在上周数模比赛中有遇到DataFrame类型数据,同时需要利用pytorch建立bp神经网络,而在pytroch输入的数据集需是DataLoader,在网络上查询许久找到使用方法,因此记录如下: 首先是pytorch建立神经网络结构(回归模型)
import torch
from torch import nn
net=nn.Sequential(
nn.Sequential(nn.Linear(20, 20),nn.Dropout(0.3),nn.Sigmoid()),
nn.Sequential(nn.Linear(20, 10),nn.Dropout(0.3),nn.Sigmoid()),
nn.Linear(10, 1)
)
建立好神经网络后,导入数据集,此次是利用pandas从excel文件中读取,具体代码如下:
import torch.utils.data.dataset as Dataset
import torch.utils.data as data
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, TensorDataset
def get_dataLoader(df,feature_name,batch_size):
y=df.pop("label")
x=df[feature_name].copy()
x=(x-x.mean())/x.std()
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.3, random_state=2018)
train_set=TensorDataset(torch.from_numpy(x_train.values).to(torch.float32),torch.from_numpy(y_train.values).to(torch.float32))
test_set = TensorDataset(torch.from_numpy(x_val.values).to(torch.float32),torch.from_numpy( y_val.values).to(torch.float32))
train_loader = DataLoader( train_set,batch_size=batch_size, shuffle=True,drop_last=True)
test_loader = DataLoader(test_set, batch_size=batch_size,drop_last=True)
return train_loader,test_loader
在得到训练集以及测试集合后就可以对网络进行训练啦!!具体代码如下:
epochs = 500
optimizer = torch.optim.Adam(net.parameters(), lr=1e-3)
loss_fn = nn.L1Loss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
net = net.to(device)
loss_fn = loss_fn.to(device)
print(device)
train_loss_list=[]
test_loss_list=[]
def train(net, train_loader, optimizer, device, loss_fn):
net.train()
step = 0
nums = 0
train_loss = 0
for data in train_loader:
optimizer.zero_grad()
nums+=1
x, targets = data
x = x.to(device)
batch_len = len(targets)
targets = targets.to(device)
y_pred = net(x)
loss = loss_fn(y_pred, targets)
loss.backward()
optimizer.step()
train_loss += loss.item()
if (step % 10 == 0):
print("第{}次训练loss:{}".format(step, loss))
step += 1
epoch_loss = train_loss /nums
train_loss_list.append(epoch_loss)
return epoch_loss
def eval(net, test_loader, device, loss_fn):
net.eval()
test_loss = 0
len_test=0
nums=0
for data in test_loader:
nums+=1
X, targets = data
X = X.to(device)
targets = targets.to(device)
y_pred = net(X)
len_test+=len(targets)
y_hat = torch.argmax(y_pred, 1)
loss = loss_fn(y_pred, targets)
test_loss += loss.item()
test_loss_list.append(test_loss/nums)
return test_loss/nums
for epoch in range(epochs):
print("第{}次epoch训练开始".format(epoch + 1))
train_loss=train(net, train_loader, optimizer, device, loss_fn)
loss= eval(net, test_loader, device, loss_fn)
print("第{}次epoch训练集误差{},测试集误差:{}".format(epoch + 1,train_loss, loss))
最后一步就是对训练误差进行画图啦!!!!!
import matplotlib.pyplot as plt
import numpy as np
plt.plot(train_loss_list,label="train")
plt.plot(test_loss_list,label="test")
plt.xlabel("Epoch",fontsize=15)
plt.ylabel("Loss",fontsize=15)
plt.legend(["train","test"],fontsize=15)
plt.show()
另外在结尾出附上利用pandas筛选某些列的方法(自己笨,查了很多次每次要用的时候又忘记了)以及一些在本次比赛中学到的一些处理首先:
df2['label'] = df2['label_4'].apply(lambda x: 1 if x >=3 else 0)
lgb打印特征重要性
def bar_plot(dataset, model_bst):
plt.figure(figsize=(8, 10))
list_feature_name = list(dataset.columns[:])
list_feature_importance =model_bst.feature_importance()
dataframe_feature_importance = pd.DataFrame(
{'feature_name': list_feature_name, 'importance': list_feature_importance})
dataframe_feature_importance20 = dataframe_feature_importance.sort_values(by='importance', ascending=False)[:20]
sns.barplot(x="importance", y="feature_name", data=dataframe_feature_importance20, color="tomato")
dataframe_feature_importance20.to_excel("特征以及相应的重要性.xlsx",index=False)
plt.savefig("第一问特征重要性20.png",dpi=1000,bbox_inches = 'tight')
bar_plot(x_train,clf_first)
绘制对比柱状图
x = np.arange(len(df))
labels=df["model"]
width = 0.35
fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, df["absolute_error"], width, label='absolute_error')
rects2 = ax.bar(x + width/2, df["squared_error"], width,label='squared_error')
ax.set_ylabel('Error',fontsize=15)
ax.set_xlabel('Model',fontsize=15)
ax.set_title('Error by models')
ax.set_xticks(x)
ax.set_xticklabels(labels,fontsize=15)
ax.legend(fontsize=15)
fig.tight_layout()
绘制子图
fig,axes=plt.subplots(2,2)
axes[i][j].plot(x,y)
模型打包以及加载
import pickle
with open('./get_ADMET.pkl', 'wb') as f:
pickle.dump(clf, f)
print("保存模型成功!")
model_1 = pickle.load(open('get_Active.pkl','rb'))
|