处理数据部分 processData
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
'''
train.csv
Id,Label,I1,I2,I3,I4,I5,I6,I7,I8,I9,I10,I11,I12,I13,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,C15,C16,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
test.csv:
Id,I1,I2,I3,I4,I5,I6,I7,I8,I9,I10,I11,I12,I13,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,C15,C16,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
'''
train_df=pd.read_csv('../PNN_data/train.csv')
test_df=pd.read_csv('../PNN_data/test.csv')
print(train_df.shape, test_df.shape)
train_df.head()
label=train_df['Label']
del train_df['Label']
data_df = pd.concat((train_df, test_df))
data_df.head()
del data_df['Id']
sparse_feas=[col for col in data_df.columns if col[0]=='C']
dense_feas=[col for col in data_df.columns if col[0]=='I']
data_df[sparse_feas] = data_df[sparse_feas].fillna('-1')
data_df[dense_feas] = data_df[dense_feas].fillna(0)
for feat in sparse_feas:
le = LabelEncoder()
data_df[feat] = le.fit_transform(data_df[feat])
mms = MinMaxScaler()
data_df[dense_feas] = mms.fit_transform(data_df[dense_feas])
train = data_df[:train_df.shape[0]]
test = data_df[train_df.shape[0]:]
train['Label'] = label
train_set, val_set = train_test_split(train, test_size = 0.1, random_state=2020)
train_set.reset_index(drop=True, inplace=True)
val_set.reset_index(drop=True, inplace=True)
train_set.to_csv('../PNN_processData/train_set.csv', index=0)
val_set.to_csv('../PNN_processData/val_set.csv', index=0)
test.to_csv('../PNN_processData/test.csv', index=0)
其中: train_df: 其他的参数就不列举了 大家想看的可以评论邮箱 我会统一发给大家
PNN 模型部分
这里我们尝试建立一个PNN网络来完成一个ctr预测的问题。 关于Pytorch的建模流程, 主要有四步:
1. 准备数据
2. 建立模型
3. 训练模型
4. 使用和保存
import datetime
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset, TensorDataset
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')
'''
train_set:I~C
test_set:I~C
'''
train_set=pd.read_csv('../PNN_processData/train_set.csv')
test_set=pd.read_csv('../PNN_processData/test.csv')
val_set=pd.read_csv('../PNN_processData/val_set.csv')
data_df = pd.concat((train_set, val_set, test_set))
dense_feas = ['I'+str(i) for i in range(1, 14)]
sparse_feas = ['C'+str(i) for i in range(1, 27)]
sparse_feas_map = {}
for key in sparse_feas:
sparse_feas_map[key]=data_df[key].nunique()
features_info=[dense_feas,sparse_feas,sparse_feas_map]
dl_train_dataset = TensorDataset(torch.tensor(train_set.drop(columns='Label').values).float(), torch.tensor(train_set['Label']).float())
dl_val_dataset = TensorDataset(torch.tensor(val_set.drop(columns='Label').values).float(), torch.tensor(val_set['Label']).float())
dl_train = DataLoader(dl_train_dataset, shuffle=True, batch_size=16)
dl_vaild = DataLoader(dl_val_dataset, shuffle=True, batch_size=16)
a=[256,128,64]
list(zip(a[:-1], a[1:]))
class DNN(nn.Module):
def __init__(self, hidden_units, dropout=0.):
"""
hidden_units:列表, 每个元素表示每一层的神经单元个数,比如[256, 128, 64],两层网络, 第一层神经单元128个,第二层64,注意第一个是输入维度
dropout: 失活率
"""
super(DNN, self).__init__()
self.dnn_network = nn.ModuleList([nn.Linear(layer[0], layer[1]) for layer in list(zip(hidden_units[:-1], hidden_units[1:]))])
self.dropout = nn.Dropout(p=dropout)
def forward(self, x):
for linear in self.dnn_network:
x = linear(x)
x = F.relu(x)
x = self.dropout(x)
return x
class ProductLayer(nn.Module):
def __init__(self, mode, embed_dim, field_num, hidden_units):
super(ProductLayer, self).__init__()
self.mode = mode
self.w_z = nn.Parameter(torch.rand([field_num, embed_dim, hidden_units[0]]))
if mode == 'in':
self.w_p = nn.Parameter(torch.rand([field_num, field_num, hidden_units[0]]))
else:
self.w_p = nn.Parameter(torch.rand([embed_dim, embed_dim, hidden_units[0]]))
self.l_b = torch.rand([hidden_units[0], ], requires_grad=True)
def forward(self, z, sparse_embeds):
l_z = torch.mm(z.reshape(z.shape[0], -1), self.w_z.permute((2, 0, 1)).reshape(self.w_z.shape[2], -1).T)
if self.mode == 'in':
p = torch.matmul(sparse_embeds, sparse_embeds.permute((0, 2, 1)))
else:
f_sum = torch.unsqueeze(torch.sum(sparse_embeds, dim=1), dim=1)
p = torch.matmul(f_sum.permute((0, 2,1)), f_sum)
l_p = torch.mm(p.reshape(p.shape[0], -1), self.w_p.permute((2, 0, 1)).reshape(self.w_p.shape[2], -1).T)
output = l_p + l_z + self.l_b
return output
class PNN(nn.Module):
def __init__(self, feature_info, hidden_units, mode='in', dnn_dropout=0., embed_dim=10, outdim=1):
"""
DeepCrossing:
feature_info: 特征信息(数值特征, 类别特征, 类别特征embedding映射)
hidden_units: 列表, 全连接层的每一层神经单元个数, 这里注意一下, 第一层神经单元个数实际上是hidden_units[1], 因为hidden_units[0]是输入层
dropout: Dropout层的失活比例
embed_dim: embedding的维度m
outdim: 网络的输出维度
"""
super(PNN, self).__init__()
self.dense_feas, self.sparse_feas, self.sparse_feas_map = feature_info
self.field_num = len(self.sparse_feas)
self.dense_num = len(self.dense_feas)
self.mode = mode
self.embed_dim = embed_dim
self.embed_layers = nn.ModuleDict({
'embed_' + str(key): nn.Embedding(num_embeddings=val, embedding_dim=self.embed_dim)
for key, val in self.sparse_feas_map.items()
})
self.product = ProductLayer(mode, embed_dim, self.field_num, hidden_units)
hidden_units[0] += self.dense_num
self.dnn_network = DNN(hidden_units, dnn_dropout)
self.dense_final = nn.Linear(hidden_units[-1], 1)
def forward(self, x):
dense_inputs, sparse_inputs = x[:, :13], x[:, 13:]
sparse_inputs = sparse_inputs.long()
sparse_embeds = [self.embed_layers['embed_'+key](sparse_inputs[:, i]) for key, i in zip(self.sparse_feas_map.keys(), range(sparse_inputs.shape[1]))]
sparse_embeds = torch.stack(sparse_embeds)
sparse_embeds = sparse_embeds.permute((1, 0, 2))
z = sparse_embeds
sparse_inputs = self.product(z, sparse_embeds)
l1 = F.relu(torch.cat([sparse_inputs, dense_inputs], axis=-1))
dnn_x = self.dnn_network(l1)
outputs = F.sigmoid(self.dense_final(dnn_x))
outputs = outputs.squeeze(-1)
return output
def auc(y_pred, y_true):
pred = y_pred.data
y = y_true.data
return roc_auc_score(y, pred)
loss_func = nn.BCELoss()
optimizer = torch.optim.Adam(params=net.parameters(), lr=0.0001)
metric_func = auc
metric_name = 'auc'
epochs = 6
log_step_freq = 10
dfhistory = pd.DataFrame(columns=["epoch", "loss", metric_name, "val_loss", "val_"+metric_name])
print('Start Training...')
nowtime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print('========='*8 + "%s" %nowtime)
for epoch in range(1, epochs+1):
net.train()
loss_sum = 0.0
metric_sum = 0.0
step = 1
for step, (features, labels) in enumerate(dl_train, 1):
optimizer.zero_grad()
predictions = net(features)
loss = loss_func(predictions, labels)
try:
metric = metric_func(predictions, labels)
except ValueError:
pass
loss.backward()
optimizer.step()
loss_sum += loss.item()
metric_sum += metric.item()
if step % log_step_freq == 0:
print(("[step = %d] loss: %.3f, "+metric_name+": %.3f") %
(step, loss_sum/step, metric_sum/step))
net.eval()
val_loss_sum = 0.0
val_metric_sum = 0.0
val_step = 1
for val_step, (features, labels) in enumerate(dl_vaild, 1):
with torch.no_grad():
predictions = net(features)
val_loss = loss_func(predictions, labels)
try:
val_metric = metric_func(predictions, labels)
except ValueError:
pass
val_loss_sum += val_loss.item()
val_metric_sum += val_metric.item()
info = (epoch, loss_sum/step, metric_sum/step, val_loss_sum/val_step, val_metric_sum/val_step)
dfhistory.loc[epoch-1] = info
print(("\nEPOCH = %d, loss = %.3f,"+ metric_name + \
" = %.3f, val_loss = %.3f, "+"val_"+ metric_name+" = %.3f")
%info)
nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print("\n"+"=========="*8 + "%s"%nowtime)
print('Finished Training...')
训练的结果如下
Start Training...
========================================================================2022-04-26 09:10:13
[step = 10] loss: 0.456, auc: 0.850
[step = 20] loss: 0.425, auc: 0.795
[step = 30] loss: 0.441, auc: 0.809
[step = 40] loss: 0.438, auc: 0.819
[step = 50] loss: 0.428, auc: 0.822
[step = 60] loss: 0.439, auc: 0.820
[step = 70] loss: 0.433, auc: 0.830
[step = 80] loss: 0.427, auc: 0.837
[step = 90] loss: 0.422, auc: 0.839
EPOCH = 1, loss = 0.422,auc = 0.839, val_loss = 0.443, val_auc = 0.676
================================================================================2022-04-26 09:10:22
[step = 10] loss: 0.380, auc: 0.849
[step = 20] loss: 0.441, auc: 0.842
[step = 30] loss: 0.447, auc: 0.838
[step = 40] loss: 0.433, auc: 0.851
[step = 50] loss: 0.415, auc: 0.858
[step = 60] loss: 0.421, auc: 0.851
[step = 70] loss: 0.418, auc: 0.861
[step = 80] loss: 0.414, auc: 0.848
[step = 90] loss: 0.410, auc: 0.854
EPOCH = 2, loss = 0.410,auc = 0.854, val_loss = 0.473, val_auc = 0.647
================================================================================2022-04-26 09:10:32
[step = 10] loss: 0.413, auc: 0.867
[step = 20] loss: 0.422, auc: 0.875
[step = 30] loss: 0.414, auc: 0.879
[step = 40] loss: 0.390, auc: 0.885
[step = 50] loss: 0.379, auc: 0.888
[step = 60] loss: 0.372, auc: 0.889
[step = 70] loss: 0.378, auc: 0.882
[step = 80] loss: 0.382, auc: 0.874
[step = 90] loss: 0.389, auc: 0.870
EPOCH = 3, loss = 0.389,auc = 0.870, val_loss = 0.457, val_auc = 0.579
================================================================================2022-04-26 09:10:35
[step = 10] loss: 0.351, auc: 0.953
[step = 20] loss: 0.374, auc: 0.913
[step = 30] loss: 0.361, auc: 0.898
[step = 40] loss: 0.355, auc: 0.863
[step = 50] loss: 0.343, auc: 0.853
[step = 60] loss: 0.346, auc: 0.858
[step = 70] loss: 0.361, auc: 0.856
[step = 80] loss: 0.364, auc: 0.865
[step = 90] loss: 0.372, auc: 0.858
EPOCH = 4, loss = 0.372,auc = 0.858, val_loss = 0.466, val_auc = 0.611
================================================================================2022-04-26 09:10:38
[step = 10] loss: 0.301, auc: 0.931
[step = 20] loss: 0.343, auc: 0.897
[step = 30] loss: 0.327, auc: 0.915
[step = 40] loss: 0.320, auc: 0.912
[step = 50] loss: 0.323, auc: 0.915
[step = 60] loss: 0.333, auc: 0.887
[step = 70] loss: 0.345, auc: 0.877
[step = 80] loss: 0.356, auc: 0.873
[step = 90] loss: 0.358, auc: 0.874
EPOCH = 5, loss = 0.358,auc = 0.874, val_loss = 0.456, val_auc = 0.645
================================================================================2022-04-26 09:10:41
[step = 10] loss: 0.360, auc: 0.858
[step = 20] loss: 0.346, auc: 0.858
[step = 30] loss: 0.386, auc: 0.883
[step = 40] loss: 0.433, auc: 0.875
[step = 50] loss: 0.401, auc: 0.888
[step = 60] loss: 0.412, auc: 0.887
[step = 70] loss: 0.405, auc: 0.876
[step = 80] loss: 0.388, auc: 0.878
[step = 90] loss: 0.384, auc: 0.881
EPOCH = 6, loss = 0.384,auc = 0.881, val_loss = 0.488, val_auc = 0.544
================================================================================2022-04-26 09:10:45
Finished Training...
|