总述
本文的目标是针对一个句子,给出其情感二分类,正向/负向。 代码存放地址:
https://github.com/stay-leave/BI-LSTM-sentiment-classify
输入数据集格式: 标签为1代表正向,0代表负向。 txt版本(即训练集、测试集不在一个文件内),这里我用的是百度千言数据集: xls版本(即训练集、测试集在一个文件内): 输出数据示例: 接下来对整个流程作梳理。
数据处理
目标:将原始数据转为tensor并加载到dataloader,以供后续使用。 思路是将文本从txt或xls中提取出来,进行分词,划分句子长度,将句子进行编码,最后将其加载到pytorch的dataloader类。
1.提取文件
txt文件的提取:
def txt_file(self,inpath):
data = []
fp = open(self.inpath,'r',encoding='utf-8')
for line in fp:
line=line.strip('\n')
line=line.split('\t')
data.append(line)
data=data[1:]
return data
xls文件的提取:
def xls_file(self,inpath):
"""提取一个文件为一个列表"""
data = xlrd.open_workbook(self.inpath, encoding_override='utf-8')
table = data.sheets()[0]
nrows = table.nrows
ncols = table.ncols
numbers=[]
for i in range(1, nrows):
alldata = table.row_values(i)
numbers.append(alldata)
return numbers
结果如下:
2.对句子进行分词
上面的数据中同时包含句子和标签,因此需要将其分开进行处理。 这是txt文件的代码,若使用xls文件,需要注释掉splitt函数的label那一行,取消下一行的注释。
def tokenlize(self,sentence):
URL_REGEX = re.compile(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>???“”‘’]))',re.IGNORECASE)
sentence= re.sub(URL_REGEX,'', sentence)
sentence =jieba.cut(sentence.strip(),cut_all=False,use_paddle=10)
out=[]
for word in sentence:
out.append(word)
return out
def splitt(self,data):
sentence=[]
label=[]
for i in data:
sentence.append(self.tokenlize(i[1]))
label.append(int(i[0]))
sentence=tuple(sentence)
label=tuple(label)
return sentence,label
结果如下:
3.建立字典,对句子进行编码
思路是统计词频,将句子转换为数字序列,同时根据自己设置的句子长度对句子进行截取和补全。 这里使用PAD:0作为补全和未登录词的表示。 首先是建立字典,(词:词频): txt与xls的转换同上
def count_s(self):
sentence,label=self.splitt(self.txt_file(self.inpath))
count_dict = dict()
sentences=[]
for i in sentence:
sentences += i
for item in sentences:
if item in count_dict:
count_dict[item] += 1
else:
count_dict[item] = 1
count_dict_s = collections.OrderedDict(sorted(count_dict.items(),key=lambda t:t[1], reverse=True))
vocab=list(count_dict_s.keys())
vocab_index=[i for i in range(1,len(vocab)+1)]
vocab_to_index = dict(zip(vocab, vocab_index))
vocab_to_index["PAD"] = 0
return vocab_to_index,sentence,label,sentences
结果如下: 有了字典就可以对一个句子进行编码,即转换为数字序列。 同样的,也可以将一个数字序列转换为句子。
def seq_to_array(self,seq,vocab_to_index):
seq_index=[]
for word in seq:
if word in vocab_to_index:
seq_index.append(vocab_to_index[word])
else:
seq_index.append(0)
if len(seq_index) < self.seq_length:
seq_index = [0] * (self.seq_length-len(seq_index)) + seq_index
elif len(seq_index) > self.seq_length:
seq_index = seq_index[:self.seq_length]
else:
seq_index=seq_index
return seq_index
对于句子
‘你好!我是初学者!’
转换如下:
def array_to_seq(self,indices):
vocab_to_index,sentence,label,sentences=self.count_s()
seqs=[]
for i in indices:
seq=[]
for j in i:
for key, value in vocab_to_index.items():
if value==j:
seq.append(key)
seqs.append(seq)
return seqs
对于上面的数字序列
[[0, 0, 0, 0, 6322, 0, 4, 3, 724, 0]]
转换为句子如下: 对句子的编码完毕,接下来就可以加载到tensor了。
4.将数据加载到dataloader类
以训练集txt文件的加载为例,先是投入句子的编码列表,再转为数组,然后加载到dataloader中。
def data_for_train_txt(self,sentence,vocab_to_index,label):
features=[self.seq_to_array(seq,vocab_to_index) for seq in sentence]
random_order = list(range(len(features)))
np.random.seed(2)
np.random.shuffle(random_order)
features_train = np.array([features[i] for i in random_order])
label_train = np.array([label[i] for i in random_order])[:, np.newaxis]
train_data = TensorDataset(torch.LongTensor(features_train),
torch.LongTensor(label_train))
train_sampler = RandomSampler(train_data)
train_loader = DataLoader(train_data, sampler=train_sampler, batch_size=self.batch_size, drop_last=True)
return train_loader
数据处理完成!接下来进行模型构建。
BI-LSTM模型构建
关于该模型的原理这篇大神的博客讲得非常好
https://blog.csdn.net/weixin_42118657/article/details/120022112
实现代码如下,基本每一步都有注释:
class BI_lstm(nn.Module):
def __init__(self, vocab_size,vocab_to_index,n_layers,hidden_dim,embed,output_size,dropout):
super(BI_lstm, self).__init__()
self.n_layers = n_layers
self.hidden_dim = hidden_dim
self.embedding_dim = embed
self.dropout=dropout
self.output_size=output_size
self.embedding = nn.Embedding(vocab_size, self.embedding_dim,padding_idx=vocab_to_index['PAD'])
self.lstm = nn.LSTM(self.embedding_dim,
hidden_dim,
n_layers,
dropout=self.dropout,
batch_first=True,
bidirectional = True
)
self.fc = nn.Linear(self.hidden_dim*2, self.output_size
)
self.sigmoid = nn.Sigmoid()
self.tanh = torch.nn.Tanh()
self.dropout = nn.Dropout(self.dropout)
def forward(self, x, hidden):
"""
x: 本次的输入,其size为(batch_size, 200),200为句子长度
hidden: 上一时刻的Hidden State和Cell State。类型为tuple: (h, c),
其中h和c的size都为(n_layers, batch_size, hidden_dim)
"""
batch_size = x.size(0)
x = x.long()
embeds = self.embedding(x)
lstm_out, hidden = self.lstm(embeds, hidden)
lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
out = self.dropout(lstm_out)
out=torch.reshape(out,(-1,256))
out=self.tanh(out)
out = self.fc(out)
out = self.sigmoid(out)
out = out.view(batch_size, -1)
out = out[:,-1]
return out,hidden
def init_hidden(self, batch_size):
"""
初始化隐状态:第一次送给LSTM时,没有隐状态,所以要初始化一个
这里的初始化策略是全部赋0。
这里之所以是tuple,是因为LSTM需要接受两个隐状态hidden state和cell state
"""
hidden = (torch.zeros(self.n_layers*2, batch_size, self.hidden_dim).to(device),
torch.zeros(self.n_layers*2, batch_size, self.hidden_dim).to(device)
)
return hidden
结果如下:
模型的训练和评估
将数据投喂给模型,进行训练。
def train(config,model,train_loader):
model.train()
optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)
criterion = nn.BCELoss()
y_loss=[]
for e in range(config.epochs):
h = model.init_hidden(config.batch_size)
counter = 0
train_losses=[]
for inputs, labels in train_loader:
counter += 1
inputs, labels = inputs.cuda(), labels.cuda()
h = tuple([each.data for each in h])
output,h= model(inputs, h)
output=output[:, np.newaxis]
train_loss = criterion(output, labels.float())
train_losses.append(train_loss.item())
optimizer.zero_grad()
train_loss.backward()
optimizer.step()
if counter % config.print_every == 0:
print("Epoch: {}/{}, ".format(e+1, config.epochs),
"Step: {}, ".format(counter),
"Loss: {:.6f}, ".format(train_loss.item()),
"Val Loss: {:.6f}".format(np.mean(train_losses)))
y_loss.append(train_loss.item())
x = [i for i in range(len(y_loss))]
fig = plt.figure()
plt.plot(x, y_loss)
plt.show()
torch.save(model,config.save_model_path)
训练完对其进行测试评估,使用准确率:
def test(config, model, test_loader):
criterion = nn.BCELoss()
h = model.init_hidden(config.batch_size)
with torch.no_grad():
count = 0
total = 0
loss=0
l=0
for input_test, target_test in test_loader:
h = tuple([each.data for each in h])
input_test = input_test.type(torch.LongTensor)
target_test = target_test.type(torch.LongTensor)
target_test = target_test.squeeze(1)
input_test = input_test.cuda()
target_test = target_test.cuda()
output_test,h = model(input_test,h)
pred=output_test.cpu().numpy().tolist()
target=target_test.cpu().numpy().tolist()
for i,j in zip(pred,target):
if round(i)==j:
count=count+1
total += target_test.size(0)
loss = criterion(output_test, target_test.float())
loss+=loss
l=l+1
acc=100 * count/ total
test_loss=loss/l
print("test mean loss: {:.3f}".format(test_loss))
print("test accuracy : {:.3f}".format(acc))
模型的使用
训练好的模型就可以直接用来对句子进行预测了。 预测代码:
def predict(config, model, pred_loader):
pred_all=[]
with torch.no_grad():
h = model.init_hidden(config.batch_size_pred)
for dat,id in pred_loader:
h = tuple([each.data for each in h])
dat=dat.cuda()
output,h= model(dat, h)
pred=output.cpu().numpy().tolist()
pred_all=pred_all+pred
return pred_all
保存预测结果:
def save_file(config, alls):
"""保存结果到excel
"""
f = openpyxl.Workbook()
sheet1 = f.create_sheet('sheet1')
sheet1['A1'] = 'id'
sheet1['B1'] = '评论内容'
sheet1['C1'] = '情感值'
sheet1['D1'] = '情感类别'
i = 2
for all in alls:
for j in range(1, len(all) + 1):
sheet1.cell(row=i, column=j, value=all[j - 1])
i = i + 1
f.save(config.save_pred_path)
总结
此次是基于pytorch框架简单地实现了bi-lstm模型进行文本分类,采用sigmoid函数的输出作为情感值是很不合理的,应该叫倾向值,或者不看该数据也是可以的,只关心正负向就行。 后续将继续学习使用预训练词向量进行训练。 自己也是个小白,还得继续学习。
参考博客:
https://blog.csdn.net/qq_52785473/article/details/122800625 https://blog.csdn.net/qq_40276310/article/details/109248949 https://blog.csdn.net/qq_40276310/article/details/109248949 http://t.csdn.cn/qjkST https://blog.51cto.com/u_11466419/5184189
|