title: textRCNN精读与复现 date: 2022-03-03 20:16:16 tags:
论文地址:Recurrent Convolutional Neural Networks for Text Classification | Papers With Code
模型架构
双向循环神经网络以及最大池化层。论文中提及embedding是利用word2vec预训练好的,没有找到相应的资源。
实验
数据
利用论文中提到的Stanford Sentiment Treebank数据集
首先对数据集进行处理,得到text与sentiment的值,利用其中的sentiment_labels.txt文件。
这里数据预处理参考了博文:csdn博文 感谢感谢~~
接下来是写dataset
'''
author: yxr
date: 2022-2-26
introduce: RPNN实现的数据处理部分
'''
import pandas as pd
import torch
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
pad_size = 64
def make_dictionary():
df1 = pd.read_csv('./train_final.txt', header=None, delimiter='\t')
df2 = pd.read_csv('./test_final.txt', header=None, delimiter='\t')
df3 = pd.read_csv('./valid_final.txt', header=None, delimiter='\t')
frame = [df1, df2, df3]
df = pd.concat(frame, axis=0)
df.columns = ['sentence', 'label']
sentences = df.sentence.values
word2ids = {'[SEP]': 0, '[CLS]': 1, '[PAD]': 2}
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
for sentence in sentences:
tokens = tokenizer.tokenize(sentence)
for token in tokens:
if token not in word2ids.keys():
word2ids[token] = len(word2ids)
return word2ids
def label_class(score):
if score >= 0 and score <= 0.2:
tmp_label = 0
elif score > 0.2 and score <= 0.4:
tmp_label = 1
elif score > 0.4 and score <= 0.6:
tmp_label = 2
elif score > 0.6 and score <= 0.8:
tmp_label = 3
else:
tmp_label = 4
return tmp_label
def data_process(data_path, word2idx):
df = pd.read_csv(data_path, header=None, delimiter='\t')
df.columns = ['sentence', 'label']
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
sentences = df.sentence.values
sentence_labels = df.label.values
input_ids = []
input_tokens = []
labels = []
for sent, ll in zip(sentences, sentence_labels):
token_id = []
tmp_tokens = tokenizer.tokenize('[CLS]' + sent)
if len(tmp_tokens) > pad_size - 1:
tmp_tokens = tmp_tokens[:pad_size - 1]
else:
tmp_tokens = tmp_tokens + ['[PAD]'] * (pad_size - 1 - len(tmp_tokens))
tmp_tokens.append('[SEP]')
for token in tmp_tokens:
token_id.append(word2idx[token])
tmp_label = label_class(ll)
input_ids.append(token_id)
input_tokens.append(tmp_tokens)
labels.append(tmp_label)
input_ids = torch.tensor(input_ids)
labels = torch.tensor(labels)
return TensorDataset(input_ids, labels), input_tokens
这里的tokenizer利用了BERT里的工具。与github上的pytorch实现有所不同。其实应该是有别的工具可以用的,并其体积会小很多。
步骤:生成自定义词表字典----对text进行“分词”----利用词表得到input_id----返回TensorDataset,注意哦,tensor里面是不能存储str类型的数据的。
model
接下来就是比较核心的model部分啦,其实就是跟着paper撸一遍。
import torch
import torch.nn as nn
embed_size = 300
hidden_size = 512
pad_size = 64
class_num = 5
class Embedding(nn.Module):
def __init__(self, vocab_size, is_pretrain=False):
super(Embedding, self).__init__()
if is_pretrain:
self.embedding.weight.requires_grad = False
else:
self.embedding = nn.Embedding(vocab_size, embed_size)
def forward(self, input_ids):
return self.embedding(input_ids)
class RCNN(nn.Module):
def __init__(self, vocab_size):
super(RCNN, self).__init__()
self.embedding = Embedding(vocab_size=vocab_size)
self.bilstm = nn.LSTM(input_size=embed_size, hidden_size=hidden_size, bidirectional=True)
self.linear1 = nn.Linear(embed_size + 2 * hidden_size, hidden_size)
self.tanh = nn.Tanh()
self.maxpooling = nn.MaxPool1d(pad_size)
self.linear2 = nn.Linear(hidden_size, class_num)
self.softmax = nn.Softmax(dim=-1)
def forward(self, x):
embed = self.embedding(x)
lstm_out, _ = self.bilstm(embed)
out = torch.cat((embed, lstm_out), 2)
out = self.tanh(self.linear1(out))
out = out.permute(0, 2, 1)
out = self.maxpooling(out).squeeze()
out = self.linear2(out)
out = self.softmax(out)
return out
这里注意一下对lstm的使用,其实目前对LSTM的实现还是有一些不清楚,明天再看一遍推理。
用pytorch写比较简单的网络架构时,最重要的是搞清楚维度问题。因为大部分的用到的网络架构都是写好的,只需要传参就可以了。
train
接下来就是利用上面的数据处理以及模型部分,把数据输入模型—开跑
import sys
sys.path.append('./')
import gensim
import torch
import torch.nn as nn
import torch.optim as Optim
import data_preprocess
from transformers import BertTokenizer
from torch.utils.data import DataLoader, RandomSampler
import rcnn_model
import time
import pandas as pd
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
print('==开始读入数据:===')
word2idx = data_preprocess.make_dictionary()
vocab_size = len(word2idx)
train_dataset, train_tokens = data_preprocess.data_process('./train_final.txt', word2idx)
train_loader = DataLoader(train_dataset,
sampler=RandomSampler(train_dataset),
batch_size=10)
test_dataset, test_tokens = data_preprocess.data_process('./test_final.txt', word2idx)
test_loader = DataLoader(test_dataset,
sampler=RandomSampler(test_dataset),
batch_size=10)
print('==开始加载模型:===')
model = rcnn_model.RCNN(vocab_size=vocab_size)
criterion = nn.CrossEntropyLoss()
optimizer = Optim.Adadelta(model.parameters(), lr=0.03)
acc = []
for epoch in range(50):
print('==训练===')
acc1 = []
accuracy = 0.0
data_batch_num = 0
total_loss = 0.0
t_begin = time.time()
for batch in train_loader:
input_ids = batch[0]
labels = batch[1]
output = model(input_ids)
out_class = torch.argmax(output, dim=1)
loss = criterion(output, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
accuracy += int(sum(out_class == labels))
data_batch_num += len(labels)
accuracy = float(accuracy / len(train_tokens))
print('===acc:%.3f' % accuracy, '==time:', time.time() - t_begin)
acc1.append(str(accuracy).format(':.4f'))
print('===测试===test')
accuracy = 0.0
for batch in test_loader:
input_ids = batch[0]
labels = batch[1]
with torch.no_grad():
output = model(input_ids)
out_class = torch.argmax(output, dim=1)
accuracy += int(sum(out_class == labels))
accuracy = float(accuracy / len(test_tokens))
print('===acc:%.3f' % accuracy)
acc1.append(str(accuracy).format(':.4f'))
acc.append(acc1)
acc = pd.DataFrame(acc, columns=['train', 'test-train', 'test'])
acc.to_csv('accuracy.csv', index=False, )
这里的embedding本来是利用了一个google预训练好的embedding矩阵,可以传入单词,得到单词的嵌入向量,但效果很一般,后来直接去掉了,改成了直接用nn.embedding(vocab_size, embed_size),然后在训练过程中调整词嵌入向量。
结果
按照data给的README,数据集具有五种类别,测试准确率约在37%左右。没有达到论文中给的40%多。
分析原因可能出在词嵌入部分??不过还不确定,感觉把离散的评分搞成类别会损失一部分信息,准备不进行分类,把模型输出改为[0-1]之间的数值,之后再利用预测结果评判准确度。
感觉可能数据集的质量也不是很高,数据标签应该不是人工标注的。
|