1.1 文本训练化概述
- 深度学习构建模型前需要将文本转化为向量表示(Word Embedding)。首先需要将文本转化为数字(文本序列化),在把数字转化为向量。
- 可以考虑把文本中的每个词语和其对应的数字,使用字典保存,同时把句子转化为数字的列表。
1.2 文本序列化的过程
-
实现文本序列化之前,应考虑一下几点:
-
如何使用字典把词语和数字进行对应; -
不同的词语出现的次数不尽相同,是否需要对高频或者低频词语进行过滤 -
得到词典之后,如何把句子转化为数字序列,如何把数字序列转化为句子 -
不同句子长度不相同,每个batch的句子如何构造成相同的长度 -
对于新出现的词语在词典中没有出现怎么办(特殊字符代理)
2. 文本情感分类的模型构建
2.1 文本情感分类具体流程
- 数据预处理:读取文本数据,并按照批量的方式加载数据,使用
DataLoader 完成数据的读取,具体实现参考data_prepare.py 。 - 文本序列化:将文本数据转化向量表示(Word Embedding),具体实现参考
save_ws.py 。 - 模型的构建和评估:由
model.py 实现
2. 2 代码
import torch
from torch.utils.data import DataLoader, Dataset
import os
import re
import pickle
data_base_path = r"data\aclImdb"
ws = pickle.load(open("ws.pkl", "rb"))
max_len = 20
def tokenize(text):
fileters = ['!', '"', '#', '$', '%', '&', '\(', '\)', '\*', '\+', ',', '-', '\.', '/', ':', ';', '<', '=', '>',
'\?', '@'
, '\[', '\\', '\]', '^', '_', '`', '\{', '\|', '\}', '~', '\t', '\n', '\x97', '\x96', '”', '“', ]
text = re.sub("<.*?>", " ", text, flags=re.S)
text = re.sub("|".join(fileters), " ", text, flags=re.S)
return [i.strip() for i in text.split()]
class ImdbDataset(Dataset):
def __init__(self, mode):
super(ImdbDataset, self).__init__()
if mode == "train":
text_path = [os.path.join(data_base_path, i) for i in ["train/neg", "train/pos"]]
else:
text_path = [os.path.join(data_base_path, i) for i in ["test/neg", "test/pos"]]
self.total_file_path_list = []
for i in text_path:
self.total_file_path_list.extend([os.path.join(i, j) for j in os.listdir(i)])
def __getitem__(self, idx):
cur_path = self.total_file_path_list[idx]
cur_filename = os.path.basename(cur_path)
label = int(cur_filename.split("_")[-1].split(".")[0]) - 1
text = tokenize(open(cur_path, encoding="utf-8").read().strip())
return label, text
def __len__(self):
return len(self.total_file_path_list)
def collate_fn(batch):
label, content, = list(zip(*batch))
content = [ws.transform(i, max_len=max_len) for i in content]
content = torch.LongTensor(content)
label = torch.LongTensor(label)
return label, content
def get_dataloader(train_data=True):
mode = ""
if train_data:
mode = "train"
imdb_dataset = ImdbDataset(mode)
dataloader = DataLoader(dataset=imdb_dataset, batch_size=10, shuffle=True, collate_fn=collate_fn)
return dataloader
if __name__ == '__main__':
text = "I cannot stay indifferent<br></br> to Lars| van Trier's films. "
s = tokenize(text)
dataset = ImdbDataset(mode="train")
dataloader = get_dataloader()
for idx, (label, text) in enumerate(dataloader):
print("idx:", idx)
print("table:", label)
print("text:", text)
break
class ImdbDataset(Dataset):
def __init__(self, mode):
super(ImdbDataset, self).__init__()
if mode == "train":
text_path = [os.path.join(data_base_path, i) for i in ["train/neg", "train/pos"]]
else:
text_path = [os.path.join(data_base_path, i) for i in ["test/neg", "test/pos"]]
self.total_file_path_list = []
for i in text_path:
self.total_file_path_list.extend([os.path.join(i, j) for j in os.listdir(i)])
def __getitem__(self, idx):
cur_path = self.total_file_path_list[idx]
cur_filename = os.path.basename(cur_path)
label = int(cur_filename.split("_")[-1].split(".")[0]) - 1
text = tokenize(open(cur_path).read().strip())
return label, text
def __len__(self):
return len(self.total_file_path_list)
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from unit21.data_prepare import get_dataloader, max_len
ws = pickle.load(open("ws.pkl", "rb"))
class IMDBModel(nn.Module):
def __init__(self,max_len):
super(IMDBModel,self).__init__()
self.embedding = nn.Embedding(len(ws),300,padding_idx=ws.PAD)
self.fc = nn.Linear(max_len*300,10)
def forward(self, x):
embed = self.embedding(x)
embed = embed.view(x.size(0),-1)
out = self.fc(embed)
return F.log_softmax(out,dim=-1)
model = IMDBModel(max_len)
optimizer = Adam(model.parameters(), 0.001)
def train(epoch):
train_dataloader = get_dataloader()
for idx, (target, input) in enumerate(train_dataloader):
optimizer.zero_grad()
output = model(input)
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()
if idx % 10 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, idx * len(input), len(train_dataloader.dataset),
100. * idx / len(train_dataloader), loss.item()))
torch.save(model.state_dict(), "imdb_net.pkl")
def test():
test_loss = 0
correct = 0
train = False
model.load_state_dict(torch.load("imdb_net.pkl"))
model.eval()
test_dataloader = get_dataloader()
with torch.no_grad():
for target, input in test_dataloader:
output = model(input)
test_loss += F.nll_loss(output, target, reduction="sum")
pred = torch.max(output, dim=-1, keepdim=False)[-1]
correct = pred.eq(target.data).sum()
test_loss = test_loss / len(test_dataloader.dataset)
print('\nTest set: Avg. loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
test_loss, correct, len(test_dataloader.dataset),
100. * correct / len(test_dataloader.dataset)))
if __name__ == '__main__':
epoch = 1
train(epoch)
|