【参考:pytorch_BiLSTM 命名实体识别 手写代码_哔哩哔哩_bilibili】
【参考:shouxieai/nlp-bilstm_crf-ner: nlp-bilstm+crf-ner】
"""
2022/4/22
"""
import os
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
from tqdm import tqdm
def build_corpus(data_type, make_vocab=True, data_dir="data"):
word_lists, tag_lists = [], []
with open(os.path.join(data_dir, data_type + '.char.bmes'), 'r', encoding='utf-8') as f:
word_list, tag_list = [], []
for line in f:
if line != '\n':
word, tag = line.strip().split(" ")
word_list.append(word)
tag_list.append(tag)
else:
word_lists.append(word_list)
tag_lists.append(tag_list)
word_list, tag_list = [], []
word_lists = sorted(word_lists, key=lambda x: len(x), reverse=False)
tag_lists = sorted(tag_lists, key=lambda x: len(x), reverse=False)
if make_vocab:
word2id = build_map(word_lists)
tag2id = build_map(tag_lists)
word2id['<UNK>'] = len(word2id)
word2id['<PAD>'] = len(word2id)
tag2id['<PAD>'] = len(tag2id)
return word_lists, tag_lists, word2id, tag2id
return word_lists, tag_lists
def build_map(lists):
"""
:param lists: 二维矩阵
:return: 字典map
"""
maps = {}
for list in lists:
for e in list:
if e not in maps:
maps[e] = len(maps)
return maps
class MyDataset(Dataset):
def __init__(self, datas, tags, word2idx, tag2idx):
self.datas = datas
self.tags = tags
self.word2idx = word2idx
self.tag2idx = tag2idx
def __getitem__(self, index):
data = self.datas[index]
tag = self.tags[index]
data_index = [self.word2idx.get(i, self.word2idx["<UNK>"]) for i in data]
tag_index = [self.tag2idx[i] for i in tag]
return data_index, tag_index
def __len__(self):
return len(self.datas)
def pro_batch_data(self, batch_datas):
datas = []
tags = []
batch_lens = []
for data, tag in batch_datas:
datas.append(data)
tags.append(tag)
batch_lens.append(len(data))
batch_max_len = max(batch_lens)
datas = [i + [self.word2idx['<PAD>']] * (batch_max_len - len(i)) for i in datas]
tags = [i + [self.tag2idx['<PAD>']] * (batch_max_len - len(i)) for i in tags]
return torch.tensor(datas, dtype=torch.int64), torch.tensor(tags, dtype=torch.long)
class Mymodel(nn.Module):
def __init__(self, corpus_num, embedding_dim, hidden_num, class_num, bi=True):
super(Mymodel, self).__init__()
self.embedding = nn.Embedding(corpus_num, embedding_dim)
self.lstm = nn.LSTM(input_size=embedding_dim,
hidden_size=hidden_num,
batch_first=True,
bidirectional=bi)
if bi:
self.classifier = nn.Linear(hidden_num * 2, class_num)
else:
self.classifier = nn.Linear(hidden_num, class_num)
def forward(self, batch_data):
x = self.embedding(batch_data)
out, _ = self.lstm(x)
predict = self.classifier(out)
return predict
if __name__ == "__main__":
train_data, train_tag, word2id, tag2id = build_corpus('train')
id2tag = [i for i in tag2id]
dev_data, dev_tag = build_corpus('dev', make_vocab=False)
corpus_num = len(train_data)
class_num = len(train_tag)
epochs = 20
train_batch_size = 30
dev_batch_size = 30
embedding_dim = 100
hidden_num = 107
bi = True
lr = 0.001
train_dataset = MyDataset(train_data, train_tag, word2id, tag2id)
train_dataloader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=False
, collate_fn=train_dataset.pro_batch_data)
dev_dataset = MyDataset(dev_data, dev_tag, word2id, tag2id)
dev_dataloader = DataLoader(dev_dataset, batch_size=dev_batch_size, shuffle=False
, collate_fn=dev_dataset.pro_batch_data)
model = Mymodel(corpus_num, embedding_dim, hidden_num, class_num, bi)
criterion = nn.CrossEntropyLoss()
optim = optim.Adam(model.parameters(), lr=lr)
if os.path.exists('model.pth') is False:
for i in range(epochs):
model.train()
train_loss = 0
for train_data, train_tag in tqdm(train_dataloader):
optim.zero_grad()
predict = model.forward(train_data)
predict = predict.reshape(-1, predict.shape[-1])
train_tag = train_tag.reshape(-1)
loss = criterion(predict, train_tag)
loss.backward()
optim.step()
train_loss += loss.item() / predict.size(0)
train_loss = train_loss / len(train_dataloader.dataset)
print('Epoch: {} \tTraining Loss: {:.6f}'.format(i, train_loss))
torch.save({'model': model.state_dict()}, 'model.pth')
else:
state_dict = torch.load('model.pth')
model.load_state_dict(state_dict['model'])
eval=True
if eval:
model.eval()
with torch.no_grad():
dev_loss = 0
real_label = []
predict_label = []
for dev_data, dev_tag in tqdm(dev_dataloader):
predict = model.forward(dev_data)
predict = predict.reshape(-1, predict.shape[-1])
predcit_class = torch.argmax(predict, dim=1)
predict_label.append(predcit_class.numpy())
dev_tag = dev_tag.reshape(-1)
real_label.append(dev_tag.numpy())
loss = criterion(predict, dev_tag)
dev_loss += loss.item() / predict.size(0)
real_label = np.concatenate(real_label)
predict_label = np.concatenate(predict_label)
acc = np.sum(real_label == predict_label) / len(predict_label)
dev_loss = dev_loss / len(dev_dataloader.dataset)
print(f'dev_loss:{dev_loss},acc:{acc}')
while True:
text = input("请输入:")
text_index = [[word2id.get(i,word2id["<PAD>"]) for i in text]]
text_index = torch.tensor(text_index,dtype=torch.int64)
predict=model.forward(text_index)
predict = torch.argmax(predict, dim=-1).reshape(-1)
pre = [id2tag[i] for i in predict]
print([f'{w}_{s}' for w,s in zip(text,pre)])
效果不太理想
请输入:一是加快和海口市政府及有关部门沟通衔接,就桂林洋校区的围墙边界、规划区土地使用等问题达成一致意见。二是组织地理与环境科学学院、生命科学学院、体育学院等有关教学单位实地查看,有效利用未开发的区域,作为培养学生的实训和实验基地。三是按照校园总体规划,进一步加强桂林洋校区基础设施的建设和管理工作,同时打造和谐向上的校园文化。
['一_O', '是_O', '加_O', '快_O', '和_O', '海_O', '口_M-ORG', '市_M-ORG', '政_M-ORG', '府_E-ORG', '及_O', '有_O', '关_O', '部_O', '门_O', '沟_O', '通_O', '衔_M-TITLE', '接_O', ',_O', '就_O', '桂_O', '林_O', '洋_O', '校_O', '区_O', '的_O', '围_O', '墙_O', '边_O', '界_O', '、_O', '规_B-ORG', '划_M-ORG', '区_M-ORG', '土_M-ORG', '地_M-ORG', '使_O', '用_O', '等_O', '问_O', '题_O', '达_O', '成_O', '一_O', '致_O', '意_O', '见_O', '。_O', '二_O', '是_O', '组_O', '织_O', '地_O', '理_O', '与_O', '环_O', '境_O', '科_O', '学_O', '学_O', '院_O', '、_O', '生_O', '命_M-ORG', '科_M-ORG', '学_M-ORG', '学_M-ORG', '院_E-ORG', '、_O', '体_B-ORG', '育_M-ORG', '学_M-ORG', '院_E-ORG', '等_O', '有_O', '关_O', '教_O', '学_O', '单_O', '位_O', '实_M-EDU', '地_M-EDU', '查_O', '看_O', ',_O', '有_O', '效_O', '利_O', '用_O', '未_O', '开_O', '发_O', '的_O', '区_O', '域_O', ',_O', '作_O', '为_O', '培_O', '养_O', '学_O', '生_O', '的_O', '实_O', '训_O', '和_O', '实_O', '验_O', '基_O', '地_O', '。_O', '三_O', '是_O', '按_O', '照_O', '校_O', '园_O', '总_O', '体_O', '规_O', '划_O', ',_O', '进_O', '一_O', '步_O', '加_O', '强_O', '桂_O', '林_O', '洋_O', '校_O', '区_O', '基_O', '础_O', '设_O', '施_E-TITLE', '的_O', '建_O', '设_O', '和_O', '管_O', '理_O', '工_O', '作_O', ',_O', '同_O', '时_O', '打_O', '造_O', '和_O', '谐_O', '向_M-ORG', '上_M-ORG', '的_O', '校_O', '园_E-ORG', '文_O', '化_E-TITLE', '。_O']
请输入:会议指出,桂林洋校区基础设施建设已经取得重要进展,学生公寓、第二公共教学楼等设施相继投入使用,进一步改善了学生的学习和生活条件。为进一步落实省委“能力提升年”的安排部署,切实加快基础设施建设,更好服务全校师生。
['会_O', '议_O', '指_O', '出_O', ',_O', '桂_O', '林_M-ORG', '洋_M-ORG', '校_M-ORG', '区_O', '基_M-ORG', '础_M-ORG', '设_M-ORG', '施_M-ORG', '建_M-ORG', '设_M-ORG', '已_M-ORG', '经_M-ORG', '取_O', '得_O', '重_O', '要_O', '进_O', '展_O', ',_O', '学_O', '生_O', '公_O', '寓_O', '、_O', '第_B-ORG', '二_O', '公_M-ORG', '共_O', '教_O', '学_O', '楼_O', '等_O', '设_O', '施_O', '相_O', '继_M-ORG', '投_E-ORG', '入_O', '使_O', '用_E-TITLE', ',_O', '进_O', '一_O', '步_O', '改_O', '善_O', '了_O', '学_O', '生_O', '的_O', '学_O', '习_O', '和_O', '生_O', '活_O', '条_O', '件_O', '。_O', '为_O', '进_O', '一_O', '步_O', '落_O', '实_O', '省_O', '委_O', '“_O', '能_O', '力_O', '提_O', '升_O', '年_O', '”_O', '的_O', '安_O', '排_O', '部_O', '署_O', ',_O', '切_O', '实_O', '加_O', '快_O', '基_M-ORG', '础_O', '设_O', '施_M-TITLE', '建_E-ORG', '设_E-ORG', ',_O', '更_O', '好_O', '服_O', '务_O', '全_M-TITLE', '校_M-TITLE', '师_O', '生_O', '。_O']
请输入:校领导过建春、刁晓平、陈险峰、李森、韩尚峰、刘汝兵、黄忆军、王任斌调研桂林洋校区基建工作并召开现场办公会,相关职能部门负责人参加调研
['校_O', '领_O', '导_O', '过_O', '建_O', '春_O', '、_O', '刁_O', '晓_M-ORG', '平_M-ORG', '、_O', '陈_M-ORG', '险_M-ORG', '峰_M-ORG', '、_O', '李_M-ORG', '森_M-ORG', '、_O', '韩_O', '尚_M-ORG', '峰_M-ORG', '、_M-ORG', '刘_M-ORG', '汝_M-ORG', '兵_E-NAME', '、_O', '黄_M-ORG', '忆_M-ORG', '军_M-ORG', '、_M-ORG', '王_M-ORG', '任_M-ORG', '斌_M-ORG', '调_M-ORG', '研_M-ORG', '桂_M-ORG', '林_M-ORG', '洋_M-ORG', '校_M-ORG', '区_M-ORG', '基_M-ORG', '建_M-ORG', '工_M-ORG', '作_M-ORG', '并_M-ORG', '召_M-ORG', '开_M-ORG', '现_O', '场_M-TITLE', '办_M-TITLE', '公_M-TITLE', '会_E-TITLE', ',_O', '相_O', '关_O', '职_O', '能_M-TITLE', '部_M-TITLE', '门_M-TITLE', '负_M-TITLE', '责_M-TITLE', '人_E-TITLE', '参_O', '加_O', '调_M-TITLE', '研_M-TITLE']
|