logging.conf
[loggers]
keys=root,std,file
[logger_root]
level=DEBUG
handlers=hand01,hand02,hand03
[logger_std]
handlers=hand01,hand02,hand03
qualname=std
propagate=0
[logger_file]
handlers=hand01,hand01_file,hand02_file
qualname=file
propagate=0
[handlers]
keys=hand01,hand02,hand03,hand01_file,hand02_file
[handler_hand01]
class=logging.StreamHandler
level=WARNING
formatter=form02
encoding='utf8'
args=(sys.stderr,)
[handler_hand02]
class=logging.StreamHandler
level=ERROR
formatter=form03
encoding='utf8'
args=(sys.stderr,)
[handler_hand03]
class=logging.StreamHandler
level=INFO
formatter=form03
encoding='utf8'
args=(sys.stderr,)
[handler_hand01_file]
class=logging.FileHandler
level=ERROR
formatter=form01
encoding='utf8'
args=('log/error.log', 'a')
[handler_hand02_file]
class=logging.handlers.RotatingFileHandler
level=INFO
formatter=form01
encoding='utf8'
args=('log/log.log', 'a', 500*1024*1024, 5)
[formatters]
keys=form01,form02,form03
[formatter_form01]
format=%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s
datefmt=%Y %m %d %H:%M:%S
[formatter_form02]
format=%(name)-12s: %(levelname)-8s %(message)s
datefmt=%Y %m %d %H:%M:%S
[formatter_form03]
format=[%(asctime)s][%(levelname)s] %(message)s
datefmt=%Y %m %d %H:%M:%S
multinegativeentity.py
"""
@about : 负面实体服务
@time : 2020/11/24 10:19
@author : fangsh
"""
import json
import time
import re
import os
import torch
import torch.nn as nn
import numpy as np
from flask import Flask, request
from transformers import BertModel, BertConfig, BertTokenizer
import logging
import logging.config
from htmllaundry import strip_markup
from utils import entity_replace_multi_key, text_processed, entity_processed
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
logging.config.fileConfig('./config/logging.conf')
logger = logging.getLogger('std')
app = Flask(__name__)
class NeuralNet(nn.Module):
"""定义网络结构: bert对最后一层所有隐向量使用GAP,MAP
"""
def __init__(self, pretrained_path, num_labels=2):
super(NeuralNet, self).__init__()
self.config = BertConfig.from_pretrained(pretrained_path, num_labels=num_labels)
self.bert = BertModel.from_pretrained(pretrained_path, config=self.config)
self.dropout = nn.Dropout(.5)
self.linear = nn.Linear(3072, num_labels)
self.gap = torch.nn.AdaptiveAvgPool1d(1)
self.gmp = torch.nn.AdaptiveMaxPool1d(1)
def forward(self, input_ids, input_mask, input_seg):
last_hidden_states, pooled_out = self.bert(input_ids=input_ids,
attention_mask=input_mask,
token_type_ids=input_seg)
embedding = last_hidden_states.permute(0, 2, 1)
q = self.gap(embedding).squeeze(dim=-1)
a = self.gmp(embedding).squeeze(dim=-1)
t = last_hidden_states[:, -1]
e = last_hidden_states[:, 0]
x = torch.cat([q, a, t, e], dim=1)
out = self.linear(x)
out = out.view(-1, self.config.num_labels)
return out
BERT_MODEL_PATH = './RoBERTa_zh_L12_PyTorch'
MAX_LENGTH = 200
NUM_LABELS = 2
MODEL_WEIGHT = './model_save/roberta-datav12_1.bin'
with torch.no_grad():
model = NeuralNet(BERT_MODEL_PATH, num_labels=NUM_LABELS)
model.cuda()
model.eval()
model.load_state_dict(torch.load(MODEL_WEIGHT))
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH)
def convert_to_feature(text):
inputs = tokenizer.encode_plus(text, max_length=MAX_LENGTH, truncation=True)
input_ids, input_mask, input_seg = inputs["input_ids"], inputs["attention_mask"], inputs["token_type_ids"]
padding_length = MAX_LENGTH - len(input_ids)
input_ids += [0] * padding_length
input_mask += [0] * padding_length
input_seg += [0] * padding_length
return input_ids, input_seg, input_mask
def muiti_ent_predict(title, entities):
batch_token_ids, batch_segment_ids, batch_atten_mask = [], [], []
text = text_processed(title)
entities = [entity_processed(ent) for ent in entities]
for ent in entities:
text_p = entity_replace_multi_key(text, ent, entities)
logger.info("处理后:%s" % text_p)
token_ids, segment_ids, atten_mask = convert_to_feature(text_p)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_atten_mask.append(atten_mask)
with torch.no_grad():
batch_ids_tensor = torch.tensor(batch_token_ids, dtype=torch.long)
batch_seg_tensor = torch.tensor(batch_segment_ids, dtype=torch.long)
batch_atten_tensor = torch.tensor(batch_atten_mask, dtype=torch.long)
batch = [batch_ids_tensor, batch_atten_tensor, batch_seg_tensor]
batch = tuple(t.cuda() for t in batch)
pred = model(*batch)
pred = np.argmax(pred.cpu().detach().numpy(), axis=1)
logger.info(pred)
pred_ent = []
for i, j in zip(entities, pred):
if j == 1:
pred_ent.append(i)
return pred_ent
@app.route('/upload_and_predict', methods=['GET', 'POST'])
def upload_and_predict():
result = {
'code': 0000,
'msg': '正常',
'body': [],
'sysnum': 'null'
}
if request.method == 'POST':
t1 = time.time()
try:
data = request.get_data()
json_data = json.loads(data.decode('utf-8'))
except:
result['msg'] = 'ERROR: “参数错误”'
result['code'] = 10001
logger.info(json_data)
for i in json_data:
text = i.get("text")
entitys = i.get("entitys")
entitys = entitys.split(',')
pred_ent = muiti_ent_predict(text, entitys)
logging.info('text:{}entitys:{}'
.format(text, entitys))
logging.info('pred_entitys:{}time:{:5.2f}'
.format(pred_ent, time.time() - t1))
result['body'].append(pred_ent)
return json.dumps(result, ensure_ascii=False)
else:
result['msg'] = 'ERROR: “只支持post请求”'
result['code'] = 10001
return json.dumps(result, ensure_ascii=False)
class MyException(Exception):
def __init__(self, message):
super().__init__()
self.message = message
if __name__ == "__main__":
app.config['JSON_AS_ASCII'] = False
app.run(host='0.0.0.0', port=21130, threaded=False)
utils.py
import re
import pandas as pd
from htmllaundry import strip_markup
def strQ2B(ustring):
"""全角转半角"""
rstring = ""
for uchar in ustring:
inside_code=ord(uchar)
if inside_code == 12288:
inside_code = 32
elif (inside_code >= 65281 and inside_code <= 65374):
inside_code -= 65248
rstring += chr(inside_code)
return rstring
def entity_processed(ent):
"""
对实体进行处理,避免实体与原文中的不匹配。主要有以下问题:
1、英文大小写不匹配
2、()实体是全角,文章被统一为半角
3、公司的部分字被识别为另一个公司。 全称被替换后,识别不到这个子集。直接做删除处理
:param ent:
:return:
"""
ent = ent.lower()
ent = ent.replace("(", "(")
ent = ent.replace(")", ")")
return ent
def text_processed(text):
"""
对文本的预处理,是否需要实体在文中的位子增加特殊标记
:param text:
:param ent:
:return:
"""
text = text.lower()
text = strip_markup(text)
text = strQ2B(text)
text = re.sub(r"&[a-z]+;", "", text)
text = re.sub(r"&[a-z]+$", "", text)
text = re.sub(r"\t", "", text)
text = re.sub(r"\n", "", text)
text = re.sub(r"“", "\"", text)
text = re.sub(r"”", "\"", text)
text = text.replace("@", "")
return text
def entity_replace_one_key(sentence, cur_entity, entity_list):
""" 将当前待预测实体在句子中进行替换, 同时考虑长短实体包含的问题,
如“中国信安”,“青海中国信安”,简单替换“中国信安”会把别的实体也误替换
只考虑一个实体有一个关键词的情况
"""
entity_list.sort(key=lambda x: len(x), reverse=True)
index = entity_list.index(cur_entity)
flag = False
for i in entity_list[:index]:
if cur_entity in i:
flag = True
if flag:
for i, entity in enumerate(entity_list[:index]):
sentence = sentence.replace(entity, "<entity_%d>" % i)
sentence = sentence.replace(cur_entity, "@" * len(cur_entity))
for i, entity in enumerate(entity_list[:index]):
sentence = sentence.replace("<entity_%d>" % i, entity)
else:
sentence = sentence.replace(cur_entity, "@" * len(cur_entity))
return sentence
def entity_replace_multi_key(sentence, cur_entity, entity_list):
"""
考虑每个实体可能对应多个关键字。
:param sentence:
:param cur_entity:
:param entity_list:
:return:
"""
entity_list_split = []
for i in entity_list:
entity_list_split.extend(i.split("_"))
for i in cur_entity.split("_"):
sentence = entity_replace_one_key(sentence, i, entity_list_split)
return sentence
def convert_to_single_ent(df):
"""
将数据处理成一个实体一行,并且增加是否是负面的标注
:return:
"""
data = []
for i in df.index.values:
text, all_entities, eng_entities = df.iloc[i]
text = text_processed(text)
all_entities, eng_entities = [entity_processed(ent) for ent in str(all_entities).split(",")], \
[entity_processed(ent) for ent in str(eng_entities).split(",")]
for ent in all_entities:
text_p = entity_replace_multi_key(text, ent, all_entities)
if "@" in text_p:
if ent in eng_entities:
data.append((text_p, ent, 1))
else:
data.append((text_p, ent, 0))
return pd.DataFrame(data)
run_classify.py
import torch
import pandas as pd
import numpy as np
import logging
import logging.config
from tqdm import tqdm
from torch import nn
from pytorch_transformers.modeling_bert import BertConfig, BertModel
from pytorch_transformers.tokenization_bert import BertTokenizer
from pytorch_transformers import AdamW
from torch.utils.data import (DataLoader, TensorDataset)
from torch.utils.data import SequentialSampler, RandomSampler, WeightedRandomSampler
from sklearn.metrics import accuracy_score, f1_score,classification_report
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
logging.config.fileConfig('./config/logging.conf')
logger = logging.getLogger('std')
max_length = 250
learning_rate = 2e-5
epochs = 15
patience = 5
batch_size = 16
do_train = True
do_eval = True
do_error_analyse = False
train_fpath = './data/processed/train.xlsx'
test_fpath = './data/processed/test.xlsx'
bert_model = '/data/sfang/BertPretrainedModel/torch/RoBERTa_zh_L12_PyTorch'
save_model = './model_save/'
file_name = 'roberta-FAQ-model'
class InputExample():
"""定义数据结构:保存每个样例的信息"""
def __init__(self,text,entity,label):
self.text = text
self.entity = entity
self.label = label
class InputFeature():
"""定义数据结构:保存每个样例的特征。预处理后可直接输入模型的数值特征"""
def __init__(self,x_ids,x_seg,atten_mask,label):
self.features = {
'x_ids':x_ids,
'x_seg':x_seg,
'atten_mask':atten_mask
}
self.label = label
class NeuralNet(nn.Module):
"""定义网络结构"""
def __init__(self, pretrained_path, num_labels=2):
super(NeuralNet, self).__init__()
self.config = BertConfig.from_pretrained(pretrained_path, num_labels=num_labels)
self.bert = BertModel.from_pretrained(pretrained_path, config=self.config)
self.dropout = nn.Dropout(.5)
self.linear = nn.Linear(self.config.hidden_size * 2, self.config.num_labels)
def forward(self, x_ids, x_seg, x_mask):
last_hidden_states, pooled_out = self.bert(input_ids=x_ids,token_type_ids=x_seg,
attention_mask=x_mask)
pooled_output = self.dropout(pooled_out)
target = last_hidden_states * x_seg.float().unsqueeze(-1)
target = target.sum(dim=1)
target_div = x_seg.sum(dim=1)
target = target.div(target_div.float().unsqueeze(-1))
target_cls = torch.cat((target,pooled_output),-1)
logits = self.linear(target_cls)
return logits
def read_data(fpath):
"""读取表格数据"""
examples = []
df = pd.read_excel(fpath, header=None)
df = df.dropna()
for i in df.values.tolist():
text,entity,label = i[1],i[2],i[4]
examples.append(InputExample(text,entity,label))
return examples,df
def convert_examples_to_features(examples,tokenizer):
features = []
for i,example in enumerate(examples):
tokenA = tokenizer.tokenize(example.text)
tokenB = tokenizer.tokenize(example.entity)
max_text_length = max_length - 3 - len(tokenB)
tokenA = tokenA[:max_text_length]
token = ["[CLS]"] + tokenA + ["[SEP]"] + tokenB + ["[SEP]"]
x_ids = tokenizer.convert_tokens_to_ids(token)
x_seg = [0] * (len(tokenA) + 2) + [1] * (len(tokenB)+1)
atten_mask = [1] * len(token)
padding_lengh = max_length - len(token)
x_ids += [0] * padding_lengh
x_seg += [0] * padding_lengh
atten_mask += [0] * padding_lengh
label = example.label
if i < 5 :
logger.info("********** examples *********")
logger.info("ids: {}".format(i))
logger.info("tokens: {}".format(' '.join(token)))
logger.info("x_ids:{}".format(' '.join(map(str,x_ids))))
logger.info("x_seg:{}".format(' '.join(map(str,x_seg))))
logger.info("atten_mask: {}".format(atten_mask))
logger.info("label: {}".format(label))
features.append(InputFeature(x_ids,x_seg,atten_mask,label))
return features
def select_field(features,field):
"""根据key,返回全部样例的key特征列表"""
return [ele.features[field] for ele in features]
def metric(y_true, y_pred):
acc = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, average='macro')
return acc, f1
tokenizer = BertTokenizer.from_pretrained(bert_model)
all_examples,_ = read_data(train_fpath)
all_features = convert_examples_to_features(examples=all_examples,tokenizer=tokenizer)
all_ids = np.array(select_field(all_features,'x_ids'))
all_seg = np.array(select_field(all_features,'x_seg'))
all_atten_mask = np.array(select_field(all_features,'atten_mask'))
all_labels = [ele.label for ele in all_features]
test_examples,_ = read_data(test_fpath)
test_features = convert_examples_to_features(test_examples,tokenizer)
test_ids = torch.tensor(select_field(test_features,'x_ids'), dtype=torch.long)
test_seg = torch.tensor(select_field(test_features,'x_seg'),dtype=torch.long)
test_atten_mask = torch.tensor(select_field(test_features,'atten_mask'),dtype=torch.long)
test_labels = torch.tensor([ele.label for ele in test_features],dtype=torch.long)
train_ids, valid_ids, train_seg, valid_seg, train_masks, valid_masks, train_labels, valid_labels = \
train_test_split(all_ids, all_seg, all_atten_mask, all_labels, test_size=0.2, random_state=42)
print(np.array(train_ids).shape)
print(np.array(train_masks).shape)
print(np.array(train_labels).shape)
train_ids_tensor = torch.tensor(train_ids, dtype=torch.long)
train_seg_tensor = torch.tensor(train_seg,dtype=torch.long)
train_masks_tensor = torch.tensor(train_masks, dtype=torch.long)
train_labels_tensor = torch.tensor(train_labels, dtype=torch.long)
valid_ids_tensor = torch.tensor(valid_ids, dtype=torch.long)
valid_seg_tensor = torch.tensor(valid_seg,dtype=torch.long)
valid_masks_tensor = torch.tensor(valid_masks, dtype=torch.long)
valid_labels_tensor = torch.tensor(valid_labels, dtype=torch.long)
train_datasets = torch.utils.data.TensorDataset(train_ids_tensor, train_seg_tensor, train_masks_tensor, train_labels_tensor)
valid_datasets = torch.utils.data.TensorDataset(valid_ids_tensor, valid_seg_tensor, valid_masks_tensor, valid_labels_tensor)
test_datasets = torch.utils.data.TensorDataset(test_ids, test_seg, test_atten_mask, test_labels)
train_loader = torch.utils.data.DataLoader(train_datasets, shuffle=True, batch_size=batch_size)
valid_loader = torch.utils.data.DataLoader(valid_datasets, shuffle=False, batch_size=batch_size)
test_loader = torch.utils.data.DataLoader(test_datasets, shuffle=False, batch_size=batch_size)
early_stoping = 0
best_f1 = 0.0
if do_train:
logger.info("*************** Train ******************")
model = NeuralNet(bert_model)
model.cuda()
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
optim = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=1e-6)
weights = torch.tensor([3.0, 1.0],dtype=torch.float).cuda()
loss_fn = torch.nn.CrossEntropyLoss(weight=weights)
model.train()
for epoch in range(epochs):
train_loss = 0
pbar = tqdm(train_loader)
for i, batch in enumerate(pbar):
batch = tuple(t.cuda() for t in batch)
batch_ids, batch_seg, batch_mask, batch_label = batch
preb = model(batch_ids, batch_seg, batch_mask)
loss = loss_fn(preb, batch_label)
optim.zero_grad()
loss.backward()
optim.step()
train_loss += loss.item() / len(train_loader)
pbar.set_description("loss%.4f" % loss)
valid_loss = 0.0
model.eval()
valid_preb = np.zeros(shape=(valid_ids_tensor.shape[0], 2))
with torch.no_grad():
for i, batch in tqdm(enumerate(valid_loader)):
batch = tuple(t.cuda() for t in batch)
ids, segs, masks, labels = batch
preb = model(ids, segs, masks)
valid_loss += loss_fn(preb, labels).item() / len(valid_loader)
valid_preb[batch_size * i:batch_size * (i + 1)] = F.softmax(preb, dim=1).cpu().numpy()
acc, f1 = metric(valid_labels, np.argmax(valid_preb,axis=1))
if f1 > best_f1:
best_f1 = f1
early_stoping = 0
torch.save(model.state_dict(), save_model + '%s.bin'%file_name)
else:
early_stoping += 1
if early_stoping >= patience:
break
logging.info(
'epoch: %d, train loss: %.8f, valid loss: %.8f, acc: %.8f, f1: %.8f, best_f1: %.8f\n' %
(epoch, train_loss, valid_loss, acc, f1, best_f1))
torch.cuda.empty_cache()
if do_eval:
logger.info("****************** Evaluate *******************")
preds = []
y_label = []
with torch.no_grad():
model = NeuralNet(bert_model)
model.load_state_dict(torch.load(save_model + '%s.bin'%file_name))
model.cuda()
for i,batch in enumerate(test_loader):
batch = tuple(t.cuda() for t in batch)
x_idx, x_seg, atten_mask,label = batch
y_preb = model(x_idx, x_seg, atten_mask)
label = label.cpu().numpy()
y_preb = np.argmax(y_preb.cpu().numpy(),axis=1)
preds.extend(y_preb)
y_label.extend(label)
logging.info(classification_report(y_label,preds))
if do_error_analyse:
error_samples = []
for i in range(len(test_examples)):
if y_label[i] != preds[i]:
error_samples.append((test_examples[i].text,y_label[i],preds[i]))
pd.DataFrame(error_samples).to_excel("./output/error_samples.xlsx",index=None,header=['text','label','pred'])
|