1 导包
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn_crfsuite import CRF
from sklearn_crfsuite import metrics
2 读取数据
数据保存在data/目录下,data/目录下共有四个文件夹,分别对应四种医学情景:出院情况、病史特点、诊疗过程和一般项目。每个文件夹下保存了该情景下的电子病历。包括两类文件:‘xxx-yyy.txtoriginal.txt’和’xxx-yyy.txt’。'xxx-yyy.txtoriginal.txt’保存了xxx情境下第yyy号病历的病历文本,保存在txt的第一行中。'xxx-yyy.txt’为其对应的标签数据。
数据中共包含5种实体:治疗、身体部位、疾病和诊断、症状和体征、检查和检验。
with open('data/一般项目/一般项目-1.txtoriginal.txt') as f:
content = f.read().strip()
print(content)
女性,88岁,农民,双滦区应营子村人,主因右髋部摔伤后疼痛肿胀,活动受限5小时于2016-10-29;11:12入院。
with open('data/一般项目/一般项目-1.txt') as f:
content_label = f.read().strip()
print(content_label)
右髋部 21 23 身体部位
疼痛 27 28 症状和体征
肿胀 29 30 症状和体征
可以看出,标签文件的数据格式为每行对应一个实体,每行格式为“实体内容 实体在文本中的开始位置 实体在文本中的结束位置 实体类别”。如第一行表示content[21:24]对应的便是’右髋部’,为身体部位实体类别。
3 数据标注
实体识别的数据标注方式主要有BIOES和BIO两种,详细的介绍参考实验手册。这里为使标注类别不至于太多,决定采用BIO方式。即将实体部分的第一个汉字标注为B,实体的其他部分的汉字标注为I,非实体部分标注为O。
将5种实体类别治疗、身体部位、疾病和诊断、症状和体征、检查和检验分别标记为TREATMENT、BODY、DISEASES、SIGNS、EXAMINATIONS。
则标记时,如:若为治疗类别的实体的第一个汉字,则将其标注为B-TREATMENT,该实体其他字标记为I-TREATMENT。
label_dict = {'治疗':'TREATMENT',
'身体部位':'BODY',
'疾病和诊断':'DISEASES',
'症状和体征':'SIGNS',
'检查和检验':'EXAMINATIONS'}
def sentence2BIOlabel(sentence,label_from_file):
'''
返回句子sentence的BIO标注列表
入参:
sentence:一个句子,字符串类别
label_from_file:该句子对应的标签,格式为直接从txt文件中读出的格式,形如上文中的content_label
出参:
sentence_label:该句子的BIO标签。一个列表,列表的第i项为第i个汉字对应的标签
'''
sentence_label = ['O']*len(sentence)
if label_from_file=='':
return sentence_label
for line in label_from_file.split('\n'):
entity_info = line.strip().split('\t')
start_index = int(entity_info[1])
end_index = int(entity_info[2])
entity_label = label_dict[entity_info[3]]
sentence_label[start_index] = 'B-'+entity_label
for i in range(start_index+1,end_index+1):
sentence_label[i] = 'I-'+entity_label
return sentence_label
sentence_label_tmp = sentence2BIOlabel(content,content_label)
print(sentence_label_tmp)
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-BODY', 'I-BODY', 'I-BODY', 'O', 'O', 'O', 'B-SIGNS', 'I-SIGNS', 'B-SIGNS', 'I-SIGNS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
for i in range(len(content)):
print(content[i],sentence_label_tmp[i])
女 O
性 O
, O
8 O
8 O
岁 O
, O
农 O
民 O
, O
双 O
滦 O
区 O
应 O
营 O
子 O
村 O
人 O
, O
主 O
因 O
右 B-BODY
髋 I-BODY
部 I-BODY
摔 O
伤 O
后 O
疼 B-SIGNS
痛 I-SIGNS
肿 B-SIGNS
胀 I-SIGNS
, O
活 O
动 O
受 O
限 O
5 O
小 O
时 O
于 O
2 O
0 O
1 O
6 O
- O
1 O
0 O
- O
2 O
9 O
; O
1 O
1 O
: O
1 O
2 O
入 O
院 O
。 O
sentence_list = []
label_list = []
data_base_path = "./data/"
for parent_path in os.listdir(data_base_path):
if os.path.isdir(data_base_path+parent_path):
for file_name in os.listdir(data_base_path+parent_path):
if "original" in file_name:
with open(data_base_path+parent_path+"/"+file_name,"r") as f:
content = f.read().strip()
sentence_list.append(content)
label_file_name = file_name.split("original")[0]
with open(data_base_path+parent_path+"/"+label_file_name,"r") as f:
content_label = f.read().strip()
label = sentence2BIOlabel(content,content_label)
label_list.append(label)
4 文本特征工程
要使用CRF算法对每个字进行标注,就需要获取每个字对应的特征。就需要对文本进行特征工程,这一部分就是构建一句话中每个字的特征。
import pkuseg
seg = pkuseg.pkuseg(model_name='medicine',postag=True)
seg.cut('发病原因为右髋部摔伤后疼痛肿胀')
[('发病', 'vn'),
('原因', 'n'),
('为', 'v'),
('右髋部', 'n'),
('摔伤', 'v'),
('后', 'f'),
('疼痛', 'a'),
('肿胀', 'v')]
pkuseg包对医疗方面的文本有较好的分词效果。seg.cut(文本)的输出格式为[(第一个词,第一个词的词性),(第二个词,第二个词的词性),…,(第n个词,第n个词的词性)]。稍后在构建每个字的特征时我们会用到pkuseg的分词功能。
with open('THUOCL_medical.txt') as f:
medical_words = f.read().strip()
medical_words_list = [words.strip().split('\t')[0] for words in medical_words.split('\n')]
medical_words_list[:10]
['精神', '医院', '检查', '死亡', '恢复', '意识', '医疗', '治疗', '卫生', '患者']
进行完上述准备工作后,我们接下来正式来构造特征。
def word2feature(sentence,i):
'''
返回句子sentence中第i个汉字的一些简单的特征
入参:
sentence:待处理的句子
i:会返回第i个汉字的一些简单的特征
出参:
simple_feature:由一些简单的特征所组成的字典,字典的键为特征名,值为特征值
'''
simple_feature = {}
simple_feature['word'] = sentence[i]
simple_feature['pre_word'] = sentence[i-1] if i>0 else 'start'
simple_feature['after_word'] = sentence[i+1] if i<len(sentence)-1 else 'end'
simple_feature['pre_word_word'] = sentence[i-1]+sentence[i] if i>0 else 'start'+sentence[i]
simple_feature['word_after_word'] = sentence[i]+sentence[i+1] if i<len(sentence)-1 else sentence[i]+'end'
simple_feature['bias'] = 1
return simple_feature
def sentence2feature(sentence):
'''
在word2feature定义的简单特征的基础上,增加一些复杂的特征,并返回句子中每个字对应的特征字典所组成的列表
入参:
sentence:待处理的句子
出参:
sentence_feature_list:句子中每个字对应的特征字典所组成的列表。格式为:[第一个字的特征字典,第二个字的特征字典,...,第n个字的特征字典]
'''
sentence_feature_list = [word2feature(sentence,i) for i in range(len(sentence))]
word_index = 0
sentence_cut = seg.cut(sentence)
for i,(WORD,nominal) in enumerate(sentence_cut):
for j in range(word_index,word_index+len(WORD)):
sentence_feature_list[j]['WORD'] = WORD
sentence_feature_list[j]['nominal'] = nominal
sentence_feature_list[j]['pre_WORD'] = sentence_cut[i-1][0] if i>0 else 'START'
sentence_feature_list[j]['after_WORD'] = sentence_cut[i+1][0] if i<len(sentence_cut)-1 else 'END'
sentence_feature_list[j]['is_medicalwords'] = 1 if WORD in medical_words_list else 0
sentence_feature_list[j]["if_first"] = 1 if j == word_index else 0
word_index = word_index+len(WORD)
if sentence_feature_list == None:
print(True)
return sentence_feature_list
feature_list = [sentence2feature(sentence) for sentence in tqdm(sentence_list)]
100%|██████████| 1198/1198 [00:39<00:00, 30.34it/s]
len(feature_list)
1198
5 CRF模型搭建
x_train,x_test,y_train,y_test = train_test_split(feature_list, label_list, test_size=0.3, random_state=2020)
len(y_train)
838
crf = CRF(
algorithm='lbfgs',
c1=0.1,
c2=0.1,
max_iterations=100,
all_possible_transitions=False
)
crf.fit(x_train,y_train)
/home/chi/anaconda3/envs/greedyaiqa/lib/python3.6/site-packages/sklearn/base.py:197: FutureWarning: From version 0.24, get_params will raise an AttributeError if a parameter cannot be retrieved as an instance attribute. Previously it would return None.
FutureWarning)
CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=False,
averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)
def predict(sentence):
'''
输出CRF预测的一个句子的BIO标注
入参:
sentence:待处理的句子
出参:
sent_bio:一个字典,字典的键为句子中的汉字,值为其对应的BIO标注
'''
feature = sentence2feature(sentence)
sent_bio = crf.predict_single(feature)
return sent_bio
predict('这是由于耳膜损伤导致的')
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
可以看出CRF模型能够有效的识别出这句话中的实体,接下来我们用CRF模型对我们的测试集进行预测。
y_pred = crf.predict(x_test)
6 模型评估
使用sklearn_crfsuite中自带的metrics包可对模型进行有效的评估
labels = list(crf.classes_)
labels.remove('O')
labels
['B-TREATMENT',
'I-TREATMENT',
'B-EXAMINATIONS',
'I-EXAMINATIONS',
'B-BODY',
'I-BODY',
'B-SIGNS',
'I-SIGNS',
'B-DISEASES',
'I-DISEASES']
metrics.flat_f1_score(y_test, y_pred,
average='micro', labels=labels)
0.9421061480692708
print(metrics.flat_classification_report(
y_test, y_pred, labels=labels, digits=3
))
precision recall f1-score support
B-TREATMENT 0.901 0.833 0.866 294
I-TREATMENT 0.919 0.887 0.903 1448
B-EXAMINATIONS 0.970 0.978 0.974 2975
I-EXAMINATIONS 0.964 0.971 0.968 6462
B-BODY 0.926 0.919 0.922 3314
I-BODY 0.917 0.928 0.922 6003
B-SIGNS 0.967 0.980 0.974 2501
I-SIGNS 0.969 0.980 0.974 2738
B-DISEASES 0.833 0.721 0.773 208
I-DISEASES 0.829 0.719 0.770 834
micro avg 0.943 0.942 0.942 26777
macro avg 0.919 0.892 0.905 26777
weighted avg 0.942 0.942 0.942 26777
?
7 BiLSTM-CRF
CRF模型还可与BiLSTM模型结合来解决实体识别问题,这样的好处是BiLSTM可以自动获取文本的特征,就不需要自己去定义特征,不需要再进行文本特征工程部分。
BiLSTM-CRF模型的实现细节均在BiLSTM_CRF.py中实现。
from BiLSTM_CRF import *
word2id = word_to_id(sentence_list)
tag2id = tag_to_id(label_list)
tag2id
{'O': 0,
'B-EXAMINATIONS': 1,
'I-EXAMINATIONS': 2,
'B-BODY': 3,
'I-BODY': 4,
'B-SIGNS': 5,
'I-SIGNS': 6,
'B-DISEASES': 7,
'I-DISEASES': 8,
'B-TREATMENT': 9,
'I-TREATMENT': 10,
'<unk>': 11,
'<pad>': 12,
'<start>': 13,
'<end>': 14}
LSTM模型训练的时候需要在word2id和tag2id加入PAD和UNK,如果是加了CRF的lstm还要加入和 (解码的时候需要用到)。word2id的格式与tag2id的格式类似。
x_train_lstmcrf,x_test_lstmcrf,y_train_lstmcrf,y_test_lstmcrf = train_test_split(sentence_list, label_list, test_size=0.3, random_state=2020)
x_train_lstmcrf,y_train_lstmcrf = prepocess_data_for_lstmcrf(x_train_lstmcrf,y_train_lstmcrf)
x_test_lstmcrf,y_test_lstmcrf = prepocess_data_for_lstmcrf(x_test_lstmcrf,y_test_lstmcrf,test=True)
model = BiLSTM_CRF_Model(vocab_size=len(word2id),out_size=len(tag2id),batch_size=64, epochs=30)
model.train(x_train_lstmcrf,y_train_lstmcrf,word2id,tag2id)
Epoch 1, step/total_step: 10/14 71.43% Loss:703.1040
Epoch 1, Val Loss:354.8586
Epoch 2, step/total_step: 10/14 71.43% Loss:419.8804
Epoch 2, Val Loss:280.7740
Epoch 3, step/total_step: 10/14 71.43% Loss:351.4792
Epoch 3, Val Loss:253.9364
Epoch 4, step/total_step: 10/14 71.43% Loss:320.9320
Epoch 4, Val Loss:202.8077
Epoch 5, step/total_step: 10/14 71.43% Loss:269.4268
Epoch 5, Val Loss:176.0763
Epoch 6, step/total_step: 10/14 71.43% Loss:232.1650
Epoch 6, Val Loss:150.6498
Epoch 7, step/total_step: 10/14 71.43% Loss:198.0663
Epoch 7, Val Loss:126.4110
Epoch 8, step/total_step: 10/14 71.43% Loss:166.0142
Epoch 8, Val Loss:106.4447
Epoch 9, step/total_step: 10/14 71.43% Loss:140.0015
Epoch 9, Val Loss:89.2141
Epoch 10, step/total_step: 10/14 71.43% Loss:117.0769
Epoch 10, Val Loss:76.6977
Epoch 11, step/total_step: 10/14 71.43% Loss:100.5820
Epoch 11, Val Loss:67.0900
Epoch 12, step/total_step: 10/14 71.43% Loss:87.8584
Epoch 12, Val Loss:60.0311
Epoch 13, step/total_step: 10/14 71.43% Loss:77.5198
Epoch 13, Val Loss:54.0604
Epoch 14, step/total_step: 10/14 71.43% Loss:70.3888
Epoch 14, Val Loss:49.1572
Epoch 15, step/total_step: 10/14 71.43% Loss:63.0964
Epoch 15, Val Loss:46.0829
Epoch 16, step/total_step: 10/14 71.43% Loss:59.1197
Epoch 16, Val Loss:41.5716
Epoch 17, step/total_step: 10/14 71.43% Loss:53.2706
Epoch 17, Val Loss:37.3219
Epoch 18, step/total_step: 10/14 71.43% Loss:49.1802
Epoch 18, Val Loss:33.9772
Epoch 19, step/total_step: 10/14 71.43% Loss:44.7907
Epoch 19, Val Loss:31.7810
Epoch 20, step/total_step: 10/14 71.43% Loss:41.8903
Epoch 20, Val Loss:29.3012
Epoch 21, step/total_step: 10/14 71.43% Loss:38.7684
Epoch 21, Val Loss:27.7391
Epoch 22, step/total_step: 10/14 71.43% Loss:37.1167
Epoch 22, Val Loss:27.8624
Epoch 23, step/total_step: 10/14 71.43% Loss:35.7134
Epoch 23, Val Loss:25.9406
Epoch 24, step/total_step: 10/14 71.43% Loss:33.8559
Epoch 24, Val Loss:25.1907
Epoch 25, step/total_step: 10/14 71.43% Loss:32.3450
Epoch 25, Val Loss:23.9051
Epoch 26, step/total_step: 10/14 71.43% Loss:31.1568
Epoch 26, Val Loss:22.7183
Epoch 27, step/total_step: 10/14 71.43% Loss:29.5120
Epoch 27, Val Loss:21.2255
Epoch 28, step/total_step: 10/14 71.43% Loss:28.1631
Epoch 28, Val Loss:20.2317
Epoch 29, step/total_step: 10/14 71.43% Loss:26.7550
Epoch 29, Val Loss:19.0803
Epoch 30, step/total_step: 10/14 71.43% Loss:25.4697
Epoch 30, Val Loss:18.3434
y_pred_lstmcrf, _ = model.test(x_test_lstmcrf,y_test_lstmcrf,word2id,tag2id)
/pytorch/aten/src/ATen/native/cudnn/RNN.cpp:1236: UserWarning: RNN module weights are not part of single contiguous chunk of memory. This means they need to be compacted at every call, possibly greatly increasing memory usage. To compact weights again call flatten_parameters().
metrics.flat_f1_score(y_test_lstmcrf, y_pred_lstmcrf,
average='micro', labels=labels)
0.9319616117106688
print(metrics.flat_classification_report(
y_test_lstmcrf, y_pred_lstmcrf, labels=labels, digits=3
))
precision recall f1-score support
B-TREATMENT 0.873 0.748 0.806 294
I-TREATMENT 0.887 0.887 0.887 1448
B-EXAMINATIONS 0.952 0.974 0.963 2975
I-EXAMINATIONS 0.950 0.970 0.960 6462
B-BODY 0.913 0.909 0.911 3314
I-BODY 0.914 0.916 0.915 6003
B-SIGNS 0.956 0.946 0.951 2501
I-SIGNS 0.959 0.961 0.960 2738
B-DISEASES 0.880 0.740 0.804 208
I-DISEASES 0.850 0.749 0.797 834
micro avg 0.932 0.932 0.932 26777
macro avg 0.913 0.880 0.895 26777
weighted avg 0.931 0.932 0.931 26777
?
|