词性标注-隐马尔可夫模型
今天刚学完条件随机场,然后想找个例子实战一下,写写代码。于是关注到了词性标注,因为在学习的过程中,很多博客、视频、书等等都常常提到词性标注作为讲解的例子。然后我突然想到前面学HMM的过程中虽然利用盒子与球模型实现了HMM的算法,但是还没有用一个更广泛,更实际的例子去实现它,于是决定先用HMM实现词性标注。
流程很简单,先利用现有的语料库训练HMM模型,这是学习问题,然后利用viterbi算法做任意输入的句子做词性标注,即解码问题
我们先来看监督式学习的情况
语料来自于https://github.com/junman/POS-tagging,利用并进一步封装了其中读取文件的代码以忽略与模型无关的细节
如下是dataset.py的文件内容,只关注模型的读者直接使用即可
import numpy as np
def read_dataset():
state_list = ['Ag', 'a', 'ad', 'an', 'Bg', 'b', 'c', 'Dg',
'd', 'e', 'f', 'h', 'i', 'j', 'k', 'l',
'Mg', 'm', 'Ng', 'n', 'nr', 'ns', 'nt', 'nx',
'nz', 'o', 'p', 'q', 'Rg', 'r', 's','na',
'Tg', 't','u', 'Vg', 'v', 'vd', 'vn','vvn',
'w', 'Yg', 'y', 'z']
data = []
word_count = {}
with open('corpus_POS.txt') as filess:
for line in filess:
vocabs=[]
classifies=[]
line=line.strip()
if not line:
continue
words=line.split(" ")
for word in words:
position=word.index('/')
if '[' in word and ']' in word:
vocabs.append(word[1:position])
classifies.append(word[position+1:-1])
if word[1:position] in word_count.keys():
word_count[word[1:position]] +=1
else:
word_count[word[1:position]] = 0
break
if '[' in word:
vocabs.append(word[1:position])
classifies.append(word[position+1:])
if word[1:position] in word_count.keys():
word_count[word[1:position]] +=1
else:
word_count[word[1:position]] = 0
break
if ']' in word:
vocabs.append(word[:position])
classifies.append(word[position+1:-1])
if word[:position] in word_count.keys():
word_count[word[:position]] +=1
else:
word_count[word[:position]] = 0
break
vocabs.append(word[:position])
classifies.append(word[position+1:])
if word[:position] in word_count.keys():
word_count[word[:position]] +=1
else:
word_count[word[:position]] = 0
if len(vocabs)!=len(classifies):
print('词汇数量与类别数量不一致')
break
else:
data.append((vocabs,classifies))
observation_list = word_count.keys()
word_index = { word : i for i,word in enumerate(observation_list)}
state_index = { pos : i for i,pos in enumerate(state_list)}
pad_length = 30
X_data = []
y_data = []
for vocabs,classifies in data:
digit_vocabs = []
digit_classifies = []
for i in range(min(len(vocabs),pad_length)):
digit_vocabs.append(word_index[vocabs[i]])
for i in range(min(len(vocabs),pad_length)):
digit_classifies.append(state_index[classifies[i]])
if (len(digit_vocabs)<pad_length):
for i in range(pad_length-len(digit_vocabs)):
digit_vocabs.append(len(observation_list))
digit_classifies.append(len(state_list))
X_data.append(digit_vocabs)
y_data.append(digit_classifies)
X_data = np.array(X_data)
y_data = np.array(y_data)
return X_data,y_data,word_index,state_index
然后就是利用我在白板推导系列Pytorch-隐马尔可夫模型-学习问题以及白板推导系列Pytorch-隐马尔可夫模型-解码问题两篇博客中实现的极大似然估计定义的模型代码和viterbi解码的代码,训练数据传进去,跑一跑,解个码就结束了
是这样吗?
我起初以为是这样的,结果现实狠狠的给了我一巴掌。
我这才知道之前写的代码存在多大的问题。
如果你看过我上面两篇文章,你用里面的代码去训练模型,你会发现你似乎永远看不到代码运行结束?这是为什么?监督式学习的代码之前在盒子和球模型不是表现挺好吗?模型参数什么的预测都挺准。但是有一个问题,我们当时用的盒子与球模型只有四个状态,两种观测,所以我们直接把A,B写成矩阵,一个一个算,模型仍然很快训练完成,因为整个模型只有
4
+
4
?
4
+
4
?
2
=
28
4+4*4+4*2 = 28
4+4?4+4?2=28个参数?。
但是现在不一样了。
给定的语料中,状态(词性)有44个,观测(单词)有46571个,总共有
44
+
44
?
44
+
44
?
46571
44+44*44+44*46571
44+44?44+44?46571??总共2051104个参数。另外,我们定义的发射矩阵维度是44x46571,但其中大部分元素都是0,用这么庞大的空间存储这么一点点有效的数据,我亏的慌。
所以看到github上的代码后,我改啊改,改啊改,终于把它改成了库里面的形状
修改前
def train(self,data):
self.pi = np.zeros(shape=(self.N,))
self.A = np.zeros(shape=(self.N,self.N))
self.B = np.zeros(shape=(self.N,self.M))
S = len(data)
self.T = len(data[0][0])
for i in range(self.N):
for j in range(S):
self.pi[i] += data[j][0][0]==i
self.pi[i] = self.pi[i]/S
for i in range(self.N):
for j in range(self.N):
fenzi = 0
fenmu = 0
for k in range(S):
for t in range(self.T-1):
fenzi += data[k][0][t]==i and data[k][0][t+1]==j
fenmu += data[k][0][t]==i
self.A[i][j] = fenzi/fenmu
for j in range(self.N):
for k in range(self.M):
fenzi = 0
fenmu = 0
for i in range(S):
for t in range(self.T):
fenzi += data[i][0][t]==j and data[i][1][t]==k
fenmu += data[i][0][t]==j
self.B[j][k] = fenzi/fenmu
return self.pi,self.A,self.B
修改后的监督学习方法如下
def train(self,O,I):
self.pi = np.zeros(shape=(self.N,))
self.A = [{} for i in range(self.N)]
self.B = [{} for i in range(self.N)]
S = I.shape[0]
self.T = I.shape[1]
state_count = {}
for i in range(self.N):
state_count[i] = (I==i).sum()
if state_count[i]==0:
state_count[i] = 0.001
self.pi[i] = (I[:,0]==i).sum()/S
for j in range(S):
for t in range(self.T):
if t<self.T-1:
self.A[I[j,t]][I[j,t+1]] = self.A[t].get(t+1,0)+1
self.B[I[j,t]][O[j,t]] = self.B[I[j,t]].get(O[j,t],0)+1
for i in range(self.N):
for state in self.A[i].keys():
self.A[i][state] = self.A[i].get(state)/state_count[i]
for observation in self.B[i].keys():
self.B[i][observation] = self.B[i].get(observation)/state_count[i]
return self.pi,self.A,self.B
修改后的代码仍然是极大似然估计,但它不再是套用通过数学推导出来的公式,而是直接根据频率估计概率的思想编码,二者表现形式可能没有多大区别,但行为模式不同。
读者对比之后可以发现,状态转移矩阵A和发射矩阵B都从ndarray变成了字典的列表。并且可以看到不再出现
for k in range(self.M)
代码上的细节读者可慢慢体会。
读者可前往https://download.csdn.net/download/qq_41335232/45109996直接下载完整压缩包,或者自行复制本文代码创建文件
最后我把剩下的代码贴上来(dataset.py前面已经贴过了),然后语料corpus_POS.txt读者可前往https://github.com/junman/POS-tagging下载
import numpy as np
from dataset import read_dataset
class SupervisedModel:
def __init__(self,n_states) -> None:
self.N = n_states+1
def train(self,O,I):
self.pi = np.zeros(shape=(self.N,))
self.A = [{} for i in range(self.N)]
self.B = [{} for i in range(self.N)]
S = I.shape[0]
self.T = I.shape[1]
state_count = {}
for i in range(self.N):
state_count[i] = (I==i).sum()
if state_count[i]==0:
state_count[i] = 0.001
self.pi[i] = (I[:,0]==i).sum()/S
for j in range(S):
for t in range(self.T):
if t<self.T-1:
self.A[I[j,t]][I[j,t+1]] = self.A[t].get(t+1,0)+1
self.B[I[j,t]][O[j,t]] = self.B[I[j,t]].get(O[j,t],0)+1
for i in range(self.N):
for state in self.A[i].keys():
self.A[i][state] = self.A[i].get(state)/state_count[i]
for observation in self.B[i].keys():
self.B[i][observation] = self.B[i].get(observation)/state_count[i]
return self.pi,self.A,self.B
def decode(self,O):
T = len(O)
delta = np.zeros(shape=(T,self.N))
fi = np.zeros(shape=(T,self.N),dtype=int)
for i in range(self.N):
delta[0][i] = self.B[i].get(O[0],0)*self.pi[i]
for t in range(0,T-1):
for i in range(self.N):
max_val = 0
max_index = 0
for j in range(self.N):
p = self.A[j].get(i,0)*delta[t][j]
if p>max_val:
max_index = j
max_val = p
delta[t+1][i] = self.B[i].get(O[t+1],0)*max_val
fi[t+1][i] = max_index
I = []
index = delta[T-1].argmax()
I.append(index)
for t in reversed(range(1,T)):
index = fi[t,index]
I.insert(0,index)
return I
X_data,y_data,word_index,state_index = read_dataset()
def word2index(word_index,words):
digit_words = []
for word in words:
digit_words.append(word_index[word])
return digit_words
def index2state(state_index,indexes):
state_list = list(state_index.keys())
states = []
for index in indexes:
states.append(state_list[index])
return states
model = SupervisedModel(len(state_index.keys()))
model.train(X_data,y_data)
words = ["我","要","吃饭"]
words = word2index(word_index,words)
posids = model.decode(words)
print(index2state(state_index,posids))
至于无监督的词性标注,我目前还没想到要怎么优化baum-welch的代码,但从理论上说,直接使用之前的代码也是可以训练的,但是肯定需要很长的训练时间。并且效果也不怎么好。
读者可利用hmmlearn库简单验证
import numpy as np
from dataset import read_dataset
from hmmlearn import hmm
X_data,y_data,word_index,state_index = read_dataset()
n_states = len(state_index.keys())
model = hmm.MultinomialHMM(n_components=n_states, n_iter=20, tol=0.001)
model.fit(X_data)
words = ["我","要","吃饭"]
model.decode(np.array([word2index(words)]).T,algorithm="viterbi")
|