【参考:NLP-HMM隐马尔可夫+维特比分词,代码+数据+讲解_哔哩哔哩_bilibili】 PPT浅显易懂,非常不错
【参考:shouxieai/nlp-hmm-word-cut: nlp-hmm-word-cut】
如何通俗地讲解 viterbi 算法? - 路生的回答 - 知乎 如何通俗地讲解 viterbi 算法? - JustCoder的回答 - 知乎
PPT
代码
import pickle
from tqdm import tqdm
import numpy as np
import os
def make_label(text_str):
text_len = len(text_str)
if text_len == 1:
return "S"
return "B" + "M" * (text_len - 2) + "E"
def text_to_state(file="all_train_text.txt"):
if os.path.exists("all_train_state.txt"):
return
all_data = open(file, "r", encoding="utf-8").read().split("\n")
with open("all_train_state.txt", "w", encoding="utf-8") as f:
for d_index, data in tqdm(enumerate(all_data)):
if data:
state_ = ""
for w in data.split(" "):
if w:
state_ = state_ + make_label(w) + " "
if d_index != len(all_data) - 1:
state_ = state_.strip() + "\n"
f.write(state_)
class HMM:
def __init__(self, file_text="all_train_text.txt", file_state="all_train_state.txt"):
self.all_states = open(file_state, "r", encoding="utf-8").read().split("\n")[:200]
self.all_texts = open(file_text, "r", encoding="utf-8").read().split("\n")[:200]
self.states_to_index = {"B": 0, "M": 1, "S": 2, "E": 3}
self.index_to_states = ["B", "M", "S", "E"]
self.len_states = len(self.states_to_index)
self.init_matrix = np.zeros((self.len_states))
self.transfer_matrix = np.zeros((self.len_states, self.len_states))
self.emit_matrix = {"B": {"total": 0}, "M": {"total": 0}, "S": {"total": 0}, "E": {"total": 0}}
def cal_init_matrix(self, state):
self.init_matrix[self.states_to_index[state[0]]] += 1
def cal_transfer_matrix(self, states):
sta_join = "".join(states)
sta1 = sta_join[:-1]
sta2 = sta_join[1:]
for s1, s2 in zip(sta1, sta2):
self.transfer_matrix[self.states_to_index[s1], self.states_to_index[s2]] += 1
def cal_emit_matrix(self, words, states):
for word, state in zip("".join(words), "".join(states)):
self.emit_matrix[state][word] = self.emit_matrix[state].get(word, 0) + 1
self.emit_matrix[state]["total"] += 1
def normalize(self):
self.init_matrix = self.init_matrix / np.sum(self.init_matrix)
self.transfer_matrix = self.transfer_matrix / np.sum(self.transfer_matrix, axis=1, keepdims=True)
self.emit_matrix = {
state: {word: t / word_times["total"] * 1000 for word, t in word_times.items() if word != "total"} for
state, word_times in self.emit_matrix.items()}
def train(self):
if os.path.exists("three_matrix.pkl"):
self.init_matrix, self.transfer_matrix, self.emit_matrix = pickle.load(open("three_matrix.pkl", "rb"))
return
for words, states in tqdm(zip(self.all_texts, self.all_states)):
words = words.split(" ")
states = states.split(" ")
self.cal_init_matrix(states[0])
self.cal_transfer_matrix(states)
self.cal_emit_matrix(words, states)
self.normalize()
pickle.dump([self.init_matrix, self.transfer_matrix, self.emit_matrix], open("three_matrix.pkl", "wb"))
def viterbi_t(text, hmm):
states = hmm.index_to_states
emit_p = hmm.emit_matrix
trans_p = hmm.transfer_matrix
start_p = hmm.init_matrix
V = [{}]
path = {}
for y in states:
V[0][y] = start_p[hmm.states_to_index[y]] * emit_p[y].get(text[0], 0)
path[y] = [y]
for t in range(1, len(text)):
V.append({})
newpath = {}
neverSeen = text[t] not in emit_p['S'].keys() and \
text[t] not in emit_p['M'].keys() and \
text[t] not in emit_p['E'].keys() and \
text[t] not in emit_p['B'].keys()
for y in states:
emitP = emit_p[y].get(text[t], 0) if not neverSeen else 1.0
temp = []
for y0 in states:
if V[t - 1][y0] >= 0:
temp.append((V[t - 1][y0] * trans_p[hmm.states_to_index[y0], hmm.states_to_index[y]] * emitP, y0))
(prob, state) = max(temp)
V[t][y] = prob
newpath[y] = path[state] + [y]
path = newpath
(prob, state) = max([(V[len(text) - 1][y], y) for y in states])
result = ""
for t, s in zip(text, path[state]):
result += t
if s == "S" or s == "E":
result += " "
return result
if __name__ == "__main__":
text_to_state()
text = "一个人无论学什么专业,总得懂一些文学知识,有一点艺术素养,这对于丰富自己的思想和生活,提高自己的审美能力很有好处"
hmm = HMM()
hmm.train()
result = viterbi_t(text, hmm)
print(result)
一个 人 无 论学 什么 专业 , 总得 懂 一些 文学 知识 , 有 一点 艺术 素养 , 这 对于 丰富 自己 的 思想 和 生活 , 提高 自己 的 审美 能力 很 有 好处
|