参考链接:
https://blog.csdn.net/lcwdzl/article/details/78493637
代码源码地址: https://github.com/lankuohsing/Study_NLP
1. 机械分词算法
1.1. 正向最大匹配算法
# In[]
custom_dict = set(["机械","分词","方法","机械分词方法", "又","叫","基于","字符串","匹配",
"的","它","是","按照","一定的","策略","将","待分析的","与","一个","充分",
"充分大的","词典","中","词条","进行","若","在","找到","某个","则","成功",
"识别出","词","这是","最","简单","分词方法","但","非常","高效","和","常见"])
input_sentence = "机械分词方法又叫基于字符串匹配的分词方法,它是按照一定的策略将待分析的字符串与一个“充分大的”词典中的词条进行匹配,若在词典中找到某个字符串,则匹配成功(识别出一个词)。这是最简单的分词方法,但非常高效和常见。"
max_word_len=0
for word in custom_dict:
if len(word)>max_word_len:
max_word_len=len(word)
if len(input_sentence)<max_word_len:
max_word_len=len(input_sentence)
# In[]
cur_sub_sentence=input_sentence
seg_results=[]
while(len(cur_sub_sentence)>0):
# print("cur_sub_sentence: ",cur_sub_sentence)
cur_word_to_match=cur_sub_sentence[0:max_word_len]
while(not (cur_word_to_match in custom_dict)):
if(len(cur_word_to_match)>1):
cur_word_to_match=cur_word_to_match[:-1]
else:#if cur_word_to_match is not in the dict, then split it into chars
break
# if len(cur_word_to_match)==0:
# cur_sub_sentence=cur_sub_sentence[max_word_len:]
# continue
seg_results.append(cur_word_to_match)
# print("cur_word_to_match: ",cur_word_to_match)
cur_sub_sentence=cur_sub_sentence[len(cur_word_to_match):]
# In[]
print(seg_results)
|