1. 机械分词算法

1.1. 正向最大匹配算法

# In[]
custom_dict = set(["机械","分词","方法","机械分词方法", "又","叫","基于","字符串","匹配",
                   "的","它","是","按照","一定的","策略","将","待分析的","与","一个","充分",
                   "充分大的","词典","中","词条","进行","若","在","找到","某个","则","成功",
                   "识别出","词","这是","最","简单","分词方法","但","非常","高效","和","常见"])

input_sentence = "机械分词方法又叫基于字符串匹配的分词方法，它是按照一定的策略将待分析的字符串与一个“充分大的”词典中的词条进行匹配，若在词典中找到某个字符串，则匹配成功(识别出一个词)。这是最简单的分词方法，但非常高效和常见。"


max_word_len=0
for word in custom_dict:
    if len(word)>max_word_len:
        max_word_len=len(word)

if len(input_sentence)<max_word_len:
    max_word_len=len(input_sentence)


# In[]
cur_sub_sentence=input_sentence
seg_results=[]
while(len(cur_sub_sentence)>0):
#    print("cur_sub_sentence: ",cur_sub_sentence)
    cur_word_to_match=cur_sub_sentence[0:max_word_len]
    while(not (cur_word_to_match in custom_dict)):
        if(len(cur_word_to_match)>1):
            cur_word_to_match=cur_word_to_match[:-1]
        else:#if cur_word_to_match is not in the dict, then split it into chars
            break
#    if len(cur_word_to_match)==0:
#        cur_sub_sentence=cur_sub_sentence[max_word_len:]
#        continue
    seg_results.append(cur_word_to_match)
#    print("cur_word_to_match: ",cur_word_to_match)
    cur_sub_sentence=cur_sub_sentence[len(cur_word_to_match):]
# In[]
print(seg_results)