示意图:
import collections
class TireNode(object):
def __init__(self):
self.children = collections.defaultdict(TireNode)
self.is_word = False
class Tire(object):
def __init__(self):
self.root = TireNode()
def insert(self,word):
current = self.root
for char in word:
current = current.children[char]
current.is_word = True
def search(self,word):
current = self.root
for char in word:
current = current.children.get(char)
if not current:
return False
return current.is_word
def startsWith(self,word):
current = self.root
for char in word:
current = current.children.get(char)
if not current:
return False
return True
def enumerateMatch(self,sentence,space = '_'):
current = self.root
sentence = list(sentence)
matched = []
while len(sentence) - 1:
if self.search(sentence):
matched.append('_'.join(sentence))
del sentence[-1]
return matched
t = Tire()
gazetteer = ['南京', '南京市', '市长', '长江', '长江大桥', '大桥']
for gaz in gazetteer:
t.insert(gaz)
matchs = ['南京', '南京市', '市长', '长江', '长江大桥', '大桥','江大桥','小黑']
print('###测试insert与search:\n')
for match in matchs:
print(match,':',t.search(match))
words = ['南', '京', '市', '长', '江', '大', '桥']
print('###测试enumerateMatch:\n')
print(t.enumerateMatch(words))
print('###应用Tire Tree,找到每一个char的对应的word:\n')
words = ['南', '京', '市', '长', '江', '大', '桥']
matched = []
for _ in range(len(words)):
matched.append([])
for index in range(len(words)):
current_word = words[index:]
print(index,''.join(current_word),end = ':')
result = t.enumerateMatch(current_word)
for res in result:
matched[index + len(res.split('_')) - 1].append(res)
print(' ',result)
print('matched:',matched)
输出:
###测试insert与search:
南京 : True 南京市 : True 市长 : True 长江 : True 长江大桥 : True 大桥 : True 江大桥 : False 小黑 : False ###测试enumerateMatch:
[‘南_京_市’, ‘南_京’] ###应用Tire Tree,找到每一个char的对应的word:
0 南京市长江大桥: [‘南_京_市’, ‘南_京’] 1 京市长江大桥: [] 2 市长江大桥: [‘市_长’] 3 长江大桥: [‘长_江_大_桥’, ‘长_江’] 4 江大桥: [] 5 大桥: [‘大_桥’] 6 桥: [] matched: [[], [‘南_京’], [‘南_京_市’], [‘市_长’], [‘长_江’], [], [‘长_江_大_桥’, ‘大_桥’]]
|