参考链接:
https://www.cnblogs.com/nullzx/p/7499397.html
1、Trie前缀树
用途:字符串排序统计,效率优于哈希算法
本质:N叉树
class TrieNode():
def __init__(self):
self.data = {}
self.is_word = False
class Trie:
def __init__(self):
"""
Initialize your data structure here.
"""
self.root = TrieNode()
def insert(self, word):
"""
Inserts a word into the trie.
:type word: str
:rtype: void
"""
node = self.root
for chars in word:
child = node.data.get(chars)
if not child:
node.data[chars] = TrieNode()
node = node.data[chars]
node.is_word = True
def search(self, word):
"""
Returns if the word is in the trie.
:type word: str
:rtype: bool
"""
node = self.root
for chars in word:
node = node.data.get(chars)
if not node:
return False
return node.is_word # 判断单词是否是完整的存在在trie树中
def startsWith(self, prefix):
"""
Returns if there is any word in the trie that starts with the given prefix.
:type prefix: str
:rtype: bool
"""
node = self.root
for chars in prefix:
node = node.data.get(chars)
if not node:
return False
return True
def get_start(self, prefix):
"""
Returns words started with prefix
:param prefix:
:return: words (list)
"""
def get_key(pre, pre_node):
word_list = []
if pre_node.is_word:
word_list.append(pre)
for x in pre_node.data.keys():
word_list.extend(get_key(pre + str(x), pre_node.data.get(x)))
return word_list
words = []
if not self.startsWith(prefix):
return words
if self.search(prefix):
words.append(prefix)
return words
node = self.root
for chars in prefix:
node = node.data.get(chars)
return get_key(prefix, node)
trie = Trie()
trie.insert("something")
trie.insert("somebody")
trie.insert("somebody1")
trie.insert("somebody3")
print(trie.search("key"))
print(trie.search("somebody3"))
print(trie.get_start('somebody'))
2. AC自动机
每个节点带有fail指针,同KMP算法一样,指向与自己具有最大公共前缀串的位置。通过动态规划思想,若孩子节点在父节点fail指向的孩子节点里,子节点fail指针为父节点fail指针的子节点,否则指向fail节点的fail节点,直到无法fail指向根节点。
class TrieNode():
def __init__(self):
self.child = {}
self.failto = None
self.is_word = False
'''
下面节点值可以根据具体场景进行赋值
'''
self.str_ = ''
self.num = 0
class AhoCorasickAutomation:
def __init__(self):
"""
Initialize your data structure here.
"""
self.root = TrieNode()
def buildTrieTree(self, wordlst):
for word in wordlst:
cur = self.root
for i, c in enumerate(word):
if c not in cur.child:
cur.child[c] = TrieNode()
ps = cur.str_
cur = cur.child[c]
cur.str_ = ps + c
cur.is_word = True
cur.num += 1
def build_AC_from_Trie(self):
queue = []
for child in self.root.child:
self.root.child[child].failto = self.root
queue.append(self.root.child[child])
while len(queue) > 0:
cur = queue.pop(0)
for child in cur.child.keys():
failto = cur.failto
while True:
if failto == None:
cur.child[child].failto = self.root
break
if child in failto.child:
cur.child[child].failto = failto.child[child]
break
else:
failto = failto.failto
queue.append(cur.child[child])
def ac_search(self, str_):
cur = self.root
result = {}
i = 0
n = len(str_)
while i < n:
c = str_[i]
if c in cur.child:
cur = cur.child[c]
if cur.is_word:
temp = cur.str_
result.setdefault(temp, [])
result[temp].append([i - len(temp) + 1, i, cur.num])
'''
处理所有其他长度公共字串
'''
fl = cur.failto
while fl:
if fl.is_word:
temp = fl.str_
result.setdefault(temp, [])
result[temp].append([i - len(temp) + 1, i, cur.failto.num])
fl = fl.failto
i += 1
else:
cur = cur.failto
if cur == None:
cur = self.root
i += 1
return result
acTree = AhoCorasickAutomation()
acTree.buildTrieTree(['abcdef', 'abhab', 'bcd', 'cde', 'cd', 'cdfkcdf', 'cde'])
acTree.build_AC_from_Trie()
res = acTree.ac_search('bcabcdebcedfabcdefababkabhabkcdbcde')
print(res)
import ahocorasick
ac = ahocorasick.Automaton()
wordlst = ['abcdef', 'abhab', 'bcd', 'cd', 'cde', 'cdfkcdf', 'cde']
for word in wordlst:
ac.add_word(word, [word])
ac.make_automaton()
for item in ac.iter('bcabcdebcedfabcdefababkabhabkcdbcde'):
print(item)
|