【字典树】用python实现Trie树
字典树常用做高效的文本词语保存,适用于敏感词过滤、关键词提取等场景。在字典树中相同前缀的词之间共享相同的树节点和路径。
字典树结构一般包括如下功能和属性:(1)构建;(2)添加;(3)删除;(4)前缀统计;(5)搜索
- 实现一:通过字典的嵌套来实现
class Trie(object):
"""
实现1:通过python自带的字典结构
具有如下基本功能:
(1)根据一组words进行TrieTree的构建
(2)添加某个word
(3)查询某个word
(4)删除某个word
"""
def __init__(self):
self.trie = {}
self.count = 0
def __repr__(self):
return str(self.trie)
def buildTree(self, wordList):
for word in wordList:
t = self.trie # 指向各节点的指针,初始化为root节点
for w in word:
if w not in t:
t[w] = {'count': 0}
t[w]['count'] += 1
t = t[w]
self.count += 1
t['end'] = 1
def add(self, word):
t = self.trie
for w in word:
if w not in t:
t[w] = {'count': 0}
t[w]['count'] += 1
t = t[w]
self.count += 1
t['end'] = 1
def delete(self, word):
# 仅仅改变end和count属性,字符串仍存在于存储中
# 先确定是否存在,若存在沿着的每条路径的count都需要-1
if not self.search(word):
return False
t = self.trie
for w in word:
t = t[w]
t['count'] -= 1
self.count -= 1
t['end'] = 0
def search(self, word):
t = self.trie
for w in word:
if w not in t:
return False
t = t[w]
if t.get('end') == 1:
return True
return False
def prefix_count(self, prefix):
t = self.trie
for w in prefix:
if w not in t:
return -1
t = t[w]
return t['count']
- 实现二:通过递归的节点类实现
class Trie(object):
"""
另一种实现
Trie可视作一种递归结构
"""
def __init__(self, depth=0):
self.children = {} # {key: Trie()}, key为每个层级对应的字符
self.depth = depth # 在整个字典树中的层级,root为0,因此可以视为word的长度
self.end = False # word终止标志
self.count = 0 # 以当前节点前序子串为前缀的word数量
self.words = []
def __repr__(self):
return str(self.words)
def insert(self, word: str) -> None:
cur = self
cur.count += 1
for w in word:
if w not in cur.children:
cur.children[w] = Trie(cur.depth+1)
cur = cur.children[w]
cur.count += 1
cur.words.append(word)
cur.end = True
def buildTree(self, words: list) -> None:
for word in words:
self.insert(word)
def prefix(self, pref):
cur = self
for p in pref:
if p not in cur.children:
return None
cur = cur.children[p]
return cur.words
def search(self, word):
cur = self
for w in word:
if w not in cur.children:
return False
cur = cur.children[w]
if cur.end:
return True
else:
return False
def remove(self, word):
if not self.search(word):
return False
cur = self
cur.count -= 1
for w in word:
cur = cur.children[w]
cur.count -= 1
cur.end = False