【字典树】用python实现Trie树

字典树常用做高效的文本词语保存,适用于敏感词过滤、关键词提取等场景。在字典树中相同前缀的词之间共享相同的树节点和路径。

字典树结构一般包括如下功能和属性:(1)构建;(2)添加;(3)删除;(4)前缀统计;(5)搜索

  • 实现一:通过字典的嵌套来实现
class Trie(object):
    """
    实现1:通过python自带的字典结构
    具有如下基本功能:
    (1)根据一组words进行TrieTree的构建
    (2)添加某个word
    (3)查询某个word
    (4)删除某个word
    """
    def __init__(self):
        self.trie = {}
        self.count = 0

    def __repr__(self):
        return str(self.trie)

    def buildTree(self, wordList):
        for word in wordList:
            t = self.trie             # 指向各节点的指针,初始化为root节点
            for w in word:
                if w not in t:
                    t[w] = {'count': 0}
                t[w]['count'] += 1
                t = t[w]

            self.count += 1
            t['end'] = 1

    def add(self, word):
        t = self.trie
        for w in word:
            if w not in t:
                t[w] = {'count': 0}
            t[w]['count'] += 1
            t = t[w]

        self.count += 1
        t['end'] = 1

    def delete(self, word):
        # 仅仅改变end和count属性,字符串仍存在于存储中
        # 先确定是否存在,若存在沿着的每条路径的count都需要-1
        if not self.search(word):
            return False

        t = self.trie
        for w in word:
            t = t[w]
            t['count'] -= 1

        self.count -= 1
        t['end'] = 0


    def search(self, word):
        t = self.trie
        for w in word:
            if w not in t:
                return False
            t = t[w]
        if t.get('end') == 1:
            return True
        return False

    def prefix_count(self, prefix):
        t = self.trie
        for w in prefix:
            if w not in t:
                return -1
            t = t[w]
        return t['count']
  • 实现二:通过递归的节点类实现
class Trie(object):
    """
    另一种实现
    Trie可视作一种递归结构
    """
    def __init__(self, depth=0):
        self.children = {}             # {key: Trie()},   key为每个层级对应的字符
        self.depth = depth           # 在整个字典树中的层级,root为0,因此可以视为word的长度
        self.end = False             # word终止标志
        self.count = 0               # 以当前节点前序子串为前缀的word数量
        self.words = []

    def __repr__(self):
        return str(self.words)

    def insert(self, word: str) -> None:
        cur = self
        cur.count += 1
        for w in word:
            if w not in cur.children:
                cur.children[w] = Trie(cur.depth+1)
            cur = cur.children[w]
            cur.count += 1
            cur.words.append(word)

        cur.end = True

    def buildTree(self, words: list) -> None:
        for word in words:
            self.insert(word)

    def prefix(self, pref):
        cur = self
        for p in pref:
            if p not in cur.children:
                return None
            cur = cur.children[p]
        return cur.words

    def search(self, word):
        cur = self
        for w in word:
            if w not in cur.children:
                return False
            cur = cur.children[w]
        if cur.end:
            return True
        else:
            return False

    def remove(self, word):
        if not self.search(word):
            return False
        cur = self
        cur.count -= 1
        for w in word:
            cur = cur.children[w]
            cur.count -= 1
        cur.end = False