說明:
在本人的工作中遇到了這樣的一個問題。需要用到字符串匹配的功能。
一邊是300字左右的句子,另一邊是幾個到幾十個詞的短語,然後過濾出包含短語的句子。
數量都在千萬級別,最直接的想法肯定是兩層for循環兩兩匹配,但是有點low。
在這實現了基於多叉樹的樹形匹配方式。(如果有問題,歡迎指正)
class Trie():
def __init__(self):
self.trie = {"head":{}}
def init_trie(self, path):
"""從文件初始樹"""
path = r"D:/dict_words"
index = 0
with open(path, "r", encoding="utf-8") as reader:
for line in reader:
line = line.strip()
self.add_word(line)
index += 1
if index > 1000:
break
print("over")
def add_word(self, word):
"""添加字符串"""
word = word.split("\t")[1]
word = word.strip()
word_list = list(word)
word_list.append("<end>")
word_list_len = len(word_list)
index = 0
tmp_index_dict = self.trie["head"]
while index < word_list_len:
if word_list[index] in tmp_index_dict:
tmp_index_dict = tmp_index_dict[word_list[index]]
index += 1
else:
if index < word_list_len - 1:
tmp_index_dict[word_list[index]] = {word_list[index + 1]:{}}
else:
tmp_index_dict[word_list[index]] = {}
tmp_index_dict = tmp_index_dict[word_list[index]]
index += 1
def is_contain_word_wap(self, sent):
"""判斷句子是否包含詞典中的詞"""
print(sent)
if sent == None or len(sent) == 0:
return False
res = False
for i in range(len(sent)):
tmp = sent[i:]
if self.is_contain_word(tmp):
res = True
break
return res
def is_contain_word(self, sent):
"""判斷一個句子是否包含詞典中的詞 - 需要從開頭開始有匹配項"""
word_list = list(sent)
word_list_len = len(word_list)
index = 0
tmp_index_dict = self.trie["head"]
while index < word_list_len:
if word_list[index] in tmp_index_dict:
tmp_index_dict = tmp_index_dict[word_list[index]]
index += 1
else:
break
if "<end>" in tmp_index_dict and index > 0:
return True
return False
def is_in_dict(self, word):
"""是否在詞典中"""
word_list = list(word)
word_list_len = len(word_list)
index = 0
tmp_index_dict = self.trie["head"]
while index < word_list_len:
if word_list[index] in tmp_index_dict:
tmp_index_dict = tmp_index_dict[word_list[index]]
index += 1
else:
break
if "<end>" in tmp_index_dict and index > 0:
return True
return False
if __name__ == "__main__":
print("running ")
trie = Trie()
trie.init_trie("")
print(trie.is_in_dict("寧夏羊掌櫃牧業有限公司"))
print(trie.is_in_dict("寧夏?掌櫃牧業有限公司"))
print(trie.is_in_dict("??寧夏羊掌櫃牧業有限公司"))
print(trie.is_contain_word_wap("寧夏羊掌櫃牧業有限公司???"))
print(trie.is_contain_word_wap("???寧夏羊掌櫃牧業有限公"))
print(trie.is_contain_word_wap("???寧夏羊掌櫃牧業有限公司"))