將問題進行分析,和系統已有的分類進行關聯
構建字典數據
將構建的知識圖片字典化, 用於後面對問題的解析,下圖爲症狀的字典,其它字典同理
構建 Trie 字典樹
將建字典數據,組裝集合
cur_dir = '/'.join(os.path.abspath(__file__).split('/')[:-1])
# 特徵詞路徑
self.disease_path = os.path.join(cur_dir, '../dict/disease.txt')
self.check_path = os.path.join(cur_dir, '../dict/examine.txt')
self.drug_path = os.path.join(cur_dir, '../dict/drug.txt')
self.food_path = os.path.join(cur_dir, '../dict/food.txt')
self.symptom_path = os.path.join(cur_dir, '../dict/symptom.txt')
self.deny_path = os.path.join(cur_dir, '../dict/deny.txt')
# 加載數據
self.disease_wds = [i.strip() for i in open(self.disease_path, encoding="utf-8") if i.strip()] # ['乾眼', '右膝髕上囊及關節腔少量積液']
self.check_wds = [i.strip() for i in open(self.check_path, encoding="utf-8") if i.strip()] # ['膝關節核磁', '視力', '砂眼', '辨色力', '角膜', '眼底']
self.drug_wds = [i.strip() for i in open(self.drug_path, encoding="utf-8") if i.strip()]
self.food_wds = [i.strip() for i in open(self.food_path, encoding="utf-8") if i.strip()]
self.symptom_wds = [i.strip() for i in open(self.symptom_path, encoding="utf-8") if i.strip()] # ['畏光','乾澀','看東西有時候清楚有時候不清楚']
# 讀出所有 dict 裏面的字典數據,並拼接成一個大而全的 集合
# ['乾眼', '右膝髕上囊及關節腔少量積液','膝關節核磁', '視力', '砂眼', '辨色力', '角膜', '眼底','畏光','乾澀','看東西有時候清楚有時候不清楚']
self.region_words = set(self.disease_wds + self.check_wds + self.drug_wds + self.food_wds + self.symptom_wds)
構建 Trie 字典樹
Trie字典樹:https://www.cnblogs.com/vipsoft/p/17722820.html
Aho-Corasick 算法 AC自動機實現:https://www.cnblogs.com/vipsoft/p/17722761.html
# 目的是爲了將來對用戶提的問題,進行關鍵詞快速提取
def build_actree(self, word_list):
"""
構造actree,加速過濾
:param word_list:
:return:
"""
actree = ahocorasick.Automaton()
for index, word in enumerate(word_list):
actree.add_word(word, (index, word)) # 向trie樹中添加單詞
actree.make_automaton()
return actree
按實體組裝字典
# 將 ['乾眼', '右膝髕上囊及關節腔少量積液','膝關節核磁', '視力', '砂眼', '辨色力', '角膜', '眼底'],進行分類,組裝成不同類型的字典
def build_wdtype_dict(self):
"""
構造詞對應的類型
:return:
"""
wd_dict = dict()
for wd in self.region_words:
wd_dict[wd] = []
if wd in self.disease_wds:
wd_dict[wd].append('disease')
if wd in self.check_wds:
wd_dict[wd].append('check')
if wd in self.drug_wds:
wd_dict[wd].append('drug')
if wd in self.food_wds:
wd_dict[wd].append('food')
if wd in self.symptom_wds:
wd_dict[wd].append('symptom')
return wd_dict
問題分析
通過AC算法,過濾關鍵詞
# "請問最近看東西有時候清楚有時候不清楚是怎麼回事"
def check_medical(self, question):
"""
問句過濾
:param question:
:return:
"""
region_wds = []
for i in self.region_tree.iter(question): # 從問題中,找出關鍵詞
wd = i[1][1] # 看東西有時候清楚有時候不清楚
region_wds.append(wd)
stop_wds = []
for wd1 in region_wds:
for wd2 in region_wds:
if wd1 in wd2 and wd1 != wd2:
stop_wds.append(wd1)
final_wds = [i for i in region_wds if i not in stop_wds] # '看東西有時候清楚有時候不清楚'
medical_dict = {i: self.wdtype_dict.get(i) for i in final_wds} # {'看東西有時候清楚有時候不清楚': ['symptom']}
return medical_dict
解析出問題的類型
data['args'] = medical_dict
# 若沒有查到相關的外部查詢信息,那麼則將該疾病的描述信息返回
if question_types == [] and 'symptom' in types:
question_types = ['symptom_disease']
# 將多個分類結果進行合併處理,組裝成一個字典
data['question_types'] = question_types
輸出字典
question = "請問最近看東西有時候清楚有時候不清楚是怎麼回事"
# 最終輸出
data = {'args': {'看東西有時候清楚有時候不清楚': ['symptom']}, 'question_types': ['symptom_disease']}
question = "乾眼常用藥有哪些"
# 最終輸出
data = {'args': {'乾眼': ['disease']}, 'question_types': ['disease_drug']}
question = "乾眼哪些不能喫"
data = {'args': {'乾眼': ['disease']}, 'question_types': ['disease_not_food']}
後面根據 question_types 生成 CQL語句