NLP訓練營之問答系統——替換用戶輸入句子中錯誤單詞

構建詞典庫

# Python rstrip() 刪除 string 字符串末尾的指定字符(默認爲空格)
vocab = set([line.rstrip() for line in open('vocab.txt')]) 

print(vocab)

需要生成 所有候選集合

def generate_candidates(word):
    '''
    word: 給定的輸入(錯誤的輸入)
    返回所有(valid)候選集和
    '''
    # 生成編輯距離爲1的單詞
    # 1.insert 2. delete 3. replace
    # appl: replace: bppl, cppl......
    #       insert: bappl, cappl....
    #       delete: ppl, apl, app......
    
    # 假設適用26個字符(意思是無法預測用戶是否想輸入其他字符,
    # 如標點符號;但是我們預測用戶可能將其他英文字母輸錯成爲標點符號)
    letters = "abcdefghijklmnopqrstuvwxyz"
    splits =[(word[:i], word[i:]) for i in range(len(word) + 1)]
    
    # insert 操作
    inserts = [*L+c+R for L, R in splits for c in letters]*
    
    # delete 操作
    deletes = [L+R[1:] for  L, R in splits if R]
    
    # replace 操作
    replaces = [L+c+R[1:] for L, R in splits if R for c in letters]
   
    candidates = set(inserts+deletes+replaces) 
    
    # 過濾掉不存在於詞典庫裏面的單詞
    return [word for word in candidates if word in vocab]
    
    
    
generate_candidates('apple')

讀取路透社語料庫

import nltk
from nltk.corpus import reuters

nltk.download('reuters') # 沒有該語料庫就下載

categories = reuters.categories()
corpus = reuters.sents(categories = categories)

構建語言模型:bigram

term_count = {}
bigram_count = {}

for doc in corpus:
    doc = ['<s>'] + doc
    for i in range(len(doc) - 1):
        
        # bigram: [i, i+1]
        term = doc[i]
        bigram = doc[i: i+2]
        
        if term in term_count:
            term_count[term] += 1
        else:
            term_count[term] = 1
            
        bigram = ' '.join(bigram) 
        if bigram in bigram_count:
            bigram_count[bigram] += 1
        else:
            bigram_count[bigram] = 1
            
# print(bigram_count)
# sklearn裏面有現成的包

用戶打錯的概率統計 - channel probalility

channel_prob = {}

for line in open('spell-errors.txt'):
    items = line.split(':')
    correct = items[0].strip()
    mistakes = [item.strip() for item in items[1].strip().split(',')]
    
    channel_prob[correct] = {}
    for mis in mistakes:
    	# 當用戶打到這個正確的詞的時候,它可能打錯成其他詞的概率,這是個條件概率,每個詞概率相同。。。。?
        channel_prob[correct][mis] = 1.0 / len(mistakes)
        
print(channel_prob)

主函數

import numpy as np

V = len(term_count.keys())

file = open('testdata.txt', 'r')
for line in file:
    items = line.rstrip().split('\t')
    line = items[2][:-1].split() # 要去掉句末的標點符號!!!!
    
    # ['I', 'like', 'playing']
    for word in line:
        
        # 消除標點符號的干擾
        other = [",", "'"]
        if word[-1] in other:
            word = word[:-1]
        elif word[-2:] in ["'s", "'t"]:
            word = word[:-2]
            
        if word not in vocab:
            
            # 需要替換word成正確的單詞
            
            # Step1 : 生成所有的(valid)候選集和
            candidates = generate_candidates(word)
            
            # 一種方式:if candidates = [] , 多生成幾個candidates,比如生成編輯距離不大於2的
            # TODO: 根據條件生成更多的候選集和
            
            # 這種情況意思是 通過兩步改變該字符串,仍然無法生成我們認爲正確的單詞!!!
            if len(candidates) < 1:
                continue  # 不建議這麼做(這是不對的!!!)
                
            probs = []
            
            # 對於每一個candidate,計算它的score
            # score = p(correct) * p(mistake|correct)
            #       = log p(correct) + log p(mistake|correct)
            # 返回score最大的candidate
            for candi in candidates:
                prob = 0
                
                # a. 計算channel probability
                
                # 如果候選詞(正確的詞)在可能輸錯詞表中有 該錯詞
                if candi in channel_prob and word in channel_prob[candi]:
                    prob += np.log(channel_prob[candi][word])
                # 沒有的話就給它個超級小的概率,但是反正我們就認爲用戶確實輸錯了!
                else:
                    prob += np.log(0.0001) # 實際不能這麼平滑,這裏是爲了快捷!
                    
                # b. 計算語言模型的概率
                    # 我認爲老師在這句話胡說八道,根本不用加1,後面的idx全部+1復原了
                    # 程序根本沒有經過這裏,這裏全部寫錯了。下面是我修改之後的!!
                    
                    #老師瞎寫的:
#                 idx = items[2].index(word)+1
#                 if items[2][idx - 1] in bigram_count and candi in bigram_count[items[2][idx - 1]]:
#                     prob += np.log((bigram_count[items[2][idx - 1]][candi] + 1.0) / (
#                             term_count[bigram_count[items[2][idx - 1]]] + V))
                    
                idx = items[2].index(word) + 1 # 默認有個<s>符號,所以加1
                if items[2][idx-1: idx+1] in bigram_count and candi in bigram_count[items[2][idx-1:idx+1]]:
                    print('+++++++++++++++++++++++++')
                    prob += np.log((bigram_count[items[2][idx-1: idx+1]] + 1.0) /  # ????
                                    (term_count[items[2][idx - 1]] + V))
                    
                # TODO: 也要考慮當前 [word,post_word]
                # prob += np.log(bigram概率)
                else:
                    prob += np.log(1.0 / V)
                    
                probs.append(prob)
            
            max_idx = probs.index(max(probs))
            print(word, candidates[max_idx])
            
            
            
            
inverted_index = {} # 倒排表的建立可以用的數據結構
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章