構建詞典庫
# Python rstrip() 刪除 string 字符串末尾的指定字符(默認爲空格)
vocab = set([line.rstrip() for line in open('vocab.txt')])
print(vocab)
需要生成 所有候選集合
def generate_candidates(word):
'''
word: 給定的輸入(錯誤的輸入)
返回所有(valid)候選集和
'''
# 生成編輯距離爲1的單詞
# 1.insert 2. delete 3. replace
# appl: replace: bppl, cppl......
# insert: bappl, cappl....
# delete: ppl, apl, app......
# 假設適用26個字符(意思是無法預測用戶是否想輸入其他字符,
# 如標點符號;但是我們預測用戶可能將其他英文字母輸錯成爲標點符號)
letters = "abcdefghijklmnopqrstuvwxyz"
splits =[(word[:i], word[i:]) for i in range(len(word) + 1)]
# insert 操作
inserts = [*L+c+R for L, R in splits for c in letters]*
# delete 操作
deletes = [L+R[1:] for L, R in splits if R]
# replace 操作
replaces = [L+c+R[1:] for L, R in splits if R for c in letters]
candidates = set(inserts+deletes+replaces)
# 過濾掉不存在於詞典庫裏面的單詞
return [word for word in candidates if word in vocab]
generate_candidates('apple')
讀取路透社語料庫
import nltk
from nltk.corpus import reuters
nltk.download('reuters') # 沒有該語料庫就下載
categories = reuters.categories()
corpus = reuters.sents(categories = categories)
構建語言模型:bigram
term_count = {}
bigram_count = {}
for doc in corpus:
doc = ['<s>'] + doc
for i in range(len(doc) - 1):
# bigram: [i, i+1]
term = doc[i]
bigram = doc[i: i+2]
if term in term_count:
term_count[term] += 1
else:
term_count[term] = 1
bigram = ' '.join(bigram)
if bigram in bigram_count:
bigram_count[bigram] += 1
else:
bigram_count[bigram] = 1
# print(bigram_count)
# sklearn裏面有現成的包
用戶打錯的概率統計 - channel probalility
channel_prob = {}
for line in open('spell-errors.txt'):
items = line.split(':')
correct = items[0].strip()
mistakes = [item.strip() for item in items[1].strip().split(',')]
channel_prob[correct] = {}
for mis in mistakes:
# 當用戶打到這個正確的詞的時候,它可能打錯成其他詞的概率,這是個條件概率,每個詞概率相同。。。。?
channel_prob[correct][mis] = 1.0 / len(mistakes)
print(channel_prob)
主函數
import numpy as np
V = len(term_count.keys())
file = open('testdata.txt', 'r')
for line in file:
items = line.rstrip().split('\t')
line = items[2][:-1].split() # 要去掉句末的標點符號!!!!
# ['I', 'like', 'playing']
for word in line:
# 消除標點符號的干擾
other = [",", "'"]
if word[-1] in other:
word = word[:-1]
elif word[-2:] in ["'s", "'t"]:
word = word[:-2]
if word not in vocab:
# 需要替換word成正確的單詞
# Step1 : 生成所有的(valid)候選集和
candidates = generate_candidates(word)
# 一種方式:if candidates = [] , 多生成幾個candidates,比如生成編輯距離不大於2的
# TODO: 根據條件生成更多的候選集和
# 這種情況意思是 通過兩步改變該字符串,仍然無法生成我們認爲正確的單詞!!!
if len(candidates) < 1:
continue # 不建議這麼做(這是不對的!!!)
probs = []
# 對於每一個candidate,計算它的score
# score = p(correct) * p(mistake|correct)
# = log p(correct) + log p(mistake|correct)
# 返回score最大的candidate
for candi in candidates:
prob = 0
# a. 計算channel probability
# 如果候選詞(正確的詞)在可能輸錯詞表中有 該錯詞
if candi in channel_prob and word in channel_prob[candi]:
prob += np.log(channel_prob[candi][word])
# 沒有的話就給它個超級小的概率,但是反正我們就認爲用戶確實輸錯了!
else:
prob += np.log(0.0001) # 實際不能這麼平滑,這裏是爲了快捷!
# b. 計算語言模型的概率
# 我認爲老師在這句話胡說八道,根本不用加1,後面的idx全部+1復原了
# 程序根本沒有經過這裏,這裏全部寫錯了。下面是我修改之後的!!
#老師瞎寫的:
# idx = items[2].index(word)+1
# if items[2][idx - 1] in bigram_count and candi in bigram_count[items[2][idx - 1]]:
# prob += np.log((bigram_count[items[2][idx - 1]][candi] + 1.0) / (
# term_count[bigram_count[items[2][idx - 1]]] + V))
idx = items[2].index(word) + 1 # 默認有個<s>符號,所以加1
if items[2][idx-1: idx+1] in bigram_count and candi in bigram_count[items[2][idx-1:idx+1]]:
print('+++++++++++++++++++++++++')
prob += np.log((bigram_count[items[2][idx-1: idx+1]] + 1.0) / # ????
(term_count[items[2][idx - 1]] + V))
# TODO: 也要考慮當前 [word,post_word]
# prob += np.log(bigram概率)
else:
prob += np.log(1.0 / V)
probs.append(prob)
max_idx = probs.index(max(probs))
print(word, candidates[max_idx])
inverted_index = {} # 倒排表的建立可以用的數據結構