基於互信息與左右信息熵的新詞發現

import re
from collections import Counter
import numpy as np
def ngram_words(file,ngram_cont):
    words = []
    for i in range(1,ngram_cont):
        words.extend([file[j:j+i] for j in range(len(file)-i+1)]) #添加指定的n元詞數
    words_fre = dict(Counter(words))#統計詞頻
    return words_fre
def PMI(words_fre,pmi_threshold):
    new_words = []
    for i in words_fre:
        if len(i) ==1 :
            pass
        else:
            p_x_p_y = min([words_fre.get(i[:j]) * words_fre.get(i[j:]) for j in range(1,len(i))]) #計算px*py
            if words_fre.get(i)/p_x_p_y > pmi_threshold: #大於閾值的添加爲新詞
                new_words.append(i)
    return new_words
def calculate_entropy(list):
    entropy_dic = dict(Counter(list)) #統計詞頻
    entropy = (-1) * sum([entropy_dic.get(i)/len(list) * np.log2(entropy_dic.get(i)/len(list)) for i in entropy_dic])#計算熵
    return entropy
def Entropy_left_right(words,text,ent_threshold):
    result_words = []

    for word in words:
        try:
            left_right_words = re.findall('(.)%s(.)' % word,text) #新詞在文章中的前後位置的字
            left_words = [i[0] for i in left_right_words]
            left_entropy = calculate_entropy(left_words)
            right_words = [i[1] for i in left_right_words]
            right_entropy = calculate_entropy(right_words)
            if min(left_entropy,right_entropy) > ent_threshold:
                result_words.append(word)
        except:
            pass
    return result_words
stop_word=['【','】',')','(','、',',','“','”','。','\n','《','》',' ','-','!','?','.','\'','[',']',':','/','.','"','\u3000','’','.',',','…','?']
with open("result.txt",'r',encoding='utf8') as f:
    text = f.read()
for i in stop_word:
    text=text.replace(i,"")
ngram = 3
PMI_threshold = 0.05
ent_threshold = 1
words_fre = ngram_words(text,ngram)
new_words = PMI(words_fre,PMI_threshold)
result = Entropy_left_right(new_words,text,ent_threshold)
print(result)

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章