注:基於現有案例教程
完成一個相對簡單的 “關鍵字提取” 算法,來達到最自然語言處理的一個初步的理解。
詞彙數據下載:
http://labfile.oss.aliyuncs.com/courses/741/nltk_data.tar.gz
也可以用下面的下載
import nltk
nltk.download('stopwords')
nltk.download('punkt')
程序測試樣本數據下載:
http://labfile.oss.aliyuncs.com/courses/741/news.txt
nltk.tokenize
是 NLTK 提供的分詞工具包。所謂的分詞 tokenize
實際就是把段落分成句子,把句子分成一個個單詞的過程。我們導入的 sent_tokenize()
函數對應的是分段爲句。 word_tokenize()
函數對應的是分句爲詞。
stopwords
是一個列表,包含了英文中那些頻繁出現的詞,如 am
, is
, are
。
defaultdict
是一個帶有默認值的字典容器。
puctuation
是一個列表,包含了英文中的標點和符號。
nlargest()
函數可以很快地求出一個容器中最大的 n
個數字。
導入這些包:
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from string import punctuation
from heapq import nlargest
基本思想:擁有關鍵詞最多的句子就是最重要的句子。我們把句子按照關鍵詞數量的多少排序,取前 n 句,即可彙總成我們的摘要。
整體代碼:
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from string import punctuation
from heapq import nlargest
#定義一些需要的常量
stopwords = set(stopwords.words('english')+list(punctuation))
max_cut = 0.9
min_cut = 0.1
def compute_frequencies(word_sent):
freq = defaultdict(int)
for s in word_sent:
for word in s:
if word not in stopwords:
freq[word] += 1
m = float(max(freq.values()))
for w in list(freq.keys()):
freq[w] = freq[w]/m
if freq[w] >= max_cut or freq[w] <= min_cut:
#del 刪除變量,解出當前freq[w]對當前值的佔用
del freq[w]
return freq
def summarize(text,n):
#把段落變成一個個句子
sents = sent_tokenize(text)
#斷言字段長度大於n,也就是2
assert n<=len(sents)
#將句子分成一個個單詞,並小寫
word_sent = [word_tokenize(s.lower()) for s in sents]
#計算每個詞出現的頻率,返回freq[w]代表了w出現的頻率
freq = compute_frequencies(word_sent)
#生成一個帶有 默認值的字典容器,默認值是int型
ranking = defaultdict(int)
for i,word in enumerate(word_sent):
for w in word:
if w in freq:
ranking[i] += freq[w]
sents_idx = rank(ranking,n)
return [sents[j] for j in sents_idx]
def rank(ranking,n):
#求出一個容器中最大的n個數字
return nlargest(n,ranking,key=ranking.get)
if __name__ == '__main__':
with open("news.txt", "r") as myfile:
text = myfile.read().replace('\n','')
res = summarize(text, 2)
for i in range(len(res)):
print(res[i])
運行結果:
方法只是單純的疊加重要性,導致長句子佔有優勢。
下面使用TextRank 算法完成新聞摘要提取,TextRank 對 PageRank 算法做了改進,使其可以計算每一個 句子 的 重要性 :
代碼如下:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import math
from itertools import product, count
from string import punctuation
from heapq import nlargest
stopwords = set(stopwords.words('english') + list(punctuation))
def calculate_similarity(sen1,sen2):
counter =0
for word in sen1:
if word in sen2:
counter +=1
return counter/(math.log(len(sen1))+math.log(len(sen2)))
def create_graph(word_sent):
num = len(word_sent)
board = [[0.0 for _ in range(num)] for _ in range(num)]
for i,j in product(range(num),repeat=2):
if i != j:
board[i][j] = calculate_similarity(word_sent[i],word_sent[j])
return board
def weighted_pagerank(weight_graph):
scores = [0.5 for _ in range(len(weight_graph))]
old_scores = [0.0 for _ in range(len(weight_graph))]
while different(scores,old_scores):
for i in range(len(weight_graph)):
old_scores[i] = scores[i]
for i in range(len(weight_graph)):
scores[i] = calculate_score(weight_graph,scores,i)
return scores
def different(scores,old_scores):
flag = False
for i in range(len(scores)):
if math.fabs(scores[i]-old_scores[i]) >= 0.0001:
flag = True
break
return flag
def calculate_score(weight_graph,scores,i):
length = len(weight_graph)
d = 0.85
added_score = 0.0
for j in range(length):
fraction = 0.0
denominator = 0.0
fraction = weight_graph[j][i] * scores[j]
for k in range(length):
denominator += weight_graph[j][k]
added_score += fraction/denominator
weight_score = (1-d)+d*added_score
return weight_score
def Summarize(text,n):
sents = sent_tokenize(text)
word_sent = [word_tokenize(s.lower()) for s in sents]
for i in range(len(word_sent)):
for word in word_sent[i]:
if word in stopwords:
word_sent[i].remove(word)
similarity_graph = create_graph(word_sent)
scores = weighted_pagerank(similarity_graph)
sent_selected = nlargest(n,zip(scores,count()))
sent_index=[]
for i in range(n):
sent_index.append(sent_selected[i][1])
return [sents[i] for i in sent_index]
if __name__ =='__main__':
with open("news.txt","r") as myfile:
text = myfile.read().replace('\n','')
print(Summarize(text,2))
生成結果如下:
識別出來的內容已經和之前的不同。雖然兩個好像都沒能把主題句找出來