注:基于现有案例教程
完成一个相对简单的 “关键字提取” 算法,来达到最自然语言处理的一个初步的理解。
词汇数据下载:
http://labfile.oss.aliyuncs.com/courses/741/nltk_data.tar.gz
也可以用下面的下载
import nltk
nltk.download('stopwords')
nltk.download('punkt')
程序测试样本数据下载:
http://labfile.oss.aliyuncs.com/courses/741/news.txt
nltk.tokenize
是 NLTK 提供的分词工具包。所谓的分词 tokenize
实际就是把段落分成句子,把句子分成一个个单词的过程。我们导入的 sent_tokenize()
函数对应的是分段为句。 word_tokenize()
函数对应的是分句为词。
stopwords
是一个列表,包含了英文中那些频繁出现的词,如 am
, is
, are
。
defaultdict
是一个带有默认值的字典容器。
puctuation
是一个列表,包含了英文中的标点和符号。
nlargest()
函数可以很快地求出一个容器中最大的 n
个数字。
导入这些包:
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from string import punctuation
from heapq import nlargest
基本思想:拥有关键词最多的句子就是最重要的句子。我们把句子按照关键词数量的多少排序,取前 n 句,即可汇总成我们的摘要。
整体代码:
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from string import punctuation
from heapq import nlargest
#定义一些需要的常量
stopwords = set(stopwords.words('english')+list(punctuation))
max_cut = 0.9
min_cut = 0.1
def compute_frequencies(word_sent):
freq = defaultdict(int)
for s in word_sent:
for word in s:
if word not in stopwords:
freq[word] += 1
m = float(max(freq.values()))
for w in list(freq.keys()):
freq[w] = freq[w]/m
if freq[w] >= max_cut or freq[w] <= min_cut:
#del 删除变量,解出当前freq[w]对当前值的占用
del freq[w]
return freq
def summarize(text,n):
#把段落变成一个个句子
sents = sent_tokenize(text)
#断言字段长度大于n,也就是2
assert n<=len(sents)
#将句子分成一个个单词,并小写
word_sent = [word_tokenize(s.lower()) for s in sents]
#计算每个词出现的频率,返回freq[w]代表了w出现的频率
freq = compute_frequencies(word_sent)
#生成一个带有 默认值的字典容器,默认值是int型
ranking = defaultdict(int)
for i,word in enumerate(word_sent):
for w in word:
if w in freq:
ranking[i] += freq[w]
sents_idx = rank(ranking,n)
return [sents[j] for j in sents_idx]
def rank(ranking,n):
#求出一个容器中最大的n个数字
return nlargest(n,ranking,key=ranking.get)
if __name__ == '__main__':
with open("news.txt", "r") as myfile:
text = myfile.read().replace('\n','')
res = summarize(text, 2)
for i in range(len(res)):
print(res[i])
运行结果:
方法只是单纯的叠加重要性,导致长句子占有优势。
下面使用TextRank 算法完成新闻摘要提取,TextRank 对 PageRank 算法做了改进,使其可以计算每一个 句子 的 重要性 :
代码如下:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import math
from itertools import product, count
from string import punctuation
from heapq import nlargest
stopwords = set(stopwords.words('english') + list(punctuation))
def calculate_similarity(sen1,sen2):
counter =0
for word in sen1:
if word in sen2:
counter +=1
return counter/(math.log(len(sen1))+math.log(len(sen2)))
def create_graph(word_sent):
num = len(word_sent)
board = [[0.0 for _ in range(num)] for _ in range(num)]
for i,j in product(range(num),repeat=2):
if i != j:
board[i][j] = calculate_similarity(word_sent[i],word_sent[j])
return board
def weighted_pagerank(weight_graph):
scores = [0.5 for _ in range(len(weight_graph))]
old_scores = [0.0 for _ in range(len(weight_graph))]
while different(scores,old_scores):
for i in range(len(weight_graph)):
old_scores[i] = scores[i]
for i in range(len(weight_graph)):
scores[i] = calculate_score(weight_graph,scores,i)
return scores
def different(scores,old_scores):
flag = False
for i in range(len(scores)):
if math.fabs(scores[i]-old_scores[i]) >= 0.0001:
flag = True
break
return flag
def calculate_score(weight_graph,scores,i):
length = len(weight_graph)
d = 0.85
added_score = 0.0
for j in range(length):
fraction = 0.0
denominator = 0.0
fraction = weight_graph[j][i] * scores[j]
for k in range(length):
denominator += weight_graph[j][k]
added_score += fraction/denominator
weight_score = (1-d)+d*added_score
return weight_score
def Summarize(text,n):
sents = sent_tokenize(text)
word_sent = [word_tokenize(s.lower()) for s in sents]
for i in range(len(word_sent)):
for word in word_sent[i]:
if word in stopwords:
word_sent[i].remove(word)
similarity_graph = create_graph(word_sent)
scores = weighted_pagerank(similarity_graph)
sent_selected = nlargest(n,zip(scores,count()))
sent_index=[]
for i in range(n):
sent_index.append(sent_selected[i][1])
return [sents[i] for i in sent_index]
if __name__ =='__main__':
with open("news.txt","r") as myfile:
text = myfile.read().replace('\n','')
print(Summarize(text,2))
生成结果如下:
识别出来的内容已经和之前的不同。虽然两个好像都没能把主题句找出来