原項目地址:
https://github.com/ztz818/Automatic-generation-of-text-summaries
相關知識介紹:
Word2Vec理論知識:https://blog.csdn.net/Pit3369/article/details/96482304
中文文本關鍵詞抽取的三種方法(TF-IDF、TextRank、word2vec):
https://blog.csdn.net/Pit3369/article/details/95594728
調用summarize(text, n)
返回Top N個摘要
-
tokens = cut_sentences(text)
按照。!?切分句子,返回給
tokens
[[句子1],[句子2]] -
對句子
tokens
分詞、通用詞,返回列表sents
:[[句子1分詞結果],[句子2分詞結果]],分詞結果空格隔開 -
sents = filter_model(sents)
遍歷列表,去除word2vec詞向量裏未出現的詞。
-
graph = create_graph(sents)
傳入句子列表,返回句子間相似度的圖board[n*n],句子個數爲n,圖元素爲i、j句子的相似值
-
循環i、j:
board[i][j] = compute_similarity_by_avg(word_sent[i], word_sent[j])
在
compute_similarity_by_avg(sents_1, sents_2)
中,把句子中各個詞詞向量相加,除以句子長度作爲句子平均得分
,用來計算兩個句子的相似度,返回給create_graph
-
用句子i、j的
平均詞向量
進行相似度計算,相似值存儲在board[i][j]
,返回給相似度圖矩陣graph[][]
-
-
scores = weight_sentences_rank(graph)
輸入相似度圖矩陣graph[n*n],返回句子分數數組:scores[i],句子i的分值,初始值爲0.5
while循環迭代直至句子i分數變化穩定在0.0001:
-
scores[i] = calculate_score(weight_graph, scores, i)
計算句子在圖中分數,返回句子i的分數。參數j遍歷所有句子:
- 計算分子:
fraction = weight_graph[j][i] * scores[j]
- 計算分母:
denominator += weight_graph[j][k]
,參數k遍歷句子j、k是否關聯 - 分數累加:
added_score += fraction / denominator
- 根據PageRank算法中PR值的計算方法,算出最終的分數,並返回:
weighted_score = (1 - d) + d * added_score
- 計算分子:
-
-
sent_selected = nlargest(n, zip(scores, count()))
選取句子分數scores[],Top n的句子,作爲摘要。
相似值計算:
使用的計算句子相似度的方式如下:
如計算句子A=[‘word’,‘you’,‘me’],與句子B=[‘sentence’,‘google’,‘python’]計算相似性,從word2vec模型中分別得到A中三個單詞的詞向量v1,v2,v3取其平均值Va(avg)=(v1+v2+v3)/3。對句子B做同樣的處理得到Vb(avg),然後計算Va(avg)與Vb(avg)連個向量的夾角餘弦值,Cosine Similarity視爲句子A與B的相似度
def cut_sentences(sentence):
puns = frozenset(u'。!?')#分隔句子
tmp = []
for ch in sentence:
tmp.append(ch)
if puns.__contains__(ch):
yield ''.join(tmp)
tmp = []
yield ''.join(tmp)
# 句子中的stopwords
def create_stopwords():
stop_list = [line.strip() for line in open("G:\1graduate\news_stopwords.txt", 'r', encoding='utf-8').readlines()]
return stop_list
def two_sentences_similarity(sents_1, sents_2):
'''
計算兩個句子的相似性
:param sents_1:
:param sents_2:
:return:
'''
counter = 0
for sent in sents_1:
if sent in sents_2:
counter += 1
return counter / (math.log(len(sents_1) + len(sents_2)))
def create_graph(word_sent):
"""
傳入句子鏈表 返回句子之間相似度的圖
:param word_sent:
:return:
"""
num = len(word_sent)
print("句子數量:",num)
board = [[0.0 for _ in range(num)] for _ in range(num)]#num*num
print("board大小:",len(board),len(board[0]))
"""
詞向量平均值作爲計算句子i,j相似度的依據,保存到數組
product(A,B)函數,返回A和B中的元素組成的笛卡爾積的元組
"""
for i, j in product(range(num), repeat=2):
if i != j:
board[i][j] = compute_similarity_by_avg(word_sent[i], word_sent[j])
return board
def cosine_similarity(vec1, vec2):
'''
計算兩個向量之間的餘弦相似度
:param vec1:
:param vec2:
:return:
(XY對應元素相乘之和)/((X元素平方和開根號)+(Y元素平方和開根號))
'''
tx = np.array(vec1)
ty = np.array(vec2)
cos1 = np.sum(tx * ty)
cos21 = np.sqrt(sum(tx ** 2))
cos22 = np.sqrt(sum(ty ** 2))
cosine_value = cos1 / float(cos21 * cos22)
return cosine_value
def compute_similarity_by_avg(sents_1, sents_2):
'''
對兩個句子求平均詞向量
:param sents_1:
:param sents_2:
:return:
'''
# print("計算句子相似度:",sents_1, sents_2)
if len(sents_1) == 0 or len(sents_2) == 0:
return 0.0
"""
句子中各個詞向量相加,再平均,作爲相似度計算依據
if判斷,避免了for循環vec1作爲空值的異常
"""
if sents_1[0] in model:
vec1 = model[sents_1[0]]
else:
vec1=model['中國']
for word1 in sents_1[1:]:
if word1 in model:
vec1 = vec1 + model[word1]
else:
vec1 = vec1 + model['中國']
if sents_2[0] in model:
vec2 = model[sents_2[0]]
else:
vec2=model['中國']
for word2 in sents_2[1:]:
if word2 in model:
vec2 = vec2 + model[word2]
else:
vec2 = vec2 + model['中國']
# 詞向量元素均除以句子長度
similarity = cosine_similarity(vec1 / len(sents_1), vec2 / len(sents_2))
return similarity
def calculate_score(weight_graph, scores, i):
"""
計算句子在圖中的分數
:param weight_graph:
:param scores:
:param i:
:return:
"""
length = len(weight_graph)
d = 0.85
added_score = 0.0
for j in range(length):
fraction = 0.0
denominator = 0.0
# 計算分子,第j個句子的詞語i
fraction = weight_graph[j][i] * scores[j]
# 計算分母
for k in range(length):
denominator += weight_graph[j][k]
if denominator == 0:
denominator = 1
added_score += fraction / denominator
# 根據PageRank算法中PR值的計算方法,算出最終的分數
weighted_score = (1 - d) + d * added_score
return weighted_score
def weight_sentences_rank(weight_graph):
'''
輸入相似度的圖(矩陣)
返回各個句子的分數
:param weight_graph:
:return:
'''
# 初始分數設置爲0.5
scores = [0.5 for _ in range(len(weight_graph))] # 根據句子數量建立的矩陣,存儲每個句子的得分
old_scores = [0.0 for _ in range(len(weight_graph))]
print("scores矩陣:",len(scores))
# 開始迭代
# 判斷前後分數有無變化
while different(scores, old_scores):
for i in range(len(weight_graph)):
old_scores[i] = scores[i]
for i in range(len(weight_graph)):
# 句子序號i,計算每個句子的詞向量得分
scores[i] = calculate_score(weight_graph, scores, i)
return scores
def different(scores, old_scores):
'''
判斷前後分數有無變化
:param scores:
:param old_scores:
:return:
'''
flag = False
for i in range(len(scores)):
if math.fabs(scores[i] - old_scores[i]) >= 0.0001:
flag = True
break
return flag
def filter_symbols(sents):
stopwords = create_stopwords() + ['。', ' ', '.']
_sents = []
for sentence in sents:
for word in sentence:
if word in stopwords:
sentence.remove(word)
if sentence:
_sents.append(sentence)
return _sents
def filter_model(sents):
_sents = []
for sentence in sents:
for word in sentence:
if word not in model:
sentence.remove(word)
if sentence:
_sents.append(sentence)
else:
print("移除句子",sentence)
return _sents
def summarize(text, n):
# 按照。!?分句
tokens = cut_sentences(text)
sentences = []
sents = []
for sent in tokens:
sentences.append(sent)
sents.append([word for word in jieba.cut(sent) if word])
# 按句分詞,得到列表套列表,1句子,2按句分詞
print(len(sents))
# print((sents))
# sents = filter_symbols(sents)
# 移除模型中沒有的詞
sents = filter_model(sents)
print("過濾模型後句子數量",len(sents))
graph = create_graph(sents)
print("詞向量圖矩陣建立完畢~")
print("詞向量圖矩陣:",len(graph))# 句子個數,每行元素爲該句子中詞的詞向量
scores = weight_sentences_rank(graph)
sent_selected = nlargest(n, zip(scores, count()))
sent_index = []
print("sent_selected=",(sent_selected))
# 句子分數,句子編號
for i in range(n):
# sent_index.append(sent_selected[i][1])
sent_index.append(sent_selected[i][1])
return [sentences[i] for i in sent_index]