項目簡介
本項目整體分爲三個部分來進行
- 今日頭條新聞爬取
- 將爬取下來的新聞正文部分進行實體分析,並將結果可視化
- 用storm框架將爬取的新聞數據存入mysql
本文主要介紹今日頭條新聞爬取的部分,下面給出整個項目的框架
今日頭條爬取部分可以參考:爬取部分
storm流處理部分可以參考:storm流處理
項目下載地址:今日頭條爬取+實體分析+storm流處理
代碼介紹
整個實體分析模塊分成了四個類,主要分爲事件的挖掘模塊,使用LTP進行語義分析,實體提取的模塊,使用textrank圖算法來構建關鍵詞以及實體的通聯關係的模塊,最後有一個利用echars將通聯關係可視化的模塊。
下面給出所有的代碼
text_grapher.py
from sentence_parser import *
import re
from collections import Counter
from GraphShow import *
from keywords_textrank import *
'''事件挖掘'''
class CrimeMining:
def __init__(self):
self.textranker = TextRank()
self.parser = LtpParser()
self.ners = ['nh', 'ni', 'ns']
self.ner_dict = {
'nh':'人物',
'ni':'機構',
'ns':'地名'
}
self.graph_shower = GraphShow()
'''移除括號內的信息,去除噪聲'''
def remove_noisy(self, content):
p1 = re.compile(r'([^)]*)')
p2 = re.compile(r'\([^\)]*\)')
return p2.sub('', p1.sub('', content))
'''收集命名實體'''
def collect_ners(self, words, postags):
ners = []
for index, pos in enumerate(postags):
if pos in self.ners:
ners.append(words[index] + '/' + pos)
return ners
'''對文章進行分句處理'''
def seg_content(self, content):
return [sentence for sentence in re.split(r'[??!!。;;::\n\r]', content) if sentence]
'''對句子進行分詞,詞性標註處理'''
def process_sent(self, sent):
words, postags = self.parser.basic_process(sent)
return words, postags
'''構建實體之間的共現關係'''
def collect_coexist(self, ner_sents, ners):
co_list = []
for sent in ner_sents:
words = [i[0] + '/' + i[1] for i in zip(sent[0], sent[1])]
co_ners = set(ners).intersection(set(words))
co_info = self.combination(list(co_ners))
co_list += co_info
if not co_list:
return []
return {i[0]:i[1] for i in Counter(co_list).most_common()}
'''列表全排列'''
def combination(self, a):
combines = []
if len(a) == 0:
return []
for i in a:
for j in a:
if i == j:
continue
combines.append('@'.join([i, j]))
return combines
'''抽取出事件三元組'''
def extract_triples(self, words, postags):
svo = []
tuples, child_dict_list = self.parser.parser_main(words, postags)
for tuple in tuples:
rel = tuple[-1]
if rel in ['SBV']:
sub_wd = tuple[1]
verb_wd = tuple[3]
obj = self.complete_VOB(verb_wd, child_dict_list)
subj = sub_wd
verb = verb_wd
if not obj:
svo.append([subj, verb])
else:
svo.append([subj, verb+obj])
return svo
'''過濾出與命名實體相關的事件三元組'''
def filter_triples(self, triples, ners):
ner_triples = []
for ner in ners:
for triple in triples:
if ner in triple:
ner_triples.append(triple)
return ner_triples
'''根據SBV找VOB'''
def complete_VOB(self, verb, child_dict_list):
for child in child_dict_list:
wd = child[0]
attr = child[3]
if wd == verb:
if 'VOB' not in attr:
continue
vob = attr['VOB'][0]
obj = vob[1]
return obj
return ''
'''對文章進行關鍵詞挖掘'''
def extract_keywords(self, words_list):
return self.textranker.extract_keywords(words_list, 10)
'''基於文章關鍵詞,建立起實體與關鍵詞之間的關係'''
def rel_entity_keyword(self, ners, keyword, subsent):
events = []
rels = []
sents = []
ners = [i.split('/')[0] for i in set(ners)]
keyword = [i[0] for i in keyword]
for sent in subsent:
tmp = []
for wd in sent:
if wd in ners + keyword:
tmp.append(wd)
if len(tmp) > 1:
sents.append(tmp)
for ner in ners:
for sent in sents:
if ner in sent:
tmp = ['->'.join([ner, wd]) for wd in sent if wd in keyword and wd != ner and len(wd) > 1]
if tmp:
rels += tmp
for e in set(rels):
events.append([e.split('->')[0], e.split('->')[1]])
return events
'''利用標點符號,將文章進行短句切分處理'''
def seg_short_content(self, content):
return [sentence for sentence in re.split(r'[,,??!!。;;::\n\r\t ]', content) if sentence]
'''挖掘主控函數'''
def main(self,content,title):
if not content:
return []
# 對文章進行去噪處理
content = self.remove_noisy(content)
# 對文章進行長句切分處理
sents = self.seg_content(content)
# 對文章進行短句切分處理
subsents = self.seg_short_content(content)
subsents_seg = []
# words_list存儲整篇文章的詞頻信息
words_list = []
# ner_sents保存具有命名實體的句子
ner_sents = []
# ners保存命名實體
ners = []
# triples保存主謂賓短語
triples = []
# 存儲文章事件
events = []
for sent in subsents:
words, postags = self.process_sent(sent)
words_list += [[i[0], i[1]] for i in zip(words, postags)]
subsents_seg.append([i[0] for i in zip(words, postags)])
ner = self.collect_ners(words, postags)
if ner:
triple = self.extract_triples(words, postags)
if not triple:
continue
triples += triple
ners += ner
ner_sents.append([words, postags])
# 獲取文章關鍵詞, 並圖譜組織, 這個可以做
keywords = [i[0] for i in self.extract_keywords(words_list)]
for keyword in keywords:
name = keyword
cate = '關鍵詞'
events.append([name, cate])
# 對三元組進行event構建,這個可以做
for t in triples:
if (t[0] in keywords or t[1] in keywords) and len(t[0]) > 1 and len(t[1]) > 1:
events.append([t[0], t[1]])
# 獲取文章詞頻信息話,並圖譜組織,這個可以做
word_dict = [i for i in Counter([i[0] for i in words_list if i[1][0] in ['n', 'v'] and len(i[0]) > 1]).most_common()][:10]
for wd in word_dict:
name = wd[0]
cate = '高頻詞'
events.append([name, cate])
# 獲取全文命名實體,這個可以做
ner_dict = {i[0]:i[1] for i in Counter(ners).most_common()}
for ner in ner_dict:
name = ner.split('/')[0]
cate = self.ner_dict[ner.split('/')[1]]
events.append([name, cate])
# 獲取全文命名實體共現信息,構建事件共現網絡
co_dict = self.collect_coexist(ner_sents, list(ner_dict.keys()))
co_events = [[i.split('@')[0].split('/')[0], i.split('@')[1].split('/')[0]] for i in co_dict]
events += co_events
#將關鍵詞與實體進行關係抽取
events_entity_keyword = self.rel_entity_keyword(ners, keywords, subsents_seg)
events += events_entity_keyword
#對事件網絡進行圖譜化展示
self.graph_shower.create_page(events,title)
handler = CrimeMining()
def Entity_extraction(text,title):
print('-----------------進入實體分析-----------------')
handler.main(text,title)
sentence_parser.py
import os
from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer
class LtpParser():
def __init__(self):
LTP_DIR = "./ltp_data"
self.segmentor = Segmentor()
self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))
self.postagger = Postagger()
self.postagger.load(os.path.join(LTP_DIR, "pos.model"))
self.parser = Parser()
self.parser.load(os.path.join(LTP_DIR, "parser.model"))
self.recognizer = NamedEntityRecognizer()
self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))
'''ltp基本操作'''
def basic_parser(self, words):
postags = list(self.postagger.postag(words))
netags = self.recognizer.recognize(words, postags)
return postags, netags
'''ltp獲取詞性'''
def get_postag(self, words):
return list(self.postagger.postag(words))
'''基於實體識別結果,整理輸出實體列表'''
def format_entity(self, words, netags, postags):
name_entity_dist = {}
name_entity_list = []
place_entity_list = []
organization_entity_list = []
ntag_E_Nh = ""
ntag_E_Ni = ""
ntag_E_Ns = ""
index = 0
for item in zip(words, netags):
word = item[0]
ntag = item[1]
if ntag[0] != "O":
if ntag[0] == "S":
if ntag[-2:] == "Nh":
name_entity_list.append(word+'_%s ' % index)
elif ntag[-2:] == "Ni":
organization_entity_list.append(word+'_%s ' % index)
else:
place_entity_list.append(word + '_%s ' % index)
elif ntag[0] == "B":
if ntag[-2:] == "Nh":
ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index
elif ntag[-2:] == "Ni":
ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index
else:
ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index
elif ntag[0] == "I":
if ntag[-2:] == "Nh":
ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index
elif ntag[-2:] == "Ni":
ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index
else:
ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index
else:
if ntag[-2:] == "Nh":
ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index
name_entity_list.append(ntag_E_Nh)
ntag_E_Nh = ""
elif ntag[-2:] == "Ni":
ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index
organization_entity_list.append(ntag_E_Ni)
ntag_E_Ni = ""
else:
ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index
place_entity_list.append(ntag_E_Ns)
ntag_E_Ns = ""
index += 1
name_entity_dist['nhs'] = self.modify_entity(name_entity_list, words, postags, 'nh')
name_entity_dist['nis'] = self.modify_entity(organization_entity_list, words, postags, 'ni')
name_entity_dist['nss'] = self.modify_entity(place_entity_list,words, postags, 'ns')
return name_entity_dist
'''entity修正,爲rebuild_wordspostags做準備'''
def modify_entity(self, entity_list, words, postags, tag):
entity_modify = []
if entity_list:
for entity in entity_list:
entity_dict = {}
subs = entity.split(' ')[:-1]
start_index = subs[0].split('_')[1]
end_index = subs[-1].split('_')[1]
entity_dict['stat_index'] = start_index
entity_dict['end_index'] = end_index
if start_index == entity_dict['end_index']:
consist = [words[int(start_index)] + '/' + postags[int(start_index)]]
else:
consist = [words[index] + '/' + postags[index] for index in range(int(start_index), int(end_index)+1)]
entity_dict['consist'] = consist
entity_dict['name'] = ''.join(tmp.split('_')[0] for tmp in subs) + '/' + tag
entity_modify.append(entity_dict)
return entity_modify
'''基於命名實體識別,修正words,postags'''
def rebuild_wordspostags(self, name_entity_dist, words, postags):
pre = ' '.join([item[0] + '/' + item[1] for item in zip(words, postags)])
post = pre
for et, infos in name_entity_dist.items():
if infos:
for info in infos:
post = post.replace(' '.join(info['consist']), info['name'])
post = [word for word in post.split(' ') if len(word.split('/')) == 2 and word.split('/')[0]]
words = [tmp.split('/')[0] for tmp in post]
postags = [tmp.split('/')[1] for tmp in post]
return words, postags
'''依存關係格式化'''
def syntax_parser(self, words, postags):
arcs = self.parser.parse(words, postags)
words = ['Root'] + words
postags = ['w'] + postags
tuples = list()
for index in range(len(words)-1):
arc_index = arcs[index].head
arc_relation = arcs[index].relation
tuples.append([index+1, words[index+1], postags[index+1], words[arc_index], postags[arc_index], arc_index, arc_relation])
return tuples
'''爲句子中的每個詞語維護一個保存句法依存兒子節點的字典'''
def build_parse_child_dict(self, words, postags, tuples):
child_dict_list = list()
for index, word in enumerate(words):
child_dict = dict()
for arc in tuples:
if arc[3] == word:
if arc[-1] in child_dict:
child_dict[arc[-1]].append(arc)
else:
child_dict[arc[-1]] = []
child_dict[arc[-1]].append(arc)
child_dict_list.append([word, postags[index], index, child_dict])
return child_dict_list
'''parser主函數'''
def parser_main(self, words, postags):
tuples = self.syntax_parser(words, postags)
child_dict_list = self.build_parse_child_dict(words, postags, tuples)
return tuples, child_dict_list
'''基礎語言分析'''
def basic_process(self, sentence):
words = list(self.segmentor.segment(sentence))
postags, netags = self.basic_parser(words)
name_entity_dist = self.format_entity(words, netags, postags)
words, postags = self.rebuild_wordspostags(name_entity_dist, words, postags)
return words, postags
keywords_textrank.py
import jieba.posseg as pseg
from collections import defaultdict
import sys
'''textrank圖算法'''
class textrank_graph:
def __init__(self):
self.graph = defaultdict(list)
self.d = 0.85 #d是阻尼係數,一般設置爲0.85
self.min_diff = 1e-5 #設定收斂閾值
#添加節點之間的邊
def addEdge(self, start, end, weight):
self.graph[start].append((start, end, weight))
self.graph[end].append((end, start, weight))
#節點排序
def rank(self):
#默認初始化權重
weight_deafault = 1.0 / (len(self.graph) or 1.0)
#nodeweight_dict, 存儲節點的權重
nodeweight_dict = defaultdict(float)
#outsum,存儲節點的出度權重
outsum_node_dict = defaultdict(float)
#根據圖中的邊,更新節點權重
for node, out_edge in self.graph.items():
#是 [('是', '全國', 1), ('是', '調查', 1), ('是', '失業率', 1), ('是', '城鎮', 1)]
nodeweight_dict[node] = weight_deafault
outsum_node_dict[node] = sum((edge[2] for edge in out_edge), 0.0)
#初始狀態下的textrank重要性權重
sorted_keys = sorted(self.graph.keys())
#設定迭代次數,
step_dict = [0]
for step in range(1, 1000):
for node in sorted_keys:
s = 0
#計算公式:(edge_weight/outsum_node_dict[edge_node])*node_weight[edge_node]
for e in self.graph[node]:
s += e[2] / outsum_node_dict[e[1]] * nodeweight_dict[e[1]]
#計算公式:(1-d) + d*s
nodeweight_dict[node] = (1 - self.d) + self.d * s
step_dict.append(sum(nodeweight_dict.values()))
if abs(step_dict[step] - step_dict[step - 1]) <= self.min_diff:
break
#利用Z-score進行權重歸一化,也稱爲離差標準化,是對原始數據的線性變換,使結果值映射到[0 - 1]之間。
#先設定最大值與最小值均爲系統存儲的最大值和最小值
(min_rank, max_rank) = (sys.float_info[0], sys.float_info[3])
for w in nodeweight_dict.values():
if w < min_rank:
min_rank = w
if w > max_rank:
max_rank = w
for n, w in nodeweight_dict.items():
nodeweight_dict[n] = (w - min_rank/10.0) / (max_rank - min_rank/10.0)
return nodeweight_dict
'''基於textrank圖算法的關鍵詞提取'''
class TextRank:
def __init__(self):
self.candi_pos = ['n', 'v']
self.stop_pos = ['nt']
self.span = 5
def extract_keywords(self, word_list, num_keywords):
g = textrank_graph()
cm = defaultdict(int)
for i, word in enumerate(word_list):
if word[1][0] in self.candi_pos and len(word[0]) > 1:
for j in range(i + 1, i + self.span):
if j >= len(word_list):
break
if word_list[j][1][0] not in self.candi_pos or word_list[j][1] in self.stop_pos or len(word_list[j][0]) < 2:
continue
pair = tuple((word[0], word_list[j][0]))
cm[(pair)] += 1
for terms, w in cm.items():
g.addEdge(terms[0], terms[1], w)
nodes_rank = g.rank()
nodes_rank = sorted(nodes_rank.items(), key=lambda asd:asd[1], reverse=True)
return nodes_rank[:num_keywords]
GraphShow.py
'''創建展示頁面'''
class GraphShow():
def __init__(self):
self.base = '''
<html>
<head>
<script type="text/javascript" src="VIS/dist/vis.js"></script>
<link href="VIS/dist/vis.css" rel="stylesheet" type="text/css">
<meta http-equiv="Content-Type" content="text/html; charset=gba">
</head>
<body>
<div id="VIS_draw"></div>
<script type="text/javascript">
var nodes = data_nodes;
var edges = data_edges;
var container = document.getElementById("VIS_draw");
var data = {
nodes: nodes,
edges: edges
};
var options = {
nodes: {
shape: 'circle',
size: 15,
font: {
size: 15
}
},
edges: {
font: {
size: 10,
align: 'center'
},
color: 'red',
arrows: {
to: {enabled: true, scaleFactor: 1.2}
},
smooth: {enabled: true}
},
physics: {
enabled: true
}
};
var network = new vis.Network(container, data, options);
</script>
</body>
</html>
'''
'''讀取文件數據'''
def create_page(self, events,title):
nodes = []
for event in events:
nodes.append(event[0])
nodes.append(event[1])
node_dict = {node: index for index, node in enumerate(nodes)}
data_nodes = []
data_edges = []
for node, id in node_dict.items():
data = {}
data["group"] = 'Event'
data["id"] = id
data["label"] = node
data_nodes.append(data)
for edge in events:
data = {}
data['from'] = node_dict.get(edge[0])
data['label'] = ''
data['to'] = node_dict.get(edge[1])
data_edges.append(data)
self.create_html(data_nodes, data_edges,title)
return
'''生成html文件'''
def create_html(self, data_nodes, data_edges,title):
f = open('./html/'+title+'.html', 'w+')
html = self.base.replace('data_nodes', str(data_nodes)).replace('data_edges', str(data_edges))
f.write(html.encode("gbk", 'ignore').decode("gbk", "ignore"))
f.close()
print('-----------------實體分析結束-----------------')
展示效果
這裏給大家看一下最後得到的結果
總結
這部分其實是比較難的,這個部分很多我也是直接使用的別人已經做好的代碼,主要只是想借用這個可視化的效果而已,將整個部分當成一個藉口來用其實就可以,最後,我們還差一個流處理的部分,這個我們留到下篇文章再講,感興趣的可以繼續關注一下。