由於詞彙敏感之類的原因,每次發佈都審覈失敗,於是轉爲圖片上傳了!相關代碼在文末附錄中。
附錄:
t1.py:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/2/12 13:02
# @Author : ystraw
# @Site :
# @File : t1.py
# @Software: PyCharm Community Edition
# @function: 從github指定鏈接中進行數據獲取
# 獲取鏈接後,根據鏈接的不同來源,進行不同網頁的抓取文章內容
import requests
import time
import datetime
from bs4 import BeautifulSoup
from openpyxl import Workbook
import random
from lxml import etree
from openpyxl import load_workbook
import getIpPool
proxies = getIpPool.getproxies()
MAX_num = 15 # Ip取值範圍
openFlag = 1 # 0關閉Ip代理,1開啓Ip代理
outTime = 10 # 超時時間
# 寫入文件:, 新建不追加:
def writeFile(filename, file):
with open(filename, 'w', encoding='utf-8') as f:
f.write(file)
print(filename, '已寫入!')
f.close()
# 寫入文件:, 新建追加:
def writeFile_add(filename, file):
with open(filename, 'a', encoding='utf-8') as f:
f.write(file)
print(filename, '已寫入!')
f.close()
# 讀入文件
def readFile(filename):
with open(filename, 'r', encoding='utf-8') as f:
str = f.read()
print(filename, '已讀入!')
f.close()
return str
# 寫入Excel
def write_excel_xls(path, sheet_name, value, bHead):
# 獲取需要寫入數據的行數
index = len(value)
# 獲取需要寫入數據的行數
index = len(value)
wb = Workbook()
# 激活 worksheet
ws = wb.active
# 第一行輸入
ws.append(bHead)
# .cell(row=x, column=2, value=z.project)
for i in range(2, index+2):
for j in range(1, len(value[i-2]) + 1):
# ws.append(value[i])
ws.cell(row=i, column=j, value=value[i-2][j-1])
# 保存
wb.save(path)
print(path + '表格寫入數據成功!')
# 採集github的文章鏈接
def getUrl(path, url):
ua_list = [
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
]
agent = random.choice(ua_list)
headers = {
"Connection": "keep-alive",
"Accept": "application/json, text/javascript, */*; q=0.01",
"Referer": "https://piaofang.maoyan.com/dashboard?movieId=1211270&date=2011-01-02",
"User-Agent": agent,
}
# url = 'https://www.amazon.com/b/ref=AE_HP_leftnav_automotive?_encoding=UTF8&ie=UTF8&node=2562090011&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-leftnav&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_t=101&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_i=17938598011'
print('請求地址:', url)
html = requests.get(url, headers=headers, verify=False).text
# writeFile('data/top250.html', html)
# xpath:提取信息(標題)
text =etree.HTML(html)
trs = text.xpath('//div[@class="Box-body"]//tbody/tr/td[2]/text()')
# bs4: 提取信息:
bs = BeautifulSoup(html, 'lxml')
div = bs.findAll('div', attrs={'class': 'Box-body'})[0]
# print(div)
trList = div.findAll('tr')
# print(len(trList))
cnt = 0
# 全部數據
alldata = []
for tr in trList:
tds = tr.findAll('td')
if tds != []:
# 提取:日期,標題
tempList = [tds[0].string, trs[cnt]]
# 提取:【原始URL,截圖,翻譯,Archive】的鏈接
for i in range(2, 6):
astring = ''
aList = tds[i].findAll('a')
for a in aList:
astring += a['href'] + ','
tempList.append(astring.strip(','))
print(tempList)
alldata.append(tempList)
cnt += 1
tableHead = ['日期', '標題', '原始URL', '截圖', '翻譯', 'Archive']
write_excel_xls(path, 'link', alldata, tableHead)
# 提取微信文章
def getdetailContent_1(url):
ua_list = [
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
]
agent = random.choice(ua_list)
headers = {
"Connection": "keep-alive",
"Accept": "application/json, text/javascript, */*; q=0.01",
"Referer": "https://piaofang.maoyan.com/dashboard?movieId=1211270&date=2011-01-02",
"User-Agent": agent,
}
# url = 'https://www.amazon.com/b/ref=AE_HP_leftnav_automotive?_encoding=UTF8&ie=UTF8&node=2562090011&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-leftnav&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_t=101&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_i=17938598011'
print('請求地址:', url)
# html = requests.get(url, timeout=10, headers=headers, verify=False).text
global proxies
global openFlag
ip = proxies[random.randint(0, MAX_num if len(proxies) > MAX_num else len(proxies)-1)]
if openFlag == 1:
html = requests.get(url, timeout=outTime, headers=headers, proxies={ip[0]: ip[1]}, verify=False).text
# print(ip)
else:
html = requests.get(url, timeout=outTime, headers = headers, verify=False).text
# print(html)
text = etree.HTML(html)
context = text.xpath('string(//div[@class="rich_media_content "])').replace(' ', '').replace('\n', '')
# print(context.replace(' ', '').replace('\n', ''))
return context
# 提取財經網
def getdetailContent_2(url):
ua_list = [
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
]
agent = random.choice(ua_list)
headers = {
"Connection": "keep-alive",
"Accept": "application/json, text/javascript, */*; q=0.01",
"Referer": "https://piaofang.maoyan.com/dashboard?movieId=1211270&date=2011-01-02",
"User-Agent": agent,
}
# url = 'https://www.amazon.com/b/ref=AE_HP_leftnav_automotive?_encoding=UTF8&ie=UTF8&node=2562090011&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-leftnav&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_t=101&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_i=17938598011'
print('請求地址:', url)
# html = requests.get(url, timeout=10, headers=headers, verify=False).text
global proxies
global openFlag
ip = proxies[random.randint(0, MAX_num if len(proxies) > MAX_num else len(proxies)-1)]
if openFlag == 1:
html = requests.get(url, timeout=outTime, headers=headers, proxies={ip[0]: ip[1]}, verify=False).text
# print(ip)
else:
html = requests.get(url, timeout=outTime, headers = headers, verify=False).text
# print(html)
text = etree.HTML(html)
context = text.xpath('string(//div[@id="Main_Content_Val"])')
# print(context.replace(' ', '').replace('\n', ''))
# print('===============')
return context
# 經濟觀察網
def getdetailContent_3(url):
ua_list = [
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
]
agent = random.choice(ua_list)
headers = {
"Connection": "keep-alive",
"Accept": "application/json, text/javascript, */*; q=0.01",
"Referer": "https://piaofang.maoyan.com/dashboard?movieId=1211270&date=2011-01-02",
"User-Agent": agent,
}
# url = 'https://www.amazon.com/b/ref=AE_HP_leftnav_automotive?_encoding=UTF8&ie=UTF8&node=2562090011&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-leftnav&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_t=101&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_i=17938598011'
print('請求地址:', url)
# 解決亂碼
# html = requests.get(url, timeout=10, headers=headers, verify=False).text.encode('iso-8859-1')
global proxies
global openFlag
ip = proxies[random.randint(0, MAX_num if len(proxies) > MAX_num else len(proxies)-1)]
if openFlag == 1:
html = requests.get(url, timeout=outTime, headers=headers, proxies={ip[0]: ip[1]}, verify=False).text.encode('iso-8859-1')
# print(ip)
else:
html = requests.get(url, timeout=outTime, headers = headers, verify=False).text.encode('iso-8859-1')
# print(html)
text = etree.HTML(html)
context = text.xpath('string(//div[@class="xx_boxsing"])')
# print(context.replace(' ', '').replace('\n', ''))
# print('===============')
return context
# 方方博客
def getdetailContent_4(url):
ua_list = [
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
]
agent = random.choice(ua_list)
headers = {
"Connection": "keep-alive",
"Accept": "application/json, text/javascript, */*; q=0.01",
"Referer": "https://piaofang.maoyan.com/dashboard?movieId=1211270&date=2011-01-02",
"User-Agent": agent,
}
# url = 'https://www.amazon.com/b/ref=AE_HP_leftnav_automotive?_encoding=UTF8&ie=UTF8&node=2562090011&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-leftnav&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_t=101&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_i=17938598011'
print('請求地址:', url)
# 解決亂碼
# html = requests.get(url, timeout=10, headers=headers, verify=False).text
global proxies
global openFlag
ip = proxies[random.randint(0, MAX_num if len(proxies) > MAX_num else len(proxies)-1)]
if openFlag == 1:
html = requests.get(url, timeout=outTime, headers=headers, proxies={ip[0]: ip[1]}, verify=False).text
# print(ip)
else:
html = requests.get(url, timeout=outTime, headers = headers, verify=False).text
# print(html)
text = etree.HTML(html)
context = text.xpath('string(//div[@class="blog_content"])')
# print(context.replace(' ', '').replace('\n', ''))
# print('===============')
return context
# 中國經營網專題
def getdetailContent_5(url):
ua_list = [
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
]
agent = random.choice(ua_list)
headers = {
"Connection": "keep-alive",
"Accept": "application/json, text/javascript, */*; q=0.01",
"Referer": "https://piaofang.maoyan.com/dashboard?movieId=1211270&date=2011-01-02",
"User-Agent": agent,
}
# url = 'https://www.amazon.com/b/ref=AE_HP_leftnav_automotive?_encoding=UTF8&ie=UTF8&node=2562090011&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-leftnav&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_t=101&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_i=17938598011'
print('請求地址:', url)
# 解決亂碼
# html = requests.get(url, timeout=10, headers=headers, verify=False).text
global proxies
global openFlag
ip = proxies[random.randint(0, MAX_num if len(proxies) > MAX_num else len(proxies)-1)]
if openFlag == 1:
html = requests.get(url, timeout=outTime, headers=headers, proxies={ip[0]: ip[1]}, verify=False).text
# print(ip)
else:
html = requests.get(url, timeout=outTime, headers = headers, verify=False).text
# print(html)
text = etree.HTML(html)
context = text.xpath('string(//div[@class="contentleft auto"])')
# print(context.replace(' ', '').replace('\n', ''))
# print('===============')
return context
# 界面網
def getdetailContent_6(url):
ua_list = [
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
]
agent = random.choice(ua_list)
headers = {
"Connection": "keep-alive",
"Accept": "application/json, text/javascript, */*; q=0.01",
"Referer": "https://piaofang.maoyan.com/dashboard?movieId=1211270&date=2011-01-02",
"User-Agent": agent,
}
# url = 'https://www.amazon.com/b/ref=AE_HP_leftnav_automotive?_encoding=UTF8&ie=UTF8&node=2562090011&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-leftnav&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_t=101&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_i=17938598011'
print('請求地址:', url)
# 解決亂碼
# html = requests.get(url, timeout=10, headers=headers, verify=False).text
global proxies
global openFlag
ip = proxies[random.randint(0, MAX_num if len(proxies) > MAX_num else len(proxies)-1)]
if openFlag == 1:
html = requests.get(url, timeout=outTime, headers=headers, proxies={ip[0]: ip[1]}, verify=False).text
# print(ip)
else:
html = requests.get(url, timeout=outTime, headers = headers, verify=False).text
# print(html)
text = etree.HTML(html)
context = text.xpath('string(//div[@class="article-content"])')
# print(context.replace(' ', '').replace('\n', ''))
# print('===============')
return context
# 從excel中獲取
def getContent(path, savePath):
# 讀取數據
wb = load_workbook(path)
sheet_names = wb.get_sheet_names()
table = wb.get_sheet_by_name(sheet_names[0]) # index爲0爲第一張表
nrows = table.max_row # 行
ncols = table.max_column # 列
print(nrows, ncols)
cnt = 0
alldata = []
for i in range(2, nrows+1):
templist = []
for j in range(1, ncols+1):
# print(table.cell(i, j).value)
templist.append(table.cell(i, j).value)
# 獲取詳情鏈接:
url = table.cell(i, 3).value.split(',')[0]
try:
if url[:24] == 'https://mp.weixin.qq.com':
# 微信公共號獲取文章'
content = getdetailContent_1(url)
templist.append('微信公共號')
templist.append(content)
# print(content)
# pass
elif url[:24] == 'http://china.caixin.com/' or url[:22] == 'http://www.caixin.com/' or url[:25] == 'http://weekly.caixin.com/':
# 財新網獲取文章
content = getdetailContent_2(url)
templist.append('財新網')
templist.append(content)
# print(content)
# pass
elif url[:22] == 'http://www.eeo.com.cn/':
# 經濟觀察網
# # print('經濟觀察網', table.cell(i, 3).value)
content = getdetailContent_3(url)
templist.append('經濟觀察網')
templist.append(content)
# print(content)
# pass
elif url[:32] == 'http://fangfang.blog.caixin.com/':
# 方方博客
content = getdetailContent_4(url)
templist.append('方方博客')
templist.append(content)
# print(content)
# pass
elif url[:21] == 'http://www.cb.com.cn/':
# # 中國經營網專題
content = getdetailContent_5(url)
templist.append('中國經營網')
templist.append(content)
# # print(content)
pass
elif url[:24] == 'https://www.jiemian.com/':
# 界面網
content = getdetailContent_6(url)
templist.append('界面網')
templist.append(content)
# print(content)
# pass
else:
# print('else', table.cell(i, 3).value, '===', table.cell(i, 2).value)
cnt += 1
# print(table.cell(i, 3).value, table.cell(i, 5).value)
alldata.append(templist)
except Exception as ex:
print('異常:', ex)
# if i >= 10:
# break
# time.sleep(random.randint(0, 2))
print('剔除的:', cnt)
tableHead = ['日期', '標題', '原始URL', '截圖', '翻譯', 'Archive','文章來源', '文章內容']
write_excel_xls(savePath, 'link', alldata, tableHead)
if __name__ == '__main__':
'''
第一步:獲取鏈接
'''
# 數據地址
# url = 'https://github.com/2019ncovmemory/nCovMemory#%E7%AC%AC%E4%B8%80%E8%B4%A2%E7%BB%8Fyimagazine'
# # 保存文件路徑:
# path = './data/all_text_2.xlsx'
# getUrl(path, url)
'''
第二步:通過鏈接提取文章內容
'''
# url = 'https://web.archive.org/web/20200204084331/http://www.caixin.com/2020-02-04/101511377.html'
# 讀取鏈接文件地址:
path = './data/all_text_link_2.xlsx'
# 保存路徑:
savePath = './data/text_0.xlsx'
getContent(path, savePath)
t2.py:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/2/13 13:46
# @Author : ystraw
# @Site :
# @File : t2.py
# @Software: PyCharm Community Edition
# @function: 對t1獲得的 alltext.xlsx 進行必要的處理
# 1、刪除空行
import numpy
import pandas as pd
import jieba
# 讀入文件
def readFile(filename):
with open(filename, 'r', encoding='utf-8') as f:
str = f.read()
print(filename, '已讀入!')
f.close()
return str
# 刪除空行
def dealNull(path, savepath):
data = pd.read_excel(path, sheet_name=0)
df = pd.DataFrame(data)
# print(data.head()) # 無參時默認讀前五行
# print(data.tail()) # 無參時默認讀後五行
print(data.shape) # 查看數據大小
print(data.columns) # 查看數據的列索引
# 數據表基本信息(維度、列名稱、數據格式、所佔空間等)
print(data.info())
# 每一列數據的格式
print('格式:\n', df.dtypes)
# 讀取某列的某行數據
# df['文章內容'].astype('str')
# df['文章內容'] = df['文章內容'].map(str.strip)
# print(data['文章內容'].at[123])
# 讀取表格數據內容(不包括標題)
# print(data.values)
# 判斷每一行的文章內容是否爲空
data_notnull = data['文章內容'].notnull()
# print(data_notnull)
# 刪除空行
data_new = data[data_notnull]
# print(data_new)
print('刪除空行之後的大小:\n', data_new.shape)
# 保存文件
data_new.to_excel(savepath, index=False, header=True)
# 分詞並統計詞頻
def fenci(content):
# 讀入停留詞文件:
sword = readFile('./data/stopword.txt')
# 構建停留詞詞典:
sword = sword.split('\n')
worddict = {}
wordlist = []
for w in jieba.cut(content, cut_all=False): # cut_all=False爲精確模式,=True爲全模式
# print(w)
if (w not in sword) and w != '' and w != ' ' and w != None and w != '\n' and len(w) >= 2:
# print(w + '-')
wordlist.append(w)
try:
worddict[w] = worddict[w] + 1
except:
worddict[w] = 1
# print(worddict)
return [worddict, wordlist]
# 數據預處理
def preDeal(path, savepath):
# 讀取數據
data = pd.read_excel(path, sheet_name=0)
df = pd.DataFrame(data)
# 加一列
df['文章內容分詞'] = None
for i in range(df.shape[0]):
# 進行分詞
rt = fenci(df['文章內容'].at[i])
df['文章內容分詞'].at[i] = ' '.join(rt[1])
# 保存文件
df.to_excel(savepath, index=False, header=True)
if __name__ == '__main__':
'''
數據清洗
'''
# # 刪除空行
# path = './data/text_0.xlsx'
# savepath = './data/text_1.xlsx'
# dealNull(path, savepath)
'''
數據預處理
'''
path = './data/text_1.xlsx'
savepath = './data/text_2.xlsx'
preDeal(path, savepath)
t3.py:
# 導入Geo包,注意1.x版本的導入跟0.x版本的導入差別
# 更新方法:pip install --upgrade pyecharts
from pyecharts.charts import Geo
# 導入配置項
from pyecharts import options as opts
# ChartType:圖標類型,SymbolType:標記點類型
from pyecharts .globals import ChartType, SymbolType
# 讀入文件
def readFile(filename):
with open(filename, 'r', encoding='utf-8') as f:
str = f.read()
print(filename, '已讀入!')
f.close()
return str
geo = Geo()
# 新增座標點,添加名稱跟經緯度
# 讀入城市座標數據:
zb_city = readFile('./data/1-5LineCity_2.txt')
# geo.add_coordinate(name="China",longitude=104.195,latitude=35.675)
cityList = zb_city.split('\n')
for cy in cityList:
if cy == '' or cy == None:
continue
temp = cy.split(',')
geo.add_coordinate(name=temp[0], longitude=temp[2], latitude=temp[1])
# 地圖類型,世界地圖可換爲world
geo.add_schema(maptype="china")
# 獲取權重:
cityList = readFile('./data/city_node.csv').split('\n')
data = []
for i in range(len(cityList)):
city = cityList[i]
if i == 0 or city == '' or city == None:
continue
data.append((city.split(' ')[0], int(city.split(' ')[2])))
# print(data)
# 獲取流向
cityList = readFile('./data/city_edge.csv').split('\n')
data2 = []
for i in range(len(cityList)):
city = cityList[i]
if i == 0 or city == '' or city == None:
continue
# 共現次數較少的不展示:
if int(city.split(' ')[2]) < 200:
continue
data2.append((city.split(' ')[0], city.split(' ')[1]))
# print(data2)
# 添加數據點
# geo.add("",[("北京",10),("上海",20),("廣州",30),("成都",40),("哈爾濱",50)],type_=ChartType.EFFECT_SCATTER)
geo.add("", data, type_=ChartType.EFFECT_SCATTER)
# 添加流向,type_設置爲LINES,漣漪配置爲箭頭,提供的標記類型包括 'circle', 'rect', 'roundRect', 'triangle',
#'diamond', 'pin', 'arrow', 'none'
geo.add("geo-lines",
data2,
type_=ChartType.LINES,
effect_opts=opts.EffectOpts(symbol=SymbolType.ARROW,symbol_size=10,color="yellow"),
linestyle_opts=opts.LineStyleOpts(curve=0.2),
is_large=True)
# 不顯示標籤
geo.set_series_opts(label_opts=opts.LabelOpts(is_show=True))
# 設置圖標標題,visualmap_opts=opts.VisualMapOpts()爲左下角的視覺映射配置項
geo.set_global_opts(visualmap_opts=opts.VisualMapOpts(),title_opts=opts.TitleOpts(title="城市動態流向圖"))
# 直接在notebook裏顯示圖表
geo.render_notebook()
# 生成html文件,可傳入位置參數
geo.render("城市動態流向圖.html")
dataAnalysis.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/2/17 18:42
# @Author : ystraw
# @Site :
# @File : dataAnalysis.py
# @Software: PyCharm Community Edition
# @function: 進行數據分析
import folium
import codecs
from folium.plugins import HeatMap
from pyecharts.charts import Geo
from pyecharts.charts import Map
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 讀入文件
def readFile(filename):
with open(filename, 'r', encoding='utf-8') as f:
str = f.read()
print(filename, '已讀入!')
f.close()
return str
# 描述性分析
def ms_analysis(filepath):
# 中文和負號的正常顯示
plt.rcParams['font.sans-serif'] = ['KaiTi']
plt.rcParams['font.serif'] = ['KaiTi']
# 讀入數據
data = pd.read_excel(filepath)
'''
發文數量、來源和發文日期
'''
# 繪製文章發佈量與日期及來源關係圖:
# data.groupby('日期')['文章來源'].value_counts().unstack().fillna(value=0).plot(kind='bar', title='文章發佈量分日統計')
# plt.show()
# return
'''
城市提及分析
'''
# 讀入城市數據,構建城市字典:
city = readFile('./data/1-5LineCity_2.txt')
cityList = city.split('\n')
# print(cityList)
# 構建城市頻率:
cityDict = {}
for cy in cityList:
if cy == '' or cy == None:
continue
temp = cy.split(',')
cityDict[temp[0][:-1]] = 0
# print(cityDict)
print(data.shape[0], data.shape[1])
# 進行統計
for i in range(data.shape[0]):
wordList = data['文章內容分詞'].at[i].split(' ')
for word in wordList:
try:
cityDict[word] += 1
except:
pass
# break
print(cityDict)
# 繪製地圖:
# 取字典中的值
# provice = list(cityDict.keys())
# values = list(cityDict.values())
# 填充數據
data = []
for cy in cityList:
if cy == '' or cy == None:
continue
temp = cy.split(',')
data.append([float(temp[1]), float(temp[2]), cityDict[temp[0][:-1]]])
# data=[[ 31.235929,121.480539, 1500 ]] #
print(data)
map_osm = folium.Map([33., 113.], zoom_start=12) #繪製Map,開始縮放程度是5倍
HeatMap(data).add_to(map_osm) # 將熱力圖添加到前面建立的map裏
map_osm.save('./image/文章提及城市分佈.html')#將繪製好的地圖保存爲html文件
# 得到城市共現矩陣文件
def city_gx_analysis(filepath):
citys = {} # 城市字典
relationships = {} # 關係字典
lineCitys = [] # 每篇城市關係
# 構建城市集合:
cityList = readFile('./data/1-5LineCity.txt').split('\n')
citySet = set()
for city in cityList:
citySet.add(city.replace('市', ''))
# 讀入分詞數據
data = pd.read_excel(filepath)
# 填充鄰接矩陣
for i in range(data.shape[0]):
wordList = data['文章內容分詞'].at[i].split(' ')
lineCitys.append([])
for word in wordList:
if word not in citySet:
continue
lineCitys[-1].append(word)
if citys.get(word) is None:
citys[word] = 0
relationships[word] = {}
# 出現次數加1
citys[word] += 1
# explore relationships
for line in lineCitys: # 對於每一段
for city1 in line:
for city2 in line: # 每段中的任意兩個城市
if city1 == city2:
continue
if relationships[city1].get(city2) is None: # 若兩個城市尚未同時出現則新建項
relationships[city1][city2]= 1
else:
relationships[city1][city2] = relationships[city1][city2]+ 1 # 兩個城市共同出現次數加 1
# output
with codecs.open("./data/city_node.csv", "w", "utf-8") as f:
f.write("Id Label Weight\r\n")
for city, times in citys.items():
f.write(city + " " + city + " " + str(times) + "\r\n")
with codecs.open("./data/city_edge.csv", "w", "utf-8") as f:
f.write("Source Target Weight\r\n")
for city, edges in relationships.items():
for v, w in edges.items():
if w > 3:
f.write(city + " " + v + " " + str(w) + "\r\n")
if __name__ == '__main__':
filepath = './data/text_2.xlsx'
# 描述性分析
# ms_analysis(filepath)
# 分析城市間的共現關係
city_gx_analysis(filepath)
TF-IDF.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/2/23 22:20
# @Author : ystraw
# @Site :
# @File : TF-IDF.py
# @Software: PyCharm Community Edition
# @function: 對文本內容進行關鍵詞提取
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from openpyxl import Workbook
# 寫入Excel
def write_excel_xls(path, sheet_name, value, bHead):
# 獲取需要寫入數據的行數
index = len(value)
# 獲取需要寫入數據的行數
index = len(value)
wb = Workbook()
# 激活 worksheet
ws = wb.active
# 第一行輸入
ws.append(bHead)
# .cell(row=x, column=2, value=z.project)
for i in range(2, index+2):
for j in range(1, len(value[i-2]) + 1):
# ws.append(value[i])
ws.cell(row=i, column=j, value=value[i-2][j-1])
# 保存
wb.save(path)
print(path + '表格寫入數據成功!')
def TQ():
# 讀入數據
filepath = './data/text_2.xlsx'
data = pd.read_excel(filepath)
document = list(data['文章內容分詞'])
# print(document)
# print(len(document))
# min_df: 當構建詞彙表時,嚴格忽略低於給出閾值的文檔頻率的詞條,語料指定的停用詞。如果是浮點值,該參數代表文檔的比例,整型絕對計數值,如果詞彙表不爲None,此參數被忽略。
tfidf_model = TfidfVectorizer(min_df=0.023).fit(document)
# 得到語料庫所有不重複的詞
feature = tfidf_model.get_feature_names()
# print(feature)
# print(len(feature))
# ['一切', '一條', '便是', '全宇宙', '天狗', '日來', '星球']
# 得到每個特徵對應的id值:即上面數組的下標
# print(tfidf_model.vocabulary_)
# {'一條': 1, '天狗': 4, '日來': 5, '一切': 0, '星球': 6, '全宇宙': 3, '便是': 2}
# 每一行中的指定特徵的tf-idf值:
sparse_result = tfidf_model.transform(document)
# print(sparse_result)
# 每一個語料中包含的各個特徵值的tf-idf值:
# 每一行代表一個預料,每一列代表這一行代表的語料中包含這個詞的tf-idf值,不包含則爲空
weight = sparse_result.toarray()
# 構建詞與tf-idf的字典:
feature_TFIDF = {}
for i in range(len(weight)):
for j in range(len(feature)):
# print(feature[j], weight[i][j])
if feature[j] not in feature_TFIDF:
feature_TFIDF[feature[j]] = weight[i][j]
else:
feature_TFIDF[feature[j]] = max(feature_TFIDF[feature[j]], weight[i][j])
# print(feature_TFIDF)
# 按值排序:
print('TF-IDF 排名前十的:')
alldata = []
featureList = sorted(feature_TFIDF.items(), key=lambda kv: (kv[1], kv[0]), reverse=True)
for i in range(1, 600 if len(featureList) > 600 else len(featureList)):
print(featureList[i][0], featureList[i][1])
alldata.append([featureList[i][0], featureList[i][1]])
# 寫入文件:
tableHead = ['關鍵詞', 'TF-IDF']
import datetime
filetime = str(datetime.datetime.now()).replace('-', '').replace(' ', '_').replace(':', '_')[:17]
write_excel_xls('./data/關鍵詞_' + filetime + '.xlsx', 'link', alldata, tableHead)
def drawWordCloud():
from wordcloud import WordCloud
from scipy.misc import imread
# 讀入數據
filepath = './data/text_2.xlsx'
data = pd.read_excel(filepath)
document = list(data['文章內容分詞'])
# 整理文本:
# words = '一切 一條 便是 全宇宙 天狗 日來 星球' # 樣例
words = ''.join(document)
# print(words)
# 設置背景圖片:
b_mask = imread('./image/ciyun.webp')
# 繪製詞圖:
wc = WordCloud(
background_color="white", #背景顏色
max_words=2000, #顯示最大詞數
font_path="./image/simkai.ttf", #使用字體
# min_font_size=5,
# max_font_size=80,
# width=400, #圖幅寬度
mask=b_mask
)
wc.generate(words)
# 準備一個寫入的背景圖片
wc.to_file("./image/beijing_2.jpg")
if __name__ == '__main__':
'''
提取關鍵詞
'''
# TQ()
'''
繪製詞雲圖片
'''
drawWordCloud()
LDA_主題模型.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/2/26 14:10
# @Author : ystraw
# @Site :
# @File : LDA_主題模型.py
# @Software: PyCharm Community Edition
# @function:
import pandas as pd
import numpy as np
def LDA():
# 讀入數據
filepath = './data/text_2.xlsx'
data = pd.read_excel(filepath)
document = list(data['文章內容分詞'])
# 獲取詞頻向量:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
corpus = document
cntVector = CountVectorizer()
cntTf = cntVector.fit_transform(corpus)
# 輸出選取詞特徵
vocs = cntVector.get_feature_names()
print('主題詞袋:', len(vocs))
# print(vocs)
# 輸出即爲所有文檔中各個詞的詞頻向量
# print(cntTf)
# LDA主題模型
lda = LatentDirichletAllocation(n_components=4, # 主題個數
max_iter=5, # EM算法的最大迭代次數
learning_method='online',
learning_offset=20., # 僅僅在算法使用online時有意義,取值要大於1。用來減小前面訓練樣本批次對最終模型的影響
random_state=0)
docres = lda.fit_transform(cntTf)
# 類別所屬概率
LDA_corpus = np.array(docres)
print('類別所屬概率:\n', LDA_corpus)
# 每篇文章中對每個特徵詞的所屬概率矩陣:list長度等於分類數量
# print('主題詞所屬矩陣:\n', lda.components_)
# 找到最大值所在列,確定屬於的類別:
arr = pd.DataFrame(LDA_corpus)
data['主題類別'] = np.argmax(LDA_corpus, axis=1) # 求最大值所在索引
data['主題出現概率']=arr.max(axis=1) # 求行最大值
print('所屬類別:\n', data.head())
data.to_excel('./data/LDA_主題分佈_類別.xlsx', index=False)
# return
# 打印每個單詞的主題的權重值
tt_matrix = lda.components_
# 類別id
id = 0
# 存儲數據
datalist = []
for tt_m in tt_matrix:
# 元組形式
tt_dict = [(name, tt) for name, tt in zip(vocs, tt_m)]
tt_dict = sorted(tt_dict, key=lambda x: x[1], reverse=True)
# 打印權重值大於0.6的主題詞:
# tt_dict = [tt_threshold for tt_threshold in tt_dict if tt_threshold[1] > 0.6]
# 打印每個類別的前20個主題詞:
tt_dict = tt_dict[:20]
print('主題%d:' % id, tt_dict)
# 存儲:
datalist += [[tt_dict[i][0], tt_dict[i][1], id]for i in range(len(tt_dict))]
id += 1
# 存入excel:
# df = pd.DataFrame(datalist, columns=['特徵詞', '權重', '類別'])
# df.to_excel('./data/LDA_主題分佈3.xlsx', index=False)
if __name__ == '__main__':
'''
利用LDA主題模型進行主題提取:
'''
LDA()
Snownlp情感分析.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/2/27 12:54
# @Author : ystraw
# @Site :
# @File : Snownlp情感分析.py
# @Software: PyCharm Community Edition
# @function: 進行情感分析
import pandas as pd
from snownlp import SnowNLP
def qgjs():
# 讀入數據
data = pd.read_excel('./data/LDA_主題分佈_類別.xlsx')
# print(data.shape)
# 進行情感打分
score = []
for i in range(0, data.shape[0]):
s = SnowNLP(data['標題'].at[i])
score.append(s.sentiments)
data['情緒得分'] = score
print(data.head())
data.to_excel('./data/情緒得分.xlsx', index=False)
if __name__ == '__main__':
qgjs()