用 pyecharts 對爬蟲後的數據進行可視化處理(生成餅圖、柱狀圖、地理位置圖、3D旋轉動圖、詞雲圖)及地圖問題的解決
首先,pyecharts 是一款融合了Python和echarts技術的強大的數據可視化工具,它的可視化類型比較多也很豐富,具體的可以參考pyecharts 中文網站:https://pyecharts.org/#/zh-cn/intro
pyecharts 的安裝方法
可以參考 Selenium 的兩種安裝方法選一種就可以,有圖和安裝步驟
綜合案例
1、爬取貓眼中的《一出好戲》的數據
__author__ = 'xiaoguo'
from urllib import request
import ssl, json
from datetime import datetime, timedelta
import time
# 獲取數據
def get_data(url):
headers = {
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'
}
req = request.Request(url, headers=headers)
response = request.urlopen(req, context=ssl._create_unverified_context())
if response.getcode() == 200:
result = response.read()
return result
# 處理數據
def parse_data(html):
data = json.loads(html)['cmts']
contents = []
for item in data:
content ={
'id': item['id'],
'nickName': item['nickName'],
'cityName': item['cityName'] if 'cityName' in item else ' ', # 處理cityName不存在的情況
'content': item['content'].replace('\n', ' '), # 處理評論內容中有 \n 的情況
'score': item['score'],
'startTime': item['startTime']
}
contents.append(content)
return contents
# 存儲數據到文本文件中
def save_to_txt():
# 當前時間
start_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# 結束時間
end_time = '2018-08-08 00:00:00'
while start_time > end_time:
url = 'http://m.maoyan.com/mmdb/comments/movie/1203084.json?_v_=yes&offset=0&startTime=' + start_time.replace(' ', '%20')
try:
html = get_data(url)
except:
time.sleep(0.5)
html = get_data(url)
else:
time.sleep(0.2)
contents = parse_data(html)
print(contents)
start_time = contents[len(contents)-1]['startTime'] # 獲取每次加載後的最後一個評論時間
# 將取出的字符串時間轉換爲時間格式
start_time = datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S')-timedelta(seconds=1)
start_time = datetime.strftime(start_time, '%Y-%m-%d %H:%M:%S')
for item in contents:
with open('maoyanContent.txt', mode='a', encoding='utf-8') as f:
f.write(str(item['id']) + ',' + item['nickName'] + ',' + item['cityName'] + ',' + item['content'] + ',' + str(item['score']) + ',' + item['startTime'] + '\n')
if __name__ == '__main__':
save_to_txt()
2、用pyecharts對爬取到的星級評分進行餅圖可視化處理
starScore.py
__author__ = 'xiaoguo'
from pyecharts import Pie
# 獲取評論中的所有評分
star_rates = []
with open('maoyanContent.txt', mode='r', encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
star = line.split(',')[4]
star_rates.append(star)
# 定義星級
attr = ['五星', '四星', '三星', '二星', '一星']
value = [
star_rates.count('5') + star_rates.count('4.5'),
star_rates.count('4') + star_rates.count('3.5'),
star_rates.count('3') + star_rates.count('2.5'),
star_rates.count('2') + star_rates.count('1.5'),
star_rates.count('1') + star_rates.count('0.5'),
]
pie = Pie('《一出好戲》星級評分',
title_pos='center',
width=900,
)
pie.add('',
attr,
value,
is_label_show=True,
legend_pos='left',
legend_orient="vertical",
radius=[20, 60],
)
pie.render('《一出好戲》電影評分餅圖.html')
3、用pyecharts對爬取到的電影評論進行詞雲圖可視化處理
以下的代碼可以根據給定的圖片生成和圖片一樣形狀的詞雲圖:
# 導入背景圖
bg_image = plt.imread('bg.jpg')
commentsWordCloud.py
__author__ = 'xiaoguo'
import jieba
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from wordcloud import STOPWORDS
# 獲取所有評論的內容
contents = []
with open('maoyanContent.txt', mode='r', encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
ct = line.split(',')[3]
content = ct.replace(',', ',')
if ' ' != content:
contents.append(content)
# 設置分詞
content_after_split = jieba.cut(str(contents), cut_all=False)
words = ' '.join(content_after_split) # 以空格進行拼接
# 設置屏蔽詞彙
stopWords = STOPWORDS.copy()
stopWords.add('電影')
stopWords.add('一出')
stopWords.add('好戲')
stopWords.add('有點')
# 導入背景圖
bg_image = plt.imread('bg.jpg')
# 設置詞雲的參數
wc = WordCloud(width=1024, height=768, background_color='white', mask=bg_image, stopwords=stopWords, max_font_size=400, random_state=50, font_path='STKAITI.TTF')
# 將分詞後的數據導入雲圖
wc.generate_from_text(words)
# 繪製圖像
plt.imshow(wc)
plt.axis('off') # 不顯示座標軸
plt.show() # 顯示圖像
# 保存圖像到文件
wc.to_file('黃渤一出好戲評論詞雲圖.jpg')
4、用pyecharts對爬取到的粉絲位置進行可視化處理
在處理地理位置的時候可能會報錯,缺少城市的包,需要下載以下三個包:
echarts-china-cities-pypkg (0.0.8)
echarts-china-provinces-pypkg (0.0.2)
echarts-countries-pypkg (0.1.4)
粉絲地理位置可視化圖:
粉絲排行3D可視化動圖
is_grid3d_rotate=True,
grid3d_rotate_speed=50,
以上的兩行代碼是決定這個3D圖是否可以轉動以及轉動的速度
fanLocation.py
__author__ = 'xiaoguo'
# 快速統計元素出現的次數庫
from collections import Counter
from pyecharts import Geo, Bar, Page, Bar3D
import json
import pandas as pd
def render():
# 獲取所有城市的信息
cities = []
with open('maoyanContent.txt', mode='r', encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
city = line.split(',')[2]
if '' != city:
cities.append(city)
# 對地圖中的城市數據和座標文件中的地名進行處理
handle(cities)
data = Counter(cities).most_common()
page = Page(page_title='《一出好戲》')
# 根據城市生成地理座標圖
geo = Geo(
"《一出好戲》粉絲位置分佈圖",
"數據來源:貓眼",
title_color="#fff",
title_pos="center",
width=1200,
height=600,
background_color="#404a59",
)
attr, value = geo.cast(data)
geo.add(
"",
attr,
value,
visual_range=[0, 400],
visual_text_color="#fff",
symbol_size=15,
is_visualmap=True,
)
geo.render('《一出好戲》粉絲位置分佈圖.html')
page.add(geo)
# 根據城市數據生成柱狀圖
cities_top20 = Counter(cities).most_common(20) # 返回出現次數最多的20條
bar = Bar(
"《一出好戲》粉絲來源城市TOP20",
"數據來源:貓眼",
# title_color="#fff",
title_pos="center",
width=1200,
height=600,
)
attr, value = bar.cast(cities_top20)
bar.add('',
attr,
value,
is_label_show=True,
is_visualmap=True,
)
bar.render('《一出好戲》粉絲來源排行榜TOP20—柱狀圖.html')
page.add(bar)
# 根據城市數據生成Bar3D圖
xt = []
yt = []
for city in cities_top20:
xt.append(city[1])
yData = city[0]
yt.append(yData)
xs = [str(i) for i in range(20)]
data = pd.DataFrame({'ls': xs, 'city': yt, 'sales': xt})
x_name = list(set(data.iloc[:, 0]))
y_name = list(set(data.iloc[:, 1]))
data_xyz = []
for i in range(len(data)):
x = x_name.index(data.iloc[i, 0])
y = y_name.index(data.iloc[i, 1])
z = data.iloc[i, 2]
data_xyz.append([x, y, z])
range_color = ['#313695', '#4575b4', '#74add1', '#abd9e9', '#e0f3f8', '#ffffbf',
'#fee090', '#fdae61', '#f46d43', '#d73027', '#a50026']
bar3D = Bar3D(
'《一出好戲》粉絲排行榜TOP20',
'數據來源:貓眼',
width=1200,
height=600,
title_pos='center',
)
bar3D.add(
"",
x_name,
y_name,
data_xyz,
is_visualmap=True,
visual_range=[0, 20],
visual_range_color=range_color,
grid3d_width=150,
grid3d_depth=100,
is_grid3d_rotate=True,
grid3d_rotate_speed=50,
grid3d_shading="lambert",
)
bar3D.render('《一出好戲》粉絲排行榜TOP—Bar3D.html')
page.add(bar3D)
page.render('《一出好戲粉絲分佈及排行榜TOP20》.html')
# 用來處理地名數據,解析座標文件中找不到地名的問題
def handle(cities):
with open('/存儲文件的位置/Library/Python/3.6/lib/python/site-packages/pyecharts/datasets/city_coordinates.json', mode='r', encoding='utf-8') as f:
# 將字符串轉換成字典
data = json.loads(f.read())
# print(data)
# 循環判斷處理
data_new = data.copy() # 把地圖庫裏面的數據複製一份
for city in set(cities):
count = 0
for key in data:
count += 1
if key == city: # 如果找到相同的就停止
break
if key.startswith(city): # 用來處理簡寫的地名 如:把 '鄭州市' 簡寫爲 '鄭州'
data_new[city] = data[key]
break
if key.startswith(city[0:-1]) and len(city) >= 3: # 用來處理行政變更的地名 如: 把 '溧水縣' 改寫成 '溧水區'
data_new[city] = data[key]
break
# 用來處理不存在的情況
if count == len(data):
while city in cities:
cities.remove(city)
# print(len(data), len(data_new))
# 將修改後的座標數據寫入座標文件
with open('/存儲文件的位置/Library/Python/3.6/lib/python/site-packages/pyecharts/datasets/city_coordinates.json', mode='w', encoding='utf-8') as f:
# 將字典轉換成字符串 (ensure_ascii=False 指定支持中文)
data_new = json.dumps(data_new, ensure_ascii=False)
f.write(data_new)
if __name__ == '__main__':
render()
粉絲Top20可視化柱狀圖