Python可視化利器 — pyecharts 結合爬蟲對數據進行分析生成各種可視化圖形

用 pyecharts 對爬蟲後的數據進行可視化處理(生成餅圖、柱狀圖、地理位置圖、3D旋轉動圖、詞雲圖)及地圖問題的解決

        首先,pyecharts 是一款融合了Pythonecharts技術的強大的數據可視化工具,它的可視化類型比較多也很豐富,具體的可以參考pyecharts 中文網站:https://pyecharts.org/#/zh-cn/intro

 pyecharts 的安裝方法

可以參考 Selenium 的兩種安裝方法選一種就可以,有圖和安裝步驟

綜合案例

1、爬取貓眼中的《一出好戲》的數據

__author__ = 'xiaoguo'

from urllib import request
import ssl, json
from datetime import datetime, timedelta
import time

# 獲取數據
def get_data(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'
    }
    req = request.Request(url, headers=headers)
    response = request.urlopen(req, context=ssl._create_unverified_context())

    if response.getcode() == 200:
        result = response.read()
    return result

# 處理數據
def parse_data(html):
    data = json.loads(html)['cmts']
    contents = []
    for item in data:
        content ={
            'id': item['id'],
            'nickName': item['nickName'],
            'cityName': item['cityName'] if 'cityName' in item else ' ',   # 處理cityName不存在的情況
            'content': item['content'].replace('\n', ' '),    # 處理評論內容中有 \n 的情況
            'score': item['score'],
            'startTime': item['startTime']
        }
        contents.append(content)
    return contents

# 存儲數據到文本文件中
def save_to_txt():
    # 當前時間
    start_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    # 結束時間
    end_time = '2018-08-08 00:00:00'

    while start_time > end_time:
        url = 'http://m.maoyan.com/mmdb/comments/movie/1203084.json?_v_=yes&offset=0&startTime=' + start_time.replace(' ', '%20')
        try:
            html = get_data(url)
        except:
            time.sleep(0.5)
            html = get_data(url)
        else:
            time.sleep(0.2)

        contents = parse_data(html)
        print(contents)

        start_time = contents[len(contents)-1]['startTime']      # 獲取每次加載後的最後一個評論時間
        # 將取出的字符串時間轉換爲時間格式
        start_time = datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S')-timedelta(seconds=1)
        start_time = datetime.strftime(start_time, '%Y-%m-%d %H:%M:%S')

        for item in contents:
            with open('maoyanContent.txt', mode='a', encoding='utf-8') as f:
                f.write(str(item['id']) + ',' + item['nickName'] + ',' + item['cityName'] + ',' + item['content'] + ',' + str(item['score']) + ',' + item['startTime'] + '\n')

if __name__ == '__main__':
    save_to_txt()

2、用pyecharts對爬取到的星級評分進行餅圖可視化處理

starScore.py

__author__ = 'xiaoguo'
from pyecharts import Pie
# 獲取評論中的所有評分
star_rates = []
with open('maoyanContent.txt', mode='r', encoding='utf-8') as f:
    lines = f.readlines()
    for line in lines:
        star = line.split(',')[4]
        star_rates.append(star)

# 定義星級
attr = ['五星', '四星', '三星', '二星', '一星']
value = [
    star_rates.count('5') + star_rates.count('4.5'),
    star_rates.count('4') + star_rates.count('3.5'),
    star_rates.count('3') + star_rates.count('2.5'),
    star_rates.count('2') + star_rates.count('1.5'),
    star_rates.count('1') + star_rates.count('0.5'),
]

pie = Pie('《一出好戲》星級評分',
          title_pos='center',
          width=900,
)
pie.add('',
        attr,
        value,
        is_label_show=True,
        legend_pos='left',
        legend_orient="vertical",
        radius=[20, 60],
)

pie.render('《一出好戲》電影評分餅圖.html')

3、用pyecharts對爬取到的電影評論進行詞雲圖可視化處理

以下的代碼可以根據給定的圖片生成和圖片一樣形狀的詞雲圖:

# 導入背景圖
bg_image = plt.imread('bg.jpg')

commentsWordCloud.py

__author__ = 'xiaoguo'
import jieba
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from wordcloud import STOPWORDS

# 獲取所有評論的內容
contents = []
with open('maoyanContent.txt', mode='r', encoding='utf-8') as f:
    lines = f.readlines()
    for line in lines:
        ct = line.split(',')[3]
        content = ct.replace(',', ',')
        if ' ' != content:
            contents.append(content)

# 設置分詞
content_after_split = jieba.cut(str(contents), cut_all=False)
words = ' '.join(content_after_split)           # 以空格進行拼接

# 設置屏蔽詞彙
stopWords = STOPWORDS.copy()
stopWords.add('電影')
stopWords.add('一出')
stopWords.add('好戲')
stopWords.add('有點')

# 導入背景圖
bg_image = plt.imread('bg.jpg')

# 設置詞雲的參數
wc = WordCloud(width=1024, height=768, background_color='white', mask=bg_image, stopwords=stopWords, max_font_size=400, random_state=50, font_path='STKAITI.TTF')

# 將分詞後的數據導入雲圖
wc.generate_from_text(words)

# 繪製圖像
plt.imshow(wc)
plt.axis('off')         # 不顯示座標軸
plt.show()              # 顯示圖像

# 保存圖像到文件
wc.to_file('黃渤一出好戲評論詞雲圖.jpg')

4、用pyecharts對爬取到的粉絲位置進行可視化處理

在處理地理位置的時候可能會報錯,缺少城市的包,需要下載以下三個包:

echarts-china-cities-pypkg (0.0.8)
echarts-china-provinces-pypkg (0.0.2)
echarts-countries-pypkg (0.1.4)

粉絲地理位置可視化圖:

粉絲排行3D可視化動圖

is_grid3d_rotate=True,
​​​​​​​grid3d_rotate_speed=50,

以上的兩行代碼是決定這個3D圖是否可以轉動以及轉動的速度

fanLocation.py

__author__ = 'xiaoguo'

# 快速統計元素出現的次數庫
from collections import Counter
from pyecharts import Geo, Bar, Page, Bar3D
import json
import pandas as pd

def render():
    # 獲取所有城市的信息
    cities = []
    with open('maoyanContent.txt', mode='r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            city = line.split(',')[2]
            if '' != city:
                cities.append(city)

    # 對地圖中的城市數據和座標文件中的地名進行處理
    handle(cities)
    data = Counter(cities).most_common()

    page = Page(page_title='《一出好戲》')
    # 根據城市生成地理座標圖
    geo = Geo(
        "《一出好戲》粉絲位置分佈圖",
        "數據來源:貓眼",
        title_color="#fff",
        title_pos="center",
        width=1200,
        height=600,
        background_color="#404a59",
    )
    attr, value = geo.cast(data)
    geo.add(
        "",
        attr,
        value,
        visual_range=[0, 400],
        visual_text_color="#fff",
        symbol_size=15,
        is_visualmap=True,
    )
    geo.render('《一出好戲》粉絲位置分佈圖.html')
    page.add(geo)

    # 根據城市數據生成柱狀圖
    cities_top20 = Counter(cities).most_common(20)      # 返回出現次數最多的20條
    bar = Bar(
              "《一出好戲》粉絲來源城市TOP20",
              "數據來源:貓眼",
              # title_color="#fff",
              title_pos="center",
              width=1200,
              height=600,
    )
    attr, value = bar.cast(cities_top20)
    bar.add('',
            attr,
            value,
            is_label_show=True,
            is_visualmap=True,
    )
    bar.render('《一出好戲》粉絲來源排行榜TOP20—柱狀圖.html')
    page.add(bar)


    # 根據城市數據生成Bar3D圖
    xt = []
    yt = []
    for city in cities_top20:
        xt.append(city[1])
        yData = city[0]
        yt.append(yData)
    xs = [str(i) for i in range(20)]
    data = pd.DataFrame({'ls': xs, 'city': yt, 'sales': xt})
    x_name = list(set(data.iloc[:, 0]))
    y_name = list(set(data.iloc[:, 1]))

    data_xyz = []
    for i in range(len(data)):
        x = x_name.index(data.iloc[i, 0])
        y = y_name.index(data.iloc[i, 1])
        z = data.iloc[i, 2]
        data_xyz.append([x, y, z])

    range_color = ['#313695', '#4575b4', '#74add1', '#abd9e9', '#e0f3f8', '#ffffbf',
                   '#fee090', '#fdae61', '#f46d43', '#d73027', '#a50026']
    bar3D = Bar3D(
        '《一出好戲》粉絲排行榜TOP20',
        '數據來源:貓眼',
        width=1200,
        height=600,
        title_pos='center',
    )
    bar3D.add(
        "",
        x_name,
        y_name,
        data_xyz,
        is_visualmap=True,
        visual_range=[0, 20],
        visual_range_color=range_color,
        grid3d_width=150,
        grid3d_depth=100,
        is_grid3d_rotate=True,
        grid3d_rotate_speed=50,
        grid3d_shading="lambert",
    )
    bar3D.render('《一出好戲》粉絲排行榜TOP—Bar3D.html')

    page.add(bar3D)

    page.render('《一出好戲粉絲分佈及排行榜TOP20》.html')

# 用來處理地名數據,解析座標文件中找不到地名的問題
def handle(cities):
    with open('/存儲文件的位置/Library/Python/3.6/lib/python/site-packages/pyecharts/datasets/city_coordinates.json', mode='r', encoding='utf-8') as f:
        # 將字符串轉換成字典
        data = json.loads(f.read())
        # print(data)

    # 循環判斷處理
    data_new = data.copy()      # 把地圖庫裏面的數據複製一份
    for city in set(cities):
        count = 0
        for key in data:
            count += 1
            if key == city:     # 如果找到相同的就停止
                break
            if key.startswith(city):    # 用來處理簡寫的地名 如:把 '鄭州市' 簡寫爲 '鄭州'
                data_new[city] = data[key]
                break
            if key.startswith(city[0:-1]) and len(city) >= 3:    # 用來處理行政變更的地名 如: 把 '溧水縣' 改寫成 '溧水區'
                data_new[city] = data[key]
                break
        # 用來處理不存在的情況
        if count == len(data):
            while city in cities:
                cities.remove(city)
    # print(len(data), len(data_new))

    # 將修改後的座標數據寫入座標文件
    with open('/存儲文件的位置/Library/Python/3.6/lib/python/site-packages/pyecharts/datasets/city_coordinates.json', mode='w', encoding='utf-8') as f:
        # 將字典轉換成字符串  (ensure_ascii=False 指定支持中文)
        data_new = json.dumps(data_new, ensure_ascii=False)
        f.write(data_new)

if __name__ == '__main__':
    render()

粉絲Top20可視化柱狀圖

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章