前幾天看了一下csdn微信公衆號發的關於知否劇評的詞雲製作覺得很有趣,因此也模仿了一下十分有趣,可是還有些細節要注意。
第一步代碼如下:
# -*-coding:utf-8-*-
#導入相關模塊
import requests
from lxml import etree
import jieba
import time
#載入瀏覽器請求頭,避免被檢測
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36"
}
#獲取下一頁url地址,使用xpath匹配
'''
def getPageNum(url):
if url:
req = requests.get(url, headers=header)
html = etree.HTML(req.text)
pageNum = html.xpath(u"//div[@class='paginator']/a[last()]/text()")[0]
return pageNum
'''
#開始抓取,匹配XPath拿到數據
def getContent(url):
if url:
req = requests.get(url, headers=header)
html = etree.HTML(req.text)
data = html.xpath(u"//span[@class='short']/text()")
return data
#將每頁的url寫入一個列表
def getUrl():
dataUrl = []
#評論不多,我們可以自定義抓取100頁,實際評論只有66頁
for i in range(1, int(100)):
url = "https://movie.douban.com/subject/26928226/comments?start=%s" % ((i - 1) * 20)
#將所有的頁碼寫入列表
dataUrl.append(url)
return dataUrl
#主程序開始
if __name__ == '__main__':
url = "https://movie.douban.com/subject/26928226/comments?start=20"
#pageNum = getPageNum(url)
data = getUrl()
datas = []
dic = dict()
print(data)
print(datas)
#打開創建文件
file_handle = open('./3.txt', mode='w+', encoding='utf-8')
for u in data:
#避免程序過快執行,模擬人爲操作,程序睡眠
time.sleep(1)
for d in getContent(u):
#抓取的內容寫入txt
file_handle.write(d)
# 避免程序過快執行被檢測,模擬人爲操作,程序睡眠
print(d)
time.sleep(1)
#使用jieba分詞,對詞頻進行分析
jdata = jieba.cut(d)
for i in jdata:
# 將數據做簡單的清理,去掉亂碼符號空格
if len(i.strip()) > 1:
datas.append(i.strip())
#數據在控制檯簡單的展示
for i in datas:
if datas.count(i) > 1:
dic[i] = datas.count(i)
for key, values in dic.items():
print("%s===%d" % (key, values))
保存到了3.txt文本文件中
# -*- coding: utf-8 -*-
# 導入必要的模塊
import jieba
import matplotlib.pyplot as plt
from wordcloud import WordCloud
# 創建停用詞列表
def stopwordslist(filepath):
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
return stopwords
# 對句子進行jieba分詞,雖然WordCloud也有分詞功能,但感覺沒有jieba分詞的結果好
def seg_sentence(sentence):
sentence_seged = jieba.cut(sentence.strip())
stopwords = stopwordslist('stopwords1893.txt') # 這裏加載停用詞的路徑
outstr = []
for word in sentence_seged:
if word not in stopwords:
if word != '\t' and word != ' ' and word != '\n':
outstr.append(word)
return outstr
# 打開並逐行讀取文本文檔
f = open("3.txt", 'r')
lines = f.readlines()
sentence = ''
for line in lines:
sentence = ''.join([sentence, line]) # 每行句子都連接起來
f.close()
# 輸入文本得到jieba分詞結果
word_result_list = seg_sentence(sentence)
# 將分詞連接起來,以逗號分隔
word_result = ','.join(word_result_list)
plt.figure(figsize=(12,6))
# 中文字體的保存目錄
font = r'SimHei.ttf'
# 詞雲的參數設置
wc = WordCloud(
background_color='white', # 設置背景顏色爲白色
colormap='winter', # 設置顏色風格爲'winter'
font_path=font, # 設置中文字體
width=1280, # 設置詞雲圖的寬度
height=720, # 設置詞雲圖的高度
max_font_size=150, # 設置字體顯示的最大值
max_words=200 # 設置最多能顯示的詞數
)
# 輸入文本給詞雲做處理
wc.generate(word_result)
# 顯示詞雲圖
plt.imshow(wc)
# "off"表示不顯示軸座標
plt.axis("off")
plt.show()
# 輸出詞雲圖到當前目錄
wc.to_file("pict_wordcloud.jpg")
這裏要注意的是要下載兩個文件:一個是停用詞列表:stopwords1893.txt;一個是字體文件 SimHei.ttf
有興趣的自行下載哦~~
效果圖如下:
還是挺有意思的,哈哈,我追劇也追了挺久的,挺好看滴哈哈。