最近看了《哪吒之魔童降世》,有搞笑,有溫情,有劇情,有理念,強烈推薦,給國漫點贊。
然後又在學習python爬蟲,就試了下爬取其豆瓣影評
涉及:
1. requests請求網頁
2. xpath提取數據
3. 爬取遇到“下一頁”操作時的處理
4. openpyxl將數據寫入excel
5. matplotlib.pyplot畫柱狀圖和圓形分佈圖
源碼:
import requests
from lxml import etree
import os
import matplotlib.pyplot as plt
import openpyxl
import traceback
class DB_Film_review_Spider(object):
def __init__(self,url):
self.__url=url
self.page=0
self.total_author=[]
self.total_star_num=[]
self.total_comment_time=[]
self.total_recommendation_level=[]
self.total_text=[]
self.level=['力薦','推薦','還行','較差','很差']
self.num=[0,0,0,0,0]
# 從網頁提取數據
def get_data(self):
responce=requests.get(self.__url)
xml=etree.HTML(responce.text)
self.author=xml.xpath('//div[@class="mod-bd"]//div//div//a/@title') # 作者
self.star_num = xml.xpath('//div[@class="mod-bd"]//h3//span[@class="votes"]/text()') # 有用數
self.comment_time=xml.xpath('//div[@class="mod-bd"]//h3//span[@class="comment-info"]//span[3]/@title') #評論時間
self.recommendation_level=xml.xpath('//div[@class="mod-bd"]//h3//span[@class="comment-info"]//span[2]/@title') # 推薦程度
self.text=xml.xpath('//div[@class="mod-bd"]//p//span/text()') # 影評
# 寫入excel
def write_excel(self):
try:
file_path = "哪吒豆瓣影評.xlsx"
column_headers = ["編號", "作者", "推薦程度", "評論時間", "點贊數", "詳細影評"]
if os.path.exists(file_path):
wb = openpyxl.load_workbook(file_path)
ws = wb["Sheet"]
else:
wb = openpyxl.Workbook()
ws = wb.active
ws.column_dimensions["A"].width = 5 # 列寬
ws.column_dimensions["B"].width = 30
ws.column_dimensions["C"].width = 10
ws.column_dimensions["D"].width = 23
ws.column_dimensions["F"].width = 700
for i in range(6): # 寫列標題
ws.cell(row=1, column=i + 1, value=column_headers[i])
for i in range(len(self.total_author)):
ws.cell(row=i + 2, column=1, value=i + 1) # 寫編號這一列數據
ws.cell(row=i + 2, column=2, value=self.total_author[i].encode("utf-8")) # 作者
ws.cell(row=i + 2, column=3, value=self.total_recommendation_level[i].encode("utf-8")) # 推薦程度
if i>=len(self.total_comment_time):
pass
else:
ws.cell(row=i + 2, column=4, value=self.total_comment_time[i].encode("utf-8"))
ws.cell(row=i + 2, column=5, value=self.total_star_num[i].encode("utf-8"))
ws.cell(row=i + 2, column=6, value=self.total_text[i].encode("utf-8"))
wb.save(file_path)
except Exception:
print(traceback.print_exc())
# 獲取url,找出規律,提取下一頁的url
def get_url(self):
try:
while (True):
self.get_data()
# set(list1).issubset(set(list2)) 判斷list2是否包含list1,是則返回True
if set(self.author).issubset(set(self.total_author)):
break # 若獲取到的數據沒有增加(即無變化),則跳出循環
else:
self.total_author = self.total_author + self.author
self.total_star_num = self.total_star_num + self.star_num
self.total_comment_time = self.total_comment_time + self.comment_time
self.total_recommendation_level = self.total_recommendation_level + self.recommendation_level
self.total_text = self.total_text + self.text
self.page = self.page + 20 # 下一頁的url 僅是start=" "的值每次加20
self.__url = "https://movie.douban.com/subject/26794435/comments?start=" + str(
self.page) + "&limit=20&sort=new_score&status=P"
except Exception:
print(traceback.print_exc())
# 處理數據並畫出示意圖
def process_data(self):
# print(len(self.total_recommendation_level))
# print(len(self.total_comment_time))
for level in self.total_recommendation_level: # 統計每種level的數目
for i in range(5):
if level==self.level[i]:
self.num[i]+=1
continue
# print(self.num)
# 畫柱狀圖
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用黑體顯示中文
plt.rcParams['axes.unicode_minus'] = False # 正常顯示負號
plt.bar(range(len(self.level)),self.num,label='推薦等級',tick_label=self.level,fc='r')
plt.title("推薦情況")
plt.show()
# 畫圓形分佈圖
plt.figure(figsize=(6,9)) #調節圖形大小,寬,高
colors=['red','yellowgreen','lightskyblue','green','gray'] # 顏色
# 將某部分爆炸出來, 使用括號,將第一塊分割出來,數值的大小是分割出來的與其他兩塊的間隙
explode = (0.05, 0, 0,0,0)
plt.pie(self.num,explode=explode,labels=self.level,colors=colors,
labeldistance=1.1, autopct='%3.1f%%', shadow=False,
startangle=90, pctdistance=0.6
)
# 參數1:每個標籤所佔大小(列表),會自動計算百分比
# 參數3:定義圓形圖的標籤(列表)
# labeldistance,文本的位置離遠點有多遠,1.1指1.1倍半徑的位置
# autopct,圓裏面的文本格式,%3.1f%%表示小數有三位,整數有一位的浮點數
# shadow,餅是否有陰影
# startangle,起始角度,0,表示從0開始逆時針轉,爲第一塊。一般選擇從90度開始比較好看
# pctdistance,百分比的text離圓心的距離
plt.axis('equal') # 設置x,y軸刻度一致,這樣餅圖才能是圓的
plt.legend()
plt.show()
if __name__ == '__main__':
D_Spider=DB_Film_review_Spider("https://movie.douban.com/subject/26794435/comments?start=0&limit=20&sort=new_score&status=P")
D_Spider.get_url()
D_Spider.process_data()
D_Spider.write_excel()
運行結果:
問題:
1. 代碼需優化的部分還有很多
2. 僅爬取了10頁(共220條)評論信息,後面的需要登錄才能進行訪問,後續進行處理