使用Python爬取豆瓣的影評,比爬取爬取網易雲簡單,因爲不需要設置特定的headers,關於網易雲說幾句,很難爬取,對請求頭有着嚴格的要求,前幾年那會還好些。
爬取結果分爲:用戶名,評價的星級,評論的內容
以後可能會通過評價的的星級繪製一個餅圖之類的可視化圖表,算是一個基礎的爬蟲和分析吧
代碼如下:
import os
import requests
from lxml import etree
# 設置頭部信息,防止被檢測出是爬蟲
headers = {
'Host': 'movie.douban.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
}
id = input('請輸入電影id:')
page = int(input('請輸入爬取的頁數:'))
# 循環請求接口
for i in range(0, page):
# 循環請求每頁的數據
re = requests.get(
url='https://movie.douban.com/subject/%s/comments?start=%d&limit=20&sort=new_score&status=P' % (id, 20 * i),
headers=headers).text
# 構造了一個XPath解析對象並對HTML文本進行自動修正
html = etree.HTML(re)
# XPath使用路徑表達式來選取用戶名
comment = html.xpath('//div[@class="comment"]')
for content in comment:
names = content.xpath('.//a[@class=""]')
grades = content.xpath('.//span[contains(@class,"rating")]')
texts = content.xpath('.//span[@class="short"]')
name = names[0].xpath('./text()')[0]
if len(grades) > 0:
grade = grades[0].xpath('./@class')[0][7:8] + '星'
else:
grade = '暫無評價'
text = texts[0].xpath('./text()')[0]
# 文件夾不存在,則創建文件夾
save_path = './douban'
folder = os.path.exists(save_path)
if not folder:
os.makedirs(save_path)
with open('./douban/comments.text', 'a+', encoding='utf-8') as f:
f.write('用戶名:%s\n' % name)
f.write('評價:%s\n' % grade)
f.write('評論內容:%s\n' % text)
f.write('==========================================================================\n')