使用requests庫和Xpath語法爬取豆瓣電影Top250
即爬取下面的頁面:
下面爲實現代碼:
'''
@Description: 爬取豆瓣電影 Top 250
@Author: sikaozhifu
@Date: 2020-06-07 15:16:23
@LastEditTime: 2020-06-07 17:04:13
@LastEditors: Please set LastEditors
'''
from lxml import etree
import requests
headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
}
base_url = 'https://movie.douban.com/top250?start={}&filter='
num = 25
movies = []
for x in range(0, 10):
url = base_url.format(x * num)
response = requests.get(url, headers=headers)
text = response.text
html = etree.HTML(text)
lis = html.xpath('//div[@id = "content"]//ol[@class = "grid_view"]/li')
for li in lis:
movie = {}
title = li.xpath('.//img/@alt')[0]
movie['title'] = title
image_url = li.xpath('.//img/@src')[0]
movie['image_url'] = image_url
director = li.xpath('.//div[@class = "bd"]/p/text()')[0]
category = li.xpath('.//div[@class = "bd"]/p/text()')[1]
movie['director'] = director.strip()
movie['category'] = category.strip()
rating = li.xpath('.//div[@class = "star"]/span[1]/@class')[0]
movie['rating'] = rating
rating_num = li.xpath('.//div[@class = "star"]/span[2]/text()')[0]
movie['rating_num'] = rating_num
quote = li.xpath('.//span[@class = "inq"]/text()')
if len(quote):# 因爲有一部電影沒有quote,所以需要判斷獲得的列表是否爲空
quote = li.xpath('.//span[@class = "inq"]/text()')
else:
quote = ''
movie['quote'] = quote
movies.append(movie)
print(movies)
運行結果: