練習xPath,爬取豆瓣正在放映電影,獲取電影信息,按評分對電影排序。
import requests
from lxml import etree
import operator
def get_text():
# 把源代碼爬取下來
headers = {
'Referer': 'https://movie.douban.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:69.0) Gecko/20100101 Firefox/69.0'
}
url = 'https://movie.douban.com/cinema/nowplaying/zhengzhou/'
resp = requests.get(url, headers=headers)
with open('douan.html', 'w', encoding='utf-8') as fp:
fp.write(resp.text)
# 解析
def parser_contents():
nowplaying = []
parser = etree.HTMLParser(encoding='utf-8')
html = etree.parse('douan.html', parser=parser)
movies = html.xpath('/html/body/div[3]/div[1]/div/div[1]/div[3]/div[2]/ul/li')
for movie in movies:
# print(etree.tostring(movie, encoding='utf-8').decode('utf-8'))
time = movie.xpath('.//@data-release')
title = movie.xpath('.//@data-title')
score = movie.xpath('.//@data-score')
duration = movie.xpath('.//@data-duration')
director = movie.xpath('.//@data-director')
actors = movie.xpath('.//@data-actors')
# print(time, score, duration, director, actors)
picture = movie.xpath('.//li[1]//img/@src')[0]
details = movie.xpath('.//li[2]/a//@href')[0]
buy_ticket = movie.xpath('.//li[4]/a//@href')[0]
info = {
'電影名稱': title,
'上映時間': time,
'評分': score,
'時長': duration,
'導演': director,
'演員': actors,
'海報': picture,
'詳細': details,
'選座購票': buy_ticket,
}
nowplaying.append(info)
return nowplaying
if __name__ == '__main__':
get_text()
nowplaying = parser_contents()
print(len(nowplaying))
# 按評分排序
nowplaying = sorted(nowplaying, key=operator.itemgetter('評分'), reverse=True)
for i in nowplaying:
print(i)
print('\n')
運行截圖: