使用Python爬取豆瓣電影top250

本文爲學習記錄筆記,原創非常優秀~ 感謝@數挖小飛飛 思密達。
在運行源程序時,發現關於路徑的報錯。故,本文對@數挖小飛飛 大大的代碼做了一小點修改,添加了建立文件夾函數。
原文鏈接:https://blog.csdn.net/qq_36936730/article/details/104668162

1.修改部分

在運行原程序時,未手動建立文件夾“film_pic”。程序報錯如下:
在這裏插入圖片描述
添加函數如下,將creat_dir()添添加至主函數第一行即可。

# create dir
def create_dir():

	import os		# 引入python 的OS庫

	file_path = r'E:/PySource/film_pic'	 # 文件夾路徑及名稱

	if os.path.exists(file_path):		# 判斷是否已存在同名文件夾,存在則刪除後重新創建
		os.rmdir(file_path)
		os.mkdir(file_path)
	else:
		os.mkdir(file_path)

2.運行結果

在這裏插入圖片描述在這裏插入圖片描述

3.完整學習代碼


# 發送請求——獲得頁面——解析頁面——抽取並儲存內容

import requests
import re
import json

'''
# Web Capture
url = "https://movie.douban.com/top250?start=0&filter="
headers = {
	"user-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3742.400 QQBrowser/10.5.3866.400"
}
response = requests.get(url,headers=headers)
text = response.text

# information extraction
regix = '<div class="pic">.*?<em class="">(.*?)</em>.*?<img.*?src="(.*?)" class="">.*?' \
        'div class="info.*?class="hd".*?class="title">(.*?)</span>.*?class="other">(.*?)'\
        '</span>.*?<div class="bd">.*?<p class="">(.*?)<br>(.*?)</p>.*?' \
        'class="star.*?<span class="(.*?)"></span>.*?span class="rating_num".*?average">(.*?)</span>.*?<span>(.*?)</span>.*?' \
        'span class="inq"?>(.*?)</span>'
res = re.findall(regix, text, re.S)
print(res)

# image download defination
def down_image(url,name,headers):
	r = requests.get(url,headers = headers)
	filename = re.search('/public/(.*?)$',url,re.S).group(1)
	with open("film_pic/"+name.split('/')[0]+".jpg",'wb') as f:
		f.write(r.content)
'''

headers = {
	"user-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3742.400 QQBrowser/10.5.3866.400"
}

# create dir
def create_dir():

	import os

	file_path = r'E:/PySource/film_pic'

	if os.path.exists(file_path):
		os.rmdir(file_path)
		os.mkdir(file_path)
	else:
		os.mkdir(file_path)

# image download defination
def down_image(url,name,headers):
	r = requests.get(url,headers = headers)
	filename = re.search('/public/(.*?)$',url,re.S).group(1)	
	with open("film_pic/"+name.split('/')[0]+".jpg",'wb') as f:		
		f.write(r.content)

# Web page parsing function
def parse_html(url):	
	response = requests.get(url,headers=headers)	
	text = response.text	

	# 正則表達式頭部([1:排名 2:圖片] [3:名稱 4:別名] [5:導演 6:年份/國家/類型] [7:評星 8:評分 9:評價人數] [10:評價])	
	regix = '<div class="pic">.*?<em class="">(.*?)</em>.*?<img.*?src="(.*?)" class="">.*?' \
        'div class="info.*?class="hd".*?class="title">(.*?)</span>.*?class="other">(.*?)'\
        '</span>.*?<div class="bd">.*?<p class="">(.*?)<br>(.*?)</p>.*?' \
        'class="star.*?<span class="(.*?)"></span>.*?span class="rating_num".*?average">(.*?)</span>.*?<span>(.*?)</span>.*?' \
        'span class="inq"?>(.*?)</span>'

	# match all the results
	res = re.findall(regix, text, re.S)
	for item in res:
		rank = item[0]
		down_image(item[1],item[2],headers = headers)
		name = item[2] + ' ' + re.sub('&nbsp;', ' ',item[3])
		actor = re.sub('&nbsp;', '',item[4].strip())
		year = item[5].split('/')[0].strip('&nbsp;').strip()
		country = item[5].split('/')[1].strip('&nbsp;').strip()
		tp = item[5].split('/')[2].strip('&nbsp;').strip()
		tmp = [i for i in item[6] if i.isnumeric()]
		if len(tmp) == 1:
			score = tmp[0] + '星/' + item[7] + '分'
		else:
			score = tmp[0] + '星半/' + item[7] + '分'
		rev_num = item[8][:-3]
		inq = item[9]

		# create dictionary
		yield{
			'電影名稱': name,'導演和演員': actor, '類型': tp, '年份': year, '國家': country, '評分': score,'排名': rank, '評價人數': rev_num, '評價': inq
		}

# define output function
def write_movies_file(str):
	with open('top250_douban_film.txt','a',encoding='utf-8') as f:	# write in top250_douban_film.txt
		f.write(json.dumps(str, ensure_ascii=False) + '\n')

# define main()
def main():
	create_dir()
	for offset in range(0,250,25):
		url = 'https://movie.douban.com/top250?start=' + str(offset) + '&filter='
		for item in parse_html(url):
			print(item)
			write_movies_file(item)

if __name__ == '__main__':
    main()

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章