本文爲學習記錄筆記,原創非常優秀~ 感謝@數挖小飛飛 思密達。
在運行源程序時,發現關於路徑的報錯。故,本文對@數挖小飛飛 大大的代碼做了一小點修改,添加了建立文件夾函數。
原文鏈接:https://blog.csdn.net/qq_36936730/article/details/104668162
1.修改部分
在運行原程序時,未手動建立文件夾“film_pic”。程序報錯如下:
添加函數如下,將creat_dir()添添加至主函數第一行即可。
# create dir
def create_dir():
import os # 引入python 的OS庫
file_path = r'E:/PySource/film_pic' # 文件夾路徑及名稱
if os.path.exists(file_path): # 判斷是否已存在同名文件夾,存在則刪除後重新創建
os.rmdir(file_path)
os.mkdir(file_path)
else:
os.mkdir(file_path)
2.運行結果
3.完整學習代碼
# 發送請求——獲得頁面——解析頁面——抽取並儲存內容
import requests
import re
import json
'''
# Web Capture
url = "https://movie.douban.com/top250?start=0&filter="
headers = {
"user-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3742.400 QQBrowser/10.5.3866.400"
}
response = requests.get(url,headers=headers)
text = response.text
# information extraction
regix = '<div class="pic">.*?<em class="">(.*?)</em>.*?<img.*?src="(.*?)" class="">.*?' \
'div class="info.*?class="hd".*?class="title">(.*?)</span>.*?class="other">(.*?)'\
'</span>.*?<div class="bd">.*?<p class="">(.*?)<br>(.*?)</p>.*?' \
'class="star.*?<span class="(.*?)"></span>.*?span class="rating_num".*?average">(.*?)</span>.*?<span>(.*?)</span>.*?' \
'span class="inq"?>(.*?)</span>'
res = re.findall(regix, text, re.S)
print(res)
# image download defination
def down_image(url,name,headers):
r = requests.get(url,headers = headers)
filename = re.search('/public/(.*?)$',url,re.S).group(1)
with open("film_pic/"+name.split('/')[0]+".jpg",'wb') as f:
f.write(r.content)
'''
headers = {
"user-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3742.400 QQBrowser/10.5.3866.400"
}
# create dir
def create_dir():
import os
file_path = r'E:/PySource/film_pic'
if os.path.exists(file_path):
os.rmdir(file_path)
os.mkdir(file_path)
else:
os.mkdir(file_path)
# image download defination
def down_image(url,name,headers):
r = requests.get(url,headers = headers)
filename = re.search('/public/(.*?)$',url,re.S).group(1)
with open("film_pic/"+name.split('/')[0]+".jpg",'wb') as f:
f.write(r.content)
# Web page parsing function
def parse_html(url):
response = requests.get(url,headers=headers)
text = response.text
# 正則表達式頭部([1:排名 2:圖片] [3:名稱 4:別名] [5:導演 6:年份/國家/類型] [7:評星 8:評分 9:評價人數] [10:評價])
regix = '<div class="pic">.*?<em class="">(.*?)</em>.*?<img.*?src="(.*?)" class="">.*?' \
'div class="info.*?class="hd".*?class="title">(.*?)</span>.*?class="other">(.*?)'\
'</span>.*?<div class="bd">.*?<p class="">(.*?)<br>(.*?)</p>.*?' \
'class="star.*?<span class="(.*?)"></span>.*?span class="rating_num".*?average">(.*?)</span>.*?<span>(.*?)</span>.*?' \
'span class="inq"?>(.*?)</span>'
# match all the results
res = re.findall(regix, text, re.S)
for item in res:
rank = item[0]
down_image(item[1],item[2],headers = headers)
name = item[2] + ' ' + re.sub(' ', ' ',item[3])
actor = re.sub(' ', '',item[4].strip())
year = item[5].split('/')[0].strip(' ').strip()
country = item[5].split('/')[1].strip(' ').strip()
tp = item[5].split('/')[2].strip(' ').strip()
tmp = [i for i in item[6] if i.isnumeric()]
if len(tmp) == 1:
score = tmp[0] + '星/' + item[7] + '分'
else:
score = tmp[0] + '星半/' + item[7] + '分'
rev_num = item[8][:-3]
inq = item[9]
# create dictionary
yield{
'電影名稱': name,'導演和演員': actor, '類型': tp, '年份': year, '國家': country, '評分': score,'排名': rank, '評價人數': rev_num, '評價': inq
}
# define output function
def write_movies_file(str):
with open('top250_douban_film.txt','a',encoding='utf-8') as f: # write in top250_douban_film.txt
f.write(json.dumps(str, ensure_ascii=False) + '\n')
# define main()
def main():
create_dir()
for offset in range(0,250,25):
url = 'https://movie.douban.com/top250?start=' + str(offset) + '&filter='
for item in parse_html(url):
print(item)
write_movies_file(item)
if __name__ == '__main__':
main()