模板1之爬取豆瓣電影排行榜

僞代碼,當模板

import requests
import bs4
import re

def open_url(url):
	headers = {
			'user-agent': '網頁F12去找'
			}
	res = request.get(url,headers=headers)
	return res

def find_movies(res):
	soup = bs4.BeautifulSoup(res.text,'html.parser')
	# 電影名
	movies = []
	targets = soup.find_all("div",class="hd")
	for each in targets:
		movies.append(each.a.span.text)
	# 評分
	ranks= []
	targets = soup.find_all("span",class="rating_num")
	for each in targets:
		#ranks.append('評分:%s' % each.text)
		ranks.append(each.text)
	# 資料
	messages = []
	targets = soup.find_all("div",class="bd")
	for each in targets:
		try:
		messages.append(each.p.text.split('\n')[1].strip() + each.p.text.split('\n')[2].strip())	
		except:
			continue
	result = []
	length = len(movies)
	for i in range(length):
		#result.append(movies[i] + rank[i] + messages[i] + '\n')
		result.append([movies[i],rank[i],messages[i]])
	
	return result
	
# 找出一共多少個頁面
def find_depth(res):
	soup = bs4.BeautifulSOup(res.text,'html.parser')
	depth = soup.find('span',class_='next').previous_sibling.previous_sibling.text
	
	return int(depth)

def save_to_excel(result):
	wb = openpyxl.Workbook()
	ws = wb.active
	ws.append(['電影名稱','評分','資料'])

	for each in result:
		ws.append(each)
	ws.save("1.xlsx)
	
def main():
	host = "一個網頁的url"
	res = open_url(host)
	depth = find_depth(res)
	
	result = []
	for i in range(depth):
		url = host + '/?start=' + str(25 * i)
		res = open_url(url)
		result.extend(find_movies(res))
	'''
	with open("1.txt","w",encoding="utf-8") as f:
		for each in result:
			f.write(each)
	'''
	save_to_excel(result)
main()
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章