【python實現網絡爬蟲(8)】requests+bs4實現笑話大全數據爬取

這裏就直接給代碼和輸出結果

import re
import requests
from bs4 import BeautifulSoup


def get_url(n):
	lst = []
	for i in range(n):
		ui = f"http://xiaohua.zol.com.cn/lengxiaohua/{i}.html"
		lst.append(ui)
	return lst

def get_data(ui,dic_h,dic_c):
	ri = requests.get(ui,headers = dic_h,cookies=dic_c)
	soupi = BeautifulSoup(ri.text, 'lxml')
	lis = soupi.find("ul", class_="article-list").find_all("li")
	lst = []
	for li in lis:
		dic = {}
		title = li.find("a").text
		source = li.find("div",class_ = "article-source").span.next_sibling.text
		content = re.sub(r"\s","",li.find("div",class_ = "summary-text").text)
		dic['標題'] = title
		dic['來源'] = source
		dic['內容'] = content
		lst.append(dic)
	return lst


if __name__ == "__main__":
	urllst = get_url(10)
	dic_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
	
	cookies = 'ip_ck=5ceJ4vn3j7QuMjA4NTMwLjE1NzgxMzM1Nzc%3D; _ga=GA1.3.1225262396.1578133580; bdshare_firstime=1578133579985; z_pro_city=s_provice%3Dhenan%26s_city%3Dxinyang; userProvinceId=22; userCityId=274; userCountyId=0; userLocationId=98988; _gid=GA1.3.714541997.1581779195; questionnaire_close_today=1581724801; questionnaire_close_total=1; lv=1581813976; vn=7; Hm_lvt_ae5edc2bc4fc71370807f6187f0a2dd0=1581674234,1581677358,1581779195,1581813977; questionnaire_pv=1581811202; Hm_lpvt_ae5edc2bc4fc71370807f6187f0a2dd0=1581813980; 0eaca02be5352ff53a4b3abd16c22bb8=bs283k1v2g1fn2bo278t%7B%7BZ%7D%7D2%7B%7BZ%7D%7Dnull; 25c963336b4e0a1c4aa78f69eb50b24c=bs283k1v2g1fn2bo278t%7B%7BZ%7D%7D2%7B%7BZ%7D%7Dnull; MyZClick_0eaca02be5352ff53a4b3abd16c22bb8=/html/body/div%5B6%5D/div/div%5B2%5D/div/a/; MyZClick_25c963336b4e0a1c4aa78f69eb50b24c=/html/body/div%5B6%5D/div/div%5B2%5D/div/a/'
	dic_cookies = {}
	for i in cookies.split("; "):
		dic_cookies[i.split("=")[0]] = i.split("=")[1]


	data_lst = []
	errorlst = []
	for u in urllst:
		try:
			data_lst.extend(get_data(u,dic_headers,dic_cookies))
			print("已經爬取{}條數據".format(len(data_lst)))
		except:
			errorlst.append(u)
			print("數據採集失敗,網址爲:",u)

輸出結果爲:
在這裏插入圖片描述
excel中的數據爲:
在這裏插入圖片描述

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章