【實例】Python爬取貓眼排行(正則)

一、使用庫

  • re 正則庫
  • requests HTTP庫

二、爬取目標

  • 貓眼排行(TOP100的電影排名、圖片、電影名、主演、上映時間和評分)

  • 地址:https://maoyan.com/board/4
    在這裏插入圖片描述
    站點限制,不加user-agent情況下爬取亂碼
    在這裏插入圖片描述

三、代碼逐步分析

3.1 爬取源代碼

# _*_ coding:utf-8 _*_

import requests
# 定義一個get_one_page()方法,並傳入url參數
def get_one_page(url):
	headers = {
		'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
	}
	response = requests.get(url, headers=headers)
	# 返回爬取的頁面結果
	if response.status_code == 200:
		return response.text
	return None
# 定義主函數
def main():
	url = 'https://maoyan.com/board/4'
	html = get_one_page(url)
	print(html)
# 通過main()方法調用
main()

(成功拿到貓眼首頁源代碼)
在這裏插入圖片描述

3.2 正則提取內容

Tips:正則提取內容的核心是正則表達式的編寫,正則編寫成功的前提是分析HTML源代碼規律

3.2.1 分析源碼規律

》》查看網頁源代碼,每個排名模塊都在一個<dd>標籤中,下方綠色方框中是我們提取的內容,我們可以用紅色下劃線作爲標誌位來寫入正則
在這裏插入圖片描述

3.2.2 正則匹配內容(分析舉例)

(這裏先用正則匹配排名,其它分析類同)

# _*_ coding:utf-8 _*_

import requests, re

def get_one_page(url):
	headers = {
		'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
		'Cookie':'__mta=154448135.1590711099221.1590720315342.1590721027543.31; uuid_n_v=v1; uuid=F6762C20A14011EA9642EFCCC3619456B947C019F96D4B5FAAF8B1E90D84C541; _csrf=dfbe15498963df30026b4f6b8dd35d4b6bbaca2a320ce3124cf4697f19a06489; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_cuid=1725dc51ae0c8-0864648a72fbd-f7d1d38-e1000-1725dc51ae0c8; _lxsdk=F6762C20A14011EA9642EFCCC3619456B947C019F96D4B5FAAF8B1E90D84C541; mojo-uuid=36b40a0e6faced8727ef4645526b4f74; __mta=154448135.1590711099221.1590711291245.1590711298085.4; mojo-session-id={"id":"eeab46203559090c0bea4fe72e744e89","time":1590719256411}; lt=XKHOYXlVQikp_WQcKgzbhGPgqH0AAAAArwoAAEAX05M2TH7VqZxZNJ5ltWa7BFthNtQs6-v0cIyosaqZuMZaAjDpyqUEGSCTKD_-9Q; lt.sig=T35bvq4BmYRGdKFklLmgj7sBw7U; mojo-trace-id=16; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1590711098,1590721027; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be=1590721027; _lxsdk_s=1725e419696-e16-d95-d71%7C%7C31'
	}
	response = requests.get(url, headers=headers)
	if response.status_code == 200:
		return response.text
	return None

# 這裏定義瞭解析提取頁面的方法parse_one_page(),作用是使用正則提取並打印源碼中的字符
def parse_one_page(html):
	# 將字符串編譯成正則表達式對象(使用非貪婪模式提取i節點中的排名,使用board-index字符對位置進行定位, 需要在編譯時使用模式修正符re.S(使.匹配換行符))
	pattern = re.compile('<dd>.*?board-index.*?>(.*?)</i>', re.S)
	# 使用正則對源碼中的排名進行提取
	result = re.findall(pattern , html)
	print(result)
# 主函數中實現定義的方法
def main():
	url = 'https://maoyan.com/board/4'
	html = get_one_page(url)
	result = parse_one_page(html)

main()

成功打印出電影排名
在這裏插入圖片描述

Tips:這裏多次訪問後再次提取顯示提取爲空,如下:
在這裏插入圖片描述
這裏由於多次請求跳往了美團驗證中心
在這裏插入圖片描述
可以進行登錄貓眼電影在爬蟲中添加cookies信息進行繞過(再次爬取正常)
在這裏插入圖片描述

3.2.3 正則匹配內容(全部提取)

# _*_ coding:utf-8 _*_

import requests, re

def get_one_page(url):
	headers = {
		'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
		'Cookie':'__mta=154448135.1590711099221.1590720315342.1590721027543.31; uuid_n_v=v1; uuid=F6762C20A14011EA9642EFCCC3619456B947C019F96D4B5FAAF8B1E90D84C541; _csrf=dfbe15498963df30026b4f6b8dd35d4b6bbaca2a320ce3124cf4697f19a06489; _lx_utm=utm_source%3DBaidu%26um_medium%3Dorganic; _lxsdk_cuid=1725dc51ae0c8-0864648a72fbd-f7d1d38-e1000-1725dc51ae0c8; _lxsdk=F6762C20A14011EA9642EFCCC3619456B947C019F96D4B5FAAF8B1E90D84C541; mojo-uuid=36b40a0e6faced8727ef4645526b4f74; __mta=154448135.1590711099221.1590711291245.1590711298085.4; mojo-session-id={"id":"eeab46203559090c0bea4fe72e744e89","time":1590719256411}; lt=XKHOYXlVQikp_WQcKgzbhGPgqH0AAAAArwoAAEAX05M2TH7VqZxZNJ5ltWa7BFthNtQs6-v0cIyosaqZuMZaAjDpyqUEGSCTKD_-9Q; lt.sig=T35bvq4BmYRGdKFklLmgj7sBw7U; mojo-trace-id=16; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1590711098,1590721027; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1590721027; _lxsdk_s=1725e419696-e16-d95-d71%7C%7C31'
	}
	response = requests.get(url, headers=headers)
	if response.status_code == 200:
		return response.text
	return None
	
def parse_one_page(html):
	# 保證每一個正則都有一個大標籤開始(dd)和結尾(/dd),匹配每一個字符段前後都有HTML標誌
	pattern = re.compile('<dd>.*?board-index.*?>(.*?)</i>.*?data-src="(.*?)".*?name.*?a.*?">(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>' ,re.S)
	result = re.findall(pattern , html)
	print(result[0], result[1])

def main():
	url = 'https://maoyan.com/board/4'
	html = get_one_page(url)
	result = parse_one_page(html)

main()

(提取完貓眼top10會是如下一個列表形式,這裏只顯示前兩個)
在這裏插入圖片描述

3.2.4 結果顯示優化

def get_one_page(url):
	headers = {
		'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
		'Cookie':'__mta=154448135.1590711099221.1590720315342.1590721027543.31; uuid_n_v=v1; uuid=F6762C20A14011EA9642EFCCC3619456B947C019F96D4B5FAAF8B1E90D84C541; _csrf=dfbe15498963df30026b4f68dd35d4b6bbaca2a320ce3124cf4697f19a06489; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_cuid=1725dc51ae0c8-0864648a72fbd-f7d1d38-e1000-1725dc51ae0c8; _lxsdk=F6762C20A14011EA9642EFCCC3619456B947C019F96D4B5FAAF8B1E90D84C541; mojo-uuid=36b40a0e6faced8727ef4645526b4f74; __mta=154448135.1590711099221.1590711291245.1590711298085.4; mojo-session-id={"id":"eeab46203559090c0bea4fe72e744e89","time":1590719256411}; lt=XKHOYXlVQikp_WQcKgzbhGPgqH0AAAAArwoAAEAX05M2TH7VqZxZNJ5ltWa7BFthNtQs6-v0cIyosaqZuMZaAjDpyqUEGSCTKD_-9Q; lt.sig=T35bvq4BmYRGdKFklLmgj7sBw7U; mojo-trace-id=16; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1590711098,1590721027; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1590721027; _lxsdk_s=1725e419696-e16-d95-d71%7C%7C31'
	}
	response = requests.get(url, headers=headers)
	if response.status_code == 200:
		return response.text
	return None
	
def parse_one_page(html):
	pattern = re.compile('<dd>.*?board-index.*?>(.*?)</i>.*?data-src="(.*?)".*?name.*?a.*?">(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>' ,re.S)
	result = re.findall(pattern , html)
	# 將數組中的值提取出來重新賦值爲字典
	for r in result:
		test={
			'rank':r[0],
			'image':r[1],
			# 首尾去空
			'title':r[2].strip(),
			'actor':r[3].strip(),
			'time':r[4].strip(),
			'score':r[5].strip() + r[6].strip()
		}
		print(test)

def main():
	url = 'https://maoyan.com/board/4'
	html = get_one_page(url)
	result = parse_one_page(html)

main()

效果如下
在這裏插入圖片描述

3.2.5 內容保存本地

# _*_ coding:utf-8 _*_

import requests, re, json

def get_one_page(url):
	headers = {
		'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
		'Cookie':'__mta=154448135.1590711099221.1590720315342.1590721027543.31; uuid_n_v=v1; uuid=F6762C20A14011EA9642EFCCC3619456B947C019F96D4B5FAAF8B1E90D84C541; _csrf=dfbe15498963df30026b4f6b8dd35d4b6bbaca2a320ce3124cf4697f19a06489; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_cuid=1725dc51a0c8-0864648a72fbd-f7d1d38-e1000-1725dc51ae0c8; _lxsdk=F6762C20A14011EA9642EFCCC3619456B947C019F96D4B5FAAF8B1E90D84C541; mojo-uuid=36b40a0e6faced8727ef4645526b4f74; __mta=154448135.1590711099221.1590711291245.1590711298085.4; mojo-session-id={"id":"eeab46203559090c0bea4fe72e744e89","time":1590719256411}; lt=XKHOYXlVQikp_WQcKgzbhGPgqH0AAAAArwoAAEAX05M2TH7VqZxZNJ5ltWa7BFthNtQs6-v0cIyosaqZuMZaAjDpyqUEGSCTKD_-9Q; lt.sig=T35bvq4BmYRGdKFklLmgj7sBw7U; mojo-trace-id=16; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1590711098,1590721027; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1590721027; _lxsdk_s=1725e419696-e16-d95-d71%7C%7C31'
	}
	response = requests.get(url, headers=headers)
	if response.status_code == 200:
		return response.text
	return None
	
def parse_one_page(html):
	pattern = re.compile('<dd>.*?board-index.*?>(.*?)</i>.*?data-src="(.*?)".*?name.*?a.*?">(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>' ,re.S)
	result = re.findall(pattern , html)
	
	for r in result:
		maoyan_top={
			'rank':r[0],
			'image':r[1],
			'title':r[2].strip(),
			'actor':r[3].strip(),
			'time':r[4].strip(),
			'score':r[5].strip() + r[6].strip()
		}
		# 將爬取的數據保存到本地
		with open('maoyan_top.txt', 'a', encoding='utf-8') as f:
			f.write(str(maoyan_top)+'\n')
			# 也可以使用json庫的dumps()方法實現字典的序列化
			# ~ f.write(json.dumps(r, ensure_ascii=False)+'\n')

def main():
	url = 'https://maoyan.com/board/4'
	html = get_one_page(url)
	result = parse_one_page(html)
	
main()

本地保存文件成功
在這裏插入圖片描述

3.2.6 網頁分頁爬取

》》觀察每頁規律

對比第一頁和第二頁,發現offset=10爲第二頁,我們可以通過控制offset參數值來對這10頁進行爬取

在這裏插入圖片描述
》》通過導入offset偏移量即可,核心邏輯爲下:
(採用規範性寫法進入程序主入口)
在這裏插入圖片描述

四、TOP100排名爬取代碼

# _*_ coding:utf-8 _*_

import requests, re, json
# 定義get_one_page()方法,獲取貓眼每頁源碼
def get_one_page(url):
	headers = {
		'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
		'Cookie':'__mta=154448135.1590711099221.1590720315342.1590721027543.31; uuid_n_v=v1; uuid=F6762C20A14011EA9642EFCCC3619456B947C019F96D4B5FAAF8B1E90D84C541; _csrf=dfbe15498963df30026b4f6b8dd35d4b6bbaca2a320ce3124cf4697f19a06489; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_cuid=1725dc51ae0c8-0864648a72fbd-f7d1d38-e1000-1725dc51ae0c8; _lxsdk=F6762C20A14011EA9642EFCCC3619456B947C019F96D4B5FAAF8B1E90D84C541; mojo-uuid=36b40a0e6faced8727ef4645526b4f74; __mta=154448135.1590711099221.1590711291245.1590711298085.4; mojo-session-id={"id":"eeab46203559090c0be4fe72e744e89","time":1590719256411}; lt=XKHOYXlVQikp_WQcKgzbhGPgqH0AAAAArwoAAEAX05M2TH7VqZxZNJ5ltWa7BFthNtQs6-v0cIyosaqZuMZaAjDpyqUEGSCTKD_-9Q; lt.sig=T35bvq4BmYRGdKFklLmgj7sBw7U; mojo-trace-id=16; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1590711098,1590721027; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1590721027; _lxsdk_s=1725e419696-e16-d95-d71%7C%7C31'
	}
	response = requests.get(url, headers=headers)
	if response.status_code == 200:
		return response.text
	return None
# 定義parse_one_page()方法,解析正則提取目標字符串(排名、圖片地址、電影名……)	
def parse_one_page(html):
	pattern = re.compile('<dd>.*?board-index.*?>(.*?)</i>.*?data-src="(.*?)".*?name.*?a.*?">(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>' ,re.S)
	result = re.findall(pattern , html)
	# 遍歷提取結果生成自定義字典格式
	for r in result:
		maoyan_top={
			'rank':r[0],
			'image':r[1],
			'title':r[2].strip(),
			'actor':r[3].strip(),
			'time':r[4].strip(),
			'score':r[5].strip() + r[6].strip()
		}
		# 將爬取內容保存到本地文件
		with open('maoyan_top.txt', 'a', encoding='utf-8') as f:
			f.write(str(maoyan_top)+'\n')
			
# 接收主函數中的offset偏移量,構造URL
def main(offset):
	url = 'https://maoyan.com/board/4?offset=' + str(offset)
	html = get_one_page(url)
	result = parse_one_page(html)
# 主函數入口循環出每頁偏移量
if __name__ == '__main__':
	for i in range(10):
		main(offset = i*10)

(爬取結果如下)
在這裏插入圖片描述

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章