文章目錄
一、使用庫
- re 正則庫
- requests HTTP庫
二、爬取目標
-
貓眼排行(TOP100的電影排名、圖片、電影名、主演、上映時間和評分)
-
地址:https://maoyan.com/board/4
站點限制,不加user-agent情況下爬取亂碼
三、代碼逐步分析
3.1 爬取源代碼
# _*_ coding:utf-8 _*_
import requests
# 定義一個get_one_page()方法,並傳入url參數
def get_one_page(url):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
}
response = requests.get(url, headers=headers)
# 返回爬取的頁面結果
if response.status_code == 200:
return response.text
return None
# 定義主函數
def main():
url = 'https://maoyan.com/board/4'
html = get_one_page(url)
print(html)
# 通過main()方法調用
main()
(成功拿到貓眼首頁源代碼)
3.2 正則提取內容
Tips:正則提取內容的核心是正則表達式的編寫,正則編寫成功的前提是分析HTML源代碼規律
3.2.1 分析源碼規律
》》查看網頁源代碼,每個排名模塊都在一個<dd>標籤中,下方綠色方框中是我們提取的內容,我們可以用紅色下劃線作爲標誌位來寫入正則
3.2.2 正則匹配內容(分析舉例)
(這裏先用正則匹配排名,其它分析類同)
# _*_ coding:utf-8 _*_
import requests, re
def get_one_page(url):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'Cookie':'__mta=154448135.1590711099221.1590720315342.1590721027543.31; uuid_n_v=v1; uuid=F6762C20A14011EA9642EFCCC3619456B947C019F96D4B5FAAF8B1E90D84C541; _csrf=dfbe15498963df30026b4f6b8dd35d4b6bbaca2a320ce3124cf4697f19a06489; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_cuid=1725dc51ae0c8-0864648a72fbd-f7d1d38-e1000-1725dc51ae0c8; _lxsdk=F6762C20A14011EA9642EFCCC3619456B947C019F96D4B5FAAF8B1E90D84C541; mojo-uuid=36b40a0e6faced8727ef4645526b4f74; __mta=154448135.1590711099221.1590711291245.1590711298085.4; mojo-session-id={"id":"eeab46203559090c0bea4fe72e744e89","time":1590719256411}; lt=XKHOYXlVQikp_WQcKgzbhGPgqH0AAAAArwoAAEAX05M2TH7VqZxZNJ5ltWa7BFthNtQs6-v0cIyosaqZuMZaAjDpyqUEGSCTKD_-9Q; lt.sig=T35bvq4BmYRGdKFklLmgj7sBw7U; mojo-trace-id=16; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1590711098,1590721027; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be=1590721027; _lxsdk_s=1725e419696-e16-d95-d71%7C%7C31'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
# 這裏定義瞭解析提取頁面的方法parse_one_page(),作用是使用正則提取並打印源碼中的字符
def parse_one_page(html):
# 將字符串編譯成正則表達式對象(使用非貪婪模式提取i節點中的排名,使用board-index字符對位置進行定位, 需要在編譯時使用模式修正符re.S(使.匹配換行符))
pattern = re.compile('<dd>.*?board-index.*?>(.*?)</i>', re.S)
# 使用正則對源碼中的排名進行提取
result = re.findall(pattern , html)
print(result)
# 主函數中實現定義的方法
def main():
url = 'https://maoyan.com/board/4'
html = get_one_page(url)
result = parse_one_page(html)
main()
成功打印出電影排名
Tips:這裏多次訪問後再次提取顯示提取爲空,如下:
這裏由於多次請求跳往了美團驗證中心
可以進行登錄貓眼電影在爬蟲中添加cookies信息進行繞過(再次爬取正常)
3.2.3 正則匹配內容(全部提取)
# _*_ coding:utf-8 _*_
import requests, re
def get_one_page(url):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'Cookie':'__mta=154448135.1590711099221.1590720315342.1590721027543.31; uuid_n_v=v1; uuid=F6762C20A14011EA9642EFCCC3619456B947C019F96D4B5FAAF8B1E90D84C541; _csrf=dfbe15498963df30026b4f6b8dd35d4b6bbaca2a320ce3124cf4697f19a06489; _lx_utm=utm_source%3DBaidu%26um_medium%3Dorganic; _lxsdk_cuid=1725dc51ae0c8-0864648a72fbd-f7d1d38-e1000-1725dc51ae0c8; _lxsdk=F6762C20A14011EA9642EFCCC3619456B947C019F96D4B5FAAF8B1E90D84C541; mojo-uuid=36b40a0e6faced8727ef4645526b4f74; __mta=154448135.1590711099221.1590711291245.1590711298085.4; mojo-session-id={"id":"eeab46203559090c0bea4fe72e744e89","time":1590719256411}; lt=XKHOYXlVQikp_WQcKgzbhGPgqH0AAAAArwoAAEAX05M2TH7VqZxZNJ5ltWa7BFthNtQs6-v0cIyosaqZuMZaAjDpyqUEGSCTKD_-9Q; lt.sig=T35bvq4BmYRGdKFklLmgj7sBw7U; mojo-trace-id=16; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1590711098,1590721027; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1590721027; _lxsdk_s=1725e419696-e16-d95-d71%7C%7C31'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
def parse_one_page(html):
# 保證每一個正則都有一個大標籤開始(dd)和結尾(/dd),匹配每一個字符段前後都有HTML標誌
pattern = re.compile('<dd>.*?board-index.*?>(.*?)</i>.*?data-src="(.*?)".*?name.*?a.*?">(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>' ,re.S)
result = re.findall(pattern , html)
print(result[0], result[1])
def main():
url = 'https://maoyan.com/board/4'
html = get_one_page(url)
result = parse_one_page(html)
main()
(提取完貓眼top10會是如下一個列表形式,這裏只顯示前兩個)
3.2.4 結果顯示優化
def get_one_page(url):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'Cookie':'__mta=154448135.1590711099221.1590720315342.1590721027543.31; uuid_n_v=v1; uuid=F6762C20A14011EA9642EFCCC3619456B947C019F96D4B5FAAF8B1E90D84C541; _csrf=dfbe15498963df30026b4f68dd35d4b6bbaca2a320ce3124cf4697f19a06489; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_cuid=1725dc51ae0c8-0864648a72fbd-f7d1d38-e1000-1725dc51ae0c8; _lxsdk=F6762C20A14011EA9642EFCCC3619456B947C019F96D4B5FAAF8B1E90D84C541; mojo-uuid=36b40a0e6faced8727ef4645526b4f74; __mta=154448135.1590711099221.1590711291245.1590711298085.4; mojo-session-id={"id":"eeab46203559090c0bea4fe72e744e89","time":1590719256411}; lt=XKHOYXlVQikp_WQcKgzbhGPgqH0AAAAArwoAAEAX05M2TH7VqZxZNJ5ltWa7BFthNtQs6-v0cIyosaqZuMZaAjDpyqUEGSCTKD_-9Q; lt.sig=T35bvq4BmYRGdKFklLmgj7sBw7U; mojo-trace-id=16; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1590711098,1590721027; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1590721027; _lxsdk_s=1725e419696-e16-d95-d71%7C%7C31'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
def parse_one_page(html):
pattern = re.compile('<dd>.*?board-index.*?>(.*?)</i>.*?data-src="(.*?)".*?name.*?a.*?">(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>' ,re.S)
result = re.findall(pattern , html)
# 將數組中的值提取出來重新賦值爲字典
for r in result:
test={
'rank':r[0],
'image':r[1],
# 首尾去空
'title':r[2].strip(),
'actor':r[3].strip(),
'time':r[4].strip(),
'score':r[5].strip() + r[6].strip()
}
print(test)
def main():
url = 'https://maoyan.com/board/4'
html = get_one_page(url)
result = parse_one_page(html)
main()
效果如下
3.2.5 內容保存本地
# _*_ coding:utf-8 _*_
import requests, re, json
def get_one_page(url):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'Cookie':'__mta=154448135.1590711099221.1590720315342.1590721027543.31; uuid_n_v=v1; uuid=F6762C20A14011EA9642EFCCC3619456B947C019F96D4B5FAAF8B1E90D84C541; _csrf=dfbe15498963df30026b4f6b8dd35d4b6bbaca2a320ce3124cf4697f19a06489; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_cuid=1725dc51a0c8-0864648a72fbd-f7d1d38-e1000-1725dc51ae0c8; _lxsdk=F6762C20A14011EA9642EFCCC3619456B947C019F96D4B5FAAF8B1E90D84C541; mojo-uuid=36b40a0e6faced8727ef4645526b4f74; __mta=154448135.1590711099221.1590711291245.1590711298085.4; mojo-session-id={"id":"eeab46203559090c0bea4fe72e744e89","time":1590719256411}; lt=XKHOYXlVQikp_WQcKgzbhGPgqH0AAAAArwoAAEAX05M2TH7VqZxZNJ5ltWa7BFthNtQs6-v0cIyosaqZuMZaAjDpyqUEGSCTKD_-9Q; lt.sig=T35bvq4BmYRGdKFklLmgj7sBw7U; mojo-trace-id=16; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1590711098,1590721027; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1590721027; _lxsdk_s=1725e419696-e16-d95-d71%7C%7C31'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
def parse_one_page(html):
pattern = re.compile('<dd>.*?board-index.*?>(.*?)</i>.*?data-src="(.*?)".*?name.*?a.*?">(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>' ,re.S)
result = re.findall(pattern , html)
for r in result:
maoyan_top={
'rank':r[0],
'image':r[1],
'title':r[2].strip(),
'actor':r[3].strip(),
'time':r[4].strip(),
'score':r[5].strip() + r[6].strip()
}
# 將爬取的數據保存到本地
with open('maoyan_top.txt', 'a', encoding='utf-8') as f:
f.write(str(maoyan_top)+'\n')
# 也可以使用json庫的dumps()方法實現字典的序列化
# ~ f.write(json.dumps(r, ensure_ascii=False)+'\n')
def main():
url = 'https://maoyan.com/board/4'
html = get_one_page(url)
result = parse_one_page(html)
main()
本地保存文件成功
3.2.6 網頁分頁爬取
》》觀察每頁規律
對比第一頁和第二頁,發現offset=10爲第二頁,我們可以通過控制offset參數值來對這10頁進行爬取
》》通過導入offset偏移量即可,核心邏輯爲下:
(採用規範性寫法進入程序主入口)
四、TOP100排名爬取代碼
# _*_ coding:utf-8 _*_
import requests, re, json
# 定義get_one_page()方法,獲取貓眼每頁源碼
def get_one_page(url):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'Cookie':'__mta=154448135.1590711099221.1590720315342.1590721027543.31; uuid_n_v=v1; uuid=F6762C20A14011EA9642EFCCC3619456B947C019F96D4B5FAAF8B1E90D84C541; _csrf=dfbe15498963df30026b4f6b8dd35d4b6bbaca2a320ce3124cf4697f19a06489; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_cuid=1725dc51ae0c8-0864648a72fbd-f7d1d38-e1000-1725dc51ae0c8; _lxsdk=F6762C20A14011EA9642EFCCC3619456B947C019F96D4B5FAAF8B1E90D84C541; mojo-uuid=36b40a0e6faced8727ef4645526b4f74; __mta=154448135.1590711099221.1590711291245.1590711298085.4; mojo-session-id={"id":"eeab46203559090c0be4fe72e744e89","time":1590719256411}; lt=XKHOYXlVQikp_WQcKgzbhGPgqH0AAAAArwoAAEAX05M2TH7VqZxZNJ5ltWa7BFthNtQs6-v0cIyosaqZuMZaAjDpyqUEGSCTKD_-9Q; lt.sig=T35bvq4BmYRGdKFklLmgj7sBw7U; mojo-trace-id=16; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1590711098,1590721027; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1590721027; _lxsdk_s=1725e419696-e16-d95-d71%7C%7C31'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
# 定義parse_one_page()方法,解析正則提取目標字符串(排名、圖片地址、電影名……)
def parse_one_page(html):
pattern = re.compile('<dd>.*?board-index.*?>(.*?)</i>.*?data-src="(.*?)".*?name.*?a.*?">(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>' ,re.S)
result = re.findall(pattern , html)
# 遍歷提取結果生成自定義字典格式
for r in result:
maoyan_top={
'rank':r[0],
'image':r[1],
'title':r[2].strip(),
'actor':r[3].strip(),
'time':r[4].strip(),
'score':r[5].strip() + r[6].strip()
}
# 將爬取內容保存到本地文件
with open('maoyan_top.txt', 'a', encoding='utf-8') as f:
f.write(str(maoyan_top)+'\n')
# 接收主函數中的offset偏移量,構造URL
def main(offset):
url = 'https://maoyan.com/board/4?offset=' + str(offset)
html = get_one_page(url)
result = parse_one_page(html)
# 主函數入口循環出每頁偏移量
if __name__ == '__main__':
for i in range(10):
main(offset = i*10)
(爬取結果如下)