# 基本思路: # 1.瀏覽訪問網頁榜單-最受期待榜 url = 'http://maoyan.com/board/6?offset=0' # 2.獲取數據:電影名稱 主演 網址 上映時間 本月新增想看人數 總想看人數 # 3.數據解析處理等 # 4.數據保存 import requests from requests.exceptions import RequestException import re import json from multiprocessing import Pool # 多線程 def get_page(url): # 網頁請求 try: response = requests.get(url) if response.status_code == 200: return response.text else: return None except RequestException as e: return e def analysis(html): # 解析網頁數據 pattern = re.compile('.*?<p class="name".*?data-val.*?>(.*?)</a></p>.*?class="star">(.*?)\ </p>.*?class="releasetime">(.*?)</p>.*?class="month-wish">(.*?)<span>.*?class="stonefont">.*?</span>.*?\ class="total-wish">(.*?)<span>.*?class="stonefont">.*?</span>', re.S) # re.S 匹配換行符 result = re.findall(pattern, html) for item in result: yield { 'name': item[0], 'actor': item[1].strip()[3:], 'time': item[2].strip()[5:], 'month_wish': item[3], 'total_wish': item[4] } def write_to_file(content): # 數據保存至本地 with open('result.txt', 'a', encoding='utf-8') as f: f.write(json.dumps(content, ensure_ascii=False) + '\n') f.close() def main(offset): url = 'http://maoyan.com/board/6?offset=' + str(offset) html = get_page(url) for item in analysis(html): print(item) # write_to_file(item) if __name__ == '__main__': """ for i in range(0,4): main(i*10) """ pool = Pool() pool.map(main, [i*10 for i in range(0, 4)])
Python爬蟲:requests+正則爬取貓眼最受期待榜單
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.