# 1.打印程序執行時間
# 2.隨機的User-Agent(確保每次發請求使用隨機)
# 3.數據爬下來後做處理(字符串),定義成字典
# 4.一條龍:獲取 --> 調用解析 --> 數據處理
# 請求模塊
from urllib import request
import re
import time
import random
import csv
class Maoyan_spider(object):
def __init__(self):
self.url = "https://maoyan.com/board/4?offset={}"
self.ua_list = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0'
]
# 用於記錄頁數
self.page = 1
# 獲取響應
def get_page(self, url):
#每次使用隨機的user-agent
headers = {"User-Agent": random.choice(self.ua_list)}
req = request.Request(
url = url,
headers = headers
)
res = request.urlopen(req)
html = res.read().decode('utf-8')
# 調用解析函數
self.parse_page(html)
return html
# 提取數據,解析
def parse_page(self, html):
# 正則解析,創建編譯對象
pattern = re.compile('<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?releasetime">(.*?)</p>',re.S)
# r_list:[('霸王別姬','張國榮','1993'),(),()]
r_list = pattern.findall(html)
self.write_page(r_list)
# 保存,從終端打印輸出
# def write_page(self,r_list):
# one_film_dict ={}
# for rt in r_list:
# one_film_dict['name'] = rt[0].strip()
# one_film_dict['star'] = rt[1].strip()
# one_film_dict['time'] = rt[2].strip()[5:15]
#
# print(one_film_dict)
# # 保存到csv文件(writerow)
# def write_page(self, r_list):
# # 打開文件要在for循環之前,否則會打開很多次文件
# with open('maoyan.csv', 'a',newline='') as f:
# for rt in r_list:
# writer = csv.writer(f)
# writer.writerow(
# [rt[0].strip(),rt[1].strip(),rt[2].strip()[5:15]]
# )
# 保存到csv文件(writerows)推薦使用此方法
def write_page(self, r_list):
# 空列表,最終writerrows()的參數:[(),(),()]
film_list = []
with open('maoyan.csv', 'a', newline='') as f:
writer = csv.writer(f)
for rt in r_list:
# 把處理過的數據定義爲元祖
t = (rt[0].strip(),rt[1].strip(),rt[2].strip()[5:15])
film_list.append(t)
#和for循環平級
writer.writerows(film_list)
# 主函數
def main(self):
for offset in range(0,31,10):
url = self.url.format(offset)
html = self.get_page(url)
time.sleep(random.randint(1,3))
print('第%d頁爬取完成'%self.page)
self.page += 1
if __name__ == "__main__":
start = time.time()
spider = Maoyan_spider()
spider.main()
end = time.time()
print('執行時間爲:%.2f'%(end-start))