第一步,我們需要爬取豆瓣信息,
第二步,我們需要保存爬取下來的信息,
接下來我們導入模塊,
HTTP請求的request模塊 和Excel文件處理的 xlwt 模塊
還有一個實時間模塊,我們爬取的時間不能太快,否則會被網站識別到。
import requests
import xlwt
import time
接下來創建Excel表格並命名:
#創建Excel
excel1 = xlwt.Workbook()
#工作表籤sheet1重命名爲 豆瓣
sheet1 = excel1.add_sheet('豆瓣', cell_overwrite_ok=True)
#第一個參數0代表第一行,第二個參數代表第一行的第一列,第二列,第三列
sheet1.write(0, 0, '電影名稱')
sheet1.write(0, 1, '地區/國家')
sheet1.write(0, 2, '年份')
sheet1.write(0, 3, '評分')
sheet1.write(0, 4, '評價人數')
現在開始獲取數據,
分析豆瓣電影的頁面,將我們得到的有用信息寫入代碼,
#定義一個字典data,起始頁碼start爲0,我們改爲pn,方便循環其他分頁
#通過瀏覽其他頁面我們得知,pn頁碼的數字不是1,2,3,4,5這樣表示,而是通過+20表示:0,20,40,
data = {
'start': pn,
'limit': '20'
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.5702.400 QQBrowser/10.2.1893.400',
'Referer': 'https://movie.douban.com/typerank?type_name=%E5%89%A7%E6%83%85&type=11&interval_id=100:90&action=',
'Cookie':'bid=6nS16u78ysk; douban-fav-remind=1; __utmz=30149280.1536302200.2.2.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; ll="118371"; _vwo_uuid_v2=D1979D38657B6548A1BB551C1064F3530|bf98aa297a7ebd3c5226eab14e60b262; _pk_ses.100001.4cf6=*; ap_v=0,6.0; __utma=30149280.1052346626.1533889879.1536302200.1538099719.3; __utmc=30149280; __utma=223695111.1669584758.1538099719.1538099719.1538099719.1; __utmb=223695111.0.10.1538099719; __utmc=223695111; __utmz=223695111.1538099719.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); ps=y; dbcl2="185126615:6R0mlwkYAeI"; ck=ga0K; push_noty_num=0; push_doumail_num=0; __yadk_uid=KHFgBWW1h7s9vtx9sYMoQeaaZ8e8CNU1; __utmt=1; __utmv=30149280.18512; __utmb=30149280.8.10.1538099719; _pk_id.100001.4cf6=5aa5204a625172c4.1538099719.1.1538100311.1538099719.'
}
Requesturl = 'https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start={}&limit=20'.format(pn)
res = requests.get(Requesturl, data=data, headers=headers)
result = res.json()
這樣我們的了一個頁電影的所有信息
我們要對信息進行提取,只要我們需要的,提取到需要的後,將信息寫入Excel裏面,
每頁有20條電影信息,我們for 20次得到每個電影的信息信息。
for i in range(20):
try:
job_name = result[i]['title']
job_name1 = result[i]['regions'][0]
job_name2 = result[i]['release_date']
job_name3 = result[i]['score']
job_name4 = result[i]['vote_count']
print(job_name)
sheet1.write(n, 0, job_name)
sheet1.write(n, 1, job_name1)
sheet1.write(n, 2, job_name2)
sheet1.write(n, 3, job_name3)
sheet1.write(n, 4, job_name4)
n += 1
time.sleep(1) # 爬取一次用時1s
excel1.save('豆瓣高分電影2.xls') # 創建excel表格,命名爲''
except:
pass
最後我們需要做的就是,遍歷他每一頁的數據,然而我們不知道他有多少頁,,
我這邊寫了10000頁,估計沒有這麼多,不管了。
for pn in range(0,20000,20):
#將上面代碼放入這裏面,
#下面這是循環一次停一秒。不能太快了。
time.sleep(1)
#打印目前保存到第幾頁了,
print('--------------這是第%s頁------------------'%(pn/20))
全部代碼如下,
import requests
import xlwt
import time
excel1 = xlwt.Workbook()
sheet1 = excel1.add_sheet('豆瓣', cell_overwrite_ok=True)
sheet1.write(0, 0, '電影名稱')
sheet1.write(0, 1, '地區/國家')
sheet1.write(0, 2, '年份')
sheet1.write(0, 3, '評分')
sheet1.write(0, 4, '評價人數')
n = 1
for pn in range(0,20000,20):
data = {
'start': pn,
'limit': '20'
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.5702.400 QQBrowser/10.2.1893.400',
'Referer': 'https://movie.douban.com/typerank?type_name=%E5%89%A7%E6%83%85&type=11&interval_id=100:90&action=',
'Cookie':'bid=6nS16u78ysk; douban-fav-remind=1; __utmz=30149280.1536302200.2.2.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; ll="118371"; _vwo_uuid_v2=D1979D38657B6548A1BB551C1064F3530|bf98aa297a7ebd3c5226eab14e60b262; _pk_ses.100001.4cf6=*; ap_v=0,6.0; __utma=30149280.1052346626.1533889879.1536302200.1538099719.3; __utmc=30149280; __utma=223695111.1669584758.1538099719.1538099719.1538099719.1; __utmb=223695111.0.10.1538099719; __utmc=223695111; __utmz=223695111.1538099719.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); ps=y; dbcl2="185126615:6R0mlwkYAeI"; ck=ga0K; push_noty_num=0; push_doumail_num=0; __yadk_uid=KHFgBWW1h7s9vtx9sYMoQeaaZ8e8CNU1; __utmt=1; __utmv=30149280.18512; __utmb=30149280.8.10.1538099719; _pk_id.100001.4cf6=5aa5204a625172c4.1538099719.1.1538100311.1538099719.'
}
Requesturl = 'https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start={}&limit=20'.format(pn)
res = requests.get(Requesturl, data=data, headers=headers,proxies=proxies)
result = res.json()
for i in range(20):
try:
job_name = result[i]['title']
job_name1 = result[i]['regions'][0]
job_name2 = result[i]['release_date']
job_name3 = result[i]['score']
job_name4 = result[i]['vote_count']
print(job_name)
sheet1.write(n, 0, job_name)
sheet1.write(n, 1, job_name1)
sheet1.write(n, 2, job_name2)
sheet1.write(n, 3, job_name3)
sheet1.write(n, 4, job_name4)
n += 1
time.sleep(1) # 爬取一次用時1s
excel1.save('豆瓣高分電影.xls') # 創建excel表格,命名爲''
except:
pass
time.sleep(1)
print('--------------這是第%s頁------------------'%(pn/20))