爬取前三頁的數據
第一次寫的代碼:
# 綜合項目實戰:爬取搜狗,知乎某一詞條對應的某一範圍頁碼的數據
# 分析下爬取第一頁的url和第二頁的url,get請求參數中只有page在變,把url固定,變量封裝
import requests
url='http://zhihu.sogou.com/zhihu?'
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
}
for page in range(1,3):
params={
'query':'人工智能',
'page':page,
'ie':'utf-8'
}
response = requests.get(url=url,params=params,headers=headers)
page_data = response.text
with open(str(page)+ '.html','w',encoding='utf-8') as f:
f.write(page_data)
+os模塊的操作,最終版本:
import requests
import os
# 創建一個文件夾
if not os.path.exists('./pages'):
os.mkdir('./pages')
# 搜索詞條
word = input('enter a word')
# 動態指定頁碼的範圍
start_page = int(input('enter a start pageNum:'))
end_page = int(input('enter a end pageNum'))
# 1.指定url--設計成一個具有通用的url
url = 'http://zhihu.sogou.com/zhihu'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
# 循環取頁面值 -- page
for page in range(start_page,end_page+1):
params = {'query':word,'page':page,'ie':'utf-8'}
response = requests.get(url=url,params=params,headers=headers)
# 獲取響應中頁面數據(指定頁碼(page))
page_text = response.text
# 進行持久化處理
fileName = word+str(page)+'.html'
filePath = 'pages/'+fileName
with open(filePath,'w',encoding='utf-8')as f:
f.write(page_text)
print(f"第{page}頁數據寫入成功")