requests
文檔:http://cn.python-requests.org/zh_CN/latest/
安裝:pip --timeout=100 install requests
百度搜索
- 一個簡單地小例子
- 基於
requests
模塊的get
請求 - 爬取百度搜索首頁
import requests
if __name__ == "__main__":
url = "https://www.baidu.com"
response = requests.get(url)
response.encoding = 'utf-8'
print("狀態碼:" + str(response.status_code))
page_text = response.text
print("頁面內容:" + page_text)
with open('./baidu.html', 'w', encoding='utf-8') as fp:
fp.write(page_text)
print('爬取數據結束!')
搜狗搜索
- 基於
requests
模塊的get
請求 - 爬取搜狗指定詞條對應的搜索結果頁面
import requests
if __name__ == '__main__':
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
url = 'https://www.sogou.com/web'
kw = input('輸入查詢關鍵字:')
param = {
'query': kw
}
response = requests.get(url, param, headers=headers)
page_text = response.text
fileName = kw + '.html'
with open(fileName, 'w', encoding='utf-8') as fp:
fp.write(page_text)
print('數據爬取結束!')
百度翻譯
- 基於
requests
模塊的post
請求 - 破解百度翻譯
import requests
import json
if __name__ == '__main__':
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
post_url = 'https://fanyi.baidu.com/sug'
word = input('輸入查詢關鍵字:')
data = {
'kw': word
}
response = requests.post(post_url, data, headers=headers)
dic_obj = response.json()
print(dic_obj)
fileName = word + '.json'
fp = open(fileName, 'w', encoding='utf-8')
json.dump(dic_obj, fp, ensure_ascii=False)
print('數據爬取結束!')
豆瓣喜劇電影排行榜
- 基於
requests
模塊ajax
的get
請求 - 爬取鏈接:
https://movie.douban.com/
- 爬取豆瓣電影分類排行榜 - 喜劇片
import requests
import json
if __name__ == '__main__':
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
param = {
"type": "24",
"interval_id": "100:90",
"action": "",
"start": "0",
"limit": "20",
}
url = 'https://movie.douban.com/j/chart/top_list'
response = requests.get(url, param, headers=headers)
dic_obj = response.json()
print(dic_obj)
fileName = '豆瓣電影排行榜.json'
fp = open(fileName, 'w', encoding='utf-8')
json.dump(dic_obj, fp, ensure_ascii=False)
print('數據爬取結束!')
企業信息爬取
- 爬取鏈接:
http://125.35.6.84:81/xk/
- 爬取企業化妝品生產許可證信息
import requests
import json
if __name__ == '__main__':
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/78.0.3904.108 Safari/537.36 '
}
url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList'
# 企業 id 列表
id_list = []
detail_list = []
# 獲取前兩頁企業 id,30 條id
for page in range(1, 3):
page = str(page)
param = {
"on": "true",
"page": page,
"pageSize": "15",
"productName": "",
"conditionType": "1",
"applyname": "",
"applysn": "",
}
response = requests.post(url, param, headers=headers)
json_ids = response.json()
for dic in json_ids['list']:
id_list.append(dic['ID'])
post_url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById'
for id in id_list:
data = {
'id': id
}
res = requests.post(post_url, data, headers=headers)
detail_json = res.json()
detail_list.append(detail_json)
fileName = '企業信息.json'
fp = open(fileName, 'w', encoding='utf-8')
json.dump(detail_list, fp, ensure_ascii=False)
print('數據爬取結束!')