工具
from urllib.parse import urlencode
import requests
網頁
- 進入今日頭條,搜索圖片新聞
- 往下進行網頁刷新的時候,按f12選擇xhr進行篩選可以看到ajax傳值
- 此處用urlencode來進行url編碼
- 其實沒怎麼進行數據處理,只是學習一下爬取的內容
要注意的地方
- 請求頭要全,出了表明是ajax之外其他的也需要,不然爬不到數據
- 最後一個參數timestamp是時間戳,自己隨便弄一個就行
- 爬取的數量通過offset來定義,這個我改成固定的參數了
代碼
from urllib.parse import urlencode
import requests
base_url = 'https://www.toutiao.com/api/search/content/?'
headers = {
'accept': 'application/json, text/javascript',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'content-type': 'application/x-www-form-urlencoded',
'cookie': 'tt_webid=6786617025860912652; utm_source=huawei_llq_api; tt_webid=6786617025860912652; s_v_web_id=k6277fq4_t22lPv72_Hoox_47zF_81Qg_iArio6REnBId; WEATHER_CITY=%E5%8C%97%E4%BA%AC; __tasessionId=6t99ps1z21580477014052; csrftoken=c076a79c3ed0c5219a9a6bd871311f74',
'referer': 'https://www.toutiao.com/search/?keyword=%E5%9B%BE%E7%89%87%E6%96%B0%E9%97%BB',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
'x-requested-with': 'XMLHttpRequest',
}
import time
import datetime
def getpage(page):
t = time.time()
params = {
'aid': '24',
'app_name': 'web_search',
'offset': '20',
'format': 'json',
'keyword': '圖片新聞',
'autoload': 'true',
'count': '20',
'en_qc': '1',
'cur_tab': '1',
'from': 'search_tab',
'pd': 'synthesis',
'timestamp': int(round(t * 1000)),
}
url = base_url + urlencode(params)
# url = 'https://www.toutiao.com/api/search/content/?aid=24&app_name=web_search&offset=40&format=json&keyword=%E5%9B%BE%E7%89%87%E6%96%B0%E9%97%BB&autoload=true&count=20&en_qc=1&cur_tab=1&from=search_tab&pd=synthesis×tamp=1580479424843'
return url
try:
print(getpage(40))
response = requests.get(getpage(40), headers = headers)
json = response.json()
with open('test.txt','w',encoding='utf-8') as f:
f.write(str(json.get('data')[0].get('abstract')))
except BaseException as e:
print(e)