嗯,今天就讓我們來一起爬爬今日頭條的熱點新聞吧!
今日頭條地址:https://www.toutiao.com/ch/news_hot/
在瀏覽器中打開今日頭條的鏈接,選中左側的熱點,在瀏覽器開發者模式,network下很快能找到一個‘?category=new_hot…’字樣的文件,點擊進去就能看到請求地址了。如下圖:
該請求地址的數據全部存放在data 字段中,並且數據類型爲json。如下圖:
請求的鏈接地址爲:
https://www.toutiao.com/api/pc/feed/?category=news_hot&utm_source=toutiao&widen=1&max_behot_time=1577347347&max_behot_time_tmp=1577347347&tadrequire=true&as=A1450EF0A468003&cp=5E04F850E003EE1&_signature=VYMs9gAgEBe5v1fEUcnQ31WDLeAAAuI
其中有9個參數,對比如下表:
其中max_behot_time在獲取的json數據中獲得,具體數據見如下截圖:
請求地址中,有兩個參數as和cp,都是經過js加密處理過。不過也有相對應的加密算法:
加密算法:
var e = {};
e.getHoney = function() {
var t = Math.floor((new Date).getTime() / 1e3),
e = t.toString(16).toUpperCase(),
n = md5(t).toString().toUpperCase();
if (8 != e.length) return {
as: "479BB4B7254C150",
cp: "7E0AC8874BB0985"
};
for (var o = n.slice(0, 5), i = n.slice(-5), a = "", r = 0; 5 > r; r++) a += o[r] + e[r];
for (var l = "", s = 0; 5 > s; s++) l += e[s + 3] + i[s];
return {
as: "A1" + a + e.slice(-3),
cp: e.slice(0, 3) + l + "E1"
}
}, t.ascp = e
}(window, document), function() {
var t = ascp.getHoney(),
e = {
path: "/",
domain: "i.snssdk.com"
};
$.cookie("cp", t.cp, e), $.cookie("as", t.as, e), window._honey = t
}(), Flow.prototype = {
init: function() {
var t = this;
this.url && (t.showState(t.auto_load ? NETWORKTIPS.LOADING : NETWORKTIPS.HASMORE), this.container.on("scrollBottom", function() {
t.auto_load && (t.lock || t.has_more && t.loadmore())
}), this.list_bottom.on("click", "a", function() {
return t.lock = !1, t.loadmore(), !1
}))
},
loadmore: function(t) {
this.getData(this.url, this.type, this.param, t)
},
python獲取as和cp值的代碼如下:
參考博客:https://www.cnblogs.com/xuchunlin/p/7097391.html
import time
import hashlib
def get_as_cp_args():
zz ={}
now = round(time.time())
print (now) # 獲取計算機時間
e = hex(int(now)).upper()[2:] # hex()轉換一個整數對象爲十六進制的字符串表示
print (e)
i = hashlib.md5(str(int(now)).encode("utf8")).hexdigest().upper() # hashlib.md5().hexdigest()創建hash對象並返回16進制結果
if len(e)!=8:
zz = {'as': "479BB4B7254C150",
'cp': "7E0AC8874BB0985"}
return zz
n=i[:5]
a=i[-5:]
r = ""
s = ""
for i in range(5):
s = s+n[i]+e[i]
for j in range(5):
r = r+e[j+3]+a[j]
zz = {
'as': "A1" + s + e[-3:],
'cp': e[0:3] + r + "E1"
}
print (zz)
return zz
這樣完整的鏈接就構成了,另外提一點就是:_signature參數去掉也是可以獲取到json數據的,因此這樣請求的鏈接就完成了。
全部代碼如下:
import requests
import json
import time
import hashlib
import xlwt
# 獲取as和cp參數的函數
def get_as_cp_args():
zz ={}
now = round(time.time())
print (now) # 獲取計算機時間
e = hex(int(now)).upper()[2:] # hex()轉換一個整數對象爲十六進制的字符串表示
print (e)
i = hashlib.md5(str(int(now)).encode("utf8")).hexdigest().upper() # hashlib.md5().hexdigest()創建hash對象並返回16進制結果
if len(e)!=8:
zz = {'as': "479BB4B7254C150",
'cp': "7E0AC8874BB0985"}
return zz
n=i[:5]
a=i[-5:]
r = ""
s = ""
for i in range(5):
s = s+n[i]+e[i]
for j in range(5):
r = r+e[j+3]+a[j]
zz = {
'as': "A1" + s + e[-3:],
'cp': e[0:3] + r + "E1"
}
print (zz)
return zz
#獲取解析json後的數據
def get_html_data(target_url):
# 這裏你換成你自己的請求頭。直接複製代碼,會報錯!!!
headers = {"referer": "https://www.toutiao.com/",
"accept": "text/javascript, text/html, application/xml, text/xml, */*",
"content-type": "application/x-www-form-urlencoded",
"cookie": "tt_webid=6774555886024279565; s_v_web_id=76cec5f9a5c4ee50215b678a6f53dea5; WEATHER24279565; csrftoken=bb8c835711d848db5dc5445604d0a9e9; __tasessionId=gphokc0el1577327623076",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"}
response = requests.get(target_url, headers=headers)
res_data = json.loads(response.text)
return res_data
# 解析數據,提取相關的字段
def get_parse_data(max_behot_time, base_url, start_url,):
# 存放所有的今日頭條新聞數據
excel_data = []
# 循環次數,相當於於刷新新聞的次數,正常情況下刷新一次會出現10條新聞,但也存在少於10條的情況;所以最後的結果並不一定是10的倍數
for i in range(3):
# 獲取as和cp參數的函數
as_cp_args = get_as_cp_args()
# 拼接請求路徑地址
targetUrl = start_url + max_behot_time + '&max_behot_time_tmp=' + max_behot_time + '&tadrequire=true&as=' + as_cp_args['as'] + '&cp=' + as_cp_args['cp']
res_data = get_html_data(targetUrl)
time.sleep(1)
toutiao_data = res_data['data']
for i in range(len(toutiao_data)):
toutiao = []
toutiao_title = toutiao_data[i]['title'] # 頭條新聞標題
toutiao_source_url = toutiao_data[i]['source_url'] # 頭條新聞鏈接
if "https" not in toutiao_source_url:
toutiao_source_url = base_url + toutiao_source_url
toutiao_source = toutiao_data[i]['source'] # 頭條發佈新聞的來源
toutiao_media_url = base_url + toutiao_data[i]['media_url'] # 頭條發佈新聞鏈接
toutiao.append(toutiao_title)
toutiao.append(toutiao_source_url)
toutiao.append(toutiao_source)
toutiao.append(toutiao_media_url)
excel_data.append(toutiao)
print(toutiao)
# 獲取下一個鏈接的max_behot_time參數的值
max_behot_time = str(res_data['next']['max_behot_time'])
return excel_data
# 數據保存到Excel 表格中中
def save_data(excel_data):
header = ["新聞標題", "新聞鏈接", "頭條號", "頭條號鏈接"]
excel_data.insert(0, header)
workbook = xlwt.Workbook(encoding="utf-8", style_compression=0)
worksheet = workbook.add_sheet("sheet1", cell_overwrite_ok=True)
for i in range(len(excel_data)):
for j in range(len(excel_data[i])):
worksheet.write(i, j, excel_data[i][j])
workbook.save(r"今日頭條熱點新聞.xls")
print("今日頭條新聞保存完畢!!")
if __name__ == '__main__':
# 鏈接參數
max_behot_time = '0'
# 基礎地址
base_url = 'https://www.toutiao.com'
# 請求的前半部分地址
start_url = 'https://www.toutiao.com/api/pc/feed/?category=news_hot&utm_source=toutiao&widen=1&max_behot_time='
toutiao_data = get_parse_data(max_behot_time, base_url, start_url)
save_data(toutiao_data)
程序運行結束後Excel表格截圖: