知乎首頁爬蟲

嘗試了一下知乎首頁爬蟲:

import re

import requests
from urllib import parse


首頁鏈接 = []


headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36',
    'cookie': '_zap=d87a4437-7623-4ed0-9aea-8e3db9f0a2a7; _xsrf=WE09KPNwIEZZSwM7t95gSxiU0bH5VZfe; d_c0="AJBsNreRQBCPTnUIjLaLd2xSWLQtcZGToV8=|1571991072"; z_c0=Mi4xYjg4QUJBQUFBQUFBa0d3MnQ1RkFFQmNBQUFCaEFsVk5LUHlmWGdCNnpNN3Zta1NNeTJsdldITUgyU05ySERaNnJ3|1571991080|b93528ba749b936e200ca7a5d85c9653f2e0f932; tst=r; __utmv=51854390.100--|2=registration_date=20170128=1^3=entry_date=20170128=1; __utma=51854390.216570059.1573461536.1573461536.1573541524.2; __utmz=51854390.1573541524.2.2.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; q_c1=75d1f2e82fb3418ca72dd2006a184a02|1576219305000|1571997768000; tgw_l7_route=18884ea8e9aef06cacc0556da5cb4bf1; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1576739253,1576740717,1576744885,1576808320; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1576808320',
}

resp = requests.get('https://www.zhihu.com/', headers=headers)
# print(resp.text)

urls = re.findall('content="(https://www.zhihu.com/question/\d+?)"', resp.text, re.S)
urls2 = re.findall('href="(/question/\d+/answer/\d+)"', resp.text, re.S)
x = len(urls)
for i in range(x):
    urls[i] = re.sub(r'/question/\d+', '', urls[i])
    url = urls[i] + urls2[i]
    # print(url)
    首頁鏈接.append(url)

# resp = requests.get('https://www.zhihu.com/question/355969351/answer/911683923', headers=headers)
# # print(resp.text)
# contennt = re.findall('"excerpt":"(.*?)"', resp.text, re.S)
# print(contennt)
#
for i in range(10):
    id = x + i*6 + 1
    url = 'https://www.zhihu.com/api/v3/feed/topstory/recommend?session_token=46796b9b57df0d43594dcc3c865270a8&deskt' \
          'op=true&page_number=5&limit=6&action=down&after_id={}'.format(id)
    resp = requests.get(url,headers=headers)
    # print(resp.content.decode('raw_unicode_escape'))
    r_urls3 = []
    urls3 = re.findall('"url":"(https://api.zhihu.com/questions/\d+?)"', resp.text, re.S)
    # 有不是問答,但依舊會有brife且有id的,會導致問題鏈接不能與回答id對應,正則一定要加‘answer’
    urls4 = re.findall('"brief":".*?answer.*?(\d+?)}"', resp.text, re.S)
    for u in urls3:
        u = re.sub('api.|s', '', u)
        r_urls3.append(u)
    s = len(urls4)
    for i in range(s):
        real_url = r_urls3[i] + '/answer/' + urls4[i]
        real_url = re.sub('quetion', 'question', real_url)
        # print(real_url)
        首頁鏈接.append(real_url)


print(len(首頁鏈接))
for i in 首頁鏈接:
    print(i)

# https://www.zhihu.com/question/336203471/answer/930023039
# https://www.zhihu.com/question/303859624/answer/856155303

說幾個有意思的點:

1,一定要有cookie,不然拿不到數據

2,首頁直接加載的五條或六條數據,不設反爬,直接在源碼拿

3,後續數據依靠ajax加載,如果需要頁面信息,其實可以直接從ajax拿,而不用取鏈接,ajax加載了鏈接,類似廣告,和一些非問答鏈接,需要處理和篩選。

4,如果直接在ajax拿對應文章的內容,需要

resp.content.decode('raw_unicode_escape')

5,如果你直接拿鏈接,需要注意,拿到問題和回答對應的id,直接拼接的鏈接是假鏈,點去會報404,因爲:

re.sub('quetion', 'question', real_url)

 它把question寫成quetion,真的是。。。

 鏈接都是可訪問的。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章