網絡爬蟲-知乎Live-Live評論與觀衆-MongoDB數據庫

1.解析了AjAx動態加載地址

2.鍵值型MongoDB數據庫

代碼如下:

首先先獲取zhihu-live中的各個Live鏈接地址

import json, time
import random
import requests
from pymongo import MongoClient

client = MongoClient('localhost', 27017)
db = client.zhihu_live
collection = db.zhihu_live

is_end = False
link = 'https://api.zhihu.com/lives/homefeed?includes=live'

def scrapy(link):
    headers = {
    'authority': 'api.zhihu.com',
    'origin': 'https: //www.zhihu.com',
    'referer': 'https: //www.zhihu.com/lives/897097999497437184/related',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)\
     AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132\
     Safari/537.36'
    }
    try:
        r = requests.get(link, headers=headers)
        return r.text
    except Exception as e:
        print('Error:', e)
        scrapy(link)

while not is_end:
    html = scrapy(link)
    decodejson = json.loads(html)
    collection.insert_one(decodejson)
    link = decodejson['paging']['next']
    is_end = decodejson['paging']['is_end']
    time.sleep(random.randint(2, 3) + random.random())

 

然後獲取各個live鏈接地址裏的觀衆id等信息

import json, time
import random
import requests
from pymongo import MongoClient

client = MongoClient('localhost', 27017)
db = client.zhihu_live
collection = db.zhihu_live

def get_audience(live_id):
    headers = {
    'authority': 'api.zhihu.com',
    'origin': 'https: //www.zhihu.com',
    'referer': 'https: //www.zhihu.com/lives/897097999497437184/related',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)\
     AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132\
     Safari/537.36'
    }
    link = 'https://api.zhihu.com/lives/' + live_id + '/members?limit=10&offset=0'

    is_end = False
    while not is_end:
        try:
            r = requests.get(link, headers=headers)
            html = r.text
            decodejson = json.loads(html)
            decodejson['live_id'] = live_id
            db.zhihu_live_audience.insert_one(decodejson)

            link = decodejson['paging']['next']
            is_end = decodejson['paging']['is_end']
            time.sleep(random.randint(2, 3) + random.random())
        except Exception as e:
            print('Error:', e)

def id_get():
    firt_page = collection.find_one()
    for each in firt_page['data']:
        live_id = each['live']['id']
        print(each['live']['id'], '\t', each['live']['speaker']['member']['name'])
        get_audience(live_id)


if __name__ == '__main__':
    id_get()

測試運行結果如下圖

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章