1.解析了AjAx動態加載地址
2.鍵值型MongoDB數據庫
代碼如下:
首先先獲取zhihu-live中的各個Live鏈接地址
import json, time
import random
import requests
from pymongo import MongoClient
client = MongoClient('localhost', 27017)
db = client.zhihu_live
collection = db.zhihu_live
is_end = False
link = 'https://api.zhihu.com/lives/homefeed?includes=live'
def scrapy(link):
headers = {
'authority': 'api.zhihu.com',
'origin': 'https: //www.zhihu.com',
'referer': 'https: //www.zhihu.com/lives/897097999497437184/related',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)\
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132\
Safari/537.36'
}
try:
r = requests.get(link, headers=headers)
return r.text
except Exception as e:
print('Error:', e)
scrapy(link)
while not is_end:
html = scrapy(link)
decodejson = json.loads(html)
collection.insert_one(decodejson)
link = decodejson['paging']['next']
is_end = decodejson['paging']['is_end']
time.sleep(random.randint(2, 3) + random.random())
然後獲取各個live鏈接地址裏的觀衆id等信息
import json, time
import random
import requests
from pymongo import MongoClient
client = MongoClient('localhost', 27017)
db = client.zhihu_live
collection = db.zhihu_live
def get_audience(live_id):
headers = {
'authority': 'api.zhihu.com',
'origin': 'https: //www.zhihu.com',
'referer': 'https: //www.zhihu.com/lives/897097999497437184/related',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)\
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132\
Safari/537.36'
}
link = 'https://api.zhihu.com/lives/' + live_id + '/members?limit=10&offset=0'
is_end = False
while not is_end:
try:
r = requests.get(link, headers=headers)
html = r.text
decodejson = json.loads(html)
decodejson['live_id'] = live_id
db.zhihu_live_audience.insert_one(decodejson)
link = decodejson['paging']['next']
is_end = decodejson['paging']['is_end']
time.sleep(random.randint(2, 3) + random.random())
except Exception as e:
print('Error:', e)
def id_get():
firt_page = collection.find_one()
for each in firt_page['data']:
live_id = each['live']['id']
print(each['live']['id'], '\t', each['live']['speaker']['member']['name'])
get_audience(live_id)
if __name__ == '__main__':
id_get()
測試運行結果如下圖