1. 網址
https://news.163.com/
2. 頁面解析
上面這部分可直接在源碼裏找到
下面這部分通過js異步加載
3. 異步加載部分
向下刷新頁面,查找發起請求的地址;url變化也比較明顯;找到接口後可直接獲取數據
4. pc端接口
打開f12 ,選擇移動端,刷新頁面,即可跳轉到移動端頁面,如下
5. pc 端解析
移動端也是通過js異步加載,向下刷新查找數據接口;數據接口比較明顯,可直接訪問
6. 源碼參考
import re
import json
import aiohttp
import asyncio
class Spider(object):
data = []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
async def fetch(self, session, url):
"""
獲取網頁源碼
:param session:
:param url:
:return:
"""
async with session.get(url, headers=self.headers) as response:
# response.text()可以指定編碼解碼
return await response.text(encoding='utf-8')
async def parser(self, html):
"""
解析網頁數據
:param html:
:return:
"""
data_list = json.loads(re.findall(r'artiList\((.*)\)', html)[0])['BBM54PGAwangning']
for data in data_list:
# 文檔id
docid = data['docid']
# 來源
source = data['source']
# 標題
title = data['title']
# 優先權
priority = data['priority']
# 詳情頁
url = data['url']
# 評論數
commentCount = data['commentCount']
# 摘要
digest = data['digest']
# 首頁圖片
imgsrc = data['imgsrc']
# 發佈時間
ptime = data['ptime']
print(title)
items = {
'文檔id': docid,
'來源': source,
'標題': title,
'優先權': priority,
'詳情頁': url,
'評論數': commentCount,
'摘要': digest,
'首頁圖片': imgsrc,
'發佈時間': ptime
}
# print(items)
with open('wangyi.json', 'a', encoding='utf-8') as f:
f.write(json.dumps(items, ensure_ascii=False) + '\n')
async def download(self, url, table):
"""
處理網頁
:param url:
:return:
"""
# 設置最大連接數和忽略證書錯誤
async with aiohttp.TCPConnector(limit=10, verify_ssl=False) as tc:
# 創建一個clientsession對象
async with aiohttp.ClientSession(connector=tc) as session:
html = await self.fetch(session, url)
await self.parser(html)
if __name__ == '__main__':
import time
t0 = time.time()
urls = ['https://3g.163.com/touch/reconstruct/article/list/BBM54PGAwangning/{}-10.html'.format(i*10) for i in
range(31)]
loop = asyncio.get_event_loop()
tasks = [asyncio.ensure_future(Spider().download(url, Spider().data)) for url in urls]
tasks = asyncio.gather(*tasks)
loop.run_until_complete(tasks)
print(time.time()-t0)
# 0.8280472755432129