import pymysql
import requests
from lxml import etree
from hashlib import sha1
import pickle
import zlib
import redis
def paqu():
url = 'https://www.zhihu.com/hot'
headers = {
'Host': 'www.zhihu.com',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
'Connection': 'keep-alive',
'Pargma': 'np-cache',
'Cookie': '自己的cookie',
'Cache-Control': 'no-cache',
'Upgrade-Insecure-Requests': '1',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'User-Agent': 'Mozilla/5.0 (Macintosh; Inter Mac OS X 10_12_4) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
}
response = requests.get(url,headers=headers)
html_content = response.content
html = etree.HTML(html_content)
links = html.xpath('//div[contains(@class,"HotItem-content")]/a/@href')
client = redis.Redis(host='',password='',port=6379,db=1)
hasher_proto = sha1()
print(links)
for link in links:
hasher = hasher_proto.copy()
#將 url 處理爲 SHA1 摘要
hasher.update(link.encode("utf-8"))
field_key = hasher.hexdigest()
if not client.hexists('zhihu',field_key):
html_page = requests.get(link,headers=headers).text
zipped_page = zlib.compress(pickle.dumps(html_page))
client.hset('zhihu',field_key,zipped_page)
print('總共緩存了{}個頁面'.format(client.hlen('zhihu')))
if __name__ == '__main__':
paqu()
爬取知乎熱搜存取到redis
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.