1、新建
> scrapy startproject spider_pjt2_zhihu
> cd spider_pjt2_zhihu
> scrapy genspider zhihu www.zhihu.com
2. spider_pjt2_zhihu/utils/ZhihuAccount.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File : ZhihuAccount.py
from selenium import webdriver
import requests
from time import sleep
try:
import http.cookiejar as cookielib
except Exception as e:
print("兼容Py2.x", e)
import cookielib # 兼容Py2.x
import os
class ZhihuAccount(object):
""""
入口:check_login
True:
False:
"""
def __init__(self):
self.brower = None
self.session = requests.session()
self.filename = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
'cookies/zhihu_cookie.text')
self.session.cookies = cookielib.LWPCookieJar(filename=self.filename)
self.headers = {
'Referer': 'https://www.zhihu.com/signup?next=%2F',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/72.0.3626.121 Safari/537.36',
}
# 加載cookie
self.load_cookies() # 加載失敗主動拋出異常,還沒完成
def login(self, username='', password=''):
if username == '' or password == '':
username = input('輸入名稱:')
password = input('輸入密碼:')
self.brower = webdriver.Chrome(executable_path='D:/selenium/chromedriver.exe')
self.brower.get('https://www.zhihu.com/signup?next=%2F')
try:
self.brower.find_element_by_xpath('//*[@id="root"]/div/main/div/div/div/div[2]/div[2]/span').click() # 點擊
self.brower.find_element_by_xpath('//*[@id="root"]//input[@name="username"]').send_keys(username)
sleep(2)
self.brower.find_element_by_xpath('//*[@id="root"]//input[@name="password"]').send_keys(password)
self.brower.execute_script('Object.defineProperties(navigator,{webdriver:{get:() => false}});')
self.brower.execute_script('window.navigator.webdriver') # status =
self.brower.find_element_by_xpath('//*/form/button').click() # 點擊 # if status == ('None' or 'False'):
sleep(1)
# 登錄邏輯中保存session
for cookie in self.brower.get_cookies():
self.session.cookies.set_cookie(
cookielib.Cookie(version=0, name=cookie['name'], value=cookie['value'],
port='80', port_specified=False, domain=cookie['domain'],
domain_specified=True, domain_initial_dot=False,
path=cookie['path'], path_specified=True,
secure=cookie['secure'], rest={},
expires=cookie['expiry'] if "expiry" in cookie else None,
discard=False, comment=None, comment_url=None, rfc2109=False))
self.session.cookies.save()
return True
except Exception as e_login:
print("登錄失敗", e_login)
return False
def load_cookies(self):
try:
self.session.cookies.load(ignore_discard=True)
return True
except Exception as e_load:
print("zhihu_cookie未能加載", e_load)
print("正在重新登錄...")
# 第一次嘗試登錄:
if self.login():
print("cookie成功加載")
return True
else:
print("加載cookie失敗")
return False
def check_login(self):
# 通過設置頁面返回狀態碼來判斷是否爲登錄狀態
inbox_url = 'https://www.zhihu.com/settings/account'
response = self.session.get(inbox_url, headers=self.headers, allow_redirects=False)
status = True
if not response.status_code == 200:
# 第二次嘗試登錄:
# print("正在重新登錄...")
if not self.login():
status = False
# 關閉瀏覽器:
if self.brower:
self.brower.quit()
self.session.close()
if status:
return True
else:
return False
spider_pjt2_zhihu/middlewares.py
from scrapy.downloadermiddlewares.cookies import CookiesMiddleware
from spider_pjt2_zhihu.utils.ZhihuAccount import ZhihuAccount
import requests
from collections import defaultdict
from scrapy.http.cookies import CookieJar
try:
import http.cookiejar as cookielib
except Exception as e:
print("兼容Py2.x", e)
import cookielib # 兼容Py2.x
class ZhihuCookiesMiddleware(CookiesMiddleware):
def __init__(self, debug=False):
super().__init__(debug)
self.load_zhihu_cookies()
def load_zhihu_cookies(self):
# 加載zhihu_cookie.txt
# 先測試登錄,根據返回狀態碼判斷是否進一步操作
account = ZhihuAccount()
if not account.check_login():
print("登錄失敗")
return
print("登錄成功")
# 利用cookie重新登錄知乎
session = requests.session()
session.cookies = cookielib.LWPCookieJar(filename=account.filename)
session.cookies.load(ignore_discard=True) # 由於在ZhihuAccount中已經檢測過,所以這裏不用檢測異常
post_url = 'https://www.zhihu.com'
response = session.get(post_url, headers=account.headers, allow_redirects=False)
if response.status_code == 200:
# print(response.text)
for cookie in session.cookies: # 坑,,,注意從session中獲取,而不是從response中獲取。。。
self.jars['zhihu'].set_cookie(cookie)
spider_pjt2_zhihu/settings.py
COOKIES_ENABLED = True
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': None, # 關閉
'spider_pjt2_zhihu.middlewares.ZhihuCookiesMiddleware': 1,
}
spider_pjt2_zhihu/spiders/zhihu.py
# -*- coding: utf-8 -*-
import scrapy
class ZhihuSpider(scrapy.Spider):
name = 'zhihu'
allowed_domains = ['www.zhihu.com']
start_urls = ['http://www.zhihu.com/'] # 雖然start_requests被重寫用不到這個
def __init__(self):
self.headers = {
# 'Referer': 'https://www.zhihu.com/', # 這裏可以不加,作用不大
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/72.0.3626.121 Safari/537.36',
}
super(ZhihuSpider, self).__init__()
def start_requests(self):
# 重寫start_requests
for url in self.start_urls:
yield scrapy.Request(url, headers=self.headers, meta={'cookiejar': 'zhihu'})
def parse(self, response):
print("body:\n", response.body)
測試
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File : main.py
from scrapy.cmdline import execute
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
# 調用execute()函數來執行命令,此方法傳遞一個數組作爲參數
execute(["scrapy", "crawl", "zhihu"])
進一步操作:設計數據庫表
table: zhihu_question
question_id # 主鍵,問題編號
question_title # 問題的標題
created_time # 創建時間
updated_time # 更新時間
question_url # 問題鏈接
question_topics # 關鍵詞
question_content # 問題描述
answer_num # 回答數量
comments_num # 評論數,評論問題
watch_user_num # 關注者數量
click_num # 被瀏覽數量
crawl_time # 初始爬取時間
crawl_update_time # 最後爬取時間
table: zhihu_answer
answer_id # 回答id, 作爲主鍵
question_id # 問題id,作爲外鍵
answer_url # 回答鏈接
author_id # zhihu ID,由於可以匿名回答,可爲空
author_name # 回答者名稱,顯示名稱
answer_content # 回答內容
voteup_count # 贊同數
comment_count # 評論數
create_time # 回答時間
updated_time # 最後更新時間
crawl_time # 初始爬取時間
crawl_update_time # 最後爬取時間
正則表達式分析:
(.*question\/(\d+))(\/|$).*
參考:
scrapy添加cookie的三種方式 https://blog.csdn.net/qq_40655579/article/details/85126064
scrapy中如何設置應用cookies https://blog.csdn.net/fuck487/article/details/84617194
scrapy在中間件攜帶cookie發送請求 https://blog.csdn.net/qq_42336549/article/details/80991814
Scrapy源碼註解–CookiesMiddleware https://www.cnblogs.com/thunderLL/p/8060279.html