scrapy爬取知乎

1、新建

> scrapy startproject spider_pjt2_zhihu

> cd spider_pjt2_zhihu

> scrapy genspider zhihu www.zhihu.com

2. spider_pjt2_zhihu/utils/ZhihuAccount.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File    : ZhihuAccount.py


from selenium import webdriver
import requests
from time import sleep

try:
    import http.cookiejar as cookielib
except Exception as e:
    print("兼容Py2.x", e)
    import cookielib  # 兼容Py2.x

import os


class ZhihuAccount(object):
    """"
    入口:check_login
    True:
    False:
    """
    def __init__(self):
        self.brower = None
        self.session = requests.session()
        self.filename = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
                                     'cookies/zhihu_cookie.text')
        self.session.cookies = cookielib.LWPCookieJar(filename=self.filename)
        self.headers = {
            'Referer': 'https://www.zhihu.com/signup?next=%2F',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/72.0.3626.121 Safari/537.36',
        }
        # 加載cookie
        self.load_cookies()  # 加載失敗主動拋出異常,還沒完成

    def login(self, username='', password=''):
        if username == '' or password == '':
            username = input('輸入名稱:')
            password = input('輸入密碼:')
        self.brower = webdriver.Chrome(executable_path='D:/selenium/chromedriver.exe')
        self.brower.get('https://www.zhihu.com/signup?next=%2F')
        try:
            self.brower.find_element_by_xpath('//*[@id="root"]/div/main/div/div/div/div[2]/div[2]/span').click()  # 點擊
            self.brower.find_element_by_xpath('//*[@id="root"]//input[@name="username"]').send_keys(username)
            sleep(2)
            self.brower.find_element_by_xpath('//*[@id="root"]//input[@name="password"]').send_keys(password)

            self.brower.execute_script('Object.defineProperties(navigator,{webdriver:{get:() => false}});')
            self.brower.execute_script('window.navigator.webdriver')  # status =

            self.brower.find_element_by_xpath('//*/form/button').click()  # 點擊  # if status == ('None' or 'False'):

            sleep(1)
            # 登錄邏輯中保存session
            for cookie in self.brower.get_cookies():
                self.session.cookies.set_cookie(
                    cookielib.Cookie(version=0, name=cookie['name'], value=cookie['value'],
                                     port='80', port_specified=False, domain=cookie['domain'],
                                     domain_specified=True, domain_initial_dot=False,
                                     path=cookie['path'], path_specified=True,
                                     secure=cookie['secure'], rest={},
                                     expires=cookie['expiry'] if "expiry" in cookie else None,
                                     discard=False, comment=None, comment_url=None, rfc2109=False))

            self.session.cookies.save()
            return True
        except Exception as e_login:
            print("登錄失敗", e_login)
            return False

    def load_cookies(self):
        try:
            self.session.cookies.load(ignore_discard=True)
            return True
        except Exception as e_load:
            print("zhihu_cookie未能加載", e_load)
            print("正在重新登錄...")
            # 第一次嘗試登錄:
            if self.login():
                print("cookie成功加載")
                return True
            else:
                print("加載cookie失敗")
                return False

    def check_login(self):
        # 通過設置頁面返回狀態碼來判斷是否爲登錄狀態
        inbox_url = 'https://www.zhihu.com/settings/account'
        response = self.session.get(inbox_url, headers=self.headers, allow_redirects=False)
        status = True
        if not response.status_code == 200:
            # 第二次嘗試登錄:
            # print("正在重新登錄...")
            if not self.login():
                status = False

        # 關閉瀏覽器:
        if self.brower:
            self.brower.quit()
        self.session.close()

        if status:
            return True
        else:
            return False

spider_pjt2_zhihu/middlewares.py

from scrapy.downloadermiddlewares.cookies import CookiesMiddleware
from spider_pjt2_zhihu.utils.ZhihuAccount import ZhihuAccount
import requests
from collections import defaultdict
from scrapy.http.cookies import CookieJar
try:
    import http.cookiejar as cookielib
except Exception as e:
    print("兼容Py2.x", e)
    import cookielib  # 兼容Py2.x

class ZhihuCookiesMiddleware(CookiesMiddleware):
    def __init__(self, debug=False):
        super().__init__(debug)
        self.load_zhihu_cookies()

    def load_zhihu_cookies(self):
        # 加載zhihu_cookie.txt
        # 先測試登錄,根據返回狀態碼判斷是否進一步操作
        account = ZhihuAccount()
        if not account.check_login():
            print("登錄失敗")
            return
        print("登錄成功")

        # 利用cookie重新登錄知乎
        session = requests.session()
        session.cookies = cookielib.LWPCookieJar(filename=account.filename)

        session.cookies.load(ignore_discard=True)  # 由於在ZhihuAccount中已經檢測過,所以這裏不用檢測異常

        post_url = 'https://www.zhihu.com'
        response = session.get(post_url, headers=account.headers, allow_redirects=False)
        if response.status_code == 200:
            # print(response.text)
            for cookie in session.cookies:  # 坑,,,注意從session中獲取,而不是從response中獲取。。。
                self.jars['zhihu'].set_cookie(cookie)

spider_pjt2_zhihu/settings.py

COOKIES_ENABLED = True

DOWNLOADER_MIDDLEWARES = {
    'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': None,  # 關閉
    'spider_pjt2_zhihu.middlewares.ZhihuCookiesMiddleware': 1,
}

spider_pjt2_zhihu/spiders/zhihu.py

# -*- coding: utf-8 -*-
import scrapy

class ZhihuSpider(scrapy.Spider):
    name = 'zhihu'
    allowed_domains = ['www.zhihu.com']
    start_urls = ['http://www.zhihu.com/']  # 雖然start_requests被重寫用不到這個

    def __init__(self):
        self.headers = {
            # 'Referer': 'https://www.zhihu.com/',  # 這裏可以不加,作用不大
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/72.0.3626.121 Safari/537.36',
        }

        super(ZhihuSpider, self).__init__()

    def start_requests(self):
        # 重寫start_requests
        for url in self.start_urls:
            yield scrapy.Request(url, headers=self.headers, meta={'cookiejar': 'zhihu'})

    def parse(self, response):
        print("body:\n", response.body)

測試

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File    : main.py

from scrapy.cmdline import execute

import sys
import os

sys.path.append(os.path.dirname(os.path.abspath(__file__)))

# 調用execute()函數來執行命令,此方法傳遞一個數組作爲參數
execute(["scrapy", "crawl", "zhihu"])

進一步操作:設計數據庫表

table: zhihu_question
question_id     # 主鍵,問題編號
question_title  # 問題的標題
created_time    # 創建時間
updated_time    # 更新時間

question_url        # 問題鏈接
question_topics     # 關鍵詞
question_content    # 問題描述
answer_num          # 回答數量
comments_num        # 評論數,評論問題
watch_user_num      # 關注者數量
click_num           # 被瀏覽數量
crawl_time          # 初始爬取時間
crawl_update_time   # 最後爬取時間
table: zhihu_answer
answer_id       # 回答id, 作爲主鍵
question_id     # 問題id,作爲外鍵
answer_url      # 回答鏈接
author_id       # zhihu ID,由於可以匿名回答,可爲空
author_name     # 回答者名稱,顯示名稱
answer_content  # 回答內容
voteup_count    # 贊同數
comment_count   # 評論數
create_time     # 回答時間
updated_time    # 最後更新時間
crawl_time          # 初始爬取時間
crawl_update_time   # 最後爬取時間

正則表達式分析

(.*question\/(\d+))(\/|$).*

在這裏插入圖片描述


參考:

scrapy添加cookie的三種方式 https://blog.csdn.net/qq_40655579/article/details/85126064
scrapy中如何設置應用cookies https://blog.csdn.net/fuck487/article/details/84617194
scrapy在中間件攜帶cookie發送請求 https://blog.csdn.net/qq_42336549/article/details/80991814
Scrapy源碼註解–CookiesMiddleware https://www.cnblogs.com/thunderLL/p/8060279.html

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章