python爬虫小白升仙_8-----初探scrapy-redis分布式爬虫

scrapy-redis分布式爬虫 爬取知乎用户的关注列表信息及关注者的粉丝

流程

1. 登陆知乎后,进入个人主页,可以发现请求的url

 响应为json格式,一页有20个用户信息

2. 创建scrapy项目

3. 源码

3.1 userinfo.py

       爬取用户信息,转为字典,若一页刚好20条信息,则更改url-来切换下一页;否则信息保存到item,更换url-来切换爬取另一个用户的关注列表信息  

# -*- coding: utf-8 -*-

import scrapy
from scrapy import Request
import json
from zhihu.items import ZhihuItem
import re


class UserinfoSpider(scrapy.Spider):
    name = 'userinfo'
    allowed_domains = ['zhihu.com']
    start_urls = ['https://www.zhihu.com/api/v4/members/wang-shun-61-24/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=20&limit=20']

    def parse(self, response):
        userData=json.loads(response.body.decode("utf-8"))["data"]
        count=len(userData)
        if count<20:
            pass
        elif count==20:
            # 正则 re.findall  的简单用法(返回string中所有与pattern相匹配的全部字串,返回形式为数组)
            offset=int(re.findall("&offset=(.*?)&",response.url)[0])
            next_offset=offset+20
            next_page_url=response.url.replace("&offset="+str(offset)+"&","&offset="+str(next_offset)+"&")
            yield Request(url=next_page_url,callback=self.parse)

        for data in userData:
            item = ZhihuItem()
            item["name"]=data["name"]
            item["url_token"] = data["url_token"]
            item["headline"] = data["headline"]
            item["follower_count"] = data["follower_count"]
            item["articles_count"] = data["articles_count"]
            yield item

            next_id_url="https://www.zhihu.com/api/v4/members/"+data["url_token"]+"/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=20&limit=20"
            yield Request(url=next_id_url,callback=self.parse)




3.2 items.py

import scrapy


class ZhihuItem(scrapy.Item):
    # 名字
    name = scrapy.Field()
    # url标签
    url_token = scrapy.Field()
    # 个性签名
    headline = scrapy.Field()
    # 粉丝
    follower_count = scrapy.Field()
    # 发布文章
    articles_count = scrapy.Field()

 3.3 settings.py

# -*- coding: utf-8 -*-

BOT_NAME = 'zhihu'

SPIDER_MODULES = ['zhihu.spiders']
NEWSPIDER_MODULE = 'zhihu.spiders'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# 下载器在从同一网站下载连续页面之前应等待的时间(以秒为单位)。这可以用于限制爬行速度,以避免击中服务器太难。
DOWNLOAD_DELAY = 3

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
    "Host": "www.zhihu.com",
    "cookie": '_zap=bf320ca4-f377-4f5d-a294-86a285cc1b07; _xsrf=lglbJpe1HZ2uYEmRfjc0RwFcNv3GsXVs; d_c0="ABBctvmj1RCPTnoZaMXWutUjIcug2t2mG4w=|1581995080"; capsion_ticket="2|1:0|10:1581995084|14:capsion_ticket|44:NzAzYWMxYjcwYzFlNDY1MWE0ZmFkNzIxODUzN2RjODE=|61dcd9dc1da5b58a1280caec5aac54ca0bd64b6506ea946ba455e990e4ad41ff"; z_c0="2|1:0|10:1581995119|4:z_c0|92:Mi4xOTAwdUFnQUFBQUFBRUZ5Mi1hUFZFQ1lBQUFCZ0FsVk5iNkk0WHdCMHFyRHBiaHhxX0M1OUppcEtPTzFVejNQcWxn|d8b66099962700028671a523b3f058dec5b1dd8d302d299edc616b8e41452d43"; q_c1=22c717de53034a299aadad488196c6c9|1581995151000|1581995151000; tshl=; tst=r; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1581995076,1582079852; KLBRSID=b33d76655747159914ef8c32323d16fd|1582105883|1582104976; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1582105885',
    "referer": "https://www.zhihu.com/"
}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
    #'zhihu.middlewares.ZhihuDownloaderMiddleware': 543,
    'zhihu.middlewares.RandomUserAgentMiddleware': 542,
}


# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'zhihu.pipelines.ZhihuPipeline': 300,
}

USER_AGENT_LIST = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    ]

# Mongodb参数配置 ip/port/数据库名/集合名
MONGODB_HOST = '127.0.0.1'
MONGODB_PORT = 27017
MONGODB_DBNAME = 'zhihu'
MONGODB_DOCNAME = 'zhihu_collection'

 3.4 middlewares.py

import random
from scrapy.utils.project import get_project_settings
import logging

# 随机选择 User-Agent 的下载器中间件
class RandomUserAgentMiddleware(object):
    def process_request(self,request,spider):
        # 从 settings 的 USER_AGENTS 列表中随机选择一个作为 User-Agent
        settings=get_project_settings()
        user_agent=random.choice(settings["USER_AGENT_LIST"])
        request.headers["User-Agent"]=user_agent
        return None

    def process_response(self, request, response, spider):
        # 验证 User-Agent 设置是否生效
        logger=logging.getLogger(__name__)
        logger.info("headers ::> User-Agent = " + str(request.headers['User-Agent'], encoding="utf8"))
        return response

3.5 pipelines.py

      数据存放到mongodb数据库中

# -*- coding: utf-8 -*-

from scrapy.utils.project import get_project_settings  #  获取settings.py
import pymongo
from zhihu.items import ZhihuItem

class ZhihuPipeline(object):
    settings = get_project_settings()
    host = settings['MONGODB_HOST']
    port = settings['MONGODB_PORT']
    dbName = settings['MONGODB_DBNAME']
    collectionName = settings['MONGODB_DOCNAME']

    # 开始处理数据之前连接数据库
    def open_spider(self,spider):
        # 创建连接
        self.client=pymongo.MongoClient(host=self.host,port=self.port)
        # 创建数据库
        self.db=self.client[self.dbName]
        #创建集合
        self.collection=self.db[self.collectionName]

    def process_item(self, item, spider):
        if isinstance(item,ZhihuItem):
            # 字段更新("查询条件","$set 更新数据","True:如果数据不存在则插入")
            self.collection.update({"url_token":item["url_token"]},{"$set":item},True)
        return item

    # 数据处理完之后关闭数据库
    def close_spider(self,spider):
        self.client.close()

4. 运行结果

     Robo 3T显示数据

5. scrapy-redis 分布式修改配置

    下载redis和scrapy-redis先

    a) 修改spider继承类,添加redis_key, 注释掉原有的起始url:start_urls

from scrapy_redis.spiders import RedisCrawlSpider


class UserinfoSpider(RedisCrawlSpider):
    name = 'userinfo'
    redis_key = "myspider:start_urls"
    allowed_domains = ['zhihu.com']
    #start_urls = ['https://www.zhihu.com/api/v4/members/wang-shun-61-24/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=20&limit=20']

    b) settings.py中添加 redis的相关配置

# 配置 Scrapy-Redis
# 启用 Scrapy-Redis 调度存储请求队列
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 去重规则对应处理的类
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 不清除 Redis 队列,即是否在关闭时候保留原来的调度器和去重记录。
# True=保留,False=清空。这样可以暂停/恢复 爬取
SCHEDULER_PERSIST = True
#使用优先级调度请求队列 (默认使用)
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'

#可选用的其它队列 PriorityQueue(有序集合),FifoQueue(列表)、LifoQueue(列表)
#SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.FifoQueue'  # 广度优先
#SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.LifoQueue'  # 深度优先

# 连接redis
REDIS_HOST = '127.0.0.1'         # 主机名
REDIS_PORT = 6379                      # 端口
#REDIS_PARAMS  = {'password':'xxx'}     # Redis连接参数。
REDIS_ENCODING = "utf-8"               # redis编码类型。默认:'utf-8'
# 或者:
#REDIS_URL = 'redis://user:pass@hostname:9001' # 连接 URL(优先于以上配置)

  5.1 打开redis服务端和客户端,并向redis_key列表里添加url为起始爬取的url

5.2 同时运行几个来模拟分布式爬取

 

参考学习:

https://www.bilibili.com/video/av20220465?from=search&seid=5965231367183075581

https://cuiqingcai.com/8465.html

 

 

 

 

 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章