scrapy-redis分布式爬虫 爬取知乎用户的关注列表信息及关注者的粉丝
流程
1. 登陆知乎后,进入个人主页,可以发现请求的url
响应为json格式,一页有20个用户信息
2. 创建scrapy项目
3. 源码
3.1 userinfo.py
爬取用户信息,转为字典,若一页刚好20条信息,则更改url-来切换下一页;否则信息保存到item,更换url-来切换爬取另一个用户的关注列表信息
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
import json
from zhihu.items import ZhihuItem
import re
class UserinfoSpider(scrapy.Spider):
name = 'userinfo'
allowed_domains = ['zhihu.com']
start_urls = ['https://www.zhihu.com/api/v4/members/wang-shun-61-24/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=20&limit=20']
def parse(self, response):
userData=json.loads(response.body.decode("utf-8"))["data"]
count=len(userData)
if count<20:
pass
elif count==20:
# 正则 re.findall 的简单用法(返回string中所有与pattern相匹配的全部字串,返回形式为数组)
offset=int(re.findall("&offset=(.*?)&",response.url)[0])
next_offset=offset+20
next_page_url=response.url.replace("&offset="+str(offset)+"&","&offset="+str(next_offset)+"&")
yield Request(url=next_page_url,callback=self.parse)
for data in userData:
item = ZhihuItem()
item["name"]=data["name"]
item["url_token"] = data["url_token"]
item["headline"] = data["headline"]
item["follower_count"] = data["follower_count"]
item["articles_count"] = data["articles_count"]
yield item
next_id_url="https://www.zhihu.com/api/v4/members/"+data["url_token"]+"/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=20&limit=20"
yield Request(url=next_id_url,callback=self.parse)
3.2 items.py
import scrapy
class ZhihuItem(scrapy.Item):
# 名字
name = scrapy.Field()
# url标签
url_token = scrapy.Field()
# 个性签名
headline = scrapy.Field()
# 粉丝
follower_count = scrapy.Field()
# 发布文章
articles_count = scrapy.Field()
3.3 settings.py
# -*- coding: utf-8 -*-
BOT_NAME = 'zhihu'
SPIDER_MODULES = ['zhihu.spiders']
NEWSPIDER_MODULE = 'zhihu.spiders'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# 下载器在从同一网站下载连续页面之前应等待的时间(以秒为单位)。这可以用于限制爬行速度,以避免击中服务器太难。
DOWNLOAD_DELAY = 3
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
"Host": "www.zhihu.com",
"cookie": '_zap=bf320ca4-f377-4f5d-a294-86a285cc1b07; _xsrf=lglbJpe1HZ2uYEmRfjc0RwFcNv3GsXVs; d_c0="ABBctvmj1RCPTnoZaMXWutUjIcug2t2mG4w=|1581995080"; capsion_ticket="2|1:0|10:1581995084|14:capsion_ticket|44:NzAzYWMxYjcwYzFlNDY1MWE0ZmFkNzIxODUzN2RjODE=|61dcd9dc1da5b58a1280caec5aac54ca0bd64b6506ea946ba455e990e4ad41ff"; z_c0="2|1:0|10:1581995119|4:z_c0|92:Mi4xOTAwdUFnQUFBQUFBRUZ5Mi1hUFZFQ1lBQUFCZ0FsVk5iNkk0WHdCMHFyRHBiaHhxX0M1OUppcEtPTzFVejNQcWxn|d8b66099962700028671a523b3f058dec5b1dd8d302d299edc616b8e41452d43"; q_c1=22c717de53034a299aadad488196c6c9|1581995151000|1581995151000; tshl=; tst=r; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1581995076,1582079852; KLBRSID=b33d76655747159914ef8c32323d16fd|1582105883|1582104976; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1582105885',
"referer": "https://www.zhihu.com/"
}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
#'zhihu.middlewares.ZhihuDownloaderMiddleware': 543,
'zhihu.middlewares.RandomUserAgentMiddleware': 542,
}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'zhihu.pipelines.ZhihuPipeline': 300,
}
USER_AGENT_LIST = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
# Mongodb参数配置 ip/port/数据库名/集合名
MONGODB_HOST = '127.0.0.1'
MONGODB_PORT = 27017
MONGODB_DBNAME = 'zhihu'
MONGODB_DOCNAME = 'zhihu_collection'
3.4 middlewares.py
import random
from scrapy.utils.project import get_project_settings
import logging
# 随机选择 User-Agent 的下载器中间件
class RandomUserAgentMiddleware(object):
def process_request(self,request,spider):
# 从 settings 的 USER_AGENTS 列表中随机选择一个作为 User-Agent
settings=get_project_settings()
user_agent=random.choice(settings["USER_AGENT_LIST"])
request.headers["User-Agent"]=user_agent
return None
def process_response(self, request, response, spider):
# 验证 User-Agent 设置是否生效
logger=logging.getLogger(__name__)
logger.info("headers ::> User-Agent = " + str(request.headers['User-Agent'], encoding="utf8"))
return response
3.5 pipelines.py
数据存放到mongodb数据库中
# -*- coding: utf-8 -*-
from scrapy.utils.project import get_project_settings # 获取settings.py
import pymongo
from zhihu.items import ZhihuItem
class ZhihuPipeline(object):
settings = get_project_settings()
host = settings['MONGODB_HOST']
port = settings['MONGODB_PORT']
dbName = settings['MONGODB_DBNAME']
collectionName = settings['MONGODB_DOCNAME']
# 开始处理数据之前连接数据库
def open_spider(self,spider):
# 创建连接
self.client=pymongo.MongoClient(host=self.host,port=self.port)
# 创建数据库
self.db=self.client[self.dbName]
#创建集合
self.collection=self.db[self.collectionName]
def process_item(self, item, spider):
if isinstance(item,ZhihuItem):
# 字段更新("查询条件","$set 更新数据","True:如果数据不存在则插入")
self.collection.update({"url_token":item["url_token"]},{"$set":item},True)
return item
# 数据处理完之后关闭数据库
def close_spider(self,spider):
self.client.close()
4. 运行结果
Robo 3T显示数据
5. scrapy-redis 分布式修改配置
下载redis和scrapy-redis先
a) 修改spider继承类,添加redis_key, 注释掉原有的起始url:start_urls
from scrapy_redis.spiders import RedisCrawlSpider
class UserinfoSpider(RedisCrawlSpider):
name = 'userinfo'
redis_key = "myspider:start_urls"
allowed_domains = ['zhihu.com']
#start_urls = ['https://www.zhihu.com/api/v4/members/wang-shun-61-24/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=20&limit=20']
b) settings.py中添加 redis的相关配置
# 配置 Scrapy-Redis
# 启用 Scrapy-Redis 调度存储请求队列
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 去重规则对应处理的类
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 不清除 Redis 队列,即是否在关闭时候保留原来的调度器和去重记录。
# True=保留,False=清空。这样可以暂停/恢复 爬取
SCHEDULER_PERSIST = True
#使用优先级调度请求队列 (默认使用)
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'
#可选用的其它队列 PriorityQueue(有序集合),FifoQueue(列表)、LifoQueue(列表)
#SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.FifoQueue' # 广度优先
#SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.LifoQueue' # 深度优先
# 连接redis
REDIS_HOST = '127.0.0.1' # 主机名
REDIS_PORT = 6379 # 端口
#REDIS_PARAMS = {'password':'xxx'} # Redis连接参数。
REDIS_ENCODING = "utf-8" # redis编码类型。默认:'utf-8'
# 或者:
#REDIS_URL = 'redis://user:pass@hostname:9001' # 连接 URL(优先于以上配置)
5.1 打开redis服务端和客户端,并向redis_key列表里添加url为起始爬取的url
5.2 同时运行几个来模拟分布式爬取
参考学习:
https://www.bilibili.com/video/av20220465?from=search&seid=5965231367183075581
https://cuiqingcai.com/8465.html