源码阅读如下:
"""Set User-Agent header per spider or use a default value from settings"""
from scrapy import signals
class UserAgentMiddleware(object):
"""This middleware allows spiders to override the user_agent"""
def __init__(self, user_agent='Scrapy'):
self.user_agent = user_agent
@classmethod
def from_crawler(cls, crawler):
# 传递配置中的USER_AGENT值给初始化函数
o = cls(crawler.settings['USER_AGENT'])
# 在开始爬虫任务时,尝试读取爬虫的user_agent值来覆盖配置中对应的值
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
return o
def spider_opened(self, spider):
self.user_agent = getattr(spider, 'user_agent', self.user_agent)
def process_request(self, request, spider):
if self.user_agent:
request.headers.setdefault(b'User-Agent', self.user_agent)
值得注意的是,在开启此中间件时,如果不主动配置USER_AGENT请求就变得很有意思,因为项目中USER_AGENT配置默认值是这样的:
USER_AGENT = 'Scrapy/%s (+https://scrapy.org)' % import_module('scrapy').__version__
实际上我们需要的可能是能够切换user_agent的中间件,所以可以这样去实现:
1.在配置中将所有的user_agent组成一个列表
2.在中间件中用choice随机获取
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
from random import choice
class MyUserAgentMiddleware(UserAgentMiddleware):
def process_request(self, request, spider):
if self.user_agent:
request.headers.setdefault(
b'User-Agent',
choice(self.user_agent)
)