# 需要用到的包import re
from scrapy.exceptions import IgnoreRequest
# 中间件代码classUrlFilterMiddleware:"""url filter middleware written by fcj"""
@classmethoddeffrom_crawler(cls, crawler):
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)return s
defprocess_request(self, request, spider):from urllib.parse import urlsplit # 局部区域引用包
black_set ={# 关键字'beanie','brief','choker','hat','scraf','pajamas','slippes'}
split_result = urlsplit(url=request.url).path
# 使用urlsplit切割url,并最后得到path,path一般都包含了关键字.
split_set =set(re.split(r'[/_.-]', split_result))# 再使用正则表达式,切割字符串,得到一个字符串列表,并set.
intersection = split_set & black_set
# set求和,若结果的set长度大于0,则两个set间存在相同元素.这一步主要是避免迭代寻找共同元素.iflen(intersection)>0:raise IgnoreRequest('url filter')# 当有共同元素的时候,则证明url里有关键字,抛出忽略请求的异常即可.# 官文对这个异常的描述:This exception can be raised by the Scheduler or # any downloader middleware to indicate that the request should be ignored.else:returnNone# 当返回None值时,爬虫会继续对request做处理.defspider_opened(self, spider):
spider.logger.info('Spider opened: %s'% spider.name)