# 需要用到的包import re
from scrapy.exceptions import IgnoreRequest
# 中間件代碼classUrlFilterMiddleware:"""url filter middleware written by fcj"""
@classmethoddeffrom_crawler(cls, crawler):
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)return s
defprocess_request(self, request, spider):from urllib.parse import urlsplit # 局部區域引用包
black_set ={# 關鍵字'beanie','brief','choker','hat','scraf','pajamas','slippes'}
split_result = urlsplit(url=request.url).path
# 使用urlsplit切割url,並最後得到path,path一般都包含了關鍵字.
split_set =set(re.split(r'[/_.-]', split_result))# 再使用正則表達式,切割字符串,得到一個字符串列表,並set.
intersection = split_set & black_set
# set求和,若結果的set長度大於0,則兩個set間存在相同元素.這一步主要是避免迭代尋找共同元素.iflen(intersection)>0:raise IgnoreRequest('url filter')# 當有共同元素的時候,則證明url裏有關鍵字,拋出忽略請求的異常即可.# 官文對這個異常的描述:This exception can be raised by the Scheduler or # any downloader middleware to indicate that the request should be ignored.else:returnNone# 當返回None值時,爬蟲會繼續對request做處理.defspider_opened(self, spider):
spider.logger.info('Spider opened: %s'% spider.name)