文檔地址:https://scrapy-chs.readthedocs.io/zh_CN/0.24/topics/signals.html
scrapy 框架簡介
- Scrapy是用純Python實現一個爲了爬取網站數據、提取結構性數據而編寫的應用框架,用途非常廣泛
- 框架的力量,用戶只需要定製開發幾個模塊就可以輕鬆的實現一個爬蟲,用來抓取網頁內容以及各種圖片,非常之方便
Scrapy架構圖
- crapy Engine(引擎): 負責Spider、ItemPipeline、Downloader、Scheduler中間的通訊,信號、數據傳遞等。
- Scheduler(調度器): 它負責接受引擎發送過來的Request請求,並按照一定的方式進行整理排列,入隊,當引擎需要時,交還給引擎。
- Downloader(下載器):負責下載Scrapy Engine(引擎)發送的所有Requests請求,並將其獲取到的Responses交還給Scrapy Engine(引擎),由引擎交給Spider來處理
- Spider(爬蟲):它負責處理所有Responses,從中分析提取數據,獲取Item字段需要的數據,並將需要跟進的URL提交給引擎,再次進入Scheduler(調度器)
- Item Pipeline(管道):它負責處理Spider中獲取到的Item,並進行進行後期處理(詳細分析、過濾、存儲等)的地方
- Downloader Middlewares(下載中間件):你可以當作是一個可以自定義擴展下載功能的組件
- Spider Middlewares(Spider中間件):你可以理解爲是一個可以自定擴展和操作引擎和Spider中間通信的功能組件(比如進入Spider的Responses;和從Spider出去的Requests)
1.安裝Scrapy框架
- pip install scrapy
2.創建一個scrapy項目
scrapy startproject 項目名
3.創建爬蟲文件
scrapy genspider 文件名 域名
# -*- coding: utf-8 -*-
import scrapy
class BaiduSpider(scrapy.Spider):
# 爬蟲名稱
name = 'baidu'
# 設置允許爬取的域(可以指定多個)
allowed_domains = ['www.baidu.com']
# 設置起始url(可以設置多個)
start_urls = ['http://www.baidu.com/']
def parse(self, response):
'''
是一個回調方法,起始url請求成功後,會回調這個方法
:param response: 響應結果
:return:
'''
pass
# parse 方法主要做數據的提取,並把提取的數據封裝在item中,傳遞給pipeline
使用模板創建
scrapy genspider -t crawl 文件名 域名
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class TaobaoSpider(CrawlSpider):
name = 'taobao'
allowed_domains = ['www.taobao.com']
start_urls = ['http://www.taobao.com/']
'''
Rule 主要是按正則匹配的規則提取鏈接
'''
rules = (
Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
)
'''
LinkExtractor : 設置提取鏈接的規則(正則表達式)
allow=(), : 設置允許提取的url
restrict_xpaths=(), :根據xpath語法,定位到某一標籤下提取鏈接
restrict_css=(), :根據css選擇器,定位到某一標籤下提取鏈接
deny=(), : 設置不允許提取的url(優先級比allow高)
allow_domains=(), : 設置允許提取url的域
deny_domains=(), :設置不允許提取url的域(優先級比allow_domains高)
unique=True, :如果出現多個相同的url只會保留一個
strip=True :默認爲True,表示自動去除url首尾的空格
'''
'''
rule
link_extractor, : linkExtractor對象
callback=None, : 設置回調函數
follow=None, : 設置是否跟進
process_links=None, :可以設置回調函數,對所有提取到的url進行攔截
process_request=identity : 可以設置回調函數,對request對象進行攔截
'''
# 注意: CrawlSpider中一定不要出現parse回調方法 會重寫父類的方法
def parse_item(self, response):
item = {}
#item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
#item['name'] = response.xpath('//div[@id="name"]').get()
#item['description'] = response.xpath('//div[@id="description"]').get()
return item
item pipiline組件是一個獨立的Python類,其中process_item()方法必須實現:
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
class BaiduPipeline(object):
def __init__(self):
# 初始化一些參數,比如說mysql, mongo連接初始化
pass
def process_item(self, item, spider):
"""
處理spider傳遞過來的item
:param item: item對象
:param spider: spider對象
:return:
"""
return item
def open_spider(self, spider):
# 可選實現,當spider被開啓時,這個方法被調用
pass
def close_spider(self, spider):
# 可選實現,當spider被關閉時,這個方法被調用,一般用來關閉mysql, mongo連接
pass
setting.py 的設置
# -*- coding: utf-8 -*-
# Scrapy settings for Baidu project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'Baidu'
SPIDER_MODULES = ['Baidu.spiders']
NEWSPIDER_MODULE = 'Baidu.spiders'
LOG_FILE = "BaiduSpider.log"
LOG_LEVEL = "INFO"
FEED_EXPORT_ENCODING='UTF8'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'Baidu (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'Baidu.middlewares.BaiduSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'Baidu.middlewares.BaiduDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'Baidu.pipelines.BaiduPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# 調式的過程避免每次發送請求,優先從緩存中讀取
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_DIR = 'httpcache'
HTTPCACHE_IGNORE_HTTP_CODES = []
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
Dowmloader Middleware的使用
1.設置隨機代理
1.1 在settings.py中添加代理IP
PROXIES = ['http://183.207.95.27:80', 'http://111.6.100.99:80', 'http://122.72.99.103:80',
'http://106.46.132.2:80', 'http://112.16.4.99:81', 'http://123.58.166.113:9000',
'http://118.178.124.33:3128', 'http://116.62.11.138:3128', 'http://121.42.176.133:3128',
'http://111.13.2.131:80', 'http://111.13.7.117:80', 'http://121.248.112.20:3128',
'http://112.5.56.108:3128', 'http://42.51.26.79:3128', 'http://183.232.65.201:3128',
'http://118.190.14.150:3128', 'http://123.57.221.41:3128', 'http://183.232.65.203:3128',
'http://166.111.77.32:3128', 'http://42.202.130.246:3128', 'http://122.228.25.97:8101',
'http://61.136.163.245:3128', 'http://121.40.23.227:3128', 'http://123.96.6.216:808',
'http://59.61.72.202:8080', 'http://114.141.166.242:80', 'http://61.136.163.246:3128',
'http://60.31.239.166:3128', 'http://114.55.31.115:3128', 'http://202.85.213.220:3128']
1.2 在middlewares.py文件中,添加下面的代碼
import scrapy
from scrapy import signals
import random
class ProxyMiddleware(object):
'''
設置Proxy
'''
def __init__(self, ip):
self.ip = ip
@classmethod
def from_crawler(cls, crawler):
return cls(ip=crawler.settings.get('PROXIES'))
def process_request(self, request, spider):
ip = random.choice(self.ip)
request.meta['proxy'] = ip
1.3 最後將我們自定義的類添加到下載器中間件設置中,如下
DOWNLOADER_MIDDLEWARES = {
'myproject.middlewares.ProxyMiddleware': 543,
}
2.設置隨機UserAgent
2.1 在settings.py中添加
MY_USER_AGENT = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
]
2.2 在middlewares.py文件中,添加下面的代碼
import scrapy
from scrapy import signals
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
import random
class MyUserAgentMiddleware(UserAgentMiddleware):
'''
設置User-Agent
'''
def __init__(self, user_agent):
self.user_agent = user_agent
@classmethod
def from_crawler(cls, crawler):
return cls(
user_agent=crawler.settings.get('MY_USER_AGENT')
)
def process_request(self, request, spider):
agent = random.choice(self.user_agent)
request.headers['User-Agent'] = agent
2.3 最後一步,就是將我們自定義的這個MyUserAgentMiddleware類添加到DOWNLOADER_MIDDLEWARES
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddleware.useragent.UserAgentMiddleware': None,
'myproject.middlewares.MyUserAgentMiddleware': 400,
}
Crawler對象體系
settings # crawler的配置管理器
crawler.settings.get(name)
set(name, value, priority=‘project’)
setdict(values, priority=‘project’)
setmodule(module, priority=‘project’)
get(name, default=None)
getbool(name, default=False)
getint(name, default=0)
getfloat(name, default=0.0)
getlist(name, default=None)
getdict(name, default=None)
copy() # 深拷貝當前配置
freeze()
frozencopy()
signals # crawler的信號管理器
crawler.signals.connect(receiver, signal)
connect(receiver, signal)
send_catch_log(signal, **kwargs)
send_catch_log_deferred(signal, **kwargs)
disconnect(receiver, signal)
disconnect_all(signal)
stats # crawler的統計信息收集器
crawler.stats.get_value()
get_value(key, default=None)
get_stats()
set_value(key, value)
set_stats(stats)
inc_value(key, count=1, start=0)
max_value(key, value)
min_value(key, value)
clear_stats()
open_spider(spider)
close_spider(spider)
extensions 擴展管理器,跟蹤所有開啓的擴展
engine 執行引擎,協調crawler的核心邏輯,包括調度,下載和spider
spider 正在爬取的spider。該spider類的實例由創建crawler時所提供
crawl(*args, **kwargs) 初始化spider類,啓動執行引擎,啓動crawler
Scrapy內置信號
engine_started # 引擎啓動
engine_stopped # 引擎停止
spider_opened # spider開始
spider_idle # spider進入空閒(idle)狀態
spider_closed # spider被關閉
spider_error # spider的回調函數產生錯誤
request_scheduled # 引擎調度一個 Request
request_dropped # # 引擎丟棄一個 Request
response_received # 引擎從downloader獲取到一個新的 Response
response_downloaded # 當一個 HTTPResponse 被下載
item_scraped # item通過所有 Item Pipeline 後,沒有被丟棄dropped
item_dropped # DropItem丟棄item