scrapy爬蟲的搭建過程（實戰篇）

1. 爬蟲功能

以 http://bbs.fengniao.com/forum/forum_125_1_lastpost.html 爲起始頁，爬取前十頁的信息，包括文章的標題、鏈接地址和圖片地址，保存到mongodb中。並下載對應的圖片到本地目錄。

2. 環境

系統：win7
Scrapy 1.4.0
mongodb v3.2
python 3.6.1

3. 代碼

3.1. 創建爬蟲項目

# 第一步，進入需要防止爬蟲代碼的位置，下圖中指定目錄爲：E:\myScrapyCode
scrapy startproject fengniao  #創建一個爬蟲項目fengniao  
cd fengniao                   #進入到爬蟲項目目錄
scrapy genspider fengniaoClawer fengniao.com    #創建一個具體的爬蟲fengniaoClawer, 並初始化域名

3.2. 代碼結構

3.3. 詳細代碼

fengniaoClawer.py

# 文件：fengniaoClawer.py

# -*- coding: utf-8 -*-
import scrapy
from fengniao.items import FengniaoItem
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import TimeoutError, TCPTimedOutError, DNSLookupError, ConnectionRefusedError


class FengniaoclawerSpider(scrapy.Spider):

    name = 'fengniaoClawer'               # 爬蟲名字，爬蟲啓動時需要指定的名字
    allowed_domains = ['fengniao.com']    # 允許的域名，非這個域名下的url都會被拋棄掉
    manualRetry = 8     # 手動重試的次數，有些網頁即使狀態碼爲200，也未必說明內容被拉下來, 拉下來的可能是殘缺的一部分

    # 爬蟲自定義設置，會覆蓋 settings.py 文件中的設置
    custom_settings = {
        'LOG_LEVEL': 'DEBUG',       # 定義log等級
        'DOWNLOAD_DELAY': 0,        # 下載延時
        'COOKIES_ENABLED': False,   # enabled by default
        'DEFAULT_REQUEST_HEADERS': {
            # 'Host': 'www.fengniao.com',
            'Referer': 'https://www.fengniao.com',
        },

        # 管道文件，優先級按照由小到大依次進入
        'ITEM_PIPELINES': {
            'fengniao.pipelines.ImagePipeline':100,
            'fengniao.pipelines.FengniaoPipeline': 300,
        },

        # 關於下載圖片部分
        'IMAGES_STORE':'fengniaoPhoto',    # 沒有則新建
        'IMAGES_EXPIRES':90,          # 圖片有效期,已經存在的圖片在這個時間段內不會再下載
        'IMAGES_MIN_HEIGHT': 100,  # 圖片最小尺寸（高度），低於這個高度的圖片不會下載
        'IMAGES_MIN_WIDTH': 100,   # 圖片最小尺寸（寬度），低於這個寬度的圖片不會下載

        # 下載中間件，優先級按照由小到大依次進入
        'DOWNLOADER_MIDDLEWARES': {
            'fengniao.middlewares.ProxiesMiddleware': 400,
            'fengniao.middlewares.HeadersMiddleware': 543,
            'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
        },

        'DEPTH_PRIORITY': 1,    # BFS，是以starts_url爲準，局部BFS，受CONCURRENT_REQUESTS影響
        'SCHEDULER_DISK_QUEUE': 'scrapy.squeues.PickleFifoDiskQueue',
        'SCHEDULER_MEMORY_QUEUE': 'scrapy.squeues.FifoMemoryQueue',
        'REDIRECT_PRIORITY_ADJUST': 2,  # Default: +2
        'RETRY_PRIORITY_ADJUST': -1,    # Default: -1
        'RETRY_TIMES': 8,       # 重試次數
        # Default: 2, can also be specified per-request using max_retry_times attribute of Request.meta
        'DOWNLOAD_TIMEOUT': 30,
        # This timeout can be set per spider using download_timeout spider attribute and per-request using download_timeout Request.meta key

        # 'DUPEFILTER_CLASS': "scrapy_redis.dupefilter.RFPDupeFilter",
        # 'SCHEDULER': "scrapy_redis.scheduler.Scheduler",
        # 'SCHEDULER_PERSIST': False,  # Don't cleanup redis queues, allows to pause/resume crawls.

        # 併發度相關。根據網站情況，網速以及代理來設置
        'CONCURRENT_REQUESTS': 110,     # default 16,Scrapy downloader 併發請求(concurrent requests)的最大值,即一次讀入並請求的url數量
        # 'CONCURRENT_REQUESTS_PER_DOMAIN':15,  #default 8 ,對單個網站進行併發請求的最大值。
        'CONCURRENT_REQUESTS_PER_IP': 5,  # default 0,如果非0，則忽略CONCURRENT_REQUESTS_PER_DOMAIN 設定， 也就是說併發限制將針對IP，而不是網站
        'REACTOR_THREADPOOL_MAXSIZE': 20,  # default 10

        # 限制爬取深度, 相對於start_url的深度
        # 注意，這個深度一定要大於 retry的深度，要不然的話，一旦重試次數達到極致，也就達到了最大深度，爬蟲會丟棄這個Request
        'DEPTH_LIMIT': 10,
    }

    # 爬蟲發起的第一個請求
    def start_requests(self):
        startUrl = 'http://bbs.fengniao.com/forum/forum_125_1_lastpost.html'
        pageNum = 0
        yield scrapy.Request(
            url = startUrl,
            meta = {
                'dont_redirect': True,
                'pageNum': pageNum
            },
            callback = self.parseArticle,   # 指定處理Response的函數
            errback = self.error
        )

    # 跟進處理Response
    def parseArticle(self, response):
        self.logger.info(f"parseArticle: url = {response.url}, status = {response.status}, meta = {response.meta}")
        # print(f"parseArticle: text = {response.text}")

        # 拿到所有的文章
        articleLst = response.xpath("//ul[@class='txtList']/li")
        isGetPage = True if articleLst else False

        # 保證完整的網頁（至少預期的那部分內容）都被爬取下來了
        if isGetPage == True:
            # 逐條提取數據
            for article in articleLst:
                # 根據Items中的定義，來初始化數據
                articleItem = FengniaoItem()
                articleItem['itemType'] = 'articleInfo'

                # 提取文章標題
                articleItem['title'] = article.xpath("./h3/a/text()").extract()

                # 提取文章鏈接
                articleItem['href'] = article.xpath("./h3/a/@href").extract()
                # 提取第一個展示圖片的鏈接
                articleItem['picLst'] = article.xpath("./div[@class='picList']//a//@style").extract()

                # 提取到數據，轉入pipelines.py進行處理
                self.logger.info(f"parseArticle: articleItem = {articleItem}")
                yield articleItem

            # 繼續爬取其他鏈接
            # 可以是任何鏈接，無論是從網頁中提取，還是自己構造
            for pageNum in range(2, 10):
                # 構造這種鏈接：http://bbs.fengniao.com/forum/forum_125_1_lastpost.html
                nextUrl = "http://bbs.fengniao.com/forum/forum_125_" + str(pageNum) + "_lastpost.html"
                # 繼續爬取網頁，構造Request送往調度器
                yield scrapy.Request(
                    url = nextUrl,
                    meta = {'dont_redirect': True, 'pageNum': pageNum},
                    callback = self.parseArticle,
                    errback = self.error,
                )
        # 有時候，拉下來的網頁可能是殘缺的，但是狀態碼是200，scrapy會認爲是成功的，這種需要手動重試
        elif response.meta['depth'] < self.manualRetry:
            request = response.request
            request.dont_filter = True
            yield request
        else:
            yield {'url': response.url, 'itemType': 'getPageLost'}      # 日誌用


    # 處理Error信息
    def error(self, failure):
        if failure.check(HttpError):
            response = failure.value.response
            if response.meta['depth'] < self.manualRetry:
                failure.request.dont_filter = True
                yield failure.request
            else:
                yield {
                    'url': response.url,
                    'itemType': 'error',
                    'errorType': 'HttpError',
                    'depth': response.meta['depth'],
                    'priority': response.request.priority,
                    'status': response.status,
                    'callback': response.request.callback.__name__
                }  # 日誌用

        elif failure.check(TimeoutError, TCPTimedOutError, ConnectionRefusedError, DNSLookupError):
            request = failure.request
            yield {
                'url': request.url,
                'itemType': 'error',
                'errorType': 'TimeoutError',
                'priority': request.priority,
                'callback': request.callback.__name__
            }  # 日誌用,只在最後一次超時後才執行
        else:
            request = failure.request
            yield {
                'url': request.url,
                'itemType': 'error',
                'errorType': 'UnknownError',
                'priority': request.priority,
                'callback': request.callback.__name__
            }  # 日誌用


    # 爬取結束時的收尾工作，一般可以是發送郵件
    def closed(self, reason):
        self.logger.info(f"closed: spider finished, reason = {reason}")

items.py

# 文件：items.py


# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy

# 定義格式化數據
# 其實也可以不定義，因爲最終送往mongodb的數據是字典格式就行
# 定義的目的應該是有兩方面：
# 1. 方便開發人員清晰的知道要解析哪些字段，避免遺漏
# 2. 避免填充其他的字段，造成數據混亂，因爲如果不是這個裏面定義的字段，是無法賦值的

class FengniaoItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    _id = scrapy.Field()        # _id 即使是mongodb默認添加，也需要在這裏定義
    itemType = scrapy.Field()
    title = scrapy.Field()
    href = scrapy.Field()
    picLst = scrapy.Field()
    imagePathLst = scrapy.Field()

dictionary.py

# 文件：dictionary.py

# 瀏覽器頭信息
useragent = [
  'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
  'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
  'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)',
  'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)',
  'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400) ',
  'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE) ',
  'Mozilla/2.02E (Win95; U)',
  'Mozilla/3.01Gold (Win95; I)',
  'Mozilla/4.8 [en] (Windows NT 5.1; U)',
  'Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.4) Gecko Netscape/7.1 (ax)',
  'Opera/7.50 (Windows XP; U)',
  'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.2; WOW64; Trident/5.0)',
  'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
  'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/6.0)',
  'Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0',
  'Opera/9.25 (Windows NT 6.0; U; en)',
  'Opera/9.80 (Windows NT 5.2; U; en) Presto/2.2.15 Version/10.10',
  'Opera/9.80 (Windows NT 5.1; U; ru) Presto/2.7.39 Version/11.00',
  'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.7.62 Version/11.01',
]

middlewares.py

# 文件：middlewares.py

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals
import random
from fengniao.dictionary import useragent


# 添加user-agent信息，模擬瀏覽器
class HeadersMiddleware:
    def process_request(self, request, spider):
        # print('Using HeadersMiddleware!')
        request.headers['User-Agent'] = random.choice(useragent)


# 添加代理，突破反爬機制
class ProxiesMiddleware:
    def process_request(self, request, spider):
        # print('Using ProxiesMiddleware!')
        if not request.meta.get('proxyFlag'):
            # 阿布雲代理
            request.meta['proxy']='http://FHK87H210JK29JHH:[email protected]:9020'


class FengniaoSpiderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        return None

    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, dict or Item objects.
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Response, dict
        # or Item objects.
        pass

    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

pipelines.py

# 文件：pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo
import scrapy
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
import re

# 用於將結構化數據，存儲進mongodb
class FengniaoPipeline(object):
    def __init__(self, mongoUrl, mongoDB):
        self.mongo_url = mongoUrl
        self.mongo_db = mongoDB

    # 從settings中拿到配置信息，這個方法是內置的
    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongoUrl = crawler.settings.get("MONGO_URI"),
            mongoDB = crawler.settings.get("MONGO_DB")
        )

    # 在爬蟲開始時會調用
    def open_spider(self, spider):
        self.client = pymongo.MongoClient(self.mongo_url)
        self.db = self.client[self.mongo_db]
        self.db_table = self.db['fengniaoArticle']

    # 處理item，只要有item進來就會被調用
    def process_item(self, item, spider):
        # spider對象自帶logger對象
        spider.logger.info(f"FengniaoPipeline: item = {item}")
        if ("itemType" in item) and (item["itemType"] == "articleInfo"):
            try:
                insertRes = self.db_table.insert_one(item)
                spider.logger.info(f"FengniaoPipeline: insertRes = {insertRes.inserted_id}")
            except Exception as e:
                spider.logger.info(f"FengniaoPipeline: insertRes(fengniaoArticle) Exception = {e}")
            else:
                raise DropItem("fengniaoArticle record inserted!")

    # 在爬蟲結束時會調用
    def close_spider(self, spider):
        self.client.close()


class ImagePipeline(ImagesPipeline):
    # 下載圖片
    def get_media_requests(self, item, info):
        # when they have finished downloading, the results will be sent to the item_completed() method
        if 'picLst' in item:
            item["imagePathLst"] = []     # 初始化字段，用於存儲圖片名字
            for pic in item['picLst']:
                # background-image:url(https://bbs.qn.img-space.com/201802/5/954dda931c45118eec0ed19d8293be83.jpg?imageView2/2/w/400/q/90/ignore-error/1/)
                picUrlRe = re.search('background-image:url\((.*?)\?imageView', pic)
                if picUrlRe:
                    # 拿到圖片鏈接
                    # https://bbs.qn.img-space.com/201802/4/d094e1a73f2334780e36dea8e83256a5.jpg
                    picUrl = picUrlRe.group(1)
                    print(f"picUrl = {picUrl}")
                    # 拿到圖片名字
                    picNameRe = re.search(".*/(.*?)$", picUrl)
                    if picNameRe:
                        # d094e1a73f2334780e36dea8e83256a5.jpg
                        picName = picNameRe.group(1)
                        print(f"picName = {picName}")
                        # 進行圖片下載
                        yield scrapy.Request(
                            url = picUrl,
                            meta = {'picName': picName}
                        )
                        break   # 只下載一張圖片

    # 保存圖片
    def file_path(self, request, response=None, info=None):
      image_guid = request.meta['picName']
      # 指定圖片名字
      return f'{image_guid}'

    # 反饋結果
    def item_completed(self, results, item, info):
        # results = [(True, {'url': 'https://bbs.qn.img-space.com/201802/5/c9ab39ff6d05d8e9dc990cf190c70461.jpg', 'path': 'c9ab39ff6d05d8e9dc990cf190c70461.jpg', 'checksum': '669a6a44cd462192729ebee54c35fe31'})]
        print(f"results = {results}")
        if results and results[0][0]:
            imagePath = results[0][1]['path']
            item["imagePathLst"].append(imagePath)
        return item

settings.py

# 文件：settings.py


# -*- coding: utf-8 -*-
import datetime

# Scrapy settings for fengniao project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'fengniao'

SPIDER_MODULES = ['fengniao.spiders']
NEWSPIDER_MODULE = 'fengniao.spiders'


# 指定mongodb數據庫
MONGO_URI = "localhost:27017"
MONGO_DB = 'fengniao'

# 指定Log文件
Date = datetime.datetime.now().strftime('%Y%m%d')
#LOG_FILE = f"fengniaoLog{Date}.txt"


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'fengniao (+http://www.yourdomain.com)'

# Obey robots.txt rules  ———— 如果遵守這個約定的話，那大部分網站都爬不動
# ROBOTSTXT_OBEY = True
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'fengniao.middlewares.FengniaoSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'fengniao.middlewares.MyCustomDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'fengniao.pipelines.FengniaoPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

main.py

# 文件：main.py

from scrapy import cmdline
# 續爬參數
# cmdline.execute('scrapy crawl fengniaoClawer -s JOBDIR=crawls/storefengniaoClawer'.split())
cmdline.execute('scrapy crawl fengniaoClawer'.split())

# detail+error-callback=parse=galance=7w

scrapy.cfg

# 文件：scrapy.cfg


# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html

[settings]
default = fengniao.settings

[deploy]
#url = http://localhost:6800/
project = fengniao