scrapy爬蟲的搭建過程(實戰篇)
1. 爬蟲功能
- 以 http://bbs.fengniao.com/forum/forum_125_1_lastpost.html 爲起始頁,爬取前十頁的信息,包括文章的標題、鏈接地址和圖片地址,保存到mongodb中。並下載對應的圖片到本地目錄。
2. 環境
- 系統:win7
- Scrapy 1.4.0
- mongodb v3.2
- python 3.6.1
3. 代碼
3.1. 創建爬蟲項目
# 第一步,進入需要防止爬蟲代碼的位置,下圖中指定目錄爲:E:\myScrapyCode
scrapy startproject fengniao #創建一個爬蟲項目fengniao
cd fengniao #進入到爬蟲項目目錄
scrapy genspider fengniaoClawer fengniao.com #創建一個具體的爬蟲fengniaoClawer, 並初始化域名
3.2. 代碼結構
3.3. 詳細代碼
- fengniaoClawer.py
# 文件:fengniaoClawer.py
# -*- coding: utf-8 -*-
import scrapy
from fengniao.items import FengniaoItem
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import TimeoutError, TCPTimedOutError, DNSLookupError, ConnectionRefusedError
class FengniaoclawerSpider(scrapy.Spider):
name = 'fengniaoClawer' # 爬蟲名字,爬蟲啓動時需要指定的名字
allowed_domains = ['fengniao.com'] # 允許的域名,非這個域名下的url都會被拋棄掉
manualRetry = 8 # 手動重試的次數,有些網頁即使狀態碼爲200,也未必說明內容被拉下來, 拉下來的可能是殘缺的一部分
# 爬蟲自定義設置,會覆蓋 settings.py 文件中的設置
custom_settings = {
'LOG_LEVEL': 'DEBUG', # 定義log等級
'DOWNLOAD_DELAY': 0, # 下載延時
'COOKIES_ENABLED': False, # enabled by default
'DEFAULT_REQUEST_HEADERS': {
# 'Host': 'www.fengniao.com',
'Referer': 'https://www.fengniao.com',
},
# 管道文件,優先級按照由小到大依次進入
'ITEM_PIPELINES': {
'fengniao.pipelines.ImagePipeline':100,
'fengniao.pipelines.FengniaoPipeline': 300,
},
# 關於下載圖片部分
'IMAGES_STORE':'fengniaoPhoto', # 沒有則新建
'IMAGES_EXPIRES':90, # 圖片有效期,已經存在的圖片在這個時間段內不會再下載
'IMAGES_MIN_HEIGHT': 100, # 圖片最小尺寸(高度),低於這個高度的圖片不會下載
'IMAGES_MIN_WIDTH': 100, # 圖片最小尺寸(寬度),低於這個寬度的圖片不會下載
# 下載中間件,優先級按照由小到大依次進入
'DOWNLOADER_MIDDLEWARES': {
'fengniao.middlewares.ProxiesMiddleware': 400,
'fengniao.middlewares.HeadersMiddleware': 543,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
},
'DEPTH_PRIORITY': 1, # BFS,是以starts_url爲準,局部BFS,受CONCURRENT_REQUESTS影響
'SCHEDULER_DISK_QUEUE': 'scrapy.squeues.PickleFifoDiskQueue',
'SCHEDULER_MEMORY_QUEUE': 'scrapy.squeues.FifoMemoryQueue',
'REDIRECT_PRIORITY_ADJUST': 2, # Default: +2
'RETRY_PRIORITY_ADJUST': -1, # Default: -1
'RETRY_TIMES': 8, # 重試次數
# Default: 2, can also be specified per-request using max_retry_times attribute of Request.meta
'DOWNLOAD_TIMEOUT': 30,
# This timeout can be set per spider using download_timeout spider attribute and per-request using download_timeout Request.meta key
# 'DUPEFILTER_CLASS': "scrapy_redis.dupefilter.RFPDupeFilter",
# 'SCHEDULER': "scrapy_redis.scheduler.Scheduler",
# 'SCHEDULER_PERSIST': False, # Don't cleanup redis queues, allows to pause/resume crawls.
# 併發度相關。根據網站情況,網速以及代理來設置
'CONCURRENT_REQUESTS': 110, # default 16,Scrapy downloader 併發請求(concurrent requests)的最大值,即一次讀入並請求的url數量
# 'CONCURRENT_REQUESTS_PER_DOMAIN':15, #default 8 ,對單個網站進行併發請求的最大值。
'CONCURRENT_REQUESTS_PER_IP': 5, # default 0,如果非0,則忽略CONCURRENT_REQUESTS_PER_DOMAIN 設定, 也就是說併發限制將針對IP,而不是網站
'REACTOR_THREADPOOL_MAXSIZE': 20, # default 10
# 限制爬取深度, 相對於start_url的深度
# 注意,這個深度一定要大於 retry的深度,要不然的話,一旦重試次數達到極致,也就達到了最大深度,爬蟲會丟棄這個Request
'DEPTH_LIMIT': 10,
}
# 爬蟲發起的第一個請求
def start_requests(self):
startUrl = 'http://bbs.fengniao.com/forum/forum_125_1_lastpost.html'
pageNum = 0
yield scrapy.Request(
url = startUrl,
meta = {
'dont_redirect': True,
'pageNum': pageNum
},
callback = self.parseArticle, # 指定處理Response的函數
errback = self.error
)
# 跟進處理Response
def parseArticle(self, response):
self.logger.info(f"parseArticle: url = {response.url}, status = {response.status}, meta = {response.meta}")
# print(f"parseArticle: text = {response.text}")
# 拿到所有的文章
articleLst = response.xpath("//ul[@class='txtList']/li")
isGetPage = True if articleLst else False
# 保證完整的網頁(至少預期的那部分內容)都被爬取下來了
if isGetPage == True:
# 逐條提取數據
for article in articleLst:
# 根據Items中的定義,來初始化數據
articleItem = FengniaoItem()
articleItem['itemType'] = 'articleInfo'
# 提取文章標題
articleItem['title'] = article.xpath("./h3/a/text()").extract()
# 提取文章鏈接
articleItem['href'] = article.xpath("./h3/a/@href").extract()
# 提取第一個展示圖片的鏈接
articleItem['picLst'] = article.xpath("./div[@class='picList']//a//@style").extract()
# 提取到數據,轉入pipelines.py進行處理
self.logger.info(f"parseArticle: articleItem = {articleItem}")
yield articleItem
# 繼續爬取其他鏈接
# 可以是任何鏈接,無論是從網頁中提取,還是自己構造
for pageNum in range(2, 10):
# 構造這種鏈接:http://bbs.fengniao.com/forum/forum_125_1_lastpost.html
nextUrl = "http://bbs.fengniao.com/forum/forum_125_" + str(pageNum) + "_lastpost.html"
# 繼續爬取網頁,構造Request送往調度器
yield scrapy.Request(
url = nextUrl,
meta = {'dont_redirect': True, 'pageNum': pageNum},
callback = self.parseArticle,
errback = self.error,
)
# 有時候,拉下來的網頁可能是殘缺的,但是狀態碼是200,scrapy會認爲是成功的,這種需要手動重試
elif response.meta['depth'] < self.manualRetry:
request = response.request
request.dont_filter = True
yield request
else:
yield {'url': response.url, 'itemType': 'getPageLost'} # 日誌用
# 處理Error信息
def error(self, failure):
if failure.check(HttpError):
response = failure.value.response
if response.meta['depth'] < self.manualRetry:
failure.request.dont_filter = True
yield failure.request
else:
yield {
'url': response.url,
'itemType': 'error',
'errorType': 'HttpError',
'depth': response.meta['depth'],
'priority': response.request.priority,
'status': response.status,
'callback': response.request.callback.__name__
} # 日誌用
elif failure.check(TimeoutError, TCPTimedOutError, ConnectionRefusedError, DNSLookupError):
request = failure.request
yield {
'url': request.url,
'itemType': 'error',
'errorType': 'TimeoutError',
'priority': request.priority,
'callback': request.callback.__name__
} # 日誌用,只在最後一次超時後才執行
else:
request = failure.request
yield {
'url': request.url,
'itemType': 'error',
'errorType': 'UnknownError',
'priority': request.priority,
'callback': request.callback.__name__
} # 日誌用
# 爬取結束時的收尾工作,一般可以是發送郵件
def closed(self, reason):
self.logger.info(f"closed: spider finished, reason = {reason}")
- items.py
# 文件:items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
# 定義格式化數據
# 其實也可以不定義,因爲最終送往mongodb的數據是字典格式就行
# 定義的目的應該是有兩方面:
# 1. 方便開發人員清晰的知道要解析哪些字段,避免遺漏
# 2. 避免填充其他的字段,造成數據混亂,因爲如果不是這個裏面定義的字段,是無法賦值的
class FengniaoItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
_id = scrapy.Field() # _id 即使是mongodb默認添加,也需要在這裏定義
itemType = scrapy.Field()
title = scrapy.Field()
href = scrapy.Field()
picLst = scrapy.Field()
imagePathLst = scrapy.Field()
- dictionary.py
# 文件:dictionary.py
# 瀏覽器頭信息
useragent = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400) ',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE) ',
'Mozilla/2.02E (Win95; U)',
'Mozilla/3.01Gold (Win95; I)',
'Mozilla/4.8 [en] (Windows NT 5.1; U)',
'Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.4) Gecko Netscape/7.1 (ax)',
'Opera/7.50 (Windows XP; U)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.2; WOW64; Trident/5.0)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/6.0)',
'Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0',
'Opera/9.25 (Windows NT 6.0; U; en)',
'Opera/9.80 (Windows NT 5.2; U; en) Presto/2.2.15 Version/10.10',
'Opera/9.80 (Windows NT 5.1; U; ru) Presto/2.7.39 Version/11.00',
'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.7.62 Version/11.01',
]
- middlewares.py
# 文件:middlewares.py
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
import random
from fengniao.dictionary import useragent
# 添加user-agent信息,模擬瀏覽器
class HeadersMiddleware:
def process_request(self, request, spider):
# print('Using HeadersMiddleware!')
request.headers['User-Agent'] = random.choice(useragent)
# 添加代理,突破反爬機制
class ProxiesMiddleware:
def process_request(self, request, spider):
# print('Using ProxiesMiddleware!')
if not request.meta.get('proxyFlag'):
# 阿布雲代理
request.meta['proxy']='http://FHK87H210JK29JHH:[email protected]:9020'
class FengniaoSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
- pipelines.py
# 文件:pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo
import scrapy
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
import re
# 用於將結構化數據,存儲進mongodb
class FengniaoPipeline(object):
def __init__(self, mongoUrl, mongoDB):
self.mongo_url = mongoUrl
self.mongo_db = mongoDB
# 從settings中拿到配置信息,這個方法是內置的
@classmethod
def from_crawler(cls, crawler):
return cls(
mongoUrl = crawler.settings.get("MONGO_URI"),
mongoDB = crawler.settings.get("MONGO_DB")
)
# 在爬蟲開始時會調用
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_url)
self.db = self.client[self.mongo_db]
self.db_table = self.db['fengniaoArticle']
# 處理item,只要有item進來就會被調用
def process_item(self, item, spider):
# spider對象自帶logger對象
spider.logger.info(f"FengniaoPipeline: item = {item}")
if ("itemType" in item) and (item["itemType"] == "articleInfo"):
try:
insertRes = self.db_table.insert_one(item)
spider.logger.info(f"FengniaoPipeline: insertRes = {insertRes.inserted_id}")
except Exception as e:
spider.logger.info(f"FengniaoPipeline: insertRes(fengniaoArticle) Exception = {e}")
else:
raise DropItem("fengniaoArticle record inserted!")
# 在爬蟲結束時會調用
def close_spider(self, spider):
self.client.close()
class ImagePipeline(ImagesPipeline):
# 下載圖片
def get_media_requests(self, item, info):
# when they have finished downloading, the results will be sent to the item_completed() method
if 'picLst' in item:
item["imagePathLst"] = [] # 初始化字段,用於存儲圖片名字
for pic in item['picLst']:
# background-image:url(https://bbs.qn.img-space.com/201802/5/954dda931c45118eec0ed19d8293be83.jpg?imageView2/2/w/400/q/90/ignore-error/1/)
picUrlRe = re.search('background-image:url\((.*?)\?imageView', pic)
if picUrlRe:
# 拿到圖片鏈接
# https://bbs.qn.img-space.com/201802/4/d094e1a73f2334780e36dea8e83256a5.jpg
picUrl = picUrlRe.group(1)
print(f"picUrl = {picUrl}")
# 拿到圖片名字
picNameRe = re.search(".*/(.*?)$", picUrl)
if picNameRe:
# d094e1a73f2334780e36dea8e83256a5.jpg
picName = picNameRe.group(1)
print(f"picName = {picName}")
# 進行圖片下載
yield scrapy.Request(
url = picUrl,
meta = {'picName': picName}
)
break # 只下載一張圖片
# 保存圖片
def file_path(self, request, response=None, info=None):
image_guid = request.meta['picName']
# 指定圖片名字
return f'{image_guid}'
# 反饋結果
def item_completed(self, results, item, info):
# results = [(True, {'url': 'https://bbs.qn.img-space.com/201802/5/c9ab39ff6d05d8e9dc990cf190c70461.jpg', 'path': 'c9ab39ff6d05d8e9dc990cf190c70461.jpg', 'checksum': '669a6a44cd462192729ebee54c35fe31'})]
print(f"results = {results}")
if results and results[0][0]:
imagePath = results[0][1]['path']
item["imagePathLst"].append(imagePath)
return item
- settings.py
# 文件:settings.py
# -*- coding: utf-8 -*-
import datetime
# Scrapy settings for fengniao project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'fengniao'
SPIDER_MODULES = ['fengniao.spiders']
NEWSPIDER_MODULE = 'fengniao.spiders'
# 指定mongodb數據庫
MONGO_URI = "localhost:27017"
MONGO_DB = 'fengniao'
# 指定Log文件
Date = datetime.datetime.now().strftime('%Y%m%d')
#LOG_FILE = f"fengniaoLog{Date}.txt"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'fengniao (+http://www.yourdomain.com)'
# Obey robots.txt rules ———— 如果遵守這個約定的話,那大部分網站都爬不動
# ROBOTSTXT_OBEY = True
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'fengniao.middlewares.FengniaoSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'fengniao.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'fengniao.pipelines.FengniaoPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
- main.py
# 文件:main.py
from scrapy import cmdline
# 續爬參數
# cmdline.execute('scrapy crawl fengniaoClawer -s JOBDIR=crawls/storefengniaoClawer'.split())
cmdline.execute('scrapy crawl fengniaoClawer'.split())
# detail+error-callback=parse=galance=7w
- scrapy.cfg
# 文件:scrapy.cfg
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = fengniao.settings
[deploy]
#url = http://localhost:6800/
project = fengniao
3.4. 運行結果
數據庫
圖片下載