爬取目標:爬取360圖片前50頁並且實現本地CSV格式保存和Mongodb保存數據,下載支持異步和多線程,圖片下載使用ImagePipeline內置類重寫, 理解好Item Pipeline組件
重點內容
第一次運行報錯:
報錯ImportError:No module named ‘PIL’
pip install -i https://pypi.douban.com/simple pillow 安裝模塊組件
注意爬取的url是Json格式,在XHR裏可以查看
注意爬取的目標地址是:https://image.so.com/z?ch=photography
例如:
Request URL:
https://image.so.com/zj?ch=photography&sn=30&listtype=new&temp=1
提取base_url的是https://image.so.com/zj? 而不是我們的目標地址的https://image.so.com/z? 這個問題我被坑了,之前我們爬取的並不是Ajax格式,後續需要注意
上主要代碼:
images.py
-- coding: utf-8 --
import scrapy
from scrapy import Spider, Request
from urllib.parse import urlencode
import json
from images360.items import ImageItem
class ImagesSpider(scrapy.Spider):
name = ‘images’
allowed_domains = [‘images.so.com’]
start_urls = [‘http://images.so.com/‘]
def parse(self, response):
result = json.loads(response.text)
for image in result.get('list'):
item = ImageItem()
item['id'] = image.get('imageid')
item['url'] = image.get('qhimg_url')
item['title'] = image.get('group_title')
item['thumb'] = image.get('qhimg_thumb_url')
yield item
def start_requests(self):
data = {'ch':'photography','listtype':'new'}
base_url = 'http://image.so.com/zj?'
for page in range(1, self.settings.get('MAX_PAGE') + 1): #注意代碼是否寫的OK,本人settings寫錯,幸好有錯誤提示
data['sn'] = page * 30
#使用urlencode方法將字典轉化成url的get參數,並且需要引入urllib.parse
params = urlencode(data)
url = base_url + params
#需要引入scrapy.Request
yield Request(url, self.parse)
#需要在settings類修改ROBOTSTXT_OBEY = False
items.py
from scrapy import Item, Field
class ImageItem(Item):
collection = ‘images’
id = Field()
url = Field()
title =Field()
#縮略圖
thumb = Field()
pipelines.py
import pymongo
from scrapy import Request
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
class MongoPipeline(object):
def init(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DB')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def process_item(self, item, spider):
name = item.collection
self.db[name].insert(dict(item))
return item
def close_spider(self, spider):
self.client.close()
class ImagePipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None):
url = request.url
file_name = url.split(‘/’)[-1]
return file_name
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem('Image Downloaded Failed')
return item
def get_media_requests(self, item, info):
yield Request(item['url'])
在settings.py添加
ROBOTSTXT_OBEY = False #不設置無法爬取
ITEM_PIPELINES = { #有Pipeline幾個添加幾個
‘images360.pipelines.ImagePipeline’: 300,
‘images360.pipelines.MongoPipeline’: 301,
}
IMAGES_STORE = ‘./images’
MAX_PAGE = 50
MONGO_URI = ‘localhost’
MONGO_DB = ‘images360’
pipelines.py
import pymongo
from scrapy import Request
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
class MongoPipeline(object):
def init(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DB')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def process_item(self, item, spider):
name = item.collection
self.db[name].insert(dict(item))
return item
def close_spider(self, spider):
self.client.close()
class ImagePipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None):
url = request.url
file_name = url.split(‘/’)[-1]
return file_name
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem('Image Downloaded Failed')
return item
def get_media_requests(self, item, info):
yield Request(item['url'])
大神完整代碼:裏面包含Mysql數據庫的存儲
https://github.com/Python3WebSpider/Images360/tree/master/images360
參考資料:
https://blog.csdn.net/kuangshp128/article/details/80321099