今天是一個項目——www.aqistudy.cn 全國空氣質量歷史數據存儲的網站的爬取。
- 使用spider爬取,分別存儲爲json文件,csv文件,存儲到mongoDB中,redis數據庫中;使用crawl_spider爬取;scrapy-redis分佈式,spider爬取及crawl_spider爬取;
spider爬取
步驟:
- 確認目標數據-爬取每天的歷史數據
- 確認網頁層級:城市一層,爲靜態頁面;月度數據爲第二層,是動態HTML頁面;日數據爲第三層,是動態HTML頁面
- items.py
- aqi.py
- pipeline.py
- 使用download中間件,使用selinium+chrome模擬瀏覽器發起請求,自定義response對象,直接返回
- settings.py的設置
代碼
items.py
import scrapy
class AqiItem(scrapy.Item):
# 城市的名字
city_name = scrapy.Field()
# 1.日期
data = scrapy.Field()
# 2.空氣質量係數
aqi = scrapy.Field()
# 3.質量等級
q_level = scrapy.Field()
# 4.PM2.5
pm2_5 = scrapy.Field()
# 5.PM10
pm10 = scrapy.Field()
# 6.so2
so2 = scrapy.Field()
# 7.co
co = scrapy.Field()
# 8.no2
no2 = scrapy.Field()
# 9.o3_8h
o3 = scrapy.Field()
# 數據源
data_source = scrapy.Field()
# 下載時間
data_time = scrapy.Field()
aqi.py
# -*- coding: utf-8 -*-
import scrapy
from AQI.items import AqiItem
class AqiSpider(scrapy.Spider):
name = 'aqi'
allowed_domains = ['aqistudy.cn']
start_urls = ['https://www.aqistudy.cn/historydata/']
base_url = 'https://www.aqistudy.cn/historydata/'
def parse(self, response):
# 獲取城市名列表
city_name_list = response.xpath('/html/body/div[3]/div/div[1]/div[2]/div[2]/ul/div[2]/li/a/text()').extract()[0:1]
# 獲取月份url列表:
link_list = response.xpath('/html/body/div[3]/div/div[1]/div[2]/div[2]/ul/div[2]/li/a/@href').extract()[0:1]
# 遍歷列表
for name, link in zip(city_name_list, link_list):
# 給城市名賦值
item = AqiItem()
item['city_name'] = name
# 完整的link
link = self.base_url + link
# 發起請求,獲取月度信息
yield scrapy.Request(url=link, meta={'api_item':item}, callback=self.parse_month)
def parse_month(self, response):
# 獲取月度信息url列表
month_url_list = response.xpath('/html/body/div[3]/div[1]/div[1]/table//tr/td[1]/a/@href').extract()[1:2]
# 遍歷
for month_url in month_url_list:
# 完整的url
month_url = self.base_url +month_url
# 取出item,再次傳遞
item = response.meta['api_item']
# 發送請求,獲取每天的數據頁面
yield scrapy.Request(url=month_url, meta={'api_item':item}, callback=self.parse_day)
def parse_day(self, response):
# 獲取所有的tr標籤
tr_list = response.xpath('//tr')
# 第一個tr標籤不使用,應該刪除掉
tr_list.pop(0)
print '**'*40
print tr_list
# 循環列表
for tr in tr_list:
# 獲取目標數據
item = response.meta['api_item']
# 1.日期
item['data'] = tr.xpath('./td[1]/text()').extract_first()
# 2.空氣質量係數
item['aqi'] = tr.xpath('./td[2]/text()').extract_first()
# 3.質量等級
item['q_level'] = tr.xpath('./td[3]/span/text()').extract_first()
# 4.PM2.5
item['pm2_5'] = tr.xpath('./td[4]/text()').extract_first()
# 5.PM10
item['pm10'] = tr.xpath('./td[5]/text()').extract_first()
# 6.so2
item['so2'] = tr.xpath('./td[6]/text()').extract_first()
# 7.co
item['co'] = tr.xpath('./td[7]/text()').extract_first()
# 8.no2
item['no2'] = tr.xpath('./td[8]/text()').extract_first()
# 9.o3_8h
item['o3'] = tr.xpath('./td[9]/text()').extract_first()
# --->引擎---》管道
yield item
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from datetime import datetime
from scrapy.exporters import CsvItemExporter
import pymongo
import redis
class AqiDataPipeline(object):
def process_item(self, item, spider):
# 數據源
item['data_source'] = spider.name
# 下載時間
item['data_time'] = str(datetime.utcnow())
return item
# json管道
class AqiJsonPipeline(object):
def open_spider(self, spider):
self.file = open('api.json', 'w')
def process_item(self, item, spider):
str_item = json.dumps(dict(item)) + '\n'
self.file.write(str_item)
return item
def close_spider(self, spider):
self.file.close()
middlewares.py
from selenium import webdriver
import scrapy
import time
# 通過中間件自定義 webdriver的下載器
class ChromeMiddlewares(object):
def process_request(self, request, spider):
# 網址
url = request.url
# 判斷,如果首頁,不需要自定義
if url != 'https://www.aqistudy.cn/historydata/':
# 發送請求
driver = webdriver.Chrome()
driver.get(url)
# 注意添加延遲
time.sleep(2)
# 獲取數據
data = driver.page_source
# 關閉瀏覽器
driver.quit()
# 構建自己的response對象,直接返回
return scrapy.http.HtmlResponse(url=url, body=data.encode('utf-8'), encoding='utf-8', request=request)
settings.py
BOT_NAME = 'AQI'
SPIDER_MODULES = ['AQI.spiders']
NEWSPIDER_MODULE = 'AQI.spiders'
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
ROBOTSTXT_OBEY = False
DOWNLOADER_MIDDLEWARES = {
'AQI.middlewares.ChromeMiddlewares': 543,
}
ITEM_PIPELINES = {
'AQI.pipelines.AqiDataPipeline': 100,
}
結果
其他存儲方式
csv管道
# csv的管道
class AqiCsvPipeline(object):
def open_spider(self, spider):
self.file = open('api.csv', 'w')
# 創建一個寫入器
self.writer = CsvItemExporter(self.file)
# 聲明開始導出
self.writer.start_exporting()
def process_item(self, item, spider):
self.writer.export_item(item)
return item
def close_spider(self, spider):
self.file.close()
self.writer.finish_exporting()
mongoDB管道
# 存儲到MongoDB 注意,需要開啓服務
class AqiMongoDBPipeline(object):
def open_spider(self, spider):
# 鏈接mongoDB數據庫
self.client = pymongo.MongoClient('127.0.0.1', 27017)
self.db = self.client['AQI_Mongo']
self.collection = self.db['api']
def process_item(self, item, spider):
# 存儲數據
self.collection.insert(dict(item))
return item
def close_spider(self, spider):
self.client.close()
redis管道
# 存儲到redis中
class ApiRedisPipeline(object):
def open_spider(self, spider):
self.client = redis.Redis('127.0.0.1', 6379)
def process_item(self, item, spider):
self.client.lpush('AQI_list', dict(item))
return item
spider_crawl爬取
aqi_scrawl.py
# -*- coding: utf-8 -*-
import scrapy
from AQI.items import AqiItem
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class AqiSpider(scrapy.Spider):
name = 'aqi_crawl'
allowed_domains = ['aqistudy.cn']
start_urls = ['https://www.aqistudy.cn/historydata/']
rules = (
# 1.提取城市的鏈接
Rule(LinkExtractor(allow='monthdata\.php\?city=')),
# 2.提取 月份的鏈接
Rule(LinkExtractor(allow='daydata\.php\?city='), callback='parse_day', follow=False),
)
def parse_day(self, response):
# 解析城市的名字
city_name = response.xpath('//h2[@id="title"]/text()').extract_first()
# 獲取所有的tr標籤
tr_list = response.xpath('//tr')
# 第一個tr標籤不使用,應該刪除掉
tr_list.pop(0)
# 循環列表
for tr in tr_list:
# 獲取目標數據
item = AqiItem()
item['city_name'] = city_name[8:-11]
# 1.日期
item['data'] = tr.xpath('./td[1]/text()').extract_first()
# 2.空氣質量係數
item['aqi'] = tr.xpath('./td[2]/text()').extract_first()
# 3.質量等級
item['q_level'] = tr.xpath('./td[3]/span/text()').extract_first()
# 4.PM2.5
item['pm2_5'] = tr.xpath('./td[4]/text()').extract_first()
# 5.PM10
item['pm10'] = tr.xpath('./td[5]/text()').extract_first()
# 6.so2
item['so2'] = tr.xpath('./td[6]/text()').extract_first()
# 7.co
item['co'] = tr.xpath('./td[7]/text()').extract_first()
# 8.no2
item['no2'] = tr.xpath('./td[8]/text()').extract_first()
# 9.o3_8h
item['o3'] = tr.xpath('./td[9]/text()').extract_first()
# --->引擎---》管道
yield item
- 其他文件跟aqi.py 一致
scrapy-redis分佈式
spider
- 爬蟲導入,並繼承RedisSpider模塊,設置分佈式 識別的key
- 修改settings文件
api_redis.py
...
from scrapy_redis.spiders import RedisSpider
class AqiSpider(RedisSpider):
name = 'aqi_redis'
allowed_domains = ['aqistudy.cn']
# 設置分佈式 識別的key
redis_key = 'api:start_urls'
...
settings.py
...
# redis 分佈式 四大組件: 調度器,過濾器, spider, item
# 1. 啓用 分佈式 過濾器
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 2.啓用 分佈式 調度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 3.啓用 分佈式 如果爬蟲中斷1000個 ,下次從中斷的位置10001開始下載
SCHEDULER_PERSIST = True
# 4. redis的管道
# 設置redis host port
REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379
crawl_spider
- 導入並繼承RedisCrawlSpider類,設置分佈式 識別的key
- settings.py文件設置和spider一致
aqi_crawl_spider_redis.py
...
from scrapy_redis.spiders import RedisCrawlSpider
class AqiSpider(RedisCrawlSpider):
name = 'aqi_crawl_redis'
allowed_domains = ['aqistudy.cn']
# 設置 分佈式識別的key
redis_key = 'api_crawl_redis'
...
- settings.py添加選項同 spider模塊