由於scrapy使用異步下載,所以會出現下載小說章節的結果是亂序的。
可以通過下面的方法將章節順序傳遞給item,並保存起來:
在解析主頁得到所有章節信息(章節名、網址、還有順序)後,通過Request()的cb_kwargs傳遞一個關鍵字參數‘order’給回調函數parse_item(),代表該章節的順序。
items.py:
# -*- coding: utf-8 -*-
import scrapy
class XiaoshuoItem(scrapy.Item):
order = scrapy.Field() # 序號,章節排序的依據
name = scrapy.Field() # 章節名
content = scrapy.Field() # 章節內容
xiaoshuo_spyder.py:
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
from Xiaoshuo.items import XiaoshuoItem
class XiaoshuoSpider(scrapy.Spider):
name = 'Xiaoshuo_spider'
start_urls = ['https://www.biquge.biz/0_844/']
def parse(self, response):
"""解析主頁裏所有章節地址並下載,通過cb_kwargs={'order': i + 1}來傳遞章節順序"""
sels = response.xpath('//div[@id="list"]//dd/a')
for i, a in enumerate(sels):
# yield response.follow(a, callback=self.parse_item, cb_kwargs={'order': i + 1})
yield Request(response.urljoin(a.xpath('@href').get()), callback=self.pasrse_item, cb_kwargs={'order': i + 1})
def parse_item(self, response, order):
"""主頁解析後章節順序通過order傳遞進來"""
item = XiaoshuoItem()
item['order'] = order
item['name'] = response.xpath('//h1/text()').get()
item['content'] = response.xpath('//div[@id="content"]').get()
return item
pipelines.py
# -*- coding: utf-8 -*-
class XiaoshuoPipeline(object):
def open_spider(self, spider):
"""定義items,用來保存每個item"""
self.items = []
def process_item(self, item, spider):
"""將下載解析到的各個item添加到items,此時是亂序的"""
self.items.append(item)
return item
def close_spider(self, spider):
"""在爬蟲結束的時候,將items按照'order'字段排列,並最終合併成一個html文件"""
with open('御魂者傳奇.html', 'w', encoding='utf-8') as f:
header = '<!DOCTYPE html><html lang="zh"><head><meta charset="UTF-8"></head><body>'
footer = '</body></html>'
f.write(header)
# 所有章節按order字段排序
self.items.sort(key=lambda i: i['order'])
for item in self.items:
cont = '<h3>{}</h3>{}<br><hr><br>'.format(item['name'], item['content'])
f.write(cont)
f.write(footer)
settings.py:(打開管道)
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'project_name.pipelines.XiaoshuoPipeline': 300,
}