Teminal 輸入 scrapy startproject spider1,然後輸入 scrapy genspider qidian
settings.py修改4處代碼:
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 3
# pipelines 的註釋取消
ITEM_PIPELINES = {
'spider1.pipelines.Spider1Pipeline': 300,
}
將文件導出使用 scrapy crawl qidian -o book.json(或者book.csv/book.xml)
代碼:
qidian.py
# -*- coding: utf-8 -*-
import scrapy
from spider1.items import BookItem
class QidianSpider(scrapy.Spider):
name = 'qidian'
allowed_domains = ['qidian.com']
start_urls = ['https://www.qidian.com/rank/yuepiao?chn=21']
def parse(self, response):
names = response.xpath('//div[@class="book-mid-info"]//h4//text()').extract()
authors = response.xpath('//div[@class="book-mid-info"]//p[@class="author"]/a[1]/text()').extract()
# pipelines 的使用
# for name, author in zip(names, authors):
# yield {'name': name, 'author': author}
# item 的使用
item = BookItem()
for name,author in zip(names,authors):
item['name']=name
item['author']=author
yield item
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import json
class Spider1Pipeline(object):
# process_item 裏使用 open('book.txt')頻繁打開文件
# 使用open_spider,close_spider只開關一次文件
def open_spider(self, spider):
self.filename = open('book.txt', 'w', encoding='utf-8')
def process_item(self, item, spider):
# 沒有使用 open_spider,close_spider
# with open('book.txt', 'a', encoding='utf-8') as f:
# f.write(json.dumps(item, ensure_ascii=False)+'\n')
# 使用pipelines
# self.filename.write(json.dumps(item, ensure_ascii=False) + '\n')
# 使用items時,item不能json序列化,使用dict(item)
self.filename.write(json.dumps(dict(item), ensure_ascii=False) + '\n')
return item
def close_spider(self, spider):
self.filename.close()
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class BookItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
author = scrapy.Field()
結果: