spider代碼
class QidianSpider(scrapy.Spider):
name = 'qidian'
allowed_domains = ['qidian.com']
start_urls = ['https://www.qidian.com/all?orderId=&page=1&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0']
def parse(self, response):
#大分類分組
li_list = response.xpath("//ul[@type='category']/li")[1:]
for li in li_list:
item = {}
item["first_category_title"] = li.xpath("./a/text()").extract_first()
item["first_category_url"] = "https:" + li.xpath("./a/@href").extract_first()
#小分類分組
yield scrapy.Request(
url=item["first_category_url"],
callback=self.parse_first_category,
meta={"item": item}
)
def parse_first_category(self, response):
dd_list = response.xpath("//div[@class='sub-type']/dl[@class='']/dd")
first_category_title = response.meta["item"]["first_category_title"]
for dd in dd_list:
item = {}
item["first_category_title"] = first_category_title
item["second_category_title"] = dd.xpath("./a/text()").extract_first()
item["second_category_url"] = "https:" + dd.xpath("./a/@href").extract_first()
yield scrapy.Request(
url=item["second_category_url"],
callback=self.parse_second_category,
meta={"item": item}
)
def parse_second_category(self, response):
first_category_title = response.meta["item"]["first_category_title"]
second_category_title = response.meta["item"]["second_category_title"]
li_list = response.xpath("//ul[@class='all-img-list cf']/li")
for li in li_list:
item = {}
item["first_category_title"] = first_category_title
item["second_category_title"] = second_category_title
item["book_name"] = li.xpath(".//h4/a/text()").extract_first()
item["author_name"] = li.xpath(".//p[@class='author']/a[@class='name']/text()").extract_first()
item["is_end"] = li.xpath(".//span/text()").extract_first()
item["info"] = li.xpath(".//p[@class='intro']/text()").extract_first().strip()
item["book_poster_src"] = "http:" + li.xpath(".//div[@class='book-img-box']/a/img/@src").extract_first()
yield item
next_url = "https:" + response.xpath("//a[contains(text(), '>')]/@href").extract_first()
if next_url is not None:
yield scrapy.Request(url=next_url, callback=self.parse_second_category)
pipelines代碼
import json
import requests
from p4.settings import USER_AGENT
class P4Pipeline(object):
def open_spider(self, spider):
self.fp = open("./data/book.json", "w", encoding="utf-8")
def process_item(self, item, spider):
json.dump(item, self.fp, ensure_ascii=False)
self.fp.write("\n")
self.save_poster(item["book_poster_src"], item["book_name"])
print(item["book_name"] + "save into local documnet successfully")
return item
def close_spider(self, spider):
self.fp.close()
def save_poster(self, url, title):
file_name = "./data/img/" + title + ".jpg"
with open(file_name, "wb") as f:
f.write(requests.get(url, headers={"User_Agent": USER_AGENT}).content)
主方法
from scrapy import cmdline
cmdline.execute("scrapy crawl qidian".split())
爬取結果如下: