scrapy框架爬取起點小說分類

原創

cyj5201314

2020-06-28 16:11

spider代碼

class QidianSpider(scrapy.Spider):
    name = 'qidian'
    allowed_domains = ['qidian.com']
    start_urls = ['https://www.qidian.com/all?orderId=&page=1&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0']

    def parse(self, response):
        #大分類分組
        li_list = response.xpath("//ul[@type='category']/li")[1:]
        for li in li_list:
            item = {}
            item["first_category_title"] = li.xpath("./a/text()").extract_first()
            item["first_category_url"] = "https:" + li.xpath("./a/@href").extract_first()
            #小分類分組
            yield scrapy.Request(
                url=item["first_category_url"],
                callback=self.parse_first_category,
                meta={"item": item}
            )


    def parse_first_category(self, response):
        dd_list = response.xpath("//div[@class='sub-type']/dl[@class='']/dd")
        first_category_title = response.meta["item"]["first_category_title"]
        for dd in dd_list:
            item = {}
            item["first_category_title"] = first_category_title
            item["second_category_title"] = dd.xpath("./a/text()").extract_first()
            item["second_category_url"] = "https:" + dd.xpath("./a/@href").extract_first()
            yield scrapy.Request(
                url=item["second_category_url"],
                callback=self.parse_second_category,
                meta={"item": item}
            )


    def parse_second_category(self, response):
        first_category_title = response.meta["item"]["first_category_title"]
        second_category_title = response.meta["item"]["second_category_title"]
        li_list = response.xpath("//ul[@class='all-img-list cf']/li")

        for li in li_list:
            item = {}
            item["first_category_title"] = first_category_title
            item["second_category_title"] = second_category_title
            item["book_name"] = li.xpath(".//h4/a/text()").extract_first()
            item["author_name"] = li.xpath(".//p[@class='author']/a[@class='name']/text()").extract_first()
            item["is_end"] = li.xpath(".//span/text()").extract_first()
            item["info"] = li.xpath(".//p[@class='intro']/text()").extract_first().strip()
            item["book_poster_src"] = "http:" + li.xpath(".//div[@class='book-img-box']/a/img/@src").extract_first()
            yield item

        next_url = "https:" + response.xpath("//a[contains(text(), '>')]/@href").extract_first()
        if next_url is not None:
            yield scrapy.Request(url=next_url, callback=self.parse_second_category)

pipelines代碼

import json
import requests
from p4.settings import USER_AGENT

class P4Pipeline(object):

    def open_spider(self, spider):
        self.fp = open("./data/book.json", "w", encoding="utf-8")

    def process_item(self, item, spider):
        json.dump(item, self.fp, ensure_ascii=False)
        self.fp.write("\n")
        self.save_poster(item["book_poster_src"], item["book_name"])
        print(item["book_name"] + "save into local documnet successfully")
        return item

    def close_spider(self, spider):
        self.fp.close()

    def save_poster(self, url, title):
        file_name = "./data/img/" + title + ".jpg"
        with open(file_name, "wb") as f:
            f.write(requests.get(url, headers={"User_Agent": USER_AGENT}).content)

主方法

from scrapy import cmdline

cmdline.execute("scrapy crawl qidian".split())

爬取結果如下:

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

scrapy框架爬取起點小說分類

spider代碼

pipelines代碼

主方法

今天！通義靈碼在北京、成都、杭州三城開講啦

【BI 可視化插件】怎麼做？手把手教你實現

scrapy利用下載器中間件給request對象修改User-Agent

scrapy利用登陸後的cookie請求人人網個人主頁

scrapy利用FormRequest.from_response模擬登陸

scrapy框架爬取起點小說分類

pandas讀取分析保險數據

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結