toolkit-frame之toolkit-sprider(數據採集)---筆趣閣小說

採集筆趣閣小說,使用以下幾個核心包:

requests:2.10.0

beautifulsoup4:4.7.1

其中:

1、BaseFrame.__log__("開始採集中國船舶網的數據...") BaseFrame.__log__() 就是我個人封裝的日之類替換成print就可以。

2、response = requests.get(self.base_url, timeout=30, headers=UserAgent().get_random_header(self.base_url)) 這個就是封裝的一個隨機header,防止被認爲是機器,每次都模擬一個新的瀏覽器的header。代碼是這個:UserAgent().get_random_header(self.base_url) 隨機header類:https://blog.csdn.net/zy0412326/article/details/104258491

3、filepath = BaseConfig().CORPUS_ROOT + os.sep + "equipment_info.xlsx" 這個就是文件的路徑BaseConfig().CORPUS_ROOT替換成自己的文件路徑就可以了。

4、mmEntity = SpriderEntity() 這個就是一個實體類用來記錄採集過的數據,程序可以多次執行。防止數據重複採集的策略。

這裏我之前寫過防止重複採集的文章:https://blog.csdn.net/zy0412326/article/details/103224399

不廢話了直接上代碼:

import os
import requests
from bs4 import BeautifulSoup
from access.sprider.SpriderAccess import SpriderAccess
from base.BaseConfig import BaseConfig
from base.BaseFrame import BaseFrame
from business.sprider.UserAgent import UserAgent
from object.entity.SpriderEntity import SpriderEntity
from plugin.Tools import Tools


class QuLa:
    base_url = "https://www.qu.la/"
    save_path = BaseConfig().CORPUS_ROOT + os.sep + "QuLa"

    def __init__(self):
        Tools.judge_diskpath_exits_create(self.save_path)
        pass

    def sprider_story(self):
        BaseFrame.__log__("開始採集筆趣閣排行榜小說...")
        self.story_url = self.base_url + "paihangbang"
        try:
            response = requests.get(self.story_url, timeout=30, headers=UserAgent().get_random_header(self.story_url))
            response.encoding = 'UTF-8'
            soup = BeautifulSoup(response.text, "html5lib")
        except Exception as e:
            BaseFrame.__err__("採集出現錯誤" + str(e))
            pass
        div_list = soup.findAll('div', attrs={"class": 'topbooks'})
        for div in div_list:
            a_list = div.find_all('a', attrs={"target": '_blank'})
            for a in a_list:
                content_url = self.base_url + a.get("href")
                txt_title = a.get("title")
                try:
                    response = requests.get(content_url, timeout=30, headers=UserAgent().get_random_header(content_url))
                    response.encoding = 'UTF-8'
                    soup = BeautifulSoup(response.text, "html5lib")
                    dl_tag = soup.find('dl')
                    a_list = dl_tag.find_all('a')
                    for a_tag in a_list:
                        href = a_tag.get("href")
                        if "book" in href:
                            url = self.base_url + href
                            chapter = a_tag.text

                            mmEntity = SpriderEntity()
                            mmEntity.sprider_base_url = self.base_url
                            mmEntity.create_datetime = Tools.get_current_datetime()
                            mmEntity.sprider_url = url
                            mmEntity.sprider_pic_title = chapter
                            mmEntity.sprider_pic_index = str(1)
                            if SpriderAccess().query_sprider_entity_by_urlandtitle(url, chapter) is None:
                                SpriderAccess().save_sprider(mmEntity)
                                self.get_content(url, chapter, txt_title)
                except Exception as e:
                    BaseFrame.__err__("採集" + content_url + "出現錯誤" + str(e))
                    pass

            pass

    def get_content(self, url, chapter, title):
        """
        寫文件至文本中
        :param url:採集的URL
        :param chapter: 章節名稱
        :param title: 小說名稱
        :return:
        """
        try:
            BaseFrame.__log__("正在採集" + url + "上的小說...")
            response = requests.get(url, timeout=60, headers=UserAgent().get_random_header(url))
            response.encoding = 'UTF-8'
            soup = BeautifulSoup(response.text, "html5lib")
            content = soup.find('div', attrs={"id": 'content'})
            content = chapter + "\n" + str(content). \
                replace('<br/>', '\n'). \
                replace("<script>chaptererror();</script>", ""). \
                replace("<div id=\"content\">", ""). \
                replace("</div>", "")
            txt_path = self.save_path + os.sep + str(title) + ".txt"
            Tools.write_string_to_txt(txt_path, content)
        except Exception as e:
            BaseFrame.__err__("採集" + chapter + "出現錯誤" + str(e) + "嘗試重新採集.")
            self.get_content(url, chapter, title)
            pass


if __name__ == '__main__':
    QuLa().sprider_story()
pass
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章