採集筆趣閣小說,使用以下幾個核心包:
requests:2.10.0
beautifulsoup4:4.7.1
其中:
1、BaseFrame.__log__("開始採集中國船舶網的數據...") BaseFrame.__log__() 就是我個人封裝的日之類替換成print就可以。
2、response = requests.get(self.base_url, timeout=30, headers=UserAgent().get_random_header(self.base_url)) 這個就是封裝的一個隨機header,防止被認爲是機器,每次都模擬一個新的瀏覽器的header。代碼是這個:UserAgent().get_random_header(self.base_url) 隨機header類:https://blog.csdn.net/zy0412326/article/details/104258491
3、filepath = BaseConfig().CORPUS_ROOT + os.sep + "equipment_info.xlsx" 這個就是文件的路徑BaseConfig().CORPUS_ROOT替換成自己的文件路徑就可以了。
4、mmEntity = SpriderEntity() 這個就是一個實體類用來記錄採集過的數據,程序可以多次執行。防止數據重複採集的策略。
這裏我之前寫過防止重複採集的文章:https://blog.csdn.net/zy0412326/article/details/103224399
不廢話了直接上代碼:
import os import requests from bs4 import BeautifulSoup from access.sprider.SpriderAccess import SpriderAccess from base.BaseConfig import BaseConfig from base.BaseFrame import BaseFrame from business.sprider.UserAgent import UserAgent from object.entity.SpriderEntity import SpriderEntity from plugin.Tools import Tools class QuLa: base_url = "https://www.qu.la/" save_path = BaseConfig().CORPUS_ROOT + os.sep + "QuLa" def __init__(self): Tools.judge_diskpath_exits_create(self.save_path) pass def sprider_story(self): BaseFrame.__log__("開始採集筆趣閣排行榜小說...") self.story_url = self.base_url + "paihangbang" try: response = requests.get(self.story_url, timeout=30, headers=UserAgent().get_random_header(self.story_url)) response.encoding = 'UTF-8' soup = BeautifulSoup(response.text, "html5lib") except Exception as e: BaseFrame.__err__("採集出現錯誤" + str(e)) pass div_list = soup.findAll('div', attrs={"class": 'topbooks'}) for div in div_list: a_list = div.find_all('a', attrs={"target": '_blank'}) for a in a_list: content_url = self.base_url + a.get("href") txt_title = a.get("title") try: response = requests.get(content_url, timeout=30, headers=UserAgent().get_random_header(content_url)) response.encoding = 'UTF-8' soup = BeautifulSoup(response.text, "html5lib") dl_tag = soup.find('dl') a_list = dl_tag.find_all('a') for a_tag in a_list: href = a_tag.get("href") if "book" in href: url = self.base_url + href chapter = a_tag.text mmEntity = SpriderEntity() mmEntity.sprider_base_url = self.base_url mmEntity.create_datetime = Tools.get_current_datetime() mmEntity.sprider_url = url mmEntity.sprider_pic_title = chapter mmEntity.sprider_pic_index = str(1) if SpriderAccess().query_sprider_entity_by_urlandtitle(url, chapter) is None: SpriderAccess().save_sprider(mmEntity) self.get_content(url, chapter, txt_title) except Exception as e: BaseFrame.__err__("採集" + content_url + "出現錯誤" + str(e)) pass pass def get_content(self, url, chapter, title): """ 寫文件至文本中 :param url:採集的URL :param chapter: 章節名稱 :param title: 小說名稱 :return: """ try: BaseFrame.__log__("正在採集" + url + "上的小說...") response = requests.get(url, timeout=60, headers=UserAgent().get_random_header(url)) response.encoding = 'UTF-8' soup = BeautifulSoup(response.text, "html5lib") content = soup.find('div', attrs={"id": 'content'}) content = chapter + "\n" + str(content). \ replace('<br/>', '\n'). \ replace("<script>chaptererror();</script>", ""). \ replace("<div id=\"content\">", ""). \ replace("</div>", "") txt_path = self.save_path + os.sep + str(title) + ".txt" Tools.write_string_to_txt(txt_path, content) except Exception as e: BaseFrame.__err__("採集" + chapter + "出現錯誤" + str(e) + "嘗試重新採集.") self.get_content(url, chapter, title) pass if __name__ == '__main__': QuLa().sprider_story() pass