爬蟲之詩文傳頌

# encoding: utf-8
# author: Batac

import requests
import re
import json

class ShiwenSpider:
    """詩文數據分析工具"""

    def __init__(self):
        """程序初始化"""
        self.current_page = 1
        self.total_page = 2
        self.base_url = "https://www.***.org/default_"+str(self.current_page)+".aspx"
        self.header = {
            "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"
        }

    def parse_url(self):
        """發送請求, 獲取數據"""
        html = requests.get(self.base_url, headers=self.header)
        return html.content.decode('utf-8')

    def data_contetn(self, html_str):
        """解析數據"""
        total_page = re.findall(r'<label id="sumPage".*?>(.*?)</label>', html_str)
        if len(total_page) > 0:
            # 記錄總頁數
            page = int(total_page[0])
            if self.total_page < page:
                self.total_page = page
        titles = re.findall(r'<div\sclass="yizhu">.*?<b>(.*?)</b>', html_str, re.DOTALL)
        chaodai = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>', html_str)
        author = re.findall(r'<p class="source">.*?<a.*?>.*?<a.*?>(.*?)</a>', html_str)
        contents = re.findall(r'<div\sclass="contson"\sid=".*?">(.*?)</div>', html_str, re.DOTALL)
        pems = []
        for content in contents:
            x = re.sub('<.*?>', '', content)
            pems.append(x.strip())
        total_list = []
        for value in zip(titles, author, chaodai, pems):
            item = {}
            title, name, chao, con = value
            item["title"] = title
            item["chaodai"] = chao
            item["name"] = name
            item["content"] = con
            total_list.append(item)
        return total_list


    def save_data(self,list):
        """保存數據"""
        with open("movice.txt", "a", encoding="utf-8") as f:
            for content in list:
                f.write(json.dumps(content, ensure_ascii=False, indent=2))
                f.write("\n")
        print("第"+str(self.current_page)+"頁保存結束")

    def run(self):
        """運行項目"""
        while self.total_page >= self.current_page:
            print("第" + str(self.current_page) + "頁開始查詢數據")
            html = self.parse_url()
            list = self.data_contetn(html)
            self.save_data(list)
            self.current_page += 1
            self.base_url = "https://www.***.org/default_"+str(self.current_page)+".aspx"




if __name__ == "__main__":
    sw = ShiwenSpider()
    sw.run()

備註:項目只用作學習交流使用;

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章