# encoding: utf-8
# author: Batac
import requests
import re
import json
class ShiwenSpider:
"""詩文數據分析工具"""
def __init__(self):
"""程序初始化"""
self.current_page = 1
self.total_page = 2
self.base_url = "https://www.***.org/default_"+str(self.current_page)+".aspx"
self.header = {
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"
}
def parse_url(self):
"""發送請求, 獲取數據"""
html = requests.get(self.base_url, headers=self.header)
return html.content.decode('utf-8')
def data_contetn(self, html_str):
"""解析數據"""
total_page = re.findall(r'<label id="sumPage".*?>(.*?)</label>', html_str)
if len(total_page) > 0:
# 記錄總頁數
page = int(total_page[0])
if self.total_page < page:
self.total_page = page
titles = re.findall(r'<div\sclass="yizhu">.*?<b>(.*?)</b>', html_str, re.DOTALL)
chaodai = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>', html_str)
author = re.findall(r'<p class="source">.*?<a.*?>.*?<a.*?>(.*?)</a>', html_str)
contents = re.findall(r'<div\sclass="contson"\sid=".*?">(.*?)</div>', html_str, re.DOTALL)
pems = []
for content in contents:
x = re.sub('<.*?>', '', content)
pems.append(x.strip())
total_list = []
for value in zip(titles, author, chaodai, pems):
item = {}
title, name, chao, con = value
item["title"] = title
item["chaodai"] = chao
item["name"] = name
item["content"] = con
total_list.append(item)
return total_list
def save_data(self,list):
"""保存數據"""
with open("movice.txt", "a", encoding="utf-8") as f:
for content in list:
f.write(json.dumps(content, ensure_ascii=False, indent=2))
f.write("\n")
print("第"+str(self.current_page)+"頁保存結束")
def run(self):
"""運行項目"""
while self.total_page >= self.current_page:
print("第" + str(self.current_page) + "頁開始查詢數據")
html = self.parse_url()
list = self.data_contetn(html)
self.save_data(list)
self.current_page += 1
self.base_url = "https://www.***.org/default_"+str(self.current_page)+".aspx"
if __name__ == "__main__":
sw = ShiwenSpider()
sw.run()
備註:項目只用作學習交流使用;