用正則表達式爬取古詩詞網


在這裏插入圖片描述

import requests
import re
from pymongo import MongoClient


class Poetry:
    def __init__(self):
        self.headers = {
            "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36"}
        self.client = MongoClient(host="127.0.0.1", port=27017)
        self.poetry = self.client['spider']['poetry']
        self.poetries = []

    def get_text(self, url):
        r = requests.get(url, headers=self.headers)
        r.encoding = r.apparent_encoding
        text = r.text
        return text

    def parse_text(self, text):
        #添加re.DOTALL方法後,.可以匹配\n即.可以匹配任意字符
        titles = re.findall(r'<div class="cont">.*?<b>(.*?)</b>', text, re.DOTALL)
        dynasties = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>', text, re.DOTALL)
        authors = re.findall(r'<p class="source">.*?<a.*?<a.*?>(.*?)</a>', text, re.DOTALL)
        raw_contents = re.findall(r'<div class="contson".*?>(.*?)</div>', text, re.DOTALL)
        contents = []
        for item in raw_contents:
            new_item = re.sub(r'<.*?>|\n', "", item)
            contents.append(new_item.strip())
        tmp = zip(titles, dynasties, authors, contents)
        for item in tmp:
            title, dynasty, author, content = item
            poetry = {
                'title': title,
                'dynasty': dynasty,
                'author': author,
                'content': content
            }
            self.poetries.append(poetry)

    def print_poetries(self):
        for item in self.poetries:
            print(item)

    def save_to_db(self):
        self.poetry.insert_many(self.poetries)

    def run(self):
        template_url = "https://www.gushiwen.org/default_{}.aspx"
        for i in range(1, 11):
            url = template_url.format(i)
            text = self.get_text(url)
            self.parse_text(text)
        self.print_poetries()
        self.save_to_db()


p = Poetry()
p.run()
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章