使用正則表達式爬取古詩文網
爬取目標
具體字段爲:
- title 標題
- dynasty 朝代
- author 作者
- content 內容
- tag 標籤
實現代碼
'''
@Description: 使用正則表達式爬取古詩詞網
@Author: sikaozhifu
@Date: 2020-06-09 14:55:44
@LastEditTime: 2020-06-09 15:55:47
@LastEditors: Please set LastEditors
'''
import requests
import re
from lxml import etree
poems = []
def parse_url(url): # 解析url返回的文檔
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
}
response = requests.get(url, headers=headers)
text = response.text
titles = re.findall(r'<div\sclass="cont">.*?<b>(.*?)</b>', text, re.DOTALL)
dynasties = re.findall(r'<p\sclass="source">.*?<a.*?>(.*?)</a>', text, re.DOTALL)
authors = re.findall(r'<p\sclass="source">.*?<a.*?>.*?</a>.*?<a.*?>(.*?)</a>', text, re.DOTALL)
contents_temp = re.findall(r'<div\sclass="contson".*?>(.*?)</div>', text, re.DOTALL)
contents = []
for content in contents_temp:
contents.append(re.sub('<.*?>', '', content).strip())
# 使用lxml解析
html = etree.HTML(text)
tag_element = html.xpath('//div[@class = "tag"]')
tags = []
for tag_temp in tag_element:
tag = tag_temp.xpath('string(.)')
# 去掉標籤字段中的空格、回車、換行等。
tag = tag.replace('\n', '').replace('\r', '').replace(' ', '')
tags.append(tag)
for value in list(zip(titles, dynasties, authors, contents, tags)):
title, dynasty, author, content, tag = value
poem = {
'title': title,
'dynasty': dynasty,
'author': author,
'content': content,
'tag': tag
}
poems.append(poem)
def main():
for x in range(1, 11):# 共有10頁
url = 'https://www.gushiwen.org/default_%s.aspx' % x
parse_url(url)
print(poems)
if __name__ == "__main__":
main()