爬取一本三國演義:
import urllib.request
from bs4 import BeautifulSoup
import time
# 首先向第一個url發送請求,得到相應
url = 'http://www.shicimingju.com/book/sanguoyanyi.html'
headers ={
'User-Agent':' Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
# 構建請求對象
request = urllib.request.Request(url=url, headers=headers)
# 發送請求得到響應
response = urllib.request.urlopen(request)
# 通過bs4進行解析響應
soup = BeautifulSoup(response.read(), 'html.parser')
print(soup)
# 解析內容,得到所有的章節標題還有每個章節的鏈接
oa_list = soup.select('.book-mulu > ul > li > a')
# print(oa_list)
# print(len(oa_list))
# 打開文件
fp = open('三國演義.txt', 'w', encoding='utf8')
# 循環遍歷oa_list,以次得到每一個a的內容和herf
for oa in oa_list:
# 獲取標題
title = oa.text
print('正在爬取--%s--....' % title)
# 獲取每一個a的鏈接
herf = 'http://www.shicimingju.com' + oa['href']
# 構建請求對象
title_request = urllib.request.Request(url = herf,headers = headers)
# 發送請求,得到響應
title_response = urllib.request.urlopen(title_request)
#解析響應
title_soup = BeautifulSoup(title_response.read(),'html.parser')
# 解析獲得內容
content = title_soup.select('.chapter_content')[0].text
# print(content)
# exit()
# 將title和content寫入到文件中
fp.write(title + content)
print('結束爬取--%s--' % title)
time.sleep(2)
# 關閉文件
fp.close()