Python 小把戲之下載小說
from bs4 import BeautifulSoup
import sys
import requests
import re
'''
下載小說 網站: 筆趣閣 http://www.biquku.la/
Parameter:
無
Returns:
無
Modify:
2019/01/28
'''
class book_downloader(object):
def __init__(self, url):
'''
初始化函數
Parameter:
url: 網站: 筆趣閣的 小說目錄頁網址
Returns:
無
Modify:
2019/01/28
'''
self.__target = url
self.__txtname= ''
self.__names = []
self.__urls = []
self.__nums = 0
self.__book_url=[]
def get_download_url(self):
'''
獲取各個章節的url 和 章節名稱
Parameter:
無
Returns:
無
Modify:
2019/01/28
'''
req = requests.get(url = self.__target)
div_bf = BeautifulSoup(req.text)
name = div_bf.find_all('div', id = 'info')
name_fb = BeautifulSoup(str(name[0]))
self.__txtname = name_fb.find_all('h1')[0].string
div = div_bf.find_all('div', id = 'list')
a_bf = BeautifulSoup(str(div[0]))
a = a_bf.find_all('a')
self.__nums = len(a)
for each in a:
self.__names.append(each.string)
self.__urls.append(self.__target + each.get('href'))
def get_txt(self, url):
'''
獲取各個章節內容
Parameter:
各個章節的url
Returns:
章節內容 txt
Modify:
2019/01/28
'''
r = requests.get(url)
r.encoding = 'utf-8'
r_bf = BeautifulSoup(r.text)
r_txt = r_bf.find_all('div', id = 'content')
r_txt = r_txt[0].text.replace('\xa0'*4, '\n ')
return r_txt
def write_txt(self, name, txt):
'''
將小說章節和內容寫入本地
Parameter:
name: 章節名 txt 章節的內容
Returns:
無
Modify:
2019/01/28
'''
with open(str(self.__txtname) + '.txt', 'a', encoding = 'utf-8') as f:
f.write('\n\n' + name + '\n\n')
f.writelines(' ' + txt.replace('\xa0'*4, '\n '))
def download(self):
'''
將小說章節和內容寫入本地
Parameter:
無
Returns:
無
Modify:
2019/01/28
'''
self.get_download_url()
n = 0
for i in self.__urls:
sys.stdout.write(" %s 已下載:%.3f%%" % (self.__txtname,float( n*100/self.__nums)) + '\r')
sys.stdout.flush()
self.write_txt(self.__names[n], self.get_txt(i))
n += 1
def get_book_url(self):
'''
獲取 http://www.biquku.la/ 首頁上的封推的小說的url
Parameter:
無
Returns:
無
Modify:
2019/01/31
'''
url = 'http://www.biquku.la'
html = requests.get(url = url)
div_bf = BeautifulSoup(html.text)
urls = div_bf.find_all('div', class_ = 'content')
print(urls)
for each in urls:
self.__book_url += re.findall('<a href="(.*?)">', str(each))
def download_books(self):
'''
下載 http://www.biquku.la/ 首頁上的封推的小說
Parameter:
無
Returns:
無
Modify:
2019/01/31
'''
self.get_book_url()
for each in self.__book_url:
self.__urls = []
self.__names = []
self.__target = each
self.download()
if __name__ == '__main__':
target = 'http://www.biquku.la/18/18109/'
dl = book_downloader(target)
dl.download_books()