Python 小把戏之下载小说
from bs4 import BeautifulSoup
import sys
import requests
import re
'''
下载小说 网站: 笔趣阁 http://www.biquku.la/
Parameter:
无
Returns:
无
Modify:
2019/01/28
'''
class book_downloader(object):
def __init__(self, url):
'''
初始化函数
Parameter:
url: 网站: 笔趣阁的 小说目录页网址
Returns:
无
Modify:
2019/01/28
'''
self.__target = url
self.__txtname= ''
self.__names = []
self.__urls = []
self.__nums = 0
self.__book_url=[]
def get_download_url(self):
'''
获取各个章节的url 和 章节名称
Parameter:
无
Returns:
无
Modify:
2019/01/28
'''
req = requests.get(url = self.__target)
div_bf = BeautifulSoup(req.text)
name = div_bf.find_all('div', id = 'info')
name_fb = BeautifulSoup(str(name[0]))
self.__txtname = name_fb.find_all('h1')[0].string
div = div_bf.find_all('div', id = 'list')
a_bf = BeautifulSoup(str(div[0]))
a = a_bf.find_all('a')
self.__nums = len(a)
for each in a:
self.__names.append(each.string)
self.__urls.append(self.__target + each.get('href'))
def get_txt(self, url):
'''
获取各个章节内容
Parameter:
各个章节的url
Returns:
章节内容 txt
Modify:
2019/01/28
'''
r = requests.get(url)
r.encoding = 'utf-8'
r_bf = BeautifulSoup(r.text)
r_txt = r_bf.find_all('div', id = 'content')
r_txt = r_txt[0].text.replace('\xa0'*4, '\n ')
return r_txt
def write_txt(self, name, txt):
'''
将小说章节和内容写入本地
Parameter:
name: 章节名 txt 章节的内容
Returns:
无
Modify:
2019/01/28
'''
with open(str(self.__txtname) + '.txt', 'a', encoding = 'utf-8') as f:
f.write('\n\n' + name + '\n\n')
f.writelines(' ' + txt.replace('\xa0'*4, '\n '))
def download(self):
'''
将小说章节和内容写入本地
Parameter:
无
Returns:
无
Modify:
2019/01/28
'''
self.get_download_url()
n = 0
for i in self.__urls:
sys.stdout.write(" %s 已下载:%.3f%%" % (self.__txtname,float( n*100/self.__nums)) + '\r')
sys.stdout.flush()
self.write_txt(self.__names[n], self.get_txt(i))
n += 1
def get_book_url(self):
'''
获取 http://www.biquku.la/ 首页上的封推的小说的url
Parameter:
无
Returns:
无
Modify:
2019/01/31
'''
url = 'http://www.biquku.la'
html = requests.get(url = url)
div_bf = BeautifulSoup(html.text)
urls = div_bf.find_all('div', class_ = 'content')
print(urls)
for each in urls:
self.__book_url += re.findall('<a href="(.*?)">', str(each))
def download_books(self):
'''
下载 http://www.biquku.la/ 首页上的封推的小说
Parameter:
无
Returns:
无
Modify:
2019/01/31
'''
self.get_book_url()
for each in self.__book_url:
self.__urls = []
self.__names = []
self.__target = each
self.download()
if __name__ == '__main__':
target = 'http://www.biquku.la/18/18109/'
dl = book_downloader(target)
dl.download_books()