python之下載小說

#coding = utf-8
import requests
from bs4 import BeautifulSoup
import time
import os, sys
from pathlib import Path
#下載類
class downloader(object):
    def __init__(self):
        self.server = 'http://www.xuliehao.org/'
        self.target = 'http://www.xuliehao.org/novel20931/'
        self.names = [] #存放章節名
	self.urls = [] #存放章節鏈接
	self.nums = 0   #章節數

 #獲取下載地址
    def get_download_url(self):
        req = requests.get(url = self.target)
	req.encoding = 'gbk'
	html = req.text
	div_bf = BeautifulSoup(html)
	div = div_bf.find_all(id="list") #因爲小說的章節對應的tag
	a_bf = BeautifulSoup(str(div[0]))
	a = a_bf.find_all('a')
	self.nums = len(a[15:])
	for each in a[15:]:	
	    self.names.append(each.string)
	    self.urls.append(self.server+each.get('href'))
 
  #獲取章節內容
    def get_contents(self,target):
        req = requests.get(url =target)
	req.encoding = 'gbk'
	html = req.text
	bf = BeautifulSoup(html,'lxml')
	texts = bf.find_all(id="content")
	#texts = texts[0].text.replace('\xa0'*8,'\n\n')
	return texts[0].text
	
    #創建文件夾
    def mkdirdir(self,dirname):
	dir = os.getcwd()
	dirname = dir +"/" + dirname
	is_dir = Path(dirname)
	if is_dir.is_dir():
	    print (dirname + " is exist")
	else:
	    os.mkdir(dirname,0o777)
	self.path = dirname
  
  #將抓取的文章內容寫入文件
    def writer(self,name,text):
	write_flag = True
	dirname = self.path + "/" + name + ".txt";
	dirname = dirname.replace(' ','') #將章節名稱帶空格的去掉
	print (dirname)
	with open(dirname,'a',encoding='utf-8') as f:
	    f.write(name+'\n')
	    f.writelines(text)
	    f.write('\n\n')
#主函數
if __name__ == "__main__":
	
    dl = downloader()
    dl.get_download_url()
    dl.mkdirdir("testStory") 
    for i in range(dl.nums):
        dl.writer(dl.names[i],dl.get_contents(dl.urls[i]))

1.小說的目錄在html中的tag如下:

2.每章對應的小說內容在html中tag如下

 

 

 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章