書荒閣小說爬取

把dir改成書荒閣某本小說目錄,name改爲保存文件名字,就可以爬下來了

from bs4 import BeautifulSoup
import requests

headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}

def download(url, path):
    res = requests.get(url, headers = headers)
    res.encoding = 'gbk'
    bs = BeautifulSoup(res.text, 'html.parser')#.replace(' ','')
    tag = bs.find('div', {'id':'content'})
    with open(path, 'a', encoding = 'utf-8') as f:
        f.write('\n\n' + bs.title.get_text() + '\n\n')
        f.write(tag.get_text())


def getdir(url, baseurl):
    res = requests.get(url, headers = headers)
    bs = BeautifulSoup(res.text, 'html.parser')
    tags = bs.find('div', {'id':'list'}).findAll(['dd', 'dt'])
    start = 1
    while True:
        if tags[start].name == 'dt':
            break
        start+=1
    for i in range(start+1, len(tags)):
        yield baseurl+tags[i].a['href']





#dir=input()
#name=input()
dir='https://www.shuhuangge.org/0_71/'
baseurl='https://www.shuhuangge.org'
name='我欲封天'
for url in getdir(dir, baseurl):
    download(url,name + '.txt')
    print(url)
    #time.sleep(1)


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章