python爬蟲開發之“讀文章”網頁爬取

目標：

將小說網站上所有的青春校園小說，保存到本地磁盤。（以小說標題爲目錄，保存各個章節的小說章節）

小說網站入口：

http://xs.duwenzhang.com/list1/

因爲前兩天發表的那篇博客雖然說能夠成功抓取，但是還是存在着一些小問題，如亂碼。。

今天的我，邏輯比較清晰，索性就把它給修改了。

開發的過程中，主要出現了以下幾個問題：

由於入口地址只能夠抓取，7頁小說所在的url，並且url中是有重複的，爲什麼呢？這是因爲首頁的url和入口的url是一樣的。所以需要把首頁爬取下來的url放入集合中。那麼怎麼才能夠抓取19頁（青春校園小說一共有19頁小說）的url呢？我是通過將首頁的url抓取下來，通過自己構造的函數，再一次的去抓取它們。。（目前我的水平是這樣，如果有大佬能夠指點一下我，我將萬分感謝）
抓取下來的小說標題作爲目錄名時，需要注意，有的小說名很長....還有，小說存在着創建目錄時不能使用的字符，例如 ':'、'\t'、等等問題，都需要將其替換了。
最後就是有的小說內容是不存在的！因爲源網站上的資源找不到了..（大概是內容被和諧了）所以通過BeautifulSoup獲取的Tag下的對象需要判斷是不是爲空！不然會報錯。
最後附上源代碼，望大佬們指正

import os
import re
import urllib.request
from bs4 import BeautifulSoup


# 目標: 將該小說網站上所有的青春校園小說，保存到本地磁盤。（以小說標題爲目錄，保存各個章節的小說章節）

def get_url_list(url):
    new_url = set()
    new_url.add(url)  # 把第一頁添加到新的url列表中http://xs.duwenzhang.com/list1/
    rq = urllib.request.urlopen(url)
    soup = BeautifulSoup(rq, 'html.parser')
    url_list = soup.find_all('a', href=re.compile(r'/list1/[0-9]+.html'))
    for url in url_list:
        new_url.add(url['href'])
    return new_url


def get_full_urlList(url_list):
    new_url = list(url_list)
    for url in new_url:
        rq = urllib.request.urlopen(url)
        soup = BeautifulSoup(rq, 'html.parser', from_encoding='iso-8859-1')
        new_url_list = soup.find_all('a', href=re.compile(r'/list1/[0-9]+.html'))
        for deep_url in new_url_list:
            if deep_url['href'] not in new_url:
                new_url.append(deep_url['href'])
    return set(new_url)


def Create_novel_dir_and_href(url_list):
    href_set = set()
    for url in url_list:
        rq = urllib.request.urlopen(url)
        soup = BeautifulSoup(rq, 'html.parser', from_encoding='gbk')
        # <div class="book_right_commend_content">
        title_list = soup.find_all('h2')
        href_list = soup.find_all('a', href=re.compile(r'/1/[0-9]+/'))
        for href in href_list:
            href_set.add('http://xs.duwenzhang.com' + href['href'])
        for title in title_list:
            correct = title.get_text()
            string = re.compile(r'\|')
            correct_title = string.sub(' ', correct)  # 把|替換成 ' ' #
            if len(correct_title) > 20:
                correct_title = correct_title[:20]
            dir_path = 'F:/爬蟲文件/青春校園小說/' + correct_title  # 文件目錄不能有 |
            if not os.path.exists(dir_path):
                os.mkdir(dir_path)
    print('小說目錄創建成功！')
    return href_set


def get_chapter_list(url_list):
    for url in url_list:
        chapter_href = set()
        rq = urllib.request.urlopen(url)
        soup = BeautifulSoup(rq, 'html.parser', from_encoding='gbk')
        title = soup.find('h1').get_text()  # 題目的長度要小於12
        if len(title) > 12:
            title = title[:12]
        dir_path = ('F:/爬蟲文件/青春校園小說/[青春 校園] ' + title).strip()
        chapter_list = soup.find_all('a', href=re.compile(r'/1/[0-9/]+.html'))
        for lst in chapter_list:
            chapter_href.add(lst['href'])
        for page_url in chapter_href:
            rq = urllib.request.urlopen(page_url)
            soup = BeautifulSoup(rq, 'html.parser', from_encoding='gbk')
            page_title = soup.find(name='dt')  # 文章標題不能有冒號....
            page_content = soup.find(name='dd')
            if page_title and page_content:
                string = re.compile(r'(:|/\t|/?|\t)')
                page_content = page_content.get_text()
                page_title = page_title.get_text()
                page_title = string.sub('', page_title)
                page_path = dir_path + '/' + page_title + '.txt'
                f = open(page_path, 'w', encoding='gbk', errors='ignore')
                f.write(page_content)
                f.close()
                print('成功寫入了小說：%s,小說章節：%s' % (title, page_title))
            else:
                continue
    print('爬取成功')


if __name__ == '__main__':
    root_url = 'http://xs.duwenzhang.com/list1/'  # 根url，即校園小說首頁
    Url_List = get_url_list(root_url)  # 入口函數，獲得 主頁url和其它頁url
    Full_url = get_full_urlList(Url_List)
    Novel_url = Create_novel_dir_and_href(Full_url)  # 創建小說目錄,並返回這些小說的入口url
    get_chapter_list(Novel_url)