1.4 爬蟲-筆趣閣獲取小說例子

#筆趣閣網站
# 1.模擬搜索
# 2.圖書查詢-章節
# 3.獲取章節-內容
# 4.本地存儲:txt、mysql、

def searchBook():
    print("****************筆趣閣小說圖書下載*****************")
    print("****************作者:一個低調的人*****************")
    bookName = input("請輸入圖書的名稱: ")
    # 1.轉移字符:中文在URL中亂碼
    bookName = bookName.encode("gbk")
    # 2.請求
    resp = requests.get(url = url_one, params = {"searchkey":bookName},headers=head,timeout=10)
    # 3.判斷是否成功
    if resp.status_code == 200:
        resp.encoding = "gbk"
        print(resp.text)
        # 4.解析內容:1 .數據源 2.html.parser
        soup = BeautifulSoup(resp.text, "html.parser")
        # 4.1 Tag 根據標籤的名稱獲取,第一個出現的
        title = soup.title  # 拿標題
        print(title)
        img = soup.img  # 拿A標籤
        print(img)
        a = soup.a  # 拿A標籤
        print(a)
        # 4.2 string text 獲取內容
        print(title.string, img, a.string)
        # 4.3獲取屬性 attrs 屬性字典集合 get(key)訪問
        # print(img.attrs)
        print(img.attrs.get("src"))
        # {'class': ['navbar-logo'], 'src': '/novel/images/navbar-logo.svg'}
        #         # 4.4查詢
        #         # find_all() 查詢所有標籤,list列表[tag,tag...]
        # find() = soup.Tag 第一個出現的標籤
        # name:標籤名 ,string:單個,list:多個
        div_list = soup.find_all(name="div",attrs={"class": "caption"})
        for div in div_list:
            # 判斷不能None
            bookname = div.h4.a.string
            bookurl = div.h4.a.attrs.get("href")
            bookauthor = div.small.string
            bookdir = div.p.string
            # and 與 需要滿足所有所有條件
            if bookname != None and bookurl != None and bookauthor != None and bookdir != None:
                bookname.replace(" ", "")
                bookurl.replace(" ", "")
                bookauthor.replace(" ", "")
                bookdir.replace(" ", "")
                print(bookname + "\n", bookurl + "\n", bookauthor + "\n", bookdir + "\n")
                # 5.保存到字典
                book_dict[bookname] = bookurl
    else:
        print("錯誤!重新開始")
        searchBook()
    pass

def getBookChapter():
    bookname = input("請輸入已找到的圖書的名稱: ")
    # 判斷是否存在字典中
    # keys() 返回字典key的列表 集合
    if bookname in book_dict.keys():
        # resp = requests.get(url=url_one, params={"searchkey": bookName}, headers=head, timeout=10)
        resp = requests.get(url=book_dict[bookname],headers=head, timeout=time)
        # 3.判斷是否成功
        if resp.status_code == 200:
            resp.encoding = "gbk"
            soup = BeautifulSoup(resp.text, "html.parser")
            title = soup.title.string  # 拿標題
            print(title.string)
            dd_list = soup.find_all(name="dd", attrs={"class": "col-md-3"})
            for dd in dd_list:
                try:
                    chapter = dd.a.attrs.get("title")
                    chapterUrl = dd.a.attrs.get("href")
                    print(chapter,chapterUrl)
                    bookUrl = book_dict[bookname]
                    getBookChapterContent(chapter, chapterUrl, bookUrl,bookname)
                except Exception:
                    pass
                    continue
    else:
        print("錯誤!重新開始")
        getBookChapter()
    pass

def getBookChapterContent(chapter, chapterUrl, bookUrl,bookname):
    # 判斷是否存在URL,進行拼接
    if "http" not in chapterUrl:
        chapterUrl = bookUrl + chapterUrl

    resp = requests.get(url=chapterUrl)  # 發起請求
    if resp.ststus_code == 200:
        resp.encoding = "gbk"
        soup4 = BeautifulSoup(resp.text,"html.parser")  # 格式化
        div = soup4.find(name="div",attrs={"div":"htmlContent"}) #返回一個標籤對象,而不是列表對象
        text = div.text
        if text !=None and text !="": #判斷不能爲空
            text = div.text.replace("<br/","\n") #換行
            saveTxt(text,bookname,chapter) #保存
            chapter_dict[chapter] = text #保存到字典中
    else:
        print(bookname + "下載失敗!")

def saveTxt(text,bookname,chapter):
    path = "小說" + bookname
    # 驗證路徑是否存在
    if not os.path.exists(path):
        #創建
        os.mkdir(path)  # 創建一級目錄
        os.makedirs(path)  # 創建多級目錄
    # file文件管理,創建、打開、寫入、讀取、清理緩存、關閉
    file = open(path+"/"+chapter+".txt","wb",encoding="utf-8")  # wb 寫,rb 讀
    file.write(text)
    file.flush() #緩衝區
    file.close()

# csv可與數據庫之間互相導入
def saveCsv():

    headers = {"章節名稱","內容"}  # 寫一行,標題行
    rows = {}  # 寫多行,寫入二維列表
    file = open("test.csv", "w",encoding="utf-8")  # 創建CSV文件
    f_csv = csv.file.write(file)  # 轉換寫方式 :表結構

    # 循環所有保存到字典裏的內容
    for key in chapter_dict.keys():  # 查詢key的集合
        text = chapter_dict[key]
        row = [key,text]  # 保存 名稱和列表
        rows.append(row)  # 添加到rows
        print("存儲中",row)

    f_csv.writerow(headers)  # 寫單行數據:[]
    f_csv.writerow(rows)  # 寫多行數據:[[],[],[]]
    print("創建表格成功")

#python 入口
if __name__ == '__main__':
    url_one = "https://www.biqukan.cc/modules/article/search.php"
    head = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
    }
    time = 15
    import requests #請求
    from bs4 import BeautifulSoup # 解析類
    import os #管理目錄:創建,管理,修改
    import csv # csv管理

    book_dict = {}  # 存儲圖書的名稱:路徑
    chapter_dict = {}  # 章節:內容
    searchBook()
    getBookChapter()
    # saveCsv()
    pass
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章