1.4 爬虫-笔趣阁获取小说例子

#笔趣阁网站
# 1.模拟搜索
# 2.图书查询-章节
# 3.获取章节-内容
# 4.本地存储:txt、mysql、

def searchBook():
    print("****************笔趣阁小说图书下载*****************")
    print("****************作者:一个低调的人*****************")
    bookName = input("请输入图书的名称: ")
    # 1.转移字符:中文在URL中乱码
    bookName = bookName.encode("gbk")
    # 2.请求
    resp = requests.get(url = url_one, params = {"searchkey":bookName},headers=head,timeout=10)
    # 3.判断是否成功
    if resp.status_code == 200:
        resp.encoding = "gbk"
        print(resp.text)
        # 4.解析内容:1 .数据源 2.html.parser
        soup = BeautifulSoup(resp.text, "html.parser")
        # 4.1 Tag 根据标签的名称获取,第一个出现的
        title = soup.title  # 拿标题
        print(title)
        img = soup.img  # 拿A标签
        print(img)
        a = soup.a  # 拿A标签
        print(a)
        # 4.2 string text 获取内容
        print(title.string, img, a.string)
        # 4.3获取属性 attrs 属性字典集合 get(key)访问
        # print(img.attrs)
        print(img.attrs.get("src"))
        # {'class': ['navbar-logo'], 'src': '/novel/images/navbar-logo.svg'}
        #         # 4.4查询
        #         # find_all() 查询所有标签,list列表[tag,tag...]
        # find() = soup.Tag 第一个出现的标签
        # name:标签名 ,string:单个,list:多个
        div_list = soup.find_all(name="div",attrs={"class": "caption"})
        for div in div_list:
            # 判断不能None
            bookname = div.h4.a.string
            bookurl = div.h4.a.attrs.get("href")
            bookauthor = div.small.string
            bookdir = div.p.string
            # and 与 需要满足所有所有条件
            if bookname != None and bookurl != None and bookauthor != None and bookdir != None:
                bookname.replace(" ", "")
                bookurl.replace(" ", "")
                bookauthor.replace(" ", "")
                bookdir.replace(" ", "")
                print(bookname + "\n", bookurl + "\n", bookauthor + "\n", bookdir + "\n")
                # 5.保存到字典
                book_dict[bookname] = bookurl
    else:
        print("错误!重新开始")
        searchBook()
    pass

def getBookChapter():
    bookname = input("请输入已找到的图书的名称: ")
    # 判断是否存在字典中
    # keys() 返回字典key的列表 集合
    if bookname in book_dict.keys():
        # resp = requests.get(url=url_one, params={"searchkey": bookName}, headers=head, timeout=10)
        resp = requests.get(url=book_dict[bookname],headers=head, timeout=time)
        # 3.判断是否成功
        if resp.status_code == 200:
            resp.encoding = "gbk"
            soup = BeautifulSoup(resp.text, "html.parser")
            title = soup.title.string  # 拿标题
            print(title.string)
            dd_list = soup.find_all(name="dd", attrs={"class": "col-md-3"})
            for dd in dd_list:
                try:
                    chapter = dd.a.attrs.get("title")
                    chapterUrl = dd.a.attrs.get("href")
                    print(chapter,chapterUrl)
                    bookUrl = book_dict[bookname]
                    getBookChapterContent(chapter, chapterUrl, bookUrl,bookname)
                except Exception:
                    pass
                    continue
    else:
        print("错误!重新开始")
        getBookChapter()
    pass

def getBookChapterContent(chapter, chapterUrl, bookUrl,bookname):
    # 判断是否存在URL,进行拼接
    if "http" not in chapterUrl:
        chapterUrl = bookUrl + chapterUrl

    resp = requests.get(url=chapterUrl)  # 发起请求
    if resp.ststus_code == 200:
        resp.encoding = "gbk"
        soup4 = BeautifulSoup(resp.text,"html.parser")  # 格式化
        div = soup4.find(name="div",attrs={"div":"htmlContent"}) #返回一个标签对象,而不是列表对象
        text = div.text
        if text !=None and text !="": #判断不能为空
            text = div.text.replace("<br/","\n") #换行
            saveTxt(text,bookname,chapter) #保存
            chapter_dict[chapter] = text #保存到字典中
    else:
        print(bookname + "下载失败!")

def saveTxt(text,bookname,chapter):
    path = "小说" + bookname
    # 验证路径是否存在
    if not os.path.exists(path):
        #创建
        os.mkdir(path)  # 创建一级目录
        os.makedirs(path)  # 创建多级目录
    # file文件管理,创建、打开、写入、读取、清理缓存、关闭
    file = open(path+"/"+chapter+".txt","wb",encoding="utf-8")  # wb 写,rb 读
    file.write(text)
    file.flush() #缓冲区
    file.close()

# csv可与数据库之间互相导入
def saveCsv():

    headers = {"章节名称","内容"}  # 写一行,标题行
    rows = {}  # 写多行,写入二维列表
    file = open("test.csv", "w",encoding="utf-8")  # 创建CSV文件
    f_csv = csv.file.write(file)  # 转换写方式 :表结构

    # 循环所有保存到字典里的内容
    for key in chapter_dict.keys():  # 查询key的集合
        text = chapter_dict[key]
        row = [key,text]  # 保存 名称和列表
        rows.append(row)  # 添加到rows
        print("存储中",row)

    f_csv.writerow(headers)  # 写单行数据:[]
    f_csv.writerow(rows)  # 写多行数据:[[],[],[]]
    print("创建表格成功")

#python 入口
if __name__ == '__main__':
    url_one = "https://www.biqukan.cc/modules/article/search.php"
    head = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
    }
    time = 15
    import requests #请求
    from bs4 import BeautifulSoup # 解析类
    import os #管理目录:创建,管理,修改
    import csv # csv管理

    book_dict = {}  # 存储图书的名称:路径
    chapter_dict = {}  # 章节:内容
    searchBook()
    getBookChapter()
    # saveCsv()
    pass
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章