python爬取小說(二)書籍基本信息爬取

爬完數據目錄和內容後,我們來爬取書籍的基本信息。
在上篇博客的基礎上,爬取書籍信息並存入字典
這裏寫圖片描述

# -*- coding: utf-8 -*-
import urllib.request
import bs4
import re
import sqlite3

def getHtml(url):
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
    headers = {"User-Agent":user_agent}
    request = urllib.request.Request(url,headers=headers)
    response = urllib.request.urlopen(request)
    html = response.read()
    return html


# 爬取整個網頁
def parse(url):
    html_doc = getHtml(url)
    sp = bs4.BeautifulSoup(html_doc, 'html.parser', from_encoding="utf-8")
    return sp

# 爬取書籍基本信息
def get_book_baseinfo(url):
    # class = "info"信息獲取
    info = parse(url).find('div',class_ = 'info')
    book_info = {}
    if info:
        book_info['title'] = ''
        book_info['img'] = ''
        # 標題
        book_info['title'] = info.find('h2').string

        # 圖片鏈接
        img = info.find('div',class_ = 'cover')
        for im in img.children:
            # 圖片地址想要訪問,顯然需要拼接
            book_info['img'] = 'http://www.biqukan.com' + im.attrs['src']

        # 基本信息存儲
        ifo = info.find('div',class_ = 'small')
        bkinfo = []
        for b in ifo:
            for v in b.children:
                t = v.string
                if t:
                    bkinfo.append(''.join(t))

        # 將:後面的信息連起來
        spv = []
        cv = ''
        for v in bkinfo:
            if v.find(':') >= 0:
                if cv:
                    spv.append(cv)
                cv = v
            else:
                cv += v
        spv.append(cv)

        # 基本信息轉成字典
        for element in spv:
            its = [v.strip() for v in element.split(':')]
            if len(its) != 2:
                continue
            nm = its[0].lower()  # 統一成小寫
            if type(nm).__name__ == 'unicode':
                nm = nm.encode('utf-8')
            vu = its[1]
            book_info[nm] = vu

        # 發現這裏獲取到的字典鍵與後面將要獲取的鍵重複了,所以這裏改一下
        book_info['auther'] = book_info.pop('作者')

        #簡介獲取(與基本信息的獲取方式一致)
        intro = info.find('div',class_ = 'intro')
        bkurl = []
        for b in intro:
            t = b.string
            if t:
                bkurl.append(''.join(t))

        bkjj = []
        cvx = ''
        for w in bkurl:
            if w.find(':') >= 0:
                if cvx:
                    bkjj.append(cvx)
                cvx = w
            else:
                cvx += w
        bkjj.append(cvx)

        for ele in bkjj:
            itis = [n.strip() for n in ele.split(':')]
            if len(itis) != 2:
                continue
            summ = itis[0].lower()  # 統一成小寫
            if type(summ).__name__ == 'unicode':
                summ = summ.encode('utf-8')
            vux = itis[1]
            book_info[summ] = vux

    # 使用笨辦法將字典的key轉成英文狀態,這樣方便數據庫存儲
    book_info['type'] = book_info.pop('分類')
    book_info['status'] = book_info.pop('狀態')
    book_info['num'] = book_info.pop('字數')
    book_info['updatatime'] = book_info.pop('更新時間')
    book_info['newchapter'] = book_info.pop('最新章節')
    book_info['authsummery'] = book_info.pop('作者')
    book_info['summery'] = book_info.pop('簡介')
    book_info['notipurl'] = book_info.pop('無彈窗推薦地址')

    return book_info



# 獲取書籍目錄
def get_book_dir(url):
    books_dir = []
    name = parse(url).find('div', class_='listmain')
    if name:
        dd_items = name.find('dl')
        dt_num = 0
        for n in dd_items.children:
            ename = str(n.name).strip()
            if ename == 'dt':
                dt_num += 1
            if ename != 'dd':
                continue
            Catalog_info = {}
            if dt_num == 2:
                durls = n.find_all('a')[0]
                Catalog_info['name'] = (durls.get_text())
                Catalog_info['url'] = 'http://www.biqukan.com' + durls.get('href')
                books_dir.append(Catalog_info)
    return books_dir


# 獲取章節內容
def get_charpter_text(curl):
    # 直接根據curl地址取章節內容就行了獲取一篇文章??!!!
    #  這是一個地址,你不要打開網頁獲取網頁內容?這不是已經
    text = parse(curl).find('div', class_='showtxt')
    if text:
        cont = text.get_text()
        cont = [str(cont).strip().replace('\r \xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0', '').replace('\u3000\u3000', '')]
        c = " ".join(cont)
        ctext = re.findall(r'^.*?html', c)
        return ctext
    else:
        return ''


# 獲取書籍
def get_book(burl):
    # 目錄
    book = get_book_dir(burl)
    if not book:
        return book

    # 內容
    for d in book:
        curl = d['url']
        try:
            print('正在獲取章節【{}】【內容】【{}】'.format(d['name'],d['url']))
            ctext = get_charpter_text(curl)
            d['text'] = ctext
            print(d['text'])
            print()
        except Exception as err:
            d['text'] = 'get failed'

    return book


if __name__ == '__main__':
    # 這裏調用get_book_baseinfo函數看看效果
    book = get_book_baseinfo('http://www.biqukan.com/1_1094/')
    print(book)

結果展示:

{'title': '一念永恆', 'img': 'http://www.biqukan.com/files/article/image/1/1094/1094s.jpg', 'auther': '耳根', 'type': '玄幻小說', 'status': '連載', 'num': '3689058', 'updatatime': '2018-02-09 18:20:00', 'newchapter': '第1314章 你的選擇(終)', 'authsummery': '耳根所寫的《一念永恆》無彈窗免費全文閱讀爲轉載作品,章節由網友發佈。', 'summery': '一念成滄海,一念化桑田。一念斬千魔,一念誅萬仙。唯我念……永恆', 'notipurl': 'http://www.biqukan.com/1_1094/?_t_t_t=0.4355400702253367'}

後面我們將這些信息存儲到數據庫。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章