爬完數據目錄和內容後,我們來爬取書籍的基本信息。
在上篇博客的基礎上,爬取書籍信息並存入字典
# -*- coding: utf-8 -*-
import urllib.request
import bs4
import re
import sqlite3
def getHtml(url):
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
headers = {"User-Agent":user_agent}
request = urllib.request.Request(url,headers=headers)
response = urllib.request.urlopen(request)
html = response.read()
return html
# 爬取整個網頁
def parse(url):
html_doc = getHtml(url)
sp = bs4.BeautifulSoup(html_doc, 'html.parser', from_encoding="utf-8")
return sp
# 爬取書籍基本信息
def get_book_baseinfo(url):
# class = "info"信息獲取
info = parse(url).find('div',class_ = 'info')
book_info = {}
if info:
book_info['title'] = ''
book_info['img'] = ''
# 標題
book_info['title'] = info.find('h2').string
# 圖片鏈接
img = info.find('div',class_ = 'cover')
for im in img.children:
# 圖片地址想要訪問,顯然需要拼接
book_info['img'] = 'http://www.biqukan.com' + im.attrs['src']
# 基本信息存儲
ifo = info.find('div',class_ = 'small')
bkinfo = []
for b in ifo:
for v in b.children:
t = v.string
if t:
bkinfo.append(''.join(t))
# 將:後面的信息連起來
spv = []
cv = ''
for v in bkinfo:
if v.find(':') >= 0:
if cv:
spv.append(cv)
cv = v
else:
cv += v
spv.append(cv)
# 基本信息轉成字典
for element in spv:
its = [v.strip() for v in element.split(':')]
if len(its) != 2:
continue
nm = its[0].lower() # 統一成小寫
if type(nm).__name__ == 'unicode':
nm = nm.encode('utf-8')
vu = its[1]
book_info[nm] = vu
# 發現這裏獲取到的字典鍵與後面將要獲取的鍵重複了,所以這裏改一下
book_info['auther'] = book_info.pop('作者')
#簡介獲取(與基本信息的獲取方式一致)
intro = info.find('div',class_ = 'intro')
bkurl = []
for b in intro:
t = b.string
if t:
bkurl.append(''.join(t))
bkjj = []
cvx = ''
for w in bkurl:
if w.find(':') >= 0:
if cvx:
bkjj.append(cvx)
cvx = w
else:
cvx += w
bkjj.append(cvx)
for ele in bkjj:
itis = [n.strip() for n in ele.split(':')]
if len(itis) != 2:
continue
summ = itis[0].lower() # 統一成小寫
if type(summ).__name__ == 'unicode':
summ = summ.encode('utf-8')
vux = itis[1]
book_info[summ] = vux
# 使用笨辦法將字典的key轉成英文狀態,這樣方便數據庫存儲
book_info['type'] = book_info.pop('分類')
book_info['status'] = book_info.pop('狀態')
book_info['num'] = book_info.pop('字數')
book_info['updatatime'] = book_info.pop('更新時間')
book_info['newchapter'] = book_info.pop('最新章節')
book_info['authsummery'] = book_info.pop('作者')
book_info['summery'] = book_info.pop('簡介')
book_info['notipurl'] = book_info.pop('無彈窗推薦地址')
return book_info
# 獲取書籍目錄
def get_book_dir(url):
books_dir = []
name = parse(url).find('div', class_='listmain')
if name:
dd_items = name.find('dl')
dt_num = 0
for n in dd_items.children:
ename = str(n.name).strip()
if ename == 'dt':
dt_num += 1
if ename != 'dd':
continue
Catalog_info = {}
if dt_num == 2:
durls = n.find_all('a')[0]
Catalog_info['name'] = (durls.get_text())
Catalog_info['url'] = 'http://www.biqukan.com' + durls.get('href')
books_dir.append(Catalog_info)
return books_dir
# 獲取章節內容
def get_charpter_text(curl):
# 直接根據curl地址取章節內容就行了獲取一篇文章??!!!
# 這是一個地址,你不要打開網頁獲取網頁內容?這不是已經
text = parse(curl).find('div', class_='showtxt')
if text:
cont = text.get_text()
cont = [str(cont).strip().replace('\r \xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0', '').replace('\u3000\u3000', '')]
c = " ".join(cont)
ctext = re.findall(r'^.*?html', c)
return ctext
else:
return ''
# 獲取書籍
def get_book(burl):
# 目錄
book = get_book_dir(burl)
if not book:
return book
# 內容
for d in book:
curl = d['url']
try:
print('正在獲取章節【{}】【內容】【{}】'.format(d['name'],d['url']))
ctext = get_charpter_text(curl)
d['text'] = ctext
print(d['text'])
print()
except Exception as err:
d['text'] = 'get failed'
return book
if __name__ == '__main__':
# 這裏調用get_book_baseinfo函數看看效果
book = get_book_baseinfo('http://www.biqukan.com/1_1094/')
print(book)
結果展示:
{'title': '一念永恆', 'img': 'http://www.biqukan.com/files/article/image/1/1094/1094s.jpg', 'auther': '耳根', 'type': '玄幻小說', 'status': '連載', 'num': '3689058', 'updatatime': '2018-02-09 18:20:00', 'newchapter': '第1314章 你的選擇(終)', 'authsummery': '耳根所寫的《一念永恆》無彈窗免費全文閱讀爲轉載作品,章節由網友發佈。', 'summery': '一念成滄海,一念化桑田。一念斬千魔,一念誅萬仙。唯我念……永恆', 'notipurl': 'http://www.biqukan.com/1_1094/?_t_t_t=0.4355400702253367'}
後面我們將這些信息存儲到數據庫。