爲方便後續做前端展示,本次主要優化章節字段根據爬取順序入庫,各功能函數模塊化。
# -*- coding: utf-8 -*-
import urllib.request
import bs4
import re
import sqlite3
import time
print ('連接數據庫……')
cx = sqlite3.connect('PaChong.db')
# 在該數據庫下創建表
# cx.execute('''CREATE TABLE book_info(
# id INTEGER PRIMARY KEY AUTOINCREMENT,
# title verchar(128) not null,
# img verchar(512) null,
# auther verchar(64) null,
# type verchar(128) null,
# status verchar(64) null,
# num int null,
# updatatime datetime null,
# newchapter verchar(512) null,
# authsummery verchar(1024) null,
# summery verchar(1024) null,
# notipurl verchar(512) null);
# ''')
# cx.execute('''CREATE TABLE book_chapter(
# id INTEGER PRIMARY KEY AUTOINCREMENT,
# book_id int null ,
# chapter_no int null ,
# chapter_name verchar(128) null,
# chapter_url verchar(512) null,
# chapter_content text null);
# ''')
print("Table created successfully")
print("數據庫連接完成")
def getHtml(url):
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
headers = {"User-Agent":user_agent}
request = urllib.request.Request(url,headers=headers)
response = urllib.request.urlopen(request)
html = response.read()
return html
# 爬取整個網頁
def parse(url):
html_doc = getHtml(url)
sp = bs4.BeautifulSoup(html_doc, 'html.parser', from_encoding="utf-8")
return sp
# 爬取書籍基本信息
def get_book_baseinfo(url):
# class = "info"信息獲取
info = parse(url).find('div', class_='info')
book_info = {} # 存儲書籍所有基本信息
pre_small = [] # 存儲處理前的信息
aft_small = [] # 存儲使用分號切割後的信息
pre_intro = [] # 存儲簡介開始之後的信息
if info:
# print(info)
book_info['title'] = ''
book_info['img'] = ''
# 標題
book_info['title'] = info.find('h2').string
# 圖片鏈接
img = info.find('div',class_ = 'cover')
for im in img.children:
# 圖片地址想要訪問,顯然需要拼接
book_info['img'] = 'http://www.biqukan.com' + im.attrs['src']
# 作者,分類,狀態,字數,更新時間,最新章節
ifo = info.find('div',class_ = 'small')
for b in ifo:
for v in b.children:
t = v.string
if t:
pre_small.append(''.join(t))
# 將:後面的信息連起來
cv = ''
for v in pre_small:
if v.find(':') >= 0:
if cv:
aft_small.append(cv)
cv = v
else:
cv += v
aft_small.append(cv)
# 基本信息轉成字典
for element in aft_small:
its = [v.strip() for v in element.split(':')]
if len(its) != 2:
continue
nm = its[0].lower() # 統一成小寫
if type(nm).__name__ == 'unicode':
nm = nm.encode('utf-8')
vu = its[1]
book_info[nm] = vu
# 發現這裏獲取到的字典鍵與後面將要獲取的鍵重複了,所以這裏改一下
book_info['auther'] = book_info.pop('作者')
# 簡介獲取()
intro = info.find('div',class_ = 'intro')
for b in intro:
t = b.string
if t:
pre_intro.append(''.join(t))
ext_info = extract_book_ext_info(''.join(pre_intro))
# 使用笨辦法將字典的key轉成英文狀態,這樣方便數據庫存儲
book_info['type'] = book_info.pop('分類')
book_info['status'] = book_info.pop('狀態')
book_info['num'] = book_info.pop('字數')
book_info['updatatime'] = book_info.pop('更新時間')
book_info['newchapter'] = book_info.pop('最新章節')
# 注意,以下三個是extract_book_ext_info函數返回
book_info['authsummery'] = ext_info['作者']
book_info['summery'] = ext_info['簡介']
book_info['notipurl'] = ext_info['無彈窗推薦地址']
print("正在獲取書籍【{}】".format(book_info['title']))
return book_info
# 簡介、作者介紹、無彈窗地址獲取
def extract_book_ext_info(ext_text):
tag = ['簡介:', '作者:', '無彈窗推薦地址:']
# 查找分片位置
pos = []
cur_pos = 0
for t in tag:
tpos = ext_text.find(t, cur_pos)
pos.append(tpos)
# 分片
cur_pos = 0
items = []
for i in range(len(tag)):
items.append(ext_text[cur_pos:pos[i]])
cur_pos = pos[i]
items.append(ext_text[pos[-1]:])
ext_info = {}
for v in items:
txt = v.strip()
if not txt:
continue
dim_pos = txt.find(':')
if dim_pos < 0:
continue
ext_info[txt[:dim_pos].strip()] = txt[dim_pos+1:].strip()
return ext_info
# 獲取書籍目錄
def get_book_dir(url):
books_dir = []
name = parse(url).find('div', class_='listmain')
if name:
dd_items = name.find('dl')
dt_num = 0
for n in dd_items.children:
ename = str(n.name).strip()
if ename == 'dt':
dt_num += 1
if ename != 'dd':
continue
Catalog_info = {}
if dt_num == 2:
durls = n.find_all('a')[0]
Catalog_info['chapter_name'] = (durls.get_text())
Catalog_info['chapter_url'] = 'http://www.biqukan.com' + durls.get('href')
books_dir.append(Catalog_info)
# print(books_dir)
return books_dir
# 獲取章節內容
def get_charpter_text(curl):
text = parse(curl).find('div', class_='showtxt')
if text:
cont = text.get_text()
cont = [str(cont).strip().replace('\r \xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0', '').replace('\u3000\u3000', '')]
c = " ".join(cont)
ctext = ' '.join(re.findall(r'^.*?html', c))
return ctext
else:
return ''
#數據存儲
def SqlExec(conn,sql):
try:
cur = conn.cursor()
cur.execute(sql)
conn.commit()
except Exception as e:
print('exec sql error[%s]' % sql)
print(Exception, e)
cur = None
return cur
# 獲取書籍章節內容
def get_book(burl):
# 目錄
book = get_book_dir(burl)
if not book:
print('獲取數據目錄失敗:', burl)
return book
for cno, d in enumerate(book):
curl = d['chapter_url']
d['chapter_no'] = str(cno)
try:
print('正在獲取....【{}】'.format(d['chapter_name']))
ctext = get_charpter_text(curl)
d['chapter_content'] = ctext
except Exception as err:
d['chapter_content'] = 'get failed'
continue
return book
# 插入書籍章節內容
def insert_chapter(dic):
if not dic:
print("獲取基本信息失敗")
return dic
sql = 'insert into book_chapter(' + ','.join(dic.keys()) + ') '
sql += "values('" + "','".join(dic.values()) + "');"
# 調用數據庫函數
if SqlExec(cx, sql):
print('正在插入...【{}】'.format(dic['chapter_name']))
else:
print(sql)
# 書籍基本信息入庫
def insert_baseinfo(burl):
baseinfo = get_book_baseinfo(burl)
if not baseinfo:
print("獲取基本信息失敗")
return baseinfo
sql = 'insert into book_info(' + ','.join(baseinfo.keys()) + ')'
sql += " values('" + "','".join(baseinfo.values()) + "');"
# 調用數據庫函數
if SqlExec(cx, sql):
print('正在插入...書籍【{}】的基本信息'.format(baseinfo['title']))
else:
print(sql)
if __name__ == '__main__':
url = 'http://www.biqukan.com/1_1093/'
insert_baseinfo(url) # 獲取書籍基本信息併入庫
books = get_book(url) # 獲取書籍章節目錄
for i in books: # 插入每章內容
insert_chapter(i)
結果顯示:
同樣,要獲取更多書籍信息,可以加個循環
if __name__ == '__main__':
for i in range(1090,1100):
url = 'http://www.biqukan.com/1_' + str(i) + '/'
insert_baseinfo(url) # 獲取書籍基本信息併入庫
books = get_book(url) # 獲取書籍章節目錄
for per_ch in books: # 插入每章內容
insert_chapter(per_ch)
數據庫展示: