愛看小說的程序猿必備!
首先我們需要安裝以下庫:
python3.5
requests
pymysql
lxml
queue
threading
import requests
from lxml import etree
import pymysql
def insertdb():
url = "http://www.xbiquge.la/xiaoshuodaquan/"
# 獲取響應
req = requests.get(url)
html = etree.HTML(req.content.decode("utf8", "ignore"))
lists = html.xpath("//div[@class='novellist']/ul/li")
#鏈接數據庫
db = pymysql.connect(host='localhost', port=3306, user='root', password='123456', db='biquge', charset='utf8')
cursor = db.cursor()
data = "insert into story_list(story_name,story_url) values(%s, %s)"
for list in lists:
#將小說名和url插入到數據庫
name = "".join(list.xpath("./a/text()"))
url = "".join(list.xpath("./a/@href"))
try:
cursor.execute(data, (name, url))
db.commit()
except:
db.rollback()
db.close()
def main():
insertdb()
if __name__ == '__main__':
spider()
- 下載小說
通過上一步我們已經有了筆趣閣所有小說的url地址,接下來要做的就是查詢指定小說若有則下載。
爲了加快小說的下載速度我們採取多線程訪問的方法。
import pymysql
import requests
from lxml import etree
from queue import Queue
import threading
q =False
#查詢數據庫
def pyurl(name):
#鏈接數據庫
db = pymysql.connect(host='localhost', port=3306, user='root', password='123456', db='biquge', charset='utf8')
cursor = db.cursor()
sql = 'select story_url from story_list where story_name ="%s"'%(name)
try:
#返回查詢的小說名和url地址
cursor.execute(sql)
result = cursor.fetchall()
url ="".join(result[0])
except:
url=""
name=""
print("暫無此小說,請等待")
db.close()
return url,name
#爬取改小說目錄的所有章節url地址
def spider(url,page_queue):
req = requests.get(url)
#獲取小說章節目錄得地址
html = etree.HTML(req.content.decode("utf8", "ignore"))
lists = html.xpath("//div[@id ='list']/dl//dd")
for list in lists:
title_url = "".join(list.xpath("./a/@href"))
#將小說每一個章節的目錄存放入隊列
page_queue.put(title_url)
#多線程下載小說章節
def download(page_queue,file,lock):
while not q:
try:
url = page_queue.get(False)
req = requests.get("http://www.xbiquge.la"+url) # 獲取響應
html = etree.HTML(req.content.decode("utf8", "ignore"))
title = html.xpath("//div[@class='bookname']/h1/text()")
title = "".join(title)
lists = html.xpath("//div[@id='content']/text()")
novel = "\n".join(lists).replace(" ", " ")
print("章節"+title+"下載完畢")
#設置鎖 避免多線程對文件的讀寫出錯
with lock:
file.write(title)
file.write("\n")
file.write(novel)
file.write("\n")
except:
pass
def main():
print("請輸入你要查詢的小說名字:")
x=input()
url ,name= pyurl(x)
if url!="":
page_queue=Queue()
file = open(x + ".txt", "a", encoding="utf8")
spider(url,page_queue)
lock=threading.Lock()
threadlist=[]
print("開始下載小說: "+name)
#開設8個線程一起運行下載的方法
for i in range(1,8):
t=threading.Thread(target=download,args=(page_queue,file,lock,))
t.start()
threadlist.append(t)
#判斷章節目錄有無空,不爲空一直運行
while not page_queue.empty():
pass
global q
q=True
#另主線程阻塞等待,使創建的線程都執行完畢
for thread in threadlist:
thread.join()
file.close()
print("小說: "+name+" 下載完畢")
if __name__ == '__main__':
main()
注意:上方兩個代碼要運行在不同的.py文件中
歡迎交流學習!如有錯誤請大方指出,不要吝嗇你的喜歡!