爱看小说的程序猿必备!
首先我们需要安装以下库:
python3.5
requests
pymysql
lxml
queue
threading
import requests
from lxml import etree
import pymysql
def insertdb():
url = "http://www.xbiquge.la/xiaoshuodaquan/"
# 获取响应
req = requests.get(url)
html = etree.HTML(req.content.decode("utf8", "ignore"))
lists = html.xpath("//div[@class='novellist']/ul/li")
#链接数据库
db = pymysql.connect(host='localhost', port=3306, user='root', password='123456', db='biquge', charset='utf8')
cursor = db.cursor()
data = "insert into story_list(story_name,story_url) values(%s, %s)"
for list in lists:
#将小说名和url插入到数据库
name = "".join(list.xpath("./a/text()"))
url = "".join(list.xpath("./a/@href"))
try:
cursor.execute(data, (name, url))
db.commit()
except:
db.rollback()
db.close()
def main():
insertdb()
if __name__ == '__main__':
spider()
- 下载小说
通过上一步我们已经有了笔趣阁所有小说的url地址,接下来要做的就是查询指定小说若有则下载。
为了加快小说的下载速度我们采取多线程访问的方法。
import pymysql
import requests
from lxml import etree
from queue import Queue
import threading
q =False
#查询数据库
def pyurl(name):
#链接数据库
db = pymysql.connect(host='localhost', port=3306, user='root', password='123456', db='biquge', charset='utf8')
cursor = db.cursor()
sql = 'select story_url from story_list where story_name ="%s"'%(name)
try:
#返回查询的小说名和url地址
cursor.execute(sql)
result = cursor.fetchall()
url ="".join(result[0])
except:
url=""
name=""
print("暂无此小说,请等待")
db.close()
return url,name
#爬取改小说目录的所有章节url地址
def spider(url,page_queue):
req = requests.get(url)
#获取小说章节目录得地址
html = etree.HTML(req.content.decode("utf8", "ignore"))
lists = html.xpath("//div[@id ='list']/dl//dd")
for list in lists:
title_url = "".join(list.xpath("./a/@href"))
#将小说每一个章节的目录存放入队列
page_queue.put(title_url)
#多线程下载小说章节
def download(page_queue,file,lock):
while not q:
try:
url = page_queue.get(False)
req = requests.get("http://www.xbiquge.la"+url) # 获取响应
html = etree.HTML(req.content.decode("utf8", "ignore"))
title = html.xpath("//div[@class='bookname']/h1/text()")
title = "".join(title)
lists = html.xpath("//div[@id='content']/text()")
novel = "\n".join(lists).replace(" ", " ")
print("章节"+title+"下载完毕")
#设置锁 避免多线程对文件的读写出错
with lock:
file.write(title)
file.write("\n")
file.write(novel)
file.write("\n")
except:
pass
def main():
print("请输入你要查询的小说名字:")
x=input()
url ,name= pyurl(x)
if url!="":
page_queue=Queue()
file = open(x + ".txt", "a", encoding="utf8")
spider(url,page_queue)
lock=threading.Lock()
threadlist=[]
print("开始下载小说: "+name)
#开设8个线程一起运行下载的方法
for i in range(1,8):
t=threading.Thread(target=download,args=(page_queue,file,lock,))
t.start()
threadlist.append(t)
#判断章节目录有无空,不为空一直运行
while not page_queue.empty():
pass
global q
q=True
#另主线程阻塞等待,使创建的线程都执行完毕
for thread in threadlist:
thread.join()
file.close()
print("小说: "+name+" 下载完毕")
if __name__ == '__main__':
main()
注意:上方两个代码要运行在不同的.py文件中
欢迎交流学习!如有错误请大方指出,不要吝啬你的喜欢!