使用python爬取全書網小說
#-*- coding:UTF-8 -*- #編碼設置
#全書網獲取小說
from urllib2 import urlopen
import re
#小說網址
first_url = 'http://www.quanshuwang.com/book/9/9055'
#需要轉碼,否則亂碼
html = urlopen(first_url).read().decode('gbk')
#print html
#保存小說名稱和作者
novel_info = {}
novel_info['title'] = re.findall(r'<div class="chapName">.*?<strong>(.*?)</strong>',html)[0]
novel_info['author'] = re.findall(r'<div class="chapName"><span class="r">(.*?)</span><strong>',html)[0]
#print novel_info['title'],novel_info['author']
#獲取小說章節主體div
novel_div = re.findall(r'<DIV class="clearfix dirconone">(.*?)</div>',html,re.S|re.I)[0]
#print novel_div
#獲取小說每個章節對應的超鏈接
target_a = re.findall(r'<a.*?</a>',novel_div)
if target_a:
for a in target_a:
#print a
href = re.findall(r'href="(.*?)"',a)[0]
#print href
#獲取小說每個章節對應的內容
content = urlopen(href).read().decode('gbk')
print content
#在這裏同上使用正則獲取小說正文,再把換行符、空格等字符去掉,保存即可