最近迷上一拳超人,在xx動漫上看漫畫時總是遇到各種網絡問題,索性之間爬下來
源碼如下
import requests, re
from bs4 import BeautifulSoup
import bs4
import os
import urllib
headers = {
"cookie": "picHost=p17.xiaoshidi.net; Hm_lvt_cb51090e9c10cda176f81a7fa92c3dfc=1545054252,1545054291,1545054381,1545054404; Hm_lpvt_cb51090e9c10cda176f81a7fa92c3dfc=1545054475",
'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36'
}
def getHTML(url):
try:
r = requests.get(url,headers = headers, timeout = 30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r
except requests.exceptions.HTTPError as e:
return 0
def fillNeedInfo(url, html):
text = html.text
needText = re.findall('var mhurl="(.*?)"',text)
needText = needText[0]
return needText
def saveInfo(picUrl, picPath, chapter, page):
picClass = picUrl.split('.')[-1]
if picClass == 'jpg':
try:
req = urllib.request.Request(picUrl, headers=headers)
data = urllib.request.urlopen(req,timeout = 300).read()
with open(picPath+'/'+str(page)+'.jpg', 'wb') as f:
f.write(data)
f.close()
print('第' + str(chapter) + '章第' + str(page) + '頁爬取成功')
except Exception as e:
print(str(e))
elif picClass == 'png':
try:
req = urllib.request.Request(picUrl, headers=headers)
data = urllib.request.urlopen(req).read()
with open(picPath+'/'+str(page)+'.png', 'wb') as f:
f.write(data)
f.close()
print('第' + str(chapter) + '章第' + str(page) + '頁爬取成功')
except Exception as e:
print(str(e))
def updataUrl(url, chapter, page):
url += str(chapter)
url += '/index_'
url += str(page)
url += '.html'
print(url)
return url
def getChapterNum(url):
text = getHTML(url).text
chapterNumList = re.findall('a href="(.*?)/" title="(.*?)"',text)
chapterNumList.pop(0)
return chapterNumList
exceptionList = []
def reptileMain(url):
leftPictureUrl = "http://p0.xiaoshidi.net/"
try: #創建文件夾存放
os.mkdir('image')
except:
pass
chapterNumList = getChapterNum(url) #章節列表
page = 0 #頁碼標記
star = 1
for chapterNum in chapterNumList:
page = 0
picPath = 'image/' + str(chapterNum[1]) #章節文件路徑
for value in range(0,500): #單章最多500頁
try:
html = getHTML(updataUrl(url,chapterNum[0],page)) #獲取頁面信息
if html == 0: #若404則html=0,此時跳出循環 page超出頁數
break
pictureUrl = leftPictureUrl + fillNeedInfo(url, html)
try: #爲每章創建目錄
os.mkdir(picPath)
except Exception as e:
pass
saveInfo(pictureUrl, picPath, chapterNum[0], page)
except Exception as e:
exceptionList.append(e) #記錄錯誤信息
page += 1
for value in range(0,star):
print('*',end='')
print(' ')
star += 3
if star > 35:
star = 1
def main():
url = input("請輸入風之動漫漫畫目錄網址:")
print('開始爬取,爬取文件將新建文件目錄image,如果已經存在,請注意文件存放')
reptileMain(url)
print('爬取成功')
main()
print('程序出現以下錯誤:')
for value in exceptionList:
print(value)
over = input("程序運行結束,請敲回車結束或之間關閉")
水多輕噴
類庫安裝完成後直接打開文件,輸入需要爬取的漫畫目錄頁即可,如要爬取一拳超人:
https://www.fzdm.com/manhua/132/
將此頁面複製進去回車就行,注意132後面一定要有/
已成功爬取一拳超人和進擊的巨人全篇漫畫,不會使用程序需要漫畫的可以關注公衆號:九藝雜貨鋪