python3爬蟲爬取風之動漫漫畫

最近迷上一拳超人,在xx動漫上看漫畫時總是遇到各種網絡問題,索性之間爬下來

源碼如下

import requests, re
from bs4 import BeautifulSoup
import bs4
import os
import urllib

headers = {
    "cookie": "picHost=p17.xiaoshidi.net; Hm_lvt_cb51090e9c10cda176f81a7fa92c3dfc=1545054252,1545054291,1545054381,1545054404; Hm_lpvt_cb51090e9c10cda176f81a7fa92c3dfc=1545054475",
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36'
}

def getHTML(url):
    try:
        r = requests.get(url,headers = headers, timeout = 30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r
    except requests.exceptions.HTTPError as e:
        return 0

def fillNeedInfo(url, html):
    text = html.text
    needText = re.findall('var mhurl="(.*?)"',text)

    needText = needText[0]

    return needText
def saveInfo(picUrl, picPath, chapter, page):
    picClass = picUrl.split('.')[-1]
    if picClass == 'jpg':
        try:
            req = urllib.request.Request(picUrl, headers=headers)
            data = urllib.request.urlopen(req,timeout = 300).read()
            with open(picPath+'/'+str(page)+'.jpg', 'wb') as f:
                f.write(data)
                f.close()
            print('第' + str(chapter) + '章第' + str(page) + '頁爬取成功')
        except Exception as e:
            print(str(e))
    elif picClass == 'png':
        try:
            req = urllib.request.Request(picUrl, headers=headers)
            data = urllib.request.urlopen(req).read()
            with open(picPath+'/'+str(page)+'.png', 'wb') as f:
                f.write(data)
                f.close()
            print('第' + str(chapter) + '章第' + str(page) + '頁爬取成功')
        except Exception as e:
            print(str(e))

def updataUrl(url, chapter, page):
    url += str(chapter)

    url += '/index_'

    url += str(page)

    url += '.html'
    print(url)
    return url

def getChapterNum(url):
    text = getHTML(url).text
    chapterNumList = re.findall('a href="(.*?)/" title="(.*?)"',text)
    chapterNumList.pop(0)
    return chapterNumList

exceptionList = []

def reptileMain(url):
    leftPictureUrl = "http://p0.xiaoshidi.net/"

    try:            #創建文件夾存放
        os.mkdir('image')
    except:
        pass

    chapterNumList = getChapterNum(url)     #章節列表

    page = 0        #頁碼標記

    star = 1
    for chapterNum in chapterNumList:
        page = 0
        picPath = 'image/' + str(chapterNum[1])      #章節文件路徑

        for value in range(0,500):          #單章最多500頁
            try:
                html = getHTML(updataUrl(url,chapterNum[0],page))   #獲取頁面信息
                if html == 0:           #若404則html=0,此時跳出循環  page超出頁數
                    break

                pictureUrl = leftPictureUrl + fillNeedInfo(url, html)

                try:        #爲每章創建目錄
                    os.mkdir(picPath)
                except Exception as e:
                    pass
                saveInfo(pictureUrl, picPath, chapterNum[0], page)

            except Exception as e:
                exceptionList.append(e) #記錄錯誤信息
            page += 1
            for value in range(0,star):
                print('*',end='')
            print(' ')
            star += 3
            if star > 35:
                star = 1

def main():
    url = input("請輸入風之動漫漫畫目錄網址:")
    print('開始爬取,爬取文件將新建文件目錄image,如果已經存在,請注意文件存放')
    reptileMain(url)
    print('爬取成功')
main()
print('程序出現以下錯誤:')
for value in exceptionList:
    print(value)

over = input("程序運行結束,請敲回車結束或之間關閉")

水多輕噴

類庫安裝完成後直接打開文件,輸入需要爬取的漫畫目錄頁即可,如要爬取一拳超人:
使用示例

https://www.fzdm.com/manhua/132/
將此頁面複製進去回車就行,注意132後面一定要有/

已成功爬取一拳超人和進擊的巨人全篇漫畫,不會使用程序需要漫畫的可以關注公衆號:九藝雜貨鋪

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章