爬蟲(三)下載漫畫

1.相關網址和庫

參考文章:漫畫下載,動態加載、反爬蟲這都不叫事!
網址

https://www.dmzj.com/

在這裏插入圖片描述
需要用到的庫
requests、beautifulsoup、tqdm

2.代碼實現

import requests
import re
from bs4 import BeautifulSoup
from contextlib import closing
from tqdm import tqdm
import os
import time


class MangaSpider(object):
    def __init__(self):
        self.save_dir = '妖神記'
        if not os.path.exists(self.save_dir):
            os.mkdir(self.save_dir)

        self.target_url = 'https://www.dmzj.com/info/yaoshenji.html'

        self.chapter_list = []

    # 1.發送請求
    def send_request(self, url):
        response = requests.get(url)
        data = response.content.decode('utf-8')

        return data

    # 2.解析數據
    def parse_list_data(self, data):
        bs = BeautifulSoup(data, 'lxml')
        list_con_li = bs.find('ul', attrs={'class': 'list_con_li'})
        manga_list = list_con_li.find_all('a')

        for manga in manga_list:
            chapter_dict = {}
            chapter_dict['chapter'] = manga.text
            chapter_dict['url'] = manga.get('href')
            self.chapter_list.append(chapter_dict)

    def parse_pic_data(self, pics_data):
        bs_pics = BeautifulSoup(pics_data, 'lxml')
        script_info = bs_pics.script  # bs 解析獲取 script 數據
        pics = re.findall('\d{13,14}', str(script_info))
        for j, pic in enumerate(pics):
            if len(pic) == 13:
                pics[j] = pic + '0'
        pics = sorted(pics, key=lambda x: int(x))

        chapterpic_hou = re.findall('\|(\d{5})\|', str(script_info))[0]
        chapterpic_qian = re.findall('\|(\d{4})\|', str(script_info))[0]

        pics_url_sorted = []
        for pic in pics:
            if pic[-1] == '0':
                url = 'https://images.dmzj.com/img/chapterpic/' + chapterpic_qian + '/' + chapterpic_hou + '/' + pic[:-1] + '.jpg'
            else:
                url = 'https://images.dmzj.com/img/chapterpic/' + chapterpic_qian + '/' + chapterpic_hou + '/' + pic + '.jpg'
            pics_url_sorted.append(url)

        return pics_url_sorted

    # 3.下載漫畫
        def download_pics(self, data):
        download_header = {
            'Referer': data['url']
        }
        name = data['chapter']
        # print(name)
        while '.' in name:
            name = name.replace('.', '')
        chapter_save_dir = os.path.join(self.save_dir, name)
        if not os.path.exists(chapter_save_dir):
            os.mkdir(chapter_save_dir)
        pics_data = self.send_request(url=data['url'])
        pics_url = self.parse_pic_data(pics_data)

        # 下載圖片並保存
        for idx, pic_url in enumerate(pics_url):
            pic_name = '%03d.jpg' % (idx + 1)
            pic_save_path = os.path.join(chapter_save_dir, pic_name)
            with closing(requests.get(pic_url, headers=download_header, stream=True)) as response:
                chunk_size = 1024
                content_size = int(response.headers['content-length'])
                if response.status_code == 200:
                    with open(pic_save_path, 'wb') as f:
                        for data in response.iter_content(chunk_size=chunk_size):
                            f.write(data)
                else:
                    print('鏈接異常')
        time.sleep(5)

    def run(self):
        data = self.send_request(url=self.target_url)
        self.parse_list_data(data)

        for i, data in enumerate(tqdm(self.chapter_list)):
            self.download_pics(data)


if __name__ == '__main__':
    MangaSpider().run()

3.小結

  • 通過Referer的反扒爬蟲手段
  • 動態加載數據的解析
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章