文件下載太慢嗎? 使用httpx和asyncio實現併發下載的小demo

#!/usr/bin/env python
# -*- coding=utf8 -*-

import os
import time
import asyncio
import typing

from httpx import Response, AsyncClient, ConnectionClosed, ConnectTimeout, ReadTimeout

# fake
URLTypes = typing.Union["URL", str]
PATHTypes = typing.Union["PATH", str]


class ConcurrentDownloader(object):

    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,'
                  'application/signed-exchange;v=b3',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/74.0.3729.169 Safari/537.36',
        'Accept-Encoding': 'gzip, deflate'
    }
    cli = AsyncClient()

    async def send_req(self, url: URLTypes, method: str = 'GET', **kwargs) -> typing.Union[Response, None]:
        """
        發送請求
        :param url: 要請求的連接
        :param method: 請求的方式
        :param kwargs: requests.request方法支持的其它參數
        :return: response or None
        """
        resp = None
        try:
            headers = self.headers.copy()
            if kwargs.get("headers"):
                headers.update(kwargs.pop("headers"))
            resp = await self.cli.request(method, url, headers=headers, timeout=120, **kwargs)
        except (ConnectTimeout, ReadTimeout):
            print("連接/讀取超時,請檢查您的網絡和任務地址")
        except ConnectionClosed:
            print("連接被關閉")
        except Exception as f:
            print("訪問任務地址失敗:%s" % f)
        finally:
            return resp

    async def stream_download(self, url: URLTypes, file: PATHTypes, method: str = 'GET', **kwargs) -> bool:
        """
        流式下載
        :param url: 下載任務的地址
        :param file: 保存下載內容的路徑
        :param method: 訪問下載地址的方式
        :param kwargs: requests.request方法支持的其它參數
        :return: True/False
        """
        headers = self.headers.copy()
        if kwargs.get("headers"):
            headers.update(kwargs.pop("headers"))
        try:
            async with self.cli.stream(method, url, headers=headers, timeout=None, **kwargs) as resp:
                with open(file, "ab") as f:
                    async for chunk in resp.aiter_bytes():
                        f.write(chunk)
            return True
        except:
            return False

    @staticmethod
    async def parse_cont_len(resp: Response) -> int:
        """
        解析resp的headers的content-length
        :param resp: resp對象
        :return: content-length
        """
        headers = resp.headers
        cont_len = headers.get("content-length")
        if not cont_len or not cont_len.isdigit():
            print("get content-length failed")
            return 0
        return int(cont_len)

    async def get_cont_len(self, url: URLTypes) -> int:
        """
        獲取鏈接應返回內容的字節碼大小
        這裏用來獲取要下載內容的大小
        :param url: 鏈接地址
        :return: 字節碼的長度
        """
        resp = await self.send_req(url=url, method="HEAD")
        cont_len = resp is None and 0 or await self.parse_cont_len(resp)
        return cont_len

    async def sub_download(self, url: URLTypes, sub_index: int, start: int, end: int, save_dir: PATHTypes) -> bool:
        """
        子下載任務
        :param url: 下載鏈接
        :param sub_index: 子任務的編號
        :param start: 指定從第幾個字節開始下載
        :param end: 指定下載到第幾個字節結束
        :param save_dir: 下載任務將保存至哪個目錄
        :return: bool值標記下載是否成功
        """
        print("do--第%d個子任務開始下載" % sub_index)
        file_path = self.organize_filepath(url, save_dir)
        sub_filepath = file_path + ".%d" % sub_index
        if not os.path.exists(sub_filepath):
            headers = {"Range": "bytes=%d-%d" % (start, end)}
            ret = await self.stream_download(url, sub_filepath, headers=headers)
        else:
            with open(sub_filepath, "rb") as f:
                already_download_size = len(f.read())
                if already_download_size >= end - start:
                    print("sub task already downloaded, sub index: %s" % sub_index)
                    ret = True
                else:
                    start += already_download_size
                    headers = {"Range": "bytes=%d-%d" % (start, end)}
                    ret = await self.stream_download(url, sub_filepath, headers=headers)
        print("第%d個子任務下載完成--done" % sub_index)
        return ret

    @staticmethod
    def merge_sub_files(concurrent_num: int, file_path: PATHTypes) -> None:
        """
        合併所有子下載任務所下載的內容
        :param concurrent_num: 子下載任務的總個數
        :param file_path: 下載任務應保存的路徑
        :return: None
        """
        with open(file_path, "wb") as f:
            for index in range(concurrent_num):
                sub_filepath = file_path + ".%d" % index
                with open(sub_filepath, "rb") as sub_f:
                    chunk_size = 1024
                    chunk = sub_f.read(chunk_size)
                    while chunk:
                        f.write(chunk)
                        chunk = sub_f.read(chunk_size)
                    f.write(chunk)
                os.remove(sub_filepath)

    async def download_subs(self, tasks: typing.Set[asyncio.Future]) -> None:
        """
        執行所有子下載任務
        :param tasks: 所有子下載任務
        :return: None
        """
        failed_tasks = set()
        done, pending = await asyncio.wait(tasks)
        for task in done:
            ret = task.result()
            if ret is False:
                failed_tasks.add(task)
        failed_tasks.update(pending)
        if failed_tasks:
            await self.download_subs(failed_tasks)

    async def _download(self, url: URLTypes, concurrent_num: int, save_dir: PATHTypes) -> bool:
        """
        執行下載任務
        :param url: 任務鏈接
        :param concurrent_num: 併發數
        :param save_dir: 存儲下載內容的目錄
        :return: bool值標記是否下載成功
        """
        cont_len = await self.get_cont_len(url)
        if not cont_len:
            print("獲取文件大小失敗, 沒有執行下載任務")
            return False
        print("開始下載, 文件大小: %d[%.2fM], 併發數: %d" % (cont_len, cont_len / (1024 ** 2), concurrent_num))
        each_len = cont_len // concurrent_num + 1
        tasks = set()
        for i in range(concurrent_num):
            start, end = each_len * i, each_len * (i + 1) - 1
            end = end <= cont_len and end or cont_len
            task = asyncio.ensure_future(self.sub_download(url, i, start, end, save_dir))
            tasks.add(task)
        await self.download_subs(tasks)
        file_path = self.organize_filepath(url, save_dir)
        self.merge_sub_files(concurrent_num, file_path)
        return True

    @staticmethod
    def organize_filepath(url: URLTypes, save_dir: PATHTypes) -> PATHTypes:
        """
        根據下載鏈接和下載後存儲的目錄組織下載後文件的存儲路徑
        :param url: 下載鏈接
        :param save_dir: 存儲下載內容的目錄
        :return: 下載後文件的存儲路徑
        """
        filename = url.rsplit("/", 1)[-1]
        file_path = os.path.join(save_dir, filename)
        return file_path

    def download(self, url: URLTypes, save_dir: PATHTypes = r".", concurrent_num: int = 10) -> bool:
        """
        下載鏈接的內容
        :param url: 下載鏈接
        :param save_dir: 存儲下載內容的目錄
        :param concurrent_num: 併發數
        :return: bool值標記是否下載成功
        """
        print("下達了一個下載任務, 任務地址:%s" % url)
        st = time.time()
        if not os.path.exists(save_dir):
            raise ValueError("目錄不存在: %s" % save_dir)
        file_path = self.organize_filepath(url, save_dir)
        if os.path.exists(file_path):
            print("任務文件已存在, 未執行下載任務: %s" % file_path)
            return False
        ret = asyncio.run(self._download(url, concurrent_num, save_dir))
        et = time.time()
        print("任務運行結束, 總耗時: %.2f分鐘" % ((et - st) / 60))
        return ret


if __name__ == '__main__':
    task_link = "http://soft.down9.xyz/GitKraken_7565.zip"
    cd = ConcurrentDownloader()
    d_ret = cd.download(task_link)
    assert d_ret is True

流程:1.獲取下載任務的內容大小--2.根據內容大小切割爲若干個子任務--3.下載所有子任務--4.合併所有子任務的內容

要點:子任務使用流式下載,邊下邊存入文件以便支持斷點續傳

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章