#!/usr/bin/env python
# -*- coding=utf8 -*-
import os
import time
import asyncio
import typing
from httpx import Response, AsyncClient, ConnectionClosed, ConnectTimeout, ReadTimeout
# fake
URLTypes = typing.Union["URL", str]
PATHTypes = typing.Union["PATH", str]
class ConcurrentDownloader(object):
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,'
'application/signed-exchange;v=b3',
'Accept-Language': 'zh-CN,zh;q=0.9',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/74.0.3729.169 Safari/537.36',
'Accept-Encoding': 'gzip, deflate'
}
cli = AsyncClient()
async def send_req(self, url: URLTypes, method: str = 'GET', **kwargs) -> typing.Union[Response, None]:
"""
發送請求
:param url: 要請求的連接
:param method: 請求的方式
:param kwargs: requests.request方法支持的其它參數
:return: response or None
"""
resp = None
try:
headers = self.headers.copy()
if kwargs.get("headers"):
headers.update(kwargs.pop("headers"))
resp = await self.cli.request(method, url, headers=headers, timeout=120, **kwargs)
except (ConnectTimeout, ReadTimeout):
print("連接/讀取超時,請檢查您的網絡和任務地址")
except ConnectionClosed:
print("連接被關閉")
except Exception as f:
print("訪問任務地址失敗:%s" % f)
finally:
return resp
async def stream_download(self, url: URLTypes, file: PATHTypes, method: str = 'GET', **kwargs) -> bool:
"""
流式下載
:param url: 下載任務的地址
:param file: 保存下載內容的路徑
:param method: 訪問下載地址的方式
:param kwargs: requests.request方法支持的其它參數
:return: True/False
"""
headers = self.headers.copy()
if kwargs.get("headers"):
headers.update(kwargs.pop("headers"))
try:
async with self.cli.stream(method, url, headers=headers, timeout=None, **kwargs) as resp:
with open(file, "ab") as f:
async for chunk in resp.aiter_bytes():
f.write(chunk)
return True
except:
return False
@staticmethod
async def parse_cont_len(resp: Response) -> int:
"""
解析resp的headers的content-length
:param resp: resp對象
:return: content-length
"""
headers = resp.headers
cont_len = headers.get("content-length")
if not cont_len or not cont_len.isdigit():
print("get content-length failed")
return 0
return int(cont_len)
async def get_cont_len(self, url: URLTypes) -> int:
"""
獲取鏈接應返回內容的字節碼大小
這裏用來獲取要下載內容的大小
:param url: 鏈接地址
:return: 字節碼的長度
"""
resp = await self.send_req(url=url, method="HEAD")
cont_len = resp is None and 0 or await self.parse_cont_len(resp)
return cont_len
async def sub_download(self, url: URLTypes, sub_index: int, start: int, end: int, save_dir: PATHTypes) -> bool:
"""
子下載任務
:param url: 下載鏈接
:param sub_index: 子任務的編號
:param start: 指定從第幾個字節開始下載
:param end: 指定下載到第幾個字節結束
:param save_dir: 下載任務將保存至哪個目錄
:return: bool值標記下載是否成功
"""
print("do--第%d個子任務開始下載" % sub_index)
file_path = self.organize_filepath(url, save_dir)
sub_filepath = file_path + ".%d" % sub_index
if not os.path.exists(sub_filepath):
headers = {"Range": "bytes=%d-%d" % (start, end)}
ret = await self.stream_download(url, sub_filepath, headers=headers)
else:
with open(sub_filepath, "rb") as f:
already_download_size = len(f.read())
if already_download_size >= end - start:
print("sub task already downloaded, sub index: %s" % sub_index)
ret = True
else:
start += already_download_size
headers = {"Range": "bytes=%d-%d" % (start, end)}
ret = await self.stream_download(url, sub_filepath, headers=headers)
print("第%d個子任務下載完成--done" % sub_index)
return ret
@staticmethod
def merge_sub_files(concurrent_num: int, file_path: PATHTypes) -> None:
"""
合併所有子下載任務所下載的內容
:param concurrent_num: 子下載任務的總個數
:param file_path: 下載任務應保存的路徑
:return: None
"""
with open(file_path, "wb") as f:
for index in range(concurrent_num):
sub_filepath = file_path + ".%d" % index
with open(sub_filepath, "rb") as sub_f:
chunk_size = 1024
chunk = sub_f.read(chunk_size)
while chunk:
f.write(chunk)
chunk = sub_f.read(chunk_size)
f.write(chunk)
os.remove(sub_filepath)
async def download_subs(self, tasks: typing.Set[asyncio.Future]) -> None:
"""
執行所有子下載任務
:param tasks: 所有子下載任務
:return: None
"""
failed_tasks = set()
done, pending = await asyncio.wait(tasks)
for task in done:
ret = task.result()
if ret is False:
failed_tasks.add(task)
failed_tasks.update(pending)
if failed_tasks:
await self.download_subs(failed_tasks)
async def _download(self, url: URLTypes, concurrent_num: int, save_dir: PATHTypes) -> bool:
"""
執行下載任務
:param url: 任務鏈接
:param concurrent_num: 併發數
:param save_dir: 存儲下載內容的目錄
:return: bool值標記是否下載成功
"""
cont_len = await self.get_cont_len(url)
if not cont_len:
print("獲取文件大小失敗, 沒有執行下載任務")
return False
print("開始下載, 文件大小: %d[%.2fM], 併發數: %d" % (cont_len, cont_len / (1024 ** 2), concurrent_num))
each_len = cont_len // concurrent_num + 1
tasks = set()
for i in range(concurrent_num):
start, end = each_len * i, each_len * (i + 1) - 1
end = end <= cont_len and end or cont_len
task = asyncio.ensure_future(self.sub_download(url, i, start, end, save_dir))
tasks.add(task)
await self.download_subs(tasks)
file_path = self.organize_filepath(url, save_dir)
self.merge_sub_files(concurrent_num, file_path)
return True
@staticmethod
def organize_filepath(url: URLTypes, save_dir: PATHTypes) -> PATHTypes:
"""
根據下載鏈接和下載後存儲的目錄組織下載後文件的存儲路徑
:param url: 下載鏈接
:param save_dir: 存儲下載內容的目錄
:return: 下載後文件的存儲路徑
"""
filename = url.rsplit("/", 1)[-1]
file_path = os.path.join(save_dir, filename)
return file_path
def download(self, url: URLTypes, save_dir: PATHTypes = r".", concurrent_num: int = 10) -> bool:
"""
下載鏈接的內容
:param url: 下載鏈接
:param save_dir: 存儲下載內容的目錄
:param concurrent_num: 併發數
:return: bool值標記是否下載成功
"""
print("下達了一個下載任務, 任務地址:%s" % url)
st = time.time()
if not os.path.exists(save_dir):
raise ValueError("目錄不存在: %s" % save_dir)
file_path = self.organize_filepath(url, save_dir)
if os.path.exists(file_path):
print("任務文件已存在, 未執行下載任務: %s" % file_path)
return False
ret = asyncio.run(self._download(url, concurrent_num, save_dir))
et = time.time()
print("任務運行結束, 總耗時: %.2f分鐘" % ((et - st) / 60))
return ret
if __name__ == '__main__':
task_link = "http://soft.down9.xyz/GitKraken_7565.zip"
cd = ConcurrentDownloader()
d_ret = cd.download(task_link)
assert d_ret is True
流程:1.獲取下載任務的內容大小--2.根據內容大小切割爲若干個子任務--3.下載所有子任務--4.合併所有子任務的內容
要點:子任務使用流式下載,邊下邊存入文件以便支持斷點續傳