python 圖片、文件 通過 request header 多線程下載

寫爬蟲過程中發現圖片下載比較慢,遂使用多線程下載來提速

import threading
import requests


class MulThreadDownload(threading.Thread):
    def __init__(self, url, startpos, endpos, temp_dict, headers, proxies):
        super(MulThreadDownload, self).__init__()
        self.url = url
        self.startpos = startpos
        self.endpos = endpos
        self.temp_dict = temp_dict
        self.headers = headers
        self.proxies = proxies

    def download(self, proxies):
        headers = {"Range": "bytes=%s-%s" % (self.startpos, self.endpos)}
        if self.headers:
            self.headers['Range'] = "bytes=%s-%s" % (self.startpos, self.endpos)
            headers = self.headers

        for i in range(10):
            # 切換代理形式,增加成功率
            if i in [2, 5]:
                proxies = {'http': 'http:{}'.format(proxies.get("https")[6:])}
            if i in [3, 7]:
                proxies = {'http': proxies.get("https")}
            try:
                res = requests.get(self.url, headers=headers, proxies=proxies, timeout=3)
                if res.content:
                    self.temp_dict[self.startpos]=res.content
                    break
                else:
                    continue
            except Exception as e:
                print(f'{self.url} down load error {str(e)}')
                continue


    def run(self):
        self.download(self.proxies)


def download_img_multi_thread(url, headers, proxies):
    # 獲取文件的大小和文件名
    filesize = 0
    if headers:
        filesize = int(requests.head(url, headers=headers, proxies=proxies).headers.get('Content-Length'))
    if not headers:
        filesize = int(requests.head(url, proxies=proxies).headers.get('Content-Length'))
    if filesize:
        # 線程數
        threadnum = 5
        # 信號量,同時只允許5個線程運行
        # threading.BoundedSemaphore(threadnum)
        # 默認5線程現在,也可以通過傳參的方式設置線程數
        step = filesize // threadnum
        mtd_list = []
        start = 0
        end = -1
        # 如果文件大小爲11字節,那就是獲取文件0-10的位置的數據。如果end = 10,說明數據已經獲取完了。
        temp_dict = dict()
        while end < filesize - 1:
            start = end + 1
            end = start + step - 1
            if end > filesize - 1:
                end = filesize - 1
            if filesize - 1 - end < step:
                end = filesize - 1
            t = MulThreadDownload(url, start, end, temp_dict, headers, proxies)
            t.start()
            mtd_list.append(t)

        for i in mtd_list:
            i.join()
        # 所有線程都下完,組合所有字節到一起
        temp_dict = sorted(temp_dict.items(), key=lambda x: x[0])
        temp_b = b''
        for i in temp_dict:
            temp_b = temp_b+i[1]
        if len(temp_b) == filesize:  # 校驗文件大小
            return temp_b
        else:
            print(f'file download failed temp_b {len(temp_b)} filesize {filesize}')
    return None
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章