寫爬蟲過程中發現圖片下載比較慢,遂使用多線程下載來提速
import threading
import requests
class MulThreadDownload(threading.Thread):
def __init__(self, url, startpos, endpos, temp_dict, headers, proxies):
super(MulThreadDownload, self).__init__()
self.url = url
self.startpos = startpos
self.endpos = endpos
self.temp_dict = temp_dict
self.headers = headers
self.proxies = proxies
def download(self, proxies):
headers = {"Range": "bytes=%s-%s" % (self.startpos, self.endpos)}
if self.headers:
self.headers['Range'] = "bytes=%s-%s" % (self.startpos, self.endpos)
headers = self.headers
for i in range(10):
# 切換代理形式,增加成功率
if i in [2, 5]:
proxies = {'http': 'http:{}'.format(proxies.get("https")[6:])}
if i in [3, 7]:
proxies = {'http': proxies.get("https")}
try:
res = requests.get(self.url, headers=headers, proxies=proxies, timeout=3)
if res.content:
self.temp_dict[self.startpos]=res.content
break
else:
continue
except Exception as e:
print(f'{self.url} down load error {str(e)}')
continue
def run(self):
self.download(self.proxies)
def download_img_multi_thread(url, headers, proxies):
# 獲取文件的大小和文件名
filesize = 0
if headers:
filesize = int(requests.head(url, headers=headers, proxies=proxies).headers.get('Content-Length'))
if not headers:
filesize = int(requests.head(url, proxies=proxies).headers.get('Content-Length'))
if filesize:
# 線程數
threadnum = 5
# 信號量,同時只允許5個線程運行
# threading.BoundedSemaphore(threadnum)
# 默認5線程現在,也可以通過傳參的方式設置線程數
step = filesize // threadnum
mtd_list = []
start = 0
end = -1
# 如果文件大小爲11字節,那就是獲取文件0-10的位置的數據。如果end = 10,說明數據已經獲取完了。
temp_dict = dict()
while end < filesize - 1:
start = end + 1
end = start + step - 1
if end > filesize - 1:
end = filesize - 1
if filesize - 1 - end < step:
end = filesize - 1
t = MulThreadDownload(url, start, end, temp_dict, headers, proxies)
t.start()
mtd_list.append(t)
for i in mtd_list:
i.join()
# 所有線程都下完,組合所有字節到一起
temp_dict = sorted(temp_dict.items(), key=lambda x: x[0])
temp_b = b''
for i in temp_dict:
temp_b = temp_b+i[1]
if len(temp_b) == filesize: # 校驗文件大小
return temp_b
else:
print(f'file download failed temp_b {len(temp_b)} filesize {filesize}')
return None