記得前幾個月剛畢業面試由於一直依賴框架,有一次面試讓我寫多線程爬蟲竟然沒寫出來,後悔不已,今天突然想起來,並且剛好打開博客,便記錄一個利用futures線程池庫的多線程爬蟲,技術有限,不足請評論指出。
import time
from queue import Queue
from concurrent.futures import ThreadPoolExecutor, as_completed
#創建鎖
get_lock = threading.Lock()
def get_url():
#加鎖
get_lock.acquire()
url = queue_list.get()
#釋放鎖
get_lock.release()
return url
if __name__ == '__main__':
url_list = ['www.badidu.com']*5
#創建請求隊列
queue_list = Queue()
for i in url_list:
#向請求隊列加url
queue_list.put(i)
#創建五個線程
with ThreadPoolExecutor(max_workers=5) as t:
obj_list = []
# 隊列不爲空時執行
while not queue_list.empty():
if not queue_list.empty():
#獲取url
url = get_url()
#執行run方法,傳參url
obj = t.submit(Chrome_Test().run, url)
obj_list.append(obj)
#重新添加至隊列
queue_list.put(url)
else:
break