记得前几个月刚毕业面试由于一直依赖框架,有一次面试让我写多线程爬虫竟然没写出来,后悔不已,今天突然想起来,并且刚好打开博客,便记录一个利用futures线程池库的多线程爬虫,技术有限,不足请评论指出。
import time
from queue import Queue
from concurrent.futures import ThreadPoolExecutor, as_completed
#创建锁
get_lock = threading.Lock()
def get_url():
#加锁
get_lock.acquire()
url = queue_list.get()
#释放锁
get_lock.release()
return url
if __name__ == '__main__':
url_list = ['www.badidu.com']*5
#创建请求队列
queue_list = Queue()
for i in url_list:
#向请求队列加url
queue_list.put(i)
#创建五个线程
with ThreadPoolExecutor(max_workers=5) as t:
obj_list = []
# 队列不为空时执行
while not queue_list.empty():
if not queue_list.empty():
#获取url
url = get_url()
#执行run方法,传参url
obj = t.submit(Chrome_Test().run, url)
obj_list.append(obj)
#重新添加至队列
queue_list.put(url)
else:
break