多線程爬蟲
關於python3多線程,大家去菜鳥學學就行了,並沒有多難
接下來我們來改造我們那個爬取指定多套圖的爬蟲(講一下基礎就行了,至於生產着消費者之類的有興趣的可以自己改)
一、數據存儲改成隊列
二、每次發送請求前用time.sleep(random)改變發送請求的頻率
三、發送請求後驗證就收信息狀態碼page.status_code,如果不爲200(成功),則time.sleep(random)後繼續發送
(因爲偷懶,代碼並沒有優化,不過並不影響什麼)(⊙o⊙)…這個可能這兩天練習爬的多了封ip了?
也可能是代碼有問題,因爲是直接拿前面教程的代碼改的,有些地方可能不合理,所以這個代碼就不拿出來讓人笑話了,直接拿我之前學習的時候寫的吧
import requests
import os
import queue
import threading
import time
import random
from lxml import html
threadLock = threading.Lock()
threads = []
big_url_list = queue.Queue()
big_message = queue.Queue()
sleep = random.uniform(1.5, 2.5)
path = './收藏/'
def GetBigUrlList():
global big_url_list
print("開始獲取任務鏈接!")
with open("網址.txt", "r") as f:
url_list = f.read().splitlines()
f.close()
for url in url_list:
big_url_list.put(url)
print(big_url_list.queue)
def GetProxy():
proxy_list = [
'113.120.32.12:9999',
'171.15.173.185:9999'
]
proxy = random.choice(proxy_list)
proxies = {
'http': 'http://' + proxy,
'https': 'https://' + proxy,
}
return proxies
def GetHeaders():
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
UserAgent=random.choice(user_agent_list)
headers = {'User-Agent': UserAgent}
return headers
class GetAllMessage(threading.Thread):
def run(self):
global big_url_list,big_message,sleep
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
}
while True:
if big_url_list.empty():
break
url = big_url_list.get()
try:
print("開始解析地址:%s" % (url))
page = requests.get(url=url,headers=headers)
while page.status_code != 200:
time.sleep(sleep)
page = requests.get(url=url, headers=headers)
tree = html.fromstring(page.text)
result = tree.xpath('//div[@class="pagenavi"]//a[last()-1]/@href')
number_end = int("".join(result).replace(url + '/', ''))
print("解析此套圖一共 %d 張" % (number_end))
print("開始解析此套圖所有圖片名稱和地址!")
img_lists = []
for i in range(1, number_end + 1):
page_url = url + "/" + str(i)
page2 = requests.get(url=page_url, headers=headers)
while page2.status_code != 200:
time.sleep(sleep)
page2 = requests.get(url=page_url, headers=headers)
tree = html.fromstring(page2.text)
img_url = tree.xpath('//div[@class="main-image"]//img/@src')
img_name = tree.xpath('//div[@class="content"]//h2[@class="main-title"]/text()')
img_list = {'img_name': img_name[0], 'img_url': img_url[0]}
img_lists.append(img_list)
small_message = {'big_name':img_lists[0]['img_name'],'big_url':url,'img_lists':img_lists}
big_message.put(small_message)
except Exception as e:
print(e)
class DownloadImg(threading.Thread):
def run(self):
global big_message,path,sleep
while True:
if big_message.empty():
break
small_message = big_message.get()
all_path = path + small_message['big_name']
print("開始構建套圖存儲目錄:%s" % (all_path))
is_exists = os.path.exists(all_path)
if not is_exists:
os.makedirs(all_path)
print("目錄 %s 構建成功!" % (all_path))
else:
print("目錄 %s 已存在!" % (all_path))
for i in range(len(small_message['img_lists'])):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
"Referer": small_message['big_url']
}
# headers = GetHeaders()
# headers["Referer"] = small_message['big_url'] + "/" + str(i)
# img = requests.get(url=small_message['img_lists'][i]['img_url'], headers=headers,proxies=GetProxy())
img = requests.get(url=small_message['img_lists'][i]['img_url'], headers=headers)
while img.status_code != 200:
print(img.status_code)
time.sleep(sleep)
img = requests.get(url=small_message['img_lists'][i]['img_url'], headers=headers)
# img = requests.get(url=small_message['img_lists'][i]['img_url'], headers=headers,proxies=GetProxy())
with open(all_path + '/' + small_message['img_lists'][i]['img_name'] + '.jpg', "wb") as f:
f.write(img.content)
f.close()
except Exception as e:
print(e)
pass
print("套圖 %s 下載完畢!" % (small_message['big_name']))
if __name__ == '__main__':
start = time.clock()
GetBigUrlList()
for i in range(10):
t = GetAllMessage()
t.start()
threads.append(t)
for t in threads:
t.join()
threads.clear()
for i in range(5):
t = DownloadImg()
t.start()
threads.append(t)
for t in threads:
t.join()
end = time.clock()
print(end - start)
這個代碼還加了ip代理池和隨機頭部,不過ip代理並沒有開,因爲免費的代理ip都太慢了,如果想加,在發送請求加一個參數就行了
五個線程是摸索到的差不多是頻率的臨界,多了時間效果反而不好了