Python3爬蟲教程(六)

多線程爬蟲

關於python3多線程,大家去菜鳥學學就行了,並沒有多難
接下來我們來改造我們那個爬取指定多套圖的爬蟲(講一下基礎就行了,至於生產着消費者之類的有興趣的可以自己改)
一、數據存儲改成隊列
二、每次發送請求前用time.sleep(random)改變發送請求的頻率
三、發送請求後驗證就收信息狀態碼page.status_code,如果不爲200(成功),則time.sleep(random)後繼續發送
(因爲偷懶,代碼並沒有優化,不過並不影響什麼)在這裏插入圖片描述(⊙o⊙)…這個可能這兩天練習爬的多了封ip了?
也可能是代碼有問題,因爲是直接拿前面教程的代碼改的,有些地方可能不合理,所以這個代碼就不拿出來讓人笑話了,直接拿我之前學習的時候寫的吧

import requests
import os
import queue
import threading
import time
import random
from lxml import html

threadLock = threading.Lock()
threads = []
big_url_list = queue.Queue()
big_message = queue.Queue()
sleep = random.uniform(1.5, 2.5)
path = './收藏/'

def GetBigUrlList():
    global big_url_list
    print("開始獲取任務鏈接!")
    with open("網址.txt", "r") as f:
        url_list =  f.read().splitlines()
        f.close()
    for url in url_list:
        big_url_list.put(url)
    print(big_url_list.queue)

def GetProxy():
    proxy_list = [
        '113.120.32.12:9999',
        '171.15.173.185:9999'
    ]
    proxy = random.choice(proxy_list)
    proxies = {
        'http': 'http://' + proxy,
        'https': 'https://' + proxy,
    }
    return proxies

def GetHeaders():
    user_agent_list = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    ]
    UserAgent=random.choice(user_agent_list)
    headers = {'User-Agent': UserAgent}
    return headers

class GetAllMessage(threading.Thread):
    def run(self):
        global big_url_list,big_message,sleep
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
        }

        while True:
            if big_url_list.empty():
                break
            url = big_url_list.get()
            try:
                print("開始解析地址:%s" % (url))
                page = requests.get(url=url,headers=headers)
                while page.status_code != 200:
                    time.sleep(sleep)
                    page = requests.get(url=url, headers=headers)
                tree = html.fromstring(page.text)
                result = tree.xpath('//div[@class="pagenavi"]//a[last()-1]/@href')
                number_end = int("".join(result).replace(url + '/', ''))
                print("解析此套圖一共 %d 張" % (number_end))
                print("開始解析此套圖所有圖片名稱和地址!")
                img_lists = []
                for i in range(1, number_end + 1):
                    page_url = url + "/" + str(i)
                    page2 = requests.get(url=page_url, headers=headers)
                    while page2.status_code != 200:
                        time.sleep(sleep)
                        page2 = requests.get(url=page_url, headers=headers)
                    tree = html.fromstring(page2.text)
                    img_url = tree.xpath('//div[@class="main-image"]//img/@src')
                    img_name = tree.xpath('//div[@class="content"]//h2[@class="main-title"]/text()')
                    img_list = {'img_name': img_name[0], 'img_url': img_url[0]}
                    img_lists.append(img_list)
                small_message = {'big_name':img_lists[0]['img_name'],'big_url':url,'img_lists':img_lists}
                big_message.put(small_message)
            except Exception as e:
                print(e)

class DownloadImg(threading.Thread):
    def run(self):
        global big_message,path,sleep
        while True:
            if big_message.empty():
                break
            small_message = big_message.get()
            all_path = path + small_message['big_name']
            print("開始構建套圖存儲目錄:%s" % (all_path))
            is_exists = os.path.exists(all_path)
            if not is_exists:
                os.makedirs(all_path)
                print("目錄 %s 構建成功!" % (all_path))
            else:
                print("目錄 %s 已存在!" % (all_path))
            for i in range(len(small_message['img_lists'])):
                try:
                    headers = {
                        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
                        "Referer": small_message['big_url']
                    }
                    # headers = GetHeaders()
                    # headers["Referer"] = small_message['big_url'] + "/" + str(i)
                    # img = requests.get(url=small_message['img_lists'][i]['img_url'], headers=headers,proxies=GetProxy())
                    img = requests.get(url=small_message['img_lists'][i]['img_url'], headers=headers)
                    while img.status_code != 200:
                        print(img.status_code)
                        time.sleep(sleep)
                        img = requests.get(url=small_message['img_lists'][i]['img_url'], headers=headers)
                        # img = requests.get(url=small_message['img_lists'][i]['img_url'], headers=headers,proxies=GetProxy())
                    with open(all_path + '/' + small_message['img_lists'][i]['img_name'] + '.jpg', "wb") as f:
                        f.write(img.content)
                        f.close()
                except Exception as e:
                    print(e)
                    pass
            print("套圖 %s 下載完畢!" % (small_message['big_name']))

if __name__ == '__main__':
    start = time.clock()

    GetBigUrlList()

    for i in range(10):
        t = GetAllMessage()
        t.start()
        threads.append(t)

    for t in threads:
        t.join()

    threads.clear()

    for i in range(5):
        t = DownloadImg()
        t.start()
        threads.append(t)

    for t in threads:
        t.join()
    end = time.clock()
    print(end - start)

這個代碼還加了ip代理池和隨機頭部,不過ip代理並沒有開,因爲免費的代理ip都太慢了,如果想加,在發送請求加一個參數就行了
五個線程是摸索到的差不多是頻率的臨界,多了時間效果反而不好了

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章