終極項目-測試-多線程、多進程、多協程代碼、Redis數據庫分佈式

學習Scrapy框架前要先了解這些各個提高代碼運行方式的內在關係邏輯

三種方式同時利用5個(線,進,協)來測試請求50個網頁,比較速度

首先上多線程:

採用的是隊列+多線程,這也是分佈式爬蟲底架常見的使用方法

本此多線程採用的是threading框架,也有如_thread等其它框架

import time
import requests
import threading
import queue as Qe

threads = []
link_list = []
threadList = ['Thread-1', 'Thread-2', 'Thread-3', 'Thread-4', 'Thread-5']

with open('alexa.txt', 'r') as file:
    file_list = file.readlines()
    for eachone in file_list:
        link = eachone.split('\t')[1].replace('\n', '')
        link_list.append(link)
        file.close()

start = time.time()

class MyThread(threading.Thread):
    def __init__(self, name, q):
        threading.Thread.__init__(self)
        self.name = name
        self.q = q

    def run(self):
        print('Starting' + self.name)
        while True:
            try:
                crawler(self.name, self.q)
            except:
                break
        print('Exiting' + self.name)

def crawler(threadName, q):
    url = q.get(timeout=2)
    try:
        r = requests.get(url, timeout=2)
        print(q.qsize(), threadName, r.status_code, url)
    except Exception as e:
        print(q.qsize(), threadName, url, 'Error:', e)

workQueue = Qe.Queue(50)

# 創建新線程
for tNname in threadList:
    thread = MyThread(tNname, workQueue)
    thread.start()
    threads.append(thread)

# 填充隊列
for url in link_list:
    workQueue.put(url)

# 等待所有線程完成
for t in threads:
    t.join()

end = time.time()
print('總時間:', end-start)
print('Exiting Main Thread')

多進程:

採用的是非阻塞調用,Pool框架,也有如Process等框架大家可自行學習

import time
import requests
from multiprocessing import Pool, Manager

link_list = []

with open('alexa.txt', 'r') as file:
    file_list = file.readlines()
    for eachone in file_list:
        link = eachone.split('\t')[1].replace('\n', '')
        link_list.append(link)
        file.close()

start = time.time()

def crawler(q, index):
    Process_id = 'Process-' + str(index)
    while not q.empty():
        url = q.get(timeout=2)
        try:
            r = requests.get(url, timeout=2)
            print(Process_id, q.qsize(), r.status_code, url)
        except Exception as e:
            print(Process_id, q.qsize(), url, 'Error:', e)


if __name__ == '__main__':
    manager = Manager()
    workQueue = manager.Queue(50)

    # 填充隊列
    for url in link_list:
        workQueue.put(url)

    po = Pool(processes=5)  # 無窮多進程
    for i in range(5):
        po.apply_async(crawler, args=(workQueue, i))

    print('Started processes')
    po.close()
    po.join()

    end = time.time()
    print('總時間:', end-start)
    print('Main process Ended')

 

多協程:

本次多協程採用的是常用的gevent框架

import time
import gevent
import requests
from gevent.queue import Queue, Empty

from gevent import monkey # 把下面有可能有IO操作的單獨做上標記
monkey.patch_all() # 將Io轉爲異步執行的函數

jobs = []
link_list = []

with open('alexa.txt', 'r') as file:
    file_list = file.readlines()
    for eachone in file_list:
        link = eachone.split('\t')[1].replace('\n', '')
        link_list.append(link)
        file.close()

start = time.time()

def crawler(index):
    Process_id = 'Process-' + str(index)
    while not workQueue.empty():
        url = workQueue.get(timeout=2)
        try:
            r = requests.get(url, timeout=2)
            print(Process_id, workQueue.qsize(), r.status_code, url)
        except Exception as e:
            print(Process_id, workQueue.qsize(), url, 'Error:', e)

def boss():
    for url in link_list:
        workQueue.put_nowait(url)


if __name__ == '__main__':
    workQueue = Queue(50)

    gevent.spawn(boss).join()
    for i in range(5):
        jobs.append(gevent.spawn(crawler, i))
    gevent.joinall(jobs)

    end = time.time()
    print('總時間:', end-start)
    print('Main Ended')

由上述代碼運行對比得到時間:

多線程:11.943s

多進程:9.652s

多協程:5.673s

可以看出多協程是最出色的,更爲巧妙的是三種提高代碼運行速度的方式可相互聯繫,相互配合,讓爬蟲更加快速(本此結果僅代表作者本次測試結論,真實結論有待證明)

下面上Redis內存式數據庫分佈式爬蟲獲取50個網頁圖片的代碼

 

1.master(中樞管理)

import re
import time
import requests
from redis import Redis
from bs4 import BeautifulSoup

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1)\
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116\
Safari/537.36'}

def push_redis_list():
    r = Redis(host='127.0.0.1', port=6379)

    link_list = []
    with open('alexa.txt', 'r') as file:
        file_list = file.readlines()
        for eachone in file_list:
            link = eachone.split('\t')[1].replace('\n', '')
            link_list.append(link)
            file.close()

    for url in link_list:
        try:
            response = requests.get(url, timeout=2)
            soup = BeautifulSoup(response.text, 'html.parser')
            img_list = soup.find_all('img')
            for img in img_list:
                img_url = img['src']
                if img_url != '':
                    print('加入的圖片url:', img_url)
                    r.lpush('img_url', img_url)
        except Exception as e:
            print(url, 'Error:', e)
    print('現在的圖片鏈接個數爲', r.llen('img_url'))

def get_img():
    r = Redis(host='127.0.0.1', port=6379)
    while True:
        try:
            url = r.lpop('img_url')
            url = url.decode('ascii')
            if url[:2] == '//':
                url = 'http:' + url
            try:
                response = requests.get(url, timeout=2)
                name = int(time.time())
                f = open('E:\截圖庫\\' + str(name) + url[-4:], 'wb')
                f.write(response.content)
                f.close()
                print('已經獲取圖片', url)
            except Exception as e:
                print('爬取圖片過程出問題', e)
            time.sleep(3)
        except Exception as e:
            print('Error:', e)
            time.sleep(10)
            break

if __name__ == '__main__':
    this_machine = 'master'
    print('開始分佈式爬蟲')
    if this_machine == 'master':
        push_redis_list()
    else:
        get_img()

2.slave(爬蟲執行者)

import re
import time
import requests
from redis import Redis
from bs4 import BeautifulSoup

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1)\
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116\
Safari/537.36'}

def push_redis_list():
    r = Redis(host='127.0.0.1', port=6379)

    link_list = []
    with open('alexa.txt', 'r') as file:
        file_list = file.readlines()
        for eachone in file_list:
            link = eachone.split('\t')[1].replace('\n', '')
            link_list.append(link)
            file.close()

    for url in link_list:
        try:
            response = requests.get(url, timeout=2)
            soup = BeautifulSoup(response.text, 'html.parser')
            img_list = soup.find_all('img')
            for img in img_list:
                img_url = img['src']
                if img_url != '':
                    print('加入的圖片url:', img_url)
                    r.lpush('img_url', img_url)
        except Exception as e:
            print(url, 'Error:', e)
    print('現在的圖片鏈接個數爲', r.llen('img_url'))

def get_img():
    r = Redis(host='127.0.0.1', port=6379)
    while True:
        try:
            url = r.lpop('img_url')
            url = url.decode('ascii')
            if url[:2] == '//':
                url = 'http:' + url
            try:
                response = requests.get(url, timeout=2)
                name = int(time.time())
                f = open('E:\截圖庫\\' + str(name) + url[-4:], 'wb')
                f.write(response.content)
                f.close()
                print('已經獲取圖片', url)
            except Exception as e:
                print('爬取圖片過程出問題', e)
            time.sleep(3)
        except Exception as e:
            print('Error:', e)
            time.sleep(10)
            break

if __name__ == '__main__':
    this_machine = 'slave'
    print('開始分佈式爬蟲')
    if this_machine == 'master':
        push_redis_list()
    else:
        get_img()

master只有一個,而slave可以有很多個。互不干擾,相互運行,大大加快了爬蟲的運行速度

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章