Python多線程爬蟲小demo

Python多線程爬蟲小demo

class

線程類 及實例 說明:

實例線程list 用途 繼承自
ThreadCrawlCom_URL threads_crawlC = [ ] 抓取商品url線程列表 threading.Thread
ThreadParseComUrl threads_parseC = [] 解析商品url線程列表 threading.Thread
ThreadCrawl threads_crawl = [] 抓取商品詳情線程列表 threading.Thread
ThreadParse threads_parse = [] 解析商品詳情線程列表 threading.Thread
ThreadStore threads_Store = [] 存儲數據線程列表 threading.Thread

代碼塊:

# !/usr/bin/env python
# -*- coding: utf-8 -*-
# @File  : Spider2.py
# @Author: zjian
# @Date  : 18-8-27
# @Contact  :[email protected]
# @Software : PyCharm

from queue import Queue
import threading
from lxml import etree
import requests
import time
import pymysql
from Fst_MT_Spider.models.Settings import init_url

class ThreadCrawlCom_URL(threading.Thread):

    def __init__(self, threadName, PageurlQueue, ComUrlQueue):
        super(ThreadCrawlCom_URL, self).__init__()
        self.threadName = threadName
        self.PageurlQueue = PageurlQueue
        self.ComUrlQueue = ComUrlQueue
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'}

    def run(self):
        print('[INFO]啓動%s' % self.threadName)
        while not PC_EXIT_FLAG:
            try:
                url = str(self.PageurlQueue.get(False))
                content = requests.get(url=url, headers=self.headers).text
                time.sleep(1)
                self.ComUrlQueue.put(content) #將請求回的列表頁內容放置於此
            except:
                pass
            print('[INFO]%s線程結束' % self.threadName)

class ThreadCrawl(threading.Thread):
    def __init__(self, threadName, dataQueue, urlQueue):
        super(ThreadCrawl, self).__init__()
        # 線程名
        self.threadName = threadName
        # 數據隊列
        self.dataQueue = dataQueue
        # url隊列
        self.urlQueue = urlQueue
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'}

    def run(self):
        print('[INFO]啓動%s' % self.threadName)
        while not C_EXIT_FLAG:
            try:
                url = str(self.urlQueue.get(False))
                content = requests.get(url=url, headers=self.headers).text
                time.sleep(1)
                self.dataQueue.put(content)
            except:
                pass
        print('[INFO]%s線程結束' % self.threadName)

# 用以解析商品詳情頁
class ThreadParse(threading.Thread):

    def __init__(self, threadName, dataQueue, resQueue):
        super(ThreadParse, self).__init__()
        self.threadName = threadName
        self.dataQueue = dataQueue
        self.resQueue = resQueue

    def run(self):
        print('[INFO] 啓動 %s 線程' % self.threadName)
        while not P_EXIT_FLAG:
            try:
                response = self.dataQueue.get(False)
                self.parse(response)
            except:
                pass
        print('[INFO] %s 線程結束' % self.threadName)

    def parse(self, response):
        # 解析爲HTML DOM
        response = etree.HTML(response)
        title = ''.join(response.xpath('//*[@id="goods_detail_1"]/h1/text()')).strip()
        try:
            bd = response.xpath('//*[@id="goods_detail_1"]/h1/em/a/span/text()')[0]
        except:
            bd = ''
        goods_nums = response.xpath('//*[@id="goods_detail_1"]/ul/li[1]/span[1]/text()')[0]  # 商品貨號
        o_price = response.xpath('//*[@id="goods_detail_1"]/ul/li[1]/del/text()')[0]
        n_price = response.xpath('//*[@id="goods_detail_1"]/ul/li[2]/em/text()')[0]
        sale_count = response.xpath('//*[@id="goods_detail_1"]/ul/li[3]/span/text()')[0]
        comment_nums = response.xpath('//*[@id="goods_detail_1"]/ul/li[4]/span[2]/a/text()')[0]
        score = response.xpath('//*[@id="goods_detail_1"]/ul/li[4]/span[1]/em/@title')[0]
        brand = response.xpath('//*[@id="J_attrBrandName"]/text()')[0]
        c_size = response.xpath('//*[@id="J_AttrUL"]/li[3]/text()')[0].replace('\n', '').replace(' ', '').replace('\xa0', ',')
        color = response.xpath('//*[@id="J_AttrUL"]/li[4]/text()')[0].replace('\n', '').replace(' ', '').replace('\xa0',',')

        item = {'title': title,
                'bd': bd,
                'goods_nums': goods_nums,
                'o_price': o_price,
                'n_price': n_price,
                'sale_count': sale_count,
                'comment_nums': comment_nums,
                'score': score,
                'brand': brand,
                'c_size': c_size,
                'color': color
                }
        self.resQueue.put(item)

#現在需要一個解析商品url的類,
class ThreadParseComUrl(threading.Thread):

    def __init__(self,threadName,urlQueue,ComUrlQueue):
        super(ThreadParseComUrl, self).__init__()
        self.threadName = threadName
        self.urlQueue = urlQueue
        self.ComUrlQueue = ComUrlQueue
    def run(self):
        print('[INFO] 啓動 %s 線程' % self.threadName)
        while not PP_EXIT_FLAG:
            try:
                response = self.ComUrlQueue.get(False)
                self.parse(response)
            except:
                pass
        print('[INFO] %s 線程結束' % self.threadName)
    def parse(self,response):
        response = etree.HTML(response)
        """這裏寫提取邏輯"""
        for i in range(1,41):
            try:
                url = 'http://www.handu.com/'+response.xpath('//*[@id="cate_right"]/div[3]/div[1]/div[{0}]/div/div[1]/a/@href'.format(i))[0]
                self.urlQueue.put(url) #將url放入隊列
            except:
                print('[info]當前第%s<div>節點不存在商品url'%i)
                continue

class ThreadStore(threading.Thread):
    def __init__(self, threadName, resQueue, lock):
        super(ThreadStore, self).__init__()
        self.threadName = threadName
        self.resQueue = resQueue
        self.lock = lock
        self.conn = pymysql.connect('localhost', 'root', '123456', 'databases_name', charset='utf8')
        self.cursor = self.conn.cursor()
        self.conn.ping(True)

    def run(self, ):
        print('[INFO] 啓動%s線程進行存儲' % self.threadName)
        while not S_EXIT_FLAG:
            try:
                item = self.resQueue.get(False)
                self.store(item)
            except:
                pass
        print('[INFO] 結束%s存儲線程' % self.threadName)

    def store(self, item):
        insert_sql = '''insert into Hstyle(title,bd,goods_nums,o_price,n_price,sale_count,score,comment_nums,brand,c_size,color)
       values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
        params = (item['title'], item['bd'], item['goods_nums'], item['o_price'], item['n_price'], item['sale_count'],
                  item['score'], item['comment_nums'], item['brand'], item['c_size'], item['color'])
        with self.lock:
            try:
                self.cursor.execute(insert_sql, params)
                self.conn.commit()
            except Exception as e:
                print(e)

    def __del__(self):
        with self.lock:
            self.cursor.close()
            self.conn.close()

C_EXIT_FLAG = False
P_EXIT_FLAG = False
S_EXIT_FLAG = False
PC_EXIT_FLAG = False
PP_EXIT_FLAG = False

def main():

    pageQueue = Queue()# pageurl隊列  (商品展示陳列頁面,首頁url)
    ComUrlQueue = Queue()  # 放置請求後返回的商品url隊列(未解析)
    urlQueue = Queue()  # 將抓取到的商品url放置在此隊列中(已解析)
    dataQueue = Queue()  # 放置請求後返回的(未經過parse解析的)商品數據隊列
    resQueue = Queue()  # 結果隊列 (已解析)
    lock = threading.Lock()# 創建鎖

    pageurl_list = init_url()  #這裏生成起始頁面url列表
    for pageurl in pageurl_list:
        pageQueue.put(pageurl)

    #1.抓取商品url線程列表
    threads_crawlC = []
    for i in range(len(pageurl_list)):
        threadName = 'Crawl_URL-thds-' + str(i)
        thread_CC = ThreadCrawlCom_URL(threadName,pageQueue,ComUrlQueue)
        threads_crawlC.append(thread_CC)
    #1.1開啓抓取商品url的線程
    for thread_CC in threads_crawlC:
        thread_CC.start()

    #2解析商品url線程列表
    threads_parseC = []
    for i in range(100):
        threadName = 'Parse_CC-thds-' + str(i)
        thread_PC = ThreadParseComUrl(threadName,urlQueue,ComUrlQueue)
        threads_parseC.append(thread_PC)
    #2.1開始解析商品url線程
    for thread_PC in threads_parseC:
        thread_PC.start()

    #3抓取商品詳情線程列表
    threads_crawl = []
    for i in range(100):  # 經過測試發現在一定範圍內線程數越多執行速度越快,其他影響因素:網速
        '''由於這裏不知道url的長度,隨着前一個的爬取,urlQueue的長度也是在不斷增加的,所以這裏先固定設置爲100'''
        threadName = 'Crawl-thds-' + str(i)
        thread_C = ThreadCrawl(threadName, dataQueue, urlQueue)
        threads_crawl.append(thread_C)
    #3.1開啓線程
    for thread_C in threads_crawl:
        thread_C.start()

    #4解析商品詳情線程列表
    threads_parse = []
    # for threadName in parselist:
    for i in range(100):
        threadName = 'Parse-thds-' + str(i)
        thread_P = ThreadParse(threadName, dataQueue, resQueue)
        threads_parse.append(thread_P)
    #4.1 開啓線程
    for thread_P in threads_parse:
        thread_P.start()

    #5存儲線程列表
    threads_Store = []
    for i in range(100):
        threadName = 'Store-thds-' + str(i)
        thread_S = ThreadStore(threadName, resQueue, lock)
        threads_Store.append(thread_S)
    #5.1開啓線程
    for thread_S in threads_Store:
        thread_S.start()

    '''序號重置'''
    #1等待請求頁面隊列爲空,採集商品url線程退出
    while not pageQueue.empty():
        pass
    global PC_EXIT_FLAG
    PC_EXIT_FLAG = True
    for thread_CC in threads_crawlC:
        thread_CC.join()

    #1.1等待頁面採集結果(商品url)隊列爲空,退出解析線程
    while not ComUrlQueue.empty():
        pass
    global PP_EXIT_FLAG
    PP_EXIT_FLAG = True
    for thread_PC in threads_parseC:
        thread_PC.join()

    #2等待url請求隊列中爲空,採集線程退出循環
    while not urlQueue.empty():
        pass
    global C_EXIT_FLAG
    C_EXIT_FLAG = True
    for thread_C in threads_crawl:
        thread_C.join()

    #2.1等待放置採集返回的商品詳情頁隊列爲空,則退出解析線程
    while not dataQueue.empty():
        pass
    global P_EXIT_FLAG
    P_EXIT_FLAG = True
    for thread_P in threads_parse:
        thread_P.join()

    #等待 已採集的商品數據隊列爲空,則退出存儲入數據庫的線程
    while not resQueue.empty():
        pass
    global S_EXIT_FLAG
    S_EXIT_FLAG = True
    for thread_S in threads_Store:
        thread_S.join()

if __name__ == '__main__':
    start = time.time()
    main()
    print('[INFO]耗時%s' % (time.time() - start))

C_EXIT_FLAG = False
P_EXIT_FLAG = False
S_EXIT_FLAG = False
PC_EXIT_FLAG = False
PP_EXIT_FLAG = False
這裏五個全局變量是分別控制各個線程退出循環的判定條件

程序及數據通路流程圖:
程序及數據通路流程圖:

速度檢測:
數據量
總耗時


更多的關於爬蟲的代碼請訪問個人github自己有關於學習爬蟲的代碼或者資料有時間都會整理好放上去,覺得對你有幫助記得點個star。(另外建議就是對測試的網站最好能友好一點,學習的前提是不對對方服務器造成很大的負擔,謹記)。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章