python + selenium多進程爬取淘寶搜索頁數據

1. 功能描述

按照給定的關鍵詞，在淘寶搜索對應的產品，然後爬取搜索結果中產品的信息，包括：標題，價格，銷量，產地等信息，存入mongodb中，需要採用多進程提高爬取效率。
2. 環境

系統：win7
MongoDB 3.4.6
python 3.6.1
IDE：pycharm
安裝過chrome瀏覽器（63.0.3239.132 (正式版本) 32 位）
selenium 3.7.0
配置好chromedriver v2.34
3. 代碼


from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys

import pymongo
import time
import datetime

import re
import multiprocessing

import lxml.html
import lxml.etree

# ---------- 1. 一些配置信息  ------------
# 搜索關鍵字列表
keySearchWords = {
    "動漫": [1, "動漫周邊"],
    "水果": [1, "水果沙拉"],
}

# 數據庫初始化
client = pymongo.MongoClient("127.0.0.1:27017")
db = client["taobao"]
db_coll = db["productInfo"]

# 一個頁面的最大重試次數
retryMax = 8

chrome_options = webdriver.ChromeOptions()
# 禁止圖片和視頻的加載，提高網頁爬取速度
# prefs = {"profile.managed_default_content_settings.images": 2}
# chrome_options.add_experimental_option("prefs", prefs)

# 啓用headless模式：無瀏覽器界面，提高速度與穩定性
# chrome_options.add_argument('--headless')
# chrome_options.add_argument('--disable-gpu')


# ---------- 2. 解析頁面信息  ------------
# 獲得每個產品在list頁面的主要信息
def getProductMainInfo(htmlSource):
    try:
        resultTree = lxml.etree.HTML(htmlSource)
        # fix_html = lxml.html.tostring(resultTree, pretty_print=True)
        # print(f"htmlSource = {htmlSource}")

        productLst = resultTree.xpath("//div[@class='m-itemlist']//div[contains(@class, 'J_MouserOnverReq')]")
        print(f"productLst = {productLst}")
        productInfoLst = []
        for product in productLst:
            productInfo = {}

            # 唯一標記
            dataNid = product.xpath(".//div[contains(@class,'ctx-box')]//div[contains(@class, 'title')]/a/@data-nid")
            if len(dataNid) > 0:
                productInfo['dataNid'] = dataNid[0]
            else:
                productInfo['dataNid'] = 0
            productInfo['_id'] = productInfo['dataNid']

            taobaoCategory = product.xpath("@data-category")
            if len(taobaoCategory) > 0:
                productInfo['taobaoCategory'] = taobaoCategory[0]
            else:
                productInfo['taobaoCategory'] = 'unknow'

            rank = product.xpath("@data-index")
            if len(rank) > 0:
                productInfo['rank'] = rank[0]
            else:
                productInfo['rank'] = 0

            imgSrc = product.xpath(".//div[@class='pic']/a//img/@src")
            if len(imgSrc) > 0:
                productInfo['imgSrc'] = imgSrc[0]
            else:
                productInfo['imgSrc'] = ''

            title = product.xpath(".//div[contains(@class,'ctx-box')]//div[contains(@class, 'title')]/a/text()")
            productInfo['title'] = ''
            if len(title) > 0:
                for elem in title:
                    productInfo['title'] += elem.strip()

            detailUrl = product.xpath(".//div[contains(@class,'title')]//a/@href")
            if len(detailUrl) > 0:
                productInfo['detailUrl'] = detailUrl[0]
            else:
                productInfo['detailUrl'] = ''

            shopName = product.xpath(".//div[contains(@class,'ctx-box')]//div[@class='shop']/a[contains(@class,'shopname')]/span/text()")
            if len(shopName) > 0:
                productInfo['shopName'] = shopName[-1]
            else:
                productInfo['shopName'] = ''

            shopHref = product.xpath(".//div[contains(@class,'ctx-box')]//div[@class='shop']/a[contains(@class,'shopname')]/@href")
            if len(shopHref) > 0:
                productInfo['shopHref'] = shopHref[0]
            else:
                productInfo['shopHref'] = ''
            print(f"shopHref = {shopHref}")

            shopID = product.xpath(".//div[contains(@class,'ctx-box')]//div[@class='shop']/a[contains(@class,'shopname')]/@data-userid")
            if len(shopID) > 0:
                productInfo['shopID'] = shopID[0]
            else:
                productInfo['shopID'] = ''
            print(f"shopID = {shopID}")

            location = product.xpath(".//div[contains(@class,'clearfix')]//div[@class='location']/text()")
            if len(location) > 0:
                productInfo['location'] = location[0]
            else:
                productInfo['location'] = ''

            dealCountRes = product.xpath(".//div[contains(@class,'ctx-box')]//div[@class='deal-cnt']/text()")
            if len(dealCountRes) > 0:
                dealCount = dealCountRes[0]
                if dealCount != '':
                    dealCountRe = re.search('(\d+)', dealCount)
                    dealCount = int(dealCountRe.group(1)) if dealCountRe else 0.0
                    productInfo['dealCount'] = dealCount
            else:
                productInfo['dealCount'] = 0

            # 價格放到了 <span class="sm-offer-priceNum sw-dpl-offer-priceNum" title="¥6.60">
            # productInfo['price'] = product.xpath(".//span[contains(@class,'priceNum')]//text()")
            price = product.xpath(".//div[contains(@class,'price')]//strong/text()")
            if len(price) > 0:
                price = price[0]
            else:
                price = ''
            if price != '':
                # 處理價格：price = ￥64.32'
                priceRe = re.search(r'([\d\.]+)', price)
                priceFloat = float(priceRe.group(1)) if priceRe else 0.0
                productInfo['price'] = priceFloat
            else:
                productInfo['price'] = 0.0

            print(f"productInfo = {productInfo}")
            productInfoLst.append(productInfo)
        return productInfoLst
    except Exception as e:
        print(f"解析List 商品信息出錯，e = {e}")
        return []


# 第一步，進入首頁，搜索關鍵字
def searchKey(browser, wait, keyWord, categorySearchWords, retryCount):
    print(f"searchKey: enter, keyWord = {keyWord}, categorySearchWords = {categorySearchWords}, retryCount = {retryCount}")
    retryCount += 1
    if retryCount > retryMax:
        return (False, 0, keyWord)
    mainUrl = "https://www.taobao.com/"
    print(f"searchKey: 訪問taobao主頁, 進行搜索. mainUrl = {mainUrl}")
    browser.get(mainUrl)

    # 嘗試搜索
    try:
        # 搜索框是否出現
        input = wait.until(
            EC.presence_of_element_located((By.XPATH, "//input[@class='search-combobox-input']"))
        )
    except Exception as e:
        # 搜索框都沒出現，說明頁面沒有加載好，重試
        print(f"searchKey: 搜索框還沒有加載好，重新加載主頁. retryCount = {retryCount}, url = {mainUrl}, e = {e}")
        searchKey(browser, wait, keyWord, categorySearchWords, retryCount)
    else:
        try:
            # 重新拿到搜索框，防止頁面在這個時間有變化，導致input元素失焦
            input = browser.find_element_by_xpath("//input[@class='search-combobox-input']")
            # 輸入搜索關鍵字
            time.sleep(3)
            input.clear()
            input.send_keys(categorySearchWords)
            # 敲enter鍵
            input.send_keys(Keys.RETURN)
            print(f"searchKey: press return key.")
            time.sleep(3)

            # 查看搜索結果是否出現
            searchRes = wait.until(
                EC.presence_of_element_located((By.XPATH, "//div[@class='m-itemlist']"))
            )
            print(f"searchKey: searchSuccess, searchRes = {searchRes}")
        except Exception as e:
            print(f"searchKey: 搜索結果總頁數尚未加載好，重新加載主頁. retryCount = {retryCount}, url = {mainUrl}, e = {e}")
            searchKey(browser, wait, keyWord, categorySearchWords, retryCount)
        else:
            # 如果發現結果頁加載OK, 開始尋找總頁數
            try:
                # 獲取結果總頁數
                print(f"searchKey: 搜索結果已出現，開始尋找總頁數")
                totalPage = 0
                print(f"searchKey: totalPageInit = {totalPage}")
                #  共 100 頁，
                totalRes = wait.until(
                    EC.presence_of_element_located((By.XPATH, "//div[@class='total']"))
                )
                # print(f"totalRes.text = {totalRes.text}")
                totalRe = re.search("(\d+)", totalRes.text)
                # print(f"totalRe = {totalRe}")
                totalPage = int(totalRe.group(1))
                print(f"searchKey: totalPage = {totalPage}")
                return (True, totalPage, keyWord)
            except Exception as e:
                print(f"searchKey: 搜索結果就一頁. e = {e}")
                return (True, 1, keyWord)
            finally:
                # 這個部分會在本函數return語句之前執行
                # 參考文章：Python :淺析 return 和 finally 共同挖的坑
                # http://python.jobbole.com/88408/
                try:
                    print(f"searchKey: 取第一頁的數據出來，進行存儲")
                    # 解析頁面內容：
                    if browser.page_source:
                        productInfoLst = getProductMainInfo(browser.page_source)
                        for product in productInfoLst:
                            product['item_type'] = "productMainInfo"
                            product['keyWord'] = keyWord
                            product['page'] = 1
                            product['markClawerTools'] = 1
                            product['reverse1'] = 1
                            product['categorySearchWords'] = categorySearchWords
                            try:
                                insertRes = db_coll.insert_one(product)
                                print(f"searchKey: insertRes = {insertRes.inserted_id}")
                            except Exception as e:
                                print(f"searchKey: insert_one Exception = {e}")
                except Exception as e:
                    print(f"searchKey: 取第一頁數據出來這個過程出現異常。Exception = {e}")


# 輸入頁碼，然後點擊按鈕，進行翻頁
def getNextListPage(browser, wait, pageNum, retryCount):
    print(f"getNextListPage: enter, pageNum = {pageNum}, retryCount = {retryCount}")
    retryCount += 1
    if retryCount > retryMax:
        return (False, [])

    # 第一步判斷List頁是否加載成功，看一下下面的頁碼輸入框是否加載好了
    try:
        input = wait.until(
            EC.presence_of_element_located((By.XPATH, "//div[@class='form']/input[@class='input J_Input']"))
        )
    except Exception as e:
        # 刷新當前頁面，然後重新查看數據
        print(f"getNextListPage: 頁碼輸入框尚未加載好，刷新當前頁面，循環這個過程. retryCount = {retryCount}")
        browser.refresh()
        time.sleep(2)
        getNextListPage(browser, wait, pageNum, retryCount)
    else:
        # 開始輸入頁碼數，進行翻頁
        try:
            # 重新獲取輸入框，防止原先獲取的input元素失焦
            input = browser.find_element_by_xpath("//div[@class='form']/input[@class='input J_Input']")
            input.clear()
            input.send_keys(pageNum)
            submit = browser.find_element_by_xpath("//div[@class='form']/span[@class='btn J_Submit']")
            submit.click()
            # 檢查下面高亮的頁碼數和這次輸入的是否一致，也就是說是否翻頁成功
            activePage = wait.until(
                EC.text_to_be_present_in_element((By.XPATH, "//ul[@class='items']/li[@class='item active']/span"), str(pageNum))
            )
            # 檢測翻頁後商品列表是否加載成功
            allProducts = wait.until(
                EC.presence_of_element_located((By.XPATH, "//div[@class='m-itemlist']"))
            )
            # 解析頁面內容：
            if browser.page_source:
                productInfoLst = getProductMainInfo(browser.page_source)
                return(True, productInfoLst)
            else:
                # 如果獲取不到源代碼，說明頁面加載出錯，刷新，重新獲取
                browser.refresh()
                time.sleep(2)
                getNextListPage(browser, wait, pageNum, retryCount)
        except Exception as e:
            print(f"getNextListPage: 無法解析當前頁列表下的商品，重新刷新，pageNum = {pageNum}, retryCount = {retryCount}, e = {e}")
            browser.refresh()
            time.sleep(2)
            getNextListPage(browser, wait, pageNum, retryCount)


# 瀏覽器進程，多進程執行單位，爬取淘寶數據主要流程...
def chromeProcessPer(queue, lock, mark):
    clawerStartTime = datetime.datetime.now()
    print(f"chromeProcessPer enter: clawerMark = {mark}, time = {clawerStartTime}")

    # 啓動瀏覽器，並設置好wait
    browser = webdriver.Chrome(chrome_options = chrome_options)
    browser.set_window_size(900, 900)  # 根據桌面分辨率來定，主要是爲了抓到驗證碼的截屏
    wait = WebDriverWait(browser, timeout = 30)

    # 從隊列中獲取搜索關鍵字
    while not queue.empty():
        keyWordInfo = queue.get()

        # 加鎖，是爲了防止散亂的打印。 保護一些臨界狀態
        # 多個進程運行的狀態下，如果同一時刻都調用到print，那麼顯示的打印結果將會混亂
        lock.acquire()
        # keyWordInProcess = ('動漫', [1, '動漫周邊']), markProcess = 1
        print(f"keyWordInProcess = {keyWordInfo}, markProcess = {mark}")
        lock.release()
        time.sleep(1)

        # 按照品類的搜索關鍵字來搜索
        searchRes = searchKey(browser, wait, keyWordInfo[0], keyWordInfo[1][1], 0)
        print(f"main: searchRes = {searchRes}, clawerMark = {mark}")
        if ((searchRes != None)) and searchRes[0] == True:
            # 從第二頁開始存儲，因爲有的商品可能就一頁，在搜索結果中就處理掉了。 而且也沒有頁碼輸入框，無法與下面的流程匹配
            if searchRes[1] > 1:
                for page in range(2, searchRes[1] + 1):
                    # time.sleep(5)      # 根據淘寶反爬策略調整延時
                    listPageRes = getNextListPage(browser, wait, page, 0)
                    print(f"chromeProcesser: keyWord = {keyWordInfo}, page = {page}, listPageRes = {listPageRes},  mark = {mark}")
                    if (listPageRes != None) and (listPageRes[0] == True):
                        for product in listPageRes[1]:
                            product['item_type'] = "productMainInfo"
                            product['keyWord'] = keyWordInfo[0]
                            product['page'] = page
                            product['markClawerTools'] = 1
                            product['reverse1'] = 1
                            product['categorySearchWords'] = keyWordInfo[1][1]
                            try:
                                insertRes = db_coll.insert_one(product)
                                print(f"insertRes = {insertRes.inserted_id}")
                            except Exception as e:
                                print(f"insert_one Exception = {e}")
            else:
                print(f"main: keyWord = {keyWordInfo}, totalPage = {searchRes[1]}, clawerMark = {mark}")
    clawerEndTime = datetime.datetime.now()
    # 一定要退出
    browser.quit()
    print(f"chromeProcessPer end: clawerMark = {mark}, time = {clawerEndTime}, timeUsed = {clawerEndTime - clawerStartTime}")


if __name__ == "__main__":
    mainStartTime = datetime.datetime.now()
    print(f"main: taobao mainStartTime = {mainStartTime}")

    lock = multiprocessing.Lock()       # 進程鎖
    queue = multiprocessing.Queue(300)  # 隊列，用於存放所有的初始關鍵字，最大爲300個

    for keyWord, keyWordValue in keySearchWords.items():
        print(f"keyWord = {keyWord}, keyWordValue = {keyWordValue}")
        # 如果queue定的太小，剩下的放不進去，程序就會block住，等待隊列有空餘空間
        # def put(self, obj, block=True, timeout=None):
        queue.put((keyWord, keyWordValue))
    print(f"queueBefore = {queue}")

    getKeyProcessLst = []
    # 生成兩個進程，並啓動
    for i in range(2):
        # 攜帶的args 必須是python原有的數據類型，不能是自定義的。否則lock鎖不住該對象
        process = multiprocessing.Process(target = chromeProcessPer, args = (queue, lock, i))
        process.start()
        getKeyProcessLst.append(process)

    # 守護線程
    # join 等待線程終止，如果不使用join方法對每個線程做等待終止，那麼線程在運行過程中，可能會去執行最後的打印
    # 如果沒有join，父進程就不會阻塞，啓動子進程之後，父進程就直接執行最後的打印了
    for p in getKeyProcessLst:
        p.join()

    print(f"queueAfter = {queue}")
    queue.close()
    print(f"all queue used.")
    mainEndTime = datetime.datetime.now()
    print(f"### timeUsedTotal = {mainEndTime - mainStartTime}, mainStartTime = {mainStartTime}, mainEndTime = {mainEndTime}")
4. 結果

需要注意的是：
進程數越多，對電腦的壓迫越大，找到合適的數量。而且一定要注意代碼中的每一個註釋。
目前淘寶對這個頁面的反爬措施不強，不需要登錄，也沒有頻率限制，也沒有驗證碼或者廣告彈窗。如果有新的反爬措施，參考前面其他相關專題，增加這樣的機制。
python + selenium多進程爬取淘寶搜索頁數據

python + selenium多進程爬取淘寶搜索頁數據

1. 功能描述

2. 環境

3. 代碼

4. 結果

scrapy-redis分佈式爬蟲如何在start_urls中添加參數

python3下使用scrapy實現模擬用戶登錄與cookie存儲—— 中級篇（百度雲俱樂部）

故障分析系列（01） —— scrapy爬蟲速度突然變慢原因分析

python下selenium如何處理日期控件的幾種方法

scrapy-redis分佈式爬蟲的搭建過程（代碼篇）

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結