python + selenium多進程爬取淘寶搜索頁數據

python + selenium多進程爬取淘寶搜索頁數據

1. 功能描述

  • 按照給定的關鍵詞,在淘寶搜索對應的產品,然後爬取搜索結果中產品的信息,包括:標題,價格,銷量,產地等信息,存入mongodb中,需要採用多進程提高爬取效率。

2. 環境

  • 系統:win7
  • MongoDB 3.4.6
  • python 3.6.1
  • IDE:pycharm
  • 安裝過chrome瀏覽器(63.0.3239.132 (正式版本) 32 位)
  • selenium 3.7.0
  • 配置好chromedriver v2.34

3. 代碼


from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys

import pymongo
import time
import datetime

import re
import multiprocessing

import lxml.html
import lxml.etree

# ---------- 1. 一些配置信息  ------------
# 搜索關鍵字列表
keySearchWords = {
    "動漫": [1, "動漫周邊"],
    "水果": [1, "水果沙拉"],
}

# 數據庫初始化
client = pymongo.MongoClient("127.0.0.1:27017")
db = client["taobao"]
db_coll = db["productInfo"]

# 一個頁面的最大重試次數
retryMax = 8

chrome_options = webdriver.ChromeOptions()
# 禁止圖片和視頻的加載,提高網頁爬取速度
# prefs = {"profile.managed_default_content_settings.images": 2}
# chrome_options.add_experimental_option("prefs", prefs)

# 啓用headless模式:無瀏覽器界面,提高速度與穩定性
# chrome_options.add_argument('--headless')
# chrome_options.add_argument('--disable-gpu')


# ---------- 2. 解析頁面信息  ------------
# 獲得每個產品在list頁面的主要信息
def getProductMainInfo(htmlSource):
    try:
        resultTree = lxml.etree.HTML(htmlSource)
        # fix_html = lxml.html.tostring(resultTree, pretty_print=True)
        # print(f"htmlSource = {htmlSource}")

        productLst = resultTree.xpath("//div[@class='m-itemlist']//div[contains(@class, 'J_MouserOnverReq')]")
        print(f"productLst = {productLst}")
        productInfoLst = []
        for product in productLst:
            productInfo = {}

            # 唯一標記
            dataNid = product.xpath(".//div[contains(@class,'ctx-box')]//div[contains(@class, 'title')]/a/@data-nid")
            if len(dataNid) > 0:
                productInfo['dataNid'] = dataNid[0]
            else:
                productInfo['dataNid'] = 0
            productInfo['_id'] = productInfo['dataNid']

            taobaoCategory = product.xpath("@data-category")
            if len(taobaoCategory) > 0:
                productInfo['taobaoCategory'] = taobaoCategory[0]
            else:
                productInfo['taobaoCategory'] = 'unknow'

            rank = product.xpath("@data-index")
            if len(rank) > 0:
                productInfo['rank'] = rank[0]
            else:
                productInfo['rank'] = 0

            imgSrc = product.xpath(".//div[@class='pic']/a//img/@src")
            if len(imgSrc) > 0:
                productInfo['imgSrc'] = imgSrc[0]
            else:
                productInfo['imgSrc'] = ''

            title = product.xpath(".//div[contains(@class,'ctx-box')]//div[contains(@class, 'title')]/a/text()")
            productInfo['title'] = ''
            if len(title) > 0:
                for elem in title:
                    productInfo['title'] += elem.strip()

            detailUrl = product.xpath(".//div[contains(@class,'title')]//a/@href")
            if len(detailUrl) > 0:
                productInfo['detailUrl'] = detailUrl[0]
            else:
                productInfo['detailUrl'] = ''

            shopName = product.xpath(".//div[contains(@class,'ctx-box')]//div[@class='shop']/a[contains(@class,'shopname')]/span/text()")
            if len(shopName) > 0:
                productInfo['shopName'] = shopName[-1]
            else:
                productInfo['shopName'] = ''

            shopHref = product.xpath(".//div[contains(@class,'ctx-box')]//div[@class='shop']/a[contains(@class,'shopname')]/@href")
            if len(shopHref) > 0:
                productInfo['shopHref'] = shopHref[0]
            else:
                productInfo['shopHref'] = ''
            print(f"shopHref = {shopHref}")

            shopID = product.xpath(".//div[contains(@class,'ctx-box')]//div[@class='shop']/a[contains(@class,'shopname')]/@data-userid")
            if len(shopID) > 0:
                productInfo['shopID'] = shopID[0]
            else:
                productInfo['shopID'] = ''
            print(f"shopID = {shopID}")

            location = product.xpath(".//div[contains(@class,'clearfix')]//div[@class='location']/text()")
            if len(location) > 0:
                productInfo['location'] = location[0]
            else:
                productInfo['location'] = ''

            dealCountRes = product.xpath(".//div[contains(@class,'ctx-box')]//div[@class='deal-cnt']/text()")
            if len(dealCountRes) > 0:
                dealCount = dealCountRes[0]
                if dealCount != '':
                    dealCountRe = re.search('(\d+)', dealCount)
                    dealCount = int(dealCountRe.group(1)) if dealCountRe else 0.0
                    productInfo['dealCount'] = dealCount
            else:
                productInfo['dealCount'] = 0

            # 價格放到了 <span class="sm-offer-priceNum sw-dpl-offer-priceNum" title="¥6.60">
            # productInfo['price'] = product.xpath(".//span[contains(@class,'priceNum')]//text()")
            price = product.xpath(".//div[contains(@class,'price')]//strong/text()")
            if len(price) > 0:
                price = price[0]
            else:
                price = ''
            if price != '':
                # 處理價格:price = ¥64.32'
                priceRe = re.search(r'([\d\.]+)', price)
                priceFloat = float(priceRe.group(1)) if priceRe else 0.0
                productInfo['price'] = priceFloat
            else:
                productInfo['price'] = 0.0

            print(f"productInfo = {productInfo}")
            productInfoLst.append(productInfo)
        return productInfoLst
    except Exception as e:
        print(f"解析List 商品信息出錯,e = {e}")
        return []


# 第一步,進入首頁,搜索關鍵字
def searchKey(browser, wait, keyWord, categorySearchWords, retryCount):
    print(f"searchKey: enter, keyWord = {keyWord}, categorySearchWords = {categorySearchWords}, retryCount = {retryCount}")
    retryCount += 1
    if retryCount > retryMax:
        return (False, 0, keyWord)
    mainUrl = "https://www.taobao.com/"
    print(f"searchKey: 訪問taobao主頁, 進行搜索. mainUrl = {mainUrl}")
    browser.get(mainUrl)

    # 嘗試搜索
    try:
        # 搜索框是否出現
        input = wait.until(
            EC.presence_of_element_located((By.XPATH, "//input[@class='search-combobox-input']"))
        )
    except Exception as e:
        # 搜索框都沒出現,說明頁面沒有加載好,重試
        print(f"searchKey: 搜索框還沒有加載好,重新加載主頁. retryCount = {retryCount}, url = {mainUrl}, e = {e}")
        searchKey(browser, wait, keyWord, categorySearchWords, retryCount)
    else:
        try:
            # 重新拿到搜索框,防止頁面在這個時間有變化,導致input元素失焦
            input = browser.find_element_by_xpath("//input[@class='search-combobox-input']")
            # 輸入搜索關鍵字
            time.sleep(3)
            input.clear()
            input.send_keys(categorySearchWords)
            # 敲enter鍵
            input.send_keys(Keys.RETURN)
            print(f"searchKey: press return key.")
            time.sleep(3)

            # 查看搜索結果是否出現
            searchRes = wait.until(
                EC.presence_of_element_located((By.XPATH, "//div[@class='m-itemlist']"))
            )
            print(f"searchKey: searchSuccess, searchRes = {searchRes}")
        except Exception as e:
            print(f"searchKey: 搜索結果總頁數尚未加載好,重新加載主頁. retryCount = {retryCount}, url = {mainUrl}, e = {e}")
            searchKey(browser, wait, keyWord, categorySearchWords, retryCount)
        else:
            # 如果發現結果頁加載OK, 開始尋找總頁數
            try:
                # 獲取結果總頁數
                print(f"searchKey: 搜索結果已出現,開始尋找總頁數")
                totalPage = 0
                print(f"searchKey: totalPageInit = {totalPage}")
                #  共 100 頁,
                totalRes = wait.until(
                    EC.presence_of_element_located((By.XPATH, "//div[@class='total']"))
                )
                # print(f"totalRes.text = {totalRes.text}")
                totalRe = re.search("(\d+)", totalRes.text)
                # print(f"totalRe = {totalRe}")
                totalPage = int(totalRe.group(1))
                print(f"searchKey: totalPage = {totalPage}")
                return (True, totalPage, keyWord)
            except Exception as e:
                print(f"searchKey: 搜索結果就一頁. e = {e}")
                return (True, 1, keyWord)
            finally:
                # 這個部分會在本函數return語句之前執行
                # 參考文章:Python :淺析 return 和 finally 共同挖的坑
                # http://python.jobbole.com/88408/
                try:
                    print(f"searchKey: 取第一頁的數據出來,進行存儲")
                    # 解析頁面內容:
                    if browser.page_source:
                        productInfoLst = getProductMainInfo(browser.page_source)
                        for product in productInfoLst:
                            product['item_type'] = "productMainInfo"
                            product['keyWord'] = keyWord
                            product['page'] = 1
                            product['markClawerTools'] = 1
                            product['reverse1'] = 1
                            product['categorySearchWords'] = categorySearchWords
                            try:
                                insertRes = db_coll.insert_one(product)
                                print(f"searchKey: insertRes = {insertRes.inserted_id}")
                            except Exception as e:
                                print(f"searchKey: insert_one Exception = {e}")
                except Exception as e:
                    print(f"searchKey: 取第一頁數據出來這個過程出現異常。Exception = {e}")


# 輸入頁碼,然後點擊按鈕,進行翻頁
def getNextListPage(browser, wait, pageNum, retryCount):
    print(f"getNextListPage: enter, pageNum = {pageNum}, retryCount = {retryCount}")
    retryCount += 1
    if retryCount > retryMax:
        return (False, [])

    # 第一步判斷List頁是否加載成功,看一下下面的頁碼輸入框是否加載好了
    try:
        input = wait.until(
            EC.presence_of_element_located((By.XPATH, "//div[@class='form']/input[@class='input J_Input']"))
        )
    except Exception as e:
        # 刷新當前頁面,然後重新查看數據
        print(f"getNextListPage: 頁碼輸入框尚未加載好,刷新當前頁面,循環這個過程. retryCount = {retryCount}")
        browser.refresh()
        time.sleep(2)
        getNextListPage(browser, wait, pageNum, retryCount)
    else:
        # 開始輸入頁碼數,進行翻頁
        try:
            # 重新獲取輸入框,防止原先獲取的input元素失焦
            input = browser.find_element_by_xpath("//div[@class='form']/input[@class='input J_Input']")
            input.clear()
            input.send_keys(pageNum)
            submit = browser.find_element_by_xpath("//div[@class='form']/span[@class='btn J_Submit']")
            submit.click()
            # 檢查下面高亮的頁碼數和這次輸入的是否一致,也就是說是否翻頁成功
            activePage = wait.until(
                EC.text_to_be_present_in_element((By.XPATH, "//ul[@class='items']/li[@class='item active']/span"), str(pageNum))
            )
            # 檢測翻頁後商品列表是否加載成功
            allProducts = wait.until(
                EC.presence_of_element_located((By.XPATH, "//div[@class='m-itemlist']"))
            )
            # 解析頁面內容:
            if browser.page_source:
                productInfoLst = getProductMainInfo(browser.page_source)
                return(True, productInfoLst)
            else:
                # 如果獲取不到源代碼,說明頁面加載出錯,刷新,重新獲取
                browser.refresh()
                time.sleep(2)
                getNextListPage(browser, wait, pageNum, retryCount)
        except Exception as e:
            print(f"getNextListPage: 無法解析當前頁列表下的商品,重新刷新,pageNum = {pageNum}, retryCount = {retryCount}, e = {e}")
            browser.refresh()
            time.sleep(2)
            getNextListPage(browser, wait, pageNum, retryCount)


# 瀏覽器進程,多進程執行單位,爬取淘寶數據主要流程...
def chromeProcessPer(queue, lock, mark):
    clawerStartTime = datetime.datetime.now()
    print(f"chromeProcessPer enter: clawerMark = {mark}, time = {clawerStartTime}")

    # 啓動瀏覽器,並設置好wait
    browser = webdriver.Chrome(chrome_options = chrome_options)
    browser.set_window_size(900, 900)  # 根據桌面分辨率來定,主要是爲了抓到驗證碼的截屏
    wait = WebDriverWait(browser, timeout = 30)

    # 從隊列中獲取搜索關鍵字
    while not queue.empty():
        keyWordInfo = queue.get()

        # 加鎖,是爲了防止散亂的打印。 保護一些臨界狀態
        # 多個進程運行的狀態下,如果同一時刻都調用到print,那麼顯示的打印結果將會混亂
        lock.acquire()
        # keyWordInProcess = ('動漫', [1, '動漫周邊']), markProcess = 1
        print(f"keyWordInProcess = {keyWordInfo}, markProcess = {mark}")
        lock.release()
        time.sleep(1)

        # 按照品類的搜索關鍵字來搜索
        searchRes = searchKey(browser, wait, keyWordInfo[0], keyWordInfo[1][1], 0)
        print(f"main: searchRes = {searchRes}, clawerMark = {mark}")
        if ((searchRes != None)) and searchRes[0] == True:
            # 從第二頁開始存儲,因爲有的商品可能就一頁,在搜索結果中就處理掉了。 而且也沒有頁碼輸入框,無法與下面的流程匹配
            if searchRes[1] > 1:
                for page in range(2, searchRes[1] + 1):
                    # time.sleep(5)      # 根據淘寶反爬策略調整延時
                    listPageRes = getNextListPage(browser, wait, page, 0)
                    print(f"chromeProcesser: keyWord = {keyWordInfo}, page = {page}, listPageRes = {listPageRes},  mark = {mark}")
                    if (listPageRes != None) and (listPageRes[0] == True):
                        for product in listPageRes[1]:
                            product['item_type'] = "productMainInfo"
                            product['keyWord'] = keyWordInfo[0]
                            product['page'] = page
                            product['markClawerTools'] = 1
                            product['reverse1'] = 1
                            product['categorySearchWords'] = keyWordInfo[1][1]
                            try:
                                insertRes = db_coll.insert_one(product)
                                print(f"insertRes = {insertRes.inserted_id}")
                            except Exception as e:
                                print(f"insert_one Exception = {e}")
            else:
                print(f"main: keyWord = {keyWordInfo}, totalPage = {searchRes[1]}, clawerMark = {mark}")
    clawerEndTime = datetime.datetime.now()
    # 一定要退出
    browser.quit()
    print(f"chromeProcessPer end: clawerMark = {mark}, time = {clawerEndTime}, timeUsed = {clawerEndTime - clawerStartTime}")


if __name__ == "__main__":
    mainStartTime = datetime.datetime.now()
    print(f"main: taobao mainStartTime = {mainStartTime}")

    lock = multiprocessing.Lock()       # 進程鎖
    queue = multiprocessing.Queue(300)  # 隊列,用於存放所有的初始關鍵字,最大爲300個

    for keyWord, keyWordValue in keySearchWords.items():
        print(f"keyWord = {keyWord}, keyWordValue = {keyWordValue}")
        # 如果queue定的太小,剩下的放不進去,程序就會block住,等待隊列有空餘空間
        # def put(self, obj, block=True, timeout=None):
        queue.put((keyWord, keyWordValue))
    print(f"queueBefore = {queue}")

    getKeyProcessLst = []
    # 生成兩個進程,並啓動
    for i in range(2):
        # 攜帶的args 必須是python原有的數據類型,不能是自定義的。否則lock鎖不住該對象
        process = multiprocessing.Process(target = chromeProcessPer, args = (queue, lock, i))
        process.start()
        getKeyProcessLst.append(process)

    # 守護線程
    # join 等待線程終止,如果不使用join方法對每個線程做等待終止,那麼線程在運行過程中,可能會去執行最後的打印
    # 如果沒有join,父進程就不會阻塞,啓動子進程之後,父進程就直接執行最後的打印了
    for p in getKeyProcessLst:
        p.join()

    print(f"queueAfter = {queue}")
    queue.close()
    print(f"all queue used.")
    mainEndTime = datetime.datetime.now()
    print(f"### timeUsedTotal = {mainEndTime - mainStartTime}, mainStartTime = {mainStartTime}, mainEndTime = {mainEndTime}")


4. 結果

這裏寫圖片描述

這裏寫圖片描述

  • 需要注意的是:
  • 進程數越多,對電腦的壓迫越大,找到合適的數量。而且一定要注意代碼中的每一個註釋。
  • 目前淘寶對這個頁面的反爬措施不強,不需要登錄,也沒有頻率限制,也沒有驗證碼或者廣告彈窗。如果有新的反爬措施,參考前面其他相關專題,增加這樣的機制。
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章