python + selenium多進程爬取淘寶搜索頁數據
1. 功能描述
- 按照給定的關鍵詞,在淘寶搜索對應的產品,然後爬取搜索結果中產品的信息,包括:標題,價格,銷量,產地等信息,存入mongodb中,需要採用多進程提高爬取效率。
2. 環境
- 系統:win7
- MongoDB 3.4.6
- python 3.6.1
- IDE:pycharm
- 安裝過chrome瀏覽器(63.0.3239.132 (正式版本) 32 位)
- selenium 3.7.0
- 配置好chromedriver v2.34
3. 代碼
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import pymongo
import time
import datetime
import re
import multiprocessing
import lxml.html
import lxml.etree
keySearchWords = {
"動漫": [1, "動漫周邊"],
"水果": [1, "水果沙拉"],
}
client = pymongo.MongoClient("127.0.0.1:27017")
db = client["taobao"]
db_coll = db["productInfo"]
retryMax = 8
chrome_options = webdriver.ChromeOptions()
def getProductMainInfo(htmlSource):
try:
resultTree = lxml.etree.HTML(htmlSource)
productLst = resultTree.xpath("//div[@class='m-itemlist']//div[contains(@class, 'J_MouserOnverReq')]")
print(f"productLst = {productLst}")
productInfoLst = []
for product in productLst:
productInfo = {}
dataNid = product.xpath(".//div[contains(@class,'ctx-box')]//div[contains(@class, 'title')]/a/@data-nid")
if len(dataNid) > 0:
productInfo['dataNid'] = dataNid[0]
else:
productInfo['dataNid'] = 0
productInfo['_id'] = productInfo['dataNid']
taobaoCategory = product.xpath("@data-category")
if len(taobaoCategory) > 0:
productInfo['taobaoCategory'] = taobaoCategory[0]
else:
productInfo['taobaoCategory'] = 'unknow'
rank = product.xpath("@data-index")
if len(rank) > 0:
productInfo['rank'] = rank[0]
else:
productInfo['rank'] = 0
imgSrc = product.xpath(".//div[@class='pic']/a//img/@src")
if len(imgSrc) > 0:
productInfo['imgSrc'] = imgSrc[0]
else:
productInfo['imgSrc'] = ''
title = product.xpath(".//div[contains(@class,'ctx-box')]//div[contains(@class, 'title')]/a/text()")
productInfo['title'] = ''
if len(title) > 0:
for elem in title:
productInfo['title'] += elem.strip()
detailUrl = product.xpath(".//div[contains(@class,'title')]//a/@href")
if len(detailUrl) > 0:
productInfo['detailUrl'] = detailUrl[0]
else:
productInfo['detailUrl'] = ''
shopName = product.xpath(".//div[contains(@class,'ctx-box')]//div[@class='shop']/a[contains(@class,'shopname')]/span/text()")
if len(shopName) > 0:
productInfo['shopName'] = shopName[-1]
else:
productInfo['shopName'] = ''
shopHref = product.xpath(".//div[contains(@class,'ctx-box')]//div[@class='shop']/a[contains(@class,'shopname')]/@href")
if len(shopHref) > 0:
productInfo['shopHref'] = shopHref[0]
else:
productInfo['shopHref'] = ''
print(f"shopHref = {shopHref}")
shopID = product.xpath(".//div[contains(@class,'ctx-box')]//div[@class='shop']/a[contains(@class,'shopname')]/@data-userid")
if len(shopID) > 0:
productInfo['shopID'] = shopID[0]
else:
productInfo['shopID'] = ''
print(f"shopID = {shopID}")
location = product.xpath(".//div[contains(@class,'clearfix')]//div[@class='location']/text()")
if len(location) > 0:
productInfo['location'] = location[0]
else:
productInfo['location'] = ''
dealCountRes = product.xpath(".//div[contains(@class,'ctx-box')]//div[@class='deal-cnt']/text()")
if len(dealCountRes) > 0:
dealCount = dealCountRes[0]
if dealCount != '':
dealCountRe = re.search('(\d+)', dealCount)
dealCount = int(dealCountRe.group(1)) if dealCountRe else 0.0
productInfo['dealCount'] = dealCount
else:
productInfo['dealCount'] = 0
price = product.xpath(".//div[contains(@class,'price')]//strong/text()")
if len(price) > 0:
price = price[0]
else:
price = ''
if price != '':
priceRe = re.search(r'([\d\.]+)', price)
priceFloat = float(priceRe.group(1)) if priceRe else 0.0
productInfo['price'] = priceFloat
else:
productInfo['price'] = 0.0
print(f"productInfo = {productInfo}")
productInfoLst.append(productInfo)
return productInfoLst
except Exception as e:
print(f"解析List 商品信息出錯,e = {e}")
return []
def searchKey(browser, wait, keyWord, categorySearchWords, retryCount):
print(f"searchKey: enter, keyWord = {keyWord}, categorySearchWords = {categorySearchWords}, retryCount = {retryCount}")
retryCount += 1
if retryCount > retryMax:
return (False, 0, keyWord)
mainUrl = "https://www.taobao.com/"
print(f"searchKey: 訪問taobao主頁, 進行搜索. mainUrl = {mainUrl}")
browser.get(mainUrl)
try:
input = wait.until(
EC.presence_of_element_located((By.XPATH, "//input[@class='search-combobox-input']"))
)
except Exception as e:
print(f"searchKey: 搜索框還沒有加載好,重新加載主頁. retryCount = {retryCount}, url = {mainUrl}, e = {e}")
searchKey(browser, wait, keyWord, categorySearchWords, retryCount)
else:
try:
input = browser.find_element_by_xpath("//input[@class='search-combobox-input']")
time.sleep(3)
input.clear()
input.send_keys(categorySearchWords)
input.send_keys(Keys.RETURN)
print(f"searchKey: press return key.")
time.sleep(3)
searchRes = wait.until(
EC.presence_of_element_located((By.XPATH, "//div[@class='m-itemlist']"))
)
print(f"searchKey: searchSuccess, searchRes = {searchRes}")
except Exception as e:
print(f"searchKey: 搜索結果總頁數尚未加載好,重新加載主頁. retryCount = {retryCount}, url = {mainUrl}, e = {e}")
searchKey(browser, wait, keyWord, categorySearchWords, retryCount)
else:
try:
print(f"searchKey: 搜索結果已出現,開始尋找總頁數")
totalPage = 0
print(f"searchKey: totalPageInit = {totalPage}")
totalRes = wait.until(
EC.presence_of_element_located((By.XPATH, "//div[@class='total']"))
)
totalRe = re.search("(\d+)", totalRes.text)
totalPage = int(totalRe.group(1))
print(f"searchKey: totalPage = {totalPage}")
return (True, totalPage, keyWord)
except Exception as e:
print(f"searchKey: 搜索結果就一頁. e = {e}")
return (True, 1, keyWord)
finally:
try:
print(f"searchKey: 取第一頁的數據出來,進行存儲")
if browser.page_source:
productInfoLst = getProductMainInfo(browser.page_source)
for product in productInfoLst:
product['item_type'] = "productMainInfo"
product['keyWord'] = keyWord
product['page'] = 1
product['markClawerTools'] = 1
product['reverse1'] = 1
product['categorySearchWords'] = categorySearchWords
try:
insertRes = db_coll.insert_one(product)
print(f"searchKey: insertRes = {insertRes.inserted_id}")
except Exception as e:
print(f"searchKey: insert_one Exception = {e}")
except Exception as e:
print(f"searchKey: 取第一頁數據出來這個過程出現異常。Exception = {e}")
def getNextListPage(browser, wait, pageNum, retryCount):
print(f"getNextListPage: enter, pageNum = {pageNum}, retryCount = {retryCount}")
retryCount += 1
if retryCount > retryMax:
return (False, [])
try:
input = wait.until(
EC.presence_of_element_located((By.XPATH, "//div[@class='form']/input[@class='input J_Input']"))
)
except Exception as e:
print(f"getNextListPage: 頁碼輸入框尚未加載好,刷新當前頁面,循環這個過程. retryCount = {retryCount}")
browser.refresh()
time.sleep(2)
getNextListPage(browser, wait, pageNum, retryCount)
else:
try:
input = browser.find_element_by_xpath("//div[@class='form']/input[@class='input J_Input']")
input.clear()
input.send_keys(pageNum)
submit = browser.find_element_by_xpath("//div[@class='form']/span[@class='btn J_Submit']")
submit.click()
activePage = wait.until(
EC.text_to_be_present_in_element((By.XPATH, "//ul[@class='items']/li[@class='item active']/span"), str(pageNum))
)
allProducts = wait.until(
EC.presence_of_element_located((By.XPATH, "//div[@class='m-itemlist']"))
)
if browser.page_source:
productInfoLst = getProductMainInfo(browser.page_source)
return(True, productInfoLst)
else:
browser.refresh()
time.sleep(2)
getNextListPage(browser, wait, pageNum, retryCount)
except Exception as e:
print(f"getNextListPage: 無法解析當前頁列表下的商品,重新刷新,pageNum = {pageNum}, retryCount = {retryCount}, e = {e}")
browser.refresh()
time.sleep(2)
getNextListPage(browser, wait, pageNum, retryCount)
def chromeProcessPer(queue, lock, mark):
clawerStartTime = datetime.datetime.now()
print(f"chromeProcessPer enter: clawerMark = {mark}, time = {clawerStartTime}")
browser = webdriver.Chrome(chrome_options = chrome_options)
browser.set_window_size(900, 900)
wait = WebDriverWait(browser, timeout = 30)
while not queue.empty():
keyWordInfo = queue.get()
lock.acquire()
print(f"keyWordInProcess = {keyWordInfo}, markProcess = {mark}")
lock.release()
time.sleep(1)
searchRes = searchKey(browser, wait, keyWordInfo[0], keyWordInfo[1][1], 0)
print(f"main: searchRes = {searchRes}, clawerMark = {mark}")
if ((searchRes != None)) and searchRes[0] == True:
if searchRes[1] > 1:
for page in range(2, searchRes[1] + 1):
listPageRes = getNextListPage(browser, wait, page, 0)
print(f"chromeProcesser: keyWord = {keyWordInfo}, page = {page}, listPageRes = {listPageRes}, mark = {mark}")
if (listPageRes != None) and (listPageRes[0] == True):
for product in listPageRes[1]:
product['item_type'] = "productMainInfo"
product['keyWord'] = keyWordInfo[0]
product['page'] = page
product['markClawerTools'] = 1
product['reverse1'] = 1
product['categorySearchWords'] = keyWordInfo[1][1]
try:
insertRes = db_coll.insert_one(product)
print(f"insertRes = {insertRes.inserted_id}")
except Exception as e:
print(f"insert_one Exception = {e}")
else:
print(f"main: keyWord = {keyWordInfo}, totalPage = {searchRes[1]}, clawerMark = {mark}")
clawerEndTime = datetime.datetime.now()
browser.quit()
print(f"chromeProcessPer end: clawerMark = {mark}, time = {clawerEndTime}, timeUsed = {clawerEndTime - clawerStartTime}")
if __name__ == "__main__":
mainStartTime = datetime.datetime.now()
print(f"main: taobao mainStartTime = {mainStartTime}")
lock = multiprocessing.Lock()
queue = multiprocessing.Queue(300)
for keyWord, keyWordValue in keySearchWords.items():
print(f"keyWord = {keyWord}, keyWordValue = {keyWordValue}")
queue.put((keyWord, keyWordValue))
print(f"queueBefore = {queue}")
getKeyProcessLst = []
for i in range(2):
process = multiprocessing.Process(target = chromeProcessPer, args = (queue, lock, i))
process.start()
getKeyProcessLst.append(process)
for p in getKeyProcessLst:
p.join()
print(f"queueAfter = {queue}")
queue.close()
print(f"all queue used.")
mainEndTime = datetime.datetime.now()
print(f"### timeUsedTotal = {mainEndTime - mainStartTime}, mainStartTime = {mainStartTime}, mainEndTime = {mainEndTime}")
4. 結果
- 需要注意的是:
- 進程數越多,對電腦的壓迫越大,找到合適的數量。而且一定要注意代碼中的每一個註釋。
- 目前淘寶對這個頁面的反爬措施不強,不需要登錄,也沒有頻率限制,也沒有驗證碼或者廣告彈窗。如果有新的反爬措施,參考前面其他相關專題,增加這樣的機制。