使用selenium爬取淘B商品列表信息入庫MongoDb

selenium_taobao_com.py

#!/usr/bin/env python3
# coding=utf-8
# Version:python3.6.1
# File:selenium_taobao_com.py
# Author:LGSP_Harold
from urllib.parse import quote

import pymongo
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from pyquery import PyQuery as pq


# firefox_options = webdriver.FirefoxOptions()
# firefox_options.add_argument('--headless')
# browser = webdriver.Firefox(firefox_options=firefox_options)
browser = webdriver.Firefox()
wait = WebDriverWait(browser, 10)

client = pymongo.MongoClient('mongodb://admin:admin@localhost:27017')
db = client.db_taobao_com


def index_page(page):
    """
    抓取索引頁
    :param page:頁碼
    """
    print('正在爬取第', page, '')
    try:
        if page > 1:
            input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager div.form > input')))
            submit = WebDriverWait(browser, 60, 3).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager div.form span.btn.J_Submit')))
            input.clear()
            input.send_keys(page)
            submit.click()
        wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager li.item.active > span'), str(page)))
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.m-itemlist .items .item')))
        next_page = browser.find_element_by_xpath('//li[@class="item next"]')
        js4 = 'arguments[0].scrollIntoView();'
        browser.execute_script(js4, next_page)
        get_products()
    except TimeoutException:
        index_page(page)


def get_products():
    """
    獲取商品數據
    """
    html = browser.page_source
    doc = pq(html)
    items = doc('#mainsrp-itemlist .items .item').items()
    for item in items:
        product = {
            'image': item.find('.pic .img').attr('data-src'),
            'price': item.find('.price').text(),
            'deal': item.find('.deal-cnt').text(),
            'title': item.find('.title').text(),
            'shop': item.find('.shop').text(),
            'location': item.find('.location').text()
        }
        print(product)
        save_to_mongo(product)


def save_to_mongo(result):
    """
    保存至MongoDB
    :param result:結果
    """
    try:
        if db.collection_product.insert_one(result):
            print('存儲到MongoDB成功')
    except Exception as e:
        print('存儲到MongoDB失敗')
        print(e)


def login():
    url = 'https://login.taobao.com/member/login.jhtml'
    browser.get(url=url)

    # 淘寶反爬機制會檢測到selenium,無法使用賬號密碼登錄(登錄失敗原因,驗證碼驗證失敗)
    # 破解方法:使用淘寶APP掃碼登錄

    # username = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#fm-login-id')))
    # password = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#fm-login-password')))
    # submit = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.fm-btn > button')))
    #
    # username.clear()
    # user = input('輸入會員名/郵箱/手機號:')
    # password.clear()
    # pwd = input('輸入登錄密碼:')
    #
    # username.send_keys(user)
    # password.send_keys(pwd)
    # submit.click()

    try:
        qr_code = WebDriverWait(browser, 30, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'i.iconfont.icon-qrcode')))
        qr_code.click()
        # above = browser.find_element_by_class_name('i.iconfont.icon-qrcode')
        # ActionChains(browser).click(above).perform()

        print('請用淘寶APP掃碼登錄')

        if WebDriverWait(browser, 60, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, '.site-nav-login-info-nick'))):
            url_index = WebDriverWait(browser, 60, 3).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.site-nav-menu-hd > a > span')))
            url_index.click()
    except TimeoutException:
        print('登錄超時')
        browser.quit()


def main():
    login()
    """
    遍歷每一頁
    """
    goods = input('輸入您要搜索的商品:')
    page = int(input('輸入您要爬取的總頁數:'))
    url = 'https://s.taobao.com/search?q=' + quote(goods)
    browser.get(url=url)

    for i in range(1, page + 1):
        index_page(i)

    browser.quit()


if __name__ == '__main__':
    main()

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章