使用selenium爬取淘B商品列表信息入库MongoDb

selenium_taobao_com.py

#!/usr/bin/env python3
# coding=utf-8
# Version:python3.6.1
# File:selenium_taobao_com.py
# Author:LGSP_Harold
from urllib.parse import quote

import pymongo
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from pyquery import PyQuery as pq


# firefox_options = webdriver.FirefoxOptions()
# firefox_options.add_argument('--headless')
# browser = webdriver.Firefox(firefox_options=firefox_options)
browser = webdriver.Firefox()
wait = WebDriverWait(browser, 10)

client = pymongo.MongoClient('mongodb://admin:admin@localhost:27017')
db = client.db_taobao_com


def index_page(page):
    """
    抓取索引页
    :param page:页码
    """
    print('正在爬取第', page, '页')
    try:
        if page > 1:
            input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager div.form > input')))
            submit = WebDriverWait(browser, 60, 3).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager div.form span.btn.J_Submit')))
            input.clear()
            input.send_keys(page)
            submit.click()
        wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager li.item.active > span'), str(page)))
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.m-itemlist .items .item')))
        next_page = browser.find_element_by_xpath('//li[@class="item next"]')
        js4 = 'arguments[0].scrollIntoView();'
        browser.execute_script(js4, next_page)
        get_products()
    except TimeoutException:
        index_page(page)


def get_products():
    """
    获取商品数据
    """
    html = browser.page_source
    doc = pq(html)
    items = doc('#mainsrp-itemlist .items .item').items()
    for item in items:
        product = {
            'image': item.find('.pic .img').attr('data-src'),
            'price': item.find('.price').text(),
            'deal': item.find('.deal-cnt').text(),
            'title': item.find('.title').text(),
            'shop': item.find('.shop').text(),
            'location': item.find('.location').text()
        }
        print(product)
        save_to_mongo(product)


def save_to_mongo(result):
    """
    保存至MongoDB
    :param result:结果
    """
    try:
        if db.collection_product.insert_one(result):
            print('存储到MongoDB成功')
    except Exception as e:
        print('存储到MongoDB失败')
        print(e)


def login():
    url = 'https://login.taobao.com/member/login.jhtml'
    browser.get(url=url)

    # 淘宝反爬机制会检测到selenium，无法使用账号密码登录（登录失败原因，验证码验证失败）
    # 破解方法：使用淘宝APP扫码登录

    # username = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#fm-login-id')))
    # password = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#fm-login-password')))
    # submit = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.fm-btn > button')))
    #
    # username.clear()
    # user = input('输入会员名/邮箱/手机号：')
    # password.clear()
    # pwd = input('输入登录密码：')
    #
    # username.send_keys(user)
    # password.send_keys(pwd)
    # submit.click()

    try:
        qr_code = WebDriverWait(browser, 30, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'i.iconfont.icon-qrcode')))
        qr_code.click()
        # above = browser.find_element_by_class_name('i.iconfont.icon-qrcode')
        # ActionChains(browser).click(above).perform()

        print('请用淘宝APP扫码登录')

        if WebDriverWait(browser, 60, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, '.site-nav-login-info-nick'))):
            url_index = WebDriverWait(browser, 60, 3).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.site-nav-menu-hd > a > span')))
            url_index.click()
    except TimeoutException:
        print('登录超时')
        browser.quit()


def main():
    login()
    """
    遍历每一页
    """
    goods = input('输入您要搜索的商品：')
    page = int(input('输入您要爬取的总页数：'))
    url = 'https://s.taobao.com/search?q=' + quote(goods)
    browser.get(url=url)

    for i in range(1, page + 1):
        index_page(i)

    browser.quit()


if __name__ == '__main__':
    main()

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

使用selenium爬取淘B商品列表信息入库MongoDb

京东面试：如何进行JVM调优？

美团一面：项目中有 10000 个 if else 如何优化？想了半天，被问懵了！

Python 将PowerPoint (PPT/PPTX) 转为HTML

SQL优化-20231016

jinja2 for循環中if語句不起作用

異步提交tinymce富文本

本地Flask項目添加SSL

Flask接入第三方（Facebook）登錄

設置overflow:hiden出現偏移現象

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結