python 根據需求靈活爬取唯品會商品動態數據

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from pyquery import PyQuery as pq
import time
import random

# window上必須要有
browser = webdriver.Chrome(r"C:\Users\dell\AppData\Local\Google\Chrome\Application\chromedriver.exe")
# 將窗口設置爲最大窗口
browser.maximize_window()
# 將要爬取數據的關鍵字 KEYWORD 可以爲任何關鍵字
KEYWORD = "手機"


class VIP(object):
    def search(self):
        """
        獲取商品頁面信息
        :param page: 當前頁碼數
        :return:
        """
        url = "https://www.vip.com/"
        browser.get(url)
        wait = WebDriverWait(browser, 5)
        # 等待搜索框出現
        input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".c-search-input")))
        # 等待點擊按鈕出現
        # 另一種方法是模擬鍵盤enter
        submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".J-search-button")))
        time.sleep(random.randint(1, 4))
        # 清空搜索框原始內容
        input.clear()
        # 將關鍵字填寫進去
        input.send_keys(KEYWORD)
        time.sleep(random.randint(1, 4))
        # 模擬認爲點擊事件
        submit.click()
        time.sleep(random.randint(1, 4))
        # 跳轉頁面下拉到底
        self.scroll()
        # browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
        # 這種下拉有一個缺點就是frame遮蓋時候不能下拉到最底部,出現點擊 < 時候出現超時異常
        # 還有一種解決方法是將窗口最大化,可以避免遮蓋問題發生
        time.sleep(random.randint(2, 4))

    def scroll(self):
        """
        針對下拉進行操作
        :return:
        """
        browser.execute_script(""" 
            (function () { 
                var y = document.body.scrollTop; 
                var step = 100; 
                window.scroll(0, y); 
                function f() { 
                    if (y < document.body.scrollHeight) { 
                        y += step; 
                        window.scroll(0, y); 
                        setTimeout(f, 50); 
                    }
                    else { 
                        window.scroll(0, y); 
                        document.title += "scroll-done"; 
                    } 
                } 
                setTimeout(f, 1000); 
            })(); 
            """)

    def getData(self):
        """
        獲取頁面信息
        :return:
        """
        html = browser.page_source
        doc = pq(html)
        items = doc(".goods-list-item").items()
        for index, item in enumerate(items):
            product = {
                "URL:": "http:{}".format(str(item(".goods-image a").attr("href"))),
                "DISCOUNT_PRICE:": item(".inner-exclusive").text(),
                "VIP_PRICE:": item(".goods-vipshop-wrap").text(),
                "DISCOUNT:": item(".goods-discount-wrap").text().replace("\n", " "),
                "TITLE:": item(".goods-title-info").text()
            }
            self.write(product)
            print(index, product)
            time.sleep(random.randint(1, 4))

    def write(self, content):
        """
        結果本地化保存
        :param content:
        :return:
        """
        with open(r"vip_iPhone.csv", 'a+', encoding="utf-8") as file:
            file.write(str(content) + "\n")

    def nextPage(self):
        """
        點擊 < 跳轉下一頁
        :return:
        """
        wait = WebDriverWait(browser, 5)
        # 點擊下一頁按鈕
        sumbit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".cat-paging-next")))
        time.sleep(4)
        sumbit.click()
        # 當前URL是頁面跳轉後的URL
        current_url = browser.current_url
        print(current_url)

    def execute(self):
        # 先執行搜索操作
        self.search()
        # 跳轉頁面
        self.getData()
        # 獲取數據
        # 19 是基於商品的總頁數來定的
        for i in range(1, 19):
            # 在跳轉下一頁
            self.nextPage()
            # 獲取數據
            self.getData()


if __name__ == "__main__":
    vip = VIP()
    vip.execute()

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章