python 爬京東商品信息

from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
from random import randint
from pyquery import PyQuery as pq
import pymongo


class JD(object):

    def __init__(self):
        self.options = webdriver.ChromeOptions()
        # 無痕模式
        self.options.add_argument("headless")
        self.browser = webdriver.Chrome(
            executable_path=r"C:\Users\dell\AppData\Local\Google\Chrome\Application\chromedriver.exe",
            chrome_options=self.options
        )
        # 瀏覽器最大化
        self.browser.maximize_window()
        self.keyword = "iphone"
        self.url = "https://www.jd.com/?cu=true&utm_source=baidu-search&utm_medium=cpc&utm_campaign=t_262767352_baidusearch&utm_term=106807362512_0_1ea216375c8242409e3b4487043f782b"

    def scroll(self):
        self.browser.execute_script(""" 
            (function () { 
                var y = document.body.scrollTop; 
                var step = 100; 
                window.scroll(0, y); 
                function f() { 
                    if (y < document.body.scrollHeight) { 
                        y += step; 
                        window.scroll(0, y); 
                        setTimeout(f, 50); 
                    }
                    else { 
                        window.scroll(0, y); 
                        document.title += "scroll-done"; 
                    } 
                } 
                setTimeout(f, 1000); 
            })(); 
            """)

    def skip(self):
        """
        主頁面跳轉到detail界面
        :return:
        """
        self.browser.get(self.url)
        wait = WebDriverWait(self.browser, 5)
        # 等待輸入框出現
        input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#key')))
        # 等待搜索按鈕出現
        submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".button")))
        time.sleep(2)
        # 清空出現的關鍵字
        input.clear()
        # 寫入我們需要的關鍵字
        input.send_keys(self.keyword)
        time.sleep(2)
        # 模擬點擊
        submit.click()
        # 跳轉到商品信息界面
        time.sleep(randint(1, 3))
        # 頁面下拉
        self.scroll()
        time.sleep(5)
        html = self.browser.page_source
        doc = pq(html)
        # 獲取商品的總頁數
        count = doc(".p-skip b").text()
        return int(count)

    def getData(self):
        self.browser.get(
            "https://search.jd.com/Search?keyword=iphone&enc=utf-8&suggest=1.def.0.V16--12s0,20s0,38s0,97s0&wq=ip&pvid=652942ef6a96487c830988faa7a2d8e6")
        html = self.browser.page_source
        doc = pq(html)
        items = doc(".gl-item").items()
        for index, item in enumerate(items):
            product = {
                "href": "https:{}".format(str(item(".p-img a").attr("href"))),
                "title": str(item(".p-img a").attr("title")).replace("\n", " "),
                "price": item(".p-price").text(),
                "name": str(item(".p-name em").text()).replace("\n", " "),
                "commit": item(".p-commit").text(),
                "shop": item(".p-shop a").text() + " " + "https:" + item(".p-shop a").attr("href"),
                "icons": item(".p-icons").text(),
                "stock": item(".p-stock").text()
            }
            time.sleep(randint(1, 2))
            print(index, product)
            self.write(product)
            self.save_to_mongo(product)

    def next(self):
        wait = WebDriverWait(self.browser, 3)
        submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".pn-next")))
        time.sleep(3)
        submit.click()
        print(self.browser.current_url)
        time.sleep(2)
        self.scroll()

    def write(self, content):
        with open(r"G:\個人總結\csv\jd_iphone.csv", 'a+', encoding="utf-8") as file:
            file.write(str(content) + "\n")

    def save_to_mongo(self, result):
        MONGO_URL = "localhost"
        MONGO_DB = "JingDong"
        MONGO_COLLECTION = "products"
        client = pymongo.MongoClient(MONGO_URL)
        db = client[MONGO_DB]
        try:
            db[MONGO_COLLECTION].insert_one(result)
        except Exception:
            print("存儲到MongoDB失敗")

    def execute(self):
        count = self.skip()
        for i in range(count):
            print("第{}頁".format(i + 1))
            self.getData()
            self.next()


if __name__ == "__main__":
    JD().execute()
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章