爬天貓,京東品牌

淘寶品牌

京東

# -*- coding:utf-8 -*-
# 爬蟲_selenium

import json
import itertools

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup
import time
import pandas as pd 
import numpy as np 


class Spider:
    def __init__(self):
        self.setOpts()
        self.bw = webdriver.Chrome(chrome_options=self.options,executable_path='./Chrome/chromedriver')
        self.fo = open('down.txt','a',encoding='utf-8')
        self.item_cnt = itertools.count(1)
        self.errors = []


    def __del__(self):
        self.bw.close()
        self.fo.close()
        print('error word',self.errors)


    def setOpts(self):
        '''
        模擬器設置
        '''
        self.options = webdriver.ChromeOptions();
        self.options.add_argument('--log-level=3')
        self.options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36')
        

    def save(self, obj):
        '''
        儲存
        '''
        self.fo.write(json.dumps(obj,ensure_ascii=False)+'\n')


    def parse(self, html, word):
        '''
        解析頁面
        '''
        soup = BeautifulSoup(html,'lxml')
        ct = itertools.count(1)

        brands = soup.select(self.sle_node1)
        if not brands:
            brands = soup.select(self.sle_node2)

        brands = [brand.get_text().strip() for brand in brands]
        for brand in brands:
            obj = {}
            obj['word']=word
            obj['sort'] = next(ct)
            obj['brand'] = brand
            obj['host'] = self.host
            self.save(obj)


    def control(self,word):
        '''
        控制
        '''
        wait = WebDriverWait(self.bw, 5)
        
        wait.until(EC.presence_of_element_located((By.ID, self.search_inp_id)))
        self.bw.find_element(By.XPATH, self.search_inp).clear()
        self.bw.find_element(By.XPATH, self.search_inp).send_keys(word)
        time.sleep(1)

        # 點擊搜索
        wait.until(EC.presence_of_element_located((By.ID, self.search_but_id)))
        self.bw.find_element(By.XPATH, self.search_but).click()
        time.sleep(2)

        # 點擊更多
        wait.until(EC.presence_of_element_located((By.ID, self.expand_but_id)))
        self.bw.find_element(By.XPATH, self.expand_but).click()
        time.sleep(2)

        self.parse(self.bw.page_source,word)
        

    def setAttrs(self):
        self.login_url = 'https://passport.jd.com/uc/login?'
        self.start_url = 'https://search.jd.com/Search?'

        self.search_inp_id = 'key'
        self.search_inp = '//*[@id="key"]'
        self.search_but_id = 'search-2014'
        self.search_but = '//*[@id="search-2014"]/div/button'
        self.expand_but_id = 'J_selector'
        self.expand_but = '//*[@id="J_selector"]/div[1]/div/div[3]/a[2]'

        self.host = 'JD'
        self.sle_node1 = '#J_selector > div.J_selectorLine.s-brand > div > div.sl-value > div.sl-v-logos > ul li a'
        self.sle_node2 = '#J_selector > div.J_selectorLine.s-brand > div > div.sl-value > div.sl-v-list > ul li a'


    def login(self):
        '''
        登錄
        '''
        self.bw.get(self.login_url)
        input('登錄後回車...')
        self.bw.get(self.start_url)
        time.sleep(3)


    def getWords(self):
        '''
        過去鏈接列表
        '''
        # words = pd.read_excel('搜索依據.xlsx',sheet_name=0)['word'].tolist()
        words=['衣架', '保鮮膜', '雨傘', '記憶枕', '毛巾', '飯盒', '洗手液', '油漆', '牆貼', '檯燈', '節能燈', '數據線', '充電寶', '攝影']
        return words


    def action(self):
        '''
        執行
        '''
        self.setAttrs()
        self.login()
        for word in self.getWords():
            try:
                self.control(word)
                print('num【%s】'%next(self.item_cnt),word,'is ok...')
            except Exception as e:
                self.errors.append(word)
                print(word,'error',repr(e))



if __name__=='__main__':
    sp= Spider();
    sp.action();

天貓

# -*- coding:utf-8 -*-
# 爬蟲_selenium

import json
import itertools

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup
import time
import pandas as pd 
import numpy as np 


class Spider:
    def __init__(self):
        self.setOpts()
        self.bw = webdriver.Chrome(chrome_options=self.options,executable_path='./Chrome/chromedriver')
        self.fo = open('down.txt','a',encoding='utf-8')
        self.item_cnt = itertools.count(1)
        self.errors = []


    def __del__(self):
        self.bw.close()
        self.fo.close()
        print('error word',self.errors)


    def setOpts(self):
        '''
        模擬器設置
        '''
        self.options = webdriver.ChromeOptions();
        self.options.add_argument('--log-level=3')
        self.options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36')
        

    def save(self, obj):
        '''
        儲存
        '''
        self.fo.write(json.dumps(obj,ensure_ascii=False)+'\n')


    def parse(self, html, word):
        '''
        解析頁面
        '''
        soup = BeautifulSoup(html,'lxml')
        ct = itertools.count(1)
        brands = soup.select(self.sle_node)
        brands = [brand.get_text() for brand in brands]
        for brand in brands:
            obj = {}
            obj['word']=word
            obj['sort'] = next(ct)
            obj['brand'] = brand
            obj['host'] = self.host
            self.save(obj)


    def control(self,word):
        '''
        控制
        '''
        wait = WebDriverWait(self.bw, 5)

        self.bw.find_element(By.XPATH, self.search_inp).clear()
        self.bw.find_element(By.XPATH, self.search_inp).send_keys(word)

        # 搜索
        wait.until(EC.presence_of_element_located((By.ID, self.search_but_id)))
        self.bw.find_element(By.XPATH, self.search_but).click()
        time.sleep(3)

        # 更多
        wait.until(EC.presence_of_element_located((By.ID, self.expand_but_id)))
        self.bw.find_element(By.XPATH, self.expand_but).click()
        time.sleep(2)

        self.parse(self.bw.page_source, word)
        

    def setAttrs(self):
        self.login_url = 'https://login.tmall.com/'
        self.start_url = 'https://www.tmall.com/'

        self.search_inp = '//*[@id="mq"]'
        self.search_but_id = 'mallSearch'
        self.search_but = '//*[@id="mallSearch"]/form/fieldset/div/button'
        self.expand_but_id = 'J_NavAttrsForm'
        self.expand_but = '//*[@id="J_NavAttrsForm"]/div/div[1]/div/div[2]/div[2]/a[2]'

        self.host = 'tmall'
        self.sle_node = '#J_NavAttrsForm > div > div.brandAttr.j_nav_brand > div > div.attrValues > ul > li a'


    def login(self):
        '''
        登錄
        '''
        self.setAttrs()
        self.bw.get(self.login_url); input('登錄後回車...')
        self.bw.get(self.start_url); time.sleep(3)


    def getWords(self):
        '''
        過去鏈接列表
        '''
        words = pd.read_excel('搜索依據.xlsx',sheet_name=0)['word'].tolist()
        # words = ['熨斗', '洗衣機', '電視', '榨汁機', '電飯煲', '固態硬盤', '存儲卡', '鼠標']
        return words


    def action(self):
        '''
        執行
        '''
        self.login()
        for word in self.getWords():
            try:
                self.control(word)
                print('num【%s】'%next(self.item_cnt),word,'is ok...')
            except Exception as e:
                self.errors.append(word)
                print(word,'error',repr(e))



if __name__=='__main__':
    sp= Spider();
    sp.action();
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章