京東
import json
import itertools
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import pandas as pd
import numpy as np
class Spider:
def __init__(self):
self.setOpts()
self.bw = webdriver.Chrome(chrome_options=self.options,executable_path='./Chrome/chromedriver')
self.fo = open('down.txt','a',encoding='utf-8')
self.item_cnt = itertools.count(1)
self.errors = []
def __del__(self):
self.bw.close()
self.fo.close()
print('error word',self.errors)
def setOpts(self):
'''
模擬器設置
'''
self.options = webdriver.ChromeOptions();
self.options.add_argument('--log-level=3')
self.options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36')
def save(self, obj):
'''
儲存
'''
self.fo.write(json.dumps(obj,ensure_ascii=False)+'\n')
def parse(self, html, word):
'''
解析頁面
'''
soup = BeautifulSoup(html,'lxml')
ct = itertools.count(1)
brands = soup.select(self.sle_node1)
if not brands:
brands = soup.select(self.sle_node2)
brands = [brand.get_text().strip() for brand in brands]
for brand in brands:
obj = {}
obj['word']=word
obj['sort'] = next(ct)
obj['brand'] = brand
obj['host'] = self.host
self.save(obj)
def control(self,word):
'''
控制
'''
wait = WebDriverWait(self.bw, 5)
wait.until(EC.presence_of_element_located((By.ID, self.search_inp_id)))
self.bw.find_element(By.XPATH, self.search_inp).clear()
self.bw.find_element(By.XPATH, self.search_inp).send_keys(word)
time.sleep(1)
wait.until(EC.presence_of_element_located((By.ID, self.search_but_id)))
self.bw.find_element(By.XPATH, self.search_but).click()
time.sleep(2)
wait.until(EC.presence_of_element_located((By.ID, self.expand_but_id)))
self.bw.find_element(By.XPATH, self.expand_but).click()
time.sleep(2)
self.parse(self.bw.page_source,word)
def setAttrs(self):
self.login_url = 'https://passport.jd.com/uc/login?'
self.start_url = 'https://search.jd.com/Search?'
self.search_inp_id = 'key'
self.search_inp = '//*[@id="key"]'
self.search_but_id = 'search-2014'
self.search_but = '//*[@id="search-2014"]/div/button'
self.expand_but_id = 'J_selector'
self.expand_but = '//*[@id="J_selector"]/div[1]/div/div[3]/a[2]'
self.host = 'JD'
self.sle_node1 = '#J_selector > div.J_selectorLine.s-brand > div > div.sl-value > div.sl-v-logos > ul li a'
self.sle_node2 = '#J_selector > div.J_selectorLine.s-brand > div > div.sl-value > div.sl-v-list > ul li a'
def login(self):
'''
登錄
'''
self.bw.get(self.login_url)
input('登錄後回車...')
self.bw.get(self.start_url)
time.sleep(3)
def getWords(self):
'''
過去鏈接列表
'''
words=['衣架', '保鮮膜', '雨傘', '記憶枕', '毛巾', '飯盒', '洗手液', '油漆', '牆貼', '檯燈', '節能燈', '數據線', '充電寶', '攝影']
return words
def action(self):
'''
執行
'''
self.setAttrs()
self.login()
for word in self.getWords():
try:
self.control(word)
print('num【%s】'%next(self.item_cnt),word,'is ok...')
except Exception as e:
self.errors.append(word)
print(word,'error',repr(e))
if __name__=='__main__':
sp= Spider();
sp.action();
天貓
import json
import itertools
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import pandas as pd
import numpy as np
class Spider:
def __init__(self):
self.setOpts()
self.bw = webdriver.Chrome(chrome_options=self.options,executable_path='./Chrome/chromedriver')
self.fo = open('down.txt','a',encoding='utf-8')
self.item_cnt = itertools.count(1)
self.errors = []
def __del__(self):
self.bw.close()
self.fo.close()
print('error word',self.errors)
def setOpts(self):
'''
模擬器設置
'''
self.options = webdriver.ChromeOptions();
self.options.add_argument('--log-level=3')
self.options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36')
def save(self, obj):
'''
儲存
'''
self.fo.write(json.dumps(obj,ensure_ascii=False)+'\n')
def parse(self, html, word):
'''
解析頁面
'''
soup = BeautifulSoup(html,'lxml')
ct = itertools.count(1)
brands = soup.select(self.sle_node)
brands = [brand.get_text() for brand in brands]
for brand in brands:
obj = {}
obj['word']=word
obj['sort'] = next(ct)
obj['brand'] = brand
obj['host'] = self.host
self.save(obj)
def control(self,word):
'''
控制
'''
wait = WebDriverWait(self.bw, 5)
self.bw.find_element(By.XPATH, self.search_inp).clear()
self.bw.find_element(By.XPATH, self.search_inp).send_keys(word)
wait.until(EC.presence_of_element_located((By.ID, self.search_but_id)))
self.bw.find_element(By.XPATH, self.search_but).click()
time.sleep(3)
wait.until(EC.presence_of_element_located((By.ID, self.expand_but_id)))
self.bw.find_element(By.XPATH, self.expand_but).click()
time.sleep(2)
self.parse(self.bw.page_source, word)
def setAttrs(self):
self.login_url = 'https://login.tmall.com/'
self.start_url = 'https://www.tmall.com/'
self.search_inp = '//*[@id="mq"]'
self.search_but_id = 'mallSearch'
self.search_but = '//*[@id="mallSearch"]/form/fieldset/div/button'
self.expand_but_id = 'J_NavAttrsForm'
self.expand_but = '//*[@id="J_NavAttrsForm"]/div/div[1]/div/div[2]/div[2]/a[2]'
self.host = 'tmall'
self.sle_node = '#J_NavAttrsForm > div > div.brandAttr.j_nav_brand > div > div.attrValues > ul > li a'
def login(self):
'''
登錄
'''
self.setAttrs()
self.bw.get(self.login_url); input('登錄後回車...')
self.bw.get(self.start_url); time.sleep(3)
def getWords(self):
'''
過去鏈接列表
'''
words = pd.read_excel('搜索依據.xlsx',sheet_name=0)['word'].tolist()
return words
def action(self):
'''
執行
'''
self.login()
for word in self.getWords():
try:
self.control(word)
print('num【%s】'%next(self.item_cnt),word,'is ok...')
except Exception as e:
self.errors.append(word)
print(word,'error',repr(e))
if __name__=='__main__':
sp= Spider();
sp.action();