爬蟲-獲取鼠標點擊或則移動到指定位置才能獲得的動態加載數據

測試網站 https://www.zalando.de/damen-home/ 一家電子商城網站

我們的目的就是去爬取各個子分類下面的鏈接，

這些數據必須到當鼠標移動到上面的橫向導航欄菜單纔會動態加載出對應的子菜單。

主體思路

使用selenium去模擬將鼠標放到航向導航欄，同時立刻獲得下面加載出來的動態數據

以及附上源碼：

from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
import os,time



#下載動態界面並返回子分類鏈接
def get_dynamic_htmlNavLink(site_url):
    print('開始加載',site_url,'動態頁面')
    chrome_options = webdriver.ChromeOptions()
    #ban sandbox
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    #use headless
    #chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--ignore-ssl-errors')
    driver = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH,chrome_options=chrome_options)
    #print('dynamic laod web is', site_url)
    driver.set_page_load_timeout(100)
    #driver.set_script_timeout(100)
    try:
        driver.get(site_url)
    except Exception as e:
        driver.execute_script('window.stop()')  # 超出時間則不加載
        print(e, 'dynamic web load timeout')
    action = ActionChains(driver)
    womwn_nav_tag = driver.find_element_by_css_selector('.z-navicat-header_categoryList')
    nav_tag_list = womwn_nav_tag.find_elements_by_css_selector('li')
    cate_list = []
    for tag in nav_tag_list:
        print(tag.text)
        action.move_to_element(tag).perform()
        time.sleep(5)
        a_tag_list = driver.find_elements_by_css_selector('a.z-navicat-header_subCategoryLink')
        for tag in a_tag_list:
            href = tag.get_attribute('href')
            if href != '':
                print(href)
                cate_list.append(href)
    try:
        driver.quit()
    except:
        pass
    return cate_list


site_url = 'https://www.zalando.de/damen-home/'
 get_dynamic_htmlNavLink(site_url)

最後附上爬取該網站所有子分類下面的所有商品數據，包含了上面獲得動態分類的子鏈接

import  requests,random,os,xlwt,math,time,re,pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains


CHROME_DRIVER_PATH = 'D:\\Code\imgageRecognition\\site_scrapy\\chromedriver.exe'
save_path = 'C:\\Users\\SHEIN\\Desktop\\zalando\\'



#獲得靜態的界面
def get_static_html(site_url):
    print('開始加載', site_url, '頁面')
    headers_list = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0 ',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
    ]
    headers = {
        'user-agent': headers_list[random.randint(0,len(headers_list))-1],
        'Connection': 'keep - alive'
    }
    try:
        resp = requests.get(site_url, headers=headers)
    except Exception as inst:
        print(inst)
        requests.packages.urllib3.disable_warnings()
        resp = requests.get(site_url, headers=headers,verify=False)
    soup = BeautifulSoup(resp.text, 'html.parser')
    return soup



#下載html 文件，並且進行css和js文件的替換
def download_html(content, html_path):
    if not os.path.exists(html_path):  # 文件夾不存在就創建按文件夾
        os.makedirs(html_path)
    print('download htmlfile path is:','{}.html'.format(html_path))
    try:
        with open('{}.html'.format(html_path), 'w+', encoding="utf-8") as f:
            f.write(content)
            f.close()
    except Exception as e:
        print(e)


#下載動態界面
def get_dynamic_htmlNavLink(site_url):
    print('開始加載',site_url,'動態頁面')
    chrome_options = webdriver.ChromeOptions()
    #ban sandbox
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    #use headless
    #chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--ignore-ssl-errors')
    driver = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH,chrome_options=chrome_options)
    #print('dynamic laod web is', site_url)
    driver.set_page_load_timeout(100)
    #driver.set_script_timeout(100)
    try:
        driver.get(site_url)
    except Exception as e:
        driver.execute_script('window.stop()')  # 超出時間則不加載
        print(e, 'dynamic web load timeout')
    action = ActionChains(driver)
    womwn_nav_tag = driver.find_element_by_css_selector('.z-navicat-header_categoryList')
    nav_tag_list = womwn_nav_tag.find_elements_by_css_selector('li')
    cate_list = []
    for tag in nav_tag_list:
        print(tag.text)
        action.move_to_element(tag).perform()
        time.sleep(5)
        a_tag_list = driver.find_elements_by_css_selector('a.z-navicat-header_subCategoryLink')
        for tag in a_tag_list:
            href = tag.get_attribute('href')
            if href != '':
                print(href)
                cate_list.append(href)
    try:
        driver.quit()
    except:
        pass
    return cate_list



#下載到表格
def exportTask(heads,task_done,path,filename):
    if not os.path.exists(path):
        os.makedirs(path)
    task_xls = xlwt.Workbook(encoding='utf-8')
    task_sheet1 = task_xls.add_sheet('sheet1')
    #表頭
    header_allign = xlwt.Alignment()
    header_allign.horz = xlwt.Alignment.HORZ_CENTER
    header_style = xlwt.XFStyle()
    header_style.alignment = header_allign
    for i in  range(len(heads)):
        task_sheet1.col(i).width = 12000
        task_sheet1.write(0,i,heads[i],header_style)
    #開始插入
    for i in range(len(task_done)):
        for j in range(len(heads)):
            task_sheet1.write(i+1,j,task_done[i][heads[j]])
    filename = "{0}.xls".format(filename.replace(':','-'))
    print(os.path.join(path,filename))
    task_xls.save(os.path.join(path,filename))
    return filename



#獲得大分類下的總頁數
def getTotalPageNums(url):
    soup = get_static_html(url)

    #1.是否有分類
    if len(soup.select('.cat_main-1dxBH')) == 0:
        return 0,''
    cate = soup.select('.cat_main-1dxBH')[0].text
    #2.是否有分頁
    exist_pagebean = soup.select('.cat_label-2W3Y8')
    if len(exist_pagebean) == 0:#不存在分頁
        return 1,cate
    page_tag= exist_pagebean[0]
    page_msg = page_tag.text

    page_num = int(page_msg.split(' ')[3])
    if '?' in url:
        url_page = '{0}&p={1}'.format(url, page_num)
    else:
        url_page = '{0}?p={1}'.format(url, page_num)
    soup2 = get_static_html(url_page)
    page_tag2 = soup2.select('.cat_label-2W3Y8')[0]
    page_msg2 = page_tag2.text
    page_num2 = int(page_msg2.split(' ')[1])
    print(page_num2,cate)
    return page_num2,cate



#獲得某個分類的商品的全部信息，包括分頁
def getInfoFromSoup(cate_url,url):
    soup = get_static_html(url)
    #print(soup.prettify())
    if len(soup.select('.cat_articleContain-1Z60A')) == 0:#一件商品都沒有
        print('沒有商品')
        return []
    else:
        cate = soup.select('.cat_main-1dxBH')[0].text
        info_list = []
        for tag in soup.select('.cat_articleContain-1Z60A'):
            info = { 'cate_url': cate_url,'cate': cate}

            link_tag = tag.select('.cat_infoDetail--ePcG')[0]
            info['product link'] = 'https://www.zalando.de' + link_tag.attrs['href']

            desc_brand_tag = tag.select('.cat_brandName-2XZRz')[0]
            desc_article_tag = tag.select('.cat_articleName--arFp')[0]
            info['desc'] = desc_brand_tag.text + '- ' + desc_article_tag.text


            price_tage = tag.select('.cat_originalPrice-2Oy4G')[0]
            info['price'] = price_tage.text
            patttern = re.compile(r'([0-9 ,]+)')
            price_num_str = patttern.findall(info['price'])
            if len(price_num_str) == 0:
                info['price_num'] = 0
            else:
                info['price_num'] = float(price_num_str[0].replace(',','.'))
                info['price'] = price_num_str[0] + ' €'


            #patttern = re.compile(r'[produkt]\/([0-9 a-z \-]+)\/#image')
            #patttern = re.compile(r'[produkt]\/[a-z 0-9 \-]+\-([0-9]+)\/')
            #info_id = patttern.findall(info['product link'])
            #print(info['product link'])
            info['product_id'] = link_tag.attrs['href']


            info_list.append(info)
            #print(info)
        return info_list



#獲得主頁下商品的分類鏈接
def getCateUrl(path):
    url = 'https://www.zalando.de/damen-home/'
    cate_list = get_dynamic_htmlNavLink(url)
    if not os.path.exists(path):
        os.mkdir(path)
    try:
        with open(path + 'all_cate_link.txt', 'a+', encoding="utf-8") as f:
            for cate_url in cate_list:
                f.write(cate_url + '\n')
            f.close()
    except Exception as e:
        print(e)



#將某個分類下的商品存到excel表格
def dowloadExcelByCate(cate_url,path,num):
    pagenum,cate = getTotalPageNums(cate_url)
    info_list = []
    if pagenum > 0:
        for i in range(1, pagenum + 1):
            if '?' in cate_url:
                url_page = '{0}&p={1}'.format(cate_url, i)
            else:
                url_page = '{0}?p={1}'.format(cate_url, i)
            info_list += getInfoFromSoup(cate_url,url_page)
            time.sleep(5)
        heads = ['cate_url','cate','desc','price','price_num','product_id','product link']
        filename = '{0}-{1}'.format(num,cate)
        exportTask(heads, info_list, path, filename)
        try:
            with open(path+'record.txt', 'a+', encoding="utf-8") as f:
                f.write(cate_url+'\n')
                f.close()
        except Exception as e:
            print(e)



#獲得已經爬取的分類鏈接
def getDoneUrl(path,file_name):
    done_url = []
    with open(os.path.join(path,file_name), 'r', encoding="utf-8") as f:
        url_list = f.readlines()
        for url in url_list:
            done_url.append(url.rstrip('\n'))
        print(done_url)
    return done_url



#合併爲一個表格
def connectToOne(dir,to_dir):
    excel_list = []
    for file in os.listdir(dir):
        if file.endswith('.xls'):
            print("file:",file)
            excel_list.append(pd.read_excel(os.path.join(dir,file)))
    print('開始合併')
    total_excel = pd.concat(excel_list)
    print('生成文件')
    total_excel.to_excel(os.path.join(to_dir,'asos.xlsx'),index=False)



if __name__ == '__main__':
    cate_url_list = getDoneUrl(save_path,'all_cate_link.txt')
    done_url = getDoneUrl(save_path,'record.txt')
    for i in range(len(cate_url_list)):
       if cate_url_list[i] not in done_url:
            dowloadExcelByCate(cate_url_list[i],save_path,i+1)


    #getCateUrl(save_path)
    #cate_url = 'https://www.zalando.de/beauty-damen/bobbi-brown/?product_group=beauty&order=activation_date'
    #getTotalPageNums(cate_url)
    #getInfoFromSoup(cate_url,cate_url)
    #dowloadExcelByCate(cate_url, save_path, 1)

    #connectToOne(save_path,'C:\\Users\\SHEIN\\Desktop')

爬蟲-獲取鼠標點擊或則移動到指定位置才能獲得的動態加載數據

測試網站 https://www.zalando.de/damen-home/ 一家電子商城網站

主體思路

druid數據源 xml配置

javascript：替換字符串中指定字符

python實現帶圖片導出excel，如何讓導出的表格裏面有圖片

在線表格的實現

excel數據預處理

python-表格數據統計

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結