selenium模塊
瀏覽器驅動
from selenium import webdriver # 用來驅動瀏覽器的 from selenium.webdriver import ActionChains # 破解滑動驗證碼的時候用的 可以拖動圖片 from selenium.webdriver.common.by import By # 按照什麼方式查找,By.ID,By.CSS_SELECTOR from selenium.webdriver.common.keys import Keys # 鍵盤按鍵操作 from selenium.webdriver.support import expected_conditions as EC # 和下面WebDriverWait一起用的 from selenium.webdriver.support.wait import WebDriverWait # 等待頁面加載某些元素 import time #獲得驅動 chrome = webdriver.Chrome()
顯隱等待
隱式等待:在browser.get('xxx')前就設置,針對所有元素有效 顯式等待:在browser.get('xxx')之後設置,只針對某個元素有效 chrome.implicitly_wait(10) :之前 wait=WebDriverWait(browser,10) :之後
網頁前進後退
try: chrome.get('https://www.baidu.com/') chrome.get('https://www.tmall.com/') chrome.get('https://www.jd.com/') # 後退 chrome.back() # 前進 chrome.forward() time.sleep(5) finally: chrome.close()
JS操作
try: chrome.get('https://www.baidu.com/') chrome.execute_script("alert('傻眼了吧!')") # chrome.execute_script(""" # scasfaf # """) time.sleep(5) finally: chrome.close()
get_elements_by_xpath
操作示例:
<html> <head> <base href='http://example.com/' /> <title>Example website</title> </head> <body> <div id='images'> <a href='image1.html'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a> <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a> <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a> <a href='image4.html'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a> <a href='image5.html'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a> </div> </body> </html>
使用路徑表達式來選取 XML 文檔中的節點或節點集
方法:tag_anme ,text ,get_attribute[] ,img.location
1.從根節點查找: /
2.從全局查找: //
3.查找某一層的下一層: //a/img
4.查找多個: get_elements_by_xpath('//a') ,得到一個列表
5.查找第3個a標籤的img: get_elements_by_xpath('//div/a[3]/img') 不是按照索引;
6.查找id屬性: get_elements_by_xpath('//*[@id = "imgages"]/a[3]/img')
交互操作
from selenium import webdriver # 用來驅動瀏覽器的 from selenium.webdriver import ActionChains # 破解滑動驗證碼的時候用的 可以拖動圖片 from selenium.webdriver.common.by import By # 按照什麼方式查找,By.ID,By.CSS_SELECTOR from selenium.webdriver.common.keys import Keys # 鍵盤按鍵操作 from selenium.webdriver.support import expected_conditions as EC # 和下面WebDriverWait一起用的 from selenium.webdriver.support.wait import WebDriverWait # 等待頁面加載某些元素 import time chrome = webdriver.Chrome() chrome.implicitly_wait(10) try: chrome.get('https://www.tmall.com/') input_tag = chrome.find_element_by_id('mq') input_tag.send_keys('時間革命') input_tag.send_keys(Keys.ENTER) input_tag = chrome.find_element_by_id('mq') input_tag.clear() input_tag.send_keys('唐詩三百首') button = chrome.find_element_by_xpath('//*[@class="mallSearch-input clearfix"]/button') button.click() time.sleep(5) finally: chrome.close()
try: chrome.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable') # source = chrome.find_element_by_id('draggable') # target = chrome.find_element_by_id('droppable') # print(source, target) # 切換子頁面 # chrome.switch_to_frame('iframeResult') # 棄用方法 chrome.switch_to.frame('iframeResult') source = chrome.find_element_by_id('draggable') target = chrome.find_element_by_id('droppable') print(source, target) # 找父頁面 # chrome.switch_to.parent_frame() # source = chrome.find_element_by_id('draggable') # target = chrome.find_element_by_id('droppable') # print(source, target) # 方式一: # ActionChains(chrome).drag_and_drop(source, target).perform() # 方式一: ActionChains(chrome).click_and_hold(source).perform() distance = target.location.get('x') - source.location.get('x') s = 0 while s < distance: ActionChains(chrome).move_by_offset(xoffset=1, yoffset=0).perform() s += 1 ActionChains(chrome).release().perform() time.sleep(5) finally: chrome.close()
爬取京東商品信息
from selenium import webdriver # 用來驅動瀏覽器的 from selenium.webdriver import ActionChains # 破解滑動驗證碼的時候用的 可以拖動圖片 from selenium.webdriver.common.by import By # 按照什麼方式查找,By.ID,By.CSS_SELECTOR from selenium.webdriver.common.keys import Keys # 鍵盤按鍵操作 from selenium.webdriver.support import expected_conditions as EC # 和下面WebDriverWait一起用的 from selenium.webdriver.support.wait import WebDriverWait # 等待頁面加載某些元素 import time def drver_star(driver, key): try: div_obj = driver.find_element_by_id('J_goodsList') li_list = div_obj.find_elements_by_class_name('gl-item') # print(li_list) for li in li_list: # 商品鏈接 detail_link = li.find_element_by_css_selector('.p-img a').get_attribute('href') # 商品名稱 g_name = li.find_element_by_css_selector('.p-name em').text # 商品價格 g_price = li.find_element_by_css_selector('.p-price i').text # 評論人數 g_commit = li.find_element_by_css_selector('.p-commit a').text goods = ''' ==============tank 商品信息 ================ 商品鏈接: %s 商品名稱: %s 商品價格: %s 評論人數: %s \n ''' % (detail_link, g_name, g_price, g_commit) print(goods) with open('%s.txt' % key, 'a', encoding='utf-8') as f: f.write(goods) next_tag = driver.find_element_by_partial_link_text('下一頁') next_tag.click() time.sleep(2) drver_star(driver, key) time.sleep(5) finally: driver.close() if __name__ == '__main__': key = input('請輸入爬取的商品內容: ').strip() driver = webdriver.Chrome() driver.implicitly_wait(10) driver.get('https://www.jd.com/') input_tag = driver.find_element_by_id('key') input_tag.send_keys(key) input_tag.send_keys(Keys.ENTER) drver_star(driver, key)