selenium 爬蟲
1.加載cookies,加載headers,進行模擬登錄
@加載headers的方法
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from fake_useragent import UserAgent
from selenium import webdriver
#方法一
ua = UserAgent()
desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
desired_capabilities["phantomjs.page.settings.userAgent"] = ua.random
driver = webdriver.PhantomJS(executable_path=PhantomJS_Path,desired_capabilities = desired_capabilities)
#方法二:
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"]=( "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.23 Mobile Safari/537.36" )
@等待時間
@睡眠等待(最不好的方法)
time.sleep()
@顯式等待
顯式等待是你在代碼中定義等待一定條件發生後再進一步執行你的代碼。 最糟糕的案例是使用time.sleep(),它將條件設置爲等待一個確切的時間段。 這裏有一些方便的方法讓你只等待需要的時間。WebDriverWait結合ExpectedCondition 是實現的一種方式。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Firefox()
driver.get("http://somedomain/url_that_delays_loading")
try:
element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "myDynamicElement")))
finally:
driver.quit()
@隱式等待
隱式等待是在嘗試發現某個元素的時候,如果沒能立刻發現,就等待固定長度的時間。默認設置是0秒。一旦設置了隱式等待時間,它的作用範圍就是Webdriver對象實例的整個生命週期。
from selenium import webdriver
driver = webdriver.Firefox()
driver.implicitly_wait(10) # seconds
driver.get("http://somedomain/url_that_delays_loading")
myDynamicElement = driver.find_element_by_id("myDynamicElement")
發現一個小問題,知道了爲啥我的selenium+PhantomJS,一直登錄不上了,原因是因爲我沒加載完全就結束操作了,注意一定要設置好等待時間,等待加載。
在middleware內集成selenium
class intergration_selenium(object):
def __init__(self):
#super(intergration_selenium,self).__init__()
#在init中初始化driver避免多次調用,使句柄不一致
self.driver = self.driver_handle() #暴力傳入句柄
self.count = 1
def process_request(self, request, spider):
if spider.name == "lagou_BaseSpider":
self.driver.get(request.url)
time.sleep(8) #使用time.sleep是最不好的方法,利用顯式和隱式等待
print('正在訪問:',request.url)
#self.driver.get_screenshot_as_file('{0}_.png'.format(self.count))
#self.count+=1
# 導出4個值,分別爲url,html,encoding,request,直接返回HtmlResponse可以令scrapy不再調用downloader
return HtmlResponse(url=self.driver.current_url,body=self.driver.page_source,encoding='utf-8',request=request)
#不清楚cookies怎麼弄,直接暴力傳入句柄
def driver_handle(self):
lagou_login_url = r'https://passport.lagou.com/login/login.html'
# 改成phantomjs
desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
ua = UserAgent()
desired_capabilities["phantomjs.page.settings.userAgent"] = ua.random
driver = webdriver.PhantomJS(
executable_path=r'E:\PhantomJS\phantomjs-2.1.1-windows\phantomjs-2.1.1-windows\bin\phantomjs.exe',
desired_capabilities=desired_capabilities)
driver.get(lagou_login_url)
name = driver.find_element_by_xpath('//input[@placeholder="請輸入常用手機號/郵箱"]').send_keys('******')
password = driver.find_element_by_xpath('//input[@placeholder="請輸入密碼"]').send_keys('******')
driver.find_element_by_xpath('//input[@type="submit"]').click()
time.sleep(10)
return driver
注意這句,導出4個值,分別爲url,html,encoding,request,直接返回HtmlResponse可以令scrapy不再調用downloader
return HtmlResponse(url=self.driver.current_url,body=self.driver.page_source,encoding=’utf-8’,request=request)