1.安裝
pip install selenium
2.訪問動態網頁
from selenium.webdriver import Chrome
from scrapy.selector import Selector
#加載驅動
browser = Chrome(executable_path="/home/mata/Tools/driver/chromedriver")
browser.get(
"https://detail.tmall.com/item.htm?spm=a220m.1000858.1000725.1.45766a1570CTei&id=566735119832&skuId=3599500084427&standard=1&user_id=268451883&cat_id=2&is_b=1&rn=b4f8b93029030636b209199c95921f38")
# browser.page_source所有數據,包括動態生成的數據
# print(browser.page_source)
# 數據提取建議用scrapy.selector
t_selector = Selector(text=browser.page_source)
price = t_selector.xpath('//span[@class="tm-price"]/text()')
print(price)
browser.quit()
3.模擬登錄知乎
browser.get("https://www.zhihu.com/signin")
#延遲3秒,防止網頁沒加載完,下面部分元素找不到
import time
time.sleep(3)
browser.find_element_by_xpath('//div[contains(@class,"SignFlow-accountInput")]/input').send_keys("13981826640")
browser.find_element_by_xpath('//div[@class="Input-wrapper"]/input').send_keys("545462004GYP")
browser.find_element_by_css_selector('div.Login-content button.SignFlow-submitButton').click()
browser.quit()
4.模擬鼠標下拉刷新
browser.get("https://www.oschina.net/blog")
import time
time.sleep(5)
for i in range(10):
i += 1
#執行script語句
browser.execute_script(
"""
window.scrollTo(0,document.body.scrollHeight);
var lenOfPage=document.bodyscrollHeight;
return lenOfPage;
"""
)
time.sleep(3)
5.設置chromedriver不加載圖片
from selenium import webdriver
# 設置chromedriver不加載圖片,只加載js與html
chrome_opt = webdriver.ChromeOptions()
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_opt.add_experimental_option("prefs", prefs)
browser = webdriver.Chrome(executable_path="/home/mata/Tools/driver/chromedriver",chrome_options=chrome_opt)
browser.get("https://www.taobao.com")
6.selenium集成到scrapy
middlewares.py中
from scrapy.http import HtmlResponse
class JSageMiddleware(object):
# 通過chrome請求動態網頁
def process_request(self, request, spider):
if spider.name == "myspider":
# 變成了同步訪問,降低了性能,若需要變成異步,需要重寫downloader
# 可去github上搜scrapy downloader
self.browser.get(request.url)
import time
time.sleep(3)
# 返回一個HtmlResponse給spider,將不會再調用downloader進行下載。
return HtmlResponse(url=spider.browser.current_url, body=spider.browser.page_source, encoding="utf-8",
request=request)
myspder.py中
# 分發器
from scrapy.xlib.pydispatch import dispatcher
# 信號
from scrapy import signals
def __init__(self):
# 可共用一個瀏覽器,不用每個url都打開一個
self.browser = webdriver.Chrome(executable_path="/home/mata/Tools/driver/chromedriver")
super(JobboleSpider, self).__init__()
# 當信號量爲signals.spider_closed時,調用相關函數
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider):
# 爬蟲退出的時候,關閉chrome
print("spider closed")
self.browser.close()
7.chrome無界面運行
from pyvirtualdisplay import Display
browser = Chrome(executable_path="/home/mata/Tools/driver/chromedriver")
browser.get(target_url)
print(browser.page_source)
....
easyprocess.EasyProcessCheckInstalledError: cmd=['Xvfb', '-help'] OSError=[Errno 2] No such file or directory[解決方案]
sudo apt-get install xvfb
pip install xvfbwrapper
from:[https://stackoverflow.com/questions/32173839/easyprocess-easyprocesscheckinstallederror-cmd-xvfb-help-oserror-errno]