selenium動態網頁請求

1.安裝

pip install selenium

2.訪問動態網頁

from selenium.webdriver import Chrome
from scrapy.selector import Selector

#加載驅動
browser = Chrome(executable_path="/home/mata/Tools/driver/chromedriver")
browser.get(
    "https://detail.tmall.com/item.htm?spm=a220m.1000858.1000725.1.45766a1570CTei&id=566735119832&skuId=3599500084427&standard=1&user_id=268451883&cat_id=2&is_b=1&rn=b4f8b93029030636b209199c95921f38")

# browser.page_source所有數據,包括動態生成的數據
# print(browser.page_source)

# 數據提取建議用scrapy.selector
t_selector = Selector(text=browser.page_source)
price = t_selector.xpath('//span[@class="tm-price"]/text()')
print(price)
browser.quit()

3.模擬登錄知乎

browser.get("https://www.zhihu.com/signin")
#延遲3秒,防止網頁沒加載完,下面部分元素找不到
import time
time.sleep(3)
browser.find_element_by_xpath('//div[contains(@class,"SignFlow-accountInput")]/input').send_keys("13981826640")
browser.find_element_by_xpath('//div[@class="Input-wrapper"]/input').send_keys("545462004GYP")

browser.find_element_by_css_selector('div.Login-content button.SignFlow-submitButton').click()
browser.quit()

4.模擬鼠標下拉刷新

browser.get("https://www.oschina.net/blog")
import time

time.sleep(5)

for i in range(10):
    i += 1
    #執行script語句
    browser.execute_script(
        """
                window.scrollTo(0,document.body.scrollHeight);
                var lenOfPage=document.bodyscrollHeight;
                return lenOfPage;
        """
    )

    time.sleep(3)

5.設置chromedriver不加載圖片

from selenium import webdriver

# 設置chromedriver不加載圖片,只加載js與html
chrome_opt = webdriver.ChromeOptions()
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_opt.add_experimental_option("prefs", prefs)

browser = webdriver.Chrome(executable_path="/home/mata/Tools/driver/chromedriver",chrome_options=chrome_opt)

browser.get("https://www.taobao.com")

6.selenium集成到scrapy

middlewares.py中

from scrapy.http import HtmlResponse


class JSageMiddleware(object):
    # 通過chrome請求動態網頁
    def process_request(self, request, spider):
        if spider.name == "myspider":
            # 變成了同步訪問,降低了性能,若需要變成異步,需要重寫downloader
            # 可去github上搜scrapy downloader
            self.browser.get(request.url)
            import time
            time.sleep(3)

            # 返回一個HtmlResponse給spider,將不會再調用downloader進行下載。
            return HtmlResponse(url=spider.browser.current_url, body=spider.browser.page_source, encoding="utf-8",
                                request=request)

myspder.py中

# 分發器
from scrapy.xlib.pydispatch import dispatcher
# 信號
from scrapy import signals

    def __init__(self):
        # 可共用一個瀏覽器,不用每個url都打開一個
        self.browser = webdriver.Chrome(executable_path="/home/mata/Tools/driver/chromedriver")
        super(JobboleSpider, self).__init__()

        # 當信號量爲signals.spider_closed時,調用相關函數
        dispatcher.connect(self.spider_closed, signals.spider_closed)

    def spider_closed(self, spider):
        # 爬蟲退出的時候,關閉chrome
        print("spider closed")
        self.browser.close()

7.chrome無界面運行

from pyvirtualdisplay import Display
browser = Chrome(executable_path="/home/mata/Tools/driver/chromedriver")
browser.get(target_url)

print(browser.page_source)
....


easyprocess.EasyProcessCheckInstalledError: cmd=['Xvfb', '-help'] OSError=[Errno 2] No such file or directory[解決方案]

sudo apt-get install xvfb
pip install xvfbwrapper
from:[https://stackoverflow.com/questions/32173839/easyprocess-easyprocesscheckinstallederror-cmd-xvfb-help-oserror-errno]

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章