c# http請求ajax頁面

  我們在用Http請求的時候,某些頁面是ajax加載的,所以請求過來的頁面數據不完整。也就是說ajax局部加載數據的地方,我們請求不到,這時候該怎麼辦呢?

  WebDriver+phantomjs 這兩個組合在一起使用,可以完成此任務。分別簡單介紹下,WebDriver是一個前端的自動化測試框架,phantomjs是一個無界面的瀏覽器,基於webkit。WebDriver調用phantomjs.exe工作。下面是WebDriver提供的API,看來它能驅動各種瀏覽器工作。

        

  使用前準備:

       在Nuget上,下載 Selenium.WebDriverSelenium.PhantomJS.WebDriver兩個包,在項目中引用 WebDriver.dll,在輸出目錄下要有phantomjs.exe。

  我們看一個完整的例子:

  

using OpenQA.Selenium;
using OpenQA.Selenium.PhantomJS;
using OpenQA.Selenium.Support.UI;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading;
using System.Threading.Tasks;

namespace ConsoleApplication1
{
    public interface ICrawler
    {
        event EventHandler<OnStartEventArgs> OnStart;
        event EventHandler<OnCompletedEvent> OnCompleted;
        event EventHandler<OnErrorEventArgs> OnError;

        Task Start(Uri uri, Script script, Operation opreation);
    }

    public class Operation
    {

        public Action<PhantomJSDriver> Action;

        public Func<IWebDriver, bool> Condition;

        public int timeout { get; set; }
    }

    public class Script
    {
        public string Code { set; get; }

        public object[] Args { set; get; }

    }

    public class OnStartEventArgs
    {
        public Uri Uri { set; get; }

        public OnStartEventArgs(Uri uri)
        {
            this.Uri = uri;
        }
    }

    public class OnErrorEventArgs
    {
        public Uri Uri { set; get; }

        public Exception Exception { set; get; }

        public OnErrorEventArgs(Uri uri, Exception ex)
        {
            this.Uri = uri;

            this.Exception = ex;
        }
    }



    public class OnCompletedEvent
    {
        public Uri Uri { set; get; }

        public int ThreadId { set; get; }

        public string PageSource { get; private set; }

        public long Milliseconds { get; private set; }

        public PhantomJSDriver Driver { get; private set; }

        public OnCompletedEvent(Uri uri, int threadId, string pageSource, long milliseconds, PhantomJSDriver driver)
        {
            this.Uri = uri;
            this.ThreadId = threadId;
            this.PageSource = pageSource;
            this.Milliseconds = milliseconds;
            this.Driver = driver;
        }
    }

    public class HighCrawler : ICrawler
    {

        public event EventHandler<OnStartEventArgs> OnStart;

        public event EventHandler<OnCompletedEvent> OnCompleted;

        public event EventHandler<OnErrorEventArgs> OnError;

        private static PhantomJSOptions _options;
        private static PhantomJSDriverService _service;


        static HighCrawler()
        {
            var service = PhantomJSDriverService.CreateDefaultService();
            service.DiskCache = true;
            service.IgnoreSslErrors = true;
            service.HideCommandPromptWindow = true;
            service.LoadImages = false;
            service.LocalToRemoteUrlAccess = true;

            _service = service;

            _options = new PhantomJSOptions();
        }


        public Task Start(Uri uri, Script script, Operation operation)
        {
            return Task.Factory.StartNew(() =>
            {
                if (OnStart != null)
                {
                    this.OnStart(this, new OnStartEventArgs(uri));
                }

                var driver = new PhantomJSDriver(_service, _options);
                try
                {
                    var watch = DateTime.Now;
                    driver.Navigate().GoToUrl(uri.ToString());

                    if (script != null)

                        driver.ExecuteScript(script.Code, script.Args);

                    if (operation.Action != null) operation.Action.Invoke(driver);

                    var driverWait = new WebDriverWait(driver, TimeSpan.FromMilliseconds(operation.timeout));  //設置超時時間

                    if (operation.Condition != null) driverWait.Until(operation.Condition);

                    var threadId = Thread.CurrentThread.ManagedThreadId;

                    var milliseconds = DateTime.Now.Subtract(watch).Milliseconds;

                    var pageSource = driver.PageSource;

                    if (this.OnCompleted != null)
                        this.OnCompleted(this, new OnCompletedEvent(uri, threadId, pageSource, milliseconds, driver));

                }
                catch (Exception ex)
                {
                    if (OnError != null)
                        this.OnError(this, new OnErrorEventArgs(uri, ex));
                }
                finally
                {
                    driver.Close();
                    driver.Quit();
                }
            });
        }
    }
}

  這是封裝了一個類,方便使用,我們看如何使用:

        /// <summary>
        /// 解析網站
        /// </summary>
        /// <param name="url">待解析的網站</param>
        /// <param name="waitId">等待加載的元素Id:"search-main"</param>
        /// <param name="xpath">解析路徑:"//div[@class=\"article panel article-result\"]//h5[@class=\"title\"]//a"</param>
        private static void TestWaitForReady(string url, string waitId, string xpath, int timeout = 10000)
        {

            var crawler = new HighCrawler();

            crawler.OnStart += (s, e) =>
            {

                Console.WriteLine("爬蟲開始抓取地址:" + e.Uri.ToString());
            };

            crawler.OnError += (s, e) =>
            {
                Console.WriteLine("爬蟲出現錯誤:" + e.Uri.ToString() + ",異常信息" + e.Exception.ToString());
            };

            crawler.OnCompleted += (s, e) =>
            {
                Console.WriteLine("接收到的源碼長度:" + e.PageSource.Length);

                Thread.Sleep(1000);
                Console.WriteLine("爬蟲結束,花費時間:" + e.Milliseconds);
                var items = e.Driver.FindElements(By.XPath(xpath));

                foreach (var item in items)
                {
                    Console.WriteLine(item.Text);
                }
            };

            var operition = new Operation
            {
                Action = (x) =>
                {

                },
                Condition = (x) =>
                {
                    return x.FindElement(By.Id(waitId)).Displayed;
                },
                timeout = timeout
            };

            crawler.Start(new Uri(url), null, operition);

        }

  取ajax異步結果的核心原理:WebDriver把頁面上的某個元素,作爲標識,一旦出現此元素,表明ajax結束,這時候再返回結果,中間有個等待的過程。

 

  

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章