我們在用Http請求的時候,某些頁面是ajax加載的,所以請求過來的頁面數據不完整。也就是說ajax局部加載數據的地方,我們請求不到,這時候該怎麼辦呢?
WebDriver+phantomjs 這兩個組合在一起使用,可以完成此任務。分別簡單介紹下,WebDriver是一個前端的自動化測試框架,phantomjs是一個無界面的瀏覽器,基於webkit。WebDriver調用phantomjs.exe工作。下面是WebDriver提供的API,看來它能驅動各種瀏覽器工作。
使用前準備:
在Nuget上,下載 Selenium.WebDriver和Selenium.PhantomJS.WebDriver兩個包,在項目中引用 WebDriver.dll,在輸出目錄下要有phantomjs.exe。
我們看一個完整的例子:
using OpenQA.Selenium; using OpenQA.Selenium.PhantomJS; using OpenQA.Selenium.Support.UI; using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading; using System.Threading.Tasks; namespace ConsoleApplication1 { public interface ICrawler { event EventHandler<OnStartEventArgs> OnStart; event EventHandler<OnCompletedEvent> OnCompleted; event EventHandler<OnErrorEventArgs> OnError; Task Start(Uri uri, Script script, Operation opreation); } public class Operation { public Action<PhantomJSDriver> Action; public Func<IWebDriver, bool> Condition; public int timeout { get; set; } } public class Script { public string Code { set; get; } public object[] Args { set; get; } } public class OnStartEventArgs { public Uri Uri { set; get; } public OnStartEventArgs(Uri uri) { this.Uri = uri; } } public class OnErrorEventArgs { public Uri Uri { set; get; } public Exception Exception { set; get; } public OnErrorEventArgs(Uri uri, Exception ex) { this.Uri = uri; this.Exception = ex; } } public class OnCompletedEvent { public Uri Uri { set; get; } public int ThreadId { set; get; } public string PageSource { get; private set; } public long Milliseconds { get; private set; } public PhantomJSDriver Driver { get; private set; } public OnCompletedEvent(Uri uri, int threadId, string pageSource, long milliseconds, PhantomJSDriver driver) { this.Uri = uri; this.ThreadId = threadId; this.PageSource = pageSource; this.Milliseconds = milliseconds; this.Driver = driver; } } public class HighCrawler : ICrawler { public event EventHandler<OnStartEventArgs> OnStart; public event EventHandler<OnCompletedEvent> OnCompleted; public event EventHandler<OnErrorEventArgs> OnError; private static PhantomJSOptions _options; private static PhantomJSDriverService _service; static HighCrawler() { var service = PhantomJSDriverService.CreateDefaultService(); service.DiskCache = true; service.IgnoreSslErrors = true; service.HideCommandPromptWindow = true; service.LoadImages = false; service.LocalToRemoteUrlAccess = true; _service = service; _options = new PhantomJSOptions(); } public Task Start(Uri uri, Script script, Operation operation) { return Task.Factory.StartNew(() => { if (OnStart != null) { this.OnStart(this, new OnStartEventArgs(uri)); } var driver = new PhantomJSDriver(_service, _options); try { var watch = DateTime.Now; driver.Navigate().GoToUrl(uri.ToString()); if (script != null) driver.ExecuteScript(script.Code, script.Args); if (operation.Action != null) operation.Action.Invoke(driver); var driverWait = new WebDriverWait(driver, TimeSpan.FromMilliseconds(operation.timeout)); //設置超時時間 if (operation.Condition != null) driverWait.Until(operation.Condition); var threadId = Thread.CurrentThread.ManagedThreadId; var milliseconds = DateTime.Now.Subtract(watch).Milliseconds; var pageSource = driver.PageSource; if (this.OnCompleted != null) this.OnCompleted(this, new OnCompletedEvent(uri, threadId, pageSource, milliseconds, driver)); } catch (Exception ex) { if (OnError != null) this.OnError(this, new OnErrorEventArgs(uri, ex)); } finally { driver.Close(); driver.Quit(); } }); } } }
這是封裝了一個類,方便使用,我們看如何使用:
/// <summary> /// 解析網站 /// </summary> /// <param name="url">待解析的網站</param> /// <param name="waitId">等待加載的元素Id:"search-main"</param> /// <param name="xpath">解析路徑:"//div[@class=\"article panel article-result\"]//h5[@class=\"title\"]//a"</param> private static void TestWaitForReady(string url, string waitId, string xpath, int timeout = 10000) { var crawler = new HighCrawler(); crawler.OnStart += (s, e) => { Console.WriteLine("爬蟲開始抓取地址:" + e.Uri.ToString()); }; crawler.OnError += (s, e) => { Console.WriteLine("爬蟲出現錯誤:" + e.Uri.ToString() + ",異常信息" + e.Exception.ToString()); }; crawler.OnCompleted += (s, e) => { Console.WriteLine("接收到的源碼長度:" + e.PageSource.Length); Thread.Sleep(1000); Console.WriteLine("爬蟲結束,花費時間:" + e.Milliseconds); var items = e.Driver.FindElements(By.XPath(xpath)); foreach (var item in items) { Console.WriteLine(item.Text); } }; var operition = new Operation { Action = (x) => { }, Condition = (x) => { return x.FindElement(By.Id(waitId)).Displayed; }, timeout = timeout }; crawler.Start(new Uri(url), null, operition); }
取ajax異步結果的核心原理:WebDriver把頁面上的某個元素,作爲標識,一旦出現此元素,表明ajax結束,這時候再返回結果,中間有個等待的過程。