Java實現爬取網頁數據:PhantomJS+Webdriver

   本文根據工作中爬取數據需要所做工作整理而來。最初我使用了HttpClient+Jsoup,然後這種最簡單的方式只能得到普通的靜態頁面數據以及暴露在瀏覽器F12調試窗口中的可見URL的數據採集,對於一些需要模仿瀏覽器行爲比如點擊事件,比如頁面採用了JS框架進行重新佈局的就無能爲力了。因此,對於此類情況,最後經過摸索,得到了這個比較好一點的實踐方式。下面廢話不多說,來一個具體實踐:抓取點擊打開鏈接https://www.sosobtc.com/  網頁上的數據。


第一步:創建Maven工程:mycrawler

第二步:導入Maven依賴:

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>com.szzc.crawler</groupId>
  <artifactId>mycrawler</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  <dependencies>
    <dependency>
		<groupId>org.apache.commons</groupId>
		<artifactId>commons-lang3</artifactId>
		<version>3.3.2</version>
	</dependency>
    <dependency>
		<groupId>org.jsoup</groupId>
		<artifactId>jsoup</artifactId>
		<version>1.8.1</version>
	</dependency>
  	<dependency>
  		<groupId>org.apache.httpcomponents</groupId>
  		<artifactId>httpclient</artifactId>
  		<version>4.3.5</version>
  	</dependency>
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.2</version>
      <scope>test</scope>
    </dependency>
    <dependency>
        <groupId>org.seleniumhq.selenium</groupId>
         <artifactId>selenium-java</artifactId>
         <version>2.53.0</version>
    </dependency>
    <dependency>
        <groupId>com.opera</groupId>
        <artifactId>operadriver</artifactId>
    </dependency>
    <dependency>
         <groupId>org.apache.commons</groupId>
         <artifactId>commons-exec</artifactId>
         <version>1.3</version>
    </dependency>
    <dependency>
        <groupId>log4j</groupId>
        <artifactId>log4j</artifactId>
        <version>1.2.17</version>
    </dependency>
      <dependency>
	    <groupId>com.github.detro</groupId>
	    <artifactId>phantomjsdriver</artifactId>
	    <version>1.2.0</version>
	</dependency>
  </dependencies>
  <dependencyManagement>
        <dependencies>
            <dependency>
                <groupId>com.opera</groupId>
                <artifactId>operadriver</artifactId>
                <version>0.16</version>
                <exclusions>
                    <exclusion>
                        <groupId>org.seleniumhq.selenium</groupId>
                        <artifactId>selenium-remote-driver</artifactId>
                    </exclusion>
                </exclusions>
            </dependency>
        </dependencies>
    </dependencyManagement>
  <build>
  	<plugins>
  		<plugin>
  			<groupId>org.apache.maven.plugins</groupId>
  			<artifactId>maven-compiler-plugin</artifactId>
  			<version>3.3</version>
  		</plugin>
  	</plugins>
  </build>
</project>

第三步:封裝的實體類CoinData:

package com.szzc;

public class CoinData {

	private Integer rowId;
	private String marketName;//交易市場
	private String CurrentPrice;//最新價格
	private String platformPrice;//平臺價格
	private String highestPrice;//最高價
	private String lowestPrice;//最低價
	private String upsAndDowns;//漲跌
	private String increment;//漲幅
	private String trading;//成交量
	
	
	public Integer getRowId() {
		return rowId;
	}

	public void setRowId(Integer rowId) {
		this.rowId = rowId;
	}

	public String getMarketName() {
		return marketName;
	}

	public void setMarketName(String marketName) {
		this.marketName = marketName;
	}


	public String getCurrentPrice() {
		return CurrentPrice;
	}


	public void setCurrentPrice(String currentPrice) {
		CurrentPrice = currentPrice;
	}


	public String getPlatformPrice() {
		return platformPrice;
	}


	public void setPlatformPrice(String platformPrice) {
		this.platformPrice = platformPrice;
	}


	public String getHighestPrice() {
		return highestPrice;
	}


	public void setHighestPrice(String highestPrice) {
		this.highestPrice = highestPrice;
	}


	public String getLowestPrice() {
		return lowestPrice;
	}


	public void setLowestPrice(String lowestPrice) {
		this.lowestPrice = lowestPrice;
	}


	public String getUpsAndDowns() {
		return upsAndDowns;
	}


	public void setUpsAndDowns(String upsAndDowns) {
		this.upsAndDowns = upsAndDowns;
	}


	public String getIncrement() {
		return increment;
	}


	public void setIncrement(String increment) {
		this.increment = increment;
	}


	public String getTrading() {
		return trading;
	}


	public void setTrading(String trading) {
		this.trading = trading;
	}

	@Override
	public String toString() {
		return "CoinData [rowId=" + rowId + ", marketName=" + marketName + ", CurrentPrice=" + CurrentPrice
				+ ", platformPrice=" + platformPrice + ", highestPrice=" + highestPrice + ", lowestPrice=" + lowestPrice
				+ ", upsAndDowns=" + upsAndDowns + ", increment=" + increment + ", trading=" + trading + "]";
	}
	
}
第四步:抓取數據的Main方法所在的類:

package com.szzc;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.openqa.selenium.By;   
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;   
public class FirstTest {
	
	public static final String TR = "tr";
	public static final String TD = "td";
	public static Integer ROWID = 1;
	private static String[] tableDiv = null;
	private static String[] liIds = null;
	
	static {
		tableDiv = new String[4];
		tableDiv[0] = "default_market_tabs-pane-btc";
		tableDiv[1] = "default_market_tabs-pane-ltc";
		tableDiv[2] = "default_market_tabs-pane-eth";
		tableDiv[3] = "default_market_tabs-pane-etc";
		liIds = new String[4];
		liIds[0] = "default_market_tabs-tab-btc";
		liIds[1] = "default_market_tabs-tab-ltc";
		liIds[2] = "default_market_tabs-tab-eth";
		liIds[3] = "default_market_tabs-tab-etc";
	}
	
	public static void main(String[] args) throws Exception {    
		//加載Chrome的驅動並打開瀏覽器   
		System.setProperty("webdriver.chrome.driver","D:/Google/chromedriver.exe");  
		ChromeOptions options = new ChromeOptions();
		options.addArguments("--start-maximized", "allow-running-insecure-content", "--test-type");
		WebDriver driver = new ChromeDriver(options);          
		//打開sosobtc.com頁面    
		driver.get("https://www.sosobtc.com/"); 
		//給瀏覽器初始化頁面響應時間
		Thread.sleep(5000);  
		//定義一個Map來存儲獲取到的四個幣種的數據
		Map<String,List<CoinData>> data = new HashMap<>();
		String[] coinName = {"btc","ltc","eth","etc"};
		//依次點擊頁面的li標籤,並獲取數據
		for (int i = 0; i < liIds.length; i++) {
			List<CoinData> coidDataList = getCoidData(driver, liIds[i], tableDiv[i]);
			data.put(coinName[i], coidDataList);
		}
		for (String coinname : data.keySet()) {
			List<CoinData> list = data.get(coinname);
			for (CoinData coinData : list) {
				System.out.println(coinData);
			}
		}
		//關閉瀏覽器
		driver.quit();
	}

	/**
	 * 
	 * @Description:
	 * @param driver
	 * @param liId 切換數據表格的li標籤的id
	 * @param id 存儲數據的div的id
	 * @throws Exception
	 * @version 1.0
	 * @return 
	 * @time 2017年7月9日下午9:28:20
	 */
	public static List<CoinData> getCoidData(WebDriver driver,String liId,String id) throws Exception {
		//點擊切換li標籤來顯式不同幣種的數據
		driver.findElement(By.id(liId)).click();
		//給數據響應的時間
		Thread.sleep(500L);
		//獲取存儲數據的table所在的div
		WebElement div = driver.findElement(By.id(id));
		//獲得所有的行對象
		List<WebElement> trs = div.findElements(By.tagName(TR));
		//定義一個list來存儲數據,每個元素代表一行
		List<CoinData> coinDataList = new ArrayList<>();
		for (WebElement tr : trs) {
			//獲取一個列對象列表
			List<WebElement> tds = tr.findElements(By.tagName(TD));
			//獲取的列對象集合不爲空時,開始封裝對象
			if (tds != null && tds.size() > 0) {
				CoinData coinData = new CoinData();
				coinData.setRowId(ROWID++);
				coinData.setMarketName(tds.get(0).getText());
				coinData.setCurrentPrice(tds.get(1).getText());
				coinData.setPlatformPrice(tds.get(2).getText());
				coinData.setHighestPrice(tds.get(3).getText());
				coinData.setLowestPrice(tds.get(4).getText());
				coinData.setUpsAndDowns(tds.get(5).getText());
				coinData.setIncrement(tds.get(6).getText());
				coinData.setTrading(tds.get(7).getText());
				coinDataList.add(coinData);
			}
		}
		//切換幣種時,重新從第一個市場名字開始計數
		ROWID = 1;
		return coinDataList;
	}
}


現在可以抓取到頁面的不同標籤下的數據了。但是不完美的是,每次運行程序還會有一個瀏覽器的窗口彈出來比較討厭。我們可以使用PhantomJS來實現無界面的瀏覽器效果:

實現抓取的Main函數所在的類:

package com.szzc;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.openqa.selenium.By;   
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.phantomjs.PhantomJSDriver;
import org.openqa.selenium.remote.DesiredCapabilities;   
public class SecondTest {
	
	public static final String TR = "tr";
	public static final String TD = "td";
	public static Integer ROWID = 1;
	private static String[] tableDiv = null;
	private static String[] liIds = null;
	
	static {
		tableDiv = new String[4];
		tableDiv[0] = "default_market_tabs-pane-btc";
		tableDiv[1] = "default_market_tabs-pane-ltc";
		tableDiv[2] = "default_market_tabs-pane-eth";
		tableDiv[3] = "default_market_tabs-pane-etc";
		liIds = new String[4];
		liIds[0] = "default_market_tabs-tab-btc";
		liIds[1] = "default_market_tabs-tab-ltc";
		liIds[2] = "default_market_tabs-tab-eth";
		liIds[3] = "default_market_tabs-tab-etc";
	}
	
	public static void main(String[] args) throws Exception {    
		//加載Chrome的驅動並打開瀏覽器   
		//System.setProperty("webdriver.chrome.driver","D:/Google/chromedriver.exe");
		System.setProperty("phantomjs.binary.path", "/usr/bin/phantomjs");
		System.setProperty("phantomjs.binary.path", "./phantomjs/win/phantomjs.exe");
		DesiredCapabilities desiredCapabilities = DesiredCapabilities.phantomjs();
		//此處可以設置一些desiredCapabilities的屬性(瀏覽器的頭信息)
		WebDriver driver = new PhantomJSDriver(desiredCapabilities);
		//打開sosobtc.com頁面    
		driver.get("https://www.sosobtc.com/"); 
		//給瀏覽器初始化頁面響應時間
		Thread.sleep(5000);  
		//定義一個Map來存儲獲取到的四個幣種的數據
		Map<String,List<CoinData>> data = new HashMap<>();
		String[] coinName = {"btc","ltc","eth","etc"};
		//依次點擊頁面的li標籤,並獲取數據
		for (int i = 0; i < liIds.length; i++) {
			List<CoinData> coidDataList = getCoidData(driver, liIds[i], tableDiv[i]);
			data.put(coinName[i], coidDataList);
		}
		for (String coinname : data.keySet()) {
			List<CoinData> list = data.get(coinname);
			for (CoinData coinData : list) {
				System.out.println(coinData);
			}
		}
		//關閉瀏覽器
		driver.quit();
	}

	/**
	 * 
	 * @Description:
	 * @param driver
	 * @param liId 切換數據表格的li標籤的id
	 * @param id 存儲數據的div的id
	 * @throws Exception
	 * @version 1.0
	 * @return 
	 * @time 2017年7月9日下午9:28:20
	 */
	public static List<CoinData> getCoidData(WebDriver driver,String liId,String id) throws Exception {
		//點擊切換li標籤來顯式不同幣種的數據
		driver.findElement(By.id(liId)).click();
		//給數據響應的時間
		Thread.sleep(500L);
		//獲取存儲數據的table所在的div
		WebElement div = driver.findElement(By.id(id));
		//獲得所有的行對象
		List<WebElement> trs = div.findElements(By.tagName(TR));
		//定義一個list來存儲數據,每個元素代表一行
		List<CoinData> coinDataList = new ArrayList<>();
		for (WebElement tr : trs) {
			//獲取一個列對象列表
			List<WebElement> tds = tr.findElements(By.tagName(TD));
			//獲取的列對象集合不爲空時,開始封裝對象
			if (tds != null && tds.size() > 0) {
				CoinData coinData = new CoinData();
				coinData.setRowId(ROWID++);
				coinData.setMarketName(tds.get(0).getText());
				coinData.setCurrentPrice(tds.get(1).getText());
				coinData.setPlatformPrice(tds.get(2).getText());
				coinData.setHighestPrice(tds.get(3).getText());
				coinData.setLowestPrice(tds.get(4).getText());
				coinData.setUpsAndDowns(tds.get(5).getText());
				coinData.setIncrement(tds.get(6).getText());
				coinData.setTrading(tds.get(7).getText());
				coinDataList.add(coinData);
			}
		}
		//切換幣種時,重新從第一個市場名字開始計數
		ROWID = 1;
		return coinDataList;
	}
}

至此我們已經可以完美的模仿一個瀏覽器的行爲,來簡單抓取一些網頁的數據了。


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章