以下是兩種爬蟲方式
**
1.Jsoup簡單爬蟲
**
首先是普通jsoup爬取網頁信息,由於我是搭建的一個簡單地maven項目,所以先上maven依賴(以下maven依賴兩個代碼都適用):
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.3</version>
</dependency>
<dependency>
<groupId>javax.xml</groupId>
<artifactId>jaxp-api</artifactId>
<version>1.4.2</version>
</dependency>
<dependency>
<groupId>net.sourceforge.htmlcleaner</groupId>
<artifactId>htmlcleaner</artifactId>
<version>2.9</version>
</dependency>
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.29</version>
</dependency>
代碼部分:
package com.jsoup;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
public class JsoupHelper {
public static Object fecthNode(String url, String xpath) throws Exception {
String html = null;
try {
Connection connect = Jsoup.connect(url);
html = connect.get().body().html();
} catch (IOException e) {
e.printStackTrace();
return null;
}
HtmlCleaner hc = new HtmlCleaner();
TagNode tn = hc.clean(html);
Document dom = new DomSerializer(new CleanerProperties()).createDOM(tn);
XPath xPath = XPathFactory.newInstance().newXPath();
Object result = xPath.evaluate(xpath, dom, XPathConstants.NODESET);
return result;
}
/**
* 獲取xpath下的a標籤的文本值及href屬性值
* @param url
* @param xpath
* @return
* @throws Exception
*/
public static Map<String, String> fecthByMap(String url, String xpath) throws Exception {
Map<String, String> nodeMap = new LinkedHashMap<>();
Object result = fecthNode(url, xpath);
if (result instanceof NodeList) {
NodeList nodeList = (NodeList) result;
for (int i = 0; i < nodeList.getLength(); i++) {
Node node = nodeList.item(i);
if(node == null){
continue;
}
nodeMap.put(node.getTextContent(), node.getAttributes().getNamedItem("href")!=null ?
node.getAttributes().getNamedItem("href").getTextContent() : "");
System.out.println(node.getTextContent() + " : " + node.getAttributes().getNamedItem("href"));
}
}
return nodeMap;
}
/**
* 獲取xpath下的某個屬性值
* @param url
* @param xpath
* @param attr
* @return
* @throws Exception
*/
public static List<String> fecthAttr(String url, String xpath, String attr) throws Exception {
List<String> list = new ArrayList<>();
Object result = fecthNode(url, xpath);
if (result instanceof NodeList) {
NodeList nodeList = (NodeList) result;
for (int i = 0; i < nodeList.getLength(); i++) {
Node node = nodeList.item(i);
if(node == null){
continue;
}
list.add(node.getAttributes().getNamedItem(attr).getTextContent());
System.out.println(node.getTextContent() + " : " + node.getAttributes().getNamedItem("href"));
}
}
return list;
}
public static void main(String[] args) throws Exception{
fecthByMap("https://www.jianshu.com/u/df0f6525c1c5","//ul[@class='note-list']/li//a[@class='title']");
}
}
效果如下:
2.jsoup+HtmlUnit動態獲取頁面信息
maven依賴參考文章開頭配置
代碼部分:
package com.jsoup;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlElement;
import com.gargoylesoftware.htmlunit.html.HtmlInput;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import java.util.List;
public class JsoupHttpClient {
/**
* Xpath:級聯選擇 ✔
* ① //:從匹配選擇的當前節點選擇文檔中的節點,而不考慮它們的位置
* ② h3:匹配<h3>標籤
* ③ [@class='name']:屬性名爲class的值爲name
* ④ a:匹配<a>標籤
*/
public static void main(String[] args) {
// jsoup("http://wwww.baidu.com","kw","jsoup","su","//h3[@class='t']/a");
jsoup("https://cn.bing.com/","sb_form_q","jsoup","sb_form_go","//li[@class='b_algo']");
}
/**
*
* @param url 網址
* @param inputId 獲取搜索輸入框
* @param inputVal 往輸入框 “填值”
* @param btnId “點擊” 搜索
* @param xpath 選擇元素
*/
public static void jsoup(String url, String inputId, String inputVal, String btnId, String xpath){
try {
//創建webclient
WebClient webClient = new WebClient();
//取消JS支持
webClient.getOptions().setJavaScriptEnabled(false);
//取消CSS支持
webClient.getOptions().setCssEnabled(false);
//獲取指定網頁實體
HtmlPage page = (HtmlPage) webClient.getPage(url);
//獲取搜索輸入按鈕
HtmlInput input = page.getHtmlElementById(inputId);
//往輸入框填值
input.setValueAttribute(inputVal);
//獲取搜索按鈕
HtmlInput btn = page.getHtmlElementById(btnId);
//點擊搜索
HtmlPage page1 = btn.click();
//選擇元素
List<HtmlElement> elementList = page1.getByXPath(xpath);
System.out.println("----begin----");
for(int i=0;i<elementList.size();i++) {
// 輸出新頁面的文本
System.out.println(i+1+"、"+elementList.get(i).asText());
}
System.out.println("----end----");
}catch (Exception e){
e.printStackTrace();
}
}
}
效果:
這裏外加一個jsoup爬取豆瓣的小例子:
package com.jsoup;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import java.io.IOException;
public class JsoupMovie {
public static void main(String[] args) {
crawlMovieinfo();
}
public static void crawlMovieinfo() {
// 1.獲取網頁
final String URL = "https://movie.douban.com/top250";
Document document = null;
try {
document = Jsoup.connect(URL).get();
} catch (IOException e) {
e.printStackTrace();
}
// 2.選擇具體的電影的項,注意first方法,這裏先只選取第一個進行測試
Element itemElement = document.select("ol li").first();
// 3.1電影排名
Element rankElement = itemElement.select("em").first();
String rankString = rankElement.text();
System.out.println("排行榜:" + rankString.toString());
// 3.2電影網址
Element urlElement = itemElement.select("div.hd a").first();
String urlString = urlElement.attr("href");
System.out.println("電影介紹網址:" + urlString.toString());
// 3.3電影名
Element titleElement = urlElement.select("span.title").first();
String titleString = titleElement.text();
System.out.println("電影名:" + titleString.toString());
// 3.4評分
Element ratingNumElement = itemElement.select("div.star span.rating_num").first();
String ratingNumString = ratingNumElement.text();
System.out.println("評分:" + ratingNumString.toString());
// 3.5評價人數
Element ratingPeopleNumElement = itemElement.select("div.star span").last();
String ratingPeopleNumString = ratingPeopleNumElement.text();
System.out.println("人數:" + ratingPeopleNumString.toString());
// 3.6 一句話簡評
Element quoteElement = itemElement.select("p.quote span.inq").first();
String quoteString = quoteElement.text();
System.out.println("簡評:" + quoteString.toString());
}
}
效果: