聲明:文章內容全都是自己的學習總結,如有不對的地方請大家幫忙指出。有需要溝通交流的可加我QQ羣:425120333
前面兩篇的內容主要是爲了這裏更好的獲取頁面而做的準備,當然沒有前兩篇內容也是可以的,不過要將相應的調用語句註釋了。
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.net.HttpURLConnection;
import java.net.Proxy;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.DeflaterInputStream;
import java.util.zip.GZIPInputStream;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
/**
*
* @Project: TAGLIB-CRAWLER
* @File: DocumentUtil.java
* @Date: 2016年11月28日
* @Author: caiqibin
*/
public class DocumentUtil {
// 默認嘗試次數
private static final Log logger = LogFactory.getLog(DocumentUtil.class);
/**
* @introduction: 通過Jsoup直接獲取頁面
* @param url
* @return
*/
public static Document getDocumentWithJsoup(String url) {
Document document = null;
try {
document = Jsoup.connect(url).timeout(35000).get();
} catch (IOException e) {
StringWriter sw = new StringWriter();
PrintWriter pw = new PrintWriter(sw);
e.printStackTrace(pw);
logger.error(sw.toString());
}
return document;
}
/**
* @introduction: 獲取頁面,不用代理,獲取一次
* @param urlStr
* @return
*/
public static Document getDocument(String urlStr) {
return repeatAcquireDocument(urlStr, false, 1);
}
public static Document getDocumentWithProxy(String urlStr) {
return repeatAcquireDocument(urlStr, true, 30);
}
/**
* @introduction: 多次嘗試獲取頁面
* @param urlStr
* @param useProxy
* @param times
* @return
*/
public static Document repeatAcquireDocument(String urlStr, boolean useProxy, int times) {
// 這裏只是簡單的用是否包含title做個判斷
VerifyDocument verifyDocument = new VerifyDocument() {
@Override
boolean tryAgain(Document document) {
if (document != null) {
Elements elements = document.select("title");
if (elements.size() > 0) {
return false;
}
}
return true;
}
};
return repeatAcquireDocument(urlStr, useProxy, times, verifyDocument);
}
/**
* @introduction: 多次嘗試獲取頁面(需重寫驗證頁面的方法)
* @param urlStr
* @param useProxy
* @param times
* @param verifyDocument
* @return
*/
public static Document repeatAcquireDocument(String urlStr, boolean useProxy, int times, VerifyDocument verifyDocument) {
int tryTime = 0;
Document document = getDocument(urlStr, useProxy);
while (verifyDocument.tryAgain(document)) {
document = getDocument(urlStr, useProxy);
tryTime++;
if (tryTime == times) {
break;
}
}
return document;
}
/**
* @introduction: 獲取頁面
* @param urlStr
* @param useProxy
* @return
*/
public static Document getDocument(String urlStr, boolean useProxy) {
try {
HttpURLConnection connection = null;
URL url = new URL(urlStr);
if (useProxy) {
Proxy proxy = ProxyServerUtil.getProxy();
connection = (HttpURLConnection) url.openConnection(proxy);
} else {
connection = (HttpURLConnection) url.openConnection();
}
connection.addRequestProperty("User-Agent", ProxyServerUtil.getUserAgent());
connection.setConnectTimeout(7000);
connection.setReadTimeout(7000);
if (HttpURLConnection.HTTP_OK != connection.getResponseCode()) {
logger.info("==========獲取頁面出錯 響應的Code是:" + connection.getResponseCode() + "============");
return null;
}
Document document = deCodingConnection(connection);
connection.disconnect();
return document;
} catch (IOException e) {
logger.info("==========獲取頁面出錯的Url爲:" + urlStr + "============");
StringWriter sw = new StringWriter();
PrintWriter pw = new PrintWriter(sw);
e.printStackTrace(pw);
logger.error(sw.toString());
return null;
}
}
/**
* @introduction: 正確解碼獲取頁面
* @param connection
* @return
*/
private static Document deCodingConnection(HttpURLConnection connection) {
try {
connection.connect();
// 避免亂碼的處理
String charset = connection.getHeaderField("Content-Type");
charset = detectCharset(charset);
InputStream input = getInputStream(connection);
ByteArrayOutputStream output = new ByteArrayOutputStream();
int count;
byte[] buffers = new byte[4096];
while ((count = input.read(buffers, 0, buffers.length)) > 0) {
output.write(buffers, 0, count);
}
input.close();
// 若已通過請求頭得到charset,則不需要去html裏面繼續查找
if (charset == null || "".equals(charset)) {
charset = detectCharset(output.toString());
// 若在html裏面還是未找到charset,則設置默認編碼爲utf-8
if (charset == null || "".equals(charset)) {
charset = "utf-8";
}
}
String result = output.toString(charset);
output.close();
return Jsoup.parse(result);
} catch (Exception e) {
logger.info("==========解析頁面出錯:" + connection.getURL().toString() + "============");
StringWriter sw = new StringWriter();
PrintWriter pw = new PrintWriter(sw);
e.printStackTrace(pw);
logger.error(sw.toString());
return null;
}
}
private static String detectCharset(String input) {
Pattern pattern = Pattern.compile("charset=\"?([\\w\\d-]+)\"?;?", Pattern.CASE_INSENSITIVE);
if (input != null && !"".equals(input)) {
Matcher matcher = pattern.matcher(input);
if (matcher.find()) {
return matcher.group(1);
}
}
return null;
}
private static InputStream getInputStream(HttpURLConnection conn) throws Exception {
String contentEncoding = conn.getHeaderField("Content-Encoding");
if (contentEncoding != null) {
contentEncoding = contentEncoding.toLowerCase();
if (contentEncoding.indexOf("gzip") != -1) {
return new GZIPInputStream(conn.getInputStream());
} else if (contentEncoding.indexOf("deflate") != -1) {
return new DeflaterInputStream(conn.getInputStream());
}
}
return conn.getInputStream();
}
}
/**
*
* @Project: TAGLIB-CRAWLER
* @File: DocumentUtil.java
* @Date: 2016年11月28日
* @Author: caiqibin
* @introduction:判斷頁面是否需要重新獲取抽象類
*/
abstract class VerifyDocument {
/**
* @introduction: 根據頁面驗證是否要重新獲取
* @param document
* @return
*/
abstract boolean tryAgain(Document document);
}
通過這個工具方法能獲取絕大多數數網頁,不過像HTTPS請求的以及經過頁面跳轉的不一定能獲取到,後面可能還會寫一篇關於這種URL鏈接應該怎麼處理的博客(不一定會出。。。)
附:所用的jsoup在pom文件中的配置
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.7.3</version>
</dependency>