獲取網頁內容---Document

    聲明:文章內容全都是自己的學習總結,如有不對的地方請大家幫忙指出。有需要溝通交流的可加我QQ羣:425120333
    前面兩篇的內容主要是爲了這裏更好的獲取頁面而做的準備,當然沒有前兩篇內容也是可以的,不過要將相應的調用語句註釋了。
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.net.HttpURLConnection;
import java.net.Proxy;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.DeflaterInputStream;
import java.util.zip.GZIPInputStream;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

/**
 *
 * @Project: TAGLIB-CRAWLER
 * @File: DocumentUtil.java
 * @Date: 2016年11月28日
 * @Author: caiqibin
 */
public class DocumentUtil {
    // 默認嘗試次數

    private static final Log logger = LogFactory.getLog(DocumentUtil.class);

    /**
     * @introduction: 通過Jsoup直接獲取頁面
     * @param url
     * @return
     */
    public static Document getDocumentWithJsoup(String url) {
        Document document = null;
        try {
            document = Jsoup.connect(url).timeout(35000).get();
        } catch (IOException e) {
            StringWriter sw = new StringWriter();
            PrintWriter pw = new PrintWriter(sw);
            e.printStackTrace(pw);
            logger.error(sw.toString());
        }
        return document;
    }

    /**
     * @introduction: 獲取頁面,不用代理,獲取一次
     * @param urlStr
     * @return
     */
    public static Document getDocument(String urlStr) {
        return repeatAcquireDocument(urlStr, false, 1);
    }

    public static Document getDocumentWithProxy(String urlStr) {
        return repeatAcquireDocument(urlStr, true, 30);
    }

    /**
     * @introduction: 多次嘗試獲取頁面
     * @param urlStr
     * @param useProxy
     * @param times
     * @return
     */
    public static Document repeatAcquireDocument(String urlStr, boolean useProxy, int times) {
        // 這裏只是簡單的用是否包含title做個判斷
        VerifyDocument verifyDocument = new VerifyDocument() {
            @Override
            boolean tryAgain(Document document) {
                if (document != null) {
                    Elements elements = document.select("title");
                    if (elements.size() > 0) {
                        return false;
                    }
                }
                return true;
            }
        };
        return repeatAcquireDocument(urlStr, useProxy, times, verifyDocument);
    }

    /**
     * @introduction: 多次嘗試獲取頁面(需重寫驗證頁面的方法)
     * @param urlStr
     * @param useProxy
     * @param times
     * @param verifyDocument
     * @return
     */
    public static Document repeatAcquireDocument(String urlStr, boolean useProxy, int times, VerifyDocument verifyDocument) {
        int tryTime = 0;
        Document document = getDocument(urlStr, useProxy);
        while (verifyDocument.tryAgain(document)) {
            document = getDocument(urlStr, useProxy);
            tryTime++;
            if (tryTime == times) {
                break;
            }
        }
        return document;
    }

    /**
     * @introduction: 獲取頁面 
     * @param urlStr
     * @param useProxy
     * @return
     */
    public static Document getDocument(String urlStr, boolean useProxy) {
        try {
            HttpURLConnection connection = null;
            URL url = new URL(urlStr);
            if (useProxy) {
                Proxy proxy = ProxyServerUtil.getProxy();
                connection = (HttpURLConnection) url.openConnection(proxy);
            } else {
                connection = (HttpURLConnection) url.openConnection();
            }
            connection.addRequestProperty("User-Agent", ProxyServerUtil.getUserAgent());
            connection.setConnectTimeout(7000);
            connection.setReadTimeout(7000);
            if (HttpURLConnection.HTTP_OK != connection.getResponseCode()) {
                logger.info("==========獲取頁面出錯  響應的Code是:" + connection.getResponseCode() + "============");
                return null;
            }
            Document document = deCodingConnection(connection);
            connection.disconnect();
            return document;
        } catch (IOException e) {
            logger.info("==========獲取頁面出錯的Url爲:" + urlStr + "============");
            StringWriter sw = new StringWriter();
            PrintWriter pw = new PrintWriter(sw);
            e.printStackTrace(pw);
            logger.error(sw.toString());
            return null;
        }
    }

    /**
     * @introduction: 正確解碼獲取頁面
     * @param connection
     * @return
     */
    private static Document deCodingConnection(HttpURLConnection connection) {
        try {
            connection.connect();
            // 避免亂碼的處理
            String charset = connection.getHeaderField("Content-Type");
            charset = detectCharset(charset);
            InputStream input = getInputStream(connection);
            ByteArrayOutputStream output = new ByteArrayOutputStream();
            int count;
            byte[] buffers = new byte[4096];
            while ((count = input.read(buffers, 0, buffers.length)) > 0) {
                output.write(buffers, 0, count);
            }
            input.close();
            // 若已通過請求頭得到charset,則不需要去html裏面繼續查找
            if (charset == null || "".equals(charset)) {
                charset = detectCharset(output.toString());
                // 若在html裏面還是未找到charset,則設置默認編碼爲utf-8
                if (charset == null || "".equals(charset)) {
                    charset = "utf-8";
                }
            }
            String result = output.toString(charset);
            output.close();
            return Jsoup.parse(result);
        } catch (Exception e) {
            logger.info("==========解析頁面出錯:" + connection.getURL().toString() + "============");
            StringWriter sw = new StringWriter();
            PrintWriter pw = new PrintWriter(sw);
            e.printStackTrace(pw);
            logger.error(sw.toString());
            return null;
        }

    }

    private static String detectCharset(String input) {
        Pattern pattern = Pattern.compile("charset=\"?([\\w\\d-]+)\"?;?", Pattern.CASE_INSENSITIVE);
        if (input != null && !"".equals(input)) {
            Matcher matcher = pattern.matcher(input);
            if (matcher.find()) {
                return matcher.group(1);
            }
        }
        return null;
    }

    private static InputStream getInputStream(HttpURLConnection conn) throws Exception {
        String contentEncoding = conn.getHeaderField("Content-Encoding");
        if (contentEncoding != null) {
            contentEncoding = contentEncoding.toLowerCase();
            if (contentEncoding.indexOf("gzip") != -1) {
                return new GZIPInputStream(conn.getInputStream());
            } else if (contentEncoding.indexOf("deflate") != -1) {
                return new DeflaterInputStream(conn.getInputStream());
            }
        }
        return conn.getInputStream();
    }
}

/**
 *
 * @Project: TAGLIB-CRAWLER
 * @File: DocumentUtil.java
 * @Date: 2016年11月28日
 * @Author: caiqibin
 * @introduction:判斷頁面是否需要重新獲取抽象類
 */
abstract class VerifyDocument {

    /**
     * @introduction: 根據頁面驗證是否要重新獲取
     * @param document
     * @return
     */
    abstract boolean tryAgain(Document document);
}

通過這個工具方法能獲取絕大多數數網頁,不過像HTTPS請求的以及經過頁面跳轉的不一定能獲取到,後面可能還會寫一篇關於這種URL鏈接應該怎麼處理的博客(不一定會出。。。)

附:所用的jsoup在pom文件中的配置

        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.7.3</version>
        </dependency>
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章