Java爬蟲-快速入門 HttpClient+JSoup詳解

1. HttpClient與Jsoup簡介

1.1 HttpClient

HttpClient可以用來提供高效的、最新的、功能豐富的支持 HTTP 協議的客戶端編程工具包,並且它支持 HTTP 協議最新的版本和建議。

HttpClient的作用

  • 實現了所有 HTTP 的方法(GET,POST,PUT,HEAD 等)
  • 支持自動轉向
  • 支持 HTTPS 協議
  • 支持代理服務器等

1.2 JSoup

jsoup是一款Java的HTML解析器,可直接解析某個URL地址、HTML文本內容。它提供了一套非常省力的API,可通過DOM,CSS以及類似於jQuery的操作方法來取出和操作數據。

JSoup的作用

  • 從一個URL,文件或字符串中解析HTML;
  • 使用DOM或CSS選擇器來查找、取出數據;
  • 可操作HTML元素、屬性、文本;

1.3 爲什麼要一起使用

httpClient 屬於專業的抓取網頁的庫,可以設置代理,抓取失敗可以重試抓取

在我的實際使用中,單獨用jsoup也可以直接抓取網頁,但是在抓取上,jsoup比較弱,API簡單,功能也簡單,主要是擴展htmlparser的功能吧,解析html。測試過程中jsoup抓取頁面經常報錯(time out等等)。

因此,我們可以用httpclient抓取網頁,再用Jsoup.parse解析頁面。

2.項目maven依賴

<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.12.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
    <groupId>org.apache.httpcomponents</groupId>
    <artifactId>httpclient</artifactId>
    <version>4.5.10</version>
</dependency>

3.HttpClientUtils工具類

@SuppressWarnings("unused")
public class HttpClientUtils {

    private static String CHARSET = "utf-8";

    private static Integer STATUS_CODE = 200;

    private static CloseableHttpClient httpClient;

    private static Pattern pattern = Pattern.compile("<head>([\\s\\S]*?)<meta([\\s\\S]*?)charset\\s*=(\")?(.*?)\"");

    //信任SSL證書 java實現https請求繞過證書檢測
    static {
        try {
            HostnameVerifier hv = (String hostname, SSLSession session) -> true;

            javax.net.ssl.TrustManager[] trustAllCerts = new javax.net.ssl.TrustManager[1];
            javax.net.ssl.TrustManager tm = new miTM();
            trustAllCerts[0] = tm;
            javax.net.ssl.SSLContext sc = javax.net.ssl.SSLContext
                    .getInstance("SSL");
            sc.init(null, trustAllCerts, null);
            javax.net.ssl.HttpsURLConnection.setDefaultSSLSocketFactory(sc.getSocketFactory());
            RequestConfig config = RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(5000).build();
            httpClient = HttpClientBuilder.create().setDefaultRequestConfig(config).setSSLContext(sc).setSSLHostnameVerifier(hv).build();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    static class miTM implements javax.net.ssl.TrustManager, javax.net.ssl.X509TrustManager {

        @Override
        public void checkClientTrusted(X509Certificate[] x509Certificates, String s) {
        }

        @Override
        public void checkServerTrusted(X509Certificate[] x509Certificates, String s) {
        }

        @Override
        public X509Certificate[] getAcceptedIssuers() {
            return new X509Certificate[0];
        }
    }


    /**
     * 帶請求頭的GET請求 (已設置默認請求頭)
     *
     * @param url 鏈接url
     * @return 網頁內容
     */
    public static String doGetWithHeaders(String url) {
        //
        String responseTex = "";
        CloseableHttpResponse response = null;
        try {
            // 2.創建uri對象
            URIBuilder builder = new URIBuilder(url);
            URI uri = builder.build();
            // 3.創建http GET請求
            HttpGet httpGet = new HttpGet(uri);
            //攜帶請求頭的信息
            httpGet.setHeader("Accept-Charset", "GB2312,utf-8;q=0.7,*;q=0.7");
            httpGet.setHeader("Accept-Encoding", "gzip, deflate, br");
            httpGet.setHeader("Accept-Language", "en-US,en;q=0.9,zh;q=0.8,zh-CN;q=0.7");
            httpGet.setHeader("Connection", "keep-alive");
            httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36");

            // 4.執行請求
            response = httpClient.execute(httpGet);
            // 5.判斷返回狀態是否爲200
            if (response.getStatusLine().getStatusCode() == STATUS_CODE) {
                // 6.進行編碼自適應處理
                HttpEntity entity = response.getEntity();
                byte[] bytes = EntityUtils.toByteArray(entity);
                String content = new String(bytes);
                Matcher matcher = pattern.matcher(content.toLowerCase());
                if (matcher.find()) {
                    CHARSET = matcher.group(4);
                    String gb2312 = "gb2312";
                    if (gb2312.equals(CHARSET)) {
                        byte[] gbkBytes = new String(bytes, "gb2312").getBytes();
                        responseTex = new String(gbkBytes);
                    }
                }

                responseTex = new String(bytes, CHARSET);
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            close(httpClient, response);
        }
        return responseTex;
    }

    /**
     * 攜帶請求參數的GET請求
     *
     * @param url    鏈接url
     * @param params 請求參數
     * @return 網頁內容
     */
    public static String doGet(String url, Map<String, String> params) {
        //
        String responseTex = "";
        CloseableHttpResponse response = null;
        try {
            // 2.創建uri對象
            URIBuilder builder = new URIBuilder(url);
            if (params != null && !params.isEmpty()) {
                for (String key : params.keySet()) {
                    builder.addParameter(key, params.get(key));
                }
            }
            URI uri = builder.build();
            // 3.創建http GET請求
            HttpGet httpGet = new HttpGet(uri);
            // 4.執行請求
            response = httpClient.execute(httpGet);
            // 5.判斷返回狀態是否爲200
            if (response.getStatusLine().getStatusCode() == STATUS_CODE) {
                // 6.進行UTF-8編碼處理
                responseTex = EntityUtils.toString(response.getEntity(), CHARSET);
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            close(httpClient, response);
        }
        return responseTex;
    }

    /**
     * 不需要攜帶參數的GET請求
     *
     * @param url 鏈接url
     * @return 網頁內容
     */
    public static String doGet(String url) {
        return doGet(url, null);
    }

    /**
     * 既攜帶請求頭又攜帶請求參數的GET請求
     * 該方法未實現
     *
     * @param url    鏈接url
     * @param params 請求參數
     * @param header 請求頭
     * @return 網頁內容
     */
    public static String doGet(String url, Map<String, String> params, Map<String, String> header) {
        return null;
    }


    /**
     * 攜帶請求參數的POST請求
     *
     * @param url   鏈接url
     * @param param 請求參數
     * @return 網頁內容
     */
    public static String doPost(String url, Map<String, String> param) {
        //
        String result = "";
        CloseableHttpResponse response = null;
        try {
            // 2.創建Http Post請求
            HttpPost httpPost = new HttpPost(url);
            // 3.創建參數列表
            if (param != null) {
                List<NameValuePair> paramList = new ArrayList<>();
                for (String key : param.keySet()) {
                    paramList.add(new BasicNameValuePair(key, param.get(key)));
                }
                // 4.模擬表單
                UrlEncodedFormEntity entity = new UrlEncodedFormEntity(paramList);
                httpPost.setEntity(entity);
            }
            // 5.執行http請求
            response = httpClient.execute(httpPost);
            // 6.獲取響應的結果
            result = EntityUtils.toString(response.getEntity(), CHARSET);
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            close(httpClient, response);
        }
        return result;
    }

    /**
     * 發送無攜帶請求參數的POST請求
     *
     * @param url 鏈接url
     * @return 網頁內容
     */
    public static String doPost(String url) {
        return doPost(url, null);
    }

    /**
     * 以json的方式傳遞請求參數,發送POST請求
     *
     * @param url  鏈接url
     * @param json json格式的參數
     * @return 網頁內容
     */
    public static String doPostJson(String url, String json) {//map json
        //
        String result = "";
        CloseableHttpResponse response = null;
        try {
            // 2.創建Http Post請求
            HttpPost httpPost = new HttpPost(url);
            // 3.創建請求內容
            StringEntity entity = new StringEntity(json, ContentType.APPLICATION_JSON);
            httpPost.setEntity(entity);
            // 4.執行http請求
            response = httpClient.execute(httpPost);
            // 5.獲取響應結果
            result = EntityUtils.toString(response.getEntity(), "utf-8");
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            close(httpClient, response);
        }
        return result;
    }

    /**
     * 釋放資源
     *
     * @param httpClient   httpClient
     * @param httpResponse httpResponse
     */
    private static void close(CloseableHttpClient httpClient, CloseableHttpResponse httpResponse) {
        if (httpResponse != null) {
            try {
                httpResponse.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        //暫不釋放資源
//        try {
//
//            httpClient.close();
//        } catch (IOException e) {
//            e.printStackTrace();
//        }
    }


}

注意:該工具類的url地址要帶http協議或https協議,否則會報錯

4.JSoup的使用

4.1 使用dom方法來查找元素

public class Test4 {
    public static void main(String[] args) {
        String uri = "https://www.yiibai.com/jsoup/jsoup-quick-start.html";
        String html = HttpClientUtils.doGet(uri);
        //使用JSoup解析html
        Document document = Jsoup.parse(html);
        //使用document.getElementsByTag("a") 獲取所有a標籤  通過forEach遍歷內容
        Elements aTag = document.getElementsByTag("a");
        for (Element element : aTag) {
            //使用text()方法獲取文本內容
            String text = element.text();
            //使用html()方法獲取標籤體的HTML
            String html1 = element.html();
            //attr(String key)來獲取屬性爲key的內容
            String href = element.attr("href");
        }

        //使用document.getElementById("xx")獲取id爲xx的標籤
        Element id = document.getElementById("qq-group");
        String text = id.text();
        //attributes()獲取所有屬性
        Attributes attributes = id.attributes();
        //輸出屬性和屬性值
        for (Attribute attribute : attributes) {
            String key = attribute.getKey();
            String value = attribute.getValue();
            System.out.println("key="+key+"--->value="+value);
        }

        //document.getElementsByClass("yy")獲取class爲yy的所有標籤
        Elements aClass = document.getElementsByClass("article-content");

    }
}

4.2 使用選擇器語法來查找元素

public class Test5 {
    public static void main(String[] args) {
        String uri = "https://www.yiibai.com/jsoup/jsoup-quick-start.html";
        String html = HttpClientUtils.doGet(uri);
        //使用JSoup解析html
        Document document = Jsoup.parse(html);
        //select("tagname") 通過標籤查找元素
        Elements aTag = document.select("a");
        //select("#id") 通過ID查找元素
        Elements id = document.select("#qq-group");
        //select(".class") 通過class名稱查找元素
        Elements class1 = document.select(".article-content");
        //select("[attribute]") 通過屬性查找元素
        Elements href = document.select("[href]");

        //select(":contains(text)") 查找包含給定文本的元素,搜索不區分大不寫
        Elements contains = document.select(":contains(JSoup安裝)");
        for (Element element : contains) {
            //輸出包含 'JSoup安裝' 內容的鏈接
            String href1 = element.attr("href");
            if (href1!=""){
                System.out.println(href1);
            }
        }

        //select(":matches(regex)") 查找哪些元素的文本匹配指定的正則表達式
        Elements select = document.select(":matches(regex)");
    }
}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章