java爬虫带你爬天爬地爬人生,爬新浪

HttpClient简介

HttpClient是Apache Jakarta Common下的子项目,可以用来提供高效的、最新的、功能丰富的支持HTTP协议的客户端编程工具包,并且它支持 HTTP 协议最新的版本。它的主要功能有:

(1) 实现了所有 HTTP 的方法(GET,POST,PUT,HEAD 等)

(2) 支持自动转向

(3) 支持 HTTPS 协议

(4) 支持代理服务器等

Jsoup简介

jsoup是一款Java的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。它的主要功能有:

(1) 从一个URL,文件或字符串中解析HTML;

(2) 使用DOM或CSS选择器来查找、取出数据;

(3) 可操作HTML元素、属性、文本;

使用步骤

代码
import org.apache.http.HttpEntity;

import org.apache.http.client.config.RequestConfig;

import org.apache.http.client.methods.CloseableHttpResponse;

import org.apache.http.client.methods.HttpGet;

import org.apache.http.client.protocol.HttpClientContext;

import org.apache.http.impl.client.CloseableHttpClient;

import org.apache.http.impl.client.HttpClientBuilder;

import org.apache.http.util.EntityUtils;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.junit.Test;

import java.util.List;

/**

  • HttpClient & Jsoup libruary test class

  • Created by xuyh at 2017/11/6 15:28.

*/

public classHttpClientJsoupTest{

@Test

public void test() {

        //通过httpClient获取网页响应,将返回的响应解析为纯文本

    HttpGet httpGet = new HttpGet("http://sports.sina.com.cn/");

    httpGet.setConfig(RequestConfig.custom().setSocketTimeout(30000).setConnectTimeout(30000).build());

    CloseableHttpClient httpClient = null;

    CloseableHttpResponse response = null;

    String responseStr = "";

    try {

        httpClient = HttpClientBuilder.create().build();

        HttpClientContext context = HttpClientContext.create();

        response = httpClient.execute(httpGet, context);

        int state = response.getStatusLine().getStatusCode();

        if (state != 200)

            responseStr = "";

        HttpEntity entity = response.getEntity();

        if (entity != null)

            responseStr = EntityUtils.toString(entity, "utf-8");

    } catch (Exception e) {

        e.printStackTrace();

    } finally {

        try {

            if (response != null)

                response.close();

            if (httpClient != null)

                httpClient.close();

        } catch (Exception ex) {

            ex.printStackTrace();

        }

    }

    if (responseStr == null)

        return;

    //将解析到的纯文本用Jsoup工具转换成Document文档并进行操作

    Document document = Jsoup.parse(responseStr);

    List<Element> elements = document.getElementsByAttributeValue("class", "phdnews_txt fr").first()

            .getElementsByAttributeValue("class", "phdnews_hdline");

    elements.forEach(element -> {

        for (Element e : element.getElementsByTag("a")) {

            System.out.println(e.attr("href"));

            System.out.println(e.text());

        }

    });

}                                                                                                                                                                                                                    

详解

新建HttpGet对象,对象将从 http://sports.sina.com.cn/ 这个URL地址获取GET响应。并设置socket超时时间和连接超时时间分别为30000ms。

将HttpClient和Jsoup进行封装,形成一个工具类,内容如下:

import org.apache.http.HttpEntity;

import org.apache.http.NameValuePair;

import org.apache.http.client.CookieStore;

import org.apache.http.client.config.RequestConfig;

import org.apache.http.client.entity.UrlEncodedFormEntity;

import org.apache.http.client.methods.CloseableHttpResponse;

import org.apache.http.client.methods.HttpGet;

import org.apache.http.client.methods.HttpPost;

import org.apache.http.client.protocol.HttpClientContext;

import org.apache.http.conn.ssl.SSLConnectionSocketFactory;

import org.apache.http.cookie.Cookie;

import org.apache.http.entity.ContentType;

import org.apache.http.entity.StringEntity;

import org.apache.http.impl.client.CloseableHttpClient;

import org.apache.http.impl.client.HttpClientBuilder;

import org.apache.http.impl.client.HttpClients;

import org.apache.http.message.BasicNameValuePair;

import org.apache.http.ssl.SSLContextBuilder;

import org.apache.http.util.EntityUtils;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import javax.net.ssl.*;

import java.io.IOException;

import java.security.GeneralSecurityException;

import java.util.ArrayList;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

/**

  • Http工具,包含:

  • 普通http请求工具(使用httpClient进行http,https请求的发送)

  • Created by xuyh at 2017/7/17 19:08.

*/

public classHttpUtils{

/**
  • 请求超时时间,默认20000ms

*/

private int timeout = 20000;

/**
  • cookie表

*/

private Map<String, String> cookieMap = new HashMap<>();

/**
  • 请求编码(处理返回结果),默认UTF-8

*/

private String charset = "UTF-8";

private static HttpUtils httpUtils;

privateHttpUtils(){

}

/**
  • 获取实例

*@return

*/

publicstaticHttpUtilsgetInstance(){

    if (httpUtils == null)

        httpUtils = new HttpUtils();

    return httpUtils;

}

/**
  • 清空cookieMap

*/

publicvoidinvalidCookieMap(){

    cookieMap.clear();

}

publicintgetTimeout(){

    return timeout;

}

/**
  • 设置请求超时时间

*@paramtimeout

*/

publicvoidsetTimeout(inttimeout){

    this.timeout = timeout;

}

publicStringgetCharset(){

    return charset;

}

/**
  • 设置请求字符编码集

*@paramcharset

*/

publicvoidsetCharset(String charset){

    this.charset = charset;

}

/**
  • 将网页返回为解析后的文档格式

*@paramhtml

*@return

*@throwsException

*/

publicstaticDocumentparseHtmlToDoc(String html)throwsException{

    return removeHtmlSpace(html);

}

privatestaticDocumentremoveHtmlSpace(String str){

    Document doc = Jsoup.parse(str);

    String result = doc.html().replace("&nbsp;", "");

    return Jsoup.parse(result);

}

/**
  • 执行get请求,返回doc

*@paramurl

*@return

*@throwsException

*/

publicDocumentexecuteGetAsDocument(String url)throwsException{

    return parseHtmlToDoc(executeGet(url));

}

/**
  • 执行get请求

*@paramurl

*@return

*@throwsException

*/

publicStringexecuteGet(String url)throwsException{

    HttpGet httpGet = new HttpGet(url);

    httpGet.setHeader("Cookie", convertCookieMapToString(cookieMap));

    httpGet.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());

    CloseableHttpClient httpClient = null;

    String str = "";

    try {

        httpClient = HttpClientBuilder.create().build();

        HttpClientContext context = HttpClientContext.create();

        CloseableHttpResponse response = httpClient.execute(httpGet, context);

        getCookiesFromCookieStore(context.getCookieStore(), cookieMap);

        int state = response.getStatusLine().getStatusCode();

        if (state == 404) {

            str = "";

        }

        try {

            HttpEntity entity = response.getEntity();

            if (entity != null) {

                str = EntityUtils.toString(entity, charset);

            }

        } finally {

            response.close();

        }

    } catch (IOException e) {

        throw e;

    } finally {

        try {

            if (httpClient != null)

                httpClient.close();

        } catch (IOException e) {

            throw e;

        }

    }

    return str;

}

/**
  • 用https执行get请求,返回doc

*@paramurl

*@return

*@throwsException

*/

publicDocumentexecuteGetWithSSLAsDocument(String url)throwsException{

    return parseHtmlToDoc(executeGetWithSSL(url));

}

/**
  • 用https执行get请求

*@paramurl

*@return

*@throwsException

*/

publicStringexecuteGetWithSSL(String url)throwsException{

    HttpGet httpGet = new HttpGet(url);

    httpGet.setHeader("Cookie", convertCookieMapToString(cookieMap));

    httpGet.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());

    CloseableHttpClient httpClient = null;

    String str = "";

    try {

        httpClient = createSSLInsecureClient();

        HttpClientContext context = HttpClientContext.create();

        CloseableHttpResponse response = httpClient.execute(httpGet, context);

        getCookiesFromCookieStore(context.getCookieStore(), cookieMap);

        int state = response.getStatusLine().getStatusCode();

        if (state == 404) {

            str = "";

        }

        try {

            HttpEntity entity = response.getEntity();

            if (entity != null) {

                str = EntityUtils.toString(entity, charset);

            }

        } finally {

            response.close();

        }

    } catch (IOException e) {

        throw e;

    } catch (GeneralSecurityException ex) {

        throw ex;

    } finally {

        try {

            if (httpClient != null)

                httpClient.close();

        } catch (IOException e) {

            throw e;

        }

    }

    return str;

}

/**
  • 执行post请求,返回doc

*@paramurl

*@paramparams

*@return

*@throwsException

*/

publicDocumentexecutePostAsDocument(String url, Map<String, String> params)throwsException{

    return parseHtmlToDoc(executePost(url, params));

}

/**
  • 执行post请求

*@paramurl

*@paramparams

*@return

*@throwsException

*/

publicStringexecutePost(String url, Map<String, String> params)throwsException{

    String reStr = "";

    HttpPost httpPost = new HttpPost(url);

    httpPost.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());

    httpPost.setHeader("Cookie", convertCookieMapToString(cookieMap));

    List<NameValuePair> paramsRe = new ArrayList<>();

    for (String key : params.keySet()) {

        paramsRe.add(new BasicNameValuePair(key, params.get(key)));

    }

    CloseableHttpClient httpclient = HttpClientBuilder.create().build();

    CloseableHttpResponse response;

    try {

        httpPost.setEntity(new UrlEncodedFormEntity(paramsRe));

        HttpClientContext context = HttpClientContext.create();

        response = httpclient.execute(httpPost, context);

        getCookiesFromCookieStore(context.getCookieStore(), cookieMap);

        HttpEntity entity = response.getEntity();

        reStr = EntityUtils.toString(entity, charset);

    } catch (IOException e) {

        throw e;

    } finally {

        httpPost.releaseConnection();

    }

    return reStr;

}

/**
  • 用https执行post请求,返回doc

*@paramurl

*@paramparams

*@return

*@throwsException

*/

publicDocumentexecutePostWithSSLAsDocument(String url, Map<String, String> params)throwsException{

    return parseHtmlToDoc(executePostWithSSL(url, params));

}

/**
  • 用https执行post请求

*@paramurl

*@paramparams

*@return

*@throwsException

*/

publicStringexecutePostWithSSL(String url, Map<String, String> params)throwsException{

    String re = "";

    HttpPost post = new HttpPost(url);

    List<NameValuePair> paramsRe = new ArrayList<>();

    for (String key : params.keySet()) {

        paramsRe.add(new BasicNameValuePair(key, params.get(key)));

    }

    post.setHeader("Cookie", convertCookieMapToString(cookieMap));

    post.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());

    CloseableHttpResponse response;

    try {

        CloseableHttpClient httpClientRe = createSSLInsecureClient();

        HttpClientContext contextRe = HttpClientContext.create();

        post.setEntity(new UrlEncodedFormEntity(paramsRe));

        response = httpClientRe.execute(post, contextRe);

        HttpEntity entity = response.getEntity();

        if (entity != null) {

            re = EntityUtils.toString(entity, charset);

        }

        getCookiesFromCookieStore(contextRe.getCookieStore(), cookieMap);

    } catch (Exception e) {

        throw e;

    }

    return re;

}

/**
  • 发送JSON格式body的POST请求

*@paramurl 地址

*@paramjsonBody json body

*@return

*@throwsException

*/

publicStringexecutePostWithJson(String url, String jsonBody)throwsException{

    String reStr = "";

    HttpPost httpPost = new HttpPost(url);

    httpPost.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());

    httpPost.setHeader("Cookie", convertCookieMapToString(cookieMap));

    CloseableHttpClient httpclient = HttpClientBuilder.create().build();

    CloseableHttpResponse response;

    try {

        httpPost.setEntity(new StringEntity(jsonBody, ContentType.APPLICATION_JSON));

        HttpClientContext context = HttpClientContext.create();

        response = httpclient.execute(httpPost, context);

        getCookiesFromCookieStore(context.getCookieStore(), cookieMap);

        HttpEntity entity = response.getEntity();

        reStr = EntityUtils.toString(entity, charset);

    } catch (IOException e) {

        throw e;

    } finally {

        httpPost.releaseConnection();

    }

    return reStr;

}

/**
  • 发送JSON格式body的SSL POST请求

*@paramurl 地址

*@paramjsonBody json body

*@return

*@throwsException

*/

publicStringexecutePostWithJsonAndSSL(String url, String jsonBody)throwsException{

    String re = "";

    HttpPost post = new HttpPost(url);

    post.setHeader("Cookie", convertCookieMapToString(cookieMap));

    post.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());

    CloseableHttpResponse response;

    try {

        CloseableHttpClient httpClientRe = createSSLInsecureClient();

        HttpClientContext contextRe = HttpClientContext.create();

        post.setEntity(new StringEntity(jsonBody, ContentType.APPLICATION_JSON));

        response = httpClientRe.execute(post, contextRe);

        HttpEntity entity = response.getEntity();

        if (entity != null) {

            re = EntityUtils.toString(entity, charset);

        }

        getCookiesFromCookieStore(contextRe.getCookieStore(), cookieMap);

    } catch (Exception e) {

        throw e;

    }

    return re;

}

privatevoidgetCookiesFromCookieStore(CookieStore cookieStore, Map<String, String> cookieMap){

    List<Cookie> cookies = cookieStore.getCookies();

    for (Cookie cookie : cookies) {

        cookieMap.put(cookie.getName(), cookie.getValue());

    }

}

privateStringconvertCookieMapToString(Map<String, String> map){

    String cookie = "";

    for (String key : map.keySet()) {

        cookie += (key + "=" + map.get(key) + "; ");

    }

    if (map.size() > 0) {

        cookie = cookie.substring(0, cookie.length() - 2);

    }

    return cookie;

}

/**
  • 创建 SSL连接

*@return

*@throwsGeneralSecurityException

*/

privatestaticCloseableHttpClientcreateSSLInsecureClient()throwsGeneralSecurityException{

    try {

        SSLContext sslContext = new SSLContextBuilder().loadTrustMaterial(null, (chain, authType) -> true).build();

        SSLConnectionSocketFactory sslConnectionSocketFactory = new SSLConnectionSocketFactory(sslContext,

                (s, sslContextL) -> true);

        return HttpClients.custom().setSSLSocketFactory(sslConnectionSocketFactory).build();

    } catch (GeneralSecurityException e) {

        throw e;

    }

}

}

给大家推荐一个程序员学习交流群:863621962。群里有分享的视频,还有思维导图

群公告有视频,都是干货的,你可以下载来看。主要分享分布式架构、高可扩展、高性能、高并发、性能优化、Spring boot、Redis、ActiveMQ、Nginx、Mycat、Netty、Jvm大型分布式项目实战学习架构师视频。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章