1. HttpClient與Jsoup簡介
1.1 HttpClient
HttpClient可以用來提供高效的、最新的、功能豐富的支持 HTTP 協議的客戶端編程工具包,並且它支持 HTTP 協議最新的版本和建議。
HttpClient的作用
- 實現了所有 HTTP 的方法(GET,POST,PUT,HEAD 等)
- 支持自動轉向
- 支持 HTTPS 協議
- 支持代理服務器等
1.2 JSoup
jsoup是一款Java的HTML解析器,可直接解析某個URL地址、HTML文本內容。它提供了一套非常省力的API,可通過DOM,CSS以及類似於jQuery的操作方法來取出和操作數據。
JSoup的作用
- 從一個URL,文件或字符串中解析HTML;
- 使用DOM或CSS選擇器來查找、取出數據;
- 可操作HTML元素、屬性、文本;
1.3 爲什麼要一起使用
httpClient 屬於專業的抓取網頁的庫,可以設置代理,抓取失敗可以重試抓取
在我的實際使用中,單獨用jsoup也可以直接抓取網頁,但是在抓取上,jsoup比較弱,API簡單,功能也簡單,主要是擴展htmlparser的功能吧,解析html。測試過程中jsoup抓取頁面經常報錯(time out等等)。
因此,我們可以用httpclient抓取網頁,再用Jsoup.parse解析頁面。
2.項目maven依賴
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.12.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.10</version>
</dependency>
3.HttpClientUtils工具類
@SuppressWarnings("unused")
public class HttpClientUtils {
private static String CHARSET = "utf-8";
private static Integer STATUS_CODE = 200;
private static CloseableHttpClient httpClient;
private static Pattern pattern = Pattern.compile("<head>([\\s\\S]*?)<meta([\\s\\S]*?)charset\\s*=(\")?(.*?)\"");
//信任SSL證書 java實現https請求繞過證書檢測
static {
try {
HostnameVerifier hv = (String hostname, SSLSession session) -> true;
javax.net.ssl.TrustManager[] trustAllCerts = new javax.net.ssl.TrustManager[1];
javax.net.ssl.TrustManager tm = new miTM();
trustAllCerts[0] = tm;
javax.net.ssl.SSLContext sc = javax.net.ssl.SSLContext
.getInstance("SSL");
sc.init(null, trustAllCerts, null);
javax.net.ssl.HttpsURLConnection.setDefaultSSLSocketFactory(sc.getSocketFactory());
RequestConfig config = RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(5000).build();
httpClient = HttpClientBuilder.create().setDefaultRequestConfig(config).setSSLContext(sc).setSSLHostnameVerifier(hv).build();
} catch (Exception e) {
e.printStackTrace();
}
}
static class miTM implements javax.net.ssl.TrustManager, javax.net.ssl.X509TrustManager {
@Override
public void checkClientTrusted(X509Certificate[] x509Certificates, String s) {
}
@Override
public void checkServerTrusted(X509Certificate[] x509Certificates, String s) {
}
@Override
public X509Certificate[] getAcceptedIssuers() {
return new X509Certificate[0];
}
}
/**
* 帶請求頭的GET請求 (已設置默認請求頭)
*
* @param url 鏈接url
* @return 網頁內容
*/
public static String doGetWithHeaders(String url) {
//
String responseTex = "";
CloseableHttpResponse response = null;
try {
// 2.創建uri對象
URIBuilder builder = new URIBuilder(url);
URI uri = builder.build();
// 3.創建http GET請求
HttpGet httpGet = new HttpGet(uri);
//攜帶請求頭的信息
httpGet.setHeader("Accept-Charset", "GB2312,utf-8;q=0.7,*;q=0.7");
httpGet.setHeader("Accept-Encoding", "gzip, deflate, br");
httpGet.setHeader("Accept-Language", "en-US,en;q=0.9,zh;q=0.8,zh-CN;q=0.7");
httpGet.setHeader("Connection", "keep-alive");
httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36");
// 4.執行請求
response = httpClient.execute(httpGet);
// 5.判斷返回狀態是否爲200
if (response.getStatusLine().getStatusCode() == STATUS_CODE) {
// 6.進行編碼自適應處理
HttpEntity entity = response.getEntity();
byte[] bytes = EntityUtils.toByteArray(entity);
String content = new String(bytes);
Matcher matcher = pattern.matcher(content.toLowerCase());
if (matcher.find()) {
CHARSET = matcher.group(4);
String gb2312 = "gb2312";
if (gb2312.equals(CHARSET)) {
byte[] gbkBytes = new String(bytes, "gb2312").getBytes();
responseTex = new String(gbkBytes);
}
}
responseTex = new String(bytes, CHARSET);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
close(httpClient, response);
}
return responseTex;
}
/**
* 攜帶請求參數的GET請求
*
* @param url 鏈接url
* @param params 請求參數
* @return 網頁內容
*/
public static String doGet(String url, Map<String, String> params) {
//
String responseTex = "";
CloseableHttpResponse response = null;
try {
// 2.創建uri對象
URIBuilder builder = new URIBuilder(url);
if (params != null && !params.isEmpty()) {
for (String key : params.keySet()) {
builder.addParameter(key, params.get(key));
}
}
URI uri = builder.build();
// 3.創建http GET請求
HttpGet httpGet = new HttpGet(uri);
// 4.執行請求
response = httpClient.execute(httpGet);
// 5.判斷返回狀態是否爲200
if (response.getStatusLine().getStatusCode() == STATUS_CODE) {
// 6.進行UTF-8編碼處理
responseTex = EntityUtils.toString(response.getEntity(), CHARSET);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
close(httpClient, response);
}
return responseTex;
}
/**
* 不需要攜帶參數的GET請求
*
* @param url 鏈接url
* @return 網頁內容
*/
public static String doGet(String url) {
return doGet(url, null);
}
/**
* 既攜帶請求頭又攜帶請求參數的GET請求
* 該方法未實現
*
* @param url 鏈接url
* @param params 請求參數
* @param header 請求頭
* @return 網頁內容
*/
public static String doGet(String url, Map<String, String> params, Map<String, String> header) {
return null;
}
/**
* 攜帶請求參數的POST請求
*
* @param url 鏈接url
* @param param 請求參數
* @return 網頁內容
*/
public static String doPost(String url, Map<String, String> param) {
//
String result = "";
CloseableHttpResponse response = null;
try {
// 2.創建Http Post請求
HttpPost httpPost = new HttpPost(url);
// 3.創建參數列表
if (param != null) {
List<NameValuePair> paramList = new ArrayList<>();
for (String key : param.keySet()) {
paramList.add(new BasicNameValuePair(key, param.get(key)));
}
// 4.模擬表單
UrlEncodedFormEntity entity = new UrlEncodedFormEntity(paramList);
httpPost.setEntity(entity);
}
// 5.執行http請求
response = httpClient.execute(httpPost);
// 6.獲取響應的結果
result = EntityUtils.toString(response.getEntity(), CHARSET);
} catch (Exception e) {
e.printStackTrace();
} finally {
close(httpClient, response);
}
return result;
}
/**
* 發送無攜帶請求參數的POST請求
*
* @param url 鏈接url
* @return 網頁內容
*/
public static String doPost(String url) {
return doPost(url, null);
}
/**
* 以json的方式傳遞請求參數,發送POST請求
*
* @param url 鏈接url
* @param json json格式的參數
* @return 網頁內容
*/
public static String doPostJson(String url, String json) {//map json
//
String result = "";
CloseableHttpResponse response = null;
try {
// 2.創建Http Post請求
HttpPost httpPost = new HttpPost(url);
// 3.創建請求內容
StringEntity entity = new StringEntity(json, ContentType.APPLICATION_JSON);
httpPost.setEntity(entity);
// 4.執行http請求
response = httpClient.execute(httpPost);
// 5.獲取響應結果
result = EntityUtils.toString(response.getEntity(), "utf-8");
} catch (Exception e) {
e.printStackTrace();
} finally {
close(httpClient, response);
}
return result;
}
/**
* 釋放資源
*
* @param httpClient httpClient
* @param httpResponse httpResponse
*/
private static void close(CloseableHttpClient httpClient, CloseableHttpResponse httpResponse) {
if (httpResponse != null) {
try {
httpResponse.close();
} catch (IOException e) {
e.printStackTrace();
}
}
//暫不釋放資源
// try {
//
// httpClient.close();
// } catch (IOException e) {
// e.printStackTrace();
// }
}
}
注意:該工具類的url地址要帶http協議或https協議,否則會報錯
4.JSoup的使用
4.1 使用dom方法來查找元素
public class Test4 {
public static void main(String[] args) {
String uri = "https://www.yiibai.com/jsoup/jsoup-quick-start.html";
String html = HttpClientUtils.doGet(uri);
//使用JSoup解析html
Document document = Jsoup.parse(html);
//使用document.getElementsByTag("a") 獲取所有a標籤 通過forEach遍歷內容
Elements aTag = document.getElementsByTag("a");
for (Element element : aTag) {
//使用text()方法獲取文本內容
String text = element.text();
//使用html()方法獲取標籤體的HTML
String html1 = element.html();
//attr(String key)來獲取屬性爲key的內容
String href = element.attr("href");
}
//使用document.getElementById("xx")獲取id爲xx的標籤
Element id = document.getElementById("qq-group");
String text = id.text();
//attributes()獲取所有屬性
Attributes attributes = id.attributes();
//輸出屬性和屬性值
for (Attribute attribute : attributes) {
String key = attribute.getKey();
String value = attribute.getValue();
System.out.println("key="+key+"--->value="+value);
}
//document.getElementsByClass("yy")獲取class爲yy的所有標籤
Elements aClass = document.getElementsByClass("article-content");
}
}
4.2 使用選擇器語法來查找元素
public class Test5 {
public static void main(String[] args) {
String uri = "https://www.yiibai.com/jsoup/jsoup-quick-start.html";
String html = HttpClientUtils.doGet(uri);
//使用JSoup解析html
Document document = Jsoup.parse(html);
//select("tagname") 通過標籤查找元素
Elements aTag = document.select("a");
//select("#id") 通過ID查找元素
Elements id = document.select("#qq-group");
//select(".class") 通過class名稱查找元素
Elements class1 = document.select(".article-content");
//select("[attribute]") 通過屬性查找元素
Elements href = document.select("[href]");
//select(":contains(text)") 查找包含給定文本的元素,搜索不區分大不寫
Elements contains = document.select(":contains(JSoup安裝)");
for (Element element : contains) {
//輸出包含 'JSoup安裝' 內容的鏈接
String href1 = element.attr("href");
if (href1!=""){
System.out.println(href1);
}
}
//select(":matches(regex)") 查找哪些元素的文本匹配指定的正則表達式
Elements select = document.select(":matches(regex)");
}
}