根據url讀取html有兩種方式
1.HttpURLConnection
2.Jsoup
兩種方式的maven依賴:
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpcore -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpcore</artifactId>
<version>4.4.5</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.6</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpmime</artifactId>
<version>4.5.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.google.guava/guava -->
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>27.0.1-jre</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
兩種方式的代碼實現
兩種讀取方式及部分其他方法:
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
public class ReadHTMLByUrl {
/**
* url讀取html文件
* @param u
* @param encoding
* @return
* @throws Exception
*/
public static String readFile(String u, String encoding) throws Exception {
StringBuffer html = new StringBuffer();
URL url = new URL(u);// 根據鏈接(字符串格式),生成一個URL對象
HttpURLConnection urlConnection = (HttpURLConnection) url
.openConnection();// 打開URL
BufferedReader reader = new BufferedReader(new InputStreamReader(
urlConnection.getInputStream(), encoding));// 得到輸入流,即獲得了網頁的內容
String line; // 讀取輸入流的數據,並顯示
while ((line = reader.readLine()) != null) {
html.append(line);
}
return html.toString();
}
/**
* 按照url從網絡上直接讀取html下body的內容
* @param url
* @return
* @throws IOException
*/
public static String JsoupBodyHtml(String url){
Document doc = null;
try {
doc = Jsoup.connect(url).get();
} catch (IOException e) {
for(int i=0;i<3;i++) {
try {
Thread.sleep(5*1000);
doc = Jsoup.connect(url).get();
//成功建立連接跳出循環
break;
} catch (InterruptedException e1) {
} catch (IOException e1) {
}
}
}
return doc == null ? "":doc.body() == null ? "":doc.body().html();
}
/**
* 按照url從網絡上直接讀取html下的內容
* @param url
* @return
* @throws IOException
*/
public static String JsoupHtml(String url) {
Document doc = null;
try {
doc = Jsoup.connect(url).get();
} catch (IOException e) {
try {
Thread.sleep(20*1000);
doc = Jsoup.connect(url).get();
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
} catch (InterruptedException e1) {
e1.printStackTrace();
}
e.printStackTrace();
}
return doc==null ? "" : doc.toString();
}
/**
* 根據網頁的url清空帶有字體的style屬性,替換img的src
* @param url
* @return
*/
public static String reWriteHtml(String url) {
try {
//截取url
String domainUrl = url.substring(0, url.lastIndexOf("/") + 1);
Document doc = Jsoup.connect(url).get();
Element body = doc.body();
//查找img
List<Element> imgs = body.select("img");
if(null == imgs) {
imgs = new ArrayList();
}
String src;
for(Element img : imgs){
src = img.attr("src");
//圖片相對路徑改爲絕對路徑
src = src.startsWith("http") ? src : domainUrl + src;
img.attr("src", src);
}
List<Element> eList = body.getAllElements();
if(null == eList) {
eList = new ArrayList();
}
String style;
//移除帶有font屬性的樣式
for(Element e : eList){
style = e.attr("style");
if(style.indexOf("font") > -1){
e.removeAttr("style");
}
}
return body.html();
}catch(Exception e) {
e.printStackTrace();
return "";
}
}
/**
* 將img標籤中的src進行二次包裝
* @param content 內容
* @param replaceHttp 需要在src中加入的域名
* @return
*/
public static String repairContent(String url){
String content =JsoupBodyHtml(url);
if("".equals(content)) {
return "";
}
//加在img src中的前綴
String replaceHttp = url.substring(0, url.lastIndexOf("/")+1);
String patternStr="<img\\s*([^>]*)\\s*src=\\\"(.*?)\\\"\\s*([^>]*)>";
content = replSrc(content, replaceHttp, patternStr);
return content;
}
/**
* 替換src後的div內容
* @param content
* @param replaceHttp
* @param patternStr
* @return
*/
private static String replSrc(String content, String replaceHttp, String patternStr) {
Pattern pattern = Pattern.compile(patternStr,Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(content);
Map<String,String> repMap = new HashMap<String,String>();
//將所有的匹配數據放到repMap中去重,防止重複替換
while(matcher.find()) {
String src = matcher.group(2);
String replaceSrc = "";
//只替換非 http:// 或 https:// 的src
if(!src.startsWith("http://")&&!src.startsWith("https://")){
replaceSrc = replaceHttp + src;
//按key去重
if (!repMap.containsKey(src)) {
repMap.put(src, replaceSrc);
}
}
}
//去重替換文件
for (String key : repMap.keySet()) {
content = content.replace(key, repMap.get(key));
}
return content;
}
/**
* 獲得要替換的map
* @param content
* @param replaceHttp
* @param pattern
* @param matcher
* @return
*/
private static String matchSrc(String content, String replaceHttp, Pattern pattern, Matcher matcher) {
Map<String,String> repMap = new HashMap<String,String>();
//將所有的匹配數據放到repMap中去重,防止重複替換
while(matcher.find()) {
String src = matcher.group(2);
String replaceSrc = "";
if(!src.startsWith("http://")&&!src.startsWith("https://")){
replaceSrc = replaceHttp + src;
if (!repMap.containsKey(src)) {
repMap.put(src, replaceSrc);
}
}
}
//按key去重
for (String key : repMap.keySet()) {
content = content.replace(key, repMap.get(key));
}
return content;
}
private static List<String> getMatchers(String regex, String source){
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(source);
List<String> list = new ArrayList<String>();
while (matcher.find()) {
list.add(matcher.group(2));
}
return list;
}
}