根據url讀取html文件

根據url讀取html有兩種方式

1.HttpURLConnection
2.Jsoup
兩種方式的maven依賴:

	<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpcore -->
		<dependency>
			<groupId>org.apache.httpcomponents</groupId>
			<artifactId>httpcore</artifactId>
			<version>4.4.5</version>
		</dependency>
		<dependency>
			<groupId>org.apache.httpcomponents</groupId>
			<artifactId>httpclient</artifactId>
			<version>4.5.6</version>
		</dependency>
		<dependency>
			<groupId>org.apache.httpcomponents</groupId>
			<artifactId>httpmime</artifactId>
			<version>4.5.2</version>
		</dependency>
		<!-- https://mvnrepository.com/artifact/com.google.guava/guava -->
		<dependency>
			<groupId>com.google.guava</groupId>
			<artifactId>guava</artifactId>
			<version>27.0.1-jre</version>
		</dependency>
		<dependency>
		    <groupId>org.jsoup</groupId>
		    <artifactId>jsoup</artifactId>
		    <version>1.11.3</version>
		</dependency>

兩種方式的代碼實現

兩種讀取方式及部分其他方法:

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;


public class ReadHTMLByUrl {
	
    
    /**
     * url讀取html文件
     * @param u
     * @param encoding
     * @return
     * @throws Exception
     */
    public static String readFile(String u, String encoding) throws Exception {
    	StringBuffer html = new StringBuffer();
        URL url = new URL(u);// 根據鏈接(字符串格式),生成一個URL對象

        HttpURLConnection urlConnection = (HttpURLConnection) url
                .openConnection();// 打開URL
        BufferedReader reader = new BufferedReader(new InputStreamReader(
                urlConnection.getInputStream(), encoding));// 得到輸入流,即獲得了網頁的內容
        String line; // 讀取輸入流的數據,並顯示
        while ((line = reader.readLine()) != null) {
        	html.append(line);
        }
		return html.toString();
    }
    
    /**
     * 按照url從網絡上直接讀取html下body的內容
     * @param url
     * @return
     * @throws IOException
     */
    public static String JsoupBodyHtml(String url){
    	
    	Document doc = null;
		try {
			doc = Jsoup.connect(url).get();
		} catch (IOException e) {
			for(int i=0;i<3;i++) {
				try {
					Thread.sleep(5*1000);
					doc = Jsoup.connect(url).get();
					//成功建立連接跳出循環
					break;
				} catch (InterruptedException e1) {
				} catch (IOException e1) {
				}
			}
		}
    	return doc == null ? "":doc.body() == null ? "":doc.body().html();
    	
    }
    
    /**
     * 按照url從網絡上直接讀取html下的內容
     * @param url
     * @return
     * @throws IOException
     */
    public static String JsoupHtml(String url) {
    	Document doc = null;
		try {
			doc = Jsoup.connect(url).get();
		} catch (IOException e) {
			try {
				Thread.sleep(20*1000);
				doc = Jsoup.connect(url).get();
			} catch (IOException e1) {
				// TODO Auto-generated catch block
				e1.printStackTrace();
			} catch (InterruptedException e1) {
				e1.printStackTrace();
			}
			e.printStackTrace();
		}
    	return doc==null ? "" : doc.toString();
    	
    }
    /**
     *  根據網頁的url清空帶有字體的style屬性,替換img的src
     * @param url
     * @return
     */
    public static String reWriteHtml(String url) {
        try {
         //截取url
          String domainUrl = url.substring(0, url.lastIndexOf("/") + 1);
          Document doc = Jsoup.connect(url).get();
          Element body = doc.body();
          //查找img
          List<Element> imgs = body.select("img");
          if(null == imgs) {
            imgs = new ArrayList();
          }
          String src;
          for(Element img : imgs){
              src = img.attr("src");
              //圖片相對路徑改爲絕對路徑
              src = src.startsWith("http") ? src : domainUrl + src;
              img.attr("src", src);
          }
          List<Element> eList = body.getAllElements();
          if(null == eList) {
            eList = new ArrayList();
          }
          String style;
          //移除帶有font屬性的樣式
          for(Element e : eList){
              style = e.attr("style");
              if(style.indexOf("font") > -1){
                  e.removeAttr("style");
              }
          }
          return body.html();
        }catch(Exception e) {
          e.printStackTrace();
          return "";
        }
      }
    /** 
     * 將img標籤中的src進行二次包裝 
     * @param content 內容 
     * @param replaceHttp 需要在src中加入的域名 
     * @return 
     */  
    public static String repairContent(String url){
    	String content =JsoupBodyHtml(url);
    	if("".equals(content)) {
    		return "";
    	}
    	//加在img src中的前綴
		String replaceHttp = url.substring(0, url.lastIndexOf("/")+1);
        String patternStr="<img\\s*([^>]*)\\s*src=\\\"(.*?)\\\"\\s*([^>]*)>";  
        
        content = replSrc(content, replaceHttp, patternStr);   
        return content;  
    }
    /**
     * 替換src後的div內容
     * @param content
     * @param replaceHttp
     * @param patternStr
     * @return
     */
	private static String replSrc(String content, String replaceHttp, String patternStr) {
		Pattern pattern = Pattern.compile(patternStr,Pattern.CASE_INSENSITIVE);  
        Matcher matcher = pattern.matcher(content);
        Map<String,String> repMap = new HashMap<String,String>();
		//將所有的匹配數據放到repMap中去重,防止重複替換
        while(matcher.find()) { 
            String src = matcher.group(2);  
            String replaceSrc = ""; 
            //只替換非 http:// 或 https:// 的src
            if(!src.startsWith("http://")&&!src.startsWith("https://")){  
                replaceSrc = replaceHttp + src;
               //按key去重
                if (!repMap.containsKey(src)) {
                	repMap.put(src, replaceSrc);
				}
            }  
        }
        //去重替換文件
        for (String key : repMap.keySet()) {
        	content = content.replace(key, repMap.get(key));
		}
		return content;
	}
    /**
     * 獲得要替換的map
     * @param content
     * @param replaceHttp
     * @param pattern
     * @param matcher
     * @return
     */
	private static String matchSrc(String content, String replaceHttp, Pattern pattern, Matcher matcher) {
		Map<String,String> repMap = new HashMap<String,String>();
		//將所有的匹配數據放到repMap中去重,防止重複替換
        while(matcher.find()) { 
            String src = matcher.group(2);  
            String replaceSrc = "";  
            if(!src.startsWith("http://")&&!src.startsWith("https://")){  
                replaceSrc = replaceHttp + src;
                if (!repMap.containsKey(src)) {
                	repMap.put(src, replaceSrc);
				}
            }  
        }
        //按key去重
        for (String key : repMap.keySet()) {
        	content = content.replace(key, repMap.get(key));
		}
		return content;
	}  
	private static List<String> getMatchers(String regex, String source){
         Pattern pattern = Pattern.compile(regex);
         Matcher matcher = pattern.matcher(source);
         List<String> list = new ArrayList<String>();
         while (matcher.find()) {
             list.add(matcher.group(2));
         }
         return list;
     } 
}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章