java獲取頁面編碼(Z)

文章出自：http://babyjoycry.javaeye.com/blog/587527 在此感謝原作者...\(^o^)/~

最近研究抓取網頁內容，發現要獲取頁面的編碼格式，Java沒有現成的實現方法，雖然csdn上有個達人寫了一篇文章，附有代碼，可惜，我沒有找到相關的包，不得已，只好自己動手豐衣足食了。

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

import cpdetector.io.CodepageDetectorProxy;
import cpdetector.io.HTMLCodepageDetector;
import cpdetector.io.JChardetFacade;

public class PageEncodeDetector {
  private static CodepageDetectorProxy detector = CodepageDetectorProxy
      .getInstance();

  static {
    detector.add(new HTMLCodepageDetector(false));
    detector.add(JChardetFacade.getInstance());
  }

  /**
    * 測試用例
    *
    * @param args
    */
  public static void main(String[] args) {
    PageEncodeDetector web = new PageEncodeDetector();
    try {
      System.out.println(web.getCharset("http://www.baidu.com/"));
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
  }

  /**
    * @param strurl
    *                        頁面url地址,需要以 http://開始，例：http://www.pujia.com
    * @return
    * @throws IOException
    */
  public String getCharset(String strurl) throws IOException {
    // 定義URL對象
    URL url = new URL(strurl);
    // 獲取http連接對象
    HttpURLConnection urlConnection = (HttpURLConnection) url
        .openConnection();
    ;
    urlConnection.connect();
    // 網頁編碼
    String strencoding = null;

    /**
     * 首先根據header信息，判斷頁面編碼
     */
    // map存放的是header信息(url頁面的頭信息)
    Map<String, List<String>> map = urlConnection.getHeaderFields();
    Set<String> keys = map.keySet();
    Iterator<String> iterator = keys.iterator();

    // 遍歷,查找字符編碼
    String key = null;
    String tmp = null;
    while (iterator.hasNext()) {
      key = iterator.next();
      tmp = map.get(key).toString().toLowerCase();
      // 獲取content-type charset
      if (key != null && key.equals("Content-Type")) {
        int m = tmp.indexOf("charset=");
        if (m != -1) {
          strencoding = tmp.substring(m + 8).replace("]", "");
          return strencoding;
        }
      }
    }

    /**
     * 通過解析meta得到網頁編碼
     */
    // 獲取網頁源碼(英文字符和數字不會亂碼，所以可以得到正確<meta/>區域)
    StringBuffer sb = new StringBuffer();
    String line;
    try {
      BufferedReader in = new BufferedReader(new InputStreamReader(url
          .openStream()));
      while ((line = in.readLine()) != null) {
        sb.append(line);
      }
      in.close();
    } catch (Exception e) { // Report any errors that arise
      System.err.println(e);
      System.err
          .println("Usage:     java     HttpClient     <URL>     [<filename>]");
    }
    String htmlcode = sb.toString();
    // 解析html源碼，取出<meta />區域，並取出charset
    String strbegin = "<meta";
    String strend = ">";
    String strtmp;
    int begin = htmlcode.indexOf(strbegin);
    int end = -1;
    int inttmp;
    while (begin > -1) {
      end = htmlcode.substring(begin).indexOf(strend);
      if (begin > -1 && end > -1) {
        strtmp = htmlcode.substring(begin, begin + end).toLowerCase();
        inttmp = strtmp.indexOf("charset");
        if (inttmp > -1) {
          strencoding = strtmp.substring(inttmp + 7, end).replace(
              "=", "").replace("/", "").replace("\"", "")
              .replace("\'", "").replace(" ", "");
          return strencoding;
        }
      }
      htmlcode = htmlcode.substring(begin);
      begin = htmlcode.indexOf(strbegin);
    }

    /**
     * 分析字節得到網頁編碼
     */
    strencoding = getFileEncoding(url);

    // 設置默認網頁字符編碼
    if (strencoding == null) {
      strencoding = "GBK";
    }

    return strencoding;
  }

  /**
    *
    *<br>
    * 方法說明：通過網頁內容識別網頁編碼
    *
    *<br>
    * 輸入參數：strUrl 網頁鏈接; timeout 超時設置
    *
    *<br>
    * 返回類型：網頁編碼
    */
  public static String getFileEncoding(URL url) {

    java.nio.charset.Charset charset = null;
    try {
      charset = detector.detectCodepage(url);
    } catch (Exception e) {
      System.out.println(e.getClass() + "分析" + "編碼失敗");
    }
    if (charset != null)
      return charset.name();
    return null;

  }

}

需要下載cpdetector_1.0.5.jar 和 chardet.jar

java獲取頁面編碼(Z)

python gdal 安裝使用（Windows， python 3.6.8）

[基礎]JDBC連接MySQL

MySQL5中文亂碼解決(Z)

[基礎]MySQL中的數據類型

MySQL 基本命令

我的友情鏈接

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結