文章出自:http://babyjoycry.javaeye.com/blog/587527 在此感謝原作者...\(^o^)/~
最近研究抓取網頁內容,發現要獲取頁面的編碼格式,Java沒有現成的實現方法,雖然csdn上有個達人寫了一篇文章,附有代碼,可惜,我沒有找到相關的包,不得已,只好自己動手豐衣足食了。
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import cpdetector.io.CodepageDetectorProxy;
import cpdetector.io.HTMLCodepageDetector;
import cpdetector.io.JChardetFacade;
public class PageEncodeDetector {
private static CodepageDetectorProxy detector = CodepageDetectorProxy
.getInstance();
static {
detector.add(new HTMLCodepageDetector(false));
detector.add(JChardetFacade.getInstance());
}
/**
* 測試用例
*
* @param args
*/
public static void main(String[] args) {
PageEncodeDetector web = new PageEncodeDetector();
try {
System.out.println(web.getCharset("http://www.baidu.com/"));
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
* @param strurl
* 頁面url地址,需要以 http://開始,例:http://www.pujia.com
* @return
* @throws IOException
*/
public String getCharset(String strurl) throws IOException {
// 定義URL對象
URL url = new URL(strurl);
// 獲取http連接對象
HttpURLConnection urlConnection = (HttpURLConnection) url
.openConnection();
;
urlConnection.connect();
// 網頁編碼
String strencoding = null;
/**
* 首先根據header信息,判斷頁面編碼
*/
// map存放的是header信息(url頁面的頭信息)
Map<String, List<String>> map = urlConnection.getHeaderFields();
Set<String> keys = map.keySet();
Iterator<String> iterator = keys.iterator();
// 遍歷,查找字符編碼
String key = null;
String tmp = null;
while (iterator.hasNext()) {
key = iterator.next();
tmp = map.get(key).toString().toLowerCase();
// 獲取content-type charset
if (key != null && key.equals("Content-Type")) {
int m = tmp.indexOf("charset=");
if (m != -1) {
strencoding = tmp.substring(m + 8).replace("]", "");
return strencoding;
}
}
}
/**
* 通過解析meta得到網頁編碼
*/
// 獲取網頁源碼(英文字符和數字不會亂碼,所以可以得到正確<meta/>區域)
StringBuffer sb = new StringBuffer();
String line;
try {
BufferedReader in = new BufferedReader(new InputStreamReader(url
.openStream()));
while ((line = in.readLine()) != null) {
sb.append(line);
}
in.close();
} catch (Exception e) { // Report any errors that arise
System.err.println(e);
System.err
.println("Usage: java HttpClient <URL> [<filename>]");
}
String htmlcode = sb.toString();
// 解析html源碼,取出<meta />區域,並取出charset
String strbegin = "<meta";
String strend = ">";
String strtmp;
int begin = htmlcode.indexOf(strbegin);
int end = -1;
int inttmp;
while (begin > -1) {
end = htmlcode.substring(begin).indexOf(strend);
if (begin > -1 && end > -1) {
strtmp = htmlcode.substring(begin, begin + end).toLowerCase();
inttmp = strtmp.indexOf("charset");
if (inttmp > -1) {
strencoding = strtmp.substring(inttmp + 7, end).replace(
"=", "").replace("/", "").replace("\"", "")
.replace("\'", "").replace(" ", "");
return strencoding;
}
}
htmlcode = htmlcode.substring(begin);
begin = htmlcode.indexOf(strbegin);
}
/**
* 分析字節得到網頁編碼
*/
strencoding = getFileEncoding(url);
// 設置默認網頁字符編碼
if (strencoding == null) {
strencoding = "GBK";
}
return strencoding;
}
/**
*
*<br>
* 方法說明:通過網頁內容識別網頁編碼
*
*<br>
* 輸入參數:strUrl 網頁鏈接; timeout 超時設置
*
*<br>
* 返回類型:網頁編碼
*/
public static String getFileEncoding(URL url) {
java.nio.charset.Charset charset = null;
try {
charset = detector.detectCodepage(url);
} catch (Exception e) {
System.out.println(e.getClass() + "分析" + "編碼失敗");
}
if (charset != null)
return charset.name();
return null;
}
}
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import cpdetector.io.CodepageDetectorProxy;
import cpdetector.io.HTMLCodepageDetector;
import cpdetector.io.JChardetFacade;
public class PageEncodeDetector {
private static CodepageDetectorProxy detector = CodepageDetectorProxy
.getInstance();
static {
detector.add(new HTMLCodepageDetector(false));
detector.add(JChardetFacade.getInstance());
}
/**
* 測試用例
*
* @param args
*/
public static void main(String[] args) {
PageEncodeDetector web = new PageEncodeDetector();
try {
System.out.println(web.getCharset("http://www.baidu.com/"));
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
* @param strurl
* 頁面url地址,需要以 http://開始,例:http://www.pujia.com
* @return
* @throws IOException
*/
public String getCharset(String strurl) throws IOException {
// 定義URL對象
URL url = new URL(strurl);
// 獲取http連接對象
HttpURLConnection urlConnection = (HttpURLConnection) url
.openConnection();
;
urlConnection.connect();
// 網頁編碼
String strencoding = null;
/**
* 首先根據header信息,判斷頁面編碼
*/
// map存放的是header信息(url頁面的頭信息)
Map<String, List<String>> map = urlConnection.getHeaderFields();
Set<String> keys = map.keySet();
Iterator<String> iterator = keys.iterator();
// 遍歷,查找字符編碼
String key = null;
String tmp = null;
while (iterator.hasNext()) {
key = iterator.next();
tmp = map.get(key).toString().toLowerCase();
// 獲取content-type charset
if (key != null && key.equals("Content-Type")) {
int m = tmp.indexOf("charset=");
if (m != -1) {
strencoding = tmp.substring(m + 8).replace("]", "");
return strencoding;
}
}
}
/**
* 通過解析meta得到網頁編碼
*/
// 獲取網頁源碼(英文字符和數字不會亂碼,所以可以得到正確<meta/>區域)
StringBuffer sb = new StringBuffer();
String line;
try {
BufferedReader in = new BufferedReader(new InputStreamReader(url
.openStream()));
while ((line = in.readLine()) != null) {
sb.append(line);
}
in.close();
} catch (Exception e) { // Report any errors that arise
System.err.println(e);
System.err
.println("Usage: java HttpClient <URL> [<filename>]");
}
String htmlcode = sb.toString();
// 解析html源碼,取出<meta />區域,並取出charset
String strbegin = "<meta";
String strend = ">";
String strtmp;
int begin = htmlcode.indexOf(strbegin);
int end = -1;
int inttmp;
while (begin > -1) {
end = htmlcode.substring(begin).indexOf(strend);
if (begin > -1 && end > -1) {
strtmp = htmlcode.substring(begin, begin + end).toLowerCase();
inttmp = strtmp.indexOf("charset");
if (inttmp > -1) {
strencoding = strtmp.substring(inttmp + 7, end).replace(
"=", "").replace("/", "").replace("\"", "")
.replace("\'", "").replace(" ", "");
return strencoding;
}
}
htmlcode = htmlcode.substring(begin);
begin = htmlcode.indexOf(strbegin);
}
/**
* 分析字節得到網頁編碼
*/
strencoding = getFileEncoding(url);
// 設置默認網頁字符編碼
if (strencoding == null) {
strencoding = "GBK";
}
return strencoding;
}
/**
*
*<br>
* 方法說明:通過網頁內容識別網頁編碼
*
*<br>
* 輸入參數:strUrl 網頁鏈接; timeout 超時設置
*
*<br>
* 返回類型:網頁編碼
*/
public static String getFileEncoding(URL url) {
java.nio.charset.Charset charset = null;
try {
charset = detector.detectCodepage(url);
} catch (Exception e) {
System.out.println(e.getClass() + "分析" + "編碼失敗");
}
if (charset != null)
return charset.name();
return null;
}
}
需要下載cpdetector_1.0.5.jar 和 chardet.jar