1,首先下載 三個jar chardet.jar cpdetector_1.0.10.jar antlr-2.7.2.jar
2,
/*
* Copyright (C) 2015-2020 LianShi Inc.All Rights Reserved.
* Description:TODO
* @author diaowj:2016-5-4
*/
/**
*
*/
package com.enation.app.shop.core.action.backend;
import java.io.File;
import java.nio.charset.Charset;
import info.monitorenter.cpdetector.io.ASCIIDetector;
import info.monitorenter.cpdetector.io.CodepageDetectorProxy;
import info.monitorenter.cpdetector.io.JChardetFacade;
import info.monitorenter.cpdetector.io.UnicodeDetector;
/**
* 描述
* @author diaowj
*/
import info.monitorenter.cpdetector.io.ParsingDetector;
/**
* @author diaowj
*
*/
public class FileCharsetDetector {
/**
* 利用第三方開源包cpdetector獲取文件編碼格式.
* @param filePath
* @return
*/
public static String getFileEncode(File file) {
/**
*
* 1、cpDetector內置了一些常用的探測實現類,這些探測實現類的實例可以通過add方法加進來,
* 如:ParsingDetector、 JChardetFacade、ASCIIDetector、UnicodeDetector.
* 2、detector按照“誰最先返回非空的探測結果,就以該結果爲準”的原則.
* 3、cpDetector是基於統計學原理的,不保證完全正確.
*
*/
CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();
detector.add(new ParsingDetector(false));
detector.add(UnicodeDetector.getInstance());
detector.add(JChardetFacade.getInstance());//內部引用了 chardet.jar的類
detector.add(ASCIIDetector.getInstance());
Charset charset = null;
try {
charset = detector.detectCodepage(file.toURI().toURL());
} catch (Exception e) {
e.printStackTrace();
}
//默認爲GBK
String charsetName = "GBK";
if (charset != null) {
if (charset.name().equals("US-ASCII")) {
charsetName = "ISO_8859_1";
} else{
charsetName = charset.name();
}
}
return charsetName;
}
public static void main(String[] args){
System.out.println(getFileEncode(new File("D:/商品抓取/碧雲天/detail/SC0123-25mg.txt")));
}
}