準備工作:
- 最新NLPIR分詞系統下載包http://ictclas.nlpir.org/downloads
- 授權文件更新:https://github.com/NLPIR-team/NLPIR/tree/431b6351f30ed7d606ac50fde1f4456c596998df/License或者https://github.com/NLPIR-team/NLPIR/tree/master/License
在eclipse中導入中科院分詞系統下載包中sample->JnaTest_NLPIR項目,優化NlpirTest.java文件代碼。
其中,官方原始demo代碼需要更改的地方:
CLibrary Instance = (CLibrary) Native.loadLibrary(
"D:\\tools\\NLPIR\\lib\\win64\\NLPIR", CLibrary.class);
中路徑改爲自己操作系統的.dll和.lib的上一級目錄,並加上NLPIR,不要後綴名
public static void main(String[] args) throws Exception {
String argu = "D:\\tools\\NLPIR";
路徑改爲Data文件夾的上一級目錄
☆注:遇到初始化問題,如果是文件過期問題,在上面鏈接中下載對應的最新授權文件,並替換掉項目裏Data文件下對應的授權文件;如果是配置問題,可能是導入的項目路徑中有中文。
本代碼可實現功能:
- 普通批量分詞
- 添加用戶詞典txt後批量
- 批量提取關鍵字
package code;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import com.sun.jna.Library;
import com.sun.jna.Native;
public class Nlpir {
// 定義接口CLibrary,繼承自com.sun.jna.Library
public interface CLibrary extends Library {
// 定義並初始化接口的靜態變量
CLibrary Instance = (CLibrary) Native.loadLibrary(
"D:\\tools\\NLPIR\\lib\\win64\\NLPIR", CLibrary.class);
public int NLPIR_Init(String sDataPath, int encoding,
String sLicenceCode);
public String NLPIR_ParagraphProcess(String sSrc, int bPOSTagged); //對字符串內容進行分詞
public String NLPIR_GetKeyWords(String sLine, int nMaxKeyLimit,
boolean bWeightOut); //從字符串中提取關鍵詞
public String NLPIR_GetFileKeyWords(String sLine, int nMaxKeyLimit,
boolean bWeightOut); //從txt文件中提取關鍵詞,本代碼未用到
public int NLPIR_AddUserWord(String sWord);
public String NLPIR_GetLastErrorMsg();
public void NLPIR_Exit();
}
public static String transString(String aidString, String ori_encoding,
String new_encoding) {
try {
return new String(aidString.getBytes(ori_encoding), new_encoding);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return null;
}
public static void main(String[] args) throws Exception {
String argu = "D:\\tools\\NLPIR"; //Data的上一級目錄
String system_charset = "UTF-8";
int charset_type = 1;
int init_flag = CLibrary.Instance.NLPIR_Init(argu, charset_type, "0");
String nativeBytes = null;
if (0 == init_flag) {
nativeBytes = CLibrary.Instance.NLPIR_GetLastErrorMsg();
System.err.println("初始化失敗!fail reason is "+nativeBytes);
return;
}
/*添加用戶詞典功能*/
String userFilePath = "D:\\tools\\userdic.txt";
File userFile = new File(userFilePath);
InputStreamReader readerUser = new InputStreamReader(new FileInputStream(userFile), "gbk");
BufferedReader brUser = new BufferedReader(readerUser);
String lineUser="";
List<String> list = new ArrayList<String>();
while((lineUser = brUser.readLine()) != null){
String[] strings = lineUser.split("\n");
for(int i = 0; i < strings.length; i++){
list.add(strings[i]);
}
}
brUser.close();
/*實現批量讀取多個要分詞的TXT文件*/
String sInput = "";
String path = "D:\\tools\\source";
String outName = "";
File file = new File(path);
File[] files = file.listFiles();
String out = "";
Arrays.sort(files);
for(int i = 0; i < files.length; i++){
if (files[i].isFile()){
out = "";
outName = files[i].getName();
InputStreamReader reader = new InputStreamReader(new FileInputStream(files[i]), "gbk");
BufferedReader br = new BufferedReader(reader);
String line="";
sInput = "";
line = br.readLine();
while (line != null){
sInput += line;
line = br.readLine();
}
br.close();
try {
nativeBytes = CLibrary.Instance.NLPIR_ParagraphProcess(sInput, 1);
out += "未添加用戶詞典的分詞結果是: "; //對應TXT的內容
out += nativeBytes;
out += "\r\n";
for(int k = 0; k < list.size(); k++){
CLibrary.Instance.NLPIR_AddUserWord(list.get(k)); //用戶詞典txt中的詞用空格隔開
}
/*屏蔽這三行OR上段未添加用戶詞典的分詞結果,使result結果中只出現一種結果,否則兩者都出現在TXT中
out += "增加用戶詞典後分詞結果是: "; //對應TXT的內容
out += nativeBytes;
out += "\r\n";
*/
int nCountKey = 0;
String nativeByte = CLibrary.Instance.NLPIR_GetKeyWords(sInput, 10,false);
File fp1 = new File("D:\\tools\\keyword\\" + outName);
PrintWriter pfp1 = new PrintWriter(fp1);
pfp1.write("關鍵詞爲:" + nativeByte); //對應TXT內容
pfp1.close();
//分詞結果(添加和未添加用戶詞典的分詞結果)
File fp = new File("D:\\tools\\result\\" + outName);
PrintWriter pfp = new PrintWriter(fp);
pfp.write(out);
pfp.close();
} catch (Exception ex) {
// TODO Auto-generated catch block
ex.printStackTrace();
}
}
}
CLibrary.Instance.NLPIR_Exit();
}
}
還有需要完善的地方,希望多多交流!!!