文章中需要的Apache openNlp jar包:openNlp下載:https://opennlp.apache.org/cgi-bin/download.cgi
其他涉及的jar都是java基礎類包
package com.npl.demo.utils;
import java.io.IOException;
import java.io.StreamTokenizer;
import java.io.StringReader;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;
import java.util.Scanner;
import java.util.StringTokenizer;
import java.util.regex.Pattern;
import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.tokenize.WhitespaceTokenizer;
/**
* Filename: NlpTokenization.java
* Description:
* Copyright: Copyright (c) 2019 All Rights Reserved.
* @author: wangk
* @version: 1.0
* Create at: 2019年5月5日 下午4:28:56
*
* Modification History:
* Date Author Version Description
* ------------------------------------------------------------------
* 2019年5月5日 wangk 1.0 1.0 Version
*
*/
public class NlpTokenization {
static String paragraph = "Let's The first sentence. The second sentence. Let's ";
static String[] sentences = {
"Tim was agood neighbor. Perhaps not as good Bob "+
"Haywood, but stille pretty good. Of course Mr. Adam "+
"took the cake!"
};
static String chineseLanguage = "時代的碰撞|中國古典民樂與流行的相遇"; //中文可以進行正則匹配每隔字中間加一個空格,就可以進行分詞了
//代碼如下
/*String regex = "(.{1})";
text = text.replaceAll (regex, "$1 ");*/
public static void main(String[] args) {
NlpTokenization to = new NlpTokenization();
//to.scanner(paragraph);
//to.split(chineseLanguage);
//to.breakIterator(paragraph);
//to.streamTokenizer(paragraph);
//to.stringTokenizer(chineseLanguage);
//to.textSplit(); //測試分詞 性能
to.openNlpSimpleTokenizer(chineseLanguage);
}
/**
* @Description: /默認使用空格作爲分隔符 java類 Scanner方法
* @author wangk
* @param text
* @return
* @date: 2019年5月5日 下午1:51:38
*/
public List scanner(String text) {
Scanner scanner = new Scanner(text);
scanner.useDelimiter("[ ,.]");//設置基於字符串或模式的分隔符 --設置分隔符爲空格,逗號,句號 使用正則設置
//scanner.reset();//分隔符復位爲空格
List<String> list = new ArrayList<>();
while(scanner.hasNext()) {
String token = scanner.next();
list.add(token);
}
for(String token : list) {
System.out.println(token);
}
return null;
}
/**
* @Description: 文本分詞 java類 split方法
* @author wangk
* @param text
* @return
* @date: 2019年5月5日 下午1:51:30
*/
public List split(String text) {
String tokens[] = text.split("\\s+");
for (String token : tokens) {
System.out.println(token);
}
return null;
}
/**
* @Description: 文本分詞 java類 BreakIterator方法 該類可以獲取各種邊界
* @author wangk
* @param text
* @return
* @date: 2019年5月5日 下午1:51:19
*/
public List breakIterator(String text) {
BreakIterator wordIterator = BreakIterator.getWordInstance();
wordIterator.setText(text);
int boundary = wordIterator.first();
while(boundary != BreakIterator.DONE) {//done爲最後一個邊界
int begin = boundary;
System.out.print(boundary+"-");
boundary = wordIterator.next();
int end = boundary;
if(end == BreakIterator.DONE) break;
System.out.println(boundary+"["+ text.substring(begin, end)+"]");
}
return null;
}
/**
* @Description: 文本分詞 java類 StreamTokenizer方法 通常基於一個文件創建,對文件中的文本分詞
* @author wangk
* @param text
* @return
* @date: 2019年5月5日 下午1:50:37
*/
public List streamTokenizer(String text) {
StreamTokenizer tokenizer = new StreamTokenizer(new StringReader(text));
//對於分詞器會將單引號字符和雙引號字符表示引用文本,由於沒有對應的引號,故字符串的其他部分被忽略了
//使用ordinaryChar方法制定那些字符串應爲普通字符
tokenizer.ordinaryChar('\'');
tokenizer.ordinaryChar(',');
boolean isEOF = false; //用來終止循環
while(!isEOF) {
try {
int token = tokenizer.nextToken(); //返回詞項的類型
switch(token) {
case StreamTokenizer.TT_EOF: //static int 流結束的一個常數
isEOF = true;
break;
case StreamTokenizer.TT_EOL: //static int 行結束的一個常數
break;
case StreamTokenizer.TT_NUMBER: //static int 讀取詞項的數量
System.out.println(tokenizer.nval); //double 如果當前詞項是一個單詞則存有一個數字
break;
case StreamTokenizer.TT_WORD: //static int 指明一個單詞詞項的常數
System.out.println(tokenizer.sval); //String 如果當前詞項是一個單詞則存有這個詞項
break;
default:
System.out.println((char) token);
}
} catch (IOException e) {
e.printStackTrace();
}
}
return null;
}
/**
* @Description: 文本分詞 java類 stringTokenizer方法 可以處理熱恩和來源的字符串
* @author wangk
* @param text
* @return
* @date: 2019年5月5日 下午3:05:36
*/
public List stringTokenizer(String text) {
StringTokenizer st = new StringTokenizer(text);
while(st.hasMoreTokens()) {
System.out.println(st.nextToken());
}
return null;
}
/**
* @Description: 測試分詞 性能
* @author wangk
* @date: 2019年5月5日 下午4:28:59
*/
public void textSplit() {
StringBuilder sb = new StringBuilder();
for (int i = 100000; i < 100000 + 60; i++)
sb.append(i).append(' ');
String sample = sb.toString();
int runs = 100000;
for (int i = 0; i < 5; i++) {
{
long start = System.nanoTime();
for (int r = 0; r < runs; r++) {
StringTokenizer st = new StringTokenizer(sample);
List<String> list = new ArrayList<String>();
while (st.hasMoreTokens())
list.add(st.nextToken());
}
long time = System.nanoTime() - start;
System.out.printf("StringTokenizer took an average of %.1f us%n", time / runs / 1000.0);
}
{
long start = System.nanoTime();
Pattern spacePattern = Pattern.compile(" ");
for (int r = 0; r < runs; r++) {
List<String> list = Arrays.asList(spacePattern.split(sample, 0));
}
long time = System.nanoTime() - start;
System.out.printf("Pattern.split took an average of %.1f us%n", time / runs / 1000.0);
}
{
long start = System.nanoTime();
for (int r = 0; r < runs; r++) {
List<String> list = new ArrayList<String>();
int pos = 0, end;
while ((end = sample.indexOf(' ', pos)) >= 0) {
list.add(sample.substring(pos, end));
pos = end + 1;
}
}
long time = System.nanoTime() - start;
System.out.printf("indexOf loop took an average of %.1f us%n", time / runs / 1000.0);
}
}
}
/**
* @Description: 英文標點也被作爲單獨項 openNlp 方法SimpleTokenizer
* @author wangk
* @param text
* @return
* @date: 2019年5月6日 上午10:36:38
*/
public List openNlpSimpleTokenizer(String text) {
SimpleTokenizer simpleTokenizer = SimpleTokenizer.INSTANCE;
String regex = "(.{1})";
text = text.replaceAll (regex, "$1 ");
String tokens[] = simpleTokenizer.tokenize(text);
for(String token : tokens) {
System.out.println(token);
}
return null;
}
/**
* @Description: 空格作爲分隔符 openNlp 方法 WhitespaceTokenizer
* @author wangk
* @param text
* @return
* @date: 2019年5月6日 上午10:36:38
*/
public List openNlpWhitespaceTokenizer(String text) {
WhitespaceTokenizer simpleTokenizer = WhitespaceTokenizer.INSTANCE;
String tokens[] = simpleTokenizer.tokenize(text);
for(String token : tokens) {
System.out.println(token);
}
return null;
}
}