1、本文使用pinyin4j.jar包生成漢字的拼音,遇到多音字用此jar包會生成多個拼音,怎樣的到最準確的那個拼音比如(長大:zhangda 而不是changda)
需要讀取pinyin.txt文件進行過濾;
代碼如下:
package pingyin;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;
public class TMP {
private static Map<String, List<String>> pinyinMap = new HashMap<String, List<String>>();
private static long count = 0;
public static void main(String[] args) {
String str ="";
// String str = "中國人民銀行";
initPinyin("./pinyin.txt");
String py = convertChineseToPinyin(str);
System.out.println(str+" = "+py);
}
/**
* 將某個字符串的首字母 大寫
* @param str
* @return
*/
public static String convertInitialToUpperCase(String str){
if(str==null){
return null;
}
StringBuffer sb = new StringBuffer();
char[] arr = str.toCharArray();
for(int i=0;i<arr.length;i++){
char ch = arr[i];
if(i==0){
sb.append(String.valueOf(ch).toUpperCase());
}else{
sb.append(ch);
}
}
return sb.toString();
}
/**
* 漢字轉拼音 最大匹配優先
* @param chinese
* @return
*/
private static String convertChineseToPinyin(String chinese) {
StringBuffer pinyin = new StringBuffer();
HanyuPinyinOutputFormat defaultFormat = new HanyuPinyinOutputFormat();
defaultFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE);
defaultFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
char[] arr = chinese.toCharArray();
for (int i = 0; i < arr.length; i++) {
char ch = arr[i];
if (ch > 128) { // 非ASCII碼
// 取得當前漢字的所有全拼
try {
String[] results = PinyinHelper.toHanyuPinyinStringArray(
ch, defaultFormat);
if (results == null) { //非中文
return "";
} else {
int len = results.length;
if (len == 1) { // 不是多音字
// pinyin.append(results[0]);
String py = results[0];
if(py.contains("u:")){ //過濾 u:
py = py.replace("u:", "v");
System.out.println("filter u:"+py);
}
pinyin.append(convertInitialToUpperCase(py));
}else if(results[0].equals(results[1])){ //非多音字 有多個音,取第一個
// pinyin.append(results[0]);
pinyin.append(convertInitialToUpperCase(results[0]));
}else { // 多音字
System.out.println("多音字:"+ch);
int length = chinese.length();
boolean flag = false;
String s = null;
List<String> keyList =null;
for (int x = 0; x < len; x++) {
String py = results[x];
if(py.contains("u:")){ //過濾 u:
py = py.replace("u:", "v");
System.out.println("filter u:"+py);
}
keyList = pinyinMap.get(py);
if (i + 3 <= length) { //後向匹配2個漢字 大西洋
s = chinese.substring(i, i + 3);
if (keyList != null && (keyList.contains(s))) {
// if (value != null && value.contains(s)) {
System.out.println("last 2 > " + py);
// pinyin.append(results[x]);
pinyin.append(convertInitialToUpperCase(py));
flag = true;
break;
}
}
if (i + 2 <= length) { //後向匹配 1個漢字 大西
s = chinese.substring(i, i + 2);
if (keyList != null && (keyList.contains(s))) {
System.out.println("last 1 > " + py);
// pinyin.append(results[x]);
pinyin.append(convertInitialToUpperCase(py));
flag = true;
break;
}
}
if ((i - 2 >= 0) && (i+1<=length)) { // 前向匹配2個漢字 龍固大
s = chinese.substring(i - 2, i+1);
if (keyList != null && (keyList.contains(s))) {
System.out.println("before 2 < " + py);
// pinyin.append(results[x]);
pinyin.append(convertInitialToUpperCase(py));
flag = true;
break;
}
}
if ((i - 1 >= 0) && (i+1<=length)) { // 前向匹配1個漢字 固大
s = chinese.substring(i - 1, i+1);
if (keyList != null && (keyList.contains(s))) {
System.out.println("before 1 < " + py);
// pinyin.append(results[x]);
pinyin.append(convertInitialToUpperCase(py));
flag = true;
break;
}
}
if ((i - 1 >= 0) && (i+2<=length)) { //前向1個,後向1個 固大西
s = chinese.substring(i - 1, i+2);
if (keyList != null && (keyList.contains(s))) {
System.out.println("before last 1 <> " + py);
// pinyin.append(results[x]);
pinyin.append(convertInitialToUpperCase(py));
flag = true;
break;
}
}
}
if (!flag) { //都沒有找到,匹配默認的 讀音 大
s = String.valueOf(ch);
for (int x = 0; x < len; x++) {
String py = results[x];
if(py.contains("u:")){ //過濾 u:
py = py.replace("u:", "v");
System.out.println("filter u:");
}
keyList = pinyinMap.get(py);
if (keyList != null && (keyList.contains(s))) {
System.out.println("default = " + py);
// pinyin.append(results[x]); //如果不需要拼音首字母大寫 ,直接返回即可
pinyin.append(convertInitialToUpperCase(py));//拼音首字母 大寫
break;
}
}
}
}
}
} catch (BadHanyuPinyinOutputFormatCombination e) {
e.printStackTrace();
}
} else {
pinyin.append(arr[i]);
}
}
return pinyin.toString();
}
/**
* 初始化 所有的多音字詞組
*
* @param fileName
*/
public static void initPinyin(String fileName) {
// 讀取多音字的全部拼音表;
InputStream file = TMP.class.getResourceAsStream("pinyin.txt");
BufferedReader br = new BufferedReader(new InputStreamReader(file));
String s = null;
try {
while ((s = br.readLine()) != null) {
if (s != null) {
String[] arr = s.split("#");
String pinyin = arr[0];
String chinese = arr[1];
if(chinese!=null){
String[] strs = chinese.split(" ");
List<String> list = Arrays.asList(strs);
pinyinMap.put(pinyin, list);
}
}
}
} catch (IOException e) {
e.printStackTrace();
}finally{
try {
br.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}