<span style="font-family: Arial, Helvetica, sans-serif;">需求:批量將.scel文件轉化爲可視的txt文件(支持1對1,多對1,多對多),並從中提取中文詞(去重),支持追加內容。</span>
成果:
使用:
package com.hxl.files;
import java.io.IOException;
public class Demo {
public static void main(String[] args) {
//單個scel文件轉化
FileProcessing scel=new SougouScelFileProcessing();
scel.parseFile("/Users/ST_iOS/Desktop/test/ciku/89個節日.scel", "/Users/ST_iOS/Desktop/test/ciku/txt/89個節日.txt", true);
//多個scel文件轉化爲一個txt (格式:拼音字母 詞)
try {
scel.parseFiles("/Users/ST_iOS/Desktop/test/ciku", "/Users/ST_iOS/Desktop/test/ciku/txt/彙總.txt", false);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
//多個scel文件轉化爲多個txt文件
scel.setTargetDir("/Users/ST_iOS/Desktop/test/ciku/多對多");//轉化後文件的存儲位置
scel.parseFile("/Users/ST_iOS/Desktop/test/ciku",false);
}
}
以上代碼是將.scel文件轉化爲.txt文件的代碼,效果類似於圖1和圖2的,如果要提取中文詞,將scel=new TxtFileProcessing();調用一樣的方法,即可實現txt詞庫文件提取中文詞。
注意事項:
SougouScelFileProcessing 解析的源文件必須爲.scel文件類型,解析後的文件必須爲.txt文件類型,參數以絕對路徑進行傳遞
TxtFileProcessing 解析的源文件必須爲.txt,格式如圖2(拼音 中文),解析後的文件必須爲.txt文件類型,參數以絕對路徑進行傳遞
源碼:一共4個class,需要的朋友可自行拷入項目中使用,代碼解釋的也挺清晰的,就不多說了。
1、FileProcessing
package com.hxl.files;
import java.io.File;
import java.io.IOException;
public abstract class FileProcessing {
// 解析後存放的文件夾
protected String targetDir;
/**
* 解析單個文件
*
* @param filePath
* 要解析的源文件路徑
* @param targetFilePath
* 解析後的文件路徑
* @param isAppend
* 是否爲內容追加,不追加則會覆蓋內容
*/
public abstract void parseFile(String filePath, String targetFilePath, boolean isAppend);
/**
* 合併解析多個文件
*
* @param fileDirPath
* 要解析的源文件夾路徑
* @param targetFilePath
* 解析後的文件路徑
* @param isAppend
* 是否爲內容追加,不追加則會覆蓋內容
* @throws IOException
*/
public abstract void parseFiles(String fileDirPath, String targetFilePath, boolean isAppend) throws IOException;
/**
* 解析單個或者多個文件,如果是多個文件則生成對應的txt文件,{@link #setTargetDir(String)},
* 如果targetDir不設置,則在當前文件夾下生成相應的txt文件
*
* @param filePath
* 源文件路徑
* @param isAppend
* false:覆蓋內容 true:附加內容
*/
public abstract void parseFile(String filePath, boolean isAppend);
/**
* 創建文件夾
*
* @param filePath
* 目標文件
* @return
*/
protected void createParentDir(String targetFilePath) {
if (!targetFilePath.endsWith(".txt")) {
throw new IllegalStateException("文件格式錯誤,後綴必須爲.txt,此格式爲 " + targetFilePath);
}
String path = targetFilePath.substring(0, targetFilePath.lastIndexOf("/") + 1);
File file = new File(path);
if (!file.exists()) {
file.mkdirs();
}
}
/**
* 解析單個文件
* @param filePath 文件路徑
*/
public void parseFile(String filePath){
parseFile(filePath,false);
}
public String getTargetDir() {
return targetDir;
}
/**
* 解析後的txt文件存放路徑
*
* @param targetDir
* 文件夾路徑
*/
public void setTargetDir(String targetDir) {
this.targetDir = targetDir;
}
}
2、SougouScelFileProcessing 繼承FileProcessing
package com.hxl.files;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
public class SougouScelFileProcessing extends FileProcessing {
private static final Logger log = Logger.getLogger("SougouScelFileProcessing");
protected static String encoding = "UTF-16LE";
protected ByteArrayOutputStream output = new ByteArrayOutputStream();
/**
* 解析單個或者多個文件,如果是多個文件則生成對應的txt文件,{@link #setTargetDir(String)},
* 如果targetDir不設置,則在當前文件夾下生成相應的txt文件
*
* @param filePath
* 源文件路徑
* @param isAppend
* false:覆蓋內容 true:附加內容
*/
public void parseFile(String filePath, boolean isAppend) {
File file = new File(filePath);
if (file.isDirectory()) {
File items[] = file.listFiles();
for (int i = 0; i < items.length; i++) {
if (!items[i].getName().endsWith(".scel")) {
continue;
}
if (targetDir == null) {
parseFile(items[i].getAbsolutePath(), items[i].getAbsolutePath().replace(".scel", ".txt"),
isAppend);
} else {
parseFile(items[i].getAbsolutePath(), targetDir + "/" + items[i].getName().replace(".scel", ".txt"),
isAppend);
}
}
} else {
parseFile(filePath, file.getAbsolutePath().replace(".scel", ".txt"), isAppend);
}
}
/**
* 解析單個scel文件
*
* @param filePath
* 源文件路徑
* @param targetFilePath目標文件路徑
* @param isAppend
* false:覆蓋內容 true:附加內容
*/
public void parseFile(String filePath, String targetFilePath, boolean isAppend) {
if (!targetFilePath.endsWith(".txt")) {
throw new IllegalStateException("文件格式錯誤,後綴必須爲.txt,此格式爲 " + targetFilePath);
}
if (!filePath.endsWith(".scel")) {
return;
}
File input = new File(filePath);
if (input.length() < 8) {
// 假如文件小於8字節,不去考慮它
return;
}
FileInputStream in = null;
SougouScelMdel model = null;
try {
in = new FileInputStream(input);
model = read(in);
if (model == null) {
return;
}
writeToTargetFile(model, targetFilePath, isAppend);
} catch (IOException e) {
log.log(Level.SEVERE, e.getMessage());
e.printStackTrace();
}
}
/**
* 解析多個文件夾,將解析後的內容放到一個文件裏
*
* @param fileDirPath
* 源文件夾路徑
* @param targetFilePath
* 目標文件路徑
* @param isAppend
* false:覆蓋內容 true:附加內容
* @throws FileNotFoundException
*/
public void parseFiles(String fileDirPath, String targetFilePath, boolean isAppend) throws IOException {
if (!targetFilePath.endsWith(".txt")) {
throw new IllegalStateException("文件格式錯誤,後綴必須爲.txt,此格式爲 " + targetFilePath);
}
File dir = new File(fileDirPath);
if (!dir.exists() || !dir.isDirectory()) {
throw new IllegalStateException("scel文件夾路徑錯誤 " + targetFilePath);
}
File scels[] = dir.listFiles();
ArrayList<SougouScelMdel> models = new ArrayList<>();
for (int i = 0; i < scels.length; i++) {
if (!scels[i].getName().endsWith(".scel")) {
continue;
}
FileInputStream in = null;
SougouScelMdel model = null;
in = new FileInputStream(scels[i]);
model = read(in);
if (model != null) {
models.add(model);
}
}
writeToTargetFile(models, targetFilePath, isAppend);
}
private void writeToTargetFile(SougouScelMdel model, String targetFilePath, boolean isAppend) throws IOException {
List<SougouScelMdel> models = new ArrayList<>();
models.add(model);
writeToTargetFile(models, targetFilePath, isAppend);
}
/**
* 將搜狗scel文件解析後的內容寫入txt文件
*
* @param models
* @param targetFilePath
* @param isAppend
* @throws IOException
*/
private void writeToTargetFile(List<SougouScelMdel> models, String targetFilePath, boolean isAppend)
throws IOException {
createParentDir(targetFilePath);
FileOutputStream out = new FileOutputStream(targetFilePath, isAppend);
int count = 0;
for (int k = 0; k < models.size(); k++) {
Map<String, List<String>> words = models.get(k).getWordMap(); // 詞<拼音,詞>
Set<Entry<String, List<String>>> set = words.entrySet();
Iterator<Entry<String, List<String>>> iter = set.iterator();
if (isAppend) {
out.write("\r\n".getBytes());
}
while (iter.hasNext()) {
Entry<String, List<String>> entry = iter.next();
List<String> list = entry.getValue();
int size = list.size();
for (int i = 0; i < size; i++) {
String word = list.get(i);
out.write((entry.getKey() + " ").getBytes());
out.write((word + "\n").getBytes());// 寫入txt文件
count++;
}
}
}
out.close();
log.info("生成" + targetFilePath.substring(targetFilePath.lastIndexOf("/") + 1) + "成功!,總計寫入: " + count + " 條數據!");
}
private SougouScelMdel read(InputStream in) {
SougouScelMdel model = new SougouScelMdel();
DataInputStream input = new DataInputStream(in);
int read;
try {
byte[] bytes = new byte[4];
input.readFully(bytes);
assert (bytes[0] == 0x40 && bytes[1] == 0x15 && bytes[2] == 0 && bytes[3] == 0);
input.readFully(bytes);
int flag1 = bytes[0];
assert (bytes[1] == 0x43 && bytes[2] == 0x53 && bytes[3] == 0x01);
int[] reads = new int[] { 8 };
model.setName(readString(input, 0x130, reads));
model.setType(readString(input, 0x338, reads));
model.setDescription(readString(input, 0x540, reads));
model.setSample(readString(input, 0xd40, reads));
read = reads[0];
input.skip(0x1540 - read);
read = 0x1540;
input.readFully(bytes);
read += 4;
assert (bytes[0] == (byte) 0x9D && bytes[1] == 0x01 && bytes[2] == 0 && bytes[3] == 0);
bytes = new byte[128];
Map<Integer, String> pyMap = new LinkedHashMap<Integer, String>();
while (true) {
int mark = readUnsignedShort(input);
int size = input.readUnsignedByte();
input.skip(1);
read += 4;
assert (size > 0 && (size % 2) == 0);
input.readFully(bytes, 0, size);
read += size;
String py = new String(bytes, 0, size, encoding);
// System.out.println(py);
pyMap.put(mark, py);
if ("zuo".equals(py)) {
break;
}
}
if (flag1 == 0x44) {
input.skip(0x2628 - read);
} else if (flag1 == 0x45) {
input.skip(0x26C4 - read);
} else {
throw new RuntimeException("出現意外,聯繫作者");
}
StringBuffer buffer = new StringBuffer();
Map<String, List<String>> wordMap = new LinkedHashMap<String, List<String>>();
while (true) {
int size = readUnsignedShort(input);
if (size < 0) {
break;
}
int count = readUnsignedShort(input);
int len = count / 2;
assert (len * 2 == count);
buffer.setLength(0);
for (int i = 0; i < len; i++) {
int key = readUnsignedShort(input);
buffer.append(pyMap.get(key)).append("'");
}
buffer.setLength(buffer.length() - 1);
String py = buffer.toString();
List<String> list = wordMap.get(py);
if (list == null) {
list = new ArrayList<String>();
wordMap.put(py, list);
}
for (int i = 0; i < size; i++) {
count = readUnsignedShort(input);
if (count > bytes.length) {
bytes = new byte[count];
}
input.readFully(bytes, 0, count);
String word = new String(bytes, 0, count, encoding);
// 接下來12個字節可能是詞頻或者類似信息
input.skip(12);
list.add(word);
}
}
model.setWordMap(wordMap);
return model;
} catch (IOException e) {
log.log(Level.SEVERE, e.getMessage());
e.printStackTrace();
} finally {
try {
in.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
return null;
}
protected String readString(DataInputStream input, int pos, int[] reads) throws IOException {
int read = reads[0];
input.skip(pos - read);
read = pos;
output.reset();
while (true) {
int c1 = input.read();
int c2 = input.read();
read += 2;
if (c1 == 0 && c2 == 0) {
break;
} else {
output.write(c1);
output.write(c2);
}
}
reads[0] = read;
return new String(output.toByteArray(), encoding);
}
protected final int readUnsignedShort(InputStream in) throws IOException {
int ch1 = in.read();
int ch2 = in.read();
if ((ch1 | ch2) < 0) {
return Integer.MIN_VALUE;
}
return (ch2 << 8) + (ch1 << 0);
}
}
3、SougouScelModel
package com.hxl.files;
import java.util.List;
import java.util.Map;
public class SougouScelModel {
private Map<String, List<String>> wordMap;
private String name;
private String type;
private String description;
private String sample;
public Map<String, List<String>> getWordMap() {
return wordMap;
}
void setWordMap(Map<String, List<String>> wordMap) {
this.wordMap = wordMap;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getDescription() {
return description;
}
public void setDescription(String description) {
this.description = description;
}
public String getSample() {
return sample;
}
public void setSample(String sample) {
this.sample = sample;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
}
4、TxtFileProcessing 繼承於FileProcessing
package com.hxl.files;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
public class TxtFileProcessing extends FileProcessing {
private static final Logger log = Logger.getLogger("TxtFileProcessing");
// 文字編碼
private String encoding = "UTF-8";
@Override
public void parseFile(String filePath, String targetFilePath, boolean isAppend) {
if (!targetFilePath.endsWith(".txt")) {
throw new IllegalStateException("文件格式錯誤,後綴必須爲.txt,此格式爲 " + targetFilePath);
}
if (!filePath.endsWith(".txt")) {
return;
}
File inputFile = new File(filePath);
if (!inputFile.exists()) {
log.log(Level.SEVERE,filePath + " 文件不存在");
} else {
ArrayList<String> content = new ArrayList<>();
HashSet<String> set = new HashSet<>();
createParentDir(targetFilePath);
File outputFile = new File(targetFilePath);
if (!isAppend) {
// 假如不是附加內容,刪除
if (outputFile.exists()) {
log.info(outputFile.getAbsolutePath() + " 文件存在,刪除...");
outputFile.delete();
}
} else {
// 讀取原有的txt文件內容
content.addAll(readTargetFile(outputFile));
}
content.addAll(readSourceFile(inputFile));
// 去重
for (int i = 0; i < content.size(); i++) {
set.add(content.get(i));
}
// 寫入目標文件
writeToTargetFile(set, outputFile);
}
}
@Override
public void parseFile(String filePath, boolean isAppend) {
File file = new File(filePath);
if (file.isDirectory()) {
File items[] = file.listFiles();
for (int i = 0; i < items.length; i++) {
if (!items[i].getName().endsWith(".txt")) {
continue;
}
if (targetDir == null) {
parseFile(items[i].getAbsolutePath(), items[i].getAbsolutePath().replace(".txt", "解析.txt"),
isAppend);
} else {
parseFile(items[i].getAbsolutePath(), targetDir + "/" + items[i].getName(),
isAppend);
}
}
} else {
parseFile(filePath, file.getAbsolutePath().replace(".txt", "解析.txt"), isAppend);
}
}
@Override
public void parseFiles(String fileDirPath, String targetFilePath, boolean isAppend) throws IOException {
if (!targetFilePath.endsWith(".txt")) {
throw new IllegalStateException("文件格式錯誤,後綴必須爲.txt,此格式爲 " + targetFilePath);
}
File fileDir = new File(fileDirPath);
if (!fileDir.isDirectory() || !fileDir.exists()) {
throw new IllegalStateException("文件夾路徑錯誤 " + targetFilePath);
}
File file[] = fileDir.listFiles();
ArrayList<String> content = new ArrayList<>();
HashSet<String> set = new HashSet<>();
createParentDir(targetFilePath);
File outputFile = new File(targetFilePath);
if (!isAppend) {
// 假如不是附加內容,刪除
if (outputFile.exists()) {
log.info(outputFile.getAbsolutePath() + " 文件存在,刪除...");
outputFile.delete();
}
} else {
// 讀取原有的txt文件內容
content.addAll(readSourceFile(outputFile));
}
for (int i = 0; i < file.length; i++) {
if (file[i].getName().endsWith(".txt")) {
content.addAll(readSourceFile(file[i]));
}
}
// 去重
for (int i = 0; i < content.size(); i++) {
set.add(content.get(i));
}
// 寫入目標文件
writeToTargetFile(set, outputFile);
}
/**
* 將內容寫入目標文件
*
* @param set
* 詞庫合集
* @param outputFile
* 目標文件
*/
private void writeToTargetFile(HashSet<String> set, File outputFile) {
StringBuffer buff = new StringBuffer();
for (String content : set) {
buff.append(content);
buff.append("\r\n");
}
String content = buff.toString();
FileOutputStream out = null;
try {
out = new FileOutputStream(outputFile);
out.write(content.getBytes());
} catch (IOException e) {
log.log(Level.SEVERE, e.getMessage());
e.printStackTrace();
} finally {
try {
out.close();
} catch (IOException e) {
log.log(Level.SEVERE, e.getMessage());
e.printStackTrace();
}
}
log.info("生成" + outputFile.getName() + "成功!,總計寫入: " + set.size() + " 條數據!");
}
/**
* 讀取源文件,獲取中文詞庫
*
* @param file
* 源文件
* @return 中文詞庫集合
*/
private List<String> readSourceFile(File file) {
ArrayList<String> content = new ArrayList<>();
try {
if (file.isFile() && file.exists()) { // 判斷文件是否存在
InputStreamReader read = new InputStreamReader(new FileInputStream(file), encoding);// 考慮到編碼格式
BufferedReader bufferedReader = new BufferedReader(read);
String lineTxt = null;
while ((lineTxt = bufferedReader.readLine()) != null) {
String newStr = new String(lineTxt.getBytes("UTF-8"));
String split[] = newStr.split(" ");
for (int i = 0; i < split.length; i++) {
if (i % 2 == 0) {
// 拼音字母
} else {
// 中文詞庫
content.add(split[i]);
}
}
}
bufferedReader.close();
read.close();
} else {
log.log(Level.SEVERE, "找不到源文件 " + file.getAbsolutePath());
}
} catch (Exception e) {
log.log(Level.SEVERE, e.getMessage());
e.printStackTrace();
}
return content;
}
/**
* 讀取已解析好的的詞庫文件
*
* @param file
* 詞庫文件
* @return 詞庫內容
*/
private List<String> readTargetFile(File file) {
ArrayList<String> content = new ArrayList<>();
try {
if (file.isFile() && file.exists()) { // 判斷文件是否存在
InputStreamReader read = new InputStreamReader(new FileInputStream(file), encoding);// 考慮到編碼格式
BufferedReader bufferedReader = new BufferedReader(read);
String lineTxt = null;
while ((lineTxt = bufferedReader.readLine()) != null) {
String newStr = new String(lineTxt.getBytes("UTF-8"));
if (!newStr.trim().isEmpty()) {
content.add(newStr);
}
}
bufferedReader.close();
read.close();
} else {
System.err.println("找不到目標文件 " + file.getAbsolutePath());
}
} catch (Exception e) {
log.log(Level.SEVERE, e.getMessage());
e.printStackTrace();
}
return content;
}
}