使用java將搜狗詞庫.scel文件轉化爲.txt文件

<span style="font-family: Arial, Helvetica, sans-serif;">需求:批量將.scel文件轉化爲可視的txt文件(支持1對1,多對1,多對多),並從中提取中文詞(去重),支持追加內容。</span>

成果:

   




使用:

package com.hxl.files;

import java.io.IOException;

public class Demo {
	
	public static void main(String[] args) {
		//單個scel文件轉化  
		FileProcessing scel=new SougouScelFileProcessing();
		scel.parseFile("/Users/ST_iOS/Desktop/test/ciku/89個節日.scel", "/Users/ST_iOS/Desktop/test/ciku/txt/89個節日.txt", true);

		//多個scel文件轉化爲一個txt (格式:拼音字母 詞)
		try {
			scel.parseFiles("/Users/ST_iOS/Desktop/test/ciku", "/Users/ST_iOS/Desktop/test/ciku/txt/彙總.txt", false);
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		//多個scel文件轉化爲多個txt文件
		scel.setTargetDir("/Users/ST_iOS/Desktop/test/ciku/多對多");//轉化後文件的存儲位置
		scel.parseFile("/Users/ST_iOS/Desktop/test/ciku",false);
		
	}
}

以上代碼是將.scel文件轉化爲.txt文件的代碼,效果類似於圖1和圖2的,如果要提取中文詞,將scel=new TxtFileProcessing();調用一樣的方法,即可實現txt詞庫文件提取中文詞。

注意事項:

SougouScelFileProcessing 解析的源文件必須爲.scel文件類型,解析後的文件必須爲.txt文件類型,參數以絕對路徑進行傳遞

TxtFileProcessing   解析的源文件必須爲.txt,格式如圖2(拼音 中文),解析後的文件必須爲.txt文件類型,參數以絕對路徑進行傳遞


源碼:一共4個class,需要的朋友可自行拷入項目中使用,代碼解釋的也挺清晰的,就不多說了。

1、FileProcessing


package com.hxl.files;

import java.io.File;
import java.io.IOException;

public abstract class FileProcessing {
	// 解析後存放的文件夾
	protected String targetDir;

	/**
	 * 解析單個文件
	 * 
	 * @param filePath
	 *            要解析的源文件路徑
	 * @param targetFilePath
	 *            解析後的文件路徑
	 * @param isAppend
	 *            是否爲內容追加,不追加則會覆蓋內容
	 */
	public abstract void parseFile(String filePath, String targetFilePath, boolean isAppend);

	/**
	 * 合併解析多個文件
	 * 
	 * @param fileDirPath
	 *            要解析的源文件夾路徑
	 * @param targetFilePath
	 *            解析後的文件路徑
	 * @param isAppend
	 *            是否爲內容追加,不追加則會覆蓋內容
	 * @throws IOException 
	 */
	public abstract void parseFiles(String fileDirPath, String targetFilePath, boolean isAppend) throws IOException;

	/**
	 * 解析單個或者多個文件,如果是多個文件則生成對應的txt文件,{@link #setTargetDir(String)},
	 * 如果targetDir不設置,則在當前文件夾下生成相應的txt文件
	 * 
	 * @param filePath
	 *            源文件路徑
	 * @param isAppend
	 *            false:覆蓋內容 true:附加內容
	 */
	public abstract void parseFile(String filePath, boolean isAppend);
	
	/**
	 * 創建文件夾
	 * 
	 * @param filePath
	 *            目標文件
	 * @return
	 */
	protected void createParentDir(String targetFilePath) {
		if (!targetFilePath.endsWith(".txt")) {
			throw new IllegalStateException("文件格式錯誤,後綴必須爲.txt,此格式爲   " + targetFilePath);
		}
		String path = targetFilePath.substring(0, targetFilePath.lastIndexOf("/") + 1);
		File file = new File(path);
		if (!file.exists()) {
			file.mkdirs();
		}
	}
	
	/**
	 * 解析單個文件
	 * @param filePath   文件路徑
	 */
	public void parseFile(String filePath){
		parseFile(filePath,false);
	}

	public String getTargetDir() {
		return targetDir;
	}

	/**
	 * 解析後的txt文件存放路徑
	 * 
	 * @param targetDir
	 *            文件夾路徑
	 */
	public void setTargetDir(String targetDir) {
		this.targetDir = targetDir;
	}

}

2、SougouScelFileProcessing 繼承FileProcessing

package com.hxl.files;

import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;


public class SougouScelFileProcessing extends FileProcessing {
	private static final Logger log = Logger.getLogger("SougouScelFileProcessing");
	protected static String encoding = "UTF-16LE";
	protected ByteArrayOutputStream output = new ByteArrayOutputStream();

	/**
	 * 解析單個或者多個文件,如果是多個文件則生成對應的txt文件,{@link #setTargetDir(String)},
	 * 如果targetDir不設置,則在當前文件夾下生成相應的txt文件
	 * 
	 * @param filePath
	 *            源文件路徑
	 * @param isAppend
	 *            false:覆蓋內容 true:附加內容
	 */
	public void parseFile(String filePath, boolean isAppend) {
		File file = new File(filePath);
		if (file.isDirectory()) {
			File items[] = file.listFiles();
			for (int i = 0; i < items.length; i++) {
				if (!items[i].getName().endsWith(".scel")) {
					continue;
				}

				if (targetDir == null) {
					parseFile(items[i].getAbsolutePath(), items[i].getAbsolutePath().replace(".scel", ".txt"),
							isAppend);
				} else {
					parseFile(items[i].getAbsolutePath(), targetDir + "/" + items[i].getName().replace(".scel", ".txt"),
							isAppend);
				}

			}
		} else {
			parseFile(filePath, file.getAbsolutePath().replace(".scel", ".txt"), isAppend);
		}

	}

	/**
	 * 解析單個scel文件
	 * 
	 * @param filePath
	 *            源文件路徑
	 * @param targetFilePath目標文件路徑
	 * @param isAppend
	 *            false:覆蓋內容 true:附加內容
	 */
	public void parseFile(String filePath, String targetFilePath, boolean isAppend) {
		if (!targetFilePath.endsWith(".txt")) {
			throw new IllegalStateException("文件格式錯誤,後綴必須爲.txt,此格式爲   " + targetFilePath);
		}
		if (!filePath.endsWith(".scel")) {
			return;
		}
		File input = new File(filePath);
		if (input.length() < 8) {
			// 假如文件小於8字節,不去考慮它
			return;
		}
		FileInputStream in = null;
		SougouScelMdel model = null;
		try {
			in = new FileInputStream(input);
			model = read(in);
			if (model == null) {
				return;
			}
			writeToTargetFile(model, targetFilePath, isAppend);
		} catch (IOException e) {
			log.log(Level.SEVERE, e.getMessage());
			e.printStackTrace();
		}

	}

	/**
	 * 解析多個文件夾,將解析後的內容放到一個文件裏
	 * 
	 * @param fileDirPath
	 *            源文件夾路徑
	 * @param targetFilePath
	 *            目標文件路徑
	 * @param isAppend
	 *            false:覆蓋內容 true:附加內容
	 * @throws FileNotFoundException
	 */
	public void parseFiles(String fileDirPath, String targetFilePath, boolean isAppend) throws IOException {
		if (!targetFilePath.endsWith(".txt")) {
			throw new IllegalStateException("文件格式錯誤,後綴必須爲.txt,此格式爲   " + targetFilePath);
		}
		File dir = new File(fileDirPath);
		if (!dir.exists() || !dir.isDirectory()) {
			throw new IllegalStateException("scel文件夾路徑錯誤   " + targetFilePath);
		}
		File scels[] = dir.listFiles();
		ArrayList<SougouScelMdel> models = new ArrayList<>();
		for (int i = 0; i < scels.length; i++) {
			if (!scels[i].getName().endsWith(".scel")) {
				continue;
			}
			FileInputStream in = null;
			SougouScelMdel model = null;
			in = new FileInputStream(scels[i]);
			model = read(in);
			if (model != null) {
				models.add(model);
			}
		}
		writeToTargetFile(models, targetFilePath, isAppend);
	}

	private void writeToTargetFile(SougouScelMdel model, String targetFilePath, boolean isAppend) throws IOException {
		List<SougouScelMdel> models = new ArrayList<>();
		models.add(model);
		writeToTargetFile(models, targetFilePath, isAppend);

	}

	/**
	 * 將搜狗scel文件解析後的內容寫入txt文件
	 * 
	 * @param models
	 * @param targetFilePath
	 * @param isAppend
	 * @throws IOException
	 */
	private void writeToTargetFile(List<SougouScelMdel> models, String targetFilePath, boolean isAppend)
			throws IOException {
		createParentDir(targetFilePath);
		FileOutputStream out = new FileOutputStream(targetFilePath, isAppend);
		int count = 0;
		for (int k = 0; k < models.size(); k++) {
			Map<String, List<String>> words = models.get(k).getWordMap(); // 詞<拼音,詞>
			Set<Entry<String, List<String>>> set = words.entrySet();
			Iterator<Entry<String, List<String>>> iter = set.iterator();
			if (isAppend) {
				out.write("\r\n".getBytes());
			}
			while (iter.hasNext()) {
				Entry<String, List<String>> entry = iter.next();
				List<String> list = entry.getValue();

				int size = list.size();
				for (int i = 0; i < size; i++) {
					String word = list.get(i);
					out.write((entry.getKey() + " ").getBytes());
					out.write((word + "\n").getBytes());// 寫入txt文件
					count++;

				}
			}

		}
		out.close();
		log.info("生成" + targetFilePath.substring(targetFilePath.lastIndexOf("/") + 1) + "成功!,總計寫入: " + count + " 條數據!");

	}

	private SougouScelMdel read(InputStream in) {
		SougouScelMdel model = new SougouScelMdel();
		DataInputStream input = new DataInputStream(in);
		int read;
		try {
			byte[] bytes = new byte[4];
			input.readFully(bytes);
			assert (bytes[0] == 0x40 && bytes[1] == 0x15 && bytes[2] == 0 && bytes[3] == 0);
			input.readFully(bytes);
			int flag1 = bytes[0];
			assert (bytes[1] == 0x43 && bytes[2] == 0x53 && bytes[3] == 0x01);
			int[] reads = new int[] { 8 };
			model.setName(readString(input, 0x130, reads));
			model.setType(readString(input, 0x338, reads));
			model.setDescription(readString(input, 0x540, reads));
			model.setSample(readString(input, 0xd40, reads));
			read = reads[0];
			input.skip(0x1540 - read);
			read = 0x1540;
			input.readFully(bytes);
			read += 4;
			assert (bytes[0] == (byte) 0x9D && bytes[1] == 0x01 && bytes[2] == 0 && bytes[3] == 0);
			bytes = new byte[128];
			Map<Integer, String> pyMap = new LinkedHashMap<Integer, String>();
			while (true) {
				int mark = readUnsignedShort(input);
				int size = input.readUnsignedByte();
				input.skip(1);
				read += 4;
				assert (size > 0 && (size % 2) == 0);
				input.readFully(bytes, 0, size);
				read += size;
				String py = new String(bytes, 0, size, encoding);
				// System.out.println(py);
				pyMap.put(mark, py);
				if ("zuo".equals(py)) {
					break;
				}
			}
			if (flag1 == 0x44) {
				input.skip(0x2628 - read);
			} else if (flag1 == 0x45) {
				input.skip(0x26C4 - read);
			} else {
				throw new RuntimeException("出現意外,聯繫作者");
			}
			StringBuffer buffer = new StringBuffer();
			Map<String, List<String>> wordMap = new LinkedHashMap<String, List<String>>();
			while (true) {
				int size = readUnsignedShort(input);
				if (size < 0) {
					break;
				}
				int count = readUnsignedShort(input);
				int len = count / 2;
				assert (len * 2 == count);
				buffer.setLength(0);
				for (int i = 0; i < len; i++) {
					int key = readUnsignedShort(input);
					buffer.append(pyMap.get(key)).append("'");
				}
				buffer.setLength(buffer.length() - 1);
				String py = buffer.toString();
				List<String> list = wordMap.get(py);
				if (list == null) {
					list = new ArrayList<String>();
					wordMap.put(py, list);
				}
				for (int i = 0; i < size; i++) {
					count = readUnsignedShort(input);
					if (count > bytes.length) {
						bytes = new byte[count];
					}
					input.readFully(bytes, 0, count);
					String word = new String(bytes, 0, count, encoding);
					// 接下來12個字節可能是詞頻或者類似信息
					input.skip(12);
					list.add(word);
				}
			}
			model.setWordMap(wordMap);
			return model;
		} catch (IOException e) {
			log.log(Level.SEVERE, e.getMessage());
			e.printStackTrace();
		} finally {
			try {
				in.close();
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}
		return null;
	}

	protected String readString(DataInputStream input, int pos, int[] reads) throws IOException {
		int read = reads[0];
		input.skip(pos - read);
		read = pos;
		output.reset();
		while (true) {
			int c1 = input.read();
			int c2 = input.read();
			read += 2;
			if (c1 == 0 && c2 == 0) {
				break;
			} else {
				output.write(c1);
				output.write(c2);
			}
		}
		reads[0] = read;
		return new String(output.toByteArray(), encoding);
	}

	protected final int readUnsignedShort(InputStream in) throws IOException {
		int ch1 = in.read();
		int ch2 = in.read();
		if ((ch1 | ch2) < 0) {
			return Integer.MIN_VALUE;
		}
		return (ch2 << 8) + (ch1 << 0);
	}

}

3、SougouScelModel


package com.hxl.files;

import java.util.List;
import java.util.Map;

public class SougouScelModel {
	private Map<String, List<String>> wordMap;

    private String name;
    private String type;
    private String description;
    private String sample;

    public Map<String, List<String>> getWordMap() {
        return wordMap;
    }

    void setWordMap(Map<String, List<String>> wordMap) {
        this.wordMap = wordMap;
    }

    public String getType() {
        return type;
    }

    public void setType(String type) {
        this.type = type;
    }

    public String getDescription() {
        return description;
    }

    public void setDescription(String description) {
        this.description = description;
    }

    public String getSample() {
        return sample;
    }

    public void setSample(String sample) {
        this.sample = sample;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }
}

4、TxtFileProcessing 繼承於FileProcessing

package com.hxl.files;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;


public class TxtFileProcessing extends FileProcessing {
	private static final Logger log = Logger.getLogger("TxtFileProcessing");
	// 文字編碼
	private String encoding = "UTF-8";

	@Override
	public void parseFile(String filePath, String targetFilePath, boolean isAppend) {
		if (!targetFilePath.endsWith(".txt")) {
			throw new IllegalStateException("文件格式錯誤,後綴必須爲.txt,此格式爲   " + targetFilePath);
		}
		if (!filePath.endsWith(".txt")) {
			return;
		}
		File inputFile = new File(filePath);
		if (!inputFile.exists()) {
			log.log(Level.SEVERE,filePath + "   文件不存在");
		} else {
			ArrayList<String> content = new ArrayList<>();
			HashSet<String> set = new HashSet<>();
			createParentDir(targetFilePath);
			File outputFile = new File(targetFilePath);
			if (!isAppend) {
				// 假如不是附加內容,刪除
				if (outputFile.exists()) {
					log.info(outputFile.getAbsolutePath() + "   文件存在,刪除...");
					outputFile.delete();
				}
			} else {
				// 讀取原有的txt文件內容
				content.addAll(readTargetFile(outputFile));
			}
			content.addAll(readSourceFile(inputFile));
			// 去重
			for (int i = 0; i < content.size(); i++) {
				set.add(content.get(i));
			}
			// 寫入目標文件
			writeToTargetFile(set, outputFile);

		}

	}

	@Override
	public void parseFile(String filePath, boolean isAppend) {
		File file = new File(filePath);
		if (file.isDirectory()) {
			File items[] = file.listFiles();
			for (int i = 0; i < items.length; i++) {
				if (!items[i].getName().endsWith(".txt")) {
					continue;
				}

				if (targetDir == null) {
					parseFile(items[i].getAbsolutePath(), items[i].getAbsolutePath().replace(".txt", "解析.txt"),
							isAppend);
				} else {
					parseFile(items[i].getAbsolutePath(), targetDir + "/" + items[i].getName(),
							isAppend);
				}

			}
		} else {
			parseFile(filePath, file.getAbsolutePath().replace(".txt", "解析.txt"), isAppend);
		}
	}
	
	@Override
	public void parseFiles(String fileDirPath, String targetFilePath, boolean isAppend) throws IOException {
		if (!targetFilePath.endsWith(".txt")) {
			throw new IllegalStateException("文件格式錯誤,後綴必須爲.txt,此格式爲   " + targetFilePath);
		}
		File fileDir = new File(fileDirPath);
		if (!fileDir.isDirectory() || !fileDir.exists()) {
			throw new IllegalStateException("文件夾路徑錯誤   " + targetFilePath);
		}
		File file[] = fileDir.listFiles();
		ArrayList<String> content = new ArrayList<>();
		HashSet<String> set = new HashSet<>();
		createParentDir(targetFilePath);
		File outputFile = new File(targetFilePath);
		if (!isAppend) {
			// 假如不是附加內容,刪除
			if (outputFile.exists()) {
				log.info(outputFile.getAbsolutePath() + "   文件存在,刪除...");
				outputFile.delete();
			}
		} else {
			// 讀取原有的txt文件內容
			content.addAll(readSourceFile(outputFile));
		}
		for (int i = 0; i < file.length; i++) {
			if (file[i].getName().endsWith(".txt")) {
				content.addAll(readSourceFile(file[i]));
			}
		}
		// 去重
		for (int i = 0; i < content.size(); i++) {
			set.add(content.get(i));
		}
		// 寫入目標文件
		writeToTargetFile(set, outputFile);

	}

	/**
	 * 將內容寫入目標文件
	 * 
	 * @param set
	 *            詞庫合集
	 * @param outputFile
	 *            目標文件
	 */
	private void writeToTargetFile(HashSet<String> set, File outputFile) {
		StringBuffer buff = new StringBuffer();
		for (String content : set) {
			buff.append(content);
			buff.append("\r\n");
		}
		String content = buff.toString();

		FileOutputStream out = null;
		try {
			out = new FileOutputStream(outputFile);
			out.write(content.getBytes());

		} catch (IOException e) {
			log.log(Level.SEVERE, e.getMessage());
			e.printStackTrace();
		} finally {
			try {
				out.close();
			} catch (IOException e) {
				log.log(Level.SEVERE, e.getMessage());
				e.printStackTrace();
			}
		}
		log.info("生成" + outputFile.getName() + "成功!,總計寫入: " + set.size() + " 條數據!");
	}

	

	

	/**
	 * 讀取源文件,獲取中文詞庫
	 * 
	 * @param file
	 *            源文件
	 * @return 中文詞庫集合
	 */
	private List<String> readSourceFile(File file) {
		ArrayList<String> content = new ArrayList<>();
		try {
			if (file.isFile() && file.exists()) { // 判斷文件是否存在
				InputStreamReader read = new InputStreamReader(new FileInputStream(file), encoding);// 考慮到編碼格式
				BufferedReader bufferedReader = new BufferedReader(read);
				String lineTxt = null;

				while ((lineTxt = bufferedReader.readLine()) != null) {
					String newStr = new String(lineTxt.getBytes("UTF-8"));
					String split[] = newStr.split(" ");
					for (int i = 0; i < split.length; i++) {
						if (i % 2 == 0) {
							// 拼音字母
						} else {
							// 中文詞庫
							content.add(split[i]);
						}
					}
				}
				bufferedReader.close();
				read.close();
			} else {
				log.log(Level.SEVERE, "找不到源文件   " + file.getAbsolutePath());
			}
		} catch (Exception e) {
			log.log(Level.SEVERE, e.getMessage());
			e.printStackTrace();
		}
		return content;

	}

	/**
	 * 讀取已解析好的的詞庫文件
	 * 
	 * @param file
	 *            詞庫文件
	 * @return 詞庫內容
	 */
	private List<String> readTargetFile(File file) {
		ArrayList<String> content = new ArrayList<>();
		try {
			if (file.isFile() && file.exists()) { // 判斷文件是否存在
				InputStreamReader read = new InputStreamReader(new FileInputStream(file), encoding);// 考慮到編碼格式
				BufferedReader bufferedReader = new BufferedReader(read);
				String lineTxt = null;

				while ((lineTxt = bufferedReader.readLine()) != null) {
					String newStr = new String(lineTxt.getBytes("UTF-8"));
					if (!newStr.trim().isEmpty()) {
						content.add(newStr);
					}
				}
				bufferedReader.close();
				read.close();
			} else {
				System.err.println("找不到目標文件  " + file.getAbsolutePath());
			}
		} catch (Exception e) {
			log.log(Level.SEVERE, e.getMessage());
			e.printStackTrace();
		}
		return content;

	}

}


















發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章