package com.cmcm.goods_classification;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
public class WordProcess {
private static final String DATASOURCEPATH = "D://mallet_data//0DataSource//Watches_Child//Sports_Watches";
private static final String STOPWORDSPATH = "C://mallet-2.0.7//stoplists//en.txt";
public static final String RESULTPATH = "D://automotives//result.txt";
private static Map<String, Integer> dataHash = new HashMap<String, Integer>();
private static Set<String> stopWordsSet = new HashSet<String>();
public static void main(String[] args) throws Exception {
loadStopWords();
FileProcess.readFolder(DATASOURCEPATH);
List<Map.Entry<String, Integer>> dataList = hashSort();
FileProcess.writeFile(dataList);
}
public static void pruneText(String textPath) {
String text = FileProcess.readFile(textPath).toLowerCase();// 將所有字母化爲小寫
text = text.replaceAll("^[a-zA-Z0-9']|\\s+|\t|\r", " "); // 將非字母字符、多個空格回車換行均化爲一個空格
String words[] = text.split("\\s+");// 取出單詞,並將單詞存入數組中
getFrequency(words);
}
public static void getFrequency(String[] words) {
for (int i = 0; i < words.length; i++) {
String key = words[i]; // key對應單詞
if ((dataHash.get(key) != null) && (!stopWordsSet.contains(key))) {
int value = ((Integer) dataHash.get(key)).intValue(); // value對應單詞出現的頻率,單詞已在map中存在則value+1
value++;
dataHash.put(key, new Integer(value));
} else {
dataHash.put(key, new Integer(1)); // 單詞未在map中存在則value初始化爲1
}
}
}
public static List<Map.Entry<String, Integer>> hashSort() {
List<Map.Entry<String, Integer>> list_Data = new ArrayList<Map.Entry<String, Integer>>(dataHash.entrySet());
Collections.sort(list_Data, new Comparator<Map.Entry<String, Integer>>() {
public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
if (o2.getValue() != null && o1.getValue() != null && o2.getValue().compareTo(o1.getValue()) > 0) {
return 1;
} else {
return -1;
}
}
});
return list_Data;
}
public static void loadStopWords() {
String stopWordsText = FileProcess.readFile(STOPWORDSPATH);
// System.out.println(stopWordsText);
String words[] = stopWordsText.split("\\s+|\\t|\\r|\\n");// 取出單詞,並將單詞存入數組中
System.out.println(words.length);
for(String word : words){
stopWordsSet.add(word);
}
}
}
package com.cmcm.goods_classification;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
public class FileProcess {
/**
* read all file in folder
* @param path
*/
public static void readFolder(String path) {
int fileNum = 0, folderNum = 0;
File file = new File(path);
if (file.exists()) {
LinkedList<File> list = new LinkedList<File>();
File[] files = file.listFiles();
for (File file2 : files) {
if (file2.isDirectory()) {//System.out.println("DIR : " + file2.getAbsolutePath());
list.add(file2);
folderNum++;
} else {
System.out.println("FILE: " + file2.getAbsolutePath());
WordProcess.pruneText(file2.getAbsolutePath());
fileNum++;
}
}
File temp_file;
while (!list.isEmpty()) {
temp_file = list.removeFirst();
files = temp_file.listFiles();
for (File file2 : files) {
if (file2.isDirectory()) {//System.out.println("DIR : " + file2.getAbsolutePath());
list.add(file2);
folderNum++;
} else {
System.out.println("FILE: " + file2.getAbsolutePath());
fileNum++;
WordProcess.pruneText(file2.getAbsolutePath());
}
}
}
} else {
System.out.println("File is not exist!");
}
System.out.println(" num dir is: " + folderNum + "\n num file is: "+ fileNum);
}
/**
* read content from filePath and return content
* @param filePath
*/
public static String readFile(String filePath) {
File file = new File(filePath);
StringBuffer result = new StringBuffer();
BufferedReader reader = null;
try {
reader = new BufferedReader(new FileReader(file));
String tempString = null;
while ((tempString = reader.readLine()) != null) {
result.append(" ");
result.append(tempString);
}
reader.close();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e1) {
}
}
}
return result.toString();
}
/**
* write content into filePath
* @param dataList
*/
public static void writeFile(List<Map.Entry<String, Integer>> dataList) {
System.out.println("start write word and frequency");
int size = dataList.size();
File file = null;
FileWriter fileWrite = null;
PrintWriter pw = null;
int count = 1;
try {
// if file exist ,append ; if not, create
file = new File(WordProcess.RESULTPATH);
fileWrite = new FileWriter(file, true);
pw = new PrintWriter(fileWrite);
for (int i = 0; i < size; i++) {
String word = dataList.get(i).getKey();
int frequency = dataList.get(i).getValue();
// System.out.println(word + " : " + frequency);
pw.print(word);
pw.print(" ");
pw.print(count++);
pw.print(" ");
pw.print(frequency);
pw.println();
}
pw.flush();
fileWrite.flush();
} catch(IOException e) {
e.printStackTrace();
}finally{
try {
pw.close();
fileWrite.close();
} catch (IOException e) {
e.printStackTrace();
}
System.out.println("end write word and frequency");
}
}
}