Java 實現大文件統計字母出現個數

簡述:

在大文件中用多線程實現查找裏面次數出現最多的字母

文件格式如,

Z
Q
S
D
N
O
E
U
...


所有的類,包圖



0.Constant.java

常量類

package com.anialy.test.io;

public class Constants {
	public static String[] words = new String[]{
		"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K","L",
		"M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z"};
	
	// 隨機生成的數據文件
	public static String OUTPUT_FILE_NAME = "output.txt";
	
	// 統計結果文件
	public static String RESULTS_FILE = "results.txt";
}



1. DataProductor.java

數據生成代碼,可以跑一段時間然後看輸出的文件

運行main函數即可

package com.anialy.test.io;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.Random;

public class DataProductor {

	public static void main(String[] args) {
		// 隨機生成數據
		new DataProductor().produceData();
	}

	private FileOutputStream outputFileStream = null;
	private static final File outputFile = new File(Constants.OUTPUT_FILE_NAME);

	public void produceData(){
		DataProductor productor = new DataProductor();
		PrepareOutput prepareOutput = productor.new PrepareOutput();
		Thread thread = new Thread(prepareOutput);
		thread.start();
	}


	/**
	 * 子線程在文件中插入英文字符
	 */
	private class PrepareOutput implements Runnable{
		String output = ""	;

		private String genWord(){
			int index = new Random().nextInt(Constants.words.length);
			return Constants.words[index];
		}

		public void run(){
			while(true){
				for(int i=0; i<10000;i++){
					output += genWord() + "\n";
				}
				try{
					byte[] outputBytes = output.getBytes("UTF-8");
					//append text at the end, NO covering the previous file
					outputFileStream = new FileOutputStream(outputFile,true);
					outputFileStream.write(outputBytes);
				} catch (FileNotFoundException e) {
					e.printStackTrace();
				} catch (UnsupportedEncodingException e) {
					e.printStackTrace();
				} catch (IOException e) {
					e.printStackTrace();
				}finally{
					//close file stream
					try {
						outputFileStream.close();
					} catch (IOException e) {
						e.printStackTrace();
					}
				}
			}
		}
	}
}


生成的數據文件



示例如下,左側爲notepad中的行號,右側是字符每行一個換行符間隔




3. CalcDemo

分成多個子線程統計每個英文字符出現的次數

package com.anialy.test.io;

import java.io.File;

public class CalcDemo {
	// the number of calc threads number
	public static final int CALC_THREADS_NUM = 8;
	// the src file 
	private static final File file = new File(Constants.OUTPUT_FILE_NAME);
	// total length
	private static final Long totalBytes = file.length();
	// bytes per thread
	private static final Long bytesPerThread = totalBytes / CALC_THREADS_NUM;
	// bytes left
	private static final Long bytesLeft = totalBytes % bytesPerThread;
	
	private static void initInfo(){
		System.out.printf("file size: %d bytes\n", totalBytes);
		System.out.printf("per thread: %d bytes\n", bytesPerThread);
		System.out.printf("bytes left: %d bytes\n", bytesLeft);
	}
	
	public static void doCalc() {
		initInfo();
		// calc thread start
		for(int threadId=0; threadId<CALC_THREADS_NUM; threadId++){
			Long start = threadId * bytesPerThread;
			Long end = start + bytesPerThread;
			new CalcThread(file, start, end, threadId).start();
		}
		
		// for the bytes left
		CalcThread calcThread = new CalcThread(file
				, bytesPerThread*CALC_THREADS_NUM, totalBytes, CALC_THREADS_NUM);
		calcThread.start();
		
	}
}


生成結果文件,



4.CalcThread.java

3中使用的統計文件的線程類

package com.anialy.test.io;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.io.UnsupportedEncodingException;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;

public class CalcThread extends Thread{
	// Thread NO.
	private int NO = 0; 
	
	private RandomAccessFile raf;
	
	private Long start; // access start index;
	
	private Long end; // access end index;
	
	private Map<String, Integer> map = new TreeMap<String, Integer>();
	
	public CalcThread(File file, Long start, Long end, int NO) {
		try {
			raf = new RandomAccessFile(file, "rw");
			this.start = start;
			this.end = end;
			this.NO = NO;
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		}
		
		int letterSize = Constants.words.length;
		for(int i=0; i<letterSize; i++){
			map.put(Constants.words[i], 0);
		}
	}

	@Override
	public void run() {
		System.out.printf("Thread %d is now processing !!\n", NO);
		try {
			raf.seek(start);
			byte[] buff = new byte[1]; // 帶着換行符一起統計,每行兩個字符
			while(raf.read(buff) != -1){
				String letter = new String(buff, "UTF-8").substring(0);
				if(!"\n".equals(letter)){
					Integer cnt = map.get(letter);
					map.put(letter, cnt + 1);
				}
				start++;
				if(start.equals(end)){
					break;
				}
			}  
			
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			try {
				raf.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
			outputResult();
		}
	}
	
	// 將結果輸出到每個線程各自的文件
	private void outputResult() {
		Iterator<String> iter = map.keySet().iterator();
		FileOutputStream outputFileStream = null;
		try {
			// 將統計的結果導入文件result-{NO}
			File outputFile = new File("result-" + NO);
			StringBuffer sbf = new StringBuffer();
			while(iter.hasNext()){
				String key = iter.next();
				sbf.append(key + ":" + map.get(key) + "\n");
			}
			outputFileStream = new FileOutputStream(outputFile,true);
			outputFileStream.write(sbf.toString().getBytes("UTF-8"));
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally{
			if(outputFileStream != null){
				try {
					outputFileStream.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
		}
	}

	
	// main 測試
	public static void main(String[] args) {
		File file = new File(Constants.OUTPUT_FILE_NAME);
		new CalcThread(file, 0L, file.length(), 1).start();
	}
}

5. Conclusion.java

對分割統計所得的小文件,最後進行統計

package com.anialy.test.io;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;

public class Conclusion {
	
	static Map<String, Integer> resultMap = new TreeMap<String, Integer>();
	static File file = new File("results.txt");
	
	/**
	 * 分別統計每個子結果文件
	 */
	@SuppressWarnings("resource")
	public static void sumUp() {
		// include the thread to deal with byte-left
		for(int i=0; i<CalcDemo.CALC_THREADS_NUM+1; i++){
			File file = new File("result-" + i);
			FileInputStream fio = null;
			try {
				fio = new FileInputStream(file);
				InputStreamReader inputStreamReader = new InputStreamReader(fio);  
		        BufferedReader bufferedReader = new BufferedReader(inputStreamReader); 
		        String line;
		        while((line = bufferedReader.readLine()) != null){
		        	String letter = line.substring(0,1);
		        	String cntStr = line.substring(2);
		        	if(resultMap.get(letter) == null){
		        		resultMap.put(letter, 0);
		        	}
		        	resultMap.put(letter, resultMap.get(letter) + Integer.parseInt(cntStr));
		        }
			} catch (FileNotFoundException e) {
				e.printStackTrace();
			} catch (IOException e) {
				e.printStackTrace();
			}
			showResults();
		}
	}
	
	
	/**
	 * 輸出所有結果
	 */
	private static void showResults(){
		Iterator<String> iter = resultMap.keySet().iterator();
		StringBuffer sbf = new StringBuffer();
		while(iter.hasNext()){
			String letter = iter.next();
			Integer cnt = resultMap.get(letter);
			String line = letter + ": " + cnt + "\n";
			sbf.append(line);
		}
		
		FileOutputStream fos;
		try {
			fos = new FileOutputStream(file);
			fos.write(sbf.toString().getBytes("UTF-8"));
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	
	public static void main(String[] args) {
		sumUp();
	}
}


得到,



結果如下

A: 47177
B: 45772
C: 46418
D: 46785
E: 47498
F: 46146
G: 45693
H: 46705
I: 47734
J: 46402
K: 46519
L: 46679
M: 46958
N: 46577
O: 47473
P: 45797
Q: 46858
R: 46850
S: 47055
T: 46028
U: 45530
V: 45026
W: 46445
X: 46960
Y: 46718
Z: 46197

6. MainTest.java

主測試文件(20s sleep如果output.txt數據文件更大則需要更長時間)

package com.anialy.test.io;

import java.io.File;

/**
 * Package: com.anialy.test.io
 *
 * File: MainTest.java 
 *
 * Author: anialy   Date: 2014-9-3
 * 
 */
public class MainTest {
	public static void main(String[] args) {
		// clear data
		File file = null;
		for(int i=0; i<=CalcDemo.CALC_THREADS_NUM; i++){
			// delete old data file
			file = new File("result-" + i);
			if(file.isFile())
				file.delete();
		}

		file = new File(Constants.RESULTS_FILE);
		if(file.isFile())
			file.delete();


		// CALC_THREADS_NUM+1(for the bytes-left) Threads to analyze data
		CalcDemo.doCalc();

		// 20 seconds time waiting for all threads' process 
		try {
			Thread.sleep(20000);
		} catch (InterruptedException e) {
			e.printStackTrace();
		}

		// main thread to sum up data
		Conclusion.sumUp();
	}
}














發佈了225 篇原創文章 · 獲贊 34 · 訪問量 222萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章