多路歸併排序


package merge;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;

/**
*
* 外部排序指的是大文件的排序,即待排序的記錄存儲在外存儲器上,待排序的文件無法一次裝入內存,
* 需要在內存和外部存儲器之間進行多次數據交換,以達到排序整個文件的目的。
* 外部排序最常用的算法是多路歸併排序,即將原文件分解成多個能夠一次性裝人內存的部分,
* 分別把每一部分調入內存完成排序。然後,對已經排序的子文件進行歸併排序。
* 一般來說外排序分爲兩個步驟:預處理和合並排序。即首先根據內存的大小,將有n個記錄的磁盤文件分批讀入內存,採用有效的內存排序方法進行排序,將其預處理爲若干個有序的子文件,這些有序子文件就是初始順串,然後採用合併的方法將這些初始順串逐趟合併成一個有序文件。
*
* @author jia.hej
*
* @version $Id: MergeSort.java, v 0.1 2009-8-7 下午03:53:51 jia.hej Exp $
*/
public class MergeSort {

/** 十 */
private static long TEN = 10;
/** 百 */
private static long HUNDRED = 100;
/** 千 */
private static long THOUSAND = 1000;
/** 萬 */
private static long MILLION = 10000; //1078 00:00:01 078
/** 十萬 */
private static long TEN_MILLION = 100000; //9656 00:00:09 656
/** 百萬 */
private static long HUNDRED_MILLION = 1000000; //93733 00:01:33 733
/** 千萬 */
private static long THOUSAND_MILLION = 10000000; //970144 00:16:10 144
/** 億 */
private static long BILLION = 100000000;
/** 十億 */
private static long TEN_BILLION = 1000000000;
/** 百億 */
private static long HUNDRED_BILLION = 10000000000l;
/** 千億 */
private static long THOUSAND_BILLION = 100000000000l;

private static String INPUT_FILE = "c:\\test\\input.txt";

private static String OUTPUT_FILE = "c:\\test\\output.txt";

/** 拆分大小 */
private static int SPLIT_SIZE = 10 * 10000;

private static int numSize;

public static void main(String[] args) throws Exception {
createDir("c:\\test");
createFile(INPUT_FILE);
numSize = createRandomNum(THOUSAND_MILLION);

sortFile(INPUT_FILE);

long beginTime = System.currentTimeMillis();
System.out.println("begin=" + beginTime);

//拆分文件
splitFile(INPUT_FILE, numSize);

List<String> splitFilePathList = new ArrayList<String>();
File dir = new File("c:\\test\\temp");
File[] files = dir.listFiles();
for (int i = 0; i < files.length; i++) {
File file = files[i];
splitFilePathList.add(file.getAbsolutePath());
}
//合併文件
createFile(OUTPUT_FILE);
mergeFile(splitFilePathList, OUTPUT_FILE);

long endTime = System.currentTimeMillis();
System.out.println("end=" + endTime);
System.out.println("end-begin=" + (endTime - beginTime));
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss SSS");
System.out.println(simpleDateFormat.format(endTime - beginTime));

//刪除拆分文件
System.gc();
Runtime.getRuntime().exec(new String[] { "cmd", "/c", "del", "c:\\test\\temp\\*.txt" });
}

private static void sortFile(String path) throws Exception {
SortedSet<Integer> set = new TreeSet<Integer>();
File file = new File(path);
FileReader fileReader = new FileReader(file);
BufferedReader bufferedReader = new BufferedReader(fileReader);
String value;
while ((value = bufferedReader.readLine()) != null) {
set.add(Integer.parseInt(value));
}
bufferedReader.close();
fileReader.close();
createFile("c:\\test\\input排序.txt");
writeFile("c:\\test\\input排序.txt", set, false);
}

/**
* 拆分文件
*
* @param inputPath
* @param numSize
* @throws Exception
*/
private static void splitFile(String inputPath, int numSize) throws Exception {
File file = new File(inputPath);
FileReader fileReader = new FileReader(file);
BufferedReader bufferedReader = new BufferedReader(fileReader);
SortedSet<Integer> set = new TreeSet<Integer>();
String str;
createDir("c:\\test\\temp");
if (numSize > SPLIT_SIZE) {
int count = 1;
int fileNum = 1;
while ((str = bufferedReader.readLine()) != null) {
set.add(Integer.parseInt(str));
//超過拆分數,寫入子文件
if (count >= SPLIT_SIZE) {
createFile("c:\\test\\temp\\" + fileNum + ".txt");
writeFile("c:\\test\\temp\\" + fileNum + ".txt", set, false);
set.clear();
count = 0;
fileNum++;
}
count++;//讀取文件當前行數
}
}
//總量未達到拆分數,寫入子文件
else {
while ((str = bufferedReader.readLine()) != null) {
set.add(Integer.parseInt(str));
}
createFile("c:\\test\\temp\\1.txt");
writeFile("c:\\test\\temp\\1.txt", set, false);
}
if (bufferedReader != null) {
bufferedReader.close();
}
if (fileReader != null) {
fileReader.close();
}
}

/**
* 合併文件
*
* <p>
* 1.txt(1、3、5、7、9)和2.txt(6、8、9)<br/>
* 首先1和6進入treeset。 <br/>
* 輸出1,發現是來自於1.txt的,再讀入3,此時set中的元素是6和3。<br/>
* 輸出3,發現還是來自於1.txt的,再讀入5,此時set中的元素是6和5。 <br/>
* 輸出5,發現還是來自於1.txt的,再讀入7,此時set中的元素是6和7。 <br/>
* 輸出6,發現來自於2.txt的,讀入8,此時set中的元素是8和7。 <br/>
* 輸出7,發現來自於1.txt的,讀入9,此時set中的元素是8和9。 <br/>
* 輸出8,發現來自於2.txt的,無法再讀入9,此時set中的元素是9。<br/>
* 輸出9。
* </p>
*
* @param splitFilePathList
* @param outputPath
* @throws Exception
*/
private static void mergeFile(List<String> splitFilePathList, String outputPath)
throws Exception {
//fileInfo添加到set
SortedSet<FileInfo> fileInfoSet = new TreeSet<FileInfo>(new FileInfoComparator());
if (fileInfoSet.isEmpty()) {
for (int i = 0; i < splitFilePathList.size(); i++) {
File file = new File(splitFilePathList.get(i));
FileReader fileReader = new FileReader(file);
BufferedReader bufferedReader = new BufferedReader(fileReader);

FileInfo fileInfo = new FileInfo();
String splitFilePath = splitFilePathList.get(i);
fileInfo.setFileNum(Integer.parseInt(splitFilePath.substring(splitFilePath
.lastIndexOf("\\") + 1, splitFilePath.indexOf(".txt"))));//文件號
fileInfo.setReader(bufferedReader);//reader引用
String value = bufferedReader.readLine();
if (value != null) {
fileInfo.setValue(value);//當前值
fileInfo.setLineNum(fileInfo.getLineNum() + 1);//當前行號
fileInfoSet.add(fileInfo);
}
}
}

Set<Integer> valueSet = new LinkedHashSet<Integer>();
boolean isSplit = false;
int count = 1;
//輸出set元素
while (!fileInfoSet.isEmpty()) {
FileInfo currentFileInfo = fileInfoSet.first();
valueSet.add(Integer.parseInt(currentFileInfo.getValue()));
//拆分批量寫入文件
if (valueSet.size() >= SPLIT_SIZE) {
writeFile(outputPath, valueSet, true);
valueSet.clear();
isSplit = true;
}

//clone fileInfo
FileInfo nextFileInfo = new FileInfo();
nextFileInfo.setFileNum(currentFileInfo.getFileNum());
nextFileInfo.setLineNum(currentFileInfo.getLineNum());
nextFileInfo.setValue(currentFileInfo.getValue());
nextFileInfo.setReader(currentFileInfo.getReader());

boolean isSuccess = nextFileInfo.readNextValue();

//未到文件末尾,set中fileInfo重新排序
if (isSuccess) {
if (fileInfoSet.remove(currentFileInfo)) {
fileInfoSet.add(nextFileInfo);
}
}
//已到文件末尾,從set中移除該fileInfo
else {
fileInfoSet.remove(currentFileInfo);
}

System.out.println("----- MergeFile:" + count++ + " -----");
System.out.println("fileNum=" + currentFileInfo.getFileNum());
System.out.println("lineNum=" + currentFileInfo.getLineNum());
System.out.println("value=" + currentFileInfo.getValue());
System.out.println("----------------------------");
}

//從未拆分過則一次性寫入文件
if (valueSet.size() > 0 && valueSet.size() < SPLIT_SIZE && !isSplit) {
writeFile(outputPath, valueSet, false);
}
//曾拆分過剩餘部分寫入文件
else if (valueSet.size() > 0 && valueSet.size() < SPLIT_SIZE && isSplit) {
writeFile(outputPath, valueSet, true);
}
}

/**
* 生成隨機數
*
* @param numSize
* @return
* @throws Exception
*/
private static int createRandomNum(long numSize) throws Exception {
Set<Integer> set = new LinkedHashSet<Integer>();
int count = 0;
boolean isSplit = false;
while (count < numSize) {
int num = (int) (Math.random() * numSize + 1);
if (set.add(num)) {
count++;
}
//拆分批量寫入文件
if (set.size() >= SPLIT_SIZE) {
writeFile(INPUT_FILE, set, true);
set.clear();
isSplit = true;
}
}

//從未拆分過則一次寫入文件
if (set.size() > 0 && set.size() < SPLIT_SIZE && !isSplit) {
writeFile(INPUT_FILE, set, false);
}
//曾拆分過剩餘部分寫入文件
else if (set.size() > 0 && set.size() < SPLIT_SIZE && isSplit) {
writeFile(INPUT_FILE, set, true);
}
return count;
}

private static void createDir(String dirPath) {
File dir = new File(dirPath);
if (!dir.exists()) {
if (dir.mkdir()) {
System.out.println(dir.getName() + " is create.");
}
}
}

private static void createFile(String path) throws Exception {
File file = new File(path);
if (!file.exists()) {
if (file.createNewFile()) {
System.out.println(file.getName() + " is create.");
}
}
}

private static void writeFile(String path, Set<Integer> set, boolean isAppend) throws Exception {
File file = new File(path);
FileWriter fileWriter = new FileWriter(file, isAppend);// 第二個參數表示:是否爲追加模
BufferedWriter bufferedWriter = new BufferedWriter(fileWriter);
Iterator<Integer> iterator = set.iterator();
while (iterator.hasNext()) {
bufferedWriter.write(iterator.next().toString());
bufferedWriter.newLine();
}
bufferedWriter.flush();
if (bufferedWriter != null) {
bufferedWriter.close();
}
if (fileWriter != null) {
fileWriter.close();
}
}
}


package merge;

import java.io.BufferedReader;

/**
*
* 文件信息
*
* @author jia.hej
*
* @version $Id: FileInfo.java, v 0.1 2009-8-1 上午02:11:30 jia.hej Exp $
*/
public class FileInfo {

/**
* 文件號
*/
private int fileNum;

/**
* 當前行號
*/
private int lineNum = 0;

/**
* 當前值
*/
private String value;

/**
* BufferedReader引用
*/
private BufferedReader reader;

public boolean readNextValue() throws Exception {
String value;
if ((value = this.reader.readLine()) != null) {
this.value = value;
this.lineNum++;
return true;
} else {
this.reader.close();
return false;
}
}

public int getFileNum() {
return fileNum;
}

public void setFileNum(int fileNum) {
this.fileNum = fileNum;
}

public int getLineNum() {
return lineNum;
}

public void setLineNum(int lineNum) {
this.lineNum = lineNum;
}

public String getValue() {
return value;
}

public void setValue(String value) {
this.value = value;
}

public BufferedReader getReader() {
return reader;
}

public void setReader(BufferedReader reader) {
this.reader = reader;
}
}


package merge;

import java.util.Comparator;

/**
*
* 文件比較器
*
* @author jia.hej
*
* @version $Id: FileInfoComparator.java, v 0.1 2009-8-7 下午01:42:05 jia.hej Exp $
*/
public class FileInfoComparator implements Comparator<FileInfo> {

public int compare(FileInfo o1, FileInfo o2) {
if (Integer.parseInt(o1.getValue()) != Integer.parseInt(o2.getValue())) {
return Integer.parseInt(o1.getValue()) - Integer.parseInt(o2.getValue());
}
//如果存在重複值則使用文件號比較
else {
return o1.getFileNum() - o2.getFileNum();
}
}

}



重寫的第二版

package merge;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;

/**
*
* @author jia.hej
* @version $Id: MergeSort.java, v 0.1 2017-5-6 上午12:56:01 jia.hej Exp $
*/
public class MergeSort {

/** 十 */
private static long TEN = 10;
/** 百 */
private static long HUNDRED = 100;
/** 千 */
private static long THOUSAND = 1000;
/** 萬 */
private static long MILLION = 10000; //1078 00:00:01 078
/** 十萬 */
private static long TEN_MILLION = 100000; //9656 00:00:09 656
/** 百萬 */
private static long HUNDRED_MILLION = 1000000; //93733 00:01:33 733
/** 千萬 */
private static long THOUSAND_MILLION = 10000000; //970144 00:16:10 144
/** 億 */
private static long BILLION = 100000000;
/** 十億 */
private static long TEN_BILLION = 1000000000;
/** 百億 */
private static long HUNDRED_BILLION = 10000000000l;
/** 千億 */
private static long THOUSAND_BILLION = 100000000000l;

private static String INPUT_FILE = "c:\\test\\input.txt";

private static String OUTPUT_FILE = "c:\\test\\output.txt";

/** 拆分大小 */
private static int SPLIT_SIZE = 1000;

public static void main(String[] args) throws Exception {
createDir("c:\\test");
createFile(INPUT_FILE);
int numberSize = createRandomNum(TEN_MILLION);

long beginTime = System.currentTimeMillis();

//文件拆分
splitFile(INPUT_FILE, numberSize);

createDir("c:\\test\\sort");
List<String> splitFilePathList = new ArrayList<String>();
File dir = new File("c:\\test\\temp");
File[] files = dir.listFiles();
for (int i = 0; i < files.length; i++) {
File file = files[i];
splitFilePathList.add(file.getAbsolutePath());

//文件排序
sortFile(file.getAbsolutePath());
}

//文件合併
createFile(OUTPUT_FILE);
mergeFile(splitFilePathList, OUTPUT_FILE);

System.out.println("begin=" + beginTime);
long endTime = System.currentTimeMillis();
System.out.println("end=" + endTime);
System.out.println("總耗時=" + ((endTime - beginTime) / 1000) + "s");
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss SSS");
System.out.println(simpleDateFormat.format(endTime - beginTime));

//刪除臨時拆分文件
System.gc();
Runtime.getRuntime().exec(new String[] { "cmd", "/c", "del", "c:\\test\\temp\\*.txt" });
Runtime.getRuntime().exec(new String[] { "cmd", "/c", "del", "c:\\test\\sort\\*.txt" });
}

private static void sortFile(String path) throws Exception {
SortedSet<Integer> set = new TreeSet<Integer>();
File file = new File(path);
FileReader fileReader = new FileReader(file);
BufferedReader bufferedReader = new BufferedReader(fileReader);
String value;
while ((value = bufferedReader.readLine()) != null) {
set.add(Integer.parseInt(value));
}
bufferedReader.close();
fileReader.close();
createFile("c:\\test\\sort\\" + file.getName());
writeFile("c:\\test\\sort\\" + file.getName(), set, false);
}

/**
* 拆分文件
*
* @param inputPath
* @param numSize
* @throws Exception
*/
private static void splitFile(String inputPath, int numberSize) throws Exception {
File file = new File(inputPath);
FileReader fileReader = new FileReader(file);
BufferedReader bufferedReader = new BufferedReader(fileReader);
SortedSet<Integer> set = new TreeSet<Integer>();
String str;
createDir("c:\\test\\temp");

if (numberSize > SPLIT_SIZE) {
int count = 1;
int fileNum = 1;
while ((str = bufferedReader.readLine()) != null) {
set.add(Integer.parseInt(str));
//超過拆分數,寫入子文件
if (count >= SPLIT_SIZE) {
createFile("c:\\test\\temp\\" + fileNum + ".txt");
writeFile("c:\\test\\temp\\" + fileNum + ".txt", set, false);
set.clear();
count = 0;
fileNum++;
}
count++;//讀取文件當前行數
}
}
//總量未達到拆分數,寫入子文件
else {
while ((str = bufferedReader.readLine()) != null) {
set.add(Integer.parseInt(str));
}
createFile("c:\\test\\temp\\1.txt");
writeFile("c:\\test\\temp\\1.txt", set, false);
}
if (bufferedReader != null) {
bufferedReader.close();
}
if (fileReader != null) {
fileReader.close();
}
}

/**
* 合併文件
*
* <p>
* 1.txt(1、3、5、7、9)和2.txt(6、8、9)<br/>
* 首先1和6進入treeset。 <br/>
* 輸出1,發現是來自於1.txt的,再讀入3,此時set中的元素是6和3。<br/>
* 輸出3,發現還是來自於1.txt的,再讀入5,此時set中的元素是6和5。 <br/>
* 輸出5,發現還是來自於1.txt的,再讀入7,此時set中的元素是6和7。 <br/>
* 輸出6,發現來自於2.txt的,讀入8,此時set中的元素是8和7。 <br/>
* 輸出7,發現來自於1.txt的,讀入9,此時set中的元素是8和9。 <br/>
* 輸出8,發現來自於2.txt的,無法再讀入9,此時set中的元素是9。<br/>
* 輸出9。
* </p>
*
* @param splitFilePathList
* @param outputPath
* @throws Exception
*/
private static void mergeFile(List<String> splitFilePathList, String outputPath)
throws Exception {
//按拆分文件數初始化map
Map<Integer, FileInfo> valueFileMap = new HashMap<Integer, FileInfo>();
for (int i = 0; i < splitFilePathList.size(); i++) {
File file = new File(splitFilePathList.get(i));
FileReader fileReader = new FileReader(file);
BufferedReader bufferedReader = new BufferedReader(fileReader);

FileInfo fileInfo = new FileInfo();
String splitFilePath = splitFilePathList.get(i);
fileInfo.setFileNo(Integer.parseInt(splitFilePath.substring(
splitFilePath.lastIndexOf("\\") + 1, splitFilePath.indexOf(".txt"))));//文件號
fileInfo.setReader(bufferedReader);//reader引用
String value = bufferedReader.readLine();
if (value != null) {
fileInfo.setCurrentValue(value);//當前值
fileInfo.setCurrentLineNo(fileInfo.getCurrentLineNo() + 1);//當前行號
valueFileMap.put(Integer.parseInt(value), fileInfo);
}
}

File outputFile = new File(outputPath);
FileWriter outputFileWriter = new FileWriter(outputFile, true);// 第二個參數表示:是否爲追加模式
BufferedWriter outputBufferedWriter = new BufferedWriter(outputFileWriter);

//按文件行號橫向遍歷
for (int i = 0; i < splitFilePathList.size() * SPLIT_SIZE; i++) {
//查找最小值
Integer minValue = Integer.MAX_VALUE;
for (Integer currentValue : valueFileMap.keySet()) {
if (currentValue.intValue() < minValue) {
minValue = currentValue.intValue();
}
}

//最小值追加到輸出文件
Set<Integer> tempValueSet = new LinkedHashSet<Integer>();
tempValueSet.add(minValue);
writeFile(outputBufferedWriter, tempValueSet);

//命中FileInfo讀指針後移
FileInfo currentFileInfo = valueFileMap.get(minValue);

System.out.println("----- MergeFile:" + (i + 1) + " -----");
System.out.println("fileNo=" + currentFileInfo.getFileNo());
System.out.println("currentLineNo=" + currentFileInfo.getCurrentLineNo());
System.out.println("currentValue=" + currentFileInfo.getCurrentValue());
System.out.println("----------------------------");

FileInfo nextFileInfo = new FileInfo();
nextFileInfo.setFileNo(currentFileInfo.getFileNo());
nextFileInfo.setCurrentLineNo(currentFileInfo.getCurrentLineNo());
nextFileInfo.setReader(currentFileInfo.getReader());

valueFileMap.remove(minValue);

//讀指針未到文件尾
if (nextFileInfo.readNextValue()) {
valueFileMap.put(Integer.parseInt(nextFileInfo.getCurrentValue()), nextFileInfo);
} else {
nextFileInfo.closeReader();
}
}

if (outputBufferedWriter != null) {
outputBufferedWriter.close();
}
if (outputFileWriter != null) {
outputFileWriter.close();
}
}

/**
* 生成隨機數
*
* @param numberSize
* @return
* @throws Exception
*/
private static int createRandomNum(long numberSize) throws Exception {
int count = 0;
Set<Integer> set = new LinkedHashSet<Integer>();

while (count < numberSize) {
int num = (int) (Math.random() * numberSize);
if (set.add(num)) {
count++;
}
}

writeFile(INPUT_FILE, set, true);
return count;
}

private static void createDir(String dirPath) {
File dir = new File(dirPath);
if (dir.exists()) {
dir.delete();
}

if (dir.mkdir()) {
System.out.println("[" + dir.getName() + " dir is create]");
}
}

private static void createFile(String path) throws Exception {
File file = new File(path);
if (file.exists()) {
file.delete();
}

if (file.createNewFile()) {
System.out.println("[" + file.getName() + " file is create]");
}
}

private static void writeFile(String path, Set<Integer> set, boolean isAppend) throws Exception {
File file = new File(path);
FileWriter fileWriter = new FileWriter(file, isAppend);// 第二個參數表示:是否爲追加模式
BufferedWriter bufferedWriter = new BufferedWriter(fileWriter);
Iterator<Integer> iterator = set.iterator();
while (iterator.hasNext()) {
bufferedWriter.write(iterator.next().toString());
bufferedWriter.newLine();
}
bufferedWriter.flush();
if (bufferedWriter != null) {
bufferedWriter.close();
}
if (fileWriter != null) {
fileWriter.close();
}
}

private static void writeFile(BufferedWriter bufferedWriter, Set<Integer> set) throws Exception {
Iterator<Integer> iterator = set.iterator();
while (iterator.hasNext()) {
bufferedWriter.write(iterator.next().toString());
bufferedWriter.newLine();
}
bufferedWriter.flush();
}

}


package merge;

import java.io.BufferedReader;
import java.io.IOException;

/**
*
* 文件信息
*
* @author jia.hej
* @version $Id: FileInfo.java, v 0.1 2017-5-6 上午12:56:01 jia.hej Exp $
*/
public class FileInfo {

/**
* 文件號
*/
private int fileNo = 0;

/**
* 當前行號
*/
private int currentLineNo = 0;

/**
* 當前值
*/
private String currentValue;

/**
* BufferedReader引用
*/
private BufferedReader reader;

public boolean readNextValue() throws Exception {
if (reader == null) {
return false;
}

String value;
if ((value = this.reader.readLine()) != null) {
this.currentValue = value;
this.currentLineNo++;
return true;
} else {
this.reader.close();
this.reader = null;
return false;
}
}

public void closeReader() throws IOException {
if (reader != null) {
reader.close();
}
}

public int getFileNo() {
return fileNo;
}

public void setFileNo(int fileNo) {
this.fileNo = fileNo;
}

public int getCurrentLineNo() {
return currentLineNo;
}

public void setCurrentLineNo(int currentLineNo) {
this.currentLineNo = currentLineNo;
}

public String getCurrentValue() {
return currentValue;
}

public void setCurrentValue(String currentValue) {
this.currentValue = currentValue;
}

public BufferedReader getReader() {
return reader;
}

public void setReader(BufferedReader reader) {
this.reader = reader;
}

}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章