有一個1G大小的一個文件,裏面每一行是一個詞,
詞的大小不超過16字節,內存限制大小是10M。返回頻數最高的100個詞。
1、找出一種分類方式(找到散列方式或散列函數);
2、特殊情況考慮,防止分類後單類文件過大問題;
3、對分類的文件進行歸併。
1、分類方式(儘可能保證相同類型在一個文件中):
按照26個英文字母及字母長度分類;
例:文件名:a4,b5,c8....
字母小於4個爲一組,大於等於4小於等於5爲一組,大於5爲一組,工三組。
最多產生文件個數:26乘以3共78個。
2、假設一個文件多大;首先按長度再次分割,
再按第二、三.....個英文字母再次分割;假設都相同,單詞相同。
3、結果(歸併)。
代碼下載地址
package com.wqq.study.demo.service.bigdata;
import java.io.*;
import java.util.*;
public class BigData2 {
public static String BIG_FILE_NAME="C:\\Users\\wqq\\Desktop\\bigdata\\bigdata.txt";
public static String SORT_FILE_NAME="C:\\Users\\wqq\\Desktop\\bigdata\\bigdatasort.txt";
public static Integer LIMIT=100;
public static String LINE_SEPARATOR="\r\n";
public static void main(String[] args) throws Exception{
BigData2 bigData=new BigData2();
List<Map.Entry<String,Integer>> return100=separateFile();
System.out.println("return100:"+return100.size());
}
private static List<Map.Entry<String,Integer>> separateFile() {
Set<String> fileNameList = new TreeSet<>();
Map<String,FileWriter > map = new HashMap<>();
try (BufferedReader reader = new BufferedReader(new FileReader(BIG_FILE_NAME))) {
String line="";
while ((line = reader.readLine()) != null) {
line=line.toLowerCase();
int len=line.length();
String fileTemp = proFileName(line);
if(map.containsKey(fileTemp)){
FileWriter tmpWriterTemp= map.get(fileTemp);
tmpWriterTemp.write(line+LINE_SEPARATOR);
}else {
try {
FileWriter tmpWriter = new FileWriter(fileTemp);
map.put(fileTemp,tmpWriter);
tmpWriter.write(line+LINE_SEPARATOR);
}catch (Exception e){
e.printStackTrace();
}
}
fileNameList.add(fileTemp);
}
return getSort(fileNameList);
} catch (Exception e) {
e.printStackTrace();
}finally {
for (String reader : map.keySet()) {
try {
map.get(reader).close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return null;
}
private static String proFileName(String line) {
int len=line.length();
StringBuffer stringBuffer=new StringBuffer();
stringBuffer.append("C:\\Users\\wqq\\Desktop\\bigdata\\");
if(len<4){
if(len>0){
stringBuffer.append("temp"+line.substring(0,1)+"3");
}else{
stringBuffer.append("temp"+"3");
}
}else if(3<len &&len<5){
stringBuffer.append("temp"+line.substring(0,1)+"4");
}else if(4<len &&len<6){
stringBuffer.append("temp"+line.substring(0,1)+"5");
}else{
stringBuffer.append("temp"+line.substring(0,1)+"6");
}
stringBuffer.append(".txt");
return stringBuffer.toString();
}
public static List<Map.Entry<String,Integer>> getSort(Set<String> fileNameList) throws Exception{
List<Map.Entry<String,Integer>> entrysReturn=new ArrayList<>();
for(String fileNameSet:fileNameList){
Map<String,Integer> countMap=new HashMap<>();
try (BufferedReader readerSet = new BufferedReader(new FileReader(fileNameSet))) {
String lineSet="";
while ((lineSet = readerSet.readLine()) != null) {
if(countMap.containsKey(lineSet)){
int value=countMap.get(lineSet);
value++;
countMap.put(lineSet,value);
}else {
countMap.put(lineSet,1);
}
}
}
List<Map.Entry<String,Integer>> entrys=new ArrayList<>(countMap.entrySet());
Collections.sort(entrys, new Comparator<Map.Entry<String, Integer>>() {
public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
return o2.getValue().compareTo(o1.getValue());
}
});
if(entrys.size()>LIMIT){
entrysReturn.addAll(entrys.subList(0,LIMIT));
}else{
entrysReturn.addAll(entrys);
}
}
Collections.sort(entrysReturn, new Comparator<Map.Entry<String, Integer>>() {
public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
return o2.getValue().compareTo(o1.getValue());
}
});
if(entrysReturn.size()>LIMIT){
return entrysReturn.subList(0,LIMIT);
}else{
return entrysReturn;
}
}
private String randomWord() {
int length = 2 + (int) (Math.random() * 5);
String word = "";
for (int i = 0; i < length; i++) {
word += (char) randomChar();
}
return word;
}
private byte randomChar() {
int flag = (int) (Math.random() * 2);
byte resultBt;
if (flag == 0) {
byte bt = (byte) (Math.random() * 26);
resultBt = (byte) (65 + bt);
} else {
byte bt = (byte) (Math.random() * 26);
resultBt = (byte) (97 + bt);
}
return resultBt;
}
public void StringBufferDemo() throws IOException{
File file=new File("C:\\Users\\wqq\\Desktop\\bigdata\\bigdata.txt");
if(!file.exists())
file.createNewFile();
FileOutputStream out=new FileOutputStream(file,true);
BigData2 bigData=new BigData2();;
for(int i=0;i<100000000;i++){
StringBuffer sb=new StringBuffer();
sb.append(bigData.randomWord());
sb.append("\r\n");
out.write(sb.toString().getBytes("utf-8"));
}
out.close();
}
}