需求
無論hdfs還是mapreduce,對於小文件都有損效率,實踐中,又難免面臨處理大量小文件的場景,此時,就需要有相應解決方案
分析
小文件的優化無非以下幾種方式:
1、在數據採集的時候,就將小文件或小批數據合成大文件再上傳HDFS
2、在業務處理之前,在HDFS上使用mapreduce程序對小文件進行合併
3、在mapreduce處理時,可採用combineInputFormat提高效率
實現及代碼
自定義InputFromat
package inputformat;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import java.io.IOException;
public class Custom_FileInputFormat extends FileInputFormat<NullWritable, BytesWritable> {
/**
* 返回文件不可切割,保證一個文件的完整性
* @param context
* @param filename
* @return
*/
@Override
protected boolean isSplitable(JobContext context, Path filename) {
return false;
}
@Override
public RecordReader<NullWritable, BytesWritable> createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
Custom_RecordReader custom_recordReader = new Custom_RecordReader();
custom_recordReader.initialize(inputSplit, taskAttemptContext);
return custom_recordReader;
}
}
自定義RecordReader
package inputformat;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
public class Custom_RecordReader extends RecordReader<NullWritable, BytesWritable> {
private FileSplit fileSplit;
private Configuration conf;
private BytesWritable bytesWritable = new BytesWritable();
private boolean pressced = false;
/**
* @param split 封裝的文件的對象內容
* @param context 上下文對象
* @throws IOException
* @throws InterruptedException
*/
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
this.fileSplit = (FileSplit) split;
this.conf = context.getConfiguration();
}
//讀取下一個文件
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (!pressced) {
//獲取文件路徑
Path path = fileSplit.getPath();
//獲取FileSysteam對象
FileSystem fileSystem = null;
FSDataInputStream inputStream = null;
try {
fileSystem = FileSystem.get(conf);
//讀取文件
inputStream = fileSystem.open(path);
//初始化一個字節數組,大小爲文件的長度
byte[] bytes = new byte[(int) fileSplit.getLength()];
//把數據流轉換成字節數組
//把數據流轉換成字節數組
IOUtils.readFully(inputStream, bytes, 0, bytes.length);
//把字節數組轉換成BytesWritable對象
bytesWritable.set(bytes, 0, bytes.length);
} catch (IOException e) {
e.printStackTrace();
} finally {
fileSystem.close();
if (null != inputStream) {
inputStream.close();
}
}
pressced = true;
return true;
} else {
return false;
}
}
//獲取當前的key值
@Override
public NullWritable getCurrentKey() throws IOException, InterruptedException {
return NullWritable.get();
}
//獲取當前的value值
@Override
public BytesWritable getCurrentValue() throws IOException, InterruptedException {
return bytesWritable;
}
//進度
@Override
public float getProgress() throws IOException, InterruptedException {
return pressced?0:1;
}
//關閉方法
@Override
public void close() throws IOException {
}
}
定義Map類
package inputformat;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
public class Custom_Mapper extends Mapper<NullWritable, BytesWritable, Text, BytesWritable> {
@Override
protected void map(NullWritable key, BytesWritable value, Context context) throws IOException, InterruptedException {
FileSplit fileSplit = (FileSplit) context.getInputSplit();
String name = fileSplit.getPath().getName();
context.write(new Text(name),value);
}
}
定義mapreduce處理流程
package inputformat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import java.io.IOException;
public class Custom_Driver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Job job=Job.getInstance(new Configuration());
job.setJarByClass(Custom_Driver.class);
job.setInputFormatClass(Custom_FileInputFormat.class);
Custom_FileInputFormat.addInputPath(job,new Path("G:\\input"));
job.setMapperClass(Custom_Mapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(BytesWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(BytesWritable.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
SequenceFileOutputFormat.setOutputPath(job,new Path("G:\\FileInputFormat_output"));
boolean b = job.waitForCompletion(true);
System.exit(b?0:1);
}
}