文章目錄
自定義inputFormat
需求
無論hdfs還是mapreduce,對於小文件都有損效率,實踐中,又難免面臨處理大量小文件的場景,此時,就需要有相應解決方案
運行前:
運行後:
分析
小文件的優化無非以下幾種方式:
- 在數據採集的時候,就將小文件或小批數據合成大文件再上傳HDFS
- 在業務處理之前,在HDFS上使用mapreduce程序對小文件進行合併
- 在mapreduce處理時,可採用combineInputFormat提高效率
實現
這裏實現的是上述第二種方式
程序的核心機制:
自定義一個InputFormat
改寫RecordReader,實現一次讀取一個完整文件封裝爲KV
在輸出時使用SequenceFileOutPutFormat輸出合併文件
自定義InputFromat
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
public class CusInputFormat extends FileInputFormat<NullWritable, BytesWritable> {
// 是否對文件進行切分
@Override
protected boolean isSplitable(JobContext context, Path filename) {
// TODO Auto-generated method stub
return false;
}
@Override
public RecordReader<NullWritable, BytesWritable> createRecordReader(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
CusRecordReader cusRecordReader = new CusRecordReader();
cusRecordReader.initialize(split, context);
return cusRecordReader;
}
}
自定義RecordReader
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
public class CusRecordReader extends RecordReader<NullWritable, BytesWritable> {
// 定義配置信息類
private Configuration conf;
// 定義切片信息
private FileSplit split;
// 記錄讀取進度
private boolean progress = false;
// 定義輸出的value
private BytesWritable values = new BytesWritable();
@Override
public void close() throws IOException {
// TODO Auto-generated method stub
}
@Override
public NullWritable getCurrentKey() throws IOException, InterruptedException {
// TODO Auto-generated method stub
return NullWritable.get();
}
@Override
public BytesWritable getCurrentValue() throws IOException, InterruptedException {
// TODO Auto-generated method stub
return values;
}
// 讀取進度
@Override
public float getProgress() throws IOException, InterruptedException {
// TODO Auto-generated method stub
// 返回true,淨
return this.progress ? 1 : 0;
}
// 初始化方法
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
// TODO Auto-generated method stub
this.split = (FileSplit) split;
this.conf = context.getConfiguration();
}
// 讀取一行的內容
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
// TODO Auto-generated method stub
if (!progress) {
// 1. 定義緩衝區
byte[] data = new byte[(int) this.split.getLength()];
FileSystem fs = null;
FSDataInputStream fis = null;
// 獲得文件讀取的路徑
Path path = split.getPath();
// 獲得文件系統
fs = path.getFileSystem(conf);
// 讀取數據
fis = fs.open(path);
IOUtils.readFully(fis, data, 0, data.length);
// 輸出文件內容
values.set(data, 0, data.length);
IOUtils.closeStream(fis);
this.progress = true;
return true;
}
return false;
}
}
map端
import java.io.IOException;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
public class FileMap extends Mapper<NullWritable, BytesWritable, Text, BytesWritable> {
Text outputkey = new Text();
@Override
protected void map(NullWritable key, BytesWritable value, Context context)
throws IOException, InterruptedException {
context.write(outputkey, value);
}
// 最先執行的方法, 並且只執行一次
@Override
protected void setup(Mapper<NullWritable, BytesWritable, Text, BytesWritable>.Context context)
throws IOException, InterruptedException {
// 1. 獲得文件的切片信息
FileSplit inputSplit = (FileSplit) context.getInputSplit();
// 2. 獲得切片名稱
String name = inputSplit.getPath().toString();
// 3. 輸出key
outputkey.set(name);
}
}
reduce端
import java.io.IOException;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class FileReduce extends Reducer<Text, BytesWritable, Text, BytesWritable> {
@Override
protected void reduce(Text key, Iterable<BytesWritable> values, Context context)
throws IOException, InterruptedException {
context.write(key, values.iterator().next());
}
}
主函數Driver
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Driver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(Driver.class);
job.setMapperClass(FileMap.class);
job.setReducerClass(FileReduce.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(BytesWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(BytesWritable.class);
job.setInputFormatClass(CusInputFormat.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
boolean res = job.waitForCompletion(true);
System.exit(res ? 0 : 1);
}
}
自定義outputFormat
需求
現有一些原始日誌需要做增強解析處理,流程:
- 從原始日誌文件中讀取數據
- 根據日誌中的一個URL字段到外部知識庫中獲取信息增強到原始日誌
- 如果成功增強,則輸出到增強結果目錄;如果增強失敗,則抽取原始數據中URL字段輸出到待爬清單目錄
運行前;
運行後:
分析
程序的關鍵點是要在一個mapreduce程序中根據數據的不同輸出兩類結果到不同目錄,這類靈活的輸出需求可以通過自定義outputformat來實現
實現
實現要點:
- 在mapreduce中訪問外部資源
- 自定義outputformat,改寫其中的recordwriter,改寫具體輸出數據的方法write()
代碼實現如下:
自定義outputFormat
import java.io.IOException;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class FilteroutputFormat extends FileOutputFormat<Text, NullWritable> {
@Override
public RecordWriter<Text, NullWritable> getRecordWriter(TaskAttemptContext context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
return new FilterRecoderWriter(context);
}
}
自定義RecordWriter
import java.io.IOException;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
public class FilterRecoderWriter extends RecordWriter<Text, NullWritable> {
// 定義輸出流
FSDataOutputStream adminOut = null;
FSDataOutputStream otherOut = null;
public FilterRecoderWriter(TaskAttemptContext context) {
// TODO Auto-generated constructor stub
// 1.獲得文件系統
FileSystem fs = null;
try {
fs = FileSystem.get(context.getConfiguration());
// 創建輸出路徑
Path adminPath = new Path("C:\\Users\\55454_000\\Desktop\\adminlog");
Path otherPath = new Path("C:\\Users\\55454_000\\Desktop\\otherlog");
// 創建輸出
adminOut = fs.create(adminPath);
otherOut = fs.create(otherPath);
} catch (Exception e) {
// TODO: handle exception
e.printStackTrace();
}
}
@Override
public void close(TaskAttemptContext arg0) throws IOException, InterruptedException {
// TODO Auto-generated method stub
if (adminOut != null) {
adminOut.close();
}
if (otherOut != null) {
otherOut.close();
}
}
@Override
public void write(Text key, NullWritable value) throws IOException, InterruptedException {
// TODO Auto-generated method stub
if (key.toString().contains("admin")) {
adminOut.write(key.toString().getBytes());
} else {
otherOut.write(key.toString().getBytes());
}
}
}
map端
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class FilterMap extends Mapper<LongWritable, Text, Text, NullWritable> {
Text k = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 1.獲取數據
String line = value.toString();
k.set(line);
// 輸出
context.write(k, NullWritable.get());
}
}
reduce端
import java.io.IOException;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class FilterReduce extends Reducer<Text, NullWritable, Text, NullWritable> {
@Override
protected void reduce(Text key, Iterable<NullWritable> values, Context context)
throws IOException, InterruptedException {
// 如果是admin, 進行內容的輸出
context.write(key, NullWritable.get());
}
}
主函數Driver
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Driver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(Driver.class);
job.setMapperClass(FilterMap.class);
job.setReducerClass(FilterReduce.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
job.setOutputFormatClass(FilteroutputFormat.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
boolean res = job.waitForCompletion(true);
System.exit(res ? 0 : 1);
}
}