hadoop08--自定義inputFormat以及outputFormat

自定義inputFormat

需求

無論hdfs還是mapreduce,對於小文件都有損效率,實踐中,又難免面臨處理大量小文件的場景,此時,就需要有相應解決方案
運行前:
在這裏插入圖片描述
運行後:
在這裏插入圖片描述

分析

小文件的優化無非以下幾種方式:

  1. 在數據採集的時候,就將小文件或小批數據合成大文件再上傳HDFS
  2. 在業務處理之前,在HDFS上使用mapreduce程序對小文件進行合併
  3. 在mapreduce處理時,可採用combineInputFormat提高效率

實現

這裏實現的是上述第二種方式
程序的核心機制:

	自定義一個InputFormat
	改寫RecordReader,實現一次讀取一個完整文件封裝爲KV
	在輸出時使用SequenceFileOutPutFormat輸出合併文件
自定義InputFromat
import java.io.IOException;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

public class CusInputFormat extends FileInputFormat<NullWritable, BytesWritable> {

	// 是否對文件進行切分
	@Override
	protected boolean isSplitable(JobContext context, Path filename) {
		// TODO Auto-generated method stub
		return false;
	}

	@Override
	public RecordReader<NullWritable, BytesWritable> createRecordReader(InputSplit split, TaskAttemptContext context)
			throws IOException, InterruptedException {
		// TODO Auto-generated method stub
		CusRecordReader cusRecordReader = new CusRecordReader();
		cusRecordReader.initialize(split, context);
		return cusRecordReader; 
	}

}

自定義RecordReader
import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

public class CusRecordReader extends RecordReader<NullWritable, BytesWritable> {

	// 定義配置信息類
	private Configuration conf;
	// 定義切片信息
	private FileSplit split;
	// 記錄讀取進度
	private boolean progress = false;
	// 定義輸出的value
	private BytesWritable values = new BytesWritable();

	@Override
	public void close() throws IOException {
		// TODO Auto-generated method stub

	}

	@Override
	public NullWritable getCurrentKey() throws IOException, InterruptedException {
		// TODO Auto-generated method stub
		return NullWritable.get();
	}

	@Override
	public BytesWritable getCurrentValue() throws IOException, InterruptedException {
		// TODO Auto-generated method stub
		return values;
	}

	// 讀取進度
	@Override
	public float getProgress() throws IOException, InterruptedException {
		// TODO Auto-generated method stub
		// 返回true,淨
		return this.progress ? 1 : 0;
	}

	// 初始化方法
	@Override
	public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
		// TODO Auto-generated method stub
		this.split = (FileSplit) split;
		this.conf = context.getConfiguration();
	}

	// 讀取一行的內容
	@Override
	public boolean nextKeyValue() throws IOException, InterruptedException {
		// TODO Auto-generated method stub

		if (!progress) {
			// 1. 定義緩衝區
			byte[] data = new byte[(int) this.split.getLength()];
			FileSystem fs = null;
			FSDataInputStream fis = null;

			// 獲得文件讀取的路徑
			Path path = split.getPath();
			// 獲得文件系統
			fs = path.getFileSystem(conf);

			// 讀取數據
			fis = fs.open(path);
			IOUtils.readFully(fis, data, 0, data.length);

			// 輸出文件內容
			values.set(data, 0, data.length);

			IOUtils.closeStream(fis);

			this.progress = true;
			return true;
		}
		return false;
	}

}
map端
import java.io.IOException;

import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

public class FileMap extends Mapper<NullWritable, BytesWritable, Text, BytesWritable> {

	Text outputkey = new Text();

	@Override
	protected void map(NullWritable key, BytesWritable value, Context context)
			throws IOException, InterruptedException {
		context.write(outputkey, value);
	}

	// 最先執行的方法, 並且只執行一次
	@Override
	protected void setup(Mapper<NullWritable, BytesWritable, Text, BytesWritable>.Context context)
			throws IOException, InterruptedException {
		// 1. 獲得文件的切片信息
		FileSplit inputSplit = (FileSplit) context.getInputSplit(); 

		// 2. 獲得切片名稱
		String name = inputSplit.getPath().toString();

		// 3. 輸出key
		outputkey.set(name);

	}

}
reduce端
import java.io.IOException;

import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class FileReduce extends Reducer<Text, BytesWritable, Text, BytesWritable> {
	@Override
	protected void reduce(Text key, Iterable<BytesWritable> values, Context context)
			throws IOException, InterruptedException {
		context.write(key, values.iterator().next());
	} 
}

主函數Driver
import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Driver {
	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		Configuration conf = new Configuration();

		Job job = Job.getInstance(conf);

		job.setJarByClass(Driver.class);
		job.setMapperClass(FileMap.class);
		job.setReducerClass(FileReduce.class);

		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(BytesWritable.class);

		job.setOutputKeyClass(Text.class); 
		job.setOutputValueClass(BytesWritable.class);
		job.setInputFormatClass(CusInputFormat.class);

		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		boolean res = job.waitForCompletion(true);

		System.exit(res ? 0 : 1);
	}
}

自定義outputFormat

需求

現有一些原始日誌需要做增強解析處理,流程:

  1. 從原始日誌文件中讀取數據
  2. 根據日誌中的一個URL字段到外部知識庫中獲取信息增強到原始日誌
  3. 如果成功增強,則輸出到增強結果目錄;如果增強失敗,則抽取原始數據中URL字段輸出到待爬清單目錄
    運行前;
    在這裏插入圖片描述
    運行後:
    在這裏插入圖片描述

分析

程序的關鍵點是要在一個mapreduce程序中根據數據的不同輸出兩類結果到不同目錄,這類靈活的輸出需求可以通過自定義outputformat來實現

實現

實現要點:

  1. 在mapreduce中訪問外部資源
  2. 自定義outputformat,改寫其中的recordwriter,改寫具體輸出數據的方法write()

代碼實現如下:

自定義outputFormat
import java.io.IOException;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class FilteroutputFormat extends FileOutputFormat<Text, NullWritable> {

	@Override
	public RecordWriter<Text, NullWritable> getRecordWriter(TaskAttemptContext context)
			throws IOException, InterruptedException {
		// TODO Auto-generated method stub
		return new FilterRecoderWriter(context);
	}

}
自定義RecordWriter
import java.io.IOException;

import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

public class FilterRecoderWriter extends RecordWriter<Text, NullWritable> {

	// 定義輸出流
	FSDataOutputStream adminOut = null;
	FSDataOutputStream otherOut = null;

	public FilterRecoderWriter(TaskAttemptContext context) {
		// TODO Auto-generated constructor stub
		// 1.獲得文件系統
		FileSystem fs = null;
		try {
			fs = FileSystem.get(context.getConfiguration());

			// 創建輸出路徑
			Path adminPath = new Path("C:\\Users\\55454_000\\Desktop\\adminlog");
			Path otherPath = new Path("C:\\Users\\55454_000\\Desktop\\otherlog");

			// 創建輸出
			adminOut = fs.create(adminPath);
			otherOut = fs.create(otherPath);

		} catch (Exception e) {
			// TODO: handle exception
			e.printStackTrace();
		}

	}

	@Override
	public void close(TaskAttemptContext arg0) throws IOException, InterruptedException {
		// TODO Auto-generated method stub
		if (adminOut != null) {
			adminOut.close();
		}
		if (otherOut != null) {
			otherOut.close();
		}
	}

	@Override
	public void write(Text key, NullWritable value) throws IOException, InterruptedException {
		// TODO Auto-generated method stub
		if (key.toString().contains("admin")) {
			adminOut.write(key.toString().getBytes());
		} else {
			otherOut.write(key.toString().getBytes());
		}

	}

}
map端
import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class FilterMap extends Mapper<LongWritable, Text, Text, NullWritable> {

	Text k = new Text();

	@Override
	protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
		// 1.獲取數據
		String line = value.toString();

		k.set(line);

		// 輸出
		context.write(k, NullWritable.get());
	}
}
reduce端
import java.io.IOException;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class FilterReduce extends Reducer<Text, NullWritable, Text, NullWritable> {
	@Override
	protected void reduce(Text key, Iterable<NullWritable> values, Context context)
			throws IOException, InterruptedException {
		// 如果是admin, 進行內容的輸出
		context.write(key, NullWritable.get());
	}
}

主函數Driver
import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Driver {
	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		Configuration conf = new Configuration();

		Job job = Job.getInstance(conf);

		job.setJarByClass(Driver.class);
		job.setMapperClass(FilterMap.class);
		job.setReducerClass(FilterReduce.class);

		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(NullWritable.class);

		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(NullWritable.class);
		job.setOutputFormatClass(FilteroutputFormat.class);

		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		boolean res = job.waitForCompletion(true);

		System.exit(res ? 0 : 1);
	}
}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章