MapReduce一次Map讀n行數據

MR題目:如何一次Map讀n行數據

輸入數據

{
"name":"ta",
"age":12,
"sex":1
}
{
"name":"la",
"age":13,
"sex":2
}
{
"name":"la",
"age":13,
"sex":2
}
{
"name":"la",
"age":13,
"sex":2
}
{
"name":"la",
"age":13,
"sex":2
}
{
"name":"la",
"age":13,
"sex":2
}

輸出數據

{"name":"la","age":13,"sex":2}
{"name":"la","age":13,"sex":2}
{"name":"la","age":13,"sex":2}
{"name":"la","age":13,"sex":2}
{"name":"la","age":13,"sex":2}
{"name":"ta","age":12,"sex":1}

運行記錄

Map-Reduce Framework
		Map input records=6   //從這裏可以看出確實是一次讀的五行
		Map output records=6

代碼

job

public class JSONJob {
    static class JSONMapper extends Mapper<IntWritable, Text, Text, NullWritable> {
        @Override
        protected void map(IntWritable key, Text value, Context context) throws IOException, InterruptedException {
            context.write(value,NullWritable.get());
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        //1. 配置jbo
        Configuration con = new Configuration();

        Job job = Job.getInstance(con);

        //2. 指定程序jar包所在位置
        job.setJarByClass(JSONJob.class);
		// 重點是自定義讀數據時的邏輯		
        job.setInputFormatClass(JsonInputFormat.class);


        //3. 配置Mapper和Reduce
        job.setMapperClass(JSONMapper.class);
        
        //4. 配置Mapper輸出kv
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NullWritable.class);

        //5. 配置最終kv
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);


        FileInputFormat.setInputPaths(job, new Path("d:/work/in/2"));
        FileOutputFormat.setOutputPath(job, new Path("d:/work/out/2"));

        //7. 提交代碼
        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : 1);
    }
}

InputFormat

//因爲是文本,所以直接用了FileInputFormat來處理
public class JsonInputFormat extends FileInputFormat<IntWritable, Text> {
    //如果要切片,我沒想到辦法做到從JSON最後一個}切,所以直接不切片了,多大都進一個MapTask
    @Override
    protected boolean isSplitable(JobContext context, Path filename) {
        return false;
    }

    @Override
    public RecordReader<IntWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
        return new FiveLineRecordReader(split, context);
    }
	//自定義RecordReader,讀數據的邏輯
    class FiveLineRecordReader extends RecordReader<IntWritable, Text> {
        FileSplit fileSplit;  //切片信息
        Configuration conf;  //job配置信息
        Boolean progress;   //一個標誌位,用來記錄是否讀到文件末尾的。false代表到了文件末尾
        IntWritable lineNum;  //按行讀,記錄的行號。類似默認的那個偏移量
        Text value; 		//存儲讀到的Json

        public FiveLineRecordReader(InputSplit fileSplit, TaskAttemptContext context) throws IOException, InterruptedException {
            initialize(fileSplit, context);
        }
		//初始化數據。無關緊要
        @Override
        public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
            this.fileSplit = (FileSplit) split;
            conf = context.getConfiguration();
            lineNum = new IntWritable(0);
            progress = true;
            value = new Text();
        }
		
        //核心方法
        @Override
        public boolean nextKeyValue() throws IOException, InterruptedException {
            if (progress) {
                String path = fileSplit.getPath().toUri().getPath();
                path = path.substring(1);//我這裏是本地跑的所以處理了以下。
                //1. 獲取文件字符流
                BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(path)));
                
                //2. 獲取上一次讀到的位置
                int lineNumNow = lineNum.get();
                
                //3. 跳過之前讀過的行
                for (int i = 0; i < lineNumNow; i++) {
                    bufferedReader.readLine();
                }
                //3. 新讀5行
                StringBuffer json = new StringBuffer();
                for (int i = 0; i < 5; i++) {
                    String tem = bufferedReader.readLine();
                    if (tem != null) {
                        json.append(tem);
                    } else break;
                }
                //4. 記錄結果
                value.set(json.toString());
                lineNum.set(lineNum.get()+5);
                //5. 判斷是否到文件末尾。
                if (bufferedReader.readLine() == null) {
                    progress = false;
                }
                return true;//nextKeyValue 返回true代表後面還有map數據
            }
            return false;
        }

        @Override
        public IntWritable getCurrentKey() throws IOException, InterruptedException {
            return lineNum;
        }

        @Override
        public Text getCurrentValue() throws IOException, InterruptedException {
            return value;
        }

        @Override
        public float getProgress() throws IOException, InterruptedException {
            return progress == true ? 0 : 1;
        }

        @Override
        public void close() throws IOException {

        }
    }
}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章