1. 前言
需求:對輸入文件中的數據進行排序,輸入文件中的每一行均爲一個數字,即爲一個數據。要求在輸出文件中每行輸出兩個數字,第一個代表原始數據在數據集中的順次,第二個代表原始數據。
2. MapReduce實現排序的原理
在MapReduce中默認可以進行排序。
- 如果key是封裝爲IntWritable類型,那麼MapReduce按照數字大小對key進行排序。
- 如果key是封裝爲String的Text類型,那麼MapReduce按照字典順序對字符串排序。
默認排序規則:
按照key值進行排序,所以應該使用封裝int的IntWritable型數據結構,也就是在map中將讀入的數據轉化成IntWritable型,然後作爲key值輸出(value是任意的),reduce拿到<key, value-list>之後,將輸入的key作爲value輸出,並根據value-list中元素的個數決定輸出的次數。輸出的key是一個全局變量,它統計當前key的順次。
3. 上傳文件
一通亂敲:
hadoop fs -put sort /sort
4. 代碼實現
package com.mapreduce.sort;
import com.mapreduce.wordcount.WordCountMapper;
import com.mapreduce.wordcount.WordCountReducer;
import com.mapreduce.wordcount.WordCountRunJob;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;
import java.net.URI;
public class SortApp {
private static final String INPUT_PATH = "hdfs://master001:9000/sort";
private static final String OUTPUT_PATH = "hdfs://master001:9000/outputsort";
public static class MyMapper extends Mapper<LongWritable, Text, IntWritable, IntWritable>{
private static IntWritable data = new IntWritable(); //靜態變量了解一下
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException{
String line = value.toString();
data.set(Integer.parseInt(line));
context.write(data, new IntWritable(1));
}
}
public static class MyReducer extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable>{
private static IntWritable data = new IntWritable(1);
public void reduce(IntWritable key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException{
for(IntWritable val : values){
context.write(data, key);
data = new IntWritable(data.get() + 1);
}
}
}
public static void main(String[] args) throws Exception{
System.setProperty("HADOOP_USER_NAME", "hadoop");
Configuration conf = new Configuration();
//提升代碼的健壯性
final FileSystem fileSystem = FileSystem.get(URI.create(INPUT_PATH), conf);
if(fileSystem.exists(new Path(OUTPUT_PATH))){
fileSystem.delete(new Path(OUTPUT_PATH), true);
}
Job job = Job.getInstance(conf, "SortApp");
//run jar class 主方法
job.setJarByClass(SortApp.class);
//設置map
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(IntWritable.class);
//設置reduce
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(IntWritable.class);
//設置input format
job.setInputFormatClass(TextInputFormat.class);
FileInputFormat.addInputPath(job, new Path(INPUT_PATH));
//設置output format
job.setOutputFormatClass(TextOutputFormat.class);
FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH));
//提交job
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}