前提準備:
1.hadoop安裝運行正常。Hadoop安裝配置請參考:Ubuntu下 Hadoop 1.2.1 配置安裝
2.集成開發環境正常。集成開發環境配置請參考 :Ubuntu 搭建Hadoop源碼閱讀環境
MapReduce編程實例:
MapReduce編程實例(一),詳細介紹在集成環境中運行第一個MapReduce程序 WordCount及代碼分析
MapReduce編程實例(五),MapReduce實現單表關聯
排序,比較簡單,上代碼,代碼中有註釋,歡迎交流。
總體是利用MapReduce本身對Key進行排序的特性和按key值有序的分配到不同的partition。Mapreduce默認會對每個reduce按text類型key按字母順序排序,對intwritable類型按大小進行排序。
package com.t.hadoop;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
/**
* 排序
* 利用MapReduce默認的對Key進行排序
* 繼承Partitioner類,重寫getPartition使Mapper結果整體有序分到相應的Partition,輸入到Reduce分別排序。
* 利用全局變量統計位置
* @author daT [email protected]
*
*/
public class Sort {
public static class SortMapper extends Mapper<Object, Text, IntWritable, IntWritable>{
//直接輸出key,value,key爲需要排序的值,value任意
@Override
protected void map(Object key, Text value,
Context context)throws IOException, InterruptedException {
System.out.println("Key: "+key+" "+"Value: "+value);
context.write(new IntWritable(Integer.valueOf(value.toString())),new IntWritable(1));
}
}
public static class SortReducer extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable>{
public static IntWritable lineNum = new IntWritable(1);//記錄該數據的位置
//查詢value的個數,有多少個就輸出多少個Key值。
@Override
protected void reduce(IntWritable key, Iterable<IntWritable> value,
Context context) throws IOException, InterruptedException {
System.out.println("lineNum: "+lineNum);
for(IntWritable i:value){
context.write(lineNum, key);
}
lineNum = new IntWritable(lineNum.get()+1);
}
}
public static class SortPartitioner extends Partitioner<IntWritable, IntWritable>{
//根據key對數據進行分派
@Override
public int getPartition(IntWritable key, IntWritable value, int partitionNum) {
System.out.println("partitionNum: "+partitionNum);
int maxnum = 23492;//輸入的最大值,自己定義的。mapreduce 自帶的有采樣算法和partition的實現可以用,此例沒有用。
int bound = maxnum/partitionNum;
int keyNum = key.get();
for(int i=0;i<partitionNum;i++){
if(keyNum>bound*i&&keyNum<=bound*(i+1)){
return i;
}
}
return -1;
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException{
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if(otherArgs.length<2){
System.out.println("input parameters errors");
System.exit(2);
}
Job job= new Job(conf);
job.setJarByClass(Sort.class);
job.setMapperClass(SortMapper.class);
job.setPartitionerClass(SortPartitioner.class);//此例不許要combiner,需要設置Partitioner
job.setReducerClass(SortReducer.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true)?0:1);
}
}