Mapreduce之二次排序

二次排序

二次排序,對第1個字段相同的數據,使用第2個字段進行排序。說白了就是我們有的時候需要先按照key進行排序 如果key相同的情況下再按val排序;通過一個程序實現兩次排序的玩法我們成爲是二次排序;
舉個例子,電商平臺記錄了每一用戶的每一筆訂單的訂單金額,現在要求屬於同一個用戶的所有訂單金額作排序,
並且輸出的用戶名也要排序。

這裏涉及到了分組

分組 grouping

1) 概念:主要定義哪些key可以放置在一組,設置組之後reduce在處理的時候就可以分組並行處理,這樣能提高reduce的並行運算性能;

2) 自定義分組排序

定義實現一個WritableComparator,重寫compare(), 設置比較策略;

數據:

hadoop@apache	200
hive@apache	550
yarn@apache	580
hive@apache	159
hadoop@apache	300
hive@apache	258
hadoop@apache	100

首先自定義序列化比較類:

package com.hnxy.mr.Sort;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;

public class AccountWritable implements WritableComparable<AccountWritable> {
	// 定義屬性 賬戶姓名 訂單金額
	private String account;
	private Long cost;

	public String getAccount() {
		return account;
	}

	public void setAccount(String account) {
		this.account = account;
	}

	public Long getCost() {
		return cost;
	}

	public void setCost(Long cost) {
		this.cost = cost;
	}

	@Override
	public String toString() {
		return "[account=" + account + ", cost=" + cost + "]";
	}

	@Override
	public void write(DataOutput out) throws IOException {
		// 序列化
		out.writeUTF(account);
		out.writeLong(cost);
	}

	@Override
	public void readFields(DataInput in) throws IOException {
		// 反序列化
		this.account = in.readUTF();
		this.cost = in.readLong();
	}

	@Override
	public int compareTo(AccountWritable o) {
		// 業務判斷 正序排序
		return this.getAccount().compareTo(o.getAccount());
	}
}

具體的實現類:

package com.hnxy.mr.Sort;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class MyMRSecondarySort extends Configured implements Tool{
	//hadoop@apache	200 數據
	public static class SecondarySortMapper extends Mapper<LongWritable, Text, AccountWritable, NullWritable> {
		AccountWritable outkey = new AccountWritable();
		String[] str = null;
		@Override
		protected void map(LongWritable key, Text value,
				Mapper<LongWritable, Text, AccountWritable, NullWritable>.Context context)
				throws IOException, InterruptedException {
			// 常規業務邏輯
			str = value.toString().split("\t");
			if(str.length == 2 && str != null){
				outkey.setAccount(str[0]);
				outkey.setCost(Long.parseLong(str[1]));
				context.write(outkey, NullWritable.get());
			}
		}
	}
	// 客戶的要求是這樣的 : hadoop方一個文件  yarn --> 放在一起 hive單獨存放
	private static class MyPartitioner extends Partitioner<AccountWritable, NullWritable>{
		@Override
		public int getPartition(AccountWritable key, NullWritable value, int numPartitions) {
			// 自定義分區規則
			if(key.getAccount().startsWith("hadoop")){
				return 0;
			}else{
				return 1;			
			}
		}		
	}
	//排序    外部比較器
	private static class MyCompartor extends WritableComparator{
		public MyCompartor(){
			super(AccountWritable.class,true);
		}
		@Override
		public int compare(WritableComparable a, WritableComparable b) {
			// 強轉類型
			AccountWritable aa = (AccountWritable)a;
			AccountWritable ab = (AccountWritable)b;
			//設置返回 名稱倒序進行分組
			int result = ab.getAccount().compareTo(aa.getAccount());
			//如果組內第一值相同就比較第二個
			if(result==0){
				result = ab.getCost().compareTo(aa.getCost());
			}
			return result;
		}
	}
	//分組
	private static class MyCompartor1 extends WritableComparator{
		public MyCompartor1(){
			super(AccountWritable.class,true);
	}
		@Override
		public int compare(WritableComparable a, WritableComparable b) {
			// 強轉類型
			AccountWritable aa =(AccountWritable)a;
			AccountWritable ab = (AccountWritable)b;
			//設置返回值 名稱和分組倒序進行排序
			//這裏有個疑問,爲什麼感覺像有進行了一次外部比較。其實這次是按照名字分組 相等就是返回0
			int result = aa.getAccount().compareTo(ab.getAccount());
			return result;
		}
	}
	@Override
	public int run(String[] args) throws Exception {
		// 創建方法的返回值
		int count = 0;
		// 獲取配置參數
		Configuration conf = this.getConf();
		// 判斷輸出目錄是否存在
		FileSystem fs = FileSystem.get(conf);
		// 設定輸入與輸出類
		Path in = new Path(args[0]);
		Path out = new Path(args[1]);
		if(fs.exists(out)){
			fs.delete(out,true);
			System.out.println("Old OutPut Path is Deleted!");
		}
		// 定義job
		// -Djob_name=?
		// 定義工作名稱
		String jobName = conf.get("job_name");
		if(!(null != jobName && !"".equals(jobName.trim()))){
			jobName = "Job_By_Su";
		}
		Job job = Job.getInstance(conf,jobName);
		// 設置jar_mr類
		job.setJarByClass(MyMRSecondarySort.class);
		job.setMapperClass(SecondarySortMapper.class);
		job.setNumReduceTasks(2);
		
		// 按照名稱的hashcode進行分區
		
        // 先分組
		job.setGroupingComparatorClass(MyCompartor1.class);
        
		// 再分區
		job.setPartitionerClass(MyPartitioner.class);		
		// 最後組內排序
		job.setSortComparatorClass(MyCompartor.class);
		// 輸出類型怎麼設置?
		// 一定要自己設置一下輸出類型
		job.setMapOutputKeyClass(AccountWritable.class);
		job.setMapOutputValueClass(NullWritable.class);
		// 一定要自己設置一下輸出類型		
		// 設置格式化類
		job.setInputFormatClass(TextInputFormat.class);
		job.setOutputFormatClass(TextOutputFormat.class);
		// 設置輸入與輸出路徑
		FileInputFormat.addInputPath(job, in);
		FileOutputFormat.setOutputPath(job, out);		
		// 執行
		count = job.waitForCompletion(true)?1:0;
		// 返回
		return count;		
	}

	 public static void main(String[] args) throws Exception {
		System.exit(ToolRunner.run(new MyMRSecondarySort(), args));
	}
}

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章