二次排序
二次排序,對第1個字段相同的數據,使用第2個字段進行排序。說白了就是我們有的時候需要先按照key進行排序 如果key相同的情況下再按val排序;通過一個程序實現兩次排序的玩法我們成爲是二次排序;
舉個例子,電商平臺記錄了每一用戶的每一筆訂單的訂單金額,現在要求屬於同一個用戶的所有訂單金額作排序,
並且輸出的用戶名也要排序。
這裏涉及到了分組
分組 grouping
1) 概念:主要定義哪些key可以放置在一組,設置組之後reduce在處理的時候就可以分組並行處理,這樣能提高reduce的並行運算性能;
2) 自定義分組排序
定義實現一個WritableComparator,重寫compare(), 設置比較策略;
數據:
hadoop@apache 200
hive@apache 550
yarn@apache 580
hive@apache 159
hadoop@apache 300
hive@apache 258
hadoop@apache 100
首先自定義序列化比較類:
package com.hnxy.mr.Sort;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class AccountWritable implements WritableComparable<AccountWritable> {
// 定義屬性 賬戶姓名 訂單金額
private String account;
private Long cost;
public String getAccount() {
return account;
}
public void setAccount(String account) {
this.account = account;
}
public Long getCost() {
return cost;
}
public void setCost(Long cost) {
this.cost = cost;
}
@Override
public String toString() {
return "[account=" + account + ", cost=" + cost + "]";
}
@Override
public void write(DataOutput out) throws IOException {
// 序列化
out.writeUTF(account);
out.writeLong(cost);
}
@Override
public void readFields(DataInput in) throws IOException {
// 反序列化
this.account = in.readUTF();
this.cost = in.readLong();
}
@Override
public int compareTo(AccountWritable o) {
// 業務判斷 正序排序
return this.getAccount().compareTo(o.getAccount());
}
}
具體的實現類:
package com.hnxy.mr.Sort;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class MyMRSecondarySort extends Configured implements Tool{
//hadoop@apache 200 數據
public static class SecondarySortMapper extends Mapper<LongWritable, Text, AccountWritable, NullWritable> {
AccountWritable outkey = new AccountWritable();
String[] str = null;
@Override
protected void map(LongWritable key, Text value,
Mapper<LongWritable, Text, AccountWritable, NullWritable>.Context context)
throws IOException, InterruptedException {
// 常規業務邏輯
str = value.toString().split("\t");
if(str.length == 2 && str != null){
outkey.setAccount(str[0]);
outkey.setCost(Long.parseLong(str[1]));
context.write(outkey, NullWritable.get());
}
}
}
// 客戶的要求是這樣的 : hadoop方一個文件 yarn --> 放在一起 hive單獨存放
private static class MyPartitioner extends Partitioner<AccountWritable, NullWritable>{
@Override
public int getPartition(AccountWritable key, NullWritable value, int numPartitions) {
// 自定義分區規則
if(key.getAccount().startsWith("hadoop")){
return 0;
}else{
return 1;
}
}
}
//排序 外部比較器
private static class MyCompartor extends WritableComparator{
public MyCompartor(){
super(AccountWritable.class,true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
// 強轉類型
AccountWritable aa = (AccountWritable)a;
AccountWritable ab = (AccountWritable)b;
//設置返回 名稱倒序進行分組
int result = ab.getAccount().compareTo(aa.getAccount());
//如果組內第一值相同就比較第二個
if(result==0){
result = ab.getCost().compareTo(aa.getCost());
}
return result;
}
}
//分組
private static class MyCompartor1 extends WritableComparator{
public MyCompartor1(){
super(AccountWritable.class,true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
// 強轉類型
AccountWritable aa =(AccountWritable)a;
AccountWritable ab = (AccountWritable)b;
//設置返回值 名稱和分組倒序進行排序
//這裏有個疑問,爲什麼感覺像有進行了一次外部比較。其實這次是按照名字分組 相等就是返回0
int result = aa.getAccount().compareTo(ab.getAccount());
return result;
}
}
@Override
public int run(String[] args) throws Exception {
// 創建方法的返回值
int count = 0;
// 獲取配置參數
Configuration conf = this.getConf();
// 判斷輸出目錄是否存在
FileSystem fs = FileSystem.get(conf);
// 設定輸入與輸出類
Path in = new Path(args[0]);
Path out = new Path(args[1]);
if(fs.exists(out)){
fs.delete(out,true);
System.out.println("Old OutPut Path is Deleted!");
}
// 定義job
// -Djob_name=?
// 定義工作名稱
String jobName = conf.get("job_name");
if(!(null != jobName && !"".equals(jobName.trim()))){
jobName = "Job_By_Su";
}
Job job = Job.getInstance(conf,jobName);
// 設置jar_mr類
job.setJarByClass(MyMRSecondarySort.class);
job.setMapperClass(SecondarySortMapper.class);
job.setNumReduceTasks(2);
// 按照名稱的hashcode進行分區
// 先分組
job.setGroupingComparatorClass(MyCompartor1.class);
// 再分區
job.setPartitionerClass(MyPartitioner.class);
// 最後組內排序
job.setSortComparatorClass(MyCompartor.class);
// 輸出類型怎麼設置?
// 一定要自己設置一下輸出類型
job.setMapOutputKeyClass(AccountWritable.class);
job.setMapOutputValueClass(NullWritable.class);
// 一定要自己設置一下輸出類型
// 設置格式化類
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
// 設置輸入與輸出路徑
FileInputFormat.addInputPath(job, in);
FileOutputFormat.setOutputPath(job, out);
// 執行
count = job.waitForCompletion(true)?1:0;
// 返回
return count;
}
public static void main(String[] args) throws Exception {
System.exit(ToolRunner.run(new MyMRSecondarySort(), args));
}
}