MapReduce之自定義分組實現
0.概念
-
說明:分組是一種特殊的比較器,對key做比較,並進行歸併,類似於合併同類項,也類似於SQL中的分組查詢
-
要求:通過自定義分組比較器實現將emp.csv中的數據按照部門號分成三個分組,並顯示出每組的人員名稱,
最終顯示的結果格式如下所示:
<10,CLARK1;KING1;MILLER1>
<20,CLARK2;KING2;MILLER2>
<30,CLARK3;KING3;MILLER3> -
思路:
1、需要自定義分組比較器
2、需要自定義Employee對象排序規則,並注意所使用排序字段應該與分組比較器規則相同
3、mapper和reducer都需要修改 -
本文用到的文件內容
1.依賴
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.3</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.3</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-common</artifactId>
<version>2.7.3</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>2.7.3</version>
</dependency>
2.添加log4j.properties文件在資源目錄下即resources,文件內容如下
log4j.rootLogger=FATAL, dest1
log4j.logger.dsaLogging=DEBUG, dsa
log4j.additivity.dsaLogging=false
log4j.appender.dest1=org.apache.log4j.ConsoleAppender
log4j.appender.dest1.layout=org.apache.log4j.PatternLayout
log4j.appender.dest1.layout.ConversionPattern=%-5p:%l: %m%n
log4j.appender.dest1.ImmediateFlush=true
log4j.appender.dsa=org.apache.log4j.RollingFileAppender
log4j.appender.dsa.File=./logs/dsa.log
log4j.appender.dsa.MaxFileSize=2000KB
# Previously MaxBackupIndex=2
log4j.appender.dsa.MaxBackupIndex=5
log4j.appender.dsa.layout=org.apache.log4j.PatternLayout
log4j.appender.dsa.layout.ConversionPattern=%l:%d: %m%n
3.編寫序列化類Employee,編寫排序規則
package com.mr.group;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class Employee implements WritableComparable<Employee> {
//7369,SMITH,CLERK,7902,1980/12/17,800,,20
private IntWritable empNo;
private Text empName;
private Text empJob;
private IntWritable leaderNo;
private Text hireDate;
private IntWritable empSalary;
private Text empBonus;
private IntWritable deptNo;
public Employee() {
this.empNo = new IntWritable();
this.empName = new Text("");
this.empJob = new Text("");
this.leaderNo = new IntWritable();
this.hireDate = new Text("");
this.empSalary =new IntWritable();
this.empBonus = new Text("");
this.deptNo = new IntWritable();
}
public Employee(int empNo, String empName, String empJob, int leaderNo,
String hireDate, int empSalary, String empBonus, int deptNo) {
this.empNo = new IntWritable(empNo);
this.empName = new Text(empName);
this.empJob = new Text(empJob);
this.leaderNo = new IntWritable(leaderNo);
this.hireDate = new Text(hireDate);
this.empSalary =new IntWritable(empSalary);
this.empBonus = new Text(empBonus);
this.deptNo = new IntWritable(deptNo);
}
@Override
public void write(DataOutput out) throws IOException {
//序列化
this.deptNo.write(out);
this.empSalary.write(out);
this.empNo.write(out);
this.empName.write(out);
this.empJob.write(out);
this.leaderNo.write(out);
this.hireDate.write(out);
this.empBonus.write(out);
}
@Override
public void readFields(DataInput in) throws IOException {
this.deptNo.readFields(in);
this.empSalary.readFields(in);
this.empNo.readFields(in);
this.empName.readFields(in);
this.empJob.readFields(in);
this.leaderNo.readFields(in);
this.hireDate.readFields(in);
this.empBonus.readFields(in);
}
@Override
public String toString() {
return "Employee{" +
"empNo=" + empNo +
", empName=" + empName +
", empJob=" + empJob +
", leaderNo=" + leaderNo +
", hireDate=" + hireDate +
", empSalary=" + empSalary +
", empBonus=" + empBonus +
", deptNo=" + deptNo +
'}';
}
public IntWritable getEmpNo() {
return empNo;
}
public void setEmpNo(IntWritable empNo) {
this.empNo = empNo;
}
public Text getEmpName() {
return empName;
}
public void setEmpName(Text empName) {
this.empName = empName;
}
public Text getEmpJob() {
return empJob;
}
public void setEmpJob(Text empJob) {
this.empJob = empJob;
}
public IntWritable getLeaderNo() {
return leaderNo;
}
public void setLeaderNo(IntWritable leaderNo) {
this.leaderNo = leaderNo;
}
public Text getHireDate() {
return hireDate;
}
public void setHireDate(Text hireDate) {
this.hireDate = hireDate;
}
public IntWritable getEmpSalary() {
return empSalary;
}
public void setEmpSalary(IntWritable empSalary) {
this.empSalary = empSalary;
}
public Text getEmpBonus() {
return empBonus;
}
public void setEmpBonus(Text empBonus) {
this.empBonus = empBonus;
}
public IntWritable getDeptNo() {
return deptNo;
}
public void setDeptNo(IntWritable deptNo) {
this.deptNo = deptNo;
}
/**
* 自定義排序規則
* 按照部門號升序排,員工工資降序排序
* @param o
* @return
*/
public int compareTo(Employee o) {
if (this.deptNo.get() > o.getDeptNo().get()){
return 1;
}else if(this.deptNo.get() < o.getDeptNo().get()){
return -1;
}
//說明:部門號是相同的情況,執行下面代碼
//按照工資降序排
if (this.empSalary.get() > o.getEmpSalary().get()){
return -1;
}else if(this.empSalary.get() < o.getEmpSalary().get()){
return 1;
}else{
return 0;
}
}
}
4.編寫自定義分組類MyEmployeeGrouper
package com.mr.group;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
public class MyEmployeeGrouper extends WritableComparator {
public MyEmployeeGrouper() {
//每一個實現都需要對比較的對象進行註冊
super(Employee.class,true);
}
/**
* 然後使用部門號進行比較
* @param a
* @param b
* @return
*/
@Override
public int compare(WritableComparable a, WritableComparable b) {
Employee employee1 = (Employee) a;
Employee employee2 = (Employee) b;
return employee1.getDeptNo().compareTo(employee2.getDeptNo());
}
}
5.編寫mapper類
package com.mr.group;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class EmpGroupMapper extends Mapper<LongWritable, Text, Employee, Text> {
Employee employee = new Employee();//保證對象的hashCode一致
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//數據格式:<0,7369,SMITH,CLERK,7902,1980/12/17,800,,20>
System.out.println("====key:" + key + "value:" + value.toString() + " ====== ");
//1、分詞
String[] splits = value.toString().split(",");
//2、創建Employee對象,並且賦值
employee.setDeptNo(new IntWritable(Integer.valueOf(splits[7])));
employee.setEmpSalary(new IntWritable(Integer.valueOf(splits[5])));
employee.setEmpName(new Text(splits[1]));
employee.setEmpJob(new Text(splits[2]));
//3、通過context寫出去
context.write(employee,employee.getEmpName());
}
}
6.編寫reduce類
package com.mr.group;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class EmpGroupReducer extends Reducer<Employee, Text,IntWritable,Text> {
@Override
protected void reduce(Employee key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
//1、對數據進行處理:取出相同部門的員工名稱集合
System.out.println("======key===========" + key.toString());
String names = "";
for (Text e: values) {
names = e.toString() + ";" + names;
System.out.println("======value===========" + e.toString());
}
//2、將結果通過context寫出去
context.write(key.getDeptNo(),new Text(names));
}
}
7.編寫Job類
package com.mr.group;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.util.Random;
public class EmpGroupJob {
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);
job.setMapperClass(EmpGroupMapper.class);
job.setMapOutputKeyClass(Employee.class);
job.setMapOutputValueClass(Text.class);
//設置分組
job.setGroupingComparatorClass(MyEmployeeGrouper.class);
//設置reduce
job.setReducerClass(EmpGroupReducer.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(job,new Path("D:\\emp.csv"));
FileOutputFormat.setOutputPath(job,new Path(getOutputDir()));
boolean result = job.waitForCompletion(true);
if (result)
System.out.println("運行成功");
else
System.out.println("運行失敗");
}
//用於產生隨機輸出目錄
public static String getOutputDir(){
String prefix = "D:\\output\\";
long time = System.currentTimeMillis();
int random = new Random().nextInt(1000);
return prefix + "result_" + time + "_" + random;
}
}
8.運行結果