利用MapReduce實現自定義排序
1.第一種實現的方式是:將數據封裝到list中,使用集合的排序方法實現排序在cleanup方法中加入代碼實現排序;
所以這裏只適合把所有的數據都放在一個pojo中;
2.第二種是將pojo類作爲key從Mapper傳遞到Reducer,然後值作爲key,然後遍歷進行傳遞。
3.共同點:兩種方式都要實現WritableComparable接口,重寫compareTo()方法,然後指定兩個int類型的數據相減,
或者也可以指定多字段進行排序
1.Pojo類(用來實現排序的,而且這裏的數據是來自之前統計的結果,所以只有兩個字段:技能點、數量)
package com.data.test.Mysql_skill;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/*
自定義的類作爲MapReduce傳輸對象的時候,必須序列化,實現WritableComparable 接口
泛型爲map輸出的key的類型
可以把需要的字段都封裝過來,map輸出的value就可以用NullWritable來代替
任何用作鍵來使用的類都應該實現WritableComparable接口
*/
public class outPojo implements WritableComparable {
String skill;
int count;
public outPojo() {
}
public outPojo(String skill, int count) {
this.skill = skill;
this.count = count;
}
public String getSkill() {
return skill;
}
public void setSkill(String skill) {
this.skill = skill;
}
public int getCount() {
return count;
}
public void setCount(int count) {
this.count = count;
}
// 序列化的方法:對象 ===> 二進制
// map發到reduce端的的時候先序列化
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(skill);
out.writeInt(count);
}
// 反序列化的方法,到reduce端的時候進行反序列化,和序列化的順序一定要一致
@Override
public void readFields(DataInput in) throws IOException {
this.skill = in.readUTF();
this.count = in.readInt();
}
// 也就是說可以指定多重排序方式,不僅僅是單一的字段排序
@Override
public int compareTo(outPojo o) {
// 第一先按數量(count字段)降序排列
int temp = o.getCount() - this.count;
// 如果第一個字段相同,在比較技能字段
if(tmp==0){
tmp = o.skill -this.skill;
return tmp;
}
return tmp;
}
// 控制當前對象被傳遞的時候,輸出的格式,可以全部輸出也可以只輸出一部分(這裏只是輸出了其中的一個字段)
// 實際上這樣就可以在Reducer過程中輸出的時候選擇類型爲當前實體類類型。
// 也就是說如果要輸出,多個值的時候,也可以封裝在這樣一個實體類中然後一起輸出,不過就是得實現序列化(Writable)接口
@Override
public String toString() {
return String.valueOf(count);
}
}
2.Pojo類(用來實現從Mysql中讀取數據的,所以不僅實現序列化接口
Writable,還實現了操作數據庫的DBWritable)
// 由於這裏的測試的數據來源是mysql,所以還多了一個實體類
package com.data.test.Mysql_skill;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.lib.db.DBWritable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
public class sourcePojo implements Writable, DBWritable {
int id; // 編號
String city; //城市
String company; // 公司名
String companySize;// 公司人數
String companyType;// 公司性質
String edulevel;// 學歷要求
String emplType;// 全職/兼職
String extractSkillTag;// 當前崗位需要的技能
String jobName;// 崗位名稱
String salary;// 薪資
String welfare;// 福利
String workingExp;//工作年限要求
public sourcePojo() {
}
public sourcePojo(int id, String city, String company, String companySize, String companyType, String edulevel, String emplType, String extractSkillTag, String jobName, String salary, String welfare, String workingExp) {
this.id = id;
this.city = city;
this.company = company;
this.companySize = companySize;
this.companyType = companyType;
this.edulevel = edulevel;
this.emplType = emplType;
this.extractSkillTag = extractSkillTag;
this.jobName = jobName;
this.salary = salary;
this.welfare = welfare;
this.workingExp = workingExp;
}
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public String getCity() {
return city;
}
public void setCity(String city) {
this.city = city;
}
public String getCompany() {
return company;
}
public void setCompany(String company) {
this.company = company;
}
public String getCompanySize() {
return companySize;
}
public void setCompanySize(String companySize) {
this.companySize = companySize;
}
public String getCompanyType() {
return companyType;
}
public void setCompanyType(String companyType) {
this.companyType = companyType;
}
public String getEdulevel() {
return edulevel;
}
public void setEdulevel(String edulevel) {
this.edulevel = edulevel;
}
public String getEmplType() {
return emplType;
}
public void setEmplType(String emplType) {
this.emplType = emplType;
}
public String getExtractSkillTag() {
return extractSkillTag;
}
public void setExtractSkillTag(String extractSkillTag) {
this.extractSkillTag = extractSkillTag;
}
public String getJobName() {
return jobName;
}
public void setJobName(String jobName) {
this.jobName = jobName;
}
public String getSalary() {
return salary;
}
public void setSalary(String salary) {
this.salary = salary;
}
public String getWelfare() {
return welfare;
}
public void setWelfare(String welfare) {
this.welfare = welfare;
}
public String getWorkingExp() {
return workingExp;
}
public void setWorkingExp(String workingExp) {
this.workingExp = workingExp;
}
// 序列化
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(this.id);
out.writeUTF(this.city);
out.writeUTF(this.company);
out.writeUTF(this.companySize);
out.writeUTF(this.companyType);
out.writeUTF(this.edulevel);
out.writeUTF(this.emplType);
out.writeUTF(this.extractSkillTag);
out.writeUTF(this.jobName);
out.writeUTF(this.salary);
out.writeUTF(this.welfare);
out.writeUTF(this.workingExp);
}
// 反序列化
@Override
public void readFields(DataInput in) throws IOException {
this.id = in.readInt();
this.city = in.readUTF();
this.company = in.readUTF();
this.companySize = in.readUTF();
this.companyType = in.readUTF();
this.edulevel = in.readUTF();
this.emplType = in.readUTF();
this.extractSkillTag = in.readUTF();
this.jobName = in.readUTF();
this.salary = in.readUTF();
this.welfare = in.readUTF();
this.workingExp = in.readUTF();
}
// 注意:PreparedStatement的索引是從1開始的。
@Override
public void write(PreparedStatement statement) throws SQLException {
statement.setInt(1,this.id);
statement.setString(2,this.city);
statement.setString(3,this.company);
statement.setString(4,this.companySize);
statement.setString(5,this.companyType);
statement.setString(6,this.edulevel);
statement.setString(7,this.emplType);
statement.setString(8,this.extractSkillTag);
statement.setString(9,this.jobName);
statement.setString(10,this.salary);
statement.setString(11,this.welfare);
statement.setString(12,this.workingExp);
}
// 同樣注意這裏的索引是從1開始的
@Override
public void readFields(ResultSet resultSet) throws SQLException {
this.id = resultSet.getInt(1);
this.city = resultSet.getString(2);
this.company = resultSet.getString(3);
this.companySize = resultSet.getString(4);
this.companyType = resultSet.getString(5);
this.edulevel = resultSet.getString(6);
this.emplType = resultSet.getString(7);
this.extractSkillTag = resultSet.getString(8);
this.jobName = resultSet.getString(9);
this.salary = resultSet.getString(10);
this.welfare = resultSet.getString(11);
this.workingExp = resultSet.getString(12);
}
// 重寫toString方法,用來實現數據從數據庫中讀出來之後傳入到Mapper中時的數據格式,
// 然後在Mapper中可以根據分隔符的不同來對數據進行清洗、分析、聚合計算等
@Override
public String toString() {
return id + "|"+city+ "|"+company+ "|"+companySize+ "|"+companyType+"|"+edulevel+ "|"+emplType+ "|"+extractSkillTag+ "|"+jobName+ "|"+salary+ "|"+welfare+ "|"+workingExp;
}
}
3.Mapper類
// 數據的來源是實體類,所以首先那個實體類要實現序列化(Writable)的方法
package com.data.test.Mysql_skill;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class MyMapper extends Mapper {
@Override
protected void map(Object key, sourcePojo value, Context context) throws IOException, InterruptedException {
// 這裏拿到的就是數據庫中的一條數據,然後存在了一行中
String words = value.toString();
// 這是指定分隔符對不同的數據進行分割
String[] string = words.split(“\|”);
// 測試:string.length = 12
// System.out.println(string.length);
// 崗位技能點
String skills = string[7];
// 使用正則表達式匹配你想要的數據
Pattern pattern = Pattern.compile(“\'(.*?)\'”); // 利用Pattern對象指定正則
Matcher matcher = pattern.matcher(skills); // 利用Matcher進行匹配
// 循環依次去取數據,所取到的數據就是你要取到的數據
while (matcher.find()){
// 根據標識符將一個一個篩出來的數據放到list中
String temp = matcher.group();
// 對數據中不符合規範的進行清洗
temp = temp.replace(“\'”,””);
// 對全角與半角字符的處理
temp = temp.replace(“.”,”.”);
// 在循環彙總依次將處理過之後的數據,並設置每一個的值是1
context.write(new Text(temp),new IntWritable(1));
}
}
}
4.Reducer類
package com.data.test.Mysql_skill;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
public class MyReducer extends Reducer {
// 定義一個集合存輸出對象,對象中的toString()方法只輸出一個count
List list = new ArrayList<>();
@Override
protected void reduce(Text key, Iterable values, Context context){
// 保存每個key的次數和的變量
int sum = 0;
// 依次循環遍歷每一個value(值都是1),然後做和
for (IntWritable value : values) {
sum += value.get();
}
// 創建實現了排序接口的對象(當前對象有兩個屬性:技能點、數量)
outPojo outPojo = new outPojo(key.toString(),sum);
// 將每個待排序對象加入到list中
list.add(outPojo);
}
/**
* 當前方法只執行一次,這樣實現對數據的操作不再是一組一組輸出,而是全部拿到,最後過濾輸出。、
* 比如這裏要求TOP-k就是一個經典的應用
*/
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
// 調用list集合的排序方法
Collections.sort(list);
// 遍歷list集合
// 設置一個計數器
int flag = 0;
// 遍歷集合中的每一個對象
for (outPojo out : list) {
// 將對象中封裝的技能點字段值利用getter取出來
String skill = out.getSkill();
// 同理取count字段
int count = out.getCount();
// 循環依次將數據都寫入到context中
context.write(new Text(skill),new IntWritable(count));
//根據計數器的大小控制輸出前10行
flag++;
// 當flag等於10的時候就返回
if (flag == 10)
return;
}
}
}
5.Main類
package com.data.test.Mysql_skill;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.db.DBConfiguration;
import org.apache.hadoop.mapreduce.lib.db.DBInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;
public class Main {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
conf.set(“fs.defaultFS”,”hdfs://lky01:9000″);
DBConfiguration.configureDB(conf,"com.mysql.jdbc.Driver",
"jdbc:mysql://lky01:3306/migrate","root","123456");
// 新建一個任務對象
Job job = Job.getInstance(conf);
job.setJarByClass(Main.class);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
// 設置Mapper的輸出類型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 設置Reducer輸出類型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 設置數據的讀入的格式
job.setInputFormatClass(DBInputFormat.class);
DBInputFormat.setInput(job, sourcePojo.class,"select * from zhaopin","select count(1) from zhaopin");
// 設置數據的輸出格式
job.setOutputFormatClass(TextOutputFormat.class);
// 設置與Mysql數據連接的jar的位置(當前的路徑是自己在hdfs中創建的,並且上傳了對應版本的jar包)
job.addArchiveToClassPath(new Path("hdfs://lky01:9000/lib/mysql/mysql-connector-java-5.1.39.jar"));
// 設置輸出路徑到hdfs
FileSystem fs = FileSystem.get(conf);
Path outPath = new Path("/output_wang_skill");
if (fs.exists(outPath))
fs.delete(outPath,true);
FileOutputFormat.setOutputPath(job,outPath);
// 提交任務等待完成、
System.exit(job.waitForCompletion(true) ? 0 : 1);
}