利用MapReduce實現自定義排序

1.第一種實現的方式是：將數據封裝到list中，使用集合的排序方法實現排序在cleanup方法中加入代碼實現排序；
所以這裏只適合把所有的數據都放在一個pojo中；
2.第二種是將pojo類作爲key從Mapper傳遞到Reducer，然後值作爲key，然後遍歷進行傳遞。
3.共同點：兩種方式都要實現WritableComparable接口，重寫compareTo()方法，然後指定兩個int類型的數據相減，
或者也可以指定多字段進行排序

1.Pojo類（用來實現排序的，而且這裏的數據是來自之前統計的結果，所以只有兩個字段：技能點、數量）

package com.data.test.Mysql_skill;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/*

自定義的類作爲MapReduce傳輸對象的時候，必須序列化,實現WritableComparable 接口
泛型爲map輸出的key的類型
可以把需要的字段都封裝過來，map輸出的value就可以用NullWritable來代替
任何用作鍵來使用的類都應該實現WritableComparable接口
*/
public class outPojo implements WritableComparable {
String skill;
int count;

public outPojo() {
}

public outPojo(String skill, int count) {
    this.skill = skill;
    this.count = count;
}

public String getSkill() {
    return skill;
}

public void setSkill(String skill) {
    this.skill = skill;
}

public int getCount() {
    return count;
}

public void setCount(int count) {
    this.count = count;
}
// 序列化的方法:對象 ===> 二進制
// map發到reduce端的的時候先序列化
@Override
public void write(DataOutput out) throws IOException {
    out.writeUTF(skill);
    out.writeInt(count);
}
// 反序列化的方法，到reduce端的時候進行反序列化,和序列化的順序一定要一致
@Override
public void readFields(DataInput in) throws IOException {
    this.skill = in.readUTF();
    this.count = in.readInt();
}
// 也就是說可以指定多重排序方式，不僅僅是單一的字段排序
@Override
public int compareTo(outPojo o) {
    // 第一先按數量（count字段）降序排列
    int temp = o.getCount() - this.count;
    // 如果第一個字段相同，在比較技能字段
    if(tmp==0){
        tmp = o.skill -this.skill;
        return tmp;
    }
    return tmp;
}
// 控制當前對象被傳遞的時候，輸出的格式，可以全部輸出也可以只輸出一部分（這裏只是輸出了其中的一個字段）
// 實際上這樣就可以在Reducer過程中輸出的時候選擇類型爲當前實體類類型。
// 也就是說如果要輸出，多個值的時候，也可以封裝在這樣一個實體類中然後一起輸出，不過就是得實現序列化（Writable）接口
@Override
public String toString() {
    return String.valueOf(count);
}
}

2.Pojo類（用來實現從Mysql中讀取數據的，所以不僅實現序列化接口

Writable，還實現了操作數據庫的DBWritable）
// 由於這裏的測試的數據來源是mysql，所以還多了一個實體類
package com.data.test.Mysql_skill;

import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.lib.db.DBWritable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;

public class sourcePojo implements Writable, DBWritable {
int id; // 編號
String city; //城市
String company; // 公司名
String companySize;// 公司人數
String companyType;// 公司性質
String edulevel;// 學歷要求
String emplType;// 全職/兼職
String extractSkillTag;// 當前崗位需要的技能
String jobName;// 崗位名稱
String salary;// 薪資
String welfare;// 福利
String workingExp;//工作年限要求

public sourcePojo() {
}

public sourcePojo(int id, String city, String company, String companySize, String companyType, String edulevel, String emplType, String extractSkillTag, String jobName, String salary, String welfare, String workingExp) {
    this.id = id;
    this.city = city;
    this.company = company;
    this.companySize = companySize;
    this.companyType = companyType;
    this.edulevel = edulevel;
    this.emplType = emplType;
    this.extractSkillTag = extractSkillTag;
    this.jobName = jobName;
    this.salary = salary;
    this.welfare = welfare;
    this.workingExp = workingExp;
}

public int getId() {
    return id;
}

public void setId(int id) {
    this.id = id;
}

public String getCity() {
    return city;
}

public void setCity(String city) {
    this.city = city;
}

public String getCompany() {
    return company;
}

public void setCompany(String company) {
    this.company = company;
}

public String getCompanySize() {
    return companySize;
}

public void setCompanySize(String companySize) {
    this.companySize = companySize;
}

public String getCompanyType() {
    return companyType;
}

public void setCompanyType(String companyType) {
    this.companyType = companyType;
}

public String getEdulevel() {
    return edulevel;
}

public void setEdulevel(String edulevel) {
    this.edulevel = edulevel;
}

public String getEmplType() {
    return emplType;
}

public void setEmplType(String emplType) {
    this.emplType = emplType;
}

public String getExtractSkillTag() {
    return extractSkillTag;
}

public void setExtractSkillTag(String extractSkillTag) {
    this.extractSkillTag = extractSkillTag;
}

public String getJobName() {
    return jobName;
}

public void setJobName(String jobName) {
    this.jobName = jobName;
}

public String getSalary() {
    return salary;
}

public void setSalary(String salary) {
    this.salary = salary;
}

public String getWelfare() {
    return welfare;
}

public void setWelfare(String welfare) {
    this.welfare = welfare;
}

public String getWorkingExp() {
    return workingExp;
}

public void setWorkingExp(String workingExp) {
    this.workingExp = workingExp;
}
// 序列化
@Override
public void write(DataOutput out) throws IOException {
    out.writeInt(this.id);
    out.writeUTF(this.city);
    out.writeUTF(this.company);
    out.writeUTF(this.companySize);
    out.writeUTF(this.companyType);
    out.writeUTF(this.edulevel);
    out.writeUTF(this.emplType);
    out.writeUTF(this.extractSkillTag);
    out.writeUTF(this.jobName);
    out.writeUTF(this.salary);
    out.writeUTF(this.welfare);
    out.writeUTF(this.workingExp);
}
// 反序列化
@Override
public void readFields(DataInput in) throws IOException {
    this.id = in.readInt();
    this.city = in.readUTF();
    this.company = in.readUTF();
    this.companySize = in.readUTF();
    this.companyType = in.readUTF();
    this.edulevel = in.readUTF();
    this.emplType = in.readUTF();
    this.extractSkillTag = in.readUTF();
    this.jobName = in.readUTF();
    this.salary = in.readUTF();
    this.welfare = in.readUTF();
    this.workingExp = in.readUTF();
}
// 注意：PreparedStatement的索引是從1開始的。
@Override
public void write(PreparedStatement statement) throws SQLException {
    statement.setInt(1,this.id);
    statement.setString(2,this.city);
    statement.setString(3,this.company);
    statement.setString(4,this.companySize);
    statement.setString(5,this.companyType);
    statement.setString(6,this.edulevel);
    statement.setString(7,this.emplType);
    statement.setString(8,this.extractSkillTag);
    statement.setString(9,this.jobName);
    statement.setString(10,this.salary);
    statement.setString(11,this.welfare);
    statement.setString(12,this.workingExp);
}
// 同樣注意這裏的索引是從1開始的
@Override
public void readFields(ResultSet resultSet) throws SQLException {
    this.id = resultSet.getInt(1);
    this.city = resultSet.getString(2);
    this.company = resultSet.getString(3);
    this.companySize = resultSet.getString(4);
    this.companyType = resultSet.getString(5);
    this.edulevel = resultSet.getString(6);
    this.emplType = resultSet.getString(7);
    this.extractSkillTag = resultSet.getString(8);
    this.jobName = resultSet.getString(9);
    this.salary = resultSet.getString(10);
    this.welfare = resultSet.getString(11);
    this.workingExp = resultSet.getString(12);
}
// 重寫toString方法，用來實現數據從數據庫中讀出來之後傳入到Mapper中時的數據格式，
// 然後在Mapper中可以根據分隔符的不同來對數據進行清洗、分析、聚合計算等
@Override
public String toString() {
    return id + "|"+city+ "|"+company+ "|"+companySize+ "|"+companyType+"|"+edulevel+ "|"+emplType+ "|"+extractSkillTag+ "|"+jobName+ "|"+salary+ "|"+welfare+ "|"+workingExp;
}
}

3.Mapper類

// 數據的來源是實體類，所以首先那個實體類要實現序列化（Writable）的方法
package com.data.test.Mysql_skill;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class MyMapper extends Mapper {
@Override
protected void map(Object key, sourcePojo value, Context context) throws IOException, InterruptedException {
// 這裏拿到的就是數據庫中的一條數據，然後存在了一行中
String words = value.toString();
// 這是指定分隔符對不同的數據進行分割
String[] string = words.split(“\|”);
// 測試：string.length = 12
// System.out.println(string.length);
// 崗位技能點
String skills = string[7];
// 使用正則表達式匹配你想要的數據
Pattern pattern = Pattern.compile(“\'(.*?)\'”); // 利用Pattern對象指定正則
Matcher matcher = pattern.matcher(skills); // 利用Matcher進行匹配
// 循環依次去取數據，所取到的數據就是你要取到的數據
while (matcher.find()){
// 根據標識符將一個一個篩出來的數據放到list中
String temp = matcher.group();
// 對數據中不符合規範的進行清洗
temp = temp.replace(“\'”,””);
// 對全角與半角字符的處理
temp = temp.replace(“．”,”.”);
// 在循環彙總依次將處理過之後的數據，並設置每一個的值是1
context.write(new Text(temp),new IntWritable(1));
}
}
}

4.Reducer類

package com.data.test.Mysql_skill;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

public class MyReducer extends Reducer {
// 定義一個集合存輸出對象，對象中的toString()方法只輸出一個count
List list = new ArrayList<>();
@Override
protected void reduce(Text key, Iterable values, Context context){
// 保存每個key的次數和的變量
int sum = 0;
// 依次循環遍歷每一個value（值都是1），然後做和
for (IntWritable value : values) {
sum += value.get();
}
// 創建實現了排序接口的對象（當前對象有兩個屬性：技能點、數量）
outPojo outPojo = new outPojo(key.toString(),sum);
// 將每個待排序對象加入到list中
list.add(outPojo);
}

/**
 * 當前方法只執行一次，這樣實現對數據的操作不再是一組一組輸出，而是全部拿到，最後過濾輸出。、
 * 比如這裏要求TOP-k就是一個經典的應用
 */
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
    // 調用list集合的排序方法
    Collections.sort(list);
    // 遍歷list集合
    // 設置一個計數器
    int flag = 0;
    // 遍歷集合中的每一個對象
    for (outPojo out : list) {
        // 將對象中封裝的技能點字段值利用getter取出來
        String skill = out.getSkill();
        // 同理取count字段
        int count = out.getCount();
        // 循環依次將數據都寫入到context中
        context.write(new Text(skill),new IntWritable(count));
        //根據計數器的大小控制輸出前10行
        flag++;
        // 當flag等於10的時候就返回
        if (flag == 10)
            return;
    }
}
}

5.Main類

package com.data.test.Mysql_skill;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.db.DBConfiguration;
import org.apache.hadoop.mapreduce.lib.db.DBInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;
public class Main {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
conf.set(“fs.defaultFS”,”hdfs://lky01:9000″);
DBConfiguration.configureDB(conf,"com.mysql.jdbc.Driver",
            "jdbc:mysql://lky01:3306/migrate","root","123456");

    // 新建一個任務對象
    Job job = Job.getInstance(conf);
    job.setJarByClass(Main.class);
    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);
    // 設置Mapper的輸出類型
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    // 設置Reducer輸出類型
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    // 設置數據的讀入的格式
    job.setInputFormatClass(DBInputFormat.class);
    DBInputFormat.setInput(job, sourcePojo.class,"select * from zhaopin","select count(1) from zhaopin");

    // 設置數據的輸出格式
    job.setOutputFormatClass(TextOutputFormat.class);

    // 設置與Mysql數據連接的jar的位置(當前的路徑是自己在hdfs中創建的，並且上傳了對應版本的jar包)
    job.addArchiveToClassPath(new Path("hdfs://lky01:9000/lib/mysql/mysql-connector-java-5.1.39.jar"));

    // 設置輸出路徑到hdfs
    FileSystem fs = FileSystem.get(conf);
    Path outPath = new Path("/output_wang_skill");
    if (fs.exists(outPath))
        fs.delete(outPath,true);
    FileOutputFormat.setOutputPath(job,outPath);
    // 提交任務等待完成、
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

利用MapReduce實現自定義排序

利用MapReduce實現自定義排序

利用MapReduce實現自定義排序

Java環境變量配置及NetBeans安裝

C語言實訓任務之學生成績管理系統

MySQL新版登錄報錯：Access denied for user 'root'@'localhost' (using password:YES)解決方法

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結