目錄
1.需求
訂單表
商品表
實現機制:
通過將關聯的條件作爲map輸出的key,將兩表滿足join條件的數據並攜帶數據所來源的文件信息,發往同一個reduce task,在reduce中進行數據的串聯
2.創建join對象
package com.czxy.order;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
@Data
@NoArgsConstructor
public class JoinBean implements Writable {
private String id; // 訂單id
private String date; // 訂單時間
private String pid; // 商品id
private String amount; // 數量
private String name; //訂單名稱
private String categoryId; //類別id
private String price; // 價格
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(id+"");
out.writeUTF(date+"");
out.writeUTF(pid+"");
out.writeUTF(amount+"");
out.writeUTF(name+"");
out.writeUTF(categoryId+"");
out.writeUTF(price+"");
}
@Override
public void readFields(DataInput in) throws IOException {
this.id = in.readUTF();
this.date = in.readUTF();
this.pid = in.readUTF();
this.amount = in.readUTF();
this.name = in.readUTF();
this.categoryId = in.readUTF();
this.price = in.readUTF();
}
}
3.實現Map Join
注意:需要把商品表的數據上傳的HDFS
3.1創建map代碼
package com.czxy.order;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import javax.xml.transform.Source;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
public class MapJoinMapper extends Mapper<LongWritable, Text, Text, Text> {
private Map<String, JoinBean> joinMap = new HashMap<>();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
// 獲取所有的緩存文件地址
URI[] urls = DistributedCache.getCacheFiles(context.getConfiguration());
// 獲取文件系統對象
FileSystem fs = FileSystem.get(urls[0], context.getConfiguration());
// 讀取文件
FSDataInputStream open = fs.open(new Path(urls[0]));
// 轉換爲高效流
BufferedReader bf = new BufferedReader(new InputStreamReader(open));
// 按行讀取
String line = "";
while ((line = bf.readLine()) != null) {
// 創建對象
JoinBean joinBean = new JoinBean();
// 按行切割
String[] split = line.split(",");
// 給對象賦值
joinBean.setPid(split[0]);
joinBean.setName(split[1]);
joinBean.setCategoryId(split[2]);
joinBean.setPrice(split[3]);
//給結合添加信息
joinMap.put(joinBean.getPid(), joinBean);
}
// 關閉資源
bf.close();
open.close();
fs.close();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 類型轉換
String s = value.toString();
// 字符串切割
String[] split = s.split(",");
// 根據pid 獲取對象
JoinBean joinBean = joinMap.get(split[2]);
if (joinBean == null) {
joinBean = joinMap.get("\uFEFF" + split[2]);
}
// 添加信息
joinBean.setId(split[0]);
joinBean.setDate(split[1]);
joinBean.setPid(split[2]);
joinBean.setAmount(split[3]);
// 輸出
context.write(new Text(split[2]), new Text(joinBean.toString()));
}
}
3.2實現啓動類
package com.czxy.order;
import com.czxy.flow.FlowBean;
import com.czxy.flow.FlowDriver;
import com.czxy.flow.FlowMapper;
import com.czxy.flow.FlowReduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.net.URI;
public class MapJoinDriver extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
Configuration configuration = new Configuration();
// 獲取 HDFS 文件
DistributedCache.addCacheFile(new URI("hdfs://192.168.100.201:8020/input/product.txt"), configuration);
// 獲取job
Job job = Job.getInstance(configuration);
// 設置支持jar執行
job.setJarByClass(MapJoinDriver.class);
// 設置執行的napper
job.setMapperClass(MapJoinMapper.class);
// 設置map輸出的key類型
job.setMapOutputKeyClass(Text.class);
// 設置map輸出value類型
job.setMapOutputValueClass(Text.class);
// 設置文件輸入
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job, new Path("./data/join/orders.txt"));
// 設置文件輸出
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job, new Path("./outPut/join/map"));
// 設置啓動類
boolean b = job.waitForCompletion(true);
return b ? 0 : 1;
}
public static void main(String[] args) throws Exception {
ToolRunner.run(new MapJoinDriver(), args);
}
}
4.實現reduce join
4.1創建map類
package com.czxy.order;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
public class ReduceJoinMap extends Mapper<LongWritable, Text, Text, JoinBean> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
JoinBean joinBean=new JoinBean();
// 類型轉換
String s = value.toString();
// 字符串切割
String[] split = s.split(",");
// 獲取當前讀取的文件
FileSplit fileSplit = (FileSplit) context.getInputSplit();
String fileName = fileSplit.getPath().getName();
System.out.println(fileName);
//判斷文件名是不是order
if (fileName.contains("orders")) {
joinBean.setId(split[0]);
joinBean.setDate(split[1]);
joinBean.setPid(split[2]);
joinBean.setAmount(split[3]);
}else {
joinBean.setPid(split[0]);
joinBean.setName(split[1]);
joinBean.setCategoryId(split[2]);
joinBean.setPrice(split[3]);
}
//輸出
context.write(new Text(joinBean.getPid()),joinBean);
}
}
4.2創建reduce代碼
package com.czxy.order;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class ReduceJoinReduce extends Reducer<Text, JoinBean, Text, JoinBean> {
@Override
protected void reduce(Text key, Iterable<JoinBean> values, Context context) throws IOException, InterruptedException {
// 創建對象 用來保存合併後的數據
JoinBean joinBean = new JoinBean();
// 定義兩個變量用來標記對象是否添加了數據
boolean tab1 = false;
boolean tab2 = false;
for (JoinBean value : values) {
//判斷是不是訂單
if (value.getName() == null || value.getName().equals("null")) {
joinBean.setId(value.getId());
joinBean.setDate(value.getDate());
joinBean.setPid(value.getPid());
joinBean.setAmount(value.getAmount());
//改變標識
tab1 = true;
} else {
joinBean.setPid(value.getPid());
joinBean.setName(value.getName());
joinBean.setCategoryId(value.getCategoryId());
joinBean.setPrice(value.getPrice());
// 改變標識
tab2 = true;
}
if (tab1 && tab2) {
// 輸出
context.write(key, joinBean);
}
}
}
}
4.3reduce的啓動類
package com.czxy.order;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.net.URI;
public class ReduceJoinDriver extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
// 獲取job
Job job = Job.getInstance(new Configuration());
// 設置支持jar執行
job.setJarByClass(ReduceJoinDriver.class);
// 設置執行的napper
job.setMapperClass(ReduceJoinMap.class);
// 設置map輸出的key類型
job.setMapOutputKeyClass(Text.class);
// 設置map輸出value類型
job.setMapOutputValueClass(JoinBean.class);
// 設置執行的reduce
job.setReducerClass(ReduceJoinReduce.class);
// 設置reduce輸出key的類型
job.setOutputKeyClass(Text.class);
// 設置reduce輸出value的類型
job.setOutputValueClass(JoinBean.class);
// 設置文件輸入
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job, new Path("./data/join/"));
// 設置文件輸出
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job, new Path("./outPut/join/reduce"));
// 設置啓動類
boolean b = job.waitForCompletion(true);
return b ? 0 : 1;
}
public static void main(String[] args) throws Exception {
ToolRunner.run(new ReduceJoinDriver(), args);
}
}
5.需要的資源及執行結果
ordes.txt
1001,20150710,P0001,2
1002,20150710,P0001,3
1002,20150710,P0002,3
product.txt
P0001,小米5,1000,2000
P0002,錘子T1,1000,3000