一、Reduce join的應用
(一)join的主要工作
Map
端的主要工作:爲來自不同表或文件的key/value
對,打標籤以區別不同來源的記錄。然後用連接字段作爲key
,其餘部分和新加的標誌作爲value
,最後進行輸出。Reduce
端的主要工作:在Reduce
端以連接字段作爲key
的分組已經完成,我們只需要在每一個分組當中將那些來源於不同文件的記錄(在Map
階段已經打標誌)分開,最後進行合併。
(二)Reduce join案例實操
-
需求
將
pd.txt
中數據根據商品pid
合併到order.txt
中。// order.txt // id pid amount 1001 01 1 1002 02 2 1003 03 3 1004 01 4 1005 02 5 1006 03 6 // pd.txt // pid pname 01 小米 02 華爲 03 格力
-
創建包名:
com.easysir.reducejoin
: -
創建
TableBean
類:package com.easysir.reducejoin; import org.apache.hadoop.io.Writable; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; public class TableBean implements Writable { private String id; // 訂單id private String pid; // 商品id private int amount; // 商品數量 private String pname; // 商品名稱 private String flag; // 標記來自哪張表 public TableBean() { super(); } public TableBean(String id, String pid, int amount, String pname, String flag) { super(); this.id = id; this.pid = pid; this.amount = amount; this.pname = pname; this.flag = flag; } @Override public void write(DataOutput out) throws IOException { out.writeUTF(id); out.writeUTF(pid); out.writeInt(amount); out.writeUTF(pname); out.writeUTF(flag); } @Override public void readFields(DataInput in) throws IOException { id = in.readUTF(); pid = in.readUTF(); amount = in.readInt(); pname = in.readUTF(); flag = in.readUTF(); } public String getId() { return id; } public void setId(String id) { this.id = id; } public String getPid() { return pid; } public void setPid(String pid) { this.pid = pid; } public int getAmount() { return amount; } public void setAmount(int amount) { this.amount = amount; } public String getPname() { return pname; } public void setPname(String pname) { this.pname = pname; } public String getFlag() { return flag; } public void setFlag(String flag) { this.flag = flag; } @Override public String toString() { return id + "\t" + amount + "\t" + pname ; } }
-
創建
TableMapper
類:package com.easysir.reducejoin; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import java.io.IOException; public class TableMapper extends Mapper<LongWritable, Text, Text, TableBean> { String name; TableBean tableBean = new TableBean(); Text k =new Text(); @Override protected void setup(Context context) throws IOException, InterruptedException { FileSplit fileSplit = (FileSplit) context.getInputSplit(); name = fileSplit.getPath().getName(); } @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { // 1 獲取一行 String line = value.toString(); String[] fields = line.split("\t"); // 2 判斷使哪個表 if (name.startsWith("order")){ // 訂單表 tableBean.setId(fields[0]); tableBean.setPid(fields[1]); tableBean.setAmount(Integer.parseInt(fields[2])); tableBean.setPname(""); tableBean.setFlag("order"); k.set(fields[1]); }else { // 商品表 tableBean.setId(""); tableBean.setPid(fields[0]); tableBean.setAmount(0); tableBean.setPname(fields[1]); tableBean.setFlag("pd"); k.set(fields[0]); } context.write(k, tableBean); } }
-
創建
TableReducer
類:package com.easysir.reducejoin; import org.apache.commons.beanutils.BeanUtils; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; import java.lang.reflect.InvocationTargetException; import java.util.ArrayList; public class TableReducer extends Reducer<Text, TableBean, TableBean, NullWritable> { @Override protected void reduce(Text key, Iterable<TableBean> values, Context context) throws IOException, InterruptedException { // 存儲所有訂單集合 ArrayList<TableBean> orderBeans = new ArrayList<>(); // 存儲商品信息 TableBean pdbean = new TableBean(); for (TableBean tableBean : values) { if ("order".equals(tableBean.getFlag())){ TableBean tmpBean = new TableBean(); try { BeanUtils.copyProperties(tmpBean, tableBean); orderBeans.add(tmpBean); } catch (IllegalAccessException | InvocationTargetException e) { e.printStackTrace(); } }else { try { BeanUtils.copyProperties(pdbean, tableBean); } catch (IllegalAccessException | InvocationTargetException e) { e.printStackTrace(); } } } for (TableBean tableBean : orderBeans){ tableBean.setPname(pdbean.getPname()); context.write(tableBean, NullWritable.get()); } } }
-
創建
TableDriver
類:package com.easysir.reducejoin; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; public class TableDriver { public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { // 輸入輸出路徑需要根據自己電腦上實際的輸入輸出路徑設置 args = new String[] { "E:\\idea-workspace\\mrWordCount\\input3", "E:\\idea-workspace\\mrWordCount\\output" }; Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(TableDriver.class); job.setMapperClass(TableMapper.class); job.setReducerClass(TableReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(TableBean.class); job.setOutputKeyClass(TableBean.class); job.setOutputValueClass(NullWritable.class); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); boolean result = job.waitForCompletion(true); System.exit(result? 0:1); } }
(三)總結
- 缺點:這種方式中,合併的操作是在
Reduce
階段完成,Reduce
端的處理壓力太大,Map
節點的運算負載則很低,資源利用率不高,且在Reduce
階段極易產生數據傾斜。 - 解決方案:採用在
Map
端進行數據合併的方式
二、Map join的應用
(一)Map join詳解
-
使用場景
Map join
適用於一張一張表非常大,一張表非常小的場景。 -
優點
在
Map
端緩存多張表,提前處理業務邏輯,這樣增加Map
端業務,能夠減少Reduce
端數據的壓力,儘可能的減少數據傾斜。 -
具體步驟:
(1)在
Mapper
的setup
階段,將文件讀取到緩存集合中;(2)在驅動函數中加載緩存。
// 緩存普通文件到Task運行節點。 job.addCacheFile(new URI("文件路徑"));
(二)Map join案例實操
-
需求
將
pd.txt
中數據根據商品pid
合併到order.txt
中。// order.txt // id pid amount 1001 01 1 1002 02 2 1003 03 3 1004 01 4 1005 02 5 1006 03 6 // pd.txt // pid pname 01 小米 02 華爲 03 格力
-
創建包名:
com.easysir.mapjoin
-
創建
DistributedCacheMapper
類:package com.easysir.mapjoin; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.net.URI; import java.util.HashMap; import java.util.Map; public class DistributedCacheMapper extends Mapper<LongWritable, Text, Text, NullWritable>{ Map<String, String> pdMap = new HashMap<>(); @Override protected void setup(Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException { // 1 獲取緩存的文件 URI[] cacheFiles = context.getCacheFiles(); String path = cacheFiles[0].getPath(); BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8")); String line; while(StringUtils.isNotEmpty(line = reader.readLine())){ // 2 切割 String[] fields = line.split("\t"); // 3 緩存數據到集合 pdMap.put(fields[0], fields[1]); } // 4 關流 reader.close(); } Text k = new Text(); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { // 1 獲取一行 String line = value.toString(); // 2 截取 String[] fields = line.split("\t"); // 3 獲取產品id String pId = fields[1]; // 4 獲取商品名稱 String pdName = pdMap.get(pId); // 5 拼接 k.set(line + "\t"+ pdName); // 6 寫出 context.write(k, NullWritable.get()); } }
-
創建
DistributedCacheDriver
類:package com.easysir.mapjoin; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.net.URI; public class DistributedCacheDriver { public static void main(String[] args) throws Exception { // 輸入輸出路徑需要根據自己電腦上實際的輸入輸出路徑設置 args = new String[] { "E:\\idea-workspace\\mrWordCount\\input3\\order.txt", "E:\\idea-workspace\\mrWordCount\\output" }; // 1 獲取job信息 Configuration configuration = new Configuration(); Job job = Job.getInstance(configuration); // 2 設置加載jar包路徑 job.setJarByClass(DistributedCacheDriver.class); // 3 關聯map job.setMapperClass(DistributedCacheMapper.class); // 4 設置最終輸出數據類型 job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); // 5 設置輸入輸出路徑 FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); // 6 加載緩存數據 job.addCacheFile(new URI("file:///E:/idea-workspace/mrWordCount/input3/pd.txt")); // 7 Map端Join的邏輯不需要Reduce階段,設置reduceTask數量爲0 job.setNumReduceTasks(0); // 8 提交 boolean result = job.waitForCompletion(true); System.exit(result ? 0 : 1); } }
-
運行結果:
1001 01 1 小米 1002 02 2 華爲 1003 03 3 格力 1004 01 4 小米 1005 02 5 華爲 1006 03 6 格力