文章目錄
map端join算法實現
原理闡述
適用於關聯表中有小表的情形:
可以將小表分發到所有的map節點,這樣,map節點就可以在本地對自己所讀到的大表數據進行join
並輸出最終結果,可以大大提高join操作的併發度,加快處理速度
實現示例
--先在mapper類中預先定義好小表,進行join
--引入實際場景中的解決方案:一次加載數據庫或者用distributedcache
總結
適用場景
一個大表join一個小表
實現方式:
a. 將小表先準備在一個hdfs的目錄中
b. 在代碼的main方法中用job.addCacheFile()將其分發到maptask的工作目錄下;還需要將reduce task的數量設置爲0
c. 在代碼的mapper的setup方法中用本地文件api讀取小表文件到內存中
d. 在map方法中根據輸入數據匹配內存小表進行拼接即可
代碼實現
緩存小表
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class CacheMap extends Mapper<LongWritable, Text, Text, NullWritable> {
// v保存緩存數據的hashmap
Map<String, String> pMap = new HashMap<>();
Text k = new Text();
@Override
protected void setup(Mapper<LongWritable, Text, Text, NullWritable>.Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
// 1. 獲得緩存的文件
BufferedReader reader = new BufferedReader(
new InputStreamReader(new FileInputStream("C:\\Users\\55454_000\\Desktop\\product.txt"), "UTF-8"));
String line = null;
while (StringUtils.isNotEmpty(line = reader.readLine())) {
// 切割
String[] fieds = line.split(",");
// 緩存到集合中
pMap.put(fieds[0], fieds[1]);
}
// 關閉流
reader.close();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 獲取數據
String line = value.toString();
// 截取字段
String[] fields = line.split(",");
// 獲得訂單ID
String id = fields[0];
// 獲得產品的id
String pid = fields[2];
// 獲得商品名稱
String pName = pMap.get(pid);
// join
k.set(line + "\t" + pName);
// 輸出
context.write(k, NullWritable.get());
}
}
主函數Driver
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Driver {
public static void main(String[] args)
throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(Driver.class);
job.setMapperClass(CacheMap.class);
// job.setReducerClass(InputFormatReduce.class);
// job.setInputFormatClass(MyFileInputFormat.class);
//job.addCacheFile(new URI("file:///C:/Users/55454_000/Desktop/product.txt"));
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
conf.setBoolean("mapreduce.map.output.compress", true);
conf.setClass("mapreduce.map.output.compress", BZip2Codec.class, CompressionCodec.class);
FileOutputFormat.setCompressOutput(job, true);
FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
// job.setOutputKeyClass(Text.class);
// job.setOutputValueClass(BytesWritable.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
boolean res = job.waitForCompletion(true);
System.exit(res ? 0 : 1);
}
}
reduce端join實現
實現
自定義數據類型
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
public class InfobeanWritable implements Writable {
private int order_id; // 訂單id
private String date; // 日期
private String pid; // 商品id
private int amount; // 訂單數量
private String name; // 商品名稱
private String category_id; // 商品類別
private double price; // 商品價格
private String flag;// 標記位 0 表示訂單表 1 表示商品表
// 無參構造方法
public InfobeanWritable() {
}
// 有參構造方法
public InfobeanWritable(int order_id, String date, String pid, int amount, String name, String category_id,
double price, String flag) {
this.setInfobeanWritable(order_id, date, pid, amount, name, category_id, price, flag);
}
@Override
public String toString() {
return "InfobeanWritable [order_id=" + order_id + ", date=" + date + ", pid=" + pid + ", amount=" + amount
+ ", name=" + name + ", category_id=" + category_id + ", price=" + price + ", flag=" + flag + "]";
}
public void setInfobeanWritable(int order_id, String date, String pid, int amount, String name, String category_id,
double price, String flag) {
this.order_id = order_id;
this.date = date;
this.pid = pid;
this.amount = amount;
this.name = name;
this.category_id = category_id;
this.price = price;
this.flag = flag;
}
public int getOrder_id() {
return order_id;
}
public void setOrder_id(int order_id) {
this.order_id = order_id;
}
public String getDate() {
return date;
}
public void setDate(String date) {
this.date = date;
}
public String getPid() {
return pid;
}
public void setPid(String pid) {
this.pid = pid;
}
public int getAmount() {
return amount;
}
public void setAmount(int amount) {
this.amount = amount;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getCategory_id() {
return category_id;
}
public void setCategory_id(String category_id) {
this.category_id = category_id;
}
public double getPrice() {
return price;
}
public void setPrice(double price) {
this.price = price;
}
public String getFlag() {
return flag;
}
public void setFlag(String flag) {
this.flag = flag;
}
@Override
public void readFields(DataInput in) throws IOException {
// TODO Auto-generated method stub
this.order_id = in.readInt();
this.date = in.readUTF();
this.pid = in.readUTF();
this.amount = in.readInt();
this.name = in.readUTF();
this.category_id = in.readUTF();
this.price = in.readDouble();
this.flag = in.readUTF();
}
@Override
public void write(DataOutput out) throws IOException {
// TODO Auto-generated method stub
out.writeInt(this.order_id);
out.writeUTF(this.date);
out.writeUTF(this.pid);
out.writeInt(this.amount);
out.writeUTF(this.name);
out.writeUTF(this.category_id);
out.writeDouble(this.price);
out.writeUTF(this.flag);
}
}
map端
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
public class MapJoin extends Mapper<LongWritable, Text, Text, InfobeanWritable> {
Text outputkey = new Text();
InfobeanWritable infobean = new InfobeanWritable();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 1. 獲取一行的內容
String line = value.toString();
// 2. 獲得文件名稱
FileSplit fileSplit = (FileSplit) context.getInputSplit();
String fileName = fileSplit.getPath().getName();
// 切割
String[] fileds = line.split(",");
String pid;
// 從文件名稱裏判斷, 如果是order開頭說明是訂單表
if (fileName.startsWith("order")) {
int order_id = Integer.valueOf(fileds[0]);
String date = fileds[1];
pid = fileds[2];
int amount = Integer.valueOf(fileds[3]);
infobean.setInfobeanWritable(order_id, date, pid, amount, "", "", 0, "0");
} else {
pid = fileds[0];
String name = fileds[1];
String category_id = fileds[2];
double price = Double.valueOf(fileds[3]);
infobean.setInfobeanWritable(0, "", pid, 0, name, category_id, price, "1");
}
outputkey.set(pid);
context.write(outputkey, infobean);
}
}
reduce端
import java.io.IOException;
import java.util.ArrayList;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class JoinReduce extends Reducer<Text, InfobeanWritable, InfobeanWritable, NullWritable> {
@Override
protected void reduce(Text key, Iterable<InfobeanWritable> values, Context context)
throws IOException, InterruptedException {
ArrayList<InfobeanWritable> orderlist = new ArrayList<>();
InfobeanWritable pdBean = new InfobeanWritable();
for (InfobeanWritable value : values) {
if ("1".equals(value.getFlag())) {
try {
BeanUtils.copyProperties(pdBean, value);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} else {
InfobeanWritable odBean = new InfobeanWritable();
try {
BeanUtils.copyProperties(odBean, value);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
orderlist.add(odBean);
}
}
for (InfobeanWritable bean : orderlist) {
bean.setName(pdBean.getName());
bean.setCategory_id(pdBean.getCategory_id());
bean.setPrice(pdBean.getPrice());
context.write(bean, NullWritable.get());
}
}
}
主函數Driver
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Driver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(Driver.class);
job.setMapperClass(MapJoin.class);
job.setReducerClass(JoinReduce.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(InfobeanWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
// job.setOutputFormatClass(FilteroutputFormat.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
boolean res = job.waitForCompletion(true);
System.exit(res ? 0 : 1);
}
}
數據壓縮
作用
有效減少磁盤空間或者IO的帶寬
常用的壓縮的方式
壓縮的格式 | 是否切分 | 解壓縮 |
---|---|---|
gzip | 否 | 不需要處理 |
Bzip2 | 是 | 不需要處理 |
Snappy | 否 | 不需要處理 |
Snappy 特點
Snappy 需要單獨的安裝 hive
Snappy 速度是最快的
使用壓縮的情況
在不頻繁進行計算的時候, 並且有大量文件傳輸的情景下可以使用壓縮
使用階段
1.輸入階段
2.map輸出階段
// 在driver類中開啓map端的壓縮
config.setBoolean("mapreduce.map.output.compress", true);
//設置壓縮方式
config.setClass("mapreduce.map.output.compress.codec", BZip2Codec.class, CompressionCodec.class);
3.reduce輸出階段
//開啓reduce端壓縮
FileOutputFormat.setCompressOutput(job, true);
//壓縮格式的設置
FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);