平時開發的時候會遇到很多需要將HDFS上多個數據源的某個字段做爲關聯字段,得出多個數據源的笛卡兒積。瞭解了MapReduce多數據源關聯的基本思路後就很簡單。
基本思路:在Map輸入階段獲取輸入路徑,在Map輸出階段根據路徑的不同加以區分,即在將關聯的列作爲Key,在Value中加以區分是哪個數據源的數據,接着在Reduce的輸入階段,在reduce方法的入參會得到所有Key相同的集合,這樣便可對數據進行響應的組裝,這樣便可完成2個數據源的關聯,多個數據源的關聯也類似。
下面附上本人寫的一個demo:
package com.mclaren.hadoop.mr;
import java.io.IOException;
import java.util.Arrays;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.PosixParser;
import org.apache.commons.lang.StringUtils;
import org.apache.directory.api.util.Strings;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2;
import org.apache.hadoop.hbase.mapreduce.PutSortReducer;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.mclaren.hadoop.config.ConfSource;
/**
*
* @ClassName: Sync2HBaseJob
* @Description: 數據入hbase MapReduce
* @author Mclaren.Pan
* @date 2014年11月5日 上午9:49:04
*
*/
public class Sync2HBaseJob {
private static final Logger LOG = LoggerFactory.getLogger(Sync2HBaseJob.class);
private static CommandLine cl = null;
private static ConfClz confClz = new ConfClz();
public static class Sync2HBaseMapper extends Mapper<Object, Text, Text, Text> {
private String[] headers = {};
private String familyCol = "";
private String rowKeyStr = "";
private String dataSourceID = "";
private String separator = "";
//判斷數據源標誌
private String flag;
private Text k;
private Text v;
@Override
protected void setup(Context context){
String headerStr_ = context.getConfiguration().get("headerStr");
headers = headerStr_.split("\\|", -1);
separator = context.getConfiguration().get("separator");
familyCol = context.getConfiguration().get("familyCol");
rowKeyStr = context.getConfiguration().get("rowKeyStr");
dataSourceID = context.getConfiguration().get("dataSourceID");
FileSplit split = (FileSplit) context.getInputSplit();
flag = split.getPath().toString();
}
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
if (StringUtils.isNotBlank(value.toString())){
// 數據源A
if (flag.contains("DS_A")) {
String[] vals = splitLine(value, separator);
k = new Text(vals[1]);
StringBuilder sb = new StringBuilder();
for (String item : vals) {
sb.append(item).append(",");
}
v = new Text("DS_A@_@"
+ sb.delete(sb.length() - 1, sb.length())
.toString());
} else {
String[] vals = splitLine(value, "$");
k = new Text(vals[1]);
StringBuilder sb = new StringBuilder();
for (String item : vals) {
sb.append(item).append(",");
}
v = new Text("DS_B@_@"
+ sb.delete(sb.length() - 1, sb.length())
.toString());
}
context.write(k, v);
}
}
/**
*
* @MethodName: splitLine
* @Description: 分割行
* @param lineValue
* @param split
* @return String[]
* @throws
*/
private String[] splitLine(Text lineValue, String split) {
String line = Strings.trim(lineValue.toString());
String[] vals;
if ("|".equals(split) || StringUtils.isEmpty(split)) {
vals = line.split("\\|", -1);
} else if ("$".equals(split)) {
vals = line.split("\\$", -1);
} else {
vals = line.split(split, -1);
}
return vals;
}
}
/**
*
* @ClassName: Sync2HBaseReducer
* @Description: TODO
* @author Mclaren.Pan
* @date 2014年12月2日 下午5:01:17
*
*/
public static class Sync2HBaseReducer extends
Reducer<Text, Text, Text, Text> {
private String[] headers = {};
private String familyCol = "";
private String rowKeyStr = "";
private String dataSourceID = "";
private String separator = "";
@Override
protected void setup(Context context) throws IOException,
InterruptedException {
String headerStr_ = context.getConfiguration().get("headerStr");
headers = headerStr_.split("\\|", -1);
separator = context.getConfiguration().get("separator");
familyCol = context.getConfiguration().get("familyCol");
rowKeyStr = context.getConfiguration().get("rowKeyStr");
dataSourceID = context.getConfiguration().get("dataSourceID");
}
@Override
protected void cleanup(Context context) throws IOException,
InterruptedException {
}
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
boolean ds_a_found = false;
String[] finalLine = null;
String acc_nbr = "";
boolean ds_b_found = false;
String[] vals = null;
//得到相同key的集合,所以會得到兩個數據源的數據
for (Text val : values) {
String[] data = val.toString().split("@_@");
if (data[0].toString().contains("DS_A")) {
finalLine = splitLine(new Text(data[1]), ",");
ds_a_found = true;
} else {
vals = splitLine(new Text(data[1]), ",");
acc_nbr = vals[4];
ds_b_found = true;
}
if (ds_a_found && ds_b_found) {
StringBuilder sb = new StringBuilder();
for (String col : finalLine) {
sb.append(col).append(",");
}
String rowKey_ = acc_nbr + "_" + finalLine[0];
String temp = sb.delete(sb.length() -1 , sb.length()).toString();
//可能存在一對多的情況,都要插入
context.write(new Text(rowKey_), new Text(temp));
ds_b_found = false;
}
}
}
private String[] splitLine(Text lineValue, String split) {
String line = Strings.trim(lineValue.toString());
String[] vals;
if ("|".equals(split) || StringUtils.isEmpty(split)) {
vals = line.split("\\|", -1);
} else {
vals = line.split(split, -1);
}
return vals;
}
}
/**
*
* @ClassName: ExportHBaseMapper
* @Description: TODO
* @author Mclaren.Pan
* @date 2014年12月4日 上午12:35:09
*
*/
public static class ExportHBaseMapper extends Mapper<Text, Text, ImmutableBytesWritable, Put> {
private String[] headers = {};
private String separator = "";
private String familyCol = "";
private String rowKeyStr = "";
private String dataSourceID = "";
@Override
protected void setup(Context context){
String headerStr_ = context.getConfiguration().get("headerStr");
headers = headerStr_.split("\\|", -1);
separator = context.getConfiguration().get("separator");
familyCol = context.getConfiguration().get("familyCol");
rowKeyStr = context.getConfiguration().get("rowKeyStr");
dataSourceID = context.getConfiguration().get("dataSourceID");
}
@Override
public void map(Text key, Text value, Context context) throws IOException, InterruptedException {
//分解行
String[] vals = splitLine(value);
//獲取rowKey
byte[] rowKeyVal = key.copyBytes();
//插入hbase
try {
Put put = buildPutInstance(rowKeyVal, vals);
ImmutableBytesWritable immutableBytesWritable = new ImmutableBytesWritable();
immutableBytesWritable.set(rowKeyVal);
context.write(immutableBytesWritable, put);
} catch (Exception e) {
LOG.error("", e);
}
}
/**
*
* @MethodName: buildPutInstance
* @Description: 組裝Put實例
* @param rowKeyVal
* @param vals
* @return Put
* @throws
*/
private Put buildPutInstance(byte[] rowKeyVal, String[] vals) {
String item = "";
String itemDataType = "";
byte[] familyCol_bytes = Bytes.toBytes(familyCol);
Put put = new Put(rowKeyVal);
for (int i = 0; i < vals.length; i++) {
item = Strings.trim(vals[i]);
itemDataType = ConfSource.getDataTypeFromConf(dataSourceID,
headers[i]);
if ("" != itemDataType) {
if (itemDataType.contains("NUMBER")) {
if (itemDataType.contains(",")) {
if ("".equals(item.trim())) {
put.add(familyCol_bytes, Bytes.toBytes(headers[i]),
Bytes.toBytes(0.0));
}
else {
Double itemVal = 0.0;
itemVal = Double.parseDouble(item.trim());
put.add(familyCol_bytes, Bytes.toBytes(headers[i]),
Bytes.toBytes(itemVal));
}
} else {
Pattern pattern = Pattern.compile("(\\d+)");
Matcher matcher = pattern.matcher(itemDataType);
int precision = 0;
if (matcher.find()) {
precision = Integer.valueOf(matcher.group(1));
}
//// NUMBER如果超過10位,可能超過int表示範圍-2147483648~2147483647,會報錯,這裏轉成long
if (precision >= 10) {
if ("".equals(item.trim())) {
put.add(familyCol_bytes, Bytes.toBytes(headers[i]),
Bytes.toBytes(0));
}
else {
Double itemVal = Double.parseDouble(item.trim());
put.add(familyCol_bytes, Bytes.toBytes(headers[i]),
Bytes.toBytes(itemVal));
}
} else {
if ("".equals(item.trim())) {
put.add(familyCol_bytes, Bytes.toBytes(headers[i]),
Bytes.toBytes(0));
}
else {
Integer itemVal = 0;
itemVal = Integer.parseInt(item.trim());
put.add(familyCol_bytes, Bytes.toBytes(headers[i]),
Bytes.toBytes(itemVal));
}
}
}
} else {
put.add(familyCol_bytes, Bytes.toBytes(headers[i]),
Bytes.toBytes(item));
}
} else {
put.add(familyCol_bytes, Bytes.toBytes(headers[i]), Bytes.toBytes(item));
}
}
return put;
}
/**
*
* @MethodName: splitLine
* @Description: 分割行
* @param lineValue
* @return String[]
* @throws
*/
private String[] splitLine(Text lineValue) {
String line = lineValue.toString().trim();
String[] vals;
if ("|".equals(separator) || StringUtils.isEmpty(separator)) {
vals = line.split("\\|", -1);
}
else {
vals = line.split(separator, -1);
}
return vals;
}
/**
*
* @MethodName: getRowKeyVal
* @Description: 獲取rowkey
* @param headers
* @param colsPerLine
* @throws InterruptedException
* @return byte[]
* @throws
*/
private byte[] getRowKeyVal(String[] headers, String[] colsPerLine) throws InterruptedException {
String[] rowKeys = rowKeyStr.split(",");
StringBuilder rkBuf = new StringBuilder();
for (String rowKey : rowKeys) {
int rkIdx = Arrays.asList(headers).indexOf(rowKey);
rkBuf.append(colsPerLine[rkIdx]).append("_");
}
int rkBufLen = rkBuf.length();
byte[] rowKeyVal = Bytes.toBytes(rkBuf.delete(rkBufLen - 1, rkBufLen).toString());
return rowKeyVal;
}
}
/**
* @throws IOException
*
* @MethodName: sync2HBase
* @Description: 導入bigtable的共用執行類
* @throws Exception
* @return void
* @throws
*/
private void sync2HBase() throws IOException{
Configuration conf1 = ConfSource.getHBaseConf();
Job job1 = new Job(conf1, "2個數據源關聯");
job1.setOutputKeyClass(Text.class);
job1.setOutputValueClass(Text.class);
job1.setJarByClass(Sync2HBaseJob.class);
job1.setMapperClass(Sync2HBaseMapper.class);
job1.setReducerClass(Sync2HBaseReducer.class);
job1.setOutputKeyClass(Text.class);
job1.setOutputValueClass(Text.class);
FileInputFormat.addInputPaths(job1, confClz.getDownloadPath());
FileOutputFormat.setOutputPath(job1, new Path("/mr_temp"));
// 設置參數
job1.getConfiguration().set("headerStr", confClz.getHeaderStr());
job1.getConfiguration().set("separator", confClz.getSeparator());
job1.getConfiguration().set("familyCol", confClz.getFamilyCol());
job1.getConfiguration().set("rowKeyStr", confClz.getRowKeyStr());
job1.getConfiguration().set("dataSourceID", confClz.getDataSourceID());
try {
job1.waitForCompletion(true);
} catch (Exception e) {
e.printStackTrace();
}
try {
Configuration conf2 = ConfSource.getHBaseConf();
Job job2 = new Job(conf2, "Import into hbase table"
+ confClz.getHbaseTable() + " from "
+ confClz.getDownloadPath());
job2.setJarByClass(Sync2HBaseJob.class);
job2.setInputFormatClass(KeyValueTextInputFormat.class);
FileInputFormat.setInputPaths(job2, new Path("/mr_temp"));
job2.setMapperClass(ExportHBaseMapper.class);
HTable table = new HTable(conf2, confClz.getHbaseTable());
job2.setReducerClass(PutSortReducer.class);
Path outputDir = new Path(confClz.getHfilePath());
FileOutputFormat.setOutputPath(job2, outputDir);
job2.setMapOutputKeyClass(ImmutableBytesWritable.class);
job2.setMapOutputValueClass(Put.class);
HFileOutputFormat2.configureIncrementalLoad(job2, table);
TableMapReduceUtil.addDependencyJars(job2);
// 設置參數
job2.getConfiguration().set("headerStr", confClz.getHeaderStr());
job2.getConfiguration().set("separator", confClz.getSeparator());
job2.getConfiguration().set("familyCol", confClz.getFamilyCol());
job2.getConfiguration().set("rowKeyStr", confClz.getRowKeyStr());
job2.getConfiguration().set("dataSourceID", confClz.getDataSourceID());
job2.waitForCompletion(true);
} catch (Exception e) {
e.printStackTrace();
}
}
/**
*
* @MethodName: getConf
* @Description: 加載配置文件
* @return void
* @throws
*/
private void getConf() {
//讀取數據源ID
String dataSourceID = cl.getOptionValue("dsid");
if (StringUtils.isEmpty(dataSourceID)) {
LOG.error("沒有指定數據源ID");
System.exit(1);
}
//讀取列
String headerStr = ConfSource.getProperty(dataSourceID + ".header");
String[] headers = headerStr.split("\\|", -1);
//讀取分隔符
String separator = ConfSource.getProperty(dataSourceID + ".separator");
//讀取hbase表名
String hbaseTable = cl.getOptionValue("tb");
if (StringUtils.isEmpty(hbaseTable)) {
LOG.error("沒有指定導入的HBASE表名");
System.exit(1);
}
//讀取列族
String familyCol = ConfSource.getProperty(dataSourceID + ".familyCol");
//rowkey,支持組合rowKey
String rowKeyStr = ConfSource.getProperty(dataSourceID + ".rowKey");
String[] rowKeys = rowKeyStr.split(",");
for (String rk : rowKeys) {
int rkIdx = Arrays.asList(headers).indexOf(rk);
if (-1 == rkIdx) {
LOG.error("指定RowKey在列數據中未找到!");
System.exit(1);
}
}
//讀取hdfs路徑
String downloadPath = "";
String inputPath = cl.getOptionValue("path");
if (StringUtils.isEmpty(inputPath)) {
String dataFilePath = ConfSource.getProperty(dataSourceID + ".hdfsPath");
if (StringUtils.isEmpty(dataFilePath)) {
LOG.error("沒有指定數據文件地址,並且默認數據文件地址未來找到!");
System.exit(1);
} else {
downloadPath = dataFilePath;
}
} else {
downloadPath = inputPath;
}
// hfile 存放hdfs路徑
String hfilePath = cl.getOptionValue("hfilePath");
if (StringUtils.isEmpty(hfilePath)) {
LOG.error("必須設置hfile存放的hdfs路徑!");
System.exit(1);
}
confClz.setDataSourceID(dataSourceID);
confClz.setFamilyCol(familyCol);
confClz.setRowKeyStr(rowKeyStr);
confClz.setHbaseTable(hbaseTable);
confClz.setHeaderStr(headerStr);
confClz.setSeparator(separator);
confClz.setDownloadPath(downloadPath);
confClz.setHfilePath(hfilePath);
}
/**
*
* @MethodName: getCommandParam
* @Description: 從命令行獲取參數
* @param args
* @return void
* @throws
*/
private void getCommandParam(String[] args) {
Options opt = new Options();
opt.addOption("dsid", true, "data source identity");
opt.addOption("path", true, "hdfs absolute path");
opt.addOption("tb", true, "which hbase table to export");
opt.addOption("hfilePath", "hfile path", true, "hfile output hdfs path");
String formatStr = "sh hadoop [this jar path][-dsid][-path][-tb][-bulkpath] ";
HelpFormatter formatter = new HelpFormatter();
CommandLineParser parser = new PosixParser();
try {
cl = parser.parse(opt, args);
} catch (Exception e) {
formatter.printHelp(formatStr, opt);
System.exit(1);
LOG.error("", e);
}
}
public static void main(String[] args) {
try {
Sync2HBaseJob job = new Sync2HBaseJob();
job.getCommandParam(args);
job.getConf();
job.sync2HBase();
}
catch (Exception e) {
e.printStackTrace();
}
}
}