MapReduce兩個數據源關聯

平時開發的時候會遇到很多需要將HDFS上多個數據源的某個字段做爲關聯字段,得出多個數據源的笛卡兒積。瞭解了MapReduce多數據源關聯的基本思路後就很簡單。

基本思路:在Map輸入階段獲取輸入路徑,在Map輸出階段根據路徑的不同加以區分,即在將關聯的列作爲Key,在Value中加以區分是哪個數據源的數據,接着在Reduce的輸入階段,在reduce方法的入參會得到所有Key相同的集合,這樣便可對數據進行響應的組裝,這樣便可完成2個數據源的關聯,多個數據源的關聯也類似。

下面附上本人寫的一個demo:

package com.mclaren.hadoop.mr;

import java.io.IOException;
import java.util.Arrays;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.PosixParser;
import org.apache.commons.lang.StringUtils;
import org.apache.directory.api.util.Strings;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2;
import org.apache.hadoop.hbase.mapreduce.PutSortReducer;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.mclaren.hadoop.config.ConfSource;


/**
 * 
 * @ClassName: Sync2HBaseJob
 * @Description: 數據入hbase MapReduce
 * @author Mclaren.Pan
 * @date 2014年11月5日 上午9:49:04
 *
 */
public class Sync2HBaseJob {
	
	private static final Logger LOG = LoggerFactory.getLogger(Sync2HBaseJob.class);
	
	private static CommandLine cl = null; 
	
	private static ConfClz confClz = new ConfClz();
	
	public static class Sync2HBaseMapper extends Mapper<Object, Text, Text, Text> {
		private String[] headers = {};
    	
    	private String familyCol = "";
    	
    	private String rowKeyStr = "";
    	
    	private String dataSourceID = "";
    	
    	private String separator = "";
    	
    	//判斷數據源標誌
    	private String flag;

    	private Text k;
    	
		private Text v;
    	
    	
        @Override
        protected void setup(Context context){
        	String headerStr_ = context.getConfiguration().get("headerStr");
        	headers = headerStr_.split("\\|", -1);
        	separator = context.getConfiguration().get("separator");
        	familyCol = context.getConfiguration().get("familyCol");
        	rowKeyStr = context.getConfiguration().get("rowKeyStr");
        	dataSourceID = context.getConfiguration().get("dataSourceID");
        	
        	FileSplit split = (FileSplit) context.getInputSplit();
        	flag = split.getPath().toString();
        }
        
        public void map(Object key, Text value, Context context)
				throws IOException, InterruptedException {
			if (StringUtils.isNotBlank(value.toString())){
				// 數據源A
				if (flag.contains("DS_A")) {
					String[] vals = splitLine(value, separator);
					k = new Text(vals[1]);
					StringBuilder sb = new StringBuilder();
					for (String item : vals) {
						sb.append(item).append(",");
					}
					v = new Text("DS_A@_@"
							+ sb.delete(sb.length() - 1, sb.length())
									.toString());
				} else {
					String[] vals = splitLine(value, "$");
					k = new Text(vals[1]);
					StringBuilder sb = new StringBuilder();
					for (String item : vals) {
						sb.append(item).append(",");
					}
					v = new Text("DS_B@_@"
							+ sb.delete(sb.length() - 1, sb.length())
									.toString());
				}
				context.write(k, v);
			}
    	}
        
        /**
         * 
         * @MethodName: splitLine
         * @Description: 分割行
         * @param lineValue
         * @param split
         * @return String[] 
         * @throws
         */
        private String[] splitLine(Text lineValue, String split) {
    		String line = Strings.trim(lineValue.toString());
    		String[] vals;
    		if ("|".equals(split) || StringUtils.isEmpty(split)) {
    			vals = line.split("\\|", -1);
    		} else if ("$".equals(split)) {
    			vals = line.split("\\$", -1);
    		} else {
    			vals = line.split(split, -1);
    		}
    		return vals;
        }
        
    }
    
    /**
     * 
     * @ClassName: Sync2HBaseReducer
     * @Description: TODO
     * @author Mclaren.Pan
     * @date 2014年12月2日 下午5:01:17
     *
     */
    public static class Sync2HBaseReducer extends
		Reducer<Text, Text, Text, Text> {
    	private String[] headers = {};
    	
    	private String familyCol = "";
    	
    	private String rowKeyStr = "";
    	
    	private String dataSourceID = "";
    	
    	private String separator = "";
    	
    	@Override
		protected void setup(Context context) throws IOException,
				InterruptedException {
    		String headerStr_ = context.getConfiguration().get("headerStr");
        	headers = headerStr_.split("\\|", -1);
        	separator = context.getConfiguration().get("separator");
        	familyCol = context.getConfiguration().get("familyCol");
        	rowKeyStr = context.getConfiguration().get("rowKeyStr");
        	dataSourceID = context.getConfiguration().get("dataSourceID");
		}

		@Override
		protected void cleanup(Context context) throws IOException,
				InterruptedException {
		}
		
		public void reduce(Text key, Iterable<Text> values, Context context)
				throws IOException, InterruptedException {
			boolean ds_a_found = false;
			String[] finalLine = null;
			String acc_nbr = "";
			boolean ds_b_found = false;
			String[] vals = null;
			
			//得到相同key的集合,所以會得到兩個數據源的數據
			for (Text val : values) {
				String[] data = val.toString().split("@_@");
				
	        	if (data[0].toString().contains("DS_A")) {
	        		finalLine = splitLine(new Text(data[1]), ",");
	        		ds_a_found = true;
	        	} else {
	        		vals = splitLine(new Text(data[1]), ",");
	        		acc_nbr = vals[4];
	        		ds_b_found = true;
	        	}
	        	
	        	if (ds_a_found && ds_b_found) {
	        		StringBuilder sb = new StringBuilder();
	        		for (String col : finalLine) {
	        			sb.append(col).append(",");
	        		}
	        		String rowKey_ = acc_nbr + "_" + finalLine[0];
	        		
	        		String temp = sb.delete(sb.length() -1 , sb.length()).toString();
	        		//可能存在一對多的情況,都要插入
	        		context.write(new Text(rowKey_), new Text(temp));
	        		ds_b_found = false;
	        	}
			}
			
		}
		
        private String[] splitLine(Text lineValue, String split) {
    		String line = Strings.trim(lineValue.toString());
    		String[] vals;
    		if ("|".equals(split) || StringUtils.isEmpty(split)) {
    			vals = line.split("\\|", -1);
    		} else {
    			vals = line.split(split, -1);
    		}
    		return vals;
        }
    }
    
    /**
     * 
     * @ClassName: ExportHBaseMapper
     * @Description: TODO
     * @author Mclaren.Pan
     * @date 2014年12月4日 上午12:35:09
     *
     */
    public static class ExportHBaseMapper extends Mapper<Text, Text, ImmutableBytesWritable, Put> {

    	private String[] headers = {};
    	
    	private String separator = "";
    	
    	private String familyCol = "";
    	
    	private String rowKeyStr = "";
    	
    	private String dataSourceID = "";
    	
        @Override
        protected void setup(Context context){
        	String headerStr_ = context.getConfiguration().get("headerStr");
        	headers = headerStr_.split("\\|", -1);
        	separator = context.getConfiguration().get("separator");
        	familyCol = context.getConfiguration().get("familyCol");
        	rowKeyStr = context.getConfiguration().get("rowKeyStr");
        	dataSourceID = context.getConfiguration().get("dataSourceID");
        }
        
        @Override
        public void map(Text key, Text value, Context context) throws IOException, InterruptedException {
        	
        	//分解行
        	String[] vals = splitLine(value);
        	
        	//獲取rowKey
        	byte[] rowKeyVal = key.copyBytes();
    		
        	//插入hbase
        	try {
        		Put put = buildPutInstance(rowKeyVal, vals);
        		ImmutableBytesWritable immutableBytesWritable = new ImmutableBytesWritable();
        		immutableBytesWritable.set(rowKeyVal);
        		context.write(immutableBytesWritable, put);
        	} catch (Exception e) {
        		LOG.error("", e);
        	}
    		
    	}
        
        
        /**
         * 
         * @MethodName: buildPutInstance
         * @Description: 組裝Put實例
         * @param rowKeyVal
         * @param vals
         * @return Put 
         * @throws
         */
        private Put buildPutInstance(byte[] rowKeyVal, String[] vals) {
        	String item = "";
        	String itemDataType = "";
        	byte[] familyCol_bytes = Bytes.toBytes(familyCol);
        	Put put = new Put(rowKeyVal);
    		for (int i = 0; i < vals.length; i++) {
    			item = Strings.trim(vals[i]);
    			itemDataType = ConfSource.getDataTypeFromConf(dataSourceID,
    					headers[i]);
    			if ("" != itemDataType) {
    				if (itemDataType.contains("NUMBER")) {
    					if (itemDataType.contains(",")) {
    						if ("".equals(item.trim())) {
    							put.add(familyCol_bytes, Bytes.toBytes(headers[i]),
    									Bytes.toBytes(0.0));
    						}
    						else {
    							Double itemVal = 0.0;
    							itemVal = Double.parseDouble(item.trim());
    							put.add(familyCol_bytes, Bytes.toBytes(headers[i]),
    									Bytes.toBytes(itemVal));
    						}
    					} else {
    						Pattern pattern = Pattern.compile("(\\d+)");
    						Matcher matcher = pattern.matcher(itemDataType);
    						int precision = 0;
    						if (matcher.find()) {
    							precision = Integer.valueOf(matcher.group(1));
    						}
    						//// NUMBER如果超過10位,可能超過int表示範圍-2147483648~2147483647,會報錯,這裏轉成long
    						if (precision >= 10) {
    							if ("".equals(item.trim())) {
    								put.add(familyCol_bytes, Bytes.toBytes(headers[i]),
    										Bytes.toBytes(0));
    							}
    							else {
    								Double itemVal = Double.parseDouble(item.trim());
        							put.add(familyCol_bytes, Bytes.toBytes(headers[i]),
        									Bytes.toBytes(itemVal));
    							}
    						} else {
    							if ("".equals(item.trim())) {
    								put.add(familyCol_bytes, Bytes.toBytes(headers[i]),
    										Bytes.toBytes(0));
    							}
    							else {
    								Integer itemVal = 0;
    								itemVal = Integer.parseInt(item.trim());
    								put.add(familyCol_bytes, Bytes.toBytes(headers[i]),
    										Bytes.toBytes(itemVal));
    							}
    						}
    					}
    				} else {
    					put.add(familyCol_bytes, Bytes.toBytes(headers[i]),
    							Bytes.toBytes(item));
    				}
    			} else {
    				put.add(familyCol_bytes, Bytes.toBytes(headers[i]), Bytes.toBytes(item));
    			}
    		}
    		return put;
        }
        
        /**
         * 
         * @MethodName: splitLine
         * @Description: 分割行
         * @param lineValue
         * @return String[] 
         * @throws
         */
        private String[] splitLine(Text lineValue) {
    		String line = lineValue.toString().trim();
    		String[] vals;
    		if ("|".equals(separator) || StringUtils.isEmpty(separator)) {
    			vals = line.split("\\|", -1);
    		}
    		else  {
    			vals = line.split(separator, -1);
    		}
    		return vals;
        }
        
        /**
         * 
         * @MethodName: getRowKeyVal
         * @Description: 獲取rowkey
         * @param headers
         * @param colsPerLine
         * @throws InterruptedException
         * @return byte[] 
         * @throws
         */
        private byte[] getRowKeyVal(String[] headers, String[] colsPerLine) throws InterruptedException {
        	String[] rowKeys = rowKeyStr.split(",");
        	StringBuilder rkBuf = new StringBuilder();
        	for (String rowKey : rowKeys) {
        		int rkIdx = Arrays.asList(headers).indexOf(rowKey);
        		rkBuf.append(colsPerLine[rkIdx]).append("_");
        	}
        	int rkBufLen = rkBuf.length();
    		byte[] rowKeyVal = Bytes.toBytes(rkBuf.delete(rkBufLen - 1, rkBufLen).toString());
    		return rowKeyVal;
        }
    }

    /**
     * @throws IOException 
     * 
     * @MethodName: sync2HBase
     * @Description: 導入bigtable的共用執行類
     * @throws Exception
     * @return void 
     * @throws
     */
    private void sync2HBase() throws IOException{
    	
		Configuration conf1 = ConfSource.getHBaseConf();
		Job job1 = new Job(conf1, "2個數據源關聯");
		job1.setOutputKeyClass(Text.class);
		job1.setOutputValueClass(Text.class);
		job1.setJarByClass(Sync2HBaseJob.class);
		job1.setMapperClass(Sync2HBaseMapper.class);
		job1.setReducerClass(Sync2HBaseReducer.class);
		job1.setOutputKeyClass(Text.class);
		job1.setOutputValueClass(Text.class);
		FileInputFormat.addInputPaths(job1, confClz.getDownloadPath());
		FileOutputFormat.setOutputPath(job1, new Path("/mr_temp"));
		// 設置參數
		job1.getConfiguration().set("headerStr", confClz.getHeaderStr());
		job1.getConfiguration().set("separator", confClz.getSeparator());
		job1.getConfiguration().set("familyCol", confClz.getFamilyCol());
		job1.getConfiguration().set("rowKeyStr", confClz.getRowKeyStr());
		job1.getConfiguration().set("dataSourceID", confClz.getDataSourceID());
		
		try {
			job1.waitForCompletion(true);

		} catch (Exception e) {
			e.printStackTrace();
		}
		
		try {
			Configuration conf2 = ConfSource.getHBaseConf();
			Job job2 = new Job(conf2, "Import into hbase table"
					+ confClz.getHbaseTable() + " from "
					+ confClz.getDownloadPath());
			job2.setJarByClass(Sync2HBaseJob.class);
			job2.setInputFormatClass(KeyValueTextInputFormat.class);
			FileInputFormat.setInputPaths(job2, new Path("/mr_temp"));
			job2.setMapperClass(ExportHBaseMapper.class);
			HTable table = new HTable(conf2, confClz.getHbaseTable());
			job2.setReducerClass(PutSortReducer.class);
			Path outputDir = new Path(confClz.getHfilePath());
			FileOutputFormat.setOutputPath(job2, outputDir);
			job2.setMapOutputKeyClass(ImmutableBytesWritable.class);
			job2.setMapOutputValueClass(Put.class);
			HFileOutputFormat2.configureIncrementalLoad(job2, table);
			TableMapReduceUtil.addDependencyJars(job2);

			// 設置參數
			job2.getConfiguration().set("headerStr", confClz.getHeaderStr());
			job2.getConfiguration().set("separator", confClz.getSeparator());
			job2.getConfiguration().set("familyCol", confClz.getFamilyCol());
			job2.getConfiguration().set("rowKeyStr", confClz.getRowKeyStr());
			job2.getConfiguration().set("dataSourceID", confClz.getDataSourceID());

				job2.waitForCompletion(true);
			} catch (Exception e) {
				e.printStackTrace();
			}
		
    }
    
    /**
     * 
     * @MethodName: getConf
     * @Description: 加載配置文件
     * @return void 
     * @throws
     */
	private void getConf() {
		
		//讀取數據源ID
		String dataSourceID = cl.getOptionValue("dsid");
		if (StringUtils.isEmpty(dataSourceID)) {
			LOG.error("沒有指定數據源ID");
			System.exit(1);
		}
		
		//讀取列
		String headerStr = ConfSource.getProperty(dataSourceID + ".header");
		String[] headers = headerStr.split("\\|", -1);
		
		//讀取分隔符
		String separator = ConfSource.getProperty(dataSourceID + ".separator");
		
		//讀取hbase表名
		String hbaseTable = cl.getOptionValue("tb");
		
		if (StringUtils.isEmpty(hbaseTable)) {
			LOG.error("沒有指定導入的HBASE表名");
			System.exit(1);
		}
		
		//讀取列族
		String familyCol = ConfSource.getProperty(dataSourceID + ".familyCol"); 
		
		//rowkey,支持組合rowKey
		String rowKeyStr = ConfSource.getProperty(dataSourceID + ".rowKey");
		String[] rowKeys = rowKeyStr.split(",");
		for (String rk : rowKeys) {
			int rkIdx = Arrays.asList(headers).indexOf(rk);
			if (-1 == rkIdx) {
				LOG.error("指定RowKey在列數據中未找到!");
				System.exit(1);
			}
		}

		//讀取hdfs路徑
		String downloadPath = "";
		String inputPath = cl.getOptionValue("path");
		if (StringUtils.isEmpty(inputPath)) {
			String dataFilePath = ConfSource.getProperty(dataSourceID + ".hdfsPath");
			if (StringUtils.isEmpty(dataFilePath)) {
				LOG.error("沒有指定數據文件地址,並且默認數據文件地址未來找到!");
				System.exit(1);
			} else {
				downloadPath = dataFilePath;
			}
		} else {
			downloadPath = inputPath;
		}

		// hfile 存放hdfs路徑
		String hfilePath = cl.getOptionValue("hfilePath");
		if (StringUtils.isEmpty(hfilePath)) {
			LOG.error("必須設置hfile存放的hdfs路徑!");
			System.exit(1);
		}

		confClz.setDataSourceID(dataSourceID);
		confClz.setFamilyCol(familyCol);
		confClz.setRowKeyStr(rowKeyStr);
		confClz.setHbaseTable(hbaseTable);
		confClz.setHeaderStr(headerStr);
		confClz.setSeparator(separator);
		confClz.setDownloadPath(downloadPath);
		confClz.setHfilePath(hfilePath);
	}
    
    /**
     * 
     * @MethodName: getCommandParam
     * @Description: 從命令行獲取參數
     * @param args
     * @return void 
     * @throws
     */
    private void getCommandParam(String[] args) {
    	Options opt = new Options();
    	opt.addOption("dsid", true, "data source identity");
    	opt.addOption("path", true, "hdfs absolute path");
    	opt.addOption("tb", true, "which hbase table to export");
    	opt.addOption("hfilePath", "hfile path", true, "hfile output hdfs path");
    	String formatStr = "sh hadoop [this jar path][-dsid][-path][-tb][-bulkpath] ";
    	HelpFormatter formatter = new HelpFormatter();
    	CommandLineParser parser = new PosixParser();
    	try {
    		cl = parser.parse(opt, args); 
    	} catch (Exception e) {
    		formatter.printHelp(formatStr, opt);
    		System.exit(1);
    		LOG.error("", e);
    	}
    }
    
    public static void main(String[] args) {
    	try {
    		Sync2HBaseJob job = new Sync2HBaseJob();
    		job.getCommandParam(args);
    		job.getConf();
    		job.sync2HBase();
    	}
    	catch (Exception e) {
    		e.printStackTrace();
    	}
    }
    	
}


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章