package com.sitech; import com.google.common.collect.Lists; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil; import org.apache.hadoop.hbase.mapreduce.TableMapper; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.parquet.example.data.simple.SimpleGroupFactory; import org.apache.parquet.hadoop.ParquetOutputFormat; import org.apache.parquet.hadoop.example.GroupWriteSupport; import org.apache.parquet.example.data.Group; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.Properties; public class HbaseToHdfs { /** * Init properties. * * @param path the path * @return the properties * @throws Exception the exception */ public static Properties init(final String path) throws Exception { Properties properties = new Properties(); InputStream inputStream = new FileInputStream(new File(path)); try { properties.load(inputStream); } catch (IOException e) { e.printStackTrace(); } finally { inputStream.close(); } return properties; } /** * Gets schema. * * @param cloumns the cloumns * @return the schema */ public static String getSchema(String cloumns) { String[] cloum = cloumns.split(","); StringBuffer schemaBuffer = new StringBuffer(); for (String tmp : cloum) { schemaBuffer.append("required binary " + tmp + ";\n"); } return schemaBuffer.toString(); } /** * Create submittable job job. * * @param conf the conf * @param dfsArgs the dfs args * @return the job * @throws Exception the exception */ public static Job createSubmittableJob(Configuration conf, String[] dfsArgs) throws Exception { String hbase_table = dfsArgs[0]; String HDFS_path = dfsArgs[1]; String start_time = dfsArgs[2]; String end_time = dfsArgs[3]; String family = dfsArgs[4]; String qualifier = dfsArgs[5]; String properties_path = dfsArgs[6]; //獲取列配置 Properties properties = init(properties_path); String cloumns = properties.getProperty(hbase_table).trim(); //獲取Schema:"required binary name;\n" String writeSchema = "message example {\n" + getSchema(cloumns) + "}"; conf.set("parquet.example.schema", writeSchema); conf.set("cloumns", cloumns); conf.set("family", family); conf.set("qualifier", qualifier); conf.set("mapreduce.output.fileoutputformat.compress", "false"); Job job = Job.getInstance(conf, "HbaseToHDFS_" + hbase_table); job.setJarByClass(HbaseToHdfs.class); job.setMapperClass(MyMapper.class); job.setNumReduceTasks(0); job.setOutputFormatClass(ParquetOutputFormat.class); String hbasePath = "/user/sjzx_b/bss/lib"; //TODO HDFS路徑 /libjars/hbase ArrayList<String> list = new ArrayList<String>(); list.add("/parquet-column-1.8.1.jar"); list.add("/parquet-common-1.8.1.jar"); list.add("/parquet-encoding-1.8.1.jar"); list.add("/parquet-hadoop-1.8.1.jar"); list.add("/parquet-jackson-1.8.1.jar"); list.add("/parquet-format-2.3.0-incubating.jar"); for (int i = 0; i < list.size(); i++) { job.addArchiveToClassPath(new Path(hbasePath + list.get(i))); } TableMapReduceUtil.initTableMapperJob(initScans(hbase_table, start_time, end_time), MyMapper.class, Void.class, Group.class, job); FileSystem fs = FileSystem.get(conf); Path out = new Path(HDFS_path); if (fs.exists(out)) { fs.delete(out, true); } // FileOutputFormat.setOutputPath(job, out); ParquetOutputFormat.setOutputPath(job, new Path(HDFS_path)); ParquetOutputFormat.setWriteSupportClass(job, GroupWriteSupport.class); return job; } private static List<Scan> initScans(String tableName, String start_time, String end_time) { Scan scan = new Scan(); scan.setAttribute(Scan.SCAN_ATTRIBUTES_TABLE_NAME, Bytes.toBytes(tableName)); scan.setCaching(500); scan.setCacheBlocks(false); SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd"); try { long timeStart = sdf.parse(start_time).getTime(); long timeend = sdf.parse(end_time).getTime(); scan.setTimeRange(timeStart, timeend); } catch (NumberFormatException e) { System.out.println("Error:日期格式報錯"); e.printStackTrace(); } catch (IOException e) { System.out.println("Error:日期IO異常"); e.printStackTrace(); } catch (Exception e) { System.out.println("Error"); } return Lists.newArrayList(scan); } /** * The type My mapper. */ public static class MyMapper extends TableMapper<Void, Group> { private String family; private String qualifier; private String cloumns; private SimpleGroupFactory factory; @Override protected void setup(Context context) throws IOException, InterruptedException { family = context.getConfiguration().get("family"); qualifier = context.getConfiguration().get("qualifier"); cloumns = context.getConfiguration().get("cloumns"); factory = new SimpleGroupFactory(GroupWriteSupport.getSchema(context.getConfiguration())); } protected void map(ImmutableBytesWritable key, Result r, Context context) throws IOException, InterruptedException { if (r != null) { byte[] value = r.getValue(family.getBytes(), qualifier.getBytes()); String StringValue = new String(value, "UTF-8"); Group group = factory.newGroup(); context.write(null, getGroup(StringValue, group)); } } /** * Gets group. * * @param StringValue the string value * @param group the group * @return the group */ protected Group getGroup(String StringValue, Group group) { String[] cloum = cloumns.split(","); String[] value = StringValue.split("\u0007"); if (cloum.length == value.length) { for (int i = 0; i < cloum.length; i++) { group.append(cloum[i], value[i]); } } return group; } } /** * The entry point of application. * * @param args the input arguments 1.hbase_table 2.inputHDFSpath 3.starttime 4.endtime 5.family 6.qualifier * @throws Exception the exception */ public static void main(String[] args) throws Exception { Configuration conf = HBaseConfiguration.create(); String[] dfsArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (dfsArgs.length != 7) { System.err.println("Wrong number of arguments: " + dfsArgs.length); System.exit(2); } long jobStartTime = System.currentTimeMillis(); Job job = createSubmittableJob(conf, dfsArgs); job.waitForCompletion(true); long jobEndTime = System.currentTimeMillis(); long castTime = jobEndTime - jobStartTime; SimpleDateFormat sdf = new SimpleDateFormat("HH:mm:ss"); String time1 = sdf.format(new Date(jobStartTime)); String time2 = sdf.format(new Date(jobEndTime)); String record = " StartTime: " + time1 + ", EndStart: " + time2 + ", SpeedTime: " + (castTime / 1000.0) + "s\n"; System.out.println(record); } }
parquet 形式MapReduce hbase 數據寫入hdfs
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.