parquet 形式MapReduce hbase 數據寫入hdfs

package com.sitech;

import com.google.common.collect.Lists;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.parquet.example.data.simple.SimpleGroupFactory;
import org.apache.parquet.hadoop.ParquetOutputFormat;
import org.apache.parquet.hadoop.example.GroupWriteSupport;
import org.apache.parquet.example.data.Group;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Properties;

public class HbaseToHdfs {
    /**
     * Init properties.
     *
     * @param path the path
     * @return the properties
     * @throws Exception the exception
     */
    public static Properties init(final String path) throws Exception {
        Properties properties = new Properties();
        InputStream inputStream = new FileInputStream(new File(path));
        try {
            properties.load(inputStream);
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            inputStream.close();
        }
        return properties;
    }

    /**
     * Gets schema.
     *
     * @param cloumns the cloumns
     * @return the schema
     */
    public static String getSchema(String cloumns) {
        String[] cloum = cloumns.split(",");
        StringBuffer schemaBuffer = new StringBuffer();
        for (String tmp : cloum) {
            schemaBuffer.append("required binary " + tmp + ";\n");
        }
        return schemaBuffer.toString();
    }

    /**
     * Create submittable job job.
     *
     * @param conf    the conf
     * @param dfsArgs the dfs args
     * @return the job
     * @throws Exception the exception
     */
    public static Job createSubmittableJob(Configuration conf, String[] dfsArgs) throws Exception {
        String hbase_table = dfsArgs[0];
        String HDFS_path = dfsArgs[1];
        String start_time = dfsArgs[2];
        String end_time = dfsArgs[3];
        String family = dfsArgs[4];
        String qualifier = dfsArgs[5];
        String properties_path = dfsArgs[6];

        //獲取列配置
        Properties properties = init(properties_path);
        String cloumns = properties.getProperty(hbase_table).trim();

        //獲取Schema:"required binary name;\n"
        String writeSchema = "message example {\n" +
                getSchema(cloumns) +
                "}";
        conf.set("parquet.example.schema", writeSchema);
        conf.set("cloumns", cloumns);

        conf.set("family", family);
        conf.set("qualifier", qualifier);
        conf.set("mapreduce.output.fileoutputformat.compress", "false");
        Job job = Job.getInstance(conf, "HbaseToHDFS_" + hbase_table);
        job.setJarByClass(HbaseToHdfs.class);
        job.setMapperClass(MyMapper.class);
        job.setNumReduceTasks(0);
        job.setOutputFormatClass(ParquetOutputFormat.class);

        String hbasePath = "/user/sjzx_b/bss/lib"; //TODO HDFS路徑  /libjars/hbase
        ArrayList<String> list = new ArrayList<String>();
        list.add("/parquet-column-1.8.1.jar");
        list.add("/parquet-common-1.8.1.jar");
        list.add("/parquet-encoding-1.8.1.jar");
        list.add("/parquet-hadoop-1.8.1.jar");
        list.add("/parquet-jackson-1.8.1.jar");
        list.add("/parquet-format-2.3.0-incubating.jar");

        for (int i = 0; i < list.size(); i++) {
            job.addArchiveToClassPath(new Path(hbasePath + list.get(i)));
        }


        TableMapReduceUtil.initTableMapperJob(initScans(hbase_table, start_time, end_time), MyMapper.class, Void.class, Group.class, job);
        FileSystem fs = FileSystem.get(conf);
        Path out = new Path(HDFS_path);
        if (fs.exists(out)) {
            fs.delete(out, true);
        }
//        FileOutputFormat.setOutputPath(job, out);
        ParquetOutputFormat.setOutputPath(job, new Path(HDFS_path));
        ParquetOutputFormat.setWriteSupportClass(job, GroupWriteSupport.class);
        return job;
    }

    private static List<Scan> initScans(String tableName, String start_time, String end_time) {
        Scan scan = new Scan();
        scan.setAttribute(Scan.SCAN_ATTRIBUTES_TABLE_NAME, Bytes.toBytes(tableName));
        scan.setCaching(500);
        scan.setCacheBlocks(false);

        SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");
        try {
            long timeStart = sdf.parse(start_time).getTime();
            long timeend = sdf.parse(end_time).getTime();
            scan.setTimeRange(timeStart, timeend);
        } catch (NumberFormatException e) {
            System.out.println("Error:日期格式報錯");
            e.printStackTrace();
        } catch (IOException e) {
            System.out.println("Error:日期IO異常");
            e.printStackTrace();
        } catch (Exception e) {
            System.out.println("Error");
        }
        return Lists.newArrayList(scan);
    }

    /**
     * The type My mapper.
     */
    public static class MyMapper extends TableMapper<Void, Group> {
        private String family;
        private String qualifier;
        private String cloumns;
        private SimpleGroupFactory factory;

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            family = context.getConfiguration().get("family");
            qualifier = context.getConfiguration().get("qualifier");
            cloumns = context.getConfiguration().get("cloumns");
            factory = new SimpleGroupFactory(GroupWriteSupport.getSchema(context.getConfiguration()));
        }

        protected void map(ImmutableBytesWritable key, Result r, Context context) throws IOException, InterruptedException {
            if (r != null) {
                byte[] value = r.getValue(family.getBytes(), qualifier.getBytes());
                String StringValue = new String(value, "UTF-8");
                Group group = factory.newGroup();
                context.write(null, getGroup(StringValue, group));
            }
        }

        /**
         * Gets group.
         *
         * @param StringValue the string value
         * @param group       the group
         * @return the group
         */
        protected Group getGroup(String StringValue, Group group) {
            String[] cloum = cloumns.split(",");
            String[] value = StringValue.split("\u0007");
            if (cloum.length == value.length) {
                for (int i = 0; i < cloum.length; i++) {
                    group.append(cloum[i], value[i]);
                }
            }
            return group;
        }
    }

    /**
     * The entry point of application.
     *
     * @param args the input arguments             1.hbase_table 2.inputHDFSpath 3.starttime 4.endtime 5.family 6.qualifier
     * @throws Exception the exception
     */
    public static void main(String[] args) throws Exception {
        Configuration conf = HBaseConfiguration.create();
        String[] dfsArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        if (dfsArgs.length != 7) {
            System.err.println("Wrong number of arguments: " + dfsArgs.length);
            System.exit(2);
        }

        long jobStartTime = System.currentTimeMillis();

        Job job = createSubmittableJob(conf, dfsArgs);
        job.waitForCompletion(true);

        long jobEndTime = System.currentTimeMillis();
        long castTime = jobEndTime - jobStartTime;

        SimpleDateFormat sdf = new SimpleDateFormat("HH:mm:ss");
        String time1 = sdf.format(new Date(jobStartTime));
        String time2 = sdf.format(new Date(jobEndTime));
        String record = " StartTime: " + time1 + ", EndStart: " + time2 + ", SpeedTime: " + (castTime / 1000.0) + "s\n";

        System.out.println(record);
    }
}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章