parquet 形式MapReduce hbase 數據寫入hdfs

package com.sitech;

import com.google.common.collect.Lists;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.parquet.example.data.simple.SimpleGroupFactory;
import org.apache.parquet.hadoop.ParquetOutputFormat;
import org.apache.parquet.hadoop.example.GroupWriteSupport;
import org.apache.parquet.example.data.Group;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Properties;

public class HbaseToHdfs {
    /**
     * Init properties.
     *
     * @param path the path
     * @return the properties
     * @throws Exception the exception
     */
    public static Properties init(final String path) throws Exception {
        Properties properties = new Properties();
        InputStream inputStream = new FileInputStream(new File(path));
        try {
            properties.load(inputStream);
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            inputStream.close();
        }
        return properties;
    }

    /**
     * Gets schema.
     *
     * @param cloumns the cloumns
     * @return the schema
     */
    public static String getSchema(String cloumns) {
        String[] cloum = cloumns.split(",");
        StringBuffer schemaBuffer = new StringBuffer();
        for (String tmp : cloum) {
            schemaBuffer.append("required binary " + tmp + ";\n");
        }
        return schemaBuffer.toString();
    }

    /**
     * Create submittable job job.
     *
     * @param conf    the conf
     * @param dfsArgs the dfs args
     * @return the job
     * @throws Exception the exception
     */
    public static Job createSubmittableJob(Configuration conf, String[] dfsArgs) throws Exception {
        String hbase_table = dfsArgs[0];
        String HDFS_path = dfsArgs[1];
        String start_time = dfsArgs[2];
        String end_time = dfsArgs[3];
        String family = dfsArgs[4];
        String qualifier = dfsArgs[5];
        String properties_path = dfsArgs[6];

        //獲取列配置
        Properties properties = init(properties_path);
        String cloumns = properties.getProperty(hbase_table).trim();

        //獲取Schema："required binary name;\n"
        String writeSchema = "message example {\n" +
                getSchema(cloumns) +
                "}";
        conf.set("parquet.example.schema", writeSchema);
        conf.set("cloumns", cloumns);

        conf.set("family", family);
        conf.set("qualifier", qualifier);
        conf.set("mapreduce.output.fileoutputformat.compress", "false");
        Job job = Job.getInstance(conf, "HbaseToHDFS_" + hbase_table);
        job.setJarByClass(HbaseToHdfs.class);
        job.setMapperClass(MyMapper.class);
        job.setNumReduceTasks(0);
        job.setOutputFormatClass(ParquetOutputFormat.class);

        String hbasePath = "/user/sjzx_b/bss/lib"; //TODO HDFS路徑  /libjars/hbase
        ArrayList<String> list = new ArrayList<String>();
        list.add("/parquet-column-1.8.1.jar");
        list.add("/parquet-common-1.8.1.jar");
        list.add("/parquet-encoding-1.8.1.jar");
        list.add("/parquet-hadoop-1.8.1.jar");
        list.add("/parquet-jackson-1.8.1.jar");
        list.add("/parquet-format-2.3.0-incubating.jar");

        for (int i = 0; i < list.size(); i++) {
            job.addArchiveToClassPath(new Path(hbasePath + list.get(i)));
        }


        TableMapReduceUtil.initTableMapperJob(initScans(hbase_table, start_time, end_time), MyMapper.class, Void.class, Group.class, job);
        FileSystem fs = FileSystem.get(conf);
        Path out = new Path(HDFS_path);
        if (fs.exists(out)) {
            fs.delete(out, true);
        }
//        FileOutputFormat.setOutputPath(job, out);
        ParquetOutputFormat.setOutputPath(job, new Path(HDFS_path));
        ParquetOutputFormat.setWriteSupportClass(job, GroupWriteSupport.class);
        return job;
    }

    private static List<Scan> initScans(String tableName, String start_time, String end_time) {
        Scan scan = new Scan();
        scan.setAttribute(Scan.SCAN_ATTRIBUTES_TABLE_NAME, Bytes.toBytes(tableName));
        scan.setCaching(500);
        scan.setCacheBlocks(false);

        SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");
        try {
            long timeStart = sdf.parse(start_time).getTime();
            long timeend = sdf.parse(end_time).getTime();
            scan.setTimeRange(timeStart, timeend);
        } catch (NumberFormatException e) {
            System.out.println("Error:日期格式報錯");
            e.printStackTrace();
        } catch (IOException e) {
            System.out.println("Error:日期IO異常");
            e.printStackTrace();
        } catch (Exception e) {
            System.out.println("Error");
        }
        return Lists.newArrayList(scan);
    }

    /**
     * The type My mapper.
     */
    public static class MyMapper extends TableMapper<Void, Group> {
        private String family;
        private String qualifier;
        private String cloumns;
        private SimpleGroupFactory factory;

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            family = context.getConfiguration().get("family");
            qualifier = context.getConfiguration().get("qualifier");
            cloumns = context.getConfiguration().get("cloumns");
            factory = new SimpleGroupFactory(GroupWriteSupport.getSchema(context.getConfiguration()));
        }

        protected void map(ImmutableBytesWritable key, Result r, Context context) throws IOException, InterruptedException {
            if (r != null) {
                byte[] value = r.getValue(family.getBytes(), qualifier.getBytes());
                String StringValue = new String(value, "UTF-8");
                Group group = factory.newGroup();
                context.write(null, getGroup(StringValue, group));
            }
        }

        /**
         * Gets group.
         *
         * @param StringValue the string value
         * @param group       the group
         * @return the group
         */
        protected Group getGroup(String StringValue, Group group) {
            String[] cloum = cloumns.split(",");
            String[] value = StringValue.split("\u0007");
            if (cloum.length == value.length) {
                for (int i = 0; i < cloum.length; i++) {
                    group.append(cloum[i], value[i]);
                }
            }
            return group;
        }
    }

    /**
     * The entry point of application.
     *
     * @param args the input arguments             1.hbase_table 2.inputHDFSpath 3.starttime 4.endtime 5.family 6.qualifier
     * @throws Exception the exception
     */
    public static void main(String[] args) throws Exception {
        Configuration conf = HBaseConfiguration.create();
        String[] dfsArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        if (dfsArgs.length != 7) {
            System.err.println("Wrong number of arguments: " + dfsArgs.length);
            System.exit(2);
        }

        long jobStartTime = System.currentTimeMillis();

        Job job = createSubmittableJob(conf, dfsArgs);
        job.waitForCompletion(true);

        long jobEndTime = System.currentTimeMillis();
        long castTime = jobEndTime - jobStartTime;

        SimpleDateFormat sdf = new SimpleDateFormat("HH:mm:ss");
        String time1 = sdf.format(new Date(jobStartTime));
        String time2 = sdf.format(new Date(jobEndTime));
        String record = " StartTime: " + time1 + ", EndStart: " + time2 + ", SpeedTime: " + (castTime / 1000.0) + "s\n";

        System.out.println(record);
    }
}

parquet 形式MapReduce hbase 數據寫入hdfs

釘釘打卡速度慢

使用neovim打造go ide(支持代碼跳轉, 代碼補全, 實時語法檢查)

Nginx R31 doc 官方文檔-01-nginx 如何安裝

Python 潮流週刊#51：用 Python 繪製美觀的圖表

cs01 CSS Syntax

Qt/C++音視頻開發74-合併標籤圖形/生成yolo運算結果圖形/文字和圖形合併成一個/水印濾鏡

挑戰程序設計競賽 2.2章習題 POJ - 3617 Best Cow Line 貪心

字節面試：MySQL什麼時候鎖表？如何防止鎖表？

.NET8連接SQL SERVER 2008 R2 報：證書鏈是由不受信任的頒發機構頒發的

golang開發環境搭建(win10)

spark ,hive collect_list全局保持順序

MapReduce 單表關聯

Flink 同步kafka 數據寫入hbase

FlinkSQL實現WordCount

spark wordcount

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結