Flink 兩表關聯 流批處理消費kafka 數據寫入hbase

Flink流批處理消費kafka 數據寫入hbase

  1. 通過flume將數據寫入kafka topic

Kafka topic1 數據:

name, age, sexy, proctime.proctime

java,18,男,20190516

rose,28,女,20190516

tom,38,男,20190516

jack,18,男,20190516

luoli,19,女,20190516

 

Kafka topic2 數據:

name, age, sexy, city, proctime.proctime

java,18,男,hangzh,20190516

rose,28,女,xian,20190516

tom,38,男,shanghai,20190516

jack,18,男,beijin,20190516

luoli,19,女,baoji,20190516

 

  1. 具體實現代碼:

package com.sitesh;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.api.java.tuple.Tuple4;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.runtime.state.filesystem.FsStateBackend;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer010;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.TableEnvironment;
import org.apache.flink.table.api.java.StreamTableEnvironment;
import org.apache.flink.types.Row;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Put;

import java.io.IOException;
import java.util.Properties;
import java.util.logging.Level;
import java.util.logging.Logger;


public class SqlJoinWithKafka {
    public static void main(String[] args) throws Exception{

        System.out.println("use command as: ");
        System.out.println("flink run --class com.sitech.flink.examples.tablesql.SqlJoinWithKafka" +
                " /opt/test.jar --topic topic-test -bootstrap.servers xxxx.xxx.xxx.xxx:9092");
        System.out.println("******************************************************************************************");
        System.out.println("<topic> is the kafka topic name");
        System.out.println("<bootstrap.servers> is the ip:port list of brokers");
        System.out.println("******************************************************************************************");

        //獲取執行環境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        StreamTableEnvironment tableEnv = TableEnvironment.getTableEnvironment(env);

        //基於EventTime進行處理
        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
        //checkpoint配置
        env.enableCheckpointing(5000);//每隔5000 ms進行啓動一個檢查點【設置checkpoint的週期】
//        env.setStateBackend(new FsStateBackend("hdfs://172.21.3.66:12001/flink/checkpoints"));

        //設置並行度爲1
        env.setParallelism(1);
        //kafka配置
        ParameterTool paraTool = ParameterTool.fromArgs(args);
        Properties prop = new Properties();
        prop.setProperty("bootstrap.servers","xxxxxxx");
        prop.setProperty("group.id","sql-join");

        //定義Stream1,從Kafka topic1中讀取數據,設置參數:基於checkpoint自動提交offset到Kafka;從最新的數據開始消費
        DataStream<Tuple3<String, String, String>> kafkaStream1 = env.addSource(new FlinkKafkaConsumer010<>("topic1",
                new SimpleStringSchema(),
                prop).setCommitOffsetsOnCheckpoints(true).setStartFromLatest()).map(new MapFunction<String, Tuple3<String, String, String>>() {
            @Override
            public Tuple3<String, String, String> map(String s) throws Exception {
                String[] word = s.split(",");
                return new Tuple3<>(word[0], word[1], word[2]);
            }
        });
        //將Stream1註冊爲Table1
        tableEnv.registerDataStream("Table1", kafkaStream1, "name, age, sexy, proctime.proctime");

        //定義Stream2,從Kafka topic2中讀取數據,設置參數:基於checkpoint自動提交offset到Kafka;從最新的數據開始消費
        DataStream<Tuple4<String, String, String,String>> kafkaStream2 = env.addSource(new FlinkKafkaConsumer010<>("topic2",
                new SimpleStringSchema(),
                prop).setCommitOffsetsOnCheckpoints(true).setStartFromLatest()).map(new MapFunction<String, Tuple4<String, String, String,String>>() {
            @Override
            public Tuple4<String, String, String,String> map(String s) throws Exception {
                String[] word = s.split(",");
                return new Tuple4<>(word[0], word[1], word[2],word[3]);
            }
        });
        //將Stream2註冊爲Table2
        tableEnv.registerDataStream("Table2", kafkaStream2, "name, age, sexy, city, proctime.proctime");

        //執行SQL Join進行聯合查詢:將Table1.name=Table2.name 的數據進行拼接
        //Table1:name, age, sexy, proctime.proctime
        //Table1:name, age, sexy, city, proctime.proctime
        //查詢輸出字段:[0]name, [1]age, [2]sexy, [3]city, [4]proctime
        Table t1 = tableEnv.scan("Table2");
        System.out.println("========== Table2數據 ==========");
        tableEnv.toRetractStream(t1, Row.class).print();
        Table result = tableEnv.sqlQuery(
                "SELECT t1.name, t1.age, t1.sexy,t2.city,t2.proctime\n" +
                        "FROM Table1 AS t1\n" +
                        "LEFT JOIN Table2 AS t2\n" +
                        "ON t1.name = t2.name\n"
//                "AND t1.age = t2.age\n" +
//                "AND t1.sexy = t2.sexy" +
                //限制join的時間窗口
//                "AND t1.proctime BETWEEN t2.proctime - INT ERVAL '10' SECOND AND t2.proctime + INTERVAL '10' SECOND"
        );


        /**
         * 將Join查詢結果轉換爲DataStream,並實時寫入Hbase,一條數據爲一行
         */
        //定義數據接收類型
//        TupleTypeInfo<Tuple5<String, String, String,String, Timestamp>> tupleType = new TupleTypeInfo<>(
//                Types.STRING(),
//                Types.STRING(),
//                Types.STRING(),
//                Types.STRING(),
//                Types.SQL_TIMESTAMP());
        // 定義Join查詢結果爲DataStream<Tuple<>>類型
        DataStream<Tuple2<Boolean,Row>> dsTuple = tableEnv.toRetractStream(result, Row.class);
        System.out.println("========== Join結果 ==========");
        dsTuple.print();
        // 將查詢結果寫入Hbase
        dsTuple.rebalance().map(new MapFunction<Tuple2<Boolean,Row>, Object>() {
            public Tuple2<Boolean,Row> map(Tuple2<Boolean,Row> value)throws IOException {
                HBaseSink(value);
                return value;
            }
        });

        try {
            System.setProperty("HADOOP_USER_NAME","e3base");
            env.execute("Kafka-Flink Sql");
        } catch (Exception e) {
            Logger.getLogger(SqlJoinWithKafka.class.getName()).log(Level.SEVERE, null, e);
            e.printStackTrace();
        }
    }

    /** HBaseSink
     * 將kafka每條數據寫入Hbase的一行
     * @param m 每條數據字符串
     * @throws IOException
     */
    public static void HBaseSink(Tuple2<Boolean,Row> m)throws IOException
    {
        Configuration config = HBaseConfiguration.create();
        config.set("hbase.zookeeper.quorum", "xxxxxxxxxxxxxxxxxx");
        config.set("hbase.master", "xxxxxxxx");
        config.set("hbase.zookeeper.property.clientPort","11001");
     
        config.setInt("hbase.rpc.timeout", 20000);
        config.setInt("hbase.client.operation.timeout", 30000);
        config.setInt("hbase.client.scanner.timeout.period", 200000);
        //config.set(TableOutputFormat.OUTPUT_TABLE, hbasetable);

        Connection c = ConnectionFactory.createConnection(config);
        Admin admin = c.getAdmin();
        if(!admin.tableExists(TableName.valueOf("flinktest1"))){
            admin.createTable(new HTableDescriptor(TableName.valueOf("flinktest1")).addFamily(new HColumnDescriptor("cf")));
        }
        org.apache.hadoop.hbase.client.Table t = c.getTable(TableName.valueOf("flinktest1"));
//        TimeStamp ts = new TimeStamp(new Date());
//        Date date = ts.getDate();
        //取每條數據字符串前9位ID號作爲rowkey
//        String rowkey = m.f1.toString().substring(0,9);
        //取當前系統時間戳爲rowkey
        String rowkey = String.valueOf(System.currentTimeMillis());
        System.out.println(rowkey);
        Put put = new Put(org.apache.hadoop.hbase.util.Bytes.toBytes(rowkey));
        put.addColumn(org.apache.hadoop.hbase.util.Bytes.toBytes("cf"), org.apache.hadoop.hbase.util.Bytes.toBytes("test"),
                org.apache.hadoop.hbase.util.Bytes.toBytes(m.f1.toString()));
        t.put(put);

        t.close();
        c.close();
    }
}


最後結果:

1557976616020                             column=cf:test, timestamp=1557976616088, value=java,18,\xE7\x94\xB7,hangzh,2019-05-16 03:16:55.313                          
 1557976616250                             column=cf:test, timestamp=1557976616257, value=rose,28,\xE5\xA5\xB3,xian,2019-05-16 03:16:55.324                            
 1557976616408                             column=cf:test, timestamp=1557976616414, value=tom,38,\xE7\x94\xB7,shanghai,2019-05-16 03:16:55.324                         
 1557976616560                             column=cf:test, timestamp=1557976616566, value=jack,18,\xE7\x94\xB7,beijin,2019-05-16 03:16:55.325                          
 1557976616723                             column=cf:test, timestamp=1557976616729, value=luoli,19,\xE5\xA5\xB3,baoji,2019-05-16 03:16:55.325 

 

 

 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章