Flink流批處理消費kafka 數據寫入hbase
- 通過flume將數據寫入kafka topic
Kafka topic1 數據:
name, age, sexy, proctime.proctime
java,18,男,20190516
rose,28,女,20190516
tom,38,男,20190516
jack,18,男,20190516
luoli,19,女,20190516
Kafka topic2 數據:
name, age, sexy, city, proctime.proctime
java,18,男,hangzh,20190516
rose,28,女,xian,20190516
tom,38,男,shanghai,20190516
jack,18,男,beijin,20190516
luoli,19,女,baoji,20190516
- 具體實現代碼:
package com.sitesh;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.api.java.tuple.Tuple4;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.runtime.state.filesystem.FsStateBackend;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer010;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.TableEnvironment;
import org.apache.flink.table.api.java.StreamTableEnvironment;
import org.apache.flink.types.Row;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Put;
import java.io.IOException;
import java.util.Properties;
import java.util.logging.Level;
import java.util.logging.Logger;
public class SqlJoinWithKafka {
public static void main(String[] args) throws Exception{
System.out.println("use command as: ");
System.out.println("flink run --class com.sitech.flink.examples.tablesql.SqlJoinWithKafka" +
" /opt/test.jar --topic topic-test -bootstrap.servers xxxx.xxx.xxx.xxx:9092");
System.out.println("******************************************************************************************");
System.out.println("<topic> is the kafka topic name");
System.out.println("<bootstrap.servers> is the ip:port list of brokers");
System.out.println("******************************************************************************************");
//獲取執行環境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
StreamTableEnvironment tableEnv = TableEnvironment.getTableEnvironment(env);
//基於EventTime進行處理
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
//checkpoint配置
env.enableCheckpointing(5000);//每隔5000 ms進行啓動一個檢查點【設置checkpoint的週期】
// env.setStateBackend(new FsStateBackend("hdfs://172.21.3.66:12001/flink/checkpoints"));
//設置並行度爲1
env.setParallelism(1);
//kafka配置
ParameterTool paraTool = ParameterTool.fromArgs(args);
Properties prop = new Properties();
prop.setProperty("bootstrap.servers","xxxxxxx");
prop.setProperty("group.id","sql-join");
//定義Stream1,從Kafka topic1中讀取數據,設置參數:基於checkpoint自動提交offset到Kafka;從最新的數據開始消費
DataStream<Tuple3<String, String, String>> kafkaStream1 = env.addSource(new FlinkKafkaConsumer010<>("topic1",
new SimpleStringSchema(),
prop).setCommitOffsetsOnCheckpoints(true).setStartFromLatest()).map(new MapFunction<String, Tuple3<String, String, String>>() {
@Override
public Tuple3<String, String, String> map(String s) throws Exception {
String[] word = s.split(",");
return new Tuple3<>(word[0], word[1], word[2]);
}
});
//將Stream1註冊爲Table1
tableEnv.registerDataStream("Table1", kafkaStream1, "name, age, sexy, proctime.proctime");
//定義Stream2,從Kafka topic2中讀取數據,設置參數:基於checkpoint自動提交offset到Kafka;從最新的數據開始消費
DataStream<Tuple4<String, String, String,String>> kafkaStream2 = env.addSource(new FlinkKafkaConsumer010<>("topic2",
new SimpleStringSchema(),
prop).setCommitOffsetsOnCheckpoints(true).setStartFromLatest()).map(new MapFunction<String, Tuple4<String, String, String,String>>() {
@Override
public Tuple4<String, String, String,String> map(String s) throws Exception {
String[] word = s.split(",");
return new Tuple4<>(word[0], word[1], word[2],word[3]);
}
});
//將Stream2註冊爲Table2
tableEnv.registerDataStream("Table2", kafkaStream2, "name, age, sexy, city, proctime.proctime");
//執行SQL Join進行聯合查詢:將Table1.name=Table2.name 的數據進行拼接
//Table1:name, age, sexy, proctime.proctime
//Table1:name, age, sexy, city, proctime.proctime
//查詢輸出字段:[0]name, [1]age, [2]sexy, [3]city, [4]proctime
Table t1 = tableEnv.scan("Table2");
System.out.println("========== Table2數據 ==========");
tableEnv.toRetractStream(t1, Row.class).print();
Table result = tableEnv.sqlQuery(
"SELECT t1.name, t1.age, t1.sexy,t2.city,t2.proctime\n" +
"FROM Table1 AS t1\n" +
"LEFT JOIN Table2 AS t2\n" +
"ON t1.name = t2.name\n"
// "AND t1.age = t2.age\n" +
// "AND t1.sexy = t2.sexy" +
//限制join的時間窗口
// "AND t1.proctime BETWEEN t2.proctime - INT ERVAL '10' SECOND AND t2.proctime + INTERVAL '10' SECOND"
);
/**
* 將Join查詢結果轉換爲DataStream,並實時寫入Hbase,一條數據爲一行
*/
//定義數據接收類型
// TupleTypeInfo<Tuple5<String, String, String,String, Timestamp>> tupleType = new TupleTypeInfo<>(
// Types.STRING(),
// Types.STRING(),
// Types.STRING(),
// Types.STRING(),
// Types.SQL_TIMESTAMP());
// 定義Join查詢結果爲DataStream<Tuple<>>類型
DataStream<Tuple2<Boolean,Row>> dsTuple = tableEnv.toRetractStream(result, Row.class);
System.out.println("========== Join結果 ==========");
dsTuple.print();
// 將查詢結果寫入Hbase
dsTuple.rebalance().map(new MapFunction<Tuple2<Boolean,Row>, Object>() {
public Tuple2<Boolean,Row> map(Tuple2<Boolean,Row> value)throws IOException {
HBaseSink(value);
return value;
}
});
try {
System.setProperty("HADOOP_USER_NAME","e3base");
env.execute("Kafka-Flink Sql");
} catch (Exception e) {
Logger.getLogger(SqlJoinWithKafka.class.getName()).log(Level.SEVERE, null, e);
e.printStackTrace();
}
}
/** HBaseSink
* 將kafka每條數據寫入Hbase的一行
* @param m 每條數據字符串
* @throws IOException
*/
public static void HBaseSink(Tuple2<Boolean,Row> m)throws IOException
{
Configuration config = HBaseConfiguration.create();
config.set("hbase.zookeeper.quorum", "xxxxxxxxxxxxxxxxxx");
config.set("hbase.master", "xxxxxxxx");
config.set("hbase.zookeeper.property.clientPort","11001");
config.setInt("hbase.rpc.timeout", 20000);
config.setInt("hbase.client.operation.timeout", 30000);
config.setInt("hbase.client.scanner.timeout.period", 200000);
//config.set(TableOutputFormat.OUTPUT_TABLE, hbasetable);
Connection c = ConnectionFactory.createConnection(config);
Admin admin = c.getAdmin();
if(!admin.tableExists(TableName.valueOf("flinktest1"))){
admin.createTable(new HTableDescriptor(TableName.valueOf("flinktest1")).addFamily(new HColumnDescriptor("cf")));
}
org.apache.hadoop.hbase.client.Table t = c.getTable(TableName.valueOf("flinktest1"));
// TimeStamp ts = new TimeStamp(new Date());
// Date date = ts.getDate();
//取每條數據字符串前9位ID號作爲rowkey
// String rowkey = m.f1.toString().substring(0,9);
//取當前系統時間戳爲rowkey
String rowkey = String.valueOf(System.currentTimeMillis());
System.out.println(rowkey);
Put put = new Put(org.apache.hadoop.hbase.util.Bytes.toBytes(rowkey));
put.addColumn(org.apache.hadoop.hbase.util.Bytes.toBytes("cf"), org.apache.hadoop.hbase.util.Bytes.toBytes("test"),
org.apache.hadoop.hbase.util.Bytes.toBytes(m.f1.toString()));
t.put(put);
t.close();
c.close();
}
}
最後結果:
1557976616020 column=cf:test, timestamp=1557976616088, value=java,18,\xE7\x94\xB7,hangzh,2019-05-16 03:16:55.313
1557976616250 column=cf:test, timestamp=1557976616257, value=rose,28,\xE5\xA5\xB3,xian,2019-05-16 03:16:55.324
1557976616408 column=cf:test, timestamp=1557976616414, value=tom,38,\xE7\x94\xB7,shanghai,2019-05-16 03:16:55.324
1557976616560 column=cf:test, timestamp=1557976616566, value=jack,18,\xE7\x94\xB7,beijin,2019-05-16 03:16:55.325
1557976616723 column=cf:test, timestamp=1557976616729, value=luoli,19,\xE5\xA5\xB3,baoji,2019-05-16 03:16:55.325