Flink針對標準的流處理和批處理提供了兩種關係型API:Table API 和 SQL。Table API 可以直接進行select、filter、join等操作;Flink SQL則是基於Apache Calcite實現標準的SQL,和SQL語言一致,適合大部分開發人員。
Flink Table API和SQL 捆綁在Flink-Table依賴中,如果要使用需要添加一下依賴:
以Flink 1.7.2爲例
<!--java-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table_2.11</artifactId>
<version>1.7.2</version>
</dependency>
<!--scala-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-scala_2.11</artifactId>
<version>1.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-scala_2.11</artifactId>
<version>1.7.2</version>
</dependency>
Table API和SQL的基本使用
一、首先需要創建一個TableEnvironment。TableEnvironment可以實現以下功能:
- 通過內部目錄創建表
- 通過外部目錄創建表
- 執行sql查詢
- 註冊用戶自定義的Fuction
- 將DataStream和DataSet 轉換成 Table
流數據查詢
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
StreamTableEnvironment tableEnv = TableEnvironment.getTableEnvironment(env);
批數據查詢
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
BatchTableEnvironment tableEnv = BatchTableEnvironment.getTableEnvironment(env);
二、通過獲取到的TableEnvironment對象創建Table對象,有兩種類型的Table對象:輸入Table(TableSource)和輸出Table(TableSink)
TableSource
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
StreamTableEnvironment tableEnv = TableEnvironment.getTableEnvironment(env);
//CsvTableSource: 文件路徑、字段名、字段類型
TableSource csvSource = new CsvTableSource("path",new String[]{"name","age"},new TypeInformation[]{Types.STRING,Types.INT});
//註冊一個TableSource,稱爲CsvTable
tableEnv.registerTableSource("CsvTable", csvSource);
TableSink
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
StreamTableEnvironment tableEnv = TableEnvironment.getTableEnvironment(env);
//通過TableSink把數據寫到外部
//創建一個TableSink
TableSink csvSink = new CsvTableSink("path","字段之間的格式 ,");
//定義字段名和類型
String[] fieldNames = {"cid", "cname", "revsum"};
TypeInformation[] filedTypes = {Types.INT, Types.STRING, Types.INT};
//註冊一個TableSink
tableEnv.registerTableSink("CsvSinkTable", fieldNames, filedTypes, csvSink);
三、使用Table API和SQL操作
SQL
//使用SQL操作table
//計算來自法國的收入
Table revenue = tableEnv.sqlQuery("select cid, cname, sum(revenue) as revsum " +
"from orders" +
"where country = 'france'" +
"group by cid,cname");
Table API
Table orders = tableEnv.scan("orders");
Table revenue = orders.filter("count == 'france'").groupBy("cid, cname").select("cid, cname, revenue.sum as revSum");
四、DataStream、DataSet和Table之間的轉換
Table->DataStream
//將Table中的數據轉化成DataStream<ROW>
DataStream<Row> dsRow = ((org.apache.flink.table.api.java.StreamTableEnvironment) tableEnv).toAppendStream(表對象, Row.class);
//將Table中的數據轉化成DataStream<Tuple2>
TupleTypeInfo<Tuple2<String, Integer>> tupleType = new TupleTypeInfo<>(Types.STRING, Types.INT);
DataStream<Tuple2<String, Integer>> dsTuple = ((org.apache.flink.table.api.java.StreamTableEnvironment) tableEnv).toAppendStream(revenue, tupleType);
Table->DataSet
//將Table中的數據轉化成DataStream<ROW>
DataSet<Row> dsRow = ((org.apache.flink.table.api.java.BatchTableEnvironment) tableEnv).toDataSet(表對象, Row.class);
//將Table中的數據轉化成DataStream<Tuple2>
TupleTypeInfo<Tuple2<String, Integer>> tupleType = new TupleTypeInfo<>(Types.STRING, Types.INT);
DataSet<Tuple2<String, Integer>> dsTuple = ((org.apache.flink.table.api.java.BatchTableEnvironment) tableEnv).toDataSet(revenue, tupleType);
DataStream->Table
//將DataStream轉化成Table
DataStream<Tuple2<String,String>> stream = ...;
((org.apache.flink.table.api.java.StreamTableEnvironment) tableEnv).registerDataStream("mytable", stream);
完整代碼
package com.basic;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.typeutils.TupleTypeInfo;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.StreamTableEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.TableEnvironment;
import org.apache.flink.table.sinks.CsvTableSink;
import org.apache.flink.table.sinks.TableSink;
import org.apache.flink.table.sources.CsvTableSource;
import org.apache.flink.table.sources.TableSource;
import org.apache.flink.types.Row;
/**
* FlinkTable 任務 流數據查詢
*/
public class FlinkTableJobStream {
public static void main(String[] args) {
//1.創建TableEnvironment
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
StreamTableEnvironment tableEnv = TableEnvironment.getTableEnvironment(env);
//創建一個TableSource
//CsvTableSource: 文件路徑、字段名、字段類型
TableSource csvSource = new CsvTableSource("path",new String[]{"name","age"},new TypeInformation[]{Types.STRING,Types.INT});
//註冊一個TableSource,稱爲CsvTable
tableEnv.registerTableSource("CsvTable", csvSource);
//使用SQL操作table
//計算來自法國的收入
Table revenue = tableEnv.sqlQuery("select cid, cname, sum(revenue) as revsum " +
"from orders" +
"where country = 'france'" +
"group by cid,cname");
//將Table中的數據轉化成DataStream<ROW>
DataStream<Row> dsRow = ((org.apache.flink.table.api.java.StreamTableEnvironment) tableEnv).toAppendStream(revenue, Row.class);
//將Table中的數據轉化成DataStream<Tuple2>
TupleTypeInfo<Tuple2<String, Integer>> tupleType = new TupleTypeInfo<>(Types.STRING, Types.INT);
DataStream<Tuple2<String, Integer>> dsTuple = ((org.apache.flink.table.api.java.StreamTableEnvironment) tableEnv).toAppendStream(revenue, tupleType);
//通過TableSink把數據寫到外部
//創建一個TableSink
TableSink csvSink = new CsvTableSink("path","字段之間的格式 ,");
//定義字段名和類型
String[] fieldNames = {"cid", "cname", "revsum"};
TypeInformation[] filedTypes = {Types.INT, Types.STRING, Types.INT};
//註冊一個TableSink
tableEnv.registerTableSink("CsvSinkTable", fieldNames, filedTypes, csvSink);
//把結果添加到TableSink中
revenue.insertInto("CsvSinkTable");
}
}
package com.basic;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.typeutils.TupleTypeInfo;
import org.apache.flink.table.api.BatchTableEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.sinks.CsvTableSink;
import org.apache.flink.table.sinks.TableSink;
import org.apache.flink.types.Row;
/**
* FlinkTable 任務 批數據查詢
*/
public class FlinkTableBatch {
public static void main(String[] args) {
//1.創建TableEnvironment
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
BatchTableEnvironment tableEnv = BatchTableEnvironment.getTableEnvironment(env);
//使用SQL操作table
//計算來自法國的收入
Table revenue = tableEnv.sqlQuery("select cid, cname, sum(revenue) as revsum " +
"from orders" +
"where country = 'france'" +
"group by cid,cname");
//將Table中的數據轉化成DataStream<ROW>
DataSet<Row> dsRow = ((org.apache.flink.table.api.java.BatchTableEnvironment) tableEnv).toDataSet(revenue, Row.class);
//將Table中的數據轉化成DataStream<Tuple2>
TupleTypeInfo<Tuple2<String, Integer>> tupleType = new TupleTypeInfo<>(Types.STRING, Types.INT);
DataSet<Tuple2<String, Integer>> dsTuple = ((org.apache.flink.table.api.java.BatchTableEnvironment) tableEnv).toDataSet(revenue, tupleType);
//通過TableSink把數據寫到外部
//創建一個TableSink
TableSink csvSink = new CsvTableSink("path","字段之間的格式 ,");
//定義字段名和類型
String[] fieldNames = {"cid", "cname", "revsum"};
TypeInformation[] filedTypes = {Types.INT, Types.STRING, Types.INT};
//註冊一個TableSink
tableEnv.registerTableSink("CsvSinkTable", fieldNames, filedTypes, csvSink);
//把結果添加到TableSink中
revenue.insertInto("CsvSinkTable");
}
}
pom文件
<dependencies>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>1.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_2.11</artifactId>
<version>1.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka-0.11_2.11</artifactId>
<version>1.7.2</version>
</dependency>
<!--flink table核心包-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table_2.11</artifactId>
<version>1.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-scala_2.11</artifactId>
<version>1.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-scala_2.11</artifactId>
<version>1.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-core</artifactId>
<version>1.7.2</version>
</dependency>
</dependencies>