Flink Table API和SQL的分析及使用(一)

Flink針對標準的流處理和批處理提供了兩種關係型API:Table API 和 SQL。Table API 可以直接進行select、filter、join等操作;Flink SQL則是基於Apache Calcite實現標準的SQL,和SQL語言一致,適合大部分開發人員。

Flink Table API和SQL 捆綁在Flink-Table依賴中,如果要使用需要添加一下依賴:
以Flink 1.7.2爲例

		<!--java-->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table_2.11</artifactId>
            <version>1.7.2</version>
        </dependency>
        <!--scala-->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-scala_2.11</artifactId>
            <version>1.7.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-scala_2.11</artifactId>
            <version>1.7.2</version>
        </dependency>

Table API和SQL的基本使用

一、首先需要創建一個TableEnvironment。TableEnvironment可以實現以下功能:

  1. 通過內部目錄創建表
  2. 通過外部目錄創建表
  3. 執行sql查詢
  4. 註冊用戶自定義的Fuction
  5. 將DataStream和DataSet 轉換成 Table

流數據查詢

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        StreamTableEnvironment tableEnv = TableEnvironment.getTableEnvironment(env);

批數據查詢

        ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
        BatchTableEnvironment tableEnv = BatchTableEnvironment.getTableEnvironment(env);

二、通過獲取到的TableEnvironment對象創建Table對象,有兩種類型的Table對象:輸入Table(TableSource)和輸出Table(TableSink)

TableSource

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        StreamTableEnvironment tableEnv = TableEnvironment.getTableEnvironment(env);
        
        //CsvTableSource: 文件路徑、字段名、字段類型
        TableSource csvSource = new CsvTableSource("path",new String[]{"name","age"},new TypeInformation[]{Types.STRING,Types.INT});
        //註冊一個TableSource,稱爲CsvTable
        tableEnv.registerTableSource("CsvTable", csvSource);

TableSink

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        StreamTableEnvironment tableEnv = TableEnvironment.getTableEnvironment(env);
        
		//通過TableSink把數據寫到外部
        //創建一個TableSink
        TableSink csvSink = new CsvTableSink("path","字段之間的格式 ,");
        //定義字段名和類型
        String[] fieldNames = {"cid", "cname", "revsum"};
        TypeInformation[] filedTypes = {Types.INT, Types.STRING, Types.INT};
        //註冊一個TableSink
        tableEnv.registerTableSink("CsvSinkTable", fieldNames, filedTypes, csvSink);

三、使用Table API和SQL操作

SQL

        //使用SQL操作table
        //計算來自法國的收入
        Table revenue = tableEnv.sqlQuery("select cid, cname, sum(revenue) as revsum " +
                "from orders" +
                "where country = 'france'" +
                "group by cid,cname");

Table API

		Table orders = tableEnv.scan("orders");
		Table revenue = orders.filter("count == 'france'").groupBy("cid, cname").select("cid, cname, revenue.sum as revSum");

四、DataStream、DataSet和Table之間的轉換

Table->DataStream

        //將Table中的數據轉化成DataStream<ROW>
        DataStream<Row> dsRow = ((org.apache.flink.table.api.java.StreamTableEnvironment) tableEnv).toAppendStream(表對象, Row.class);

        //將Table中的數據轉化成DataStream<Tuple2>
        TupleTypeInfo<Tuple2<String, Integer>> tupleType = new TupleTypeInfo<>(Types.STRING, Types.INT);
        DataStream<Tuple2<String, Integer>> dsTuple = ((org.apache.flink.table.api.java.StreamTableEnvironment) tableEnv).toAppendStream(revenue, tupleType);

Table->DataSet

        //將Table中的數據轉化成DataStream<ROW>
        DataSet<Row> dsRow = ((org.apache.flink.table.api.java.BatchTableEnvironment) tableEnv).toDataSet(表對象, Row.class);
        //將Table中的數據轉化成DataStream<Tuple2>
        TupleTypeInfo<Tuple2<String, Integer>> tupleType = new TupleTypeInfo<>(Types.STRING, Types.INT);
        DataSet<Tuple2<String, Integer>> dsTuple = ((org.apache.flink.table.api.java.BatchTableEnvironment) tableEnv).toDataSet(revenue, tupleType);

DataStream->Table

        //將DataStream轉化成Table
        DataStream<Tuple2<String,String>> stream = ...;
        ((org.apache.flink.table.api.java.StreamTableEnvironment) tableEnv).registerDataStream("mytable", stream);

完整代碼

package com.basic;

import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.typeutils.TupleTypeInfo;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.StreamTableEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.TableEnvironment;
import org.apache.flink.table.sinks.CsvTableSink;
import org.apache.flink.table.sinks.TableSink;
import org.apache.flink.table.sources.CsvTableSource;
import org.apache.flink.table.sources.TableSource;
import org.apache.flink.types.Row;

/**
 * FlinkTable 任務 流數據查詢
 */
public class FlinkTableJobStream {
    public static void main(String[] args) {
        //1.創建TableEnvironment
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        StreamTableEnvironment tableEnv = TableEnvironment.getTableEnvironment(env);

        //創建一個TableSource
        //CsvTableSource: 文件路徑、字段名、字段類型
        TableSource csvSource = new CsvTableSource("path",new String[]{"name","age"},new TypeInformation[]{Types.STRING,Types.INT});
        //註冊一個TableSource,稱爲CsvTable
        tableEnv.registerTableSource("CsvTable", csvSource);

        //使用SQL操作table
        //計算來自法國的收入
        Table revenue = tableEnv.sqlQuery("select cid, cname, sum(revenue) as revsum " +
                "from orders" +
                "where country = 'france'" +
                "group by cid,cname");

        //將Table中的數據轉化成DataStream<ROW>
        DataStream<Row> dsRow = ((org.apache.flink.table.api.java.StreamTableEnvironment) tableEnv).toAppendStream(revenue, Row.class);

        //將Table中的數據轉化成DataStream<Tuple2>
        TupleTypeInfo<Tuple2<String, Integer>> tupleType = new TupleTypeInfo<>(Types.STRING, Types.INT);
        DataStream<Tuple2<String, Integer>> dsTuple = ((org.apache.flink.table.api.java.StreamTableEnvironment) tableEnv).toAppendStream(revenue, tupleType);

        //通過TableSink把數據寫到外部
        //創建一個TableSink
        TableSink csvSink = new CsvTableSink("path","字段之間的格式 ,");
        //定義字段名和類型
        String[] fieldNames = {"cid", "cname", "revsum"};
        TypeInformation[] filedTypes = {Types.INT, Types.STRING, Types.INT};
        //註冊一個TableSink
        tableEnv.registerTableSink("CsvSinkTable", fieldNames, filedTypes, csvSink);

        //把結果添加到TableSink中
        revenue.insertInto("CsvSinkTable");
    }
}

package com.basic;

import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.typeutils.TupleTypeInfo;
import org.apache.flink.table.api.BatchTableEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.sinks.CsvTableSink;
import org.apache.flink.table.sinks.TableSink;
import org.apache.flink.types.Row;

/**
 * FlinkTable 任務 批數據查詢
 */
public class FlinkTableBatch {
    public static void main(String[] args) {
        //1.創建TableEnvironment
        ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
        BatchTableEnvironment tableEnv = BatchTableEnvironment.getTableEnvironment(env);

        //使用SQL操作table
        //計算來自法國的收入
        Table revenue = tableEnv.sqlQuery("select cid, cname, sum(revenue) as revsum " +
                "from orders" +
                "where country = 'france'" +
                "group by cid,cname");

        //將Table中的數據轉化成DataStream<ROW>
        DataSet<Row> dsRow = ((org.apache.flink.table.api.java.BatchTableEnvironment) tableEnv).toDataSet(revenue, Row.class);
        //將Table中的數據轉化成DataStream<Tuple2>
        TupleTypeInfo<Tuple2<String, Integer>> tupleType = new TupleTypeInfo<>(Types.STRING, Types.INT);
        DataSet<Tuple2<String, Integer>> dsTuple = ((org.apache.flink.table.api.java.BatchTableEnvironment) tableEnv).toDataSet(revenue, tupleType);

        //通過TableSink把數據寫到外部
        //創建一個TableSink
        TableSink csvSink = new CsvTableSink("path","字段之間的格式 ,");
        //定義字段名和類型
        String[] fieldNames = {"cid", "cname", "revsum"};
        TypeInformation[] filedTypes = {Types.INT, Types.STRING, Types.INT};
        //註冊一個TableSink
        tableEnv.registerTableSink("CsvSinkTable", fieldNames, filedTypes, csvSink);

        //把結果添加到TableSink中
        revenue.insertInto("CsvSinkTable");
    }
}

pom文件

    <dependencies>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-java</artifactId>
            <version>1.7.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-java_2.11</artifactId>
            <version>1.7.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-kafka-0.11_2.11</artifactId>
            <version>1.7.2</version>
        </dependency>
        <!--flink table核心包-->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table_2.11</artifactId>
            <version>1.7.2</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-scala_2.11</artifactId>
            <version>1.7.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-scala_2.11</artifactId>
            <version>1.7.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-core</artifactId>
            <version>1.7.2</version>
        </dependency>
    </dependencies>
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章