Flink Table API和SQL的分析及使用(一)

Flink针对标准的流处理和批处理提供了两种关系型API:Table API 和 SQL。Table API 可以直接进行select、filter、join等操作;Flink SQL则是基于Apache Calcite实现标准的SQL,和SQL语言一致,适合大部分开发人员。

Flink Table API和SQL 捆绑在Flink-Table依赖中,如果要使用需要添加一下依赖:
以Flink 1.7.2为例

		<!--java-->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table_2.11</artifactId>
            <version>1.7.2</version>
        </dependency>
        <!--scala-->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-scala_2.11</artifactId>
            <version>1.7.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-scala_2.11</artifactId>
            <version>1.7.2</version>
        </dependency>

Table API和SQL的基本使用

一、首先需要创建一个TableEnvironment。TableEnvironment可以实现以下功能:

  1. 通过内部目录创建表
  2. 通过外部目录创建表
  3. 执行sql查询
  4. 注册用户自定义的Fuction
  5. 将DataStream和DataSet 转换成 Table

流数据查询

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        StreamTableEnvironment tableEnv = TableEnvironment.getTableEnvironment(env);

批数据查询

        ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
        BatchTableEnvironment tableEnv = BatchTableEnvironment.getTableEnvironment(env);

二、通过获取到的TableEnvironment对象创建Table对象,有两种类型的Table对象:输入Table(TableSource)和输出Table(TableSink)

TableSource

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        StreamTableEnvironment tableEnv = TableEnvironment.getTableEnvironment(env);
        
        //CsvTableSource: 文件路径、字段名、字段类型
        TableSource csvSource = new CsvTableSource("path",new String[]{"name","age"},new TypeInformation[]{Types.STRING,Types.INT});
        //注册一个TableSource,称为CsvTable
        tableEnv.registerTableSource("CsvTable", csvSource);

TableSink

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        StreamTableEnvironment tableEnv = TableEnvironment.getTableEnvironment(env);
        
		//通过TableSink把数据写到外部
        //创建一个TableSink
        TableSink csvSink = new CsvTableSink("path","字段之间的格式 ,");
        //定义字段名和类型
        String[] fieldNames = {"cid", "cname", "revsum"};
        TypeInformation[] filedTypes = {Types.INT, Types.STRING, Types.INT};
        //注册一个TableSink
        tableEnv.registerTableSink("CsvSinkTable", fieldNames, filedTypes, csvSink);

三、使用Table API和SQL操作

SQL

        //使用SQL操作table
        //计算来自法国的收入
        Table revenue = tableEnv.sqlQuery("select cid, cname, sum(revenue) as revsum " +
                "from orders" +
                "where country = 'france'" +
                "group by cid,cname");

Table API

		Table orders = tableEnv.scan("orders");
		Table revenue = orders.filter("count == 'france'").groupBy("cid, cname").select("cid, cname, revenue.sum as revSum");

四、DataStream、DataSet和Table之间的转换

Table->DataStream

        //将Table中的数据转化成DataStream<ROW>
        DataStream<Row> dsRow = ((org.apache.flink.table.api.java.StreamTableEnvironment) tableEnv).toAppendStream(表对象, Row.class);

        //将Table中的数据转化成DataStream<Tuple2>
        TupleTypeInfo<Tuple2<String, Integer>> tupleType = new TupleTypeInfo<>(Types.STRING, Types.INT);
        DataStream<Tuple2<String, Integer>> dsTuple = ((org.apache.flink.table.api.java.StreamTableEnvironment) tableEnv).toAppendStream(revenue, tupleType);

Table->DataSet

        //将Table中的数据转化成DataStream<ROW>
        DataSet<Row> dsRow = ((org.apache.flink.table.api.java.BatchTableEnvironment) tableEnv).toDataSet(表对象, Row.class);
        //将Table中的数据转化成DataStream<Tuple2>
        TupleTypeInfo<Tuple2<String, Integer>> tupleType = new TupleTypeInfo<>(Types.STRING, Types.INT);
        DataSet<Tuple2<String, Integer>> dsTuple = ((org.apache.flink.table.api.java.BatchTableEnvironment) tableEnv).toDataSet(revenue, tupleType);

DataStream->Table

        //将DataStream转化成Table
        DataStream<Tuple2<String,String>> stream = ...;
        ((org.apache.flink.table.api.java.StreamTableEnvironment) tableEnv).registerDataStream("mytable", stream);

完整代码

package com.basic;

import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.typeutils.TupleTypeInfo;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.StreamTableEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.TableEnvironment;
import org.apache.flink.table.sinks.CsvTableSink;
import org.apache.flink.table.sinks.TableSink;
import org.apache.flink.table.sources.CsvTableSource;
import org.apache.flink.table.sources.TableSource;
import org.apache.flink.types.Row;

/**
 * FlinkTable 任务 流数据查询
 */
public class FlinkTableJobStream {
    public static void main(String[] args) {
        //1.创建TableEnvironment
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        StreamTableEnvironment tableEnv = TableEnvironment.getTableEnvironment(env);

        //创建一个TableSource
        //CsvTableSource: 文件路径、字段名、字段类型
        TableSource csvSource = new CsvTableSource("path",new String[]{"name","age"},new TypeInformation[]{Types.STRING,Types.INT});
        //注册一个TableSource,称为CsvTable
        tableEnv.registerTableSource("CsvTable", csvSource);

        //使用SQL操作table
        //计算来自法国的收入
        Table revenue = tableEnv.sqlQuery("select cid, cname, sum(revenue) as revsum " +
                "from orders" +
                "where country = 'france'" +
                "group by cid,cname");

        //将Table中的数据转化成DataStream<ROW>
        DataStream<Row> dsRow = ((org.apache.flink.table.api.java.StreamTableEnvironment) tableEnv).toAppendStream(revenue, Row.class);

        //将Table中的数据转化成DataStream<Tuple2>
        TupleTypeInfo<Tuple2<String, Integer>> tupleType = new TupleTypeInfo<>(Types.STRING, Types.INT);
        DataStream<Tuple2<String, Integer>> dsTuple = ((org.apache.flink.table.api.java.StreamTableEnvironment) tableEnv).toAppendStream(revenue, tupleType);

        //通过TableSink把数据写到外部
        //创建一个TableSink
        TableSink csvSink = new CsvTableSink("path","字段之间的格式 ,");
        //定义字段名和类型
        String[] fieldNames = {"cid", "cname", "revsum"};
        TypeInformation[] filedTypes = {Types.INT, Types.STRING, Types.INT};
        //注册一个TableSink
        tableEnv.registerTableSink("CsvSinkTable", fieldNames, filedTypes, csvSink);

        //把结果添加到TableSink中
        revenue.insertInto("CsvSinkTable");
    }
}

package com.basic;

import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.typeutils.TupleTypeInfo;
import org.apache.flink.table.api.BatchTableEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.sinks.CsvTableSink;
import org.apache.flink.table.sinks.TableSink;
import org.apache.flink.types.Row;

/**
 * FlinkTable 任务 批数据查询
 */
public class FlinkTableBatch {
    public static void main(String[] args) {
        //1.创建TableEnvironment
        ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
        BatchTableEnvironment tableEnv = BatchTableEnvironment.getTableEnvironment(env);

        //使用SQL操作table
        //计算来自法国的收入
        Table revenue = tableEnv.sqlQuery("select cid, cname, sum(revenue) as revsum " +
                "from orders" +
                "where country = 'france'" +
                "group by cid,cname");

        //将Table中的数据转化成DataStream<ROW>
        DataSet<Row> dsRow = ((org.apache.flink.table.api.java.BatchTableEnvironment) tableEnv).toDataSet(revenue, Row.class);
        //将Table中的数据转化成DataStream<Tuple2>
        TupleTypeInfo<Tuple2<String, Integer>> tupleType = new TupleTypeInfo<>(Types.STRING, Types.INT);
        DataSet<Tuple2<String, Integer>> dsTuple = ((org.apache.flink.table.api.java.BatchTableEnvironment) tableEnv).toDataSet(revenue, tupleType);

        //通过TableSink把数据写到外部
        //创建一个TableSink
        TableSink csvSink = new CsvTableSink("path","字段之间的格式 ,");
        //定义字段名和类型
        String[] fieldNames = {"cid", "cname", "revsum"};
        TypeInformation[] filedTypes = {Types.INT, Types.STRING, Types.INT};
        //注册一个TableSink
        tableEnv.registerTableSink("CsvSinkTable", fieldNames, filedTypes, csvSink);

        //把结果添加到TableSink中
        revenue.insertInto("CsvSinkTable");
    }
}

pom文件

    <dependencies>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-java</artifactId>
            <version>1.7.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-java_2.11</artifactId>
            <version>1.7.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-kafka-0.11_2.11</artifactId>
            <version>1.7.2</version>
        </dependency>
        <!--flink table核心包-->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table_2.11</artifactId>
            <version>1.7.2</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-scala_2.11</artifactId>
            <version>1.7.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-scala_2.11</artifactId>
            <version>1.7.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-core</artifactId>
            <version>1.7.2</version>
        </dependency>
    </dependencies>
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章