1,第一步很重要 就是依賴的問題,因爲是本地執行,所以最好有hadoop配置環境,沒有的話會提示錯誤信息,自己百度一下,自己下載個winutils.exe ,然後配置環境變量
2,爲了圖方便 直接貼上pom文件依賴(自己看哈,就是hive跟 hadoop的依賴):
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>pijiuya</groupId> <artifactId>FlinkExample</artifactId> <version>1.0-SNAPSHOT</version> <properties> <flink.version>1.10.0</flink.version> </properties> <dependencies> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-java</artifactId> <version>${flink.version}</version> <!-- provided在這表示此依賴只在代碼編譯的時候使用,運行和打包的時候不使用 --> <!--<scope>provided</scope>--> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-streaming-java_2.11</artifactId> <version>${flink.version}</version> <!--<scope>provided</scope>--> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-scala_2.11</artifactId> <version>${flink.version}</version> <!--<scope>provided</scope>--> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-streaming-scala_2.11</artifactId> <version>${flink.version}</version> <!--<scope>provided</scope>--> </dependency> <dependency> <groupId>org.apache.bahir</groupId> <artifactId>flink-connector-redis_2.11</artifactId> <version>1.0</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-statebackend-rocksdb_2.11</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-connector-kafka-0.11_2.11</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.kafka</groupId> <artifactId>kafka-clients</artifactId> <version>0.11.0.3</version> </dependency> <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-api</artifactId> <version>1.7.25</version> </dependency> <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> <version>1.7.25</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-table</artifactId> <version>${flink.version}</version> <type>pom</type> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-table-api-java-bridge_2.11</artifactId> <version>${flink.version}</version> </dependency> <!-- or... --> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-table-api-scala-bridge_2.11</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-table-common</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-table-api-java</artifactId> <version>${flink.version}</version> </dependency> <!--<dependency>--> <!--<groupId>org.apache.flink</groupId>--> <!--<artifactId>flink-table-api-scala_${scala.binary.version}</artifactId>--> <!--<version>${flink.version}</version>--> <!--</dependency>--> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-table-api-scala_2.11</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-table-planner-blink_2.11</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-table-planner_2.11</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-jdbc_2.11</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-csv</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>statefun-sdk</artifactId> <version>2.0.0</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>statefun-flink-harness</artifactId> <version>2.0.0</version> </dependency> <dependency> <groupId>com.alibaba</groupId> <artifactId>fastjson</artifactId> <version>1.2.60</version> </dependency> <dependency> <groupId>redis.clients</groupId> <artifactId>jedis</artifactId> <version>2.9.0</version> <!--<scope>provided</scope>--> </dependency> <dependency> <groupId>org.apache.bahir</groupId> <artifactId>flink-connector-redis_2.11</artifactId> <version>1.0</version> </dependency> <!--時間類--> <dependency> <groupId>joda-time</groupId> <artifactId>joda-time</artifactId> <version>2.9.2</version> </dependency> <!--hive依賴--> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-connector-hive_2.11</artifactId> <version>1.10.0</version> <!--<scope>provided</scope>--> </dependency> <dependency> <groupId>org.apache.hive</groupId> <artifactId>hive-exec</artifactId> <version>1.1.0</version> <!--<scope>provided</scope>--> </dependency> <!-- hadoop依賴--> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-common</artifactId> <version>2.6.0-cdh5.16.1</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-hdfs</artifactId> <version>2.6.0-cdh5.16.1</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>2.6.0-cdh5.16.1</version> </dependency> </dependencies> <repositories> <repository> <id>cloudera</id> <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url> <releases> <enabled>true</enabled> </releases> <snapshots> <enabled>true</enabled> </snapshots> </repository> </repositories> <build> <plugins> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-shade-plugin</artifactId> <version>3.1.0</version> <configuration> <createDependencyReducedPom>false</createDependencyReducedPom> </configuration> <executions> <execution> <phase>package</phase> <goals> <goal>shade</goal> </goals> <configuration> <transformers> <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"> <!--如果要打包的話,這裏要換成對應的 main class--> <!--<mainClass>application.StormToFlink_demo</mainClass>--> <!--<mainClass>application.SaveDataToHbase</mainClass>--> <!--<mainClass>application.CheckPointState_demo</mainClass>--> <!--<mainClass>application.storm.FlinkTest</mainClass>--> <!--<mainClass>application.storm.FlinkMatchTopicApplication_develop</mainClass>--> <mainClass>batch.WordCount_demo</mainClass> <mainClass>developing_scala.kafka2RedisDemo_test</mainClass> </transformer> <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer"> <resource>reference.conf</resource> </transformer> </transformers> <filters> <filter> <artifact>*:*:*:*</artifact> <excludes> <exclude>META-INF/*.SF</exclude> <exclude>META-INF/*.DSA</exclude> <exclude>META-INF/*.RSA</exclude> </excludes> </filter> </filters> </configuration> </execution> </executions> </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId> <configuration> <source>8</source> <target>8</target> <encoding>utf8</encoding> </configuration> </plugin> </plugins> </build> </project>
3,因爲每個人的hive版本不一樣,請參考官網信息 一目瞭然
https://ci.apache.org/projects/flink/flink-docs-release-1.10/dev/table/hive/#connecting-to-hive
4,接下來把hive-site.xml配置文件給load下來放到一個路徑,我這裏是本地window10環境演示的,所以隨便放了一個路徑。
需要注意的就是有的是CDH HDP環境,可能load下來的配置文件密碼加密啊 有的屬性沒有,參考官網必須知道一下的屬性:
https://ci.apache.org/projects/flink/flink-docs-release-1.10/dev/table/hive/hive_catalog.html
<configuration>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://localhost/metastore?createDatabaseIfNotExist=true</value>
<description>metadata is stored in a MySQL server</description>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
<description>MySQL JDBC driver class</description>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>...</value>
<description>user name for connecting to mysql server</description>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>...</value>
<description>password for connecting to mysql server</description>
</property>
<property>
<name>hive.metastore.uris</name>
<value>thrift://localhost:9083</value>
<description>IP address (or fully-qualified domain name) and port of the metastore host</description>
</property>
<property>
<name>hive.metastore.schema.verification</name>
<value>true</value>
</property>
</configuration>
5,後面就是代碼了。很簡單的。百度找找
package flink_sql import org.apache.flink.streaming.api.TimeCharacteristic import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment} import org.apache.flink.table.api.{EnvironmentSettings, Table} import org.apache.flink.table.api.scala.StreamTableEnvironment import org.apache.flink.table.catalog.hive.HiveCatalog /** * todo 從kafka讀取數據創建catalog */ object Sql_source_kafka { def main(args: Array[String]): Unit = { import org.apache.flink.api.scala._ val streamEnv = StreamExecutionEnvironment.getExecutionEnvironment streamEnv.setParallelism(1) // streamEnv.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) // streamEnv.setStreamTimeCharacteristic(TimeCharacteristic.ProcessingTime) val tableEnvSettings = EnvironmentSettings.newInstance() .useBlinkPlanner() .inStreamingMode() .build() val tableEnv = StreamTableEnvironment.create(streamEnv, tableEnvSettings) val catalog = new HiveCatalog( "rtdw", // catalog name "default", // default database "G:\\Flink SQL開發文件", // Hive config (hive-site.xml) directory "1.1.0" // Hive version ) //todo 註冊這個catalog tableEnv.registerCatalog("rtdw", catalog) //todo 使用這個catalog,這個表在內存 tableEnv.useCatalog("rtdw") //todo 創建庫 // val createDbSql1 = "CREATE DATABASE IF NOT EXISTS rtdw.default" // val createDbSql1 = "USE DATABASE default" // tableEnv.sqlUpdate(createDbSql1) //todo 存在哪些庫 val aa: Array[String] = tableEnv.listCatalogs() print(aa.toList) //todo 存在的表 val tables = tableEnv.listTables() println(tables.toList) //todo kafka 數據 val kafkaLogStr = "{\"eventType\": \"clickBuyNow\",\"userId\": \"97470180\",\"ts\": 1585136092541}" // tableEnv.sqlUpdate("DROP TABLE Orders rtdw.ods.streaming_user_active_log2") val createTableSql_new = """CREATE TABLE flink_test_03 ( | eventType STRING, | userId STRING, | ts STRING |) | WITH |( | 'connector.type' = 'kafka', | 'connector.version' = '0.11', | 'connector.topic' = 'flink_test_topic', | 'connector.startup-mode' = 'earliest-offset', | 'connector.properties.zookeeper.connect' = 'node1:2181,node2:2181,node3:2181', | 'connector.properties.bootstrap.servers' = 'node11:9092,node2:9092,node3:9092', | 'connector.properties.group.id' = 'flink_test_1', | 'format.type' = 'json', | 'format.derive-schema' = 'true', | 'update-mode' = 'append' |)""".stripMargin tableEnv.sqlUpdate(createTableSql_new) val querySql = """SELECT eventType, |userId, |ts |FROM flink_test_03 """.stripMargin val result: Table = tableEnv.sqlQuery(querySql) print("打印元數據信息") val rsss: DataStream[(String, String, String)] = tableEnv.toAppendStream[(String, String, String)](result) rsss.print() streamEnv.execute() } }
6,執行之後我們可以在hive查看是否存在表信息,實際發現表存儲,表字段不存在,通過mysql去查找也是:
通過命令DESCRIBE FORMATTED flink_test_03; 發現可以打印這個變的信息,mysql庫是不存在表字段信息的。
7,如果是這種情況,我們只能是打印table的schme信息,手動去分析去存儲,後續再研究怎麼搞元數據的管理。