一.簡介
Flink的基本信息和API信息以及Kafka的基本信息在此不再贅述,需要了解的參考博客:
Flink:Flink流處理API編程指南
Kafka:Kafka基本信息
二.代碼實戰
package cn.kafka
import java.util.Properties
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.streaming.api.windowing.time.Time
// import org.apache.flink.api.scala._ // 有限數據集類型隱式轉換
import org.apache.flink.streaming.api.scala._ // 無限數據集類型隱式轉換
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer09
object FlinkKafka {
def main(args: Array[String]): Unit = {
//獲取執行環境
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
// 將Flink默認的開發環境並行度設置爲1
env.setParallelism(1)
val properties = new Properties()
properties.setProperty("bootstrap.servers" , "master:9092,slave01:9092,slave02:9092")
properties.setProperty("zookeeper.connect" , "master:2181,slave01:2181,slave02:2181")
properties.setProperty("group.id" , "spark")
properties.setProperty("enable.auto.commit" , "true")
properties.setProperty("auto.commit.interval.ms" ,"5000")
/**
* 配置下次重新消費的話,從哪裏開始消費:
* latest:從上一次提交的offset位置開始的
* earlist:從頭開始進行(重複消費數據)
*/
properties.setProperty("auto.offset.reset" , "latest")
// 配置序列化和反序列化
properties.setProperty("key.serializer" , "org.apache.kafka.common.serialization.StringSerializer")
properties.setProperty("key.deserializer" , "org.apache.kafka.common.serialization.StringDeserializer")
//獲取數據源 kafka
val consumer : FlinkKafkaConsumer09[String] = new FlinkKafkaConsumer09[String](
"spark", new SimpleStringSchema(), properties
)
val kafkaDataStream : DataStream[String] = env.addSource(consumer)
val result : DataStream[(String, Int)] = kafkaDataStream
.flatMap(row => row.split("\\s"))
.map(row => (row, 1))
.keyBy(_._1)
.timeWindow(Time.seconds(2))
.sum(1)
result.print()
env.execute("FlinkKafkaWordCount")
}
}
三.執行結果
1.kafka生產者
2.Flink消費者