sum
在對datastream keyby後使用sum函數聚合
package com.stanley.wordcount
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import org.apache.flink.api.scala._
/**
* Created by admin on 2020/7/2.
*/
object SumWordCount {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
//並行度設置爲1
env.setParallelism(1)
//讀取文本流數據
val inputDataStream:DataStream[String] = env.socketTextStream("node1",9999)
val outputDataStream:DataStream[(String,Int)] = inputDataStream.flatMap(_.split(" "))
.map((_, 1))
.keyBy(0)
.sum(1)
outputDataStream.print("sum_wordcount")
env.execute("wc test")
}
}
processfunction
調用最底層processfunction,將count保存成一個keyedstate
package com.stanley.wordcount
import org.apache.flink.api.common.state.{ValueState, ValueStateDescriptor}
import org.apache.flink.api.java.tuple.Tuple
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import org.apache.flink.api.scala._
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.util.Collector
/**
* Created by admin on 2020/7/2.
*/
object ProcessWordCount {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
val inputDataStream:DataStream[String] = env.socketTextStream("node1",9999)
val outputDataStraem:DataStream[(String,Int)] = inputDataStream
.flatMap(_.split(" "))
.map((_, 1))
.keyBy(0)
//調用新建的MyProcessFunction
.process(new MyProcessFunction)
outputDataStraem.print("process_wordcount")
env.execute("wc test")
}
}
class MyProcessFunction extends KeyedProcessFunction[Tuple,(String,Int),(String,Int)]{
//創建一個countState
private var countState:ValueState[Int] = _
override def open(parameters: Configuration): Unit = {
//初始化countState
countState = getRuntimeContext.getState[Int](new ValueStateDescriptor[Int]("count",classOf[Int]))
}
override def processElement(i: (String, Int), context: KeyedProcessFunction[Tuple, (String, Int), (String, Int)]#Context, collector: Collector[(String, Int)]): Unit = {
//取出count
var count = countState.value()
count+=1
//更新countState
countState.update(count)
collector.collect((i._1,count))
}
}
RichMapFunction
RichMapFunction和ProcessFunction一樣都是實現了AbstractRichFunction,所以同樣擁有生命週期方法和運行時上下文,以及keyed state
package com.stanley.wordcount
import org.apache.flink.api.common.functions.RichMapFunction
import org.apache.flink.api.common.state.{ValueState, ValueStateDescriptor}
import org.apache.flink.api.java.tuple.Tuple
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import org.apache.flink.api.scala._
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.util.Collector
/**
* Created by admin on 2020/7/2.
*/
object RichWordCount {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
val inputDataStream:DataStream[String] = env.socketTextStream("node1",9999)
val outputDataStraem:DataStream[(String,Int)] = inputDataStream
.flatMap(_.split(" "))
.map((_, 1))
.keyBy(0)
//調用MyRichMapFunction
.map(new MyRichMapFunction)
outputDataStraem.print("rich_wordcount")
env.execute("wc test")
}
}
class MyRichMapFunction extends RichMapFunction[(String,Int),(String,Int)]{
private var countState:ValueState[Int] = _
override def open(parameters: Configuration): Unit = {
countState = getRuntimeContext.getState[Int](new ValueStateDescriptor[Int]("count",classOf[Int]))
}
override def map(in: (String,Int)): (String, Int) = {
var count = countState.value()
count+=1
countState.update(count)
(in._1,count)
}
}
總結
sum方法適合在比較簡單的邏輯的計算中使用,ProcessFunction和RichMapFunction在實際應用環境中可以通過將狀態保存到狀態後端,如果出現故障通過checkpoint來恢復。