GitHub代码
https://github.com/SmallScorpion/flink-tutorial.git
状态后端(State Backends)
- 每传入一条数据,有状态的算子任务都会读取和更新状态
- 由于有效的状态访问对于处理数据的低延迟至关重要,因此每个并行任务都会在本地维护其状态,以确保快速的状态访问
- 状态的存储、访问以及维护,由一个可插入的组件决定,这个组件就叫做状态后端(state backend)
- 状态后端主要负责两件事:本地的状态管理,以及将检查点(checkpoint)状态写入远程存储
选择一个状态后端
Pom
<!-- RocksDBStateBackend -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-statebackend-rocksdb_2.11</artifactId>
<version>1.10.0</version>
</dependency>
在集群模式 配置文件中也可以设置
状态小应用
获取上一次得温度,与这次获取得数据进行对比,两次温度相差10.0则进行报警输出,类似reduce算子
import com.atguigu.bean.SensorReading
import org.apache.flink.api.common.functions.RichFlatMapFunction
import org.apache.flink.api.common.state.{ValueState, ValueStateDescriptor}
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.scala._
import org.apache.flink.util.Collector
object StateTempChangeAlertTest {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
val inputDStream: DataStream[String] = env.socketTextStream("hadoop102", 7777)
val dataDstream: DataStream[SensorReading] = inputDStream.map(
data => {
val dataArray: Array[String] = data.split(",")
SensorReading(dataArray(0), dataArray(1).toLong, dataArray(2).toDouble)
})
val resultDStrem: DataStream[(String, Double, Double)] = dataDstream
.keyBy("id")
.flatMap( TempChangeAlert(10.0) )
dataDstream.print("data")
resultDStrem.print("result")
env.execute("stateBackendsApp test job")
}
}
/**
* 获取上一次的温度进行 对比,若 两个值得温度相差10度则进行报警输出
* @param tpr
*/
case class TempChangeAlert(tpr: Double) extends RichFlatMapFunction[SensorReading, (String, Double, Double)]{
var lastTempState: ValueState[Double] = _
var firstId: ValueState[Boolean] = _
override def open(parameters: Configuration): Unit = {
lastTempState = getRuntimeContext
.getState( new ValueStateDescriptor[Double]( "last_time", classOf[Double]) )
firstId = getRuntimeContext
.getState( new ValueStateDescriptor[Boolean]( "first_id", classOf[Boolean]) )
}
override def flatMap(value: SensorReading, out: Collector[(String, Double, Double)]): Unit = {
// 获取上一次得值
val lastTemp: Double = lastTempState.value()
val bool: Boolean = firstId.value()
if(bool == false){
firstId.update(true)
}
// 更新状态
lastTempState.update(value.temperature)
// 两次得值相减得绝对值,大于传入得警告温度,则发生报警
val diff: Double = (value.temperature - lastTemp).abs
// 不是第一个数据,则上一次取出得数据永远是0.0,永远会输出
if( diff >= tpr && bool == true){
out.collect( (value.id, lastTemp, value.temperature) )
}
}
}
使用已有的api实现状态编程实现上面小Demo
import com.atguigu.bean.SensorReading
import org.apache.flink.streaming.api.scala._
object FlatMapWithStateTest {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
val inputDStream: DataStream[String] = env.socketTextStream("hadoop102", 7777)
val dataDstream: DataStream[SensorReading] = inputDStream.map(
data => {
val dataArray: Array[String] = data.split(",")
SensorReading(dataArray(0), dataArray(1).toLong, dataArray(2).toDouble)
})
val resultDStrem: DataStream[(String, Double, Double)] = dataDstream
.keyBy("id")
//.flatMap( TempChangeAlert(10.0) )
.flatMapWithState[(String, Double, Double), Double]({
case (inputData: SensorReading, None) => (List.empty, Some(inputData.temperature))
case (inputData: SensorReading, lastTemp: Some[Double]) => {
val diff = (inputData.temperature - lastTemp.get).abs
if( diff >= 10.0 ){
( List( (inputData.id, lastTemp.get, inputData.temperature) ), Some(inputData.temperature) )
} else {
( List.empty, Some(inputData.temperature) )
}
}
})
dataDstream.print("data")
resultDStrem.print("result")
env.execute("stateBackendsApp test job")
}
}