Flink開發五步之第三步:Transform算子彙總(直接動手操練吧)

map

package com.third_transform
import org.apache.flink.streaming.api.scala._

object Transform_Map {
  def main(args: Array[String]): Unit = {
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    val arr: Array[String] = Array("hello flink","hello world1","hello world1","hello world2")
    val ds: DataStream[String] = env.fromCollection(arr)

    val MapedDS: DataStream[String] = ds.map(r => {
      val wordArr: Array[String] = r.split(" ")
      wordArr(0) + "__" + wordArr(1)
    })
    MapedDS.print("stream")
    env.execute()
  }
}

flatmap

package com.third_transform

import org.apache.flink.streaming.api.scala._

object Transform_FlatMap {
  def main(args: Array[String]): Unit = {
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    val arr: Array[String] = Array("hello flink","hello world1","hello world1","hello world2")
    val ds: DataStream[String] = env.fromCollection(arr)
    val flatMapedDS: DataStream[String] = ds.flatMap(r => {
      val wordArr: Array[String] = r.split(" ")
      wordArr
    })
    flatMapedDS.print("stream")
    env.execute()
  }
}

filter

package com.third_transform

import org.apache.flink.streaming.api.scala._

object Transform_Filter {
  def main(args: Array[String]): Unit = {
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    val arr: Array[String] = Array("hello flink","hello world1","hello world1","hello world2")
    val ds: DataStream[String] = env.fromCollection(arr)

    val filteredDS: DataStream[String] = ds.flatMap(r => {
      val wordArr: Array[String] = r.split(" ")
      wordArr
    }).filter(_ != "hello")

    filteredDS.print("stream")
    env.execute()
  }
}

keyby + reduce

package com.third_transform

import org.apache.flink.api.java.tuple.Tuple
import org.apache.flink.streaming.api.scala._

object Transform_KeyBy {
  def main(args: Array[String]): Unit = {
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    val arr: Array[String] = Array("hello1 flink","hello1 world1","hello2 world1","hello3 world2")
    val ds: DataStream[String] = env.fromCollection(arr)

    // DataStream ===>>> KeyedStream
    val keyByedKS: KeyedStream[(String, String), Tuple] = ds.map(r => {
      val wordArr: Array[String] = r.split(" ")
      (wordArr(0), wordArr(1))
    }).keyBy(0)

    // KeyedStream ===>>> DataStream
    //一個分組數據流的聚合操作,合併當前的元素
    //和上次聚合的結果,產生一個新的值,返回的流中包含每一次聚合的結果,而不是
    //只返回最後一次聚合的最終結果
    val reducedDS: DataStream[(String, String)] = keyByedKS.reduce((v1, v2) => (v1._1 + "---" + v2._1, v1._2 + "---" + v2._2))

    reducedDS.print("stream")
    env.execute()
  }
}

滾動聚合算子(rolling Aggregation)

sum()

min()

max()

minBy()

maxBy()

package com.third_transform

import org.apache.flink.api.java.tuple.Tuple
import org.apache.flink.streaming.api.scala._

object Transform_RollingAggregation {
  def main(args: Array[String]): Unit = {
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    val arr: Array[String] = Array("hello 1","hello 2","world 2","world 3")
    val ds: DataStream[String] = env.fromCollection(arr)

    val keyByedKS: KeyedStream[(String, Int), Tuple] = ds.map(r => {
      val wordArr: Array[String] = r.split(" ")
      (wordArr(0), wordArr(1).toInt)
    }).keyBy(0)
    val sumedDS: DataStream[(String, Int)] = keyByedKS.sum(1)
    val minedDS: DataStream[(String, Int)] = keyByedKS.min(1)
    val maxedDS: DataStream[(String, Int)] = keyByedKS.max(1)
    val minByedDS: DataStream[(String, Int)] = keyByedKS.minBy(1)
    val maxByedDS: DataStream[(String, Int)] = keyByedKS.maxBy(1)
    sumedDS.print("stream1")
    minedDS.print("stream2")
    maxedDS.print("stream3")
    minByedDS.print("stream4")
    maxByedDS.print("stream5")
    env.execute()
  }
}

split和select

DataStream → SplitStream:根據某些特徵把一個 DataStream 拆分成兩個或者多個 DataStream。
SplitStream→DataStream:從一個 SplitStream中獲取一個或者多個 DataStream。

package com.third_transform

import org.apache.flink.streaming.api.scala._

object Transform_SplitAndSelect {
  def main(args: Array[String]): Unit = {
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    val arr: Array[(String, Int)] = Array(("hello1", 1),("hello2", 2), ("hello2", 3), ("hello3", 4))
    val ds: DataStream[(String, Int)] = env.fromCollection(arr)

    val splitedSS: SplitStream[(String, Int)] = ds.split(r => {
      if (r._2 > 2) Seq("big") else Seq("small")
    })

    val bigDS: DataStream[(String, Int)] = splitedSS.select("big")
    val smallDS: DataStream[(String, Int)] = splitedSS.select("small")
    val allDS: DataStream[(String, Int)] = splitedSS.select("big", "small")
    bigDS.print("bigDS")
    smallDS.print("smallDS")
    allDS.print("allDS")
    env.execute()
  }
}

connect和comap

package com.third_transform

import org.apache.flink.streaming.api.scala._

object Transform_ConnectAndComap {
  def main(args: Array[String]): Unit = {
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    val arr1: Array[(String, Int)] = Array(("hello1", 1),("hello2", 2), ("hello2", 3), ("hello3", 4))
    val ds1: DataStream[(String, Int)] = env.fromCollection(arr1)
    val arr2: Array[(Int, String)] = Array((1,"hello"), (2,"hello"), (3,"hello"))
    val ds2: DataStream[(Int, String)] = env.fromCollection(arr2)

    val connectedCS: ConnectedStreams[(String, Int), (Int, String)] = ds1.connect(ds2)

    val coMap: DataStream[(Any, Any)] = connectedCS.map(r1 => (r1._1, r1._2 - 5), r2 => (r2._1 + 5, r2._2))

    coMap.print("stream")
    env.execute()
  }
}

union

package com.third_transform

import org.apache.flink.streaming.api.scala._

object Transform_Union {
  def main(args: Array[String]): Unit = {
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    val arr1: Array[(String, Int)] = Array(("hello1", 1),("hello2", 2), ("hello2", 3), ("hello3", 4))
    val ds1: DataStream[(String, Int)] = env.fromCollection(arr1)
    val arr2: Array[(String, Int)] = Array(("hello2", 2), ("hello2", 3), ("hello3", 4))
    val ds2: DataStream[(String, Int)] = env.fromCollection(arr2)
    val unionedDS: DataStream[(String, Int)] = ds1.union(ds2)
    unionedDS.print("stream")
    env.execute()
  }
}

待續

split和select的新寫法

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章