- FlatMap
import org.apache.flink.api.scala.{ExecutionEnvironment, _}
object FlatMapFunction01 {
def main(args: Array[String]): Unit = {
val env = ExecutionEnvironment.getExecutionEnvironment
env.getConfig.disableSysoutLogging()
//創建模擬測試數據
val text = env.fromElements("flink hadoop", "spark hive")
//使用flatMap來進行數據的切割,將每次的數據都作用於該function
val text2 = text.flatMap(_.split("\\s+"))
text2.print()
}
}
- Map
import org.apache.flink.api.scala.{ExecutionEnvironment, _}
object MapFunction01 {
def main(args: Array[String]): Unit = {
//創建運行時環境
val env = ExecutionEnvironment.getExecutionEnvironment
env.getConfig.disableSysoutLogging()
//創建模擬測試數據
val text = env.fromElements("flink hadoop", "spark hive").flatMap(_.split("\\s+"))
//轉換爲大寫,並計算其長度
val res1 = text.map(str => (str.toUpperCase(), str.trim.length))
res1.print()
//使用case class
val res2 = text.map(line => LineCount(line.toUpperCase(), line.length))
res2.print()
}
}
case class LineCount(line: String, count: Int) {
override def toString: String = line + " " + count
}
- MapPartition
import java.lang
import org.apache.flink.api.common.functions.MapPartitionFunction
import org.apache.flink.api.scala.{ExecutionEnvironment, _}
import org.apache.flink.util.Collector
object MapPartitionFunction01 {
def main(args: Array[String]): Unit = {
//創建運行時環境
val env = ExecutionEnvironment.getExecutionEnvironment
env.getConfig.disableSysoutLogging()
//創建模擬測試數據
val text = env.fromElements("flink hadoop", "spark hive").flatMap(_.split("\\s+"))
//以partition爲粒度進行count計算
//將mapPartition中的方法安裝partition作用於DataSet,產生另一個DataSet
//比較適合沒有分組的數據,如果是需要轉換單個的元素,更適合用map方法
//MapPartitionFunction[String, Long] String爲輸入元素的類型,Long爲返回元素的類型,因爲是計數,所以是Long
val text2 = text.mapPartition(new MapPartitionFunction[String, Long]() {
override def mapPartition(iterable: lang.Iterable[String], collector: Collector[Long]): Unit = {
var count = 0
val iterator = iterable.iterator()
while (iterator.hasNext) {
iterator.next()
count += 1
}
collector.collect(count)
}
})
text2.print()
//全體數據加上一個前綴
val text3 = text.mapPartition(new MapPartitionFunction[String, String] {
override def mapPartition(values: lang.Iterable[String], out: Collector[String]): Unit = {
val iterator = values.iterator()
while (iterator.hasNext) {
var str = iterator.next()
str = "prefix-" + str
out.collect(str)
}
}
})
text3.print()
}
}
- Reduce
import org.apache.flink.api.common.functions.ReduceFunction
import org.apache.flink.api.scala._
import org.apache.flink.api.scala.ExecutionEnvironment
object ReduceFunction01 {
def main(args: Array[String]): Unit = {
//創建運行時環境
val env = ExecutionEnvironment.getExecutionEnvironment
env.getConfig.disableSysoutLogging()
//創建模擬測試數據
val text = env.fromElements("flink hadoop", "spark hive hadoop flink", "flink").flatMap(_.split("\\s+"))
//reduce()將輸入的數據通過自定義的處理邏輯,返回一個結果
val text2 = text.reduce((str1, str2) => str1.concat(str2))
text2.print()
println("------------------------------------------------")
val text3 = text.reduce(new ReduceFunction[String]{
override def reduce(value1: String, value2: String): String = {
println("The first value to combine:" + value1)
println("The second value to combine:" + value2)
value1.concat(value2)
}
})
text3.print()
}
}
- ReduceGroup
import java.lang
import org.apache.flink.api.common.functions.GroupReduceFunction
import org.apache.flink.api.scala._
import org.apache.flink.api.scala.ExecutionEnvironment
import org.apache.flink.util.Collector
object ReduceGroupFunction01 {
def main(args: Array[String]): Unit = {
//創建運行時環境
val env = ExecutionEnvironment.getExecutionEnvironment
env.getConfig.disableSysoutLogging()
//創建模擬測試數據
val text = env.fromElements("flink hadoop", "spark hive hadoop flink", "flink").flatMap(_.split("\\s+"))
//一般是先分組,然後根據每組的數據進行計算
val text2 = text.map((_, 1)).groupBy(0).reduceGroup(new GroupReduceFunction[(String, Int), (String, Int)] {
override def reduce(values: lang.Iterable[(String, Int)], out: Collector[(String, Int)]): Unit = {
val iterator = values.iterator()
var word = ""
var cnt = 0
while (iterator.hasNext) {
val item = iterator.next()
word = item._1
cnt += item._2
}
out.collect((word, cnt))
}
})
text2.print()
}
}
- Join
import org.apache.flink.api.scala.{ExecutionEnvironment, _}
object JoinFunction01 {
def main(args: Array[String]): Unit = {
val env = ExecutionEnvironment.getExecutionEnvironment
env.getConfig.disableSysoutLogging()
val stuDataSet = env.fromElements(
(1, "張三", "男", 21),
(2, "彭霞", "女", 18),
(3, "李四", "男", 20),
(4, "李莉", "女", 23),
(5, "倩倩", "女", 21)
)
val scoreDataSet = env.fromElements(
(1, 90),
(2, 84),
(3, 80),
(4, 92),
(5, 87)
)
//where是指出左邊DataSet的Join列,equalTo是指出右邊DataSet的Join列
val res = stuDataSet.join(scoreDataSet)
.where(0)
.equalTo(0)
res.print()
}
}