要求:根據id分組,並對utc進行排序
val conf = new SparkConf()
.setAppName("flow")
.setMaster("local[*]")
.registerKryoClasses(Array[Class[_]](A.getClass, Trip.getClass, Line.getClass, Log.getClass, LogMinor.getClass, LogData.getClass, UnConformData.getClass, LineX.getClass, MatchDataMajor.getClass))
val sparkSession = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
//正確的寫法
val list: List[A] = List(A(1, 234), A(1, 123), A(1, 345), A(1, 456))
val data = sparkSession.sparkContext.parallelize(list.groupBy(_.uuid).toList) //在分片前面groupBy
data.foreachPartition {
partition =>
partition.foreach(_._2.sortBy(_.utc).foreach(println))
}
/* 結果:
A(1,123)
A(1,234)
A(1,345)
A(1,456)*/
val conf = new SparkConf()
.setAppName("flow")
.setMaster("local[*]")
.registerKryoClasses(Array[Class[_]](A.getClass, Trip.getClass, Line.getClass, Log.getClass, LogMinor.getClass, LogData.getClass, UnConformData.getClass, LineX.getClass, MatchDataMajor.getClass))
val sparkSession = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
//錯誤的寫法
val list: List[A] = List(A(1, 234), A(1, 123), A(1, 345), A(1, 456))
val data = sparkSession.sparkContext.parallelize(list)
val unit: Unit = data.foreachPartition {
var num = 0
partition => //在分片裏面進行的分區
partition.toList.groupBy(_.uuid).map(_._2.sortBy(_.utc)).foreach(println)
}
/**
* 結果:
* List(A(1,123))
* List(A(1,234))
* List(A(1,345))
* List(A(1,456))
*/