scala中groupBy用在partition前面還是後面

 要求:根據id分組,並對utc進行排序


    val conf = new SparkConf()
      .setAppName("flow")
      .setMaster("local[*]")
      .registerKryoClasses(Array[Class[_]](A.getClass, Trip.getClass, Line.getClass, Log.getClass, LogMinor.getClass, LogData.getClass, UnConformData.getClass, LineX.getClass, MatchDataMajor.getClass))

    val sparkSession = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
    //正確的寫法  
    val list: List[A] = List(A(1, 234), A(1, 123), A(1, 345), A(1, 456))
    val data = sparkSession.sparkContext.parallelize(list.groupBy(_.uuid).toList) //在分片前面groupBy
    data.foreachPartition {
      partition =>
        partition.foreach(_._2.sortBy(_.utc).foreach(println))
    }
    /* 結果:
     A(1,123)
      A(1,234)
      A(1,345)
      A(1,456)*/
val conf = new SparkConf()
      .setAppName("flow")
      .setMaster("local[*]")
      .registerKryoClasses(Array[Class[_]](A.getClass, Trip.getClass, Line.getClass, Log.getClass, LogMinor.getClass, LogData.getClass, UnConformData.getClass, LineX.getClass, MatchDataMajor.getClass))

    val sparkSession = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
    //錯誤的寫法
    val list: List[A] = List(A(1, 234), A(1, 123), A(1, 345), A(1, 456))
    val data = sparkSession.sparkContext.parallelize(list)

    val unit: Unit = data.foreachPartition {
      var num = 0
      partition =>  //在分片裏面進行的分區
        partition.toList.groupBy(_.uuid).map(_._2.sortBy(_.utc)).foreach(println)  
    }
    /**
      * 結果:
      * List(A(1,123))
      * List(A(1,234))
      * List(A(1,345))
      * List(A(1,456))
      */

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章