RDD算子1

RDD 是一個分佈式的可變集合
#常用的Transformation(即轉換,延遲加載)
#通過並行化scala集合創建RDD

scala> val rdd1 = sc.parallelize(Array(1,2,3,4,5,6,7,8,9,10))
rdd1: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[0] at parallelize at <console>:24

scala> rdd1.partition
partitioner   partitions

scala> rdd1.partitions.length
length   lengthCompare

#查看RDD分區數量
scala> rdd1.partitions.length
res0: Int = 4

scala> val rdd1 = sc.parallelize(List(1,2,3,4,5))
rdd1: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[1] at parallelize at <console>:24

scala> val rdd1 = sc.parallelize(List(1,2,3,4,5)).map(_*2).sortBy(x=>x,true)
rdd1: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[8] at sortBy at <console>:24

scala> rdd1.filter(_>3)
res1: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[9] at filter at <console>:27

scala> val rdd2 = sc.parallelize(List(1,2,3,4,5)).map(_*2).sortBy(x=>x+"",true)
rdd2: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[16] at sortBy at <console>:24

scala> val rdd4 = sc.parallelize(Array("a a1","b b2 b3","c","d e"))
rdd4: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[17] at parallelize at <console>:24

scala> rdd4.flatMap(_.split(" ")).collect
res2: Array[String] = Array(a, a1, b, b2, b3, c, d, e)                          

scala> val rdd5 = sc.parallelize(List(List("a a1","b b2 b3","c","d e"),List("a a1","b b2 b3","c","d e")))
rdd5: org.apache.spark.rdd.RDD[List[String]] = ParallelCollectionRDD[19] at parallelize at <console>:24

scala> rdd5.flatMap(_.flatMap(_.split(" "))).collect
res3: Array[String] = Array(a, a1, b, b2, b3, c, d, e, a, a1, b, b2, b3, c, d, e)

scala> rdd5.map(_.flatMap(_.split(" "))).collect
res4: Array[List[String]] = Array(List(a, a1, b, b2, b3, c, d, e), List(a, a1, b, b2, b3, c, d, e))

##union求並集,注意類型要一致
scala> val rdd6 = sc.parallelize(List(1,2,3,4,5))
rdd6: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[22] at parallelize at <console>:24

scala> val rdd7 = sc.parallelize(List(3,4,5,6,7))
rdd7: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[23] at parallelize at <console>:24

scala> val rdd8 = rdd6.union(rdd7)
rdd8: org.apache.spark.rdd.RDD[Int] = UnionRDD[24] at union at <console>:28

scala> rdd8.distinct.sortBy(x=>x).collect
res5: Array[Int] = Array(1, 2, 3, 4, 5, 6, 7)                                   

##intersection求交集
scala> val rdd9 = rdd6.intersection(rdd7)
rdd9: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[38] at intersection at <console>:28

scala> rdd9.collect
res6: Array[Int] = Array(4, 5, 3)                                               

##join求交集
scala> val rdd1 = sc.parallelize(List(("tom",1),("jerry",2),("kitty", 3)))
rdd1: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[39] at parallelize at <console>:24

scala> val rdd2 = sc.parallelize(List(("jerry", 9),("shuke", 7), ("tom", 2)))
rdd2: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[40] at parallelize at <console>:24

scala> val rdd3 = rdd1.join(rdd2)
rdd3: org.apache.spark.rdd.RDD[(String, (Int, Int))] = MapPartitionsRDD[43] at join at <console>:28

scala> rdd3.collect
res7: Array[(String, (Int, Int))] = Array((tom,(1,2)), (jerry,(2,9)))           

scala> val rdd3 = rdd1.leftOuterJoin(rdd2)
rdd3: org.apache.spark.rdd.RDD[(String, (Int, Option[Int]))] = MapPartitionsRDD[46] at leftOuterJoin at <console>:28

scala> rdd3.collect
res8: Array[(String, (Int, Option[Int]))] = Array((tom,(1,Some(2))), (jerry,(2,Some(9))), (kitty,(3,None)))

scala> val rdd3 = rdd1.rightOuterJoin(rdd2)
rdd3: org.apache.spark.rdd.RDD[(String, (Option[Int], Int))] = MapPartitionsRDD[49] at rightOuterJoin at <console>:28

scala> rdd3.collect
res9: Array[(String, (Option[Int], Int))] = Array((tom,(Some(1),2)), (jerry,(Some(2),9)), (shuke,(None,7)))

scala> val rdd3 = rdd1 union rdd2
rdd3: org.apache.spark.rdd.RDD[(String, Int)] = UnionRDD[50] at union at <console>:28

scala> rdd3.collect
res10: Array[(String, Int)] = Array((tom,1), (jerry,2), (kitty,3), (jerry,9), (shuke,7), (tom,2))


##groupByKey
scala> rdd3.groupByKey
res11: org.apache.spark.rdd.RDD[(String, Iterable[Int])] = ShuffledRDD[51] at groupByKey at <console>:31

scala> rdd3.groupByKey.collect
res12: Array[(String, Iterable[Int])] = Array((tom,CompactBuffer(1, 2)), (shuke,CompactBuffer(7)), (kitty,CompactBuffer(3)), (jerry,CompactBuffer(2, 9)))

scala> rdd3.groupByKey.map(x=>(x._1,x._2.sum)).collect
res15: Array[(String, Int)] = Array((tom,3), (shuke,7), (kitty,3), (jerry,11))

scala> rdd3.groupByKey.mapValues(_.sum).collect
res16: Array[(String, Int)] = Array((tom,3), (shuke,7), (kitty,3), (jerry,11))                                                          ^


##WordCount
scala> sc.textFile("/spark/input/a.txt").flatMap(x=>x.split(" ")).map((_,1)).reduceByKey(_+_).sortBy(_._2,false).collect
res19: Array[(String, Int)] = Array((hello,4), (jim,1), (jarry,1), (wo,1), (ni,1))

scala> sc.textFile("/spark/input/a.txt").flatMap(x=>x.split(" ")).map((_,1)).groupByKey.map(t=>(t._1, t._2.sum)).collect
res20: Array[(String, Int)] = Array((jim,1), (jarry,1), (wo,1), (hello,4), (ni,1))

#cogroup
scala> val rdd1 = sc.parallelize(List(("tom", 1),("tom", 2),("jerry", 3), ("kitty", 4)))
rdd1: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[75] at parallelize at <console>:24

scala> val rdd2 = sc.parallelize(List(("jerry", 2),("tom", 1),("shuke", 2)))
rdd2: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[76] at parallelize at <console>:24

scala> val rdd3 = rdd1.cogroup(rdd2)
rdd3: org.apache.spark.rdd.RDD[(String, (Iterable[Int], Iterable[Int]))] = MapPartitionsRDD[78] at cogroup at <console>:28

scala> rdd3.collect
res21: Array[(String, (Iterable[Int], Iterable[Int]))] = Array((tom,(CompactBuffer(1, 2),CompactBuffer(1))), (jerry,(CompactBuffer(3),CompactBuffer(2))), (shuke,(CompactBuffer(),CompactBuffer(2))), (kitty,(CompactBuffer(4),CompactBuffer())))

scala> val rdd4 = rdd3.map(t=>(t._1, t._2._1.sum+t._2._2.sum))
rdd4: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[79] at map at <console>:30

scala> rdd4.collect
res22: Array[(String, Int)] = Array((tom,4), (jerry,5), (shuke,2), (kitty,4))

##求笛卡爾集
scala> val rdd1 = sc.parallelize(List("tom", "jerry"))
rdd1: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[80] at parallelize at <console>:24

scala> val rdd2 = sc.parallelize(List(("jerry", 2),("tom", 1),("shuke", 2)))
rdd2: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[81] at parallelize at <console>:24

scala> val rdd2 = sc.parallelize(List("tom", "kitty", "shuke"))
rdd2: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[82] at parallelize at <console>:24

scala> val rdd3 = rdd1.cartesian(rdd2)
rdd3: org.apache.spark.rdd.RDD[(String, String)] = CartesianRDD[83] at cartesian at <console>:28


##spark action
scala> rdd3.collect
res23: Array[(String, String)] = Array((tom,tom), (tom,kitty), (tom,shuke), (jerry,tom), (jerry,kitty), (jerry,shuke))

scala> val rdd1 = sc.parallelize(List(1,2,3,4,5), 2)
rdd1: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[84] at parallelize at <console>:24

scala> rdd1.collect
res24: Array[Int] = Array(1, 2, 3, 4, 5)

scala> rdd1.count
res25: Long = 5

scala> rdd1.top(2)
res26: Array[Int] = Array(5, 4)

scala> rdd1.take(2)
res27: Array[Int] = Array(1, 2)

scala> rdd1.first
res28: Int = 1

scala> rdd1.takeOrdered(3)
res29: Array[Int] = Array(1, 2, 3)

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章