RDD 是一個分佈式的可變集合
#常用的Transformation(即轉換,延遲加載)
#通過並行化scala集合創建RDD
scala> val rdd1 = sc.parallelize(Array(1,2,3,4,5,6,7,8,9,10))
rdd1: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[0] at parallelize at <console>:24
scala> rdd1.partition
partitioner partitions
scala> rdd1.partitions.length
length lengthCompare
#查看RDD分區數量
scala> rdd1.partitions.length
res0: Int = 4
scala> val rdd1 = sc.parallelize(List(1,2,3,4,5))
rdd1: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[1] at parallelize at <console>:24
scala> val rdd1 = sc.parallelize(List(1,2,3,4,5)).map(_*2).sortBy(x=>x,true)
rdd1: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[8] at sortBy at <console>:24
scala> rdd1.filter(_>3)
res1: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[9] at filter at <console>:27
scala> val rdd2 = sc.parallelize(List(1,2,3,4,5)).map(_*2).sortBy(x=>x+"",true)
rdd2: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[16] at sortBy at <console>:24
scala> val rdd4 = sc.parallelize(Array("a a1","b b2 b3","c","d e"))
rdd4: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[17] at parallelize at <console>:24
scala> rdd4.flatMap(_.split(" ")).collect
res2: Array[String] = Array(a, a1, b, b2, b3, c, d, e)
scala> val rdd5 = sc.parallelize(List(List("a a1","b b2 b3","c","d e"),List("a a1","b b2 b3","c","d e")))
rdd5: org.apache.spark.rdd.RDD[List[String]] = ParallelCollectionRDD[19] at parallelize at <console>:24
scala> rdd5.flatMap(_.flatMap(_.split(" "))).collect
res3: Array[String] = Array(a, a1, b, b2, b3, c, d, e, a, a1, b, b2, b3, c, d, e)
scala> rdd5.map(_.flatMap(_.split(" "))).collect
res4: Array[List[String]] = Array(List(a, a1, b, b2, b3, c, d, e), List(a, a1, b, b2, b3, c, d, e))
##union求並集,注意類型要一致
scala> val rdd6 = sc.parallelize(List(1,2,3,4,5))
rdd6: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[22] at parallelize at <console>:24
scala> val rdd7 = sc.parallelize(List(3,4,5,6,7))
rdd7: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[23] at parallelize at <console>:24
scala> val rdd8 = rdd6.union(rdd7)
rdd8: org.apache.spark.rdd.RDD[Int] = UnionRDD[24] at union at <console>:28
scala> rdd8.distinct.sortBy(x=>x).collect
res5: Array[Int] = Array(1, 2, 3, 4, 5, 6, 7)
##intersection求交集
scala> val rdd9 = rdd6.intersection(rdd7)
rdd9: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[38] at intersection at <console>:28
scala> rdd9.collect
res6: Array[Int] = Array(4, 5, 3)
##join求交集
scala> val rdd1 = sc.parallelize(List(("tom",1),("jerry",2),("kitty", 3)))
rdd1: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[39] at parallelize at <console>:24
scala> val rdd2 = sc.parallelize(List(("jerry", 9),("shuke", 7), ("tom", 2)))
rdd2: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[40] at parallelize at <console>:24
scala> val rdd3 = rdd1.join(rdd2)
rdd3: org.apache.spark.rdd.RDD[(String, (Int, Int))] = MapPartitionsRDD[43] at join at <console>:28
scala> rdd3.collect
res7: Array[(String, (Int, Int))] = Array((tom,(1,2)), (jerry,(2,9)))
scala> val rdd3 = rdd1.leftOuterJoin(rdd2)
rdd3: org.apache.spark.rdd.RDD[(String, (Int, Option[Int]))] = MapPartitionsRDD[46] at leftOuterJoin at <console>:28
scala> rdd3.collect
res8: Array[(String, (Int, Option[Int]))] = Array((tom,(1,Some(2))), (jerry,(2,Some(9))), (kitty,(3,None)))
scala> val rdd3 = rdd1.rightOuterJoin(rdd2)
rdd3: org.apache.spark.rdd.RDD[(String, (Option[Int], Int))] = MapPartitionsRDD[49] at rightOuterJoin at <console>:28
scala> rdd3.collect
res9: Array[(String, (Option[Int], Int))] = Array((tom,(Some(1),2)), (jerry,(Some(2),9)), (shuke,(None,7)))
scala> val rdd3 = rdd1 union rdd2
rdd3: org.apache.spark.rdd.RDD[(String, Int)] = UnionRDD[50] at union at <console>:28
scala> rdd3.collect
res10: Array[(String, Int)] = Array((tom,1), (jerry,2), (kitty,3), (jerry,9), (shuke,7), (tom,2))
##groupByKey
scala> rdd3.groupByKey
res11: org.apache.spark.rdd.RDD[(String, Iterable[Int])] = ShuffledRDD[51] at groupByKey at <console>:31
scala> rdd3.groupByKey.collect
res12: Array[(String, Iterable[Int])] = Array((tom,CompactBuffer(1, 2)), (shuke,CompactBuffer(7)), (kitty,CompactBuffer(3)), (jerry,CompactBuffer(2, 9)))
scala> rdd3.groupByKey.map(x=>(x._1,x._2.sum)).collect
res15: Array[(String, Int)] = Array((tom,3), (shuke,7), (kitty,3), (jerry,11))
scala> rdd3.groupByKey.mapValues(_.sum).collect
res16: Array[(String, Int)] = Array((tom,3), (shuke,7), (kitty,3), (jerry,11)) ^
##WordCount
scala> sc.textFile("/spark/input/a.txt").flatMap(x=>x.split(" ")).map((_,1)).reduceByKey(_+_).sortBy(_._2,false).collect
res19: Array[(String, Int)] = Array((hello,4), (jim,1), (jarry,1), (wo,1), (ni,1))
scala> sc.textFile("/spark/input/a.txt").flatMap(x=>x.split(" ")).map((_,1)).groupByKey.map(t=>(t._1, t._2.sum)).collect
res20: Array[(String, Int)] = Array((jim,1), (jarry,1), (wo,1), (hello,4), (ni,1))
#cogroup
scala> val rdd1 = sc.parallelize(List(("tom", 1),("tom", 2),("jerry", 3), ("kitty", 4)))
rdd1: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[75] at parallelize at <console>:24
scala> val rdd2 = sc.parallelize(List(("jerry", 2),("tom", 1),("shuke", 2)))
rdd2: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[76] at parallelize at <console>:24
scala> val rdd3 = rdd1.cogroup(rdd2)
rdd3: org.apache.spark.rdd.RDD[(String, (Iterable[Int], Iterable[Int]))] = MapPartitionsRDD[78] at cogroup at <console>:28
scala> rdd3.collect
res21: Array[(String, (Iterable[Int], Iterable[Int]))] = Array((tom,(CompactBuffer(1, 2),CompactBuffer(1))), (jerry,(CompactBuffer(3),CompactBuffer(2))), (shuke,(CompactBuffer(),CompactBuffer(2))), (kitty,(CompactBuffer(4),CompactBuffer())))
scala> val rdd4 = rdd3.map(t=>(t._1, t._2._1.sum+t._2._2.sum))
rdd4: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[79] at map at <console>:30
scala> rdd4.collect
res22: Array[(String, Int)] = Array((tom,4), (jerry,5), (shuke,2), (kitty,4))
##求笛卡爾集
scala> val rdd1 = sc.parallelize(List("tom", "jerry"))
rdd1: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[80] at parallelize at <console>:24
scala> val rdd2 = sc.parallelize(List(("jerry", 2),("tom", 1),("shuke", 2)))
rdd2: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[81] at parallelize at <console>:24
scala> val rdd2 = sc.parallelize(List("tom", "kitty", "shuke"))
rdd2: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[82] at parallelize at <console>:24
scala> val rdd3 = rdd1.cartesian(rdd2)
rdd3: org.apache.spark.rdd.RDD[(String, String)] = CartesianRDD[83] at cartesian at <console>:28
##spark action
scala> rdd3.collect
res23: Array[(String, String)] = Array((tom,tom), (tom,kitty), (tom,shuke), (jerry,tom), (jerry,kitty), (jerry,shuke))
scala> val rdd1 = sc.parallelize(List(1,2,3,4,5), 2)
rdd1: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[84] at parallelize at <console>:24
scala> rdd1.collect
res24: Array[Int] = Array(1, 2, 3, 4, 5)
scala> rdd1.count
res25: Long = 5
scala> rdd1.top(2)
res26: Array[Int] = Array(5, 4)
scala> rdd1.take(2)
res27: Array[Int] = Array(1, 2)
scala> rdd1.first
res28: Int = 1
scala> rdd1.takeOrdered(3)
res29: Array[Int] = Array(1, 2, 3)