-
基礎轉換操作
-
鍵值轉換操作
鍵值轉換操作
-
cogroup[W](other: RDD[(K, W)]): RDD[(K, (Iterable[V], Iterable[W]))]
-
cogroup[W](other: RDD[(K, W)], numPartitions: Int): RDD[(K, (Iterable[V], Iterable[W]))]
-
cogroup[W](other: RDD[(K, W)], partitioner: Partitioner): RDD[(K, (Iterable[V], Iterable[W]))]
-
cogroup[W!, W2](other1: RDD[(K, W1])], other2: RDD[(K, W2)]): RDD[(K, (Iterable[V], Iterable[W1], Iterable[W2]))]
-
cogroup[W1, W2](other1: RDD[(K, W1)], other2: RDD[(K, W2)], numPartitions: Int): RDD[(K, (Iterable[V], Iterable[W1], Iterable[W2]))]
-
cogroup[W1, W2](other1: RDD[(K, W1)], other2: RDD[(K, W2)], partitioner: Partitioner): RDD[(K, (Iterable[V], Iterable[W1], Iterable[W2]))]
-
cogroup[W!, W2, W3](other1: RDD[(K, W1])], other2: RDD[(K, W2)], other3: RDD[(K, W3)]): RDD[(K, (Iterable[V], Iterable[W1], Iterable[W2], Iterable[W3]))]
-
cogroup[W1, W2](other1: RDD[(K, W1)], other2: RDD[(K, W2)], other3: RDD[(K, W3)], numPartitions: Int): RDD[(K, (Iterable[V], Iterable[W1], Iterable[W2], Iterable[W3]))]
-
cogroup[W1, W2](other1: RDD[(K, W1)], other2: RDD[(K, W2)], other3: RDD[(K, W3)], partitioner: Partitioner): RDD[(K, (Iterable[V], Iterable[W1], Iterable[W2], Iterable[W3]))]
cogroup類似於SQL中的全外連接,返回左右RDD中的記錄,關聯不上的爲空
scala> var rdd1 = sc.makeRDD(Array(("A", "1"), ("B", "2"), ("C", "3")), 2)
rdd1: org.apache.spark.rdd.RDD[(String, String)] = ParallelCollectionRDD[26] at makeRDD at <console>:24
scala> var rdd2 = sc.makeRDD(Array(("A", "a"), ("C", "c"), ("D", "d")), 2)
rdd2: org.apache.spark.rdd.RDD[(String, String)] = ParallelCollectionRDD[27] at makeRDD at <console>:24
scala> var rdd3 = sc.makeRDD(Array(("A", "A"), ("E", "E")), 2)
rdd3: org.apache.spark.rdd.RDD[(String, String)] = ParallelCollectionRDD[28] at makeRDD at <console>:24
scala> rdd1.cogroup(rdd2).collect
res26: Array[(String, (Iterable[String], Iterable[String]))] = Array((B,(CompactBuffer(2),CompactBuffer())), (D,(CompactBuffer(),CompactBuffer(d))), (A,(CompactBuffer(1),CompactBuffer(a))), (C,(CompactBuffer(3),CompactBuffer(c))))
scala> rdd1.cogroup(rdd2, rdd3).collect
res27: Array[(String, (Iterable[String], Iterable[String], Iterable[String]))] = Array((B,(CompactBuffer(2),CompactBuffer(),CompactBuffer())), (D,(CompactBuffer(),CompactBuffer(d),CompactBuffer())), (A,(CompactBuffer(1),CompactBuffer(a),CompactBuffer(A))), (C,(CompactBuffer(3),CompactBuffer(c),CompactBuffer())), (E,(CompactBuffer(),CompactBuffer(),CompactBuffer(E))))
-
join[W](other: RDD[(K, W)]): RDD[(K, (V, W))]
-
join[W](other: RDD[(K, W)], numPartitions: Int): RDD[(K, (V, W))]
-
join[W](other: RDD[(K, W)], partitioner: Partitioner): RDD[(K, (V, W))]
-
fullOuterJoin[W](other: RDD[(K, W)]): RDD[(K, (Option[V], Option[W]))]
-
fullOuterJoin[W](other: RDD[(K, W)], numPartitions: Int): RDD[(K, (Option[V], Option[W]))]
-
fullOuterJoin[W](other: RDD[(K, W)], partitioner: Partitioner): RDD[(K, (Option[V], Option[W]))]
-
leftOuterJoin[W](other: RDD[(K, W)]): RDD[(K, (V, Option[W]))]
-
leftOuterJoin[W](other: RDD[(K, W)], numPartition: Int): RDD[(K, (V, Option[W]))]
-
leftOuterJoin[W](other: RDD[(K, W)], partitioner: Partitioner): RDD[(K, (V, Option[W]))]
-
rightOuterJoin[W](other: RDD[(K, W)]): RDD[(K, (V, Option[W]))]
-
rightOuterJoin[W](other: RDD[(K, W)], numPartition: Int): RDD[(K, (V, Option[W]))]
-
rightOuterJoin[W](other: RDD[(K, W)], partitioner: Partitioner): RDD[(K, (V, Option[W]))]
join、fullOuterJoin、leftOuterJoin和rightOuterJoin操作對RDD[K, V]中K值相等的進行連接操作,分別對應內連接、全連接、左連接和有連接,其內部都是通過cogroup實現的。
scala> var rdd1 = sc.makeRDD(Array(("A", "1"), ("B", "2"), ("C", "3")), 2)
rdd1: org.apache.spark.rdd.RDD[(String, String)] = ParallelCollectionRDD[26] at makeRDD at <console>:24
scala> var rdd2 = sc.makeRDD(Array(("A", "a"), ("C", "c"), ("D", "d")), 2)
rdd2: org.apache.spark.rdd.RDD[(String, String)] = ParallelCollectionRDD[27] at makeRDD at <console>:24
scala> rdd1.join(rdd2).collect
res28: Array[(String, (String, String))] = Array((A,(1,a)), (C,(3,c)))
scala> rdd1.leftOuterJoin(rdd2).collect
res29: Array[(String, (String, Option[String]))] = Array((B,(2,None)), (A,(1,Some(a))), (C,(3,Some(c))))
scala> rdd1.rightOuterJoin(rdd2).collect
res30: Array[(String, (Option[String], String))] = Array((D,(None,d)), (A,(Some(1),a)), (C,(Some(3),c)))
scala> rdd1.fullOuterJoin(rdd2)
res31: org.apache.spark.rdd.RDD[(String, (Option[String], Option[String]))] = MapPartitionsRDD[46] at fullOuterJoin at <console>:28
scala> rdd1.fullOuterJoin(rdd2).collect
res32: Array[(String, (Option[String], Option[String]))] = Array((B,(Some(2),None)), (D,(None,Some(d))), (A,(Some(1),Some(a))), (C,(Some(3),Some(c))))
-
subtractByKey[W](other: RDD[(K, W)]): RDD[(K, V)]
-
subtractByKey[W](other: RDD[(K, W)], p: Partitioner): RDD[(K, V)]
-
subtractByKey[W](other: RDD[(K, W)], numPartitions: Int): RDD[(K, V)]
subtractByKey操作類似於subtract,區別在於針對的是鍵值RDD[K, V]
scala> var rdd1 = sc.makeRDD(Array(("A", "1"), ("B", "2"), ("C", "3")), 2)
rdd1: org.apache.spark.rdd.RDD[(String, String)] = ParallelCollectionRDD[26] at makeRDD at <console>:24
scala> var rdd2 = sc.makeRDD(Array(("A", "a"), ("C", "c"), ("D", "d")), 2)
rdd2: org.apache.spark.rdd.RDD[(String, String)] = ParallelCollectionRDD[27] at makeRDD at <console>:24
scala> rdd1.subtractByKey(rdd2).collect
res33: Array[(String, String)] = Array((B,2))
scala> rdd2.subtractByKey(rdd1).collect
res34: Array[(String, String)] = Array((D,d))