scala> val rawRDDA=sc.parallelize(List("!! bb ## cc","%% cc bb %%","cc && ++ aa"),3)
rawRDDA: org.apache.spark.rdd.RDD[String]= ParallelCollectionRDD[46] at parallelize at <console>:29
scala> val rawRDDB=sc.parallelize(List(("xx",99),("yy",88),("xx",99),("zz",99)),2)
rawRDDB: org.apache.spark.rdd.RDD[(String, Int)]= ParallelCollectionRDD[47] at parallelize at <console>:29
scala> val rawRDDC=sc.parallelize(List(("yy",88)),1)
rawRDDC: org.apache.spark.rdd.RDD[(String, Int)]= ParallelCollectionRDD[48] at parallelize at <console>:29
scala>import org.apache.spark.HashPartitioner
import org.apache.spark.HashPartitioner
scala>
scala> val tempResultRDDA=rawRDDA.flatMap(line=>line.split(" ")|).filter(allword=>{allword.contains("aa")||allword.contains("bb")}|).map(word=>(word,1)|).partitionBy(newHashPartitioner(2)|).groupByKey(|).map((P:(String,Iterable[Int]))=>(P._1,P._2.sum))//A去重並計數
tempResultRDDA: org.apache.spark.rdd.RDD[(String, Int)]= MapPartitionsRDD[54] at map at <console>:37
scala>
scala> val tempResultRDDBC = rawRDDB.distinct.subtract(rawRDDC)// 在B中去掉C
tempResultRDDBC: org.apache.spark.rdd.RDD[(String, Int)]= MapPartitionsRDD[61] at subtract at <console>:34
scala>
scala> val resultRDDABC = tempResultRDDA.union(tempResultRDDBC)//取並集
resultRDDABC: org.apache.spark.rdd.RDD[(String, Int)]= UnionRDD[62] at union at <console>:34
scala> resultRDDABC.collect
res14: Array[(String, Int)]=Array((aa,1),(bb,2),(xx,99),(zz,99))