- wordcount: 統計詞頻, 排序
- 歷年溫度最值: max, min, avg
part1: spark rdd 之 wordcount
spark-shell
//word.txt
var file="/home/wang/txt/word.txt" //a b c e f e f
import org.apache.spark.rdd.RDD
var rdd1:RDD[String]=sc.textFile(file) // a b c e f e f
var rdd2:RDD[(String,Integer)]=rdd1.flatMap(_.split(" ") ).filter(_.trim !="" ).map((_,1)) // ("",6) (a,1)
var rdd3:RDD[(String,Integer)] =rdd2.reduceByKey(_+_) //(b,1), (yre,1), (46,1)
var arr=rdd3.collect
//按單詞升序
var wordsRdd:RDD[(String,Integer)]=rdd3.sortByKey(true) // (435,1), (46,1), (5,1), (:,1), (a,1)
var arr1:Array[(String,Integer)]=wordsRdd.collect
//按詞頻降序
var countRdd:RDD[(String,Integer)]=rdd3.sortBy(x=>x._2,false) // (e,4), (f,2), (er,2)
var arr2:Array[(String,Integer)]=countRdd.collect
scala api
<!--maven依賴 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.1.0</version>
</dependency>
object Wc {
def main(args: Array[String]): Unit = {
//conf
var conf=new SparkConf()
conf.setMaster("local")
conf.setAppName("wc")
//context
var sc=new SparkContext(conf)
//上面的spark-shell代碼.......
}
}
part2: spark rdd 之 maxTemp
思路1:groupByKey
//( 年, 溫度)=> groupByKey
//tmp.txt
var file="/home/wang/txt/tmp.txt" //1982,-8
import org.apache.spark.rdd.RDD
var rdd1:RDD[(Integer,Integer)]=sc.textFile(file).map(x=>
{ var arr=x.split(",");
(arr(0).toInt, arr(1).toInt) //(1982,-8)
})
//rdd2:RDD[(Integer, Iterable[Integer])]
var rdd2:RDD[(Integer,Iterable[Integer])]=rdd1.groupByKey //(1984,CompactBuffer(-4, 26))
//(1984,26,-4,11.0),(1964,22,-11,5.5)
var rdd3:RDD[(Integer,Integer,Integer,Double)]=rdd2.map(x=>
{ var arr=x._2;
var sum=arr.reduce(_+_);
var avg=sum/ arr.size.toDouble;
(x._1, arr.max,arr.min, avg )
})
var arr:Array[(Integer,Integer,Integer,Double)]=rdd3.collect
思路2: reduceByKey
// (年, max溫度, min溫度, 溫度sum, 溫度count )=> reduceByKey(聚合函數)
var file="/home/wang/txt/tmp.txt" //1982,-8
import org.apache.spark.rdd.RDD
var rdd1:RDD[(Integer, (Integer,Integer,Integer,Integer))]=sc.textFile(file).map(x=> {
var arr=x.split(",");
var tmp=arr(1).toInt;
var year=arr(0).toInt;
(year, (tmp, tmp,tmp, 1 )) //(1982,-8)
})
var rdd2:RDD[(Integer, (Integer,Integer,Integer,Integer))]=rdd1.reduceByKey((x,y)=>{
var tup1=x
var tup2=y
var maxTmp=max(tup1._1, tup2._1)
var minTmp=min(tup1._2, tup2._2)
var size= tup1._3+ tup2._3
var sum= tup1._4 + tup2._4
(maxTmp,minTmp,size,sum) //(1984,(26,-4,22,2))
})
var rdd3:RDD[(Integer, (Integer,Integer,Double))]=rdd2.map(x=>{
var year=x._1
var tup1=x._2
var maxTmp=tup1._1
var minTmp=tup1._2
var size= tup1._3
var sum= tup1._4
var avg= (sum/size.toDouble).formatted("%.3f").toDouble
(year,(maxTmp,minTmp,avg)) //(1984,(26,-4,0.091))
})
var arr:Array[(Integer,Integer,Double)]=rdd3.collect