數據結構概覽:
流程梳理:
1.讀取文件
2.抽取需要的列
3.以年月爲基礎,進行reduceByKey統計dongsi地區的PM
4.排序
5.獲取結果
代碼實現
package cn.ityuge.spark.rdd
import org.apache.ivy.util.StringUtils
import org.apache.spark.{SparkConf, SparkContext}
import org.junit.Test
class PmDataStatistic {
@Test
def pmProcess: Unit ={
val conf = new SparkConf().setAppName("pmDataStatistic").setMaster("local")
val sc = new SparkContext(conf)
//1.讀取文件
val source = sc.textFile("file:///C:\\Users\\monster\\Desktop\\my_code\\spark\\data\\BeijingPM20100101_20151231_noheader.csv")
//2.算子處理
val resultData = source.map(item => ((item.split(",")(1), item.split(",")(2)), item.split(",")(6)))
.filter(item => ! item._2.isEmpty && ! item._2.equalsIgnoreCase("NA") )
.map(item=>(item._1,item._2.toInt))
.reduceByKey((curr,agg)=>curr+agg)
.sortBy(item=>item._2,ascending = false)
resultData.take(10).foreach(item=>println(item))
}
}