做什麼?
根據需求七中統計的各省各城市累計廣告點擊量,創建SparkSQL臨時表,通過SQL查詢的形式獲取各省的Top3熱門廣告。
需求分析
- 在需求七中,我們已經實時統計了各省各城市廣告的點擊量,並且key的格式是(date_province_city_adid),
- 現在我們只需要把Key變爲(date_province_city),再用reduceByKey進行累加
- 最後利用sparkSQL創建臨時表,進行排序即可
步驟分析:
1.轉化key的格式
//1.轉化key爲date_province_adid,value仍然是原本的count
val key2ProvinceCountDStream=key2ProvinceCityCountDStream.map{
case (key,count)=>{
val keySplit = key.split("_")
val date = keySplit(0)
val province = keySplit(1)
val adid = keySplit(3)
(date+"_"+province+"_"+adid,count);
}
}
2.累加,創建臨時表
//2.累增,創建臨時表
val key2ProvinceAggCountDStream=key2ProvinceCountDStream.reduceByKey(_+_);
val top3DStream=key2ProvinceAggCountDStream.transform{
stream=>{
val temp=stream.map{
case (key,count)=>{
val keySplit = key.split("_")
val date = keySplit(0)
val province = keySplit(1)
val adid = keySplit(2).toLong
(date, province, adid, count)
}
}
import sparkSession.implicits._;
temp.toDF("date","province","adid","count").createOrReplaceTempView("tmp_basic_info");
val sql = "select date, province, adid, count from(" +
"select date, province, adid, count, " +
"row_number() over(partition by date,province order by count desc) rank from tmp_basic_info) " +
"where rank <= 3"
sparkSession.sql(sql).rdd;
}
}
4.封裝,寫入數據庫
//3.數據封裝
top3DStream.foreachRDD{
// rdd : RDD[row]
rdd =>
rdd.foreachPartition{
// items : row
items =>
val top3Array = new ArrayBuffer[AdProvinceTop3]()
for(item <- items){
val date = item.getAs[String]("date")
val province = item.getAs[String]("province")
val adid = item.getAs[Long]("adid")
val count = item.getAs[Long]("count")
top3Array += AdProvinceTop3(date, province, adid, count)
}
top3Array.foreach(println);
//AdProvinceTop3DAO.updateBatch(top3Array.toArray)
}
}
完整代碼
def proveinceTope3Adver(sparkSession: SparkSession,
key2ProvinceCityCountDStream: DStream[(String, Long)])={
//1.轉化key爲date_province_adid,value仍然是原本的count
val key2ProvinceCountDStream=key2ProvinceCityCountDStream.map{
case (key,count)=>{
val keySplit = key.split("_")
val date = keySplit(0)
val province = keySplit(1)
val adid = keySplit(3)
(date+"_"+province+"_"+adid,count);
}
}
//2.累增,創建臨時表
val key2ProvinceAggCountDStream=key2ProvinceCountDStream.reduceByKey(_+_);
val top3DStream=key2ProvinceAggCountDStream.transform{
stream=>{
val temp=stream.map{
case (key,count)=>{
val keySplit = key.split("_")
val date = keySplit(0)
val province = keySplit(1)
val adid = keySplit(2).toLong
(date, province, adid, count)
}
}
import sparkSession.implicits._;
temp.toDF("date","province","adid","count").createOrReplaceTempView("tmp_basic_info");
val sql = "select date, province, adid, count from(" +
"select date, province, adid, count, " +
"row_number() over(partition by date,province order by count desc) rank from tmp_basic_info) " +
"where rank <= 3"
sparkSession.sql(sql).rdd;
}
}
//3.數據封裝
top3DStream.foreachRDD{
// rdd : RDD[row]
rdd =>
rdd.foreachPartition{
// items : row
items =>
val top3Array = new ArrayBuffer[AdProvinceTop3]()
for(item <- items){
val date = item.getAs[String]("date")
val province = item.getAs[String]("province")
val adid = item.getAs[Long]("adid")
val count = item.getAs[Long]("count")
top3Array += AdProvinceTop3(date, province, adid, count)
}
top3Array.foreach(println);
//AdProvinceTop3DAO.updateBatch(top3Array.toArray)
}
}
}