做什麼?
統計需求三中得到的Top10熱門品類中的Top10活躍Session,對Top10熱門品類中的每個品類都取Top10活躍Session,評判活躍Session的指標是一個Session對一個品類的點擊次數。
需求解析
- 需要什麼?
Top10熱門品類中的Top10活躍Session - 依據什麼排序?
一個Session對一個品類的點擊次數 - 需要怎樣的原始數據?
用戶點擊的商品爲top10商品的其中一個----------filter算子 - 怎麼做?
步驟解析
- 獲取點擊過Top10品類的所有用戶行爲
//1.獲取top10熱門商品的array;
val top10Arr=top10Category.map{
case (sortKey,info)=>{
val cId= StringUtil.getFieldFromConcatString(info, "\\|", Constants.FIELD_CATEGORY_ID).toLong;
cId
}
}
//2.過濾數據
val filterRDD=sessionId2FilterActionRDD.filter{
case (sessionId,action)=>{
val cId=action.click_category_id;
top10Arr.contains(cId);
}
}
- 按照session聚合統計,配合yield將數據轉化爲(categoryId,str),其中str=(sessionId=count)
//3.根據sessionId分組聚合,統計每個用戶對每個商品的點擊次數,最後結構爲(categoryId,sessionId=count)
val GroupFilterRDD=filterRDD.groupByKey();
val cid2SessionCountRDD=GroupFilterRDD.flatMap{
case(sessionId,actions)=>{
val countMap=new mutable.HashMap[Long,Long];
for(action<-actions){
val cId=action.click_category_id;
if(!countMap.contains(cId)){
countMap+=(cId->0)
}
countMap.update(cId,countMap(cId)+1);
}
for((k,v)<-countMap)
yield(k,session+"="+v);
}
}
- 按照categoryId分組聚合,排序
//4.groupByKey分組聚合
val cid2GroupRDD=cid2SessionCountRDD.groupByKey();
//5.對每個cid對應的列表進行排序操作
val top10ActiveSession=cid2GroupRDD.flatMap{
case (cid, iterableSessionCount) =>
// true: item1放在前面
// flase: item2放在前面
// item: sessionCount String "sessionId=count"
val sortList = iterableSessionCount.toList.sortWith((item1, item2) => {
item1.split("=")(1).toLong > item2.split("=")(1).toLong
}).take(10)
val top10Session = sortList.map{
// item : sessionCount String "sessionId=count"
case item =>
val sessionId = item.split("=")(0)
val count = item.split("=")(1).toLong
Top10Session(taskUUID, cid, sessionId, count)
}
top10Session
}
- 寫入數據庫
//6.寫入數據庫
import sparkSession.implicits._
top10SessionRDD.toDF().write
.format("jdbc")
.option("url", ConfigurationManager.config.getString(Constants.JDBC_URL))
.option("user", ConfigurationManager.config.getString(Constants.JDBC_USER))
.option("password", ConfigurationManager.config.getString(Constants.JDBC_PASSWORD))
.option("dbtable", "top10_session_0308")
.mode(SaveMode.Append)
.save()
完整代碼:
package server
import commons.constant.Constants
import commons.model.{Top10Session, UserVisitAction}
import commons.utils.StringUtil
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import scala.collection.mutable
class serverFour extends Serializable{
def top10ActiveSession(session: SparkSession, taskUUID: String,
sessionId2FilterActionRDD: RDD[(String, UserVisitAction)],
top10Category: Array[(SortKey, String)]) = {
//1.獲取top10熱門商品的array;
val top10Arr=top10Category.map{
case (sortKey,info)=>{
val cId= StringUtil.getFieldFromConcatString(info, "\\|", Constants.FIELD_CATEGORY_ID).toLong;
cId
}
}
//2.過濾數據
val filterRDD=sessionId2FilterActionRDD.filter{
case (sessionId,action)=>{
val cId=action.click_category_id;
top10Arr.contains(cId);
}
}
//3.根據sessionId分組聚合,統計每個用戶對每個商品的點擊次數,最後結構爲(categoryId,sessionId=count)
val GroupFilterRDD=filterRDD.groupByKey();
val cid2SessionCountRDD=GroupFilterRDD.flatMap{
case(sessionId,actions)=>{
val countMap=new mutable.HashMap[Long,Long];
for(action<-actions){
val cId=action.click_category_id;
if(!countMap.contains(cId)){
countMap+=(cId->0)
}
countMap.update(cId,countMap(cId)+1);
}
for((k,v)<-countMap)
yield(k,session+"="+v);
}
}
//4.groupByKey分組聚合
val cid2GroupRDD=cid2SessionCountRDD.groupByKey();
//5.對每個cid對應的列表進行排序操作
val top10ActiveSession=cid2GroupRDD.flatMap{
case (cid, iterableSessionCount) =>
// true: item1放在前面
// flase: item2放在前面
// item: sessionCount String "sessionId=count"
val sortList = iterableSessionCount.toList.sortWith((item1, item2) => {
item1.split("=")(1).toLong > item2.split("=")(1).toLong
}).take(10)
val top10Session = sortList.map{
// item : sessionCount String "sessionId=count"
case item =>
val sessionId = item.split("=")(0)
val count = item.split("=")(1).toLong
Top10Session(taskUUID, cid, sessionId, count)
}
top10Session
}
top10ActiveSession.foreach(println);
top10ActiveSession;
//6.寫入數據庫
/* import sparkSession.implicits._
top10SessionRDD.toDF().write
.format("jdbc")
.option("url", ConfigurationManager.config.getString(Constants.JDBC_URL))
.option("user", ConfigurationManager.config.getString(Constants.JDBC_USER))
.option("password", ConfigurationManager.config.getString(Constants.JDBC_PASSWORD))
.option("dbtable", "top10_session_0308")
.mode(SaveMode.Append)
.save()*/
}
}