使用SparkStreaming+SparkSQL實現在線動態計算出特定時間窗口下的不同種類商品中的熱門商品排名
1、Streaming+SQL技術實現解析
2、Streaming+SQL實現實戰
啓動hive metastore
hive --service metastore &
package com.tom.spark.sparkstreaming
import org.apache.spark.{SparkConf, rdd}
import org.apache.spark.sql.Row
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.streaming.{Durations, Seconds, StreamingContext}
/**
* 使用SparkStreaming+Spark SQL來在線動態計算電商中不同類別中最熱門的商品排名,例如手機這個類別下面最熱門的三種手機
* 電視這個類別下最熱門的三種電視,該實例在實際生產環境下具有非常重大的意義
*
* 實現技術:SparkStreaming+Spark SQL,之所以Spark Streaming能夠使用ML、SQL、graphX等功能是因爲有foreachRDD和transform
* 等接口,這些接口中其實是基於RDD進行操作的,所以以RDD爲基石,就可以直接使用Spark其他所有的功能,就像直接調用API一樣簡單。
* 假設說這裏的數據的格式:user item category,例如Rocky Samsung Android
*/
object OnlineTop3ItemForEachCategory2DB {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("OnlineTop3ItemForEachCategory2DB").setMaster("local[2]")
//此處設置Batch Interval是在Spark Streaming中生成基本Job的時間單位,窗口和滑動時間間隔
// 一定是該Batch Interval的整數倍
val ssc = new StreamingContext(conf, Durations.seconds(5))
ssc.checkpoint("/root/Documents/sparkApps/checkpoint")
val userClickLogDStream = ssc.socketTextStream("Master", 9999)
//用戶搜索的格式簡化爲name item,在這裏我們由於要計算出熱點內容,所以只需要提取出item即可,
//提取出的item然後通過map轉換爲(item, 1)格式
val formattedUserClickLogsDStream = userClickLogDStream.filter(_.split(" ").length == 3).map(clickLog => {
(clickLog.split(" ")(2) + "_" + clickLog.split(" ")(2), 1)
})
val categoryUserClickLogsDStream = formattedUserClickLogsDStream.reduceByKeyAndWindow(_+_, _-_, Seconds(60), Seconds(20))
categoryUserClickLogsDStream.foreachRDD( rdd => {
if(rdd.isEmpty()){
println("No data inputted!!!")
} else {
val categoryItemRow = rdd.map(reducedItem => {
val category = reducedItem._1.split("_")(0)
val item = reducedItem._1.split("_")(1)
val click_count = reducedItem._2
Row(category, item, click_count)
})
val structType = StructType(Array(
new StructField("category", StringType, true),
new StructField("item", StringType, true),
new StructField("click_count", IntegerType, true)
))
val hiveContext = new HiveContext(rdd.context)
val categoryItemDF = hiveContext.createDataFrame(categoryItemRow, structType)
categoryItemDF.registerTempTable("categoryItemTable")
val resultDataFrame = hiveContext.sql("SELECT category, item, click_count FROM " +
"(SELECT category, item, click_count, row_number() OVER (PARTITION BY category ORDER BY click_count DESC) rank " +
"FROM categoryItemTable) subquery" +
"WHERE rank <=3")
val resultRowRDD = resultDataFrame.rdd
resultRowRDD.foreachPartition( partitionOfRecords => {
// ConnectionPool is a static, lazily initialized pool of connections
if(partitionOfRecords.isEmpty) {
println("This RDD is not null, but partition is null!!!")
}else {
val connection = ConnectionPool.getConnection()
partitionOfRecords.foreach(record => {
val sql = "insert into categorytop3(category, item, click_count) values ('" + record.getAs("category") + "','" +
record.getAs("item") + "'," + record.getAs("click_count") + ")"
val stmt = connection.createStatement()
stmt.executeUpdate(sql)
})
ConnectionPool.returnConnection(connection)
}
})
}
})
//計算後的有效數據一般都會寫入Kafka中,下游的計費系統會從Kafka中Pull到有效數據進行計費
ssc.start()
ssc.awaitTermination()
}
}