做什麼?
從Kafka獲取實時數據,對每個用戶的點擊次數進行累加並寫入MySQL,當一天之內一個用戶對一個廣告的點擊次數超過100次時,將用戶加入黑名單中。
需求解析
1.先從kafka得到數據,數據格式是:(timestamp province city userid adid),
2.接着統計該批次的數據中不同用戶對同一個廣告的點擊數量,即需要以timestamp_userId_adid爲key進行reduceByKey
3.更新mysql
4.從mysql中讀取數據,過濾數據,剩下的是超過100次點擊的數據
5.更新Mysql的黑名單表
步驟解析
1.從Kafka中獲取數據,初步過濾數據:
val ssc = new StreamingContext(sparkSession.sparkContext, Seconds(5))
val kafka_brokers = ConfigurationManager.config.getString("kafka.broker.list")
val kafka_topics = ConfigurationManager.config.getString(Constants.KAFKA_TOPICS)
val kafkaParam = Map(
"bootstrap.servers" -> kafka_brokers,
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "0",
// auto.offset.reset
// latest: 先去Zookeeper獲取offset,如果有,直接使用,如果沒有,從最新的數據開始消費;
// earlist: 先去Zookeeper獲取offset,如果有,直接使用,如果沒有,從最開始的數據開始消費
// none: 先去Zookeeper獲取offset,如果有,直接使用,如果沒有,直接報錯
"auto.offset.reset" -> "latest",
"enable.auto.commit" -> (false:java.lang.Boolean)
)
// adRealTimeDStream: DStream[RDD RDD RDD ...] RDD[message] message: key value
val adRealTimeDStream = KafkaUtils.createDirectStream[String, String](ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](Array(kafka_topics), kafkaParam)
)
val adReadTimeValueDStream=adRealTimeDStream.map(item=>item.value);
val adRealTimeFilterDstream=adReadTimeValueDStream.transform{
RDDS=>{
val blackList=AdBlacklistDAO.findAll();
val black=blackList.map(item=>item.userid);
RDDS.filter{
log=>{
val userId=log.split(" ")(3).toLong;
!black.contains(userId);
}
}
}
}
2.先統計每個用戶的點擊次數,key爲time_userId_adid
val key2NumDStream=adRealTimeFilterDstream.map {
case (log)=>{
val logSplit = log.split(" ")
val timeStamp = logSplit(0).toLong
// yy-mm-dd
val dateKey = DateUtils.formatDateKey(new Date(timeStamp))
val userId = logSplit(3).toLong
val adid = logSplit(4).toLong
val key = dateKey + "_" + userId + "_" + adid
(key, 1L)
}
}
val keyCountStream=key2NumDStream.reduceByKey(_+_);
3.更新Mysql,
keyCountStream.foreachRDD{
RDDS=>RDDS.foreachPartition{
part=>{
val clickCountArray=new ArrayBuffer[AdUserClickCount]();
for((k,v)<-part){
val keySplit = k.split("_")
val date = keySplit(0)
val userId = keySplit(1).toLong
val adid = keySplit(2).toLong
clickCountArray += AdUserClickCount(date, userId, adid, v)
}
if (clickCountArray.size>0){
flag=1;
AdUserClickCountDAO.updateBatch1(clickCountArray.toArray);
}
}
}
}
3.對keyCountStream中的每個rdd,通過查詢數據庫,獲取點擊次數,從而進行過濾操作
val filterKeyCountStream=keyCountStream.filter {
case (key,count)=>{
val keySplit = key.split("_")
val date = keySplit(0)
val userId = keySplit(1).toLong
val adid = keySplit(2).toLong
val clickCount = AdUserClickCountDAO.findClickCountByMultiKey(date, userId, adid)
if(clickCount > 10){
println("userID:"+userId+"is die");
true
}else{
false
}
}
}
4.將剩下的數據加入黑名單中
//4.將剩下的數據加入黑名單中
val filterBlackListDstream=filterKeyCountStream.map{
case (key,count)=>{
key.split("_")(1).toLong
}
}.transform(rdds=>rdds.distinct());
filterBlackListDstream.foreachRDD{
rdds=>rdds.foreachPartition{
part=>{
val buffer=new ListBuffer[AdBlacklist];
for(userId<-part){
buffer+=AdBlacklist(userId);
}
AdBlacklistDAO.insertBatch(buffer.toArray)
}
}
}
完整代碼:
package scala
import java.util.Date
import commons.conf.ConfigurationManager
import commons.constant.Constants
import commons.model.{AdBlacklist, AdUserClickCount}
import commons.utils.DateUtils
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import scala.collection.mutable.{ArrayBuffer, ListBuffer}
object advertStat {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("adver").setMaster("local[*]").set("spark.serializer","org.apache.spark.serializer.KryoSerializer");
val sparkSession = SparkSession.builder().config(sparkConf).enableHiveSupport().getOrCreate();
sparkSession.sparkContext.setLogLevel("ERROR");
// val streamingContext = StreamingContext.getActiveOrCreate(checkpointDir, func)
val ssc = new StreamingContext(sparkSession.sparkContext, Seconds(5))
val kafka_brokers = ConfigurationManager.config.getString("kafka.broker.list")
val kafka_topics = ConfigurationManager.config.getString(Constants.KAFKA_TOPICS)
val kafkaParam = Map(
"bootstrap.servers" -> kafka_brokers,
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "0",
// auto.offset.reset
// latest: 先去Zookeeper獲取offset,如果有,直接使用,如果沒有,從最新的數據開始消費;
// earlist: 先去Zookeeper獲取offset,如果有,直接使用,如果沒有,從最開始的數據開始消費
// none: 先去Zookeeper獲取offset,如果有,直接使用,如果沒有,直接報錯
"auto.offset.reset" -> "latest",
"enable.auto.commit" -> (false:java.lang.Boolean)
)
// adRealTimeDStream: DStream[RDD RDD RDD ...] RDD[message] message: key value
val adRealTimeDStream = KafkaUtils.createDirectStream[String, String](ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](Array(kafka_topics), kafkaParam)
)
val adReadTimeValueDStream=adRealTimeDStream.map(item=>item.value);
val adRealTimeFilterDstream=adReadTimeValueDStream.transform{
RDDS=>{
val blackList=AdBlacklistDAO.findAll();
val black=blackList.map(item=>item.userid);
RDDS.filter{
log=>{
val userId=log.split(" ")(3).toLong;
!black.contains(userId);
}
}
}
}
/*
需求一------實時維護黑名單
*/
generateBlackList(adRealTimeFilterDstream);
ssc.start();
ssc.awaitTermination();
}
def generateBlackList(adRealTimeFilterDstream: DStream[String]): Any = {
//1.先統計每個用戶的點擊次數
val key2NumDStream=adRealTimeFilterDstream.map {
case (log)=>{
val logSplit = log.split(" ")
val timeStamp = logSplit(0).toLong
// yy-mm-dd
val dateKey = DateUtils.formatDateKey(new Date(timeStamp))
val userId = logSplit(3).toLong
val adid = logSplit(4).toLong
val key = dateKey + "_" + userId + "_" + adid
(key, 1L)
}
}
val keyCountStream=key2NumDStream.reduceByKey(_+_);
var flag=0;
//2.更新數據庫
keyCountStream.foreachRDD{
RDDS=>RDDS.foreachPartition{
part=>{
val clickCountArray=new ArrayBuffer[AdUserClickCount]();
for((k,v)<-part){
val keySplit = k.split("_")
val date = keySplit(0)
val userId = keySplit(1).toLong
val adid = keySplit(2).toLong
clickCountArray += AdUserClickCount(date, userId, adid, v)
}
if (clickCountArray.size>0){
flag=1;
AdUserClickCountDAO.updateBatch1(clickCountArray.toArray);
}
}
}
}
if (flag==1){
//3.對keyCountStream中的每個rdd,通過查詢數據庫,獲取點擊次數,從而進行過濾操作
val filterKeyCountStream=keyCountStream.filter {
case (key,count)=>{
val keySplit = key.split("_")
val date = keySplit(0)
val userId = keySplit(1).toLong
val adid = keySplit(2).toLong
val clickCount = AdUserClickCountDAO.findClickCountByMultiKey(date, userId, adid)
if(clickCount > 10){
println("userID:"+userId+"is die");
true
}else{
false
}
}
}
//4.將剩下的數據加入黑名單中
val filterBlackListDstream=filterKeyCountStream.map{
case (key,count)=>{
key.split("_")(1).toLong
}
}.transform(rdds=>rdds.distinct());
filterBlackListDstream.foreachRDD{
rdds=>rdds.foreachPartition{
part=>{
val buffer=new ListBuffer[AdBlacklist];
for(userId<-part){
buffer+=AdBlacklist(userId);
}
AdBlacklistDAO.insertBatch(buffer.toArray)
}
}
}
}
}
}