Spark實時項目第六天-訂單表與用戶維度表聯立

HBase建表

create table gmall_user_info  ( id varchar primary key ,  info.user_level varchar, info.birthday varchar,info.gender varchar,info.age_group varchar,info.gender_name varchar)SALT_BUCKETS = 3

UserInfo

在scala\com\atguigu\gmall\realtime\bean\UserInfo.scala

case class UserInfo(id:String , user_level:String ,birthday:String ,gender:String ,
                    var age_group:String,
                    var gender_name:String) {

}

UserInfoApp

import java.text.SimpleDateFormat
import java.util

import com.alibaba.fastjson.JSON
import com.atguigu.gmall.realtime.bean.UserInfo
import com.atguigu.gmall.realtime.utils.{MyKafkaUtil, OffsetManagerUtil}
import org.apache.hadoop.conf.Configuration
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{HasOffsetRanges, OffsetRange}


object UserInfoApp {

  def main(args: Array[String]): Unit = {

    val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("dim_user_info_app")

    val ssc = new StreamingContext(sparkConf, Seconds(5))
    val topic = "ODS_T_USER_INFO";
    val groupId = "gmall_user_info_group"


    /////////////////////  偏移量處理///////////////////////////
    val offset: Map[TopicPartition, Long] = OffsetManagerUtil.getOffset(groupId, topic)

    var inputDstream: InputDStream[ConsumerRecord[String, String]] = null
    // 判斷如果從redis中讀取當前最新偏移量 則用該偏移量加載kafka中的數據  否則直接用kafka讀出默認最新的數據
    if (offset != null && offset.size > 0) {
      inputDstream = MyKafkaUtil.getKafkaStream(topic, ssc, offset, groupId)
      //startInputDstream.map(_.value).print(1000)
    } else {
      inputDstream = MyKafkaUtil.getKafkaStream(topic, ssc, groupId)
    }

    //取得偏移量步長
    var offsetRanges: Array[OffsetRange] = null
    val inputGetOffsetDstream: DStream[ConsumerRecord[String, String]] = inputDstream.transform { rdd =>
      offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
      rdd
    }

    val userInfoDstream: DStream[UserInfo] = inputGetOffsetDstream.map { record =>
      val userInfoJsonStr: String = record.value()
      val userInfo: UserInfo = JSON.parseObject(userInfoJsonStr, classOf[UserInfo])
      val formattor = new SimpleDateFormat("yyyy-MM-dd")
      val date: util.Date = formattor.parse(userInfo.birthday)
      val curTs: Long = System.currentTimeMillis()
      val  betweenMs= curTs-date.getTime//兩個ts
    val age=betweenMs/1000L/60L/60L/24L/365L
      if(age<20){
        userInfo.age_group="20歲及以下"
      }else if(age>30){
        userInfo.age_group="30歲以上"
      }else{
        userInfo.age_group="21歲到30歲"
      }
      if(userInfo.gender=="M"){
        userInfo.gender_name="男"
      }else{
        userInfo.gender_name="女"
      }
      userInfo
    }

    userInfoDstream.foreachRDD{rdd=>
      import org.apache.phoenix.spark._
      rdd.saveToPhoenix("GMALL_USER_INFO",Seq("ID", "USER_LEVEL", "BIRTHDAY", "GENDER","AGE_GROUP","GENDER_NAME")
        ,new Configuration,Some("hadoop102,hadoop103,hadoop104:2181"))

      OffsetManagerUtil.saveOffset(groupId, topic, offsetRanges)
    }

    ssc.start()
    ssc.awaitTermination()

  }

}

利用maxwell-bootstrap 初始化數據

bin/maxwell-bootstrap --user maxwell  --password 123123 --host hadoop102  --database spark_gmall  --table user_info  --client_id maxwell_1

修改OrderInfoApp

在scala\com\atguigu\gmall\realtime\app\dw\OrderInfoApp.scala

import com.alibaba.fastjson.{JSON, JSONObject}
import com.atguigu.gmall.realtime.bean.{OrderInfo, UserState}
import com.atguigu.gmall.realtime.utils.{MyKafkaUtil, OffsetManagerUtil, PhoenixUtil}
import org.apache.hadoop.conf.Configuration
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.spark.SparkConf
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{HasOffsetRanges, OffsetRange}

object OrderInfoApp {


  def main(args: Array[String]): Unit = {


    val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("dw_order_info_app")

    val ssc = new StreamingContext(sparkConf, Seconds(5))
    val topic = "ODS_T_ORDER_INFO";
    val groupId = "base_order_info_group"


    /////////////////////  偏移量處理///////////////////////////
    val offset: Map[TopicPartition, Long] = OffsetManagerUtil.getOffset(groupId, topic)

    var inputDstream: InputDStream[ConsumerRecord[String, String]] = null
    // 判斷如果從redis中讀取當前最新偏移量 則用該偏移量加載kafka中的數據  否則直接用kafka讀出默認最新的數據
    if (offset != null && offset.size > 0) {
      inputDstream = MyKafkaUtil.getKafkaStream(topic, ssc, offset, groupId)
      //startInputDstream.map(_.value).print(1000)
    } else {
      inputDstream = MyKafkaUtil.getKafkaStream(topic, ssc, groupId)
    }

    //取得偏移量步長
    var offsetRanges: Array[OffsetRange] = null
    val inputGetOffsetDstream: DStream[ConsumerRecord[String, String]] = inputDstream.transform { rdd =>
      offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
      rdd
    }


    /////////////////////  業務處理///////////////////////////

    //基本轉換 補充日期字段
    val orderInfoDstream: DStream[OrderInfo] = inputGetOffsetDstream.map { record =>
      val jsonString: String = record.value()
      val orderInfo: OrderInfo = JSON.parseObject(jsonString,classOf[OrderInfo])


        val datetimeArr: Array[String] = orderInfo.create_time.split(" ")
        //println(datetimeArr.toString + "---------------------------------------")
        orderInfo.create_date=datetimeArr(0)
        val timeArr: Array[String] = datetimeArr(1).split(":")
        orderInfo.create_hour=timeArr(0)

      orderInfo
    }

    // 使用mapPartitions 分區處理 減少sql每條數據都執行一次
    val orderInfoWithfirstDstream: DStream[OrderInfo] = orderInfoDstream.mapPartitions { orderInfoItr =>
      // 將訂單信息轉換程一個List
      val orderInfoList: List[OrderInfo] = orderInfoItr.toList

      if(orderInfoList.size>0){
        // 取出所有userID變成一個list
        val userIdList: List[String] = orderInfoList.map(_.user_id.toString)
        // println(userIdList.size + "--------------------")
        //  將所有得userID去執行這條SQL
        var sql = "select user_id,if_consumed from user_state where user_id in ('" + userIdList.mkString("','") + "')"
        // 查詢出這批userID 得if_consumed (是否是首單) 的數據
        val userStateList: List[JSONObject] = PhoenixUtil.queryList(sql)

        // 以前的數據
        //避免2重循環  把一個 list轉爲map
        val userStateMap: Map[String, String] = userStateList.map(userStateJsonObj =>
          //注意返回字段的大小寫！！！！！！！！
          // 將查詢出來的數據進行數據結構轉換程一個map
          (userStateJsonObj.getString("USER_ID"), userStateJsonObj.getString("IF_CONSUMED"))
        ).toMap

        // 現在數據和數據庫已經存在的數據作對比
        // 遍歷所有的訂單信息
        for (orderInfo <- orderInfoList) {
          // 獲取從數據庫中查詢的userID的if_consumed，不存在爲null
          val userIfConsumed: String = userStateMap.getOrElse(orderInfo.user_id.toString, null)
          // 判斷是否爲空
          if (userIfConsumed != null && userIfConsumed == "1") {
            // 不爲空變成0 ，就不是首單了
            orderInfo.if_first_order = "0"
          } else {
            // 爲空就是首單
            orderInfo.if_first_order = "1"
          }
        }
      }
      orderInfoList.toIterator
    }

    // 解決 同一批次同一用戶多次下單  如果是首次消費 ，多筆訂單都會被認爲是首單
    // 將orderInfo 轉換成user_id,orderInfos
    val orderInfoWithUidDstream: DStream[(Long, OrderInfo)] = orderInfoWithfirstDstream.map(orderInfo=>(orderInfo.user_id,orderInfo))
    // 按照key進行分組聚合
    val orderInfoGroupbyUidDstream: DStream[(Long, Iterable[OrderInfo])] = orderInfoWithUidDstream.groupByKey()
    // 扁平化處理
    val orderInfoFinalFirstDstream: DStream[OrderInfo] = orderInfoGroupbyUidDstream.flatMap { case (userId, orderInfoItr) =>
      // 將orderInfo數據 轉成List
      val orderInfoList: List[OrderInfo] = orderInfoItr.toList
      // 有首單標誌的用戶訂單集合才進行處理
      if (orderInfoList(0).if_first_order == "1" && orderInfoList.size > 1) {
        // 把本批次用的訂單進行排序
        val orderInfoSortedList: List[OrderInfo] = orderInfoList.sortWith { (orderInfo1, orderInfo2) =>
          (orderInfo1.create_time < orderInfo2.create_time)
        }
        // 循環處理除了第一筆全部置爲0 （非首單)
        for (i <- 1 to orderInfoSortedList.size - 1) {
          orderInfoSortedList(i).if_first_order = "0"
        }
        orderInfoSortedList.toIterator
      } else {
        orderInfoList.toIterator
      }

    }


     ////////////////合併省份表/////////////////

    val orderInfoWithProvinceDstream: DStream[OrderInfo] = orderInfoFinalFirstDstream.transform {rdd =>

        // driver
        val sql = "select id,name,region_id,area_code from gmall_province_info"
        // 執行sql 獲取一個provinceJsonObject得List集合
        val provinceJsonObject: List[JSONObject] = PhoenixUtil.queryList(sql)

        // list轉成Map,將id作爲Key，數據爲value
        val provinceJsonObjMap: Map[Long, JSONObject] = provinceJsonObject.map {
          JsonObj =>
            (JsonObj.getLongValue("ID"), JsonObj)
        }.toMap

        // 定義廣播變量
        val provinceJsonObjMapBc: Broadcast[Map[Long, JSONObject]] = ssc.sparkContext.broadcast(provinceJsonObjMap)

        // exceutor
        val orderInfoWithProvinceRDD: RDD[OrderInfo] = rdd.mapPartitions { orderInfoItr =>

            // 獲取廣播變量得值爲一個List[JSONObject]
            val provinceJsonObjMap: Map[Long, JSONObject] = provinceJsonObjMapBc.value

            // 若是可迭代對象再for循環中 執行一次後，對象爲空
            val orderInfoList: List[OrderInfo] = orderInfoItr.toList

            // 循環查找
            for (orderInfo <- orderInfoList) {
              // 通過查詢到得map 對現有數據進行查詢，沒有則是null
              val provinceJsonObj: JSONObject = provinceJsonObjMap.getOrElse(orderInfo.province_id, null)
              // 判斷爲空
              if (provinceJsonObj != null) {
                // 提取信息
                orderInfo.province_name = provinceJsonObj.getString("NAME")
                println(orderInfo.province_name + "-----------")
                orderInfo.province_area_code = provinceJsonObj.getString("AREA_CODE")
              }
            }

            orderInfoList.toIterator
        }

        orderInfoWithProvinceRDD
    }


    /////////////// 合併 用戶信息////////////////////
    val orderInfoWithUserDstream: DStream[OrderInfo] = orderInfoWithProvinceDstream.mapPartitions { orderInfoItr =>
      val orderList: List[OrderInfo] = orderInfoItr.toList
      if(orderList.size>0) {
        val userIdList: List[Long] = orderList.map(_.user_id)
        val sql = "select id ,user_level ,  birthday  , gender  , age_group  , gender_name from gmall_user_info where id in ('" + userIdList.mkString("','") + "')"
        val userJsonObjList: List[JSONObject] = PhoenixUtil.queryList(sql)
        val userJsonObjMap: Map[Long, JSONObject] = userJsonObjList.map(userJsonObj => (userJsonObj.getLongValue("ID"), userJsonObj)).toMap
        for (orderInfo <- orderList) {
          val userJsonObj: JSONObject = userJsonObjMap.getOrElse(orderInfo.user_id, null)
          orderInfo.user_age_group = userJsonObj.getString("AGE_GROUP")
          orderInfo.user_gender = userJsonObj.getString("GENDER_NAME")
        }
      }
      orderList.toIterator
    }

    orderInfoWithUserDstream.cache()

    orderInfoWithUserDstream.print(1000)


    orderInfoWithUserDstream.foreachRDD{rdd=>
      val userStatRDD:RDD[UserState]  = rdd.filter(_.if_first_order=="1").map(orderInfo=>
        UserState(orderInfo.user_id.toString,orderInfo.if_first_order)
      )
      import org.apache.phoenix.spark._
      userStatRDD.saveToPhoenix("user_state",
        Seq("USER_ID","IF_CONSUMED"),
        new Configuration,
        Some("hadoop102,hadoop103,hadoop104:2181"))

/*      //寫入es
      //   println("訂單數："+ rdd.count())
      orderWithProvinceDstream.foreachRDD{rdd=>
        rdd.foreachPartition{orderInfoItr=>
          val orderList: List[OrderInfo] = orderInfoItr.toList
          val orderWithKeyList: List[(String, OrderInfo)] = orderList.map(orderInfo=>(orderInfo.id.toString,orderInfo))
          val dateStr: String = new SimpleDateFormat("yyyyMMdd").format(new Date)
          //  MyEsUtil.saveBulk(orderWithKeyList,"gmall1122_order_info-"+dateStr)

          for (orderInfo <- orderList ) {
            println(orderInfo)
            MyKafkaSink.send("DW_ORDER_INFO",orderInfo.id.toString,JSON.toJSONString(orderInfo,new SerializeConfig(true)))
          }

        }

        OffsetManagerUtil.saveOffset(groupId, topic, offsetRanges)*/
      OffsetManagerUtil.saveOffset(groupId, topic, offsetRanges)

    }

    /* dbJsonObjDstream.foreachRDD { rdd =>
       rdd.foreachPartition { jsonObjItr =>

         for (jsonObj <- jsonObjItr) {
           val dataObj: JSONObject = jsonObj.getJSONObject("data")
           val tableName = jsonObj.getString("table")
           val id = dataObj.getString("id")
           val topic = "ODS_T_" + tableName.toUpperCase
           MyKafkaSink.send(topic, id, dataObj.toJSONString)
         }
       }
       OffsetManager.saveOffset(groupId, topic, offsetRanges)

     }*/
    ssc.start()
    ssc.awaitTermination()

  }
}

Spark實時項目第六天-訂單表與用戶維度表聯立

HBase建表

UserInfo

UserInfoApp

利用maxwell-bootstrap 初始化數據

修改OrderInfoApp

整體目錄結構

測試

Error:scalac: Error: Error compiling the sbt component compiler-interface-2.11.8-55.0

用戶畫像代碼實操

Flink電商項目第一天-電商用戶行爲分析及完整圖步驟解析-熱門商品統計TopN的實現

Flink- 將錶轉換成DataStream | 查看執行計劃 | 流處理和關係代數的區別 | 動態表 | 流式持續查詢的過程 | 將流轉換成動態表 | 持續查詢 | 將動態錶轉換成 DS

Flink-分組窗口 | Over Windows | SQL 中的 Group Windows | SQL 中的 Over Windows

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結