HBase中對四個維表進行建表
create table gmall_base_category3 ( id varchar primary key ,info.name varchar, info.category2_id varchar )SALT_BUCKETS = 3
create table gmall_base_trademark ( id varchar primary key ,info.tm_name varchar)SALT_BUCKETS = 3
create table gmall_sku_info ( id varchar primary key ,info.spu_id varchar, info.price varchar,info.sku_name varchar,info.tm_id varchar,
info.category3_id varchar,info.create_time varchar,info.category3_name varchar,info.spu_name varchar,info.tm_name varchar )SALT_BUCKETS = 3
create table gmall_spu_info ( id varchar primary key ,info.spu_name varchar)SALT_BUCKETS = 3
Bean總體結構
創建OrderDetail
scala\com\atguigu\gmall\realtime\bean\OrderDetail.scala
case class OrderDetail(
id: Long,
order_id:Long,
sku_id: Long,
order_price: Double,
sku_num:Long,
sku_name: String,
create_time: String,
var spu_id: Long,
var tm_id: Long,
var category3_id: Long,
var spu_name: String,
var tm_name: String,
var category3_name: String
)
分別創建sku、spu、商標、類別
scala\com\atguigu\gmall\realtime\bean\dim\BaseCategory3.scala
case class BaseCategory3(id:String ,
name:String ,
category2_id:String ) {
}
scala\com\atguigu\gmall\realtime\bean\dim\BaseTrademark.scala
case class BaseTrademark(tm_id:String , tm_name:String) {
}
scala\com\atguigu\gmall\realtime\bean\dim\SkuInfo.scala
case class SkuInfo(id:String ,
spu_id:String ,
price:String ,
sku_name:String ,
tm_id:String ,
category3_id:String ,
create_time:String,
var category3_name:String,
var spu_name:String,
var tm_name:String) {
}
scala\com\atguigu\gmall\realtime\bean\dim\SpuInfo.scala
case class SpuInfo(id:String , spu_name:String ) {
}
BaseCategory3APP
scala\com\atguigu\gmall\realtime\app\dim\BaseCategory3App.scala
import com.alibaba.fastjson.JSON
import com.atguigu.gmall.realtime.bean.dim.BaseCategory3
import com.atguigu.gmall.realtime.utils.{MyKafkaUtil, OffsetManagerUtil}
import org.apache.hadoop.conf.Configuration
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{HasOffsetRanges, OffsetRange}
import org.apache.spark.streaming.{Seconds, StreamingContext}
object BaseCategory3App {
def main(args: Array[String]): Unit = {
val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("dim_base_category3_app")
val ssc = new StreamingContext(sparkConf, Seconds(5))
val topic = "ODS_T_BASE_CATEGORY3";
val groupId = "base_category3_group"
///////////////////// 偏移量處理///////////////////////////
val offset: Map[TopicPartition, Long] = OffsetManagerUtil.getOffset(groupId, topic)
var inputDstream: InputDStream[ConsumerRecord[String, String]] = null
// 判斷如果從redis中讀取當前最新偏移量 則用該偏移量加載kafka中的數據 否則直接用kafka讀出默認最新的數據
if (offset != null && offset.size > 0) {
inputDstream = MyKafkaUtil.getKafkaStream(topic, ssc, offset, groupId)
} else {
inputDstream = MyKafkaUtil.getKafkaStream(topic, ssc, groupId)
}
//取得偏移量步長
var offsetRanges: Array[OffsetRange] = null
val inputGetOffsetDstream: DStream[ConsumerRecord[String, String]] = inputDstream.transform { rdd =>
offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
rdd
}
// 轉換結構
val objectDstream: DStream[BaseCategory3] = inputGetOffsetDstream.map { record =>
val jsonStr: String = record.value()
val obj: BaseCategory3 = JSON.parseObject(jsonStr, classOf[BaseCategory3])
obj
}
// 存儲到HBase中
objectDstream.foreachRDD{rdd=>
import org.apache.phoenix.spark._
rdd.saveToPhoenix("GMALL_BASE_CATEGORY3",Seq("ID", "NAME", "CATEGORY2_ID" )
,new Configuration,Some("hadoop102,hadoop103,hadoop104:2181"))
OffsetManagerUtil.saveOffset(groupId, topic, offsetRanges)
}
ssc.start()
ssc.awaitTermination()
}
}
BaseTrademarkApp
scala\com\atguigu\gmall\realtime\app\dim\BaseTrademarkApp.scala
import com.alibaba.fastjson.JSON
import com.atguigu.gmall.realtime.bean.dim.BaseTrademark
import com.atguigu.gmall.realtime.utils.{MyKafkaUtil, OffsetManagerUtil}
import org.apache.hadoop.conf.Configuration
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{HasOffsetRanges, OffsetRange}
import org.apache.spark.streaming.{Seconds, StreamingContext}
object BaseTrademarkApp {
def main(args: Array[String]): Unit = {
val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("dim_base_trademark_app")
val ssc = new StreamingContext(sparkConf, Seconds(5))
val topic = "ODS_T_BASE_TRADEMARK";
val groupId = "dim_base_trademark_group"
///////////////////// 偏移量處理///////////////////////////
val offset: Map[TopicPartition, Long] = OffsetManagerUtil.getOffset(groupId, topic)
var inputDstream: InputDStream[ConsumerRecord[String, String]] = null
// 判斷如果從redis中讀取當前最新偏移量 則用該偏移量加載kafka中的數據 否則直接用kafka讀出默認最新的數據
if (offset != null && offset.size > 0) {
inputDstream = MyKafkaUtil.getKafkaStream(topic, ssc, offset, groupId)
} else {
inputDstream = MyKafkaUtil.getKafkaStream(topic, ssc, groupId)
}
//取得偏移量步長
var offsetRanges: Array[OffsetRange] = null
val inputGetOffsetDstream: DStream[ConsumerRecord[String, String]] = inputDstream.transform { rdd =>
offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
rdd
}
val objectDstream: DStream[BaseTrademark] = inputGetOffsetDstream.map { record =>
val jsonStr: String = record.value()
val obj: BaseTrademark = JSON.parseObject(jsonStr, classOf[BaseTrademark])
obj
}
objectDstream.foreachRDD{rdd=>
import org.apache.phoenix.spark._
rdd.saveToPhoenix("GMALL_BASE_TRADEMARK",Seq("ID", "TM_NAME" )
,new Configuration,Some("hadoop102,hadoop103,hadoop104:2181"))
OffsetManagerUtil.saveOffset(groupId, topic, offsetRanges)
}
ssc.start()
ssc.awaitTermination()
}
}
SkuInfoApp
scala\com\atguigu\gmall\realtime\app\dim\SkuInfoApp.scala
import com.alibaba.fastjson.{JSON, JSONObject}
import com.atguigu.gmall.realtime.bean.dim.SkuInfo
import com.atguigu.gmall.realtime.utils.{MyKafkaUtil, OffsetManagerUtil, PhoenixUtil}
import org.apache.hadoop.conf.Configuration
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.spark.SparkConf
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{HasOffsetRanges, OffsetRange}
import org.apache.spark.streaming.{Seconds, StreamingContext}
object SkuInfoApp {
def main(args: Array[String]): Unit = {
val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("dim_sku_info_app")
val ssc = new StreamingContext(sparkConf, Seconds(5))
val topic = "ODS_T_SKU_INFO";
val groupId = "dim_sku_info_group"
///////////////////// 偏移量處理///////////////////////////
val offset: Map[TopicPartition, Long] = OffsetManagerUtil.getOffset(groupId, topic)
var inputDstream: InputDStream[ConsumerRecord[String, String]] = null
// 判斷如果從redis中讀取當前最新偏移量 則用該偏移量加載kafka中的數據 否則直接用kafka讀出默認最新的數據
if (offset != null && offset.size > 0) {
inputDstream = MyKafkaUtil.getKafkaStream(topic, ssc, offset, groupId)
} else {
inputDstream = MyKafkaUtil.getKafkaStream(topic, ssc, groupId)
}
//取得偏移量步長
var offsetRanges: Array[OffsetRange] = null
val inputGetOffsetDstream: DStream[ConsumerRecord[String, String]] = inputDstream.transform { rdd =>
offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
rdd
}
val objectDstream: DStream[SkuInfo] = inputGetOffsetDstream.map { record =>
val jsonStr: String = record.value()
val obj: SkuInfo = JSON.parseObject(jsonStr, classOf[SkuInfo])
obj
}
// 對多個維度表進行聯立
val skuInfoDstream: DStream[SkuInfo] = objectDstream.transform { rdd =>
if (rdd.count() > 0) {
//category3
val category3Sql = "select id ,name from gmall_base_category3" //driver 週期性執行
val category3List: List[JSONObject] = PhoenixUtil.queryList(category3Sql)
val category3Map: Map[String, JSONObject] = category3List.map(jsonObj => (jsonObj.getString("ID"), jsonObj)).toMap
//tm_name
val tmSql = "select id ,tm_name from gmall_base_trademark"
val tmList: List[JSONObject] = PhoenixUtil.queryList(tmSql)
val tmMap: Map[String, JSONObject] = tmList.map(jsonObj => (jsonObj.getString("ID"), jsonObj)).toMap
// spu
val spuSql = "select id ,spu_name from gmall_spu_info" // spu
val spuList: List[JSONObject] = PhoenixUtil.queryList(spuSql)
val spuMap: Map[String, JSONObject] = spuList.map(jsonObj => (jsonObj.getString("ID"), jsonObj)).toMap
// 彙總到一個list 廣播這個map
val dimList = List[Map[String, JSONObject]](category3Map, tmMap, spuMap)
val dimBC: Broadcast[List[Map[String, JSONObject]]] = ssc.sparkContext.broadcast(dimList)
val skuInfoRDD: RDD[SkuInfo] = rdd.mapPartitions { skuInfoItr => //ex
val dimList: List[Map[String, JSONObject]] = dimBC.value //接收bc
val category3Map: Map[String, JSONObject] = dimList(0)
val tmMap: Map[String, JSONObject] = dimList(1)
val spuMap: Map[String, JSONObject] = dimList(2)
val skuInfoList: List[SkuInfo] = skuInfoItr.toList
for (skuInfo <- skuInfoList) {
val category3JsonObj: JSONObject = category3Map.getOrElse(skuInfo.category3_id, null) //從map中尋值
if (category3JsonObj != null) {
skuInfo.category3_name = category3JsonObj.getString("NAME")
}
val tmJsonObj: JSONObject = tmMap.getOrElse(skuInfo.tm_id, null) //從map中尋值
if (tmJsonObj != null) {
skuInfo.tm_name = tmJsonObj.getString("TM_NAME")
}
val spuJsonObj: JSONObject = spuMap.getOrElse(skuInfo.spu_id, null) //從map中尋值
if (spuJsonObj != null) {
skuInfo.spu_name = spuJsonObj.getString("SPU_NAME")
}
}
skuInfoList.toIterator
}
skuInfoRDD
} else {
rdd
}
}
skuInfoDstream.foreachRDD { rdd =>
import org.apache.phoenix.spark._
rdd.saveToPhoenix("GMALL_SKU_INFO", Seq("ID", "SPU_ID", "PRICE", "SKU_NAME", "TM_ID", "CATEGORY3_ID", "CREATE_TIME", "CATEGORY3_NAME", "SPU_NAME", "TM_NAME")
, new Configuration, Some("hadoop102,hadoop103,hadoop104:2181"))
OffsetManagerUtil.saveOffset(groupId, topic, offsetRanges)
}
ssc.start()
ssc.awaitTermination()
}
}
SpuInfoApp
scala\com\atguigu\gmall\realtime\app\dim\SpuInfoApp.scala
import com.alibaba.fastjson.JSON
import com.atguigu.gmall.realtime.bean.dim.SpuInfo
import com.atguigu.gmall.realtime.utils.{MyKafkaUtil, OffsetManagerUtil}
import org.apache.hadoop.conf.Configuration
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{HasOffsetRanges, OffsetRange}
import org.apache.spark.streaming.{Seconds, StreamingContext}
object SpuInfoApp {
def main(args: Array[String]): Unit = {
val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("dim_spu_info_app")
val ssc = new StreamingContext(sparkConf, Seconds(5))
val topic = "ODS_T_SPU_INFO";
val groupId = "dim_spu_info_group"
///////////////////// 偏移量處理///////////////////////////
val offset: Map[TopicPartition, Long] = OffsetManagerUtil.getOffset(groupId, topic)
var inputDstream: InputDStream[ConsumerRecord[String, String]] = null
// 判斷如果從redis中讀取當前最新偏移量 則用該偏移量加載kafka中的數據 否則直接用kafka讀出默認最新的數據
if (offset != null && offset.size > 0) {
inputDstream = MyKafkaUtil.getKafkaStream(topic, ssc, offset, groupId)
//startInputDstream.map(_.value).print(1000)
} else {
inputDstream = MyKafkaUtil.getKafkaStream(topic, ssc, groupId)
}
//取得偏移量步長
var offsetRanges: Array[OffsetRange] = null
val inputGetOffsetDstream: DStream[ConsumerRecord[String, String]] = inputDstream.transform { rdd =>
offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
rdd
}
val objectDstream: DStream[SpuInfo] = inputGetOffsetDstream.map { record =>
val jsonStr: String = record.value()
val obj: SpuInfo = JSON.parseObject(jsonStr, classOf[SpuInfo])
obj
}
objectDstream.foreachRDD{rdd=>
import org.apache.phoenix.spark._
rdd.saveToPhoenix("GMALL_SPU_INFO",Seq("ID", "SPU_NAME" )
,new Configuration,Some("hadoop102,hadoop103,hadoop104:2181"))
OffsetManagerUtil.saveOffset(groupId, topic, offsetRanges)
}
ssc.start()
ssc.awaitTermination()
}
}
OrderDetailApp
這裏沒有往es寫,跟orderInfo一樣因爲還沒創建模板,只是註釋掉了
scala\com\atguigu\gmall\realtime\app\dw\OrderDetailApp.scala
import java.text.SimpleDateFormat
import java.util.Date
import com.alibaba.fastjson.serializer.SerializeConfig
import com.alibaba.fastjson.{JSON, JSONObject}
import com.atguigu.gmall.realtime.bean.OrderDetail
import com.atguigu.gmall.realtime.utils.{MyKafkaSinkUtil, MyKafkaUtil, OffsetManagerUtil, PhoenixUtil}
import org.apache.hadoop.conf.Configuration
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.spark.SparkConf
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{HasOffsetRanges, OffsetRange}
import org.apache.spark.streaming.{Seconds, StreamingContext}
object OrderDetailApp {
def main(args: Array[String]): Unit = {
val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("dw_order_detail_app")
val ssc = new StreamingContext(sparkConf, Seconds(5))
val topic = "ODS_T_ORDER_DETAIL";
val groupId = "dw_order_detail_group"
///////////////////// 偏移量處理///////////////////////////
val offset: Map[TopicPartition, Long] = OffsetManagerUtil.getOffset(groupId, topic)
var inputDstream: InputDStream[ConsumerRecord[String, String]] = null
// 判斷如果從redis中讀取當前最新偏移量 則用該偏移量加載kafka中的數據 否則直接用kafka讀出默認最新的數據
if (offset != null && offset.size > 0) {
inputDstream = MyKafkaUtil.getKafkaStream(topic, ssc, offset, groupId)
//startInputDstream.map(_.value).print(1000)
} else {
inputDstream = MyKafkaUtil.getKafkaStream(topic, ssc, groupId)
}
//取得偏移量步長
var offsetRanges: Array[OffsetRange] = null
val inputGetOffsetDstream: DStream[ConsumerRecord[String, String]] = inputDstream.transform { rdd =>
offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
rdd
}
///////////////////// 業務處理///////////////////////////
val orderDetailDstream: DStream[OrderDetail] = inputGetOffsetDstream.map { record =>
val jsonString: String = record.value()
val orderDetail: OrderDetail = JSON.parseObject(jsonString,classOf[OrderDetail])
orderDetail
}
/////////////// 合併 商品信息////////////////////
val orderDetailWithSkuDstream: DStream[OrderDetail] = orderDetailDstream.mapPartitions { orderDetailItr =>
val orderDetailList: List[OrderDetail] = orderDetailItr.toList
if(orderDetailList.size>0) {
val skuIdList: List[Long] = orderDetailList.map(_.sku_id)
val sql = "select id ,tm_id,spu_id,category3_id,tm_name ,spu_name,category3_name from gmall_sku_info where id in ('" + skuIdList.mkString("','") + "')"
val skuJsonObjList: List[JSONObject] = PhoenixUtil.queryList(sql)
val skuJsonObjMap: Map[Long, JSONObject] = skuJsonObjList.map(skuJsonObj => (skuJsonObj.getLongValue("ID"), skuJsonObj)).toMap
for (orderDetail <- orderDetailList) {
val skuJsonObj: JSONObject = skuJsonObjMap.getOrElse(orderDetail.sku_id, null)
orderDetail.spu_id = skuJsonObj.getLong("SPU_ID")
orderDetail.spu_name = skuJsonObj.getString("SPU_NAME")
orderDetail.tm_id = skuJsonObj.getLong("TM_ID")
orderDetail.tm_name = skuJsonObj.getString("TM_NAME")
orderDetail.category3_id = skuJsonObj.getLong("CATEGORY3_ID")
orderDetail.category3_name = skuJsonObj.getString("CATEGORY3_NAME")
}
}
orderDetailList.toIterator
}
orderDetailWithSkuDstream.cache()
orderDetailWithSkuDstream.print(1000)
/* //寫入es
// println("訂單數:"+ rdd.count())
orderDetailWithSkuDstream.foreachRDD{rdd=>
rdd.foreachPartition{orderDetailItr=>
val orderDetailList: List[OrderDetail] = orderDetailItr.toList
for (orderDetail <- orderDetailList ) {
MyKafkaSinkUtil.send("DW_ORDER_DETAIL",orderDetail.order_id.toString,JSON.toJSONString(orderDetail,new SerializeConfig(true)))
}
}
OffsetManagerUtil.saveOffset(groupId, topic, offsetRanges)
}*/
OffsetManagerUtil.saveOffset(groupId, topic, offsetRanges)
ssc.start()
ssc.awaitTermination()
}
}
導入維表
bin/maxwell-bootstrap --user maxwell --password 123123 --host hadoop102 --database spark_gmall --table base_category3 --client_id maxwell_1
bin/maxwell-bootstrap --user maxwell --password 123123 --host hadoop102 --database spark_gmall --table base_trademark --client_id maxwell_1
bin/maxwell-bootstrap --user maxwell --password 123123 --host hadoop102 --database spark_gmall --table sku_info --client_id maxwell_1
bin/maxwell-bootstrap --user maxwell --password 123123 --host hadoop102 --database spark_gmall --table spu_info --client_id maxwell_1
base_category3
base_trademark
sku_info
spu_info
OrderDetailApp測試
可以看到倒數第三個字段並沒有提取成功,後期再更改