spark Streaming +kafka 的offset數據保存MySQL、hbase、redis

Kafka做爲一款流行的分佈式發佈訂閱消息系統，以高吞吐、低延時、高可靠的特點著稱，已經成爲Spark Streaming常用的流數據來源。

其實說白了，官方提供的思路就是，把JavaInputDStream轉換爲OffsetRange對象，該對象具有topic對應的分區的所有信息，每次batch處理完，Spark Streaming都會自動更新該對象，所以你只需要找個合適的地方保存該對象（比如HBase、HDFS），就可以愉快的操縱offset了。

一、SparkStreaming直連方式讀取kafka數據，使用MySQL保存偏移量

在數據庫中新建一張表Offset,表結構設計如圖

mysql

配置

//配置數據庫信息
//使用IDEA，在resources文件夾下新建文件File文件名爲application.conf
db.default.driver="com.mysql.jdbc.Driver"
db.default.url="jdbc:mysql://hadoop01:3306/kafkaOffset?characterEncodeing=utf-8"
db.default.user="root"
db.default.password="root"

/*
將偏移量保存到MySQL中
 */
object SparkStreamingOffsetMySql {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("medd").setMaster("local[2]")
    val ssc = new StreamingContext(conf,Duration(5000))
    //配置一系列基本配置
    val groupid = "GPMMCC"
    val topic = "mysqlDemo"
    val brokerList = "hadoop01:9092,hadoop02:9092,hadoop03:9092"
   // val zkQuorum = "hadoop01:2181,hadoop02:2181,hadoop03:2181"
    val topics = Set(topic)
    //設置kafka的參數
    val kafkaParams = Map(
      "metadata.broker.list"->brokerList,
      "group.id"->groupid,
      "auto.offset.reset"->kafka.api.OffsetRequest.SmallestTimeString
    )
    //加載配置 application.conf  https://www.jianshu.com/p/2369a020e604
    DBs.setup()     // connect to mysql 
    //不需要查詢zk中的offset啦，直接查詢MySQL中的offset
    val fromdbOffset:Map[TopicAndPartition,Long]=
      DB.readOnly{
        implicit  session=>{
          //查詢每個分組下面的所有消息
          SQL(s"select * from offset where groupId = '${groupid}'" +
           //將MySQL中的數據賦值給元組
            s"").map(m=>(TopicAndPartition(m.string("topic"),m.string("partitions").toInt),m.string("untilOffset").toLong))
            .toList().apply()
        }.toMap  //最後toMap ,應爲前面的返回值已經給定
      }
 
    //創建一個DStream,用來獲取數據
    var kafkaDStream : InputDStream[(String,String)] = null
 
    //從MySql中獲取數據進行判斷
    if(fromdbOffset.isEmpty){
      kafkaDStream = KafkaUtils.createDirectStream[String,String,StringDecoder,
        StringDecoder](ssc,kafkaParams,topics)
    }else{
      //1\ 不能重複消費
      //2\ 保證偏移量
      var checkOffset = Map[TopicAndPartition,Long]()
 
      //加載kafka的配置
      val kafkaCluster = new KafkaCluster(kafkaParams)
      //首先獲得kafka中的所有的topic和partition Offset
      val earliesOffset: Either[Err, Map[TopicAndPartition, KafkaCluster.LeaderOffset]
        ] = kafkaCluster.getEarliestLeaderOffsets(fromdbOffset.keySet)
 
      //然後開始比較大小，用mysql中的offset和kafka中的offset進行比較
      if(earliesOffset.isRight){
        //去到需要的 大Map
        //物取值
        val tap: Map[TopicAndPartition, KafkaCluster.LeaderOffset] =
        earliesOffset.right.get
        //比較，直接進行比較大小
        val checkOffset = fromdbOffset.map(f => {
          //取kafka中的offset
          //進行比較，不需要重複消費，取最大的
          val KafkatopicOffset = tap.get(f._1).get.offset
          if (f._2 > KafkatopicOffset) {
            f
          } else {
            (f._1, KafkatopicOffset)
          }
        })
        checkOffset
      }
      val messageHandler=(mmd:MessageAndMetadata[String,String])=>{
        (mmd.key(),mmd.message())
      }
      //不是第一次啓動的話 ，按照之前的偏移量取數據的偏移量
      kafkaDStream = KafkaUtils.createDirectStream[String,String,StringDecoder
        ,StringDecoder,(String,String)](ssc,kafkaParams,checkOffset
      ,messageHandler)
    }
    var offsetRanges = Array[OffsetRange]()
    kafkaDStream.foreachRDD(kafkaRDD=>{
     offsetRanges = kafkaRDD.asInstanceOf[HasOffsetRanges].offsetRanges
      val map: RDD[String] = kafkaRDD.map(_._2)
      map.foreach(println)
 
      //更新偏移量
        DB.localTx(implicit session =>{
          //去到所有的topic partition offset
          for (o<- offsetRanges){
            /*SQL("update offset set groupId=? topic=?,partition=?," +
              "untilsOffset=?").bind(groupid,o.topic,o.partition,o.untilOffset).update().apply()*/
            SQL("replace into offset(groupId,topic,partitions,untilOffset) values(?,?,?,?)").bind(
              groupid,o.topic,o.partition.toString,o.untilOffset.toString
            ).update().apply()
          }
        })
    })
    ssc.start()
    ssc.awaitTermination()
  }
}

原文：https://blog.csdn.net/Lu_Xiao_Yue/article/details/84110045

/*kafka偏移量保存在數據庫，spark從kafka拉去數據時候，先讀取數據庫偏移量*/
object StreamingKafkaMysqlOffset {
  //設置日誌級別
  Logger.getLogger("org").setLevel(Level.WARN)
 
  def main(args: Array[String]): Unit = {
    //conf 本地運行設置
    val conf: SparkConf = new SparkConf()
      .setMaster("local[*]")
      .setAppName(this.getClass.getSimpleName)
      //每秒鐘每個分區kafka拉取消息的速率
      .set("spark.streaming.kafka.maxRatePerPartition", "100")
      // 序列化
      .set("spark.serilizer", "org.apache.spark.serializer.KryoSerializer")
      // 建議開啓rdd的壓縮
      .set("spark.rdd.compress", "true")
 
    //SparkStreaming
    val ssc: StreamingContext = new StreamingContext(conf, Seconds(1))
 
    // kafka的參數配置
    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> "hadoop:9092,hadoop-01:9092,hadoop-02:9092",
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> groupId,
      "auto.offset.reset" -> "earliest",
      "enable.auto.commit" -> (false: java.lang.Boolean) //自己維護偏移量
    )
    val groupId = "topic_group0"
    val topic = "order"
    val topics = Array(topic)
    // 需要設置偏移量的值
    val offsets: mutable.HashMap[TopicPartition, Long] = mutable.HashMap[TopicPartition, Long]()
    val conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/bigdata?characterEncoding=utf-8", "root", "123456")
 
    val pstm = conn.prepareStatement("select * from mysqloffset where groupId = ? and topic = ? ")
    pstm.setString(1, groupId)
    pstm.setString(2, topic)
 
    val result: ResultSet = pstm.executeQuery()
    while (result.next()) {
      // 把數據庫中的偏移量數據加載了
      val p = result.getInt("partition")
      val f = result.getInt("untilOffset")
      //      offsets += (new TopicPartition(topic,p)-> f)
      val partition: TopicPartition = new TopicPartition(topic, p)
      offsets.put(partition, f)
    }
 
    val stream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](
      ssc,
      LocationStrategies.PreferConsistent,
      Subscribe[String, String](topics, kafkaParams, offsets)
    )
 
    //轉換成RDD
    stream.foreachRDD(rdd => {
      //手動指定分區的地方
      val ranges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
      println("長度=" + ranges.length)
      ranges.foreach(println)
      //: RDD[(String, Int)]
      val result = rdd.map(_.value()).flatMap(_.split(",")).map((_, 1)).reduceByKey(_ + _)
      result.foreach(println)
 
      //      result.foreachPartition(p => {
      //        val jedis: Jedis = ToolsRedisMysql.getJedis()
      //        //        val jedis = RedisUtils.getJedis
      //        p.foreach(zookeeper => {
      //          jedis.hincrBy("wc1", zookeeper._1, zookeeper._2)
      //        })
      //        jedis.close()
      //      })
 
      // 把偏移量的Array  寫入到mysql中
      ranges.foreach(zookeeper => {
        // 思考，需要保存哪些數據呢？   起始的offset不需要  還需要加上 groupid
 
        val pstm = conn.prepareStatement("replace into mysqloffset values (?,?,?,?)")
        pstm.setString(1, zookeeper.topic)
        pstm.setInt(2, zookeeper.partition)
        pstm.setLong(3, zookeeper.untilOffset)
        pstm.setString(4, groupId)
        pstm.execute()
        pstm.close()
      })
    })
    ssc.start()
    ssc.awaitTermination()
 
  }
}

二、offset 保存到hbase

import scala.collection.mutable
 
/**  單個跟組情況
  * 手工操作offset
  *        1 從hbase獲取offset，從kafka拉取數據
沒有分組消費，所以沒有分組信息
    htable: hbase_consumer_offset
    Family: topic_partition_offset
    column: topic 
            partition
            offset
   rowkey：topic_partition
  *        2 數據處理完後，把until offset 保存到hbase
  *        3 kafka 長時間掛掉之後，從kafka最早的offset 開始讀取 此處還需要處理   
  */
object OffsetOperate {
  var hbaseProp = PropertiesUtil.getProperties("hbase")
  var kafkaconsumePro = PropertiesUtil.getProperties("kafkaconsume")
  def main(args: Array[String]): Unit = {
 
  val conf = new SparkConf().setAppName("sparkStreaming - offset operate")
    .setMaster("local[2]") // --master local[2] | spark://xx:7077 | yarn
    .set("spark.testing.memory", "2147480000")
    val sc = new SparkContext(conf)
    val ssc = new StreamingContext(sc,Seconds(5))
 
    //kafka配置
    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> kafkaconsumePro.getProperty("bootstrap.servers"),
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> kafkaconsumePro.getProperty("group"),
      "auto.offset.reset" -> "earliest", // 第一次讀取時從topic 首位置開始讀取
      "enable.auto.commit" -> (false: java.lang.Boolean)// kafka 不保存消費的offset
    )
 
    //監聽頻道
    val topics = Array(kafkaconsumePro.getProperty("topics"))
    // 獲取hbase連接
    val hbaseConf = HBaseConfiguration.create()
    hbaseConf.set("hbase.zookeeper.quorum",hbaseProp.getProperty("quorum")) //zookeeper 集羣
    hbaseConf.set("hbase.zookeeper.property.client","2181")
    hbaseConf.set("hbase.master", hbaseProp.getProperty("hbase_master"))
    hbaseConf.set("hbase.defaults.for.version.skip", "true")
    //獲取連接對象
    val conn = ConnectionFactory.createConnection(hbaseConf)
    val admin = conn.getAdmin
    val tn = TableName.valueOf("hbase_consumer_offset") //hbase 表名
    val isExist = admin.tableExists(tn)
    val streams : InputDStream[ConsumerRecord[String,String]]= {
    if(isExist) {
      val table = new HTable(hbaseConf, "hbase_consumer_offset")
      val filter = new RowFilter(CompareOp.GREATER_OR_EQUAL, new BinaryComparator(Bytes.toBytes(topics + "_")))
      println("============ 過濾器已經創建 ==========")
      val s = new Scan()
      s.setFilter(filter)
      val rs = table.getScanner(s)
 
      // 設置 offset
      val fromOffsets = scala.collection.mutable.Map[TopicPartition, Long]()
      var s1 = ""
      var s2 = 0
      var s3: Long = 0
        for (r: Result <- rs.next(200)) {
          println("rowKey : " + new String(r.getRow))
          for (keyvalue: KeyValue <- r.raw()) {
            if ("topic".equals(new String(keyvalue.getQualifier))) {
              s1 = new String(keyvalue.getValue)
              println("columnFamily :" + new String(keyvalue.getFamily) + " column :" +new String( keyvalue.getQualifier) + s1)
            } else if ("partition".equals(new String(keyvalue.getQualifier))){
              s2 = Bytes.toInt(keyvalue.getValue)
              println("columnFamily :" +  new String(keyvalue.getFamily) + " column :" + new String( keyvalue.getQualifier) + s2)
            } else if("offset".equals(new String(keyvalue.getQualifier))) { //if("offset".equals(new String(keyvalue.getQualifier)))
              s3 = Bytes.toLong(keyvalue.getValue)
              println("columnFamily :" + new String(keyvalue.getFamily) + " column :" + new String( keyvalue.getQualifier) + s3)
            }
          }
          fromOffsets.put(new TopicPartition(s1, s2), s3)
        }
      println("fromOffset is : "+fromOffsets)
        KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent,
          ConsumerStrategies.Assign(fromOffsets.keySet, kafkaParams, fromOffsets)) //(fromOffsets.keySet,kafkaParams,fromOffsets))
      }else{ //Hbase 裏面不存在offset表，從topic首位置開始消費
        val htable = new HTableDescriptor(TableName.valueOf("hbase_consumer_offset"))
        htable.addFamily(new HColumnDescriptor(("topic_partition_offset")))
        admin.createTable(htable)
        println("表已經創建成功========" + htable)
      KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe(topics, kafkaParams))
      }
    }
  // val dstream = streams.map(x=>URLDecoder.decode(x.value()))
 
    // 操作成功後更新offset
    streams.foreachRDD{ rdd =>
      //if(!rdd.isEmpty()){
      // 打成一個事務，把業務計算和offset保存放在一起，要麼成功，要麼一起失敗，實現精確一次的消費
      import scala.collection.JavaConversions._
      val table = new HTable(hbaseConf,"hbase_consumer_offset")
      table.setAutoFlush(false, false)
      var putList:List[Put] = List()
        val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges  // RDD[ConsumerRecord[String,String]] 強轉成offsetRanges
        for(offsetRange <- offsetRanges){
          println("the topic is "+offsetRange.topic)
          println("the partition is "+offsetRange.partition)
          println("the fromOffset is "+offsetRange.fromOffset)
          println("the untilOffset is "+offsetRange.untilOffset)
          println("the object is "+offsetRange)
         // val table = new HTable(hbaseConf,"hbase_consumer_offset")
         // table.setAutoFlush(false, false)
          val put  = new Put(Bytes.toBytes(offsetRange.topic+"_"+offsetRange.partition))//put時候指定列族
          put.add(Bytes.toBytes("topic_partition_offset"),Bytes.toBytes("topic"),Bytes.toBytes(offsetRange.topic))
          put.add(Bytes.toBytes("topic_partition_offset"),Bytes.toBytes("partition"),Bytes.toBytes(offsetRange.partition))
          put.add(Bytes.toBytes("topic_partition_offset"),Bytes.toBytes("offset"),Bytes.toBytes(offsetRange.untilOffset))
          putList = put+:putList
         // println("add data success !")
        }
 
        println("the RDD records are "+rdd.map{x =>URLDecoder.decode(x.value())}.collect.foreach(println)) // 程序的計算邏輯
      //  }
      table.put(putList)
      table.flushCommits()
      println("add and compute data success !")
      }
    ssc.start()
    ssc.awaitTermination()
  }
}
參考鏈接 ：https://www.jianshu.com/p/667e0f58b7b9

實現的Spark Streaming代碼如下（ConsumerRecord類不能序列化，使用時要注意，不要分發該類到其他工作節點上，避免錯誤打印）

三、存儲在redis（基於內存）讀寫更快，

2、多個服務器分區，多個組消費組，設計key：主題分組分區； value ：offset

gtKey=groupid/topic作爲唯一標識

conn.hset(gtKey, partition.toString, offset.toString)

http://www.pianshen.com/article/8095259521/

 object KafkaDricteRedis {
 
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("redis").setMaster("local[*]")
    val ssc = new StreamingContext(conf, new Duration(5000))
 
    val groupid = "GB01" //組名
    val topic = "wordcount3"
    //topic 名
    //在redis中以 groupid/topic作爲唯一標識 ，存儲分區偏移量
    //在Reids 使用的時hash類型來存儲
    val gtKey = groupid + "/" + topic
    //topic
    val topics = Set(topic)
    //zk地址
    val zkQuorum = "hadoop01:2181,hadoop02:2181,hadoop03:2181"
    //brokerList
    val brokerList = "hadoop01:9092,hadoop03:9092"
 
    val kafkaParams = Map(
      // metadata.broker.list
      "metadata.broker.list" -> brokerList,
      "group.id" -> groupid,
      "auto.offset.reset" -> kafka.api.OffsetRequest.SmallestTimeString
      //從頭開始消費
    )
    //記錄topic 、分區對應的偏移量偏移量，在創建InputDStream時作爲參數傳如
    //從這個偏移量開始讀取
    var fromOffset: Map[TopicAndPartition, Long] = Map[TopicAndPartition, Long]()
    var offsets =   Map[TopicPartition, Long]()
 
    var kafkaDStream: InputDStream[(String, String)] = null
    //  獲取一個jedis連接
    val conn = getConnection()
    // conn.flushDB()
    //jd.hget(groupid+topic,"")
    //獲取全部的keys
    val values: util.Set[String] = conn.keys("*")
    //println(values)
    // [GB01/wordcount3]   分區數   偏移量
    //如果keys中包含 GB01/wordcount3這樣的key,則表示以前讀取過
    if (values.contains(gtKey)) {
      //獲取key 爲GB01/wordcount3 下面所對應的（k,v）
      var allKey: util.Map[String, String] = conn.hgetAll(gtKey)
      //導入後，可以把Java中的集合轉換爲Scala中的集合
      import scala.collection.JavaConversions._
      var list: List[(String, String)] = allKey.toList
      //循環得到的(k,v)
      //這裏面的 k 對應的是分區， v對應的是偏移量
      for (key <- list) { //這裏的key是一個tuple類型
        //new一個TopicAndPartition 把 topic 和分區數傳入
        val tp = new TopicAndPartition(topic, key._1.toInt)
        //把每個topic 分區 對應的偏移量傳入
        fromOffset += tp -> key._2.toLong
 
        // 把數據庫中的偏移量數據加載了
        val p = key._1.toInt
        val f =  key._2.toLong
//        offsets += (new TopicPartition(topic,p)-> f)
        val partition: TopicPartition = new TopicPartition(topic, p)
        offsets.put(partition, f)
 
 
      }
      //這裏的是把數據（key ,value）是kafka 的key默認是null,
      //value 是kafka中的value
      val messageHandler = (mmd: MessageAndMetadata[String, String]) => {
        (mmd.key(), mmd.message())
      }
 
      val stream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](
        ssc,
        LocationStrategies.PreferConsistent,
        Subscribe[String, String](topics, kafkaParams, offsets)
      )
    } else {
      //如果以前沒有讀取過，創建一個新的InputDStream
      val stream = KafkaUtils.createDirectStream[String, String](
        ssc,
        PreferConsistent,
        Subscribe[String, String](topics, kafkaParams))
    }
    //用來更新偏移量，OffsetRange中可以獲取分區及偏移量
    var OffsetRangs = Array[OffsetRange]()
    //
    kafkaDStream.foreachRDD(kafkaRDD => {
      //這裏面的RDD是kafkaRDD ,可以轉換爲HasOffsetRange
      val ranges: HasOffsetRanges = kafkaRDD.asInstanceOf[HasOffsetRanges]
      OffsetRangs = ranges.offsetRanges
      //獲取value，（key 默認是null，沒有用）
      val map: RDD[String] = kafkaRDD.map(_._2)
      map.foreach(x => println(x + "==========================="))
      //更新偏移量
      for (o <- OffsetRangs) {
        //取出偏移量
        val offset = o.untilOffset
        //取出分區
        val partition = o.partition
        println("partition: " + partition)
        println("offset: " + offset)
        //把通過hset,把對應的partition和offset寫入到redis中
        conn.hset(gtKey, partition.toString, offset.toString)
      }
    })
    ssc.start()
    ssc.awaitTermination()
  }
  //Jedis連接池
  def getConnection(): Jedis = {
    //new 一個JedisPoolConfig，用來設定參數
    val conf = new JedisPoolConfig()
    val pool = new JedisPool(conf, "hadoop01", 6379)
    //最大連接數
    conf.setMaxTotal(20)
    //最大空閒數
    conf.setMaxIdle(20)
 
    val jedis = pool.getResource()
    //密碼
    jedis.auth("123")
    jedis
  } 
}

————————————————

版權聲明：本文爲CSDN博主「曹雪朋」的原創文章，遵循 CC 4.0 BY-SA 版權協議，轉載請附上原文出處鏈接及本聲明。
原文鏈接：https://blog.csdn.net/qq_22473611/java/article/details/87973702

spark Streaming +kafka 的offset數據保存MySQL、hbase、redis

一、SparkStreaming直連方式讀取kafka數據，使用MySQL保存偏移量

二、offset 保存到hbase

三、存儲在redis（基於內存）讀寫更快，

微服務實踐之使用 Visual Studio 2022 調試Dapr 應用程序

wpf附加屬性理解 WPF附加屬性

c:forEach varStatus="status"中 varStatus的屬性簡介

hive--truncate刪除表中的數據

hadoop寫文件過程

spark Streaming +kafka 的offset數據保存MySQL、hbase、redis

phoenix建表映射有命名空間的hbase表

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結