sparkStreaming 實現kafka offset自定義保存

KafkaUtils.createDirectStream

區別Receiver接收數據,這種方式定期地從kafka的topic+partition中查詢最新的偏移量,再根據偏移量範圍在每個batch裏面處理數據,使用的是kafka的簡單消費者api 
優點: 
A、 簡化並行,不需要多個kafka輸入流,該方法將會創建和kafka分區一樣的rdd個數,而且會從kafka並行讀取。 
B、高效,這種方式並不需要WAL,WAL模式需要對數據複製兩次,第一次是被kafka複製,另一次是寫到wal中 
C、恰好一次語義(Exactly-once-semantics),傳統的讀取kafka數據是通過kafka高層次api把偏移量寫入zookeeper中,存在數據丟失的可能性是zookeeper中和ssc的偏移量不一致。EOS通過實現kafka低層次api,偏移量僅僅被ssc保存在checkpoint中,消除了zk和ssc偏移量不一致的問題。缺點是無法使用基於zookeeper的kafka監控工具

offset保存與Redis中,首先需要RedisUtil的編寫:

import org.apache.commons.pool2.impl.GenericObjectPoolConfig
import redis.clients.jedis.JedisPool

/**
 * @author https://blog.csdn.net/qq_38704184
 * @package pool
 * @date 2019/11/25 10:09
 * @version 1.0
 */
object InternalRedisClient extends Serializable {
  @transient private var pool: JedisPool = null

  def makePool(redisHost: String, redisPort: Int, redisTimeout: Int,
               maxTotal: Int, maxIdle: Int, minIdle: Int): Unit = {
    makePool(redisHost, redisPort, redisTimeout, maxTotal, maxIdle, minIdle, true, false, 10000)
  }

  def makePool(redisHost: String, redisPort: Int, redisTimeout: Int,
               maxTotal: Int, maxIdle: Int, minIdle: Int, testOnBorrow: Boolean,
               testOnReturn: Boolean, maxWaitMillis: Long): Unit = {
    if (pool == null) {
      val poolConfig = new GenericObjectPoolConfig()
      poolConfig.setMaxTotal(maxTotal)
      poolConfig.setMaxIdle(maxIdle)
      poolConfig.setMinIdle(minIdle)
      poolConfig.setTestOnBorrow(testOnBorrow)
      poolConfig.setTestOnReturn(testOnReturn)
      poolConfig.setMaxWaitMillis(maxWaitMillis)
      pool = new JedisPool(poolConfig, redisHost, redisPort, redisTimeout)

      val hook = new Thread {
        override def run = pool.destroy()
      }
      sys.addShutdownHook(hook.run)
    }
  }

  def getPool: JedisPool = {
    assert(pool != null)
    pool
  }
}

初始化Redis Pool:

def initRedisPool = {
    //    redis configuration
    val maxTotal = 20
    val maxIdle = 10
    val minIdle = 1
    val redisHost = "127.0.0.1"
    val redisPort = 6379
    val redisTimeout = 30000
    InternalRedisClient.makePool(redisHost, redisPort, redisTimeout, maxTotal, maxIdle, minIdle)
  }

獲取上次提交的offset:

def getLastCommittedOffsets(groupId: String, topicName: String, partitions: Int): Map[TopicPartition, Long] = {
    if (LOG.isInfoEnabled())
      LOG.info("||--Topic:{},getLastCommittedOffsets from Redis--||", topicName)

    //      redis 獲取上一次存的offset
    val jedis: Jedis = InternalRedisClient.getPool.getResource
    val fromOffsets = collection.mutable.HashMap.empty[TopicPartition, Long]
    for (partition <- 0 to (partitions - 1)) {
      val groupId_topic_partition_key = groupId + "_" + topicName + "_" + partition
      val lastSaveOffset: String = jedis.get(groupId_topic_partition_key)
      val lastOffset: Long = if (lastSaveOffset == null) 0L else lastSaveOffset.toLong
      fromOffsets += (new TopicPartition(topicName, partition) -> lastOffset)
    }
    jedis.close()
    fromOffsets.toMap
  }

開啓streaming處理:

def main(args: Array[String]): Unit = {
    Logger.getLogger("org").setLevel(Level.WARN)

    //    初始化redis pool
    initRedisPool

    val conf: SparkConf = new SparkConf().setAppName(this.getClass.getSimpleName).setMaster("local[*]")
    val ssc = new StreamingContext(conf, Seconds(60))

    val topic: String = "mysql_store_offset"
    val group: String = "mysql_offset"
    //    The maximum number of records returned in a single call to poll
    val maxPoll = 2000

    val kafkaParams = Map(
      ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "cm01:9092,cm02:9092,cm03:9092",
      ConsumerConfig.GROUP_ID_CONFIG -> group,
      ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG -> "false",
      ConsumerConfig.MAX_POLL_RECORDS_CONFIG -> maxPoll.toString,
      ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
      ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer]
    )
    //    這裏指定topic的partition的總數
    val topicPartitionToLong: Map[TopicPartition, Long] = getLastCommittedOffsets(group, topic, 3)
    //    初始化kafkaDS
    val kafkaTopicDS: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
      ssc,
      LocationStrategies.PreferConsistent,
      ConsumerStrategies.Assign[String, String](topicPartitionToLong.keys.toList, kafkaParams, topicPartitionToLong)
    )

    kafkaTopicDS.foreachRDD(rdd => {
      val offsetRanges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
      if (!rdd.isEmpty()) {
        val jedis: Jedis = InternalRedisClient.getPool.getResource
        val p: Pipeline = jedis.pipelined()
        //        開啓事務
        p.multi()

        //        處理數據
        rdd.foreach(partition => {
          println(s"${partition.topic()},${partition.partition()},${partition.offset()},${partition.value()}")
        })
        //        保存每次拉取後的offset
        offsetRanges.foreach(offsetRange => {
          println(s"partition:${offsetRange.partition},fromOffset:${offsetRange.fromOffset},utilOffset:${offsetRange.untilOffset}")
          val groupID_topic_partition_key = s"${group}_${offsetRange.topic}_${offsetRange.partition}_${offsetRange.partition}"
          p.set(groupID_topic_partition_key, offsetRange.untilOffset + "")
        })
        //        提交事務
        p.exec()
        //        關閉pipeline
        p.sync()
        jedis.close()
      }
    })
    ssc.start()
    ssc.awaitTermination()
  }

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章