KafkaUtils.createDirectStream
區別Receiver接收數據,這種方式定期地從kafka的topic+partition中查詢最新的偏移量,再根據偏移量範圍在每個batch裏面處理數據,使用的是kafka的簡單消費者api
優點:
A、 簡化並行,不需要多個kafka輸入流,該方法將會創建和kafka分區一樣的rdd個數,而且會從kafka並行讀取。
B、高效,這種方式並不需要WAL,WAL模式需要對數據複製兩次,第一次是被kafka複製,另一次是寫到wal中
C、恰好一次語義(Exactly-once-semantics),傳統的讀取kafka數據是通過kafka高層次api把偏移量寫入zookeeper中,存在數據丟失的可能性是zookeeper中和ssc的偏移量不一致。EOS通過實現kafka低層次api,偏移量僅僅被ssc保存在checkpoint中,消除了zk和ssc偏移量不一致的問題。缺點是無法使用基於zookeeper的kafka監控工具
offset保存與Redis中,首先需要RedisUtil的編寫:
import org.apache.commons.pool2.impl.GenericObjectPoolConfig
import redis.clients.jedis.JedisPool
/**
* @author https://blog.csdn.net/qq_38704184
* @package pool
* @date 2019/11/25 10:09
* @version 1.0
*/
object InternalRedisClient extends Serializable {
@transient private var pool: JedisPool = null
def makePool(redisHost: String, redisPort: Int, redisTimeout: Int,
maxTotal: Int, maxIdle: Int, minIdle: Int): Unit = {
makePool(redisHost, redisPort, redisTimeout, maxTotal, maxIdle, minIdle, true, false, 10000)
}
def makePool(redisHost: String, redisPort: Int, redisTimeout: Int,
maxTotal: Int, maxIdle: Int, minIdle: Int, testOnBorrow: Boolean,
testOnReturn: Boolean, maxWaitMillis: Long): Unit = {
if (pool == null) {
val poolConfig = new GenericObjectPoolConfig()
poolConfig.setMaxTotal(maxTotal)
poolConfig.setMaxIdle(maxIdle)
poolConfig.setMinIdle(minIdle)
poolConfig.setTestOnBorrow(testOnBorrow)
poolConfig.setTestOnReturn(testOnReturn)
poolConfig.setMaxWaitMillis(maxWaitMillis)
pool = new JedisPool(poolConfig, redisHost, redisPort, redisTimeout)
val hook = new Thread {
override def run = pool.destroy()
}
sys.addShutdownHook(hook.run)
}
}
def getPool: JedisPool = {
assert(pool != null)
pool
}
}
初始化Redis Pool:
def initRedisPool = {
// redis configuration
val maxTotal = 20
val maxIdle = 10
val minIdle = 1
val redisHost = "127.0.0.1"
val redisPort = 6379
val redisTimeout = 30000
InternalRedisClient.makePool(redisHost, redisPort, redisTimeout, maxTotal, maxIdle, minIdle)
}
獲取上次提交的offset:
def getLastCommittedOffsets(groupId: String, topicName: String, partitions: Int): Map[TopicPartition, Long] = {
if (LOG.isInfoEnabled())
LOG.info("||--Topic:{},getLastCommittedOffsets from Redis--||", topicName)
// redis 獲取上一次存的offset
val jedis: Jedis = InternalRedisClient.getPool.getResource
val fromOffsets = collection.mutable.HashMap.empty[TopicPartition, Long]
for (partition <- 0 to (partitions - 1)) {
val groupId_topic_partition_key = groupId + "_" + topicName + "_" + partition
val lastSaveOffset: String = jedis.get(groupId_topic_partition_key)
val lastOffset: Long = if (lastSaveOffset == null) 0L else lastSaveOffset.toLong
fromOffsets += (new TopicPartition(topicName, partition) -> lastOffset)
}
jedis.close()
fromOffsets.toMap
}
開啓streaming處理:
def main(args: Array[String]): Unit = {
Logger.getLogger("org").setLevel(Level.WARN)
// 初始化redis pool
initRedisPool
val conf: SparkConf = new SparkConf().setAppName(this.getClass.getSimpleName).setMaster("local[*]")
val ssc = new StreamingContext(conf, Seconds(60))
val topic: String = "mysql_store_offset"
val group: String = "mysql_offset"
// The maximum number of records returned in a single call to poll
val maxPoll = 2000
val kafkaParams = Map(
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "cm01:9092,cm02:9092,cm03:9092",
ConsumerConfig.GROUP_ID_CONFIG -> group,
ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG -> "false",
ConsumerConfig.MAX_POLL_RECORDS_CONFIG -> maxPoll.toString,
ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer]
)
// 這裏指定topic的partition的總數
val topicPartitionToLong: Map[TopicPartition, Long] = getLastCommittedOffsets(group, topic, 3)
// 初始化kafkaDS
val kafkaTopicDS: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Assign[String, String](topicPartitionToLong.keys.toList, kafkaParams, topicPartitionToLong)
)
kafkaTopicDS.foreachRDD(rdd => {
val offsetRanges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
if (!rdd.isEmpty()) {
val jedis: Jedis = InternalRedisClient.getPool.getResource
val p: Pipeline = jedis.pipelined()
// 開啓事務
p.multi()
// 處理數據
rdd.foreach(partition => {
println(s"${partition.topic()},${partition.partition()},${partition.offset()},${partition.value()}")
})
// 保存每次拉取後的offset
offsetRanges.foreach(offsetRange => {
println(s"partition:${offsetRange.partition},fromOffset:${offsetRange.fromOffset},utilOffset:${offsetRange.untilOffset}")
val groupID_topic_partition_key = s"${group}_${offsetRange.topic}_${offsetRange.partition}_${offsetRange.partition}"
p.set(groupID_topic_partition_key, offsetRange.untilOffset + "")
})
// 提交事務
p.exec()
// 關閉pipeline
p.sync()
jedis.close()
}
})
ssc.start()
ssc.awaitTermination()
}