import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka.HasOffsetRanges;
import org.apache.spark.streaming.kafka.KafkaCluster;
import org.apache.spark.streaming.kafka.KafkaUtils;
import org.apache.spark.streaming.kafka.OffsetRange;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import kafka.common.TopicAndPartition;
import kafka.message.MessageAndMetadata;
import kafka.serializer.StringDecoder;
import scala.Predef;
import scala.Tuple2;
import scala.collection.JavaConversions;
/**
* spark streaming使用direct方式讀取kafka數據,並存儲每個partition讀取的offset
*/
public final class JavaDirectKafkaWordCount {
private static final Logger LOG = LoggerFactory.getLogger(JavaDirectKafkaWordCount.class);
public static void main(String[] args) {
if (args.length < 2) {
System.err.println("Usage: JavaDirectKafkaWordCount <brokers> <topics>\n" +
" <brokers> is a list of one or more Kafka brokers\n" +
" <topics> is a list of one or more kafka topics to consume from\n\n");
System.exit(1);
}
//StreamingExamples.setStreamingLogLevels();
String brokers = args[0]; // kafka brokers
String topics = args[1]; // 主題
long seconds = 10; // 批次時間(單位:秒)
// Create context with a 2 seconds batch interval
SparkConf sparkConf = new SparkConf().setAppName("JavaDirectKafkaWordCount");
JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(seconds));
// 設置kafkaParams
HashSet<String> topicsSet = new HashSet<>(Arrays.asList(topics.split(",")));
HashMap<String, String> kafkaParams = new HashMap<>();
kafkaParams.put("metadata.broker.list", brokers);
final String groupId = kafkaParams.get("group.id");
// 創建kafka管理對象
final KafkaCluster kafkaCluster = getKafkaCluster(kafkaParams);
// 初始化offsets
Map<TopicAndPartition, Long> fromOffsets = fromOffsets(topicsSet, kafkaParams, groupId, kafkaCluster, null);
// 創建kafkaStream
JavaInputDStream<String> stream = KafkaUtils.createDirectStream(jssc,
String.class, String.class, StringDecoder.class,
StringDecoder.class, String.class, kafkaParams,
fromOffsets,
new Function<MessageAndMetadata<String, String>, String>() {
/**
*
*/
private static final long serialVersionUID = 1L;
public String call(MessageAndMetadata<String, String> v1)
throws Exception {
return v1.message();
}
});
// print
stream.print();
// 存儲offsets
storeConsumerOffsets(groupId, kafkaCluster, stream);
// Start the computation
jssc.start();
jssc.awaitTermination();
}
/**
* @param groupId 消費者 組id
* @param kafkaCluster kafka管理對象
* @param stream kafkaStreamRdd
*/
private static <T> void storeConsumerOffsets(final String groupId, final KafkaCluster kafkaCluster, JavaInputDStream<T> stream) {
long l = System.currentTimeMillis();
stream.foreachRDD(new VoidFunction<JavaRDD<T>>() {
/**
*
*/
private static final long serialVersionUID = 1L;
@Override
public void call(JavaRDD<T> javaRDD) throws Exception {
// 根據group.id 存儲每個partition消費的位置
OffsetRange[] offsets = ((HasOffsetRanges) javaRDD.rdd()).offsetRanges();
for (OffsetRange o : offsets) {
// 封裝topic.partition 與 offset對應關係 java Map
TopicAndPartition topicAndPartition = new TopicAndPartition(o.topic(), o.partition());
Map<TopicAndPartition, Object> topicAndPartitionObjectMap = new HashMap<>();
topicAndPartitionObjectMap.put(topicAndPartition, o.untilOffset());
// 轉換java map to scala immutable.map
scala.collection.immutable.Map<TopicAndPartition, Object> scalaTopicAndPartitionObjectMap =
JavaConversions.mapAsScalaMap(topicAndPartitionObjectMap).toMap(new Predef.$less$colon$less<Tuple2<TopicAndPartition, Object>, Tuple2<TopicAndPartition, Object>>() {
/**
*
*/
private static final long serialVersionUID = 1L;
public Tuple2<TopicAndPartition, Object> apply(Tuple2<TopicAndPartition, Object> v1) {
return v1;
}
});
// 更新offset到kafkaCluster
kafkaCluster.setConsumerOffsets(groupId, scalaTopicAndPartitionObjectMap);
}
}
});
// 記錄處理時間
LOG.info("storeConsumerOffsets time:" + (System.currentTimeMillis() - l));
}
/**
* 獲取partition信息,並設置各分區的offsets
*
* @param topicsSet 所有topic
* @param kafkaParams kafka參數配置
* @param groupId 消費者 組id
* @param kafkaCluster kafka管理對象
* @param offset 自定義offset
* @return offsets
*/
private static Map<TopicAndPartition, Long> fromOffsets(HashSet<String> topicsSet, HashMap<String, String> kafkaParams, String groupId, KafkaCluster kafkaCluster, Long offset) {
long l = System.currentTimeMillis();
// 所有partition offset
Map<TopicAndPartition, Long> fromOffsets = new HashMap<>();
// util.set 轉 scala.set
scala.collection.immutable.Set<String> immutableTopics = JavaConversions
.asScalaSet(topicsSet)
.toSet();
// 獲取topic分區信息
scala.collection.immutable.Set<TopicAndPartition> scalaTopicAndPartitionSet = kafkaCluster
.getPartitions(immutableTopics)
.right()
.get();
if (offset != null || kafkaCluster.getConsumerOffsets(kafkaParams.get("group.id"),
scalaTopicAndPartitionSet).isLeft()) {
// 等於空則設置爲0
offset = (offset == null ? 0L : offset);
// 設置每個分區的offset
scala.collection.Iterator<TopicAndPartition> iterator = scalaTopicAndPartitionSet.iterator();
while (iterator.hasNext()) {
fromOffsets.put(iterator.next(), offset);
}
} else {
// 往後繼續讀取
scala.collection.Map<TopicAndPartition, Object> consumerOffsets = kafkaCluster
.getConsumerOffsets(groupId,
scalaTopicAndPartitionSet).right().get();
scala.collection.Iterator<Tuple2<TopicAndPartition, Object>> iterator = consumerOffsets.iterator();
while (iterator.hasNext()) {
Tuple2<TopicAndPartition, Object> next = iterator.next();
offset = (long) next._2();
fromOffsets.put(next._1(), offset);
}
}
// 記錄處理時間
LOG.info("fromOffsets time:" + (System.currentTimeMillis() - l));
return fromOffsets;
}
/**
* 將kafkaParams轉換成scala map,用於創建kafkaCluster
*
* @param kafkaParams kafka參數配置
* @return kafkaCluster管理工具類
*/
private static KafkaCluster getKafkaCluster(HashMap<String, String> kafkaParams) {
// 類型轉換
scala.collection.immutable.Map<String, String> immutableKafkaParam = JavaConversions
.mapAsScalaMap(kafkaParams)
.toMap(new Predef.$less$colon$less<Tuple2<String, String>, Tuple2<String, String>>() {
/**
*
*/
private static final long serialVersionUID = 1L;
public Tuple2<String, String> apply(
Tuple2<String, String> v1) {
return v1;
}
});
return new KafkaCluster(immutableKafkaParam);
}
}
sparkstreaming直連kafka Java實現 自己管理offset
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.