動手實戰Spark Streaming Broadcast、Accumulator實現在線黑名單過濾和計數
1、自定義Receiver分析
2、自定義Receiver實戰
package com.tom.spark.SparkApps.sparkstreaming;
import java.util.Arrays;
import java.util.List;
import org.apache.hadoop.hive.ql.parse.HiveParser.ifExists_return;
import org.apache.spark.Accumulator;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.broadcast.Broadcast;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.Time;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import scala.Tuple2;
public class SparkStreamingBroadcastAccumulator {
private static volatile Broadcast<List<String>> broadcastList = null;
private static volatile Accumulator<Integer> accumulator = null;
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
//好處:1、checkpoint 2、工廠
SparkConf conf = new SparkConf().setAppName("SparkStreamingBroadcastAccumulator").setMaster("hdfs://Master:7077/");
JavaStreamingContext javassc = new JavaStreamingContext(conf, Durations.seconds(15));
//沒有action廣播不會發出
//使用Broadcast廣播黑名單到每個Executor中
broadcastList = javassc.sparkContext().broadcast(Arrays.asList("Hadoop","Mahout","Hive"));
//全局計數器,用於統計在線過濾了多少個黑名單
accumulator = javassc.sparkContext().accumulator(0, "OnlineBlacklistCounter");
//創建Kafka元數據來讓Spark Streaming這個Kafka Consumer利用
JavaReceiverInputDStream<String> lines = javassc.socketTextStream("Master", 9999);
JavaPairDStream<String, Integer> pairs = lines.mapToPair(new PairFunction<String, String, Integer>() {
public Tuple2<String, Integer> call(String t) throws Exception {
// TODO Auto-generated method stub
return new Tuple2<String, Integer>(t, 1);
}
});
JavaPairDStream<String, Integer> wordsCount = pairs.reduceByKey(new Function2<Integer, Integer, Integer>(){
//對相同的key,進行Value的累加(包括Local和Reducer級別同時Reduce)
public Integer call(Integer v1, Integer v2) throws Exception {
// TODO Auto-generated method stub
return v1 + v2;
}
});
wordsCount.foreachRDD(new Function2<JavaPairRDD<String, Integer>, Time, Void>() {
public Void call(JavaPairRDD<String, Integer> rdd, Time time)
throws Exception {
// TODO Auto-generated method stub
rdd.filter(new Function<Tuple2<String,Integer>, Boolean>() {
public Boolean call(Tuple2<String, Integer> wordPair) throws Exception {
if(broadcastList.value().contains(wordPair._1)) {
accumulator.add(wordPair._2);
return false;
} else {
return true;
}
}
}).collect();
System.out.println(broadcastList.value().toString() + " : " + accumulator.value());
return null;
}
});
wordsCount.print();
/**
* Spark Streaming 執行引擎也就是Driver開始運行,Driver啓動的時候是位於一條新的線程中的,當然其內部有消息循環體,用於
* 接收應用程序本身或者Executor中的消息,
*/
javassc.start();
javassc.awaitTermination();
javassc.close();
}
}