spark廣播變量
將外部變量發送到executor中使用。
注意事項
1、不能,因爲RDD是不存儲數據的。可以將RDD的結果廣播出去。
2、 廣播變量只能在Driver端定義,不能在Executor端定義。
3、 在Driver端可以修改廣播變量的值,在Executor端無法修改廣播變量的值。
4、如果executor端用到了Driver的變量,如果不使用廣播變量在Executor有多少task就有多少Driver端的變量副本。
5、如果Executor端用到了Driver的變量,如果使用廣播變量在每個Executor中只有一份Driver端的變量副本。
靜態廣播變量使用
val conf = new SparkConf()
conf.setMaster("local").setAppName("brocast")
val sc = new SparkContext(conf)
val list = List("hello xasxt")
val broadCast = sc.broadcast(list)
val lineRDD = sc.textFile("./words.txt")
lineRDD.filter { x => broadCast.value.contains(x) }.foreach { println}
sc.stop()
動態廣播變量使用
在實際項目中,有時候我們的廣播變量是動態的,比如需要一分鐘更新一次,這個也是可以實現的,我們知道廣播變量是在driver端初始化,在excetors端獲取這個變量,但是不能修改,所以,我們可以在driver端進行更新這個變量
package com.unionpay.ysf
import java.sql.{Connection, DriverManager, ResultSet, Statement}
import java.text.SimpleDateFormat
import java.util.Date
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.{Level, Logger}
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, HasOffsetRanges, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
object test3 {
@volatile private var instance: Broadcast[Map[String, Double]] = null
var kafkaStreams: InputDStream[ConsumerRecord[String, String]] = null
val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss:SSS")
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.spark").setLevel(Level.INFO)
Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.INFO)
Logger.getLogger("org.apache.kafka.clients.consumer").setLevel(Level.INFO)
val conf = new SparkConf().setAppName("Spark Streaming TO ES TOPIC")
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
@transient
val scc = new StreamingContext(conf, Seconds(1))
val topic ="topic_combine"
val topicSet = Set(topic) //設置kafka的topic;
val kafkaParams = Map[String, Object](
"auto.offset.reset" -> "earliest", //latest;earliest
"value.deserializer" -> classOf[StringDeserializer] //key,value的反序列化;
, "key.deserializer" -> classOf[StringDeserializer]
, "bootstrap.servers" -> "192.168.38.12:9092"
, "group.id" -> "groupId_es"
, "enable.auto.commit" -> (false: java.lang.Boolean)
)
//初始化instance;
getInstance(scc.sparkContext)
kafkaStreams = KafkaUtils.createDirectStream[String, String](
scc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](topicSet, kafkaParams))
kafkaStreams.foreachRDD(rdd => {
val current_time = sdf.format(new Date())
val new_time = current_time.substring(14,16).toLong
if(new_time % 5 == 0){
update(rdd.sparkContext,true) //五分鐘更新一次廣播變量的內容;
}
if (!rdd.isEmpty()) {
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges //獲得偏移量對象數組
rdd.foreachPartition(pr => {
pr.foreach(pair => {
val d = pair.value()
if(instance.value.contains(d)){
//自己的處理邏輯;
}
})
})
}
})
scc.start()
scc.awaitTermination()
}
/**
* 從sqlserver獲取數據放到一個map裏;
* @return
*/
def getSqlServerData(): Map[String,Double] = {
val time = sdf.format(new Date())
val enter_time = time.substring(0,10)
var map = Map[String,Double]()
var conn:Connection = null
var stmt:Statement = null
var rs:ResultSet = null
val url = ""
val user_name = ""
val password = ""
val sql = ""
try {
conn = DriverManager.getConnection(url,user_name,password)
stmt = conn.createStatement
rs = stmt.executeQuery(sql)
while (rs.next) {
val url = rs.getString("url")
val WarningPrice = rs.getString("WarningPrice").toDouble
map += (url -> WarningPrice)
}
if (rs != null) {
rs.close
rs = null
}
if (stmt != null) {
stmt.close
stmt = null
}
if (conn != null) {
conn.close
conn = null
}
} catch {
case e: Exception => e.printStackTrace()
println("sqlserver連接失敗:" + e)
}
map
}
/**
* 更新instance;
* @param sc
* @param blocking
*/
def update(sc: SparkContext, blocking: Boolean = false): Unit = {
if (instance != null){
instance.unpersist(blocking)
instance = sc.broadcast(getSqlServerData())
}
}
/**
* 初始化instance;
* @param sc
* @return
*/
def getInstance(sc: SparkContext): Broadcast[Map[String,Double]] = {
if (instance == null) {
synchronized {
if (instance == null) {
instance = sc.broadcast(getSqlServerData())
}
}
}
instance
}
}
spark累加器
相當於統籌大變量,常用於計數,統計。
import org.apache.spark.{SparkConf, SparkContext}
object AccumulatorOperator {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setMaster("local").setAppName("accumulator")
val sc = new SparkContext(conf)
val accumulator = sc.accumulator(0)
sc.textFile("./records.txt",2).foreach {//兩個變量
x =>{accumulator.add(1)
println(accumulator)}}
println(accumulator.value)
sc.stop()
}
}
package com.spark.spark.others;
import org.apache.spark.Accumulator;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.VoidFunction;
/**
* 累加器在Driver端定義賦初始值和讀取,在Executor端累加。
* @author root
*
*/
public class AccumulatorOperator {
public static void main(String[] args) {
SparkConf conf = new SparkConf();
conf.setMaster("local").setAppName("accumulator");
JavaSparkContext sc = new JavaSparkContext(conf);
final Accumulator<Integer> accumulator = sc.accumulator(0);
// accumulator.setValue(1000);
sc.textFile("./words.txt",2).foreach(new VoidFunction<String>() {
/**
*
*/
private static final long serialVersionUID = 1L;
@Override
public void call(String t) throws Exception {
accumulator.add(1);
// System.out.println(accumulator.value());
System.out.println(accumulator);
}
});
System.out.println(accumulator.value());
sc.stop();
}
}
注意事項
累加器在Driver端定義賦初始值,累加器只能在Driver端讀取最後的值,在Excutor端更新。