object FlumePushDemo {
def main(args: Array[String]): Unit = {
Logger.getLogger("org").setLevel(Level.WARN)
//local[2]這裏必須是2個或2個以上的線程,一個負責接收數據,一個負責將接收的數據下發到worker上執行
val config = new SparkConf().setAppName("FlumePushDemo").setMaster("local[2]")
val sc = new SparkContext(config)
val ssc = new StreamingContext(sc, Seconds(2))
//這個地址是spark程序啓動時所在節點的地址
val flumeStream = FlumeUtils.createStream(ssc, "192.168.10.11", 8008)
flumeStream.flatMap(x => new String(x.event.getBody.array()).split(" ")).map((_, 1)).reduceByKey(_ + _)
.print()
ssc.start()
ssc.awaitTermination()
}
}
# 這個是啓動命令,到flume的安裝路徑
# bin/flume-ng agent -n a1 -c conf/ -f config/flume-push.conf -Dflume.root.logger=INFO,console
# flume 主動推送數據到spark上
# Name the components on this agent
a1.sources = r1
a1.sinks = k1 k2
a1.channels = c1
# source
a1.sources.r1.type = exec
# 監控linux目錄下的文件
a1.sources.r1.command = tail -F /home/hadoop/access.log
a1.sources.r1.channels = c1
# Describe the sink
# avro綁定一個端口
a1.sinks.k1.type = avro
a1.sinks.k1.hostname = 192.168.10.11
a1.sinks.k1.port = 8008
#在控制檯打印信息
a1.sinks.k2.type = logger
# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
a1.sinks.k2.channel = c1
我去,弄了好幾次了Flume配置文件開頭總是顯示<span style="font-size:14px;">,結尾顯示</span>,大家在使用的時候注意,把這些去掉。
第一,拷貝三個jar包放到flume的lib目錄下
spark-streaming-flume-sink_2.10-1.6.1.jar
scala-library-2.10.5.jar
commons-lang3-3.3.2.jar
第二,使用創建FlumeUtils.createPollingStream 的dstream
object FlumePullDemo {
def main(args: Array[String]): Unit = {
Logger.getLogger("org").setLevel(Level.WARN)
//local[2]這裏必須是2個或2個以上的線程,一個負責接收數據,一個負責將接收的數據下發到worker上執行
val config = new SparkConf().setAppName("FlumePullDemo").setMaster("local[2]")
val sc = new SparkContext(config)
val ssc = new StreamingContext(sc, Seconds(2))
//這個地址是spark程序啓動時所在節點的地址,後面可以添加多個地址
val addresses: Seq[InetSocketAddress] = Seq(new InetSocketAddress("192.168.10.11", 8008))
val flumeStream = FlumeUtils.createPollingStream(ssc, addresses, StorageLevel.MEMORY_ONLY)
flumeStream.flatMap(x => new String(x.event.getBody.array()).split(" ")).map((_, 1)).reduceByKey(_ + _)
.print()
ssc.start()
ssc.awaitTermination()
}
}
配置flume文件
# 執行代碼
# bin/flume-ng agent -n a1 -c conf/ -f config/flume-pull.conf -Dflume.root.logger=INFO,console
# spark 主動到flume上拉取數據
# Name the components on this agent
a1.sources = r1
a1.sinks = k1 k2
a1.channels = c1
# source
a1.sources.r1.type = exec
a1.sources.r1.command = tail -F /home/hadoop/access.log
a1.sources.r1.channels = c1
# Describe the sink
# 告訴flume下沉到spark編寫好的組件中
a1.sinks.k1.type = org.apache.spark.streaming.flume.sink.SparkSink
a1.sinks.k1.hostname = 192.168.10.11
a1.sinks.k1.port = 8008
# 控制檯打印數據信息
a1.sinks.k2.type = logger
# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 1000
# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
a1.sinks.k2.channel = c1