大數據學習之 Flume + kafka + SparkStreaming

1.搭建Kafka 環境:

可參考

https://blog.csdn.net/weixin_37835915/article/details/103786157

(1)啓動zookeeper

(2)啓動kafka

(3)創建topic

(4)啓動Consumer

 

2. 搭建Flume 環境:

http://www.apache.org/dyn/closer.lua/flume/1.9.0/apache-flume-1.9.0-bin.tar.gz

(1)解壓在conf文件夾下面添加example.conf 文件 文件內容如下:

# 定義這個agent中各組件的名字 a1 就是agent得名字

a1.sources = r1

a1.sinks = k1

a1.channels = c1


# 描述和配置source組件:r1

a1.sources.r1.type = netcat

a1.sources.r1.bind = localhost

a1.sources.r1.port = 44444


# 描述和配置sink組件:k1

a1.sinks.k1.channel = c1

a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink

a1.sinks.k1.kafka.topic = flume

a1.sinks.k1.kafka.bootstrap.servers = localhost:9092

a1.sinks.k1.kafka.flumeBatchSize = 20

a1.sinks.k1.kafka.producer.acks = 1

a1.sinks.k1.kafka.producer.linger.ms = 1

a1.sinks.ki.kafka.producer.compression.type = snappy


# 描述和配置channel組件,此處使用是內存緩存的方式

a1.channels.c1.type = memory

a1.channels.c1.capacity = 1000

a1.channels.c1.transactionCapacity = 100


# 描述和配置source channel sink之間的連接關係

a1.sources.r1.channels = c1

a1.sinks.k1.channel = c1

(2)啓動Flume flume-ng agent --conf ../conf --conf-file ../conf/example.conf --name a1 -property flume.root.logger=INFO,console

flume-ng agnet --conf "配置文件文件目錄" --conf-file "配置文件" --name "配置文件裏agent的名字"

出現以下說明成功 Created serverSocket:sun.nio.ch.ServerSocketChannelImpl[/127.0.0.1:44444]

(3)啓動letnet localhost 44444 輸入 hellow word 在Flume中能夠接收說明成功

3 編碼代碼:

package com.spark.self


import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.storage.StorageLevel
import org.apache.log4j.{Level, Logger}
import kafka.serializer.StringDecoder
import org.apache.spark.streaming.dstream.DStream

object WordCountSprakStreaming {
  val numThreads = 1
  //  val topics = "test"
  //  val topics = "sparkStreamingTest"
  val topics = "flume"
  val zkQuorum = "localhost:2181"
  val group = "consumer1"
  val brokers = "localhost:9092"

  def main(args: Array[String]): Unit = {
    //    receiver
    direct
  }

  def service(): Unit = {
    Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
    Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
    val conf = new SparkConf().setAppName("SparkFlumeNGWordCount").setMaster("local[2]")
    val ssc = new StreamingContext(conf, Seconds(5))
    //用checkoint來存儲offset
    ssc.checkpoint("/out")
    //創建kafka對象   生產者 和消費者
    //模式1 採取的是 receiver 方式  reciver 每次只能讀取一條記錄
    val topic = Map("test" -> 1)
    //直接讀取的方式  由於kafka 是分佈式消息系統需要依賴Zookeeper
    val data = KafkaUtils.createStream(ssc, "localhost:2181", "mygroup", topic, StorageLevel.MEMORY_AND_DISK)
    //數據累計計算
    val updateFunc = (curVal: Seq[Int], preVal: Option[Int]) => {
      //進行數據統計當前值加上之前的值
      var total = curVal.sum
      //最初的值應該是0
      var previous = preVal.getOrElse(0)
      //Some 代表最終的返回值
      Some(total + previous)
    }
    val result = data.map(_._2).flatMap(_.split(" ")).map(word => (word, 1)).updateStateByKey(updateFunc).print()
    //啓動ssc
    ssc.start()
    ssc.awaitTermination()
  }

  def receiver() = {
    Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
    Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)

    val conf = new SparkConf().setAppName("kafka test").setMaster("local[2]")
    val ssc = new StreamingContext(conf, Seconds(10));
    ssc.checkpoint("/out")
    val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap
    val lines = KafkaUtils.createStream(ssc, zkQuorum, group, topicMap).map(_._2)
    val updateFunc = (curVal: Seq[Int], preVal: Option[Int]) => {
      //進行數據統計當前值加上之前的值
      var total = curVal.sum
      //最初的值應該是0
      var previous = preVal.getOrElse(0)
      //Some 代表最終的返回值
      Some(total + previous)
    }
    val words = lines.flatMap(_.split(" ")).map(x => (x, 1))
    words.reduceByKey(_ + _).updateStateByKey(updateFunc).print()
    ssc.start()
    ssc.awaitTermination()
  }


  def direct() = {
    Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
    Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
    val conf = new SparkConf().setMaster("local[2]").setAppName("kafka test")
    val ssc = new StreamingContext(conf, Seconds(10))
    ssc.checkpoint("/out")
    val topicsSet = topics.split(",").toSet
    val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers)
    val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
      ssc, kafkaParams, topicsSet)
    val lines = messages.map(_._2)
    val words = lines.flatMap(_.split(" ")).map(x => (x, 1))
    val updateFunc = (curVal: Seq[Int], preVal: Option[Int]) => {
      //進行數據統計當前值加上之前的值
      var total = curVal.sum
      //最初的值應該是0
      var previous = preVal.getOrElse(0)
      //Some 代表最終的返回值
      Some(total + previous)
    }
    words.reduceByKey(_ + _).updateStateByKey(updateFunc).print()
    ssc.start()
    ssc.awaitTermination()
  }

}

Flume 的相關說明和配置請參考

https://blog.csdn.net/weixin_37835915/article/details/103184553

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章