spark streaming 示例

1、監控本地文件夾下的文件信息

[java]view
plaincopy

import org.apache.spark.SparkConf  

import org.apache.spark.streaming.{Seconds, StreamingContext}  

import org.apache.spark.streaming.StreamingContext._  

object HdfsWordCount {  

  def main(args: Array[String]) {  

    val sparkConf = new SparkConf().setAppName("HdfsWordCount").setMaster("local[2]")//這裏指在本地運行，2個線程，一個監聽，一個處理數據  

    // Create the context  

    val ssc = new StreamingContext(sparkConf, Seconds(20))// 時間劃分爲20秒  

    val lines = ssc.textFileStream("/home/mmicky/temp/")  

    val words = lines.flatMap(_.split(" "))  

    val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)  

    wordCounts.print()  

    ssc.start()  

    ssc.awaitTermination()  

  }  

}

2、網絡socket監控

1）構建socket模擬週期發送數據

[java]view
plaincopy

import java.io.{PrintWriter}  

import java.net.ServerSocket  

import scala.io.Source  

object SaleSimulation {  

  def index(length: Int) = { //銷售模擬器：參數1：讀入的文件；參數2：端口；參數3：發送時間間隔ms  

    import java.util.Random  

    val rdm = new Random  

    rdm.nextInt(length)  

  }  

  def main(args: Array[String]) {  

    if (args.length != 3) {  

      System.err.println("Usage: <filename> <port> <millisecond>")  

      System.exit(1)  

    }  

    val filename = args(0)  

    val lines = Source.fromFile(filename).getLines.toList  

    val filerow = lines.length  

    val listener = new ServerSocket(args(1).toInt)  

    while (true) {  

      val socket = listener.accept()  

      new Thread() {  

        override def run = {  

          println("Got client connected from: " + socket.getInetAddress)  

          val out = new PrintWriter(socket.getOutputStream(), true)  

          while (true) {  

            Thread.sleep(args(2).toLong)  

            val content = lines(index(filerow))  

            println(content)  

            out.write(content + '\n')  

            out.flush()  

          }  

          socket.close()  

        }  

      }.start()  

    }  

  }  

}

[java]view
plaincopy

運行：java -cp week5.jar week5.SaleSimulation /home/mmicky/data/spark/people.txt 9999 1000 //從people文件隨機讀取，發送端口9999，間隔1秒  

2）sparkStream 監控端

[java]view
plaincopy

import org.apache.spark.{SparkContext, SparkConf}  

import org.apache.spark.streaming.{Milliseconds, Seconds, StreamingContext}  

import org.apache.spark.streaming.StreamingContext._  

import org.apache.spark.storage.StorageLevel  

object NetworkWordCount {  

  def main(args: Array[String]) {  

    val conf = new SparkConf().setAppName("NetworkWordCount").setMaster("local[2]")  

    val sc = new SparkContext(conf)  

    val ssc = new StreamingContext(sc, Seconds(5))//5秒間隔  

    val lines = ssc.socketTextStream(args(0), args(1).toInt, StorageLevel.MEMORY_AND_DISK_SER)// 服務器地址，端口，序列化方案  

    val words = lines.flatMap(_.split(","))  

    val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)  

    wordCounts.print()  

    ssc.start()  

    ssc.awaitTermination()  

  }  

}

3、監控有狀態（stateful）

[java]view
plaincopy

import org.apache.spark.{SparkContext, SparkConf}  

import org.apache.spark.streaming.{Seconds, StreamingContext}  

import org.apache.spark.streaming.StreamingContext._  

object StatefulWordCount {  

  def main(args: Array[String]) {  

    val updateFunc = (values: Seq[Int], state: Option[Int]) => { //StateFul需要定義的處理函數，第一個參數是本次進來的值，第二個是過去處理後保存的值  

      val currentCount = values.foldLeft(0)(_ + _)//求和  

      val previousCount = state.getOrElse(0)// 如果過去沒有 即取0  

      Some(currentCount + previousCount)// 求和  

    }  

    val conf = new SparkConf().setAppName("StatefulWordCount").setMaster("local[2]")  

    val sc = new SparkContext(conf)  

    //創建StreamingContext  

    val ssc = new StreamingContext(sc, Seconds(5))  

    ssc.checkpoint(".")//因爲是有狀態的，需要保存之前的信息，所以這裏設定了 checkpoint的目錄，以防斷電後內存數據丟失。  

//這裏因爲沒有設置checkpoint的時間間隔，所以會發現每一次數據塊過來 即切分一次，產生一個 .checkpoint 文件  

    //獲取數據  

    val lines = ssc.socketTextStream(args(0), args(1).toInt)  

    val words = lines.flatMap(_.split(","))  

    val wordCounts = words.map(x => (x, 1))  

    //使用updateStateByKey來更新狀態  

    val stateDstream = wordCounts.updateStateByKey[Int](updateFunc)//調用處理函數 updateFunc  

    stateDstream.print()  

    ssc.start()  

    ssc.awaitTermination()  

  }  

}

4、windows操作

[java]view
plaincopy

import org.apache.spark.{SparkContext, SparkConf}  

import org.apache.spark.storage.StorageLevel  

import org.apache.spark.streaming._  

import org.apache.spark.streaming.StreamingContext._  

object WindowWordCount {  

  def main(args: Array[String]) {  

    val conf = new SparkConf().setAppName("WindowWordCount").setMaster("local[2]")  

    val sc = new SparkContext(conf)  

    //創建StreamingContext  

    val ssc = new StreamingContext(sc, Seconds(5))  

    ssc.checkpoint(".")  

    // //獲取數據  

    val lines = ssc.socketTextStream(args(0), args(1).toInt, StorageLevel.MEMORY_ONLY_SER)  

    val words = lines.flatMap(_.split(","))  

    //windows操作  

    val wordCounts = words.map(x => (x , 1)).reduceByKeyAndWindow((a:Int,b:Int) => (a + b), Seconds(args(2).toInt), Seconds(args(3).toInt))

//第二個參數是 windows的窗口時間間隔，比如是 監聽間隔的 倍數，上面是 5秒，這裏必須是5的倍數。eg :30  

//第三個參數是 windows的滑動時間間隔，也必須是監聽間隔的倍數。eg :10  

//那麼這裏的作用是， 每隔10秒鐘，對前30秒的數據， 進行一次處理，這裏的處理就是 word count。  

    //val wordCounts = words.map(x => (x , 1)).reduceByKeyAndWindow(_+_, _-_,Seconds(args(2).toInt), Seconds(args(3).toInt))  

//這個是優化方法， 即加上上一次的結果，減去 上一次存在又不在這一次的數據塊的部分。  

    //val sortedWordCount = wordCounts.map{case(char, count) => (count, char)}.transform{_.sortByKey(false)}.map{case(char,
count) => (count, char)}

    //這個地方用transform，而不直接使用sortByKey，是因爲sortByKey只能對單個RDD進行操作，而wordCounts是一連串的RDD，所以這裏需要用transform來對這一連串的RDD進行sortByKey操作。

    wordCounts.print()  

    //sortedWordCount.print()

    ssc.start()  

    ssc.awaitTermination()  

  }  

}

caoli98033

發佈了22 篇原創文章 · 獲贊 5 · 訪問量 10萬+

私信關注

spark streaming 示例

【SQL進階】CASE語句的使用

npm error Cannot read properties of null (reading 'isDescendantOf')

spark中updateStateByKey引發StackOverflowError的解決

（轉）基於MapReduce的HBase開發（續）

（轉）Linux進程間通信——使用信號量

（轉）基於MapReduce的HBase開發

（轉）使用 Boost 的 IPC

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結