1、監控本地文件夾下的文件信息
- import org.apache.spark.SparkConf
- import org.apache.spark.streaming.{Seconds, StreamingContext}
- import org.apache.spark.streaming.StreamingContext._
- object HdfsWordCount {
- def main(args: Array[String]) {
- val sparkConf = new SparkConf().setAppName("HdfsWordCount").setMaster("local[2]")//這裏指在本地運行,2個線程,一個監聽,一個處理數據
- // Create the context
- val ssc = new StreamingContext(sparkConf, Seconds(20))// 時間劃分爲20秒
- val lines = ssc.textFileStream("/home/mmicky/temp/")
- val words = lines.flatMap(_.split(" "))
- val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)
- wordCounts.print()
- ssc.start()
- ssc.awaitTermination()
- }
- }
1)構建socket模擬週期發送數據
- import java.io.{PrintWriter}
- import java.net.ServerSocket
- import scala.io.Source
- object SaleSimulation {
- def index(length: Int) = { //銷售模擬器:參數1:讀入的文件;參數2:端口;參數3:發送時間間隔ms
- import java.util.Random
- val rdm = new Random
- rdm.nextInt(length)
- }
- def main(args: Array[String]) {
- if (args.length != 3) {
- System.err.println("Usage: <filename> <port> <millisecond>")
- System.exit(1)
- }
- val filename = args(0)
- val lines = Source.fromFile(filename).getLines.toList
- val filerow = lines.length
- val listener = new ServerSocket(args(1).toInt)
- while (true) {
- val socket = listener.accept()
- new Thread() {
- override def run = {
- println("Got client connected from: " + socket.getInetAddress)
- val out = new PrintWriter(socket.getOutputStream(), true)
- while (true) {
- Thread.sleep(args(2).toLong)
- val content = lines(index(filerow))
- println(content)
- out.write(content + '\n')
- out.flush()
- }
- socket.close()
- }
- }.start()
- }
- }
- }
- 運行:java -cp week5.jar week5.SaleSimulation /home/mmicky/data/spark/people.txt 9999 1000 //從people文件隨機讀取,發送端口9999,間隔1秒
- import org.apache.spark.{SparkContext, SparkConf}
- import org.apache.spark.streaming.{Milliseconds, Seconds, StreamingContext}
- import org.apache.spark.streaming.StreamingContext._
- import org.apache.spark.storage.StorageLevel
- object NetworkWordCount {
- def main(args: Array[String]) {
- val conf = new SparkConf().setAppName("NetworkWordCount").setMaster("local[2]")
- val sc = new SparkContext(conf)
- val ssc = new StreamingContext(sc, Seconds(5))//5秒間隔
- val lines = ssc.socketTextStream(args(0), args(1).toInt, StorageLevel.MEMORY_AND_DISK_SER)// 服務器地址,端口,序列化方案
- val words = lines.flatMap(_.split(","))
- val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)
- wordCounts.print()
- ssc.start()
- ssc.awaitTermination()
- }
- }
- import org.apache.spark.{SparkContext, SparkConf}
- import org.apache.spark.streaming.{Seconds, StreamingContext}
- import org.apache.spark.streaming.StreamingContext._
- object StatefulWordCount {
- def main(args: Array[String]) {
- val updateFunc = (values: Seq[Int], state: Option[Int]) => { //StateFul需要定義的處理函數,第一個參數是本次進來的值,第二個是過去處理後保存的值
- val currentCount = values.foldLeft(0)(_ + _)//求和
- val previousCount = state.getOrElse(0)// 如果過去沒有 即取0
- Some(currentCount + previousCount)// 求和
- }
- val conf = new SparkConf().setAppName("StatefulWordCount").setMaster("local[2]")
- val sc = new SparkContext(conf)
- //創建StreamingContext
- val ssc = new StreamingContext(sc, Seconds(5))
- ssc.checkpoint(".")//因爲是有狀態的,需要保存之前的信息,所以這裏設定了 checkpoint的目錄,以防斷電後內存數據丟失。
- //這裏因爲沒有設置checkpoint的時間間隔,所以會發現每一次數據塊過來 即切分一次,產生一個 .checkpoint 文件
- //獲取數據
- val lines = ssc.socketTextStream(args(0), args(1).toInt)
- val words = lines.flatMap(_.split(","))
- val wordCounts = words.map(x => (x, 1))
- //使用updateStateByKey來更新狀態
- val stateDstream = wordCounts.updateStateByKey[Int](updateFunc)//調用處理函數 updateFunc
- stateDstream.print()
- ssc.start()
- ssc.awaitTermination()
- }
- }
- import org.apache.spark.{SparkContext, SparkConf}
- import org.apache.spark.storage.StorageLevel
- import org.apache.spark.streaming._
- import org.apache.spark.streaming.StreamingContext._
- object WindowWordCount {
- def main(args: Array[String]) {
- val conf = new SparkConf().setAppName("WindowWordCount").setMaster("local[2]")
- val sc = new SparkContext(conf)
- //創建StreamingContext
- val ssc = new StreamingContext(sc, Seconds(5))
- ssc.checkpoint(".")
- // //獲取數據
- val lines = ssc.socketTextStream(args(0), args(1).toInt, StorageLevel.MEMORY_ONLY_SER)
- val words = lines.flatMap(_.split(","))
- //windows操作
- val wordCounts = words.map(x => (x , 1)).reduceByKeyAndWindow((a:Int,b:Int) => (a + b), Seconds(args(2).toInt), Seconds(args(3).toInt))
- //第二個參數是 windows的窗口時間間隔,比如是 監聽間隔的 倍數,上面是 5秒,這裏必須是5的倍數。eg :30
- //第三個參數是 windows的滑動時間間隔,也必須是監聽間隔的倍數。eg :10
- //那麼這裏的作用是, 每隔10秒鐘,對前30秒的數據, 進行一次處理,這裏的處理就是 word count。
- //val wordCounts = words.map(x => (x , 1)).reduceByKeyAndWindow(_+_, _-_,Seconds(args(2).toInt), Seconds(args(3).toInt))
- //這個是優化方法, 即加上上一次的結果,減去 上一次存在又不在這一次的數據塊的部分。
- //val sortedWordCount = wordCounts.map{case(char, count) => (count, char)}.transform{_.sortByKey(false)}.map{case(char, count) => (count, char)}
- //這個地方用transform,而不直接使用sortByKey,是因爲sortByKey只能對單個RDD進行操作,而wordCounts是一連串的RDD,所以這裏需要用transform來對這一連串的RDD進行sortByKey操作。
- wordCounts.print()
- //sortedWordCount.print()
- ssc.start()
- ssc.awaitTermination()
- }
- }