StrutsStreaming消費kafka的數據sink全過程

前言:
kafka的消息是通過fileBeat採集Nginx的日誌進kafka。
spark消費kafka的數據

1、源消息:

{
    "@timestamp":"2020-04-18T09:30:41.525Z",
    "@metadata":{
        "beat":"filebeat",
        "type":"_doc",
        "version":"7.2.0",
        "topic":"bigdata_nginx_access"
    },
    "message":"192.168.25.1 - - [18/Apr/2020:14:15:45 +0800] "GET /nocar/Download?a=520&g=520p&b=h&p=123456 HTTP/1.1" 200 103580 "-""
}

2、依賴 pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.test.bigdata</groupId>
    <artifactId>scalaspark</artifactId>
    <version>1.0-SNAPSHOT</version>


    <properties>
        <maven.compiler.source>1.8</maven.compiler.source>
        <maven.compiler.target>1.8</maven.compiler.target>
        <scala.version>2.11.8</scala.version>
        <spark.version>2.4.0</spark.version>
        <hadoop.version>3.0.0</hadoop.version>
        <jackson.version>2.6.2</jackson.version>
        <encoding>UTF-8</encoding>
    </properties>


    <repositories>
        <repository>
            <id>scala-tools.org</id>
            <name>Scala-Tools Maven2 Repository</name>
            <url>http://scala-tools.org/repo-Releases</url>
        </repository>
    </repositories>
    <pluginRepositories>
        <pluginRepository>
            <id>scala-tools.org</id>
            <name>Scala-Tools Maven2 Repository</name>
            <url>http://scala-tools.org/repo-Releases</url>
        </pluginRepository>
    </pluginRepositories>

    <dependencies>

        <dependency>
            <groupId>org.scala-lang</groupId>
            <artifactId>scala-library</artifactId>
            <version>${scala.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.11</artifactId>
            <version>${spark.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_2.11</artifactId>
            <version>${spark.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming_2.11</artifactId>
            <version>${spark.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-hive_2.11</artifactId>
            <version>${spark.version}</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming-kafka-0-8_2.11 -->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
            <version>${spark.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql-kafka-0-10_2.11</artifactId>
            <version>${spark.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>${hadoop.version}</version>
        </dependency>

        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.44</version>
        </dependency>
        <dependency>
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-core</artifactId>
            <version>${jackson.version}</version>
        </dependency>
        <dependency>
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-databind</artifactId>
            <version>${jackson.version}</version>
        </dependency>
        <dependency>
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-annotations</artifactId>
            <version>${jackson.version}</version>
        </dependency>
        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.1.41</version>
        </dependency>



    </dependencies>
    <build>
        <resources>
            <resource>
                <directory>src/config</directory>
                <includes>
                    <include>**/*.properties</include>
                </includes>
            </resource>
        </resources>
        <plugins>
            <plugin>
                <artifactId>maven-assembly-plugin</artifactId>
                <configuration>
                    <classifier>dist</classifier>
                    <appendAssemblyId>true</appendAssemblyId>
                    <descriptorRefs>
                        <descriptor>jar-with-dependencies</descriptor>
                    </descriptorRefs>
                </configuration>
                <executions>
                    <execution>
                        <id>make-assembly</id>
                        <phase>package</phase>
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
            <plugin>
                <artifactId>maven-compiler-plugin</artifactId>
                <configuration>
                    <source>1.7</source>
                    <target>1.7</target>
                </configuration>
            </plugin>
            <plugin>
                <groupId>net.alchim31.maven</groupId>
                <artifactId>scala-maven-plugin</artifactId>
                <version>3.2.2</version>
                <executions>
                    <execution>
                        <id>scala-compile-first</id>
                        <phase>process-resources</phase>
                        <goals>
                            <goal>compile</goal>
                        </goals>
                    </execution>
                </executions>
                <configuration>
                    <scalaVersion>${scala.version}</scalaVersion>
                    <recompileMode>incremental</recompileMode>
                    <useZincServer>true</useZincServer>
                    <args>
                        <arg>-unchecked</arg>
                        <arg>-deprecation</arg>
                        <arg>-feature</arg>
                    </args>
                    <jvmArgs>
                        <jvmArg>-Xms1024m</jvmArg>
                        <jvmArg>-Xmx1024m</jvmArg>
                    </jvmArgs>
                    <javacArgs>
                        <javacArg>-source</javacArg>
                        <javacArg>${java.version}</javacArg>
                        <javacArg>-target</javacArg>
                        <javacArg>${java.version}</javacArg>
                        <javacArg>-Xlint:all,-serial,-path</javacArg>
                    </javacArgs>
                </configuration>
            </plugin>
            <plugin>
                <groupId>org.antlr</groupId>
                <artifactId>antlr4-maven-plugin</artifactId>
                <version>4.3</version>
                <executions>
                    <execution>
                        <id>antlr</id>
                        <goals>
                            <goal>antlr4</goal>
                        </goals>
                        <phase>none</phase>
                    </execution>
                </executions>
                <configuration>
                    <outputDirectory>src/test/java</outputDirectory>
                    <listener>true</listener>
                    <treatWarningsAsErrors>true</treatWarningsAsErrors>
                </configuration>
            </plugin>
        </plugins>
    </build>


</project>

3、代碼:

package com.test.kafka

import java.util.concurrent.TimeUnit
import java.util.regex.{Matcher, Pattern}

import com.google.gson.Gson
import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery, StreamingQueryException, Trigger}
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}

object StrutsSparkTest {

  def main(args: Array[String]): Unit = {

    //System.setProperty("hadoop.home.dir", "D:\\Software\\hadoop-common-2.6.0-bin-master\\")
    //創建sparkSession
    val spark: SparkSession = SparkSession.builder()
      .appName("struts_spark")
      .master("local[*]")
      .getOrCreate()
    spark.sparkContext.setLogLevel("WARN")

    import spark.implicits._
    //kafka
    val topic = "WordCount"
    val kafkaCluer = "192.168.25.121:9092,192.168.25.122:9092,192.168.25.123:9092"
    //source創建
    val df: DataFrame = spark.readStream.format("kafka")
      .option("kafka.bootstrap.servers", kafkaCluer)
      .option("subscribe", topic)
      .option("startingOffsets", "latest")
      .load()
    val kafkaDF: Dataset[(String, String)] = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)").as[(String, String)]
    //channel  過濾一些髒消息
    val msg: DataFrame = kafkaDF.map(record =>handleMessage2CaseClass(record._2)).filter((_: Msg).!=(null))
      .select("message")
      .filter({ row =>
          //ip的提取和判斷
          val strings: Array[String] = row.toString.split(" ")
          isIp(strings(0)) && strings.length>10
      })
    //拆分nginx日誌
    val nginx: Dataset[nginxDesc] = msg.map(nginx => {
      val strings: Array[String] = nginx.toString.split(" ")
      if (strings(6).contains("?")) {
        val urlStr: Array[String] = strings(6).split("\\?")
        strings(6) = urlStr(0) //urlStr
        strings(7) = urlStr(1) //param
      }
      if (strings(3).contains("[")){
        val formatter = new SimpleDateFormat("dd/MMM/yyyy:hh:mm:ss", Locale.ENGLISH)
        val date: Date = formatter.parse(strings(3).replace("[", ""))
        val format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
        strings(3)=format.format(date)   //nginxTime
      }
      if (strings(5).contains("\""))
        strings(5) = strings(5).replace("\"", "") //get post method
      if (strings(0).contains("["))
        strings(0) = strings(0).replace("[", "") //ip
      nginxDesc(strings(0), strings(3), strings(5), strings(6), strings(7), strings(8).toInt, strings(9).toInt)
    })

    //創建流式數據寫入格式對象
    //sink
//    val query: StreamingQuery = nginx.writeStream
//      .outputMode("append")
//      .format("console")
//      .start()
    /*輸出路徑*//*輸出路徑*/
    val outputPath = "F:\\tmp\\spark\\data"
    val checkpointLocation = "F:\\tmp\\spark\\checkpoint"

    // Sink:存入本地csv,主要爲了驗證
    val query: StreamingQuery = nginx.writeStream
      .format("csv")
      .option("path", outputPath)
      .option("checkpointLocation", checkpointLocation)
      .outputMode(OutputMode.Append)
      .trigger(Trigger.ProcessingTime(1, TimeUnit.MINUTES))
      .start()

    try
      query.awaitTermination()
    catch {
      case e: StreamingQueryException =>
        e.printStackTrace()
    }
  }


  /**
    * 使用正則判斷IP
    */
  def isIp(addr:String): Boolean={
    if(addr.length() < 7 || addr.length() > 15 || "".equals(addr))
      return false
    val pat: Pattern = Pattern.compile("([1-9]|[1-9]\\d|1\\d{2}|2[0-4]\\d|25[0-5])(\\.(\\d|[1-9]\\d|1\\d{2}|2[0-4]\\d|25[0-5])){3}")
    val mat: Matcher = pat.matcher(addr)
    mat.find()
  }

  /***
    *
    * @param ip  ip
    * @param nginxTime  時間
    * @param method   方法
    * @param urlStr   url
    * @param http     http
    * @param status   狀態
    * @param flow     流量
    */
  case class nginxDesc(ip:String, nginxTime:String, method:String, urlStr:String, http:String,
                       status:Int,flow:Int )

  /**樣例類
    *
    * @param timestamp 時間
    * @param metadata 信息
    * @param message 消息
    */
  case class Msg(timestamp:String,metadata:String ,message:String)

  /**
    * 轉換json
    * @param jsonStr json
    * @return  類
    */
  def handleMessage2CaseClass(jsonStr: String): Msg = {
    val gson = new Gson()
    try{
      gson.fromJson(jsonStr, classOf[Msg])
    } catch {
      case ex: Exception =>
        ex.printStackTrace() // 打印到標準err
        System.err.println("exception===>: ...") // 打印到標準err
        //主要爲了返回對象
        Msg(null,null,null)
    }

  }


}


4、結果:

1、第一次更新
2020/06/09 只是打印的消息體。
在這裏插入圖片描述
2、第二次更新
2020/06/10,優化了消息源的判斷,異常捕捉,拆分nginx日誌
在這裏插入圖片描述
3、第三次更新
2020/06/11,優化了數據的sink,驗證數據
在這裏插入圖片描述
在這裏插入圖片描述

數據分析小白入門指南

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章