SQLAccesslogAnalyzer in sparkSQL

1. SQLAccessLogAnalyzerSpark

package com.ibeifeng.bigdata.spark.sql

import com.ibeifeng.bigdata.spark.core.ApacheAccessLog
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.{SparkContext, SparkConf}

/**
 * Created by XuanYu on 2017/3/5.
 */
object SQLAccessLogAnalyzerSpark {

  def main(args: Array[String]) {

    // 取配置信息
    val sparkConf = new SparkConf();
    sparkConf.setAppName("SQLAccessLogAnalyzer Application").setMaster("local[2]")

    // SparkContext    val sc = SparkContext.getOrCreate(sparkConf)

    /**
     * SparkSQL入口  SQLContext
     */
    val sqlContext = new SQLContext(sc)
    // this is used to implicitly convert an RDD to a DataFrame.
    import sqlContext.implicits._

// ================================================================================
    // 自定義函數
    // 案例一、定義UDF:一    sqlContext.udf.register(
      "toUpper" ,// function name
      (word: String) => word.toUpperCase()
    )


    // 案例二:截取Double的後幾位
    sqlContext.udf.register(
      "double_scala",
      (numValue: Double, scale: Int) => {
        import java.math.BigDecimal
        val db = new BigDecimal(numValue)
        db.setScale(scale, BigDecimal.ROUND_HALF_UP).doubleValue()
      }
    )

    // 案例三:註冊定義UDAF
    sqlContext.udf.register(
      "sal_avg",
      AvgSalUDAF
    )


    // ================================================================================
    val logFile = "/test/access_log"
    val accessLogsRdd: RDD[ApacheAccessLog] = sc
      .textFile(logFile)   // read file from hdfs
      // filter
      .filter(ApacheAccessLog.isValidateLogLine)
      // parse
      .map(log => ApacheAccessLog.parseLogLine(log))
    /**
     * RDD -> DataFrame
     *    其中一種方式RDD[CASE CLASS]
     */
    // Create DataFrame
    val accessLogsDF: DataFrame = accessLogsRdd.toDF()

    // 打印Schema
    /**
     * root
       |-- ipAddress: string (nullable = true)
       |-- clientIdented: string (nullable = true)
       |-- userId: string (nullable = true)
       |-- dateTime: string (nullable = true)
       |-- method: string (nullable = true)
       |-- endpoint: string (nullable = true)
       |-- protocol: string (nullable = true)
       |-- responseCode: integer (nullable = false)
       |-- contentSize: long (nullable = false)
     */
    accessLogsDF.printSchema()

    /**
     * 再次使用SQL行分析,SQL時針對表的,所以DataFrame註冊表,然後SQL分析
     */
    // DataFrame註冊    accessLogsDF.registerTempTable("access_log")
    // 將表中的數據放入內存中
    sqlContext.cacheTable("access_log")

    /**
     * 需求一:
       The average, min, and max content size of responses returned from the server.
     */
    val contentSizesRow = sqlContext.sql(
      """
        |SELECT
        |  SUM(contentSize), COUNT(*), MIN(contentSize), MAX(contentSize)
        |FROM
        |  access_log
      """.stripMargin).first()

    // println
    println("Content Size Avg: %s, Min: %s, Max: %s".format(
      contentSizesRow.getLong(0) / contentSizesRow.getLong(1),
      contentSizesRow(2),
      contentSizesRow(3)
    ))
    // Content Size Avg: 7777, Min: 0, Max: 138789

    /**
     * 需求二:
         A count of response code's returned.
     */
    val responseCodeToCount: Array[(Int, Long)] = sqlContext.sql(
      """
        |SELECT
        | responseCode, COUNT(*)
        |FROM
        | access_log
        |GROUP BY
        |  responseCode
        |LIMIT
        |  10
      """.stripMargin).map(row => (row.getInt(0), row.getLong(1))).collect()
    println(
      s"""Response Code Count: ${responseCodeToCount.mkString("[",", ","]")}"""
    )

    /**
     * 需求三:
         All IPAddresses that have accessed this server more than N times.
     */
    val ipAddresses: Array[String] = sqlContext.sql(
      """
        |SELECT
        | ipAddress, COUNT(*) AS total
        |FROM
        | access_log
        |GROUP BY
        | ipAddress
        |HAVING
        | total > 10
        |LIMIT
        | 10
      """.stripMargin).map(row => row.getString(0)).collect()
    println(
      s"""IP Addresses: ${ipAddresses.mkString("[",", ","]")}"""
    )

    /**
     * 需求四:
         The top endpoints requested by count.
     */
    val topEndpoints: Array[(String, Long)] = sqlContext.sql(
      """
        |SELECT
        | endpoint, COUNT(*) AS total
        |FROM
        | access_log
        |GROUP BY
        | endpoint
        |ORDER BY
        | total DESC
        |LIMIT
        | 10
      """.stripMargin).map(row => (row.getString(0), row.getLong(1))).collect()
    println(
      s"""Top Endpoints: ${topEndpoints.mkString("[",", ","]")}"""
    )

    // unCache Table
    sqlContext.uncacheTable("access_log")


    /**
     * 調試監     */
    Thread.sleep(10000000)

    // SparkContext
    sc.stop()
  }


}

2.ApacheAccessLog

package com.ibeifeng.bigdata.spark.core

import scala.util.matching.Regex
import scala.util.matching.Regex.Match

/**
 * 數據:
 * 1.1.1.1 - - [21/Jul/2014:10:00:00 -0800] "GET /chapter1/java/src/main/java/com/databricks/apps/logs/LogAnalyzer.java HTTP/1.1" 200 1234
 */
case class ApacheAccessLog(
  ipAddress: String,
  clientIdented: String,
  userId: String,
  dateTime: String,
  method: String,
  endpoint: String,
  protocol: String,
  responseCode: Int,
  contentSize: Long )

object ApacheAccessLog{

  // regex
  val PARTTERN: Regex = """^(\S+) (-|\S+) (-|\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+) (\S+)" (\d{3}) (\d+)""".r


  /**
   * 數據過濾
   * @param log
   * @return
   */
  def isValidateLogLine(log: String): Boolean = {
    // parse log info
    val res: Option[Match] = PARTTERN.findFirstMatchIn(log)

    // invalidate
    !res.isEmpty   // true
  }


  /**
   * 解析Log文件,將每行數據解析成對應CASE CLASS
   *
   * @param log
   * @return
   */
  def parseLogLine(log: String): ApacheAccessLog = {
    // 使用正表達式行匹配
    // parse log info
    val res: Option[Match] = PARTTERN.findFirstMatchIn(log)

    // invalidate
    if(res.isEmpty){
      throw new RuntimeException("Cannot parse log line: " + log)
    }

    // get value
    val m: Match = res.get

    // return
    ApacheAccessLog(
      m.group(1),
      m.group(2),
      m.group(3),
      m.group(4),
      m.group(5),
      m.group(6),
      m.group(7),
      m.group(8).toInt,
      m.group(9).toLong
    )
  }


}

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章