1. SQLAccessLogAnalyzerSpark
package com.ibeifeng.bigdata.spark.sql import com.ibeifeng.bigdata.spark.core.ApacheAccessLog import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, SQLContext} import org.apache.spark.{SparkContext, SparkConf} /** * Created by XuanYu on 2017/3/5. */ object SQLAccessLogAnalyzerSpark { def main(args: Array[String]) { // 讀取配置信息 val sparkConf = new SparkConf(); sparkConf.setAppName("SQLAccessLogAnalyzer Application").setMaster("local[2]") // 創建SparkContext實例 val sc = SparkContext.getOrCreate(sparkConf) /** * 創建SparkSQL入口 SQLContext */ val sqlContext = new SQLContext(sc) // this is used to implicitly convert an RDD to a DataFrame. import sqlContext.implicits._ // ================================================================================ // 自定義函數 // 案例一、定義UDF:一對一 sqlContext.udf.register( "toUpper" ,// function name (word: String) => word.toUpperCase() ) // 案例二:截取Double類型值的後幾位 sqlContext.udf.register( "double_scala", (numValue: Double, scale: Int) => { import java.math.BigDecimal val db = new BigDecimal(numValue) db.setScale(scale, BigDecimal.ROUND_HALF_UP).doubleValue() } ) // 案例三:註冊定義UDAF sqlContext.udf.register( "sal_avg", AvgSalUDAF ) // ================================================================================ val logFile = "/test/access_log" val accessLogsRdd: RDD[ApacheAccessLog] = sc .textFile(logFile) // read file from hdfs // filter .filter(ApacheAccessLog.isValidateLogLine) // parse .map(log => ApacheAccessLog.parseLogLine(log)) /** * RDD -> DataFrame * 其中一種方式爲:RDD[CASE CLASS] */ // Create DataFrame val accessLogsDF: DataFrame = accessLogsRdd.toDF() // 打印Schema /** * root |-- ipAddress: string (nullable = true) |-- clientIdented: string (nullable = true) |-- userId: string (nullable = true) |-- dateTime: string (nullable = true) |-- method: string (nullable = true) |-- endpoint: string (nullable = true) |-- protocol: string (nullable = true) |-- responseCode: integer (nullable = false) |-- contentSize: long (nullable = false) */ accessLogsDF.printSchema() /** * 再次使用SQL語句進行分析,SQL語句時針對表的,所以講DataFrame註冊爲一張表,然後進行SQL分析 */ // 將DataFrame註冊爲一張表 accessLogsDF.registerTempTable("access_log") // 將表中的數據放入內存中 sqlContext.cacheTable("access_log") /** * 需求一: The average, min, and max content size of responses returned from the server. */ val contentSizesRow = sqlContext.sql( """ |SELECT | SUM(contentSize), COUNT(*), MIN(contentSize), MAX(contentSize) |FROM | access_log """.stripMargin).first() // println println("Content Size Avg: %s, Min: %s, Max: %s".format( contentSizesRow.getLong(0) / contentSizesRow.getLong(1), contentSizesRow(2), contentSizesRow(3) )) // Content Size Avg: 7777, Min: 0, Max: 138789 /** * 需求二: A count of response code's returned. */ val responseCodeToCount: Array[(Int, Long)] = sqlContext.sql( """ |SELECT | responseCode, COUNT(*) |FROM | access_log |GROUP BY | responseCode |LIMIT | 10 """.stripMargin).map(row => (row.getInt(0), row.getLong(1))).collect() println( s"""Response Code Count: ${responseCodeToCount.mkString("[",", ","]")}""" ) /** * 需求三: All IPAddresses that have accessed this server more than N times. */ val ipAddresses: Array[String] = sqlContext.sql( """ |SELECT | ipAddress, COUNT(*) AS total |FROM | access_log |GROUP BY | ipAddress |HAVING | total > 10 |LIMIT | 10 """.stripMargin).map(row => row.getString(0)).collect() println( s"""IP Addresses: ${ipAddresses.mkString("[",", ","]")}""" ) /** * 需求四: The top endpoints requested by count. */ val topEndpoints: Array[(String, Long)] = sqlContext.sql( """ |SELECT | endpoint, COUNT(*) AS total |FROM | access_log |GROUP BY | endpoint |ORDER BY | total DESC |LIMIT | 10 """.stripMargin).map(row => (row.getString(0), row.getLong(1))).collect() println( s"""Top Endpoints: ${topEndpoints.mkString("[",", ","]")}""" ) // unCache Table sqlContext.uncacheTable("access_log") /** * 爲了調試監控 */ Thread.sleep(10000000) // SparkContext sc.stop() } }
2.ApacheAccessLog
package com.ibeifeng.bigdata.spark.core import scala.util.matching.Regex import scala.util.matching.Regex.Match /** * 數據: * 1.1.1.1 - - [21/Jul/2014:10:00:00 -0800] "GET /chapter1/java/src/main/java/com/databricks/apps/logs/LogAnalyzer.java HTTP/1.1" 200 1234 */ case class ApacheAccessLog( ipAddress: String, clientIdented: String, userId: String, dateTime: String, method: String, endpoint: String, protocol: String, responseCode: Int, contentSize: Long ) object ApacheAccessLog{ // regex val PARTTERN: Regex = """^(\S+) (-|\S+) (-|\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+) (\S+)" (\d{3}) (\d+)""".r /** * 對數據進行過濾 * @param log * @return */ def isValidateLogLine(log: String): Boolean = { // parse log info val res: Option[Match] = PARTTERN.findFirstMatchIn(log) // invalidate !res.isEmpty // true } /** * 用戶解析Log文件,將每行數據解析成對應的CASE CLASS * * @param log * @return */ def parseLogLine(log: String): ApacheAccessLog = { // 使用正則表達式進行匹配 // parse log info val res: Option[Match] = PARTTERN.findFirstMatchIn(log) // invalidate if(res.isEmpty){ throw new RuntimeException("Cannot parse log line: " + log) } // get value val m: Match = res.get // return ApacheAccessLog( m.group(1), m.group(2), m.group(3), m.group(4), m.group(5), m.group(6), m.group(7), m.group(8).toInt, m.group(9).toLong ) } }