Spark項目練習(根據規則庫統計地區用戶訪問量,寫入數據庫)

項目說明:附件爲要計算數據的demo。點擊打開鏈接

分析用戶訪問數據,將訪問IP計算分析,根據規則庫,統計出各省份的訪問量。

其中,用到知識點:1、常用算法二分法。2、IP地址轉換成10進制數的簡單算法。3、spark對Mysql數據庫的操作。

具體程序如下:

package cn.allengao.Location

import java.sql.{Connection, Date, DriverManager, PreparedStatement}
import org.apache.spark.{SparkConf, SparkContext}

/**
 * class_name: 
 * package: 
 * describe: 分析用戶訪問數據,將訪問IP計算分析,根據規則庫,統計出各省份的訪問量。
 * creat_user: Allen Gao
 * creat_date: 2018/2/1
 * creat_time: 9:06
 **/
object IPLocation {

  val data2MySQL = (iterator: Iterator[(String, Int)]) => {
    var conn: Connection = null
    var ps : PreparedStatement = null
    val sql = "INSERT INTO location_info (location, counts, access_date) VALUES (?, ?, ?)"
    try {
      conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/bigdata", "root", "toor")
      iterator.foreach(line => {
        ps = conn.prepareStatement(sql)
        ps.setString(1, line._1)
        ps.setInt(2, line._2)
        ps.setDate(3, new Date(System.currentTimeMillis()))
        ps.executeUpdate()
      })
    } catch {
      case e: Exception => println("Mysql Exception")
    } finally {
      if (ps != null)
        ps.close()
      if (conn != null)
        conn.close()
    }
  }
    //將IP地址轉換爲Long類型,以方便比較
  def ip2Long(ip: String): Long = {
    val fragments = ip.split("[.]")
    var ipNum = 0L
    for (i <- 0 until fragments.length){
      ipNum =  fragments(i).toLong | ipNum << 8L
    }
    ipNum
  }
    //二分法檢索
  def binarySearch(lines: Array[(String, String, String)], ip: Long) : Int = {
    var low = 0
    var high = lines.length - 1
    while (low <= high) {
      val middle = (low + high) / 2
      if ((ip >= lines(middle)._1.toLong) && (ip <= lines(middle)._2.toLong))
        return middle
      if (ip < lines(middle)._1.toLong)
        high = middle - 1
      else {
        low = middle + 1
      }
    }
    -1
  }

  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("IpLocation").setMaster("local[2]")
    val sc = new SparkContext(conf)
    // 1.0.1.0|1.0.3.255|16777472|16778239|亞洲|中國|福建|福州||電信|350100|China|CN|119.306239|26.075302
    val ipRulesRdd = sc.textFile("j://information//ip.txt").map(line =>{
      val fields = line.split("\\|")
      val startIP = fields(2)//起始IP
      val endIP = fields(3)//結束IP
      val province = fields(6)//省份
      (startIP, endIP, province)
    })
    //全部的ip映射規則
    val ipRulesArrary = ipRulesRdd.collect()
    //廣播規則,將需要廣播的數據廣播到集羣中的相應的Executor
    val ipRulesBroadcast = sc.broadcast(ipRulesArrary)
    //加載要處理的數據,讀取用戶點擊流日誌
    val ipsRDD = sc.textFile("j://information//access_log").map(line => {
      val fields = line.split("\\|")
      val ip = fields(1)
      val ipToLong = ip2Long(ip)//把IP轉換成Long類型
      val arr = ipRulesBroadcast.value//拿到廣播變量中的數據
      val index = binarySearch(arr, ipToLong)//得到IP索引
      val province = arr(index)._3//根據索引找到對應的省份

      (province,1)
    })
    //計算結果
    val res = ipsRDD.reduceByKey(_+_)
    //向MySQL寫入數據
    res.foreachPartition(data2MySQL(_))

    //println(res.collect().toBuffer)
    sc.stop()
  }
}

打開Mysql數據庫,新建數據庫bigdata,新建表location_info,新建四個字段,id(非空,自增),location(varchar),counts(int),access_date(datetime)。運行程序,刷新數據庫,可見如下結果:


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章