算法小白的第一次嘗試----出行模式分析(矩陣聚類,數據實戰)


主要功能: 根據交通出行數據,通過刷卡記錄,獲取用戶的所有出行od,以天爲單位構建矩陣,對矩陣進行聚類

主要採用了kmeans進行聚類,輪盤法(kmeans++思想)進行簇初始化,採用SSE(拐點)進行聚類效果評價

kmeans++ 與kmeans參考該篇博客:

https://www.cnblogs.com/wang2825/articles/8696830.html

在這裏插入圖片描述在這裏插入圖片描述在這裏插入圖片描述在這裏插入圖片描述SSE選擇最佳K:
在這裏插入圖片描述

import breeze.linalg.{Axis, DenseMatrix, sum}
import breeze.numerics._
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession

import scala.collection.mutable.{ArrayBuffer, ListBuffer}

/**
  * Created by GangTian on 2020/4/30 in SIBAT
  */
object Kmeans {
  val time_coefficient = 0.5 //時間係數(向量距離計算)
  val geo_coefficient = 0.5 // 地理位置係數(向量距離計算)
  val k_factor = 0.5 //最大聚類係數,最大簇數 = k_factor * daysOfTrip(出行總天數)
  val iterator = 10 //k-means迭代次數
  val mse = 0.01 //簇中心變化幅度

  def main(args: Array[String]): Unit = {
    Logger.getLogger("org").setLevel(Level.ERROR)
    val spark = SparkSession.builder().master("local[*]").getOrCreate()
    val sql = spark.sqlContext
    val sc = spark.sparkContext
    import spark.implicits._

    // 1.數據集準備-----造了一份數據,特徵未經過標準化處理,實數據處理過
    val list = new ListBuffer[(String, String, String, Double, Double, String, Double, Double, Double, Double)]()
    list.append(("001", "2020-05-01", "A", 1, 1, "B", 1.1, 1.1, 7.5, 8.0))
    list.append(("001", "2020-05-01", "B", 1.1, 1.1, "A", 1, 1, 18.5, 19.0))
    list.append(("001", "2020-05-02", "C", 7, 7, "D", 8, 8, 9.5, 10.5))
    list.append(("001", "2020-05-02", "D", 8, 8, "C", 7, 7, 20.5, 21.5))
    list.append(("001", "2020-05-03", "A", 1, 1, "B", 1.1, 1.1, 7.6, 8.1))
    list.append(("001", "2020-05-03", "A", 1, 1, "B", 1.1, 1.1, 7.6, 8.1)) 
    list.append(("001", "2020-05-03", "A", 1, 1, "B", 1.1, 1.1, 7.6, 8.1)) 
    list.append(("001", "2020-05-03", "A", 1, 1, "B", 1.1, 1.1, 7.6, 8.1)) 
    list.append(("001", "2020-05-03", "B", 1.1, 1.1, "A", 1, 1, 18.4, 19.2))
    list.append(("001", "2020-05-04", "C", 7, 7, "D", 8, 8, 9.6, 10.4))
    list.append(("001", "2020-05-04", "D", 8, 8, "C", 7, 7, 20.7, 21.6))
    list.append(("001", "2020-05-05", "A", 1, 1, "B", 1.1, 1.1, 7.7, 8.3))
    list.append(("001", "2020-05-05", "B", 1.1, 1.1, "A", 1, 1, 18.3, 19.0))
    list.append(("001", "2020-05-06", "C", 7, 7, "D", 8, 8, 9.2, 10.2))
    list.append(("001", "2020-05-06", "C", 7, 7, "D", 8, 8, 9.2, 10.2)) 
    list.append(("001", "2020-05-06", "D", 8, 8, "C", 7, 7, 20.2, 21.3))
    list.append(("001", "2020-05-07", "E", 5, 5, "F", 15, 15, 6.1, 8.1))
    list.append(("001", "2020-05-07", "F", 15, 15, "G", 20, 20, 16.1, 18.1))
    list.append(("001", "2020-05-07", "G", 20, 20, "E", 5, 5, 20.1, 22.1))
    list.append(("001", "2020-05-08", "A", 1, 1, "D", 8, 8, 0.5, 10.5))
    list.append(("001", "2020-05-08", "D", 8, 8, "F", 15, 15, 10.5, 14.5))
    list.append(("001", "2020-05-08", "F", 15, 15, "A", 1, 1, 17.5, 21.5))
    list.append(("001", "2020-05-08", "F", 15, 15, "A", 1, 1, 17.5, 21.5)) 
    list.append(("001", "2020-05-08", "F", 15, 15, "A", 1, 1, 17.5, 21.5)) 
    list.append(("001", "2020-05-09", "E", 5, 5, "F", 15, 15, 6.3, 8.4))
    list.append(("001", "2020-05-09", "F", 15, 15, "G", 20, 20, 16.4, 18.2))
    list.append(("001", "2020-05-09", "G", 20, 20, "E", 5, 5, 20.2, 21.9))

    sc.parallelize(list).toDF("card_id", "date", "departLocation", "departLon", "departLat", "arrivalLocation", "arrivalLon", "arrivalLat", "departTime", "arrivalTime")
      .sort("date").show(100)
    
    val kmax = 8 //最大天數
    val res = new ListBuffer[((Int,Double, ListBuffer[(DenseMatrix[Double], DenseMatrix[Double])]))]()
    for(i<-1 to(kmax)){
      res.append(run(list,i))
    }
    val best_k = calPointToLineDis(res.map(tp =>(tp._1,tp._2)))
    println("best_k: " + best_k)

    // 數據還原回去
    val dateArr = new ArrayBuffer[(String,Int)]()
    var label = 1
    val b2 = res.filter(_._1 == best_k).head._3.groupBy(_._2)
      .foreach(tp =>{
        val  k = tp._2
        for(r <- k){
          val date_pre = r._1(0,0).toLong.toString
          val date = date_pre.substring(0,4) + "-" + date_pre.substring(4,6) + "-"+ date_pre.substring(6)
          dateArr.append((date,label))
        }
        label += 1
      })
    dateArr.foreach(println(_))
  }

  /**
    * calculate the distance of one point to the line
    * @return
    */
  def calPointToLineDis(points:ListBuffer[(Int,Double)]):Double={
    val points_new = points.sortBy(_._1)
    val startPoint = points_new.head
    val endPoint = points_new.last
    //求取直線係數,k和b
    val k = (startPoint._2 - endPoint._2) / (startPoint._1 - endPoint._1)
    val b = startPoint._2 - k * startPoint._1
    // 求取拐點,拐點離直線距離最遠
    val disArr = new ArrayBuffer[(Int,Double)]()
    for(point <- points_new){
      val dis = math.abs(k * point._1 + b - point._2)
      disArr.append((point._1,dis))
    }
    val best_k = disArr.sortBy(_._2).last._1
    best_k
  }


  /**
    * the entry of kmeans program, the method return the SSE and matrix and cluster within the k clusters,the matrix and cluster are in the shape of tuple
    * @param list
    * @param k
    * @return
    */
  def run(list: ListBuffer[(String, String, String, Double, Double, String, Double, Double, Double, Double)],k:Int):(Int,Double, ListBuffer[(DenseMatrix[Double], DenseMatrix[Double])]) = {
    //1.日期數據轉化成matrix
    val matrix_list = list2Matrix(list)

    //2.基於kmeans++ 思想初始化聚類中心
    val initial_clusters = initialize_cluster(matrix_list, k)

    //3.聲明可變的cluster
    var variable_clusters = new ListBuffer[DenseMatrix[Double]]()
    for (initial_cluster <- initial_clusters) variable_clusters.append(initial_cluster)

    //3.計算每個樣本xi到K個聚類中心的距離,並將其劃分到距離最小的聚類中心對應的類中(matrix,cluster)
    var clusterResult = new ListBuffer[(DenseMatrix[Double],DenseMatrix[Double])]()
    var flag = true
    for (i <- 0 until (iterator) if flag) {
      val res = new ListBuffer[(DenseMatrix[Double], DenseMatrix[Double])] //res存放matrix 和其對應的center
      for (matrix <- matrix_list) {
        val center = calDisOfMatrix2(matrix, variable_clusters)
        res.append((matrix, center))
      }
      clusterResult = res

      //根據聚類後的結果,更新center矩陣
      val newCenter = reflash_center(res)

      // 判斷center是否發生變化,若center不變,則不再迭代
      var flag2 = true
      for(ct <- newCenter if flag2){
        if(!variable_clusters.contains(ct)){
          flag2 = false
          variable_clusters = newCenter
        }
      }
      if(flag2) flag = false
    }

    //4. 計算當前 k對應的SSE指標
    val sse = estimate(clusterResult)
    (k,sse,clusterResult)
  }


  /**
    * calculate the SSE(sum of the squared error)
    * @param clusterResult
    * @return
    */
  def estimate(clusterResult: ListBuffer[(DenseMatrix[Double], DenseMatrix[Double])]):Double = {
    var sse = 0.0
    for(tp<- clusterResult){
      sse += calDisOfMatrix(tp._1,Array(tp._2))
    }
    sse
  }


  /** reflash the center
    *
    * @param res res store the matrix and it corresponding center in tuple form
    */
  def reflash_center(res: ListBuffer[(DenseMatrix[Double], DenseMatrix[Double])]): ListBuffer[DenseMatrix[Double]] = {
    val c1 = res.head._2
    val rows = c1.rows // center對應的行數
    val cols = c1.cols // center對應的列數
    val result = new ListBuffer[DenseMatrix[Double]]()
    val groups = res.groupBy(_._2)
    for (group <- groups) {
      val value = group._2.map(_._1)
      val len = value.length * 1.0
      var center: DenseMatrix[Double] = DenseMatrix.zeros[Double](rows, cols)
      for (va <- value) {
        center += va
      }
      val center2: DenseMatrix[Double] = center / len
      result.append(center2)
    }
    result
  }


  /**
    * based on the kmeans++ algorithm to select the k clusters
    *
    * @param list input data with shape of denseMatrix
    * @param k    represent the initialize numbers of clusters
    * @return
    */
  def initialize_cluster(list: ListBuffer[DenseMatrix[Double]], k: Int): Array[DenseMatrix[Double]] = {
    var remain_k = k
    val seed = (math.random * list.length).toInt
    val clusters = new ArrayBuffer[DenseMatrix[Double]]()
    //c1簇
    clusters.append(list(seed))
    remain_k -= 1

    //其它簇採用輪盤法選取
    for (i <- 0 until (remain_k)) {
      // 1. 計算剩餘樣本與當前所有簇中心的最短距離
      val disArr = new ArrayBuffer[(DenseMatrix[Double], Double)]()
      for (matrix <- list if !clusters.contains(matrix)) {
        val dis = calDisOfMatrix(matrix, clusters.toArray)
        disArr.append((matrix, dis))
      }

      //2.計算概率px
      val total = disArr.map(_._2).sum
      val px = disArr.map(x => x._2 / total)

      //3.根據概率計算區間
      val sum_px = new ArrayBuffer[Double]()
      var sum_all = 0.0
      for (p <- px) {
        sum_all += p
        sum_px.append(sum_all)
      }

      //4.隨機選擇新的簇
      val rand = math.random
      var flag = true
      if (rand < sum_px(0)) {
        clusters.append(disArr.head._1)
        flag = false
      } else if (rand > sum_px(sum_px.length - 2)) {
        clusters.append(disArr.last._1)
      } else {
        for (i <- 0 until (sum_px.length - 1) if flag) {
          if (rand > sum_px(i) && rand < sum_px(i + 1)) {
            clusters.append(disArr(i + 1)._1)
            flag = false
          }
        }
      }
      // 5.選除新的簇後,remain_k自減1
      remain_k -= 1
    }
    clusters.toArray
  }


  /**
    * calculate the number of departure and destination pairs,and expandding the one day data to a matrix,
    * so every day has the same matrix shape including rows and cols
    *
    * @param list
    * @return
    */
  def list2Matrix(list: ListBuffer[(String, String, String, Double, Double, String, Double, Double, Double, Double)]): ListBuffer[DenseMatrix[Double]] = {
    // 1.從歷史記錄中篩選所有od 對
    val ods = list.map(tp => (tp._3 + tp._6)).distinct.sorted

    // 2.統計每一組od對在所有天數中出現次數的最高頻次
    val groups = list.groupBy(_._2).toArray
    val odNum = new ArrayBuffer[(String, Int)]()
    for (od <- ods) {
      var max = 0
      for (group <- groups) {
        val num = group._2.filter(tp => (tp._3 + tp._6) == od).length
        if (num > max) max = num
      }
      odNum.append((od, max))
    }

    // 3.根據odNum將list記錄填充至二維矩陣
    val matrix_list = new ListBuffer[DenseMatrix[Double]]()
    val rows = odNum.map(_._2).sum //矩陣行數
    val cols = 7 //矩陣列數

    //4.group代表每一天的記錄
    for (group <- groups) {
      val date = group._1.replace("-", "").toDouble
      val odArr = new ListBuffer[(Double, Double, Double, Double, Double, Double, Double)]()
      val record = group._2.sortBy(_._9) //按出發時間排序
      for (od <- odNum) {
        val res = record.filter(tp => (tp._3 + tp._6) == od._1)
        try {
          res.foreach(tp => odArr.append((date, tp._4, tp._5, tp._7, tp._8, tp._9, tp._10)))
        } catch {
          case e: Exception => println()
        }

        // 擴充至od._2
        val delta = od._2 - res.length
        for (i <- 0 until (delta)) {
          odArr.append((date, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0))
        }
      }

      //5. 將擴充後的數組轉爲矩陣形式
      val vec = odArr.map(tp => (tp._1 + "," + tp._2 + "," + tp._3 + "," + tp._4 + "," + tp._5 + "," + tp._6 + "," + tp._7))
        .flatMap(str => str.split(","))
        .map(_.toDouble)
      val matrix = new DenseMatrix[Double](cols, rows, vec.toArray).t
      matrix_list.append(matrix)
    }
    matrix_list
  }


  /**
    * calculate the distance between the input matrix of the clusters,and return the least distance
    *
    * @param matrix
    * @param clusters
    */
  def calDisOfMatrix(matrix: DenseMatrix[Double], clusters: Array[DenseMatrix[Double]]): Double = {
    var dis = Double.MaxValue //初始化最大value
    for (cluster <- clusters) {
      val m = matrix - cluster
      val dis_1 = sum(sqrt(sum(pow(m(::, 1 to (2)), 2.0), Axis._1)))
      val dis_2 = sum(sqrt(sum(pow(m(::, 3 to (4)), 2.0), Axis._1)))
      val dis_3 = sum(sum(abs(m(::, 5 to (6))), Axis._1))
      val distance = geo_coefficient * (dis_1 + dis_2) + time_coefficient * dis_3
      if (distance < dis) dis = distance
    }
    math.pow(dis, 2.0)
  }

  /**
    * calculate the least distance between the matrix and the clusters,and return the cluster center which the matrix belongs
    *
    * @param matrix
    * @param clusters
    */
  def calDisOfMatrix2(matrix: DenseMatrix[Double], clusters: ListBuffer[DenseMatrix[Double]]): DenseMatrix[Double] = {
    var dis = Double.MaxValue //初始化最大value
    var center: DenseMatrix[Double] = null
    for (cluster <- clusters) {
      val m = matrix - cluster
      val dis_1 = sum(sqrt(sum(pow(m(::, 1 to (2)), 2.0), Axis._1)))
      val dis_2 = sum(sqrt(sum(pow(m(::, 3 to (4)), 2.0), Axis._1)))
      val dis_3 = sum(sum(abs(m(::, 5 to (6))), Axis._1))
      val distance = geo_coefficient * (dis_1 + dis_2) + time_coefficient * dis_3
      if (distance < dis) {
        dis = distance
        center = cluster
      }
    }
    center
  }
}

數據(真實,非上述)實戰結果:

在這裏插入圖片描述

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章