算法小白的第一次嘗試----出行模式分析（矩陣聚類，數據實戰）


主要功能： 根據交通出行數據，通過刷卡記錄，獲取用戶的所有出行od,以天爲單位構建矩陣，對矩陣進行聚類

主要採用了kmeans進行聚類，輪盤法（kmeans++思想）進行簇初始化，採用SSE(拐點)進行聚類效果評價

kmeans++ 與kmeans參考該篇博客：

https://www.cnblogs.com/wang2825/articles/8696830.html

SSE選擇最佳K:

import breeze.linalg.{Axis, DenseMatrix, sum}
import breeze.numerics._
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession

import scala.collection.mutable.{ArrayBuffer, ListBuffer}

/**
  * Created by GangTian on 2020/4/30 in SIBAT
  */
object Kmeans {
  val time_coefficient = 0.5 //時間係數（向量距離計算）
  val geo_coefficient = 0.5 // 地理位置係數（向量距離計算）
  val k_factor = 0.5 //最大聚類係數，最大簇數 = k_factor * daysOfTrip(出行總天數)
  val iterator = 10 //k-means迭代次數
  val mse = 0.01 //簇中心變化幅度

  def main(args: Array[String]): Unit = {
    Logger.getLogger("org").setLevel(Level.ERROR)
    val spark = SparkSession.builder().master("local[*]").getOrCreate()
    val sql = spark.sqlContext
    val sc = spark.sparkContext
    import spark.implicits._

    // 1.數據集準備-----造了一份數據，特徵未經過標準化處理，實數據處理過
    val list = new ListBuffer[(String, String, String, Double, Double, String, Double, Double, Double, Double)]()
    list.append(("001", "2020-05-01", "A", 1, 1, "B", 1.1, 1.1, 7.5, 8.0))
    list.append(("001", "2020-05-01", "B", 1.1, 1.1, "A", 1, 1, 18.5, 19.0))
    list.append(("001", "2020-05-02", "C", 7, 7, "D", 8, 8, 9.5, 10.5))
    list.append(("001", "2020-05-02", "D", 8, 8, "C", 7, 7, 20.5, 21.5))
    list.append(("001", "2020-05-03", "A", 1, 1, "B", 1.1, 1.1, 7.6, 8.1))
    list.append(("001", "2020-05-03", "A", 1, 1, "B", 1.1, 1.1, 7.6, 8.1)) 
    list.append(("001", "2020-05-03", "A", 1, 1, "B", 1.1, 1.1, 7.6, 8.1)) 
    list.append(("001", "2020-05-03", "A", 1, 1, "B", 1.1, 1.1, 7.6, 8.1)) 
    list.append(("001", "2020-05-03", "B", 1.1, 1.1, "A", 1, 1, 18.4, 19.2))
    list.append(("001", "2020-05-04", "C", 7, 7, "D", 8, 8, 9.6, 10.4))
    list.append(("001", "2020-05-04", "D", 8, 8, "C", 7, 7, 20.7, 21.6))
    list.append(("001", "2020-05-05", "A", 1, 1, "B", 1.1, 1.1, 7.7, 8.3))
    list.append(("001", "2020-05-05", "B", 1.1, 1.1, "A", 1, 1, 18.3, 19.0))
    list.append(("001", "2020-05-06", "C", 7, 7, "D", 8, 8, 9.2, 10.2))
    list.append(("001", "2020-05-06", "C", 7, 7, "D", 8, 8, 9.2, 10.2)) 
    list.append(("001", "2020-05-06", "D", 8, 8, "C", 7, 7, 20.2, 21.3))
    list.append(("001", "2020-05-07", "E", 5, 5, "F", 15, 15, 6.1, 8.1))
    list.append(("001", "2020-05-07", "F", 15, 15, "G", 20, 20, 16.1, 18.1))
    list.append(("001", "2020-05-07", "G", 20, 20, "E", 5, 5, 20.1, 22.1))
    list.append(("001", "2020-05-08", "A", 1, 1, "D", 8, 8, 0.5, 10.5))
    list.append(("001", "2020-05-08", "D", 8, 8, "F", 15, 15, 10.5, 14.5))
    list.append(("001", "2020-05-08", "F", 15, 15, "A", 1, 1, 17.5, 21.5))
    list.append(("001", "2020-05-08", "F", 15, 15, "A", 1, 1, 17.5, 21.5)) 
    list.append(("001", "2020-05-08", "F", 15, 15, "A", 1, 1, 17.5, 21.5)) 
    list.append(("001", "2020-05-09", "E", 5, 5, "F", 15, 15, 6.3, 8.4))
    list.append(("001", "2020-05-09", "F", 15, 15, "G", 20, 20, 16.4, 18.2))
    list.append(("001", "2020-05-09", "G", 20, 20, "E", 5, 5, 20.2, 21.9))

    sc.parallelize(list).toDF("card_id", "date", "departLocation", "departLon", "departLat", "arrivalLocation", "arrivalLon", "arrivalLat", "departTime", "arrivalTime")
      .sort("date").show(100)
    
    val kmax = 8 //最大天數
    val res = new ListBuffer[((Int,Double, ListBuffer[(DenseMatrix[Double], DenseMatrix[Double])]))]()
    for(i<-1 to(kmax)){
      res.append(run(list,i))
    }
    val best_k = calPointToLineDis(res.map(tp =>(tp._1,tp._2)))
    println("best_k: " + best_k)

    // 數據還原回去
    val dateArr = new ArrayBuffer[(String,Int)]()
    var label = 1
    val b2 = res.filter(_._1 == best_k).head._3.groupBy(_._2)
      .foreach(tp =>{
        val  k = tp._2
        for(r <- k){
          val date_pre = r._1(0,0).toLong.toString
          val date = date_pre.substring(0,4) + "-" + date_pre.substring(4,6) + "-"+ date_pre.substring(6)
          dateArr.append((date,label))
        }
        label += 1
      })
    dateArr.foreach(println(_))
  }

  /**
    * calculate the distance of one point to the line
    * @return
    */
  def calPointToLineDis(points:ListBuffer[(Int,Double)]):Double={
    val points_new = points.sortBy(_._1)
    val startPoint = points_new.head
    val endPoint = points_new.last
    //求取直線係數，k和b
    val k = (startPoint._2 - endPoint._2) / (startPoint._1 - endPoint._1)
    val b = startPoint._2 - k * startPoint._1
    // 求取拐點，拐點離直線距離最遠
    val disArr = new ArrayBuffer[(Int,Double)]()
    for(point <- points_new){
      val dis = math.abs(k * point._1 + b - point._2)
      disArr.append((point._1,dis))
    }
    val best_k = disArr.sortBy(_._2).last._1
    best_k
  }


  /**
    * the entry of kmeans program, the method return the SSE and matrix and cluster within the k clusters,the matrix and cluster are in the shape of tuple
    * @param list
    * @param k
    * @return
    */
  def run(list: ListBuffer[(String, String, String, Double, Double, String, Double, Double, Double, Double)],k:Int):(Int,Double, ListBuffer[(DenseMatrix[Double], DenseMatrix[Double])]) = {
    //1.日期數據轉化成matrix
    val matrix_list = list2Matrix(list)

    //2.基於kmeans++ 思想初始化聚類中心
    val initial_clusters = initialize_cluster(matrix_list, k)

    //3.聲明可變的cluster
    var variable_clusters = new ListBuffer[DenseMatrix[Double]]()
    for (initial_cluster <- initial_clusters) variable_clusters.append(initial_cluster)

    //3.計算每個樣本xi到K個聚類中心的距離，並將其劃分到距離最小的聚類中心對應的類中（matrix,cluster）
    var clusterResult = new ListBuffer[(DenseMatrix[Double],DenseMatrix[Double])]()
    var flag = true
    for (i <- 0 until (iterator) if flag) {
      val res = new ListBuffer[(DenseMatrix[Double], DenseMatrix[Double])] //res存放matrix 和其對應的center
      for (matrix <- matrix_list) {
        val center = calDisOfMatrix2(matrix, variable_clusters)
        res.append((matrix, center))
      }
      clusterResult = res

      //根據聚類後的結果，更新center矩陣
      val newCenter = reflash_center(res)

      // 判斷center是否發生變化，若center不變，則不再迭代
      var flag2 = true
      for(ct <- newCenter if flag2){
        if(!variable_clusters.contains(ct)){
          flag2 = false
          variable_clusters = newCenter
        }
      }
      if(flag2) flag = false
    }

    //4. 計算當前 k對應的SSE指標
    val sse = estimate(clusterResult)
    (k,sse,clusterResult)
  }


  /**
    * calculate the SSE(sum of the squared error)
    * @param clusterResult
    * @return
    */
  def estimate(clusterResult: ListBuffer[(DenseMatrix[Double], DenseMatrix[Double])]):Double = {
    var sse = 0.0
    for(tp<- clusterResult){
      sse += calDisOfMatrix(tp._1,Array(tp._2))
    }
    sse
  }


  /** reflash the center
    *
    * @param res res store the matrix and it corresponding center in tuple form
    */
  def reflash_center(res: ListBuffer[(DenseMatrix[Double], DenseMatrix[Double])]): ListBuffer[DenseMatrix[Double]] = {
    val c1 = res.head._2
    val rows = c1.rows // center對應的行數
    val cols = c1.cols // center對應的列數
    val result = new ListBuffer[DenseMatrix[Double]]()
    val groups = res.groupBy(_._2)
    for (group <- groups) {
      val value = group._2.map(_._1)
      val len = value.length * 1.0
      var center: DenseMatrix[Double] = DenseMatrix.zeros[Double](rows, cols)
      for (va <- value) {
        center += va
      }
      val center2: DenseMatrix[Double] = center / len
      result.append(center2)
    }
    result
  }


  /**
    * based on the kmeans++ algorithm to select the k clusters
    *
    * @param list input data with shape of denseMatrix
    * @param k    represent the initialize numbers of clusters
    * @return
    */
  def initialize_cluster(list: ListBuffer[DenseMatrix[Double]], k: Int): Array[DenseMatrix[Double]] = {
    var remain_k = k
    val seed = (math.random * list.length).toInt
    val clusters = new ArrayBuffer[DenseMatrix[Double]]()
    //c1簇
    clusters.append(list(seed))
    remain_k -= 1

    //其它簇採用輪盤法選取
    for (i <- 0 until (remain_k)) {
      // 1. 計算剩餘樣本與當前所有簇中心的最短距離
      val disArr = new ArrayBuffer[(DenseMatrix[Double], Double)]()
      for (matrix <- list if !clusters.contains(matrix)) {
        val dis = calDisOfMatrix(matrix, clusters.toArray)
        disArr.append((matrix, dis))
      }

      //2.計算概率px
      val total = disArr.map(_._2).sum
      val px = disArr.map(x => x._2 / total)

      //3.根據概率計算區間
      val sum_px = new ArrayBuffer[Double]()
      var sum_all = 0.0
      for (p <- px) {
        sum_all += p
        sum_px.append(sum_all)
      }

      //4.隨機選擇新的簇
      val rand = math.random
      var flag = true
      if (rand < sum_px(0)) {
        clusters.append(disArr.head._1)
        flag = false
      } else if (rand > sum_px(sum_px.length - 2)) {
        clusters.append(disArr.last._1)
      } else {
        for (i <- 0 until (sum_px.length - 1) if flag) {
          if (rand > sum_px(i) && rand < sum_px(i + 1)) {
            clusters.append(disArr(i + 1)._1)
            flag = false
          }
        }
      }
      // 5.選除新的簇後，remain_k自減1
      remain_k -= 1
    }
    clusters.toArray
  }


  /**
    * calculate the number of departure and destination pairs，and expandding the one day data to a matrix，
    * so every day has the same matrix shape including rows and cols
    *
    * @param list
    * @return
    */
  def list2Matrix(list: ListBuffer[(String, String, String, Double, Double, String, Double, Double, Double, Double)]): ListBuffer[DenseMatrix[Double]] = {
    // 1.從歷史記錄中篩選所有od 對
    val ods = list.map(tp => (tp._3 + tp._6)).distinct.sorted

    // 2.統計每一組od對在所有天數中出現次數的最高頻次
    val groups = list.groupBy(_._2).toArray
    val odNum = new ArrayBuffer[(String, Int)]()
    for (od <- ods) {
      var max = 0
      for (group <- groups) {
        val num = group._2.filter(tp => (tp._3 + tp._6) == od).length
        if (num > max) max = num
      }
      odNum.append((od, max))
    }

    // 3.根據odNum將list記錄填充至二維矩陣
    val matrix_list = new ListBuffer[DenseMatrix[Double]]()
    val rows = odNum.map(_._2).sum //矩陣行數
    val cols = 7 //矩陣列數

    //4.group代表每一天的記錄
    for (group <- groups) {
      val date = group._1.replace("-", "").toDouble
      val odArr = new ListBuffer[(Double, Double, Double, Double, Double, Double, Double)]()
      val record = group._2.sortBy(_._9) //按出發時間排序
      for (od <- odNum) {
        val res = record.filter(tp => (tp._3 + tp._6) == od._1)
        try {
          res.foreach(tp => odArr.append((date, tp._4, tp._5, tp._7, tp._8, tp._9, tp._10)))
        } catch {
          case e: Exception => println()
        }

        // 擴充至od._2
        val delta = od._2 - res.length
        for (i <- 0 until (delta)) {
          odArr.append((date, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0))
        }
      }

      //5. 將擴充後的數組轉爲矩陣形式
      val vec = odArr.map(tp => (tp._1 + "," + tp._2 + "," + tp._3 + "," + tp._4 + "," + tp._5 + "," + tp._6 + "," + tp._7))
        .flatMap(str => str.split(","))
        .map(_.toDouble)
      val matrix = new DenseMatrix[Double](cols, rows, vec.toArray).t
      matrix_list.append(matrix)
    }
    matrix_list
  }


  /**
    * calculate the distance between the input matrix of the clusters,and return the least distance
    *
    * @param matrix
    * @param clusters
    */
  def calDisOfMatrix(matrix: DenseMatrix[Double], clusters: Array[DenseMatrix[Double]]): Double = {
    var dis = Double.MaxValue //初始化最大value
    for (cluster <- clusters) {
      val m = matrix - cluster
      val dis_1 = sum(sqrt(sum(pow(m(::, 1 to (2)), 2.0), Axis._1)))
      val dis_2 = sum(sqrt(sum(pow(m(::, 3 to (4)), 2.0), Axis._1)))
      val dis_3 = sum(sum(abs(m(::, 5 to (6))), Axis._1))
      val distance = geo_coefficient * (dis_1 + dis_2) + time_coefficient * dis_3
      if (distance < dis) dis = distance
    }
    math.pow(dis, 2.0)
  }

  /**
    * calculate the least distance between the matrix and the clusters,and return the cluster center which the matrix belongs
    *
    * @param matrix
    * @param clusters
    */
  def calDisOfMatrix2(matrix: DenseMatrix[Double], clusters: ListBuffer[DenseMatrix[Double]]): DenseMatrix[Double] = {
    var dis = Double.MaxValue //初始化最大value
    var center: DenseMatrix[Double] = null
    for (cluster <- clusters) {
      val m = matrix - cluster
      val dis_1 = sum(sqrt(sum(pow(m(::, 1 to (2)), 2.0), Axis._1)))
      val dis_2 = sum(sqrt(sum(pow(m(::, 3 to (4)), 2.0), Axis._1)))
      val dis_3 = sum(sum(abs(m(::, 5 to (6))), Axis._1))
      val distance = geo_coefficient * (dis_1 + dis_2) + time_coefficient * dis_3
      if (distance < dis) {
        dis = distance
        center = cluster
      }
    }
    center
  }
}

數據（真實，非上述）實戰結果：

算法小白的第一次嘗試----出行模式分析（矩陣聚類，數據實戰）

idea快速構建sbt項目，挑戰全網最全最細（親測，1分鐘內可成功構建sbt）

一文徹底搞懂spark的shuffle過程（shuffle write）

算法小白的第一次嘗試---判斷點是否在不規則區域範圍內（手撕）

算法小白的第一次嘗試---PCA（主成分分析）降維【適合各種緯度數據】

spark讀取csv中文亂碼

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結