算法小白的第二次嘗試----NavieBayes


>樸素貝葉斯原理見該博客（強烈推薦）：
>https://mp.weixin.qq.com/s?src=11&timestamp=1584688625&ver=2227&signature=O754zhc6apcSqOgNLOcewFs6K3RMvj9Tuz1nB4I*-IfaZLh5wlbpKA8iJxFtQ*xLy3FoYyW*pB2t7puAhrS7WS8uZLuH2XBdcv8u1Cp2u-Elufc7IvQ67zGNA6uFwLGC&new=1

/**
  * Created by GangTian on 2020/3/22 in SIBAT
  * 針對bayes中預測速度太慢的部分進行改進，個人認爲主要原因在於生成的模型過於簡單了。
  * NavieBayes2生成模型時，各特徵均加入了未知數X，這樣可以極大的提升預測時的速度，
  * 但當各features緯度太高時（幾十緯或上百緯時），生成模型所需時間很大。
  * 不過針對features緯度較低時可用。當然，Spark ml可直接調用Bayes模型
  */

import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.functions.udf
import scala.collection.mutable.ArrayBuffer

object NavieBayes {
  /**
    * 模型訓練，針對歷史數據，主要生成相應的條件概率和先驗概率
    *
    * @param trainData trainData格式：f1,f2,f3.....fn,label (f1,f2..fn數據類型任意，colName任意)
    * @return
    */
  def train(trainData: DataFrame): DataFrame = {
    import trainData.sparkSession.implicits._
    val columns = trainData.columns
    val arrColumn = columns.map(column => col(column).cast("String"))
    val transData = trainData.select(arrColumn: _*)
    val featuresCol = columns.filter(colName => !colName.equals("label"))
    val lab_df = transData.groupBy("label").count()

    //1.計算先驗概率
    val lamda = 1 // 採用拉普拉斯平滑處理（lamda = 1）,可以避免出現新詞時，條件概率爲零的情況
    val k = lab_df.count()
    val totalRecord = trainData.count()
    val pri_pro = lab_df.withColumn("pri_pro", ($"count" + lamda) / (totalRecord + k * lamda * 1.0))

    //2.統計各條件下的記錄數
    var condition_record = transData.groupByKey(row => row.getAs[String](featuresCol.head) + "," + row.getAs[String]("label"))
      .flatMapGroups((str, it) => {
        val arr = str.split(",")
        val category = arr.head
        val label = arr.last
        val num = it.toList.length
        Array((category, label, num, featuresCol.head))
      }).toDF("optional", "label", "num", "featureName")

    for (i <- 1 until (featuresCol.length)) {
      val featureName = featuresCol(i)
      val df = transData.groupByKey(row => row.getAs[String](featureName) + "," + row.getAs[String]("label"))
        .flatMapGroups((str, it) => {
          val arr = str.split(",")
          val category = arr.head
          val label = arr.last
          val num = it.toList.length
          Array((category, label, num, featureName))
        }).toDF("optional", "label", "num", "featureName")
      condition_record = condition_record.union(df)
    }

    //3.計算各特徵中的個數
    val featuresOptional = new ArrayBuffer[(String, Long)]()
    for (featureName <- featuresCol) {
      val num = transData.select(featureName).distinct().count()
      featuresOptional.append((featureName, num))
    }

    val addFeaturesOptional = udf((featureName: String) => {
      featuresOptional.filter(_._1 == featureName).head._2
    })

    //4.計算曆史的條件概率，不一定管用，若預測的輸入特徵在歷史記錄中存在，直接調用歷史的條件概率即可，若輸入在歷史記錄種不存在，則需重新計算
    var history_pro = condition_record.withColumn("historyFeaturesNum", addFeaturesOptional($"featureName"))
      .join(lab_df.withColumnRenamed("label", "key"), $"key" === $"label").drop("key")
      .withColumn("condition_pro", ($"num" + lamda) * 1.0 / ($"count" + $"historyFeaturesNum" * lamda))
      .join(pri_pro.withColumnRenamed("label", "key").withColumnRenamed("count", "ct"), $"key" === $"label")
      .drop("key", "ct")

    //5.獲取各label對應的條件概率
    val cdf = history_pro.map(row => (row.getAs[String]("label"), row.getAs[Double]("pri_pro"))).distinct().collect()

    //6.針對history_pro，對於部分未出現的結果，進行條件概率和先驗概率填充，但填充進去的任一特徵的值，不得超出歷史記錄中的可選值
    for (colName <- featuresCol) {
      val df = history_pro.filter($"featureName" === colName)
      val historyFeaturesNum = df.map(_.getAs[Long]("historyFeaturesNum")).distinct().collect().head
      val optionalType = df.map(_.getAs[String]("optional")).distinct().collect()
      val k = optionalType.length
      val m = lab_df.count().toInt
      val labs = lab_df.map(_.getAs[String]("label")).collect()
      val record = df.map(row => (row.getAs[String]("optional") + "," + row.getAs[String]("label"))).collect()
      if (record.length < k * m) {
        for (i <- 0 until (k)) {
          for (j <- 0 until (m)) {
            val tp = optionalType(i) + "," + labs(j)
            if (!record.contains(tp)) {
              val count = df.filter($"label" === labs(j)).map(_.getAs[Long]("count")).distinct().collect().head
              //自己算一個condition_pro概率，pri_pro概率根據label，自動填充就行
              val ct = lab_df.filter($"label" === labs(j)).map(_.getAs[Long]("count")).head()
              val prip = cdf.filter(_._1 == labs(j)).head._2
              val conp = lamda * 1.0 / (k + ct)
              val xdf = Seq((optionalType(i), labs(j), 0L, colName, historyFeaturesNum, count, conp, prip))
                .toDF("optional", "label", "num", "featureName", "historyFeaturesNum", "count", "condition_pro", "pri_pro")
              history_pro = history_pro.union(xdf)
            }
          }
        }
      }

      //7.增加任一特徵未曾出現的概率，optional用X表示
      for (j <- 0 until (m)) {
        //自己算一個condition_pro概率，pri_pro概率根據label，自動填充就行
        val count = df.filter($"label" === labs(j)).map(_.getAs[Long]("count")).distinct().collect().head
        val ct = lab_df.filter($"label" === labs(j)).map(_.getAs[Long]("count")).head()
        val conp = lamda * 1.0 / ((k + 1) + ct)
        val prip = cdf.filter(_._1 == labs(j)).head._2
        val xdf = Seq(("X", labs(j), 0L, colName, historyFeaturesNum, count, conp, prip))
          .toDF("optional", "label", "num", "featureName", "historyFeaturesNum", "count", "condition_pro", "pri_pro")
        history_pro = history_pro.union(xdf)
      }
    }
    history_pro
  }

  /**
    * 對輸入數據進行預測
    *
    * @param testData 輸入同trainData(不含label列)
    * @param model    train生成的model
    * @return
    */
  def predict(testData: DataFrame, model: DataFrame): DataFrame = {
    import model.sparkSession.implicits._
    val columns = testData.columns
    val arrColumn = columns.map(column => col(column).cast("String"))
    val transData = testData.select(arrColumn: _*) //將所有字段全部轉爲String形式
    val testRecord = transData.map(row => {
      var str = ""
      for (column <- columns) str += (row.getAs[String](column) + ",")
      str.dropRight(1)
    }).collect().map(_.split(","))

    //最終預測結果
    var predictLabel = Seq[String]()
    for (record <- testRecord) { //針對測試集
      //存放每一個特徵對應的所有label的概率，最後尋找最大的label作爲當前記錄的最終預測的label
      var features_pro = Seq[(String, Double)]()
      //計算各特徵條件概率
      for (i <- 0 until (columns.length)) {
        val option = record(i) //找到arrivalDate對應的值
        val featureName = columns(i) // 找到了當前值，所表示的featureName = arrivalDate
        //1.首先判斷該option在當前特徵中是否出現過
        val tf = model.filter($"featureName" === featureName)
        val labels = tf.map(row => row.getAs[String]("optional")).collect()
        if (labels.contains(option)) {
          //值在當前特徵的歷史記錄種存在，則直接從model中選擇condition_pro（條件概率）和pri_pro（先驗概率）
          val res = tf.filter($"optional" === option).map(row => (row.getAs[String]("label"), row.getAs[Double]("condition_pro"))).collect()
          for (re <- res) features_pro = features_pro :+ re
        } else {
          //當出現新特徵時，基於拉普拉斯平滑，重新計算條件概率
          val res = tf.filter($"optional" === "X").map(row => (row.getAs[String]("label"), row.getAs[Double]("condition_pro"))).collect()
          for (re <- res) features_pro = features_pro :+ re
        }
      }

      val result = features_pro.toDF("key", "probability_condition").groupByKey(row => row.getAs[String]("key"))
        .flatMapGroups((str, it) => {
          var gv = 1.0
          val its = it.toList.map(_.getAs[Double]("probability_condition"))
          for (s <- its) gv = gv * s
          Array((str, gv))
        }).toDF("key", "probability_condition").join(model.select("label", "pri_pro").distinct(), $"key" === $"label")
        .withColumn("probability", $"probability_condition" * $"pri_pro").drop("key")
        .sort($"probability".desc).head().getAs[String]("label")
      predictLabel = predictLabel :+ result
    }
    predictLabel.toDF("prediction")
  }
}
測試數據集：
生成的貝葉斯模型：
算法小白的第二次嘗試----NavieBayes

idea快速構建sbt項目，挑戰全網最全最細（親測，1分鐘內可成功構建sbt）

一文徹底搞懂spark的shuffle過程（shuffle write）

算法小白的第一次嘗試---判斷點是否在不規則區域範圍內（手撕）

算法小白的第一次嘗試---PCA（主成分分析）降維【適合各種緯度數據】

spark讀取csv中文亂碼

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結