>樸素貝葉斯原理見該博客(強烈推薦):
>https://mp.weixin.qq.com/s?src=11×tamp=1584688625&ver=2227&signature=O754zhc6apcSqOgNLOcewFs6K3RMvj9Tuz1nB4I*-IfaZLh5wlbpKA8iJxFtQ*xLy3FoYyW*pB2t7puAhrS7WS8uZLuH2XBdcv8u1Cp2u-Elufc7IvQ67zGNA6uFwLGC&new=1
/**
* Created by GangTian on 2020/3/22 in SIBAT
* 針對bayes中預測速度太慢的部分進行改進,個人認爲主要原因在於生成的模型過於簡單了。
* NavieBayes2生成模型時,各特徵均加入了未知數X,這樣可以極大的提升預測時的速度,
* 但當各features緯度太高時(幾十緯或上百緯時),生成模型所需時間很大。
* 不過針對features緯度較低時可用。當然,Spark ml可直接調用Bayes模型
*/
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.functions.udf
import scala.collection.mutable.ArrayBuffer
object NavieBayes {
/**
* 模型訓練,針對歷史數據,主要生成相應的條件概率和先驗概率
*
* @param trainData trainData格式:f1,f2,f3.....fn,label (f1,f2..fn數據類型任意,colName任意)
* @return
*/
def train(trainData: DataFrame): DataFrame = {
import trainData.sparkSession.implicits._
val columns = trainData.columns
val arrColumn = columns.map(column => col(column).cast("String"))
val transData = trainData.select(arrColumn: _*)
val featuresCol = columns.filter(colName => !colName.equals("label"))
val lab_df = transData.groupBy("label").count()
//1.計算先驗概率
val lamda = 1 // 採用拉普拉斯平滑處理(lamda = 1),可以避免出現新詞時,條件概率爲零的情況
val k = lab_df.count()
val totalRecord = trainData.count()
val pri_pro = lab_df.withColumn("pri_pro", ($"count" + lamda) / (totalRecord + k * lamda * 1.0))
//2.統計各條件下的記錄數
var condition_record = transData.groupByKey(row => row.getAs[String](featuresCol.head) + "," + row.getAs[String]("label"))
.flatMapGroups((str, it) => {
val arr = str.split(",")
val category = arr.head
val label = arr.last
val num = it.toList.length
Array((category, label, num, featuresCol.head))
}).toDF("optional", "label", "num", "featureName")
for (i <- 1 until (featuresCol.length)) {
val featureName = featuresCol(i)
val df = transData.groupByKey(row => row.getAs[String](featureName) + "," + row.getAs[String]("label"))
.flatMapGroups((str, it) => {
val arr = str.split(",")
val category = arr.head
val label = arr.last
val num = it.toList.length
Array((category, label, num, featureName))
}).toDF("optional", "label", "num", "featureName")
condition_record = condition_record.union(df)
}
//3.計算各特徵中的個數
val featuresOptional = new ArrayBuffer[(String, Long)]()
for (featureName <- featuresCol) {
val num = transData.select(featureName).distinct().count()
featuresOptional.append((featureName, num))
}
val addFeaturesOptional = udf((featureName: String) => {
featuresOptional.filter(_._1 == featureName).head._2
})
//4.計算曆史的條件概率,不一定管用,若預測的輸入特徵在歷史記錄中存在,直接調用歷史的條件概率即可,若輸入在歷史記錄種不存在,則需重新計算
var history_pro = condition_record.withColumn("historyFeaturesNum", addFeaturesOptional($"featureName"))
.join(lab_df.withColumnRenamed("label", "key"), $"key" === $"label").drop("key")
.withColumn("condition_pro", ($"num" + lamda) * 1.0 / ($"count" + $"historyFeaturesNum" * lamda))
.join(pri_pro.withColumnRenamed("label", "key").withColumnRenamed("count", "ct"), $"key" === $"label")
.drop("key", "ct")
//5.獲取各label對應的條件概率
val cdf = history_pro.map(row => (row.getAs[String]("label"), row.getAs[Double]("pri_pro"))).distinct().collect()
//6.針對history_pro,對於部分未出現的結果,進行條件概率和先驗概率填充,但填充進去的任一特徵的值,不得超出歷史記錄中的可選值
for (colName <- featuresCol) {
val df = history_pro.filter($"featureName" === colName)
val historyFeaturesNum = df.map(_.getAs[Long]("historyFeaturesNum")).distinct().collect().head
val optionalType = df.map(_.getAs[String]("optional")).distinct().collect()
val k = optionalType.length
val m = lab_df.count().toInt
val labs = lab_df.map(_.getAs[String]("label")).collect()
val record = df.map(row => (row.getAs[String]("optional") + "," + row.getAs[String]("label"))).collect()
if (record.length < k * m) {
for (i <- 0 until (k)) {
for (j <- 0 until (m)) {
val tp = optionalType(i) + "," + labs(j)
if (!record.contains(tp)) {
val count = df.filter($"label" === labs(j)).map(_.getAs[Long]("count")).distinct().collect().head
//自己算一個condition_pro概率,pri_pro概率根據label,自動填充就行
val ct = lab_df.filter($"label" === labs(j)).map(_.getAs[Long]("count")).head()
val prip = cdf.filter(_._1 == labs(j)).head._2
val conp = lamda * 1.0 / (k + ct)
val xdf = Seq((optionalType(i), labs(j), 0L, colName, historyFeaturesNum, count, conp, prip))
.toDF("optional", "label", "num", "featureName", "historyFeaturesNum", "count", "condition_pro", "pri_pro")
history_pro = history_pro.union(xdf)
}
}
}
}
//7.增加任一特徵未曾出現的概率,optional用X表示
for (j <- 0 until (m)) {
//自己算一個condition_pro概率,pri_pro概率根據label,自動填充就行
val count = df.filter($"label" === labs(j)).map(_.getAs[Long]("count")).distinct().collect().head
val ct = lab_df.filter($"label" === labs(j)).map(_.getAs[Long]("count")).head()
val conp = lamda * 1.0 / ((k + 1) + ct)
val prip = cdf.filter(_._1 == labs(j)).head._2
val xdf = Seq(("X", labs(j), 0L, colName, historyFeaturesNum, count, conp, prip))
.toDF("optional", "label", "num", "featureName", "historyFeaturesNum", "count", "condition_pro", "pri_pro")
history_pro = history_pro.union(xdf)
}
}
history_pro
}
/**
* 對輸入數據進行預測
*
* @param testData 輸入同trainData(不含label列)
* @param model train生成的model
* @return
*/
def predict(testData: DataFrame, model: DataFrame): DataFrame = {
import model.sparkSession.implicits._
val columns = testData.columns
val arrColumn = columns.map(column => col(column).cast("String"))
val transData = testData.select(arrColumn: _*) //將所有字段全部轉爲String形式
val testRecord = transData.map(row => {
var str = ""
for (column <- columns) str += (row.getAs[String](column) + ",")
str.dropRight(1)
}).collect().map(_.split(","))
//最終預測結果
var predictLabel = Seq[String]()
for (record <- testRecord) { //針對測試集
//存放每一個特徵對應的所有label的概率,最後尋找最大的label作爲當前記錄的最終預測的label
var features_pro = Seq[(String, Double)]()
//計算各特徵條件概率
for (i <- 0 until (columns.length)) {
val option = record(i) //找到arrivalDate對應的值
val featureName = columns(i) // 找到了當前值,所表示的featureName = arrivalDate
//1.首先判斷該option在當前特徵中是否出現過
val tf = model.filter($"featureName" === featureName)
val labels = tf.map(row => row.getAs[String]("optional")).collect()
if (labels.contains(option)) {
//值在當前特徵的歷史記錄種存在,則直接從model中選擇condition_pro(條件概率)和pri_pro(先驗概率)
val res = tf.filter($"optional" === option).map(row => (row.getAs[String]("label"), row.getAs[Double]("condition_pro"))).collect()
for (re <- res) features_pro = features_pro :+ re
} else {
//當出現新特徵時,基於拉普拉斯平滑,重新計算條件概率
val res = tf.filter($"optional" === "X").map(row => (row.getAs[String]("label"), row.getAs[Double]("condition_pro"))).collect()
for (re <- res) features_pro = features_pro :+ re
}
}
val result = features_pro.toDF("key", "probability_condition").groupByKey(row => row.getAs[String]("key"))
.flatMapGroups((str, it) => {
var gv = 1.0
val its = it.toList.map(_.getAs[Double]("probability_condition"))
for (s <- its) gv = gv * s
Array((str, gv))
}).toDF("key", "probability_condition").join(model.select("label", "pri_pro").distinct(), $"key" === $"label")
.withColumn("probability", $"probability_condition" * $"pri_pro").drop("key")
.sort($"probability".desc).head().getAs[String]("label")
predictLabel = predictLabel :+ result
}
predictLabel.toDF("prediction")
}
}
測試數據集:
生成的貝葉斯模型: