-------------------------------------------------------------------------------------
筆者追求算法實現,不喜歡大篇幅敘述原理,有關LDA(線性判別分析)理論推薦查看該篇博客
https://www.cnblogs.com/pinard/p/6244265.html
-------------------------------------------------------------------------------------
import breeze.linalg.DenseMatrix
import org.apache.log4j.{Level, Logger}
import org.apache.spark.ml.feature.{LabeledPoint,VectorAssembler}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import scala.collection.mutable.ArrayBuffer
/** The method is Linear discriminant analysis which can be used to
* lower the dimension of linear dataset
* Data Source :http://archive.ics.uci.edu/ml/datasets/Wine
* @author XiaoTangBao
* @date 2019/4/24 10:32
* @version 1.0
*/
object LDA {
def main(args: Array[String]): Unit = {
//屏蔽日誌
Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
//spark初始化
val spark = SparkSession.builder().master("local[4]").appName("LDA").getOrCreate()
//獲取數據源 http://archive.ics.uci.edu/ml/datasets/Wine
val data = spark.sparkContext.textFile("G:\\mldata\\wine.data").map(line => line.split(","))
.map(arr => arr.map(str => str.toDouble)).map(arr =>Row(arr(0),arr(1),arr(2),arr(3),arr(4),arr(5),
arr(6),arr(7),arr(8),arr(9),arr(10),arr(11),arr(12),arr(13)))
//設置featuresArr和schema,便於後期數據轉化及生成dataFrame
val featuresArr = Array("Alcohol","Malic acid","Ash","Alcalinity of ash","Magnesium",
"Total phenols","Flavanoids","Nonflavanoid phenols","Proanthocyanins","Color intensity",
"Hue","OD280/OD315 of diluted wines","Proline")
val schema = StructType(List(StructField("label",DoubleType,true),StructField("Alcohol",DoubleType,true),StructField("Malic acid",DoubleType,true),
StructField("Ash",DoubleType,true),StructField("Alcalinity of ash",DoubleType,true),StructField("Magnesium",DoubleType,true)
,StructField("Total phenols",DoubleType,true),StructField("Flavanoids",DoubleType,true),StructField("Nonflavanoid phenols",DoubleType,true)
,StructField("Proanthocyanins",DoubleType,true),StructField("Color intensity",DoubleType,true),StructField("Hue",DoubleType,true)
,StructField("OD280/OD315 of diluted wines",DoubleType,true),StructField("Proline",DoubleType,true)))
val oridf = spark.createDataFrame(data,schema)
//設置轉化器
val vectorAsb = new VectorAssembler().setInputCols(featuresArr).setOutputCol("features")
//數據整理後傳入run,啓動LDA算法
val newdf = vectorAsb.transform(oridf).select("label","features")
val rpg = run(newdf,2)
val arr = ArrayBuffer[(Double,Double)]()
for(i<-0 until rpg.cols) arr.append((rpg(0,i),rpg(1,i)))
arr.foreach(tp =>println(tp._1))
println()
arr.foreach(tp =>println(tp._2))
}
/**
* Entrance of modeltraining
* @param df trainData with only two columns such as label and features
* @param nb the dimensions of the traindata after
*/
def run(df:DataFrame,nb:Int)={
val trainData = df.select("features").rdd.map(row => row.toString())
.map(str => str.replace('[',' '))
.map(str => str.replace(']',' '))
.map(str => str.trim).map(str => str.split(','))
.map(arr => arr.map(str => str.toDouble)).collect()
val labels = df.select("label").rdd.map(row => row.toString())
.map(str => str.replace('[',' '))
.map(str => str.replace(']',' '))
.map(str => str.trim).map(str => str.toDouble).collect()
//特徵列數
val tz = trainData(0).length
//生成新的帶label的數據
val labArr = ArrayBuffer[LabeledPoint]()
for(i<-0 until trainData.length) labArr.append(LabeledPoint(labels(i),Vectors.dense(trainData(i))))
//總樣本組成的大型矩陣
val allData = labArr.map(lab => lab.features).map(vec => vec.toArray).flatMap(x => x).toArray
val big_Matrx =new DenseMatrix[Double](tz,trainData.length,allData)
import breeze.linalg._
//存放向量各維度的均值
val big_mean = sum(big_Matrx,Axis._1).*= (1.0 / big_Matrx.cols)
//總的類別
val allLabel = labels.distinct
//類內散度矩陣
val Sw_Arr = ArrayBuffer[DenseMatrix[Double]]()
//類間散度矩陣
val Sb_Arr = ArrayBuffer[DenseMatrix[Double]]()
for(i<-0 until allLabel.length){
//該類別下的總記錄數
val record = labArr.filter(lab => lab.label == allLabel(i)).size
val sk = labArr.filter(lab => lab.label == allLabel(i)).map(lab => lab.features)
.map(vec => vec.toArray).flatMap(x => x).toArray
var d1 = new DenseMatrix[Double](tz,record,sk)
//存放向量各維度的均值
val cols_mean = sum(d1,Axis._1).*= (1.0 / d1.cols)
//樣本去中心化
for(i<-0 until d1.cols){
d1(::,i) := d1(::,i) - cols_mean
}
//類內散度矩陣
val sw = d1 * (d1.t)
Sw_Arr.append(sw)
//類間散度矩陣
val zf = (cols_mean - big_mean).toDenseMatrix.t
val sb = record.toDouble * zf * zf.t
Sb_Arr.append(sb)
}
//總類內散度矩陣
var total_Sw = DenseMatrix.zeros[Double](tz,tz)
for(i<-0 until Sw_Arr.length) total_Sw = total_Sw + Sw_Arr(i)
//總類間散度矩陣
var total_Sb = DenseMatrix.zeros[Double](tz,tz)
for(i<-0 until Sb_Arr.length) total_Sb = total_Sb + Sb_Arr(i)
//計算類內散度和類間散度矩陣乘積
val Sw_Sb = inv(total_Sw) * total_Sb
//計算Sw_Sb矩陣特徵值及特徵向量
val eigValues = eig(Sw_Sb).eigenvalues
val eigVectors = eig(Sw_Sb).eigenvectors
//測試結果表明,特徵向量爲單列向量,一列代表的纔是一個特徵向量,所以之前的理解是錯的
//選取最大的k個特徵值對應的特徵向量
val label_eig = DenseMatrix.horzcat(eigVectors.t,eigValues.toDenseMatrix.t)
var strArr = ArrayBuffer[String]()
for(i<-0 until label_eig.rows) strArr.append(label_eig.t(::,i).toString)
for(i<-0 until strArr.length){
strArr(i) = strArr(i).replace("DenseVector(","").replace(')',' ').trim()
}
val da = ArrayBuffer[LabeledPoint]()
for(str <- strArr){
val arr = str.split(',').map(string => string.toDouble)
val lab = arr.takeRight(1)(0)
val value = arr.take(arr.length -1)
val labPoint = LabeledPoint(lab,Vectors.dense(value))
da.append(labPoint)
}
//rt表示最終選取的特徵向量矩陣
val result = da.sortBy(labPoint => labPoint.label).reverse.take(nb).map(lab => lab.features).map(vec => vec.toArray)
var rt = DenseMatrix.zeros[Double](result.length,result(0).length)
for(i<-0 until rt.rows){
for(j<-0 until rt.cols){
rt(i,j) = result(i)(j)
}
}
//降維後的數據集
val lastData = rt * big_Matrx
lastData
}
}
根據實驗結果數據繪製圖像如下圖所示:
該結果與Python 直接調取LDA方法結果相差較大:
這是由於Spark和python求取的取特徵向量不同導致,因爲矩陣特徵向量本身就非唯一,
同一特徵值對應的特徵向量有無數個,將Spark求取的第二個特徵向量乘以-1後,結果如下:
此時可以發現,該結果與Python調包結果幾乎一致,橫縱座標的不同依舊是由於特徵向量的不完全一致導致的。因爲作者僅僅改變了向量的方向,並沒有對向量進行縮放。