import breeze.linalg.{Axis, DenseMatrix, eigSym, sum}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.linalg.Vectors
import scala.collection.mutable.ArrayBuffer
/** The method attempts to lower the dimensionality based on PCA
* @author XiaoTangBao
* @date 2019/4/16 9:16
* @version 1.0
*/
object PCA2 {
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
val originalData = DenseMatrix((65.0,77.0,67.0,80.0,74.0),(61.0,77.0,63.0,69.0,70.0),
(72.0,76.0,49.0,75.0,80.0), (84.0,64.0,65.0,74.0,84.0),
(81.0,70.0,67.0,74.0,82.0),(79.0,55.0,57.0,63.0,74.0))
val n1 = lower_dimension(originalData,2).toDenseVector
println(n1)
val n2 = lower_dimension(originalData,1).toDenseVector
println(n2)
}
/**
* the method attempts to lower the dimensionality
* @param data the ioriginal data which in high dimensions, each col of the data replace one record
* each rows of the data replace the dimensions of original data
* @param k the final dimensions
*/
def lower_dimension(data:DenseMatrix[Double],k:Int)={
//每一行求和
val line_sum = sum(data,Axis._1).*=(1.0 / data.cols)
//樣本中心化
val newDM = data.t
for(i<- 0 until newDM.cols){
val DM_col = newDM(::,i)
for(j<-0 until DM_col.length){
newDM(j,i) = newDM(j,i) - line_sum(i)
}
}
val pdDM = newDM.t
//計算樣本的協方差矩陣
val covMatrix = (pdDM * pdDM.t).*=(1.0 / pdDM.cols)
val eigValues = eigSym(covMatrix).eigenvalues
val eigVectors = eigSym(covMatrix).eigenvectors
//選取最大的k個特徵值對應的特徵向量
val label_eig = DenseMatrix.horzcat(eigVectors,eigValues.toDenseMatrix.t)
var strArr = ArrayBuffer[String]()
for(i<-0 until label_eig.rows) strArr.append(label_eig.t(::,i).toString)
for(i<-0 until strArr.length){
strArr(i) = strArr(i).replace("DenseVector(","").replace(')',' ').trim()
}
val da = ArrayBuffer[LabeledPoint]()
for(str <- strArr){
val arr = str.split(',').map(string => string.toDouble)
val lab = arr.takeRight(1)(0)
val value = arr.take(arr.length -1)
val labPoint = LabeledPoint(lab,Vectors.dense(value))
da.append(labPoint)
}
val result = da.sortBy(labPoint => labPoint.label).reverse.take(k).map(lab => lab.features).map(vec => vec.toArray)
var rt = DenseMatrix.zeros[Double](result.length,result(0).length)
for(i<-0 until rt.rows){
for(j<-0 until rt.cols){
rt(i,j) = result(i)(j)
}
}
//降維後的數據集
val newData = rt * pdDM
newData
}
}
-------------result-----------------------------------------------------------------
DenseVector(0.9541373503102593, -4.557705968512854, -2.949615262199667, 9.19621431038052, 11.263691236035559, -4.304111580798186, -2.9710447411780567, -1.0679052967809501, -6.297168582968044, 0.7335085357115019)
DenseVector(-8.596024639184229, 9.03696444029072, 4.432821425965438, -1.2632454620166214, -3.6105157650553084)
算法小白的第一次嘗試---PCA(主成分分析)降維
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.