算法小白的第一次嘗試---PCA(主成分分析)降維

import breeze.linalg.{Axis, DenseMatrix, eigSym, sum}                                                                                                    
import org.apache.log4j.{Level, Logger}                                                                                                                  
import org.apache.spark.ml.feature.LabeledPoint                                                                                                          
import org.apache.spark.ml.linalg.Vectors                                                                                                                
import scala.collection.mutable.ArrayBuffer                                                                                                              
                                                                                                                                                         
/** The method attempts to lower the dimensionality based on PCA                                                                                         
  * @author XiaoTangBao                                                                                                                                  
  * @date 2019/4/16 9:16                                                                                                                                 
  * @version 1.0                                                                                                                                         
  */                                                                                                                                                     
object PCA2 {                                                                                                                                            
  def main(args: Array[String]): Unit = {                                                                                                                
    Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)                                                                                           
    val originalData = DenseMatrix((65.0,77.0,67.0,80.0,74.0),(61.0,77.0,63.0,69.0,70.0),                                                                
      (72.0,76.0,49.0,75.0,80.0), (84.0,64.0,65.0,74.0,84.0),                                                                                            
      (81.0,70.0,67.0,74.0,82.0),(79.0,55.0,57.0,63.0,74.0))                                                                                             
    val n1 = lower_dimension(originalData,2).toDenseVector                                                                                               
    println(n1)                                                                                                                                          
    val n2 = lower_dimension(originalData,1).toDenseVector                                                                                               
    println(n2)                                                                                                                                          
  }                                                                                                                                                      
                                                                                                                                                         
  /**                                                                                                                                                    
    * the method attempts to lower the dimensionality                                                                                                    
    * @param data the ioriginal data which in high dimensions, each col of the data replace one record                                                   
    *              each rows of the data replace the dimensions of original data                                                                         
    * @param k the final dimensions                                                                                                                      
    */                                                                                                                                                   
  def lower_dimension(data:DenseMatrix[Double],k:Int)={                                                                                                  
    //每一行求和                                                                                                                                              
    val line_sum = sum(data,Axis._1).*=(1.0 / data.cols)                                                                                                 
    //樣本中心化                                                                                                                                              
    val newDM = data.t                                                                                                                                   
    for(i<- 0 until newDM.cols){                                                                                                                         
      val DM_col = newDM(::,i)                                                                                                                           
      for(j<-0 until DM_col.length){                                                                                                                     
        newDM(j,i) = newDM(j,i) - line_sum(i)                                                                                                            
      }                                                                                                                                                  
    }                                                                                                                                                    
    val pdDM = newDM.t                                                                                                                                   
                                                                                                                                                         
    //計算樣本的協方差矩陣                                                                                                                                         
    val covMatrix = (pdDM * pdDM.t).*=(1.0 / pdDM.cols)                                                                                                  
    val eigValues = eigSym(covMatrix).eigenvalues                                                                                                        
    val eigVectors = eigSym(covMatrix).eigenvectors                                                                                                      
                                                                                                                                                         
    //選取最大的k個特徵值對應的特徵向量                                                                                                                                  
    val label_eig = DenseMatrix.horzcat(eigVectors,eigValues.toDenseMatrix.t)                                                                            
    var strArr = ArrayBuffer[String]()                                                                                                                   
    for(i<-0 until label_eig.rows) strArr.append(label_eig.t(::,i).toString)                                                                             
    for(i<-0 until strArr.length){                                                                                                                       
       strArr(i) = strArr(i).replace("DenseVector(","").replace(')',' ').trim()                                                                          
    }                                                                                                                                                    
    val da = ArrayBuffer[LabeledPoint]()                                                                                                                 
    for(str <- strArr){                                                                                                                                  
      val arr = str.split(',').map(string => string.toDouble)                                                                                            
      val lab = arr.takeRight(1)(0)                                                                                                                      
      val value = arr.take(arr.length -1)                                                                                                                
      val labPoint = LabeledPoint(lab,Vectors.dense(value))                                                                                              
      da.append(labPoint)                                                                                                                                
    }                                                                                                                                                    
    val result = da.sortBy(labPoint => labPoint.label).reverse.take(k).map(lab => lab.features).map(vec => vec.toArray)                                  
    var rt = DenseMatrix.zeros[Double](result.length,result(0).length)                                                                                   
    for(i<-0 until rt.rows){                                                                                                                             
      for(j<-0 until rt.cols){                                                                                                                           
        rt(i,j) = result(i)(j)                                                                                                                           
      }                                                                                                                                                  
    }                                                                                                                                                    
                                                                                                                                                         
    //降維後的數據集                                                                                                                                            
    val newData = rt * pdDM                                                                                                                              
    newData                                                                                                                                              
  }                                                                                                                                                      
}     
-------------result-----------------------------------------------------------------
DenseVector(0.9541373503102593, -4.557705968512854, -2.949615262199667, 9.19621431038052, 11.263691236035559, -4.304111580798186, -2.9710447411780567, -1.0679052967809501, -6.297168582968044, 0.7335085357115019)
DenseVector(-8.596024639184229, 9.03696444029072, 4.432821425965438, -1.2632454620166214, -3.6105157650553084)

在這裏插入圖片描述

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章