PCA算法cpp實現

PCA過程:

  • 零均值化
  • 求協方差
  • 求特徵值及特徵向量
  • 按特徵值大小排序特徵向量
  • 取前k行組成變換矩陣
  • 使用變換矩陣即可進行降維或還原

測試數據集(68040*32):CorelFeatures-mld/ColorHistogram.asc

#include <iostream>
#include <math.h>
#include <string.h>
#include <string>
#include <vector>
#include <map>
#include <fstream>
#include <iomanip>

using namespace std;

/**
* @brief 計算協方差矩陣
* @param mat    m*n 輸入矩陣
* @param covar  n*n 協方差矩陣
* @param mean   列均值向量
* @param scale  是否縮放
* @return
*/
template<typename _Tp>
int calcCovarMatrix(const std::vector<std::vector<_Tp>>& mat, std::vector<std::vector<_Tp>>& covar, std::vector<_Tp>& mean, bool scale = false)
{
    const int rows = mat.size();
    const int cols = mat[0].size();
    const int nsamples = rows;
    double scale_ = 1.;
    if (scale) scale_ = 1. / (nsamples /*- 1*/);

    covar.resize(cols);
    for (int i = 0; i < cols; ++i)
        covar[i].resize(cols, (_Tp)0);
    mean.resize(cols, (_Tp)0);

    for (int w = 0; w < cols; ++w) {
        for (int h = 0; h < rows; ++h) {
            mean[w] += mat[h][w];
        }
    }

    for (auto& value : mean) {
        value = 1. / rows * value;
    }

    for (int i = 0; i < cols; ++i) {
        std::vector<_Tp> col_buf(rows, (_Tp)0);
        for (int k = 0; k < rows; ++k)
            col_buf[k] = mat[k][i] - mean[i];

        for (int j = 0; j < cols; ++j) {
            double s0 = 0;
            for (int k = 0; k < rows; ++k) {
                s0 += col_buf[k] * (mat[k][j] - mean[j]);
            }
            covar[i][j] = (_Tp)(s0 * scale_);
        }
    }

    return 0;
}

/**
* @brief 求實對稱矩陣的特徵值及特徵向量的雅克比法
* 利用雅格比(Jacobi)方法求實對稱矩陣的全部特徵值及特徵向量
* @param pMatrix                n*n 存放實對稱矩陣
* @param pdblVects              n*n 返回特徵向量(按列存儲)
* @param pdbEigenValues         特徵值數組
* @param dbEps                  精度
* @param nJt                    最大迭代次數
* @return
*/
int JacbiCor(const vector<vector<double> >& mat, vector<vector<double> >& vects, vector<double>& values, double dbEps, int nJt)
{
    int nDim = mat.size();
    double pMatrix[nDim*nDim];
    for(int i=0; i<nDim; i++){
        for(int j=0; j<nDim; j++){
            pMatrix[i*nDim+j] = mat[i][j];
        }
    }
    double* pdblVects = new double[nDim*nDim];
    double* pdbEigenValues = new double[nDim];
    for(int i = 0; i < nDim; i ++)
    {
        pdblVects[i*nDim+i] = 1.0f;
        for(int j = 0; j < nDim; j ++)
        {
            if(i != j)
                pdblVects[i*nDim+j]=0.0f;
        }
    }

    int nCount = 0;     //迭代次數
    while(1)
    {
        //在pMatrix的非對角線上找到最大元素
        double dbMax = pMatrix[1];
        int nRow = 0;
        int nCol = 1;
        for (int i = 0; i < nDim; i ++)          //行
        {
            for (int j = 0; j < nDim; j ++)      //列
            {
                double d = fabs(pMatrix[i*nDim+j]);

                if((i!=j) && (d> dbMax))
                {
                    dbMax = d;
                    nRow = i;
                    nCol = j;
                }
            }
        }

        if(dbMax < dbEps)     //精度符合要求
            break;

        if(nCount > nJt)       //迭代次數超過限制
            break;

        nCount++;

        double dbApp = pMatrix[nRow*nDim+nRow];
        double dbApq = pMatrix[nRow*nDim+nCol];
        double dbAqq = pMatrix[nCol*nDim+nCol];

        //計算旋轉角度
        double dbAngle = 0.5*atan2(-2*dbApq,dbAqq-dbApp);
        double dbSinTheta = sin(dbAngle);
        double dbCosTheta = cos(dbAngle);
        double dbSin2Theta = sin(2*dbAngle);
        double dbCos2Theta = cos(2*dbAngle);

        pMatrix[nRow*nDim+nRow] = dbApp*dbCosTheta*dbCosTheta +
            dbAqq*dbSinTheta*dbSinTheta + 2*dbApq*dbCosTheta*dbSinTheta;
        pMatrix[nCol*nDim+nCol] = dbApp*dbSinTheta*dbSinTheta +
            dbAqq*dbCosTheta*dbCosTheta - 2*dbApq*dbCosTheta*dbSinTheta;
        pMatrix[nRow*nDim+nCol] = 0.5*(dbAqq-dbApp)*dbSin2Theta + dbApq*dbCos2Theta;
        pMatrix[nCol*nDim+nRow] = pMatrix[nRow*nDim+nCol];

        for(int i = 0; i < nDim; i ++)
        {
            if((i!=nCol) && (i!=nRow))
            {
                int u = i*nDim + nRow;  //p
                int w = i*nDim + nCol;  //q
                dbMax = pMatrix[u];
                pMatrix[u]= pMatrix[w]*dbSinTheta + dbMax*dbCosTheta;
                pMatrix[w]= pMatrix[w]*dbCosTheta - dbMax*dbSinTheta;
            }
        }

        for (int j = 0; j < nDim; j ++)
        {
            if((j!=nCol) && (j!=nRow))
            {
                int u = nRow*nDim + j;  //p
                int w = nCol*nDim + j;  //q
                dbMax = pMatrix[u];
                pMatrix[u]= pMatrix[w]*dbSinTheta + dbMax*dbCosTheta;
                pMatrix[w]= pMatrix[w]*dbCosTheta - dbMax*dbSinTheta;
            }
        }

        //計算特徵向量
        for(int i = 0; i < nDim; i ++)
        {
            int u = i*nDim + nRow;      //p
            int w = i*nDim + nCol;      //q
            dbMax = pdblVects[u];
            pdblVects[u] = pdblVects[w]*dbSinTheta + dbMax*dbCosTheta;
            pdblVects[w] = pdblVects[w]*dbCosTheta - dbMax*dbSinTheta;
        }

    }

    //對特徵值進行排序以及重新排列特徵向量,特徵值即pMatrix主對角線上的元素
    std::map<double,int> mapEigen;
    for(int i = 0; i < nDim; i ++)
    {
        pdbEigenValues[i] = pMatrix[i*nDim+i];

        mapEigen.insert(make_pair( pdbEigenValues[i],i ) );
    }

    double *pdbTmpVec = new double[nDim*nDim];
    std::map<double,int>::reverse_iterator iter = mapEigen.rbegin();
    for (int j = 0; iter != mapEigen.rend(),j < nDim; ++iter,++j)
    {
        for (int i = 0; i < nDim; i ++)
        {
            pdbTmpVec[i*nDim+j] = pdblVects[i*nDim + iter->second];
        }

        //特徵值重新排列
        pdbEigenValues[j] = iter->first;
    }

    //設定正負號
    for(int i = 0; i < nDim; i ++)
    {
        double dSumVec = 0;
        for(int j = 0; j < nDim; j ++)
            dSumVec += pdbTmpVec[j * nDim + i];
        if(dSumVec<0)
        {
            for(int j = 0;j < nDim; j ++)
                pdbTmpVec[j * nDim + i] *= -1;
        }
    }

    memcpy(pdblVects,pdbTmpVec,sizeof(double)*nDim*nDim);
    for(int i=0; i<nDim; i++){
        values[i] = pdbEigenValues[i];
        for(int j=0; j<nDim; j++){
            vects[i][j] = pdblVects[i*nDim+j];
        }
    }
    delete []pdbTmpVec;
    delete []pdblVects;
    delete []pdbEigenValues;

    return 0;
}

/**
* @brief 矩陣乘法
* 須滿足 a的列數 == b的行數
* @param a      m*k 矩陣
* @param b      k*n 矩陣
* @param result m*n 矩陣 乘法結果
* @return
*/
int mul(const vector<vector<double> >& a, const vector<vector<double> >& b, vector<vector<double> >& result){
    const int m = a.size();
    const int n = a[0].size();
    const int k = b[0].size();
    //cout<<"mul: "<<a.size()<<"*"<<a[0].size()<<" "<<b.size()<<"*"<<b[0].size()<<endl;
    for (int i = 0; i < m; i++)  //a的行數
    {
        vector<double> t;
        for (int j = 0; j < k; j++)      //b的列數
        {
            double sum = 0;
            for (int w = 0; w < n; w++)  //a的列數
            {
                sum += a[i][w] * b[w][j];
            }
            t.push_back(sum);
        }
        result.push_back(t);
    }
    return 0;
}

/**
* @brief 向量方差
* 須滿足 a b 維數相等
* @param a      n維向量
* @param b      n維向量
* @param result 方差
* @return
*/
int variance(vector<double> a, vector<double> b, double& result){
    result = 0;
    const int n = a.size();
    for(int i=0; i<n; i++){
        result += (a[i]-b[i])*(a[i]-b[i]);
    }
    result /= n;
    return 0;
}

const int m = 68040;    //行,向量數
const int n = 32;       //列,維數

double kee = 0.95;      //能量閾值,計算最小滿足的k 也可直接指定k

int main(){
    fstream fs("ColorHistogram.asc");

    vector<vector<double> > data; //原始數據
    vector<double> mean;//均值
    vector<vector<double> > covar;//協方差

    for(int i=0; i<m; i++){
        vector<double> line;
        for(int j=0; j<=n; j++){
            double t;
            fs>>t;
            if(j != 0){
                line.push_back(t);
            }
        }
        data.push_back(line);
    }
    fs.close();
    cout<<"read data over."<<endl;

    calcCovarMatrix(data, covar, mean);
    cout<<"calcCovarMatrix over."<<endl;

    vector<vector<double> > pdblVects;
    pdblVects.resize(n);
    for(int i=0; i<n; i++) pdblVects[i].resize(n);
    vector<double> pdbEigenValues;
    pdbEigenValues.resize(n);

    JacbiCor(covar, pdblVects, pdbEigenValues, 0.1, 1000);
    cout<<"JacbiCor over."<<endl;

    cout<<"eigenvalue: "<<endl;
    double sum_pdbEigenValues = 0;
    for(auto v : pdbEigenValues){
        sum_pdbEigenValues += v;
        cout<<v<<"\t";
    }
    cout<<endl;

    //計算最小滿足kee的k值
    int k = 0;
    double t_sum = 0;
    for(; k<n; k++){
        t_sum += pdbEigenValues[k];
        cout<<k<<" "<<t_sum/sum_pdbEigenValues<<endl;
        if(t_sum/sum_pdbEigenValues > kee){
            break;
        }
    }
    k++;
    //k = 32;
    cout<<"k: "<<k<<endl;

    //旋轉以用於矩陣乘法
    vector<vector<double> > rotate_pdblVects;
    rotate_pdblVects.resize(n);
    for(int i=0; i<k; i++){
        for(int j=0; j<n; j++){
            rotate_pdblVects[j].push_back(pdblVects[i][j]);
        }
    }
    //降維
    vector<vector<double> > z;
    mul(data, rotate_pdblVects, z);
    cout<<"to k-dim over."<<endl;
    //還原
    vector<vector<double> > xap;
    mul(z, pdblVects, xap);
    cout<<"back n-dim over."<<endl;
    //與原始數據進行對比
    cout.setf(ios::fixed);
    for(int i=0; i<15; i++){
        cout<<fixed<<setprecision(4)<<data[0][i]<<"\t";
    }
    cout<<endl;
    for(int i=0; i<15; i++){
        cout<<fixed<<setprecision(4)<<xap[0][i]<<"\t";
    }
    cout<<endl;
    //計算與原始數據的標準差
    cout<<"variance: "<<endl;
    for(int i=0; i<10; i++){
        double vari;
        variance(data[i], xap[i], vari);
        cout<<i<<":\t"<<sqrt(vari)<<endl;
    }
    return 0;
}

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章