機器學習之PCA原理和代碼實現

'''
PCA 主成分分析
以鳶尾花數據集爲例,特徵空間維度爲4(對於每個訓練數據共有4個特徵)
希望對特徵進行降維,將特徵投影到某個主成分/主方向上
主成分也是特徵空間中的4維向量,將所有輸入數據的特徵向量都投影到主成分上
並希望所有數據投影到主方向後的結果越分散越好(方差越大越好,表明數據越分得開)
投影如何體現?
用向量內積體現,首先假定將主成分向量歸一化成爲單位向量,然後求當前特徵向量到
主成分向量上的投影,即兩個向量做內積(np.dot(feat_x.T,main_feat))
問題轉換爲:找到一個主成分向量,使得所有的輸入數據在該方向上進行投影后分散程度最大
主方向/主成分即最優的投影方向

下面分別比較使用sklearn中的函數和用戶自己實現的python代碼實現將原始的4維特徵降維成2維
由於PCA主成分分析方法出現在對輸入數據預處理時,故而並不需要輸入數據的類別標籤
'''

import numpy as np
# 方差是衡量數據分散程度的統計量
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

data_file='F:\\machine_learning\\2\\iris.txt'
class_dict={"setosa":0,"versicolor":1,"virginica":2}
data=np.zeros((150,4),dtype=np.float)
label=np.zeros((150,))
with open(data_file,'r') as file:
    for idx,line in enumerate(file.readlines()[1:]):#從第二行開始讀
        line=line.strip().split(' ')
        data[idx,:]=np.array([float(line[1]),float(line[2]),float(line[3]),float(line[4])])
        label[idx]=class_dict[line[-1][1:-1]]
# print(data,data.shape)
# print(label)

def vis_dataset(x_feat, ylabel):
    '''
    :param x_feat: 150*4  輸入數據的特徵
    :param ylabel: 150,  輸入數據的標籤
    :return:
    '''
    fig = plt.figure()
    # 設置X軸標籤
    plt.xlabel('x1')
    # 設置Y軸標籤
    plt.ylabel('x2')
    # 畫散點圖
    feat_class_0 = x_feat[np.where(ylabel == 0)]
    # print(feat_class_0.shape)

    plt.scatter(feat_class_0[:, 0], feat_class_0[:, 1], color='red', marker='^', label='class_0')

    feat_class_1 = x_feat[np.where(ylabel == 1)]
    plt.scatter(feat_class_1[:, 0], feat_class_1[:, 1], color='green', marker='o', label='class_1')

    feat_class_2 = x_feat[np.where(ylabel == 2)]
    plt.scatter(feat_class_2[:, 0], feat_class_2[:, 1], color='blue', marker='x', label='class_2')
    # 設置圖標
    plt.legend(loc='upper right')

    # 顯示所畫的圖
    plt.show()

# 方法1 調用sklearn中的函數實現PCA
pca = PCA(n_components=2)
pca.fit(data)
# print(pca.explained_variance_)
newX=pca.transform(data)
print(newX.shape,newX)
# 畫出經過PCA降維之後的散點圖
vis_dataset(newX,label)

# 方法2 用python代碼實現PCA
# 步驟如下
# 1.將輸入數據的特徵矩陣在每個特徵維度上減去均值,即中心化(這一步是必須的,因爲這是推導出PCA公式的必要條件)
# 2.計算中心化後的特徵矩陣的協方差矩陣,並求出特徵值和(與特徵值corresponding的)特徵向量
# 3.對於特徵值從大到小進行排序(這裏是對於特徵值的絕對值),取出前k個最大的特徵值雖對應的特徵向量,構成投影矩陣
# 4.將中心化後的特徵矩陣與投影矩陣進行矩陣乘法,得到PCA降維後的特徵矩陣

def compute_pca(input_data,n_feat):
    '''
    :param input_data: 需要進行PCA降維的特徵矩陣   150*4
    :param n_feat: 希望PCA輸出的維度
    :return:
    '''
    # step1:對於輸入數據的每個特徵維度進行0均值化,這點是PCA公式成立的前提
    mean=np.average(input_data,axis=0)
    input_feat=input_data-mean

    # input_feat = [150,3] 在每個特徵維度上分別進行中心化

    covariance_matrix=np.dot(input_feat.T,input_feat)

    # 對於歸一化之後的特徵,求出協方差矩陣
    # covariance_matrix = [4 ,4]

    eigen_value, eigen_vector = np.linalg.eig(covariance_matrix)

    # 求出協方差矩陣的特徵值和特徵向量,特徵值賦值給eigen_value,對應特徵向量賦值給eigen_vector

    # print('eigen_value',eigen_value/149,eigen_vector)

    index=np.argsort(-np.abs(eigen_value))

    # print('index',index)

    # 對於特徵值進行降序排列

    used_index=index[:n_feat]

    # 使用的是排在前面的n_feat個特徵值以及對應的特徵向量

    used_value=eigen_value[used_index]
    used_vector=eigen_vector[:,used_index]

    '''
    這裏尤其要注意:
    numpy.linalg.eig 計算矩陣的特徵值和特徵向量返回的特徵向量矩陣的每一列是特徵向量
    我之前的代碼寫成的是每一行,故而在選擇前k個特徵值所對應的特徵向量的時候就求解錯誤
    '''

    # print('used_value',used_value,used_vector)

    output_feat=np.dot(input_feat,used_vector)

    return output_feat

newX_2=compute_pca(data,2)
print(newX_2)
vis_dataset(newX_2,label)

sklearn package給出的結果

我的python代碼給出的結果(不知道爲什麼我的結果投影之後的特徵矩陣在第二個維度上與sklearn的數值是相反數,但是在投影后的特徵空間上看,在第二個維度上的數據的分散程度相同,故而影響不大)

代碼中所用到的數據文件:

iris.txt
"Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species"
"1" 5.1 3.5 1.4 0.2 "setosa"
"2" 4.9 3 1.4 0.2 "setosa"
"3" 4.7 3.2 1.3 0.2 "setosa"
"4" 4.6 3.1 1.5 0.2 "setosa"
"5" 5 3.6 1.4 0.2 "setosa"
"6" 5.4 3.9 1.7 0.4 "setosa"
"7" 4.6 3.4 1.4 0.3 "setosa"
"8" 5 3.4 1.5 0.2 "setosa"
"9" 4.4 2.9 1.4 0.2 "setosa"
"10" 4.9 3.1 1.5 0.1 "setosa"
"11" 5.4 3.7 1.5 0.2 "setosa"
"12" 4.8 3.4 1.6 0.2 "setosa"
"13" 4.8 3 1.4 0.1 "setosa"
"14" 4.3 3 1.1 0.1 "setosa"
"15" 5.8 4 1.2 0.2 "setosa"
"16" 5.7 4.4 1.5 0.4 "setosa"
"17" 5.4 3.9 1.3 0.4 "setosa"
"18" 5.1 3.5 1.4 0.3 "setosa"
"19" 5.7 3.8 1.7 0.3 "setosa"
"20" 5.1 3.8 1.5 0.3 "setosa"
"21" 5.4 3.4 1.7 0.2 "setosa"
"22" 5.1 3.7 1.5 0.4 "setosa"
"23" 4.6 3.6 1 0.2 "setosa"
"24" 5.1 3.3 1.7 0.5 "setosa"
"25" 4.8 3.4 1.9 0.2 "setosa"
"26" 5 3 1.6 0.2 "setosa"
"27" 5 3.4 1.6 0.4 "setosa"
"28" 5.2 3.5 1.5 0.2 "setosa"
"29" 5.2 3.4 1.4 0.2 "setosa"
"30" 4.7 3.2 1.6 0.2 "setosa"
"31" 4.8 3.1 1.6 0.2 "setosa"
"32" 5.4 3.4 1.5 0.4 "setosa"
"33" 5.2 4.1 1.5 0.1 "setosa"
"34" 5.5 4.2 1.4 0.2 "setosa"
"35" 4.9 3.1 1.5 0.2 "setosa"
"36" 5 3.2 1.2 0.2 "setosa"
"37" 5.5 3.5 1.3 0.2 "setosa"
"38" 4.9 3.6 1.4 0.1 "setosa"
"39" 4.4 3 1.3 0.2 "setosa"
"40" 5.1 3.4 1.5 0.2 "setosa"
"41" 5 3.5 1.3 0.3 "setosa"
"42" 4.5 2.3 1.3 0.3 "setosa"
"43" 4.4 3.2 1.3 0.2 "setosa"
"44" 5 3.5 1.6 0.6 "setosa"
"45" 5.1 3.8 1.9 0.4 "setosa"
"46" 4.8 3 1.4 0.3 "setosa"
"47" 5.1 3.8 1.6 0.2 "setosa"
"48" 4.6 3.2 1.4 0.2 "setosa"
"49" 5.3 3.7 1.5 0.2 "setosa"
"50" 5 3.3 1.4 0.2 "setosa"
"51" 7 3.2 4.7 1.4 "versicolor"
"52" 6.4 3.2 4.5 1.5 "versicolor"
"53" 6.9 3.1 4.9 1.5 "versicolor"
"54" 5.5 2.3 4 1.3 "versicolor"
"55" 6.5 2.8 4.6 1.5 "versicolor"
"56" 5.7 2.8 4.5 1.3 "versicolor"
"57" 6.3 3.3 4.7 1.6 "versicolor"
"58" 4.9 2.4 3.3 1 "versicolor"
"59" 6.6 2.9 4.6 1.3 "versicolor"
"60" 5.2 2.7 3.9 1.4 "versicolor"
"61" 5 2 3.5 1 "versicolor"
"62" 5.9 3 4.2 1.5 "versicolor"
"63" 6 2.2 4 1 "versicolor"
"64" 6.1 2.9 4.7 1.4 "versicolor"
"65" 5.6 2.9 3.6 1.3 "versicolor"
"66" 6.7 3.1 4.4 1.4 "versicolor"
"67" 5.6 3 4.5 1.5 "versicolor"
"68" 5.8 2.7 4.1 1 "versicolor"
"69" 6.2 2.2 4.5 1.5 "versicolor"
"70" 5.6 2.5 3.9 1.1 "versicolor"
"71" 5.9 3.2 4.8 1.8 "versicolor"
"72" 6.1 2.8 4 1.3 "versicolor"
"73" 6.3 2.5 4.9 1.5 "versicolor"
"74" 6.1 2.8 4.7 1.2 "versicolor"
"75" 6.4 2.9 4.3 1.3 "versicolor"
"76" 6.6 3 4.4 1.4 "versicolor"
"77" 6.8 2.8 4.8 1.4 "versicolor"
"78" 6.7 3 5 1.7 "versicolor"
"79" 6 2.9 4.5 1.5 "versicolor"
"80" 5.7 2.6 3.5 1 "versicolor"
"81" 5.5 2.4 3.8 1.1 "versicolor"
"82" 5.5 2.4 3.7 1 "versicolor"
"83" 5.8 2.7 3.9 1.2 "versicolor"
"84" 6 2.7 5.1 1.6 "versicolor"
"85" 5.4 3 4.5 1.5 "versicolor"
"86" 6 3.4 4.5 1.6 "versicolor"
"87" 6.7 3.1 4.7 1.5 "versicolor"
"88" 6.3 2.3 4.4 1.3 "versicolor"
"89" 5.6 3 4.1 1.3 "versicolor"
"90" 5.5 2.5 4 1.3 "versicolor"
"91" 5.5 2.6 4.4 1.2 "versicolor"
"92" 6.1 3 4.6 1.4 "versicolor"
"93" 5.8 2.6 4 1.2 "versicolor"
"94" 5 2.3 3.3 1 "versicolor"
"95" 5.6 2.7 4.2 1.3 "versicolor"
"96" 5.7 3 4.2 1.2 "versicolor"
"97" 5.7 2.9 4.2 1.3 "versicolor"
"98" 6.2 2.9 4.3 1.3 "versicolor"
"99" 5.1 2.5 3 1.1 "versicolor"
"100" 5.7 2.8 4.1 1.3 "versicolor"
"101" 6.3 3.3 6 2.5 "virginica"
"102" 5.8 2.7 5.1 1.9 "virginica"
"103" 7.1 3 5.9 2.1 "virginica"
"104" 6.3 2.9 5.6 1.8 "virginica"
"105" 6.5 3 5.8 2.2 "virginica"
"106" 7.6 3 6.6 2.1 "virginica"
"107" 4.9 2.5 4.5 1.7 "virginica"
"108" 7.3 2.9 6.3 1.8 "virginica"
"109" 6.7 2.5 5.8 1.8 "virginica"
"110" 7.2 3.6 6.1 2.5 "virginica"
"111" 6.5 3.2 5.1 2 "virginica"
"112" 6.4 2.7 5.3 1.9 "virginica"
"113" 6.8 3 5.5 2.1 "virginica"
"114" 5.7 2.5 5 2 "virginica"
"115" 5.8 2.8 5.1 2.4 "virginica"
"116" 6.4 3.2 5.3 2.3 "virginica"
"117" 6.5 3 5.5 1.8 "virginica"
"118" 7.7 3.8 6.7 2.2 "virginica"
"119" 7.7 2.6 6.9 2.3 "virginica"
"120" 6 2.2 5 1.5 "virginica"
"121" 6.9 3.2 5.7 2.3 "virginica"
"122" 5.6 2.8 4.9 2 "virginica"
"123" 7.7 2.8 6.7 2 "virginica"
"124" 6.3 2.7 4.9 1.8 "virginica"
"125" 6.7 3.3 5.7 2.1 "virginica"
"126" 7.2 3.2 6 1.8 "virginica"
"127" 6.2 2.8 4.8 1.8 "virginica"
"128" 6.1 3 4.9 1.8 "virginica"
"129" 6.4 2.8 5.6 2.1 "virginica"
"130" 7.2 3 5.8 1.6 "virginica"
"131" 7.4 2.8 6.1 1.9 "virginica"
"132" 7.9 3.8 6.4 2 "virginica"
"133" 6.4 2.8 5.6 2.2 "virginica"
"134" 6.3 2.8 5.1 1.5 "virginica"
"135" 6.1 2.6 5.6 1.4 "virginica"
"136" 7.7 3 6.1 2.3 "virginica"
"137" 6.3 3.4 5.6 2.4 "virginica"
"138" 6.4 3.1 5.5 1.8 "virginica"
"139" 6 3 4.8 1.8 "virginica"
"140" 6.9 3.1 5.4 2.1 "virginica"
"141" 6.7 3.1 5.6 2.4 "virginica"
"142" 6.9 3.1 5.1 2.3 "virginica"
"143" 5.8 2.7 5.1 1.9 "virginica"
"144" 6.8 3.2 5.9 2.3 "virginica"
"145" 6.7 3.3 5.7 2.5 "virginica"
"146" 6.7 3 5.2 2.3 "virginica"
"147" 6.3 2.5 5 1.9 "virginica"
"148" 6.5 3 5.2 2 "virginica"
"149" 6.2 3.4 5.4 2.3 "virginica"
"150" 5.9 3 5.1 1.8 "virginica"

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章