參考資料:
機器學習實戰
'''
@version: 0.0.1
@Author: tqrs
@dev: python3 vscode
@Date: 2019-11-12 12:40:12
@LastEditTime: 2019-11-12 21:04:40
@FilePath: \\機器學習實戰\\13-PCA\\PCA.py
@Descripttion: 在低維下,數據更容易進行處理,相關特徵可能在數據中更明確地顯示出來。PCA降維是把數據從原來的座標系轉換到了新的座標系,新座標系的選擇是由數據本身決定的。
'''
import numpy as np
import matplotlib.pyplot as plt
def loadDataSet(fileName=r'13-PCA\testSet.txt'):
with open(fileName) as fr:
stringArr = [line.strip().split() for line in fr.readlines()]
datArr = [list(map(float, line)) for line in stringArr]
return np.mat(datArr)
def pca(dataMat, topNfeat=9999999):
"""
[summary]:
去除平均值
計算協方差矩陣
計算協方差矩陣的特徵值和特徵向量
將特徵值從大到小排序
保留最上面的N個特徵向量
將數據轉換到上述N個特徵向量構建的新空間中
Arguments:
dataMat -- 數據集
Keyword Arguments:
topNfeat {int} -- 選取的特徵數 (default: {9999999})
Returns:
[type] -- [description]
"""
meanVals = np.mean(dataMat, axis=0)
meanRemoved = dataMat - meanVals
covMat = np.cov(meanRemoved, rowvar=0)
eigVals, eigVects = np.linalg.eig(np.mat(covMat))
eigValInd = np.argsort(eigVals)
eigValInd = eigValInd[-1:-(topNfeat + 1):-1]
redEigVects = eigVects[:, eigValInd]
lowDDataMat = meanRemoved * redEigVects
reconMat = (lowDDataMat * redEigVects.T) + meanVals
return lowDDataMat, reconMat
def test_pca():
dataMat = loadDataSet(r'13-PCA\testSet.txt')
lowDMat, reconMat = pca(dataMat, 1)
print('lowDMat shape:', lowDMat.shape)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(dataMat[:, 0].flatten().A[0],
dataMat[:, 1].flatten().A[0],
marker='^',
s=90)
ax.scatter(reconMat[:, 0].flatten().A[0],
reconMat[:, 1].flatten().A[0],
marker='o',
s=50,
c='red')
plt.show()
def replaceNanWithMean():
datMat = loadDataSet(r'13-PCA\secom.data')
numFeat = np.shape(datMat)[1]
for i in range(numFeat):
meanVal = np.mean(datMat[np.nonzero(~np.isnan(datMat[:, i].A))[0], i])
datMat[np.nonzero(np.isnan(datMat[:, i]))[0], i] = meanVal
return datMat
if __name__ == '__main__':
dataMat = replaceNanWithMean()
meanVals = np.mean(dataMat, axis=0)
meanRemoved = dataMat - meanVals
covMat = np.cov(meanRemoved, rowvar=False)
eigVals, eigVects = np.linalg.eig(np.mat(covMat))
print(eigVals)