import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
iris = load_iris()
y = iris.target
X = iris.data
X.shape
import pandas as pd
pd.DataFrame(X)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 |
1 | 4.9 | 3.0 | 1.4 | 0.2 |
2 | 4.7 | 3.2 | 1.3 | 0.2 |
3 | 4.6 | 3.1 | 1.5 | 0.2 |
4 | 5.0 | 3.6 | 1.4 | 0.2 |
5 | 5.4 | 3.9 | 1.7 | 0.4 |
6 | 4.6 | 3.4 | 1.4 | 0.3 |
7 | 5.0 | 3.4 | 1.5 | 0.2 |
8 | 4.4 | 2.9 | 1.4 | 0.2 |
9 | 4.9 | 3.1 | 1.5 | 0.1 |
10 | 5.4 | 3.7 | 1.5 | 0.2 |
pd.DataFrame(y)
pca = PCA(n_components=2)
pca = pca.fit(X)
X_dr = pca.transform(X)
# 查看降維後每個特徵向量上的所帶信息大小【可解釋爲方差大小】
pca.explained_variance_
# 查看降維後每個特徵向量上的信息量所佔原始數據總信息量的百分比
pca.explained_variance_ratio_
X_dr
X_dr[y == 1,0]
colors = ['red','black','orange']
iris.target_names
plt.figure()
for i in [0,1,2]:
plt.scatter(X_dr[y==i,0],X_dr[y==i,1],alpha=.7,c=colors[i],label=iris.target_names[i])
plt.legend()
plt.title('PCA of IRIS dataset')
plt.show()
import numpy as np
pca_line = PCA().fit(X)
plt.plot([1,2,3,4],np.cumsum(pca_line.explained_variance_ratio_))
plt.xticks([1,2,3,4])
plt.xlabel("number of components after dimension reduction")
plt.ylabel("cumulative explained variance")
plt.show()
對於PCA降維方式,主要是根據前後的方差的大小來確定信息量的損失情況!!!!