Algorithm: Principle Component Analysis for High Dimension Reduction Data

The data preprocessing as standarlization or feature Scaling:

https://en.wikipedia.org/wiki/Feature_scaling

Before we implement PCA, we will need to do some data preprocessing. In this assessment, some of them will be implemented by you, others we will take care of. However, when you are working on real world problems, you will need to do all these steps by yourself!

The preprocessing steps we will do are

  1. Convert unsigned interger 8 (uint8) encoding of pixels to a floating point number between 0-1.
  2. Subtract from each image the mean μμ.
  3. Scale each dimension of each image by 1σ1σ where σσ is the stardard deviation.

1. PCA

Now we will implement PCA. Before we do that, let's pause for a moment and think about the steps for performing PCA. Assume that we are performing PCA on some dataset XX for MM principal components. We then need to perform the following steps, which we break into parts:

  1. Data normalization (normalize).
  2. Find eigenvalues and corresponding eigenvectors for the covariance matrix SS. Sort by the largest eigenvalues and the corresponding eigenvectors (eig).

After these steps, we can then compute the projection and reconstruction of the data onto the spaced spanned by the top n eigenvectors.

 Recall that the principle basis is the vector related the max eigenvalue of the covariance matrix.

 

Code:

# PACKAGE: DO NOT EDIT THIS CELL
import numpy as np
import timeit

# PACKAGE: DO NOT EDIT THIS CELL
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
from ipywidgets import interact

from load_data import load_mnist

MNIST = load_mnist()
images, labels = MNIST['data'], MNIST['target']

# GRADED FUNCTION: DO NOT EDIT THIS LINE

def normalize(X):
    """Normalize the given dataset X
    Args:
        X: ndarray, dataset
    
    Returns:
        (Xbar, mean, std): tuple of ndarray, Xbar is the normalized dataset
        with mean 0 and standard deviation 1; mean and std are the 
        mean and standard deviation respectively.
    
    Note:
        You will encounter dimensions where the standard deviation is
        zero, for those when you do normalization the normalized data
        will be NaN. Handle this by setting using `std = 1` for those 
        dimensions when doing normalization.
    """
    #mu = np.zeros(X.shape[1]) # <-- EDIT THIS, compute the mean of X
    mu = np.mean(X, axis = 0, keepdims = True) 
    std = np.std(X, axis = 0, keepdims = True)
    std_filled = std.copy()
    std_filled[std==0] = 1.
    #Xbar = X                  # <-- EDIT THIS, compute the normalized data Xbar
    Xbar = (X - mu) / std_filled
    return Xbar, mu, std

def eig(S):
    """Compute the eigenvalues and corresponding eigenvectors 
        for the covariance matrix S.
    Args:
        S: ndarray, covariance matrix
    
    Returns:
        (eigvals, eigvecs): ndarray, the eigenvalues and eigenvectors

    Note:
        the eigenvals and eigenvecs should be sorted in descending
        order of the eigen values
    """
    eVals, eVecs = np.linalg.eig(S)
    order = np.argsort(eVals)[::-1] # sort the eigenvals in descending order.
    eVals = eVals[order]
    eVecs = eVecs[:, order]
    return (eVals, eVecs) # <-- EDIT THIS to return the eigenvalues and corresponding eigenvectors

def projection_matrix(B):
    """Compute the projection matrix onto the space spanned by `B`
    Args:
        B: ndarray of dimension (D, M), the basis for the subspace
    
    Returns:
        P: the projection matrix
    """
    P = B @ np.linalg.inv(B.T @ B) @ B.T
    return P # <-- EDIT THIS to compute the projection matrix

def PCA(X, num_components):
    """
    Args:
        X: ndarray of size (N, D), where D is the dimension of the data,
           and N is the number of datapoints
        num_components: the number of principal components to use.
    Returns:
        X_reconstruct: ndarray of the reconstruction
        of X from the first `num_components` principal components.
    """
    # your solution should take advantage of the functions you have implemented above.
    N, D = X.shape
    X_normalized, mu, std = normalize(X)
    X_normalized.shape
    # covariance matrix with mean 0
    S = (X_normalized.T @ X_normalized) / N
    code, onb = eig(S)
    code = code[:num_components]
    onb = onb[:, :num_components]
    # P with the dimension(D, D)
    P = projection_matrix(onb)
    X_projection = P @ X.T
    return X_projection.T # <-- EDIT THIS to return the reconstruction of X

    ## Some preprocessing of the data
NUM_DATAPOINTS = 1000
X = (images.reshape(-1, 28 * 28)[:NUM_DATAPOINTS]) / 255.
Xbar, mu, std = normalize(X)

for num_component in range(1, 20):
    from sklearn.decomposition import PCA as SKPCA
    # We can compute a standard solution given by scikit-learn's implementation of PCA
    pca = SKPCA(n_components=num_component, svd_solver='full')
    sklearn_reconst = pca.inverse_transform(pca.fit_transform(Xbar))
    reconst = PCA(Xbar, num_component)
    np.testing.assert_almost_equal(reconst, sklearn_reconst)
    print(np.square(reconst - sklearn_reconst).sum())

Result:

 

(8.5153870005e-24+0j)
(8.09790151532e-24+0j)
(9.61487939311e-24+0j)
(6.39164394758e-24+0j)
(1.19817697147e-23+0j)
(9.18939009489e-24+0j)
(2.46356799263e-23+0j)
(2.04450491509e-23+0j)
(2.35281327024e-23+0j)
(2.33297802189e-22+0j)
(9.45193136857e-23+0j)
(9.82734807213e-23+0j)
(1.596514124e-22+0j)
(7.20916435378e-23+0j)
(2.9098190907e-23+0j)
(3.7462168164e-23+0j)
(3.22053322424e-23+0j)
(2.71427239921e-23+0j)
(1.11240190546e-22+0j)

 

Calculate the MSE for data set

def mse(predict, actual):
    """Helper function for computing the mean squared error (MSE)"""
    return np.square(predict - actual).sum(axis=1).mean()

loss = []
reconstructions = []
# iterate over different number of principal components, and compute the MSE
for num_component in range(1, 100):
    reconst = PCA(Xbar, num_component)
    error = mse(reconst, Xbar)
    reconstructions.append(reconst)
    print('n = {:d}, reconstruction_error = {:f}'.format(num_component, error))
    loss.append((num_component, error))

reconstructions = np.asarray(reconstructions)
reconstructions = reconstructions * std + mu # "unnormalize" the reconstructed image
loss = np.asarray(loss)

import pandas as pd
# create a table showing the number of principal components and MSE
pd.DataFrame(loss).head()

fig, ax = plt.subplots()
ax.plot(loss[:,0], loss[:,1]);
ax.axhline(100, linestyle='--', color='r', linewidth=2)
ax.xaxis.set_ticks(np.arange(1, 100, 5));
ax.set(xlabel='num_components', ylabel='MSE', title='MSE vs number of principal components');

@interact(image_idx=(0, 1000))
def show_num_components_reconst(image_idx):
    fig, ax = plt.subplots(figsize=(20., 20.))
    actual = X[image_idx]
    # concatenate the actual and reconstructed images as large image before plotting it
    x = np.concatenate([actual[np.newaxis, :], reconstructions[:, image_idx]])
    ax.imshow(np.hstack(x.reshape(-1, 28, 28)[np.arange(10)]),
              cmap='gray');
    ax.axvline(28, color='orange', linewidth=2)

@interact(i=(0, 10))
def show_pca_digits(i=1):
    """Show the i th digit and its reconstruction"""
    plt.figure(figsize=(4,4))
    actual_sample = X[i].reshape(28,28)
    reconst_sample = (reconst[i, :] * std + mu).reshape(28, 28)
    plt.imshow(np.hstack([actual_sample, reconst_sample]), cmap='gray')
    plt.show()

 

 

 

2. PCA for high-dimensional datasets

 

Sometimes, the dimensionality of our dataset may be larger than the number of samples we have. Then it might be inefficient to perform PCA with your implementation above. Instead, as mentioned in the lectures, you can implement PCA in a more efficient manner, which we call "PCA for high dimensional data" (PCA_high_dim).

Below are the steps for performing PCA for high dimensional dataset

 

# GRADED FUNCTION: DO NOT EDIT THIS LINE
### PCA for high dimensional datasets

def PCA_high_dim(X, n_components):
    """Compute PCA for small sample size but high-dimensional features. 
    Args:
        X: ndarray of size (N, D), where D is the dimension of the sample,
           and N is the number of samples
        num_components: the number of principal components to use.
    Returns:
        X_reconstruct: (N, D) ndarray. the reconstruction
        of X from the first `num_components` pricipal components.
    """
    N, D = X.shape
    S_prime = (X @ X.T) / N
    code_prime, onb_prime = eig(S_prime)
    code_prime = code_prime[:n_components]
    onb_prime = onb_prime[:, :n_components]
    # calculate the principle subspace U
    U = X.T @ onb_prime # (D, N) @ (N, n_components)
    P = projection_matrix(U)
    X_projection = P @ X.T
    return X_Projection.T # <-- EDIT THIS to return the reconstruction of X

Test CASE

np.testing.assert_almost_equal(PCA(Xbar, 2), PCA_high_dim(Xbar, 2))

 Time Complexity Analysis:

Now let's compare the running time between PCA and PCA_high_dim.

Tips for running benchmarks or computationally expensive code:

When you have some computation that takes up a non-negligible amount of time. Try separating the code that produces output from the code that analyzes the result (e.g. plot the results, comput statistics of the results). In this way, you don't have to recompute when you want to produce more analysis.

The next cell includes a function that records the time taken for executing a function f by repeating it for repeat number of times. You do not need to modify the function but you can use it to compare the running time for functions which you are interested in knowing the running time.

 

def time(f, repeat=10):
    times = []
    for _ in range(repeat):
        start = timeit.default_timer()
        f()
        stop = timeit.default_timer()
        times.append(stop-start)
    return np.mean(times), np.std(times)

times_mm0 = []
times_mm1 = []

# iterate over datasets of different size
for datasetsize in np.arange(4, 784, step=20):
    XX = Xbar[:datasetsize] # select the first `datasetsize` samples in the dataset
    # record the running time for computing X.T @ X
    mu, sigma = time(lambda : XX.T @ XX)
    times_mm0.append((datasetsize, mu, sigma))
    
    # record the running time for computing X @ X.T
    mu, sigma = time(lambda : XX @ XX.T)
    times_mm1.append((datasetsize, mu, sigma))
    
times_mm0 = np.asarray(times_mm0)
times_mm1 = np.asarray(times_mm1)


fig, ax = plt.subplots()
ax.set(xlabel='size of dataset', ylabel='running time')
bar = ax.errorbar(times_mm0[:, 0], times_mm0[:, 1], times_mm0[:, 2], label="$X^T X$ (PCA)", linewidth=2)
ax.errorbar(times_mm1[:, 0], times_mm1[:, 1], times_mm1[:, 2], label="$X X^T$ (PCA_high_dim)", linewidth=2)
ax.legend()

 

Benchmark for PCA and PCA high dimension

times0 = []
times1 = []

# iterate over datasets of different size
for datasetsize in np.arange(4, 784, step=100):
    XX = Xbar[:datasetsize]
    npc = 2
    mu, sigma = time(lambda : PCA(XX, npc), repeat=10)
    times0.append((datasetsize, mu, sigma))
    
    mu, sigma = time(lambda : PCA_high_dim(XX, npc), repeat=10)
    times1.append((datasetsize, mu, sigma))
    
times0 = np.asarray(times0)
times1 = np.asarray(times1)

fig, ax = plt.subplots()
ax.set(xlabel='number of datapoints', ylabel='run time')
ax.errorbar(times0[:, 0], times0[:, 1], times0[:, 2], label="PCA", linewidth=2)
ax.errorbar(times1[:, 0], times1[:, 1], times1[:, 2], label="PCA_high_dim", linewidth=2)
ax.legend();

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章