吳恩達 machine learning 編程作業 python實現 ex8

# -*- coding: utf-8 -*-
"""
Created on Sat Jul  4 01:40:18 2020

@author: cheetah023
"""

import numpy as np
import scipy.io as sci
import matplotlib.pyplot as plt

#函數定義
def estimateGaussian(X):
    m,n = X.shape
    mu = np.zeros([n,1])
    sigma2 = np.zeros([n])
    mu = np.sum(X,axis=0).T / m
    for i in range(0,n):
        sigma2[i] = np.sum((X[:,i] - mu[i]) ** 2) / m
    return mu, sigma2
def multivariateGaussian(X, mu, sigma2):
    k = len(mu)
    m = X.shape[0]
    
    if np.ndim(sigma2) == 1:
        sigma2 = np.diag(sigma2)
    right = np.zeros([m,1])
    left = np.power(2*np.pi,-(k/2)) 
    left = left * np.power(np.linalg.det(sigma2),-(1/2))
    for i in range(0,m):
        right[i] = np.exp(-(1/2) * np.dot(np.dot((X[i] - mu).T,np.linalg.inv(sigma2)),X[i]-mu))
    p = left * right
    return p
def visualizeFit(X,  mu, sigma2):
    x1 = np.linspace(0,30,50)
    x2 = np.linspace(0,30,50)
    xx,yy = np.meshgrid(x1,x2)
    X_temp = np.column_stack([xx.flatten(),yy.flatten()])
    print('X_temp:',X_temp.shape)
    p = multivariateGaussian(X_temp, mu, sigma2)
    p = np.reshape(p,xx.shape)
    levels = [10**h for h in range(-20,0,3)]
    plt.contour(xx,yy,p,levels)
def selectThreshold(yval, pval):
    epsilons = np.linspace(min(pval),max(pval),1000)
    bestEpsilon = 0
    bestF1 = 0
    for epsilon in epsilons:
        tp=fp=fn=0
        for i in range(0,len(pval)):
            if pval[i] < epsilon and yval[i] == 1:
                tp += 1
            elif pval[i] < epsilon and yval[i] == 0:
                fp += 1
            elif pval[i] >= epsilon and yval[i] == 1:
                fn += 1
        if tp + fp:
            prec = tp / (tp + fp)
            rec = tp / (tp + fn)
            F1 = 2 * (prec * rec) / (prec + rec)
        else:
            F1 = 0
        if F1 > bestF1:
            bestEpsilon = epsilon
            bestF1 = F1
    return bestEpsilon,bestF1
#Part 1: Load Example Dataset
data = sci.loadmat('ex8data1.mat')
#print('data.keys',data.keys())
X = data['X']
Xval = data['Xval']
yval = data['yval']
#print('X:',X.shape)
#print('Xval:',Xval.shape)
#print('yval:',yval.shape)
plt.figure(0)
plt.scatter(X[:,0],X[:,1],marker='x',c='b')
plt.xlabel('Latency (ms)')
plt.ylabel('Throughput (mb/s)')

#Part 2: Estimate the dataset statistics
[mu,sigma2] = estimateGaussian(X)
p = multivariateGaussian(X, mu, sigma2)
visualizeFit(X,  mu, sigma2)

#Part 3: Find Outliers
pval = multivariateGaussian(Xval, mu, sigma2)
[epsilon, F1] = selectThreshold(yval, pval)
print('Best epsilon found using cross-validation:', epsilon)
print('Best F1 on Cross Validation Set:', F1)
print('(you should see a value epsilon of about 8.99e-05)')
print('(you should see a Best F1 value of  0.875000)\n')

outliers = np.where(p < epsilon)
plt.scatter(X[outliers[0],0],X[outliers[0],1],marker='o',edgecolor='r')

#Part 4: Multidimensional Outliers
data = sci.loadmat('ex8data2.mat')
#print('data.keys',data.keys())
X = data['X']
Xval = data['Xval']
yval = data['yval']
#print('X:',X.shape)
#print('Xval:',Xval.shape)
#print('yval:',yval.shape)
[mu, sigma2] = estimateGaussian(X)
p = multivariateGaussian(X, mu, sigma2)
pval = multivariateGaussian(Xval, mu, sigma2)
[epsilon, F1] = selectThreshold(yval, pval)
print('for ex8data2.mat:')
print('Best epsilon found using cross-validation:', epsilon)
print('Best F1 on Cross Validation Set:', F1)
print('(you should see a value epsilon of about 1.38e-18)')
print('(you should see a Best F1 value of 0.615385)')
print('# Outliers found:', sum(p < epsilon))



運行結果:

X_temp: (2500, 2)
Best epsilon found using cross-validation: [8.99985263e-05]
Best F1 on Cross Validation Set: 0.8750000000000001
(you should see a value epsilon of about 8.99e-05)
(you should see a Best F1 value of  0.875000)

for ex8data2.mat:
Best epsilon found using cross-validation: [1.3786075e-18]
Best F1 on Cross Validation Set: 0.6153846153846154
(you should see a value epsilon of about 1.38e-18)
(you should see a Best F1 value of 0.615385)
# Outliers found: [117]

總結:

1、在實現selectThreshold的時候本來想用矩陣的邏輯運算來實現,沒成功。還是使用的兩個for循環,不過看着還行邏輯比較清楚

2、在計算多變量高斯分佈multivariateGaussian()時,使用的是這個公式

 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章