Neural Networks Learning
machine learning
吳恩達機器學習編程練習4,用反向傳播神經網絡識別手寫數字。數據集由5000個20*20分辨率的手寫數字圖片組成。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.io as io
import scipy.misc
import scipy.optimize as opt
import numpy.linalg as lina
data1 = io.loadmat('D:/python/practise/sample/machine-learning-ex4/data/ex4data1.mat')
X, y = data1['X'], data1['y']
data2 = io.loadmat('D:/python/practise/sample/machine-learning-ex4/data/ex4weights.mat')
Theta_1, Theta_2 = data2['Theta1'], data2['Theta2']
X = np.insert(X, 0, 1, axis = 1)
1 Neural Networks
1.1 visualizing the data
def show_1_number(num):
testImgarr = X[num,1:].reshape(20, 20).T
testImgPIL = scipy.misc.toimage(testImgarr)
plt.figure(figsize = (3, 3))
plt.imshow(testImgPIL)
show_1_number(33)
# 完全參照Cowry5師傅的方法,自己實在懶得寫了
def plot_100_image(X): #隨機畫100個數字
sample_idx = np.random.choice(np.arange(X.shape[0]), 100) # 隨機選100個樣本
sample_images = X[sample_idx, :] # (100,400)
fig, ax_array = plt.subplots(nrows=10, ncols=10, sharey=True, sharex=True, figsize=(8, 8))
for row in range(10):
for column in range(10):
ax_array[row, column].matshow(sample_images[10 * row + column].reshape((20, 20)).T, cmap='gray_r')
plt.xticks([])
plt.yticks([])
plt.show()
plot_100_image(X[:, 1: ])
1.3 feedforward and cost function
開始定義函數
def sigmoid(z):
sigmoid = 1 / (1 + np.exp(-z))
return sigmoid
神經網絡的代價函數(無正則化)求值:
def h_forward(X, theta_1, theta_2):
A_1 = X
Z_2 = A_1.dot(theta_1.T) # 5000 * 25
A_2 = sigmoid(Z_2) # 5000 * 25
A_2 = np.insert(A_2, 0, 1, axis=1) # 5000 * 26
Z_3 = A_2.dot(theta_2.T) # 5000 * 10
A_3 = sigmoid(Z_3) # 5000 * 10
return A_3
y_test_1 = h_forward(X, Theta_1, Theta_2)
y_test_1[0]
array([1.12661530e-04, 1.74127856e-03, 2.52696959e-03, 1.84032321e-05,
9.36263860e-03, 3.99270267e-03, 5.51517524e-03, 4.01468105e-04,
6.48072305e-03, 9.95734012e-01])
y_test_1.shape
(5000, 10)
# Adjust y
y_adj = y - 1 # y-1 後 即與h(x)值對應
y_ser = pd.Series(y_adj.reshape(-1))
y_matrix = pd.get_dummies(y_ser).values #展開 y
def J_func(X, y_matrix, theta_1, theta_2):
h = h_forward(X, theta_1, theta_2)
matrix = -(y_matrix*np.log(h) + (1-y_matrix)*(np.log(1-h)))
j_value = matrix.sum()/len(X) # 通用函數如sum()不寫軸就全元素相加
return j_value
J_func(X, y_matrix, Theta_1, Theta_2)
0.2876291651613189
神經網絡的代價函數(正則化)求值:
每一個(除了截距)都要算到
def J_func_reg(X, y_matrix, theta_1, theta_2, c=1):
theta_01 = theta_1[:,1:] # 截距參數不需要正則化
theta_02 = theta_2[:,1:]
reg = (np.square(theta_01).sum() + np.square(theta_02).sum()) * c / (2*len(X))
return J_func(X, y_matrix, theta_1, theta_2) + reg
J_func_reg(X, y_matrix, Theta_1, Theta_2)
0.38376985909092365
2 Backpropagation
from IPython.display import Image
Image(filename = 'C:/Users/dennis/backprop.png')
2.1 sigmoid gradient
def sigmoid_gradient(z):
val = sigmoid(z)*(1-sigmoid(z))
return val
sigmoid_gradient(0) #test
0.25
2.2 random initialize
INIT_EPSILON = 0.12
theta_1_init = np.random.rand(25, X.shape[1])*2*INIT_EPSILON - INIT_EPSILON
theta_2_init = np.random.rand(10,26)*2*INIT_EPSILON - INIT_EPSILON
2.3 Backpropagation
#單個樣本版本的前向傳播
def h_forward_multi(x, theta_1, theta_2):
a_1 = x
z_2 = a_1.dot(theta_1.T) # 1 * 25
a_2 = sigmoid(z_2) # 1 * 25
a_2 = np.insert(a_2, 0, 1) # 1 * 26 -- 不寫axis=1是因爲一維數組不讓寫軸
z_3 = a_2.dot(theta_2.T) # 1 * 10
a_3 = sigmoid(z_3) # 1 * 10
return a_2, a_3
# adjust my y
y_mine = y.copy() # 要用copy,否則就改變原值了
y_mine[y_mine == 10] = 0
y_mine_ser = pd.Series(y_mine.reshape(-1))
y_mine_matrix = pd.get_dummies(y_mine_ser).values
y_mine_matrix.shape
(5000, 10)
第w個樣本的cost函數爲:
單個樣本的偏導數爲:
在輸出層的爲:
在隱藏層的爲:
def backprop_forloop(X, y_mine_matrix, theta_1_init, theta_2_init):
Delta_1 = np.zeros((25, 401))
Delta_2 = np.zeros((10, 26))
for w,t in enumerate(X):
a_1 = t # 1 * 401
a_2, a_3 = h_forward_multi(a_1, theta_1_init, theta_2_init)
delta_3 = (a_3 - y_mine_matrix[w]).reshape((-1,1)) # 10 * 1
delta_2 = theta_2_init.T.dot(delta_3)*((a_2*(1-a_2)).reshape((-1,1))) # 26 * 1
delta_2 = delta_2[1:] # 去掉 δ0 ,偏置項沒有δ , 25 * 1
# accumulate the gradient for all the examples
Delta_1 = Delta_1 + delta_2.dot(a_1.reshape((1,401))) # 25 * 401
Delta_2 = Delta_2 + delta_3.dot(a_2.reshape((1,26))) # 10 * 26
D_1 = Delta_1 / len(X)
D_2 = Delta_2 / len(X)
return D_1, D_2
D_1, D_2 = backprop_forloop(X, y_mine_matrix, theta_1_init, theta_2_init)
2.4 gradient checking
# 扁平化並連接數據
def unrolling_data(data_1, data_2):
data_1 = data_1.ravel()
data_2 = data_2.ravel()
unrolling_data = np.concatenate([data_1, data_2])
return unrolling_data
# 摺疊並拆開數據
def rolling_data(rolling_data):
data_1 = rolling_data[ : 25*401].reshape((25,401))
data_2 = rolling_data[25*401 : ].reshape((10,26))
return data_1, data_2
補充一個輸入扁平化θ的代價函數版本
# 無正則化版本
def J_func_unrolltheta(theta_unrolling, X, y_matrix):
theta_1, theta_2 = rolling_data(theta_unrolling)
return J_func(X, y_matrix, theta_1, theta_2)
# 加入正則化版本
def J_func_reg_unrolltheta(theta_unrolling, X, y_matrix, c = 1):
theta_1, theta_2 = rolling_data(theta_unrolling)
return J_func_reg(X, y_matrix, theta_1, theta_2, c)
def generate_numgrad(theta_unrolling):
EPSILON = 1e-4
gradApprox = np.zeros(len(theta_unrolling))
for i in range(len(theta_unrolling)):
thetaPlus = theta_unrolling.copy() # 要用copy,否則就改變原值了
thetaPlus[i] = theta_unrolling[i] + EPSILON
thetaMinus = theta_unrolling.copy() # 要用copy,否則就改變原值了
thetaMinus[i] = theta_unrolling[i] - EPSILON
gradApprox[i] = (J_func_unrolltheta(thetaPlus, X, y_mine_matrix) - J_func_unrolltheta(thetaMinus, X, y_mine_matrix)) / (2*EPSILON)
return gradApprox
def gradient_checking(theta_1, theta_2, D_1, D_2):
theta_unrolling = unrolling_data(theta_1, theta_2)
D_unrolling = unrolling_data(D_1, D_2)
gradApprox = generate_numgrad(theta_unrolling)
Ng_diff = lina.norm(D_unrolling - gradApprox) / lina.norm(D_unrolling + gradApprox)
return Ng_diff
Ng_diff = gradient_checking(theta_1_init, theta_2_init, D_1, D_2)
#diffrence = np.abs(D_unrolling - gradApprox)
#diffrence.max() # you should see a relative difference that is less than 1e-9 result = 5.939121416886906e-11
# Evaluate the norm of the difference between two solutions.
# If you have a correct implementation, and assuming you used EPSILON = 0.0001
# in computeNumericalGradient.m, then diff below should be less than 1e-9
# diff = norm(numgrad-grad)/norm(numgrad+grad);
#Ng_diff = lina.norm(D_unrolling - gradApprox) / lina.norm(D_unrolling + gradApprox)
Ng_diff # 用吳恩達的方法算出的結果
8.953099337199837e-11
2.5 regularized neural network
#用for循環累加每個樣本的梯度
def backprop_gradient_forloop_reg(theta_unrolling, X, y_mine_matrix, c=1):
theta_1, theta_2 = rolling_data(theta_unrolling)
Delta_1 = np.zeros((25, 401))
Delta_2 = np.zeros((10, 26))
for w,t in enumerate(X):
a_1 = t # 1 * 401
a_2, a_3 = h_forward_multi(a_1, theta_1, theta_2)
delta_3 = (a_3 - y_mine_matrix[w]).reshape((-1,1)) # 10 * 1
delta_2 = theta_2.T.dot(delta_3)*((a_2*(1-a_2)).reshape((-1,1))) # 26 * 1
delta_2 = delta_2[1:] # 去掉 δ0 ,偏置項沒有δ , 25 * 1
# accumulate the gradient for all the examples
Delta_1 = Delta_1 + delta_2.dot(a_1.reshape((1,401))) # 25 * 401
Delta_2 = Delta_2 + delta_3.dot(a_2.reshape((1,26))) # 10 * 26
# 讓θ矩陣第一列等於0
reg_1 = theta_1.copy()
reg_1[:, 0] = 0
reg_2 = theta_2.copy()
reg_2[:, 0] = 0
m = len(X)
D_1_reg = Delta_1 / m + (c/m) * reg_1
D_2_reg = Delta_2 / m + (c/m) * reg_2
return unrolling_data(D_1_reg, D_2_reg)
用矩陣乘法代替for循環,運算速度明顯加快
#矩陣版的前向傳播
def h_forward_multi_matrix(x, theta_1, theta_2):
a_1 = x
z_2 = a_1.dot(theta_1.T) # 5000 * 25
a_2 = sigmoid(z_2) # 5000 * 25
a_2 = np.insert(a_2, 0, 1, axis = 1) # 5000 * 26
z_3 = a_2.dot(theta_2.T) # 1 * 10
a_3 = sigmoid(z_3) # 1 * 10
return a_2, a_3
#用矩陣方法求梯度總和
def backprop_gradient_matrix_reg(theta_unrolling, X, y_mine_matrix, c=1):
theta_1, theta_2 = rolling_data(theta_unrolling)
a_1 = X # 5000 * 401
a_2, a_3 = h_forward_multi_matrix(a_1, theta_1, theta_2) # a_2: 5000 * 26 , a_3: 5000 * 10
delta_3 = (a_3 - y_mine_matrix) # 5000 * 10
delta_2 = (delta_3.dot(theta_2))*((a_2*(1-a_2))) # 5000 * 26
delta_2 = delta_2[:,1:] # 去掉 δ0 ,偏置項沒有δ , 5000 * 25
# sum the gradient for all the examples via matrix's multiplication
Delta_1 = delta_2.T.dot(a_1) # 25 * 401
Delta_2 = delta_3.T.dot(a_2) # 10 * 26
# 讓θ矩陣第一列等於0
reg_1 = theta_1.copy()
reg_1[:, 0] = 0
reg_2 = theta_2.copy()
reg_2[:, 0] = 0
m = len(X)
D_1_reg = Delta_1 / m + (c/m) * reg_1
D_2_reg = Delta_2 / m + (c/m) * reg_2
return unrolling_data(D_1_reg, D_2_reg)
theta_init_unrolling = unrolling_data(theta_1_init, theta_2_init)
D_unrolling_reg = backprop_gradient_matrix_reg(theta_init_unrolling, X, y_mine_matrix, c=1)
# 加入正則項後,生成數值梯度也要用正則化的代價函數算
def generate_numgrad_reg(theta_unrolling, c=1):
EPSILON = 1e-4
gradApprox = np.zeros(len(theta_unrolling))
for i in range(len(theta_unrolling)):
thetaPlus = theta_unrolling.copy() # 要用copy,否則就改變原值了
thetaPlus[i] = theta_unrolling[i] + EPSILON
thetaMinus = theta_unrolling.copy() # 要用copy,否則就改變原值了
thetaMinus[i] = theta_unrolling[i] - EPSILON
gradApprox[i] = (J_func_reg_unrolltheta(thetaPlus, X, y_mine_matrix, c) - J_func_reg_unrolltheta(thetaMinus, X, y_mine_matrix, c)) / (2*EPSILON)
return gradApprox
def gradient_checking_reg(theta_unrolling, D_unrolling_reg, c=1):
gradApprox = generate_numgrad_reg(theta_unrolling, c)
Ng_diff_reg = lina.norm(D_unrolling_reg - gradApprox) / lina.norm(D_unrolling_reg + gradApprox)
return Ng_diff_reg
Ng_diff_reg = gradient_checking_reg(theta_init_unrolling, D_unrolling_reg, c=1)
Ng_diff_reg
9.170318454686764e-11
2.6 learning parameters
def tnc_training(theta, x, y, c):
res = opt.minimize(fun = J_func_reg_unrolltheta, x0 = theta, args = (x, y, c), method = 'TNC', jac = backprop_gradient_matrix_reg, options = {'maxiter' : 400})
return res
res = tnc_training(theta_init_unrolling, X, y_mine_matrix, 1.28) # lambda用1.28
res
fun: 0.36464137778688066
jac: array([-2.16633907e-04, 4.44557739e-08, 1.08284677e-07, ...,
3.33837037e-05, -1.29912542e-04, 2.92099707e-05])
message: 'Max. number of function evaluations reached'
nfev: 400
nit: 28
status: 3
success: False
x: array([ 5.07021100e-01, 1.73655367e-04, 4.22987020e-04, ...,
-1.23295909e+00, -6.60098833e-01, -1.39914331e+00])
def h_forward_more(theta_unrolling, X):
theta_1, theta_2 = rolling_data(theta_unrolling)
a_1 = X
z_2 = a_1.dot(theta_1.T) # 5000 * 25
a_2 = sigmoid(z_2) # 5000 * 25
a_2 = np.insert(a_2, 0, 1, axis = 1) # 5000 * 26
z_3 = a_2.dot(theta_2.T) # 5000 * 10
a_3 = sigmoid(z_3) # 5000 * 10
return a_3
y_predict = h_forward_more(res.x, X).argmax(axis=1)
y_mine = y_mine.reshape(-1)
(y_predict == y_mine).mean()
0.9928
3 Visualizing the hidden layer
X.shape
(5000, 401)
def visual_hidden(theta, x_sample):
theta_1 = rolling_data(theta)[0]
display_data = x_sample * theta_1 # (1*401) * (25*401) = 25 * 401 利用廣播機制傳播各行
display_data = display_data[:, 1:] # 25 * 400
fig, axes = plt.subplots(5,5, sharex = True, sharey = True, figsize = (6,6))
image_list = []
for i in range(len(display_data)):
image_i = scipy.misc.toimage(display_data[i].reshape((20,20)).T)
image_list.append(image_i)
a = 0
for i in range(5):
for j in range(5):
axes[i,j].imshow(image_list[a])
a += 1
plt.subplots_adjust(wspace = 0, hspace = 0)
visual_hidden(res.x, X[600])
用下一章ex5交叉驗證的思路選擇
First, shuffle the original dataset, and then split the shuffled dataset into a trainingset and a cross validation set with the ratio of 7:3.
def randomly_split_dataset(X, y_matrix, ratio_train):
data = np.c_[X, y_matrix]
data_ = np.random.permutation(data)
data_train = data_[ : int(len(X)*ratio_train)]
data_cv = data_[int(len(X)*ratio_train) : ]
random_X_train = data_train[: , : 401]
random_y_train_matrix = data_train[: , 401: ]
random_X_cv = data_cv[: , : 401]
random_y_cv_matrix = data_cv[: , 401: ]
return random_X_train, random_y_train_matrix, random_X_cv, random_y_cv_matrix
用各個值進行訓練,得出各個模型,再計算:
def single_lambda_learn(theta_init_unrolling, X_train, y_train_matrix, X_cv, y_cv_matrix, c):
res = tnc_training(theta_init_unrolling, X_train, y_train_matrix, c)
J_train = J_func_unrolltheta(res.x, X_train, y_train_matrix)
J_cv = J_func_unrolltheta(res.x, X_cv, y_cv_matrix)
return J_train, J_cv
def more_lambda_learn(theta_init_unrolling, X_train, y_train_matrix, X_cv, y_cv_matrix, lambda_vec):
zipped = []
for c in lambda_vec:
element = single_lambda_learn(theta_init_unrolling, X_train, y_train_matrix, X_cv, y_cv_matrix, c)
zipped.append(element)
J_train, J_cv = zip(* zipped)
J_train = np.array(J_train)
J_cv = np.array(J_cv)
return J_train, J_cv
X_train, y_train_matrix, X_cv, y_cv_matrix = randomly_split_dataset(X, y_mine_matrix, ratio_train = 0.7)
def gen_2times_lambdaVec(start):
vec = [0, start]
for i in range(1,11):
vec.append(vec[-1]*2)
return vec
times_lambdaVec = gen_2times_lambdaVec(0.01)
times_lambdaVec
[0, 0.01, 0.02, 0.04, 0.08, 0.16, 0.32, 0.64, 1.28, 2.56, 5.12, 10.24]
J_train, J_cv = more_lambda_learn(theta_init_unrolling, X_train, y_train_matrix, X_cv, y_cv_matrix, times_lambdaVec)
D:\Program Files (x86)\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: RuntimeWarning: invalid value encountered in multiply
This is separate from the ipykernel package so we can avoid doing imports until
D:\Program Files (x86)\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: RuntimeWarning: divide by zero encountered in log
This is separate from the ipykernel package so we can avoid doing imports until
def lambda_curve(J_train, J_cv, lambda_vec):
plt.figure(figsize = (9,6))
plt.plot(lambda_vec, J_train, color = 'b', label = 'J_train')
plt.plot(lambda_vec, J_cv, color = 'g', label = 'J_cv')
#plt.xticks(np.arange(11))
plt.xlabel('lambda')
plt.ylabel('error')
plt.grid(True)
plt.title('Lambda Curve', fontsize = 14)
plt.legend()
lambda_curve(J_train, J_cv, times_lambdaVec)
取1.28時值最小泛化能力最好
實驗識別自己手寫的數字圖片
from PIL import Image as im
img_5 = im.open('D:/python/practise/sample/machine-learning-ex4/handwrite/5.png')
img_6 = im.open('D:/python/practise/sample/machine-learning-ex4/handwrite/6.png')
img_9 = im.open('D:/python/practise/sample/machine-learning-ex4/handwrite/9.png')
fig, ax = plt.subplots(1, 3, figsize = (10, 3))
ax[0].imshow(img_5)
ax[1].imshow(img_6)
ax[2].imshow(img_9)
<matplotlib.image.AxesImage at 0x23b021fa320>
test = np.array([np.array(img_5).ravel(), np.array(img_6).ravel(), np.array(img_9).ravel()])
test = np.insert(test, 0, 1, axis = 1)
h_forward_more(res.x, test).argmax(axis = 1) #用2.6節的訓練結果
array([0, 9, 8], dtype=int64)
泛化性能實在是差!!!