Andrew Ng, deeplearning. Course2 week2,Optimization

1、code

import numpy as np
import matplotlib.pyplot as plt
import scipy.io
import math
import sklearn
import sklearn.datasets

import opt_utils #參見數據包或者在本文底部copy
import testCase  #參見數據包或者在本文底部copy

#%matplotlib inline #如果你用的是Jupyter Notebook請取消註釋
plt.rcParams['figure.figsize'] = (7.0, 4.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'


def update_parameters_with_gd(parameters, grads, learning_rate):
	L = len(parameters) // 2
	for l in range(L):
		parameters["W" + str(l + 1)] = parameters["W" + str(l + 1)] - learning_rate * grads["dW" + str(l + 1)]
		parameters["b" + str(l + 1)] = parameters["b" + str(l + 1)] - learning_rate * grads["db" + str(l + 1)]

	return parameters

def random_mini_batches(X, Y, mini_batch_size = 64, seed = 0):
	np.random.seed(seed)
	m = Y.shape[1]
	mini_batches = [ ]

	#permute
	permutation = list(np.random.permutation(m))
	shuffled_X = X[:,permutation] #np.array 的切片特性
	shuffled_Y = Y[:,permutation].reshape(1,m )

	#cut
	num_complete_minibatches = m // mini_batch_size
	for k in range(num_complete_minibatches):
		mini_batch_X = shuffled_X[:, k * mini_batch_size:(k+1)*mini_batch_size]
		mini_batch_Y = shuffled_Y[:, k * mini_batch_size:(k+1)*mini_batch_size]
		mini_batch = (mini_batch_X, mini_batch_Y)
		mini_batches.append(mini_batch)
	if m % mini_batch_size != 0:
		mini_batch_X[:,mini_batch_size*num_complete_minibatches:]
		mini_batch_Y[:,mini_batch_size*num_complete_minibatches:]

		mini_batch = (mini_batch_X, mini_batch_Y)
		mini_batches.append(mini_batch)
	return mini_batches

def initialize_velocity(parameters):
	L = len(parameters) // 2
	v = { }
	for l in range(L):
		v["dW" + str(l + 1)] = np.zeros_like(parameters["W" + str(l + 1)])
		v["db" + str(l + 1)] = np.zeros_like(parameters["b" + str(l + 1)])

	return v

def update_parameters_with_momentum(parameters, grads, v, beta, learning_rate):
	update_parameters_with_momentum
	L = len(parameters) // 2
	for l in range(L):
		update_parameters_with_momentum
		v["dW" + str(l+1)] = beta * v["dW" + str(l+1)] + (1 - beta) * grads["dW" + str(l+1)]
		v["db" + str(l+1)] = beta * v["db" + str(l+1)] + (1 - beta) * grads["db"+ str(l+1)]

		parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate*v["dW" + str(l+1)]
		parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate*v["db" + str(l+1)]

	return parameters,v

def initialize_adam(parameters):
	L = len(parameters) //2 
	v = { }
	s = { }
	for l in range(L):
		v["dW" + str(l+1)] = np.zeros_like(parameters["W" + str(l+1)])
		v["db" + str(l+1)] = np.zeros_like(parameters["b" + str(l+1)])
		s["dW" + str(l+1)] = np.zeros_like(parameters["W" + str(l+1)])
		s["db" + str(l+1)] = np.zeros_like(parameters["b" + str(l+1)])
	return v,s

def update_parameters_with_adam(parameters, grads, v, s, t, learning_rate=0.01, beta1 = 0.9,beta2= 0.999, epsilon=1e-8):
	L = len(parameters) // 2
	v_corrected = { }
	s_corrected = { }
	for l in range(L):
		v["dW" + str(l+1)] = beta1*v["dW" + str(l+1)] + (1-beta1) * grads["dW"+ str(l+1)]
		v["db" + str(l+1)] = beta1*v["db" + str(l+1)] + (1-beta1) * grads["db" + str(l+1)]
		v_corrected["dW" + str(l+1)] = v["dW" + str(l+1)] / (1-np.power(beta1,t))
		v_corrected["db" + str(l+1)] = v["db" + str(l+1)] / (1-np.power(beta1,t))

		s["dW" + str(l+1)] = beta2*s["dW"+str(l+1)] +(1-beta2) * np.power(grads["dW" + str(l+1)],2)
		s["db" + str(l+1)] = beta2*s["db"+str(l+1)] +(1-beta2) * np.power(grads["db" + str(l+1)],2)
		s_corrected["dW" + str(l+1)] = s["dW" + str(l+1)] / (1-np.power(beta2,t))
		s_corrected["db" + str(l+1)] = s["db" + str(l+1)] / (1-np.power(beta2,t))
		assert(parameters["b1"].shape == (5,1))

		parameters["W" + str(l+1)] = parameters["W" +str(l+1)] - learning_rate*(v_corrected["dW"+str(l+1)]/np.sqrt(s_corrected["dW"+str(l+1)] + epsilon))
		

		parameters["b" + str(l+1)] = parameters["b" +str(l+1)] - learning_rate*(v_corrected["db"+str(l+1)]/np.sqrt(s_corrected["db"+str(l+1)] + epsilon))
		assert(parameters["b1"].shape == (5,1))
	return parameters, v, s

def model(X, Y, layer_dims, optimizer, learning_rate = 0.0007,
		 mini_batch_size = 64, beta = 0.9,beta1 = 0.9, beta2 = 0.999,
		 epsilon = 1e-8, num_epochs = 10000, print_cost = True, is_plot = True):
	
	L = len(layer_dims)
	costs = []
	t = 0
	seed = 10

	parameters = opt_utils.initialize_parameters(layer_dims)

	if optimizer == "gd":
		pass
	elif optimizer == "momentum":
		v = initialize_velocity(parameters)
	elif optimizer == "adam":
		v,s = initialize_adam(parameters)
		assert(parameters["b1"].shape == (5,1))
	else:
		print("Error")
		exit()
	assert(parameters["b1"].shape == (5,1))
	for i in range(num_epochs):
				
		seed = seed + 1
		assert(parameters["b1"].shape == (5,1))
		mini_batches = random_mini_batches(X, Y, mini_batch_size, seed)
		assert(parameters["b1"].shape == (5,1))

		
		for mini_batch in mini_batches:
						
			assert(parameters["b1"].shape == (5,1))
			(mini_batch_X, mini_batch_Y) = mini_batch
			assert(parameters["b1"].shape == (5,1))
			AL, cache = opt_utils.forward_propagation(mini_batch_X, parameters)
			assert(parameters["b1"].shape == (5,1))

			cost = opt_utils.compute_cost(AL, mini_batch_Y)

			assert(parameters["b1"].shape == (5,1))

			grads = opt_utils.backward_propagation(mini_batch_X, mini_batch_Y, cache)
			assert(parameters["b1"].shape == (5,1))
			if optimizer == "gd":
				parameters = update_parameters_with_gd(parameters, grads, learning_rate)
			elif optimizer == "momentum":
				parameters,v = update_parameters_with_momentum(parameters, grads, v, beta, learning_rate)
			elif optimizer == "adam":
				t = t + 1
				parameters,v,s == update_parameters_with_adam(parameters,grads, v, s, t, learning_rate,  beta1, beta2, epsilon)
			assert(parameters["b1"].shape == (5,1))
		if i % 100 == 0:
			costs.append(cost)
			if print_cost and i % 1000 == 0:
				print("第 %s 次迭代,cost = %s:"%(str(i), str(cost)))

	if is_plot:
		plt.plot(costs)
		plt.xlabel("epochs(per 100)")
		plt.ylabel("cost")
		plt.title("learning_rate = " + str(learning_rate))
		plt.show()

	return parameters


2、 unit_test

from optimize_algorithm import *
from testCase import *

def line(s):
	print("="*10 + s + "="*10)
"""
line("test for update_parameters_with_gd")
parameters, grads, learning_rate = update_parameters_with_gd_test_case()
parameters = update_parameters_with_gd(parameters, grads, learning_rate)
print("W1 = " + str(parameters["W1"]))
print("b1 = " + str(parameters["b1"]))
print("W2 = " + str(parameters["W2"]))
print("b2 = " + str(parameters["b2"]))


line("test for random_mini_batches")
X_asses, Y_asses, mini_batch_size = random_mini_batches_test_case()
mini_batches = random_mini_batches(X_asses, Y_asses, mini_batch_size)
print("第1個mini_batch_X 的維度爲:",mini_batches[0][0].shape)
print("第1個mini_batch_Y 的維度爲:",mini_batches[0][1].shape)
print("第2個mini_batch_X 的維度爲:",mini_batches[1][0].shape)
print("第2個mini_batch_Y 的維度爲:",mini_batches[1][1].shape)
print("第3個mini_batch_X 的維度爲:",mini_batches[2][0].shape)
print("第3個mini_batch_Y 的維度爲:",mini_batches[2][1].shape)

line("test for initialize_velocity")
parameters = initialize_velocity_test_case()
v = initialize_velocity(parameters)
print('v["dW1"] = ' + str(v["dW1"]))
print('v["db1"] = ' + str(v["db1"]))
print('v["dW2"] = ' + str(v["dW2"]))
print('v["db2"] = ' + str(v["db2"]))

line("test for update_parameters_with_momentum")
parameters, grads, v = update_parameters_with_momentum_test_case()
parameters, v = update_parameters_with_momentum(parameters, grads, v, beta = 0.9, learning_rate= 0.01)
print("W1 = " + str(parameters["W1"]))
print("b1 = " + str(parameters["b1"]))
print("W2 = " + str(parameters["W2"]))
print("b2 = " + str(parameters["b2"]))
print('v["dW1"] = ' + str(v["dW1"]))
print('v["db1"] = ' + str(v["db1"]))
print('v["dW2"] = ' + str(v["dW2"]))
print('v["db2"] = ' + str(v["db2"]))
print('g["dW1"] = ' + str(grads["dW1"]))
print('g["db1"] = ' + str(grads["db1"]))
print('g["dW2"] = ' + str(grads["dW2"]))
print('g["db2"] = ' + str(grads["db2"]))

line("test for initialize_adam")
parameters = initialize_adam_test_case()
v,s = initialize_adam(parameters)
print('v["dW1"] = ' + str(v["dW1"])) 
print('v["db1"] = ' + str(v["db1"])) 
print('v["dW2"] = ' + str(v["dW2"])) 
print('v["db2"] = ' + str(v["db2"])) 
print('s["dW1"] = ' + str(s["dW1"])) 
print('s["db1"] = ' + str(s["db1"])) 
print('s["dW2"] = ' + str(s["dW2"])) 
print('s["db2"] = ' + str(s["db2"])) 

line("test for update_parameters_with_adam")
parameters,grads, v,s = update_parameters_with_adam_test_case()
parameters,v,s = update_parameters_with_adam(parameters,grads,v,s,t=2)
print("W1 = " + str(parameters["W1"]))
print("b1 = " + str(parameters["b1"]))
print("W2 = " + str(parameters["W2"]))
print("b2 = " + str(parameters["b2"]))
print('v["dW1"] = ' + str(v["dW1"])) 
print('v["db1"] = ' + str(v["db1"])) 
print('v["dW2"] = ' + str(v["dW2"])) 
print('v["db2"] = ' + str(v["db2"])) 
print('s["dW1"] = ' + str(s["dW1"])) 
print('s["db1"] = ' + str(s["db1"])) 
print('s["dW2"] = ' + str(s["dW2"])) 
print('s["db2"] = ' + str(s["db2"])) 


train_X, train_Y = opt_utils.load_dataset(is_plot = True)
layer_dims = [train_X.shape[0], 5, 2, 1]
parameters = model(train_X, train_Y, layer_dims, optimizer = "adam", is_plot = True)


prediction = opt_utils.predict(train_X, train_Y, parameters)

plt.title("Model with gradient descent optimization")
axes = plt.gca()
axes.set_xlim([-1.5, 2.5])
axes.set_ylim([-1, 1.5])
opt_utils.plot_decision_boundary(lambda x :opt_utils.predict_dec(parameters, x.T), train_X, train_Y)
"""
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章