開篇
其實上研究生一年了, 雖然一直從事深度學習相關項目, 但是真正的理論知識幾乎沒怎麼深入學習過, 往常的工作無非就是拿着大佬的論文和工程代碼, 拿着大佬們做好的GNN, RGIN, RDL等使用朋友TensorFlow等流行的深度學習框架, 照搬照抄, 很多細節, 參數意義, 框架結構都不是很瞭解, 正好趁着假期, 好好學習一下理論基礎, 於是找到了這本純理論, 純手工手撕鬼子的神作, [書和相關代碼點擊這裏進行下,深度學習入門:基於Python的理論與實踐]
在這裏說明一些學習技巧, 可以結合CSDN公開課裏面的 <5天搞定深度學習入門系列>
, 簡直不能太開心
文章目錄
一、建立path_config.json
目的是這裏配置工程路徑, 使得後期開發不會有路徑上的bug
import sys, os
import json
import numpy as np
with open(r"../path_config.json" ,"r",encoding="utf-8") as f:
config = json.loads(f.read())
root_path = config["root_path"]
sys.path.append(root_path) # 引入根目錄
print("初始化成功")
初始化成功
二、激活函數, 即下圖的z
# 激活函數
# 多分類softmax 返回預測的概率
# y = x
def identity_function(x):
return x
# 階躍函數
def step_function(x):
return np.array(x > 0, dtype=np.int)
def softmax(x):
if x.ndim == 2:
x = x.T
x = x - np.max(x, axis=0)
y = np.exp(x) / np.sum(np.exp(x), axis=0)
return y.T
x = x - np.max(x) # 溢出對策
return np.exp(x) / np.sum(np.exp(x))
# ReLu激活函數 返回激活值f_x 微分值d_x
def ReLU(x):
f_x = np.maximum(0, x)
d_x = f_x.copy()
d_x[d_x>0] = 1
return f_x, d_x
def relu(x):
return np.maximum(0, x)
def relu_grad(x):
grad = np.zeros(x)
grad[x>=0] = 1
return grad
# sigmoid激活函數 返回激活值sig 微分值 sig*(1-sig)
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def sigmoid_grad(x):
return (1.0 - sigmoid(x)) * sigmoid(x)
# tanh激活函數 返回激活值tan 微分值 1-np.square(tan)
def tanha(x):
tan=(np.exp(x)-np.exp(-x))/(np.exp(x)+np.exp(-x))
return tan,1-np.square(tan)
三、損失函數
# 損失函數
y = np.array([0.1, 0.05, 0.6, 0.0, 0.05, 0.1, 0.0, 0.1, 0.0, 0.0])
t = np.array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0])
# 均方誤差
def mean_squared_error(y, t):
return 0.5 * np.sum((y-t)**2)
print("均方誤差計算: ", mean_squared_error(y,t)) # 0.09750000000000003
# 交叉熵誤差
def cross_entropy_error(y, t):
if y.ndim == 1:
t = t.reshape(1, t.size)
y = y.reshape(1, y.size)
# 監督數據是one-hot-vector的情況下,轉換爲正確解標籤的索引
if t.size == y.size:
t = t.argmax(axis=1)
batch_size = y.shape[0]
return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size
print("cross_entropy_error交叉熵誤差計算: ",cross_entropy_error(y,t)) # 0.510825457099338
def softmax_loss(X, t):
y = softmax(X)
return cross_entropy_error(y, t)
print("softmax_loss誤差計算: ",softmax_loss(y,t))
均方誤差計算: 0.09750000000000003
cross_entropy_error交叉熵誤差計算: 0.510825457099338
softmax_loss誤差計算: 1.8194936854234711
四、梯度
# 梯度
# 梯度實現 ,參數f爲函數,x爲NumPy數組, 注意輸入的應該是浮點型
def numerical_gradient(f, x):
h = 1e-4 # 0.0001
grad = np.zeros_like(x) # 生成和x形狀相同的數組
for idx in range(len(x)): # 作者這塊使用的是x.size, 但是x.size針對的是一維列表, 修改爲len(x)可以針對於多維列表進行np的操作
tmp_val = x[idx]
# f(x+h)的計算
x[idx] = tmp_val + h
fxh1 = f(x)
# f(x-h)的計算
x[idx] = tmp_val - h
fxh2 = f(x)
grad[idx] = (fxh1 - fxh2) / (2*h)
x[idx] = tmp_val # 還原值
return grad
# def numerical_gradient(f, x):
# h = 1e-4 # 0.0001
# grad = np.zeros_like(x)
# it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
# while not it.finished:
# idx = it.multi_index
# tmp_val = x[idx]
# x[idx] = float(tmp_val) + h
# fxh1 = f(x) # f(x+h)
# x[idx] = tmp_val - h
# fxh2 = f(x) # f(x-h)
# grad[idx] = (fxh1 - fxh2) / (2*h)
# x[idx] = tmp_val # 還原值
# it.iternext()
# return grad
def function_2(x):
return np.sum(x**2)
r = numerical_gradient(function_2, np.array([[3.0, 4.0], [2.0,3.0],[2.0,3.0]])) #array([6., 8.])
print(r)
r = numerical_gradient(function_2, np.array([3.0,4.0])) #array([6., 8.])
print(r)
r = numerical_gradient(function_2, np.array([[2.0,3.0,4.0],[2.0,3.0,2.0], [4.0,5.0,6.0]])) #array([6., 8.])
print(r)
[[7.0001 7.0001]
[5.0001 5.0001]
[5.0001 5.0001]]
[6. 8.]
[[ 9.00015 9.00015 9.00015]
[ 7.00015 7.00015 7.00015]
[15.00015 15.00015 15.00015]]
五、神經網絡模型
# 神經網絡兩層
class TwoLayerNet():
def __init__(self, layers=[784,50,10], seed=200, weight_init_std=0.01): #
# 初始化權重
self.params = {}
self.params['W1'] = weight_init_std * np.random.randn(layers[0], layers[1])
self.params['b1'] = np.zeros(layers[1])
self.params['W2'] = weight_init_std * np.random.randn(layers[1], layers[2])
self.params['b2'] = np.zeros(layers[2])
def predict(self, x):
W1, W2 = self.params['W1'], self.params['W2']
b1, b2 = self.params['b1'], self.params['b2']
# 前向傳播
z1 = sigmoid( np.dot(x, W1) + b1 )
y = softmax( np.dot(z1,W2) + b2 )
return y
# x:輸入數據, t:監督數據
def loss(self, x, t):
y = self.predict(x)
return cross_entropy_error(y, t) # 交叉熵計算
def accuracy(self, x, t):
y = self.predict(x)
y = np.argmax(y, axis=1)
t = np.argmax(t, axis=1)
accuracy = np.sum(y == t) / float(x.shape[0])
return accuracy
# x:輸入數據, t:監督數據
def numerical_gradient(self, x, t):
loss_W = lambda W: self.loss(x, t)
grads = {}
grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
return grads
def gradient(self, x, t): # 梯度下降法高級版本
W1, W2 = self.params['W1'], self.params['W2']
b1, b2 = self.params['b1'], self.params['b2']
grads = {}
batch_num = x.shape[0]
# forward
a1 = np.dot(x, W1) + b1
z1 = sigmoid( a1 )
a2 = np.dot(z1, W2) + b2
y = softmax( a2 )
# backward
dy = (y - t) / batch_num
grads['W2'] = np.dot(z1.T, dy)
grads['b2'] = np.sum(dy, axis=0)
da1 = np.dot(dy, W2.T)
dz1 = sigmoid_grad(a1) * da1
grads['W1'] = np.dot(x.T, dz1)
grads['b1'] = np.sum(dz1, axis=0)
return grads
def train(self, x_train, t_train, x_test, t_test, epochs=10000, batch_size=100, learning_rate=0.1):
train_size = x_train.shape[0]
train_loss_list = []
train_acc_list = []
test_acc_list = []
iter_per_epoch = max(train_size / batch_size, 1)
for i in range(epochs):
batch_mask = np.random.choice(train_size, batch_size)
x_batch = x_train[batch_mask]
t_batch = t_train[batch_mask]
# 計算梯度
#grad = self.numerical_gradient(x_batch, t_batch)
grad = self.gradient(x_batch, t_batch) #這裏面首先使用x預測到y, y與t得到loss, 在用loss得到梯度
# 更新參數
for key in ('W1', 'b1', 'W2', 'b2'):
self.params[key] -= learning_rate * grad[key]
loss = self.loss(x_batch, t_batch)
train_loss_list.append(loss)
if i % iter_per_epoch == 0:
train_acc = self.accuracy(x_train, t_train) #計算所有的訓練數據的準確率
test_acc = self.accuracy(x_test, t_test) # 計算所有的測試數據的準確率
train_acc_list.append(train_acc) #這兩句話目的是保存數據用來後面的可視化
test_acc_list.append(test_acc)
print("train acc, test acc | " + str(train_acc) + ", " + str(test_acc))
return train_loss_list, train_acc_list, test_acc_list
六、主函數
# 開始訓練
import numpy as np
from data_set.mnist import load_mnist
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label = True) # 數據
network = TwoLayerNet()
train_loss_list,train_acc_list,test_acc_list = network.train(x_train, t_train, x_test, t_test)
(10000, 784)
<class 'numpy.ndarray'>
train acc, test acc | 0.09863333333333334, 0.0958
train acc, test acc | 0.7864833333333333, 0.7901
train acc, test acc | 0.8757833333333334, 0.8787
train acc, test acc | 0.8993166666666667, 0.9018
train acc, test acc | 0.9092166666666667, 0.9125
train acc, test acc | 0.9160166666666667, 0.9172
train acc, test acc | 0.92075, 0.9241
train acc, test acc | 0.9245166666666667, 0.9264
train acc, test acc | 0.9291, 0.9308
train acc, test acc | 0.932, 0.9332
train acc, test acc | 0.9350833333333334, 0.9349
train acc, test acc | 0.9381166666666667, 0.9383
train acc, test acc | 0.9407666666666666, 0.9405
train acc, test acc | 0.9432166666666667, 0.9434
train acc, test acc | 0.9443333333333334, 0.9441
train acc, test acc | 0.9464666666666667, 0.9461
train acc, test acc | 0.9483833333333334, 0.9471
七、數據結果處理
import matplotlib.pyplot as plt
%matplotlib inline
# 繪製圖形
markers = {'train': 'o', 'test': 's'}
x = np.arange(len(train_acc_list))
plt.plot(x, train_acc_list, label='train acc')
plt.plot(x, test_acc_list, label='test acc', linestyle='--')
plt.xlabel("epochs")
plt.ylabel("accuracy")
plt.ylim(0, 1.0)
plt.legend(loc='lower right')
plt.show()
準確率顯示
import matplotlib.pyplot as plt
%matplotlib inline
# 繪製圖形
markers = {'train': 'o', 'test': 's'}
x = np.arange(len(train_loss_list))
plt.plot(x, train_loss_list, label='train acc')
plt.xlabel("epochs")
plt.ylabel("loss")
plt.ylim(0, 3.0)
plt.legend(loc='lower right')
plt.show()
loss分佈顯示
八、修改參數
很顯然梯度變化的時候, 會有上下波動,效率低下
使用SGD結果
0.9455
動量
結果:0.9719
adaGrad
結果 0.9724
Adam 結合上面兩種方法
結果: 0.9652
代碼:
import sys, os
import json
import numpy as np
sys.path.append("../") # 引入根目錄
from src.relation_function import * # 這裏保存了五、中所有的函數
from data_set.mnist import load_mnist
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label = True) # 數據
print(x_train.shape)
print("初始化成功")
# 神經網絡兩層
class TwoLayerNet():
def __init__(self, layers=[784,50,10], seed=200, weight_init_std=0.01,lr=0.001): #
# 初始化權重
self.params = {}
self.params['W1'] = np.random.randn(layers[0], layers[1])/np.sqrt(layers[0])
self.params['b1'] = np.zeros(layers[1])
self.params['W2'] = np.random.randn(layers[1], layers[2])/np.sqrt(layers[1])
self.params['b2'] = np.zeros(layers[2])
self.v = None # momentum動量替換梯度下降
self.h = None # 學習率操作方法
self.m = None # adam參數
self.beta1 = 0.9
self.beta2 = 0.999
self.iter = 0
self.lr = lr
def predict(self, x):
W1, W2 = self.params['W1'], self.params['W2']
b1, b2 = self.params['b1'], self.params['b2']
# 前向傳播
z1 = sigmoid( np.dot(x, W1) + b1 )
y = softmax( np.dot(z1,W2) + b2 )
return y
# x:輸入數據, t:監督數據
def loss(self, x, t):
y = self.predict(x)
return cross_entropy_error(y, t) # 交叉熵計算
def accuracy(self, x, t):
y = self.predict(x)
y = np.argmax(y, axis=1)
t = np.argmax(t, axis=1)
accuracy = np.sum(y == t) / float(x.shape[0])
return accuracy
# x:輸入數據, t:監督數據
def numerical_gradient(self, x, t):
loss_W = lambda W: self.loss(x, t)
grads = {}
grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
return grads
def gradient(self, x, t): # 梯度下降法高級版本
W1, W2 = self.params['W1'], self.params['W2']
b1, b2 = self.params['b1'], self.params['b2']
grads = {}
batch_num = x.shape[0]
# forward
a1 = np.dot(x, W1) + b1
z1 = sigmoid( a1 )
a2 = np.dot(z1, W2) + b2
y = softmax( a2 )
# backward
dy = (y - t) / batch_num
grads['W2'] = np.dot(z1.T, dy)
grads['b2'] = np.sum(dy, axis=0)
da1 = np.dot(dy, W2.T)
dz1 = sigmoid_grad(a1) * da1
grads['W1'] = np.dot(x.T, dz1)
grads['b1'] = np.sum(dz1, axis=0)
return grads
# 更新參數
def update(self, x_batch, t_batch, learning_rate):
# 計算梯度
grad = self.gradient(x_batch, t_batch) #這裏面首先使用x預測到y, y與t得到loss, 在用loss得到梯度
# 更新參數
for key in self.params.keys():
self.params[key] -= learning_rate * grad[key]
# 更新參數, 動量方法Momentum方法給人
# 的感覺就像是小球在地面上滾動
def update_momentum(self, x_batch, t_batch, learning_rate, momentum=0.9):
# 計算梯度
grad = self.gradient(x_batch, t_batch) #這裏面首先使用x預測到y, y與t得到loss, 在用loss得到梯度
if self.v is None: # 初始化動量
self.v = {}
for key, val in self.params.items():
self.v[key] = np.zeros_like(val)
# 更新參數
for key in self.params.keys():
self.v[key] = momentum*self.v[key]-learning_rate*grad[key]
self.params[key] += self.v[key]
# 更新參數, 修改學習率方法給人
# 的感覺就像是小球在地面上滾動
def update_adaGrad(self, x_batch, t_batch, learning_rate):
# 計算梯度
grad = self.gradient(x_batch, t_batch) #這裏面首先使用x預測到y, y與t得到loss, 在用loss得到梯度
if self.h is None: # 初始化動量
self.h = {}
for key, val in self.params.items():
self.h[key] = np.zeros_like(val)
# 更新參數
for key in self.params.keys():
self.h[key] += grad[key]*grad[key]
self.params[key] -= learning_rate*grad[key] / (np.sqrt(self.h[key])+ 1e-7)
def update_adam(self, x_batch, t_batch):
# 計算梯度
grads = self.gradient(x_batch, t_batch) #這裏面首先使用x預測到y, y與t得到loss, 在用loss得到梯度
if self.m is None: # 初始化動量
self.m, self.v = {}, {}
for key, val in self.params.items():
self.m[key] = np.zeros_like(val)
self.v[key] = np.zeros_like(val)
self.iter += 1
lr_t = self.lr * np.sqrt(1.0 - self.beta2**self.iter) / (1.0 - self.beta1**self.iter)
# 更新參數
for key in self.params.keys():
self.m[key] += (1 - self.beta1) * (grads[key] - self.m[key])
self.v[key] += (1 - self.beta2) * (grads[key]**2 - self.v[key])
self.params[key] -= lr_t * self.m[key] / (np.sqrt(self.v[key]) + 1e-7)
def train(self, x_train, t_train, x_test, t_test, epochs=10000, batch_size=100, learning_rate=0.1):
train_size = x_train.shape[0]
train_loss_list = []
train_acc_list = []
test_acc_list = []
iter_per_epoch = max(train_size / batch_size, 1)
for i in range(epochs):
start = (i*batch_size)%60000
end = (i*batch_size)%60000 + batch_size
# print(start,end)
# batch_mask = np.random.choice(train_size, batch_size)
x_batch = x_train[start:end]
t_batch = t_train[start:end]
# self.update_momentum(x_batch, t_batch, learning_rate) # 更新參數
# self.update(x_batch, t_batch, learning_rate) # 更新參數
# self.update_adaGrad(x_batch, t_batch, learning_rate)
self.update_adam(x_batch, t_batch)
loss = self.loss(x_batch, t_batch)
train_loss_list.append(loss)
if i % iter_per_epoch == 0:
train_acc = self.accuracy(x_train, t_train) #計算所有的訓練數據的準確率
test_acc = self.accuracy(x_test, t_test) # 計算所有的測試數據的準確率
train_acc_list.append(train_acc) #這兩句話目的是保存數據用來後面的可視化
test_acc_list.append(test_acc)
print("train acc, test acc | " + str(train_acc) + ", " + str(test_acc))
return train_loss_list, train_acc_list, test_acc_list
# 開始訓練
import numpy as np
from data_set.mnist import load_mnist
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label = True) # 數據
network = TwoLayerNet()
train_loss_list,train_acc_list,test_acc_list = network.train(x_train, t_train, x_test, t_test)
train acc, test acc | 0.09751666666666667, 0.0974
train acc, test acc | 0.9031333333333333, 0.9093
train acc, test acc | 0.92455, 0.924
train acc, test acc | 0.9364166666666667, 0.935
train acc, test acc | 0.94445, 0.9398
train acc, test acc | 0.9502833333333334, 0.9455
train acc, test acc | 0.9552833333333334, 0.9489
train acc, test acc | 0.9593, 0.9514
train acc, test acc | 0.9628333333333333, 0.9543
train acc, test acc | 0.9657, 0.9566
train acc, test acc | 0.9684, 0.9587
train acc, test acc | 0.9706, 0.9603
train acc, test acc | 0.97255, 0.9619
train acc, test acc | 0.9742166666666666, 0.9626
train acc, test acc | 0.9759333333333333, 0.9641
train acc, test acc | 0.9771166666666666, 0.9643
train acc, test acc | 0.9785166666666667, 0.9651