開篇

其實上研究生一年了, 雖然一直從事深度學習相關項目, 但是真正的理論知識幾乎沒怎麼深入學習過, 往常的工作無非就是拿着大佬的論文和工程代碼, 拿着大佬們做好的GNN, RGIN, RDL等使用朋友TensorFlow等流行的深度學習框架, 照搬照抄, 很多細節, 參數意義, 框架結構都不是很瞭解, 正好趁着假期, 好好學習一下理論基礎, 於是找到了這本純理論, 純手工手撕鬼子的神作, [書和相關代碼點擊這裏進行下,深度學習入門:基於Python的理論與實踐]

在這裏說明一些學習技巧, 可以結合CSDN公開課裏面的 <5天搞定深度學習入門系列>
, 簡直不能太開心

文章目錄

八、修改參數

一、建立path_config.json

目的是這裏配置工程路徑, 使得後期開發不會有路徑上的bug

import sys, os
import json
import numpy as np
with open(r"../path_config.json" ,"r",encoding="utf-8") as f:
    config = json.loads(f.read())
    root_path = config["root_path"]
sys.path.append(root_path) # 引入根目錄
print("初始化成功")

初始化成功

二、激活函數, 即下圖的z

# 激活函數
# 多分類softmax 返回預測的概率

# y = x
def identity_function(x):
    return x

# 階躍函數
def step_function(x):
    return np.array(x > 0, dtype=np.int)

def softmax(x):
    if x.ndim == 2:
        x = x.T
        x = x - np.max(x, axis=0)
        y = np.exp(x) / np.sum(np.exp(x), axis=0)
        return y.T 
    x = x - np.max(x) # 溢出對策
    return np.exp(x) / np.sum(np.exp(x))

# ReLu激活函數 返回激活值f_x 微分值d_x
def ReLU(x):
    f_x = np.maximum(0, x)
    d_x = f_x.copy()
    d_x[d_x>0] = 1
    return f_x, d_x

def relu(x):
    return np.maximum(0, x)

def relu_grad(x):
    grad = np.zeros(x)
    grad[x>=0] = 1
    return grad

# sigmoid激活函數 返回激活值sig 微分值 sig*(1-sig)
def sigmoid(x):
    return 1 / (1 + np.exp(-x))  

def sigmoid_grad(x):
    return (1.0 - sigmoid(x)) * sigmoid(x)

# tanh激活函數 返回激活值tan 微分值 1-np.square(tan)
def tanha(x):
    tan=(np.exp(x)-np.exp(-x))/(np.exp(x)+np.exp(-x))
    return tan,1-np.square(tan)

三、損失函數

# 損失函數
y = np.array([0.1, 0.05, 0.6, 0.0, 0.05, 0.1, 0.0, 0.1, 0.0, 0.0])
t = np.array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0])

# 均方誤差
def mean_squared_error(y, t):
    return 0.5 * np.sum((y-t)**2)
print("均方誤差計算: ", mean_squared_error(y,t)) # 0.09750000000000003

# 交叉熵誤差
def cross_entropy_error(y, t):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
    # 監督數據是one-hot-vector的情況下，轉換爲正確解標籤的索引
    if t.size == y.size:
        t = t.argmax(axis=1)
    batch_size = y.shape[0]
    return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size
print("cross_entropy_error交叉熵誤差計算: ",cross_entropy_error(y,t)) # 0.510825457099338

def softmax_loss(X, t):
    y = softmax(X)
    return cross_entropy_error(y, t)
print("softmax_loss誤差計算: ",softmax_loss(y,t))

均方誤差計算:  0.09750000000000003
cross_entropy_error交叉熵誤差計算:  0.510825457099338
softmax_loss誤差計算:  1.8194936854234711

四、梯度

# 梯度
# 梯度實現 ，參數f爲函數，x爲NumPy數組, 注意輸入的應該是浮點型
def numerical_gradient(f, x):
    h = 1e-4 # 0.0001
    grad = np.zeros_like(x) # 生成和x形狀相同的數組
    for idx in range(len(x)):  # 作者這塊使用的是x.size, 但是x.size針對的是一維列表, 修改爲len(x)可以針對於多維列表進行np的操作
        tmp_val = x[idx]
        # f(x+h)的計算
        x[idx] = tmp_val + h
        fxh1 = f(x)
        # f(x-h)的計算
        x[idx] = tmp_val - h
        fxh2 = f(x)
        grad[idx] = (fxh1 - fxh2) / (2*h)
        x[idx] = tmp_val # 還原值
    return grad

# def numerical_gradient(f, x):
#     h = 1e-4 # 0.0001
#     grad = np.zeros_like(x)
#     it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
#     while not it.finished:
#         idx = it.multi_index
#         tmp_val = x[idx]
#         x[idx] = float(tmp_val) + h
#         fxh1 = f(x) # f(x+h)
#         x[idx] = tmp_val - h 
#         fxh2 = f(x) # f(x-h)
#         grad[idx] = (fxh1 - fxh2) / (2*h)
#         x[idx] = tmp_val # 還原值
#         it.iternext()   
#     return grad
def function_2(x):
    return np.sum(x**2)

r = numerical_gradient(function_2, np.array([[3.0, 4.0], [2.0,3.0],[2.0,3.0]])) #array([6., 8.])
print(r)

r = numerical_gradient(function_2, np.array([3.0,4.0])) #array([6., 8.])
print(r)

r = numerical_gradient(function_2, np.array([[2.0,3.0,4.0],[2.0,3.0,2.0], [4.0,5.0,6.0]])) #array([6., 8.])
print(r)

[[7.0001 7.0001]
 [5.0001 5.0001]
 [5.0001 5.0001]]
[6. 8.]
[[ 9.00015  9.00015  9.00015]
 [ 7.00015  7.00015  7.00015]
 [15.00015 15.00015 15.00015]]

五、神經網絡模型

# 神經網絡兩層
class TwoLayerNet():
    def __init__(self, layers=[784,50,10], seed=200, weight_init_std=0.01): #  
        # 初始化權重
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(layers[0], layers[1])
        self.params['b1'] = np.zeros(layers[1])
        self.params['W2'] = weight_init_std * np.random.randn(layers[1], layers[2])
        self.params['b2'] = np.zeros(layers[2])
        
    def predict(self, x):
        W1, W2 = self.params['W1'], self.params['W2']
        b1, b2 = self.params['b1'], self.params['b2']
        # 前向傳播
        z1 = sigmoid( np.dot(x, W1) + b1 )
        y = softmax( np.dot(z1,W2) + b2 )        
        return y
    
     # x:輸入數據, t:監督數據
    def loss(self, x, t):
        y = self.predict(x)
        return cross_entropy_error(y, t) # 交叉熵計算
    
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        t = np.argmax(t, axis=1)
        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy
    
    # x:輸入數據, t:監督數據
    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t)
        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
        return grads
    
    def gradient(self, x, t): # 梯度下降法高級版本
        W1, W2 = self.params['W1'], self.params['W2']
        b1, b2 = self.params['b1'], self.params['b2']
        grads = {}
        batch_num = x.shape[0]
        # forward
        a1 = np.dot(x, W1) + b1
        z1 = sigmoid( a1 )
        a2 = np.dot(z1, W2) + b2
        y = softmax( a2 )
        # backward
        dy = (y - t) / batch_num
        grads['W2'] = np.dot(z1.T, dy)
        grads['b2'] = np.sum(dy, axis=0)
        da1 = np.dot(dy, W2.T)
        dz1 = sigmoid_grad(a1) * da1
        grads['W1'] = np.dot(x.T, dz1)
        grads['b1'] = np.sum(dz1, axis=0)
        return grads    
    
    def train(self, x_train, t_train, x_test, t_test, epochs=10000, batch_size=100, learning_rate=0.1):
        train_size = x_train.shape[0]
        train_loss_list = []
        train_acc_list = []
        test_acc_list = []

        iter_per_epoch = max(train_size / batch_size, 1)

        for i in range(epochs):
            batch_mask = np.random.choice(train_size, batch_size)
            x_batch = x_train[batch_mask]
            t_batch = t_train[batch_mask]
            
            # 計算梯度
            #grad = self.numerical_gradient(x_batch, t_batch)
            grad = self.gradient(x_batch, t_batch) #這裏面首先使用x預測到y, y與t得到loss, 在用loss得到梯度

            # 更新參數
            for key in ('W1', 'b1', 'W2', 'b2'):
                self.params[key] -= learning_rate * grad[key]
            loss = self.loss(x_batch, t_batch)
            train_loss_list.append(loss)

            if i % iter_per_epoch == 0:
                train_acc = self.accuracy(x_train, t_train) #計算所有的訓練數據的準確率
                test_acc = self.accuracy(x_test, t_test) # 計算所有的測試數據的準確率
                train_acc_list.append(train_acc) #這兩句話目的是保存數據用來後面的可視化
                test_acc_list.append(test_acc)
                print("train acc, test acc | " + str(train_acc) + ", " + str(test_acc))

        return train_loss_list, train_acc_list, test_acc_list

六、主函數

# 開始訓練
import numpy as np
from data_set.mnist import load_mnist
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label = True) # 數據
network = TwoLayerNet()
train_loss_list,train_acc_list,test_acc_list = network.train(x_train, t_train, x_test, t_test)

(10000, 784)
<class 'numpy.ndarray'>
train acc, test acc | 0.09863333333333334, 0.0958
train acc, test acc | 0.7864833333333333, 0.7901
train acc, test acc | 0.8757833333333334, 0.8787
train acc, test acc | 0.8993166666666667, 0.9018
train acc, test acc | 0.9092166666666667, 0.9125
train acc, test acc | 0.9160166666666667, 0.9172
train acc, test acc | 0.92075, 0.9241
train acc, test acc | 0.9245166666666667, 0.9264
train acc, test acc | 0.9291, 0.9308
train acc, test acc | 0.932, 0.9332
train acc, test acc | 0.9350833333333334, 0.9349
train acc, test acc | 0.9381166666666667, 0.9383
train acc, test acc | 0.9407666666666666, 0.9405
train acc, test acc | 0.9432166666666667, 0.9434
train acc, test acc | 0.9443333333333334, 0.9441
train acc, test acc | 0.9464666666666667, 0.9461
train acc, test acc | 0.9483833333333334, 0.9471

七、數據結果處理

import matplotlib.pyplot as plt
%matplotlib inline
# 繪製圖形
markers = {'train': 'o', 'test': 's'}
x = np.arange(len(train_acc_list))
plt.plot(x, train_acc_list, label='train acc')
plt.plot(x, test_acc_list, label='test acc', linestyle='--')
plt.xlabel("epochs")
plt.ylabel("accuracy")
plt.ylim(0, 1.0)
plt.legend(loc='lower right')
plt.show()

準確率顯示

import matplotlib.pyplot as plt
%matplotlib inline
# 繪製圖形
markers = {'train': 'o', 'test': 's'}
x = np.arange(len(train_loss_list))
plt.plot(x, train_loss_list, label='train acc')
plt.xlabel("epochs")
plt.ylabel("loss")
plt.ylim(0, 3.0)
plt.legend(loc='lower right')
plt.show()

loss分佈顯示

八、修改參數

很顯然梯度變化的時候，會有上下波動，效率低下
使用SGD結果
0.9455

動量

結果:0.9719

adaGrad

結果 0.9724

Adam 結合上面兩種方法

結果: 0.9652

代碼:

import sys, os
import json
import numpy as np
sys.path.append("../") # 引入根目錄
from src.relation_function import * # 這裏保存了五、中所有的函數
from data_set.mnist import load_mnist
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label = True) # 數據
print(x_train.shape)
print("初始化成功")

# 神經網絡兩層
class TwoLayerNet():
    def __init__(self, layers=[784,50,10], seed=200, weight_init_std=0.01,lr=0.001): #  
        # 初始化權重
        self.params = {}
        self.params['W1'] =  np.random.randn(layers[0], layers[1])/np.sqrt(layers[0])
        self.params['b1'] = np.zeros(layers[1])
        self.params['W2'] = np.random.randn(layers[1], layers[2])/np.sqrt(layers[1])
        self.params['b2'] = np.zeros(layers[2])
        self.v  =  None # momentum動量替換梯度下降 
        self.h  = None # 學習率操作方法
        
        self.m = None # adam參數
        self.beta1 = 0.9
        self.beta2 = 0.999
        self.iter = 0
        self.lr = lr
        
    def predict(self, x):
        W1, W2 = self.params['W1'], self.params['W2']
        b1, b2 = self.params['b1'], self.params['b2']
        # 前向傳播
        z1 = sigmoid( np.dot(x, W1) + b1 )
        y = softmax( np.dot(z1,W2) + b2 )        
        return y
    
     # x:輸入數據, t:監督數據
    def loss(self, x, t):
        y = self.predict(x)
        return cross_entropy_error(y, t) # 交叉熵計算
    
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        t = np.argmax(t, axis=1)
        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy
    
    # x:輸入數據, t:監督數據
    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t)
        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
        return grads
    
    def gradient(self, x, t): # 梯度下降法高級版本
        W1, W2 = self.params['W1'], self.params['W2']
        b1, b2 = self.params['b1'], self.params['b2']
        grads = {}
        batch_num = x.shape[0]
        # forward
        a1 = np.dot(x, W1) + b1
        z1 = sigmoid( a1 )
        a2 = np.dot(z1, W2) + b2
        y = softmax( a2 )
        # backward
        dy = (y - t) / batch_num
        grads['W2'] = np.dot(z1.T, dy)
        grads['b2'] = np.sum(dy, axis=0)
        da1 = np.dot(dy, W2.T)
        dz1 = sigmoid_grad(a1) * da1
        grads['W1'] = np.dot(x.T, dz1)
        grads['b1'] = np.sum(dz1, axis=0)
        return grads    
    
    # 更新參數
    def update(self, x_batch, t_batch, learning_rate):
        # 計算梯度
        grad = self.gradient(x_batch, t_batch) #這裏面首先使用x預測到y, y與t得到loss, 在用loss得到梯度
        # 更新參數
        for key in self.params.keys():
            self.params[key] -= learning_rate * grad[key]
            
            
    # 更新參數, 動量方法Momentum方法給人
    # 的感覺就像是小球在地面上滾動
    def update_momentum(self, x_batch, t_batch, learning_rate, momentum=0.9):
        # 計算梯度
        grad = self.gradient(x_batch, t_batch) #這裏面首先使用x預測到y, y與t得到loss, 在用loss得到梯度
        
        if self.v is None: # 初始化動量
            self.v = {}
            for key, val in self.params.items():
                self.v[key] = np.zeros_like(val)
        
        # 更新參數
        for key in self.params.keys():
            self.v[key] = momentum*self.v[key]-learning_rate*grad[key]
            self.params[key] += self.v[key]
            
 
    # 更新參數, 修改學習率方法給人
    # 的感覺就像是小球在地面上滾動
    def update_adaGrad(self, x_batch, t_batch, learning_rate):
        # 計算梯度
        grad = self.gradient(x_batch, t_batch) #這裏面首先使用x預測到y, y與t得到loss, 在用loss得到梯度
        
        if self.h is None: # 初始化動量
            self.h = {}
            for key, val in self.params.items():
                self.h[key] = np.zeros_like(val)
        
        # 更新參數
        for key in self.params.keys():
            self.h[key] += grad[key]*grad[key]
            self.params[key] -= learning_rate*grad[key] / (np.sqrt(self.h[key])+ 1e-7)
            
    def update_adam(self, x_batch, t_batch):
        # 計算梯度
        grads = self.gradient(x_batch, t_batch) #這裏面首先使用x預測到y, y與t得到loss, 在用loss得到梯度
        
        if self.m is None: # 初始化動量
            self.m, self.v = {}, {}
            for key, val in self.params.items():
                self.m[key] = np.zeros_like(val)
                self.v[key] = np.zeros_like(val)
        self.iter += 1
        lr_t  = self.lr * np.sqrt(1.0 - self.beta2**self.iter) / (1.0 - self.beta1**self.iter)  

        # 更新參數
        for key in self.params.keys():
            self.m[key] += (1 - self.beta1) * (grads[key] - self.m[key])
            self.v[key] += (1 - self.beta2) * (grads[key]**2 - self.v[key])
            self.params[key] -= lr_t * self.m[key] / (np.sqrt(self.v[key]) + 1e-7)     
            
    def train(self, x_train, t_train, x_test, t_test, epochs=10000, batch_size=100, learning_rate=0.1):
        train_size = x_train.shape[0]
        train_loss_list = []
        train_acc_list = []
        test_acc_list = []

        iter_per_epoch = max(train_size / batch_size, 1)

        for i in range(epochs):
            start = (i*batch_size)%60000
            end = (i*batch_size)%60000 + batch_size
            # print(start,end)            
            # batch_mask = np.random.choice(train_size, batch_size)
            x_batch = x_train[start:end]
            t_batch = t_train[start:end]
            
            # self.update_momentum(x_batch, t_batch, learning_rate) # 更新參數
            # self.update(x_batch, t_batch, learning_rate) # 更新參數
            # self.update_adaGrad(x_batch, t_batch, learning_rate)
            self.update_adam(x_batch, t_batch)
            
            loss = self.loss(x_batch, t_batch)
            train_loss_list.append(loss)

            if i % iter_per_epoch == 0:
                train_acc = self.accuracy(x_train, t_train) #計算所有的訓練數據的準確率
                test_acc = self.accuracy(x_test, t_test) # 計算所有的測試數據的準確率
                train_acc_list.append(train_acc) #這兩句話目的是保存數據用來後面的可視化
                test_acc_list.append(test_acc)
                print("train acc, test acc | " + str(train_acc) + ", " + str(test_acc))

        return train_loss_list, train_acc_list, test_acc_list

# 開始訓練
import numpy as np
from data_set.mnist import load_mnist
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label = True) # 數據
network = TwoLayerNet()
train_loss_list,train_acc_list,test_acc_list = network.train(x_train, t_train, x_test, t_test)

train acc, test acc | 0.09751666666666667, 0.0974
train acc, test acc | 0.9031333333333333, 0.9093
train acc, test acc | 0.92455, 0.924
train acc, test acc | 0.9364166666666667, 0.935
train acc, test acc | 0.94445, 0.9398
train acc, test acc | 0.9502833333333334, 0.9455
train acc, test acc | 0.9552833333333334, 0.9489
train acc, test acc | 0.9593, 0.9514
train acc, test acc | 0.9628333333333333, 0.9543
train acc, test acc | 0.9657, 0.9566
train acc, test acc | 0.9684, 0.9587
train acc, test acc | 0.9706, 0.9603
train acc, test acc | 0.97255, 0.9619
train acc, test acc | 0.9742166666666666, 0.9626
train acc, test acc | 0.9759333333333333, 0.9641
train acc, test acc | 0.9771166666666666, 0.9643
train acc, test acc | 0.9785166666666667, 0.9651

深度學習入門:基於Python的理論與實踐之純手工手撕數字識別nmist(雙層網絡全連接)

開篇

文章目錄

一、建立path_config.json

二、激活函數, 即下圖的z

三、損失函數

四、梯度

五、神經網絡模型

六、主函數

七、數據結果處理

準確率顯示

loss分佈顯示

八、修改參數

動量

adaGrad

Adam 結合上面兩種方法

自然語言處理-多分類模型搭建

LLVM IR轉CFG

python可視化1

Ubuntu18.0.4下sublim text3的安裝使用全教程

MindManger2018使用

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結

深度學習入門:基於Python的理論與實踐 之 純手工手撕數字識別nmist(雙層網絡全連接)

開篇

文章目錄

一、建立path_config.json

二、激活函數, 即下圖的z

三、損失函數

四、梯度

五、神經網絡模型

六、主函數

七、數據結果處理

準確率顯示

loss分佈顯示

八、修改參數

動量

adaGrad

Adam 結合上面兩種方法

深度學習入門:基於Python的理論與實踐之純手工手撕數字識別nmist(雙層網絡全連接)