【Python 3.6】任意深度BP神經網絡綜合練習(非卷積網絡),根據斯坦福cs231n課程編寫

CIFAR10數據庫
CIFAR10是一套含有60000張大小爲32×32彩色RGB圖像的10分類圖像數據庫,其中的50000張圖像爲訓練數據,10000張圖像爲測試數據,另外驗證集的數據從訓練集中取出。

隱含層使用的激活函數:ReLU函數
輸出層使用的損失函數:Softmax函數
訓練集數據特徵數量(即維度):32×32×3,3表示有RGB三個色彩通道。
訓練集數據量:1000個
驗證集數據量:1000個
由於該案例並不專注於模型的泛化能力,而是專注於如何使用代碼實現該網絡,所以不需要測試集。
隱含層層數:2層
隱含層神經元數量:從第1層到第2層隱含層 100 100
初始權重矩陣的標準差:0.05
訓練迭代次數:100次
每個批次的訓練集數量:500個
輸出層神經元數量:10,即輸出10個類別
學習率:0.001
正則化係數:0.0,無正則化
學習率衰減率:無衰減
優化器:Adam
使用批量歸一化

‘i_b_h’: input between hidden 輸入層到第一層隱含層
‘h_b_o’:hidden between output 最後一層隱含層到輸出層
‘w_h_i_b_h_i+1’:w_ith hidden between i+1th hidden 第i層隱含層到第i+1層隱含層的權重
'b_h_i_b_h_i+1:b_ith hidden between i+1th hidden 第i層隱含層到第i+1層隱含層的偏置
‘w_h_b_o’:w_hidden between output 最後一層隱含層到輸出層的權重
‘b_h_b_o’ :b_hidden between output 最後一層隱含層到輸出層的偏置

訓練結果
目前存在訓練損失與訓練準確度共同增大的情況,,,尚不知如何解決
在這裏插入圖片描述
在這裏插入圖片描述
在這裏插入圖片描述
均已對各模塊進行測試,無問題
代碼

# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
import os
import _pickle as pickle

def rel_error(x, y):
    return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

def eval_numerical_gradient_array(f, x, df, h=1e-5):
  """
  Evaluate a numeric gradient for a function that accepts a numpy
  array and returns a numpy array.
  """
  grad = np.zeros_like(x)
  it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
  while not it.finished:
    ix = it.multi_index
    
    oldval = x[ix]
    x[ix] = oldval + h
    pos = f(x).copy()
    x[ix] = oldval - h
    neg = f(x).copy()
    x[ix] = oldval
    
    grad[ix] = np.sum((pos - neg) * df) / (2 * h)
    it.iternext()
  return grad
    
#獲取CIFAR10圖像數據集
def get_CIFAR10_data(num_training=1000, num_validation=1000, num_test=0):
    cifar10_dir = 'F:/Python programs/Neural Networks/深度學習實戰/cifar-10-batches-py'
    xs = []
    ys = []
    for b in range(1,6):
        f = os.path.join(cifar10_dir, 'data_batch_%d' % (b, ))
        with open(f, 'rb') as f:
            datadict = pickle.load(f, encoding = 'latin1')
            X = datadict['data']
            Y = datadict['labels']
            X = X.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype("float")
            Y = np.array(Y)
        xs.append(X)
        ys.append(Y)    
    X_train = np.concatenate(xs)
    y_train = np.concatenate(ys)
    del X, Y
    with open(os.path.join(cifar10_dir, 'test_batch'), 'rb') as f:
        datadict = pickle.load(f, encoding = 'latin1')
        X_test = datadict['data']
        y_test = datadict['labels']
        X_test = X_test.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype("float")
        y_test = np.array(y_test)
    #從原始訓練集的50000張圖中選出最後1000張圖作爲驗證集
    mask = range(num_training, num_training + num_validation)
    X_val = X_train[mask]
    y_val = y_train[mask]
    mask = range(num_training)
    X_train = X_train[mask]
    y_train = y_train[mask]
    #從原始測試集中的10000張圖中選出最開始的1000張圖作爲測試集
    mask = range(num_test)
    X_test = X_test[mask]
    y_test = y_test[mask]
    #將訓練集數據進行中心化
    mean_image = np.mean(X_train, axis=0)
    X_train -= mean_image
    X_val -= mean_image
    X_test -= mean_image
    #重構尺寸
    X_train = X_train.transpose(0, 3, 1, 2).copy()
    X_val = X_val.transpose(0, 3, 1, 2).copy()
    X_test = X_test.transpose(0, 3, 1, 2).copy()
    return {
      'X_train': X_train, 'y_train': y_train,
      'X_val': X_val, 'y_val': y_val, \
      'X_test': X_test, 'y_test': y_test,
    }

#前向傳播
def affine_forward(x, w, b):
    """
    計算某一層的前向傳播。
    輸入:x,N*D,若爲RGB圖像,則爲N*(長*寬*3),N爲本層的神經元數量,D爲特徵數量
          w,D*M,M爲下一層的神經元數量
          b,偏置向量,M*1
    返回:out,N*M
    """
    out = None
    #數據量
    N = x.shape[0]
    #將x整理成二維數組,N行,剩下的組成列
    x = np.reshape(x, (N,-1))
    #在數據矩陣的最後一列再添加一列1,作爲偏置x0,x變成N*(D+1)
    x = np.hstack((x, np.ones((N, 1))))
    #將偏置向量b添加到權重矩陣w的最後一行,作爲偏置x0的權重,w變成(D+1)*M
    w = np.vstack((w, np.transpose(b)))
    #計算下一層所有神經元對本層所有神經元的激活值,例如本層有2個神經元,下一層有3個,則輸出2*3矩陣
    out = x.dot(w)
    return out

#反向傳播
def affine_backward(dout, cache):
    """
    計算反向傳播
    輸入:
    dout,上層梯度,即殘差,N*M
    cache, 上層
    輸出:
    dx,輸入數據的梯度,N*d1*d2*...*dk
    dw,權重矩陣的梯度,D*M
    db,偏置項b的梯度,M*1
    """
    x, w, b = cache
    dx, dw, db = None, None, None
    #反向傳播
    #數據量
    N = x.shape[0]
    #將x重塑成N*D
    x = np.reshape(x, (N, -1))
    #計算殘差的梯度
    dx = dout.dot(np.transpose(w))
    #計算權重的梯度
    dw = np.transpose(x)
    dw = dw.dot(dout)
    #計算偏置的梯度
    db = np.sum(dout, axis = 0)
    #將dx重塑回來
    dx = np.reshape(dx, x.shape)
    return dx, dw, db

#RelU傳播
def relu_forward(x):
    """
    計算ReLUs激活函數的前向傳播,然後保存結果。
    輸入:
    x - 輸入數據
    返回:
    out - 與輸入數據的尺寸相同。
    cache - x。
    """
    out = None
    out = np.max(np.dstack((x,np.zeros(x.shape))),axis = 2)
    return out

#ReLUs反向傳播
def relu_backward(dout, x):
    """
    計算ReLU函數的反向傳播。
    輸入:
    dout - 上層誤差梯度
    x - 輸入數據x
    返回:
    dx - x的梯度
    """
    dx = dout
    dx[x <= 0] = 0
    return dx

#softmax損失函數
def softmax_loss(X, y):
    """
    無正則化
    輸入:
    X:神經網絡的輸出層激活值
    y:訓練數據的標籤,即真實標籤
    reg:正則化懲罰係數
    輸出:
    loss:損失值
    dW:權重W的梯度
    """
    #初始化損失值與梯度
    loss = 0.0
    #計算損失-------------
    #訓練集數據數量N
    num_train = X.shape[0]
    #數據類別數量C
    num_catogries = X.shape[1]
    #歸一化概率的分子,N*C
    #爲了防止指數運算時結果太大導致溢出,這裏要將X的每行減去每行的最大值
    score_fenzi = X - np.max(X, axis = 1, keepdims = True)
    score_fenzi = np.exp(score_fenzi)
    #歸一化概率的分母,即,將歸一化概率的分子按行求和,N*1
    score_fenmu = np.sum(score_fenzi, axis = 1, keepdims = True)
    #將分母按列複製,
    score_fenmu = score_fenmu.dot(np.ones((1, num_catogries)))
    #歸一化概率,N*C/(N*1)*(1*C)=N*C/N*C
    prob = np.log(score_fenzi/score_fenmu + 1)
    y_true = np.zeros((num_train,  num_catogries))
    #把訓練數據的標籤鋪開,例如,x是第3類,則x對應的標籤爲[0,0,1,0,0,0,0,0,0,0]
    y_true[range(num_train), y] = 1.0
    #y_true與p對應元素相乘後,只留下了每個數據真實標籤對應的分數,例如x屬於第3類,則留下第3個歸一化概率
    #求出每一行歸一化概率的和,即把多餘的0消除,再計算所有數據歸一化概率的和
    loss = -np.sum(y_true * prob) / num_train
    #計算梯度--------------
    dx = (score_fenzi/score_fenmu).copy()
    dx[np.arange(num_train), y] -= 1
    dx /= num_train
    return loss, dx

#dropout前向傳播
def dropout_forward(x, param):
    """
    執行Dropout前向傳播
    輸入:
    x:輸入數據
    dropout_param:字典類型,dropout參數
      p:dropout激活參數,每個神經元激活概率爲p
      mode:'test'或'train'。train:使用激活概率p與神經元進行與運算
                             test:去除激活概率p,返回輸入值
      seed:隨機數生成種子
    返回:
    out:與輸入數據形狀相同                    
    """
    dropout_p = param['p']
    if 'seed' in param:
        np.random.seed(param['seed'])
    mask = None
    out = None
    mask = (np.random.rand(*x.shape) < dropout_p) / dropout_p
    out = x * mask
    out = out.astype(x.dtype, copy = False)
    return out, mask

#dropout反向傳播
def dropout_backward(dout, mask):
     dx = None
     dx = dout * mask
     return dx
 
#SGD_Momentum優化器
def SGD_Momentum(w, dw, config = None):
    """
    隨機批量、動量梯度下降方法。
    config:使用格式。
       - learning_rate:學習率
       - momentum:[0, 1]的動量學習因子,0表示不使用動量,退化爲SGD
       - velocity:速度,與w和dw形狀相同。
    """
    if config is None:
        config = {}
    config.setdefault('learning_rate', 1e-3)
    config.setdefault('momentum', 0.9)
    config.setdefault('velocity', np.zeros_like(w))
    next_w = None
    config['velocity'] = config['momentum'] * config['velocity'] - config['learning_rate'] * dw
    next_w = w + config['velocity']
    return next_w, config

#RMSProp優化器
def RMSProp(w, dw, config = None):
    """
    RMSProp更新器
    config字典:
        learning_rate
        decay_rate:歷史累積梯度衰減因子,[0,1]
        epsilon:用於避免數值溢出
    """
    if config is None:
        config = {}
    config.setdefault('learning_rate', 1e-3)
    config.setdefault('decay_rate', 0.99)
    config.setdefault('epsilon', 1e-8)
    config.setdefault('cache', np.zeros_like(w))
    next_w = None
    config['cache'] = config['decay_rate'] * config['cache'] + (1 - config['decay_rate']) * dw ** 2
    next_w = w - config['learning_rate'] * dw / (np.sqrt(config['cache']) + config['epsilon'])
    return next_w, config

#Adam優化器
def Adam(w, dw, config = None):
    """
    Adam更新器
    config字典:
        learning_rate
        beta1:動量衰減因子
        beta2: 學習率衰減因子
        epsilon:避免數值溢出
        v:梯度衰減
        r:學習率衰減
        t:迭代次數
    """
    if config is None:
        config = {}
    config.setdefault('learning_rate', 1e-3)
    config.setdefault('beta1', 0.9)
    config.setdefault('beta2', 0.999)
    config.setdefault('epsilon', 1e-8)
    config.setdefault('r', np.zeros_like(w))
    config.setdefault('v', np.zeros_like(w))
    config.setdefault('t', 0)
    config['t'] += 1
    next_w = None
    config['v'] = config['beta1'] * config['v'] + (1 - config['beta1']) * dw
    config['r'] = config['beta2'] * config['r'] + (1 - config['beta2']) * dw ** 2
    config['vb'] = config['v'] / (1 - config['beta1'] ** (config['t']))
    config['rb'] = config['r'] / (1 - config['beta2'] ** (config['t']))
    next_w = w - config['learning_rate'] * config['vb'] / (np.sqrt(config['rb']) + config['epsilon'])
    return next_w, config

#前向BN
def BN_forward(x, bn_param):
    """
    使用類似動量衰減的運行時平均,計算總體均值與方差
    輸入:
    x:輸入數據(N, D)
    bn_param:字典,如下
        gamma:(D, )
        beta:(D, )
        eps:防止數據溢出
        momentum:平均衰減因子
        running_mean:運行時均值(D, ),預測時使用,訓練階段保留就好
        running_var:運行時方差(D, ),預測時使用,訓練階段保留就好
    輸出:
    out:輸出(N, D)
    cache:反向傳播的緩存
    """
    eps = bn_param.setdefault('eps', 1e-7)
    momentum = bn_param.setdefault('momentum', 0.9)
    N, D = x.shape
    running_mean = bn_param.setdefault('running_mean', np.zeros(D, dtype = x.dtype))
    running_var = bn_param.setdefault('running_var', np.zeros(D, dtype = x.dtype))
    gamma = bn_param.setdefault('gamma', np.ones((D), dtype = x.dtype))
    beta = bn_param.setdefault('beta', np.zeros((D), dtype = x.dtype))
    out, cache = None, None    #計算均值
    mean = np.mean(x, axis = 0, keepdims = True)
    #平移爲零均值
    x_mu = x - mean
    #計算方差
    var = np.sum(x_mu ** 2, axis = 0, keepdims = True) / N
    #得到標準差
    x_std = np.sqrt(var + eps)
    #數據歸一化
    x_nor = x_mu / x_std
    #數據縮放與平移
    out = gamma * x_nor + beta
    #更新均值和方差
    bn_param['running_mean'] = momentum * running_mean + (1 - momentum) * mean
    bn_param['running_var'] = momentum * running_var + (1 - momentum) * var
    #保存中間結果,用於反向傳播
    cache = {'bn_param': bn_param, 'x_nor': x_nor, 'x_std': x_std, 'x_mu': x_mu, 'N': N}
    return out, cache

#反向BN
def BN_backward(dout, cache):
    """
    BN反向傳播
    輸入:
    dout:上層梯度
    cache:前向BN的緩存
    輸出:
    dx:數據梯度(N, D)
    dgamma:gamma梯度(D, )
    dbeta:beta梯度(D, )
    """
    dx, dgamma, dbeta = None, None, None 
    dbeta = np.sum(dout, axis = 0)
    dgamma = np.sum(cache['x_nor'] * dout, axis = 0)
    N = cache['N']
    dx = (1.0 / N) * cache['bn_param']['gamma'] * (cache['x_std'] ** 2 + cache['bn_param']['eps']) ** -0.5 * (N * dout - np.sum(
          dout, axis = 0) - cache['x_mu'] / (cache['x_std'] ** 2 + cache['bn_param']['eps']) * np.sum(dout * cache['x_mu'], axis = 0))
    return dx, dgamma, dbeta
    
#前向Dropout測試
x_dropout_test = np.random.randn(500, 500) + 10
out, _ = dropout_forward(x_dropout_test, {'p': 0.5})
print('前向Dropout自檢-----------------------')
print('輸入數據均值:', x_dropout_test.mean())
print('Dropout操作後的數據均值:', out.mean())
if rel_error(x_dropout_test.mean(), out.mean()) < 1e-3:
    print('測試通過^_^\n')
else:
    print('測試未通過')

#反向Dropout測試
x_dropout_test = np.random.randn(10, 10) + 10
dout = np.random.randn(*x_dropout_test.shape)
dropout_param = {'p': 0.8, 'seed': 123}
out, mask_test = dropout_forward(x_dropout_test, dropout_param)
dx = dropout_backward(dout, mask_test)
dx_num = eval_numerical_gradient_array(lambda xx: dropout_forward(xx, dropout_param)[0], x_dropout_test, dout)
print('反向Dropout自檢-----------------------')
print('相對誤差:', rel_error(dx, dx_num))
if rel_error(dx, dx_num) < 1e-10:
    print('測試通過^_^\n')
else:
    print('測試未通過\n')

#SGD_Momentum函數測試
N, D = 4, 5
w = np.linspace(-0.4, 0.6, num = N * D). reshape(N, D)
dw = np.linspace(-0.6, 0.4, num = N * D). reshape(N, D)
v = np.linspace(0.6, 0.9, num = N * D). reshape(N, D)
config_test = {'learning_rate': 1e-3, 'momentum': 0.9, 'velocity': v}
next_w, config_test = SGD_Momentum(w, dw, config = config_test)
expected_next_w = np.asarray([
    [ 0.1406,      0.20738947,  0.27417895,  0.34096842,  0.40775789],
    [ 0.47454737,  0.54133684,  0.60812632,  0.67491579,  0.74170526],
    [ 0.80849474,  0.87528421,  0.94207368,  1.00886316,  1.07565263],
    [ 1.14244211,  1.20923158,  1.27602105,  1.34281053,  1.4096    ]]),
expected_velocity = np.asarray([
    [ 0.5406,      0.55475789,  0.56891579, 0.58307368,  0.59723158],
    [ 0.61138947,  0.62554737,  0.63970526,  0.65386316,  0.66802105],
    [ 0.68217895,  0.69633684,  0.71049474,  0.72465263,  0.73881053],
    [ 0.75296842,  0.76712632,  0.78128421,  0.79544211,  0.8096    ]])
print('SGD_Momentum更新器自檢---------------------')
print('更新權重誤差: ', rel_error(next_w, expected_next_w))
print('速度誤差: ', rel_error(expected_velocity, config_test['velocity']))
if rel_error(next_w, expected_next_w) < 1e-8 and rel_error(expected_velocity, config_test['velocity']) < 1e-8:
    print('測試通過^_^\n')
else:
    print('測試未通過\n')

#RMSProp函數測試
N, D = 4, 5
w = np.linspace(-0.4, 0.6, num = N * D). reshape(N, D)
dw = np.linspace(-0.6, 0.4, num = N * D). reshape(N, D)
cache = np.linspace(0.6, 0.9, num = N * D). reshape(N, D)
config_test = {'learning_rate': 1e-2, 'cache': cache}
next_w, config_test = RMSProp(w, dw, config = config_test)
expected_next_w = np.asarray([
    [-0.39223849, -0.34037513, -0.28849239, -0.23659121, -0.18467247],
    [-0.132737,   -0.08078555, -0.02881884,  0.02316247,  0.07515774],
    [ 0.12716641,  0.17918792,  0.23122175,  0.28326742,  0.33532447],
    [ 0.38739248,  0.43947102,  0.49155973,  0.54365823,  0.59576619]])
expected_cache = np.asarray([
    [ 0.5976,      0.6126277,   0.6277108,   0.64284931,  0.65804321],
    [ 0.67329252,  0.68859723,  0.70395734,  0.71937285,  0.73484377],
    [ 0.75037008,  0.7659518,   0.78158892,  0.79728144,  0.81302936],
    [ 0.82883269,  0.84469141,  0.86060554,  0.87657507,  0.8926    ]])
print('RMSProp更新器自檢--------------------------')
print('權重更新誤差:', rel_error(expected_next_w, next_w))
print('cache誤差:', rel_error(expected_cache, config_test['cache']))
if rel_error(expected_next_w, next_w) < 1e-7 and rel_error(expected_cache, config_test['cache']) < 1e-7:
    print('測試通過^_^\n')
else:
    print('測試未通過\n')

#Adam函數測試
N, D = 4, 5
w = np.linspace(-0.4, 0.6, num = N * D). reshape(N, D)
dw = np.linspace(-0.6, 0.4, num = N * D). reshape(N, D)
v = np.linspace(0.6, 0.9, num = N * D). reshape(N, D)
r = np.linspace(0.7, 0.5, num = N * D). reshape(N, D)
config_test = {'learning_rate': 1e-2, 'beta1': 0.9, 'beta2': 0.999, 'epsilon': 1e-7, 'r': r, 'v': v, 't': 5}
next_w, config_test = Adam(w, dw, config = config_test)
expected_next_w = np.asarray([
    [-0.40094747, -0.34836187, -0.29577703, -0.24319299, -0.19060977],
    [-0.1380274,  -0.08544591, -0.03286534,  0.01971428,  0.0722929],
    [ 0.1248705,   0.17744702,  0.23002243,  0.28259667,  0.33516969],
    [ 0.38774145,  0.44031188,  0.49288093,  0.54544852,  0.59801459]])
expected_r = np.asarray([
    [ 0.69966,     0.68908382,  0.67851319,  0.66794809,  0.65738853,],
    [ 0.64683452,  0.63628604,  0.6257431,   0.61520571,  0.60467385,],
    [ 0.59414753,  0.58362676,  0.57311152,  0.56260183,  0.55209767,],
    [ 0.54159906,  0.53110598,  0.52061845,  0.51013645,  0.49966,   ]])
expected_v = np.asarray([
    [ 0.48,        0.49947368,  0.51894737,  0.53842105,  0.55789474],
    [ 0.57736842,  0.59684211,  0.61631579,  0.63578947,  0.65526316],
    [ 0.67473684,  0.69421053,  0.71368421,  0.73315789,  0.75263158],
    [ 0.77210526,  0.79157895,  0.81105263,  0.83052632,  0.85      ]])
print('Adam更新器自檢--------------------------')
print('權重更新誤差:', rel_error(expected_next_w, next_w))
print('v誤差:', rel_error(expected_v, config_test['v']))
print('r誤差:', rel_error(expected_r, config_test['r']))
if rel_error(expected_next_w, next_w) < 1e-6 and rel_error(expected_v, config_test['v']) < 1e-7 and rel_error(expected_r, config_test['r']) < 1e-7:
    print('測試通過^_^\n')
else:
    print('測試未通過\n')

#前向BN測試
N, D1, D2, D3 = 200, 50, 60, 3
X = np.random.randn(N, D1)
W1 = np.random.randn(D1, D2)
W2 = np.random.randn(D2, D3)
a = np.maximum(0, X.dot(W1)).dot(W2)
print('前向BN自檢---------------------------------')
print("批量歸一化前:")
print('均值:', a.mean(axis = 0))
print('標準差:', a.std(axis = 0))
print('批量歸一化後(應該爲0均值,1標準差):')
a_norm, _ = BN_forward(a, {'gamma': np.ones(D3), 'beta': np.zeros(D3)})
print('均值:', a_norm.mean(axis = 0))
print('標準差', a_norm.std(axis = 0))
if (a_norm.mean(axis = 0) - 0.0 < 1e-10).all() and (a_norm.std(axis = 0) - 1.0 < 1e-10).all() == True:
    print('測試通過^_^\n')
else:
    print('測試未通過\n')
    
#反向BN測試
N, D = 4, 5
x = 5 * np.random.randn(N, D) + 12
gamma = np.random.randn(D)
beta = np.random.randn(D)
dout = np.random.randn(N, D)
bn_param = {'gamma': gamma, 'beta': beta}
fx = lambda x: BN_forward(x, bn_param)[0]
fg = lambda a: BN_forward(x, bn_param)[0]
fb = lambda b: BN_forward(x, bn_param)[0]
dx_num = eval_numerical_gradient_array(fx, x, dout)
da_num = eval_numerical_gradient_array(fg, gamma, dout)
db_num = eval_numerical_gradient_array(fb, beta, dout)
_, cache = BN_forward(x, bn_param)
dx, dgamma, dbeta = BN_backward(dout, cache)
print('反向BN自檢-----------------------------')
print('dx誤差:', rel_error(dx_num, dx))
print('dgamma誤差:', rel_error(da_num, dgamma))
print('dbeta誤差:', rel_error(db_num, dbeta))
if rel_error(dx_num, dx) < 1e-5 and rel_error(da_num, dgamma) < 1e-8 and rel_error(db_num, dbeta) < 1e-8:
    print('測試通過^_^\n')
else:
    print('測試未通過\n')

#導入CIFAR10數據庫
#輸入數據
print('輸入數據...')
data = get_CIFAR10_data()
X_train = data['X_train']
y_train = data['y_train']
X_val = data['X_val']
y_val = data['y_val']
X_test = data['X_test']
y_test = data['y_test']
for k, v in data.items():
    print(f"{k}:", v.shape)
print('完成\n')

#測試含多層隱含層的神經網絡-------------------------------------------------

#網絡超參數設置
hidden_dim = [100, 100] #從左到右分別表示第一層到最後一層隱含層的神經元數量
hidden_layers_num = len(hidden_dim) #隱含層的層數
weight_scale = 5e-2 #初始權重矩陣中各元素的標準差
input_dim = 32*32*3 #輸入數據的特徵數量,即維度
num_classes = 10 #輸出層神經元數量
params = {} #初始化存儲有權重矩陣和偏置矩陣的字典
print("參數初始化...",  end = '')
#初始化各權重矩陣與偏置向量
#初始化從輸入層到第一層隱含層的權重矩陣與偏置矩陣
params['W_i_b_h'] = np.random.randn(input_dim, hidden_dim[0]) / np.sqrt(input_dim / 2)
params['b_i_b_h'] = np.zeros(hidden_dim[0])
#初始化從第一層到最後一層隱含層之間所有的權重矩陣與偏置矩陣
for i in range(hidden_layers_num - 1):
    if hidden_layers_num == 1:
        break;
    params['W_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)] = np.random.randn(hidden_dim[i], hidden_dim[i + 1]) / np.sqrt(hidden_dim[i] / 2)
    params['b_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)] = np.zeros(hidden_dim[i + 1])
#初始化從最後一層隱含層到輸出層的權重矩陣與偏置矩陣
params['W_h_b_o'] = np.random.randn(hidden_dim[-1], num_classes) / np.sqrt(hidden_dim[-1] / 2)
params['b_h_b_o'] = np.zeros(num_classes)
#初始化dropout的掩模矩陣字典
dropout_mask = {}

#輸入訓練超參數
num_iters = 100 #迭代次數
batch_size = 500 #每一次迭代中從訓練集中隨機選取的數據量,選取出來作爲一個批次
dropout_param = {'p': 0.7} #Dropout參數
learning_rate = 1e-3 #學習率
verbose = True #是否在命令行顯示訓練消息
reg = 0.0 #正則化係數
learning_rate_decay = 1.0 #學習率衰減率
iterations_per_epoch = max(X_train.shape[0] / batch_size, 1) #遍歷整個訓練集需要多少個批次
#可以選擇的更新器:Original, Momentum, RMSProp, Adam
optimizer = 'Adam'
#存儲歷次迭代的損失值、訓練準確率與驗證準確率
loss_history = [] #歷次損失值
train_history = [] #歷次訓練準確率
val_history = [] #歷次驗證準確率
#存儲每次迭代生成的權重梯度與偏置梯度的字典
grad = {}
forward_out = {}
#BN傳播初始化
bn_param = {}
gamma = {}
beta = {}
#Momentum、RMSProp、Adam更新器參數,BN的gamma和beta初始化
config = {'W_i_b_h': None, 'W_h_b_o': None, 'b_i_b_h': None, 'b_h_b_o': None, 'gamma_i_b_h': None, 'beta_i_b_h': None}
gamma['i_b_h'] = np.ones(hidden_dim[0])
beta['i_b_h'] = np.zeros(hidden_dim[0])
bn_dgamma = {}
bn_dbeta = {}
bn_cache = {}
for i in range(hidden_layers_num - 1):
    config['W_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)] = None
    config['b_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)] = None
    config['gamma_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)] = None
    config['beta_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)] = None
    gamma['h' + str(i + 1) + '_b_' + 'h' + str(i + 2)] = np.ones(hidden_dim[i + 1])
    beta['h' + str(i + 1) + '_b_' + 'h' + str(i + 2)] = np.zeros(hidden_dim[i + 1])
print('完成\n')


print('開始訓練')
#開始訓練
for it in range(num_iters):
    num_train = X_train.shape[0] #訓練集中的總數據量
    X_batch = None
    y_batch = None
    #從訓練集中隨機取出batch_size個訓練數據
    #從0到num_train-1中隨機取batch_size個數字,作爲一個批次的訓練數據的索引
    i = np.random.choice(range(num_train), batch_size, replace = False)
    X_batch = X_train[i,:]
    #標籤y和訓練數據X必須對應,例如取出了第3個數據,則必須取出第3個標籤
    y_batch = y_train[i]
    #前向傳播------------------------
    #從輸入層傳到第一層隱含層
    #前向仿射變換
    forward_out_i2h = affine_forward(X_batch, params['W_i_b_h'], params['b_i_b_h'])
    #前向批量歸一化
    forward_out_i2h, bn_cache['i_b_h'] = BN_forward(forward_out_i2h, {'gamma': gamma['i_b_h'], 'beta': beta['i_b_h']})
    #前向ReLU激活
    forward_out['i2h'] = relu_forward(forward_out_i2h)
    #前向Dropout
    forward_out['i2h'], dropout_mask['i2h'] = dropout_forward(forward_out['i2h'], dropout_param)
    #從第二層隱含層到最後一層隱含層
    if hidden_layers_num > 1:
        for i in range(hidden_layers_num - 1):
            if i == 0:
                forward_out['h' + str(i) + '_2_' + 'h' + str(i + 1)] = forward_out['i2h']
            #前向仿射變換
            forward_out['h' + str(i + 1) + '_2_' + 'h' + str(i + 2)] = affine_forward(forward_out['h' + str(i) + '_2_' + 'h' + str(i + 1)], 
                                                params['W_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)], 
                                                params['b_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)])
            #前向批量歸一化
            forward_out['h' + str(i + 1) + '_2_' + 'h' + str(i + 2)], bn_cache['h' + str(i + 1) + '_b_' + 'h' + str(i + 2)] = BN_forward(forward_out['h' + str(i + 1) + '_2_' + 'h' + str(i + 2)], 
                                  {'gamma': gamma['h' + str(i + 1) + '_b_' + 'h' + str(i + 2)], 'beta': beta['h' + str(i + 1) + '_b_' + 'h' + str(i + 2)]})
            #前向ReLU激活
            forward_out['h' + str(i + 1) + '_2_' + 'h' + str(i + 2)] = relu_forward(forward_out['h' + str(i + 1) + '_2_' + 'h' + str(i + 2)])
            #前向Dropout
            forward_out['h' + str(i + 1) + '_2_' + 'h' + str(i + 2)], dropout_mask['h' + str(i + 1) + '_2_' + 'h' + str(i + 2)] = dropout_forward(forward_out['h' + str(i + 1) + '_2_' + 'h' + str(i + 2)], dropout_param)
        forward_out_hidden = forward_out['h' + str(hidden_layers_num - 1) + '_2_' + 'h' + str(hidden_layers_num)]
    else:
        forward_out_hidden = forward_out['i2h']
    #從最後一層隱含層到輸出層
    scores = affine_forward(forward_out_hidden, params['W_h_b_o'], params['b_h_b_o'])
    #在輸出層使用softmax損失函數,計算網絡的總損失與梯度
    loss, grad_out = softmax_loss(scores, y_batch)
    #對總損失加入正則項
    loss += 0.5 * reg * np.sum(params['W_i_b_h'] ** 2) + np.sum(params['W_h_b_o'] ** 2)
    for i in range(hidden_layers_num - 1):
        loss += 0.5 * reg * (np.sum(params['W_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)] ** 2))
    #存儲損失值
    loss_history.append(loss)
    #反向傳播-------------------------------------------------------------------
    #計算輸出層到最後一層隱含層的殘差、權重梯度與偏置梯度
    dx, grad['W_o_b_h'], grad['b_o_b_h'] = affine_backward(grad_out, (forward_out_hidden, params['W_h_b_o'], params['b_h_b_o']))
    #對輸出層到隱含層的權重梯度加入正則項
    grad['W_o_b_h'] += reg * params['W_h_b_o']
    #從最後一層隱含層到第一層隱含層的所有殘差、權重梯度與偏置梯度
    if hidden_layers_num > 1:
        for i in range(hidden_layers_num - 1, 0, -1):
            if i == hidden_layers_num - 1:
                forward_out['h' + str(i + 1) + '_2_' + 'h' + str(i + 2)] = forward_out_hidden
            #計算第i+1層到第i層隱含層的殘差、權重梯度與偏置梯度
            #反向Dropout
            dx = dropout_backward(dx, dropout_mask['h' + str(i) + '_2_' + 'h' + str(i + 1)])
            #反向ReLU
            dx = relu_backward(dx, forward_out['h' + str(i + 1) + '_2_' + 'h' + str(i + 2)])
            #反向批量歸一化
            dx, bn_dgamma['h' + str(i) + '_b_' + 'h' + str(i + 1)], bn_dbeta['h' + str(i) + '_b_' + 'h' + str(i + 1)] = BN_backward(dx, bn_cache['h' + str(i) + '_b_' + 'h' + str(i + 1)])
            #反向傳播
            dx, grad['W_h' + str(i + 1) + '_b_' + 'h' + str(i)], grad['b_h' + str(i + 1) + '_b_' + 'h' + str(i)] = affine_backward(dx, 
                                (forward_out['h' + str(i - 1) + '_2_' + 'h' + str(i)], 
                                 params['W_h' + str(i) + '_b_' + 'h' + str(i + 1)], 
                                 params['b_h' + str(i) + '_b_' + 'h' + str(i + 1)]))
            #對第i+1層到第i層隱含層的權重梯度加入正則項
            grad['W_h' + str(i + 1) + '_b_' + 'h' + str(i)] += reg * params['W_h' + str(i) + '_b_' + 'h' + str(i + 1)]
        forward_relu_out = forward_out['h' + str(0) + '_2_' + 'h' + str(1)]
    else:
        forward_relu_out = forward_out_hidden
    #計算第一層隱含層到輸入層的殘差、權重梯度與偏置梯度
    #反向Dropout
    dx = dropout_backward(dx, dropout_mask['i2h'])
    #反向ReLU
    dx = relu_backward(dx, forward_relu_out)
    #反向批量歸一化
    dx, bn_dgamma['i_b_h'], bn_dbeta['i_b_h'] = BN_backward(dx, bn_cache['i_b_h'])
    dx, grad['W_h_b_i'], grad['b_h_b_i'] = affine_backward(dx, (X_batch, params['W_i_b_h'], params['b_i_b_h']))
    #對第第一層隱含層到輸入層的權重梯度加入正則項
    grad['W_h_b_i'] += reg * params['W_i_b_h']
    #反向傳播完成,此時回到了輸入層
    #通過梯度和學習率更新權重與偏置
    #先更新輸入層與第一層隱含層之間的權重和偏置,與,最後一層隱含層與輸出層之間的權重和偏置
    if optimizer is 'Original':
        params['W_i_b_h'] -= learning_rate * grad['W_h_b_i']
        params['W_h_b_o'] -= learning_rate * grad['W_o_b_h']
        params['b_i_b_h'] -= learning_rate * grad['b_h_b_i']
        params['b_h_b_o'] -= learning_rate * grad['b_o_b_h']
        gamma['i_b_h'] -= learning_rate * bn_dgamma['i_b_h']
        beta['i_b_h'] -= learning_rate * bn_dbeta['i_b_h']
        learning_rate *= learning_rate_decay
    elif optimizer is 'Momentum':
        params['W_i_b_h'], config['W_i_b_h'] = SGD_Momentum(params['W_i_b_h'], grad['W_h_b_i'], config['W_i_b_h'])
        params['W_h_b_o'], config['W_h_b_o'] = SGD_Momentum(params['W_h_b_o'], grad['W_o_b_h'], config['W_h_b_o'])
        params['b_i_b_h'], config['b_i_b_h'] = SGD_Momentum(params['b_i_b_h'], grad['b_h_b_i'], config['b_i_b_h'])
        params['b_h_b_o'], config['b_h_b_o'] = SGD_Momentum(params['b_h_b_o'], grad['b_o_b_h'], config['b_h_b_o'])
        gamma['i_b_h'], config['gamma_i_b_h'] = SGD_Momentum(gamma['i_b_h'], bn_dgamma['i_b_h'], config['gamma_i_b_h'])
        beta['i_b_h'], config['beta_i_b_h'] = SGD_Momentum(beta['i_b_h'], bn_dbeta['i_b_h'], config['beta_i_b_h'])
    elif optimizer is 'RMSProp':
        params['W_i_b_h'], config['W_i_b_h'] = RMSProp(params['W_i_b_h'], grad['W_h_b_i'], config['W_i_b_h'])
        params['W_h_b_o'], config['W_h_b_o'] = RMSProp(params['W_h_b_o'], grad['W_o_b_h'], config['W_h_b_o'])
        params['b_i_b_h'], config['b_i_b_h'] = RMSProp(params['b_i_b_h'], grad['b_h_b_i'], config['b_i_b_h'])
        params['b_h_b_o'], config['b_h_b_o'] = RMSProp(params['b_h_b_o'], grad['b_o_b_h'], config['b_h_b_o'])
        gamma['i_b_h'], config['gamma_i_b_h'] = RMSProp(gamma['i_b_h'], bn_dgamma['i_b_h'], config['gamma_i_b_h'])
        beta['i_b_h'], config['beta_i_b_h'] = RMSProp(beta['i_b_h'], bn_dbeta['i_b_h'], config['beta_i_b_h'])
    elif optimizer is 'Adam':
        params['W_i_b_h'], config['W_i_b_h'] = Adam(params['W_i_b_h'], grad['W_h_b_i'], config['W_i_b_h'])
        params['W_h_b_o'], config['W_h_b_o'] = Adam(params['W_h_b_o'], grad['W_o_b_h'], config['W_h_b_o'])
        params['b_i_b_h'], config['b_i_b_h'] = Adam(params['b_i_b_h'], grad['b_h_b_i'], config['b_i_b_h'])
        params['b_h_b_o'], config['b_h_b_o'] = Adam(params['b_h_b_o'], grad['b_o_b_h'], config['b_h_b_o'])
        gamma['i_b_h'], config['gamma_i_b_h'] = Adam(gamma['i_b_h'], bn_dgamma['i_b_h'], config['gamma_i_b_h'])
        beta['i_b_h'], config['beta_i_b_h'] = Adam(beta['i_b_h'], bn_dbeta['i_b_h'], config['beta_i_b_h'])
    #如果隱含層數量大於1層,則再更新從第一層隱含層到最後一層隱含層之間的權重和偏置
    if hidden_layers_num > 1:
        if optimizer is 'Original':
            for i in range(hidden_layers_num - 1):
                params['W_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)] -= learning_rate * grad['W_h' + str(i + 2) + '_b_' + 'h' + str(i + 1)]
                params['b_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)] -= learning_rate * grad['b_h' + str(i + 2) + '_b_' + 'h' + str(i + 1)]
                gamma['h' + str(i + 1) + '_b_' + 'h' + str(i + 2)] -= learning_rate * bn_dgamma['h' + str(i + 1) + '_b_' + 'h' + str(i + 2)]
                beta['h' + str(i + 1) + '_b_' + 'h' + str(i + 2)] -= learning_rate * bn_dbeta['h' + str(i + 1) + '_b_' + 'h' + str(i + 2)]
            learning_rate *= learning_rate_decay
        elif optimizer is 'Momentum':
            for i in range(hidden_layers_num - 1):
                params['W_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)], config['W_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)] = SGD_Momentum(params['W_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)], 
                                 grad['W_h' + str(i + 2) + '_b_' + 'h' + str(i + 1)], 
                                 config['W_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)])
                params['b_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)], config['b_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)] = SGD_Momentum(params['b_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)], 
                                 grad['b_h' + str(i + 2) + '_b_' + 'h' + str(i + 1)], 
                                 config['b_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)])
                gamma['h' + str(i + 1) + '_b_' + 'h' + str(i + 2)], config['gamma_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)] = SGD_Momentum(beta['h' + str(i + 1) + '_b_' + 'h' + str(i + 2)], 
                                 bn_dgamma['h' + str(i + 1) + '_b_' + 'h' + str(i + 2)], 
                                 config['gamma_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)])
                beta['h' + str(i + 1) + '_b_' + 'h' + str(i + 2)], config['gamma_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)] = SGD_Momentum(gamma['h' + str(i + 1) + '_b_' + 'h' + str(i + 2)], 
                                 bn_dbeta['h' + str(i + 1) + '_b_' + 'h' + str(i + 2)], 
                                 config['beta_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)])
        elif optimizer is 'RMSProp':
            for i in range(hidden_layers_num - 1):
                params['W_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)], config['W_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)] = RMSProp(params['W_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)], 
                                 grad['W_h' + str(i + 2) + '_b_' + 'h' + str(i + 1)], 
                                 config['W_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)])
                params['b_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)], config['b_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)] = RMSProp(params['b_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)], 
                                 grad['b_h' + str(i + 2) + '_b_' + 'h' + str(i + 1)], 
                                 config['b_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)])
                gamma['h' + str(i + 1) + '_b_' + 'h' + str(i + 2)], config['gamma_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)] = RMSProp(gamma['h' + str(i + 1) + '_b_' + 'h' + str(i + 2)], 
                                 bn_dgamma['h' + str(i + 1) + '_b_' + 'h' + str(i + 2)], 
                                 config['gamma_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)])
                beta['h' + str(i + 1) + '_b_' + 'h' + str(i + 2)], config['beta_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)] = RMSProp(beta['h' + str(i + 1) + '_b_' + 'h' + str(i + 2)], 
                                 bn_dbeta['h' + str(i + 1) + '_b_' + 'h' + str(i + 2)], 
                                 config['beta_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)])
        elif optimizer is 'Adam':
            for i in range(hidden_layers_num - 1):
                params['W_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)], config['W_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)] = Adam(params['W_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)], 
                                 grad['W_h' + str(i + 2) + '_b_' + 'h' + str(i + 1)], 
                                 config['W_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)])
                params['b_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)], config['b_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)] = Adam(params['b_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)], 
                                 grad['b_h' + str(i + 2) + '_b_' + 'h' + str(i + 1)], 
                                 config['b_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)])
                gamma['h' + str(i + 1) + '_b_' + 'h' + str(i + 2)], config['gamma_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)] = Adam(gamma['h' + str(i + 1) + '_b_' + 'h' + str(i + 2)], 
                                 bn_dgamma['h' + str(i + 1) + '_b_' + 'h' + str(i + 2)], 
                                 config['gamma_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)])
                beta['h' + str(i + 1) + '_b_' + 'h' + str(i + 2)], config['beta_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)] = Adam(beta['h' + str(i + 1) + '_b_' + 'h' + str(i + 2)], 
                                 bn_dbeta['h' + str(i + 1) + '_b_' + 'h' + str(i + 2)], 
                                 config['beta_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)])
    if verbose == True:
        print(f"第{it}/{num_iters}次迭代,損失爲{loss},", end = '\n')
    if it % iterations_per_epoch == 0:
        #做一個預測,計算此次迭代的訓練準確率、驗證準確率
        #計算訓練準確率
        pred_out = affine_forward(X_batch, params['W_i_b_h'], params['b_i_b_h'])
        pred_out = (pred_out - bn_cache['i_b_h']['bn_param']['running_mean']) / np.sqrt(bn_cache['i_b_h']['bn_param']['running_var'])
        pred_out = gamma['i_b_h'] * pred_out + beta['i_b_h']
        pred_out = relu_forward(pred_out)
        if hidden_layers_num > 1:
            for i in range(hidden_layers_num - 1):
                pred_out = affine_forward(pred_out, 
                                          params['W_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)], 
                                          params['b_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)], )
                pred_out = (pred_out - bn_cache['h' + str(i + 1) + '_b_' + 'h' + str(i + 2)]['bn_param']['running_mean']) / \
                                          np.sqrt(bn_cache['h' + str(i + 1) + '_b_' + 'h' + str(i + 2)]['bn_param']['running_var'])
                pred_out = gamma['h' + str(i + 1) + '_b_' + 'h' + str(i + 2)] * pred_out + beta['h' + str(i + 1) + '_b_' + 'h' + str(i + 2)]
                pred_out = relu_forward(pred_out)
        pred_out = affine_forward(pred_out, params['W_h_b_o'], params['b_h_b_o'])
        pred_out = np.argmax(pred_out, axis = 1)
        train_acc = np.mean(pred_out == y_batch)
        train_history.append(train_acc)
        #計算驗證準確率
        valpred_out = affine_forward(X_val, params['W_i_b_h'], params['b_i_b_h'])
        valpred_out = (valpred_out - bn_cache['i_b_h']['bn_param']['running_mean']) / np.sqrt(bn_cache['i_b_h']['bn_param']['running_var'])
        valpred_out = gamma['i_b_h'] * valpred_out + beta['i_b_h']
        valpred_out = relu_forward(valpred_out)
        if hidden_layers_num > 1:
            for i in range(hidden_layers_num - 1):
                valpred_out = affine_forward(valpred_out, 
                                          params['W_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)], 
                                          params['b_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)], )
                valpred_out = (valpred_out - bn_cache['h' + str(i + 1) + '_b_' + 'h' + str(i + 2)]['bn_param']['running_mean']) / \
                                          np.sqrt(bn_cache['h' + str(i + 1) + '_b_' + 'h' + str(i + 2)]['bn_param']['running_var'])
                valpred_out = gamma['h' + str(i + 1) + '_b_' + 'h' + str(i + 2)] * valpred_out + beta['h' + str(i + 1) + '_b_' + 'h' + str(i + 2)]
                valpred_out = relu_forward(valpred_out)
        valpred_out = affine_forward(valpred_out, params['W_h_b_o'], params['b_h_b_o'])
        valpred_out = np.argmax(valpred_out, axis = 1)
        val_acc = np.mean(valpred_out == y_val)
        val_history.append(val_acc)
        print(f'訓練準確度爲{train_acc}, 驗證準確度爲{val_acc}')
    
#結果顯示
plt.figure(1)
plt.plot(loss_history)
plt.title('Training Loss')
plt.figure(2)
plt.plot(train_history, label = 'Training Accuracy')
plt.legend()
plt.title('Accuracy')
plt.plot(val_history, label = 'Validation Accuracy')
plt.legend()
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章