# 神經網絡學習（四）

## 假設

• 樣本維度是2維
• 分類結果有三類
• 一種兩個樣本$(x_{11},x_{12},y_1),(x_{21},x_{22},y_2)$

## 公式推導

### 前向傳播

\begin{aligned} z_1 &= \omega_{11}x_1+ \omega_{12}x_2+b_1 \\ z_2 &= \omega_{21}x_1+ \omega_{22}x_2+b_2 \\ z_3 &= \omega_{31}x_1+ \omega_{32}x_2+b_3 \\ a_1 &= \frac{e^{z_1}}{e^{z_1}+e^{z_2}+e^{z_3}} \\ a_2 &= \frac{e^{z_2}}{e^{z_1}+e^{z_2}+e^{z_3}} \\ a_3 &= \frac{e^{z_3}}{e^{z_1}+e^{z_2}+e^{z_3}} \\ \end{aligned}

\begin{aligned} z_1 &= \omega_{11}x_1+ \omega_{12}x_2+b_1 \\ z_2 &= \omega_{21}x_1+ \omega_{22}x_2+b_2 \\ z_3 &= \omega_{31}x_1+ \omega_{32}x_2+b_3 \\ a_1 &= \frac{1}{1+e^{z_1}} \\ a_2 &= \frac{1}{1+e^{z_2}} \\ a_3 &= \frac{1}{1+e^{z_3}} \\ \end{aligned}

\begin{aligned} E=y_1\text{log} a_1+y_2\text{log} a_2 +y_3\text{log}a_3 \end{aligned}

\begin{aligned} E=\frac{1}{2}\left[(y_1-a_1)^2+(y_1-a_1)^2+(y_1-a_1)^2\right] \end{aligned}

### 反向傳播

\begin{aligned} \frac{\partial E}{\partial \omega_{11}}=-(y_1-a_1)a_1(1-a_1)x_{11} \end{aligned}
**我們要明確，在使用平方誤差損失函數時我們的目的是什麼？**我們是想知道$\nabla \omega_{11}$和誤差$y_1-a_1$的關係，即$\nabla \omega_{11}=f(y_1-a_1)$

$A=|y_1-a_1|$
$y_1=1$時，$A=y_1-a_1$$a_1=y_1-A$帶入上式得
\begin{aligned} \frac{\partial E}{\partial \omega_{11}}=-A(y_1-A)(1-(y_1-A))x_{11}=-A^2(1-A)x_{11} \end{aligned}
$y_1=0$時，$A=a_1$，帶入上式得
\begin{aligned} \frac{\partial E}{\partial \omega_{11}}=-(-A)A(1-A)x_{11}=A^2(1-A)x_{11} \end{aligned}

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

A = np.linspace(0, 1, 100)
plt.plot(A, A ** 2 * (1 - A))
plt.xlabel("|error|")
plt.ylabel("$\delta w_{11}$")
plt.title("$\delta w_{11}$=f(A)")
plt.show()


\begin{aligned} \frac{\partial E}{\partial \omega_{11}}=a_1-y_1 \end{aligned}
$y_1=1$時，$A=y_1-a_1$$a_1=y_1-A$帶入上式得:
\begin{aligned} \frac{\partial E}{\partial \omega_{11}}=-A \end{aligned}
$y_1=0$時，$A=a_1$，帶入上式得:
\begin{aligned} \frac{\partial E}{\partial \omega_{11}}=A \end{aligned}

## 實驗驗證

#### 交叉熵代價函數

import numpy as np
from sklearn.preprocessing import LabelBinarizer  # 標籤二值化
from sklearn.model_selection import train_test_split  # 切割數據,交叉驗證法
import matplotlib.pyplot as plt

def sigmoid(x):
return 1 / (1 + np.exp(-x))

def dsigmoid(x):
return x * (1 - x)

def softmax(x):
sum = 0
temp = np.zeros(len(x[0]))
for i in range(len(x[0])):
sum += np.exp(x[0][i])

for i in range(len(x[0])):
temp[i] = np.exp(x[0][i]) / sum

temp = np.atleast_2d(temp)
return temp

class NeuralNetwork:
def __init__(self, layers):  # (64,100,10)
# 權重的初始化,範圍-1到1：+1的一列是偏置值
self.V = np.random.random((layers[0] + 1, layers[1] + 1)) * 2 - 1
self.W = np.random.random((layers[1] + 1, layers[2])) * 2 - 1

def train(self, X, y, lr=0.11, epochs=10000):
# 添加偏置值：最後一列全是1
temp = np.ones([X.shape[0], X.shape[1] + 1])
temp[:, 0:-1] = X
X = temp

for n in range(epochs + 1):
# 在訓練集中隨機選取一行(一個數據)：randint()在範圍內隨機生成一個int類型
i = np.random.randint(X.shape[0])
x = [X[i]]
# 轉爲二維數據：由一維一行轉爲二維一行
x = np.atleast_2d(x)

# L1：輸入層傳遞給隱藏層的值；輸入層64個節點，隱藏層100個節點
# L2：隱藏層傳遞到輸出層的值；輸出層10個節點
L1 = sigmoid(np.dot(x, self.V))
L2 = softmax(np.dot(L1, self.W))
# L2_delta：輸出層對隱藏層的誤差改變量
# L1_delta：隱藏層對輸入層的誤差改變量
Error.append(np.abs(y[i][0] - L2[0][0]))
L2_delta = y[i] - L2
L1_delta = L2_delta.dot(self.W.T) * dsigmoid(L1)
#print(L2)
# 計算改變後的新權重
self.W += lr * L1.T.dot(L2_delta)
self.V += lr * x.T.dot(L1_delta)
if n > 40000:
lr = lr * 0.99
# 每訓練1000次輸出一次準確率
if n % 1000 == 0:
predictions = []
for j in range(X_test.shape[0]):
# 獲取預測結果：返回與十個標籤值逼近的距離，數值最大的選爲本次的預測值
o = self.predict(X_test[j])
# 將最大的數值所對應的標籤返回
predictions.append(np.argmax(o))
# np.equal()：相同返回true，不同返回false
accuracy = np.mean(np.equal(predictions, y_test))
print('迭代次數：', n, '準確率：', accuracy)

def predict(self, x):
# 添加偏置值：最後一列全是1
temp = np.ones([x.shape[0] + 1])
temp[0:-1] = x
x = temp
# 轉爲二維數據：由一維一行轉爲二維一行
x = np.atleast_2d(x)

# L1：輸入層傳遞給隱藏層的值；輸入層64個節點，隱藏層100個節點
# L2：隱藏層傳遞到輸出層的值；輸出層10個節點
L1 = sigmoid(np.dot(x, self.V))
L2 = softmax(np.dot(L1, self.W))
return L2
# 載入數據:8*8的數據集
X = digits.data
Y = digits.target
# 輸入數據歸一化：當數據集數值過大，乘以較小的權重後還是很大的數，代入sigmoid激活函數就趨近於1，不利於學習
X -= X.min()
X /= X.max()

NN = NeuralNetwork([64, 80, 10])
# sklearn切分數據
X_train, X_test, y_train, y_test = train_test_split(X, Y)
# 標籤二值化：將原始標籤(十進制)轉爲新標籤(二進制)
labels_train = LabelBinarizer().fit_transform(y_train)
labels_test = LabelBinarizer().fit_transform(y_test)
global Error
Error = []
print('開始訓練')
NN.train(X_train, labels_train, epochs=40000)
print('訓練結束')


#### 平方誤差代價函數

import numpy as np
from sklearn.preprocessing import LabelBinarizer  # 標籤二值化
from sklearn.model_selection import train_test_split  # 切割數據,交叉驗證法
import matplotlib.pyplot as plt

def sigmoid(x):
return 1 / (1 + np.exp(-x))

def dsigmoid(x):
return x * (1 - x)

def softmax(x):
sum = 0
temp = np.zeros(len(x[0]))
for i in range(len(x[0])):
sum += np.exp(x[0][i])

for i in range(len(x[0])):
temp[i] = np.exp(x[0][i]) / sum

temp = np.atleast_2d(temp)
return temp

class NeuralNetwork:
def __init__(self, layers):  # (64,100,10)
# 權重的初始化,範圍-1到1：+1的一列是偏置值
self.V = np.random.random((layers[0] + 1, layers[1] + 1)) * 2 - 1
self.W = np.random.random((layers[1] + 1, layers[2])) * 2 - 1

def train(self, X, y, lr=0.11, epochs=10000):
# 添加偏置值：最後一列全是1
temp = np.ones([X.shape[0], X.shape[1] + 1])
temp[:, 0:-1] = X
X = temp

for n in range(epochs + 1):
# 在訓練集中隨機選取一行(一個數據)：randint()在範圍內隨機生成一個int類型
i = np.random.randint(X.shape[0])
x = [X[i]]
# 轉爲二維數據：由一維一行轉爲二維一行
x = np.atleast_2d(x)

# L1：輸入層傳遞給隱藏層的值；輸入層64個節點，隱藏層100個節點
# L2：隱藏層傳遞到輸出層的值；輸出層10個節點
L1 = sigmoid(np.dot(x, self.V))
L2 = sigmoid(np.dot(L1, self.W))

# L2_delta：輸出層對隱藏層的誤差改變量
# L1_delta：隱藏層對輸入層的誤差改變量
L2_delta = (y[i] - L2) * dsigmoid(L2)
L1_delta = L2_delta.dot(self.W.T) * dsigmoid(L1)
Error.append(np.abs(y[i][0] - L2[0][0]))
#print(L2)
# 計算改變後的新權重
self.W += lr * L1.T.dot(L2_delta)
self.V += lr * x.T.dot(L1_delta)
if n > 40000:
lr = lr * 0.99
# 每訓練1000次輸出一次準確率
if n % 1000 == 0:
predictions = []
for j in range(X_test.shape[0]):
# 獲取預測結果：返回與十個標籤值逼近的距離，數值最大的選爲本次的預測值
o = self.predict(X_test[j])
# 將最大的數值所對應的標籤返回
predictions.append(np.argmax(o))
# np.equal()：相同返回true，不同返回false
accuracy = np.mean(np.equal(predictions, y_test))
print('迭代次數：', n, '準確率：', accuracy)

def predict(self, x):
# 添加偏置值：最後一列全是1
temp = np.ones([x.shape[0] + 1])
temp[0:-1] = x
x = temp
# 轉爲二維數據：由一維一行轉爲二維一行
x = np.atleast_2d(x)

# L1：輸入層傳遞給隱藏層的值；輸入層64個節點，隱藏層100個節點
# L2：隱藏層傳遞到輸出層的值；輸出層10個節點
L1 = sigmoid(np.dot(x, self.V))
L2 = softmax(np.dot(L1, self.W))
return L2
# 載入數據:8*8的數據集
X = digits.data
Y = digits.target
# 輸入數據歸一化：當數據集數值過大，乘以較小的權重後還是很大的數，代入sigmoid激活函數就趨近於1，不利於學習
X -= X.min()
X /= X.max()

NN = NeuralNetwork([64, 80, 10])
# sklearn切分數據
X_train, X_test, y_train, y_test = train_test_split(X, Y)
# 標籤二值化：將原始標籤(十進制)轉爲新標籤(二進制)
labels_train = LabelBinarizer().fit_transform(y_train)
labels_test = LabelBinarizer().fit_transform(y_test)
global Error
Error = []