摘要
開始學習DL,各種代價函數,各種分佈,這裏分享一個用NN實現XOR的例子
關鍵在於XOR必須使用一個非線性Activation (Relu / Tanh) 才能進行正確分類
輸入輸出
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([[0], [1], [1], [0]])
代價函數 J(Θ)
MSE
作爲最傳統的代價函數, MSE簡單效果也穩定
Cross Entropy
引用DL書中的一句話來理解交叉熵:
可以直觀地理解到->CE與誤差緊密相關.
而且在使用梯度下降時,偏導數也非常好求,參考以下文章求CE的偏導數:
http://www.cnblogs.com/python27/p/MachineLearningWeek05.html
示例代碼
參考以下博客:http://www.cnblogs.com/Belter/p/6711160.html
代碼中使用到的CE的推導下面都有
http://www.cnblogs.com/python27/p/MachineLearningWeek05.html
import numpy as np
import matplotlib.pyplot as plt
from scipy.special import expit
# Neural Network for XOR
HIDDEN_LAYER_SIZE = 2
INPUT_LAYER = 2 # input feature
NUM_LABELS = 1 # output class number
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([[0], [1], [1], [0]])
# 初始化權重,使得權重在(-epsilon, epsilon)之間
def rand_initialize_weights(L_in, L_out, epsilon):
"""
Randomly initialize the weights of a layer with L_in
incoming connections and L_out outgoing connections;
Note that W should be set to a matrix of size(L_out, 1 + L_in) as
the first column of W handles the "bias" terms
"""
epsilon_init = epsilon
W = np.random.rand(L_out, 1 + L_in) * 2 * epsilon_init - epsilon_init
return W
def activate(x):
return sigmoid(x)
def sigmoid(x):
return expit(x)
def sigmoid_gradient(z):
return np.multiply(sigmoid(z), (1 - sigmoid(z)))
# 使用Cross-Entropy作爲二分類問題的代價函數
def nn_cost_function(theta1, theta2, X, y):
m = X.shape[0] # m=4
# 計算所有參數的偏導數/梯度
D_1 = np.zeros(theta1.shape) # Δ_1
D_2 = np.zeros(theta2.shape) # Δ_2
h_total = np.zeros((m, 1)) # 所有樣本的預測值, m*1
for t in range(m):
a_1 = np.vstack((np.array([[1]]), X[t:t+1, :].T)) # 列向量 3*1 原始輸入加上一個bias(1)
z_2 = np.dot(theta1, a_1) # 2*1 # 用原始輸入計算第二層的初始結果
a_2 = np.vstack((np.array([[1]]), sigmoid(z_2))) # 列向量 3*1 原始輸入加上一個bias(1)
z_3 = np.dot(theta2, a_2) # 1*1
a_3 = sigmoid(z_3)
h = a_3 # hypothesis預測值h等於a_3
h_total[t, 0] = h
# 根據鏈式法則推導出每一層的誤差
delta_3 = h - y[t:t+1, :].T # 最後一層每一個單元的誤差, δ_3, 1*1
delta_2 = np.multiply(np.dot(theta2[:, 1:].T, delta_3), sigmoid_gradient(z_2)) # 第二層每一個單元的誤差(不包括偏置單元), δ_2, 2*1
# 根據誤差計算導數
D_2 = D_2 + np.dot(delta_3, a_2.T) # 第二層所有參數的誤差, 1*3
D_1 = D_1 + np.dot(delta_2, a_1.T) # 第一層所有參數的誤差, 2*3
theta1_grad = (1.0 / m) * D_1 # 第一層參數的偏導數,取所有樣本中參數的均值,沒有加正則項
theta2_grad = (1.0 / m) * D_2
# 計算當前代價(交叉熵)的值
J = (1.0 / m) * np.sum(-y * np.log(h_total) - (np.array([[1]]) - y) * np.log(1 - h_total))
return {
'theta1_grad': theta1_grad,
'theta2_grad': theta2_grad,
'J': J,
'h': h_total
}
theta1 = rand_initialize_weights(INPUT_LAYER, HIDDEN_LAYER_SIZE, epsilon=1) # 2*3
theta2 = rand_initialize_weights(HIDDEN_LAYER_SIZE, NUM_LABELS, epsilon=1) # 1*3
iter_times = 10000 # 之前的問題之二,迭代次數太少
alpha = 0.5 # 之前的問題之三,學習率太小
result = {'J': [], 'h': []}
theta_s = {}
for i in range(iter_times):
cost_fun_result = nn_cost_function(theta1=theta1, theta2=theta2, X=X, y=y)
theta1_g = cost_fun_result.get('theta1_grad')
theta2_g = cost_fun_result.get('theta2_grad')
J = cost_fun_result.get('J')
h_current = cost_fun_result.get('h')
# 梯度下降更新權重
theta1 -= alpha * theta1_g
theta2 -= alpha * theta2_g
result['J'].append(J)
result['h'].append(h_current)
plt.plot(result.get('J'))
plt.show()
print(result.get('h')[0], result.get('h')[-1])
使用Keras實現
# Implementation of XOR using NN in Keras
import numpy as np
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense, Activation
def main():
# Preparing Data
X = np.array([[0, 0], [1, 0], [0, 1], [1, 1]])
y = np.array([0, 1, 1, 0])
# Building Model
model = Sequential()
model.add(Dense(32, input_shape=(2, )))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(
loss='binary_crossentropy',
optimizer='sgd',
metrics=['accuracy']
)
hist = model.fit(
x=X,
y=y,
epochs=10000
)
plt.scatter(range(len(hist.history['loss'])), hist.history['loss'])
plt.show()
pass
if __name__ == '__main__':
main()
pass