深度學習模型的訓練過程實際上就是在進行參數估計,待估參數是網絡模型的權值參數。
線性迴歸模型的目標函數實際上是關於參數theta的二次函數,則必然是凸函數,則必然有唯一的全局最小值點。故而無論怎麼樣的訓練技巧設置(學習率,batch size),最終必然都會收斂到唯一的全局最小值點。
梯度下降法
1.隨機梯度下降法
對於訓練數據集(規模爲m)中的每個訓練樣本,都會更新一次參數值,相當於batch size=1
2.批梯度下降法
將整個訓練數據集在參數上得到的梯度平均值,對參數更新,batch size=m
3.mini-batch 小批量梯度下降法
batch size設定,將在若干個樣本上計算得到的梯度平均值作爲梯度,更新參數。
import os
import numpy as np
class train_linear_reg():
def __init__(self,x_train,y_train,lam):
self.x_train=x_train
self.y_train=y_train
self.lam=lam
# self.lr=lr
# self.batch_size=batch_size
self.num_train=self.x_train.shape[0]
self.num_feat=self.x_train.shape[1]
self.theta=np.random.randn(num_feat).reshape(self.num_feat,1)
# def train_an_epoch(self):
# # 每次進行訓練一個epoch的操作時,都將訓練數據集的編號進行重新排列(shuffle)
# num_step=self.num_train//self.batch_size+1
#
# # print(self.x_train.shape,self.y_train.shape,np.concatenate((self.x_train,self.y_train),axis=0).shape)
#
# temp_train_x_y=np.concatenate((self.x_train,self.y_train),axis=1)
# np.random.shuffle(temp_train_x_y)
# epoch_loss=0.0
# for i in range(num_step):
# batch_x=temp_train_x_y[i*self.batch_size:(i+1)*self.batch_size,:-1]
# batch_y=np.expand_dims(temp_train_x_y[i*self.batch_size:(i+1)*self.batch_size,-1],axis=1)
#
# epoch_loss+=self.cal_Loss(batch_x,batch_y)
#
# delta_theta=np.dot(batch_x.T,np.dot(batch_x,self.theta))-np.dot(batch_x.T,batch_y)
# delta_theta+=self.lam*self.theta
# self.theta-=self.lr*delta_theta
#
# epoch_loss/=num_step
# return epoch_loss,self.theta
def min_square(self):
'''
:return: 使用最小二乘的矩陣相乘法
X*theta=Y
'''
self.theta=np.dot(self.x_train.T,self.x_train)
self.theta+=self.lam*np.eye(self.num_feat)
self.theta=np.linalg.inv(self.theta)
mid=np.dot(self.x_train.T,self.y_train)
self.theta=np.dot(self.theta,mid)
# self.theta+=self.lam*np.eye(self.num_feat)
return self.theta
def cal_Loss(self,x,y):
'''
計算prediction error
:param x:
:param y:
:return:
'''
pred=np.dot(x,self.theta)
error_vector=pred-y
loss=np.dot(error_vector.T,error_vector)
loss+=np.dot(self.theta.T,self.theta)
loss/=2
return loss
def pred(self,test_x,test_y):
return self.cal_Loss(test_x,test_y)
if __name__=='__main__':
data_path='F:\\machine_learning\\yaling\\hw_2'
with open(os.path.join(data_path,'train-matrix.txt'),'r') as train_file:
all_lines=train_file.readlines()
# print(type(all_lines[0])) str
num_train=int(all_lines[0].strip())
num_feat=int(all_lines[1].strip())
# print(num_train,num_feat)
train_matrix=np.zeros((num_train,num_feat))
train_label=np.zeros((num_train,1))
for i in range(num_train):
train_matrix[i,:]=np.array(list(map(float,all_lines[i+2].split())))
train_label[i,0]=float(all_lines[2+num_train+i])
# print(train_matrix[0][0],train_label[0,0])
# 進行10折交叉驗證,實際上進行交叉驗證所使用的數據集是不包含測試數據集的
# 將訓練數據集均等地劃分成10個等分,每次隨機取出1等分作爲驗證集,剩下9分作爲訓練集
# 對於每個當前的超參數,都需要進行10次的訓練,這樣最終對於每個超參數將會得到10個在驗證數據集上面的評估結果
# 將10個評估結果取平均值就是當前超參數的驗證數據集上的準確率
# 挑選出在驗證集上準確率最高的超參數作爲最優的超參數
# 給出在測試集上的評估結果作爲當前模型(不考慮具體超參數設置)的泛化性能指標
lam_list=[0.0125, 0.025, 0.05, 0.1, 0.2]
# lam_list = [0.1]
# epoch_num=1000
k=10
example_per_flod=num_train//k
best_solver=None
pred_error=float('inf')
# lr=1e-8
# batch_size=100
for lam in lam_list:
temp_lam_error=0.0
for flod in range(k):
# print(train_matrix[:flod*example_per_flod,:].shape)
# print(train_matrix[(flod+1)*example_per_flod:,:].shape)
if flod==0:
temp_train_x = train_matrix[(flod + 1) * example_per_flod:, :]
temp_train_y = train_label[(flod + 1) * example_per_flod:, :]
elif flod==k-1:
temp_train_x = train_matrix[:flod * example_per_flod, :]
temp_train_y = train_label[:flod * example_per_flod, :]
else:
temp_train_x = np.concatenate(
(train_matrix[:flod * example_per_flod, :], train_matrix[(flod + 1) * example_per_flod:, :]),
axis=0)
temp_train_y = np.concatenate(
(train_label[:flod * example_per_flod, :], train_label[(flod + 1) * example_per_flod:, :]), axis=0)
temp_test_x=train_matrix[flod*example_per_flod:(flod+1)*example_per_flod,:]
temp_test_y=train_label[flod*example_per_flod:(flod+1)*example_per_flod,:]
# print('here',temp_train_y.shape,temp_train_x.shape)
# print(temp_train_y)
temp_solver=train_linear_reg(temp_train_x,temp_train_y,lam)
# for epoch in range(epoch_num):
# epoch_loss,_=temp_solver.train_an_epoch()
#
# print('epoch:%d,train epoch_loss:%.8f'%(epoch,epoch_loss))
temp_solver.min_square()
temp_lam_error+=temp_solver.pred(temp_test_x,temp_test_y)
temp_pred_error=temp_lam_error/k
print('lam:%.6f,valid loss:%.8f' % (lam, temp_pred_error))
if temp_pred_error<pred_error:
pred_error=temp_pred_error
best_solver=temp_solver
# lam:0.012500,valid loss:2159.87260888
# lam:0.025000,valid loss:2159.87141162
# lam:0.050000,valid loss:2159.86902188
# lam:0.100000,valid loss:2159.86426159
# lam:0.200000,valid loss:2159.85481769
print(best_solver.theta,best_solver.lam)
# [[13.19911254]
# [-9.31965636]
# [ 9.04841504]
# [12.40358965]
# [ 4.98294389]
# [-3.25304597]
# [-4.01073318]
# [12.87214272]
# [-4.9180319 ]
# [-7.32203696]] 0.2
new_solver=train_linear_reg(train_matrix,train_label,best_solver.lam)
# 讀取測試數據和測試數據集的標籤
with open(os.path.join(data_path,'test-matrix.txt'),'r') as test_file:
all_lines=test_file.readlines()
# print(type(all_lines[0])) str
num_test=int(all_lines[0].strip())
num_feat=int(all_lines[1].strip())
# print(num_train,num_feat)
test_matrix=np.zeros((num_test,num_feat))
test_label=np.zeros((num_test,1))
for i in range(num_test):
test_matrix[i,:]=np.array(list(map(float,all_lines[i+2].split())))
test_label[i,0]=float(all_lines[2+num_test+i])
test_err=new_solver.pred(test_matrix,test_label)
print('test_err:',test_err)
with open(os.path.join(data_path,'true-beta.txt'),'r') as t:
all_lines=t.readlines()
num_feat=int(all_lines[0])
true_beta=np.zeros((num_feat,1))
for j in range(num_feat):
true_beta[j,:]=float(all_lines[j+1])
distance=true_beta-new_solver.theta
distance=np.dot(distance.T,distance)
print('distance',distance)
# test_err: [[2628296.29671165]]
# distance [[816.7090605]]
# 使用貪婪算法所得到的最優beta值
A=[]
beta=np.zeros((num_feat,1))
for temp_k in range(6):
error=np.dot(train_matrix,beta)-train_label
all_samples=np.abs(np.dot(train_matrix.T,error))
temp_i=np.argmax(all_samples,axis=0)
if temp_i not in A:
A.append(temp_i)
temp_x=train_matrix.copy()
used_x=np.zeros((num_train,0))
# for i in range(num_feat):
# if i not in A:
# beta[i,:]=0
# else:
used_list=[]
for i in A:
used_list.append(train_matrix[:,i])
used_x=np.concatenate(used_list,axis=1)
print('used_x',used_x.shape)
used_beta=np.linalg.inv(np.dot(used_x.T,used_x))
mid=np.dot(used_x.T,train_label)
used_beta=np.dot(used_beta,mid)
used_beta=used_beta.tolist()
for p in range(num_feat):
if p not in A:
beta[p,:]=0
else:
beta[p,:]=used_beta.pop(0)
print(beta)
here_solver=train_linear_reg(train_matrix,train_label,0)
here_solver.theta=beta
print(here_solver.pred(test_matrix,test_label))