機器學習 邏輯迴歸

感謝:
https://blog.csdn.net/lu597203933/article/details/38468303

https://www.bilibili.com/video/av10590361/?p=31&t=176  logistic regression chapter

以及《機器學習實戰》 第五章

 

'''
logistic regression 邏輯迴歸:實際上是分類問題
梯度下降算法找到loss損失函數的最小值點
逐步迭代,找到最佳的迴歸係數,邏輯迴歸的分類器模型參數更新的過程就是
模型學習/訓練的過程,也可以理解成是模型最佳係數迴歸的過程
'''
import numpy as np
import os
import matplotlib.pyplot as plt
import random
def Read_Data(data_path):
    '''
    讀取所給出的保存數據的完整txt路徑,並轉換成對應的numpy數組
    :param data_path:
    :return:
    '''
    file_obj=open(data_path)
    all_lines=file_obj.readlines()
    num_samples=len(all_lines)
    num_feat=len(all_lines[0].split())-1
    data_array=np.zeros((num_samples,num_feat))
    label_array=np.zeros((num_samples))
    # print(data_array.shape)
    for i,line in enumerate(all_lines):
        line=line.split()
        for j in range(num_feat):
            if line[j][0]=='-':
                data_array[i][j]=-float(line[j][1:])
            else:
                data_array[i][j]=float(line[j])
        label_array[i]=int(line[-1])
    file_obj.close()
    return data_array,label_array

class logistic_reg(object):
    def __init__(self,num_feat):
        # self.weight=np.random.rand(num_feat).reshape(num_feat,1)
        self.weight = np.ones(num_feat).reshape(num_feat, 1)
        # self.bias=np.ones((1))
    def forward(self,train_data,train_label,lr,is_training):
        # 根據前向傳播計算得到的對於訓練數據的預測值和訓練數據標籤,求出損失函數,並更新
        # 模型的參數
        '''
        :param train_data: numpy.array = [num_samples,num_feat]
        :return:
        '''
        # train_data=np.concatenate((train_data,np.ones((train_data.shape[0],1))),axis=1)
        train_data=np.concatenate((np.ones((train_data.shape[0],1)),train_data),axis=1)

        pred=np.dot(train_data,self.weight)
        # pred shape [num_samples,1]
        pred=1+np.exp(-pred)
        pred=1/pred
        # 前向傳播,計算在當前模型的迴歸參數下,所預測的輸出

        if not is_training:# 在評估模式下,直接返回模型所預測的概率值
            return pred
        else:
            log_fore=-np.log(pred)# 對於ground truth label爲1的樣本
            log_back=-np.log(1-pred)

            if len(train_label.shape)==1:
                train_label=np.expand_dims(train_label,axis=1)

            loss=np.sum(log_fore[np.where(train_label==1)])+np.sum(log_back[np.where(train_label==0)])
            loss/=train_data.shape[0]
            # 損失函數計算結束

            # 梯度下降算法
            error=pred-train_label
            dW=np.dot(train_data.T,error)
            # dW = [num_feat,1]
            # dW/=train_data.shape[0]
            db=np.mean(pred-train_label)

            self.weight-=lr*dW
            # self.bias-=lr*db
            return pred,loss

if __name__=='__main__':
    data_root_path='F:\\machine_learning\\Ch05'
    data_path=os.path.join(data_root_path,'testSet.txt')
    data_array,label_array=Read_Data(data_path)
    # print(data_array,label_array)
    # print(data_array.shape)

    log_model=logistic_reg(data_array.shape[1]+1)

    epoch=500
    lr=0.001
    for i in range(epoch):
        pred,loss=log_model.forward(data_array,label_array,lr,is_training=True)
        # print('epoch',i,'loss',loss)
    print('weight',log_model.weight)
    # print('bias',log_model.bias)

    print(pred.shape,np.min(pred),np.max(pred))

    prediction=np.where(pred>0.5,1,0)
    prediction=prediction.reshape(-1)

    # print(prediction[:50])
    # print(label_array[:50])

    accuracy=np.sum(prediction==label_array)/label_array.shape[0]
    print('full batch training , accuracy',accuracy)
    '''
    使用梯度下降法,每次迭代/更新梯度值使用的是整個訓練數據集
    即一次處理的是所有的數據,稱爲批處理
    accuracy 0.96
    '''
    #  [[ 0.82234723]
    #  [-0.27227592]
    #  [ 1.03490603]]

    fig = plt.figure()
    ax1 = fig.add_subplot(111)
    # 設置標題
    ax1.set_title('two dimension data')
    # 設置X軸標籤
    plt.xlabel('x0')
    # 設置Y軸標籤
    plt.ylabel('x1')
    # 畫散點圖
    fore_point=data_array[np.where(label_array==1)]
    back_point=data_array[np.where(label_array==0)]
    ax1.scatter(x=fore_point[:, 0], y=fore_point[:, 1], c='r', marker='o')
    # 設置圖標
    plt.legend('fore_point')

    ax1.scatter(x=back_point[:, 0], y=back_point[:, 1], c='b', marker='^')
    # 設置圖標
    plt.legend('back_point')

    x=np.arange(-3,3,0.1)
    y=-(log_model.weight[1]/log_model.weight[2])*x-(log_model.weight[0]/log_model.weight[2])
    plt.scatter(x,y,c='g', marker='.')

    # 顯示所畫的圖
    plt.show()

    '''
    批量的梯度下降法(每更新一次迴歸係數,需要使用到訓練數據集中的所有數據)對於大規模數據集的處理計算複雜度會很高
    故而引入隨機梯度下降法
    隨機梯度下降法:每次使用一個訓練樣本計算得到的梯度值更新迴歸係數,稱爲在線學習
    
    隨機梯度下降法可以看作是batch size=1的mini-batch梯度下降法,深度學習中通常的做法是:
    對於每個epoch,首先將整個訓練數據集的數據進行隨機重排(random shuffle),然後在每個step中從
    隨機重排後的數據集中採樣出batch  size個樣本,更新一次迴歸係數,再進行下一個step的操作,
    對於同一個epoch而言,每個step採樣出來的樣本是不具有overlap的,而同一個epcoh的所有step
    所使用到的樣本總數就是整個訓練數據集
    在這裏的小批量梯度下降算法中,引入polynomial learning rate,即在訓練過程中,學習率呈現多項式曲線的下降趨勢
    lr(t)=base_lr*(t/T)**power
    t表示當前step步數,T表示總的step步數
    '''
    log_model_2 = logistic_reg(data_array.shape[1] + 1)

    epoch = 50
    lr=0.1
    # base_lr = 0.1
    batch_size=1
    if data_array.shape[0]%batch_size==0:
        num_step=data_array.shape[0]//batch_size
    else:
        num_step = int(data_array.shape[0]/ batch_size)+1
    # T = epoch * num_step
    for i in range(epoch):
        # 在每個epoch開始時對整個訓練數據集進行隨機重排
        index=random.sample(range(data_array.shape[0]), data_array.shape[0])
        if i%25==0:
            lr*=0.1
        # print(i,lr)
        for j in range(num_step):
            # lr=base_lr*(num_step*i+j+1)/()
            row_start=j*batch_size
            row_end=(j+1)*batch_size
            if row_end>data_array.shape[0]:
                row_end=data_array.shape[0]
            pred, loss = log_model_2.forward(data_array[index[row_start:row_end]], label_array[index[row_start:row_end]], lr, is_training=True)
            # print('epoch',i,'loss',loss)
    pred=log_model_2.forward(data_array,label_array,lr=0,is_training=False)
    prediction = np.where(pred > 0.5, 1, 0)
    prediction = prediction.reshape(-1)

    accuracy = np.sum(prediction == label_array) / label_array.shape[0]
    print('mini batch accuracy', accuracy)
    # print('weight', log_model.weight)
    # print('bias',log_model.bias)
    # mini batch accuracy 0.96   經過50次遍歷,可達到與梯度下降法遍歷500次相同的準確率

    '''
    讀取真實數據即大規模數據集
    '''
    data_root_path = 'F:\\machine_learning\\Ch05'
    data_path = os.path.join(data_root_path, 'testSet.txt')
    data_train,label_train=Read_Data(os.path.join(data_root_path,'horseColicTraining.txt'))
    data_test,label_test=Read_Data(os.path.join(data_root_path,'horseColicTest.txt'))
    print(data_train.shape,label_train.shape)

    # (299, 21) (299,)   數據集中的訓練樣本含有21維的特徵

    horse_model=logistic_reg(num_feat=data_train.shape[1]+1)

    epoch = 50
    lr=0.1
    # base_lr = 0.1
    batch_size=50
    if data_train.shape[0]%batch_size==0:
        num_step=data_train.shape[0]//batch_size
    else:
        num_step = int(data_train.shape[0]/ batch_size)+1
    # T = epoch * num_step
    for i in range(epoch):
        # 在每個epoch開始時對整個訓練數據集進行隨機重排
        index=random.sample(range(data_train.shape[0]), data_train.shape[0])
        if i%25==0:
            lr*=0.1
        # print(i,lr)
        for j in range(num_step):
            # lr=base_lr*(num_step*i+j+1)/()
            row_start=j*batch_size
            row_end=(j+1)*batch_size
            if row_end>data_train.shape[0]:
                row_end=data_train.shape[0]
            pred, loss = horse_model.forward(data_train[index[row_start:row_end]], label_train[index[row_start:row_end]], lr, is_training=True)
            # print('epoch',i,'loss',loss)
        pred=horse_model.forward(data_test,label_test,lr=0,is_training=False)
        prediction = np.where(pred > 0.5, 1, 0)
        prediction = prediction.reshape(-1)

        accuracy = np.sum(prediction == label_test) / label_test.shape[0]
        print('epoch',i,'horse mini batch accuracy', accuracy)
        '''
        epoch 48 horse mini batch accuracy 0.7611940298507462
        epoch 49 horse mini batch accuracy 0.7014925373134329
        '''

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章