對二分類問題,4維特徵數據的bagging(logstics的bagging)

這次的任務是在用logstic單一分類器的基礎上,用bagging進行訓練,看看有沒有提升。

首先對bagging做一些介紹:

bagging主要是對樣本進行重複放回的採樣,對每一重採樣都得到一個模型,最後取平均參數(或者進行投票)產生最後的分類器。其實從重採樣就能感受到bagging實際上是在爲減少variance做努力。首先每一次採樣的子模型都不可能是完全互相獨立的,也不可能是完全相同的。因此可以得出最後的模型是介於兩者之間,因此variance介於\frac{Var(X_{i})}{n}Var(X_{i})之間,顯然降了variance。對於bias則主要是boosting的功效,下一次博客再寫。

下面給出代碼

from numpy import *
import numpy as np
import math
import random
import array
import matplotlib.pyplot as plt
#構造函數來獲取數據

# Boostrap to select subset
def Boostrap(data,k):
    sample = []
    for i in range(k):
        sample.append(data[random.randint(0, len(data) - 1)])
    return sample



def loadDataSet(fileName):
    xArr = [];
    yArr = []
    for line in open(fileName).readlines():
        curLine = line.strip().split()
        # curLine = line.strip().split('\t')  #中間有很多個空格、縮進或者tab,split的參數直接不用寫就行
        xonerow = []  # 添加1.0作爲第一個係數,則第一個係數的權重用來代表y=wx+b中的b變量
        for i in range(len(curLine) - 1):

            xonerow.append(float(curLine[i]))  # 最後一列爲輸出結果值y,前面的值爲輸入x值
        xArr.append(xonerow)
        yArr.append(int(curLine[-1]))  # 添加最後一列爲結果值

    return xArr, yArr

# loadDataSet2
def loadDataSet2(fileName):
    xArr = [];
    for line in open(fileName).readlines():
        curLine = line.strip().split()
        # curLine = line.strip().split('\t')  #中間有很多個空格、縮進或者tab,split的參數直接不用寫就行
        xonerow = []  # 添加1.0作爲第一個係數,則第一個係數的權重用來代表y=wx+b中的b變量
        for i in range(len(curLine)):

            xonerow.append(float(curLine[i]))  # 最後一列爲輸出結果值y,前面的值爲輸入x值
        xArr.append(xonerow)
    return xArr
# def loadDataSet(fileName):
#     data_x=[];data_y=[]
#     # fr=open('machinelearninginaction/Ch05/testSet.txt')
#     for line in open(fileName).readlines():
#         lineArr=line.strip().split()
#         data_x.append([1.0,float(lineArr[0]),float(lineArr[1])])#特徵數據集,添加1是構造常數項x0
#         data_y.append(int(lineArr[-1]))#分類數據集
#     return data_x,data_y

def sigmoid(X):
    return 1/(1+exp(-X))

def gradAscent(data_x,data_y):
    data_xrix=mat(data_x) #(m,n)
    data_y=mat(data_y).transpose() #(m,1)
    m,n=shape(data_xrix)
    Weights=ones((n,1)) #initialization(n,1)
    alpha=0.001 #define the step
    maxCycles=700 #times of loop
    #We can also define a regularization parameter to constrain some huge weight
    reg_lambda = math.exp(-8)
    for i in range(maxCycles):
        h=sigmoid(data_xrix * Weights) #f(thetax)
        error=data_y - h #y-h,(m,1)
        Weights=(1-reg_lambda)*Weights + alpha * data_xrix.transpose() * error #Gradient ascend
    return Weights

def Judgefunction(test_y):
    val=[]
    rel=[]
    for i in range(test_y.shape[0]):
        val.append(test_y[i,0])
        if val[i]<0.5:
            rel.append(0)
        else:
            rel.append(1)
    return rel

if __name__== "__main__":
    # Single classifier output
    data_x, data_y = loadDataSet('C:/Users/Carzolar/Desktop/bagging and boosting/Train.txt')
    # print(mat(data_x).shape[0])
    Weights = gradAscent(data_x,data_y)
    # print(data_x)
    # print(data_y)
    result=[]
    print('The single Weights is :')
    print(Weights)

    # test model
    test_x, real_y= loadDataSet('C:/Users/Carzolar/Desktop/bagging and boosting/Test.txt')
    test_y = sigmoid(test_x*Weights)
    real_test_y=test_y[0,0]
    real_test_y2=test_y[1,0]
    result=Judgefunction(test_y)

    # Result
    # print('test_y is:',test_y)



    # bagging classifier

    data = loadDataSet2('C:/Users/Carzolar/Desktop/bagging and boosting/Train.txt')
    times = input('Please input the bagging times:')
    result_mat=[]
    for i in range(int(times)):
        Sample = Boostrap(data,400)
        print('In times',times,'Sample is:')
        print(Sample)
        #extract the x and y in this loop
        sample_x = np.mat(Sample)[:,0:4]
        sample_y0 = np.mat(Sample)[:,-1]
        # print(sample_x)
        sample_y1=sample_y0.transpose()
        sample_y2 = sample_y1.tolist()
        sample_y = sample_y2[0]
        # print(sample_y)
        weights = gradAscent(sample_x,sample_y)
        print('The weight is:\n',weights)

        #using this weights to predict the test data and record
        sample_test_x,sample_real_y = loadDataSet('C:/Users/Carzolar/Desktop/bagging and boosting/Test.txt')
        sample_test_y = sigmoid(sample_test_x*weights)
        test_result = Judgefunction(sample_test_y)
        # real_result = Judgefunction(mat(sample_real_y))
        # print(sample_test_y)
        print('This time result is:\n',test_result)
        # print('while the real result is:\n',real_y)
        result_mat.append(test_result)

    print('The result matrix is :\n',result_mat)
    final_result = []
    # print(mat(result_mat).shape[1])
    # real_list1 = mat(result_mat)[:,1].transpose().tolist()
    # print(real_list1[0])
    # print(max(real_list1[0],key=real_list1[0].count))
    # real_list = real_list1[0]
    for i in range (int(mat(result_mat).shape[1])):
        real_list1 = mat(result_mat)[:,i].transpose().tolist()
        real_list = real_list1[0]
        final_result.append(max(real_list1[0],key=real_list1[0].count))

    print('The single logstic regression test_result is:\n ', result)
    print('After voting, the result is:\n',final_result)
    # print('real_y is: ', real_y)
    print('While the real result is :\n',real_y)

    #error rate calculation
    frag1=0
    frag2=0
    for i in range(len(real_y)):
        if result[i]!=real_y[i]:
            frag1 +=1
        if final_result[i]!=real_y[i]:
            frag2 +=1
    single_error_rate = frag1/len(real_y)
    final_error_rate = frag2/len(real_y)
    print('Single logistic error rate is : \n',single_error_rate)
    print('After bagging, the error rate is :\n',final_error_rate)




這裏使用的是投票法的bagging,分別用10次,50次和100次迭代去觀察:

發現三次bagging之後,錯誤率並沒有減少。但是當我改變了SGD裏的超參數(步長)發現:

其實應該很好解釋這種情況,也就是當SGD並沒有找到局部最優時,bagging能在此基礎上幫忙減少錯誤率。可能在算法調優時能通過bagging的效果來發現原算法的優劣?

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章