這次的任務是在用logstic單一分類器的基礎上,用bagging進行訓練,看看有沒有提升。
首先對bagging做一些介紹:
bagging主要是對樣本進行重複放回的採樣,對每一重採樣都得到一個模型,最後取平均參數(或者進行投票)產生最後的分類器。其實從重採樣就能感受到bagging實際上是在爲減少variance做努力。首先每一次採樣的子模型都不可能是完全互相獨立的,也不可能是完全相同的。因此可以得出最後的模型是介於兩者之間,因此variance介於和之間,顯然降了variance。對於bias則主要是boosting的功效,下一次博客再寫。
下面給出代碼
from numpy import *
import numpy as np
import math
import random
import array
import matplotlib.pyplot as plt
#構造函數來獲取數據
# Boostrap to select subset
def Boostrap(data,k):
sample = []
for i in range(k):
sample.append(data[random.randint(0, len(data) - 1)])
return sample
def loadDataSet(fileName):
xArr = [];
yArr = []
for line in open(fileName).readlines():
curLine = line.strip().split()
# curLine = line.strip().split('\t') #中間有很多個空格、縮進或者tab,split的參數直接不用寫就行
xonerow = [] # 添加1.0作爲第一個係數,則第一個係數的權重用來代表y=wx+b中的b變量
for i in range(len(curLine) - 1):
xonerow.append(float(curLine[i])) # 最後一列爲輸出結果值y,前面的值爲輸入x值
xArr.append(xonerow)
yArr.append(int(curLine[-1])) # 添加最後一列爲結果值
return xArr, yArr
# loadDataSet2
def loadDataSet2(fileName):
xArr = [];
for line in open(fileName).readlines():
curLine = line.strip().split()
# curLine = line.strip().split('\t') #中間有很多個空格、縮進或者tab,split的參數直接不用寫就行
xonerow = [] # 添加1.0作爲第一個係數,則第一個係數的權重用來代表y=wx+b中的b變量
for i in range(len(curLine)):
xonerow.append(float(curLine[i])) # 最後一列爲輸出結果值y,前面的值爲輸入x值
xArr.append(xonerow)
return xArr
# def loadDataSet(fileName):
# data_x=[];data_y=[]
# # fr=open('machinelearninginaction/Ch05/testSet.txt')
# for line in open(fileName).readlines():
# lineArr=line.strip().split()
# data_x.append([1.0,float(lineArr[0]),float(lineArr[1])])#特徵數據集,添加1是構造常數項x0
# data_y.append(int(lineArr[-1]))#分類數據集
# return data_x,data_y
def sigmoid(X):
return 1/(1+exp(-X))
def gradAscent(data_x,data_y):
data_xrix=mat(data_x) #(m,n)
data_y=mat(data_y).transpose() #(m,1)
m,n=shape(data_xrix)
Weights=ones((n,1)) #initialization(n,1)
alpha=0.001 #define the step
maxCycles=700 #times of loop
#We can also define a regularization parameter to constrain some huge weight
reg_lambda = math.exp(-8)
for i in range(maxCycles):
h=sigmoid(data_xrix * Weights) #f(thetax)
error=data_y - h #y-h,(m,1)
Weights=(1-reg_lambda)*Weights + alpha * data_xrix.transpose() * error #Gradient ascend
return Weights
def Judgefunction(test_y):
val=[]
rel=[]
for i in range(test_y.shape[0]):
val.append(test_y[i,0])
if val[i]<0.5:
rel.append(0)
else:
rel.append(1)
return rel
if __name__== "__main__":
# Single classifier output
data_x, data_y = loadDataSet('C:/Users/Carzolar/Desktop/bagging and boosting/Train.txt')
# print(mat(data_x).shape[0])
Weights = gradAscent(data_x,data_y)
# print(data_x)
# print(data_y)
result=[]
print('The single Weights is :')
print(Weights)
# test model
test_x, real_y= loadDataSet('C:/Users/Carzolar/Desktop/bagging and boosting/Test.txt')
test_y = sigmoid(test_x*Weights)
real_test_y=test_y[0,0]
real_test_y2=test_y[1,0]
result=Judgefunction(test_y)
# Result
# print('test_y is:',test_y)
# bagging classifier
data = loadDataSet2('C:/Users/Carzolar/Desktop/bagging and boosting/Train.txt')
times = input('Please input the bagging times:')
result_mat=[]
for i in range(int(times)):
Sample = Boostrap(data,400)
print('In times',times,'Sample is:')
print(Sample)
#extract the x and y in this loop
sample_x = np.mat(Sample)[:,0:4]
sample_y0 = np.mat(Sample)[:,-1]
# print(sample_x)
sample_y1=sample_y0.transpose()
sample_y2 = sample_y1.tolist()
sample_y = sample_y2[0]
# print(sample_y)
weights = gradAscent(sample_x,sample_y)
print('The weight is:\n',weights)
#using this weights to predict the test data and record
sample_test_x,sample_real_y = loadDataSet('C:/Users/Carzolar/Desktop/bagging and boosting/Test.txt')
sample_test_y = sigmoid(sample_test_x*weights)
test_result = Judgefunction(sample_test_y)
# real_result = Judgefunction(mat(sample_real_y))
# print(sample_test_y)
print('This time result is:\n',test_result)
# print('while the real result is:\n',real_y)
result_mat.append(test_result)
print('The result matrix is :\n',result_mat)
final_result = []
# print(mat(result_mat).shape[1])
# real_list1 = mat(result_mat)[:,1].transpose().tolist()
# print(real_list1[0])
# print(max(real_list1[0],key=real_list1[0].count))
# real_list = real_list1[0]
for i in range (int(mat(result_mat).shape[1])):
real_list1 = mat(result_mat)[:,i].transpose().tolist()
real_list = real_list1[0]
final_result.append(max(real_list1[0],key=real_list1[0].count))
print('The single logstic regression test_result is:\n ', result)
print('After voting, the result is:\n',final_result)
# print('real_y is: ', real_y)
print('While the real result is :\n',real_y)
#error rate calculation
frag1=0
frag2=0
for i in range(len(real_y)):
if result[i]!=real_y[i]:
frag1 +=1
if final_result[i]!=real_y[i]:
frag2 +=1
single_error_rate = frag1/len(real_y)
final_error_rate = frag2/len(real_y)
print('Single logistic error rate is : \n',single_error_rate)
print('After bagging, the error rate is :\n',final_error_rate)
這裏使用的是投票法的bagging,分別用10次,50次和100次迭代去觀察:
發現三次bagging之後,錯誤率並沒有減少。但是當我改變了SGD裏的超參數(步長)發現:
其實應該很好解釋這種情況,也就是當SGD並沒有找到局部最優時,bagging能在此基礎上幫忙減少錯誤率。可能在算法調優時能通過bagging的效果來發現原算法的優劣?