AdaBoost算法

# coding=utf-8
from math import log, exp
from numpy.core.umath import sign
from numpy.ma import multiply
from numpy import *

def loadSimpleDate():
    dataMat = matrix([[1. , 2.1],
                      [2. , 1.1],
                      [1.3 , 1.],
                      [1. , 1.],
                      [2. , 1.]])
    classLabels = [1.0,1.0,-1.0,-1.0,1.0]
    return dataMat, classLabels

def stumpClassify(dataMatrix, dimen, threshVal, threshIneq):
    retArry = ones((shape(dataMatrix)[0], 1))
    if threshIneq == 'lt':
        retArry[dataMatrix[:, dimen] <= threshVal] = -1.0
    else:
        retArry[dataMatrix[:, dimen] > threshVal] = -1.0
    return retArry

# D是權重向量
def buildStump(dataArr, classLabels, D):
    dataMatrix = mat(dataArr)
    labelMat = mat(classLabels).T
    m, n = shape(dataMatrix)
    numSteps = 10.0 #在特徵所有可能值上遍歷
    bestStump = {} #用於存儲單層決策樹的信息
    bestClasEst = mat(zeros((m, 1)))
    minError = inf
    for i in range(n): # 遍歷所有特徵
        rangeMin = dataMatrix[:,i].min()
        rangeMax = dataMatrix[:,i].max()
        stepSize = (rangeMax - rangeMin) / numSteps
        for j in range(-1, int(numSteps) +1):
            for inequal in ['lt','gt']:
                threshVal = (rangeMin + float(j) * stepSize) # 得到閾值
                # 根據閾值分類
                predictedVals = stumpClassify(dataMatrix, i, threshVal, inequal)
                errArr = mat(ones((m, 1)))
                errArr[predictedVals == labelMat] = 0
                weightedError = D.T * errArr # 不同樣本的權重是不一樣的
                if weightedError < minError:
                    minError = weightedError
                    bestClasEst = predictedVals.copy()
                    bestStump['dim'] = i
                    bestStump['thresh'] = threshVal
                    bestStump['ineq'] = inequal
    return bestStump, minError, bestClasEst

def adaBoostTrainDS(dataArr, classLabels, numIt=40):
    weakClassArr = []
    m = shape(dataArr)[0]
    D = mat(ones((m, 1))/m)# 初始化所有樣本的權值一樣
    aggClassEst = mat(zeros((m, 1)))# 每個數據點的估計值
    for i in range(numIt):
        bestStump, error, classEst = buildStump(dataArr, classLabels, D)
        # 計算alpha, max(error, 1e-16)保證沒有錯誤的時候不出現除零溢出
        # alpha表示的是這個分類器的權重,錯誤率越低分類器權重越高
        alpha = float(0.5 * log((1.0-error)/max(error, 1e-16)))
        bestStump['alpha'] = alpha
        weakClassArr.append(bestStump)
        expon = multiply(-1 * alpha * mat(classLabels).T, classEst) #exponent for D calc, getting messy
        D = multiply(D, exp(expon))
        D = D/D.sum()
        # calc training error of all classifiers, if this is 0 quit for loop early (use break)
        aggClassEst += alpha * classEst
        # print "aggClassEst: ",aggClassEst.T
        aggErrors = multiply(sign(aggClassEst) != mat(classLabels).T, ones((m, 1)))
        errorRate = aggErrors.sum()/m
        print("total error: ",errorRate)
        if errorRate == 0.0:
            break
    return weakClassArr

#dataToClass 表示要分類的點或點集
def adaClassify(datToClass, classifierArr):
    dataMatrix = mat(datToClass) # do stuff similar to last aggClassEst in adaBoostTrainDS
    m = shape(dataMatrix)[0]
    aggClassEst = mat(zeros((m, 1)))
    for i in range(len(classifierArr)):
        classEst = stumpClassify(dataMatrix, classifierArr[i]['dim'],\
                                 classifierArr[i]['thresh'],\
                                 classifierArr[i]['ineq'])# call stump classify
        aggClassEst += classifierArr[i]['alpha'] * classEst
        print(aggClassEst)
    return sign(aggClassEst)

def main():
    dataMat, classLabels = loadSimpleDate()
    D = mat(ones((5, 1))/5)
    classifierArr = adaBoostTrainDS(dataMat, classLabels, 30)
    t = adaClassify([0, 0], classifierArr)
    print t

if __name__ == "__main__":
    main()

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章