代碼註釋:機器學習實戰第6章 支持向量機

寫在開頭的話:在學習《機器學習實戰》的過程中發現書中很多代碼並沒有註釋,這對新入門的同學是一個挑戰,特此貼出我對代碼做出的註釋,僅供參考,歡迎指正。

1、SMO高效優化算法

#coding:gbk
from numpy import *

#功能:導入數據集
#輸入:文件名
#輸出:數據矩陣,標籤向量
def loadDataSet(fileName):
    dataMat = []#數據矩陣
    labelMat = []#標籤向量
    fr = open(fileName)
    for line in fr.readlines():
        lineArr = line.strip().split('\t')#strip()表示刪除空白符,split()表示分割
        dataMat.append([float(lineArr[0]), float(lineArr[1])])#1.0表示x0
        labelMat.append(float(lineArr[2]))
    return dataMat, labelMat

#功能:在(0, m)的區間範圍內隨機選擇一個除i以外的整數
#輸入:不能選擇的整數i,區間上界m
#輸出:隨機選擇的整數
def selectJrand(i, m):
    j = i
    while (j == i):
        j = int (random.uniform(0, m))
    return j

#功能:保證aj在區間[L, H]裏面
#輸入:要調整的數aj,區間上界H,區間下界L
#輸出:調整好的數aj
def clipAlpha(aj, H, L):
    if aj > H:#aj大於H
        aj = H
    if L > aj:#aj小於L
        aj = L
    return aj

#功能:簡化版SMO算法
#輸入:數據矩陣dataMatIn,標籤向量classLabels,常數C,容錯率toler,最大迭代次數maxIter
#輸出:超平面位移項b,拉格朗日乘子alpha
def smoSimple(dataMatIn, classLabels, C, toler, maxIter):
    dataMatrix = mat(dataMatIn)
    labelMat = mat(classLabels).transpose()
    b = 0
    m, n = shape(dataMatrix)#數據矩陣行數和列數,表示訓練樣本個數和特徵值個數
    alphas = mat(zeros((m, 1)))#m*1階矩陣
    iter = 0
    while (iter < maxIter):#循環直到超出最大迭代次數
        alphaPairsChanged = 0
        for i in range(m):
            #主窗口輸入numpy.info(numpy.multiply)
            #推導見《機器學習》(周志華)公式6.12
            fXi = float(multiply(alphas, labelMat).T * \
                        (dataMatrix * dataMatrix[i, :].T)) + b
            Ei = fXi - float(labelMat[i])#誤差
            #誤差很大,可以對該數據實例所對應的alpha值進行優化
            if ((labelMat[i] * Ei < -toler) and (alphas[i] < C)) or \
                    ((labelMat[i] * Ei > toler) and (alphas[i] > 0)):
                #在(0, m)的區間範圍內隨機選擇一個除i以外的整數,即隨機選擇第二個alpha
                j = selectJrand(i, m)
                #求變量alphaJ對應的誤差
                fXj = float(multiply(alphas, labelMat).T * (dataMatrix * dataMatrix[j, :].T)) + b
                Ej = fXj - float(labelMat[j])
                #不能直接 alphaIold = alphas[i],否則alphas[i]和alphaIold指向的都是同一內存空間
                alphaIold = alphas[i].copy()
                alphaJold = alphas[j].copy()
                #接下來需要看Plata的論文,待以後看論文後再重讀此章
                if (labelMat[i] != labelMat[j]):
                    L = max(0, alphas[j] - alphas[i])
                    H = min(C, C + alphas[j] - alphas[i])
                else:
                    L = max(0, alphas[j] + alphas[i] - C)
                    H = min(C, alphas[j] + alphas[i])
                if L == H:
                    print "L == H"
                    continue
                eta = 2.0 * dataMatrix[i, :] * dataMatrix[j, :].T - \
                    dataMatrix[i, :] * dataMatrix[i, :].T - \
                    dataMatrix[j, :] * dataMatrix[j, :].T
                if eta >= 0:
                    print "eta >= 0"
                    continue
                alphas[j] -= labelMat[j] * (Ei - Ej) / eta
                alphas[j] = clipAlpha(alphas[j], H, L)#保證alphas[j]在區間[L, H]裏面
                #檢查alpha[j]是否有較大改變,如果沒有則退出for循環
                if (abs(alphas[j] - alphaJold) < 0.00001):
                    print "j not moving enough"
                    continue
                #labelMat[i]與labelMat[j]絕對值均爲1,則alphas[i]與alphas[j]改變大小一樣
                #保證alpha[i] * labelMal[i] + alpha[j] * labelMal[j] = c
                #即Delta(alpha[i]) * labelMal[i] + Delta(alpha[j]) * labelMal[j] = 0
                alphas[i] += labelMat[j] * labelMat[i] * (alphaJold - alphas[j])
                b1 = b - Ei - labelMat[i] * (alphas[i] - alphaIold) * \
                              dataMatrix[i, :] * dataMatrix[i, :].T - \
                    labelMat[j] * (alphas[j] - alphaJold) * dataMatrix[i, :] * dataMatrix[j, :].T
                b2 = b - Ej - labelMat[i] * (alphas[i] - alphaIold) * \
                    dataMatrix[i, :] * dataMatrix[j, :].T - \
                    labelMat[j] * (alphas[j] - alphaJold) * \
                    dataMatrix[j, :] * dataMatrix[j, :].T
                if (0 < alphas[i]) and (C > alphas[i]):
                    b = b1
                elif (0 < alphas[j]) and (C > alphas[j]):
                    b = b2
                else:
                    b = (b1 + b2) / 2.0
                alphaPairsChanged += 1
                print "iter: %d i: %d, pairs changed %d" % (iter, i, alphaPairsChanged)
        #不是1,這個迭代思路比較巧妙,是以最後一次迭代沒有誤差爲迭代結束條件
        if (alphaPairsChanged == 0):
            iter += 1
        else:
            iter = 0
        print "iteration number: %d" % iter
    return b, alphas


2、利用完整Platt SMO算法加速優化

#功能:建立數據結構用於保存所有的重要值
#輸入:無
#輸出:無
class optStruct:
    def __init__(self, dataMatIn, classLabes, C, toler, kTup):#__init__作用是初始化已實例化後的對象
        self.X = dataMatIn
        self.labelMat = classLabes
        self.C = C
        self.tol = toler
        self.m = shape(dataMatIn)[0]#dataMatIn行數
        self.alphas = mat(zeros((self.m, 1)))#(self.m, 1)是一個元組,下同
        self.b = 0
        # m*2誤差矩陣,第一列爲eCache是否有效的標誌位,第二列是
        self.eCache = mat(zeros((self.m, 2)))
        self.K = mat(zeros((self.m, self.m)))#建立核矩陣
        for i in range(self.m):
            self.K[:, i] = kernelTrans(self.X, self.X[i, :], kTup)

#功能:計算第k個alpha的誤差值
#輸入:數據集,alpha數
#輸出:誤差值
def calcEk(oS, k):
    #fXk = float(multiply(oS.alphas, oS.labelMat).T * (oS.X * oS.X[k, :].T)) + oS.b
    fXk = float(multiply(oS.alphas, oS.labelMat).T * oS.K[:, k] + oS.b)
    Ek = fXk -float(oS.labelMat[k])
    return Ek

#功能:選擇有最大步長的alpha值
#輸入:第一個alpha值,數據集,第一個alpha對應的誤差值
#輸出:第二個alpha值和對應的誤差值
def selectJ(i, oS, Ei):
    maxK = -1#最大步長對應j值
    maxDeltaE = 0#最大步長
    Ej = 0#最大誤差值
    oS.eCache[i] = [1, Ei]#使i值對應的標誌位永遠有效
    # .A表示將矩陣轉化爲列表,nonzero()返回值不爲零的元素的下標,[0]表示第一列
    #該行表示讀取eCache第一列即是否有效標誌位的下標
    validEcacheList = nonzero(oS.eCache[:, 0].A)[0]
    if (len(validEcacheList)) > 1:#大於等於2個
        for k in validEcacheList:#在有效標誌位中尋找
            if k == i:
                continue
            Ek = calcEk(oS, k)
            deltaE = abs(Ei - Ek)
            if (deltaE > maxDeltaE):#找到最大步長
                maxK = k
                maxDeltaE = deltaE
                Ej = Ek
        return maxK, Ej
    else:
        j = selectJrand(i, oS.m)#隨機選擇一個j值
        Ej = calcEk(oS, j)#j值對應的誤差值Ej
    return j, Ej

#功能:更新第k個alpha的誤差值至數據結構中
#輸入:數據集,alpha數
#輸出:無
def updataEk(oS, k):
    Ek = calcEk(oS, k)
    oS.eCache[k] = [1, Ek]

#功能:完整版Platt SMO內循環,在數據結構中更新alpha數
#輸入:alpha數,數據集
#輸出:是否在數據結構中成功更新alpha數,成功返回1,不成功返回0
def innerL(i, oS):
    Ei = calcEk(oS, i)
    if ((oS.labelMat[i] * Ei < -oS.tol) and (oS.alphas[i] < oS.C)) or \
            ((oS.labelMat[i] * Ei > oS.tol) and (oS.alphas[i] > 0)):
        j, Ej = selectJ(i, oS, Ei)#選擇有最大步長的alpha值
        alphaIold = oS.alphas[i].copy()
        alphaJold = oS.alphas[j].copy()
        if (oS.labelMat[i] != oS.labelMat[j]):
            L = max(0, oS.alphas[j] - oS.alphas[i])
            H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i])
        else:
            L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C)
            H = min(oS.C, oS.alphas[j] + oS.alphas[i])
        if L == H:
            print "L == H"
            return 0
        #eta = 2.0 * oS.X[i, :] * oS.X[j, :].T - oS.X[i, :] * oS.X[i, :].T - \
           #oS.X[j, :] * oS.X[j, :].T
        eta = 2.0 * oS.K[i, j] - oS.K[i, i] - oS.K[j, j]
        if eta >= 0:
            print "eta >= 0"
            return 0
        oS.alphas[j] -= oS.labelMat[j] * (Ei - Ej) / eta
        oS.alphas[j] = clipAlpha(oS.alphas[j], H, L)
        updataEk(oS, j)
        if (abs(oS.alphas[j] - alphaJold) < 0.00001):
            print "j not moving enough"
            return 0
        oS.alphas[i] += oS.labelMat[j] * oS.labelMat[i] * \
                        (alphaJold - oS.alphas[j])
        updataEk(oS, i)
        #b1 = oS.b - Ei - oS.labelMat[i] * (oS.alphas[i] - alphaIold) * oS.X[i, :] * oS.X[i, :].T - \
            #oS.labelMat[j] * (oS.alphas[j] - alphaJold) * oS.X[i, :] * oS.X[j, :].T
        #b2 = oS.b - Ej - oS.labelMat[i] * (oS.alphas[i] - alphaIold) * oS.X[i, :] * oS.X[j, :].T - \
             #oS.labelMat[j] * (oS.alphas[j] - alphaJold) * oS.X[j, :] * oS.X[j, :].T
        b1 = oS.b - Ei - oS.labelMat[i] * (oS.alphas[i] - alphaIold) * oS.K[i, i] - \
             oS.labelMat[j] * (oS.alphas[j] - alphaJold) * oS.K[i, j]
        b2 = oS.b - Ej - oS.labelMat[i] * (oS.alphas[i] - alphaIold) * oS.K[i, j] - \
             oS.labelMat[j] * (oS.alphas[j] - alphaJold) * oS.K[j, j]
        if (0 < oS.alphas[i]) and (oS.C > oS.alphas[i]):
            oS.b = b1
        elif (0 < oS.alphas[j]) and (oS.C > oS.alphas[j]):
            oS.b = b2
        else:
            oS.b = (b1 + b2) / 2.0
        return 1
    else:
        return 0

#功能:完整版Platt SMO外循環
#輸入:數據矩陣dataMatIn,標籤向量classLabels,常數C,容錯率toler,最大迭代次數maxIter
#輸出:超平面位移項b,拉格朗日乘子alpha
def smoP(dataMatIn, classLabels, C, toler, maxIter, kTup=('lin', 0)):
    oS = optStruct(mat(dataMatIn), mat(classLabels).transpose(), C, toler, kTup)#建立數據結構
    iter = 0#一次迭代完成一次循環過程
    entireSet = True
    alphaPairsChanged = 0
    while (iter < maxIter) and ((alphaPairsChanged > 0) or (entireSet)):
        alphaPairsChanged = 0
        if entireSet:#判斷1
            for i in range(oS.m):
                alphaPairsChanged += innerL(i, oS)
                print "fullSet, iter: %d i: %d, pairs changed %d" % \
                      (iter, i, alphaPairsChanged)
            iter += 1
        else:
            nonBoundIs = nonzero((oS.alphas.A > 0) * (oS.alphas.A < C))[0]
            for i in nonBoundIs:
                alphaPairsChanged += innerL(i, oS)
                print "non-bound, iter: %d i: %d, pairs changed %d" % \
                      (iter, i, alphaPairsChanged)
            iter += 1
        # 執行判斷1時,如果entireSet = True,表示遍歷整個集合,alphaPairsChanged = 0,表示未對任意alpha對進行修改
        if entireSet:
            entireSet = False
        #執行判斷1時,第一次迭代遍歷整個集合,之後就只遍歷非邊界值,除非遍歷非邊界值發現沒有任意alpha對進行修改,遍歷整個集合
        elif (alphaPairsChanged == 0):
            entireSet = True
        print "iteration number: %d" % iter
    return oS.b, oS.alphas

#功能:計算超平面法向量
#輸入:拉格朗日乘子alpha,數據矩陣dataArr,標籤向量classLabels
#輸出:超平面法向量
def calcWs(alphas, dataArr, classLabels):
    X = mat(dataArr)
    labelMat = mat(classLabels).transpose()
    m, n = shape(X)
    w = zeros((n, 1))
    for i in range(m):
        w += multiply(alphas[i] * labelMat[i], X[i, :].T)
    return w


3、在複雜數據上應用核函數

#功能:核轉換函數
#輸入:數據集,第i行數據集,核函數名稱
#輸出:對應的核函數
def kernelTrans(X, A, kTup):
    m, n = shape(X)#數據集行數和列數
    K = mat(zeros((m, 1)))
    if kTup[0] == 'lin':#如果核函數是線性核
        K = X * A.T
    elif kTup[0] == 'rbf':#如果核函數是高斯核,即徑向基核函數
        for j in range(m):
            deltaRow = X[j, :] - A
            K[j] = deltaRow * deltaRow.T
        K = exp(K / (-1 * kTup[1] ** 2))
    else:#出現不能識別的核函數
        #通過raise顯式地引發異常
        raise NameError('Houston We Have a Problem -- \
                        That Kernel is not recognized')
    return K

#功能:利用核函數進行分類的徑向基測試函數
#輸入:高斯核帶寬的平方值
#輸出:無
def testRbf(k1 = 1.3):
    dataArr, labelArr = loadDataSet('testSetRBF.txt')
    b, alphas = smoP(dataArr, labelArr, 200, 0.0001, 10000, ('rbf', k1))
    datMat = mat(dataArr)
    labelMat = mat(labelArr).transpose()
    svInd = nonzero(alphas.A >0)[0]#支持向量的下標
    sVs = datMat[svInd]#支持向量
    labelSV = labelMat[svInd]#支持向量的類別標籤
    print "there are %d Support Vectors" % shape(sVs)[0]
    m, n = shape(datMat)
    errorCount = 0
    for i in range(m):
        kernelEval = kernelTrans(sVs, datMat[i, :], ('rbf', k1))
        predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b#得預測值
        if sign(predict) != sign(labelArr[i]):
            errorCount += 1
    print "the training error rate is %f" % (float(errorCount) / m)
    dataArr, labelArr = loadDataSet('testSetRBF2.txt')
    errorCount = 0
    datMat = mat(dataArr)
    labelMat = mat(labelArr).transpose()
    m, n = shape(datMat)
    for i in range(m):
        kernelEval = kernelTrans(sVs, datMat[i, :], ('rbf', k1))
        predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b#得驗證集預測值
        if sign(predict) != sign(labelArr[i]):
            errorCount += 1
    print "the test error rate is: %f" % (float(errorCount) / m)


4、手寫識別問題回顧

#功能:圖像矩陣轉化爲m*1矩陣
#輸入:文件名
#輸出:m*1矩陣
def img2vector(filename):
    returnVect = zeros((1, 1024))
    fr = open(filename)
    for i in range(32):
        lineStr = fr.readline()
        for j in range(32):
            returnVect[0, 32 * i + j] = int (lineStr[j])
    return returnVect

#功能:將圖像內容導入矩陣
#輸入:一級子目錄
#輸出:圖像矩陣,圖像標籤向量
def loadImages(dirName):
    from os import listdir
    hwLabels = []
    trainingFileList = listdir(dirName)#dirName文件夾下的文件名列表
    m = len(trainingFileList)#dirName文件夾下的文件數目
    trainingMat = zeros((m, 1024))
    for i in range(m):
        fileNameStr = trainingFileList[i]#文件名
        fileStr = fileNameStr.split('.')[0]#去掉.txt的文件名
        classNumStr = int(fileStr.split('_')[0])#要識別的數字
        if classNumStr == 9:#數字9
            hwLabels.append(-1)
        else:#數字1
            hwLabels.append(1)
        trainingMat[i, :] = img2vector('%s/%s' % (dirName, fileNameStr))
    return trainingMat, hwLabels

def testDigits(kTup = ('rbf', 10)):
    dataArr, labelArr = loadImages('trainingDigits')
    b, alphas = smoP(dataArr, labelArr, 200, 0.0001, 10000, kTup)
    datMat = mat(dataArr)
    labelMat = mat(labelArr).transpose()
    svInd = nonzero(alphas.A > 0)[0]  # 支持向量的下標
    sVs = datMat[svInd]  # 支持向量
    labelSV = labelMat[svInd]  # 支持向量的類別標籤
    print "there are %d Support Vectors" % shape(sVs)[0]
    m, n = shape(datMat)
    errorCount = 0
    for i in range(m):
        kernelEval = kernelTrans(sVs, datMat[i, :], kTup)
        predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b  # 得預測值
        if sign(predict) != sign(labelArr[i]):
            errorCount += 1
    print "the training error rate is %f" % (float(errorCount) / m)
    dataArr, labelArr = loadImages('testDigits')
    errorCount = 0
    datMat = mat(dataArr)
    labelMat = mat(labelArr).transpose()
    m, n = shape(datMat)
    for i in range(m):
        kernelEval = kernelTrans(sVs, datMat[i, :], kTup)
        predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b  # 得驗證集預測值
        if sign(predict) != sign(labelArr[i]):
            errorCount += 1
    print "the test error rate is: %f" % (float(errorCount) / m)


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章