支持向量機的代碼實現

前言:
本篇文章主要實現了《機器學習實戰》的支持向量機部分,我在代碼中也儘量描述了對應公式的哪些變量。我建議搭配《統計學習方法》進行實戰,下面這篇博文對於公式推導也是很不錯的。

https://blog.csdn.net/u011067360/article/details/26503719

另外本篇依賴於jupyter notebook。故在代碼後面會出現相應的結果。

正文:

SMO算法求支持向量機

SMO算法中的輔助函數

def loadDataSet(fileName):
    """讀取數據"""
    dataMat = [];labelMat = []
    fr = open(fileName)
    for line in fr.readlines():
        lineArr = line.strip().split('\t')
        dataMat.append([float(lineArr[0]),float(lineArr[1])])
        labelMat.append(float(lineArr[2]))
    return dataMat,labelMat
def selectJrand(i,m):
    """
    參數i:第一個alpha的下標,參數m:所有alpha的數目
    在某個區間範圍隨機取一個整數
    """
    j = i
    while(j == i ):
        j = int (random.uniform(0,m))
    return j
def clipAlpha(aj,H,L):
    """用於在數值太大時進行調整"""
    if aj > H:
        aj = H
    if L > aj:
        aj = L
    return aj
dataArr,labelArr = loadDataSet("E:\\DataMining\\Project\\MLBook\\機器學習實戰源代碼\\machinelearninginaction\\Ch06\\testSet.txt")

簡化版SMO算法

def smoSimple(dataMatIn,classLabels,C,toler,maxIter):
    """參數dtatMatIn:數據集
       參數classLabels:類別標籤
       參數C:常數C
       參數toler:容錯率
       參數maxIter:退出前最大的循環次數"""
    dataMatrix = mat(dataMatIn);labelMat =mat(classLabels).transpose()
    b = 0;m,n = shape(dataMatrix)
    alphas = mat(zeros((m,1)))
    iter = 0  #儲存在沒有任何alpha改變的情況下遍歷數據集的次數
    while(iter < maxIter):
        alphaPairsChanged  = 0  #記錄alpha是否已優化
        for i in range(m):
            #multiply() 數組元素對應相乘
            #fXi 是預測值
            fXi = float(multiply(alphas,labelMat).T *\
                       (dataMatrix * dataMatrix[i,:].T)) + b
            Ei = fXi - float(labelMat[i])
            #違反KKT條件最嚴重
            # 以下幾種情況出現將會出現不滿足:(ui是預測值,ai是變量alpha)
            #yiui<=1但是ai<C則是不滿足的,而原本ai=C    
            #yiui>=1但是ai>0則是不滿足的而原本ai=0   
            #yiui=1但是ai=0或者ai=C則表明不滿足的,而原本應該是0<ai<C
            #所以要找出不滿足KKT的這些ai,並更新這些ai
            #在考慮這些點時有首先考慮0<ai<C對應的樣本點(即對應的樣本點是在間隔邊界上的支持向量)
            if ((labelMat[i]*Ei < -toler) and (alphas[i] < C)) or \
                ((labelMat[i]*Ei > toler) and (alphas[i] > 0)):
                    #隨機選擇第二個alpha
                    j = selectJrand(i,m)
                    fXj = float(multiply(alphas,labelMat).T*\
                               (dataMatrix*dataMatrix[j,:].T)) + b 
                    Ej = fXj - float(labelMat[j])
                    alphaIold = alphas[i].copy();alphaJold = alphas[j].copy()
                    #對類別同號異號分開求上下界
                    if (labelMat[i] != labelMat[j]):
                        L = max(0,alphas[j] - alphas[i])
                        H = min (C,C + alphas[j] - alphas[i])
                    else:
                        L = max(0,alphas[j] + alphas[i] - C)
                        H = min(C,alphas[j] + alphas[i])
                    if L==H:
                        print('L==H')
                        continue
                    #最優修改量eta
                    eta = 2.0 * dataMatrix[i,:]*dataMatrix[j,:].T -\
                          dataMatrix[i,:]*dataMatrix[i,:].T -\
                          dataMatrix[j,:]*dataMatrix[j,:].T
                    if eta >= 0:
                        print('eta>=0')
                        continue
                    #之所以是-=  是因爲前面eta的求法對比書上公式是相反數的
                    alphas[j] -= labelMat[j] * (Ei-Ej)/eta
                    alphas[j] = clipAlpha(alphas[j],H,L)  #調整alpha[j]
                    if (abs(alphas[j] - alphaJold) < 0.00001):
                        print('j not moving enough')
                        continue
                    #對i進行修改,修改量與j相同,方向相反
                    alphas[i] += labelMat[j]*labelMat[i]*(alphaJold - alphas[j])
                    #重新計算閾值b
                    b1 = b - Ei -labelMat[i]*(alphas[i]- alphaIold)* \
                        dataMatrix[i,:]*dataMatrix[i,:].T - \
                        labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[i,:]*dataMatrix[j,:].T
                    b2 = b - Ej -labelMat[i]*(alphas[i]- alphaIold)* \
                        dataMatrix[i,:]*dataMatrix[j,:].T - \
                        labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[j,:]*dataMatrix[j,:].T
                    if (0 < alphas[i]) and (C > alphas[i]):
                        b = b1
                    elif (0 < alphas[j]) and (C > alphas[j]):
                        b = b2
                    else:
                        b = (b1 + b2 ) / 2.0
                    alphaPairsChanged += 1
                    print('iter:{0} i:{1} paris change {2}'.format(iter,i,alphaPairsChanged))
        if (alphaPairsChanged == 0):
            iter += 1
        else:
            iter = 0
        print('iteration numebr :{}'.format(iter))
    return b,alphas
from numpy import *
b,alphas = smoSimple(dataArr,labelArr,0.6,0.001,40)
L==H
L==H
iter:0 i:8 paris change 2
j not moving enough
j not moving enough
iteration numebr :39
j not moving enough
j not moving enough
iteration numebr :40
b
matrix([[-3.85138014]])
alphas[alphas>0]
matrix([[0.11792374, 0.243914  , 0.00461456, 0.35722318]])

瞭解哪些數據點是支持向量

for i in range(100):
    if alphas[i] > 0.0:
        print(dataArr[i],labelArr[i])  # 輸出對應下標的樣本點
[4.658191, 3.507396] -1.0
[3.457096, -0.082216] -1.0
[5.286862, -2.358286] 1.0
[6.080573, 0.418886] 1.0

作圖標記出來

def plotTool():
    import matplotlib.pyplot as plt
    dataMat,labelMat = loadDataSet("E:\\DataMining\\Project\\MLBook\\機器學習實戰源代碼\\machinelearninginaction\\Ch06\\testSet.txt")
    dataArr = array(dataMat)
    n = shape(dataArr)[0]
    xcord1 = [];ycord1 = []
    xcord2 = [];ycord2 = []
    for i in range(n):
        if int (labelMat[i]) ==1:
            xcord1.append(dataArr[i,0]);ycord1.append(dataArr[i,1])
        else:
            xcord2.append(dataArr[i,0]);ycord2.append(dataArr[i,1])     
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(xcord1,ycord1,c='red',marker='s')
    ax.scatter(xcord2,ycord2,c='green')
    plt.xlabel('x1');plt.ylabel('x2')
    #標記出支持向量
    ax.annotate("SVector",xy=(4.658191, 3.507396),xycoords='data',xytext=(20,0),textcoords='offset points',
               arrowprops=dict(arrowstyle='->',connectionstyle='arc3,rad=-0.2'))
    ax.annotate("SVector",xy=(3.457096, -0.082216),xycoords='data',xytext=(20,0),textcoords='offset points',
               arrowprops=dict(arrowstyle='->',connectionstyle='arc3,rad=-0.2'))
    ax.annotate("SVector",xy=(5.286862, -2.358286),xycoords='data',xytext=(10,20),textcoords='offset points',
               arrowprops=dict(arrowstyle='->',connectionstyle='arc3,rad=-0.2'))
    ax.annotate("SVector",xy=(6.080573, 0.418886),xycoords='data',xytext=(10,20),textcoords='offset points',
               arrowprops=dict(arrowstyle='->',connectionstyle='arc3,rad=-0.2'))                           
    plt.show()
plotTool()

這裏寫圖片描述

計算w值

def calWs(alphas,dataArr,classLabels):
    X = mat(dataArr);labelMat = mat(classLabels).transpose()
    m,n = shape(X)
    w = zeros((n,1))
    for i in range(m):
        w += multiply(alphas[i]*labelMat[i],X[i,:].T)
    return w
w = calWs(alphas,dataArr,labelArr)

完整版Platt SMO的支持函數

PS:在calcEk()註釋掉了後面引入核函數需要修改的部分

class optStruct:
    def __init__(self,dataMatIn,classLabels,C,toler):
        '''建立一個數據結構來保存所有重要的值'''
        self.X = dataMatIn
        self.labelMat = classLabels
        self.C = C
        self.tol = toler
        self.m = shape(dataMatIn)[0]  #行
        self.alphas = mat(zeros((self.m,1)))
        self.b = 0
        self.eCache = mat(zeros((self.m,2)))       #第一列給出的是eCache是否有效的標誌位,第二列是實際的E值
def calcEk(oS,k):
    '''
    參數oS:optStruct的對象
    參數k:下標
    計算E值並返回'''
    #以下部分爲未引入核函數後的fXk,Ek計算
    fXk = float(multiply(oS.alphas,oS.labelMat).T*\
               (oS.X*oS.X[k,:].T)) + oS.b
    #以下部分爲引入核函數後的fXk,Ek計算
    #fXk = float(multiply(oS.alphas,oS.labelMat).T*oS.K[:,k]+oS.b)
    Ek = fXk - float(oS.labelMat[k])
    return Ek

def selectJ(i,oS,Ei):
    '''啓發式方法選擇內循環的alpha'''
    maxK = -1;maxDeltaE = 0;Ej = 0
    oS.eCache[i] = [1,Ei]
    #nonzero():
    #返回一個長度爲a.ndim(數組a的軸數)的元祖,元祖的每個元素都是一個整數數組,其值爲非零元素的下標在對應軸上的值
    #>>> b2 = np.array([[True, False, True], [True, False, False]])
    #>>> np.nonzero(b2)
    #(array([0, 0, 1], dtype=int64), array([0, 2, 0], dtype=int64))
    #它的第0個元素是數組a中值不爲0的元素的第0軸的下標,第1個元素則是第1軸的下標,因此從下面的結果可知b2[0,0]、b[0,2]和b2[1,0]的值不爲0:
    #matrix.A 表示將矩陣轉爲array數組
    validEcacheList = nonzero(oS.eCache[:,0].A)[0]
    if (len(validEcacheList)) > 1:
        #對E中非0的值的下標進行循環
        for k in validEcacheList:
            if k == i:
                continue
            Ek = calcEk(oS,k)
            deltaE = abs(Ei - Ek)
            if (deltaE > maxDeltaE):
                maxK = k;maxDeltaE = deltaE;Ej = Ek
        return maxK,Ej
    else:
        j = selectJrand(i,oS.m)
        Ej = calcEk(oS,j)
        return j,Ej

def updateEk(oS,k):
    '''計算誤差值並存入緩存中'''
    Ek = calcEk(oS,k)
    oS.eCache[k] = [1,Ek]

完整Platt SMO 算法中的優化例程

PS:下面代碼內部增添了在引入核函數後的內容,會註釋掉,要用時再刪除註釋即可
  File "<ipython-input-121-0f2fd9f66347>", line 1
    PS:下面代碼內部增添了在引入核函數後的內容,會註釋掉,要用時再刪除註釋即可
                                         ^
SyntaxError: invalid character in identifier
def innerL(i,oS):
    Ei = calcEk(oS,i)
    if ((oS.labelMat[i]*Ei < -oS.tol) and (oS.alphas[i] < oS.C)) or\
    ((oS.labelMat[i]*Ei > oS.tol) and (oS.alphas[i] > 0)):
        j,Ej = selectJ(i,oS,Ei)
        alphaIold = oS.alphas[i].copy();alphaJold = oS.alphas[j].copy()
        if (oS.labelMat[i] != oS.labelMat[j]):
            L = max(0,oS.alphas[j]-oS.alphas[i])
            H = min(oS.C,oS.C + oS.alphas[j] - oS.alphas[i])
        else:
            L = max(0,oS.alphas[j] + oS.alphas[i] - oS.C)
            H = min(oS.C,oS.alphas[j] + oS.alphas[i])
        if L == H:
            print('L==H');return 0
        #下一行爲未引入核函數後的:
        eta = 2.0 * oS.X[i,:]*oS.X[j,:].T - oS.X[i,:]*oS.X[i,:].T - oS.X[j,:]*oS.X[j,:].T
        #下一行爲引入核函數後的:
        #eta = 2.0 * oS.K[i,j] - oS.K[i,i] - oS.K[j,j]
        if eta >= 0:
            print('eta>=0');return 0
        oS.alphas[j] -= oS.labelMat[j]*(Ei - Ej)/eta
        oS.alphas[j] = clipAlpha(oS.alphas[j],H,L)
        updateEk(oS,j) #更新誤差進緩存
        if (abs(oS.alphas[j] - alphaJold) < 0.00001):
            print('j not moving enough');return 0
        oS.alphas[i] += oS.labelMat[j]*oS.labelMat[i]*(alphaJold - oS.alphas[j])
        updateEk(oS,i)
        #更新閾值b
        #以下b1,b2是未引入核函數之後的:
        b1 = oS.b - Ei -oS.labelMat[i]*(oS.alphas[i]-alphaIold)*\
            oS.X[i,:]*oS.X[i,:].T - oS.labelMat[j]*\
            (oS.alphas[j] - alphaJold)*oS.X[i,:]*oS.X[j,:].T
        b2 = oS.b - Ej -oS.labelMat[i]*(oS.alphas[i]-alphaIold)*\
            oS.X[i,:]*oS.X[j,:].T - oS.labelMat[j]*\
            (oS.alphas[j] - alphaJold)*oS.X[j,:]*oS.X[j,:].T
        #以下b1,b2是引入核函數之後的:
        #b1 = oS.b - Ei - oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i,i] -\
        #    oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.K[i,j]
        #b2 = oS.b - Ej - oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i,j] -\
        #    oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.K[j,j]
        if (0 < oS.alphas[i]) and (oS.C > oS.alphas[i]):
            oS.b = b1
        elif (0 < oS.alphas[j]) and (oS.C > oS.alphas[j]):
            oS.b = b2
        else:
            oS.b = (b1 + b2)/2.0
        return 1
    else : return 0

完整版Platt SMO的外循環代碼

def smoP(dataMatIn,classLabels,C,toler,maxIter,kTup=('lin',0)):
    oS = optStruct(mat(dataMatIn),mat(classLabels).transpose(),C,toler,kTup)
    iter = 0
    entireSet = True;alphaPairsChanged = 0
    while(iter < maxIter) and ((alphaPairsChanged > 0) or (entireSet)):
        alphaPairsChanged = 0
        if entireSet:
            #遍歷所有的值
            for i in range(oS.m):
                alphaPairsChanged += innerL(i,oS)
                print('fullSet,iter:{0} i:{1},pairs changed {2}'.format(iter,i,alphaPairsChanged))
            iter += 1
        else:
            nonBoundIs = nonzero((oS.alphas.A > 0)*(oS.alphas.A < C))[0]
            #遍歷非邊界值
            for i in nonBoundIs:
                alphaPairsChanged += innerL(i,oS)
                print('non-bound,iter:{0} i:{1},pairs changed {2}'.format(iter,i,alphaPairsChanged))
            iter += 1
        if entireSet:
            entireSet = False
        elif (alphaPairsChanged == 0):
            entireSet = True
        print("iteration number:{}".format(iter))
    return oS.b,oS.alphas

進行測試

b,alphas = smoP(dataArr,labelArr,0.6,0.001,40)
fullSet,iter:0 i:0,pairs changed 1
fullSet,iter:3 i:96,pairs changed 0
fullSet,iter:3 i:97,pairs changed 0
fullSet,iter:3 i:98,pairs changed 0
fullSet,iter:3 i:99,pairs changed 0
iteration number:4

瞭解哪些數據點是支持向量,可以看到與簡易版SMO是不一樣的

for i in range(100):
    if alphas[i] > 0.0:
        print(dataArr[i],labelArr[i])  # 輸出對應下標的樣本點
[3.634009, 1.730537] -1.0
[3.125951, 0.293251] -1.0
[4.658191, 3.507396] -1.0
[3.223038, -0.552392] -1.0
[3.457096, -0.082216] -1.0
[5.286862, -2.358286] 1.0
[6.080573, 0.418886] 1.0

計算w值

ws = calWs(alphas,dataArr,labelArr)
ws
array([[ 0.74764704],
       [-0.17895243]])

以第一個數據點爲例進行分類:大於0爲正類,否則負類

dataMat = mat(dataArr)
dataMat[0]*mat(ws)+b
matrix([[-0.98996178]])

在複雜數據上應用核函數

核函數轉換函數

def kernelTrans(X,A,kTup):
    '''
    返回一個指定類型的核函數
    參數kTup:是一個包含核函數信息的元組,第一個參數是描述核函數類型的字符串,其它2個參數是可能需要的可選參數
    '''
    m,n = shape(X)
    K = mat(zeros((m,1)))
    if kTup[0] == 'lin':
        K = X * A.T
    elif kTup[0] =='rbf':
        for j in range(m):
            deltaRow = X[j,:] - A
            K[j] = deltaRow*deltaRow.T
        K = exp(K/(-1*kTup[1]**2))
    else:
        raise NameError('Houston We Have a Problem - - That Kernel is not recognized')
    return K

class optStruct:
    def __init__(self,dataMatIn,classLabels,C,toler,kTup):
        self.X = dataMatIn
        self.labelMat = classLabels
        self.C = C
        self.tol = toler
        self.m = shape(dataMatIn)[0]  #行
        self.alphas = mat(zeros((self.m,1)))
        self.b = 0
        self.eCache = mat(zeros((self.m,2))) 
        self.K = mat(zeros((self.m,self.m)))
        #修改optStruct類,增加核函數
        for i in range(self.m):
            self.K[:,i] = kernelTrans(self.X,self.X[i,:],kTup)

利用核函數進行分類的徑向基測試函數

def testRbf(k1=1.3):
    #訓練部分
    dataArr,labelArr = loadDataSet('E:\\DataMining\\Project\\MLBook\\機器學習實戰源代碼\\machinelearninginaction\\Ch06\\testSetRBF.txt')
    b,alphas = smoP(dataArr,labelArr,200,0.0001,10000,('rbf',k1))
    dataMat = mat(dataArr);labelMat = mat(labelArr).transpose()
    svInd=nonzero(alphas.A>0)[0]
    sVs = dataMat[svInd]
    labelSV = labelMat[svInd]
    print("there are {} Support Vectors".format(shape(sVs)[0]))
    m,n = shape(dataMat)
    errorCount = 0
    for i in range(m):
        kernelEval = kernelTrans(sVs,dataMat[i,:],('rbf',k1))
        predict = kernelEval.T*multiply(labelSV,alphas[svInd]) + b
        if sign(predict) != sign(labelArr[i]):
            errorCount += 1
    print('the training error rate is:{}'.format(float(errorCount)/m))
    #測試部分
    dataArr,labelArr = loadDataSet('E:\\DataMining\\Project\\MLBook\\機器學習實戰源代碼\\machinelearninginaction\\Ch06\\testSetRBF2.txt')
    errorCount = 0
    dataMat = mat(dataArr);labelMat = mat(labelArr).transpose()
    m,n = shape(dataMat)
    for i in range(m):
        kernelEval = kernelTrans(sVs,dataMat[i,:],('rbf',k1))
        predict = kernelEval.T*multiply(labelSV,alphas[svInd]) + b
        if sign(predict) != sign(labelArr[i]):
            errorCount += 1
    print('the test error rate is:{}'.format(float(errorCount)/m))

在測試前要將前面innerL()和calcEk()的註釋掉未引入核函數的代碼

testRbf()
fullSet,iter:0 i:0,pairs changed 1
fullSet,iter:5 i:97,pairs changed 0
fullSet,iter:5 i:98,pairs changed 0
fullSet,iter:5 i:99,pairs changed 0
iteration number:6
there are 27 Support Vectors
the training error rate is:0.01
the test error rate is:0.02
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章