前言:
本篇文章主要實現了《機器學習實戰》的支持向量機部分,我在代碼中也儘量描述了對應公式的哪些變量。我建議搭配《統計學習方法》進行實戰,下面這篇博文對於公式推導也是很不錯的。
另外本篇依賴於jupyter notebook。故在代碼後面會出現相應的結果。
正文:
SMO算法求支持向量機
SMO算法中的輔助函數
def loadDataSet(fileName):
"""讀取數據"""
dataMat = [];labelMat = []
fr = open(fileName)
for line in fr.readlines():
lineArr = line.strip().split('\t')
dataMat.append([float(lineArr[0]),float(lineArr[1])])
labelMat.append(float(lineArr[2]))
return dataMat,labelMat
def selectJrand(i,m):
"""
參數i:第一個alpha的下標,參數m:所有alpha的數目
在某個區間範圍隨機取一個整數
"""
j = i
while(j == i ):
j = int (random.uniform(0,m))
return j
def clipAlpha(aj,H,L):
"""用於在數值太大時進行調整"""
if aj > H:
aj = H
if L > aj:
aj = L
return aj
dataArr,labelArr = loadDataSet("E:\\DataMining\\Project\\MLBook\\機器學習實戰源代碼\\machinelearninginaction\\Ch06\\testSet.txt")
簡化版SMO算法
def smoSimple(dataMatIn,classLabels,C,toler,maxIter):
"""參數dtatMatIn:數據集
參數classLabels:類別標籤
參數C:常數C
參數toler:容錯率
參數maxIter:退出前最大的循環次數"""
dataMatrix = mat(dataMatIn);labelMat =mat(classLabels).transpose()
b = 0;m,n = shape(dataMatrix)
alphas = mat(zeros((m,1)))
iter = 0 #儲存在沒有任何alpha改變的情況下遍歷數據集的次數
while(iter < maxIter):
alphaPairsChanged = 0 #記錄alpha是否已優化
for i in range(m):
#multiply() 數組元素對應相乘
#fXi 是預測值
fXi = float(multiply(alphas,labelMat).T *\
(dataMatrix * dataMatrix[i,:].T)) + b
Ei = fXi - float(labelMat[i])
#違反KKT條件最嚴重
# 以下幾種情況出現將會出現不滿足:(ui是預測值,ai是變量alpha)
#yiui<=1但是ai<C則是不滿足的,而原本ai=C
#yiui>=1但是ai>0則是不滿足的而原本ai=0
#yiui=1但是ai=0或者ai=C則表明不滿足的,而原本應該是0<ai<C
#所以要找出不滿足KKT的這些ai,並更新這些ai
#在考慮這些點時有首先考慮0<ai<C對應的樣本點(即對應的樣本點是在間隔邊界上的支持向量)
if ((labelMat[i]*Ei < -toler) and (alphas[i] < C)) or \
((labelMat[i]*Ei > toler) and (alphas[i] > 0)):
#隨機選擇第二個alpha
j = selectJrand(i,m)
fXj = float(multiply(alphas,labelMat).T*\
(dataMatrix*dataMatrix[j,:].T)) + b
Ej = fXj - float(labelMat[j])
alphaIold = alphas[i].copy();alphaJold = alphas[j].copy()
#對類別同號異號分開求上下界
if (labelMat[i] != labelMat[j]):
L = max(0,alphas[j] - alphas[i])
H = min (C,C + alphas[j] - alphas[i])
else:
L = max(0,alphas[j] + alphas[i] - C)
H = min(C,alphas[j] + alphas[i])
if L==H:
print('L==H')
continue
#最優修改量eta
eta = 2.0 * dataMatrix[i,:]*dataMatrix[j,:].T -\
dataMatrix[i,:]*dataMatrix[i,:].T -\
dataMatrix[j,:]*dataMatrix[j,:].T
if eta >= 0:
print('eta>=0')
continue
#之所以是-= 是因爲前面eta的求法對比書上公式是相反數的
alphas[j] -= labelMat[j] * (Ei-Ej)/eta
alphas[j] = clipAlpha(alphas[j],H,L) #調整alpha[j]
if (abs(alphas[j] - alphaJold) < 0.00001):
print('j not moving enough')
continue
#對i進行修改,修改量與j相同,方向相反
alphas[i] += labelMat[j]*labelMat[i]*(alphaJold - alphas[j])
#重新計算閾值b
b1 = b - Ei -labelMat[i]*(alphas[i]- alphaIold)* \
dataMatrix[i,:]*dataMatrix[i,:].T - \
labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[i,:]*dataMatrix[j,:].T
b2 = b - Ej -labelMat[i]*(alphas[i]- alphaIold)* \
dataMatrix[i,:]*dataMatrix[j,:].T - \
labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[j,:]*dataMatrix[j,:].T
if (0 < alphas[i]) and (C > alphas[i]):
b = b1
elif (0 < alphas[j]) and (C > alphas[j]):
b = b2
else:
b = (b1 + b2 ) / 2.0
alphaPairsChanged += 1
print('iter:{0} i:{1} paris change {2}'.format(iter,i,alphaPairsChanged))
if (alphaPairsChanged == 0):
iter += 1
else:
iter = 0
print('iteration numebr :{}'.format(iter))
return b,alphas
from numpy import *
b,alphas = smoSimple(dataArr,labelArr,0.6,0.001,40)
L==H
L==H
iter:0 i:8 paris change 2
j not moving enough
j not moving enough
iteration numebr :39
j not moving enough
j not moving enough
iteration numebr :40
b
matrix([[-3.85138014]])
alphas[alphas>0]
matrix([[0.11792374, 0.243914 , 0.00461456, 0.35722318]])
瞭解哪些數據點是支持向量
for i in range(100):
if alphas[i] > 0.0:
print(dataArr[i],labelArr[i]) # 輸出對應下標的樣本點
[4.658191, 3.507396] -1.0
[3.457096, -0.082216] -1.0
[5.286862, -2.358286] 1.0
[6.080573, 0.418886] 1.0
作圖標記出來
def plotTool():
import matplotlib.pyplot as plt
dataMat,labelMat = loadDataSet("E:\\DataMining\\Project\\MLBook\\機器學習實戰源代碼\\machinelearninginaction\\Ch06\\testSet.txt")
dataArr = array(dataMat)
n = shape(dataArr)[0]
xcord1 = [];ycord1 = []
xcord2 = [];ycord2 = []
for i in range(n):
if int (labelMat[i]) ==1:
xcord1.append(dataArr[i,0]);ycord1.append(dataArr[i,1])
else:
xcord2.append(dataArr[i,0]);ycord2.append(dataArr[i,1])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord1,ycord1,c='red',marker='s')
ax.scatter(xcord2,ycord2,c='green')
plt.xlabel('x1');plt.ylabel('x2')
#標記出支持向量
ax.annotate("SVector",xy=(4.658191, 3.507396),xycoords='data',xytext=(20,0),textcoords='offset points',
arrowprops=dict(arrowstyle='->',connectionstyle='arc3,rad=-0.2'))
ax.annotate("SVector",xy=(3.457096, -0.082216),xycoords='data',xytext=(20,0),textcoords='offset points',
arrowprops=dict(arrowstyle='->',connectionstyle='arc3,rad=-0.2'))
ax.annotate("SVector",xy=(5.286862, -2.358286),xycoords='data',xytext=(10,20),textcoords='offset points',
arrowprops=dict(arrowstyle='->',connectionstyle='arc3,rad=-0.2'))
ax.annotate("SVector",xy=(6.080573, 0.418886),xycoords='data',xytext=(10,20),textcoords='offset points',
arrowprops=dict(arrowstyle='->',connectionstyle='arc3,rad=-0.2'))
plt.show()
plotTool()
計算w值
def calWs(alphas,dataArr,classLabels):
X = mat(dataArr);labelMat = mat(classLabels).transpose()
m,n = shape(X)
w = zeros((n,1))
for i in range(m):
w += multiply(alphas[i]*labelMat[i],X[i,:].T)
return w
w = calWs(alphas,dataArr,labelArr)
完整版Platt SMO的支持函數
PS:在calcEk()註釋掉了後面引入核函數需要修改的部分
class optStruct:
def __init__(self,dataMatIn,classLabels,C,toler):
'''建立一個數據結構來保存所有重要的值'''
self.X = dataMatIn
self.labelMat = classLabels
self.C = C
self.tol = toler
self.m = shape(dataMatIn)[0] #行
self.alphas = mat(zeros((self.m,1)))
self.b = 0
self.eCache = mat(zeros((self.m,2))) #第一列給出的是eCache是否有效的標誌位,第二列是實際的E值
def calcEk(oS,k):
'''
參數oS:optStruct的對象
參數k:下標
計算E值並返回'''
#以下部分爲未引入核函數後的fXk,Ek計算
fXk = float(multiply(oS.alphas,oS.labelMat).T*\
(oS.X*oS.X[k,:].T)) + oS.b
#以下部分爲引入核函數後的fXk,Ek計算
#fXk = float(multiply(oS.alphas,oS.labelMat).T*oS.K[:,k]+oS.b)
Ek = fXk - float(oS.labelMat[k])
return Ek
def selectJ(i,oS,Ei):
'''啓發式方法選擇內循環的alpha'''
maxK = -1;maxDeltaE = 0;Ej = 0
oS.eCache[i] = [1,Ei]
#nonzero():
#返回一個長度爲a.ndim(數組a的軸數)的元祖,元祖的每個元素都是一個整數數組,其值爲非零元素的下標在對應軸上的值
#>>> b2 = np.array([[True, False, True], [True, False, False]])
#>>> np.nonzero(b2)
#(array([0, 0, 1], dtype=int64), array([0, 2, 0], dtype=int64))
#它的第0個元素是數組a中值不爲0的元素的第0軸的下標,第1個元素則是第1軸的下標,因此從下面的結果可知b2[0,0]、b[0,2]和b2[1,0]的值不爲0:
#matrix.A 表示將矩陣轉爲array數組
validEcacheList = nonzero(oS.eCache[:,0].A)[0]
if (len(validEcacheList)) > 1:
#對E中非0的值的下標進行循環
for k in validEcacheList:
if k == i:
continue
Ek = calcEk(oS,k)
deltaE = abs(Ei - Ek)
if (deltaE > maxDeltaE):
maxK = k;maxDeltaE = deltaE;Ej = Ek
return maxK,Ej
else:
j = selectJrand(i,oS.m)
Ej = calcEk(oS,j)
return j,Ej
def updateEk(oS,k):
'''計算誤差值並存入緩存中'''
Ek = calcEk(oS,k)
oS.eCache[k] = [1,Ek]
完整Platt SMO 算法中的優化例程
PS:下面代碼內部增添了在引入核函數後的內容,會註釋掉,要用時再刪除註釋即可
File "<ipython-input-121-0f2fd9f66347>", line 1
PS:下面代碼內部增添了在引入核函數後的內容,會註釋掉,要用時再刪除註釋即可
^
SyntaxError: invalid character in identifier
def innerL(i,oS):
Ei = calcEk(oS,i)
if ((oS.labelMat[i]*Ei < -oS.tol) and (oS.alphas[i] < oS.C)) or\
((oS.labelMat[i]*Ei > oS.tol) and (oS.alphas[i] > 0)):
j,Ej = selectJ(i,oS,Ei)
alphaIold = oS.alphas[i].copy();alphaJold = oS.alphas[j].copy()
if (oS.labelMat[i] != oS.labelMat[j]):
L = max(0,oS.alphas[j]-oS.alphas[i])
H = min(oS.C,oS.C + oS.alphas[j] - oS.alphas[i])
else:
L = max(0,oS.alphas[j] + oS.alphas[i] - oS.C)
H = min(oS.C,oS.alphas[j] + oS.alphas[i])
if L == H:
print('L==H');return 0
#下一行爲未引入核函數後的:
eta = 2.0 * oS.X[i,:]*oS.X[j,:].T - oS.X[i,:]*oS.X[i,:].T - oS.X[j,:]*oS.X[j,:].T
#下一行爲引入核函數後的:
#eta = 2.0 * oS.K[i,j] - oS.K[i,i] - oS.K[j,j]
if eta >= 0:
print('eta>=0');return 0
oS.alphas[j] -= oS.labelMat[j]*(Ei - Ej)/eta
oS.alphas[j] = clipAlpha(oS.alphas[j],H,L)
updateEk(oS,j) #更新誤差進緩存
if (abs(oS.alphas[j] - alphaJold) < 0.00001):
print('j not moving enough');return 0
oS.alphas[i] += oS.labelMat[j]*oS.labelMat[i]*(alphaJold - oS.alphas[j])
updateEk(oS,i)
#更新閾值b
#以下b1,b2是未引入核函數之後的:
b1 = oS.b - Ei -oS.labelMat[i]*(oS.alphas[i]-alphaIold)*\
oS.X[i,:]*oS.X[i,:].T - oS.labelMat[j]*\
(oS.alphas[j] - alphaJold)*oS.X[i,:]*oS.X[j,:].T
b2 = oS.b - Ej -oS.labelMat[i]*(oS.alphas[i]-alphaIold)*\
oS.X[i,:]*oS.X[j,:].T - oS.labelMat[j]*\
(oS.alphas[j] - alphaJold)*oS.X[j,:]*oS.X[j,:].T
#以下b1,b2是引入核函數之後的:
#b1 = oS.b - Ei - oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i,i] -\
# oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.K[i,j]
#b2 = oS.b - Ej - oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i,j] -\
# oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.K[j,j]
if (0 < oS.alphas[i]) and (oS.C > oS.alphas[i]):
oS.b = b1
elif (0 < oS.alphas[j]) and (oS.C > oS.alphas[j]):
oS.b = b2
else:
oS.b = (b1 + b2)/2.0
return 1
else : return 0
完整版Platt SMO的外循環代碼
def smoP(dataMatIn,classLabels,C,toler,maxIter,kTup=('lin',0)):
oS = optStruct(mat(dataMatIn),mat(classLabels).transpose(),C,toler,kTup)
iter = 0
entireSet = True;alphaPairsChanged = 0
while(iter < maxIter) and ((alphaPairsChanged > 0) or (entireSet)):
alphaPairsChanged = 0
if entireSet:
#遍歷所有的值
for i in range(oS.m):
alphaPairsChanged += innerL(i,oS)
print('fullSet,iter:{0} i:{1},pairs changed {2}'.format(iter,i,alphaPairsChanged))
iter += 1
else:
nonBoundIs = nonzero((oS.alphas.A > 0)*(oS.alphas.A < C))[0]
#遍歷非邊界值
for i in nonBoundIs:
alphaPairsChanged += innerL(i,oS)
print('non-bound,iter:{0} i:{1},pairs changed {2}'.format(iter,i,alphaPairsChanged))
iter += 1
if entireSet:
entireSet = False
elif (alphaPairsChanged == 0):
entireSet = True
print("iteration number:{}".format(iter))
return oS.b,oS.alphas
進行測試
b,alphas = smoP(dataArr,labelArr,0.6,0.001,40)
fullSet,iter:0 i:0,pairs changed 1
fullSet,iter:3 i:96,pairs changed 0
fullSet,iter:3 i:97,pairs changed 0
fullSet,iter:3 i:98,pairs changed 0
fullSet,iter:3 i:99,pairs changed 0
iteration number:4
瞭解哪些數據點是支持向量,可以看到與簡易版SMO是不一樣的
for i in range(100):
if alphas[i] > 0.0:
print(dataArr[i],labelArr[i]) # 輸出對應下標的樣本點
[3.634009, 1.730537] -1.0
[3.125951, 0.293251] -1.0
[4.658191, 3.507396] -1.0
[3.223038, -0.552392] -1.0
[3.457096, -0.082216] -1.0
[5.286862, -2.358286] 1.0
[6.080573, 0.418886] 1.0
計算w值
ws = calWs(alphas,dataArr,labelArr)
ws
array([[ 0.74764704],
[-0.17895243]])
以第一個數據點爲例進行分類:大於0爲正類,否則負類
dataMat = mat(dataArr)
dataMat[0]*mat(ws)+b
matrix([[-0.98996178]])
在複雜數據上應用核函數
核函數轉換函數
def kernelTrans(X,A,kTup):
'''
返回一個指定類型的核函數
參數kTup:是一個包含核函數信息的元組,第一個參數是描述核函數類型的字符串,其它2個參數是可能需要的可選參數
'''
m,n = shape(X)
K = mat(zeros((m,1)))
if kTup[0] == 'lin':
K = X * A.T
elif kTup[0] =='rbf':
for j in range(m):
deltaRow = X[j,:] - A
K[j] = deltaRow*deltaRow.T
K = exp(K/(-1*kTup[1]**2))
else:
raise NameError('Houston We Have a Problem - - That Kernel is not recognized')
return K
class optStruct:
def __init__(self,dataMatIn,classLabels,C,toler,kTup):
self.X = dataMatIn
self.labelMat = classLabels
self.C = C
self.tol = toler
self.m = shape(dataMatIn)[0] #行
self.alphas = mat(zeros((self.m,1)))
self.b = 0
self.eCache = mat(zeros((self.m,2)))
self.K = mat(zeros((self.m,self.m)))
#修改optStruct類,增加核函數
for i in range(self.m):
self.K[:,i] = kernelTrans(self.X,self.X[i,:],kTup)
利用核函數進行分類的徑向基測試函數
def testRbf(k1=1.3):
#訓練部分
dataArr,labelArr = loadDataSet('E:\\DataMining\\Project\\MLBook\\機器學習實戰源代碼\\machinelearninginaction\\Ch06\\testSetRBF.txt')
b,alphas = smoP(dataArr,labelArr,200,0.0001,10000,('rbf',k1))
dataMat = mat(dataArr);labelMat = mat(labelArr).transpose()
svInd=nonzero(alphas.A>0)[0]
sVs = dataMat[svInd]
labelSV = labelMat[svInd]
print("there are {} Support Vectors".format(shape(sVs)[0]))
m,n = shape(dataMat)
errorCount = 0
for i in range(m):
kernelEval = kernelTrans(sVs,dataMat[i,:],('rbf',k1))
predict = kernelEval.T*multiply(labelSV,alphas[svInd]) + b
if sign(predict) != sign(labelArr[i]):
errorCount += 1
print('the training error rate is:{}'.format(float(errorCount)/m))
#測試部分
dataArr,labelArr = loadDataSet('E:\\DataMining\\Project\\MLBook\\機器學習實戰源代碼\\machinelearninginaction\\Ch06\\testSetRBF2.txt')
errorCount = 0
dataMat = mat(dataArr);labelMat = mat(labelArr).transpose()
m,n = shape(dataMat)
for i in range(m):
kernelEval = kernelTrans(sVs,dataMat[i,:],('rbf',k1))
predict = kernelEval.T*multiply(labelSV,alphas[svInd]) + b
if sign(predict) != sign(labelArr[i]):
errorCount += 1
print('the test error rate is:{}'.format(float(errorCount)/m))
在測試前要將前面innerL()和calcEk()的註釋掉未引入核函數的代碼
testRbf()
fullSet,iter:0 i:0,pairs changed 1
fullSet,iter:5 i:97,pairs changed 0
fullSet,iter:5 i:98,pairs changed 0
fullSet,iter:5 i:99,pairs changed 0
iteration number:6
there are 27 Support Vectors
the training error rate is:0.01
the test error rate is:0.02