寫在開頭的話:在學習《機器學習實戰》的過程中發現書中很多代碼並沒有註釋,這對新入門的同學是一個挑戰,特此貼出我對代碼做出的註釋,僅供參考,歡迎指正。
1、SMO高效優化算法
#coding:gbk
from numpy import *
#功能:導入數據集
#輸入:文件名
#輸出:數據矩陣,標籤向量
def loadDataSet(fileName):
dataMat = []#數據矩陣
labelMat = []#標籤向量
fr = open(fileName)
for line in fr.readlines():
lineArr = line.strip().split('\t')#strip()表示刪除空白符,split()表示分割
dataMat.append([float(lineArr[0]), float(lineArr[1])])#1.0表示x0
labelMat.append(float(lineArr[2]))
return dataMat, labelMat
#功能:在(0, m)的區間範圍內隨機選擇一個除i以外的整數
#輸入:不能選擇的整數i,區間上界m
#輸出:隨機選擇的整數
def selectJrand(i, m):
j = i
while (j == i):
j = int (random.uniform(0, m))
return j
#功能:保證aj在區間[L, H]裏面
#輸入:要調整的數aj,區間上界H,區間下界L
#輸出:調整好的數aj
def clipAlpha(aj, H, L):
if aj > H:#aj大於H
aj = H
if L > aj:#aj小於L
aj = L
return aj
#功能:簡化版SMO算法
#輸入:數據矩陣dataMatIn,標籤向量classLabels,常數C,容錯率toler,最大迭代次數maxIter
#輸出:超平面位移項b,拉格朗日乘子alpha
def smoSimple(dataMatIn, classLabels, C, toler, maxIter):
dataMatrix = mat(dataMatIn)
labelMat = mat(classLabels).transpose()
b = 0
m, n = shape(dataMatrix)#數據矩陣行數和列數,表示訓練樣本個數和特徵值個數
alphas = mat(zeros((m, 1)))#m*1階矩陣
iter = 0
while (iter < maxIter):#循環直到超出最大迭代次數
alphaPairsChanged = 0
for i in range(m):
#主窗口輸入numpy.info(numpy.multiply)
#推導見《機器學習》(周志華)公式6.12
fXi = float(multiply(alphas, labelMat).T * \
(dataMatrix * dataMatrix[i, :].T)) + b
Ei = fXi - float(labelMat[i])#誤差
#誤差很大,可以對該數據實例所對應的alpha值進行優化
if ((labelMat[i] * Ei < -toler) and (alphas[i] < C)) or \
((labelMat[i] * Ei > toler) and (alphas[i] > 0)):
#在(0, m)的區間範圍內隨機選擇一個除i以外的整數,即隨機選擇第二個alpha
j = selectJrand(i, m)
#求變量alphaJ對應的誤差
fXj = float(multiply(alphas, labelMat).T * (dataMatrix * dataMatrix[j, :].T)) + b
Ej = fXj - float(labelMat[j])
#不能直接 alphaIold = alphas[i],否則alphas[i]和alphaIold指向的都是同一內存空間
alphaIold = alphas[i].copy()
alphaJold = alphas[j].copy()
#接下來需要看Plata的論文,待以後看論文後再重讀此章
if (labelMat[i] != labelMat[j]):
L = max(0, alphas[j] - alphas[i])
H = min(C, C + alphas[j] - alphas[i])
else:
L = max(0, alphas[j] + alphas[i] - C)
H = min(C, alphas[j] + alphas[i])
if L == H:
print "L == H"
continue
eta = 2.0 * dataMatrix[i, :] * dataMatrix[j, :].T - \
dataMatrix[i, :] * dataMatrix[i, :].T - \
dataMatrix[j, :] * dataMatrix[j, :].T
if eta >= 0:
print "eta >= 0"
continue
alphas[j] -= labelMat[j] * (Ei - Ej) / eta
alphas[j] = clipAlpha(alphas[j], H, L)#保證alphas[j]在區間[L, H]裏面
#檢查alpha[j]是否有較大改變,如果沒有則退出for循環
if (abs(alphas[j] - alphaJold) < 0.00001):
print "j not moving enough"
continue
#labelMat[i]與labelMat[j]絕對值均爲1,則alphas[i]與alphas[j]改變大小一樣
#保證alpha[i] * labelMal[i] + alpha[j] * labelMal[j] = c
#即Delta(alpha[i]) * labelMal[i] + Delta(alpha[j]) * labelMal[j] = 0
alphas[i] += labelMat[j] * labelMat[i] * (alphaJold - alphas[j])
b1 = b - Ei - labelMat[i] * (alphas[i] - alphaIold) * \
dataMatrix[i, :] * dataMatrix[i, :].T - \
labelMat[j] * (alphas[j] - alphaJold) * dataMatrix[i, :] * dataMatrix[j, :].T
b2 = b - Ej - labelMat[i] * (alphas[i] - alphaIold) * \
dataMatrix[i, :] * dataMatrix[j, :].T - \
labelMat[j] * (alphas[j] - alphaJold) * \
dataMatrix[j, :] * dataMatrix[j, :].T
if (0 < alphas[i]) and (C > alphas[i]):
b = b1
elif (0 < alphas[j]) and (C > alphas[j]):
b = b2
else:
b = (b1 + b2) / 2.0
alphaPairsChanged += 1
print "iter: %d i: %d, pairs changed %d" % (iter, i, alphaPairsChanged)
#不是1,這個迭代思路比較巧妙,是以最後一次迭代沒有誤差爲迭代結束條件
if (alphaPairsChanged == 0):
iter += 1
else:
iter = 0
print "iteration number: %d" % iter
return b, alphas
2、利用完整Platt SMO算法加速優化
#功能:建立數據結構用於保存所有的重要值
#輸入:無
#輸出:無
class optStruct:
def __init__(self, dataMatIn, classLabes, C, toler, kTup):#__init__作用是初始化已實例化後的對象
self.X = dataMatIn
self.labelMat = classLabes
self.C = C
self.tol = toler
self.m = shape(dataMatIn)[0]#dataMatIn行數
self.alphas = mat(zeros((self.m, 1)))#(self.m, 1)是一個元組,下同
self.b = 0
# m*2誤差矩陣,第一列爲eCache是否有效的標誌位,第二列是
self.eCache = mat(zeros((self.m, 2)))
self.K = mat(zeros((self.m, self.m)))#建立核矩陣
for i in range(self.m):
self.K[:, i] = kernelTrans(self.X, self.X[i, :], kTup)
#功能:計算第k個alpha的誤差值
#輸入:數據集,alpha數
#輸出:誤差值
def calcEk(oS, k):
#fXk = float(multiply(oS.alphas, oS.labelMat).T * (oS.X * oS.X[k, :].T)) + oS.b
fXk = float(multiply(oS.alphas, oS.labelMat).T * oS.K[:, k] + oS.b)
Ek = fXk -float(oS.labelMat[k])
return Ek
#功能:選擇有最大步長的alpha值
#輸入:第一個alpha值,數據集,第一個alpha對應的誤差值
#輸出:第二個alpha值和對應的誤差值
def selectJ(i, oS, Ei):
maxK = -1#最大步長對應j值
maxDeltaE = 0#最大步長
Ej = 0#最大誤差值
oS.eCache[i] = [1, Ei]#使i值對應的標誌位永遠有效
# .A表示將矩陣轉化爲列表,nonzero()返回值不爲零的元素的下標,[0]表示第一列
#該行表示讀取eCache第一列即是否有效標誌位的下標
validEcacheList = nonzero(oS.eCache[:, 0].A)[0]
if (len(validEcacheList)) > 1:#大於等於2個
for k in validEcacheList:#在有效標誌位中尋找
if k == i:
continue
Ek = calcEk(oS, k)
deltaE = abs(Ei - Ek)
if (deltaE > maxDeltaE):#找到最大步長
maxK = k
maxDeltaE = deltaE
Ej = Ek
return maxK, Ej
else:
j = selectJrand(i, oS.m)#隨機選擇一個j值
Ej = calcEk(oS, j)#j值對應的誤差值Ej
return j, Ej
#功能:更新第k個alpha的誤差值至數據結構中
#輸入:數據集,alpha數
#輸出:無
def updataEk(oS, k):
Ek = calcEk(oS, k)
oS.eCache[k] = [1, Ek]
#功能:完整版Platt SMO內循環,在數據結構中更新alpha數
#輸入:alpha數,數據集
#輸出:是否在數據結構中成功更新alpha數,成功返回1,不成功返回0
def innerL(i, oS):
Ei = calcEk(oS, i)
if ((oS.labelMat[i] * Ei < -oS.tol) and (oS.alphas[i] < oS.C)) or \
((oS.labelMat[i] * Ei > oS.tol) and (oS.alphas[i] > 0)):
j, Ej = selectJ(i, oS, Ei)#選擇有最大步長的alpha值
alphaIold = oS.alphas[i].copy()
alphaJold = oS.alphas[j].copy()
if (oS.labelMat[i] != oS.labelMat[j]):
L = max(0, oS.alphas[j] - oS.alphas[i])
H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i])
else:
L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C)
H = min(oS.C, oS.alphas[j] + oS.alphas[i])
if L == H:
print "L == H"
return 0
#eta = 2.0 * oS.X[i, :] * oS.X[j, :].T - oS.X[i, :] * oS.X[i, :].T - \
#oS.X[j, :] * oS.X[j, :].T
eta = 2.0 * oS.K[i, j] - oS.K[i, i] - oS.K[j, j]
if eta >= 0:
print "eta >= 0"
return 0
oS.alphas[j] -= oS.labelMat[j] * (Ei - Ej) / eta
oS.alphas[j] = clipAlpha(oS.alphas[j], H, L)
updataEk(oS, j)
if (abs(oS.alphas[j] - alphaJold) < 0.00001):
print "j not moving enough"
return 0
oS.alphas[i] += oS.labelMat[j] * oS.labelMat[i] * \
(alphaJold - oS.alphas[j])
updataEk(oS, i)
#b1 = oS.b - Ei - oS.labelMat[i] * (oS.alphas[i] - alphaIold) * oS.X[i, :] * oS.X[i, :].T - \
#oS.labelMat[j] * (oS.alphas[j] - alphaJold) * oS.X[i, :] * oS.X[j, :].T
#b2 = oS.b - Ej - oS.labelMat[i] * (oS.alphas[i] - alphaIold) * oS.X[i, :] * oS.X[j, :].T - \
#oS.labelMat[j] * (oS.alphas[j] - alphaJold) * oS.X[j, :] * oS.X[j, :].T
b1 = oS.b - Ei - oS.labelMat[i] * (oS.alphas[i] - alphaIold) * oS.K[i, i] - \
oS.labelMat[j] * (oS.alphas[j] - alphaJold) * oS.K[i, j]
b2 = oS.b - Ej - oS.labelMat[i] * (oS.alphas[i] - alphaIold) * oS.K[i, j] - \
oS.labelMat[j] * (oS.alphas[j] - alphaJold) * oS.K[j, j]
if (0 < oS.alphas[i]) and (oS.C > oS.alphas[i]):
oS.b = b1
elif (0 < oS.alphas[j]) and (oS.C > oS.alphas[j]):
oS.b = b2
else:
oS.b = (b1 + b2) / 2.0
return 1
else:
return 0
#功能:完整版Platt SMO外循環
#輸入:數據矩陣dataMatIn,標籤向量classLabels,常數C,容錯率toler,最大迭代次數maxIter
#輸出:超平面位移項b,拉格朗日乘子alpha
def smoP(dataMatIn, classLabels, C, toler, maxIter, kTup=('lin', 0)):
oS = optStruct(mat(dataMatIn), mat(classLabels).transpose(), C, toler, kTup)#建立數據結構
iter = 0#一次迭代完成一次循環過程
entireSet = True
alphaPairsChanged = 0
while (iter < maxIter) and ((alphaPairsChanged > 0) or (entireSet)):
alphaPairsChanged = 0
if entireSet:#判斷1
for i in range(oS.m):
alphaPairsChanged += innerL(i, oS)
print "fullSet, iter: %d i: %d, pairs changed %d" % \
(iter, i, alphaPairsChanged)
iter += 1
else:
nonBoundIs = nonzero((oS.alphas.A > 0) * (oS.alphas.A < C))[0]
for i in nonBoundIs:
alphaPairsChanged += innerL(i, oS)
print "non-bound, iter: %d i: %d, pairs changed %d" % \
(iter, i, alphaPairsChanged)
iter += 1
# 執行判斷1時,如果entireSet = True,表示遍歷整個集合,alphaPairsChanged = 0,表示未對任意alpha對進行修改
if entireSet:
entireSet = False
#執行判斷1時,第一次迭代遍歷整個集合,之後就只遍歷非邊界值,除非遍歷非邊界值發現沒有任意alpha對進行修改,遍歷整個集合
elif (alphaPairsChanged == 0):
entireSet = True
print "iteration number: %d" % iter
return oS.b, oS.alphas
#功能:計算超平面法向量
#輸入:拉格朗日乘子alpha,數據矩陣dataArr,標籤向量classLabels
#輸出:超平面法向量
def calcWs(alphas, dataArr, classLabels):
X = mat(dataArr)
labelMat = mat(classLabels).transpose()
m, n = shape(X)
w = zeros((n, 1))
for i in range(m):
w += multiply(alphas[i] * labelMat[i], X[i, :].T)
return w
3、在複雜數據上應用核函數
#功能:核轉換函數
#輸入:數據集,第i行數據集,核函數名稱
#輸出:對應的核函數
def kernelTrans(X, A, kTup):
m, n = shape(X)#數據集行數和列數
K = mat(zeros((m, 1)))
if kTup[0] == 'lin':#如果核函數是線性核
K = X * A.T
elif kTup[0] == 'rbf':#如果核函數是高斯核,即徑向基核函數
for j in range(m):
deltaRow = X[j, :] - A
K[j] = deltaRow * deltaRow.T
K = exp(K / (-1 * kTup[1] ** 2))
else:#出現不能識別的核函數
#通過raise顯式地引發異常
raise NameError('Houston We Have a Problem -- \
That Kernel is not recognized')
return K
#功能:利用核函數進行分類的徑向基測試函數
#輸入:高斯核帶寬的平方值
#輸出:無
def testRbf(k1 = 1.3):
dataArr, labelArr = loadDataSet('testSetRBF.txt')
b, alphas = smoP(dataArr, labelArr, 200, 0.0001, 10000, ('rbf', k1))
datMat = mat(dataArr)
labelMat = mat(labelArr).transpose()
svInd = nonzero(alphas.A >0)[0]#支持向量的下標
sVs = datMat[svInd]#支持向量
labelSV = labelMat[svInd]#支持向量的類別標籤
print "there are %d Support Vectors" % shape(sVs)[0]
m, n = shape(datMat)
errorCount = 0
for i in range(m):
kernelEval = kernelTrans(sVs, datMat[i, :], ('rbf', k1))
predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b#得預測值
if sign(predict) != sign(labelArr[i]):
errorCount += 1
print "the training error rate is %f" % (float(errorCount) / m)
dataArr, labelArr = loadDataSet('testSetRBF2.txt')
errorCount = 0
datMat = mat(dataArr)
labelMat = mat(labelArr).transpose()
m, n = shape(datMat)
for i in range(m):
kernelEval = kernelTrans(sVs, datMat[i, :], ('rbf', k1))
predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b#得驗證集預測值
if sign(predict) != sign(labelArr[i]):
errorCount += 1
print "the test error rate is: %f" % (float(errorCount) / m)
4、手寫識別問題回顧
#功能:圖像矩陣轉化爲m*1矩陣
#輸入:文件名
#輸出:m*1矩陣
def img2vector(filename):
returnVect = zeros((1, 1024))
fr = open(filename)
for i in range(32):
lineStr = fr.readline()
for j in range(32):
returnVect[0, 32 * i + j] = int (lineStr[j])
return returnVect
#功能:將圖像內容導入矩陣
#輸入:一級子目錄
#輸出:圖像矩陣,圖像標籤向量
def loadImages(dirName):
from os import listdir
hwLabels = []
trainingFileList = listdir(dirName)#dirName文件夾下的文件名列表
m = len(trainingFileList)#dirName文件夾下的文件數目
trainingMat = zeros((m, 1024))
for i in range(m):
fileNameStr = trainingFileList[i]#文件名
fileStr = fileNameStr.split('.')[0]#去掉.txt的文件名
classNumStr = int(fileStr.split('_')[0])#要識別的數字
if classNumStr == 9:#數字9
hwLabels.append(-1)
else:#數字1
hwLabels.append(1)
trainingMat[i, :] = img2vector('%s/%s' % (dirName, fileNameStr))
return trainingMat, hwLabels
def testDigits(kTup = ('rbf', 10)):
dataArr, labelArr = loadImages('trainingDigits')
b, alphas = smoP(dataArr, labelArr, 200, 0.0001, 10000, kTup)
datMat = mat(dataArr)
labelMat = mat(labelArr).transpose()
svInd = nonzero(alphas.A > 0)[0] # 支持向量的下標
sVs = datMat[svInd] # 支持向量
labelSV = labelMat[svInd] # 支持向量的類別標籤
print "there are %d Support Vectors" % shape(sVs)[0]
m, n = shape(datMat)
errorCount = 0
for i in range(m):
kernelEval = kernelTrans(sVs, datMat[i, :], kTup)
predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b # 得預測值
if sign(predict) != sign(labelArr[i]):
errorCount += 1
print "the training error rate is %f" % (float(errorCount) / m)
dataArr, labelArr = loadImages('testDigits')
errorCount = 0
datMat = mat(dataArr)
labelMat = mat(labelArr).transpose()
m, n = shape(datMat)
for i in range(m):
kernelEval = kernelTrans(sVs, datMat[i, :], kTup)
predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b # 得驗證集預測值
if sign(predict) != sign(labelArr[i]):
errorCount += 1
print "the test error rate is: %f" % (float(errorCount) / m)