機器學習 | K近鄰算法

由於近期學業繁重QAQ，所以我就不說廢話了，直接上代碼~

使用K近鄰算法改進約會網站

from numpy import *
import operator
import matplotlib
import matplotlib.pyplot as plt

#將文件轉成numpy數組的函數
def file2matrix(filename):
    #打開文件
    fr=open(filename)
    #將文件內容使用數組表示
    arrayOLines=fr.readlines()
    #print('arrayOLines:')
    #print(arrayOLines)
    #數組的長度表示文件的行數
    numberOfLine=len(arrayOLines)
    #print('numberOfLine:')
    #print(numberOfLine)
    #創建返回的NumPy矩陣，內容全爲0
    returnMat=zeros((numberOfLine,3))
    #print('returnMat:')
    #print(returnMat)
    classLabelVector=[]
    index=0
    for line in arrayOLines:
        line=line.strip()
        listFromLine=line.split('\t')
        #print('listFromLine:')
        #print(listFromLine)
        returnMat[index,:]=listFromLine[0:3]
        #print('returnMat:')
        #print(+returnMat)
        classLabelVector.append(int(listFromLine[-1]))
        #print('classLabelVector:')
        #print(classLabelVector)
        index+=1
    return returnMat,classLabelVector
        
#根據數組繪圖的函數
def myDraw(datingDataMat,datingLabels):
    #建立一個畫布
    fig=plt.figure()
    #在畫布中建立圖表
    #fig.add_subplot()函數
    #畫布分割成1行1列
    ax=fig.add_subplot(111)
    ax.scatter(datingDataMat[:,0],datingDataMat[:,1],
    15.0*array(datingLabels),15.0*array(datingLabels))
    plt.show()
        
#歸一化特徵值的函數
#返回的是歸一化後的數組，取值範圍，每一列的最小值歸一化數據
def autoNorm(dataSet):
    minVals=dataSet.min(0)
    maxVals=dataSet.max(0)
    ranges=maxVals-minVals
    normDataSet=zeros(shape(dataSet))
    m=dataSet.shape[0]
    normDataSet=dataSet-tile(minVals,(m,1))
    normDataSet=normDataSet/tile(ranges,(m,1))
    return normDataSet,ranges,minVals
        
#使用k-近鄰算法進行分類
def classify0(inX,dataSet,labels,k):
    dataSetSize=dataSet.shape[0]
    #計算距離
    diffMat=tile(inX,(dataSetSize,1))-dataSet
    sqDiffMat=diffMat**2
    distances=sqDiffMat.sum(axis=1)
    sortedDisIndices=distances.argsort()
    classCount={}
    #選擇距離最小的k個點
    for i in range(k):
        voteIlabel=labels[sortedDisIndices[i]]
        classCount[voteIlabel]=classCount.get(voteIlabel,0)+1
    #排序
    sortedClassCount=sorted(classCount.items(),
    key=operator.itemgetter(1),reverse=True)
    #返回發生頻率最高的元素標籤
    return sortedClassCount[0][0]
        
#將數據分爲訓練集與測試集
#對分類器分類效果進行測試
def datingClassTest():
    #測試數據佔比
    hoRatio=0.10
    datingDataMat,datingLabels=file2matrix('datingTestSet2.txt')
    normMat,ranges,minVals=autoNorm(datingDataMat)
    m=normMat.shape[0]
    #m爲行數1000
    #print('m:')
    #print(m)
    numTestVecs=int(m*hoRatio)
    #選取其中的100個進行測試
    #print('numTestVecs:')
    #print(numTestVecs)
    errorCount=0.0
    #print('normMat[numTestVecs:m,:]:')
    #print(normMat[numTestVecs:m,:])
    #print('datingLabels[numTestVecs:m]:')
    #print(datingLabels[numTestVecs:m])
    for i in range(numTestVecs):
        #print('i:')
        #print(i)
        classifierResult=classify0(normMat[i,:],normMat[numTestVecs:m,:],
        datingLabels[numTestVecs:m],3)
        print("the classifierResult came back with: %d,the real answer is: %d"
        %(classifierResult,datingLabels[i]))
        if(classifierResult!=datingLabels[i]):
            errorCount+=1.0
    print("the total error rate is: %f"%(errorCount/float(numTestVecs)))
    myDraw(datingDataMat,datingLabels)
        
#玩視頻遊戲所消耗的時間百分比
#每年獲得的飛行常客里程數
#每週消費的冰淇淋公升數
#預測函數
def classifyPerson():
    resultList=['not at all','in small doses','in large deses']
    percentTats=float(input("玩視頻遊戲所消耗的時間百分比?"))
    ffMiles=float(input("每年獲得的飛行常客里程數?"))
    iceCream=float(input("每週消費的冰淇淋公升數?"))
    datingDataMat,datingLabels=file2matrix('datingTestSet2.txt')
    normMat,ranges,minVals=autoNorm(datingDataMat)
    inArr=array([ffMiles,percentTats,iceCream])
    classifierResult=classify0((inArr-minVals)/ranges,normMat,datingLabels,3)
    print("You will probably like this person: ",resultList[classifierResult-1])
        
        
def main():
    classifyPerson()
    
    
if __name__=='__main__':
    main()

datingTestSet2.txt數據預覽

40920    8.326976    0.953952    3
14488    7.153469    1.673904    2
26052    1.441871    0.805124    1
75136    13.147394    0.428964    1
38344    1.669788    0.134296    1
72993    10.141740    1.032955    1
35948    6.830792    1.213192    3
42666    13.276369    0.543880    3
67497    8.631577    0.749278    1
35483    12.273169    1.508053    3
50242    3.723498    0.831917    1
63275    8.385879    1.669485    1
5569    4.875435    0.728658    2
51052    4.680098    0.625224    1
77372    15.299570    0.331351    1
43673    1.889461    0.191283    1
61364    7.516754    1.269164    1
69673    14.239195    0.261333    1
15669    0.000000    1.250185    2
28488    10.528555    1.304844    3
6487    3.540265    0.822483    2
37708    2.991551    0.833920    1
22620    5.297865    0.638306    2
28782    6.593803    0.187108    3
19739    2.816760    1.686209    2
36788    12.458258    0.649617    3
5741    0.000000    1.656418    2
28567    9.968648    0.731232    3
6808    1.364838    0.640103    2
41611    0.230453    1.151996    1
36661    11.865402    0.882810    3
43605    0.120460    1.352013    1
15360    8.545204    1.340429    3
63796    5.856649    0.160006    1
10743    9.665618    0.778626    2
70808    9.778763    1.084103    1
72011    4.932976    0.632026    1
5914    2.216246    0.587095    2
14851    14.305636    0.632317    3
33553    12.591889    0.686581    3
44952    3.424649    1.004504    1
17934    0.000000    0.147573    2
27738    8.533823    0.205324    3
29290    9.829528    0.238620    3
42330    11.492186    0.263499    3
36429    3.570968    0.832254    1
39623    1.771228    0.207612    1
32404    3.513921    0.991854    1
27268    4.398172    0.975024    1
5477    4.276823    1.174874    2
14254    5.946014    1.614244    2
68613    13.798970    0.724375    1
41539    10.393591    1.663724    3
7917    3.007577    0.297302    2
21331    1.031938    0.486174    2
8338    4.751212    0.064693    2
5176    3.692269    1.655113    2
18983    10.448091    0.267652    3
68837    10.585786    0.329557    1
13438    1.604501    0.069064    2
48849    3.679497    0.961466    1
12285    3.795146    0.696694    2
7826    2.531885    1.659173    2
5565    9.733340    0.977746    2
10346    6.093067    1.413798    2
1823    7.712960    1.054927    2
9744    11.470364    0.760461    3
16857    2.886529    0.934416    2
39336    10.054373    1.138351    3
65230    9.972470    0.881876    1
2463    2.335785    1.366145    2
27353    11.375155    1.528626    3
16191    0.000000    0.605619    2
12258    4.126787    0.357501    2
42377    6.319522    1.058602    1
25607    8.680527    0.086955    3
77450    14.856391    1.129823    1
58732    2.454285    0.222380    1
46426    7.292202    0.548607    3
32688    8.745137    0.857348    3
64890    8.579001    0.683048    1
8554    2.507302    0.869177    2
28861    11.415476    1.505466    3
42050    4.838540    1.680892    1
32193    10.339507    0.583646    3
64895    6.573742    1.151433    1
2355    6.539397    0.462065    2
0    2.209159    0.723567    2
70406    11.196378    0.836326    1
57399    4.229595    0.128253    1
41732    9.505944    0.005273    3
11429    8.652725    1.348934    3
75270    17.101108    0.490712    1
5459    7.871839    0.717662    2
73520    8.262131    1.361646    1
40279    9.015635    1.658555    3
21540    9.215351    0.806762    3
17694    6.375007    0.033678    2
22329    2.262014    1.022169    1
46570    5.677110    0.709469    1
...

使用K近鄰算法實現手寫識別

from numpy import *
import operator
from os import listdir

#將二維32X32的圖像，
#轉換成一個1X1024的向量
#方便使用之前的分類器
def img2vector(filename):
    returnVect=zeros((1,1024))
    fr=open(filename)
    for i in range(32):
        lineStr=fr.readline()
        for j in range(32):
            returnVect[0,32*i+j]=int(lineStr[j])
    return returnVect

#使用k-近鄰算法進行分類
def classify0(inX,dataSet,labels,k):
    dataSetSize=dataSet.shape[0]
    #計算距離
    diffMat=tile(inX,(dataSetSize,1))-dataSet
    sqDiffMat=diffMat**2
    distances=sqDiffMat.sum(axis=1)
    sortedDisIndices=distances.argsort()
    classCount={}
    #選擇距離最小的k個點
    for i in range(k):
        voteIlabel=labels[sortedDisIndices[i]]
        classCount[voteIlabel]=classCount.get(voteIlabel,0)+1
    #排序
    sortedClassCount=sorted(classCount.items(),
    key=operator.itemgetter(1),reverse=True)
    #返回發生頻率最高的元素標籤
    return sortedClassCount[0][0]

#手寫數字識別系統
def handwritingClassTest():
    #標籤列表
    hwLabels=[]
    #獲取目錄內容
    trainingFileList=listdir('trainingDigits')
    m=len(trainingFileList)
    #以文件夾中的文件個數爲行數
    #將每個文件中的內容轉換成一個1X1024的向量
    #矩陣的每一行代表一個文件中的所有內容
    trainingMat=zeros((m,1024))
    #從文件名解析分類數字
    #7_200.txt表示數字7的第200個實例
    for i in range(m):
        #獲取文件名
        fileNameStr=trainingFileList[i]
        fileStr=fileNameStr.split('.')[0]
        classNumStr=int(fileStr.split('_')[0])
        hwLabels.append(classNumStr)
        trainingMat[i,:]=img2vector('trainingDigits/%s'%fileNameStr)
    testFileList=listdir('testDigits')
    errorCount=0.0
    mTest=len(testFileList)
    for i in range(mTest):
        fileNameStr=testFileList[i]
        fileStr=fileNameStr.split('.')[0]
        classNumStr=int(fileStr.split('_')[0])
        vectorUnderTest=img2vector('testDigits/%s'%fileNameStr)
        classifierResult=classify0(vectorUnderTest,trainingMat,hwLabels,3)
        print("the classifier came back with: %d,the real answer is: %d"
        %(classifierResult,classNumStr))
        if(classifierResult!=classNumStr):
            errorCount+=1.0
    print("\nthe total number of errors is: %d"%errorCount)
    print("\nthe total error rate is: %f"%(errorCount/float(mTest)))
    
    
def main():
    #testVector=img2vector('./MLiA_SourceCode/machinelearninginaction/Ch02/digits/testDigits/0_13.txt')
    #print('testVector:')
    #print(testVector[0,0:31])
    handwritingClassTest()
    
if __name__=='__main__':
    main()

0_0.txt數據預覽

00000000000001111000000000000000
00000000000011111110000000000000
00000000001111111111000000000000
00000001111111111111100000000000
00000001111111011111100000000000
00000011111110000011110000000000
00000011111110000000111000000000
00000011111110000000111100000000
00000011111110000000011100000000
00000011111110000000011100000000
00000011111100000000011110000000
00000011111100000000001110000000
00000011111100000000001110000000
00000001111110000000000111000000
00000001111110000000000111000000
00000001111110000000000111000000
00000001111110000000000111000000
00000011111110000000001111000000
00000011110110000000001111000000
00000011110000000000011110000000
00000001111000000000001111000000
00000001111000000000011111000000
00000001111000000000111110000000
00000001111000000001111100000000
00000000111000000111111000000000
00000000111100011111110000000000
00000000111111111111110000000000
00000000011111111111110000000000
00000000011111111111100000000000
00000000001111111110000000000000
00000000000111110000000000000000
00000000000011000000000000000000

機器學習 | K近鄰算法

使用K近鄰算法改進約會網站

datingTestSet2.txt數據預覽

使用K近鄰算法實現手寫識別

0_0.txt數據預覽

2024年DataOps趨勢預測：AI不會取代數據工程師

雲原生週刊：K8s 中的服務和網絡｜ 2024.4.29

通過Http鏈接地址爬取有贊微信商城商品信息及下載至EXCEL

多人同時導出 Excel 幹崩服務器！新來的阿里大佬給出的解決方案太優雅了！

[轉帖]cpupower

今天，昨天，近七天，近30天，近90天，js封裝

華爲云云原生FinOps解決方案，釋放雲原生最大價值

用C / C ++實現BP神經網絡

機器學習 | K-均值聚類

機器學習 | CNN卷積神經網絡

機器學習 | 樹迴歸

機器學習 | 預測數值型數據：迴歸

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結