K-mean均值算法原理講解和代碼實戰

前言

最近在學習吳恩達機器學習課程，剛剛學完第一個無監督學習算法，搭配着機器學習實戰裏面的k-mean實戰，成功的將理論和實際結合了起來。接下來，咱們簡單的分析算法原理之後，着重講解一下源代碼。

項目github地址：K-mean算法實戰

文章目錄

K-mean算法原理解析

K表示的是組的個數，也就是你想把這些數據分成幾類。

算法主要思路：

隨機選擇k個點，作爲聚類的中心點
將所有數據進行歸類，歸類標準是按照歐幾里得距離，數據離哪個中心點近，就屬於哪一類
移動聚類點。計算屬於該中心點的數據的平均值，將聚類點移動到平均值位置
不斷重複2、3，直到數據的歸類不再發生變化

圖形過程演示：

我使用了數據量是50，然後K值設定的是2，也就是分爲兩類

下面咱們看一下這個動圖（文末有生成動圖python代碼地址）：

接下來給大家解析一下：

下圖是隨機初始化的兩個K點，當然這個初始化是的範圍是在數據點的範圍之內

下圖開始進行2、3步的循環：

直到數據歸屬不發生改變，則判定爲當前K點爲最佳值。數據歸屬是根據各個數據點距K點的距離判定的，使用的方法是歐幾里得距離公式。

代碼解析

一、生成數據

def CreatData():
    x1 = np.random.rand(50)*3#0-3
    y1 = [i+np.random.rand()*2-1 for i in x1]
    with open('data.txt','w') as f:
        for i in range(len(x1)):
            f.write(str(x1[i])+'\t'+str(y1[i])+'\n')

二、讀取數據

def loadDateSet(fileName):
    dataMat=[]
    fr = open(fileName)
    for line in fr.readlines():
        curline = line.strip().split('\t')
        #map函數 對指定的序列做映射，第一個參數是function 第二個是序列
        #此方法可以理解爲進行字符串格式轉換.這個函數可以深究
        fltLine = map(float,curline)
        dataMat.append(list(fltLine))
    return dataMat

這裏主要是說一下map這個方法

map(function,list) 此方法的作用是將第二個參數（列表或者迭代器）對前面的方法進行一一映射。

舉個例子：

>>>def square(x) :            # 計算平方數
...     return x ** 2
... 
>>> map(square, [1,2,3,4,5])   # 計算列表各個元素的平方
[1, 4, 9, 16, 25]

map(float,curline) 的作用可以理解爲將curline 列表中的數字格式轉換成float類型的。

三、歐幾里得距離計算

def distEclud(vecA,vecB):
    return np.sqrt(np.sum(np.power((vecA-vecB),2)))

四、顯示變化數據變化過程

def showProcess(clusterAssment,centroids):
    #顯示過程
    Index1 = np.nonzero(clusterAssment[:,0]==0)[0]
    Index2 = []
    for i in range(len(clusterAssment)):
        if i not in Index1:
            Index2.append(i)
    plt.plot(datamat[Index1,0],datamat[Index1,1],'ro')
    plt.plot(datamat[Index2,0],datamat[Index2,1],'go')
    plt.scatter([centroids[0][0]],[centroids[0][1]],color='',marker='o',edgecolors='red',linewidths=3)
    plt.scatter([centroids[1][0]],[centroids[1][1]],color='',marker='o',edgecolors='green',linewidths=3)
    plt.show()

五、初始化K點

def randCent(dataSet,k):
    n = np.shape(dataSet)[1]#獲取維度數
    centroids = np.array(np.zeros((n,2)))#創建一個k*n的矩陣，初始值爲0
    for j in range(n):
        minJ = np.min(dataSet[:,j])#獲取每一維度的最小值
        rangeJ = float(np.max(dataSet[:,j])-minJ)#獲得最大間隔，最大值➖最小值
        centroids[:,j] = minJ+rangeJ*np.random.rand(k,1)#最小值加上間隔*[0,1]範圍的數
        #每進行一次循環，給每一整列賦值。
    return centroids

六、執行算法邏輯主題，即上述步驟 2、3

def kMeans(dataSet,k,distMeans=distEclud,createCent = randCent):
    m = np.shape(dataSet)[0]#獲取數據的個數
    clusterAssment = np.array(np.zeros((m,2)))#創建一個m行2列的矩陣用於存儲索引值和距離
    centroids = createCent(dataSet,k)#隨機選取兩個點
    plt.scatter([centroids[0][0]],[centroids[0][1]],color='',marker='o',edgecolors='red',linewidths=3)
    plt.scatter([centroids[1][0]],[centroids[1][1]],color='',marker='o',edgecolors='green',linewidths=3)
    plt.plot(dataSet[:,0],dataSet[:,1],'o',color='yellow')
    plt.show()
    clusterChanged = True#標誌符，判定數據點的所屬關係有沒有發生變化
    flag=1
    while clusterChanged:
        print("當前迭代次數爲：{}".format(flag))
        flag+=1
        clusterChanged=False
        for i in range(m):#m爲數據量的個數
            minDist = 10000#設置一個最大值
            minIndex = -1#初始化索引
            for j in range(k):#k爲劃分的種類數 此for循環給數據點分配所屬關係
                distJI = distMeans(centroids[j,:],dataSet[i,:])#距離值
                if distJI<minDist:
                    minDist = distJI
                    minIndex = j
            if clusterAssment[i,0]!=minIndex:#判斷所屬關係是否發生改變
                clusterChanged=True
            clusterAssment[i,:] = minIndex,minDist**2#這裏面存儲的是所屬關係和序列號
        #print(centroids)
        for cent in range(k):#這個for循環是用來移動分類點的位置，將其移動到所屬點的平均值位置
            # print("輸出1：",clusterAssment[:,0])
            # print("輸出2：",np.nonzero(clusterAssment[:,0]==cent))
            #.A 是將矩陣轉化爲數組
            ptsInClust = dataSet[np.nonzero(clusterAssment[:,0]==cent)[0]]#取出相同簇的點進行取平均，這裏[0]是因爲參數的形狀爲(n,1)
            #np.nonzero 取值不爲0的索引值
            centroids[cent,:] = np.mean(ptsInClust,axis=0)#取平均
        showProcess(clusterAssment,centroids)
    return centroids,clusterAssment

全部代碼

import numpy as np
import matplotlib.pyplot as plt

def CreatData():
    x1 = np.random.rand(50)*3#0-3
    y1 = [i+np.random.rand()*2-1 for i in x1]
    with open('data.txt','w') as f:
        for i in range(len(x1)):
            f.write(str(x1[i])+'\t'+str(y1[i])+'\n')
def loadDateSet(fileName):
    dataMat=[]
    fr = open(fileName)
    for line in fr.readlines():
        curline = line.strip().split('\t')
        #map函數 對指定的序列做映射，第一個參數是function 第二個是序列
        #此方法可以理解爲進行字符串格式轉換.這個函數可以深究
        #print(curline)
        #fltLine = float(curline)
        fltLine = map(float,curline)
        dataMat.append(list(fltLine))
    return dataMat

def distEclud(vecA,vecB):
    return np.sqrt(np.sum(np.power((vecA-vecB),2)))
def showProcess(clusterAssment,centroids):
    #顯示過程
    Index1 = np.nonzero(clusterAssment[:,0]==0)[0]
    Index2 = []
    for i in range(len(clusterAssment)):
        if i not in Index1:
            Index2.append(i)
    plt.plot(datamat[Index1,0],datamat[Index1,1],'ro')
    plt.plot(datamat[Index2,0],datamat[Index2,1],'go')

    plt.scatter([centroids[0][0]],[centroids[0][1]],color='',marker='o',edgecolors='red',linewidths=3)
    plt.scatter([centroids[1][0]],[centroids[1][1]],color='',marker='o',edgecolors='green',linewidths=3)
    plt.show()
def randCent(dataSet,k):
    n = np.shape(dataSet)[1]#獲取維度數
    centroids = np.array(np.zeros((n,2)))#創建一個k*n的矩陣，初始值爲0
    print(centroids)
    for j in range(n):
        minJ = np.min(dataSet[:,j])#獲取每一維度的最小值
        rangeJ = float(np.max(dataSet[:,j])-minJ)#獲得最大間隔，最大值➖最小值
        #print("test2:",centroids[:,j])
        centroids[:,j] = np.array(minJ+rangeJ*np.random.rand(k,1)).reshape(2)#最小值加上間隔*[0,1]範圍的數
        #print("test3:",centroids[:,j])
        #每進行一次循環，給每一整列賦值。
    return centroids

def kMeans(dataSet,k,distMeans=distEclud,createCent = randCent):
    m = np.shape(dataSet)[0]#獲取數據的個數
    clusterAssment = np.array(np.zeros((m,2)))#創建一個m行2列的數組用於存儲索引值和距離
    centroids = createCent(dataSet,k)#隨機選取兩個點
    print("初始化的矩陣",centroids)
    plt.scatter([centroids[0][0]],[centroids[0][1]],color='',marker='o',edgecolors='red',linewidths=3)
    plt.scatter([centroids[1][0]],[centroids[1][1]],color='',marker='o',edgecolors='green',linewidths=3)
    plt.plot(dataSet[:,0],dataSet[:,1],'o',color='yellow')
    plt.show()
    clusterChanged = True#標誌符，判定數據點的所屬關係有沒有發生變化
    flag=1
    while clusterChanged:
        print("當前迭代次數爲：{}".format(flag))
        flag+=1
        clusterChanged=False
        for i in range(m):#m爲數據量的個數
            minDist = 10000#設置一個最大值
            minIndex = -1#初始化索引
            for j in range(k):#k爲劃分的種類數 此for循環給數據點分配所屬關係
                distJI = distMeans(centroids[j,:],dataSet[i,:])#距離值
                if distJI<minDist:
                    minDist = distJI
                    minIndex = j
            if clusterAssment[i,0]!=minIndex:#判斷所屬關係是否發生改變
                clusterChanged=True
            clusterAssment[i,:] = minIndex,minDist**2#這裏面存儲的是所屬關係和序列號
        #print(centroids)
        for cent in range(k):#這個for循環是用來移動分類點的位置，將其移動到所屬點的平均值位置
            # print("輸出1：",clusterAssment[:,0])
            # print("輸出2：",np.nonzero(clusterAssment[:,0].A==cent))
            #.A 是將矩陣轉化爲數組
            ptsInClust = dataSet[np.nonzero(clusterAssment[:,0]==cent)[0]]#取出相同簇的點進行取平均，這裏[0]是因爲參數的形狀爲(n,1)
            #np.nonzero 取值不爲0的索引值
            centroids[cent,:] = np.mean(ptsInClust,axis=0)#取平均
        #showProcess(clusterAssment,centroids)
    return centroids,clusterAssment


if __name__ == '__main__':
    CreatData()#生成數據
    datamat =  np.array(loadDateSet('data.txt'))
    centroids,clusterAssment = kMeans(datamat,2)

總結

K-mean均值算法的原理非常簡單，就是通過計算數據點距分類點的距離大小來判斷所屬關係，臨界條件就是數據的所屬關係不再發生改變。但是這種方式也存在一定的問題，很容易陷入到局部最優解。爲了更好的解決這個問題，我們下一章說一下改進後的算法，也就是二分K-均值法。

直達鏈接：二分K-mean均值算法

另附 python小程序-生成GIF圖和分解GIF圖：小程序地址

K-mean均值算法原理講解和代碼實戰