數據挖掘:K-Means算法的原理與Python實現

原創

2020-06-22 05:19

數據挖掘基礎:K-Means算法的原理與Python實現

原理

K-Means是一種基於樣本間相似度量的間接聚類方法，屬於非監督學習方法。K-Means接受參數k，將n個數據對象劃分爲k個聚類。計算每一個數據對象的依據爲對象與k個聚類的相似度（或者距離），選擇相似度最高的聚類，將這個數據對象劃入這個聚類。同時，也需要更新這個聚類的中心點。

輸入:

k個聚類的中心點的位置；
n個數據對象的位置；

輸出:

將這n個數據對象劃入這k個聚類中，即計算出這k個聚類所屬的聚類。

計算過程:

1. 計算點p與這k個聚類的距離l1,l2,…,lk，並得到l1,l2,…,lk的最大值lm（設點p與聚類m的距離最近，值爲lm）。

如下圖所示，共有3個聚類，每一個聚類的中心位置爲黑色粗邊框的點。現在需要計算點p，也即圖中灰色點所屬的聚類。計算得到點p與紅色聚類的距離爲1.2，與藍色

聚類的距離爲2.5，與綠色聚類的距離爲3.1。因而，點p應當屬於紅色聚類。如下圖所示：

2. 將點p劃入到聚類m，並重新計算聚類m的中心點位置。

繼續上述示例，點p被劃入了紅點聚類。從而，紅色聚類的中心也發生了變化。如下圖所示:

3. 重複以上步驟，直到n個數據對象全部計算完成。

實現

運行前提:

Python運行環境與編輯環境；
Matplotlib.pyplot圖形庫，可用於快速繪製2D圖表，與matlab中的plot命令類似，而且用法也基本相同。

代碼:

[python] view plain copy

print?

# coding=utf-8
””’
作者:Jairus Chan
程序:kmeans算法
”’
import matplotlib.pyplot as plt
import math
import numpy
import random
#dotOringalNum爲各個分類最初的大小
dotOringalNum=100
#dotAddNum最後測試點的數目
dotAddNum=1000
fig = plt.figure()
ax = fig.add_subplot(111)
sets=[]
colors=[’b’,‘g’,‘r’,‘y’]
#第一個分類，顏色爲藍色，在左下角
a=[]
txx=0.0
tyy=0.0
for i in range(0,dotOringalNum):
tx=float(random.randint(1000,3000))/100
ty=float(random.randint(1000,3000))/100
a.append([tx,ty])
txx+=tx
tyy+=ty
#ax.plot([tx],[ty],color=colors[0],linestyle=”,marker=’.’)
#a的第一個元素爲a的各個元素xy值之合
a.insert(0,[txx,tyy])
sets.append(a)
#第二個分類，顏色爲綠色，在右上角
b=[]
txx=0.0
tyy=0.0
for i in range(0,dotOringalNum):
tx=float(random.randint(4000,6000))/100
ty=float(random.randint(4000,6000))/100
b.append([tx,ty])
txx+=tx
tyy+=ty
#ax.plot([tx],[ty],color=colors[1],linestyle=”,marker=’.’)
b.insert(0,[txx,tyy])
sets.append(b)
#第三個分類，顏色爲紅色，在左上角
c=[]
txx=0.0
tyy=0.0
for i in range(0,dotOringalNum):
tx=float(random.randint(1000,3000))/100
ty=float(random.randint(4000,6000))/100
c.append([tx,ty])
txx+=tx
tyy+=ty
#ax.plot([tx],[ty],color=colors[2],linestyle=”,marker=’.’)
c.insert(0,[txx,tyy])
sets.append(c)
#第四個分類，顏色爲黃色，在右下角
d=[]
txx=0
tyy=0
for i in range(0,dotOringalNum):
tx=float(random.randint(4000,6000))/100
ty=float(random.randint(1000,3000))/100
d.append([tx,ty])
txx+=tx
tyy+=ty
#ax.plot([tx],[ty],color=colors[3],linestyle=”,marker=’.’)
d.insert(0,[txx,tyy])
sets.append(d)
#測試
for i in range(0,dotAddNum):
tx=float(random.randint(0,7000))/100
ty=float(random.randint(0,7000))/100
dist=9000.0
setBelong=0
for j in range(0,4):
length=len(sets[j])-1
centX=sets[j][0][0]/length
centY=sets[j][0][1]/length
if (centX-tx)*(centX-tx)+(centY-ty)*(centY-ty)<dist:
setBelong=j
dist=(centX-tx)*(centX-tx)+(centY-ty)*(centY-ty)
#ax.plot([tx],[ty],color=colors[setBelong],linestyle=”,marker=’.’)
sets[setBelong][0][0]+=tx
sets[setBelong][0][1]+=ty
sets[setBelong].append([tx,ty])
#輸出所有的點
for i in range(0,4):
tx=[]
ty=[]
for j in range(1,len(sets[i])):
tx.append(sets[i][j][0])
ty.append(sets[i][j][1])
ax.plot(tx,ty,color=colors[i],linestyle=”,marker=‘.’)
plt.show()

# coding=utf-8

'''
作者:Jairus Chan
程序:kmeans算法
'''

import matplotlib.pyplot as plt
import math
import numpy
import random

#dotOringalNum爲各個分類最初的大小 dotOringalNum=100 #dotAddNum最後測試點的數目 dotAddNum=1000 fig = plt.figure() ax = fig.add_subplot(111) sets=[] colors=['b','g','r','y'] #第一個分類，顏色爲藍色，在左下角 a=[] txx=0.0 tyy=0.0 for i in range(0,dotOringalNum): tx=float(random.randint(1000,3000))/100 ty=float(random.randint(1000,3000))/100 a.append([tx,ty]) txx+=tx tyy+=ty #ax.plot([tx],[ty],color=colors[0],linestyle='',marker='.') #a的第一個元素爲a的各個元素xy值之合 a.insert(0,[txx,tyy]) sets.append(a) #第二個分類，顏色爲綠色，在右上角 b=[] txx=0.0 tyy=0.0 for i in range(0,dotOringalNum): tx=float(random.randint(4000,6000))/100 ty=float(random.randint(4000,6000))/100 b.append([tx,ty]) txx+=tx tyy+=ty #ax.plot([tx],[ty],color=colors[1],linestyle='',marker='.') b.insert(0,[txx,tyy]) sets.append(b) #第三個分類，顏色爲紅色，在左上角 c=[] txx=0.0 tyy=0.0 for i in range(0,dotOringalNum): tx=float(random.randint(1000,3000))/100 ty=float(random.randint(4000,6000))/100 c.append([tx,ty]) txx+=tx tyy+=ty #ax.plot([tx],[ty],color=colors[2],linestyle='',marker='.') c.insert(0,[txx,tyy]) sets.append(c) #第四個分類，顏色爲黃色，在右下角 d=[] txx=0 tyy=0 for i in range(0,dotOringalNum): tx=float(random.randint(4000,6000))/100 ty=float(random.randint(1000,3000))/100 d.append([tx,ty]) txx+=tx tyy+=ty #ax.plot([tx],[ty],color=colors[3],linestyle='',marker='.') d.insert(0,[txx,tyy]) sets.append(d) #測試 for i in range(0,dotAddNum): tx=float(random.randint(0,7000))/100 ty=float(random.randint(0,7000))/100 dist=9000.0 setBelong=0 for j in range(0,4): length=len(sets[j])-1 centX=sets[j][0][0]/length centY=sets[j][0][1]/length if (centX-tx)*(centX-tx)+(centY-ty)*(centY-ty)<dist: setBelong=j dist=(centX-tx)*(centX-tx)+(centY-ty)*(centY-ty) #ax.plot([tx],[ty],color=colors[setBelong],linestyle='',marker='.') sets[setBelong][0][0]+=tx sets[setBelong][0][1]+=ty sets[setBelong].append([tx,ty]) #輸出所有的點 for i in range(0,4): tx=[] ty=[] for j in range(1,len(sets[i])): tx.append(sets[i][j][0]) ty.append(sets[i][j][1]) ax.plot(tx,ty,color=colors[i],linestyle='',marker='.') plt.show()

運行效果:

本博客中所有的博文都爲筆者（Jairus Chan）原創。

如需轉載，請標明出處：http://blog.csdn.net/JairusChan。

如果您對本文有任何的意見與建議，請聯繫筆者（JairusChan）。

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

數據挖掘:K-Means算法的原理與Python實現

數據挖掘基礎:K-Means算法的原理與Python實現

原理

輸入:

輸出:

計算過程:

實現

運行前提:

代碼:

運行效果:

數據挖掘:K-Means算法的原理與Python實現

Centos安裝完成後添加第三方源

Oracle 中的join大法

人工智能投資真熱還是“虛火”？投資仍是小衆行爲

防SQL注入的快捷方法

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結