參考資料:
機器學習實戰
'''
@version: 0.0.1
@Author: Huang
@dev: python3 vscode
@Date: 2019-11-10 11:39:30
@LastEditTime: 2019-11-10 17:57:13
@FilePath: \\機器學習實戰\\10-K均值聚類算法\\kMeans.py
@Descripttion: 聚類是一種無監督的學習,它將相似的對象歸到同一個簇中
'''
import numpy as np
import matplotlib.pyplot as plt
def loadDataSet(filename):
"""
[summary]:加載數據
Arguments:
filename -- 文件名
Returns:
[List] -- 數據集
"""
dataMat = []
with open(filename) as fr:
for line in fr.readlines():
curline = line.strip().split()
fltline = list(map(float, curline))
dataMat.append(fltline)
return dataMat
def distEclud(vecA, vecB):
"""
[summary]:計算兩個向量的歐氏距離
Arguments:
vecA -- A座標
vecB -- B座標
Returns:
兩點之間的歐氏距離
"""
return np.sqrt(np.power(vecA - vecB, 2).sum())
def randCent(dataSet, k):
"""
[summary]:爲數據集構建k個隨機質心的集合
Arguments:
dataSet {[mat]} -- 數據集
k {[int} -- 聚類數
Returns:
[mat] -- k箇中心點組成的矩陣
"""
n = np.shape(dataSet)[1]
centroids = np.mat(np.zeros((k, n)))
for j in range(n):
minj = min(dataSet[:, j])
rangej = float(max(dataSet[:, j]) - minj)
centroids[:, j] = minj + rangej * np.random.rand(k, 1)
return centroids
def my_KMeans(dataSet, k, distMeas=distEclud, createCent=randCent):
"""
[summary]:
創建k個點作爲起始質心(經常是隨機選擇)
當任意一個點的簇分配結果發生改變時
對數據集中的每個數據點
對每個質心
計算質心與數據點之間的距離
將數據點分配到距其最近的簇
對每一個簇,計算簇中所有點的均值並將均值作爲質心
Arguments:
dataSet {[mat]} -- 數據集
k {[int]} -- 聚類數
Keyword Arguments:
distMeas -- 距離算法 (default: {distEclud})
createCent -- 創建初始質心 (default: {randCent})
Returns:
centroids -- (k,n) 類質心
clusterAssment -- (m,2) 點分配
"""
m = np.shape(dataSet)[0]
clusterAssment = np.mat(np.zeros((m, 2)))
centroids = randCent(dataSet, k)
clusterchanged = True
while clusterchanged:
clusterchanged = False
for i in range(m):
cluster_i = clusterAssment[i, 0]
dismax = np.inf
for j in range(k):
curdis = distEclud(centroids[j, :], dataSet[i, :])
if curdis < dismax:
dismax = curdis
clusterAssment[i, :] = j, dismax
if cluster_i != clusterAssment[i, 0]:
clusterchanged = True
print(centroids)
for cent in range(k):
ptsInClust = dataSet[np.nonzero(clusterAssment[:, 0].A == cent)[0]]
centroids[cent, :] = np.mean(ptsInClust, axis=0)
return centroids, clusterAssment
def plt_my_KMeans():
data = np.mat(loadDataSet(r'./10-K均值聚類算法/testSet.txt'))
centroidsOfData, clusterAssmentOfData = my_KMeans(data, 4)
m = np.shape(data)[0]
plt.scatter(data[:, 0].A.reshape(m),
data[:, 1].A.reshape(m),
c=clusterAssmentOfData.A[:, 0].reshape(m))
plt.scatter(centroidsOfData.A[:, 0],
centroidsOfData.A[:, 1],
c='red',
marker='^')
plt.show()
def biKmeans(dataSet, k, distMeas=distEclud):
"""
[summary]:二分K-均值算法
將所有點看成一個簇
當簇數目小於k時
對於每一個簇
計算總誤差
在給定的簇上面進行K-均值聚類(k=2)
計算將該簇一分爲二之後的總誤差
選擇使得誤差最小的那個簇進行劃分操作
Arguments:
dataSet {[mat]} -- 數據集
k {[int]} -- 聚類數
Keyword Arguments:
distMeas -- 距離算法 (default: {distEclud})
Returns:
centroids -- (k,n) 類質心
clusterAssment -- (m,2) 點分配
"""
m, n = np.shape(dataSet)
clusterAssment = np.mat(np.zeros((m, 2)))
centroid0 = np.mean(dataSet, axis=0).tolist()[0]
cenList = [centroid0]
for j in range(m):
clusterAssment[j, 1] = distMeas(np.mat(centroid0), dataSet[j, :])**2
while len(cenList) < k:
lowestSSE = np.inf
for i in range(len(cenList)):
ptscurrCluster = dataSet[np.nonzero(
clusterAssment[:, 0].A == i)[0], :]
centroidMat, splitClustAss = my_KMeans(ptscurrCluster, 2, distMeas)
ssesplit = np.sum(splitClustAss[:, 1])
ssenotsplit = np.sum(
clusterAssment[np.nonzero(clusterAssment[:, 0].A != i)[0], 1])
if ssesplit + ssenotsplit < lowestSSE:
bestCentToSplit = i
bestnewCent = centroidMat
bestClustAss = splitClustAss.copy()
lowestSSE = ssenotsplit + ssesplit
bestClustAss[np.nonzero(
bestClustAss[:, 0].A == 1)[0], 0] = len(cenList)
bestClustAss[np.nonzero(
bestClustAss[:, 0].A == 0)[0], 0] = bestCentToSplit
cenList[bestCentToSplit] = bestnewCent[0, :].A.reshape(n)
cenList.append(bestnewCent[1, :].A.reshape(n))
clusterAssment[np.nonzero(
clusterAssment[:, 0].A == bestCentToSplit)[0], :] = bestClustAss
return np.mat(cenList), clusterAssment
def plt_biKmeans():
data = np.mat(loadDataSet(r'./10-K均值聚類算法/testSet2.txt'))
centroids, clusterAssment = biKmeans(data, 3)
m = np.shape(data)[0]
plt.scatter(data[:, 0].A.reshape(m),
data[:, 1].A.reshape(m),
c=clusterAssment.A[:, 0].reshape(m))
plt.scatter(centroids.A[:, 0], centroids.A[:, 1], c='red', marker='+')
plt.show()
def distSLC(vecA, vecB):
a = np.sin(vecA[0, 1] * np.pi / 180) * np.sin(vecB[0, 1] * np.pi / 180)
b = np.cos(vecA[0, 1] * np.pi / 180) * np.cos(
vecB[0, 1] * np.pi / 180) * np.cos(np.pi *
(vecB[0, 0] - vecA[0, 0]) / 180)
return np.arccos(a + b) * 6371.0
def clusterClubs(numClust=5):
datList = []
for line in open(r'./10-K均值聚類算法/places.txt').readlines():
lineArr = line.split('\t')
datList.append([float(lineArr[4]), float(lineArr[3])])
datMat = np.mat(datList)
myCentroids, clustAssing = biKmeans(datMat, numClust, distMeas=distSLC)
fig = plt.figure()
rect = [0.1, 0.1, 0.8, 0.8]
scatterMarkers = ['s', 'o', '^', '8', 'p', 'd', 'v', 'h', '>', '<']
axprops = dict(xticks=[], yticks=[])
ax0 = fig.add_axes(rect, label='ax0', **axprops)
imgP = plt.imread(r'./10-K均值聚類算法/Portland.png')
ax0.imshow(imgP)
ax1 = fig.add_axes(rect, label='ax1', frameon=False)
for i in range(numClust):
ptsInCurrCluster = datMat[np.nonzero(clustAssing[:, 0].A == i)[0], :]
markerStyle = scatterMarkers[i % len(scatterMarkers)]
ax1.scatter(ptsInCurrCluster[:, 0].flatten().A[0],
ptsInCurrCluster[:, 1].flatten().A[0],
marker=markerStyle,
s=90)
ax1.scatter(myCentroids[:, 0].flatten().A[0],
myCentroids[:, 1].flatten().A[0],
marker='+',
s=300)
plt.show()
if __name__ == '__main__':
clusterClubs(5)