导言
基于numpy实现的kmeans要比基于TensorFlow的好写,基于TensorFlow的实现可以参考我的这篇博文:https://blog.csdn.net/qq_41058526/article/details/104093115
具体实现
import numpy as np
import copy
import random
# 计算两个向量之间距离
def cal_distance(a,b):
# ** 代表乘方
return np.sum((a-b) ** 2) ** 0.5
# 计算簇的中心
def cal_cluster_center(cluster):
return np.mean(cluster,axis=0)
def kmeans(data,k,max_iter):
num_iter = 1
# 随机找到k个初始中心点
r_indexs = [i for i in range(len(data))]
random.shuffle(r_indexs)
centers = []
for i in range(k):
r_idx = r_indexs[i]
centers.append(data[r_idx])
pre_centers = copy.deepcopy(centers)
while num_iter < max_iter:
cluster_dict = {}
for x in data:
# 与每个簇心计算距离
cluster_dist = []
for i in range(k):
distance = cal_distance(pre_centers[i],x)
cluster_dist.append([i,distance])
cluster_dist.sort(key = lambda x:x[1])
min_cluster_idx, min_dist = cluster_dist[0]
# 将数据添加到不同簇中
if min_cluster_idx not in cluster_dict:
cluster_dict[min_cluster_idx] = []
cluster_dict[min_cluster_idx].append(x)
# 更新簇中心
for idx in cluster_dict.keys():
centers[idx] = cal_cluster_center(cluster_dict[idx])
# 如果簇中心点不再变化,那么结束
if(np.allclose(pre_centers,centers)):
break
else:
pre_centers = copy.deepcopy(centers)
num_iter += 1
return cluster_dict