這次寫的噁心死我了,第一次隨機選中心點的代碼有問題還是怎麼的,導致第一輪過完,可能會出現某個中心點根本就完全不合適,沒有任何一個點會標記爲這個中心點,然後報錯。導致我的這個代碼時靈時不靈,開始完全想不到bug的原因。
而且雖然用pandas來自己實現確實能幫忙鞏固pandas的知識,但我還是覺得以前是走入了誤區。機器學習重要的部分應該是對理論的理解和輪子的使用,至於書中的代碼,理解下理論實現的具體過程就行了,自己在這費心費力造輪子實在是意義不大,雖然在造的過程中也能加深理解,但事倍功半
所以還是決定以後學習就結合西瓜書的理論,實戰中的實現思路和sklearn的代碼實現了
import numpy as np
import pandas as pd
from pandas import DataFrame,Series
path = r'C:\Users\36955\Downloads\mlp\Ch10\testSet2.txt'
data = pd.read_csv(path,sep='\t',header=None)
def randCent(dataSet, k):
n = dataSet.shape[1]
centroids = DataFrame(np.zeros((k,n)))
data_min = dataSet.min(0)
data_range = dataSet.max(0)-data_min
for j in range(n):
minJ = data_min[j]
rangeJ = float(data_range[j])
centroids.iloc[:,j] = minJ + rangeJ * np.random.rand(k,1)
return centroids
def distEclud(vecA, vecB):
return np.sqrt(np.sum(np.power(vecA - vecB, 2)))
def kMeans(data,k):
m = data.shape[0]
clusterAssment = DataFrame(np.zeros((m,2)),columns=['clusterName','dist'])
centroids = randCent(data,k)
clusterChanged = True
while clusterChanged:
clusterChanged = False
for i in range(m):
minDist = np.inf;minIndex = -1
for j in range(k):
#print (i,j,m,len(centroids))
dist = distEclud(data.iloc[i,:],centroids.iloc[j,:])
if dist < minDist:
minDist = dist;minIndex = j
if clusterAssment.iloc[i,0] != minIndex:
clusterChanged = True
clusterAssment.iloc[i,:] = minIndex,minDist**2
#print centroids
centroids = data.groupby(clusterAssment.clusterName).mean()
return centroids,clusterAssment
#print kMeans(data,3)
def biKmeans(data,k):
m = data.shape[0]
clusterAssment = DataFrame(np.zeros((m,2)),columns=['clusterName','dist'])
centroid0 = list(data.mean(0))
centList = [centroid0]
while len(centList)<k:
lowestSSE = np.inf
for i in range(len(centList)):
splitCluster = data.iloc[clusterAssment[clusterAssment.clusterName==i].index,:]
splitCentroids,splitClusterAssement = kMeans(splitCluster,2)
splitSSE = splitClusterAssement.dist.sum()
nosplitSSE = clusterAssment[clusterAssment.clusterName!=i].dist.sum()
if (splitSSE+nosplitSSE)<lowestSSE:
besttosplit = i
newCentroids = splitCentroids.copy()
newAssement = splitClusterAssement.copy()
lowestSSE = splitSSE+nosplitSSE
centList[besttosplit] = list(newCentroids.iloc[0,:])
centList.append(list(newCentroids.iloc[1,:]))
newAssement.clusterName[newAssement.clusterName==0] = besttosplit
newAssement.clusterName[newAssement.clusterName==1] = len(centList)-1
clusterAssment=clusterAssment[clusterAssment.clusterName!=besttosplit]
clusterAssment = pd.concat([clusterAssment,newAssement],axis=0)
return centList,clusterAssment
print biKmeans(data,4)