import numpy as np
import sklearn.cluster as ske
#from sklearn.cluster import DBSCAN
from sklearn import metrics
import matplotlib.pyplot as plt
mac2id={}
onlinetimes=[]
f=open('c:/pythonpractice/python-data/netDuration.txt')
lines=f.readlines()
#skip the first row
#storage the data in dict ,key is mac and the valus is duration
for line in range(1,len(lines)):
mac=lines[line].split('\t')[2]
onlinetime=int(lines[line].split('\t')[6])
starttime=int(lines[line].split('\t')[4].split(' ')[1].split(':')[0])
if mac not in mac2id:
mac2id[mac]=len(onlinetimes) #use the index to refer to the onlintimes
onlinetimes.append((starttime,onlinetime))
else:
onlinetimes[mac2id[mac]]=[(starttime,onlinetime)]
real_X=np.array(onlinetimes).reshape((-1,2)) # change the shape of onlietimes to 2 colums
X=real_X[:,0:1] # chose the hour of starttime
db=ske.DBSCAN(eps=0.01,min_samples=20).fit(X)
labels=db.labels_
print('Labels:')
print(labels) #print the labels
raito=len(labels[labels[:]==-1])/len(labels) # caculate the ratio
print('noise raito:',format(raito,'.2%'))
n_clusters_=len(set(labels))-(1 if -1 in labels else 0)
print('estimated sumber of clusters: %d' % n_clusters_)
print('silhousette coofficient: %0.3f' % metrics.silhouette_score(X,labels))
#print the cluster data and the label
for i in range(n_clusters_):
print('Cluster',i,':')
print(list(X[labels==i].flatten()))
python-scikit-learn-DBSCAN
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.