這是最後一個GUI,主要是k-中心聚類算法的實現,之後要總結一下經典學習算法了
import pandas as pd
from pylab import mpl
import numpy as np
mpl.rcParams['font.sans-serif'] = ['FangSong'] # 指定默認字體
mpl.rcParams['axes.unicode_minus'] = False # 解決保存圖像是負號'-'顯示爲方塊的問題
import matplotlib.pyplot as plt
%matplotlib inline
test=pd.read_excel('data/2017年南區污水試驗檢測結果統計表(周檢).xlsx')
test.head()
id | 取樣地點 | 取樣日期 | 編號 | 化學需氧量 | 氨氮 | 總氮 | 總磷 | 錳 | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 南區污水 | 2017-05-22 | 170522WST01 | 1.21 | 0.025 | 0.05 | 0.030 | 0.01 |
1 | 2 | 南區污水 | 2017-05-22 | 170522WST02 | 360.00 | 0.091 | 50.50 | 0.910 | 0.01 |
2 | 3 | 南區污水 | 2017-06-01 | 170601WST03 | 20.70 | 0.025 | 2.03 | 0.006 | 0.01 |
3 | 4 | 南區污水 | 2017-06-01 | 170601WST04 | 39.60 | 0.605 | 27.60 | 2.430 | 0.01 |
4 | 5 | 南區科研 | 2017-06-05 | 170605WST04 | 45.80 | 0.312 | 21.70 | 0.983 | 0.01 |
test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 9 columns):
id 18 non-null int64
取樣地點 18 non-null object
取樣日期 18 non-null datetime64[ns]
編號 18 non-null object
化學需氧量 18 non-null float64
氨氮 18 non-null float64
總氮 18 non-null float64
總磷 18 non-null float64
錳 18 non-null float64
dtypes: datetime64[ns](1), float64(5), int64(1), object(2)
memory usage: 1.3+ KB
test.hist(figsize=(20,10))
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x00000151DEEE1588>,
<matplotlib.axes._subplots.AxesSubplot object at 0x00000151DEF36828>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x00000151E0FF1BA8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x00000151E1022278>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x00000151E1046908>,
<matplotlib.axes._subplots.AxesSubplot object at 0x00000151E1046940>]],
dtype=object)
[外鏈圖片轉存失敗,源站可能有防盜鏈機制,建議將圖片保存下來直接上傳(img-x29sFPzn-1577516969567)(output_4_1.png)]
test['取樣地點'].unique()
array(['南區污水', '南區科研', '反滲透水', '超濾出水', '超濾進水', '超濾產水'], dtype=object)
X = test.drop(['取樣地點','編號','取樣日期'],axis=1)
通過聚類算法驗證是否可行
def randomCenter(data, k):
'''
隨機初始化聚類中心
:param data: 訓練數據
:param k: 聚類中心的個數
:return: 返回初始化的聚類中心
'''
n = np.shape(data)[1] # 特徵的個數
cent = np.mat(np.zeros((k, n))) # 初始化K個聚類中心
for j in range(n): # 初始化聚類中心每一維的座標
minJ = np.min(data[:, j])
rangeJ = np.max(data[:, j]) - minJ
cent[:, j] = minJ * np.mat(np.ones((k, 1))) + np.random.rand(k, 1) * rangeJ # 在最大值和最小值之間初始化
return cent
def kmeans(data, k, cent):
'''
kmeans算法求解聚類中心
:param data: 訓練數據
:param k: 聚類中心的個數
:param cent: 隨機初始化的聚類中心
:return: 返回訓練完成的聚類中心和每個樣本所屬的類別
'''
m, n = np.shape(data) # m:樣本的個數;n:特徵的維度
subCenter = np.mat(np.zeros((m, 2))) # 初始化每個樣本所屬的類別
change = True # 判斷是否需要重新計算聚類中心
while change == True:
change = False # 重置
for i in range(m):
minDist = np.inf # 設置樣本與聚類中心的最小距離,初始值爲正無窮
minIndex = 0 # 所屬的類別
for j in range(k):
# 計算i和每個聚類中心的距離
dist = distance(data[i, ], cent[j, ])
if dist < minDist:
minDist = dist
minIndex = j
# 判斷是否需要改變
if subCenter[i, 0] != minIndex: # 需要改變
change = True
subCenter[i, ] = np.mat([minIndex, minDist])
# 重新計算聚類中心
for j in range(k):
sum_all = np.mat(np.zeros((1, n)))
r = 0 # 每個類別中樣本的個數
for i in range(m):
if subCenter[i, 0] == j: # 計算第j個類別
sum_all += data[i, ]
r += 1
for z in range(n):
try:
cent[j, z] = sum_all[0, z] / r
except:
print("ZeroDivisionError: division by zero")
return subCenter, cent
def save_result(file_name, data):
'''
保存source中的結果到file_name文件中
:param file_name: 保存的文件名
:param data: 需要保存的數據
:return:
'''
m, n = np.shape(data)
f = open(file_name, "w")
for i in range(m):
tmp = []
for j in range(n):
tmp.append(str(data[i, j]))
f.write("\t".join(tmp) + "\n")
f.close()
def distance(vecA, vecB):
'''
計算兩個向量之間歐氏距離的平方
:param vecA: 向量A的座標
:param vecB: 向量B的座標
:return: 返回兩個向量之間歐氏距離的平方
'''
dist = (vecA - vecB) * (vecA - vecB).T
return dist[0, 0]
if __name__ == "__main__":
k = 4# 聚類中心的個數
file_path = np.array(X)
subCenter, center = kmeans((file_path), k, randomCenter((file_path), k))
D:\sofewore\anaconda\lib\site-packages\ipykernel_launcher.py:98: RuntimeWarning: invalid value encountered in double_scalars
subCenters = np.array(subCenter)
for i,j in enumerate(subCenters):
print('第{}個變量屬於第{}類'.format(i,j[0]))
第0個變量屬於第2.0類
第1個變量屬於第0.0類
第2個變量屬於第2.0類
第3個變量屬於第2.0類
第4個變量屬於第2.0類
第5個變量屬於第2.0類
第6個變量屬於第3.0類
第7個變量屬於第2.0類
第8個變量屬於第2.0類
第9個變量屬於第2.0類
第10個變量屬於第2.0類
第11個變量屬於第2.0類
第12個變量屬於第2.0類
第13個變量屬於第2.0類
第14個變量屬於第2.0類
第15個變量屬於第2.0類
第16個變量屬於第2.0類
第17個變量屬於第2.0類
k中心聚類算法
import numpy as np
import random
def kMedoids(D, k, tmax=100):
# determine dimensions of distance matrix D
m, n = D.shape
if k > n:
raise Exception('too many medoids')
# find a set of valid initial cluster medoid indices since we
# can't seed different clusters with two points at the same location
valid_medoid_inds = set(range(n))
invalid_medoid_inds = set([])
rs,cs = np.where(D==0)
# the rows, cols must be shuffled because we will keep the first duplicate below
index_shuf = list(range(len(rs)))
np.random.shuffle(index_shuf)
rs = rs[index_shuf]
cs = cs[index_shuf]
for r,c in zip(rs,cs):
# if there are two points with a distance of 0...
# keep the first one for cluster init
if r < c and r not in invalid_medoid_inds:
invalid_medoid_inds.add(c)
valid_medoid_inds = list(valid_medoid_inds - invalid_medoid_inds)
if k > len(valid_medoid_inds):
raise Exception('too many medoids (after removing {} duplicate points)'.format(
len(invalid_medoid_inds)))
# randomly initialize an array of k medoid indices
M = np.array(valid_medoid_inds)
np.random.shuffle(M)
M = np.sort(M[:k])
# create a copy of the array of medoid indices
Mnew = np.copy(M)
# initialize a dictionary to represent clusters
C = {}
for t in range(tmax):
# determine clusters, i. e. arrays of data indices
J = np.argmin(D[:,M], axis=1)
for kappa in range(k):
C[kappa] = np.where(J==kappa)[0]
# update cluster medoids
for kappa in range(k):
J = np.mean(D[np.ix_(C[kappa],C[kappa])],axis=1)
j = np.argmin(J)
Mnew[kappa] = C[kappa][j]
np.sort(Mnew)
# check for convergence
if np.array_equal(M, Mnew):
break
M = np.copy(Mnew)
else:
# final update of cluster memberships
J = np.argmin(D[:,M], axis=1)
for kappa in range(k):
C[kappa] = np.where(J==kappa)[0]
# return results
return M, C
# # coding: utf-8
from sklearn.metrics.pairwise import pairwise_distances
import numpy as np
# 3 points in dataset
# data = np.array(X)
# # distance matrix
# D = pairwise_distances(data, metric='euclidean')
# # split into 2 clusters
# M, C = kMedoids(D, 4)
# print('medoids:')
# for point_idx in M:
# print( data[point_idx] )
# print('')
# print('clustering result:')
# for label in C:
# for point_idx in C[label]:
# print('第{0}類: 第{1}個變量'.format(label, data[point_idx][0]))
from tkinter import *
import tkinter.filedialog
def run1():
b = int(inp1.get())
test=pd.read_excel('data/2017年南區污水試驗檢測結果統計表(周檢).xlsx')
X = test.drop(['取樣地點','編號','取樣日期'],axis=1)
a=[]
k = b# 聚類中心的個數
file_path = np.array(X)
print(file_path)
subCenter, center = kmeans((file_path), k, randomCenter((file_path), k))
subCenters = np.array(subCenter)
data={}
for i,j in enumerate(subCenters):
data[i]=[int(j[0])]
# a.append(int(j[0]),i)
# a.append('\n')
# a=str(a).replace('{','')
txt2.insert(END,data)
def xz():
filename=tkinter.filedialog.askopenfilename()
if filename != '':
lb4.config(text='您選擇的文件是'+filename)
raw = pd.read_excel(filename)
txt.insert(END,raw)
else:
lb4.config(text='您沒有選擇任何文件')
def cop():
def run2():
b = int(inp12.get())
test=pd.read_excel('data/2017年南區污水試驗檢測結果統計表(周檢).xlsx')
X = test.drop(['取樣地點','編號','取樣日期'],axis=1)
txt12.insert(END,X)
data = np.array(X)
D = pairwise_distances(data, metric='euclidean')
M, C = kMedoids(D,b)
s={}
for label in C:
for point_idx in C[label]:
# a.append('第{0}類: 第{1}個變量'.format(label, data[point_idx][0]))
# a.append('\n')
s[int(data[point_idx][0])]=[label]
txt22.insert(END,s)
def xz2():
filename=tkinter.filedialog.askopenfilename()
if filename != '':
lb42.config(text='您選擇的文件是'+filename)
raw = pd.read_excel(filename)
txt12.insert(END,raw)
else:
lb42.config(text='您沒有選擇任何文件')
winNew = Toplevel(root)
winNew.geometry('800x700')
winNew.title('大數據課程設計')
lb2 = Label(winNew, text='k-中心數據聚類系統',font=('黑體',22,'bold'))
lb2.place(relx=0.12, rely=0.05, relwidth=0.8, relheight=0.1)
lb2 = Label(winNew, text='輸入簇數k的值')
lb2.place(relx=0.4, rely=0.2)
inp12 = Entry(winNew)
inp12.place(relx=0.52, rely=0.2,relwidth=0.15, relheight=0.05)
txt12 = Text(winNew)
txt12.place(relx=0.1,rely=0.3, relheight=0.6,relwidth=0.4)
txt22 = Text(winNew)
txt22.place(relx=0.55,rely=0.3, relheight=0.6,relwidth=0.5)
mainmenu = Menu(winNew)
menuFile = Menu(mainmenu) # 菜單分組 menuFile
mainmenu.add_cascade(label="文件",menu=menuFile)
# menuFile.add_separator() # 分割線
menuEdit = Menu(mainmenu) # 菜單分組 menuEdit
# mainmenu.add_cascade(label="算法",menu=menuEdit)
# menuEdit.add_command(label="K-means",command=cut)
# menuEdit.add_command(label="k-中心",command=cop)
# btClose=Button(winNew,text='關閉',command=winNew.destroy)
# btClose.place(relx=0.7,rely=0.5)
btn1 = Button(winNew, text='開始計算', command=run2)
btn1.place(relx=0.7, rely=0.2, relwidth=0.12, relheight=0.05)
btn2=Button(winNew,text='彈出文件選擇對話框',command=xz2)
btn2.place(relx=0.1, rely=0.2)
lb42 = Label(winNew,text='')
lb42.place(relx=0.1, rely=0.05)
def cut():
root2 = Tk()
root2.geometry('800x700')
root.title('大數據課程設計')
# lb1 = Label(root,text='顯示信息',font=('黑體',32,'bold'))
# lb1.place(relx=0.2,rely=0.2)
lb2 = Label(root, text='K-means數據聚類系統',font=('黑體',22,'bold'))
lb2.place(relx=0.12, rely=0.05, relwidth=0.8, relheight=0.1)
lb2 = Label(root, text='輸入簇數k的值')
lb2.place(relx=0.1, rely=0.2)
inp1 = Entry(root)
inp1.place(relx=0.22, rely=0.2,relwidth=0.15, relheight=0.05)
txt = Text(root)
txt.place(relx=0.1,rely=0.3, relheight=0.6,relwidth=0.3)
txt2 = Text(root)
txt2.place(relx=0.5,rely=0.3, relheight=0.6,relwidth=0.45)
def popupmenu(event):
mainmenu.post(event.x_root,event.y_root)
root = Tk()
root.geometry('800x700')
root.title('大數據課程設計')
# lb1 = Label(root,text='顯示信息',font=('黑體',32,'bold'))
# lb1.place(relx=0.2,rely=0.2)
lb2 = Label(root, text='K-means數據聚類系統',font=('黑體',22,'bold'))
lb2.place(relx=0.12, rely=0.05, relwidth=0.8, relheight=0.1)
lb2 = Label(root, text='輸入簇數k的值')
lb2.place(relx=0.4, rely=0.2)
inp1 = Entry(root)
inp1.place(relx=0.52, rely=0.2,relwidth=0.15, relheight=0.05)
txt = Text(root)
txt.place(relx=0.1,rely=0.3, relheight=0.6,relwidth=0.42)
txt2 = Text(root)
txt2.place(relx=0.55,rely=0.3, relheight=0.6,relwidth=0.4)
btn1 = Button(root, text='開始計算', command=run1)
btn1.place(relx=0.7, rely=0.2, relwidth=0.12, relheight=0.05)
btn2=Button(root,text='彈出文件選擇對話框',command=xz)
btn2.place(relx=0.1, rely=0.2)
lb4 = Label(root,text='')
lb4.place(relx=0.1, rely=0.05)
mainmenu = Menu(root)
menuFile = Menu(mainmenu) # 菜單分組 menuFile
# mainmenu.add_cascade(label="文件",menu=menuFile)
# menuFile.add_separator() # 分割線
# menuFile.add_command(label="打開",command=ope)
menuEdit = Menu(mainmenu) # 菜單分組 menuEdit
mainmenu.add_cascade(label="算法",menu=menuEdit)
# menuEdit.add_command(label="K-means",command=cut)
menuEdit.add_command(label="k-中心",command=cop)
root.config(menu=mainmenu)
root.bind('Button-3',popupmenu) # 根窗體綁定鼠標右擊響應事件
root.mainloop()
[[1.00e+00 1.21e+00 2.50e-02 5.00e-02 3.00e-02 1.00e-02]
[2.00e+00 3.60e+02 9.10e-02 5.05e+01 9.10e-01 1.00e-02]
[3.00e+00 2.07e+01 2.50e-02 2.03e+00 6.00e-03 1.00e-02]
[4.00e+00 3.96e+01 6.05e-01 2.76e+01 2.43e+00 1.00e-02]
[5.00e+00 4.58e+01 3.12e-01 2.17e+01 9.83e-01 1.00e-02]
[6.00e+00 1.62e+00 2.50e-02 2.68e+00 3.36e-02 1.00e-02]
[7.00e+00 1.71e+02 5.50e-01 5.94e+01 2.15e+00 1.00e-02]
[8.00e+00 8.47e+00 7.20e-02 2.41e+00 1.36e-02 1.00e-02]
[9.00e+00 3.35e+01 3.04e-01 2.09e+01 2.44e-01 1.00e-02]
[1.00e+01 5.10e+01 1.17e+00 4.23e+01 8.23e-01 1.00e-02]
[1.10e+01 2.63e+00 3.21e-01 2.46e+00 1.50e-02 1.00e-02]
[1.20e+01 2.13e+01 1.47e-01 1.81e+01 2.40e-01 1.00e-02]
[1.30e+01 2.03e+01 2.21e-01 1.84e+01 1.97e-01 1.00e-02]
[1.40e+01 5.06e+01 4.12e-01 3.64e+01 8.34e-01 1.00e-02]
[1.50e+01 2.35e+01 2.54e-01 2.31e+01 3.39e-01 1.00e-02]
[1.60e+01 2.17e+01 2.07e-01 2.22e+01 3.16e-01 1.00e-02]
[1.70e+01 2.56e+00 3.40e-02 2.66e+00 2.40e-02 1.00e-02]
[1.80e+01 4.91e+01 3.79e-01 4.43e+01 1.15e+00 1.60e-02]]
test=pd.read_excel('data/2017年南區污水試驗檢測結果統計表(周檢).xlsx')
X = test.drop(['取樣地點','編號','取樣日期'],axis=1)
k = 3# 聚類中心的個數
file_path = np.array(X)
print(file_path)
subCenter, center = kmeans((file_path), k, randomCenter((file_path), 3))
subCenters = np.array(subCenter)
print(subCenters[0])
data={}
for i,j in enumerate(subCenters):
data[i]=[int(j[0])]