原理
數據集
http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data
在使用時我直接放入到了txt中並且加上了標題欄,要不要都無所謂,但是程序需要稍作修改不然就會少一條數據。
ID Diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave_mean symmetry_mean fractal_mean radius_sd texture_sd perimeter_sd area_sd smoothness_sd compactness_sd concavity_sd concave_sd symmetry_sd fractal_sd radius_max texture_max perimeter_max area_max smoothness_max compactness_max concavity_max concave_max symmetry_max fractal_max
程序
寫的不好,歡迎指正
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# author:albert time:2018/11/28
import random
def read_dataset(address, val_list): # 將txt中的數據讀到list中
f = open(address)
colum = f.readline()
while len(colum) != 0:
val_list.append(colum.split('\t'))
colum = f.readline()
return val_list
def split_dataset(dataset, percent): # 輸入想分割的數據集和分割比例,返回percent比例的訓練集和剩下的訓練集
dataset.remove(dataset[0]) # 去表頭
n = int(len(dataset) * percent)
test_data =[]
while n >0:
number = random.randint(0, len(dataset)-1)
test_data.append(dataset[number])
dataset.remove(dataset[number])
n = n-1
return dataset, test_data
def get_label(data):
label = []
for i in data:
label.append(i[1])
return label
def min_max_train(data): # 默認接受到的是二維數組,這個似乎還是需要根據不同的數據集做一些改變而不能通用的
n1 = len(data) # 樣本數
n2 = len(data[0]) # 屬性數
max_val = [0, 0]
min_val = [0, 0]
for i in range(2, n2):
max_val.append(float(data[0][i]))
min_val.append(float(data[0][i]))
for j in range(n1):
if max_val[i] < float(data[j][i]):
max_val[i] = float(data[j][i])
if min_val[i] > float(data[j][i]):
min_val[i] = float(data[j][i])
for j in range(n1):
data[j][i] = (float(data[j][i]) - min_val[i]) / (max_val[i] - min_val[i])
return data, max_val, min_val # 返回標準化的訓練集和訓練集中每個屬性的最大最小值
def min_max_test(data, max_val, min_val):
n1 = len(data) # 樣本數
n2 = len(data[0]) # 屬性數
for i in range(2, n2):
for j in range(n1):
data[j][i] = (float(data[j][i]) - min_val[i]) / (max_val[i] - min_val[i])
return data # 返回標準化的測試集
def distance(train_data, test_data):
n_train = len(train_data)
n_attribute = len(train_data[0])
di = []
for i in range(n_train):
x = 0
for j in range(2, n_attribute):
x += (test_data[j]-train_data[i][j]) ** 2
x = x ** 0.5
di.append([])
di[i].append(x)
di[i].append(i)
return di #返回距離和標號
def sort_k(distance , k):
temp = []
for i in range(k):
temp.append(distance[i])
for i in range(k):
for j in range(i, k):
if temp[i][0] > temp[j][0]:
temp[i], temp[j] = temp[j], temp[i]
for i in range(k,len(distance)):
for j in range(k):
if distance[i][0] < temp[j][0]:
temp[j] = distance[i]
break
return temp
def find_result(k_di, train_data):
label = []
M = 0
B = 0
result = ['M', 'B', 'same']
for i in range(len(k_di)):
label.append(train_data[k_di[i][1]][1])
for i in label:
if i == 'M':
M += 1
if i == 'B':
B += 1
print('THE PERCENT OF M IS %f' % (M / len(k_di)))
print('THE PERCENT OF B IS %f' % (B / len(k_di)))
if M > B:
return result[0]
elif M < B:
return result[1]
else:
return result[2]
file_address = 'C:/Users/J/Desktop/wdbc.data.txt' # 放的text數據
val_list = []
read_dataset(file_address, val_list)
train_data, test_data = split_dataset(val_list, 0.2)
train_data, max_val, min_val = min_max_train(train_data)
test_data = min_max_test(test_data, max_val, min_val)
di = distance(train_data, test_data[0])
print(di)
k_di = sort_k(di, 25)
print(k_di)
result = find_result(k_di, train_data)
# 最後只是取一個測試集的來展示一下效果,可以根據需求再改
print(result)
print(test_data[0][1])