import pandas as pd
import numpy as np
#使用pandas 讀取data
train = pd.read_csv("iris-data-training.csv",encoding = "GBK")
test = pd.read_csv("iris-data-testing.csv", encoding = "GBK")
#將讀取的DataFrame格式轉換成np.array
train_array = train.values
test_array = test.values
#存儲訓練樣本的最大值
train_array_max = []
train_array_min = []
#分別記錄訓練樣本的數量及特徵量數目+1
n = train_array.shape[0]
m = train_array.shape[1]
#測試樣本的數目
test_n = test_array.shape[0]
#提取訓練樣本和測試樣本的特徵量和真實結果
train_x = train_array[ :, :m-1].reshape(n,m-1)
train_y = train_array[ :, m-1].reshape(n,)
test_x = test_array[ :, :m-1].reshape(test_n,m-1)
test_y = test_array[ :, m-1].reshape(test_n,)
#將特徵量歸一化
for i in range(m-1):
train_array_max.append(np.max(train_x[:,i]))
train_array_min.append(np.min(train_x[:,i]))
train_x[:,i] = (train_x[:, i] - train_array_min[i]) / (train_array_max[i] - train_array_min[i])
test_x[:, i] = (test_x[:, i] - train_array_min[i]) / (train_array_max[i] - train_array_min[i])
#利用最鄰近算法進行預測訓練數據結果
result = []
for x1 in test_x:
distance = []
for x2 in train_x:
distance.append(np.sum((x1 - x2) * (x1 - x2)))
result.append(train_y[distance.index(min(distance))])
print(result)
print(test_y.tolist())
result = np.array(result)
#獲得識別率
recognition_rate = np.sum((result-test_y) == 0)/len(test_y)
print(recognition_rate)