這裏使用python復現KNN算法:
import pandas as pd
import numpy as np
class KNN:
def __init__(self):
self.x_train: pd.DataFrame = None
self.x_test: pd.DataFrame = None
def euclidean_distance(self, x1: pd.Series, x2: pd.Series):
x1 = np.mat(x1.tolist())
x2 = np.mat(x2.tolist())
return np.sqrt(np.sum(np.square(x1 - x2)))
def fit(self, x_train, y_train):
self.x_train = x_train
self.y_train = y_train
def get_classify(self, x_test: pd.DataFrame, topk: int):
x_train = self.x_train
y_train = self.y_train
if x_train is None or x_test is None or topk <= 0:
return None
assert x_train.shape == x_test.shape
label_classify = []
for test_i, test_data in x_test.iterrows():
# for test_data, test_i in enumerate(x_test): # 對每個測試數據都要遍歷整個訓練數據
distance_list = []
for train_index, train_data in x_train.iterrows():
distance = self.euclidean_distance(train_data, test_data)
distance_list.append(distance)
index_list = np.argsort(np.array(distance_list)) # 升序排列,返回index
# 開始尋找最接近的類別,並使用投票機制返回
classify_list = [0] * topk
for i, index in enumerate(index_list[:topk]):
classify_list[i] = y_train[index]
label_classify.append(max(classify_list, key=classify_list.count))
return label_classify
# 整理數據
data = pd.read_csv('測試數據集.csv', header=None)
test_data = pd.read_csv('驗證數據集.csv', header=None)
x_train = data.drop(0, axis=1) # 測試數據集的特徵
y_train = data[0] # 測試數據集的分類
x_test = test_data.drop(0, axis=1) # 驗證數據集的特徵
# 調用模型
knn_model = KNN()
knn_model.fit(x_train=x_train, y_train=y_train)
y_test = knn_model.get_classify(x_test=x_test, topk=20)
# 測試
y_test_true = test_data[0]
err = 0 # 表示錯誤的個數
for i, data in enumerate(y_test):
true_num = y_test_true[i]
if data != true_num:
err += 1