KNN代碼復現python版

這裏使用python復現KNN算法:

import pandas as pd
import numpy as np


class KNN:
    def __init__(self):
        self.x_train: pd.DataFrame = None
        self.x_test: pd.DataFrame = None

    def euclidean_distance(self, x1: pd.Series, x2: pd.Series):
        x1 = np.mat(x1.tolist())
        x2 = np.mat(x2.tolist())
        return np.sqrt(np.sum(np.square(x1 - x2)))

    def fit(self, x_train, y_train):
        self.x_train = x_train
        self.y_train = y_train

    def get_classify(self, x_test: pd.DataFrame, topk: int):
        x_train = self.x_train
        y_train = self.y_train
        if x_train is None or x_test is None or topk <= 0:
            return None
        assert x_train.shape == x_test.shape
        label_classify = []
        for test_i, test_data in x_test.iterrows():
            # for test_data, test_i in enumerate(x_test):  # 對每個測試數據都要遍歷整個訓練數據
            distance_list = []
            for train_index, train_data in x_train.iterrows():
                distance = self.euclidean_distance(train_data, test_data)
                distance_list.append(distance)
            index_list = np.argsort(np.array(distance_list))  # 升序排列,返回index
            # 開始尋找最接近的類別,並使用投票機制返回
            classify_list = [0] * topk
            for i, index in enumerate(index_list[:topk]):
                classify_list[i] = y_train[index]
            label_classify.append(max(classify_list, key=classify_list.count))
        return label_classify


# 整理數據
data = pd.read_csv('測試數據集.csv', header=None)
test_data = pd.read_csv('驗證數據集.csv', header=None)
x_train = data.drop(0, axis=1) # 測試數據集的特徵
y_train = data[0] # 測試數據集的分類
x_test = test_data.drop(0, axis=1) # 驗證數據集的特徵

# 調用模型
knn_model = KNN()
knn_model.fit(x_train=x_train, y_train=y_train)
y_test = knn_model.get_classify(x_test=x_test, topk=20)

# 測試
y_test_true = test_data[0]
err = 0  # 表示錯誤的個數
for i, data in enumerate(y_test):
    true_num = y_test_true[i]
    if data != true_num:
        err += 1
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章