kNN 算法實現

kNN Algorithm

import numpy as np
import matplotlib.pyplot as plt
raw_data_X = [[3.4, 2.3],
              [3.1, 1.8],
              [1.3, 3.4],
              [3.6, 4.7],
              [2.3, 2.9],
              [7.4, 4.7],
              [5.7, 3.5],
              [9.2, 2.5],
              [7.8, 3.4],
              [7.9, 0.8],
             ]
raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
X_train = np.array(raw_data_X)
y_train = np.array(raw_data_y)
X_train
array([[3.4, 2.3],
       [3.1, 1.8],
       [1.3, 3.4],
       [3.6, 4.7],
       [2.3, 2.9],
       [7.4, 4.7],
       [5.7, 3.5],
       [9.2, 2.5],
       [7.8, 3.4],
       [7.9, 0.8]])
y_train
array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
# 要預測的點
x = np.array([8.1, 3.4])
plt.scatter(X_train[y_train == 0, 0], X_train[y_train == 0, 1], color='g')
plt.scatter(X_train[y_train == 1, 0], X_train[y_train == 1, 1], color='r')
plt.scatter(x[0], x[1], color='b')
<matplotlib.collections.PathCollection at 0x20ba3c947b8>

在這裏插入圖片描述

在這裏插入圖片描述

1. kNN實現過程

  • 歐拉距離
    Dist=i=1n(Xi(a)Xi(b))2Dist = \sqrt{\sum_{i=1}^n(X_i^{(a)} - X_i^{(b)})^2}
from math import sqrt

distances = []
for x_train in X_train:
    # 計算歐拉距離
    d = sqrt(np.sum((x_train - x) ** 2))
    distances.append(d)

    # 也可以如下寫法
# distances = [sqrt(np.sum((x_train - x) ** 2)) for x_train in X_train]
distances
[4.827007354458868,
 5.2497618993626745,
 6.8,
 4.684015371452148,
 5.821511831131154,
 1.4764823060233399,
 2.4020824298928622,
 1.4212670403551892,
 0.2999999999999998,
 2.607680962081059]
nearest = np.argsort(distances)
nearest
array([8, 7, 5, 6, 9, 3, 0, 1, 4, 2], dtype=int64)
# 找出距離最近的 K 個點
k = 6
topK_y = [y_train[i] for i in nearest[:k]]
topK_y
[1, 1, 1, 1, 1, 0]
from collections import Counter
# 對最近的 K 個值進行類別統計
Counter(topK_y)
Counter({1: 5, 0: 1})
votes = Counter(topK_y)
# 返回數量最多的 K 個值, 這裏 K 爲 1
votes.most_common(1)
[(1, 5)]
# 得出預測結果
predict_y = votes.most_common(1)[0][0]
predict_y
1

2. 使用 scikit-learn中的kNN

from sklearn.neighbors import KNeighborsClassifier
KNN_classifier = KNeighborsClassifier(n_neighbors=6)
KNN_classifier.fit(X_train, y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=6, p=2,
           weights='uniform')
# 在 sklearn 中, predict 接受一個數組
X_predict = x.reshape(1, -1)
X_predict
array([[8.1, 3.4]])
KNN_classifier.predict(X_predict)
array([1])
y_predict = KNN_classifier.predict(X_predict)
y_predict[0]
1

3. 重新整理我們的 kNN 代碼

from kNN.kNN import KNNClassifier
knn_clf = KNNClassifier(k=6)
knn_clf.fit(X_train, y_train)
KNN(k=6)
y_predict = knn_clf.predict(X_predict)
y_predict
array([1])

KNNClassifier.py

import numpy as np
from math import sqrt
from collections import Counter
from metrics import accuracy_score

class KNNClassifier:
    def __init__(self, k):
        assert k >= 1, "k must be valid"
        self.k = k
        self._X_train = None
        self._y_train = None

    def fit(self, X_train, y_train):
        """根據訓練數據集X_train和y_train訓練kNN分類器"""
        assert X_train.shape[0] == y_train.shape[0], \
            "the size of X_train must be equal to the size of y_train"
        assert self.k <= X_train.shape[0], \
            "the size of X_train must be at least k."

        self._X_train = X_train
        self._y_train = y_train
        return self

    def predict(self, X_predict):
        assert self._X_train is not None and self._y_train is not None, \
            "must fit before predict!"
        assert X_predict.shape[1] == self._X_train.shape[1], \
            "the feature number of X_predict must be equal to X_train"

        y_predict = [self._predict(x_predict) for x_predict in X_predict]
        return np.array(y_predict)

    def _predict(self, x_predict):
        """給定單個待預測數據x,返回x的預測結果值"""
        assert x_predict.shape[0] == self._X_train.shape[1], \
            "the feature number of x must be equal to X_train"

        dist = [ sqrt(sum((x_train - x_predict)**2)) for x_train in self._X_train]
        nearest = np.argsort(dist)
        top_K = [ self._y_train[i] for i in nearest[:self.k]]
        votes = Counter(top_K)
        return votes.most_common(1)[0][0]

    def score(self, X_test, y_test):
        y_predict = self.predict(X_test)
        return accuracy_score(y_test, y_predict)

    def __repr__(self):
        return "KNN(k=%d)" % self.k

metrics.py

import numpy as np

def accuracy_score(y_true, y_predict):
    """計算 y_true 和 y_predict 之間的準確率"""
    assert y_true.shape[0] == y_predict.shape[0], \
        "the size of y_true must be equal to the size of y_predict"

    return sum(y_true == y_predict) / len(y_true)

項目源碼地址

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章