機器學習之K近鄰算法 kNN(1)

可以說kNN是機器學習中非常特殊的沒有模型的算法,爲了和其他算法統一,可以認爲新聯數據集就是模型本身

1. kNN算法基本實現

import numpy as np
import matplotlib.pyplot as plt
from math import sqrt
from collections import Counter

# 特徵集合
raw_data_x = [[3.393533211, 2.331273381],
              [3.110073483, 1.781539638],
              [1.343808831, 3.368360954],
              [3.582294042, 4.679179110],
              [2.280362439, 2.866990263],
              [7.423469421, 4.694522875],
              [5.745051997, 3.533989803],
              [9.172168622, 2.511101045],
              [7.792783481, 3.424088941],
              [7.939820817, 0.791637231]]

# 每一個特徵的類別
raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]

# 訓練集
x_train = np.array(raw_data_x)
y_train = np.array(raw_data_y)

# 來了一個新的數據 要判斷它的特徵值
new = np.array([8.093607318, 3.365731514])

# 原數據
plt.scatter(x_train[y_train == 0, 0], x_train[y_train == 0, 1], color='g')
plt.scatter(x_train[y_train == 1, 0], x_train[y_train == 1, 1], color='r')
# 新數據
plt.scatter(new[0], new[1], color='b')
# plt.show()

# 由圖可知,它一定輸入特徵值爲 1

# kNN的過程
distances = []

# np.sum((x - new) ** 2) 等價於 (x[0] - new[0]) ** 2 + (x[1] - new[1]) ** 2
for x in x_train:
    d = sqrt(np.sum((x - new) ** 2))
    distances.append(d)

#  一句話搞定
# distances = [sqrt(np.sum((x - new) ** 2)) for x in x_train]

nearest = np.argsort(distances)

K = 6

# 最近距離y座標
topK_y = [y_train[i] for i in nearest[:K]]

# 投票過程
votes = Counter(topK_y)

# 預測結果值
predict_y = votes.most_common(1)[0][0]

print(predict_y)

2.函數

很容易把上述的過程整理出來寫出一個函數

import numpy as np
from math import sqrt
from collections import Counter


def kNN_classify(k, x_train, y_train, new):
    # 校驗參數
    assert 1 <= k <= x_train.shape[0], "k must be valid "
    assert x_train.shape[0] == y_train.shape[0], "the size of x_train must equal to the size of y_train"
    assert x_train.shape[1] == new.shape[0], "th feature number of x must be equal to x_train"

    # 距離數組
    distance = [sqrt(np.sum((x - new) ** 2)) for x in x_train]

    nearest = np.argsort(distance)

    topK_y = [y_train[i] for i in nearest[:k]]
    #  投票
    votes = Counter(topK_y)
    return votes.most_common(1)[0][0]

3.使用sklearn中的kNN算法

from sklearn.neighbors import KNeighborsClassifier
import numpy as np

# 特徵集合
raw_data_x = [[3.393533211, 2.331273381],
              [3.110073483, 1.781539638],
              [1.343808831, 3.368360954],
              [3.582294042, 4.679179110],
              [2.280362439, 2.866990263],
              [7.423469421, 4.694522875],
              [5.745051997, 3.533989803],
              [9.172168622, 2.511101045],
              [7.792783481, 3.424088941],
              [7.939820817, 0.791637231]]

# 每一個特徵的類別
raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]

# 訓練集
x_train = np.array(raw_data_x)
y_train = np.array(raw_data_y)

new = np.array([[8.093607318, 3.365731514]])

kNN_classifier = KNeighborsClassifier(n_neighbors=6)

kNN_classifier.fit(x_train, y_train)

print(kNN_classifier.predict(new))

4.模擬sklearn的方式使用面向對象的方式實現

import numpy as np
from math import sqrt
from collections import Counter


class KNNClassifier:
    def __init__(self, k):
        """"初始化kNN分類器"""
        assert 1 <= k, "k must be valid "
        self.k = k
        self._x_train = None
        self._y_train = None

    def fit(self, x_train, y_train):
        """"根據訓練數據集x_train,y_train訓練kNN分類器"""
        assert x_train.shape[0] == y_train.shape[0], "the size of x_train must equal to the size of y_train"
        assert self.k <= x_train.shape[0], "th feature number of x must be equal to x_train"

        self._x_train = x_train
        self._y_train = y_train
        return self

    def predict(self, new):
        """"給定待預測數據集new,返回表示NEW的結果向量"""
        assert self._x_train is not None and self._y_train is not None, "must fit before predict!"
        assert new.shape[1] == self._x_train.shape[1], "the feature number of new must be equal to x_train"

        y_predict = [self._predict(x) for x in new]
        return np.array(y_predict)

    def _predict(self, x):
        """"給定單個待預測數據x,返回x_predict的預測結果值"""
        assert x.shape[0] == self._x_train.shape[1], "the feature number of x must be equal to x_train"
        # 距離數組
        distance = [sqrt(np.sum((i - x) ** 2)) for i in self._x_train]

        nearest = np.argsort(distance)

        topK_y = [self._y_train[i] for i in nearest[:self.k]]
        #  投票
        votes = Counter(topK_y)
        return votes.most_common(1)[0][0]
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章