knn.py
本文爲 落魄陶陶 原創,轉載請註明出處
數據來源及源碼參見github
- 學習並參考《機器學習實戰》第二章
- 主要使用Pandas庫
- knn.py爲基本算法實現,基於數據knn.xlsx
- knn_dating.py爲約會預測,基於數據datingTestSet.txt
# 1.讀取數據 得到 每行都是 x1,x2,x3,...,xn,y的dataFrame df
# 2.計算給定點target = (x1,x2,...,xn)與步驟1中df的所有點的距離
# 3.對步驟2所得的所有距離排序 asc,取前n個對應的y
# 4.步驟3中y出現頻率最高的爲target的y
from collections import namedtuple
import numpy as np
import pandas as pd
MinMaxDict = namedtuple('MinMax', 'min max range')
def load_data()-> pd.DataFrame:
"""
加載數據
:return: pd.DataFrame 最後一列爲label,其餘爲輸入
"""
return pd.read_excel('knn.xlsx')
def norm(df):
min_max_list = []
for column in df.columns[:-1]:
col = df[column]
min_val, max_val = col.min(), col.max()
length = max_val - min_val
df[column] = (col - min_val) / length
min_max_list.append(MinMaxDict(min_val, max_val, length))
return df, min_max_list
def norm_target(target, min_max_list):
ret = []
for i, item in enumerate(min_max_list):
ret.append((target[i] - item.min) / item.range)
return tuple(ret)
def classify(n, df, target):
"""
構建一個DataFrame,包含[labels,distance]兩列,
label就是訓練集的label,distance爲目標數據targe對訓練集df每條數據的歐氏距離
然後再根據distance排序,取前n個結果
再對前n個結果統計,個數最多的label就是分類結果
:param n:
:param df:
:param target:
:return:
"""
label_column = df.columns[-1]
result = pd.DataFrame()
result['labels'] = df[label_column].copy()
result['distance'] = np.sqrt(np.sum(df.loc[:, df.columns[:-1]].sub(target, axis='columns') ** 2, axis=1))
result = result.sort_values(by='distance', ascending=True)
return result[:n]['labels'].value_counts(ascending=False).index[0]
if __name__ == '__main__':
n = 5
# 1.讀取數據得到DataFrame
df = load_data()
# 2.將數據正規化,同時得到每列最大最小及跨度信息,用於目標數據正規化
x_columns = df.columns[:-1]
df, min_max_list = norm(df)
# 3.將目標正規化
item = (0, 10)
item = norm_target(item, min_max_list)
# 4.對目標值和已有數據計算歐氏距離,取前n個最小值得到分類結果
result = classify(n, df, item)
print(result)
約會預測
knn_dating.py
import pandas as pd
import knn
def load_dating_data(file, ratio):
df = pd.read_csv(file, sep='\t')
df.columns = ['fly_miles', 'game_time', 'ice_cream', 'labels']
df.labels = df.labels.map(lambda x: ['smallDoses', 'didntLike', 'largeDoses'].index(x) + 1)
size = df.shape[0]
num = int(size * ratio)
return df[:num], df[num:]
# class DatingKnn(KNN):
#
# def __init__(self, n, ratio):
# super().__init__(n)
# num = int(self.df.shape[0] * ratio)
# self.testing_set = self.df[num:]
# self.df = self.df[:num]
#
# def load_data(self):
# return load_dating_data('datingTestSet.txt')
#
# def test(self):
# results = pd.Series([self.classify(item[1:-1], False) for item in self.testing_set.itertuples()])
# labels = self.testing_set.labels
# labels.index = results.index
# bingo = (labels - results).value_counts()[0]
# print(bingo / results.shape[0])
# # print(labels-results)
# # print(self.testing_set.labels)
if __name__ == '__main__':
# df = load_dating_data('datingTestSet.txt')
# print(df.labels.unique())
# fig = plt.figure()
# ax = fig.add_subplot(111)
# ax.scatter(df.iloc[:, 0], df.iloc[:, 1], s=15 * df.labels, c=15 * df.labels)
# plt.show()
n = 5
ratio = 0.9 # 數據的百分之90爲訓練數據,10%爲測試數據
# 1.讀取數據得到DataFrame
training_set, testing_set = load_dating_data('datingTestSet.txt', 0.9)
# 2.將數據正規化,同時得到每列最大最小及跨度信息,用於目標數據正規化
df, min_max_list = knn.norm(training_set)
results_list = []
for item in testing_set.itertuples(index=False):
# 3.將目標正規化
item = item[:-1]
item = knn.norm_target(item, min_max_list)
# 4.對目標值和已有數據計算歐氏距離,取前n個最小值得到分類結果
result = knn.classify(n, df, item)
results_list.append(result)
# 5.使用測試集計算正確率
total = len(results_list)
testing_set.reset_index(inplace=True)
labels = testing_set[testing_set.columns[-1]] # 兩個Series相減時,按照index相同的相減,所以重置測試集的index
results = pd.Series(results_list)
bingo = (labels - results_list).value_counts()[0] # 對測試集label和預測結果做差,統計結果爲0的個數即爲正確的個數
percent = bingo / total
print(percent)