本文使用knn算法及其變種,對kaggle大賽中的糖尿病進行預測。廢話省略,直接代碼
'''
kaggle大賽中的糖尿病預測。
'''
import pandas as pd;
import numpy as np;
from sklearn.neighbors import KNeighborsClassifier;
import matplotlib.pyplot as plt;
from sklearn.model_selection import train_test_split;
def PandasReadData(filepath):
data=pd.read_csv(filepath)
print(data.shape)#(768, 9)
print(data.head())
#觀察標籤的統計數據
print(data.groupby("Outcome").size())
"""
統計分析:總共有768個樣本,8個特徵,1個標籤。分兩類:0和1.
"""
return data;
pass;
def splitdata(data):
x=data.iloc[:,1:8];
y=data.iloc[:,8];
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3);
return x_train,x_test,y_train,y_test;
def KNN(x_train,x_test,y_train,y_test):
#普通knn
knn1=KNeighborsClassifier(3);
knn1.fit(x_train,y_train);
print("knn1:{}".format(knn1.score(x_test,y_test)))
#權值knn
knn2=KNeighborsClassifier(n_neighbors=5,weights="distance");
knn2.fit(x_train,y_train);
print("knn2:{}".format(knn2.score(x_test,y_test)))
#半徑knn
knn3=KNeighborsClassifier(n_neighbors=5,radius=50.0);
knn3.fit(x_train,y_train);
print("knn3:{}".format(knn3.score(x_test,y_test)))
pass;
if __name__ == '__main__':
data=PandasReadData("diabetes.csv");
x_train,x_test,y_train,y_test=splitdata(data)
KNN(x_train,x_test,y_train,y_test)
運行結果:
(768, 9)
Pregnancies Glucose ... Age Outcome
0 6 148 ... 50 1
1 1 85 ... 31 0
2 8 183 ... 32 1
3 1 89 ... 21 0
4 0 137 ... 33 1
[5 rows x 9 columns]
Outcome
0 500
1 268
dtype: int64
knn1:0.7705627705627706
knn2:0.7489177489177489
knn3:0.7619047619047619
從結果可以看出,好像改進後的算法還沒有普通的knn算法好呢。