本文使用knn算法及其变种,对kaggle大赛中的糖尿病进行预测。废话省略,直接代码
'''
kaggle大赛中的糖尿病预测。
'''
import pandas as pd;
import numpy as np;
from sklearn.neighbors import KNeighborsClassifier;
import matplotlib.pyplot as plt;
from sklearn.model_selection import train_test_split;
def PandasReadData(filepath):
data=pd.read_csv(filepath)
print(data.shape)#(768, 9)
print(data.head())
#观察标签的统计数据
print(data.groupby("Outcome").size())
"""
统计分析:总共有768个样本,8个特征,1个标签。分两类:0和1.
"""
return data;
pass;
def splitdata(data):
x=data.iloc[:,1:8];
y=data.iloc[:,8];
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3);
return x_train,x_test,y_train,y_test;
def KNN(x_train,x_test,y_train,y_test):
#普通knn
knn1=KNeighborsClassifier(3);
knn1.fit(x_train,y_train);
print("knn1:{}".format(knn1.score(x_test,y_test)))
#权值knn
knn2=KNeighborsClassifier(n_neighbors=5,weights="distance");
knn2.fit(x_train,y_train);
print("knn2:{}".format(knn2.score(x_test,y_test)))
#半径knn
knn3=KNeighborsClassifier(n_neighbors=5,radius=50.0);
knn3.fit(x_train,y_train);
print("knn3:{}".format(knn3.score(x_test,y_test)))
pass;
if __name__ == '__main__':
data=PandasReadData("diabetes.csv");
x_train,x_test,y_train,y_test=splitdata(data)
KNN(x_train,x_test,y_train,y_test)
运行结果:
(768, 9)
Pregnancies Glucose ... Age Outcome
0 6 148 ... 50 1
1 1 85 ... 31 0
2 8 183 ... 32 1
3 1 89 ... 21 0
4 0 137 ... 33 1
[5 rows x 9 columns]
Outcome
0 500
1 268
dtype: int64
knn1:0.7705627705627706
knn2:0.7489177489177489
knn3:0.7619047619047619
从结果可以看出,好像改进后的算法还没有普通的knn算法好呢。