機器學習之K-最鄰近算法(3)

本文使用knn算法及其變種,對kaggle大賽中的糖尿病進行預測。廢話省略,直接代碼

'''
kaggle大賽中的糖尿病預測。
'''
import pandas as pd;
import numpy as np;
from sklearn.neighbors import KNeighborsClassifier;
import  matplotlib.pyplot as plt;
from sklearn.model_selection import train_test_split;
def PandasReadData(filepath):
    data=pd.read_csv(filepath)
    print(data.shape)#(768, 9)
    print(data.head())
    #觀察標籤的統計數據
    print(data.groupby("Outcome").size())
    """
    統計分析:總共有768個樣本,8個特徵,1個標籤。分兩類:0和1.
    """
    return data;
    pass;
def splitdata(data):
    x=data.iloc[:,1:8];
    y=data.iloc[:,8];
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3);
    return x_train,x_test,y_train,y_test;
def KNN(x_train,x_test,y_train,y_test):
    #普通knn
    knn1=KNeighborsClassifier(3);
    knn1.fit(x_train,y_train);
    print("knn1:{}".format(knn1.score(x_test,y_test)))
    #權值knn
    knn2=KNeighborsClassifier(n_neighbors=5,weights="distance");
    knn2.fit(x_train,y_train);
    print("knn2:{}".format(knn2.score(x_test,y_test)))
    #半徑knn
    knn3=KNeighborsClassifier(n_neighbors=5,radius=50.0);
    knn3.fit(x_train,y_train);
    print("knn3:{}".format(knn3.score(x_test,y_test)))

    pass;
if __name__ == '__main__':
    data=PandasReadData("diabetes.csv");
    x_train,x_test,y_train,y_test=splitdata(data)
    KNN(x_train,x_test,y_train,y_test)

運行結果:

(768, 9)
   Pregnancies  Glucose   ...     Age  Outcome
0            6      148   ...      50        1
1            1       85   ...      31        0
2            8      183   ...      32        1
3            1       89   ...      21        0
4            0      137   ...      33        1

[5 rows x 9 columns]
Outcome
0    500
1    268
dtype: int64
knn1:0.7705627705627706
knn2:0.7489177489177489
knn3:0.7619047619047619

從結果可以看出,好像改進後的算法還沒有普通的knn算法好呢。

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章