1、KNN分类算法
KNN分类算法(K-Nearest-Neighbors Classification),又叫K近邻算法,是一个概念极其简单,而分类效果又很优秀的分类算法。
他的核心思想就是,要确定测试样本属于哪一类,就寻找所有训练样本中与该测试样本“距离”最近的前K个样本,然后看这K个样本大部分属于哪一类,那么就认为这个测试样本也属于哪一类。简单的说就是让最相似的K个样本来投票决定。
machine-learning-databases/iris 点击打开链接
数据集信息:
这也许是最著名的数据库模式识别文献中被发现。 费舍尔的论文是一个典型的,经常被引用。 (见杜达&哈特,例如)。 50个实例的数据集包含3类,其中
每个类是指一种虹膜。 一个类是线性可分的从其他2;后者不是线性可分的。
预测属性:类的虹膜。
UCI中的Iris(鸢尾属植物)数据集。Iris数据包含150条样本记录,分剐取自三种不同的鸢尾属植物setosa、versic010r和virginica的花朵样本,每一
类各50条记录,其中每条记录有4个属性:萼片长度(sepal length)、萼片宽度sepalwidth)、花瓣长度(petal length)和花瓣宽度(petal width)。
这是一个极其简单的域。
#-*- coding: UTF-8 -*- ''''' Created on 2016/7/17 @author: chen ''' import csv #用于处理csv文件 import random #用于随机数 import math import operator # from sklearn import neighbors #加载数据集 def loadDataset(filename,split,trainingSet=[],testSet = []): with open(filename,"rb") as csvfile: lines = csv.reader(csvfile) dataset = list(lines) for x in range(len(dataset)-1): for y in range(4): dataset[x][y] = float(dataset[x][y]) if random.random()<split: trainingSet.append(dataset[x]) else: testSet.append(dataset[y]) #计算距离 def euclideanDistance(instance1,instance2,length): distance = 0 for x in range(length): distance += pow((instance1[x] - instance2[x]),2) return math.sqrt(distance) #返回K个最近邻 def getNeighbors(trainingSet,testInstance,k): distances = [] length = len(testInstance) -1 #计算每一个测试实例到训练集实例的距离 for x in range(len(trainingSet)): dist = euclideanDistance(testInstance, trainingSet[x], length) distances.append((trainingSet[x],dist)) #对所有的距离进行排序 distances.sort(key=operator.itemgetter(1)) neighbors = [] #返回k个最近邻 for x in range(k): neighbors.append(distances[x][0]) return neighbors #对k个近邻进行合并,返回value最大的key def getResponse(neighbors): classVotes = {} for x in range(len(neighbors)): response = neighbors[x][-1] if response in classVotes: classVotes[response]+=1 else: classVotes[response] = 1 #排序 sortedVotes = sorted(classVotes.iteritems(),key = operator.itemgetter(1),reverse =True) return sortedVotes[0][0] #计算准确率 def getAccuracy(testSet,predictions): correct = 0 for x in range(len(testSet)): if testSet[x][-1] == predictions[x]: correct+=1 return (correct/float(len(testSet))) * 100.0 def main(): trainingSet = [] #训练数据集 testSet = [] #测试数据集 split = 0.67 #分割的比例 loadDataset(r"../data/iris.txt", split, trainingSet, testSet) print "Train set :" + repr(len(trainingSet)) print "Test set :" + repr(len(testSet)) predictions = [] k = 3 for x in range(len(testSet)): neighbors = getNeighbors(trainingSet, testSet[x], k) result = getResponse(neighbors) predictions.append(result) print ">predicted = " + repr(result) + ",actual = " + repr(testSet[x][-1]) accuracy = getAccuracy(testSet, predictions) print "Accuracy:" + repr(accuracy) + "%" if __name__ =="__main__": main()
为了检验上述程序是否正确,编写一下代码,测试只需上面的代码。
#coding:utf-8
'''''
Created on 2016年7月17日
@author: chen
'''
from sklearn.datasets import load_iris
from sklearn import neighbors
import sklearn
#查看iris数据集
iris = load_iris()
print iris
knn = neighbors.KNeighborsClassifier()
#训练数据集
knn.fit(iris.data, iris.target)
#预测
predict = knn.predict([[0.1,0.2,0.3,0.4]])
print predict
print iris.target_names[predict]
Train set :92
Test set :39
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
>predicted = 'Iris-setosa',actual = 'Iris-setosa'
Accuracy:100.0%
[Finished in 1.4s]