来自机器学习实战一书代码。
# !/usr/bin/python
# -*- coding: utf-8 -*-
import numpy as np
import operator
def creatDataSet():
group = np.array([[1.0, 1.1], [1.0, 1.0], [0,0], [0,0.1]])
labels = ['A','A','B','B']
return group, labels
def classify0(inX, dataSet, labels, k):
"""
# 作用:判断某一数据集最近(欧氏距离)的k个点的类别数目,通过多数表决决定输入Inx的类别
# Args:inX : 进行判断的数据,矩阵格式,1行
# dataSet: 训练集
# labels:训练集标签
# k:最近的k个点
# return: 投票数目最多的类别
"""
dataSetSize = dataSet.shape[0] # shape 返还行长度,列长度 类似于R的dim
diffMat = np.tile(inX, (dataSetSize,1)) - dataSet # tile 类似 rep操作, 代码 (x0-x1),(y0-y1)
sqDiffMat = diffMat ** 2 # 开方 (x0-x1)^2,(y0-y1)^2
sqDistances = sqDiffMat.sum(axis = 1) # 按行求和 (x0-x1)^2+(y0-y1)^2
distances = sqDistances ** 0.5 # 开根号 sqrt((x0-x1)^2+(y0-y1)^2) # 求欧式距离
sorteDistIndicies = distances.argsort() # 类似于order操作 返还排序后的下标
classCount = {} # 变量声明 dist格式
for i in range(k): # 最近K个点的类别结果
voteIlabel = labels[sorteDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) # 多数表决
return sortedClassCount[0][0] # 返还投票最多的类别
def file2matrix(filename):
"""
# 作用:txt数据读取
# Args:filename:读取数据的名字,目录名以'/'分隔
# return: returnMat:矩阵格式,读取的数据
# classLabelVector:list格式,数据标签
"""
fr = open(filename)
arrayOLines = fr.readlines()
numberOfLines = len(arrayOLines) # 获取 list 长度
returnMat = np.zeros((numberOfLines, 3)) # 预分配内存,原始数据有 arrayOLines行,3列
classLabelVector = [] # 变量声明 list格式
index = 0
for line in arrayOLines: # 循环list, 类似lapply,对每一个list进行操作
line = line.strip() # 截取回车字符
listFromLine = line.split('\t') # 分隔符
returnMat[index,:] = listFromLine[0:3] # 按行赋值
classLabelVector.append(int(listFromLine[-1])) # append添加行,listFromLine[-1],-指的是倒数,与R语言的删除意思不同
index += 1 # i++
return returnMat, classLabelVector
def autoNorm(dataSet):
"""
# 作用:最大最小值-标准化数据
# Args: dataSet:矩阵格式,进行标准化的数据
# return: normDataSet: 矩阵格式,标准化后的dataSet
# ranges:每列 max - min
# minVals:每列 min
"""
minVals = dataSet.min(0) # 获取每列最小值
maxVals = dataSet.max(0) # 获取每列最大值
ranges = maxVals - minVals # 分母 max - min
normDataSet = np.zeros(np.shape(dataSet)) # 预分配内存,生成全为0的矩阵
m = dataSet.shape[0] # 获取行长度
normDataSet = dataSet - np.tile(minVals, (m, 1)) # 分子: x - min
normDataSet = normDataSet/np.tile(ranges, (m, 1)) # 分子/分母
return normDataSet, ranges, minVals
# 测试
import knn
hoRatio = 0.1 # 设置测试集比例
datingDataMat, datingLabels = knn.file2matrix('Data/datingTestSet2.txt') # 数据集读取
normMat, ranges, minVals = knn.autoNorm(datingDataMat) # 标准化数据
m = normMat.shape[0]
numTestVecs = int(m * hoRatio) # 测试集长度
errorCount = 0.0
classifierResultAll = [] # 变量声明,list格式
for i in range(numTestVecs):
classifierResult = knn.classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)
classifierResultAll.append(classifierResult)
print "the classifier came back with :%d, the real answer is : %d" % (classifierResult, datingLabels[i])
if( classifierResult != datingLabels[i]):
errorCount += 1
print "the total error rate is : %f" % (errorCount / float(numTestVecs))