出租車幾何或曼哈頓距離(Manhattan Distance)是由十九世紀的赫爾曼·閔可夫斯基所創詞彙 ,是種使用在幾何度量空間的幾何學用語,用以標明兩個點在標準座標系上的絕對軸距總和。
在numpy中,曼哈頓距離可以這樣表述
distances = np.sum(np.abs(X_train[-1,1] - Y_test[-1,1]) + np.abs(X_train[-1,0] - Y_test[-1,1]))
當然,numpy本身也提供了通過閔氏距離修改p值來直接實現包括曼哈頓距離在內的多種距離計算方式,具體如下
def minkowski_distance(vec1, vec2, p=3):
"""
閔氏距離
當p=1時,就是曼哈頓距離
當p=2時,就是歐氏距離
當p→∞時,就是切比雪夫距離
:param vec1:
:param vec2:
:param p:
:return:
"""
# return sum([(x - y) ** p for (x, y) in zip(vec1, vec2)]) ** (1 / p)
return np.linalg.norm(vec1 - vec2, ord=p)
def cosine_distance(vec1, vec2):
"""
夾角餘弦
:param vec1:
:param vec2:
:return:
"""
vec1_norm = np.linalg.norm(vec1)
vec2_norm = np.linalg.norm(vec2)
return vec1.dot(vec2) / (vec1_norm * vec2_norm)
def euclidean_distance(vec1, vec2):
"""
歐氏距離
:param vec1:
:param vec2:
:return:
"""
# return np.sqrt(np.sum(np.square(vec1 - vec2)))
# return sum([(x - y) ** 2 for (x, y) in zip(vec1, vec2)]) ** 0.5
return np.linalg.norm(vec1 - vec2, ord=2)
def manhattan_distance(vec1, vec2):
"""
曼哈頓距離
:param vec1:
:param vec2:
:return:
"""
# return np.sum(np.abs(vec1 - vec1))
return np.linalg.norm(vec1 - vec2, ord=1)
def chebyshev_distance(vec1, vec2):
"""
切比雪夫距離
:param vec1:
:param vec2:
:return:
"""
# return np.abs(vec1 - vec2).max()
return np.linalg.norm(vec1 - vec2, ord=np.inf)
def hamming_distance(vec1, vec2):
"""
漢明距離
:param vec1:
:param vec2:
:return:
"""
return np.shape(np.nonzero(vec1 - vec2)[0])[0]
def jaccard_similarity_coefficient(vec1, vec2):
"""
傑卡德距離
:param vec1:
:param vec2:
:return:
"""
return dist.pdist(np.array([vec1, vec2]), 'jaccard')
這是我學習KNN的第一個習題中遇到的問題,即通過曼哈頓距離或者歐拉距離對輸入的測試集進行分類,項目整體代碼如下
import numpy as np
import matplotlib.pyplot as plt
import operator
def creatDataSet():
group = np.array([[1.0,2.0],
[1.2,0.1],
[0.1,1.4],
[0.3,3.5],
[1.1,1.0],
[0.5,1.5]])
lebals = np.array(['A','A','B','B','A','B'])
return group,lebals
def KNN_classify(k,dis,X_train,x_train,Y_test):
assert dis == 'E' or dis =='M','dis must E or M,E爲歐拉距離,M爲曼哈頓距離'
num_test = Y_test.shape[0]
leballist = []
if dis == 'E':
for i in range(num_test):
distances = np.sqrt(np.sum(((X_train - np.tile(Y_test[i],(X_train.shape[0],1))) ** 2 ),axis = 1))
nearest_k = np.argsort(distances)
topK = nearest_k[:k]
classCount = {}
for i in topK:
classCount[x_train[i]] = classCount.get(x_train[i],0) + 1
sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
leballist.append (sortedClassCount[0][0])
return np.array(leballist)
else:
for i in range(num_test):
distances = np.sum(np.abs(X_train[-1,1] - Y_test[-1,1]) + np.abs(X_train[-1,0] - Y_test[-1,1]))
nearest_k = np.argsort(distances)
topK = nearest_k[:k]
classCount = {}
for i in topK:
classCount[x_train[i]] = classCount.get(x_train[i],0) + 1
sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
leballist.append (sortedClassCount[0][0])
return np.array(leballist)
if __name__ == "__main__":
group,lebals = creatDataSet()
y_test_res = KNN_classify(1,'M',group,lebals,np.array([[1.0,2.1],[0.4,2.0]]))
print(y_test_res)
plt.show()