topk相似度性能比較（kd-tree、kd-ball、faiss、annoy、線性搜索）

目前對向量topk相似度計算算法有許多，如下圖：

關於向量取topk相似度的應用場景很多，比如推薦系統裏面使用item2vec經常離線計算好topk的相似度，搜索領域裏面的query2vec使用topk相似度，word2vec領域裏面的topk相似度，本文選取了幾個經典的算法做性能比較，kd-tree、kd-ball、faiss、線性搜索、annoy幾個算法，環境都是mac本地cpu環境，向量是選了一點遊戲語料訓練的一個word2vec模型，在做之前需要把annoy存儲爲了一個模型，另外做向量召回時候需要對向量進行L2標準化，使得算餘弦相似度和歐式距離是一個東西，便於比較，下面看看代碼：

import time, random
import numpy as np
from sklearn.neighbors import KDTree
from  sklearn.neighbors import  BallTree
from sklearn import preprocessing
from annoy import AnnoyIndex

import faiss

import gensim
import  warnings
warnings.filterwarnings("ignore")

path = "/Users/zhoumeixu/Documents/python/word2vec/bin/vectors.bin"

model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True,unicode_errors='ignore')
model.init_sims(replace=True) #l2-normalized <=> euclidean :  (a-b)^2 = a^2 + b^2 - 2ab = 2-2ab  <==>2-2*cos
words=["王者"]*1000

class ANNSearch:
    word2idx = {}
    idx2word = {}
    data = []

    def __init__(self, model):
        for counter, key in enumerate(model.vocab.keys()):
            self.data.append(model[key])
            self.word2idx[key] = counter
            self.idx2word[counter] = key

        # leaf_size is a hyperparameter


        #這裏加L2正則化，使得餘弦相似度就是跟歐式距離等價
        #self.data=preprocessing.normalize(np.array(self.data), norm='l2')


        self.data = np.array(self.data)
        # ball樹
        self.balltree=BallTree(self.data, leaf_size=100)
        #kd樹
        self.kdtree = KDTree(self.data, leaf_size=100)


        # self.faiss_index = faiss.IndexFlatIP(200)
        # self.faiss_index.train(self.data)
        # self.faiss_index.add(self.data)

        self.quantizer = faiss.IndexFlatIP(200)  # the other index，需要以其他index作爲基礎
        self.faiss_index = faiss.IndexIVFFlat(self.quantizer, 200, 120, faiss.METRIC_L2)
        self.faiss_index.train(self.data)
        self.faiss_index.nprobe = 80
        self.faiss_index.add(self.data)  # add may be a bit slower as well


    #排除掉自身，從1開始
    def search_by_vector_kd(self, v, k=10):
        dists, inds = self.kdtree.query([v], k)
        return zip([self.idx2word[idx] for idx in inds[0][1:]], dists[0][1:])

    # 排除掉自身，從1開始
    def search_by_vector_ball(self, v, k=10):
        dists, inds = self.balltree.query([v], k)
        return zip([self.idx2word[idx] for idx in inds[0][1:]], dists[0][1:])

    def search(self, query, k=10,type="kd"):
        vector = self.data[self.word2idx[query]]
        if type=="kd":
            return self.search_by_vector_kd(vector, k)
        else:
            return self.search_by_vector_ball(vector,k)

    def  search_by_fais(self,query,k=10):
        vector = self.data[self.word2idx[query]]
        dists, inds=self.faiss_index.search(vector.reshape(-1,200),k)

        return zip([self.idx2word[idx] for idx in inds[0][1:]], dists[0][1:])


    def   search_by_annoy(self,query,annoymodel,k=10):
        index=self.word2idx[query]
        result= annoymodel.get_nns_by_item(index, k)
        word_result=[self.idx2word[idx]  for  idx  in result[1:]]

        return word_result




def  time_test():

    # Linear Search
    start = time.time()
    for word in words:
        model.most_similar(word, topn=10)
    stop = time.time()
    print("time/query by (gensim's) Linear Search = %.2f s" % (float(stop - start)))

    search_model = ANNSearch(model)

    #faiss搜索
    start = time.time()
    for word in words:
        search_model.search_by_fais(word, k=10)
    stop = time.time()
    print("time/query by faiss Search = %.2f s" % (float(stop - start)))

    # KDTree Search
    start = time.time()
    for word in words:
        search_model.search(word, k=10)
    stop = time.time()
    print("time/query by kdTree Search = %.2f s" % (float(stop - start)))

    ## ballTree Search
    start = time.time()
    for word in words:
        search_model.search(word, k=10, type="ball")
    stop = time.time()
    print("time/query by BallTree Search = %.2f s" % (float(stop - start)))


    ###annoy  serarch
    annoy_model = AnnoyIndex(200)
    annoy_model.load('/Users/zhoumeixu/Documents/python/word2vec/bin/annoy.model')
    start=time.time()
    for word in words:
        search_model.search_by_annoy(word,annoy_model,k=10)
    stop=time.time()
    print("time/query by annoy Search = %.2f s" % (float(stop - start)))





def  result_test():
    print("gensim:",model.most_similar("王者", topn=5))

    search_model = ANNSearch(model)

    print("kd tree:",list(search_model.search("王者", k=6)))

    print("ball tree:",list(search_model.search("王者", k=6,type="ball")))

    print("faiss:",list(search_model.search_by_fais("王者",k=6)))

    annoy_model = AnnoyIndex(200)
    annoy_model.load('/Users/zhoumeixu/Documents/python/word2vec/bin/annoy.model')

    print("annoy:",list(search_model.search_by_annoy("王者",annoy_model,k=6)))

if __name__=="__main__":
    #time_test()

    result_test()

準確性：

gensim: [('榮耀', 0.9508273005485535), ('李白', 0.5334799289703369), ('韓信', 0.46967631578445435), ('李元芳', 0.46955445408821106), ('諸葛亮', 0.45386096835136414)]
kd tree: [('榮耀', 0.3136006177182185), ('李白', 0.9659400491036177), ('韓信', 1.029877357244912), ('李元芳', 1.0299956733648787), ('諸葛亮', 1.0451210619937399)]
ball tree: [('榮耀', 0.3136006177182185), ('李白', 0.9659400491036177), ('韓信', 1.029877357244912), ('李元芳', 1.0299956733648787), ('諸葛亮', 1.0451210619937399)]
faiss: [('榮耀', 0.09834535), ('李白', 0.9330402), ('韓信', 1.0606475), ('李元芳', 1.060891), ('諸葛亮', 1.092278)]
annoy: ['榮耀', '李白', '韓信', '李元芳', '諸葛亮']

可以看到幾個東西算出來的topk最相似是一模一樣的

算法性能耗時：

time/query by (gensim's) Linear Search = 0.72 s
time/query by faiss Search = 1.09 s
time/query by kdTree Search = 6.80 s
time/query by BallTree Search = 6.42 s
time/query by annoy Search = 0.03 s

在這裏可以看到annoy最快，kddtree最慢，當然在gpu環境下面可能是faiss最快的

topk相似度性能比較（kd-tree、kd-ball、faiss、annoy、線性搜索）

xlnet中文文本分類任務

xlnet模型微調---英文文本分類

利用spark生成tfrecord文件

bert語義相似度計算

bert模型文本分類

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結