大規模向量相似度計算(一)——hnswlib的基本使用示例

隨着推薦系統基於向量相似的召回策略越發普及和成熟。

推薦系統基於向量相似的召回更加廣泛的應用到推薦系統當中。

今天我們來聊一聊如何基於hnswlib來進行大規模的物品和物品之間的相似度計算。

1. 安裝 hnswlib

pip install hnswlib

2. 構建索引

import hnswlib
index = hnswlib.Index(space='l2', dim=dim)
index.init_index(max_elements=num_elements, ef_construction=200, M=16)

3. 添加向量

index.add_items(vectors, labels)

4 近鄰檢索

index.set_ef(int(k * 1.2))
labels, distances = index.knn_query(data, k=k)

5 完整代碼

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import hnswlib
import argparse
import requests
import logging
import rediscluster as rc
from contextlib import closing
from retry import retry
import numpy as np

logging.getLogger().setLevel(logging.INFO)
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s %(levelname)s %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)


def get_image_vector(path, batch_size):
    index = 0
    image_labels = []
    image_vectors = []
    for line in open(path, "r"):
        line = line.strip().split("\t")
        label, vector = line[0], line[1]
        image_labels.append(label)
        image_vectors.append([float(dim) for dim in vector.split(",")])
        index += 1
        if index >= batch_size:
            yield np.array(image_labels), np.array(image_vectors)
            image_labels.clear()
            image_vectors.clear()
            index = 0
    yield np.array(image_labels), np.array(image_vectors)


def build_hnsw_index(path, dim, num_elements, batch_size):
    count = 0
    index = hnswlib.Index(space='l2', dim=dim)
    index.init_index(max_elements=num_elements, ef_construction=200, M=16)
    for labels, vectors in get_image_vector(path, batch_size):
        index.add_items(vectors, labels)
        count += 1
        logging.info("add items index:{}".format(count * batch_size))
    return index


def top_k(index, data, k):
    index.set_ef(int(k * 1.2))
    labels, distances = index.knn_query(data, k=k)
    return labels, distances


def main():
    path = args["input"]
    dim = args["dim"]
    num_elements = args["size"]
    batch_size = args["batch_size"]
    k = args["k"]
    output_path = args["output"]
    output = open(output_path, "w")
    index = build_hnsw_index(path, dim, num_elements, batch_size)
    count = 0
    for labels, vectors in get_image_vector(path, batch_size):
        targets, distances = top_k(index, vectors, k)
        for label, label_targets, label_distances in zip(labels, targets, distances):
            all_targets = []
            for target, distance in zip(label_targets, label_distances):
                line = "{}:{}".format(target, distance)
                all_targets.append(line)

            line = "{}\t{}\n".format(label, ",".join(all_targets))
            output.write(line)
        count += 1
        logging.info("build top k index:{}".format(count * batch_size))


if __name__ == "__main__":
    ap = argparse.ArgumentParser()
    ap.add_argument("--input", default="", type=str, help="path of input items vector")
    ap.add_argument("--dim", default=2048, type=int, help="dim of items vector")
    ap.add_argument("--size", default=6000000, type=int, help="elements number of the items")
    ap.add_argument("--batch_size", default=100000, type=int, help="batch size to process")
    ap.add_argument("--k", default=100, type=int, help="top k result")
    ap.add_argument("--output", default="", type=str, help="output path of the result")
    args = vars(ap.parse_args())
    main()

6 測試數據和代碼下載

點擊上方鏈接可下載,或者關注微信公衆號:查叔筆錄。回覆(hnswlib)下載

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章