隨着推薦系統基於向量相似的召回策略越發普及和成熟。
推薦系統基於向量相似的召回更加廣泛的應用到推薦系統當中。
今天我們來聊一聊如何基於hnswlib來進行大規模的物品和物品之間的相似度計算。
1. 安裝 hnswlib
pip install hnswlib
2. 構建索引
import hnswlib
index = hnswlib.Index(space='l2', dim=dim)
index.init_index(max_elements=num_elements, ef_construction=200, M=16)
3. 添加向量
index.add_items(vectors, labels)
4 近鄰檢索
index.set_ef(int(k * 1.2))
labels, distances = index.knn_query(data, k=k)
5 完整代碼
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import hnswlib
import argparse
import requests
import logging
import rediscluster as rc
from contextlib import closing
from retry import retry
import numpy as np
logging.getLogger().setLevel(logging.INFO)
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s %(levelname)s %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
def get_image_vector(path, batch_size):
index = 0
image_labels = []
image_vectors = []
for line in open(path, "r"):
line = line.strip().split("\t")
label, vector = line[0], line[1]
image_labels.append(label)
image_vectors.append([float(dim) for dim in vector.split(",")])
index += 1
if index >= batch_size:
yield np.array(image_labels), np.array(image_vectors)
image_labels.clear()
image_vectors.clear()
index = 0
yield np.array(image_labels), np.array(image_vectors)
def build_hnsw_index(path, dim, num_elements, batch_size):
count = 0
index = hnswlib.Index(space='l2', dim=dim)
index.init_index(max_elements=num_elements, ef_construction=200, M=16)
for labels, vectors in get_image_vector(path, batch_size):
index.add_items(vectors, labels)
count += 1
logging.info("add items index:{}".format(count * batch_size))
return index
def top_k(index, data, k):
index.set_ef(int(k * 1.2))
labels, distances = index.knn_query(data, k=k)
return labels, distances
def main():
path = args["input"]
dim = args["dim"]
num_elements = args["size"]
batch_size = args["batch_size"]
k = args["k"]
output_path = args["output"]
output = open(output_path, "w")
index = build_hnsw_index(path, dim, num_elements, batch_size)
count = 0
for labels, vectors in get_image_vector(path, batch_size):
targets, distances = top_k(index, vectors, k)
for label, label_targets, label_distances in zip(labels, targets, distances):
all_targets = []
for target, distance in zip(label_targets, label_distances):
line = "{}:{}".format(target, distance)
all_targets.append(line)
line = "{}\t{}\n".format(label, ",".join(all_targets))
output.write(line)
count += 1
logging.info("build top k index:{}".format(count * batch_size))
if __name__ == "__main__":
ap = argparse.ArgumentParser()
ap.add_argument("--input", default="", type=str, help="path of input items vector")
ap.add_argument("--dim", default=2048, type=int, help="dim of items vector")
ap.add_argument("--size", default=6000000, type=int, help="elements number of the items")
ap.add_argument("--batch_size", default=100000, type=int, help="batch size to process")
ap.add_argument("--k", default=100, type=int, help="top k result")
ap.add_argument("--output", default="", type=str, help="output path of the result")
args = vars(ap.parse_args())
main()
6 測試數據和代碼下載
點擊上方鏈接可下載,或者關注微信公衆號:查叔筆錄。回覆(hnswlib)下載