推薦系統:ann算法之ngt

在推薦或者搜索場景中,高質量的召回都是很有必要的,有時候ann搜索算法(Approximate Nearest Neighbor)可以幫助我們實現這一個功能
ngt是yahoo日本團隊基於graph和tree做的的ann搜索工具,地址:https://github.com/yahoojapan/NGT
它實現了onng/pnng的算法,對應的論文
onng: https://arxiv.org/abs/1810.07355
pnng: https://www.aclweb.org/anthology/P16-1214.pdf

我最初接觸近鄰搜索算法是從kdtree開始的,kdtree的思路是二叉搜索樹的思路,通過不斷分割子空間,然後鎖定部分子空間進行搜索。但是,kdtree是很容易碰到維度災難的,實際項目中很少使用。github上比較有名的集中ann算法,都可以再這個benchmark找到
在這裏插入圖片描述

開源的ann算法有很多,爲什麼選擇ngt?

  1. 看上面的benchmark,它是可以保持優勢的
  2. github上維護比較勤,代碼的質量比較高
  3. ngt還有一個兄弟開源代碼倉, ngtd,也就是ngt的server版本,主要開發語言是go,調用ngt提供的cgo binding,這個對於算法的工程化非常有用,因爲c++的後端開發成本還是有點高的

快速入門

ngtd這裏不展開講,因爲它跟算法相關的不多,這裏主要講ngt以及go調用ngt的binding

step1:

安裝ngt
https://github.com/yahoojapan/NGT/releases
我用的是1.9.1
下載後解壓,

cd NGT-x.x.x
mkdir build
cd build 
cmake ..
make 
make install

ngt的依賴很少,很容易就安裝成功了,注意可能需要sudo權限

step2:

安裝gongt https://github.com/yahoojapan/gongt

go get -u github.com/yahoojapan/gongt

step3

驗證安裝
隨機生成128維的數據,生成100k條數據,生成索引,然後做10次ann搜索,看看耗時
由於用的是隨機數,這裏不驗證準確性
測試代碼

package main

import (
	"fmt"
	"os"
	"runtime"
	"github.com/yahoojapan/gongt"
	"math/rand"
	"time"
)

func getVectors(dimension, rowCount int) [][]float64 {
	vec := make([][]float64, rowCount)
	for i := 0; i < rowCount; i++ {
		vec[i] = make([]float64, dimension)
		for j := 0; j < dimension; j++ {
			vec[i][j] = rand.Float64()
		}
	}
	return vec
}

func create(indexName string) error{
	vectors := getVectors(128, 100*1000)

	fmt.Printf("generate %d random vectors\n", len(vectors))

	n := gongt.New(indexName).SetObjectType(gongt.Float).SetDimension(len(vectors[0])).Open()
	defer n.Close()

	for _, v := range vectors {
		_, err := n.Insert(v)
		if err!=nil{
			return  err
		}
	}

	if err := n.CreateAndSaveIndex(runtime.NumCPU()); err != nil {
		return err
	}
	return nil
}

func search(n *gongt.NGT) {
	//n.GetDim()
	vectors := getVectors(128, 10)
	fmt.Printf("search %d items\n", len(vectors))

	for k, v := range vectors {
		result, err := n.Search(v, 10, gongt.DefaultEpsilon)
		if err != nil {
			panic(err)
		}
		fmt.Printf("result[%v]:%+v \n", k, result)
	}
}

func main() {
	rand.Seed(10)
	indexName := "testIndex"
	start := time.Now()
	_, err := os.Stat(indexName)
	if err != nil {
		// not exist, then create database
		err := create(indexName)
		if err!=nil{
			panic(err)
		}
		fmt.Printf("create duration:%v\n", time.Now().Sub(start))
		start = time.Now()
	}
	ngt := gongt.New(indexName).Open()
	defer ngt.Close()
	search(ngt)
	fmt.Printf("search duration:%v \n", time.Now().Sub(start))
}

輸出

generate 100000 random vectors
create duration:2m55.182075292s
search 10 items
result[0]:[{ID:4453 Distance:3.9691321849823} {ID:7138 Distance:3.9754562377929688} {ID:20912 Distance:3.979109048843384} {ID:55578 Distance:4.021002769470215} {ID:50904 Distance:4.026379585266113} {ID:25982 Distance:4.03567361831665} {ID:37933 Distance:4.0385355949401855} {ID:12410 Distance:4.049818992614746} {ID:5115 Distance:4.0575408935546875} {ID:9653 Distance:4.06024169921875}] 
result[1]:[{ID:3241 Distance:3.7383930683135986} {ID:47653 Distance:3.769625663757324} {ID:33205 Distance:3.77927303314209} {ID:40472 Distance:3.817638397216797} {ID:38679 Distance:3.834383010864258} {ID:12487 Distance:3.8415846824645996} {ID:17778 Distance:3.861241340637207} {ID:30227 Distance:3.8905539512634277} {ID:91308 Distance:3.903515577316284} {ID:9489 Distance:3.9384703636169434}] 
result[2]:[{ID:10999 Distance:3.5933420658111572} {ID:93639 Distance:3.7078354358673096} {ID:39766 Distance:3.708099603652954} {ID:4030 Distance:3.710101842880249} {ID:34369 Distance:3.7109930515289307} {ID:29719 Distance:3.7181615829467773} {ID:39147 Distance:3.7279679775238037} {ID:68061 Distance:3.7346577644348145} {ID:43473 Distance:3.7695436477661133} {ID:16032 Distance:3.791642189025879}] 
result[3]:[{ID:69316 Distance:3.7236363887786865} {ID:22104 Distance:3.7963035106658936} {ID:10002 Distance:3.807844638824463} {ID:17405 Distance:3.8084936141967773} {ID:40514 Distance:3.8241684436798096} {ID:6449 Distance:3.847242593765259} {ID:23647 Distance:3.86711049079895} {ID:79469 Distance:3.8762102127075195} {ID:21532 Distance:3.8934686183929443} {ID:52289 Distance:3.912428617477417}] 
result[4]:[{ID:29526 Distance:3.6651875972747803} {ID:91662 Distance:3.7099499702453613} {ID:10173 Distance:3.817878007888794} {ID:58227 Distance:3.8235466480255127} {ID:23559 Distance:3.8804848194122314} {ID:95411 Distance:3.8835465908050537} {ID:72048 Distance:3.9418129920959473} {ID:88035 Distance:3.944735527038574} {ID:41345 Distance:3.960390090942383} {ID:75584 Distance:3.9682157039642334}] 
result[5]:[{ID:18617 Distance:3.6518163681030273} {ID:3132 Distance:3.744781732559204} {ID:22819 Distance:3.864849805831909} {ID:67798 Distance:3.87032151222229} {ID:87321 Distance:3.9066879749298096} {ID:23139 Distance:3.9347174167633057} {ID:6745 Distance:3.9412641525268555} {ID:7138 Distance:3.9555041790008545} {ID:97341 Distance:3.959703207015991} {ID:71935 Distance:3.9600508213043213}] 
result[6]:[{ID:83274 Distance:3.7253623008728027} {ID:99977 Distance:3.7412521839141846} {ID:43642 Distance:3.8960955142974854} {ID:80104 Distance:3.9404213428497314} {ID:20479 Distance:3.949878215789795} {ID:94469 Distance:3.954838514328003} {ID:27608 Distance:3.9659769535064697} {ID:407 Distance:3.970268726348877} {ID:24981 Distance:3.9909749031066895} {ID:12989 Distance:4.001890182495117}] 
result[7]:[{ID:56672 Distance:3.8234822750091553} {ID:7893 Distance:3.8627994060516357} {ID:4895 Distance:3.9049932956695557} {ID:30108 Distance:3.9499318599700928} {ID:43823 Distance:3.975146532058716} {ID:75563 Distance:3.9958245754241943} {ID:51034 Distance:4.007666110992432} {ID:61472 Distance:4.03348970413208} {ID:11582 Distance:4.047774791717529} {ID:74277 Distance:4.051519393920898}] 
result[8]:[{ID:13815 Distance:3.8047754764556885} {ID:76880 Distance:3.823500871658325} {ID:30706 Distance:3.9870598316192627} {ID:50794 Distance:4.009201526641846} {ID:93799 Distance:4.011028289794922} {ID:35995 Distance:4.020217418670654} {ID:33510 Distance:4.036275863647461} {ID:90881 Distance:4.04132080078125} {ID:31972 Distance:4.0461859703063965} {ID:33738 Distance:4.0546112060546875}] 
result[9]:[{ID:68548 Distance:3.8462207317352295} {ID:1528 Distance:3.8728442192077637} {ID:14922 Distance:3.8863134384155273} {ID:62409 Distance:3.899791717529297} {ID:71139 Distance:3.915640115737915} {ID:1429 Distance:3.956453800201416} {ID:98372 Distance:3.9577293395996094} {ID:59946 Distance:3.9726102352142334} {ID:18247 Distance:3.9744997024536133} {ID:80053 Distance:3.976346254348755}] 
search duration:98.390243ms

測試環境是一個2核的虛擬機,時間的絕對值沒有太大參考意義。至此驗證gongt安裝成功

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章