SPTAG
SPTAG copied to clipboard
Long build and search time with 100k vectors
SPTAG Performance stats with Docker on MACOS Dimensions = 768 Number of vectors = 100k Number of test queries = 3 Distance Metric: L2/Cosine (Time in secs)
Build time : 378.7892005443573 BKT Search time : 0.2576255798339844 BKT (3 queries) Addition time : 185.24826049804688 BKT Deletion Time : 1.4446532726287842 BKT
Build time : 323.30899143218994 KDT Search time : 0.3578498363494873 KDT (n=3) Addition time : 426.7439365386963 KDT Deletion Time : 1.5057392120361328 KDT
The Build and vector Addition time can still be ignored, but the search time is enormous and can't be compromised. The search time is not so high with 10 dim 100k vectors.
This does not seem to be the expected behavior as it claims to be scalable for high dimensional datasets.
I compared it with FAISS on the same data, FAISS [IVFFlatIP] Dimensions = 768 Number of vectors = 100k Number of test queries = 3 Build time : 0.41316819190979004 Search Time: 0.0009050369262695312 (3 queries)
Please use the below script to reproduce the code.
import SPTAG
import numpy as np
import time
n = 100000
k = 3
r = 3
def testBuild(algo, distmethod, x, out):
i = SPTAG.AnnIndex(algo, 'Float', x.shape[1])
i.SetBuildParam("NumberOfThreads", '6')
i.SetBuildParam("DistCalcMethod", distmethod)
if i.Build(x, x.shape[0]):
i.Save(out)
def testBuildWithMetaData(algo, distmethod, x, s, out):
i = SPTAG.AnnIndex(algo, 'Float', x.shape[1])
i.SetBuildParam("NumberOfThreads", '6')
i.SetBuildParam("DistCalcMethod", distmethod)
if i.BuildWithMetaData(x, s, x.shape[0], False):
i.Save(out)
def testSearch(index, q, k):
j = SPTAG.AnnIndex.Load(index)
for t in range(q.shape[0]):
result = j.Search(q[t], k)
print (result[0]) # ids
print (result[1]) # distances
def testSearchWithMetaData(index, q, k):
j = SPTAG.AnnIndex.Load(index)
j.SetSearchParam("MaxCheck", '1024')
for t in range(q.shape[0]):
result = j.SearchWithMetaData(q[t], k)
print (result[0]) # ids
print (result[1]) # distances
print (result[2]) # metadata
def testAdd(index, x, out, algo, distmethod):
if index != None:
i = SPTAG.AnnIndex.Load(index)
else:
i = SPTAG.AnnIndex(algo, 'Float', x.shape[1])
i.SetBuildParam("NumberOfThreads", '6')
i.SetBuildParam("DistCalcMethod", distmethod)
if i.Add(x, x.shape[0]):
i.Save(out)
def testAddWithMetaData(index, x, s, out, algo, distmethod):
if index != None:
i = SPTAG.AnnIndex.Load(index)
else:
i = SPTAG.AnnIndex(algo, 'Float', x.shape[1])
i.SetBuildParam("NumberOfThreads", '6')
i.SetBuildParam("DistCalcMethod", distmethod)
if i.AddWithMetaData(x, s, x.shape[0]):
i.Save(out)
def testDelete(index, x, out):
i = SPTAG.AnnIndex.Load(index)
ret = i.Delete(x, x.shape[0])
print (ret)
i.Save(out)
def Test(algo, distmethod):
vector_dimension = 768
x = np.ones((n, vector_dimension), dtype=np.float32) * np.reshape(np.arange(n, dtype=np.float32), (n, 1))
q = np.ones((r, vector_dimension), dtype=np.float32) * np.reshape(np.arange(r, dtype=np.float32), (r, 1)) * 2
m = ''
for i in range(n):
m += str(i) + '\n'
m = m.encode()
index = algo
print ("Build.............................")
st = time.time()
testBuild(algo, distmethod, x, 'testindices')
et = time.time()
print("Build time : ", et - st, index)
st = time.time()
testSearch('testindices', q, k)
et = time.time()
print("Search time : ", et - st, index)
print ("Add.............................")
st = time.time()
testAdd('testindices', x, 'testindices', algo, distmethod)
et = time.time()
print("Addition time : ", et - st, index)
testSearch('testindices', q, k)
print ("Delete.............................")
st = time.time()
testDelete('testindices', q, 'testindices')
et = time.time()
print("Deletion Time : ", et - st, index)
testSearch('testindices', q, k)
print ("AddWithMetaData.............................")
testAddWithMetaData(None, x, m, 'testindices', algo, distmethod)
testSearchWithMetaData('testindices', q, k)
print ("Delete.............................")
testDelete('testindices', q, 'testindices')
testSearchWithMetaData('testindices', q, k)
if __name__ == '__main__':
Test('BKT', 'L2')
# Test('KDT', 'L2')
Code to reproduce the results with FAISS:
Installation: pip install faiss-cpu --no-cache
import faiss
import numpy as np
import time
n = 100000
k = 3
r = 3
d = 768
nlist = 10
vector_dimension = 768
x = np.ones((n, vector_dimension), dtype=np.float32) * np.reshape(np.arange(n, dtype=np.float32), (n, 1))
q = np.ones((r, vector_dimension), dtype=np.float32) * np.reshape(np.arange(r, dtype=np.float32), (r, 1)) * 2
st = time.time()
quantizer = faiss.IndexFlatIP(d)
index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_INNER_PRODUCT)
index.train(x)
index.add(q)
et = time.time()
print("Build Time: ", et-st)
st = time.time()
D, I = index.search(q, k=3)
print("Search Time: ", time.time()-st)
Is there something I am doing wrong or is this the expected behavior? Are there some parameters that can be tuned to help scale search. Thanks in advance.
Did you solve it?