BIVF selector
Work towards #3503
- Expanded C API to include search_with_params for binary indexes.
- Allow a selector to be passed when performing search using BIVF indexes.
@mdouze @asadoughi please take a look and assign the right reviewers when able, thanks!
@gtwang01 has imported this pull request. If you are a Meta employee, you can view this diff on Phabricator.
Can you include some unit tests to verify the changes?
Just highlighting an issue with the PR: it causes an 80% regression in CPU time for the faiss/perf_tests/bench_scalar_quantizer_encode.cpp performance test. Please look into it as we cannot merge the PR as is.
Just highlighting an issue with the PR: it causes an 80% regression in CPU time for the faiss/perf_tests/bench_scalar_quantizer_encode.cpp performance test. Please look into it as we cannot merge the PR as is.
This does not affect SQ, so we can disregard the perf test, but there are cmake test failures.
@metonymic-smokey
=================================== FAILURES =================================== _________________________ TestPreassigned.test_binary __________________________
self = <test_contrib.TestPreassigned testMethod=test_binary>
@unittest.skipIf(
platform.system() == 'Windows'
and sys.version_info[0] == 3
and sys.version_info[1] == 12,
'test_binary hangs for Windows on Python 3.12.'
)
def test_binary(self):
ds = datasets.SyntheticDataset(128, 2000, 2000, 200)
d = ds.d
xt = ds.get_train()
xq = ds.get_queries()
xb = ds.get_database()
# define alternative quantizer on the 20 first dims of vectors
# (will be in float)
km = faiss.Kmeans(20, 50)
km.train(xt[:, :20].copy())
alt_quantizer = km.index
binarizer = faiss.index_factory(d, "ITQ,LSHt")
binarizer.train(xt)
xb_bin = binarizer.sa_encode(xb)
xq_bin = binarizer.sa_encode(xq)
index = faiss.index_binary_factory(d, "BIVF200")
fake_centroids = np.zeros((index.nlist, index.d // 8), dtype="uint8")
index.quantizer.add(fake_centroids)
index.is_trained = True
# add elements xb
a = alt_quantizer.search(xb[:, :20].copy(), 1)[1].ravel()
ivf_tools.add_preassigned(index, xb_bin, a)
# recompute GT in binary
k = 15
ib = faiss.IndexBinaryFlat(128)
ib.add(xb_bin)
Dgt, Igt = ib.search(xq_bin, k)
# search elements xq, increase nprobe, check 4 first results w/
# groundtruth
prev_inter_perf = 0
for nprobe in 1, 10, 20:
index.nprobe = nprobe
a = alt_quantizer.search(xq[:, :20].copy(), index.nprobe)[1]
D, I = ivf_tools.search_preassigned(index, xq_bin, k, a)
inter_perf = faiss.eval_intersection(I, Igt)
self.assertGreaterEqual(inter_perf, prev_inter_perf)
prev_inter_perf = inter_perf
# test range search
index.nprobe = 20
a = alt_quantizer.search(xq[:, :20].copy(), index.nprobe)[1]
# just to find a reasonable radius
D, I = ivf_tools.search_preassigned(index, xq_bin, 4, a)
radius = int(D.max() + 1)
lims, DR, IR = ivf_tools.range_search_preassigned(
index, xq_bin, radius, a)
# with that radius the k-NN results are a subset of the range
# search results
for q in range(len(xq)):
l0, l1 = lims[q], lims[q + 1]
self.assertTrue(set(I[q]) <= set(IR[l0:l1]))
E AssertionError: False is not true
tests/test_contrib.py:467: AssertionError _________________________ TestBinaryIVF.test_ivf_flat2 _________________________
self = <test_index_binary.TestBinaryIVF testMethod=test_ivf_flat2>
def test_ivf_flat2(self):
d = self.xq.shape[1] * 8
quantizer = faiss.IndexBinaryFlat(d)
index = faiss.IndexBinaryIVF(quantizer, d, 8)
index.cp.min_points_per_centroid = 5 # quiet warning
index.nprobe = 4
index.train(self.xt)
index.add(self.xb)
Divfflat, _ = index.search(self.xq, 10)
# Some centroids are equidistant from the query points.
# So the answer will depend on the implementation of the heap.
self.assertGreater((self.Dref == Divfflat).sum(), 4100)
E AssertionError: 34 not greater than 4100
tests/test_index_binary.py:199: AssertionError ____________________ TestBinaryIVF.test_ivf_flat_exhaustive ____________________
self = <test_index_binary.TestBinaryIVF testMethod=test_ivf_flat_exhaustive>
def test_ivf_flat_exhaustive(self):
d = self.xq.shape[1] * 8
quantizer = faiss.IndexBinaryFlat(d)
index = faiss.IndexBinaryIVF(quantizer, d, 8)
index.cp.min_points_per_centroid = 5 # quiet warning
index.nprobe = 8
index.train(self.xt)
index.add(self.xb)
Divfflat, _ = index.search(self.xq, 10)
np.testing.assert_array_equal(self.Dref, Divfflat)
tests/test_index_binary.py:184:
args = (
@wraps(func)
def inner(*args, **kwds):
with self._recreate_cm():
return func(*args, **kwds)
E AssertionError:
E Arrays are not equal
E
E Mismatched elements: 4965 / 5000 (99.3%)
E Max absolute difference: 8
E Max relative difference: 0.72727273
E x: array([[7, 7, 8, ..., 9, 9, 9],
E [7, 7, 8, ..., 9, 9, 9],
E [6, 7, 7, ..., 9, 9, 9],...
E y: array([[10, 12, 13, ..., 15, 15, 16],
E [10, 12, 12, ..., 14, 14, 14],
E [ 8, 9, 10, ..., 13, 13, 13],...
../../../miniconda3/lib/python3.11/contextlib.py:81: AssertionError ________________________ TestBinaryIVF.test_ivf_nprobe _________________________
self = <test_index_binary.TestBinaryIVF testMethod=test_ivf_nprobe>
def test_ivf_nprobe(self):
"""Test in case of nprobe > nlist."""
d = self.xq.shape[1] * 8
xt, xb, xq = self.xt, self.xb, self.xq
# nlist = 10
index = faiss.index_binary_factory(d, "BIVF10")
# When nprobe >= nlist, it is equivalent to an IndexFlat.
index.train(xt)
index.add(xb)
index.nprobe = 2048
k = 5
# test kNN search
D, I = index.search(xq, k)
ref_index = faiss.index_binary_factory(d, "BFlat")
ref_index.add(xb)
ref_D, ref_I = ref_index.search(xq, k)
assert np.all(D == ref_D)
E assert False E + where False = <function all at 0x7f4a70550670>(array([[10, 1..., dtype=int32) == array([[7, 7,..., dtype=int32) E + where <function all at 0x7f4a70550670> = np.all E Full diff: E - array([[7, 7, 8, 8, 8], E - [7, 7, 8, 8, 8], E - [6, 7, 7, 8, 8], E + array([[10, 14, 14, 15, 15], E + [12, 12, 13, 14, 14], E + [ 9, 11, 13, 14, 14], E ..., E - [6, 7, 8, 8, 8], E - [7, 8, 8, 8, 8], E + [ 9, 10, 10, 11, 12], E + [10, 11, 11, 11, 11], E - [6, 7, 8, 8, 9]], dtype=int32, E ? ^ ^ ^ ^ ^ E + [10, 12, 12, 13, 13]], dtype=int32, E ? ^^ ^^ ^^ ^^ ^^ E ))
tests/test_index_binary.py:289: AssertionError ----------------------------- Captured stderr call ----------------------------- WARNING clustering 200 points to 10 centroids: please provide at least 390 training points _________________________ TestBinaryIVF.test_ivf_range _________________________
self = <test_index_binary.TestBinaryIVF testMethod=test_ivf_range>
def test_ivf_range(self):
d = self.xq.shape[1] * 8
quantizer = faiss.IndexBinaryFlat(d)
index = faiss.IndexBinaryIVF(quantizer, d, 8)
index.cp.min_points_per_centroid = 5 # quiet warning
index.nprobe = 4
index.train(self.xt)
index.add(self.xb)
D, I = index.search(self.xq, 10)
radius = int(np.median(D[:, -1]) + 1)
Lr, Dr, Ir = index.range_search(self.xq, radius)
for i in range(len(self.xq)):
res = Ir[Lr[i]:Lr[i + 1]]
if D[i, -1] < radius:
self.assertTrue(set(I[i]) <= set(res))
else:
subset = I[i, D[i, :] < radius]
self.assertTrue(set(subset) == set(res))
E AssertionError: False is not true
tests/test_index_binary.py:221: AssertionError ____________________ TestBinaryIVF.test_search_per_invlist _____________________
self = <test_index_binary.TestBinaryIVF testMethod=test_search_per_invlist>
def test_search_per_invlist(self):
d = self.xq.shape[1] * 8
quantizer = faiss.IndexBinaryFlat(d)
index = faiss.IndexBinaryIVF(quantizer, d, 10)
index.cp.min_points_per_centroid = 5 # quiet warning
index.train(self.xt)
index.add(self.xb)
index.nprobe = 3
Dref, Iref = index.search(self.xq, 10)
index.per_invlist_search = True
D2, I2 = index.search(self.xq, 10)
compare_binary_result_lists(Dref, Iref, D2, I2)
tests/test_index_binary.py:313:
D1 = array([[10, 14, 14, ..., 15, 15, 15], [11, 12, 13, ..., 15, 15, 16], [10, 11, 12, ..., 15, 16, 16], ...10, 11, 11, ..., 13, 14, 14], [11, 11, 11, ..., 15, 16, 16], [10, 12, 12, ..., 13, 13, 14]], dtype=int32) I1 = array([[ 38, 4, 44, ..., 41, 47, 52], [127, 55, 34, ..., 8, 18, 66], [136, 7, 111, ..., ...233, 251, ..., 124, 84, 88], [ 3, 27, 63, ..., 83, 51, 76], [ 14, 90, 96, ..., 75, 112, 16]]) D2 = array([[ 7, 7, 8, ..., 9, 9, 9], [ 7, 8, 8, ..., 10, 10, 10], [ 7, 8, 8, ..., 9, 10, 10], ... 8, 8, 9, ..., 9, 10, 10], [ 8, 8, 8, ..., 9, 9, 9], [ 8, 9, 9, ..., 10, 10, 10]], dtype=int32) I2 = array([[ 812, 866, 121, ..., 1042, 1350, 1448], [1382, 364, 790, ..., 42, 294, 945], [ 460, 140...1426, 384, 841], [ 459, 533, 1082, ..., 677, 746, 767], [ 428, 153, 1053, ..., 324, 420, 670]])
def compare_binary_result_lists(D1, I1, D2, I2):
"""comparing result lists is difficult because there are many
ties. Here we sort by (distance, index) pairs and ignore the largest
distance of each result. Compatible result lists should pass this."""
assert D1.shape == I1.shape == D2.shape == I2.shape
n, k = D1.shape
ndiff = (D1 != D2).sum()
assert ndiff == 0, '%d differences in distance matrix %s' % (
ndiff, D1.shape)
E AssertionError: 4952 differences in distance matrix (500, 10)
tests/common_faiss_tests.py:115: AssertionError _____________ TestIndexBinaryFromFloat.test_wrapped_quantizer_HNSW _____________
self = <test_index_binary_from_float.TestIndexBinaryFromFloat testMethod=test_wrapped_quantizer_HNSW>
def test_wrapped_quantizer_HNSW(self):
def bin2float2d(v):
n, d = v.shape
vf = ((v.reshape(-1, 1) >> np.arange(8)) & 1).astype("float32")
vf *= 2
vf -= 1
return vf.reshape(n, d * 8)
d = 256
nt = 12800
nb = 10000
nq = 500
(xt, xb, xq) = make_binary_dataset(d, nb, nt, nq)
index_ref = faiss.IndexBinaryFlat(d)
index_ref.add(xb)
nlist = 256
clus = faiss.Clustering(d, nlist)
clus_index = faiss.IndexFlatL2(d)
xt_f = bin2float2d(xt)
clus.train(xt_f, clus_index)
centroids = faiss.vector_to_array(clus.centroids).reshape(-1, clus.d)
hnsw_quantizer = faiss.IndexHNSWFlat(d, 32)
hnsw_quantizer.add(centroids)
hnsw_quantizer.is_trained = True
wrapped_quantizer = faiss.IndexBinaryFromFloat(hnsw_quantizer)
assert nlist == hnsw_quantizer.ntotal
assert nlist == wrapped_quantizer.ntotal
assert wrapped_quantizer.is_trained
index = faiss.IndexBinaryIVF(wrapped_quantizer, d,
hnsw_quantizer.ntotal)
index.nprobe = 128
assert index.is_trained
index.add(xb)
D_ref, I_ref = index_ref.search(xq, 10)
D, I = index.search(xq, 10)
recall = sum(gti[0] in Di[:10] for gti, Di in zip(D_ref, D)) \
/ float(D_ref.shape[0])
assert recall >= 0.77, "recall = %g" % recall
E AssertionError: recall = 0.05 E assert 0.05 >= 0.77
tests/test_index_binary_from_float.py:160: AssertionError _____________ TestIndexBinaryFromFloat.test_wrapped_quantizer_IMI ______________
self = <test_index_binary_from_float.TestIndexBinaryFromFloat testMethod=test_wrapped_quantizer_IMI>
def test_wrapped_quantizer_IMI(self):
d = 256
nt = 3500
nb = 10000
nq = 500
(xt, xb, xq) = make_binary_dataset(d, nb, nt, nq)
index_ref = faiss.IndexBinaryFlat(d)
index_ref.add(xb)
nlist_exp = 6
nlist = 2 ** (2 * nlist_exp)
float_quantizer = faiss.MultiIndexQuantizer(d, 2, nlist_exp)
wrapped_quantizer = faiss.IndexBinaryFromFloat(float_quantizer)
wrapped_quantizer.train(xt)
assert nlist == float_quantizer.ntotal
index = faiss.IndexBinaryIVF(wrapped_quantizer, d,
float_quantizer.ntotal)
index.nprobe = 2048
assert index.is_trained
index.add(xb)
D_ref, I_ref = index_ref.search(xq, 10)
D, I = index.search(xq, 10)
recall = sum(gti[0] in Di[:10] for gti, Di in zip(D_ref, D)) \
/ float(D_ref.shape[0])
assert recall > 0.82, "recall = %g" % recall
E AssertionError: recall = 0.332 E assert 0.332 > 0.82
tests/test_index_binary_from_float.py:108: AssertionError
- generated xml file: /home/runner/work/faiss/faiss/test-results/pytest/results.xml - =========================== short test summary info ============================ FAILED tests/test_contrib.py::TestPreassigned::test_binary - AssertionError: False is not true FAILED tests/test_index_binary.py::TestBinaryIVF::test_ivf_flat2 - AssertionError: 34 not greater than 4100 FAILED tests/test_index_binary.py::TestBinaryIVF::test_ivf_flat_exhaustive - AssertionError: Arrays are not equal
Mismatched elements: 4965 / 5000 (99.3%) Max absolute difference: 8 Max relative difference: 0.72727273 x: array([[7, 7, 8, ..., 9, 9, 9], [7, 7, 8, ..., 9, 9, 9], [6, 7, 7, ..., 9, 9, 9],... y: array([[10, 12, 13, ..., 15, 15, 16], [10, 12, 12, ..., 14, 14, 14], [ 8, 9, 10, ..., 13, 13, 13],... FAILED tests/test_index_binary.py::TestBinaryIVF::test_ivf_nprobe - assert False
- where False = <function all at 0x7f4a70550670>(array([[10, 1..., dtype=int32) == array([[7, 7,..., dtype=int32)
- where <function all at 0x7f4a70550670> = np.all Full diff: - array([[7, 7, 8, 8, 8], - [7, 7, 8, 8, 8], - [6, 7, 7, 8, 8], + array([[10, 14, 14, 15, 15], + [12, 12, 13, 14, 14], + [ 9, 11, 13, 14, 14], ..., - [6, 7, 8, 8, 8], - [7, 8, 8, 8, 8], + [ 9, 10, 10, 11, 12], + [10, 11, 11, 11, 11], - [6, 7, 8, 8, 9]], dtype=int32, ? ^ ^ ^ ^ ^ + [10, 12, 12, 13, 13]], dtype=int32, ? ^^ ^^ ^^ ^^ ^^ )) FAILED tests/test_index_binary.py::TestBinaryIVF::test_ivf_range - AssertionError: False is not true FAILED tests/test_index_binary.py::TestBinaryIVF::test_search_per_invlist - AssertionError: 4952 differences in distance matrix (500, 10) FAILED tests/test_index_binary_from_float.py::TestIndexBinaryFromFloat::test_wrapped_quantizer_HNSW - AssertionError: recall = 0.05 assert 0.05 >= 0.77 FAILED tests/test_index_binary_from_float.py::TestIndexBinaryFromFloat::test_wrapped_quantizer_IMI - AssertionError: recall = 0.332 assert 0.332 > 0.82 ============= 8 failed, 895 passed, 1 skipped in 266.89s (0:04:26) =============
Training ResidualQuantizer, with 4 steps on 3000 32D vectors [0.092 s, 0.076 s clustering] train stage 0, 6 bits, kmeans objective 33217.3, total distance 200231, beam_size 1->5 (batch size 3000) [0.171 s, 0.149 s clustering] train stage 1, 6 bits, kmeans objective 155050, total distance 143425, beam_size 5->5 (batch size 3000) [0.262 s, 0.232 s clustering] train stage 2, 6 bits, kmeans objective 112965, total distance 107746, beam_size 5->5 (batch size 3000) [0.356 s, 0.318 s clustering] train stage 3, 6 bits, kmeans objective 85549.6, total distance 81995.9, beam_size 5->5 (batch size 3000) [0.001 s] encode stage 0, 6 bits, total error 67433, beam_size 5 [0.002 s] encode stage 1, 6 bits, total error 51657.3, beam_size 5 [0.003 s] encode stage 2, 6 bits, total error 40562.2, beam_size 5 [0.004 s] encode stage 3, 6 bits, total error 32055.1, beam_size 5
Apologies for the delay @mnorris11 , looking into this.
@mnorris11 I don't seem to be able to reproduce these locally:
(faiss) aditiahuja@TN9RR9RV4Y tests % python -m unittest test_index_binary.TestBinaryIVF
.WARNING clustering 200 points to 8 centroids: please provide at least 312 training points
..WARNING clustering 200 points to 10 centroids: please provide at least 390 training points
....
----------------------------------------------------------------------
Ran 7 tests in 0.090s
OK
@mnorris11 I don't seem to be able to reproduce these locally:
(faiss) aditiahuja@TN9RR9RV4Y tests % python -m unittest test_index_binary.TestBinaryIVF .WARNING clustering 200 points to 8 centroids: please provide at least 312 training points ..WARNING clustering 200 points to 10 centroids: please provide at least 390 training points .... ---------------------------------------------------------------------- Ran 7 tests in 0.090s OK
Quick sanity check: What are the cmake steps you used to install Faiss?
@mnorris11 ran these, as far as I know:
cmake .. -DFAISS_ENABLE_GPU=OFF -DFAISS_ENABLE_PYTHON=ON -DPython_EXECUTABLE=$(which python3) -DBUILD_TESTING=ON
@mnorris11 ran these, as far as I know:
cmake .. -DFAISS_ENABLE_GPU=OFF -DFAISS_ENABLE_PYTHON=ON -DPython_EXECUTABLE=$(which python3) -DBUILD_TESTING=ON
@metonymic-smokey if you exactly follow https://github.com/facebookresearch/faiss/blob/main/INSTALL.md, are you able to repro?
@mnorris11 when I run the C++ tests based on the instructions there, I get errors in test mmap_binary_flatcodes, not the tests which fail in CI here. I'm facing some issues running the python tests so will get back to you on that in a bit.
(Assigning it to you @metonymic-smokey for issue tracking purposes)