knowhere
knowhere copied to clipboard
Poor performance of building index for binary vectors
Deploy milvus 2.1.4 and run this script:
import numpy as np
import pandas as pd
import time
from pymilvus import (
connections,
utility,
FieldSchema,
CollectionSchema,
DataType,
Collection,
)
from bitstring import BitArray
from random import randint
fp_size = 8192
milvus_host = "127.0.0.1"
milvus_port = "19530"
collection_name = "emols_fp_1024_test"
print("*** Connect to milvus ***")
connections.connect("default", host=milvus_host, port=milvus_port)
has = utility.has_collection(collection_name)
print(f"Does collection exist in Milvus: {has}")
if not has:
print("*** Create collection ***")
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
FieldSchema(name="smiles", dtype=DataType.VARCHAR, max_length=200),
FieldSchema(
name="morgan_fingerprint",
dtype=DataType.BINARY_VECTOR,
dim=fp_size,
is_primary=False,
),
]
schema = CollectionSchema(fields, collection_name)
ligands_collection = Collection(collection_name, schema, consistency_level="Strong")
print("Collection created")
else:
ligands_collection = Collection(collection_name)
print(ligands_collection)
chunk_size = 100000
smiles = ["test_string"] * chunk_size
for i in range(0, 240):
print("i", i)
fps = [BitArray([randint(0, 1) for i in range(1024)]).bin] * chunk_size
fps_bytes = list(map(lambda p: bytes(p, encoding='utf-8'), fps))
ligands_collection.insert([smiles, fps_bytes])
print("*** Load collection ***")
ligands_collection.load()
print("*** Create index TANIMOTO BIN_IVF_FLAT ***")
index = {
"index_type": "BIN_IVF_FLAT",
"metric_type": "TANIMOTO",
"params": {"nlist": 1024},
}
ligands_collection.create_index("morgan_fingerprint", index)
print("index finished")
info = utility.get_query_segment_info(collection_name=collection_name)
print(info)
print("segments count", len(info))
pre = 0
start = 0
while True:
prog = utility.index_building_progress(collection_name=collection_name)
print(prog)
if prog['indexed_rows'] != pre:
end = time.time()
print("index a segment cost", (end-start), "s")
start = time.time()
pre = prog['indexed_rows']
time.sleep(3)
This script insert 24M binary vectors(8192 dim) into milvus, and build IVF_FLAT index with nlist=1024. After insert is finished, there are 120 segments generated, and each segment has about 200000 rows. The index process is very slow. building an index for a segment cost 5 minutes. seems the index process for binary vector is single-thread(only one CPU is busy).
/assign @cydrain