fast-elasticsearch-vector-scoring icon indicating copy to clipboard operation
fast-elasticsearch-vector-scoring copied to clipboard

Response has no hits even though index consists of vectors

Open Karthik-Suresh93 opened this issue 4 years ago • 1 comments

Hi @lior-k @zewelor @ran22 @cakirmuha ,

Thanks for this great plugin! I was testing it out with elasticsearch version 6.8.1 (from the same branch of the plugin). I was able to index the data and even get a response when I queried over it. Unfortunately, the hits are empty. Here is my the code I used (similar to https://github.com/lior-k/fast-elasticsearch-vector-scoring/issues/25).

import base64
import numpy as np
import json

_float32_dtype = np.dtype('>f4')

import elasticsearch
print(elasticsearch.__version__)

def decode_float_list(base64_string):
    buffer = base64.b64decode(base64_string)
    return np.frombuffer(buffer, dtype=_float32_dtype).tolist()


def encode_array(arr):
    base64_str = base64.b64encode(np.array(arr).astype(_float32_dtype)).decode("utf-8")
    return base64_str

def create_index(name):

    #this request body works for es 6.7 (??) or higher, for es versions less than that, add mappings to on top of prepoerties like :
    """
    request_body = '''{
        "settings": {
            "number_of_shards": 2,
            "number_of_replicas": 1
        },

        'mappings':{
            "properties": {
                "embedding_vector": {
                    "type": "binary",
                    "doc_values": true
                }
            }
        }
    }'''
    """
    request_body = '''{
        "settings": {
            "number_of_shards": 2,
            "number_of_replicas": 1
        },

            "properties": {
                "embedding_vector": {
                    "type": "binary",
                    "doc_values": true
                }
            }
    }'''
    print(f"creating {name} index... {request_body}")
    es.indices.create(index = name, body = request_body)


def index_data(data):
    counter = 1
    for vector in data:
        body = {
            "id": counter,
            "embedding_vector": encode_array(vector)
        }
        es.index(index=INDEX_NAME, body=body)
        counter += 1


def search():
    # "vector": [ 0.6172189116477966, 0.4812350273132324, 0.2395150065422058, 0.41844668984413147, 0.8617216944694519, 0.12854498624801636, 0.2627895176410675, 0.22640013694763184, 0.5444879531860352, 0.52374267578125, 0.7576023936271667, 0.25305455923080444, 0.5308356285095215, 0.6852802038192749, 0.4624062180519104, 0.1816617250442505, 0.2958976626396179, 0.025580303743481636, 0.16926740109920502, 0.7047653198242188, 0.6931900978088379, 0.04226350784301758, 0.9671088457107544, 0.47195401787757874, 0.2582820653915405, 0.11039293557405472, 0.6919737458229065, 0.5618643760681152, 0.6426474452018738, 0.6258983612060547, 0.8140584826469421, 0.2586701810359955, 0.2690378725528717, 0.9467039704322815, 0.474464476108551, 0.7006123661994934, 0.3056519627571106, 0.934620201587677, 0.33563244342803955, 0.38651159405708313, 0.3424995541572571, 0.23031608760356903, 0.641241729259491, 0.01252000406384468, 0.5705199837684631, 0.24167191982269287, 0.4995182156562805, 0.9633683562278748, 0.618108868598938, 0.9971736669540405, 0.24285273253917694, 0.4431900978088379, 0.67298823595047, 0.5439957976341248, 0.5564237833023071, 0.2304188311100006, 0.4888533055782318, 0.4624284505844116, 0.788846492767334, 0.44891494512557983, 0.9873254299163818, 0.8286163806915283, 0.7455354332923889, 0.8039408326148987, 0.5274253487586975, 0.4829685688018799, 0.6627996563911438, 0.3408285975456238, 0.5105639100074768, 0.066745325922966, 0.13178864121437073, 0.35720911622047424, 0.1358930915594101, 0.5904856324195862, 0.12224390357732773, 0.7346777319908142, 0.9671003222465515, 0.48915180563926697, 0.7750203013420105, 0.14900848269462585, 0.6375364661216736, 0.21111196279525757, 0.8424895405769348, 0.13458995521068573, 0.5942713618278503, 0.6773364543914795, 0.8135702610015869, 0.33085259795188904, 0.3377285897731781, 0.9505098462104797, 0.5543105006217957, 0.9818258285522461, 0.297512948513031, 0.4442136883735657, 0.9673498868942261, 0.7054122090339661, 0.724175751209259, 0.6931982636451721, 0.8991569876670837, 0.01580190286040306, 0.11919090896844864, 0.38001662492752075, 0.5516496300697327, 0.8624045848846436, 0.13067130744457245, 0.12067067623138428, 0.642181932926178, 0.32152852416038513, 0.9839213490486145, 0.6214938759803772, 0.8877131342887878, 0.6137049198150635, 0.14480671286582947, 0.5091487169265747, 0.8738197088241577, 0.6978392004966736, 0.8988777995109558, 0.10804525017738342, 0.7366241216659546, 0.7556180357933044, 0.22851991653442383, 0.1791202872991562, 0.11619532108306885, 0.04393879696726799, 0.7954261898994446, 0.8965669870376587, 0.7234428524971008, 0.23360027372837067, 0.9665877223014832, 0.14681114256381989, 0.9289661645889282, 0.9380605816841125, 0.4196012616157532, 0.4730188846588135, 0.514502227306366, 0.5517736673355103, 0.6869121193885803, 0.8567425608634949, 0.7314034700393677, 0.9989842772483826, 0.3868770897388458, 0.9380677342414856, 0.4927084743976593, 0.7979277968406677, 0.45593059062957764, 0.0170291718095541, 0.6517185568809509, 0.5005806684494019, 0.8620452880859375, 0.5568361282348633, 0.07004088908433914, 0.5770776271820068, 0.8143753409385681, 0.8382748961448669, 0.0996832400560379, 0.5101017355918884, 0.4771038293838501, 0.9274903535842896, 0.22478686273097992, 0.9320020079612732, 0.05571257323026657, 0.6283928155899048, 0.6742311120033264, 0.0424797385931015, 0.7878830432891846, 0.5152276158332825, 0.16908106207847595, 0.5440091490745544, 0.7015048861503601, 0.25502151250839233, 0.40467849373817444, 0.432849258184433, 0.7071661353111267, 0.14723558723926544, 0.38334646821022034, 0.9520816802978516, 0.8364397287368774, 0.8559724688529968, 0.008303776383399963, 0.9050803184509277, 0.32011473178863525, 0.4527781903743744, 0.7674447298049927, 0.4480983316898346, 0.1805608868598938, 0.4140874147415161, 0.27097389101982117, 0.8837590217590332, 0.7211946845054626, 0.34096693992614746, 0.4692194163799286, 0.29635292291641235, 0.272903710603714, 0.00385366752743721, 0.17514188587665558, 0.6346434950828552]

    search = {
        "query": {
            "function_score": {
                "boost_mode": "replace",
                "script_score": {
                    "script": {
                        "source": "binary_vector_score",
                        "lang": "knn",
                        "params": {
                            "cosine": False,
                            "field": "embedding_vector",
                            "vector": [
                                -0.09217305481433868, 0.010635560378432274, -0.02878434956073761, 0.06988169997930527,
                                0.1273992955684662, -0.023723633959889412, 0.05490724742412567, -0.12124507874250412,
                                -0.023694118484854698, 0.014595639891922474, 0.1471538096666336, 0.044936809688806534,
                                -0.02795785665512085, -0.05665992572903633, -0.2441125512123108, 0.2755320072174072,
                                0.11451690644025803, 0.20242854952812195, -0.1387604922056198, 0.05219579488039017,
                                0.1145530641078949, 0.09967200458049774, 0.2161576747894287, 0.06157230958342552,
                                0.10350126028060913, 0.20387393236160278, 0.1367097795009613, 0.02070528082549572,
                                0.19238869845867157, 0.059613026678562164, 0.014012521132826805, 0.16701748967170715,
                                0.04985826835036278, -0.10990987718105316, -0.12032567709684372, -0.1450948715209961,
                                0.13585780560970306, 0.037511035799980164, 0.04251480475068092, 0.10693439096212387,
                                -0.08861573040485382, -0.07457160204648972, 0.0549330934882164, 0.19136285781860352,
                                0.03346432000398636, -0.03652812913060188, -0.1902569830417633, 0.03250952064990997,
                                -0.3061246871948242, 0.05219300463795662, -0.07879918068647385, 0.1403723508119583,
                                -0.08893408626317978, -0.24330253899097443, -0.07105310261249542, -0.18161986768245697,
                                0.15501035749912262, -0.216160386800766, -0.06377710402011871, -0.07671763002872467,
                                0.05360138416290283, -0.052845533937215805, -0.02905619889497757, 0.08279753476381302
                            ]

                        }
                    }
                }
            }
        },
        "size": 5
    }
    print(json.dumps(search))
    return es.search(index=INDEX_NAME, body=search)


if __name__ == '__main__':

    es = Elasticsearch("localhost:9200", send_get_body_as='POST', retry_on_timeout=True, timeout=5000)

    # workaround for http error for max long HTTP lines (>4096 bytes). either upgrade or downgrade es for permanent solution

    INDEX_NAME = "testindex16"
    create_index(INDEX_NAME)

    data = np.random.rand(10, 8, 8).tolist()
    #data = np.random.rand(10, 2, 2).tolist()

    #print(data)
    index_data(data)

    res = search()
    print(res)```

This is the result I get:

{'took': 0, 'timed_out': False, '_shards': {'total': 2, 'successful': 2, 'skipped': 0, 'failed': 0}, 'hits': {'total': 0, 'max_score': None, 'hits': []}}

Please help me out with this issue.

Karthik-Suresh93 avatar Apr 01 '20 06:04 Karthik-Suresh93

Hi, sorry for the late response. and thanks! same as issue #25 - your indexed vector dimension ( data = np.random.rand(10, 8, 8).tolist()) is 8, while you are using a 64! dimissions vector in the query. By design the plugin ignores vectors that do not have the same # of dimensions

so change the query to ask for KNN of a 8 dimensions vector

lior-k avatar Jun 17 '20 19:06 lior-k