redis-vl-python icon indicating copy to clipboard operation
redis-vl-python copied to clipboard

Issue with search dialect 3 and JSON

Open tylerhutcherson opened this issue 1 year ago • 4 comments

The Redis query engine dialect 3 api changes the outputs for JSON:

q = VectorQuery(
    vector=[0.23, 0.12, -0.03, 0.98],
    vector_field_name="embedding",
    filter_expression=filter_expression,
    return_fields=["name", "description", "price"],
    dialect=3
)
index.query(q)

returns:

[{'id': 'product:36edbfd1372144759975f01fe6968bbf',
  'vector_distance': '1.08839428425',
  'name': '["Wireless earbuds"]',
  'description': '["Wireless Bluetooth in-ear headphones"]',
  'price': '[64.99]'}]

BUT dialect 2 returns:

[{'id': 'product:36edbfd1372144759975f01fe6968bbf',
  'vector_distance': '1.08839428425',
  'name': 'Wireless earbuds',
  'description': 'Wireless Bluetooth in-ear headphones',
  'price': '64.99'}]

(which is the correct and user expected format.

Additionally FilterQuery types break when using dialect=3...

Need to fix the parsing layer in RedisVL.

tylerhutcherson avatar Apr 19 '24 01:04 tylerhutcherson

Schema YAML:

%%writefile schema.yaml

index:
    name: products
    prefix: product
    storage_type: json

fields:
    - name: name
      type: text
    - name: description
      type: text
    - name: connection_type
      path: $.connection.type  # index item from nested object
      type: tag
    - name: price
      type: numeric
    - name: stock
      type: numeric
    - name: color
      path: $.colors.*  # index array of TAGs
      type: tag
    - name: embedding
      type: vector
      attrs:
          dims: 4
          algorithm: flat
          distance_metric: cosine
    - name: embeddings
      path: $.embeddings[*]  # index array of VECTORs
      type: vector
      attrs:
          dims: 4
          algorithm: hnsw
          distance_metric: l2

Code to reproduce:

from redis import Redis

from redisvl.schema import IndexSchema
from redisvl.index import SearchIndex


data = [
    {
        "name": "Noise-cancelling Bluetooth headphones",
        "description": "Wireless Bluetooth headphones with noise-cancelling technology",
        "connection": {
            "wireless": True,
            "type": "Bluetooth"
        },
        "price": 99.98,
        "stock": 25,
        "colors": [
            "black",
            "silver"
        ],
        "embedding": [0.87, -0.15, 0.55, 0.03],
        "embeddings": [[0.56, -0.34, 0.69, 0.02], [0.94, -0.23, 0.45, 0.19]]
    },
    {
        "name": "Wireless earbuds",
        "description": "Wireless Bluetooth in-ear headphones",
        "connection": {
            "wireless": True,
            "type": "Bluetooth"
        },
        "price": 64.99,
        "stock": 17,
        "colors": [
            "red",
            "black",
            "white"
        ],
        "embedding": [-0.7, -0.51, 0.88, 0.14],
        "embeddings": [[0.54, -0.14, 0.79, 0.92], [0.94, -0.93, 0.45, 0.16]]
    }
]

schema = IndexSchema.from_yaml("schema.yaml")
client = Redis.from_url("redis://localhost:6379")
index = SearchIndex(schema, client)
index.create(overwrite=True, drop=True)
keys = index.load(data)

tylerhutcherson avatar Apr 19 '24 02:04 tylerhutcherson

@tylerhutcherson I added a test as shown below, but it passes??? Can you take a look and see if you see anything missing?

import pytest
from redis import Redis
from redis.commands.search.query import Query

from redisvl.index import SearchIndex
from redisvl.query import VectorQuery, FilterQuery
from redisvl.query.filter import Tag
from redisvl.schema.schema import IndexSchema

@pytest.fixture
def sample_data():
    return [
        {
            "name": "Noise-cancelling Bluetooth headphones",
            "description": "Wireless Bluetooth headphones with noise-cancelling technology",
            "connection": {
                "wireless": True,
                "type": "Bluetooth"
            },
            "price": 99.98,
            "stock": 25,
            "colors": [
                "black",
                "silver"
            ],
            "embedding": [0.87, -0.15, 0.55, 0.03],
            "embeddings": [[0.56, -0.34, 0.69, 0.02], [0.94, -0.23, 0.45, 0.19]]
        },
        {
            "name": "Wireless earbuds",
            "description": "Wireless Bluetooth in-ear headphones",
            "connection": {
                "wireless": True,
                "type": "Bluetooth"
            },
            "price": 64.99,
            "stock": 17,
            "colors": [
                "red",
                "black",
                "white"
            ],
            "embedding": [-0.7, -0.51, 0.88, 0.14],
            "embeddings": [[0.54, -0.14, 0.79, 0.92], [0.94, -0.93, 0.45, 0.16]]
        }
    ]

@pytest.fixture
def schema_dict():
    return {
        "index": {
            "name": "products",
            "prefix": "product",
            "storage_type": "json"
        },
        "fields": [
            {"name": "name", "type": "text"},
            {"name": "description", "type": "text"},
            {"name": "connection_type", "path": "$.connection.type", "type": "tag"},
            {"name": "price", "type": "numeric"},
            {"name": "stock", "type": "numeric"},
            {"name": "color", "path": "$.colors.*", "type": "tag"},
            {
                "name": "embedding",
                "type": "vector",
                "attrs": {
                    "dims": 4,
                    "algorithm": "flat",
                    "distance_metric": "cosine"
                }
            },
            {
                "name": "embeddings",
                "path": "$.embeddings[*]",
                "type": "vector",
                "attrs": {
                    "dims": 4,
                    "algorithm": "hnsw",
                    "distance_metric": "l2"
                }
            }
        ]
    }

@pytest.fixture
def index(sample_data, redis_url, schema_dict):
    index_schema = IndexSchema.from_dict(schema_dict)
    redis_client = Redis.from_url(redis_url)
    index = SearchIndex(index_schema, redis_client)
    index.create(overwrite=True, drop=True)
    index.load(sample_data)
    yield index
    index.delete(drop=True)

def test_dialect_3_json(index, sample_data):
    # Create a VectorQuery with dialect 3
    vector_query = VectorQuery(
        vector=[0.23, 0.12, -0.03, 0.98],
        vector_field_name="embedding",
        return_fields=["name", "description", "price"],
        dialect=3
    )

    # Execute the query
    results = index.query(vector_query)

    # Print the results
    print("VectorQuery Results:")
    print(results)

    # Assert the expected format of the results
    assert len(results) > 0
    for result in results:
        assert not isinstance(result["name"], list)
        assert not isinstance(result["description"], list)
        assert not isinstance(result["price"], list)

    # Create a FilterQuery with dialect 3
    filter_query = FilterQuery(
        filter_expression=Tag("color") == "black",
        return_fields=["name", "description", "price"],
        dialect=3
    )

    # Execute the query
    results = index.query(filter_query)

    # Print the results
    print("FilterQuery Results:")
    print(results)

    # Assert the expected format of the results
    assert len(results) > 0
    for result in results:
        assert not isinstance(result["name"], list)
        assert not isinstance(result["description"], list)
        assert not isinstance(result["price"], list)

bsbodden avatar May 13 '24 06:05 bsbodden

@bsbodden it's outputting as a string so for your test check to work it would need to be cast image

rbs333 avatar May 13 '24 18:05 rbs333

@bsbodden can be changed to or something more elegant than the test correctly bombs

for result in results:
        assert not isinstance(json.loads(result["name"]), list)
        assert not isinstance(json.loads(result["description"]), list)
        assert not isinstance(json.loads(result["price"]), list)

rbs333 avatar May 13 '24 18:05 rbs333