elasticsearch icon indicating copy to clipboard operation
elasticsearch copied to clipboard

Add sparse_vector query

Open kderusso opened this issue 9 months ago • 3 comments

Relates to https://github.com/elastic/elasticsearch/issues/106261

This PR introduces a new sparse_vector query that combines the functionality of the text_expansion and weighted_tokens queries. Eventually this query will replace the other two queries.

Actions that will occur in future PRs:

  • Deprecating the text_expansion and weighted_tokens queries
  • Removing references to the weighted_tokens query in the SparseVectorQueryBuilder
  • Moving the sparse_vector query outside of the ML plugin (requires some inference API work)

Examples of how to use this new query type:

POST /docs/_search
{
  "query": {
    "sparse_vector": {
      "field": "content_embedding",
      "inference_id": "my-elser-model",
      "query": "how is the weather in jamaica"
    }
  }
}

POST /docs/_search
{
  "query": {
    "sparse_vector": {
      "field": "content_embedding",
      "query_vector": {
        "heat": 0.8471008,
        "atmosphere": 0.24251926,
        "very": 0.15496244,
        "brazil": 0.34641686,
        "winter": 0.5070546,
        "hardy": 0.15455529,
        "cold": 0.34308356,
        "sun": 0.052155916,
        "summer": 0.44945887,
        "beautiful": 0.28135294,
        "caribbean": 1.0542055,
        "jamaican": 1.4637164,
        "jamaica": 2.7778716,
        "geography": 0.48493838,
        "weather": 1.9407427,
        "temperature": 1.0894402,
        "season": 0.50890195,
        "quite": 0.20918,
        "cuba": 0.17437862,
        "rain": 0.14961956,
        "africa": 0.35994464,
        "festival": 0.3104579,
        "pleasant": 0.28202823,
        "island": 0.11068311,
        "forecast": 0.34817883,
        "climate": 1.2159579,
        "humid": 0.8120001,
        "fiji": 0.28404987,
        "tropical": 1.0291497,
        "te": 0.15929775,
        "warm": 1.3682823,
        "kingston": 0.104007065,
        "culture": 0.1396368,
        "beach": 0.085419096,
        "visit": 0.007837615,
        "barbados": 0.23561467,
        "desert": 0.05903566
      }
    }
  }
}

POST /docs/_search
{
  "query": {
    "sparse_vector": {
      "field": "content_embedding",
      "inference_id": "my-elser-model",
      "query": "how is the weather in jamaica",
      "prune": true,
      "pruning_config": {
          "tokens_freq_ratio_threshold": 5,
          "tokens_weight_threshold": 0.4,
          "only_score_pruned_tokens": false
      }
    }
  }
}

POST /docs/_search
{
  "query": {
    "sparse_vector": {
      "field": "content_embedding",
      "query_vector": {
        "heat": 0.8471008,
        "atmosphere": 0.24251926,
        "very": 0.15496244,
        "brazil": 0.34641686,
        "winter": 0.5070546,
        "hardy": 0.15455529,
        "cold": 0.34308356,
        "sun": 0.052155916,
        "summer": 0.44945887,
        "beautiful": 0.28135294,
        "caribbean": 1.0542055,
        "jamaican": 1.4637164,
        "jamaica": 2.7778716,
        "geography": 0.48493838,
        "weather": 1.9407427,
        "temperature": 1.0894402,
        "season": 0.50890195,
        "quite": 0.20918,
        "cuba": 0.17437862,
        "rain": 0.14961956,
        "africa": 0.35994464,
        "festival": 0.3104579,
        "pleasant": 0.28202823,
        "island": 0.11068311,
        "forecast": 0.34817883,
        "climate": 1.2159579,
        "humid": 0.8120001,
        "fiji": 0.28404987,
        "tropical": 1.0291497,
        "te": 0.15929775,
        "warm": 1.3682823,
        "kingston": 0.104007065,
        "culture": 0.1396368,
        "beach": 0.085419096,
        "visit": 0.007837615,
        "barbados": 0.23561467,
        "desert": 0.05903566
      },
      "prune": true,
      "pruning_config": {
          "tokens_freq_ratio_threshold": 5,
          "tokens_weight_threshold": 0.4,
          "only_score_pruned_tokens": false
      }
    }
  }
}

kderusso avatar May 03 '24 14:05 kderusso

Hi @kderusso, I've created a changelog YAML for you.

elasticsearchmachine avatar May 07 '24 18:05 elasticsearchmachine

@elasticmachine merge upstream

kderusso avatar May 09 '24 17:05 kderusso

Pinging @elastic/ml-core (Team:ML)

elasticsearchmachine avatar May 10 '24 15:05 elasticsearchmachine

@elasticmachine update branch

kderusso avatar May 22 '24 16:05 kderusso