elasticsearch
elasticsearch copied to clipboard
Add sparse_vector query
Relates to https://github.com/elastic/elasticsearch/issues/106261
This PR introduces a new sparse_vector
query that combines the functionality of the text_expansion
and weighted_tokens
queries. Eventually this query will replace the other two queries.
Actions that will occur in future PRs:
- Deprecating the
text_expansion
andweighted_tokens
queries - Removing references to the
weighted_tokens
query in theSparseVectorQueryBuilder
- Moving the
sparse_vector
query outside of the ML plugin (requires some inference API work)
Examples of how to use this new query type:
POST /docs/_search
{
"query": {
"sparse_vector": {
"field": "content_embedding",
"inference_id": "my-elser-model",
"query": "how is the weather in jamaica"
}
}
}
POST /docs/_search
{
"query": {
"sparse_vector": {
"field": "content_embedding",
"query_vector": {
"heat": 0.8471008,
"atmosphere": 0.24251926,
"very": 0.15496244,
"brazil": 0.34641686,
"winter": 0.5070546,
"hardy": 0.15455529,
"cold": 0.34308356,
"sun": 0.052155916,
"summer": 0.44945887,
"beautiful": 0.28135294,
"caribbean": 1.0542055,
"jamaican": 1.4637164,
"jamaica": 2.7778716,
"geography": 0.48493838,
"weather": 1.9407427,
"temperature": 1.0894402,
"season": 0.50890195,
"quite": 0.20918,
"cuba": 0.17437862,
"rain": 0.14961956,
"africa": 0.35994464,
"festival": 0.3104579,
"pleasant": 0.28202823,
"island": 0.11068311,
"forecast": 0.34817883,
"climate": 1.2159579,
"humid": 0.8120001,
"fiji": 0.28404987,
"tropical": 1.0291497,
"te": 0.15929775,
"warm": 1.3682823,
"kingston": 0.104007065,
"culture": 0.1396368,
"beach": 0.085419096,
"visit": 0.007837615,
"barbados": 0.23561467,
"desert": 0.05903566
}
}
}
}
POST /docs/_search
{
"query": {
"sparse_vector": {
"field": "content_embedding",
"inference_id": "my-elser-model",
"query": "how is the weather in jamaica",
"prune": true,
"pruning_config": {
"tokens_freq_ratio_threshold": 5,
"tokens_weight_threshold": 0.4,
"only_score_pruned_tokens": false
}
}
}
}
POST /docs/_search
{
"query": {
"sparse_vector": {
"field": "content_embedding",
"query_vector": {
"heat": 0.8471008,
"atmosphere": 0.24251926,
"very": 0.15496244,
"brazil": 0.34641686,
"winter": 0.5070546,
"hardy": 0.15455529,
"cold": 0.34308356,
"sun": 0.052155916,
"summer": 0.44945887,
"beautiful": 0.28135294,
"caribbean": 1.0542055,
"jamaican": 1.4637164,
"jamaica": 2.7778716,
"geography": 0.48493838,
"weather": 1.9407427,
"temperature": 1.0894402,
"season": 0.50890195,
"quite": 0.20918,
"cuba": 0.17437862,
"rain": 0.14961956,
"africa": 0.35994464,
"festival": 0.3104579,
"pleasant": 0.28202823,
"island": 0.11068311,
"forecast": 0.34817883,
"climate": 1.2159579,
"humid": 0.8120001,
"fiji": 0.28404987,
"tropical": 1.0291497,
"te": 0.15929775,
"warm": 1.3682823,
"kingston": 0.104007065,
"culture": 0.1396368,
"beach": 0.085419096,
"visit": 0.007837615,
"barbados": 0.23561467,
"desert": 0.05903566
},
"prune": true,
"pruning_config": {
"tokens_freq_ratio_threshold": 5,
"tokens_weight_threshold": 0.4,
"only_score_pruned_tokens": false
}
}
}
}
Hi @kderusso, I've created a changelog YAML for you.
@elasticmachine merge upstream
Pinging @elastic/ml-core (Team:ML)
@elasticmachine update branch