tantivy
tantivy copied to clipboard
Attempt to multiply with overflow
While searching my index of around 3B documents which looks like this for the biggest segments:
{
"index_settings": {
"docstore_compression": "lz4",
"docstore_blocksize": 16384
},
"segments": [
{
"segment_id": "66e5b4e1-b1d5-4aed-aba6-51565f3d6acc",
"max_doc": 1054732330,
"deletes": null
},
{
"segment_id": "61483fc5-0bd4-4e85-b9d5-1723b6d90173",
"max_doc": 105936249,
"deletes": null
},
{
"segment_id": "0646c5b8-8bf6-4a95-be54-27724cf3f0e4",
"max_doc": 53501794,
"deletes": null
},
{
"segment_id": "965d3f1c-d4a7-4545-946e-382fc469885b",
"max_doc": 53411250,
"deletes": null
},
Performing a search such as:
let (count, top_docs) = searcher.search(
&query,
&(
Count,
TopDocs::with_limit(items_per_page)
.and_offset(offset)
.order_by_fast_field::<DateTime>(
"date",
tantivy::Order::Desc,
),
),
)?;
Produces the following in debug mode
thread 'tantivy-search-2' panicked at .cargo/git/checkouts/tantivy-f70b7ea03dadae9a/4aa8cd2/bitpacker/src/bitpacker.rs:97:28:
attempt to multiply with overflow
stack backtrace:
0: rust_begin_unwind
at /rustc/9fc6b43126469e3858e2fe86cafb4f0fd5068869/library/std/src/panicking.rs:665:5
1: core::panicking::panic_fmt
at /rustc/9fc6b43126469e3858e2fe86cafb4f0fd5068869/library/core/src/panicking.rs:76:14
2: core::panicking::panic_const::panic_const_mul_overflow
at /rustc/9fc6b43126469e3858e2fe86cafb4f0fd5068869/library/core/src/panicking.rs:182:21
3: tantivy_bitpacker::bitpacker::BitUnpacker::get
at .cargo/git/checkouts/tantivy-f70b7ea03dadae9a/4aa8cd2/bitpacker/src/bitpacker.rs:97:28
4: <tantivy_columnar::column_values::u64_based::bitpacked::BitpackedReader as tantivy_columnar::column_values::ColumnValues>::get_val
at .cargo/git/checkouts/tantivy-f70b7ea03dadae9a/4aa8cd2/columnar/src/column_values/u64_based/bitpacked.rs:64:55
5: <tantivy_columnar::column_values::monotonic_column::MonotonicMappingColumn<C,T,Input> as tantivy_columnar::column_values::ColumnValues<Output>>::get_val
at .cargo/git/checkouts/tantivy-f70b7ea03dadae9a/4aa8cd2/columnar/src/column_values/monotonic_column.rs:55:24
6: <alloc::sync::Arc<dyn tantivy_columnar::column_values::ColumnValues<T>> as tantivy_columnar::column_values::ColumnValues<T>>::get_val
at .cargo/git/checkouts/tantivy-f70b7ea03dadae9a/4aa8cd2/columnar/src/column_values/mod.rs:201:9
7: tantivy_columnar::column::Column<T>::values_for_doc::{{closure}}
at .cargo/git/checkouts/tantivy-f70b7ea03dadae9a/4aa8cd2/columnar/src/column/mod.rs:136:40
8: core::ops::function::impls::<impl core::ops::function::FnOnce<A> for &mut F>::call_once
at /rustc/9fc6b43126469e3858e2fe86cafb4f0fd5068869/library/core/src/ops/function.rs:305:13
9: core::option::Option<T>::map
at /rustc/9fc6b43126469e3858e2fe86cafb4f0fd5068869/library/core/src/option.rs:1113:29
10: <core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::next
at /rustc/9fc6b43126469e3858e2fe86cafb4f0fd5068869/library/core/src/iter/adapters/map.rs:107:26
11: tantivy_columnar::column::Column<T>::first
at .cargo/git/checkouts/tantivy-f70b7ea03dadae9a/4aa8cd2/columnar/src/column/mod.rs:88:9
12: <tantivy_columnar::column::FirstValueWithDefault<T> as tantivy_columnar::column_values::ColumnValues<T>>::get_val
at .cargo/git/checkouts/tantivy-f70b7ea03dadae9a/4aa8cd2/columnar/src/column/mod.rs:200:9
13: <alloc::sync::Arc<dyn tantivy_columnar::column_values::ColumnValues<T>> as tantivy_columnar::column_values::ColumnValues<T>>::get_val
at .cargo/git/checkouts/tantivy-f70b7ea03dadae9a/4aa8cd2/columnar/src/column_values/mod.rs:201:9
14: <tantivy::collector::top_score_collector::ScorerByFastFieldReader as tantivy::collector::custom_score_top_collector::CustomSegmentScorer<u64>>::score
at .cargo/git/checkouts/tantivy-f70b7ea03dadae9a/4aa8cd2/src/collector/top_score_collector.rs:149:21
15: <tantivy::collector::custom_score_top_collector::CustomScoreTopSegmentCollector<T,TScore> as tantivy::collector::SegmentCollector>::collect
at .cargo/git/checkouts/tantivy-f70b7ea03dadae9a/4aa8cd2/src/collector/custom_score_top_collector.rs:94:21
16: <(Left,Right) as tantivy::collector::SegmentCollector>::collect
at .cargo/git/checkouts/tantivy-f70b7ea03dadae9a/4aa8cd2/src/collector/mod.rs:342:9
17: tantivy::collector::SegmentCollector::collect_block
at .cargo/git/checkouts/tantivy-f70b7ea03dadae9a/4aa8cd2/src/collector/mod.rs:283:13
18: tantivy::collector::Collector::collect_segment::{{closure}}
at .cargo/git/checkouts/tantivy-f70b7ea03dadae9a/4aa8cd2/src/collector/mod.rs:199:21
19: core::ops::function::impls::<impl core::ops::function::FnMut<A> for &mut F>::call_mut
at /rustc/9fc6b43126469e3858e2fe86cafb4f0fd5068869/library/core/src/ops/function.rs:294:13
20: tantivy::query::weight::for_each_docset_buffered
at .cargo/git/checkouts/tantivy-f70b7ea03dadae9a/4aa8cd2/src/query/weight.rs:30:9
21: tantivy::query::weight::Weight::for_each_no_score
at .cargo/git/checkouts/tantivy-f70b7ea03dadae9a/4aa8cd2/src/query/weight.rs:109:9
22: tantivy::collector::Collector::collect_segment
at .cargo/git/checkouts/tantivy-f70b7ea03dadae9a/4aa8cd2/src/collector/mod.rs:198:17
23: tantivy::core::searcher::Searcher::search_with_executor::{{closure}}
at .cargo/git/checkouts/tantivy-f70b7ea03dadae9a/4aa8cd2/src/core/searcher.rs:231:17
24: tantivy::core::executor::Executor::map::{{closure}}::{{closure}}
at .cargo/git/checkouts/tantivy-f70b7ea03dadae9a/4aa8cd2/src/core/executor.rs:68:45
25: rayon_core::scope::Scope::spawn::{{closure}}::{{closure}}
at .cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/scope/mod.rs:526:57
26: <core::panic::unwind_safe::AssertUnwindSafe<F> as core::ops::function::FnOnce<()>>::call_once
at /rustc/9fc6b43126469e3858e2fe86cafb4f0fd5068869/library/core/src/panic/unwind_safe.rs:272:9
27: std::panicking::try::do_call
at /rustc/9fc6b43126469e3858e2fe86cafb4f0fd5068869/library/std/src/panicking.rs:557:40
28: __rust_try
29: std::panicking::try
at /rustc/9fc6b43126469e3858e2fe86cafb4f0fd5068869/library/std/src/panicking.rs:520:19
30: std::panic::catch_unwind
at /rustc/9fc6b43126469e3858e2fe86cafb4f0fd5068869/library/std/src/panic.rs:358:14
31: rayon_core::unwind::halt_unwinding
at .cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/unwind.rs:17:5
32: rayon_core::scope::ScopeBase::execute_job_closure
at .cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/scope/mod.rs:689:28
33: rayon_core::scope::ScopeBase::execute_job
at .cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/scope/mod.rs:679:29
34: rayon_core::scope::Scope::spawn::{{closure}}
at .cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/scope/mod.rs:526:13
35: <rayon_core::job::HeapJob<BODY> as rayon_core::job::Job>::execute
at .cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/job.rs:169:9
36: rayon_core::job::JobRef::execute
at .cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/job.rs:64:9
37: rayon_core::registry::WorkerThread::execute
at .cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/registry.rs:860:9
38: rayon_core::registry::WorkerThread::wait_until_cold
at .cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/registry.rs:794:21
39: rayon_core::registry::WorkerThread::wait_until
at .cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/registry.rs:769:13
40: rayon_core::registry::WorkerThread::wait_until_out_of_work
at .cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/registry.rs:818:9
41: rayon_core::registry::main_loop
at .cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/registry.rs:923:5
42: rayon_core::registry::ThreadBuilder::run
at .cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/registry.rs:53:18
43: <rayon_core::registry::DefaultSpawn as rayon_core::registry::ThreadSpawn>::spawn::{{closure}}
at .cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/registry.rs:98:20
note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose backtrace.
However, my query runs in release mode but ordering/sorting is broken.
I am running tantivy at rev 4aa8cd24707be1255599284f52eb6d388cf86ae8
We have a limit currently after which don't merge segments. This limit does not take into account multi values in columns, which could exceed the value space u32. As a temporary fix we can reduce MAX_DOC_LIMIT, but a proper solution would check the number of values in each column.
/// Segment's max doc must be `< MAX_DOC_LIMIT`.
///
/// We do not allow segments with more than
pub const MAX_DOC_LIMIT: u32 = 1 << 31;
We have a limit currently after which don't merge segments. This limit does not take into account multi values in columns, which could exceed the value space
u32. As a temporary fix we can reduceMAX_DOC_LIMIT, but a proper solution would check the number of values in each column./// Segment's max doc must be `< MAX_DOC_LIMIT`. /// /// We do not allow segments with more than pub const MAX_DOC_LIMIT: u32 = 1 << 31;
Does that mean my current index is "toasted" and I should basically reindex? (While taking care of not having these larger segments).
I think yes, it seems toasted. How many multi values per doc on average does your column have?
I think yes, it seems toasted. How many multi values per doc on average does your column have?
I don't actually do multi values at all (If you mean storing multiple the same ""key"" per doc)
So there are no arrays in your data? In that case, the issue is probably still similar, but not sure what exactly causes it.
Yes, no arrays.
Here's my schema
let mut document = doc!(
self.indexer.id =>id,
self.indexer.date => DateTime::from_timestamp_secs(something as i64),
self.indexer.string1 =>String::new(),
self.indexer.string2 => String::new(),
self.indexer.string3 => String::new(),
);
@PSeitz Would a reproduction be useful? I've been thinking about generating a 1B docs segment from a minimal repo to see how things goes.
@Barre is there a rationale to having such gigantic segments? We recommend around 10millions docs per segment.
Does that mean my current index is "toasted" and I should basically reindex? (While taking care of not having these larger segments).
I'm afraid yes
@PSeitz Would a reproduction be useful? I've been thinking about generating a 1B docs segment from a minimal repo to see how things goes.
Thanks to the stack trace you join, we know the problem is coming from here.
It computes the address of the value to get, but expressed in "bits" (we do bitpacking). It uses u32 to do so. Since you have billions of doc in the segment, your bit array is larger than 2^32 / 8 bytes and this overflows.
If the data is critical and reindexing is not an option for you, you can try to modify tantivy to use u64 in this computation.
@Barre is there a rationale to having such gigantic segments? We recommend around 10millions docs per segment.
Thanks for the feedback on segment sizes! In my case, 10M would probably mean too many segments, and the compression ratio wouldn't be as good.
There's no particular reason why I "needed" 1B doc segments. I just indexed with default settings and it "happened naturally." Wouldn't it make sense to lower that maximum docs limit if the default merge policy can make things problematic?
If the data is critical and reindexing is not an option for you, you can try to modify tantivy to use u64 in this computation.
I ended up reindexing with a 106M max doc limit.
In my case, 10M would probably mean too many segments, and the compression ratio wouldn't be as good.
I don't think this is true.
I just indexed with default settings and it "happened naturally."
This is odd. The default settings does not do this. You use a program that merges everything at the end or something like that no?
I don't think this is true.
I was specifically thinking about the FST that may become more efficient as more entries it contains.
This is odd. The default settings does not do this. You use a program that merges everything at the end or something like that no?
I don't. It's just default settings with default merge policy.
Here's how I open my index:
let mut index = IndexBuilder::new()
.schema(schema.clone())
.settings(IndexSettings {
docstore_compression: tantivy::store::Compressor::Lz4,
docstore_compress_dedicated_thread: true,
..default::Default::default()
})
.open_or_create(directory)?;
let index_writer_options = IndexWriterOptions::builder()
.num_merge_threads(num_cpus::get_physical())
.num_worker_threads(num_cpus::get_physical())
.memory_budget_per_thread(1_000_000_000)
.build();
Maybe it's because of
.num_merge_threads(num_cpus::get_physical())
which makes merging more eager? Not quite default in that case, you are right.
No... This is not it. Can you share the entire main?
#[derive(Clone)]
pub struct Indexer {
pub id: Field,
pub text_indexing: Field,
pub schema: Schema,
pub index: Index,
pub index_reader: IndexReader,
}
impl Indexer {
pub fn new() -> anyhow::Result<Self> {
let mut schema_builder = Schema::builder();
let text_indexing = TextFieldIndexing::default()
.set_tokenizer("custom_tokenizer")
.set_index_option(IndexRecordOption::Basic);
let text_options = TextOptions::default()
.set_indexing_options(text_indexing)
.set_stored();
let date_options = DateOptions::from(INDEXED)
.set_stored()
.set_fast()
.set_precision(tantivy::schema::DateTimePrecision::Seconds);
let id = schema_builder.add_u64_field("id", FAST | STORED);
let date =
schema_builder.add_date_field("date", date_options);
let schema = schema_builder.build();
let directory = tantivy::directory::MmapDirectory::open("/tank/tantivy/")?;
let mut index = IndexBuilder::new()
.schema(schema.clone())
.settings(IndexSettings {
docstore_compression: tantivy::store::Compressor::Lz4,
docstore_compress_dedicated_thread: true,
..default::Default::default()
})
.open_or_create(directory)?;
index.set_multithread_executor(num_cpus::get()).unwrap();
index.tokenizers().register(
"custom_tokenizer",
TextAnalyzer::builder(CustomTokenizer)
.filter(LowerCaser)
.build(),
);
let index_reader = index.reader()?;
Ok(Self {
id,
text_indexing,
schema,
index,
index_reader,
})
}
pub fn get_writer(&self) -> anyhow::Result<IndexWriter> {
let index_writer_options = IndexWriterOptions::builder()
.num_merge_threads(num_cpus::get_physical())
.num_worker_threads(num_cpus::get_physical())
.memory_budget_per_thread(1_000_000_000)
.build();
Ok(self.index.writer_with_options(index_writer_options)?)
}
}
still nothing special in there.
To get segments that large, you should have overridden the default merge policy, or merged index on your own. You don't have code doing this?
To get segments that large, you should have overridden the default merge policy, or merged index on your own. You don't have code doing this?
I didn't do anything like this.
Though, unless I am missing something, I am not seeing anything preventing merging large segments together in https://github.com/quickwit-oss/tantivy/blob/4aa8cd24707be1255599284f52eb6d388cf86ae8/src/indexer/log_merge_policy.rs
Just an update with logs I got today to show that this happens without doing anything weird with the merge policies:
tantivy::indexer::segment_updater] Starting merge - [Seg("990299f8"), Seg("6c119178"), Seg("e56906ea"), Seg("8e9b2d3f"), Seg("9d77cd01"), Seg("5e378e0f"), Seg("f208a97a"), Seg("2efae50e"), Seg("621dfa08"), Seg("f7f64e9b"), Seg("ad1d1fff"), Seg("ba344d38"), Seg("ad33db6e"), Seg("d9dcd4cc"), Seg("d2262b24"), Seg("a02fb0dd"), Seg("61010fb7"), Seg("6ed569b1"), Seg("ece522d3"), Seg("eb046e13"), Seg("1ad9fba7"), Seg("d0673cb1"), Seg("f2b230fe"), Seg("a1098374"), Seg("7396832c"), Seg("f37d20e9"), Seg("db98459d"), Seg("3440ffe7"), Seg("fef807fe"), Seg("e48b3a38"), Seg("b67ab22c"), Seg("311e5fcf")]
tantivy::indexer::segment_updater] Merge of [Seg("990299f8"), Seg("6c119178"), Seg("e56906ea"), Seg("8e9b2d3f"), Seg("9d77cd01"), Seg("5e378e0f"), Seg("f208a97a"), Seg("2efae50e"), Seg("621dfa08"), Seg("f7f64e9b"), Seg("ad1d1fff"), Seg("ba344d38"), Seg("ad33db6e"), Seg("d9dcd4cc"), Seg("d2262b24"), Seg("a02fb0dd"), Seg("61010fb7"), Seg("6ed569b1"), Seg("ece522d3"), Seg("eb046e13"), Seg("1ad9fba7"), Seg("d0673cb1"), Seg("f2b230fe"), Seg("a1098374"), Seg("7396832c"), Seg("f37d20e9"), Seg("db98459d"), Seg("3440ffe7"), Seg("fef807fe"), Seg("e48b3a38"), Seg("b67ab22c"), Seg("311e5fcf")] was cancelled: InvalidArgument("The segment resulting from this merge would have 113750818 docs,which exceeds the limit 106000000.")
Merge of {..} was cancelled: InvalidArgument("The segment resulting from this merge would have 113750818 docs,which exceeds the limit 106000000.")
The LogMergePolicy only checks if a single segment does not exceed 10 Mio docs, but not that a group does not exceed the global threshold of 1 billion.
let size_sorted_segments = segments
.iter()
.filter(|seg| seg.num_docs() <= (self.max_docs_before_merge as u32)) // 10 mio check
.sorted_by_key(|seg| std::cmp::Reverse(seg.max_doc()))
.collect::<Vec<&SegmentMeta>>();
That's a bug, but you don't reach this here with 32 segments in the merge.
The
LogMergePolicyonly checks if a single segment does not exceed 10 Mio docs, but not that a group does not exceed the global threshold of 1 billion.let size_sorted_segments = segments .iter() .filter(|seg| seg.num_docs() <= (self.max_docs_before_merge as u32)) // 10 mio check .sorted_by_key(|seg| std::cmp::Reverse(seg.max_doc())) .collect::<Vec<&SegmentMeta>>();That's a bug, but you don't reach this here with 32 segments in the merge.
For reference, what I changed to stay safe: https://github.com/Barre/tantivy/blob/8c7866be64a5573fc1884c4fe49bc8ac1e6b0b52/src/indexer/merger.rs#L27