tantivy icon indicating copy to clipboard operation
tantivy copied to clipboard

Attempt to multiply with overflow

Open Barre opened this issue 10 months ago • 22 comments

While searching my index of around 3B documents which looks like this for the biggest segments:


{
  "index_settings": {
    "docstore_compression": "lz4",
    "docstore_blocksize": 16384
  },
  "segments": [
    {
      "segment_id": "66e5b4e1-b1d5-4aed-aba6-51565f3d6acc",
      "max_doc": 1054732330,
      "deletes": null
    },
    {
      "segment_id": "61483fc5-0bd4-4e85-b9d5-1723b6d90173",
      "max_doc": 105936249,
      "deletes": null
    },
    {
      "segment_id": "0646c5b8-8bf6-4a95-be54-27724cf3f0e4",
      "max_doc": 53501794,
      "deletes": null
    },
    {
      "segment_id": "965d3f1c-d4a7-4545-946e-382fc469885b",
      "max_doc": 53411250,
      "deletes": null
    },

Performing a search such as:


        let (count, top_docs) = searcher.search(
            &query,
            &(
                Count,
                TopDocs::with_limit(items_per_page)
                    .and_offset(offset)
                    .order_by_fast_field::<DateTime>(
                        "date",
                        tantivy::Order::Desc,
                    ),
            ),
        )?;

Produces the following in debug mode

thread 'tantivy-search-2' panicked at .cargo/git/checkouts/tantivy-f70b7ea03dadae9a/4aa8cd2/bitpacker/src/bitpacker.rs:97:28:
attempt to multiply with overflow
stack backtrace:
   0: rust_begin_unwind
             at /rustc/9fc6b43126469e3858e2fe86cafb4f0fd5068869/library/std/src/panicking.rs:665:5
   1: core::panicking::panic_fmt
             at /rustc/9fc6b43126469e3858e2fe86cafb4f0fd5068869/library/core/src/panicking.rs:76:14
   2: core::panicking::panic_const::panic_const_mul_overflow
             at /rustc/9fc6b43126469e3858e2fe86cafb4f0fd5068869/library/core/src/panicking.rs:182:21
   3: tantivy_bitpacker::bitpacker::BitUnpacker::get
             at .cargo/git/checkouts/tantivy-f70b7ea03dadae9a/4aa8cd2/bitpacker/src/bitpacker.rs:97:28
   4: <tantivy_columnar::column_values::u64_based::bitpacked::BitpackedReader as tantivy_columnar::column_values::ColumnValues>::get_val
             at .cargo/git/checkouts/tantivy-f70b7ea03dadae9a/4aa8cd2/columnar/src/column_values/u64_based/bitpacked.rs:64:55
   5: <tantivy_columnar::column_values::monotonic_column::MonotonicMappingColumn<C,T,Input> as tantivy_columnar::column_values::ColumnValues<Output>>::get_val
             at .cargo/git/checkouts/tantivy-f70b7ea03dadae9a/4aa8cd2/columnar/src/column_values/monotonic_column.rs:55:24
   6: <alloc::sync::Arc<dyn tantivy_columnar::column_values::ColumnValues<T>> as tantivy_columnar::column_values::ColumnValues<T>>::get_val
             at .cargo/git/checkouts/tantivy-f70b7ea03dadae9a/4aa8cd2/columnar/src/column_values/mod.rs:201:9
   7: tantivy_columnar::column::Column<T>::values_for_doc::{{closure}}
             at .cargo/git/checkouts/tantivy-f70b7ea03dadae9a/4aa8cd2/columnar/src/column/mod.rs:136:40
   8: core::ops::function::impls::<impl core::ops::function::FnOnce<A> for &mut F>::call_once
             at /rustc/9fc6b43126469e3858e2fe86cafb4f0fd5068869/library/core/src/ops/function.rs:305:13
   9: core::option::Option<T>::map
             at /rustc/9fc6b43126469e3858e2fe86cafb4f0fd5068869/library/core/src/option.rs:1113:29
  10: <core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::next
             at /rustc/9fc6b43126469e3858e2fe86cafb4f0fd5068869/library/core/src/iter/adapters/map.rs:107:26
  11: tantivy_columnar::column::Column<T>::first
             at .cargo/git/checkouts/tantivy-f70b7ea03dadae9a/4aa8cd2/columnar/src/column/mod.rs:88:9
  12: <tantivy_columnar::column::FirstValueWithDefault<T> as tantivy_columnar::column_values::ColumnValues<T>>::get_val
             at .cargo/git/checkouts/tantivy-f70b7ea03dadae9a/4aa8cd2/columnar/src/column/mod.rs:200:9
  13: <alloc::sync::Arc<dyn tantivy_columnar::column_values::ColumnValues<T>> as tantivy_columnar::column_values::ColumnValues<T>>::get_val
             at .cargo/git/checkouts/tantivy-f70b7ea03dadae9a/4aa8cd2/columnar/src/column_values/mod.rs:201:9
  14: <tantivy::collector::top_score_collector::ScorerByFastFieldReader as tantivy::collector::custom_score_top_collector::CustomSegmentScorer<u64>>::score
             at .cargo/git/checkouts/tantivy-f70b7ea03dadae9a/4aa8cd2/src/collector/top_score_collector.rs:149:21
  15: <tantivy::collector::custom_score_top_collector::CustomScoreTopSegmentCollector<T,TScore> as tantivy::collector::SegmentCollector>::collect
             at .cargo/git/checkouts/tantivy-f70b7ea03dadae9a/4aa8cd2/src/collector/custom_score_top_collector.rs:94:21
  16: <(Left,Right) as tantivy::collector::SegmentCollector>::collect
             at .cargo/git/checkouts/tantivy-f70b7ea03dadae9a/4aa8cd2/src/collector/mod.rs:342:9
  17: tantivy::collector::SegmentCollector::collect_block
             at .cargo/git/checkouts/tantivy-f70b7ea03dadae9a/4aa8cd2/src/collector/mod.rs:283:13
  18: tantivy::collector::Collector::collect_segment::{{closure}}
             at .cargo/git/checkouts/tantivy-f70b7ea03dadae9a/4aa8cd2/src/collector/mod.rs:199:21
  19: core::ops::function::impls::<impl core::ops::function::FnMut<A> for &mut F>::call_mut
             at /rustc/9fc6b43126469e3858e2fe86cafb4f0fd5068869/library/core/src/ops/function.rs:294:13
  20: tantivy::query::weight::for_each_docset_buffered
             at .cargo/git/checkouts/tantivy-f70b7ea03dadae9a/4aa8cd2/src/query/weight.rs:30:9
  21: tantivy::query::weight::Weight::for_each_no_score
             at .cargo/git/checkouts/tantivy-f70b7ea03dadae9a/4aa8cd2/src/query/weight.rs:109:9
  22: tantivy::collector::Collector::collect_segment
             at .cargo/git/checkouts/tantivy-f70b7ea03dadae9a/4aa8cd2/src/collector/mod.rs:198:17
  23: tantivy::core::searcher::Searcher::search_with_executor::{{closure}}
             at .cargo/git/checkouts/tantivy-f70b7ea03dadae9a/4aa8cd2/src/core/searcher.rs:231:17
  24: tantivy::core::executor::Executor::map::{{closure}}::{{closure}}
             at .cargo/git/checkouts/tantivy-f70b7ea03dadae9a/4aa8cd2/src/core/executor.rs:68:45
  25: rayon_core::scope::Scope::spawn::{{closure}}::{{closure}}
             at .cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/scope/mod.rs:526:57
  26: <core::panic::unwind_safe::AssertUnwindSafe<F> as core::ops::function::FnOnce<()>>::call_once
             at /rustc/9fc6b43126469e3858e2fe86cafb4f0fd5068869/library/core/src/panic/unwind_safe.rs:272:9
  27: std::panicking::try::do_call
             at /rustc/9fc6b43126469e3858e2fe86cafb4f0fd5068869/library/std/src/panicking.rs:557:40
  28: __rust_try
  29: std::panicking::try
             at /rustc/9fc6b43126469e3858e2fe86cafb4f0fd5068869/library/std/src/panicking.rs:520:19
  30: std::panic::catch_unwind
             at /rustc/9fc6b43126469e3858e2fe86cafb4f0fd5068869/library/std/src/panic.rs:358:14
  31: rayon_core::unwind::halt_unwinding
             at .cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/unwind.rs:17:5
  32: rayon_core::scope::ScopeBase::execute_job_closure
             at .cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/scope/mod.rs:689:28
  33: rayon_core::scope::ScopeBase::execute_job
             at .cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/scope/mod.rs:679:29
  34: rayon_core::scope::Scope::spawn::{{closure}}
             at .cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/scope/mod.rs:526:13
  35: <rayon_core::job::HeapJob<BODY> as rayon_core::job::Job>::execute
             at .cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/job.rs:169:9
  36: rayon_core::job::JobRef::execute
             at .cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/job.rs:64:9
  37: rayon_core::registry::WorkerThread::execute
             at .cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/registry.rs:860:9
  38: rayon_core::registry::WorkerThread::wait_until_cold
             at .cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/registry.rs:794:21
  39: rayon_core::registry::WorkerThread::wait_until
             at .cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/registry.rs:769:13
  40: rayon_core::registry::WorkerThread::wait_until_out_of_work
             at .cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/registry.rs:818:9
  41: rayon_core::registry::main_loop
             at .cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/registry.rs:923:5
  42: rayon_core::registry::ThreadBuilder::run
             at .cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/registry.rs:53:18
  43: <rayon_core::registry::DefaultSpawn as rayon_core::registry::ThreadSpawn>::spawn::{{closure}}
             at .cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/registry.rs:98:20
note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose backtrace.

However, my query runs in release mode but ordering/sorting is broken.

I am running tantivy at rev 4aa8cd24707be1255599284f52eb6d388cf86ae8

Barre avatar Feb 04 '25 17:02 Barre

We have a limit currently after which don't merge segments. This limit does not take into account multi values in columns, which could exceed the value space u32. As a temporary fix we can reduce MAX_DOC_LIMIT, but a proper solution would check the number of values in each column.

/// Segment's max doc must be `< MAX_DOC_LIMIT`.
///
/// We do not allow segments with more than
pub const MAX_DOC_LIMIT: u32 = 1 << 31;

PSeitz avatar Feb 04 '25 17:02 PSeitz

We have a limit currently after which don't merge segments. This limit does not take into account multi values in columns, which could exceed the value space u32. As a temporary fix we can reduce MAX_DOC_LIMIT, but a proper solution would check the number of values in each column.

/// Segment's max doc must be `< MAX_DOC_LIMIT`.
///
/// We do not allow segments with more than
pub const MAX_DOC_LIMIT: u32 = 1 << 31;

Does that mean my current index is "toasted" and I should basically reindex? (While taking care of not having these larger segments).

Barre avatar Feb 04 '25 17:02 Barre

I think yes, it seems toasted. How many multi values per doc on average does your column have?

PSeitz avatar Feb 04 '25 18:02 PSeitz

I think yes, it seems toasted. How many multi values per doc on average does your column have?

I don't actually do multi values at all (If you mean storing multiple the same ""key"" per doc)

Barre avatar Feb 04 '25 18:02 Barre

So there are no arrays in your data? In that case, the issue is probably still similar, but not sure what exactly causes it.

PSeitz avatar Feb 04 '25 18:02 PSeitz

Yes, no arrays.

Here's my schema

        let mut document = doc!(
            self.indexer.id =>id,
            self.indexer.date => DateTime::from_timestamp_secs(something as i64),
            self.indexer.string1 =>String::new(),
            self.indexer.string2 => String::new(),
            self.indexer.string3 => String::new(),
        );

Barre avatar Feb 04 '25 19:02 Barre

@PSeitz Would a reproduction be useful? I've been thinking about generating a 1B docs segment from a minimal repo to see how things goes.

Barre avatar Feb 06 '25 18:02 Barre

@Barre is there a rationale to having such gigantic segments? We recommend around 10millions docs per segment.

fulmicoton avatar Feb 07 '25 07:02 fulmicoton

Does that mean my current index is "toasted" and I should basically reindex? (While taking care of not having these larger segments).

I'm afraid yes

fulmicoton avatar Feb 07 '25 07:02 fulmicoton

@PSeitz Would a reproduction be useful? I've been thinking about generating a 1B docs segment from a minimal repo to see how things goes.

Thanks to the stack trace you join, we know the problem is coming from here.

It computes the address of the value to get, but expressed in "bits" (we do bitpacking). It uses u32 to do so. Since you have billions of doc in the segment, your bit array is larger than 2^32 / 8 bytes and this overflows.

If the data is critical and reindexing is not an option for you, you can try to modify tantivy to use u64 in this computation.

fulmicoton avatar Feb 07 '25 07:02 fulmicoton

@Barre is there a rationale to having such gigantic segments? We recommend around 10millions docs per segment.

Thanks for the feedback on segment sizes! In my case, 10M would probably mean too many segments, and the compression ratio wouldn't be as good.

There's no particular reason why I "needed" 1B doc segments. I just indexed with default settings and it "happened naturally." Wouldn't it make sense to lower that maximum docs limit if the default merge policy can make things problematic?

If the data is critical and reindexing is not an option for you, you can try to modify tantivy to use u64 in this computation.

I ended up reindexing with a 106M max doc limit.

Barre avatar Feb 07 '25 13:02 Barre

In my case, 10M would probably mean too many segments, and the compression ratio wouldn't be as good.

I don't think this is true.

I just indexed with default settings and it "happened naturally."

This is odd. The default settings does not do this. You use a program that merges everything at the end or something like that no?

fulmicoton avatar Feb 07 '25 16:02 fulmicoton

I don't think this is true.

I was specifically thinking about the FST that may become more efficient as more entries it contains.

This is odd. The default settings does not do this. You use a program that merges everything at the end or something like that no?

I don't. It's just default settings with default merge policy.

Barre avatar Feb 07 '25 17:02 Barre

Here's how I open my index:


let mut index = IndexBuilder::new()
            .schema(schema.clone())
            .settings(IndexSettings {
                docstore_compression: tantivy::store::Compressor::Lz4,
                docstore_compress_dedicated_thread: true,
                ..default::Default::default()
            })
            .open_or_create(directory)?;


let index_writer_options = IndexWriterOptions::builder()
    .num_merge_threads(num_cpus::get_physical())
    .num_worker_threads(num_cpus::get_physical())
    .memory_budget_per_thread(1_000_000_000)
    .build();

Maybe it's because of

.num_merge_threads(num_cpus::get_physical())

which makes merging more eager? Not quite default in that case, you are right.

Barre avatar Feb 07 '25 17:02 Barre

No... This is not it. Can you share the entire main?

fulmicoton avatar Feb 07 '25 17:02 fulmicoton


#[derive(Clone)]
pub struct Indexer {
    pub id: Field,
    pub text_indexing: Field,
    pub schema: Schema,
    pub index: Index,
    pub index_reader: IndexReader,
}

impl Indexer {
    pub fn new() -> anyhow::Result<Self> {
        let mut schema_builder = Schema::builder();
        let text_indexing = TextFieldIndexing::default()
            .set_tokenizer("custom_tokenizer")
            .set_index_option(IndexRecordOption::Basic);
        let text_options = TextOptions::default()
            .set_indexing_options(text_indexing)
            .set_stored();

        let date_options = DateOptions::from(INDEXED)
            .set_stored()
            .set_fast()
            .set_precision(tantivy::schema::DateTimePrecision::Seconds);

        let id = schema_builder.add_u64_field("id", FAST | STORED);

        let date =
            schema_builder.add_date_field("date", date_options);

        let schema = schema_builder.build();

        let directory = tantivy::directory::MmapDirectory::open("/tank/tantivy/")?;

        let mut index = IndexBuilder::new()
            .schema(schema.clone())
            .settings(IndexSettings {
                docstore_compression: tantivy::store::Compressor::Lz4,
                docstore_compress_dedicated_thread: true,
                ..default::Default::default()
            })
            .open_or_create(directory)?;

        index.set_multithread_executor(num_cpus::get()).unwrap();

        index.tokenizers().register(
            "custom_tokenizer",
            TextAnalyzer::builder(CustomTokenizer)
                .filter(LowerCaser)
                .build(),
        );

        let index_reader = index.reader()?;

        Ok(Self {
            id,
            text_indexing,
            schema,
            index,
            index_reader,
        })
    }

    pub fn get_writer(&self) -> anyhow::Result<IndexWriter> {
        let index_writer_options = IndexWriterOptions::builder()
            .num_merge_threads(num_cpus::get_physical())
            .num_worker_threads(num_cpus::get_physical())
            .memory_budget_per_thread(1_000_000_000)
            .build();

        Ok(self.index.writer_with_options(index_writer_options)?)
    }
}

Barre avatar Feb 07 '25 17:02 Barre

still nothing special in there.

fulmicoton avatar Feb 07 '25 17:02 fulmicoton

To get segments that large, you should have overridden the default merge policy, or merged index on your own. You don't have code doing this?

fulmicoton avatar Feb 07 '25 17:02 fulmicoton

To get segments that large, you should have overridden the default merge policy, or merged index on your own. You don't have code doing this?

I didn't do anything like this.

Though, unless I am missing something, I am not seeing anything preventing merging large segments together in https://github.com/quickwit-oss/tantivy/blob/4aa8cd24707be1255599284f52eb6d388cf86ae8/src/indexer/log_merge_policy.rs

Barre avatar Feb 07 '25 18:02 Barre

Just an update with logs I got today to show that this happens without doing anything weird with the merge policies:

tantivy::indexer::segment_updater] Starting merge  - [Seg("990299f8"), Seg("6c119178"), Seg("e56906ea"), Seg("8e9b2d3f"), Seg("9d77cd01"), Seg("5e378e0f"), Seg("f208a97a"), Seg("2efae50e"), Seg("621dfa08"), Seg("f7f64e9b"), Seg("ad1d1fff"), Seg("ba344d38"), Seg("ad33db6e"), Seg("d9dcd4cc"), Seg("d2262b24"), Seg("a02fb0dd"), Seg("61010fb7"), Seg("6ed569b1"), Seg("ece522d3"), Seg("eb046e13"), Seg("1ad9fba7"), Seg("d0673cb1"), Seg("f2b230fe"), Seg("a1098374"), Seg("7396832c"), Seg("f37d20e9"), Seg("db98459d"), Seg("3440ffe7"), Seg("fef807fe"), Seg("e48b3a38"), Seg("b67ab22c"), Seg("311e5fcf")]
tantivy::indexer::segment_updater] Merge of [Seg("990299f8"), Seg("6c119178"), Seg("e56906ea"), Seg("8e9b2d3f"), Seg("9d77cd01"), Seg("5e378e0f"), Seg("f208a97a"), Seg("2efae50e"), Seg("621dfa08"), Seg("f7f64e9b"), Seg("ad1d1fff"), Seg("ba344d38"), Seg("ad33db6e"), Seg("d9dcd4cc"), Seg("d2262b24"), Seg("a02fb0dd"), Seg("61010fb7"), Seg("6ed569b1"), Seg("ece522d3"), Seg("eb046e13"), Seg("1ad9fba7"), Seg("d0673cb1"), Seg("f2b230fe"), Seg("a1098374"), Seg("7396832c"), Seg("f37d20e9"), Seg("db98459d"), Seg("3440ffe7"), Seg("fef807fe"), Seg("e48b3a38"), Seg("b67ab22c"), Seg("311e5fcf")] was cancelled: InvalidArgument("The segment resulting from this merge would have 113750818 docs,which exceeds the limit 106000000.")

Merge of {..} was cancelled: InvalidArgument("The segment resulting from this merge would have 113750818 docs,which exceeds the limit 106000000.")

Barre avatar Apr 17 '25 21:04 Barre

The LogMergePolicy only checks if a single segment does not exceed 10 Mio docs, but not that a group does not exceed the global threshold of 1 billion.

        let size_sorted_segments = segments
            .iter()
            .filter(|seg| seg.num_docs() <= (self.max_docs_before_merge as u32)) // 10 mio check
            .sorted_by_key(|seg| std::cmp::Reverse(seg.max_doc()))
            .collect::<Vec<&SegmentMeta>>();

That's a bug, but you don't reach this here with 32 segments in the merge.

PSeitz avatar Apr 18 '25 02:04 PSeitz

The LogMergePolicy only checks if a single segment does not exceed 10 Mio docs, but not that a group does not exceed the global threshold of 1 billion.

    let size_sorted_segments = segments
        .iter()
        .filter(|seg| seg.num_docs() <= (self.max_docs_before_merge as u32)) // 10 mio check
        .sorted_by_key(|seg| std::cmp::Reverse(seg.max_doc()))
        .collect::<Vec<&SegmentMeta>>();

That's a bug, but you don't reach this here with 32 segments in the merge.

For reference, what I changed to stay safe: https://github.com/Barre/tantivy/blob/8c7866be64a5573fc1884c4fe49bc8ac1e6b0b52/src/indexer/merger.rs#L27

Barre avatar Apr 18 '25 08:04 Barre