tantivy
tantivy copied to clipboard
Panic occured while committing writer with sort by text field.
Which version of tantivy are you using? tantivy v0.21.1
To Reproduce
use tantivy::schema::{SchemaBuilder, FAST, STORED, STRING, TEXT};
use tantivy::{doc, IndexBuilder, IndexSettings, IndexSortByField, Order};
fn main() -> tantivy::Result<()> {
let mut schema_builder = SchemaBuilder::new();
let id = schema_builder.add_text_field("id", STRING | FAST | STORED);
let name = schema_builder.add_text_field("name", TEXT | STORED);
let index = IndexBuilder::new()
.schema(schema_builder.build())
.settings(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "id".to_string(),
order: Order::Asc,
}),
..Default::default()
})
.create_in_ram()?;
let mut writer = index.writer(50_000_000)?;
writer.add_document(doc! {
id => "0001",
name => "name1"
})?;
writer.add_document(doc! {
id => "0002",
name => "name2"
})?;
writer.commit()?;
writer.wait_merging_threads()?;
Ok(())
}
index out of bounds: the len is 0 but the index is 0stack backtrace:
0: std::panicking::begin_panic_handler
at /rustc/8ace7ea1f7cbba7b4f031e66c54ca237a0d65de6/library\std\src\panicking.rs:647
1: core::panicking::panic_fmt
at /rustc/8ace7ea1f7cbba7b4f031e66c54ca237a0d65de6/library\core\src\panicking.rs:72
2: core::panicking::panic_bounds_check
at /rustc/8ace7ea1f7cbba7b4f031e66c54ca237a0d65de6/library\core\src\panicking.rs:208
3: core::slice::index::impl$2::index<u32>
at /rustc/8ace7ea1f7cbba7b4f031e66c54ca237a0d65de6\library\core\src\slice\index.rs:255
4: alloc::vec::impl$12::index<u32,usize,alloc::alloc::Global>
at /rustc/8ace7ea1f7cbba7b4f031e66c54ca237a0d65de6\library\alloc\src\vec\mod.rs:2771
5: tantivy::indexer::doc_id_mapping::DocIdMapping::get_new_doc_id
at E:\Library_Store\Cargo\registry\src\mirrors.ustc.edu.cn-61ef6e0cd06fb9b8\tantivy-0.21.1\src\indexer\doc_id_mapping.rs:94
6: tantivy::postings::recorder::impl$4::serialize::closure$0
at E:\Library_Store\Cargo\registry\src\mirrors.ustc.edu.cn-61ef6e0cd06fb9b8\tantivy-0.21.1\src\postings\recorder.rs:131
7: core::ops::function::impls::impl$4::call_once<tuple$<u32>,tantivy::postings::recorder::impl$4::serialize::closure_env$0>
at /rustc/8ace7ea1f7cbba7b4f031e66c54ca237a0d65de6\library\core\src\ops\function.rs:305
8: enum2$<core::option::Option<u32> >::map
at /rustc/8ace7ea1f7cbba7b4f031e66c54ca237a0d65de6\library\core\src\option.rs:1072
9: core::iter::adapters::map::impl$2::next<u32,tantivy::postings::recorder::VInt32Reader,tantivy::postings::recorder::impl$4::serialize::closure_env$0>
at /rustc/8ace7ea1f7cbba7b4f031e66c54ca237a0d65de6\library\core\src\iter\adapters\map.rs:108
10: alloc::vec::Vec<u32,alloc::alloc::Global>::extend_desugared<u32,alloc::alloc::Global,core::iter::adapters::map::Map<tantivy::postings::recorder::VInt32Reader,tantivy::postings::recorder::impl$4::serialize::closure_env$0> >
at /rustc/8ace7ea1f7cbba7b4f031e66c54ca237a0d65de6\library\alloc\src\vec\mod.rs:2930
11: alloc::vec::spec_extend::impl$0::spec_extend<u32,core::iter::adapters::map::Map<tantivy::postings::recorder::VInt32Reader,tantivy::postings::recorder::impl$4::serialize::closure_env$0>,alloc::alloc::Global>
at /rustc/8ace7ea1f7cbba7b4f031e66c54ca237a0d65de6\library\alloc\src\vec\spec_extend.rs:17
12: alloc::vec::impl$18::extend<u32,alloc::alloc::Global,core::iter::adapters::map::Map<tantivy::postings::recorder::VInt32Reader,tantivy::postings::recorder::impl$4::serialize::closure_env$0> >
at /rustc/8ace7ea1f7cbba7b4f031e66c54ca237a0d65de6\library\alloc\src\vec\mod.rs:2904
13: tantivy::postings::recorder::impl$4::serialize
at E:\Library_Store\Cargo\registry\src\mirrors.ustc.edu.cn-61ef6e0cd06fb9b8\tantivy-0.21.1\src\postings\recorder.rs:129
14: tantivy::postings::postings_writer::SpecializedPostingsWriter<tantivy::postings::recorder::DocIdRecorder>::serialize_one_term<tantivy::postings::recorder::DocIdRecorder>
at E:\Library_Store\Cargo\registry\src\mirrors.ustc.edu.cn-61ef6e0cd06fb9b8\tantivy-0.21.1\src\postings\postings_writer.rs:175
15: tantivy::postings::postings_writer::impl$2::serialize<tantivy::postings::recorder::DocIdRecorder>
at E:\Library_Store\Cargo\registry\src\mirrors.ustc.edu.cn-61ef6e0cd06fb9b8\tantivy-0.21.1\src\postings\postings_writer.rs:214
16: tantivy::postings::postings_writer::serialize_postings
at E:\Library_Store\Cargo\registry\src\mirrors.ustc.edu.cn-61ef6e0cd06fb9b8\tantivy-0.21.1\src\postings\postings_writer.rs:66
17: tantivy::indexer::segment_writer::remap_and_write
at E:\Library_Store\Cargo\registry\src\mirrors.ustc.edu.cn-61ef6e0cd06fb9b8\tantivy-0.21.1\src\indexer\segment_writer.rs:402
18: tantivy::indexer::segment_writer::SegmentWriter::finalize
at E:\Library_Store\Cargo\registry\src\mirrors.ustc.edu.cn-61ef6e0cd06fb9b8\tantivy-0.21.1\src\indexer\segment_writer.rs:148
19: tantivy::indexer::index_writer::index_documents
at E:\Library_Store\Cargo\registry\src\mirrors.ustc.edu.cn-61ef6e0cd06fb9b8\tantivy-0.21.1\src\indexer\index_writer.rs:199
20: tantivy::indexer::index_writer::impl$0::add_indexing_worker::closure$0
at E:\Library_Store\Cargo\registry\src\mirrors.ustc.edu.cn-61ef6e0cd06fb9b8\tantivy-0.21.1\src\indexer\index_writer.rs:428
note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose backtrace.
stack backtrace:
0: std::panicking::begin_panic_handler
Error: at ErrorInThread/rustc/8ace7ea1f7cbba7b4f031e66c54ca237a0d65de6/library\std\src\panicking.rs(:"647Any { .. }
")
1: core::panicking::panic_fmt
at /rustc/8ace7ea1f7cbba7b4f031e66c54ca237a0d65de6/library\core\src\panicking.rs:72
sort by is only supported on numerical fields. Why do you want to sort by text field?
sort by is only supported on numerical fields. Why do you want to sort by text field?
Because my source data is from MongoDB. Its id field is ObjectId type, and there is no way to store ObjectId type as a number to tantivy. Therefore, I use text to store the id field. For keeping the order, I would like to sort by the id field with text type.
I suggest we just remove the ability to sort the index by something. It has brought more bugs, confusion than any other feature.
Because tantivy has its own docId field, and tantivy's interface does not provide a save method (override by query). This leads to the confusion of query results in concurrent testing (single-threaded modification + multithreaded query). So, I think index sort is necessary.
I don't understand your sentence.
For keeping the order, I would like to sort by the id field with text type.
Why do you want to keep order in the tantivy index?
I suggest we just remove the ability to sort the index by something. It has brought more bugs, confusion than any other feature.
I also don't think the maintenance cost of it justifies potential gains currently. In practice there's little to no benefit, but some confusion about it. Range queries may be accelerated by using binary search instead of a full scan, but we don't do that currently. Compression may be improved, we don't have much data about that though.
Tantivy users may have custom queries on top of sorting. Hard to tell if and how they use it, but only performance should be affected when removing it.
If we decide to remove it I would add a deprecation warning in the upcoming release.