jieba-rs
jieba-rs copied to clipboard
tantivy0.24.1使用jieba-rs 0.7.2索引大量文件,每一个文件也挺大的情况下崩溃
tantivy0.24.1使用jieba-rs 0.7.2索引大量文件,每一个文件也挺大最大的txt有250多MB的情况下崩溃
具体错误表现是memory allocation of 4024077400 bytes failed
崩溃的时候堆栈调用是
[Inlined] std::sys::pal::windows::abort_internal() mod.rs:340
[Inlined] std::process::abort() process.rs:2425
std::alloc::rust_oom() alloc.rs:377
std::alloc::_::__rg_oom() alloc.rs:372
[Inlined] alloc::alloc::handle_alloc_error::rt_error() alloc.rs:405
alloc::alloc::handle_alloc_error() alloc.rs:411
alloc::raw_vec::handle_error() mod.rs:798
alloc::raw_vec::RawVecInner<alloc::alloc::Global>::with_capacity_in<alloc::alloc::Global>(u64,alloc::alloc::Global,core::alloc::layout::Layout,*mut core::panic::location::Location) mod.rs:430
[Inlined] alloc::raw_vec::RawVec<usize,alloc::alloc::Global>::with_capacity_in(u64,alloc::alloc::Global,*mut core::panic::location::Location) mod.rs:190
[Inlined] alloc::vec::Vec<usize,alloc::alloc::Global>::with_capacity_in(u64,alloc::alloc::Global,*mut core::panic::location::Location) mod.rs:815
alloc::vec::Vec<usize,alloc::alloc::Global>::with_capacity<usize>(u64,*mut core::panic::location::Location) mod.rs:495
jieba_rs::sparse_dag::StaticSparseDAG::with_size_hint(u64) sparse_dag.rs:37
jieba_rs::Jieba::cut_internal(ref$<str$>,bool,bool) lib.rs:656
jieba_rs::Jieba::cut(ref$<str$>,bool) lib.rs:711
jieba_rs::Jieba::tokenize(ref$<str$>,jieba_rs::TokenizeMode,bool) lib.rs:777
tantivy_jieba::impl$1::token_stream(*mut tantivy_jieba::JiebaTokenizer,ref$<str$>) lib.rs:74
tantivy::tokenizer::tokenizer::impl$2::box_token_stream<tantivy_jieba::JiebaTokenizer>(*mut tantivy_jieba::JiebaTokenizer,ref$<str$>) tokenizer.rs:42
tantivy::tokenizer::tokenizer::impl$0::token_stream(*mut alloc::boxed::Box<dyn$<tantivy::tokenizer::tokenizer::BoxableTokenizer>,alloc::alloc::Global>,ref$<str$>) tokenizer.rs:20
tantivy::tokenizer::tokenizer::TextAnalyzer::token_stream(ref$<str$>) tokenizer.rs:69
tantivy::indexer::segment_writer::SegmentWriter::index_document<tantivy::schema::document::default_document::CompactDoc>(*mut tantivy::schema::document::default_document::CompactDoc) segment_writer.rs:201
tantivy::indexer::segment_writer::SegmentWriter::add_document<tantivy::schema::document::default_document::CompactDoc>(tantivy::indexer::operation::AddOperation<tantivy::schema::document::default_document::CompactDoc>) segment_writer.rs:355
tantivy::indexer::index_writer::index_documents<tantivy::schema::document::default_document::CompactDoc>(u64,tantivy::index::segment::Segment,ref_mut$<dyn$<core::iter::traits::iterator::Iterator<assoc$<Item,smallvec::SmallVec<array$<tantivy::indexer::operation::AddOperation<tantivy::schema::document::default_document::CompactDoc>,4> > > > > >,*mut tantivy::indexer::segment_updater::SegmentUpdater,tantivy::indexer::delete_queue::DeleteCursor) index_writer.rs:192
tantivy::indexer::index_writer::impl$0::add_indexing_worker::closure$0<tantivy::schema::document::default_document::CompactDoc>(tantivy::indexer::index_writer::impl$0::add_indexing_worker::closure_env$0<tantivy::schema::document::default_document::CompactDoc>) index_writer.rs:450
std::sys::backtrace::__rust_begin_short_backtrace<tantivy::indexer::index_writer::impl$0::add_indexing_worker::closure_env$0<tantivy::schema::document::default_document::CompactDoc>,enum2$<core::result::Result<tuple$<>,enum2$<tantivy::error::TantivyError> > > >(tantivy::indexer::index_writer::impl$0::add_indexing_worker::closure_env$0<tantivy::schema::document::default_document::CompactDoc>) backtrace.rs:152
std::thread::impl$0::spawn_unchecked_::closure$1::closure$0<tantivy::indexer::index_writer::impl$0::add_indexing_worker::closure_env$0<tantivy::schema::document::default_document::CompactDoc>,enum2$<core::result::Result<tuple$<>,enum2$<tantivy::error::TantivyError> > > >(std::thread::impl$0::spawn_unchecked_::closure$1::closure_env$0<tantivy::indexer::index_writer::impl$0::add_indexing_worker::closure_env$0<tantivy::schema::document::default_document::CompactDoc>,enum2$<core::result::Result<tuple$<>,enum2$<tantivy::error::TantivyError> > > >) mod.rs:559
core::panic::unwind_safe::impl$25::call_once<enum2$<core::result::Result<tuple$<>,enum2$<tantivy::error::TantivyError> > >,std::thread::impl$0::spawn_unchecked_::closure$1::closure_env$0<tantivy::indexer::index_writer::impl$0::add_indexing_worker::closure_env$0<tantivy::schema::document::default_document::CompactDoc>,enum2$<core::result::Result<tuple$<>,enum2$<tantivy::error::TantivyError> > > > >(core::panic::unwind_safe::AssertUnwindSafe<std::thread::impl$0::spawn_unchecked_::closure$1::closure_env$0<tantivy::indexer::index_writer::impl$0::add_indexing_worker::closure_env$0<tantivy::schema::document::default_document::CompactDoc>,enum2$<core::result::Result<tuple$<>,enum2$<tantivy::error::TantivyError> > > > >) unwind_safe.rs:272
std::panicking::try::do_call<core::panic::unwind_safe::AssertUnwindSafe<std::thread::impl$0::spawn_unchecked_::closure$1::closure_env$0<tantivy::indexer::index_writer::impl$0::add_indexing_worker::closure_env$0<tantivy::schema::document::default_document::CompactDoc>,enum2$<core::result::Result<tuple$<>,enum2$<tantivy::error::TantivyError> > > > >,enum2$<core::result::Result<tuple$<>,enum2$<tantivy::error::TantivyError> > > >(*mut u8) panicking.rs:589
<unknown> 0x00007ff66b5056c3
[Inlined] std::panicking::try(core::panic::unwind_safe::AssertUnwindSafe<std::thread::impl$0::spawn_unchecked_::closure$1::closure_env$0<tantivy::indexer::index_writer::impl$0::add_indexing_worker::closure_env$0<tantivy::schema::document::default_document::CompactDoc>,enum2$<core::result::Result<tuple$<>,enum2$<tantivy::error::TantivyError> > > > >) panicking.rs:552
[Inlined] std::panic::catch_unwind(core::panic::unwind_safe::AssertUnwindSafe<std::thread::impl$0::spawn_unchecked_::closure$1::closure_env$0<tantivy::indexer::index_writer::impl$0::add_indexing_worker::closure_env$0<tantivy::schema::document::default_document::CompactDoc>,enum2$<core::result::Result<tuple$<>,enum2$<tantivy::error::TantivyError> > > > >) panic.rs:359
std::thread::impl$0::spawn_unchecked_::closure$1<tantivy::indexer::index_writer::impl$0::add_indexing_worker::closure_env$0<tantivy::schema::document::default_document::CompactDoc>,enum2$<core::result::Result<tuple$<>,enum2$<tantivy::error::TantivyError> > > >(std::thread::impl$0::spawn_unchecked_::closure_env$1<tantivy::indexer::index_writer::impl$0::add_indexing_worker::closure_env$0<tantivy::schema::document::default_document::CompactDoc>,enum2$<core::result::Result<tuple$<>,enum2$<tantivy::error::TantivyError> > > >) mod.rs:557
core::ops::function::FnOnce::call_once<std::thread::impl$0::spawn_unchecked_::closure_env$1<tantivy::indexer::index_writer::impl$0::add_indexing_worker::closure_env$0<tantivy::schema::document::default_document::CompactDoc>,enum2$<core::result::Result<tuple$<>,enum2$<tantivy::error::TantivyError> > > >,tuple$<> >(*mut std::thread::impl$0::spawn_unchecked_::closure_env$1<tantivy::indexer::index_writer::impl$0::add_indexing_worker::closure_env$0<tantivy::schema::document::default_document::CompactDoc>,enum2$<core::result::Result<tuple$<>,enum2$<tantivy::error::TantivyError> > > >) function.rs:250
[Inlined] alloc::boxed::impl$28::call_once() boxed.rs:1966
[Inlined] alloc::boxed::impl$28::call_once() boxed.rs:1966
std::sys::pal::windows::thread::impl$0::new::thread_start() thread.rs:56
<unknown> 0x00007ffcffc2259d
<unknown> 0x00007ffd016eaf78
could you please try to make a minmal repro? thanks.
could you please try to make a minmal repro? thanks.
感谢回复,请查看https://github.com/hkhk368/jieba-rs-test
用法很简单,创建一个test.txt,里面放大量的文本超过200MB,我在Windows11 23h2,32G内存下就会出现memory allocation of 4024077400 bytes failed 崩溃的堆栈就是上面那个
我的test.txt的内容是这样的,我遍历我的整个硬盘,把每一个文件的完整路径,文件名,文件大小,文件修改时间等信息都放到这个txt中,这个txt就会非常大,因为我文件非常多,然后运行我的repo就会复现上面的错误。