rust-punkt
rust-punkt copied to clipboard
Panic with multibyte string
It seems, like somewhere length of string defines not correctly.
Code to reproduce:
use punkt::*;
use punkt::params::*;
fn main() {
let content = "Функция. Речи.";
let trainer: Trainer<Standard> = Trainer::new();
let mut data = TrainingData::new();
trainer.train(content, &mut data);
for s in SentenceTokenizer::<Standard>::new(content, &data) {
println!("{:?}", s);
}
}
RUST_BACKTRACE=1 cargo run
Finished dev [unoptimized + debuginfo] target(s) in 0.02s
Running `target/debug/comprehensibility`
thread 'main' panicked at 'byte index 13 is not a char boundary; it is inside 'я' (bytes 12..14) of `функция`', src/libcore/str/mod.rs:2036:5
stack backtrace:
0: backtrace::backtrace::libunwind::trace
at /cargo/registry/src/github.com-1ecc6299db9ec823/backtrace-0.3.29/src/backtrace/libunwind.rs:88
1: backtrace::backtrace::trace_unsynchronized
at /cargo/registry/src/github.com-1ecc6299db9ec823/backtrace-0.3.29/src/backtrace/mod.rs:66
2: std::sys_common::backtrace::_print
at src/libstd/sys_common/backtrace.rs:47
3: std::sys_common::backtrace::print
at src/libstd/sys_common/backtrace.rs:36
4: std::panicking::default_hook::{{closure}}
at src/libstd/panicking.rs:200
5: std::panicking::default_hook
at src/libstd/panicking.rs:214
6: std::panicking::rust_panic_with_hook
at src/libstd/panicking.rs:477
7: std::panicking::continue_panic_fmt
at src/libstd/panicking.rs:384
8: rust_begin_unwind
at src/libstd/panicking.rs:311
9: core::panicking::panic_fmt
at src/libcore/panicking.rs:85
10: core::str::slice_error_fail
at src/libcore/str/mod.rs:0
11: core::str::traits::<impl core::slice::SliceIndex<str> for core::ops::range::RangeTo<usize>>::index::{{closure}}
at /rustc/4560cb830fce63fcffdc4558f4281aaac6a3a1ba/src/libcore/str/mod.rs:1823
12: core::option::Option<T>::unwrap_or_else
at /rustc/4560cb830fce63fcffdc4558f4281aaac6a3a1ba/src/libcore/option.rs:419
13: core::str::traits::<impl core::slice::SliceIndex<str> for core::ops::range::RangeTo<usize>>::index
at /rustc/4560cb830fce63fcffdc4558f4281aaac6a3a1ba/src/libcore/str/mod.rs:1823
14: core::str::traits::<impl core::ops::index::Index<I> for str>::index
at /rustc/4560cb830fce63fcffdc4558f4281aaac6a3a1ba/src/libcore/str/mod.rs:1625
15: punkt::trainer::is_rare_abbrev_type
at /home/kirill/.cargo/registry/src/github.com-1ecc6299db9ec823/punkt-1.0.5/src/trainer.rs:449
16: punkt::trainer::Trainer<P>::train
at /home/kirill/.cargo/registry/src/github.com-1ecc6299db9ec823/punkt-1.0.5/src/trainer.rs:382
17: comprehensibility::main
at src/main.rs:10
18: std::rt::lang_start::{{closure}}
at /rustc/4560cb830fce63fcffdc4558f4281aaac6a3a1ba/src/libstd/rt.rs:64
19: std::rt::lang_start_internal::{{closure}}
at src/libstd/rt.rs:49
20: std::panicking::try::do_call
at src/libstd/panicking.rs:296
21: __rust_maybe_catch_panic
at src/libpanic_unwind/lib.rs:80
22: std::panicking::try
at src/libstd/panicking.rs:275
23: std::panic::catch_unwind
at src/libstd/panic.rs:394
24: std::rt::lang_start_internal
at src/libstd/rt.rs:48
25: std::rt::lang_start
at /rustc/4560cb830fce63fcffdc4558f4281aaac6a3a1ba/src/libstd/rt.rs:64
26: main
27: __libc_start_main
28: _start
note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose backtrace.