logos icon indicating copy to clipboard operation
logos copied to clipboard

Lexer panics when parsing multi-byte character with `.` pattern

Open andrewhickman opened this issue 2 years ago • 4 comments

This program:

use logos::{Logos, Lexer};

#[derive(Logos, Debug, PartialEq)]
pub enum Token<'a> {
    #[regex(".")]
    Char(&'a str),
    #[error]
    Error,
}

fn main() {
    let mut lexer = Token::lexer("😀");

    assert_eq!(lexer.next(), Some(Token::Char("😀")));
}

fails with the following backtrace:

Backtrace:
thread 'main' panicked at 'byte index 4 is out of bounds of `stack backtrace:
   0:     0x7ff657108e71 - std::backtrace_rs::backtrace::dbghelp::trace
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\..\..\backtrace\src\backtrace\dbghelp.rs:98
   1:     0x7ff657108e71 - std::backtrace_rs::backtrace::trace_unsynchronized
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\..\..\backtrace\src\backtrace\mod.rs:66
   2:     0x7ff657108e71 - std::sys_common::backtrace::_print_fmt
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\sys_common\backtrace.rs:66
   3:     0x7ff657108e71 - std::sys_common::backtrace::_print::impl$0::fmt
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\sys_common\backtrace.rs:45
   4:     0x7ff6571178aa - core::fmt::write
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\core\src\fmt\mod.rs:1190
   5:     0x7ff657106379 - std::io::Write::write_fmt<std::sys::windows::stdio::Stderr>
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\io\mod.rs:1657
   6:     0x7ff65710ae72 - std::sys_common::backtrace::_print
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\sys_common\backtrace.rs:48
   7:     0x7ff65710ae72 - std::sys_common::backtrace::print
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\sys_common\backtrace.rs:35
   8:     0x7ff65710ae72 - std::panicking::default_hook::closure$1
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\panicking.rs:295
   9:     0x7ff65710aa35 - std::panicking::default_hook
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\panicking.rs:314
  10:     0x7ff65710b4c8 - std::panicking::rust_panic_with_hook
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\panicking.rs:698
  11:     0x7ff65710b37d - std::panicking::begin_panic_handler::closure$0
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\panicking.rs:588
  12:     0x7ff6571097a7 - std::sys_common::backtrace::__rust_end_short_backtrace<std::panicking::begin_panic_handler::closure_env$0,never$>        
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\sys_common\backtrace.rs:138
  13:     0x7ff65710aff9 - std::panicking::begin_panic_handler
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\panicking.rs:584
  14:     0x7ff65711ccd5 - core::panicking::panic_fmt
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\core\src\panicking.rs:143
  15:     0x7ff65711d166 - core::fmt::Arguments::new_v1
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\core\src\fmt\mod.rs:387
  16:     0x7ff65711d166 - core::str::slice_error_fail
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\core\src\str\mod.rs:91
  17:     0x7ff657118847 - core::str::traits::impl$7::index
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\core\src\str\traits.rs:214
  18:     0x7ff657118847 - core::str::traits::impl$4::index
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\core\src\str\traits.rs:64
  19:     0x7ff657118847 - core::fmt::impl$16::fmt
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\core\src\fmt\mod.rs:2189
  20:     0x7ff657103ca0 - core::fmt::impl$52::fmt<str>
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\library\core\src\fmt\mod.rs:2128
  21:     0x7ff657103cd6 - core::fmt::impl$52::fmt<str>
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\library\core\src\fmt\mod.rs:2128
  22:     0x7ff6571171ae - core::fmt::builders::impl$4::field::closure$0
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\core\src\fmt\builders.rs:332
  23:     0x7ff6571171ae - core::result::Result::and_then
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\core\src\result.rs:1311
  24:     0x7ff6571171ae - core::fmt::builders::DebugTuple::field
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\core\src\fmt\builders.rs:338
  25:     0x7ff65710325c - logos_repro::impl$1::fmt
                               at C:\Users\andre\Repositories\prost-compiler\logos-repro\src\main.rs:3
  26:     0x7ff6571033a6 - core::fmt::impl$52::fmt<enum$<logos_repro::Token, 1, 18446744073709551615, Char> >
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\library\core\src\fmt\mod.rs:2128
  27:     0x7ff6571171ae - core::fmt::builders::impl$4::field::closure$0
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\core\src\fmt\builders.rs:332
  28:     0x7ff6571171ae - core::result::Result::and_then
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\core\src\result.rs:1311
  29:     0x7ff6571171ae - core::fmt::builders::DebugTuple::field
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\core\src\fmt\builders.rs:338
  30:     0x7ff6571018b9 - core::option::impl$46::fmt<enum$<logos_repro::Token, 1, 18446744073709551615, Char> >
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\library\core\src\option.rs:512
  31:     0x7ff6571018f6 - core::fmt::impl$52::fmt<enum$<core::option::Option<enum$<logos_repro::Token, 1, 18446744073709551615, Char> > > >        
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\library\core\src\fmt\mod.rs:2128
  32:     0x7ff6571178aa - core::fmt::write
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\core\src\fmt\mod.rs:1190
  33:     0x7ff65710b1f3 - core::fmt::Write::write_fmt
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\library\core\src\fmt\mod.rs:186
  34:     0x7ff65710b1f3 - std::panicking::begin_panic_handler::impl$0::fill::closure$0
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\panicking.rs:550
  35:     0x7ff65710b1f3 - core::option::Option::get_or_insert_with
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\library\core\src\option.rs:1522
  36:     0x7ff65710b1f3 - std::panicking::begin_panic_handler::PanicPayload::fill
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\panicking.rs:548
  37:     0x7ff65710b1f3 - std::panicking::begin_panic_handler::impl$1::get
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\panicking.rs:566
  38:     0x7ff65710b4b7 - std::panicking::rust_panic_with_hook
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\panicking.rs:697
  39:     0x7ff65710b37d - std::panicking::begin_panic_handler::closure$0
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\panicking.rs:588
  40:     0x7ff6571097a7 - std::sys_common::backtrace::__rust_end_short_backtrace<std::panicking::begin_panic_handler::closure_env$0,never$>        
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\sys_common\backtrace.rs:138
  41:     0x7ff65710aff9 - std::panicking::begin_panic_handler
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\panicking.rs:584
  42:     0x7ff65711ccd5 - core::panicking::panic_fmt
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\core\src\panicking.rs:143
  43:     0x7ff657116b81 - core::fmt::Arguments::new_v1
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\core\src\fmt\mod.rs:387
  44:     0x7ff657116b81 - core::panicking::assert_failed_inner
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\core\src\panicking.rs:225
  45:     0x7ff6571025aa - core::panicking::assert_failed<enum$<core::option::Option<enum$<logos_repro::Token, 1, 18446744073709551615, Char> > >,enum$<core::option::Option<enum$<logos_repro::Token, 1, 18446744073709551615, Char> > > >
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\library\core\src\panicking.rs:182
  46:     0x7ff657102feb - logos_repro::main
                               at C:\Users\andre\Repositories\prost-compiler\logos-repro\src\main.rs:14
  47:     0x7ff6571023cb - core::ops::function::FnOnce::call_once<void (*)(),tuple$<> >
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\library\core\src\ops\function.rs:227
  48:     0x7ff65710408b - std::sys_common::backtrace::__rust_begin_short_backtrace<void (*)(),tuple$<> >
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\library\std\src\sys_common\backtrace.rs:122
  49:     0x7ff657101191 - std::rt::lang_start::closure$0<tuple$<> >
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\library\std\src\rt.rs:145
  50:     0x7ff6571086df - core::ops::function::impls::impl$2::call_once
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\library\core\src\ops\function.rs:259
  51:     0x7ff6571086df - std::panicking::try::do_call
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\panicking.rs:492
  52:     0x7ff6571086df - std::panicking::try
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\panicking.rs:456
  53:     0x7ff6571086df - std::panic::catch_unwind
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\panic.rs:137
  54:     0x7ff6571086df - std::rt::lang_start_internal::closure$2
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\rt.rs:128
  55:     0x7ff6571086df - std::panicking::try::do_call
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\panicking.rs:492
  56:     0x7ff6571086df - std::panicking::try
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\panicking.rs:456
  57:     0x7ff6571086df - std::panic::catch_unwind
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\panic.rs:137
  58:     0x7ff6571086df - std::rt::lang_start_internal
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\rt.rs:128
  59:     0x7ff65710115f - std::rt::lang_start<tuple$<> >
                               at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\library\std\src\rt.rs:144
  60:     0x7ff657103356 - main
  61:     0x7ff65711b67c - invoke_main
                               at d:\agent\_work\4\s\src\vctools\crt\vcstartup\src\startup\exe_common.inl:78
  62:     0x7ff65711b67c - __scrt_common_main_seh
                               at d:\agent\_work\4\s\src\vctools\crt\vcstartup\src\startup\exe_common.inl:288
  63:     0x7ffc1ed47034 - BaseThreadInitThunk
  64:     0x7ffc20822651 - RtlUserThreadStart
thread panicked while panicking. aborting.
error: process didn't exit successfully: `target\debug\logos-repro.exe` (exit code: 0xc0000409, STATUS_STACK_BUFFER_OVERRUN)

andrewhickman avatar Jun 10 '22 22:06 andrewhickman

Well, that's a cut corner that I should be smarter about when I did it. Should be an easy enough fix.

maciejhirsz avatar Jun 11 '22 06:06 maciejhirsz

Also hitting this recently. Is there any workaround I can apply now?

P.S. It seems replacing . with \p{ANY} does not fix the problem, either.

ruifengx avatar Jun 24 '22 10:06 ruifengx

Silly me. There is indeed a simple workaround.

Define this callback to parse the multi-byte character manually:

fn single_any_char(lexer: &mut Lexer<StringLexeme>) {
    let span = lexer.span();
    let input = &lexer.source()[span.start..];
    // get length of the first character
    let n = {
        let mut chars = input.chars();
        let _ = chars.next().unwrap();
        input.len() - chars.as_str().len()
    };
    lexer.bump(n - span.len());
    // if desired, return `lexer.slice()` here
}

and then use it on the variant:

#[derive(Logos)]
pub enum Token {
    #[regex(".", single_any_char)]
    Char,
    #[error]
    Error,
}

Sidenote: It seems Lexer::slice calls the unsafe function str::get_unchecked, so if my understanding is correct, the program as illustrated by OP invokes a library UB. Fortunately, for the callback above it should be fine (the invariant is guaranteed by the Chars iterator and the check in Lexer::bump).

ruifengx avatar Jun 24 '22 11:06 ruifengx

@andrewhickman @ruifengx I'm pretty busy right now, but if either of you would tackle this, the culprit is here exactly:

https://github.com/maciejhirsz/logos/blob/51c1f8c1bca990758e05c5600becc6d9c10bd6e4/logos-codegen/src/graph/regex.rs#L123-L123

That !is_ascii(&class) check forces the following branch which treats unicode ranges (char to char) that don't have any non-ascii breakpoints into byte ranges (u8 to u8).

Just removing that check should fix this bug, although it might break other tests that depend on a flat structure. Treating unicode ranges with no break points > 127 as byte ranges is safe when performed in loops ([^x]+ for example) since all non-ascii unicode bytes fall into 128..=255 range (top most bit is always set for the first byte and all continuations in utf8-encoded scalars), which is why this optimization exists there in the first place.

In other words the "proper" solution here would be changing this check to !is_ascii(&class) || !parent_is_loop (if I'm doing the bool logic in my head right) and then propagating parent_is_loop as an argument through recursive parse_mir calls. The only place where it would need to be set to true is here:

https://github.com/maciejhirsz/logos/blob/51c1f8c1bca990758e05c5600becc6d9c10bd6e4/logos-codegen/src/graph/regex.rs#L22-L33

maciejhirsz avatar Jun 24 '22 12:06 maciejhirsz