logos
logos copied to clipboard
Lexer panics when parsing multi-byte character with `.` pattern
This program:
use logos::{Logos, Lexer};
#[derive(Logos, Debug, PartialEq)]
pub enum Token<'a> {
#[regex(".")]
Char(&'a str),
#[error]
Error,
}
fn main() {
let mut lexer = Token::lexer("😀");
assert_eq!(lexer.next(), Some(Token::Char("😀")));
}
fails with the following backtrace:
Backtrace:
thread 'main' panicked at 'byte index 4 is out of bounds of `stack backtrace:
0: 0x7ff657108e71 - std::backtrace_rs::backtrace::dbghelp::trace
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\..\..\backtrace\src\backtrace\dbghelp.rs:98
1: 0x7ff657108e71 - std::backtrace_rs::backtrace::trace_unsynchronized
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\..\..\backtrace\src\backtrace\mod.rs:66
2: 0x7ff657108e71 - std::sys_common::backtrace::_print_fmt
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\sys_common\backtrace.rs:66
3: 0x7ff657108e71 - std::sys_common::backtrace::_print::impl$0::fmt
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\sys_common\backtrace.rs:45
4: 0x7ff6571178aa - core::fmt::write
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\core\src\fmt\mod.rs:1190
5: 0x7ff657106379 - std::io::Write::write_fmt<std::sys::windows::stdio::Stderr>
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\io\mod.rs:1657
6: 0x7ff65710ae72 - std::sys_common::backtrace::_print
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\sys_common\backtrace.rs:48
7: 0x7ff65710ae72 - std::sys_common::backtrace::print
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\sys_common\backtrace.rs:35
8: 0x7ff65710ae72 - std::panicking::default_hook::closure$1
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\panicking.rs:295
9: 0x7ff65710aa35 - std::panicking::default_hook
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\panicking.rs:314
10: 0x7ff65710b4c8 - std::panicking::rust_panic_with_hook
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\panicking.rs:698
11: 0x7ff65710b37d - std::panicking::begin_panic_handler::closure$0
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\panicking.rs:588
12: 0x7ff6571097a7 - std::sys_common::backtrace::__rust_end_short_backtrace<std::panicking::begin_panic_handler::closure_env$0,never$>
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\sys_common\backtrace.rs:138
13: 0x7ff65710aff9 - std::panicking::begin_panic_handler
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\panicking.rs:584
14: 0x7ff65711ccd5 - core::panicking::panic_fmt
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\core\src\panicking.rs:143
15: 0x7ff65711d166 - core::fmt::Arguments::new_v1
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\core\src\fmt\mod.rs:387
16: 0x7ff65711d166 - core::str::slice_error_fail
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\core\src\str\mod.rs:91
17: 0x7ff657118847 - core::str::traits::impl$7::index
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\core\src\str\traits.rs:214
18: 0x7ff657118847 - core::str::traits::impl$4::index
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\core\src\str\traits.rs:64
19: 0x7ff657118847 - core::fmt::impl$16::fmt
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\core\src\fmt\mod.rs:2189
20: 0x7ff657103ca0 - core::fmt::impl$52::fmt<str>
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\library\core\src\fmt\mod.rs:2128
21: 0x7ff657103cd6 - core::fmt::impl$52::fmt<str>
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\library\core\src\fmt\mod.rs:2128
22: 0x7ff6571171ae - core::fmt::builders::impl$4::field::closure$0
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\core\src\fmt\builders.rs:332
23: 0x7ff6571171ae - core::result::Result::and_then
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\core\src\result.rs:1311
24: 0x7ff6571171ae - core::fmt::builders::DebugTuple::field
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\core\src\fmt\builders.rs:338
25: 0x7ff65710325c - logos_repro::impl$1::fmt
at C:\Users\andre\Repositories\prost-compiler\logos-repro\src\main.rs:3
26: 0x7ff6571033a6 - core::fmt::impl$52::fmt<enum$<logos_repro::Token, 1, 18446744073709551615, Char> >
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\library\core\src\fmt\mod.rs:2128
27: 0x7ff6571171ae - core::fmt::builders::impl$4::field::closure$0
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\core\src\fmt\builders.rs:332
28: 0x7ff6571171ae - core::result::Result::and_then
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\core\src\result.rs:1311
29: 0x7ff6571171ae - core::fmt::builders::DebugTuple::field
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\core\src\fmt\builders.rs:338
30: 0x7ff6571018b9 - core::option::impl$46::fmt<enum$<logos_repro::Token, 1, 18446744073709551615, Char> >
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\library\core\src\option.rs:512
31: 0x7ff6571018f6 - core::fmt::impl$52::fmt<enum$<core::option::Option<enum$<logos_repro::Token, 1, 18446744073709551615, Char> > > >
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\library\core\src\fmt\mod.rs:2128
32: 0x7ff6571178aa - core::fmt::write
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\core\src\fmt\mod.rs:1190
33: 0x7ff65710b1f3 - core::fmt::Write::write_fmt
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\library\core\src\fmt\mod.rs:186
34: 0x7ff65710b1f3 - std::panicking::begin_panic_handler::impl$0::fill::closure$0
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\panicking.rs:550
35: 0x7ff65710b1f3 - core::option::Option::get_or_insert_with
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\library\core\src\option.rs:1522
36: 0x7ff65710b1f3 - std::panicking::begin_panic_handler::PanicPayload::fill
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\panicking.rs:548
37: 0x7ff65710b1f3 - std::panicking::begin_panic_handler::impl$1::get
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\panicking.rs:566
38: 0x7ff65710b4b7 - std::panicking::rust_panic_with_hook
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\panicking.rs:697
39: 0x7ff65710b37d - std::panicking::begin_panic_handler::closure$0
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\panicking.rs:588
40: 0x7ff6571097a7 - std::sys_common::backtrace::__rust_end_short_backtrace<std::panicking::begin_panic_handler::closure_env$0,never$>
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\sys_common\backtrace.rs:138
41: 0x7ff65710aff9 - std::panicking::begin_panic_handler
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\panicking.rs:584
42: 0x7ff65711ccd5 - core::panicking::panic_fmt
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\core\src\panicking.rs:143
43: 0x7ff657116b81 - core::fmt::Arguments::new_v1
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\core\src\fmt\mod.rs:387
44: 0x7ff657116b81 - core::panicking::assert_failed_inner
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\core\src\panicking.rs:225
45: 0x7ff6571025aa - core::panicking::assert_failed<enum$<core::option::Option<enum$<logos_repro::Token, 1, 18446744073709551615, Char> > >,enum$<core::option::Option<enum$<logos_repro::Token, 1, 18446744073709551615, Char> > > >
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\library\core\src\panicking.rs:182
46: 0x7ff657102feb - logos_repro::main
at C:\Users\andre\Repositories\prost-compiler\logos-repro\src\main.rs:14
47: 0x7ff6571023cb - core::ops::function::FnOnce::call_once<void (*)(),tuple$<> >
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\library\core\src\ops\function.rs:227
48: 0x7ff65710408b - std::sys_common::backtrace::__rust_begin_short_backtrace<void (*)(),tuple$<> >
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\library\std\src\sys_common\backtrace.rs:122
49: 0x7ff657101191 - std::rt::lang_start::closure$0<tuple$<> >
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\library\std\src\rt.rs:145
50: 0x7ff6571086df - core::ops::function::impls::impl$2::call_once
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\library\core\src\ops\function.rs:259
51: 0x7ff6571086df - std::panicking::try::do_call
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\panicking.rs:492
52: 0x7ff6571086df - std::panicking::try
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\panicking.rs:456
53: 0x7ff6571086df - std::panic::catch_unwind
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\panic.rs:137
54: 0x7ff6571086df - std::rt::lang_start_internal::closure$2
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\rt.rs:128
55: 0x7ff6571086df - std::panicking::try::do_call
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\panicking.rs:492
56: 0x7ff6571086df - std::panicking::try
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\panicking.rs:456
57: 0x7ff6571086df - std::panic::catch_unwind
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\panic.rs:137
58: 0x7ff6571086df - std::rt::lang_start_internal
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\/library\std\src\rt.rs:128
59: 0x7ff65710115f - std::rt::lang_start<tuple$<> >
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c\library\std\src\rt.rs:144
60: 0x7ff657103356 - main
61: 0x7ff65711b67c - invoke_main
at d:\agent\_work\4\s\src\vctools\crt\vcstartup\src\startup\exe_common.inl:78
62: 0x7ff65711b67c - __scrt_common_main_seh
at d:\agent\_work\4\s\src\vctools\crt\vcstartup\src\startup\exe_common.inl:288
63: 0x7ffc1ed47034 - BaseThreadInitThunk
64: 0x7ffc20822651 - RtlUserThreadStart
thread panicked while panicking. aborting.
error: process didn't exit successfully: `target\debug\logos-repro.exe` (exit code: 0xc0000409, STATUS_STACK_BUFFER_OVERRUN)
Well, that's a cut corner that I should be smarter about when I did it. Should be an easy enough fix.
Also hitting this recently. Is there any workaround I can apply now?
P.S. It seems replacing .
with \p{ANY}
does not fix the problem, either.
Silly me. There is indeed a simple workaround.
Define this callback to parse the multi-byte character manually:
fn single_any_char(lexer: &mut Lexer<StringLexeme>) {
let span = lexer.span();
let input = &lexer.source()[span.start..];
// get length of the first character
let n = {
let mut chars = input.chars();
let _ = chars.next().unwrap();
input.len() - chars.as_str().len()
};
lexer.bump(n - span.len());
// if desired, return `lexer.slice()` here
}
and then use it on the variant:
#[derive(Logos)]
pub enum Token {
#[regex(".", single_any_char)]
Char,
#[error]
Error,
}
Sidenote: It seems Lexer::slice
calls the unsafe function str::get_unchecked
, so if my understanding is correct, the program as illustrated by OP invokes a library UB. Fortunately, for the callback above it should be fine (the invariant is guaranteed by the Chars
iterator and the check in Lexer::bump
).
@andrewhickman @ruifengx I'm pretty busy right now, but if either of you would tackle this, the culprit is here exactly:
https://github.com/maciejhirsz/logos/blob/51c1f8c1bca990758e05c5600becc6d9c10bd6e4/logos-codegen/src/graph/regex.rs#L123-L123
That !is_ascii(&class)
check forces the following branch which treats unicode ranges (char
to char
) that don't have any non-ascii breakpoints into byte ranges (u8
to u8
).
Just removing that check should fix this bug, although it might break other tests that depend on a flat structure. Treating unicode ranges with no break points > 127 as byte ranges is safe when performed in loops ([^x]+
for example) since all non-ascii unicode bytes fall into 128..=255 range (top most bit is always set for the first byte and all continuations in utf8-encoded scalars), which is why this optimization exists there in the first place.
In other words the "proper" solution here would be changing this check to !is_ascii(&class) || !parent_is_loop
(if I'm doing the bool logic in my head right) and then propagating parent_is_loop
as an argument through recursive parse_mir
calls. The only place where it would need to be set to true
is here:
https://github.com/maciejhirsz/logos/blob/51c1f8c1bca990758e05c5600becc6d9c10bd6e4/logos-codegen/src/graph/regex.rs#L22-L33