logos Thread 'rustc' has overflowed it's stack

Not really sure what's wrong with my regexes. I tested them in Python and they worked fine, so I'm pretty sure they're valid under the rules of the regex crate too. (I also don't get any info about what Rustc was doing when it overflowed.) Here's my full Lexer enum:

use logos::Logos;

#[derive(Logos, Debug, PartialEq, PartialOrd, Eq, Ord, Clone, Copy)]
pub enum Token {
    #[regex(r#"[\p{Zl}\p{Zp}\p{Zs}\x0A\x0B\x0C\x0D\x85]"#)]
    Whitespace,
    #[regex(r#"[\p{Cf}]"#)]
    Format,
    #[regex(r#"[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}][\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}\p{Mn}\p{Mc}\p{Nd}\p{Pc}]*"#)]
    Identifier,
    #[regex(r#"[0-9]([0-9_]?)*(\.[0-9]([0-9_]?)*)?([eE][+-]?[0-9]([0-9_]?)*)?"#)]
    DecimalLiteral,
    #[regex(r#"([0-9]?)*[0-9_]#([0-9A-Fa-f]?)[0-9A-Fa-f_]*(\.([0-9A-Fa-f]?)[0-9A-Fa-f_]*)?#([eE][+-]?([0-9]?)[0-9_]*)?"#)]
    BasedLiteral,
    #[regex(r#"'[\pL\pM\pN\pP\pS\p{Zs}]'"#)]
    CharacterLiteral,
    #[regex(r#""(("")|[^"])*""#)]
    StringLiteral,
    #[regex(r#"--[^\n]*"#)]
    Comment,
    #[regex(r#"[&-/:->@\[\]|]"#)]
    SimpleDelimiter,
    #[regex(r#"(?:\*\*|\.\.|[/:]=|<[<->]|=>|>[=>])"#)]
    CompoundDelimiter,
    // Keywords
    #[regex(r#"(?i)abort"#)]
    Abort,
    #[regex(r#"(?i)abs"#)]
    Abs,
    #[regex(r#"(?i)abstract"#)]
    Abstract,
    #[regex(r#"(?i)accept"#)]
    Accept,
    #[regex(r#"(?i)access"#)]
    Access,
    #[regex(r#"(?i)aliased"#)]
    Aliased,
    #[regex(r#"(?i)all"#)]
    All,
    #[regex(r#"(?i)and"#)]
    And,
    #[regex(r#"(?i)array"#)]
    Array,
    #[regex(r#"(?i)at"#)]
    At,
    #[regex(r#"(?i)begin"#)]
    Begin,
    #[regex(r#"(?i)body"#)]
    Body,
    #[regex(r#"(?i)case"#)]
    Case,
    #[regex(r#"(?i)constant"#)]
    Constant,
    #[regex(r#"(?i)declare"#)]
    Declare,
    #[regex(r#"(?i)delay"#)]
    Delay,
    #[regex(r#"(?i)delta"#)]
    Delta,
    #[regex(r#"(?i)digits"#)]
    Digits,
    #[regex(r#"(?i)do"#)]
    Do,
    #[regex(r#"(?i)else"#)]
    Else,
    #[regex(r#"(?i)elsif"#)]
    Elsif,
    #[regex(r#"(?i)end"#)]
    End,
    #[regex(r#"(?i)entry"#)]
    Entry,
    #[regex(r#"(?i)exception"#)]
    Exception,
    #[regex(r#"(?i)exit"#)]
    Exit,
    #[regex(r#"(?i)for#"#)]
    For,
    #[regex(r#"(?i)function"#)]
    Function,
    #[regex(r#"(?i)generic"#)]
    Generic,
    #[regex(r#"(?i)goto"#)]
    Goto,
    #[regex(r#"(?i)if"#)]
    If,
    #[regex(r#"(?i)in"#)]
    In,
    #[regex(r#"(?i)interface"#)]
    Interface,
    #[regex(r#"(?i)is"#)]
    Is,
    #[regex(r#"(?i)limited"#)]
    Limited,
    #[regex(r#"(?i)loop"#)]
    Loop,
    #[regex(r#"(?i)mod"#)]
    Mod,
    #[regex(r#"(?i)new"#)]
    New,
    #[regex(r#"(?i)not"#)]
    Not,
    #[regex(r#"(?i)null"#)]
    Null,
    #[regex(r#"(?i)of"#)]
    Of,
    #[regex(r#"(?i)or#"#)]
    Or,
    #[regex(r#"(?i)others"#)]
    Others,
    #[regex(r#"(?i)out"#)]
    Out,
    #[regex(r#"(?i)overriding"#)]
    Overriding,
    #[regex(r#"(?i)package"#)]
    Package,
    #[regex(r#"(?i)parallel"#)]
    Parallel,
    #[regex(r#"(?i)pragma"#)]
    Pragma,
    #[regex(r#"(?i)private"#)]
    Private,
    #[regex(r#"(?i)procedure"#)]
    Procedure,
    #[regex(r#"(?i)protected"#)]
    Protected,
    #[regex(r#"(?i)raise"#)]
    Raise,
    #[regex(r#"(?i)range"#)]
    Range,
    #[regex(r#"(?i)record"#)]
    Record,
    #[regex(r#"(?i)rem"#)]
    Rem,
    #[regex(r#"(?i)renames"#)]
    Renames,
    #[regex(r#"(?i)requeue"#)]
    Requeue,
    #[regex(r#"(?i)return"#)]
    Return,
    #[regex(r#"(?i)reverse"#)]
    Reverse,
    #[regex(r#"(?i)select"#)]
    Select,
    #[regex(r#"(?i)separate"#)]
    Separate,
    #[regex(r#"(?i)some"#)]
    Some,
    #[regex(r#"(?i)subtype"#)]
    Subtype,
    #[regex(r#"(?i)synchronized"#)]
    Synchronized,
    #[regex(r#"(?i)tagged"#)]
    Tagged,
    #[regex(r#"(?i)task"#)]
    Task,
    #[regex(r#"(?i)terminate"#)]
    Terminate,
    #[regex(r#"(?i)then"#)]
    Then,
    #[regex(r#"(?i)type"#)]
    Type,
    #[regex(r#"(?i)until"#)]
    Until,
    #[regex(r#"(?i)use"#)]
    Use,
    #[regex(r#"(?i)when"#)]
    When,
    #[regex(r#"(?i)while"#)]
    While,
    #[regex(r#"(?i)with"#)]
    With,
    #[regex(r#"(?i)xor#"#)]
    Xor,
    // Other unicode categories (for CST only), these will trigger errors when parsed
    #[regex(r#"[\pS]"#)]
    Symbol,
}

Any thoughts? (I know that these are really complex regexes, but the EBNF rules they came from are also quite complex.)

Jun 07 '23 15:06 ethindp

Hello,

Logos does not support the whole set of valid regexes, so this may explain. A few already closed (or open) issues are on similar topics, so I suggest you check them.

Logos’ documentation on that topic is quite sparse for the moment, but I hope someday it will be clear what is supported and what isn’t :)

Jun 07 '23 15:06 jeertmans

@jeertmans Is there a way around this for now? I'd like to avoid writing my own lexer if I can help it, particularly if my handwritten one would be suboptimal compared to what this would generate.

Jun 07 '23 17:06 ethindp

Did you identify the regex(es) that caused the problem?

Jun 08 '23 06:06 jeertmans

For reference, this is the offending pattern. ([0-9]?)*. Logos hangs on some repetitions which contain empty patterns.

Jun 21 '23 22:06 Artikae

What is the purpose of ([0-9]?)*? If I remember correctly, Logos does not support subgroups (it ignores them, I think), so it would be equivalent to [0-9]*.

Jun 22 '23 09:06 jeertmans

logos logos copied to clipboard

Thread 'rustc' has overflowed it's stack

logos
logos copied to clipboard