langcc icon indicating copy to clipboard operation
langcc copied to clipboard

Invalid tokens with some characters

Open hernanmd opened this issue 3 years ago • 0 comments

I am experimenting a bit with langcc to generate a Smalltalk parser from one of its BNFs. However, I encounter an "Invalid token" problem with some characters, e.g. % < > =.

Ex:

langcc -vvvv vw.lang gen
[000:00:00.000510] Invalid token
[000:00:00.000510] Line 2, column 92:
[000:00:00.000510] 
[000:00:00.000510]       binary_character <- `+` | `/` | `\` | `*` | `~` | `=` | `@` | `|` | `?` | `!` | `,` | `%` | `<` | `>`;
[000:00:00.000510]                                                                                              ^               
[000:00:00.000510] 

How can I do to escape them?

This is the .lang I wrote so far: (The lexer and parser stanzas are simply copied from the manual because I have not yet reached that part :-)

tokens {
    binary_character <- `+` | `/` | `\` | `*` | `~` | `=` | `@` | `|` | `?` | `!` | `,` | `%` | `<` | `>`;
 
    digit <= `0`..`9`;
    digits <= digit*;
    letter <= `a`..`z`;
    id <= letter (letter | digit)*;

    named_literal <= `nil` | `true` | `false` | `super`;
    byte_array_literal <= `#` byte_array_literal_body;
    byte_array_literal_body <= `[` number* `]`;
    literal <= #L[e::`-`] number | named_literal | symbol_literal | character_literal | string array_literal | byte_array_literal;

    comment <= `"` (non_quote_character | `'` )* `"`;
    separator <= (whitespace_character | comment)+;

    big_digits <= (digit | letter)+;
    optional_fraction_and_exponent <= #L[e::`.` digits] #L[e::(`e` | `d` | `s`) #L[e::`-`] digits];
    number <= digits (`r` #L[e::`-`] big_digits | optional_fraction_and_exponent);
   
    extended_letter <= letter | `_`;
    identifier <= extended_letter (extended_letter | digit)*;
    whitespace_character <=  ` ` | `\n` | `\t`;
    
    keyword <= identifier `:`;
    block_argument <= `:` identifier;
    assignment_operator <= `:` `=`;
    binary_selector <= (`-` | binary_character) [binary_character];
    character_constant <= `$` (non_quote_character | `'` | `"`);    
    non_quote_character <= digit | letter | binary_character | whitespace_character | `[` | `]` | `{` | `}` | `(` | `)` | `_` | `^` | `;` | `$` | `#` | `:` | `.` | `-` | ``;
    string <= `'` (non_quote_character | `'` `'` | `"`)* `'`;
    token <= keyword | block_argument | assignment_operator | binary_selector | character_constant | string;

    unary_selector <= identifier;
    symbol <= identifier | binary_selector | keyword+;
    unary_message <= unary_selector;
    binary_message <= binary_selector primary unary_message*;
    keyword_message <= (keyword primary unary_message* binary_message*)+;
    cascaded_messages <= (`;` (unary_message | binary_message | keyword_message))*;
    messages <= unary_message+ binary_message* [keyword_message] | binary_message+ [keyword_message] | keyword_message;
    rest_of_expression <= #L[e::messages cascaded_messages];
    pseudovariable_name <= `self` | `thisContext`;
    primary <= extended_binding_name | binding_reference | pseudovariable_name | literal | block_constructor | `(` expression `)`;

    extended_binding_name <= binding_name #L[e::( `.` binding_name )* ];
    expression <= ( extended_binding_name | binding_reference ) (assignment_operator expression | rest_of_expression) | keyword `=` expression | primary rest_of_expression | `super` messages cascaded_messages;
    expression_list <= expression (`.` expression)* #L[e::`.`];

    binding_name <= identifier ( named_literal | pseudovariable_name );
    declared_variable_name <= binding_name;
    temporary_list <= declared_variable_name*;
    temporaries <= `|` temporary_list `|` | `||`;

    pragma <= `<` ( keyword literal )+ `>`;
    statements <= #L[e::`^` expression #L[e::`.`] | expression #L[e::`.` statements]];
    block_constructor <= `[` #L[e::block_declarations] statements `]`;
    block_declarations <= temporaries | block_argument+ (`|` [temporaries] | `||` temporary_list `|` | `|||`);
    message_pattern <= unary_selector | binary_selector declared_variable_name | (keyword declared_variable_name)+;
    method <= message_pattern pragma #L[e::temporaries] statements;

    top <= method;
}

lexer {
    main { body }

    mode body {
        top => { emit; }
        ws => { pass; }
        `"` => { push comment_single; pass; }
        eof => { pop; }
    }

    mode comment_single {
        `\n` => { pop_extract; }
        eof => { pop_extract; }
        _ => { pass; }
    }
}

parser {
    main { S }
}

hernanmd avatar Oct 28 '22 00:10 hernanmd