langcc
langcc copied to clipboard
Invalid tokens with some characters
I am experimenting a bit with langcc to generate a Smalltalk parser from one of its BNFs. However, I encounter an "Invalid token" problem with some characters, e.g. % < > =.
Ex:
langcc -vvvv vw.lang gen
[000:00:00.000510] Invalid token
[000:00:00.000510] Line 2, column 92:
[000:00:00.000510]
[000:00:00.000510] binary_character <- `+` | `/` | `\` | `*` | `~` | `=` | `@` | `|` | `?` | `!` | `,` | `%` | `<` | `>`;
[000:00:00.000510] ^
[000:00:00.000510]
How can I do to escape them?
This is the .lang I wrote so far: (The lexer and parser stanzas are simply copied from the manual because I have not yet reached that part :-)
tokens {
binary_character <- `+` | `/` | `\` | `*` | `~` | `=` | `@` | `|` | `?` | `!` | `,` | `%` | `<` | `>`;
digit <= `0`..`9`;
digits <= digit*;
letter <= `a`..`z`;
id <= letter (letter | digit)*;
named_literal <= `nil` | `true` | `false` | `super`;
byte_array_literal <= `#` byte_array_literal_body;
byte_array_literal_body <= `[` number* `]`;
literal <= #L[e::`-`] number | named_literal | symbol_literal | character_literal | string array_literal | byte_array_literal;
comment <= `"` (non_quote_character | `'` )* `"`;
separator <= (whitespace_character | comment)+;
big_digits <= (digit | letter)+;
optional_fraction_and_exponent <= #L[e::`.` digits] #L[e::(`e` | `d` | `s`) #L[e::`-`] digits];
number <= digits (`r` #L[e::`-`] big_digits | optional_fraction_and_exponent);
extended_letter <= letter | `_`;
identifier <= extended_letter (extended_letter | digit)*;
whitespace_character <= ` ` | `\n` | `\t`;
keyword <= identifier `:`;
block_argument <= `:` identifier;
assignment_operator <= `:` `=`;
binary_selector <= (`-` | binary_character) [binary_character];
character_constant <= `$` (non_quote_character | `'` | `"`);
non_quote_character <= digit | letter | binary_character | whitespace_character | `[` | `]` | `{` | `}` | `(` | `)` | `_` | `^` | `;` | `$` | `#` | `:` | `.` | `-` | ``;
string <= `'` (non_quote_character | `'` `'` | `"`)* `'`;
token <= keyword | block_argument | assignment_operator | binary_selector | character_constant | string;
unary_selector <= identifier;
symbol <= identifier | binary_selector | keyword+;
unary_message <= unary_selector;
binary_message <= binary_selector primary unary_message*;
keyword_message <= (keyword primary unary_message* binary_message*)+;
cascaded_messages <= (`;` (unary_message | binary_message | keyword_message))*;
messages <= unary_message+ binary_message* [keyword_message] | binary_message+ [keyword_message] | keyword_message;
rest_of_expression <= #L[e::messages cascaded_messages];
pseudovariable_name <= `self` | `thisContext`;
primary <= extended_binding_name | binding_reference | pseudovariable_name | literal | block_constructor | `(` expression `)`;
extended_binding_name <= binding_name #L[e::( `.` binding_name )* ];
expression <= ( extended_binding_name | binding_reference ) (assignment_operator expression | rest_of_expression) | keyword `=` expression | primary rest_of_expression | `super` messages cascaded_messages;
expression_list <= expression (`.` expression)* #L[e::`.`];
binding_name <= identifier ( named_literal | pseudovariable_name );
declared_variable_name <= binding_name;
temporary_list <= declared_variable_name*;
temporaries <= `|` temporary_list `|` | `||`;
pragma <= `<` ( keyword literal )+ `>`;
statements <= #L[e::`^` expression #L[e::`.`] | expression #L[e::`.` statements]];
block_constructor <= `[` #L[e::block_declarations] statements `]`;
block_declarations <= temporaries | block_argument+ (`|` [temporaries] | `||` temporary_list `|` | `|||`);
message_pattern <= unary_selector | binary_selector declared_variable_name | (keyword declared_variable_name)+;
method <= message_pattern pragma #L[e::temporaries] statements;
top <= method;
}
lexer {
main { body }
mode body {
top => { emit; }
ws => { pass; }
`"` => { push comment_single; pass; }
eof => { pop; }
}
mode comment_single {
`\n` => { pop_extract; }
eof => { pop_extract; }
_ => { pass; }
}
}
parser {
main { S }
}