PeppaPEG
PeppaPEG copied to clipboard
Grammar to parse tree-siter grammars
People writing grammars for https://github.com/tree-sitter/tree-sitter need to use a ugly JavaScript spaghetti to write grammars and convert then to a json that the actual parser uses to do it's job.
I've been looking for a simple way to write the grammars in a style like yacc/bison/peg and then the tool would write the final json for the tree-sitter parser, I think that PeppaPEG would be a nice fit for this job and for that we need read the actual json grammars and convert to an augmented EBNF then people would write/extend on that EBNF and the tool would again write back the json.
A feature like this one would improve the visibility and user base of PeppaPEG.
Example here is a Lua grammar for tree-sitter https://github.com/Azganoth/tree-sitter-lua/blob/master/grammar.js and here the final json https://github.com/Azganoth/tree-sitter-lua/blob/master/src/grammar.json and here an initial EBNF grammar :
%name lua
%extras
comment
/[\s\n]/
;
%conflicts
[ _prefix ]
[ _expression _variable_declarator ]
[ _expression function_call_statement ]
[ function_name function_name_field ]
;
%externals
comment
string
;
%inline
_statement
;
program : _statement* return_statement? ;
return_statement : "return" ( _expression ( "," _expression ) * ) ? _empty_statement? ;
_statement : expression | variable_declaration | local_variable_declaration | do_statement | if_statement | while_statement | repeat_statement | for_statement | for_in_statement | goto_statement | break_statement | label_statement | _empty_statement | function | local_function | function_call ;
variable_declaration : ( variable_declarator ( "," variable_declarator ) * ) "=" ( _expression ( "," _expression ) * ) ;
local_variable_declaration : "local" variable_declarator ( "=" ( _expression ( "," _expression ) * ) ) ? ;
_variable_declarator : identifier | ( _prefix "[" _expression "]" ) | field_expression ;
field_expression : _prefix "." property_identifier ;
_local_variable_declarator : identifier ( "," identifier ) * ;
do_statement : "do" _statement* return_statement? "end" ;
if_statement : "if" condition_expression "then" _statement* return_statement? elseif* else? "end" ;
elseif : "elseif" condition_expression "then" _statement* return_statement? ;
else : "else" _statement* return_statement? ;
while_statement : "while" condition_expression "do" _statement* return_statement? "end" ;
repeat_statement : "repeat" _statement* return_statement? "until" condition_expression ;
for_statement : "for" loop_expression "do" _statement* return_statement? "end" ;
for_in_statement : "for" loop_expression "do" _statement* return_statement? "end" ;
_loop_expression : identifier "=" _expression "," _expression ( "," _expression ) ? ;
_in_loop_expression : ( identifier ( "," identifier ) * ) "in" ( _expression ( "," _expression ) * ) ;
goto_statement : "goto" identifier ;
break_statement : "break" ;
label_statement : "::" identifier "::" ;
_empty_statement : ";" ;
function_statement : "function" function_name _function_body ;
local_function_statement : "local" "function" identifier _function_body ;
function_call_statement : ( ( ( _prefix arguments ) | ( _prefix ":" method arguments ) ) ) ;
arguments : ( "(" ( _expression ( "," _expression ) * ) ? ")" ) | table | string ;
function_name : ( identifier | function_name_field ) ( ":" method ) ? ;
function_name_field : ( identifier ) ( "." property_identifier ) * ;
parameters : "(" ( ( self | spread | identifier ) ( "," identifier ) * ( "," spread ) ? ) ? ")" ;
_function_body : parameters _statement* return_statement? "end" ;
_expression : spread | _prefix | next | function_definition | table | binary_operation | unary_operation | string | number | nil | true | false | identifier ;
spread : "..." ;
self : "self" ;
next : "next" ;
global_variable : "_G" | "_VERSION" ;
_prefix : self | global_variable | _variable_declarator | ( function_call ) | ( "(" _expression ")" ) ;
function_definition : "function" _function_body ;
table : "{" _field_sequence? "}" ;
field : ( "[" _expression "]" "=" _expression ) | ( identifier "=" _expression ) | _expression ;
_field_sequence : ( ( field ( _field_sep field ) * _field_sep? ) ) ;
_field_sep : "," | ";" ;
binary_operation : ( ( _expression "or" _expression ) ) | ( ( _expression "and" _expression ) ) | ( ( _expression "<" _expression ) ) | ( ( _expression "<=" _expression ) ) | ( ( _expression "==" _expression ) ) | ( ( _expression "~=" _expression ) ) | ( ( _expression ">=" _expression ) ) | ( ( _expression ">" _expression ) ) | ( ( _expression "|" _expression ) ) | ( ( _expression "~" _expression ) ) | ( ( _expression "&" _expression ) ) | ( ( _expression "<<" _expression ) ) | ( ( _expression ">>" _expression ) ) | ( ( _expression "+" _expression ) ) | ( ( _expression "-" _expression ) ) | ( ( _expression "*" _expression ) ) | ( ( _expression "/" _expression ) ) | ( ( _expression "//" _expression ) ) | ( ( _expression "%" _expression ) ) | ( ( _expression ".." _expression ) ) | ( ( _expression "^" _expression ) ) ;
unary_operation : ( ( ( "not" | "#" | "-" | "~" ) _expression ) ) ;
number : ( ( ( ( ( "0" | ( "0" ? /[1-9]/ /[0-9]+/? ) ) "." /[0-9]+/? ( ( "e" | "E" ) ( ( "-" | "+" ) ? /[0-9]+/ ) ) ? ) | ( "." /[0-9]+/ ( ( "e" | "E" ) ( ( "-" | "+" ) ? /[0-9]+/ ) ) ? ) | ( ( "0" | ( "0" ? /[1-9]/ /[0-9]+/? ) ) ( ( "e" | "E" ) ( ( "-" | "+" ) ? /[0-9]+/ ) ) ? ) ) | ( ( "0x" | "0X" ) /[a-fA-F0-9]+/ ( "." /[a-fA-F0-9]+/ ) ? ( ( "p" | "P" ) ( ( "-" | "+" ) ? /[0-9]+/ ) ) ? ) ) ) ;
nil : "nil" ;
true : "true" ;
false : "false" ;
identifier : /[a-zA-Z_][a-zA-Z0-9_]*/ ;
Generated by this script (using quickjs https://bellard.org/quickjs/):
import * as std from "std";
import * as os from "os";
//print(typeof process);
//if(typeof std == "undefined") {
//}
const fname_list = [
"tree-sitter-lua/src/grammar.json",
];
//const fname_base = "A_grammars/tree-sitter/";
const fname_base = "../tree-sitter/";
function parseJsonGrammar(fname)
{
let fd = std.open(fname_base + fname, "r");
let json = fd.readAsString();
fd.close();
let out_fname = fname.replace(/\/.+/, ".ebnf0");
//print(out_fname); return;
//fd = std.open(out_fname, "w");
fd = std.open("/dev/stdout", "w"); //std.stdout;
json = JSON.parse(json);
//print(json);
fd.printf("%%name %s\n", json.name);
if(json.word) fd.printf("%%word %s\n", json.word);
let manageTuples = function(dict) {
for(var idx in dict) {
let value = dict[idx];
//print(value);
switch(value.type) {
case "SYMBOL":
fd.printf("\t%s\n", value.name);
break;
case "PATTERN":
fd.printf("\t/%s/\n", value.value);
break;
case "STRING":
fd.printf("\t'%s'\n", value.value);
break;
case "TOKEN":
fd.printf("\t'%s'\n", value.content);
break;
default:
throw("Unmanaged type " + value.type);
}
}
};
let manageArrays = function(ary) {
for(var idx in ary) {
let value = ary[idx];
fd.printf("\t[");
for(var idx2 in value) {
fd.printf(" %s", value[idx2]);
}
fd.printf(" ]\n");
}
};
if( json.extras && json.extras.length ) {
fd.printf("%%extras\n");
manageTuples(json.extras);
fd.printf("\t;\n");
}
if( json.conflicts && json.conflicts.length ) {
fd.printf("%%conflicts\n");
manageArrays(json.conflicts);
fd.printf("\t;\n");
}
if( json.externals && json.externals.length ) {
fd.printf("%%externals\n");
manageTuples(json.externals);
fd.printf("\t;\n");
}
if( json.precedences && json.precedences.length) {
fd.printf("%%precedences\n");
for(var idx in json.precedences) {
let value = json.precedences[idx];
fd.printf("\t[");
for(var idx2 in value) {
let pval = value[idx2];
switch(pval.type) {
case "SYMBOL":
fd.printf(" %s", pval.name);
break;
case "STRING":
fd.printf(" '%s'", pval.value);
break;
default:
throw("Unmanaged type " + pval.type);
}
}
fd.printf(" ]\n");
}
fd.printf("\t;\n");
}
if( json.inline && json.inline.length ) {
fd.printf("%%inline\n");
for(var idx in json.inline) {
let value = json.inline[idx];
fd.printf("\t%s\n", value);
}
fd.printf("\t;\n");
}
if( json.supertypes && json.supertypes.length ) {
fd.printf("%%supertypes\n");
for(var idx in json.supertypes) {
let value = json.supertypes[idx];
fd.printf("\t%s\n", value);
}
fd.printf("\t;\n");
}
let manageAlias = function (rules, alias_list) {
for(var idx in rules) {
let rhs = rules[idx2];
for(var idx2 in rhs) {
}
}
};
let alias_list = {};
let str_list = [];
let manageRule = function (name, rule, depth) {
//print(name, rule.type); //, typeof rule);
switch(rule.type)
{
case "ALIAS":
//fd.printf(" ( ");
//manageRule(rule.type, rule.content, depth+1);
//fd.printf(" )@%s ", rule.value);
fd.printf("%s", rule.value);
//alias_list[rule.value] = ;
break;
case "BLANK":
print(rule.type);
break;
case "CHOICE": {
let members = rule.members;
let mcount = members.length;
let isOptional = mcount == 2 && members[1].type == "BLANK";
if(isOptional) {
//if(mcount > 2 && depth) fd.printf(" (");
manageRule(rule.type, members[0], depth+1);
//if(mcount > 2 && depth) fd.printf(" )");
fd.printf("? ");
}
else
{
//fd.printf("=== %d : %d : %d ===", mcount, depth, mcount > 1 && depth);
if(mcount > 1 && depth) fd.printf(" (");
for(var idx in members) {
if(idx > 0) fd.printf(" | ");
manageRule(rule.type, members[idx], depth+1);
}
if(mcount > 1 && depth) fd.printf(" ) ");
}
}
break;
case "FIELD":
//print(rule.type, rule.name);
fd.printf(" ( ");
manageRule(rule.type, rule.content, depth+1);
fd.printf(" ) ");
break;
case "IMMEDIATE_TOKEN":
fd.printf(" ( ");
manageRule(rule.type, rule.content, depth+1);
fd.printf(" ) ");
break;
case "PATTERN": {
fd.printf(" /%s/", rule.value);
}
break;
case "PREC":
case "PREC_DYNAMIC":
case "PREC_LEFT":
case "PREC_RIGHT":
fd.printf(" ( ");
manageRule(rule.type, rule.content, depth+1);
fd.printf(" ) ");
break;
case "REPEAT":
//if(depth) fd.printf(" ( ");
manageRule(rule.type, rule.content, depth+1);
//if(depth) fd.printf(" )");
fd.printf("* ");
break;
case "REPEAT1":
//if(depth) fd.printf(" (");
manageRule(rule.type, rule.content, depth+1);
//if(depth) fd.printf(" )");
fd.printf("+ ");
break;
case "SEQ": {
let members = rule.members;
let mcount = members.length;
//fd.printf("=== %d : %d ===", mcount, depth);
if(mcount > 1 && depth) fd.printf(" ( ");
for(var idx in members) {
manageRule(rule.type, members[idx], depth+1);
}
if(mcount > 1 && depth) fd.printf(" ) ");
}
break;
case "STRING": {
let value = rule.value;
//print(rule.type, value);
switch(value) {
case "\0": fd.printf(" '\\0' "); break;
case "\b": fd.printf(" '\\b' "); break;
case "\f": fd.printf(" '\\f' "); break;
case "\n": fd.printf(" '\\n' "); break;
case "\r": fd.printf(" '\\r' "); break;
case "\t": fd.printf(" '\\t' "); break;
case "\v": fd.printf(" '\\v' "); break;
case "\\": fd.printf(" '\\\\' "); break;
case "'": fd.printf(" \"'\" "); break;
case "\"": fd.printf(" '\"' "); break;
default:
//value = value.replace(/\\/g, "\\\\");
value = value.replace(/\t/g, "\\t"); //order matter
if(value.indexOf("'") >= 0) fd.printf(" \"%s\" ", value);
else fd.printf(" \"%s\" ", value);
}
}
break;
case "SYMBOL":
//print(rule.type, rule.name);
fd.printf(" %s", rule.name);
break;
case "TOKEN":
fd.printf(" ( ");
manageRule(rule.type, rule.content, depth+1);
fd.printf(" ) ");
break;
default:
throw("Unknown rule type: " + rule.type);
}
}
let rules = json.rules;
for(var idx in rules) {
let rule = rules[idx];
//print(rule);
if(rule.type == "xTOKEN") {
let str = idx + " : " + + " .";
}
else {
fd.printf("%s : ", idx);
manageRule(idx, rule, 0);
}
fd.printf(" ;\n");
}
fd.close();
}
for(let idx in fname_list)
{
let fname = fname_list[idx];
print(fname);
parseJsonGrammar(fname);
}