PeppaPEG icon indicating copy to clipboard operation
PeppaPEG copied to clipboard

Grammar to parse tree-siter grammars

Open mingodad opened this issue 2 years ago • 0 comments

People writing grammars for https://github.com/tree-sitter/tree-sitter need to use a ugly JavaScript spaghetti to write grammars and convert then to a json that the actual parser uses to do it's job.

I've been looking for a simple way to write the grammars in a style like yacc/bison/peg and then the tool would write the final json for the tree-sitter parser, I think that PeppaPEG would be a nice fit for this job and for that we need read the actual json grammars and convert to an augmented EBNF then people would write/extend on that EBNF and the tool would again write back the json.

A feature like this one would improve the visibility and user base of PeppaPEG.

Example here is a Lua grammar for tree-sitter https://github.com/Azganoth/tree-sitter-lua/blob/master/grammar.js and here the final json https://github.com/Azganoth/tree-sitter-lua/blob/master/src/grammar.json and here an initial EBNF grammar :

%name lua
%extras
	comment
	/[\s\n]/
	;
%conflicts
	[ _prefix ]
	[ _expression _variable_declarator ]
	[ _expression function_call_statement ]
	[ function_name function_name_field ]
	;
%externals
	comment
	string
	;
%inline
	_statement
	;
program :  _statement*  return_statement?  ;
return_statement :  "return"  (  _expression (  ","  _expression ) *  ) ?  _empty_statement?  ;
_statement : expression |  variable_declaration |  local_variable_declaration |  do_statement |  if_statement |  while_statement |  repeat_statement |  for_statement |  for_in_statement |  goto_statement |  break_statement |  label_statement |  _empty_statement | function | local_function | function_call ;
variable_declaration :  ( variable_declarator (  "," variable_declarator ) *  )  "="  (  _expression (  ","  _expression ) *  )  ;
local_variable_declaration :  "local" variable_declarator (  "="  (  _expression (  ","  _expression ) *  )  ) ?  ;
_variable_declarator :  identifier |  (  _prefix "["  _expression "]"  )  |  field_expression ;
field_expression :  _prefix "." property_identifier ;
_local_variable_declarator :  identifier (  ","  identifier ) *  ;
do_statement :  "do"  _statement*  return_statement?  "end"  ;
if_statement :  "if" condition_expression "then"  _statement*  return_statement?  elseif*  else?  "end"  ;
elseif :  "elseif" condition_expression "then"  _statement*  return_statement?  ;
else :  "else"  _statement*  return_statement?  ;
while_statement :  "while" condition_expression "do"  _statement*  return_statement?  "end"  ;
repeat_statement :  "repeat"  _statement*  return_statement?  "until" condition_expression ;
for_statement :  "for" loop_expression "do"  _statement*  return_statement?  "end"  ;
for_in_statement :  "for" loop_expression "do"  _statement*  return_statement?  "end"  ;
_loop_expression :  identifier "="  _expression ","  _expression (  ","  _expression ) ?  ;
_in_loop_expression :  (  identifier (  ","  identifier ) *  )  "in"  (  _expression (  ","  _expression ) *  )  ;
goto_statement :  "goto"  identifier ;
break_statement :  "break"  ;
label_statement :  "::"  identifier "::"  ;
_empty_statement :  ";"  ;
function_statement :  "function"  function_name _function_body ;
local_function_statement :  "local"  "function"  identifier _function_body ;
function_call_statement :   (  ( (  _prefix arguments )  |  (  _prefix ":" method arguments )  )  )  ;
arguments :  (  "("  (  _expression (  ","  _expression ) *  ) ?  ")"  )  |  table |  string ;
function_name :  ( identifier |  function_name_field )  (  ":" method ) ?  ;
function_name_field :  (  identifier )  (  "." property_identifier ) *  ;
parameters :  "("  (  ( self |  spread |  identifier )  (  ","  identifier ) *  (  ","  spread ) ?  ) ?  ")"  ;
_function_body :  parameters _statement*  return_statement?  "end"  ;
_expression :  spread |  _prefix |  next |  function_definition |  table |  binary_operation |  unary_operation |  string |  number |  nil |  true |  false |  identifier ;
spread :  "..."  ;
self :  "self"  ;
next :  "next"  ;
global_variable :  "_G"  |  "_VERSION"  ;
_prefix :  self |  global_variable |  _variable_declarator |   ( function_call )  |  (  "("  _expression ")"  )  ;
function_definition :  "function"  _function_body ;
table :  "{"  _field_sequence?  "}"  ;
field :  (  "["  _expression "]"  "="  _expression )  |  (  identifier "="  _expression )  |  _expression ;
_field_sequence :   (  (  field (  _field_sep field ) *  _field_sep?  )  )  ;
_field_sep :  ","  |  ";"  ;
binary_operation :   (  (  _expression "or"  _expression )  )  |   (  (  _expression "and"  _expression )  )  |   (  (  _expression "<"  _expression )  )  |   (  (  _expression "<="  _expression )  )  |   (  (  _expression "=="  _expression )  )  |   (  (  _expression "~="  _expression )  )  |   (  (  _expression ">="  _expression )  )  |   (  (  _expression ">"  _expression )  )  |   (  (  _expression "|"  _expression )  )  |   (  (  _expression "~"  _expression )  )  |   (  (  _expression "&"  _expression )  )  |   (  (  _expression "<<"  _expression )  )  |   (  (  _expression ">>"  _expression )  )  |   (  (  _expression "+"  _expression )  )  |   (  (  _expression "-"  _expression )  )  |   (  (  _expression "*"  _expression )  )  |   (  (  _expression "/"  _expression )  )  |   (  (  _expression "//"  _expression )  )  |   (  (  _expression "%"  _expression )  )  |   (  (  _expression ".."  _expression )  )  |   (  (  _expression "^"  _expression )  )  ;
unary_operation :   (  (  ( "not"  |  "#"  |  "-"  |  "~"  )  _expression )  )  ;
number :  (  ( ( (  ( "0"  |  (  "0" ?  /[1-9]/ /[0-9]+/?  )  )  "."  /[0-9]+/?  (  ( "e"  |  "E"  )  (  ( "-"  |  "+"  ) ?  /[0-9]+/ )  ) ?  )  |  (  "."  /[0-9]+/ (  ( "e"  |  "E"  )  (  ( "-"  |  "+"  ) ?  /[0-9]+/ )  ) ?  )  |  (  ( "0"  |  (  "0" ?  /[1-9]/ /[0-9]+/?  )  )  (  ( "e"  |  "E"  )  (  ( "-"  |  "+"  ) ?  /[0-9]+/ )  ) ?  )  )  |  (  ( "0x"  |  "0X"  )  /[a-fA-F0-9]+/ (  "."  /[a-fA-F0-9]+/ ) ?  (  ( "p"  |  "P"  )  (  ( "-"  |  "+"  ) ?  /[0-9]+/ )  ) ?  )  )  )  ;
nil :  "nil"  ;
true :  "true"  ;
false :  "false"  ;
identifier :  /[a-zA-Z_][a-zA-Z0-9_]*/ ;

Generated by this script (using quickjs https://bellard.org/quickjs/):

import * as std from "std";
import * as os from "os";

//print(typeof process);
//if(typeof std == "undefined") {
//}

const fname_list = [
	"tree-sitter-lua/src/grammar.json",
];

//const fname_base = "A_grammars/tree-sitter/";
const fname_base = "../tree-sitter/";

function parseJsonGrammar(fname)
{
	let fd = std.open(fname_base + fname, "r");
	let json = fd.readAsString();
	fd.close();

	let out_fname = fname.replace(/\/.+/, ".ebnf0");
	//print(out_fname); return;
	//fd = std.open(out_fname, "w");
	fd = std.open("/dev/stdout", "w"); //std.stdout;

	json = JSON.parse(json);
	//print(json);

	fd.printf("%%name %s\n", json.name);
	if(json.word) fd.printf("%%word %s\n", json.word);

	let manageTuples = function(dict) {
		for(var idx in  dict) {
			let value =  dict[idx];
			//print(value);
			switch(value.type) {
				case "SYMBOL":
					fd.printf("\t%s\n", value.name);
					break;
				case "PATTERN":
					fd.printf("\t/%s/\n", value.value);
					break;
				case "STRING":
					fd.printf("\t'%s'\n", value.value);
					break;
				case "TOKEN":
					fd.printf("\t'%s'\n", value.content);
					break;
				default:
					throw("Unmanaged type " + value.type);
			}
		}
	};

	let manageArrays = function(ary) {
		for(var idx in  ary) {
			let value =  ary[idx];
			fd.printf("\t[");
			for(var idx2 in value) {
				fd.printf(" %s", value[idx2]);
			}
			fd.printf(" ]\n");
		}
	};

	if( json.extras && json.extras.length ) {
		fd.printf("%%extras\n");
		manageTuples(json.extras);
		fd.printf("\t;\n");
	}

	if( json.conflicts && json.conflicts.length ) {
		fd.printf("%%conflicts\n");
		manageArrays(json.conflicts);
		fd.printf("\t;\n");
	}

	if( json.externals && json.externals.length ) {
		fd.printf("%%externals\n");
		manageTuples(json.externals);
		fd.printf("\t;\n");
	}

	if( json.precedences  && json.precedences.length) {
		fd.printf("%%precedences\n");
		for(var idx in  json.precedences) {
			let value =  json.precedences[idx];
			fd.printf("\t[");
			for(var idx2 in value) {
				let pval = value[idx2];
				switch(pval.type) {
					case "SYMBOL":
						fd.printf(" %s", pval.name);
						break;
					case "STRING":
						fd.printf(" '%s'", pval.value);
						break;
					default:
						throw("Unmanaged type " + pval.type);
				}
			}
			fd.printf(" ]\n");
		}
		fd.printf("\t;\n");
	}

	if( json.inline  && json.inline.length ) {
		fd.printf("%%inline\n");
		for(var idx in  json.inline) {
			let value =  json.inline[idx];
			fd.printf("\t%s\n", value);
		}
		fd.printf("\t;\n");
	}

	if( json.supertypes  && json.supertypes.length ) {
		fd.printf("%%supertypes\n");
		for(var idx in  json.supertypes) {
			let value =  json.supertypes[idx];
			fd.printf("\t%s\n", value);
		}
		fd.printf("\t;\n");
	}

	let manageAlias = function (rules, alias_list) {
		for(var idx in  rules) {
			let rhs = rules[idx2];
			for(var idx2 in  rhs) {

			}
		}
	};
	let alias_list = {};

	let str_list = [];

	let manageRule = function (name, rule, depth) {
		//print(name, rule.type); //, typeof rule);
		switch(rule.type)
		{
			case "ALIAS":
				//fd.printf(" ( ");
				//manageRule(rule.type, rule.content, depth+1);
				//fd.printf(" )@%s ", rule.value);
				fd.printf("%s", rule.value);
				//alias_list[rule.value] = ;
			break;
			case "BLANK":
				print(rule.type);
			break;
			case "CHOICE": {
				let members = rule.members;
				let mcount = members.length;
				let isOptional = mcount == 2 && members[1].type == "BLANK";
				if(isOptional) {
					//if(mcount > 2 && depth) fd.printf(" (");
					manageRule(rule.type, members[0], depth+1);
					//if(mcount > 2 && depth) fd.printf(" )");
					fd.printf("? ");
				}
				else
				{
					//fd.printf("=== %d : %d : %d ===", mcount, depth, mcount > 1 && depth);
					if(mcount > 1 && depth) fd.printf(" (");
					for(var idx in members) {
						if(idx > 0) fd.printf(" | ");
						manageRule(rule.type, members[idx], depth+1);
					}
					if(mcount > 1 && depth) fd.printf(" ) ");
				}
			}
			break;
			case "FIELD":
				//print(rule.type, rule.name);
				fd.printf(" ( ");
				manageRule(rule.type, rule.content, depth+1);
				fd.printf(" ) ");
			break;
			case "IMMEDIATE_TOKEN":
				fd.printf(" ( ");
				manageRule(rule.type, rule.content, depth+1);
				fd.printf(" ) ");
			break;
			case "PATTERN": {
				fd.printf(" /%s/", rule.value);
			}
			break;
			case "PREC":
			case "PREC_DYNAMIC":
			case "PREC_LEFT":
			case "PREC_RIGHT":
				fd.printf("  ( ");
				manageRule(rule.type, rule.content, depth+1);
				fd.printf(" ) ");
			break;
			case "REPEAT":
				//if(depth) fd.printf(" ( ");
				manageRule(rule.type, rule.content, depth+1);
				//if(depth) fd.printf(" )");
				fd.printf("* ");
			break;
			case "REPEAT1":
				//if(depth) fd.printf(" (");
				manageRule(rule.type, rule.content, depth+1);
				//if(depth) fd.printf(" )");
				fd.printf("+ ");
			break;
			case "SEQ": {
				let members = rule.members;
				let mcount = members.length;
				//fd.printf("=== %d : %d ===", mcount, depth);
				if(mcount > 1 && depth) fd.printf(" ( ");
				for(var idx in members) {
					manageRule(rule.type, members[idx], depth+1);
				}
				if(mcount > 1 && depth) fd.printf(" ) ");
			}
			break;

			case "STRING": {
				let value = rule.value;
				//print(rule.type, value);
				switch(value) {
					case "\0": fd.printf(" '\\0' "); break;
					case "\b": fd.printf(" '\\b' "); break;
					case "\f": fd.printf(" '\\f' "); break;
					case "\n": fd.printf(" '\\n' "); break;
					case "\r": fd.printf(" '\\r' "); break;
					case "\t": fd.printf(" '\\t' "); break;
					case "\v": fd.printf(" '\\v' "); break;
					case "\\": fd.printf(" '\\\\' "); break;
					case "'": fd.printf(" \"'\" "); break;
					case "\"": fd.printf(" '\"' "); break;
					default:
						//value = value.replace(/\\/g, "\\\\");
						value = value.replace(/\t/g, "\\t"); //order matter
						if(value.indexOf("'") >= 0) fd.printf(" \"%s\" ", value);
						else fd.printf(" \"%s\" ", value);
				}
			}
			break;

			case "SYMBOL":
				//print(rule.type, rule.name);
				fd.printf(" %s", rule.name);
			break;

			case "TOKEN":
				fd.printf(" ( ");
				manageRule(rule.type, rule.content, depth+1);
				fd.printf(" ) ");
			break;

			default:
				throw("Unknown rule type: " + rule.type);
		}
	}

	let rules = json.rules;
	for(var idx in rules) {
		let rule = rules[idx];
		//print(rule);
		if(rule.type == "xTOKEN") {
			let str = idx + " : " + + " .";
		}
		else {
			fd.printf("%s : ", idx);
			manageRule(idx, rule, 0);
		}
		fd.printf(" ;\n");
	}
	fd.close();
}

for(let idx in fname_list)
{
	let fname = fname_list[idx];
	print(fname);
	parseJsonGrammar(fname);
}

mingodad avatar Dec 14 '21 14:12 mingodad