langium AST to code transformation (serializer / stringifier)

We need to transform a model (AST) to its textual representation. It would be great if Langium supported it

Apr 26 '25 04:04 AresEkb

I assume here you mean generically mapping an AST to its textual syntax according to the language's grammar definition, which we call serializing. Generating arbitrary code is already covered with the generator API.

May 13 '25 07:05 spoenemann

Yes, I've implemented the following:

import { AstNode, Grammar, GrammarAST, isAstNode, isReference } from 'langium';

export interface PrintContext {
  indent: string;
}

class AstNodePropertyGetter {
  private node;
  private indices;

  constructor(object: AstNode) {
    this.node = object;
    this.indices = new Map<string, number>();
  }

  type() {
    return this.node.$type;
  }

  get(property: string) {
    const value = (this.node as unknown as Record<string, unknown>)[property];
    if (!Array.isArray(value)) {
      return value;
    }
    const index = this.indices.get(property) ?? 0;
    this.indices.set(property, index + 1);
    return value[index];
  }
}

export function print(node: AstNode, grammar: Grammar, context: PrintContext = { indent: '' }) {
  const rule = grammar.rules.find((r) => GrammarAST.isParserRule(r) && r.name === node.$type);
  if (!rule) {
    throw new Error(`Rule not found for ${node.$type}`);
  }
  const result = printElement(new AstNodePropertyGetter(node), grammar, rule.definition, context);
  return indent(result?.trim().replace(/[ ]+$/gm, '') ?? '') + '\n';
}

function printElement(
  node: AstNodePropertyGetter,
  grammar: Grammar,
  element: GrammarAST.AbstractElement,
  context: PrintContext,
): string | undefined {
  if (GrammarAST.isGroup(element)) {
    return repeat(node, grammar, element, context, printGroup);
  }
  if (GrammarAST.isAlternatives(element)) {
    return repeat(node, grammar, element, context, printAlternatives);
  }
  if (GrammarAST.isAssignment(element)) {
    return repeat(node, grammar, element, context, printAssignment);
  }
  if (GrammarAST.isRuleCall(element)) {
    if (element.rule.ref?.name === '_NL') {
      return '\n';
    }
    if (element.rule.ref?.fragment || node.type() === element.rule.ref?.name) {
      return printElement(node, grammar, element.rule.ref?.definition, context);
    } else {
      return undefined;
    }
  }
  if (GrammarAST.isAction(element)) {
    return node.type() === element.inferredType?.name ? '' : undefined;
  }
  if (GrammarAST.isKeyword(element)) {
    return element.value;
  }
  throw new Error(`Unsupported ${element.$type}`);
}

function printGroup(
  node: AstNodePropertyGetter,
  grammar: Grammar,
  element: GrammarAST.Group,
  context: PrintContext,
): string | undefined {
  const results: string[] = [];
  for (const el of element.elements) {
    const value = printElement(node, grammar, el, context);
    if (value === undefined) {
      return undefined;
    }
    results.push(value);
  }
  return join(results);
}

function printAlternatives(
  node: AstNodePropertyGetter,
  grammar: Grammar,
  element: GrammarAST.Alternatives,
  context: PrintContext,
): string | undefined {
  for (const el of element.elements) {
    const value = printElement(node, grammar, el, context);
    if (value !== undefined) {
      return value;
    }
  }
  return undefined;
}

function printAssignment(
  node: AstNodePropertyGetter,
  grammar: Grammar,
  element: GrammarAST.Assignment,
  context: PrintContext,
): string | undefined {
  if (GrammarAST.isRuleCall(element.terminal)) {
    const value = node.get(element.feature);
    if (value === undefined) {
      return undefined;
    }
    if (GrammarAST.isParserRule(element.terminal.rule.ref)) {
      if (!isAstNode(value)) {
        throw new Error(`Expected AST node but got '${value}' with type ${typeof value}`);
      }
      return printElement(new AstNodePropertyGetter(value), grammar, element.terminal.rule.ref.definition, context);
    }
    if (GrammarAST.isTerminalRule(element.terminal.rule.ref)) {
      if (typeof value === 'string') {
        if (element.terminal.rule.ref.name === 'STRING') {
          return (
            "'" +
            value
              .replace('\\', '\\\\')
              .replace('\b', '\\b')
              .replace('\f', '\\f')
              .replace('\n', '\\n')
              .replace('\r', '\\r')
              .replace('\t', '\\t')
              .replace('\v', '\\v')
              .replace("'", "\\'") +
            "'"
          );
        }
        return value;
      }
      if (typeof value === 'number') {
        return value.toString();
      }
      throw new Error(`Unsupported value '${value}' with type ${typeof value}`);
    }
    throw new Error();
  }
  if (GrammarAST.isCrossReference(element.terminal)) {
    const value = node.get(element.feature);
    if (value === undefined) {
      return undefined;
    }
    if (!isReference(value)) {
      throw new Error(`Expected cross-reference but got '${value}' with type ${typeof value}`);
    }
    return value.$refText;
  }
  throw new Error(`Unsupported terminal ${element.terminal.$type}`);
}

const MAX_REPEAT_COUNT = 10000;

function repeat<T extends GrammarAST.AbstractElement>(
  node: AstNodePropertyGetter,
  grammar: Grammar,
  element: T,
  context: PrintContext,
  func: (node: AstNodePropertyGetter, grammar: Grammar, element: T, context: PrintContext) => string | undefined,
) {
  const results: string[] = [];
  const many = element.cardinality === '*' || element.cardinality === '+';
  let i = 0;
  do {
    const value = func(node, grammar, element, context);
    if (value === undefined) {
      break;
    }
    if (i++ >= MAX_REPEAT_COUNT) {
      throw new Error();
    }
    results.push(value);
  } while (many);
  return results.length || element.cardinality === '?' || element.cardinality === '*' ? join(results) : undefined;
}

function join(strings: string[]) {
  let result = '';
  for (const str of strings) {
    if (
      !result.length ||
      result.endsWith('\n') ||
      result.endsWith('(') ||
      str.startsWith(',') ||
      str.startsWith('(') ||
      str.startsWith(')')
    ) {
      result += str;
    } else if (str) {
      result += ' ' + str;
    }
  }
  return result;
}

function indent(str: string) {
  let result = '';
  let depth = 0;
  for (let i = 0; i < str.length; i++) {
    result += str[i];
    if (str[i] === '{') {
      depth++;
    } else if (str[i] === '}') {
      depth--;
    } else if (str[i] === '\n' && str[i + 1] !== '\n' && str[i + 1] !== '}') {
      result += '  '.repeat(depth);
    }
  }
  return result;
}

It's not universal, but works fine for my test grammar:

grammar ClassModel

entry ClassModel:
    Localization
    'classModel' name=ID
    (classes+=Class | dataTypes+=DataType)*;

Class:
    Localization
    kind=ClassKind name=ID ('extends' generals+=[Class:ID] (',' generals+=[Class:ID])*)? ('{'
        properties+=Property*
    _NL? _NL? '}')?;

ClassKind:
    {infer ClassKind_Regular} 'class' |
    {infer ClassKind_Abstract} 'abstract' 'class' |
    {infer ClassKind_Interface} 'interface';

Property:
    Attribute | Reference;

Attribute:
    Localization
    'attribute' name=ID dataType=[DataType:ID];

Reference:
    Localization
    'reference' name=ID target=[Class:ID];

DataType:
    StringType | NumericType | BooleanType | TimeType | UuidType | EnumeratedType;

StringType:
    Localization
    'string' name=ID ('{'
        (_NL? 'length' length=INT)?
        (_NL? 'minLength' minLength=INT)?
        (_NL? 'maxLength' maxLength=INT)?
        (_NL? 'pattern' pattern=STRING)?
    _NL? '}')?;

NumericType:
    Localization
    'numeric' name=ID ('{'
        (_NL? 'size' size=INT)?
        (_NL? 'totalDigits' totalDigits=INT)?
        (_NL? 'fractionDigits' fractionDigits=INT)?
        (_NL? 'minInclusive' minInclusive=INT)?
        (_NL? 'minExclusive' minExclusive=INT)?
        (_NL? 'maxInclusive' maxInclusive=INT)?
        (_NL? 'maxExclusive' maxExclusive=INT)?
        (_NL? 'measurementUnit' pattern=STRING)?
    _NL? '}')?;

BooleanType:
    Localization
    'boolean' name=ID ('{' _NL? '}')?;

TimeType:
    Localization
    'time' name=ID ('{'
        (_NL? 'instantUnits' instantUnits+=TimeUnit+)?
        (_NL? 'instantFractionDigits' instantFractionDigits=INT)?
        (_NL? 'durationUnits' durationUnits+=TimeUnit+)?
        (_NL? 'durationFractionDigits' durationFractionDigits=INT)?
        (_NL? 'recurrence' recurrence=TimeUnit)?
    _NL? '}')?;

TimeUnit:
    {infer TimeUnit_Year} 'year' |
    {infer TimeUnit_Quarter} 'quarter' |
    {infer TimeUnit_Month} 'month' |
    {infer TimeUnit_Week} 'week' |
    {infer TimeUnit_Day} 'day' |
    {infer TimeUnit_Hour} 'hour' |
    {infer TimeUnit_Minute} 'minute' |
    {infer TimeUnit_Second} 'second';

UuidType:
    Localization
    'uuid' name=ID ('{' _NL? '}')?;

EnumeratedType:
    Localization
    'enumerated' name=ID ('{'
        literals+=EnumeratedTypeLiteral*
    _NL? _NL? '}')?;

EnumeratedTypeLiteral:
    Localization
    name=ID;

fragment Localization:
    _NL?
    (_NL? '@name' '(' localizedName+=EStringToStringMapEntry ')' |
     _NL? '@description' '(' localizedDescription+=EStringToStringMapEntry ')')*
    _NL?;

EStringToStringMapEntry:
    key=STRING ',' value=STRING;

_NL returns string: '__NL__';

terminal ID: /[_a-zA-Z][\w_]*/;
terminal INT: /\d+/;
terminal STRING: /'(\\.|[^'])*'/; 

hidden terminal WS: /\s+/;
hidden terminal ML_COMMENT: /\/\*[\s\S]*?\*\//;
hidden terminal SL_COMMENT: /\/\/[^\n\r]*/;

I had to add _NL token indicating that the code printer should insert a new line. It's similar to printing instructions in EMFText. I find this approach simpler than code formatters implemented in Java/TypeScript in Xtext/Langium

May 13 '25 07:05 AresEkb

langium langium copied to clipboard

AST to code transformation (serializer / stringifier)

langium
langium copied to clipboard