all-of-javascript icon indicating copy to clipboard operation
all-of-javascript copied to clipboard

learn the-super-tiny-compiler

Open cbbfcd opened this issue 6 years ago • 0 comments

//                   LISP           -->          C
//  2 + 2          (add 2 2)                 add(2, 2)
//  4 - 2          (subtract 4 2)            subtract(4, 2)
//  2 + (4 - 2)    (add 2 (subtract 4 2))    add(2, subtract(4, 2))
// learn compiler from https://github.com/jamiebuilds/the-super-tiny-compiler/blob/master/the-super-tiny-compiler.js

// 1. Parsing(解析过程,将输入转出为AST)
//    Lexical Analysis(input) => tokens,  Syntactic Analysis(tokens) => AST
// 2. Transformation(转换过程,输入AST转出新的AST(或者是直接修改原AST))
//    Traversal(DFS)--深度优先遍历整个抽象语法树, Transformer--定义了visitor去遍历处理每一个节点
// 3. Code Generation(用转换后的AST,产出代码的过程)

// Lexical Analysis
function tokenizer(input){
  let tokens = []
  let current = 0
  while(current < input.length){
    let char = input[current]
    if(char === '('){
      tokens.push({
        type: 'paren',
        value: '('
      })
      current++
      continue
    }
    if(char === ')'){
      tokens.push({
        type: 'paren',
        value: ')'
      })
      current++
      continue
    }
    // 空格跳过
    const WHITESPACE = /\s/
    if(WHITESPACE.test(char)){
      current++
      continue
    }
    // 数字
    const NUMBERS = /[0-9]/
    if(NUMBERS.test(char)){
      let value = ''
      while(NUMBERS.test(char)){
        value += char
        char = input[++current]
      }
      tokens.push({
        type: 'number',
        value
      })
      continue
    }
    // 字符串 (concat "foo" "bar")
    if(char === '"'){
      let value = ''
      char = input[++current]
      while(char !== '"'){
        value += char
        char = input[++current]
      }
      char = input[++current] // 跳过最后的引号
      tokens.push({type: 'string', value})
      continue
    }
    // 字母
    const LETTERS = /[a-z]/i
    if(LETTERS.test(char)){
      let value = ''
      while(LETTERS.test(char)){
        value += char
        char = input[++current]
      }
      tokens.push({
        type: 'name',
        value: value
      });
      continue;
    }
    throw new TypeError('no matched character type!')
  }
  return tokens
}

// Syntactic Analysis
function parser(tokens){
  let current = 0
  function walk(){
    let token = tokens[current]
    if(token.type === 'number'){
      current++
      return {
        type: 'NumberLiteral',
        value: token.value
      }
    }
    if(token.type === 'string'){
      current++
      return {
        type: 'StringLiteral',
        value: token.value
      }
    }
    if(token.type === 'paren' && token.value === '('){
      // 跳过无意义的 (
      token = tokens[++current]
      // 调用的函数
      let node = {
        type: 'CallExpression',
        name: token.value,
        params: []
      }
      // 再次跳过
      token = tokens[++current]
      // 这些都是调用函数的参数了
      while((token.type !== 'paren') || (token.type === 'paren' && token.value !== ')')){
        node.params.push(walk())
        // 因为current++了
        token = tokens[current]
      }
      // 跳过右圆括号
      current++
      return node
    }
    throw new TypeError('no matched character type!')
  }
  let ast = {
    type: 'Program',
    body: []
  }
  // 处理多个表达式的情况
  while(current < tokens.length){
    ast.body.push(walk())
  }
  return ast
}

// Traversal--DFS(visitor访问每一个节点,对其进行操作)
// traverser 与下面的 transformer 配合,这种设计结构也是精辟!!
function traverser(ast, visitor){
  function traverserArray(array, parent){
    array.forEach(function(child){
      traverserNode(child, parent)
    })
  }
  function traverserNode(node, parent){
    // visitor对每个节点进行操作,分别设计了入口和出口的处理
    let methods = visitor[node.type]
    if(methods && methods.enter){
      methods.enter(node, parent)
    }
    switch(node.type){
      case 'Program':
        traverserArray(node.body, node)
        break
      case 'CallExpression':
        traverserArray(node.params, node)
        break
      case 'NumberLiteral':
      case 'StringLiteral':
        break
      default:
        throw new TypeError(node.type)
    }
    if (methods && methods.exit) {
      methods.exit(node, parent);
    }
  }
  traverserNode(ast, null)
}

// Transformation(ast --> newAst)
function transformer(ast){
  const newAst = {
    type: 'Program',
    body: []
  }
  // 创建一个上下文环境
  // 这里作者玩儿了点技巧,指针指向同一块儿内存,所以在ast._context中加的东西,就是给了newAst.body的
  // 细细品味,非常巧妙的方法!!
  ast._context = newAst.body
  // 定义visitor
  traverser(ast, {
    NumberLiteral: {
      enter(node, parent){
        parent._context.push({
          type: 'NumberLiteral',
          value: node.value,
        });
      }
    },
    StringLiteral: {
      enter(node, parent){
        parent._context.push({
          type: 'StringLiteral',
          value: node.value,
        });
      }
    },
    CallExpression: {
      enter(node, parent){
        let expression = {
          type: 'CallExpression',
          callee: {
            type: 'Identifier',
            name: node.name,
          },
          arguments: [],
        };
        // 同样的内存玩儿法
        node._context = expression.arguments;
        if (parent.type !== 'CallExpression') {
          expression = {
            type: 'ExpressionStatement',
            expression: expression,
          };
        }
        parent._context.push(expression)
      }
    }
  })
  return newAst
}

// Code Generation
function codeGenerator(node){
  switch(node.type){
    case 'Program':
      return node.body.map(codeGenerator).join('\n')
    case 'ExpressionStatement':
      return (codeGenerator(node.expression) + ';')
    case 'CallExpression':
      return (codeGenerator(node.callee) + '(' + node.arguments.map(codeGenerator).join(',') + ')')
    case 'Identifier':
      return node.name
    case 'NumberLiteral':
      return node.value
    case 'StringLiteral':
      return '"' + node.value + '"'
    default:
      throw new TypeError('error!')
  }
}

// compiler
function compiler(input){
  let tokens = tokenizer(input)
  let ast = parser(tokens)
  let newAst = transformer(ast)
  let output = codeGenerator(newAst)
  return output
}

module.exports = {
  tokenizer,
  parser,
  transformer,
  codeGenerator,
  compiler
}

cbbfcd avatar Sep 26 '18 11:09 cbbfcd