all-of-javascript
all-of-javascript copied to clipboard
learn the-super-tiny-compiler
// LISP --> C
// 2 + 2 (add 2 2) add(2, 2)
// 4 - 2 (subtract 4 2) subtract(4, 2)
// 2 + (4 - 2) (add 2 (subtract 4 2)) add(2, subtract(4, 2))
// learn compiler from https://github.com/jamiebuilds/the-super-tiny-compiler/blob/master/the-super-tiny-compiler.js
// 1. Parsing(解析过程,将输入转出为AST)
// Lexical Analysis(input) => tokens, Syntactic Analysis(tokens) => AST
// 2. Transformation(转换过程,输入AST转出新的AST(或者是直接修改原AST))
// Traversal(DFS)--深度优先遍历整个抽象语法树, Transformer--定义了visitor去遍历处理每一个节点
// 3. Code Generation(用转换后的AST,产出代码的过程)
// Lexical Analysis
function tokenizer(input){
let tokens = []
let current = 0
while(current < input.length){
let char = input[current]
if(char === '('){
tokens.push({
type: 'paren',
value: '('
})
current++
continue
}
if(char === ')'){
tokens.push({
type: 'paren',
value: ')'
})
current++
continue
}
// 空格跳过
const WHITESPACE = /\s/
if(WHITESPACE.test(char)){
current++
continue
}
// 数字
const NUMBERS = /[0-9]/
if(NUMBERS.test(char)){
let value = ''
while(NUMBERS.test(char)){
value += char
char = input[++current]
}
tokens.push({
type: 'number',
value
})
continue
}
// 字符串 (concat "foo" "bar")
if(char === '"'){
let value = ''
char = input[++current]
while(char !== '"'){
value += char
char = input[++current]
}
char = input[++current] // 跳过最后的引号
tokens.push({type: 'string', value})
continue
}
// 字母
const LETTERS = /[a-z]/i
if(LETTERS.test(char)){
let value = ''
while(LETTERS.test(char)){
value += char
char = input[++current]
}
tokens.push({
type: 'name',
value: value
});
continue;
}
throw new TypeError('no matched character type!')
}
return tokens
}
// Syntactic Analysis
function parser(tokens){
let current = 0
function walk(){
let token = tokens[current]
if(token.type === 'number'){
current++
return {
type: 'NumberLiteral',
value: token.value
}
}
if(token.type === 'string'){
current++
return {
type: 'StringLiteral',
value: token.value
}
}
if(token.type === 'paren' && token.value === '('){
// 跳过无意义的 (
token = tokens[++current]
// 调用的函数
let node = {
type: 'CallExpression',
name: token.value,
params: []
}
// 再次跳过
token = tokens[++current]
// 这些都是调用函数的参数了
while((token.type !== 'paren') || (token.type === 'paren' && token.value !== ')')){
node.params.push(walk())
// 因为current++了
token = tokens[current]
}
// 跳过右圆括号
current++
return node
}
throw new TypeError('no matched character type!')
}
let ast = {
type: 'Program',
body: []
}
// 处理多个表达式的情况
while(current < tokens.length){
ast.body.push(walk())
}
return ast
}
// Traversal--DFS(visitor访问每一个节点,对其进行操作)
// traverser 与下面的 transformer 配合,这种设计结构也是精辟!!
function traverser(ast, visitor){
function traverserArray(array, parent){
array.forEach(function(child){
traverserNode(child, parent)
})
}
function traverserNode(node, parent){
// visitor对每个节点进行操作,分别设计了入口和出口的处理
let methods = visitor[node.type]
if(methods && methods.enter){
methods.enter(node, parent)
}
switch(node.type){
case 'Program':
traverserArray(node.body, node)
break
case 'CallExpression':
traverserArray(node.params, node)
break
case 'NumberLiteral':
case 'StringLiteral':
break
default:
throw new TypeError(node.type)
}
if (methods && methods.exit) {
methods.exit(node, parent);
}
}
traverserNode(ast, null)
}
// Transformation(ast --> newAst)
function transformer(ast){
const newAst = {
type: 'Program',
body: []
}
// 创建一个上下文环境
// 这里作者玩儿了点技巧,指针指向同一块儿内存,所以在ast._context中加的东西,就是给了newAst.body的
// 细细品味,非常巧妙的方法!!
ast._context = newAst.body
// 定义visitor
traverser(ast, {
NumberLiteral: {
enter(node, parent){
parent._context.push({
type: 'NumberLiteral',
value: node.value,
});
}
},
StringLiteral: {
enter(node, parent){
parent._context.push({
type: 'StringLiteral',
value: node.value,
});
}
},
CallExpression: {
enter(node, parent){
let expression = {
type: 'CallExpression',
callee: {
type: 'Identifier',
name: node.name,
},
arguments: [],
};
// 同样的内存玩儿法
node._context = expression.arguments;
if (parent.type !== 'CallExpression') {
expression = {
type: 'ExpressionStatement',
expression: expression,
};
}
parent._context.push(expression)
}
}
})
return newAst
}
// Code Generation
function codeGenerator(node){
switch(node.type){
case 'Program':
return node.body.map(codeGenerator).join('\n')
case 'ExpressionStatement':
return (codeGenerator(node.expression) + ';')
case 'CallExpression':
return (codeGenerator(node.callee) + '(' + node.arguments.map(codeGenerator).join(',') + ')')
case 'Identifier':
return node.name
case 'NumberLiteral':
return node.value
case 'StringLiteral':
return '"' + node.value + '"'
default:
throw new TypeError('error!')
}
}
// compiler
function compiler(input){
let tokens = tokenizer(input)
let ast = parser(tokens)
let newAst = transformer(ast)
let output = codeGenerator(newAst)
return output
}
module.exports = {
tokenizer,
parser,
transformer,
codeGenerator,
compiler
}