一个简单的语法转换器
实现一个 LISP to C 的语法转换器
我们就直奔主题吧!我们的目的是将以下 LISP 代码转换成 C 的代码
(add 2 (subtract 4 2))
add(2, subtract(4, 2));
一个语法转换器需要的三个阶段
- Parsing: 这个阶段的作用是将代码解析出来,然后转换成抽象语法树(AST)。
- Transformation: 在这个阶段可以做我们任何想做的操作,一般来讲就是两种 AST 语法之间相互的转换。
- Code Generation: 这个阶段我们把已经更新过的 AST 再转换成新的格式的代码。
1 我们重点讲下 Parsing 阶段
学过编译原理的同学们应该都知道,Parsing 会有两大阶段:
- 词法分析: 是计算机科学中将字符序列转换为标记(token)序列的过程。
- 语法分析: 是根据某种给定的形式文法对由单词序列(如英语单词序列)构成的输入文本进行分析并确定其语法结构的一种过程。
从语意上理解比较抽象,我这里用一个 code demo 来讲一下这两个过程。
1.1 语法分析
// input
const input = "(add 2 (subtract 4 2))";
// output
const tokens = [
{ type: "paren", value: "(" },
{ type: "name", value: "add" },
{ type: "number", value: "2" },
{ type: "paren", value: "(" },
{ type: "name", value: "subtract" },
{ type: "number", value: "4" },
{ type: "number", value: "2" },
{ type: "paren", value: ")" },
{ type: "paren", value: ")" },
];
那么我们来看看是怎么转换的呢?以下是 tokenizer 的实现:
- 通过一个 current 值(可以理解成一个指针)去遍历整个字符串。
- 判断每一个字符串的类型,然后 push 到 tokens 数组内。
- 获取到完成的 tokens 列表。
const tokenizer = (input: string) => {
let current = 0;
let tokens = [];
while (current < input.length) {
let char = input[current];
// 左括号
if (char === "(") {
tokens.push({
type: "paren",
value: "(",
});
current++;
continue;
}
// 右括号
if (char === ")") {
tokens.push({
type: "paren",
value: ")",
});
current++;
continue;
}
// 空格直接跳过
let WHITESPACE = /\s/;
if (WHITESPACE.test(char)) {
current++;
continue;
}
// 数字取整个数字 直到遇到非数字的字符串为止
let NUMBERS = /[0-9]/;
if (NUMBERS.test(char)) {
let value = "";
while (NUMBERS.test(char)) {
value += char;
char = input[++current];
}
tokens.push({ type: "number", value });
continue;
}
// 双引号 取整个字符串 直到遇到下一个双引号为止
if (char === '"') {
let value = "";
char = input[++current];
while (char !== '"') {
value += char;
char = input[++current];
}
char = input[++current];
tokens.push({ type: "string", value });
continue;
}
// 操作符 取整个字符串 直到遇到下一个非[a-z]的字符为止,一般为空格
let LETTERS = /[a-z]/i;
if (LETTERS.test(char)) {
let value = "";
while (LETTERS.test(char)) {
value += char;
char = input[++current];
}
tokens.push({ type: "name", value });
continue;
}
throw new TypeError("I dont know what this character is: " + char);
}
return tokens;
};
1.2 语法分析
// input
const tokens = [
{ type: "paren", value: "(" },
{ type: "name", value: "add" },
{ type: "number", value: "2" },
{ type: "paren", value: "(" },
{ type: "name", value: "subtract" },
{ type: "number", value: "4" },
{ type: "number", value: "2" },
{ type: "paren", value: ")" },
{ type: "paren", value: ")" },
];
// output
const ast = {
type: "Program",
body: [
{
type: "CallExpression",
name: "add",
params: [
{
type: "NumberLiteral",
value: "2",
},
{
type: "CallExpression",
name: "subtract",
params: [
{
type: "NumberLiteral",
value: "4",
},
{
type: "NumberLiteral",
value: "2",
},
],
},
],
},
],
};
再看下这个转换的过程需要哪些步骤:
- 通过一个 current 值(可以理解成一个指针)去遍历整个 tokens 数组。
- 对每一个 token 的不同类型做不同的处理。
- 形成一个基于 LISP 生成的 AST。
interface IToken {
type: string;
value: string | number;
}
const parser = (tokens: IToken[]) => {
let current = 0;
function walk() {
let token = tokens[current];
// 数字
if (token.type === "number") {
current++;
return {
type: "NumberLiteral",
value: token.value,
};
}
// 字符串
if (token.type === "string") {
current++;
return {
type: "StringLiteral",
value: token.value,
};
}
// 带括号的表达式
if (token.type === "paren" && token.value === "(") {
token = tokens[++current];
let node: IASTExpressionNode = {
type: "CallExpression",
name: token.value,
params: [],
};
token = tokens[++current];
while (
token.type !== "paren" ||
(token.type === "paren" && token.value !== ")")
) {
// 进一步遍历
node.params.push(walk());
token = tokens[current];
}
current++;
return node;
}
throw new TypeError(token.type);
}
let ast: IAST = {
type: "Program",
body: [],
};
while (current < tokens.length) {
// 生成ast
ast.body.push(walk());
}
return ast;
};
2. Transformation: 将 LISP 的 AST 转成 C 的 AST
两种语言之间很可能存在语法差别,所以需要通过 Transformation 将语法转换掉。
// input
const ast = {
type: "Program",
body: [
{
type: "CallExpression",
name: "add",
params: [
{
type: "NumberLiteral",
value: "2",
},
{
type: "CallExpression",
name: "subtract",
params: [
{
type: "NumberLiteral",
value: "4",
},
{
type: "NumberLiteral",
value: "2",
},
],
},
],
},
],
};
// output
const newAst = {
type: "Program",
body: [
{
type: "ExpressionStatement",
expression: {
type: "CallExpression",
callee: {
type: "Identifier",
name: "add",
},
arguments: [
{
type: "NumberLiteral",
value: "2",
},
{
type: "CallExpression",
callee: {
type: "Identifier",
name: "subtract",
},
arguments: [
{
type: "NumberLiteral",
value: "4",
},
{
type: "NumberLiteral",
value: "2",
},
],
},
],
},
},
],
};
对 AST 内的每一种类型填充 enter 和 exit 函数(处理前和处理后)分别在遍历进入和退出的时候执行
function transformer(ast) {
let newAst = {
type: "Program",
body: [],
};
ast._context = newAst.body;
function traverser(ast, visitor) {
function traverseArray(array, parent) {
array.forEach((child) => {
traverseNode(child, parent);
});
}
function traverseNode(node, parent) {
let methods = visitor[node.type];
if (methods && methods.enter) {
methods.enter(node, parent);
}
switch (node.type) {
case "Program":
traverseArray(node.body, node);
break;
case "CallExpression":
traverseArray(node.params, node);
break;
case "NumberLiteral":
case "StringLiteral":
break;
default:
throw new TypeError(node.type);
}
if (methods && methods.exit) {
methods.exit(node, parent);
}
}
traverseNode(ast, null);
}
traverser(ast, {
NumberLiteral: {
enter(node, parent) {
parent._context.push({
type: "NumberLiteral",
value: node.value,
});
},
},
StringLiteral: {
enter(node, parent) {
parent._context.push({
type: "StringLiteral",
value: node.value,
});
},
},
CallExpression: {
enter(node, parent) {
let expression = {
type: "CallExpression",
callee: {
type: "Identifier",
name: node.name,
},
arguments: [],
};
node._context = expression.arguments;
if (parent.type !== "CallExpression") {
expression = {
type: "ExpressionStatement",
expression: expression,
};
}
parent._context.push(expression);
},
},
});
return newAst;
}
3. Code Generation:将 C 的 AST 转化成 C 的 code
// input
const newAst = {
type: "Program",
body: [
{
type: "ExpressionStatement",
expression: {
type: "CallExpression",
callee: {
type: "Identifier",
name: "add",
},
arguments: [
{
type: "NumberLiteral",
value: "2",
},
{
type: "CallExpression",
callee: {
type: "Identifier",
name: "subtract",
},
arguments: [
{
type: "NumberLiteral",
value: "4",
},
{
type: "NumberLiteral",
value: "2",
},
],
},
],
},
},
],
};
// output
const output = "add(2, subtract(4, 2));";
- 深度优先遍历基于 C 的 AST。
- 通过不同类型处理成不同的字符串。
- 将所有字符串组合成为 C 的 code。
const codeGenerator = (node) => {
switch (node.type) {
case "Program":
return node.body.map(codeGenerator).join("\n");
case "ExpressionStatement":
return codeGenerator(node.expression) + ";";
case "CallExpression":
return (
codeGenerator(node.callee) +
"(" +
node.arguments.map(codeGenerator).join(", ") +
")"
);
case "Identifier":
return node.name;
case "NumberLiteral":
return node.value;
case "StringLiteral":
return '"' + node.value + '"';
default:
throw new TypeError(node.type);
}
};