Introduction: The Ultimate Translation
Every time you write print("Hello, World!") and hit compile, a minor miracle occurs. Your human-readable code transforms through multiple layers of abstraction, eventually becoming a sequence of 1s and 0s that your CPU can execute directly. This transformation is the work of a compiler—one of the most sophisticated programs ever written.
Understanding compilers is essential for serious programmers because:
- Performance: Knowing what the compiler does helps you write faster code
- Debugging: Compiler errors make more sense when you understand compilation phases
- Language design: Creating DSLs or new languages requires compiler knowledge
- Career growth: Compiler engineering is a specialized, high-value skill
This article will demystify compilers by building a simple one from scratch, explaining each phase with working code.
The Compilation Pipeline
Modern compilers follow a multi-stage pipeline:
interface CompilerPipeline {
lexer: (source: string) => Token[];
parser: (tokens: Token[]) => AST;
semanticAnalyzer: (ast: AST) => TypedAST;
optimizer: (ir: IR) => IR;
codeGenerator: (ir: IR) => MachineCode;
}
Let us build each component.
Phase 1: Lexical Analysis (Lexing)
The lexer breaks source code into tokens—the smallest meaningful units.
enum TokenType {
NUMBER,
IDENTIFIER,
KEYWORD,
OPERATOR,
LPAREN,
RPAREN,
LBRACE,
RBRACE,
SEMICOLON,
EOF
}
interface Token {
type: TokenType;
value: string;
line: number;
column: number;
}
class Lexer {
private pos = 0;
private line = 1;
private column = 1;
private keywords = new Set(["let", "if", "else", "while", "return", "function"]);
constructor(private source: string) {}
private peek(): string {
return this.source[this.pos] || "";
}
private advance(): string {
const char = this.source[this.pos++];
if (char === "\n") {
this.line++;
this.column = 1;
} else {
this.column++;
}
return char;
}
private skipWhitespace(): void {
while (/\s/.test(this.peek())) {
this.advance();
}
}
private readNumber(): Token {
const start = { line: this.line, column: this.column };
let num = "";
while (/[0-9.]/.test(this.peek())) {
num += this.advance();
}
return { type: TokenType.NUMBER, value: num, ...start };
}
private readIdentifier(): Token {
const start = { line: this.line, column: this.column };
let id = "";
while (/[a-zA-Z0-9_]/.test(this.peek())) {
id += this.advance();
}
const type = this.keywords.has(id) ? TokenType.KEYWORD : TokenType.IDENTIFIER;
return { type, value: id, ...start };
}
tokenize(): Token[] {
const tokens: Token[] = [];
while (this.pos < this.source.length) {
this.skipWhitespace();
if (this.pos >= this.source.length) break;
const char = this.peek();
const start = { line: this.line, column: this.column };
if (/[0-9]/.test(char)) {
tokens.push(this.readNumber());
} else if (/[a-zA-Z_]/.test(char)) {
tokens.push(this.readIdentifier());
} else if ("+-*/=<>!".includes(char)) {
let op = this.advance();
if ("=<>!".includes(char) && this.peek() === "=") {
op += this.advance();
}
tokens.push({ type: TokenType.OPERATOR, value: op, ...start });
} else if (char === "(") {
this.advance();
tokens.push({ type: TokenType.LPAREN, value: "(", ...start });
} else if (char === ")") {
this.advance();
tokens.push({ type: TokenType.RPAREN, value: ")", ...start });
} else if (char === "{") {
this.advance();
tokens.push({ type: TokenType.LBRACE, value: "{", ...start });
} else if (char === "}") {
this.advance();
tokens.push({ type: TokenType.RBRACE, value: "}", ...start });
} else if (char === ";") {
this.advance();
tokens.push({ type: TokenType.SEMICOLON, value: ";", ...start });
} else {
throw new Error("Unexpected character: " + char + " at " + start.line + ":" + start.column);
}
}
tokens.push({ type: TokenType.EOF, value: "", line: this.line, column: this.column });
return tokens;
}
}
const code = "let x = 42 + 10;";
const lexer = new Lexer(code);
const tokens = lexer.tokenize();
console.log(tokens);

Phase 2: Syntax Analysis (Parsing)
The parser builds an Abstract Syntax Tree (AST) from tokens, encoding the grammatical structure.
type ASTNode =
| NumberLiteral
| Identifier
| BinaryOp
| Assignment
| Block
| IfStatement
| WhileLoop
| FunctionDecl;
interface NumberLiteral {
type: "NumberLiteral";
value: number;
}
interface Identifier {
type: "Identifier";
name: string;
}
interface BinaryOp {
type: "BinaryOp";
operator: string;
left: ASTNode;
right: ASTNode;
}
interface Assignment {
type: "Assignment";
name: string;
value: ASTNode;
}
class Parser {
private pos = 0;
constructor(private tokens: Token[]) {}
private peek(): Token {
return this.tokens[this.pos];
}
private advance(): Token {
return this.tokens[this.pos++];
}
private expect(type: TokenType): Token {
const token = this.advance();
if (token.type !== type) {
throw new Error("Expected " + TokenType[type] + ", got " + TokenType[token.type]);
}
return token;
}
private parseExpression(minPrecedence = 0): ASTNode {
let left = this.parsePrimary();
while (this.peek().type === TokenType.OPERATOR) {
const op = this.peek().value;
const precedence = this.getPrecedence(op);
if (precedence < minPrecedence) break;
this.advance();
const right = this.parseExpression(precedence + 1);
left = { type: "BinaryOp", operator: op, left, right };
}
return left;
}
private getPrecedence(op: string): number {
const precedences: Record<string, number> = {
"=": 1,
"==": 2, "!=": 2,
"<": 3, ">": 3, "<=": 3, ">=": 3,
"+": 4, "-": 4,
"*": 5, "/": 5
};
return precedences[op] || 0;
}
private parsePrimary(): ASTNode {
const token = this.peek();
if (token.type === TokenType.NUMBER) {
this.advance();
return { type: "NumberLiteral", value: parseFloat(token.value) };
}
if (token.type === TokenType.IDENTIFIER) {
this.advance();
return { type: "Identifier", name: token.value };
}
if (token.type === TokenType.LPAREN) {
this.advance();
const expr = this.parseExpression();
this.expect(TokenType.RPAREN);
return expr;
}
throw new Error("Unexpected token: " + token.value);
}
parse(): ASTNode[] {
const statements: ASTNode[] = [];
while (this.peek().type !== TokenType.EOF) {
statements.push(this.parseStatement());
}
return statements;
}
private parseStatement(): ASTNode {
const token = this.peek();
if (token.type === TokenType.KEYWORD && token.value === "let") {
return this.parseAssignment();
}
const expr = this.parseExpression();
this.expect(TokenType.SEMICOLON);
return expr;
}
private parseAssignment(): Assignment {
this.expect(TokenType.KEYWORD);
const nameToken = this.expect(TokenType.IDENTIFIER);
this.advance();
const value = this.parseExpression();
this.expect(TokenType.SEMICOLON);
return {
type: "Assignment",
name: nameToken.value,
value
};
}
}
const parser = new Parser(tokens);
const ast = parser.parse();
console.log(JSON.stringify(ast, null, 2));
Phase 3: Semantic Analysis
Semantic analysis checks that the program makes sense:
class SemanticAnalyzer {
private symbolTable = new Map<string, Type>();
private currentScope: Scope;
analyze(ast: ASTNode[]): void {
for (const node of ast) {
this.analyzeNode(node);
}
}
private analyzeNode(node: ASTNode): Type {
switch (node.type) {
case "NumberLiteral":
return { kind: "number" };
case "Identifier":
const type = this.symbolTable.get(node.name);
if (!type) {
throw new Error("Undefined variable: " + node.name);
}
return type;
case "BinaryOp":
const leftType = this.analyzeNode(node.left);
const rightType = this.analyzeNode(node.right);
if (!this.typesMatch(leftType, rightType)) {
throw new Error("Type mismatch in binary operation");
}
return leftType;
case "Assignment":
const valueType = this.analyzeNode(node.value);
this.symbolTable.set(node.name, valueType);
return valueType;
default:
throw new Error("Unknown node type");
}
}
private typesMatch(a: Type, b: Type): boolean {
return a.kind === b.kind;
}
}

Phase 5: Optimization
Optimizers transform IR to faster equivalent code:
class Optimizer {
optimize(ir: TACInstruction[]): TACInstruction[] {
let optimized = ir;
let changed = true;
while (changed) {
changed = false;
const newIR = this.constantFolding(optimized);
if (newIR.length < optimized.length) {
changed = true;
optimized = newIR;
}
optimized = this.deadCodeElimination(optimized);
}
return optimized;
}
private constantFolding(ir: TACInstruction[]): TACInstruction[] {
const constants = new Map<string, number>();
const optimized: TACInstruction[] = [];
for (const instr of ir) {
if (instr.op === "assign" && typeof instr.src === "number") {
constants.set(instr.dest, instr.src);
optimized.push(instr);
} else if (["add", "sub", "mul", "div"].includes(instr.op)) {
const leftVal = constants.get(instr.left);
const rightVal = constants.get(instr.right);
if (leftVal !== undefined && rightVal !== undefined) {
let result: number;
switch (instr.op) {
case "add": result = leftVal + rightVal; break;
case "sub": result = leftVal - rightVal; break;
case "mul": result = leftVal * rightVal; break;
case "div": result = leftVal / rightVal; break;
}
constants.set(instr.dest, result);
optimized.push({ op: "assign", dest: instr.dest, src: result });
} else {
optimized.push(instr);
}
} else {
optimized.push(instr);
}
}
return optimized;
}
private deadCodeElimination(ir: TACInstruction[]): TACInstruction[] {
const used = new Set<string>();
for (const instr of ir) {
if (["add", "sub", "mul", "div"].includes(instr.op)) {
used.add(instr.left);
used.add(instr.right);
}
}
return ir.filter(instr => {
if (instr.op === "assign" && instr.dest.startsWith("t")) {
return used.has(instr.dest);
}
return true;
});
}
}

Phase 6: Code Generation
Finally, generate target machine code (here, x86-64 assembly):
class X86CodeGenerator {
private assembly: string[] = [];
private registerAllocator = new RegisterAllocator();
generate(ir: TACInstruction[]): string {
this.assembly.push("section .text");
this.assembly.push("global _start");
this.assembly.push("_start:");
for (const instr of ir) {
this.generateInstruction(instr);
}
this.assembly.push(" ; Exit");
this.assembly.push(" mov rax, 60");
this.assembly.push(" xor rdi, rdi");
this.assembly.push(" syscall");
return this.assembly.join("\n");
}
private generateInstruction(instr: TACInstruction): void {
switch (instr.op) {
case "assign":
if (typeof instr.src === "number") {
const reg = this.registerAllocator.allocate(instr.dest);
this.assembly.push(" mov " + reg + ", " + instr.src);
} else {
const srcReg = this.registerAllocator.get(instr.src);
const destReg = this.registerAllocator.allocate(instr.dest);
this.assembly.push(" mov " + destReg + ", " + srcReg);
}
break;
case "add": {
const leftReg = this.registerAllocator.get(instr.left);
const rightReg = this.registerAllocator.get(instr.right);
const destReg = this.registerAllocator.allocate(instr.dest);
this.assembly.push(" mov " + destReg + ", " + leftReg);
this.assembly.push(" add " + destReg + ", " + rightReg);
break;
}
case "mul": {
const leftReg = this.registerAllocator.get(instr.left);
const rightReg = this.registerAllocator.get(instr.right);
const destReg = this.registerAllocator.allocate(instr.dest);
this.assembly.push(" mov rax, " + leftReg);
this.assembly.push(" imul rax, " + rightReg);
this.assembly.push(" mov " + destReg + ", rax");
break;
}
}
}
}
class RegisterAllocator {
private registers = ["rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9"];
private allocation = new Map<string, string>();
private nextReg = 0;
allocate(variable: string): string {
if (this.allocation.has(variable)) {
return this.allocation.get(variable)!;
}
const reg = this.registers[this.nextReg % this.registers.length];
this.nextReg++;
this.allocation.set(variable, reg);
return reg;
}
get(variable: string): string {
if (!this.allocation.has(variable)) {
throw new Error("Variable not allocated: " + variable);
}
return this.allocation.get(variable)!;
}
}
Putting It All Together
class Compiler {
compile(source: string): string {
console.log("Lexing...");
const lexer = new Lexer(source);
const tokens = lexer.tokenize();
console.log("Parsing...");
const parser = new Parser(tokens);
const ast = parser.parse();
console.log("Semantic analysis...");
const analyzer = new SemanticAnalyzer();
analyzer.analyze(ast);
console.log("Generating IR...");
const irGen = new IRGenerator();
const ir = irGen.generate(ast);
console.log("Optimizing...");
const optimizer = new Optimizer();
const optimizedIR = optimizer.optimize(ir);
console.log("Generating assembly...");
const codeGen = new X86CodeGenerator();
const assembly = codeGen.generate(optimizedIR);
return assembly;
}
}
const compiler = new Compiler();
const program = "let x = 10 + 20 * 30;";
const assembly = compiler.compile(program);
console.log(assembly);
Output assembly:
section .text
global _start
_start:
mov rax, 20
imul rax, 30
mov rbx, rax
mov rcx, 10
mov rcx, rcx
add rcx, rbx
mov rdx, rcx
mov rax, 60
xor rdi, rdi
syscall
Conclusion
Compilers are the bridge between human thought and machine execution. Understanding them makes you a better programmer.
Key takeaways:
- Compilation is a multi-phase pipeline
- Each phase has a specific responsibility
- IR enables target-independent optimization
- Real compilers use sophisticated algorithms (graph coloring for register allocation, SSA for optimization)
Modern compilers like LLVM and GCC are marvels of engineering with millions of lines of code. But at their core, they follow the same principles we explored here.