From 86fea958f4a379e352fc6edb026bef0301564624 Mon Sep 17 00:00:00 2001 From: James Martin Date: Fri, 28 Jul 2023 20:03:10 -0700 Subject: [PATCH] (Algebraic language) Fibonacci sequence works. --- Makefile | 2 +- docs/intermediate-representations.md | 111 ----- src/bytecode.c | 38 +- src/bytecode.h | 23 +- src/ir.c | 196 -------- src/ir.h | 116 ----- src/lang.c | 655 --------------------------- src/lang.h | 32 -- src/lex.c | 520 ++++----------------- src/lex.h | 95 ++-- src/main.c | 451 +++++++----------- src/parse.c | 302 ------------ src/parse.h | 6 - 13 files changed, 334 insertions(+), 2213 deletions(-) delete mode 100644 docs/intermediate-representations.md delete mode 100644 src/ir.c delete mode 100644 src/ir.h delete mode 100644 src/lang.c delete mode 100644 src/lang.h delete mode 100644 src/parse.c delete mode 100644 src/parse.h diff --git a/Makefile b/Makefile index 8e1fc0b..6bd5e26 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ SHELL = /bin/sh CFLAGS = -std=c99 -pedantic -Wextra -Os LDFLAGS = -lc -OBJECTS = bytecode.o format.o io.o main.o x86encode.o +OBJECTS = bytecode.o format.o io.o lex.o main.o x86encode.o .PHONY: passc passc: .bin $(OBJECTS) diff --git a/docs/intermediate-representations.md b/docs/intermediate-representations.md deleted file mode 100644 index 9a5df4b..0000000 --- a/docs/intermediate-representations.md +++ /dev/null @@ -1,111 +0,0 @@ -# Intermediate Representations - -## Bytecode - -### Instructions -Instructions for times: - -* `comm : a * b <=> b * a` -* `assocl : a * (b * c) => (a * b) * c` -* `assocr : (a * b) * c => a * (b * c)` -* `mapl (f : a => b) : a * c => b * c` -* `mapr (f : b => c) : a * b => a * c` -* `unitil : a => a * 1` -* `unitir : a => 1 * a` -* `unitel : a * 1 => a` -* `uniter : 1 * a => a` - -Instructions for plus: - -* `comm : a + b <=> b + a` -* `assocl : a + (b + c) => (a + b) + c` -* `assocr : (a + b) + c => a + (b + c)` -* `mapl (f : a => b) : a + c => b + c` -* `mapr (f : b => c) : a + b => a + c` -* `inl (b : type) : a => a + b` -* `inr (b : type) : b => a + b` -* `out : a + a => a` - -Distributivity: - -* `distl : a * (b + c) => (a * b) + (a * c)` -* `distr : (a + b) * c => (a * c) + (b * c)` -* `factl : (a * b) + (a * c) => a * (b + c)` -* `factr : (a * c) + (b * c) => (a + b) * c` - -Recursion: - -* `project: rec r. f(r) -> f(rec r. f(r))` -* `embed: f(rec r. f(r)) -> rec r. f(r)` - -`project` and `embed` are no-ops which exist to make type-checking easier -(i.e. isorecursive over equirecursive types). - -#### Most instructions are redundant -Most of these instructions are redundant: - -* All of the l/r variants can be implemented in terms of each other - using commutativity. -* All of the plus instructions can be implemented in terms of `map`, `in`, and `out`. -* Alternatively, we could have replaced `map` and `out` with a single instruction, - `if (f : a => c) (g : b => c) : a + b => c`. - -So "morally", there are only about 10 instructions: `comm`, `assoc`, `map`, `uniti`, `unite`, -`inl`, `inr`, `if`, `dist`, and `fact`. - -#### Most instructions are reversible -Inverses of instructions: - -* `comm` / `comm` -* `assocl` / `assocr` -* `map f` / `map f*` -* `uniti` / `unite` -* `dist` / `fact` - -The only irreversible instructions are `in` and `out`. - -#### Instructions are algebraic laws -We have a symmetric monoidal category with coproducts where `*` distributes over `+`. -This isn't quite a distributive symmetric monoidal category, because `*` isn't a product. - -Likewise, we *almost* have a distributive lattice (characterized as a meet-semilattice -with binary joins), but `*` isn't guaranteed to be idempotent. - -The reversible fragment is a wide dagger symmetric monoidal subcategory. - -#### That's really all we need -We simply don't need functions, polymorphism, or `0`. - -`0` isn't very interesting when characterized as an initial object -or as the unit for `+`; I find it's only interesting in the context of -second-order polymorphism, as `forall a. a`. - -## Finite-state 1-bit cons machine -Instructions: - -* `comm` -* `assoc` -* `factor` -* `dist` -* `map` -* `unite` -* `uniti` -* `inl` -* `inr` - -Redundant instructions: - -* `l`/`r` variants -* `out` - -There is a finite number of states, and a state transition table -which determines the next state based on the current state and -a single bit extracted using `dist`. - -## Finite-state random-access 1-bit register machine -Instructions: - -* `x <- enum(imm, y)` -* `w <- struct(x, y, z)` -* `free x` - diff --git a/src/bytecode.c b/src/bytecode.c index dd2cf69..9decce2 100644 --- a/src/bytecode.c +++ b/src/bytecode.c @@ -61,7 +61,7 @@ void assocr(void) { x86_inst_xchg_r64_m64(AX, DX); } -void distl(void) { +void distr(void) { // a, b + c // a * b + a * c @@ -74,7 +74,7 @@ void distl(void) { // Awfully convenient how that works out, huh? } -void distr(void) { +void distl(void) { // The intermediate states here are ill-typed, but ultimately everything // gets shuffled around to the right locations. @@ -96,7 +96,7 @@ void distr(void) { x86_inst_xchg_r64_rax(DX); } -void factl(void) { +void factr(void) { // a * b + a * c: // a * (b + c) @@ -107,7 +107,7 @@ void factl(void) { x86_inst_xchg_r64_m64(AX, DX); } -void factr(void) { +void factl(void) { // a * c + b * c // (a + b) * c @@ -172,23 +172,23 @@ void mapr_end(void) { x86_inst_pop_r64(AX); } -void unitil(void) { +void unitir(void) { allocate_cons(); x86_inst_xchg_r64_rax(DX); x86_inst_mov_r64_r64(DX, DI); } -void unitir(void) { +void unitil(void) { allocate_cons(); x86_inst_mov_r64_r64(AX, DI); } -void unitel(void) { +void uniter(void) { x86_inst_xchg_r64_rax(DX); free_cons(); } -void uniter(void) { +void unitel(void) { free_cons(); } @@ -377,6 +377,17 @@ void out(void) { free_cons(); } +void jump(symbol sym) { + inst_jump(sym); +} + +void jump_if(symbol a, symbol b) { + x86_inst_test_r8_r8(AX, AX); + out(); + inst_jump_if_zero(a); + inst_jump(b); +} + static void inst_load(reg dest, symbol sym) { x86_inst_lea_r64_rip_disp32_op(dest); relocate_pc32(sym); @@ -386,7 +397,7 @@ static symbol one_symbol; static symbol loop_point; static symbol exit_point; -void quit(void) { +void halt(void) { inst_jump(exit_point); } @@ -469,14 +480,13 @@ symbol init_bytecode(void) { //x86_inst_lea_r64_m64_disp8(DI, SP, -16); x86_inst_mov_r64_r64(DI, SP); x86_inst_sub_r64_imm8(DI, 16); - x86_inst_push_r64(DI); - x86_inst_push_r64(DI); + x86_inst_xor_r32_r32(R14, R14); + x86_inst_push_r64(R14); + x86_inst_push_r64(R14); - // Initial state is a unit in the left. - // (Right states will be loop states.) + // Initial state is a unit. x86_inst_mov_r64_r64(AX, DI); x86_inst_mov_r64_r64(DX, DI); - inl(); loop_point = new_symbol(); define_executable_symbol(loop_point); diff --git a/src/bytecode.h b/src/bytecode.h index efe9ba4..62b687f 100644 --- a/src/bytecode.h +++ b/src/bytecode.h @@ -10,13 +10,13 @@ void assocl(void); /// (a * b) * c => a * (b * c) void assocr(void); /// a * (b + c) => (a * b) + (a * c) -void distl(void); -/// (a + b) * c => (a * c) + (b * c) void distr(void); +/// (a + b) * c => (a * c) + (b * c) +void distl(void); /// (a * b) + (a * c) => a * (b + c) -void factl(void); -/// (a * c) + (b * c) => (a + b) * c void factr(void); +/// (a * c) + (b * c) => (a + b) * c +void factl(void); /// (a => b) => (a * c => b * c) void mapl_begin(void); void mapl_end(void); @@ -24,13 +24,13 @@ void mapl_end(void); void mapr_begin(void); void mapr_end(void); /// a => a * 1 -void unitil(void); -/// a => 1 * a void unitir(void); +/// a => 1 * a +void unitil(void); /// a * 1 => a -void unitel(void); -/// 1 * a => a void uniter(void); +/// 1 * a => a +void unitel(void); /// a + b <=> b + a void comm_plus(void); /// a + (b + c) => (a + b) + c @@ -49,9 +49,12 @@ void inl(void); void inr(void); /// a + a => a void out(void); - -void quit(void); +/// end the program +void halt(void); symbol init_bytecode(void); void finish_bytecode(void); +void jump(symbol sym); +void jump_if(symbol a, symbol b); + #endif diff --git a/src/ir.c b/src/ir.c deleted file mode 100644 index 387ebb2..0000000 --- a/src/ir.c +++ /dev/null @@ -1,196 +0,0 @@ -/// This file serves conceptually as the intermediate representation (IR) -/// of the compiler. Compared to "asm", this file is aware of stack frames, -/// control flow blocks and labels, compound types like structs and enums, -/// and register allocation. - -#include "asm.h" -#include "format.h" -#include "ir.h" -#include "x86encode.h" - -#include -#include -#include -#include -#include -#include -#include - -#define MAX_STACK_FRAMES 32 -#define MAX_LABELS 256 -#define MAX_FIXUPS 256 - -struct stack_frame { - uint32_t depth; - uint32_t label_depth; -}; - -struct label { - uint32_t frame; - uint32_t argc; - symbol symbol; -}; - -static uint32_t stack_depth = 0; -static uint32_t stack_frame = 0; -static struct stack_frame stack_frames[MAX_STACK_FRAMES]; -static uint32_t label_depth = 0; -static struct label labels[MAX_LABELS]; - -void init_ir(var* argc, var* argv, var* env) { - assert(stack_depth == 0 && stack_frame == 0); - x86_inst_mov_r64_r64(BP, SP); - x86_inst_add_r64_imm8(BP, 8 * 3); - *env = stack_depth++; - *argv = stack_depth++; - *argc = stack_depth++; -} - -void enter(void) { - assert(stack_frame < MAX_STACK_FRAMES); - printf("ENTERING: %i, %i\n", stack_depth, label_depth); - struct stack_frame frame = { .depth = stack_depth, .label_depth = label_depth }; - stack_frames[stack_frame] = frame; - stack_frame++; - // exit label - declare(0); -} - -void leave(var* args) { - assert(stack_frame > 0); - struct stack_frame frame = stack_frames[stack_frame - 1]; - stack_depth = frame.depth; - label_depth = frame.label_depth; - define(frame.label_depth, args); - stack_frame--; -} - -label declare(uint32_t argc) { - assert(label_depth < MAX_LABELS); - symbol sym = new_symbol(); - struct label label = { .frame = stack_frame, .argc = argc, .symbol = sym }; - labels[label_depth] = label; - return label_depth++; -} - -label declare_exit(uint32_t argc) { - label label = stack_frames[stack_frame - 1].label_depth; - labels[label].argc = argc; - return label; -} - -void define(label l, var* args) { - struct label* label = &labels[l]; - printf("DEFINING %i (%i)\n", l, label->argc); - define_executable_symbol(label->symbol); - assert(label->frame == stack_frame); - for (uint32_t i = 0; i < label->argc; i++) { - args[i] = stack_depth + i; - } - stack_depth += label->argc; -} - -void load_var(reg reg, var var) { - // the stack grows downward, so the bottom of the stack, BP, points to nothing; - // subtracting 8 causes it to point to the first variable, 0. - // (each variable is 8 bytes.) - x86_inst_mov_r64_m64_disp(reg, BP, -(var * 8) - 8); -} - -var push_var(reg reg) { - x86_inst_push_r64(reg); - return stack_depth++; -} - -void load_args(struct label* label, var* args) { - struct stack_frame* dest_frame = &stack_frames[label->frame - 1]; - uint32_t depth_diff = stack_depth - dest_frame->depth; - if (depth_diff > 0) { - // FIXME: should be immX!!! - x86_inst_add_r64_imm8(SP, depth_diff * 8); - } - for (uint32_t arg = 0; arg < label->argc; arg++) { - load_var(AX, args[arg]); - x86_inst_push_r64(AX); - } - stack_depth = dest_frame->depth + label->argc; -} - -void jump(label l, var* args) { - struct label* label = &labels[l]; - printf("JUMP %i (%i)\n", l, label->argc); - load_args(label, args); - inst_jump(label->symbol); -} - -void jump_table(size_t branches, label* labels, var index, var* args) { - assert(0); // UNIMPLEMENTED -} - -void jump_if(label t, label e, var cond, var* args) { - struct label* then = &labels[t]; - struct label* else_ = &labels[e]; - printf("JUMP_IF %i ELSE %i (%i)\n", t, e, then->argc); - assert(then->argc == else_->argc && then->frame == else_->frame); - load_var(BX, cond); - load_args(then, args); - inst_jump_if_not_zero(then->symbol, BX); - inst_jump(else_->symbol); -} - -var lit(uint64_t lit) { - x86_inst_mov_r64_imm(AX, lit); - x86_inst_push_r64(AX); - return stack_depth++; -} - -var lit_string(char* str) { - fprintf(stderr, "error: string literals not yet implemented\n"); - exit(1); -} - -var add(var addend1, var addend2) { - load_var(AX, addend1); - load_var(BX, addend2); - x86_inst_add_r64_r64(AX, BX); - return push_var(AX); -} - -var sub(var subtrahend, var minuend) { - // TODO: use modr/m - load_var(AX, subtrahend); - load_var(BX, minuend); - x86_inst_sub_r64_r64(AX, BX); - return push_var(AX); -} - -// Linux system call: https://blog.rchapman.org/posts/Linux_System_Call_Table_for_x86_64/ -var syscall(size_t argc, var* args) { - assert(argc > 0 && argc <= 7); - switch(argc) { - case 7: - load_var(R9, args[6]); - __attribute__((fallthrough)); - case 6: - load_var(R8, args[5]); - __attribute__ ((fallthrough)); - case 5: - load_var(R10, args[4]); - __attribute__ ((fallthrough)); - case 4: - load_var(DX, args[3]); - __attribute__ ((fallthrough)); - case 3: - load_var(SI, args[2]); - __attribute__ ((fallthrough)); - case 2: - load_var(DI, args[1]); - __attribute__ ((fallthrough)); - case 1: - // the system call number, not an argument - load_var(AX, args[0]); - } - // NOTE: syscall clobbers rcx and r11. - x86_inst_syscall(); - return push_var(AX); -} diff --git a/src/ir.h b/src/ir.h deleted file mode 100644 index 94d53a5..0000000 --- a/src/ir.h +++ /dev/null @@ -1,116 +0,0 @@ -#ifndef _IR_H -#define _IR_H - -#include -#include - -typedef uint32_t var; -typedef uint32_t label; - -struct jump_target { - label label; - var* args; -}; - -/// Declare a new label in the current scope with the provided number -/// of arguments. -/// -/// Local variables (not part of a stack frame generated by `define` or `enter`) -/// will not be in scope of the definition of the label. -label declare(uint32_t argc); - -/// Define a label and create a new scope for local variables. -/// -/// The new scope will have access to all of the variables -/// of the parent scope of the label and the label's arguments, -/// but not any local variables from previous definitions. -void define(label label, var* args); - -/// Create a new scope which encompasses all local variables defined up to this point. -/// -/// This allows nested definitions to have access to local variables. -void enter(void); - -/// Jump to label, unconditionally. Ends the continuation. -void jump(struct jump_target dest); - -/// Jump to `then` if `cond` is not zero; jump to `else` otherwise. -/// Ends the continuation. -void jump_if(struct jump_target then, struct jump_target else_, var cond); - -/// Jump to the `index`th destination. Ends the continuation. -void jump_table(uint32_t destc, struct jump_target* destinations, var index); - -/// Call this at the beginning of execution. -/// It performs initialization and stuff. -void init_ir(var* argc, var* argv, var* env); - -/// Enter a new block. -/// -/// All labels defined in this block will have access to all variables -/// which are in scope as of calling `enter`. You will be able to jump -/// to any label which is defined in this block from here -/// to the symmetric `leave`. -/// -/// This also generates a new label corresponding with the end of the block, -/// which will be automatically defined when you call `leave`. -void enter(void); - -/// Leave a block. -/// -/// This will restore the context to how it was when `enter` was called, -/// plus the return values declared by the call to `declare_continue`. -void leave(var* rets); - -/// Declare a new label in the innermost block. -/// -/// This label can only be called from the block or nested blocks. -/// This label must be called with the given number of arguments. -label declare(uint32_t argc); - -//// Declare an exit label for the surrounding block. -/// -/// Calling this label will exit the surrounding blocks. -/// The usual restrictions for labels apply. -label declare_exit(uint32_t retc); - -// Define a label in the innermost block, automatically terminating -/// any previous labels. -/// -/// All variables defined prior to the beginning of this block will be in scope. -/// The arguments associated with the label will be in scope. -/// Variables defined *after* the beginning of the block but *prior* to this label -/// will *not* be in scope. -/// -/// From this label you can jump to any label in the enclosing block -/// or any parent block. -void define(label label, var* args); - -/// Jump to label, unconditionally; never returns. -void jump(label label, var* args); - -/// Jump to `index`th label in table; never returns. -/// -/// All labels must be at the same depth and accept the same arguments. -/// `index` must not be out of bounds. -void jump_table(size_t branches, label* labels, var index, var* args); - -/// Jump to `then` if cond is not zero, `else` if cond is zero. -void jump_if(label then, label else_, var cond, var* args); - -/// Integer literal. -var lit(uint64_t lit); - -/// String literal. -var lit_string(char* str); - -/// Addition. -var add(var addend1, var addend2); - -/// Subtraction. -var sub(var subtrahend, var minuend); - -/// Perform a system call. -var syscall(size_t argc, var* args); - -#endif diff --git a/src/lang.c b/src/lang.c deleted file mode 100644 index bcc2583..0000000 --- a/src/lang.c +++ /dev/null @@ -1,655 +0,0 @@ -#include "ir.h" -#include "lang.h" - -#include -#include -#include -#include -#include - -#define MAX_CONTEXT 32 -#define MAX_ASSIGNMENTS 256 -#define MAX_ARGUMENTS 256 -#define MAX_OPERATORS 256 - -struct assignment { - char* name; - var ref; -}; - -enum block_state { - BLOCK_CLEAN, - BLOCK_ASSIGN, - BLOCK_EXPR, -}; - -struct block_crumb { - enum block_state state; - uint32_t assignment_count; - struct assignment assignments[MAX_ASSIGNMENTS]; - var final; -}; - -enum if_state { - IF_COND, - IF_THEN, - IF_ELSE, - IF_END, -}; - -struct if_crumb { - enum if_state state; - label then; - label else_; - label end; -}; - -enum loop_state { - LOOP_CLEAN, - LOOP_CVAR_INIT, - LOOP_BODY, -}; - -struct loop_crumb { - enum loop_state state; - char* label_name; - label next; - label exit; - uint32_t assignment_count; - var initializers[MAX_ASSIGNMENTS]; - struct assignment assignments[MAX_ASSIGNMENTS]; -}; - -struct expr_crumb { - uint32_t argument_count; - uint32_t operator_count; - var arguments[MAX_ARGUMENTS]; - enum operator_ operators[MAX_OPERATORS]; -}; - -struct jump_crumb { - label label; - uint32_t arity; - uint32_t argument_count; - var arguments[MAX_ARGUMENTS]; -}; - -enum crumb_type { - BLOCK_CRUMB, - IF_CRUMB, - LOOP_CRUMB, - EXPR_CRUMB, - JUMP_CRUMB, -}; - -union crumb_data { - struct block_crumb block; - struct if_crumb if_; - struct loop_crumb loop; - struct expr_crumb expr; - struct jump_crumb jump; -}; - -struct crumb { - enum crumb_type type; - union crumb_data data; -}; - -static uint32_t context_depth = 1; -static struct crumb context[MAX_CONTEXT]; - -static char* copy_str(char* str) { - unsigned long len = strlen(str); - char* new = malloc(len * sizeof(char) + 1); - memcpy(new, str, len); - new[len] = 0; - return new; -} - -static void push(struct crumb crumb) { - context[context_depth] = crumb; - context_depth++; -} - -static void push_new_block(void) { - union crumb_data data; - struct block_crumb block = { - .state = BLOCK_CLEAN, - .assignment_count = 0, - .final = (var) -1, - }; - data.block = block; - struct crumb crumb = { - .type = BLOCK_CRUMB, - .data = data, - }; - push(crumb); -} - -static void push_new_expr(void) { - struct expr_crumb exprc = { - .argument_count = 0, - .operator_count = 0, - }; - union crumb_data data; - data.expr = exprc; - struct crumb crumb = { - .type = EXPR_CRUMB, - .data = data, - }; - push(crumb); -} - -struct label_and_arity { - label label; - uint32_t arity; -}; - -static void push_new_jump(struct label_and_arity label) { - union crumb_data data; - data.jump.label = label.label; - data.jump.arity = label.arity; - data.jump.argument_count = 0; - struct crumb crumb = { - .type = JUMP_CRUMB, - .data = data, - }; - push(crumb); -} - -static void push_argument(var ref) { - struct crumb* ctx = &context[context_depth - 1]; - assert(ctx->type == EXPR_CRUMB); - struct expr_crumb* exprc = &ctx->data.expr; - if (exprc->argument_count > MAX_ARGUMENTS) { - fprintf(stderr, "error: exceeded maximum number of arguments in expression\n"); - exit(1); - } - exprc->arguments[exprc->argument_count] = ref; - exprc->argument_count++; -} - -static void push_cvar_name(char* name) { - struct crumb* ctx = &context[context_depth - 1]; - assert(ctx->type == LOOP_CRUMB); - struct loop_crumb* loopc = &ctx->data.loop; - if (loopc->assignment_count == MAX_ASSIGNMENTS) { - fprintf(stderr, "error: exceed maximum number of assignments in loop cvars\n"); - exit(1); - } - loopc->assignments[loopc->assignment_count].name = copy_str(name); -} - -static void push_cvar(var ref) { - struct crumb* ctx = &context[context_depth - 1]; - assert(ctx->type == LOOP_CRUMB); - struct loop_crumb* loopc = &ctx->data.loop; - if (loopc->assignment_count > MAX_ASSIGNMENTS) { - fprintf(stderr, "error: exceed maximum number of assignments in loop cvars\n"); - exit(1); - } - loopc->initializers[loopc->assignment_count] = ref; - loopc->assignment_count++; -} - -static var lookup_assignment( - uint32_t assignment_count, - struct assignment* assignments, - char* name -) { - for (uint32_t i = assignment_count; i > 0; i--) { - struct assignment asgn = assignments[i - 1]; - if (strcmp(asgn.name, name) == 0) { - return asgn.ref; - } - } - return (var) -1; -} - -static var lookup_var(char* name) { - for (uint32_t i = context_depth; i > 0; i--) { - struct crumb ctx = context[i - 1]; - var ref = (var) -1; - switch (ctx.type) { - case LOOP_CRUMB: - if (ctx.data.loop.state != LOOP_BODY) { - break; - } - ref = lookup_assignment( - ctx.data.loop.assignment_count, - ctx.data.loop.assignments, - name - ); - break; - case BLOCK_CRUMB: - ref = lookup_assignment( - ctx.data.block.assignment_count, - ctx.data.block.assignments, - name - ); - break; - default: - continue; - } - if (ref != (var) -1) { - return ref; - } - } - fprintf(stderr, "name resolution error: unknown variable %s\n", name); - exit(1); -} - -enum label_type { - NEXT_LABEL, - EXIT_LABEL, - RETURN_LABEL, -}; - -static const char* label_type_name(enum label_type type) { - switch (type) { - case NEXT_LABEL: - return "next"; - case EXIT_LABEL: - return "exit"; - case RETURN_LABEL: - return "return"; - } -} - -static struct label_and_arity lookup_label(enum label_type type, char* name) { - for (uint32_t i = context_depth; i > 0; i--) { - struct crumb ctx = context[i - 1]; - switch (ctx.type) { - case LOOP_CRUMB: - if (name == NULL || strcmp(name, ctx.data.loop.label_name) == 0) { - struct label_and_arity label; - if (type == NEXT_LABEL) { - label.label = ctx.data.loop.next; - label.arity = ctx.data.loop.assignment_count; - return label; - } - if (type == EXIT_LABEL) { - label.label = ctx.data.loop.exit; - label.arity = 1; - return label; - } - } - break; - default: - continue; - } - } - if (name == NULL) { - fprintf(stderr, "name resolution error: no %s label in scope\n", label_type_name(type)); - } else { - fprintf(stderr, "name resolution error: unknown label %s\n", name); - } - exit(1); -} - -static void reduce_expression_binop(struct expr_crumb* exprc, var (*emit)(var arg1, var arg2)) { - assert(exprc->argument_count >= 2); - var arg1 = exprc->arguments[0]; - var arg2 = exprc->arguments[1]; - exprc->arguments[0] = emit(arg1, arg2); - memmove(&exprc->arguments[1], &exprc->arguments[2], exprc->argument_count - 2); - exprc->argument_count--; -} - -static var reduce_expression(struct expr_crumb* exprc) { - // TODO FIXME: operator precedence - if (exprc->operator_count > 0 || exprc->argument_count > 1) { - fprintf(stderr, "warning: expression reduction may be incorrect\n"); - //exit(1); - } - for (uint32_t op_ix = 0; op_ix < exprc->operator_count; op_ix++) { - switch (exprc->operators[op_ix]) { - case OP_ADD: - reduce_expression_binop(exprc, add); - break; - case OP_SUB: - reduce_expression_binop(exprc, sub); - break; - default: - fprintf(stderr, "error: operator not implemented: %i", exprc->operators[op_ix]); - exit(1); - } - } - exprc->operator_count = 0; - assert(exprc->argument_count == 1); - return exprc->arguments[0]; -} - -void enter_block(void) { - printf("** enter_block\n"); - struct crumb* ctx = &context[context_depth - 1]; - switch (ctx->type) { - case BLOCK_CRUMB: - // we should have seen a stmt_assign or stmt_expr first, - // either of which pushes an expr crumb. - assert(0); - case EXPR_CRUMB: { - // this block is purely a scope/sequencing thing - // with no special semantics - break; - } - case IF_CRUMB: { - struct if_crumb ifc = ctx->data.if_; - switch (ifc.state) { - case IF_COND: - case IF_END: - assert(0); - case IF_THEN: - define(ifc.then, NULL); - break; - case IF_ELSE: - define(ifc.else_, NULL); - break; - } - break; - } - case LOOP_CRUMB: { - struct loop_crumb* loopc = &ctx->data.loop; - assert(loopc->state == LOOP_CLEAN); - loopc->state = LOOP_BODY; - loopc->next = declare(loopc->assignment_count); - printf("LOOP %i END %i\n", loopc->next, loopc->exit); - var args[MAX_ASSIGNMENTS]; - define(loopc->next, args); - // TODO NOTE: is this the correct order? - for (uint32_t i = 0; i < loopc->assignment_count; i++) { - loopc->assignments[i].ref = args[i]; - } - break; - } - default: - assert(0); - } - push_new_block(); -} - -void stmt_assign(char* name) { - printf("** stmt_assign\n"); - struct crumb* ctx = &context[context_depth - 1]; - assert(ctx->type == BLOCK_CRUMB); - struct block_crumb* blockc = &ctx->data.block; - assert(blockc->state == BLOCK_CLEAN); - if (blockc->assignment_count == MAX_ASSIGNMENTS) { - fprintf(stderr, "error: exceeded maximum number of assignments in block\n"); - exit(1); - } - blockc->state = BLOCK_ASSIGN; - blockc->assignments[blockc->assignment_count].name = copy_str(name); - push_new_expr(); -} - -void stmt_expr(void) { - printf("** stmt_expr\n"); - struct crumb* ctx = &context[context_depth - 1]; - assert(ctx->type == BLOCK_CRUMB); - struct block_crumb* blockc = &ctx->data.block; - assert(blockc->state == BLOCK_CLEAN); - blockc->state = BLOCK_EXPR; - push_new_expr(); -} - -void exit_block(void) { - printf("** exit_block\n"); - struct crumb* ctx = &context[context_depth - 1]; - assert(ctx->type == BLOCK_CRUMB); - struct block_crumb blockc = ctx->data.block; - assert(blockc.state == BLOCK_CLEAN); - var ret = blockc.final; - if (ret == (var) -1) { - // TODO: better way to handle empty blocks - ret = lit(0); - } - context_depth--; - ctx = &context[context_depth - 1]; - switch (ctx->type) { - case EXPR_CRUMB: { - push_argument(ret); - break; - } - case IF_CRUMB: { - struct if_crumb* ifc = &ctx->data.if_; - assert(ifc->state == IF_THEN || ifc->state == IF_ELSE); - jump(ifc->end, &ret); - if (ifc->state == IF_THEN) { - ifc->state = IF_ELSE; - } else if (ifc->state == IF_ELSE) { - ifc->state = IF_END; - } - break; - } - case LOOP_CRUMB: { - // unlike with `if`, there is no `exit_loop`, so we do clean-up here. - struct loop_crumb loopc = ctx->data.loop; - assert(loopc.state == LOOP_BODY); - jump(loopc.exit, &ret); - context_depth--; - for (uint32_t i = 0; i < loopc.assignment_count; i++) { - free(loopc.assignments[i].name); - } - leave(&ret); - push_argument(ret); - break; - } - default: - assert(0); - } - for (uint32_t i = 0; i < blockc.assignment_count; i++) { - free(blockc.assignments[i].name); - } -} - -void exit_expr(void) { - printf("** exit_expr\n"); - struct crumb* ctx = &context[context_depth - 1]; - assert(ctx->type == EXPR_CRUMB); - struct expr_crumb* exprc = &ctx->data.expr; - assert(exprc->argument_count > 0); - var ret = reduce_expression(exprc); - context_depth--; - ctx = &context[context_depth - 1]; - switch (ctx->type) { - case BLOCK_CRUMB: { - struct block_crumb* blockc = &ctx->data.block; - blockc->final = ret; - switch (blockc->state) { - case BLOCK_CLEAN: - assert(0); - case BLOCK_EXPR: - blockc->state = BLOCK_CLEAN; - break; - case BLOCK_ASSIGN: - blockc->assignments[blockc->assignment_count].ref = ret; - blockc->assignment_count++; - blockc->state = BLOCK_CLEAN; - break; - } - break; - } - case IF_CRUMB: { - struct if_crumb* ifc = &ctx->data.if_; - assert(ifc->state == IF_COND); - jump_if(ifc->then, ifc->else_, ret, NULL); - ifc->state = IF_THEN; - break; - } - case EXPR_CRUMB: - push_argument(ret); - break; - case LOOP_CRUMB: { - struct loop_crumb* loopc = &ctx->data.loop; - assert(loopc->state == LOOP_CVAR_INIT); - push_cvar(ret); - loopc->state = LOOP_CLEAN; - break; - } - case JUMP_CRUMB: { - // TODO FIXME: this is *completely wrong* for `next`! - struct jump_crumb jumpc = ctx->data.jump; - fprintf(stderr, "args: %i, arity: %i\n", jumpc.argument_count, jumpc.arity); - assert(jumpc.argument_count + 1 == jumpc.arity); - jumpc.arguments[jumpc.argument_count] = ret; - jump(jumpc.label, jumpc.arguments); - // TODO: better way to handle returning impossible value - context_depth--; - push_argument(ret); - break; - } - } -} - -void enter_if(void) { - printf("** enter_if\n"); - enter(); - label then = declare(0); - label else_ = declare(0); - label end = declare_exit(1); - printf("IF THEN %i ELSE %i EXIT %i\n", then, else_, end); - struct if_crumb ifc = { - .state = IF_COND, - .then = then, - .else_ = else_, - .end = end, - }; - union crumb_data data; - data.if_ = ifc; - struct crumb ctx = { - .type = IF_CRUMB, - .data = data, - }; - push(ctx); - push_new_expr(); -} - -void exit_if(void) { - printf("** exit_if\n"); - struct crumb ctx = context[context_depth - 1]; - assert(ctx.type == IF_CRUMB); - struct if_crumb ifc = ctx.data.if_; - switch (ifc.state) { - case IF_COND: - case IF_THEN: - assert(0); - case IF_ELSE: { - define(ifc.else_, NULL); - var ret = lit(0); - jump(ifc.end, &ret); - break; - } - case IF_END: - break; - } - var ret; - leave(&ret); - context_depth--; - push_argument(ret); -} - -void enter_loop(char* label_name) { - printf("** enter_loop\n"); - enter(); - label exit = declare_exit(1); - struct loop_crumb loopc = { - .state = LOOP_CLEAN, - .label_name = copy_str(label_name), - .assignment_count = 0, - .exit = exit - }; - union crumb_data data; - data.loop = loopc; - struct crumb ctx = { - .type = LOOP_CRUMB, - .data = data, - }; - push(ctx); -} - -void cvar_pass(char* name) { - printf("** cvar_pass\n"); - push_cvar_name(name); - push_cvar(lookup_var(name)); -} - -void cvar_init(char* name) { - printf("** cvar_init\n"); - struct crumb* ctx = &context[context_depth - 1]; - assert(ctx->type == LOOP_CRUMB); - struct loop_crumb* loopc = &ctx->data.loop; - loopc->state = LOOP_CVAR_INIT; - push_cvar_name(name); - push_new_expr(); -} - -void expr_next(char* label) { - printf("** expr_next\n"); - push_new_jump(lookup_label(NEXT_LABEL, label)); - push_new_expr(); -} - -void expr_exit(char* label) { - printf("** expr_exit\n"); - push_new_jump(lookup_label(EXIT_LABEL, label)); - push_new_expr(); -} - -void expr_return(void) { - printf("** expr_return\n"); - push_new_jump(lookup_label(RETURN_LABEL, NULL)); - push_new_expr(); -} - -void enter_group(void) { - printf("** enter_group\n"); - push_new_expr(); -} - -void exit_group(void) { - printf("** exit_group\n"); - // exit_expr is sufficient -} - -void expr_op(enum operator_ op) { - printf("** expr_op %i\n", op); - struct crumb* ctx = &context[context_depth - 1]; - assert(ctx->type == EXPR_CRUMB); - struct expr_crumb* exprc = &ctx->data.expr; - if (op == OP_JUXT && context_depth > 1) { - // HACK: should handle continuations separately from expressions - struct crumb* ctx2 = &context[context_depth - 2]; - if (ctx2->type == JUMP_CRUMB) { - struct jump_crumb* jumpc = &ctx2->data.jump; - var ret = reduce_expression(exprc); - assert(jumpc->argument_count < MAX_ARGUMENTS); - jumpc->arguments[jumpc->argument_count] = ret; - jumpc->argument_count++; - context_depth--; - push_new_expr(); - return; - } - } - if (exprc->operator_count > MAX_OPERATORS) { - fprintf(stderr, "error: exceeded maximum number of operators in expression\n"); - exit(1); - } - exprc->operators[exprc->operator_count] = op; - exprc->operator_count++; -} - -void expr_string(char* string) { - printf("** expr_string %s\n", string); - push_argument(lit_string(string)); -} - -void expr_integer(int64_t num) { - printf("** expr_integer %lli\n", num); - push_argument(lit((uint64_t) num)); -} - -void expr_var(char* var) { - printf("** expr_var %s\n", var); - push_argument(lookup_var(var)); -} diff --git a/src/lang.h b/src/lang.h deleted file mode 100644 index d493d22..0000000 --- a/src/lang.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef LANG_H -#define LANG_H - -#include "lex.h" - -void enter_block(void); -void stmt_assign(char* name); -void stmt_expr(void); -void exit_block(void); - -void exit_expr(void); - -void enter_if(void); -void exit_if(void); - -void enter_loop(char* label); -void cvar_pass(char* name); -void cvar_init(char* name); - -void expr_next(char* label); -void expr_exit(char* label); -void expr_return(void); - -void enter_group(void); -void exit_group(void); - -void expr_op(enum operator_ op); -void expr_string(char* string); -void expr_integer(int64_t num); -void expr_var(char* var); - -#endif diff --git a/src/lex.c b/src/lex.c index d1b0d6d..ee68131 100644 --- a/src/lex.c +++ b/src/lex.c @@ -7,37 +7,58 @@ #include "lex/indent.h" #include "io.h" -_Bool is_unary(enum operator_ op) { - return op == OP_SUB - || op == OP_INV - || op == OP_NOT; +static const char* const keywords[KEYWORD_COUNT] = { + "comm", + "assocl", + "assocr", + "distl", + "distr", + "factl", + "factr", + "mapl", + "mapr", + "unitil", + "unitir", + "unitel", + "uniter", + "comm+", + "assocl+", + "assocr+", + "mapl+", + "mapr+", + "inl", + "inr", + "out", + "halt", + "if", +}; + +static struct token simple(enum token_type type) { + struct token tok = { type, 0 }; + return tok; } -_Bool is_binary(enum operator_ op) { - return op == OP_EQ - || op == OP_ADD - || op == OP_SUB - || op == OP_MUL - || op == OP_DIV - || op == OP_MOD - || op == OP_AND - || op == OP_OR - || op == OP_XOR - || op == OP_SHL - || op == OP_SAR - || op == OP_SHR - || op == OP_GT - || op == OP_LT - || op == OP_GTE - || op == OP_LTE - || op == OP_NE - || op == OP_TYPE - || op == OP_FUN; +#define MAX_STR_LEN 4096 +static size_t str_index; +// alternate string buffers between tokens to prevent overwriting buffer. +// we're LL(1) so 2 buffers is sufficient. + +// NOTE: I later changed the code and it wasn't sufficient anymore, lmao. +static int which_buf = 0; +static char str_buf_1[MAX_STR_LEN]; +static char str_buf_2[MAX_STR_LEN]; +static char str_buf_3[MAX_STR_LEN]; + +static char* str_buf(void) { + which_buf = (which_buf + 1) % 3; + switch (which_buf) { + case 0: return str_buf_1; + case 1: return str_buf_2; + case 2: return str_buf_3; + } + assert(false); } -_Bool is_lit(struct token tok) { - return tok.type == TOK_INTEGER || tok.type == TOK_STRING || tok.type == TOK_NAME; -} static _Bool is_alpha(char c) { return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); @@ -56,142 +77,10 @@ static _Bool is_id_char(char c) { return is_alphanumeric(c) || c == '_'; } -static struct token simple(enum token_type type) { - struct token tok = { type, 0 }; - return tok; +_Bool is_whitespace(char c) { + return c == ' ' || c == '\t' || c == '\r' || c == '\n'; } -static struct token op(enum operator_ op) { - union token_data data; - data.op = op; - struct token tok = { TOK_OPERATOR, data }; - return tok; -} - -static uint8_t digit_value(char c) { - if (is_digit(c)) { return c - '0'; } - if (c >= 'A' && c <= 'Z') { return c - 'A' + 0xA; } - if (c >= 'a' && c <= 'z') { return c - 'a' + 0xA; } - assert(0); -} - -static _Bool is_digit_in(uint8_t base, char c) { - if (!is_alphanumeric(c)) { - return false; - } - return digit_value(c) < base; -} - -static uint64_t lex_digits(uint8_t base) { - uint64_t acc = 0; - while (true) { - char c = peekc(); - if (!is_digit_in(base, c)) { - // commas are legal digit separators - if (c == ',' && is_digit_in(base, peekc())) { - nextc(); - continue; - } - break; - } - nextc(); - uint8_t digit = digit_value(c); - // (val * base + digit) > UINT64_MAX - if (acc > ((UINT64_MAX - digit) / base)) { - fprintf(stderr, "lexical error: integer literal overflow\n"); - exit(1); - } - acc *= base; - acc += digit; - } - return acc; -} - -static struct token integer_tok(uint64_t integer) { - union token_data data; - data.int_ = integer; - struct token tok = { TOK_INTEGER, data }; - return tok; -} - -static struct token lex_integer(_Bool sign) { - uint8_t base = 10; - if (peekc() == '0') { - nextc(); - if (peekc() == 'b') { - base = 2; - nextc(); - } else if (peekc() == 'x') { - base = 16; - nextc(); - } - if(!is_digit(peekc() && !is_id_char(peekc()))) { - return integer_tok(0); - } - } - if (!is_digit_in(base, peekc())) { - fprintf(stderr, "lexical error: expected base-%i digits\n", base); - exit(1); - } - uint64_t acc = lex_digits(10); - if (is_id_char(peekc())) { - fprintf(stderr, "lexical error: must put space between integer and following identifier\n"); - exit(1); - } - if (sign && acc > INT64_MAX) { - fprintf(stderr, "lexical error: signed integer literal overflow\n"); - exit(1); - } - int64_t val = (int64_t) acc; - if (sign) { - val = -val; - } - return integer_tok(val); -} - -#define MAX_STR_LEN 4096 -static size_t str_index; -// alternate string buffers between tokens to prevent overwriting buffer. -// we're LL(1) so 2 buffers is sufficient. -static _Bool which_buf = false; -static char str_buf_1[MAX_STR_LEN]; -static char str_buf_2[MAX_STR_LEN]; - -static char* str_buf(void) { - which_buf = !which_buf; - if (which_buf) { - return str_buf_1; - } - return str_buf_2; -} - -static char* lex_string(void) { - // TODO: string escapes, multi-line strings, no length limit on strings - str_index = 0; - char* buf = str_buf(); - while (true) { - char c = nextc(); - if (str_index == MAX_STR_LEN - 1) { - fprintf(stderr, "lexical error: string too long\n"); - exit(1); - } - if (c == 0) { - fprintf(stderr, "lexical error: unclosed string (reached end of file)\n"); - exit(1); - } - if (c == '"') { - break; - } - if (c == '\n') { - fprintf(stderr, "lexical error: unclosed string (reached end of line)\n"); - exit(1); - } - buf[str_index] = c; - str_index++; - } - buf[str_index] = 0; - return buf; -} static char* lex_identifier(void) { str_index = 0; @@ -211,204 +100,53 @@ static char* lex_identifier(void) { return buf; } -static uint32_t indent_level = 0; -static uint32_t pending_level = 0; -static _Bool level_is_block[MAX_INDENTS] = {true}; -// going back to a previous indentation level. -// if we're going back, then we insert a terminator. -static _Bool going_back = false; - static struct token lex(void) { char c = peekc(); - if (is_newline(c)) { - indent_level = lex_indentation(); - if (indent_level <= pending_level) { - going_back = true; - } - } - while (indent_level > pending_level) { - pending_level++; - if (level_is_block[pending_level]) { - return simple(TOK_OPEN_BLOCK); - } - } - while (indent_level < pending_level) { - _Bool was_block = level_is_block[pending_level]; - level_is_block[pending_level] = false; - pending_level--; - if (was_block) { - return simple(TOK_CLOSE_BLOCK); - } - } - if (going_back) { - going_back = false; - if (level_is_block[indent_level]) { - return simple(TOK_TERMINATOR); - } - } - c = peekc(); - while (is_indent(c)) { - nextc(); - c = peekc(); - } - _Bool sign = false; - switch (c) { - case 0: - nextc(); - return simple(TOK_EOF); - case '"': { - nextc(); - union token_data data; - data.string = lex_string(); - struct token tok = { TOK_STRING, data }; - return tok; - } - case '\'': { - nextc(); - union token_data data; - data.label = lex_identifier(); - struct token tok = { TOK_LABEL, data }; - return tok; - } - case ':': - nextc(); - while (is_indent(peekc())) { - nextc(); - } - if (is_newline(peekc())) { - level_is_block[indent_level + 1] = true; - return lex(); - } - return op(OP_TYPE); - case '{': - nextc(); - return simple(TOK_OPEN_BLOCK); - case '}': - nextc(); - return simple(TOK_CLOSE_BLOCK); - case '(': - nextc(); - return simple(TOK_OPEN_GROUP); - case ')': - nextc(); - return simple(TOK_CLOSE_GROUP); - case ';': - nextc(); - return simple(TOK_TERMINATOR); - case ',': - nextc(); - return simple(TOK_SEPARATOR); - case '=': - nextc(); - return simple(TOK_EQUALS); - case '-': - nextc(); - if (peekc() == '>') { - nextc(); - return op(OP_FUN); - } - if (is_digit(peekc())) { - return lex_integer(true); - } - return op(OP_SUB); - case '+': - nextc(); - return op(OP_ADD); - case '*': - nextc(); - return op(OP_MUL); - case '/': - nextc(); - return op(OP_DIV); - case '%': - nextc(); - return op(OP_MOD); - case '~': - nextc(); - return op(OP_INV); - case '&': - nextc(); - return op(OP_AND); - case '|': - nextc(); - return op(OP_OR); - case '^': - nextc(); - return op(OP_XOR); - case '!': - nextc(); - if (peekc() == '=') { - nextc(); - return op(OP_NE); - } - return op(OP_NOT); - case '>': + while (true) { + // skip whitespace + while (is_whitespace(c)) { nextc(); c = peekc(); - if (c == '=') { - nextc(); - return op(OP_GTE); - } - if (c == '>') { - nextc(); - if (peekc() == '>') { - nextc(); - return op(OP_SHR); - } - return op(OP_SAR); - } - return op(OP_GT); - case '<': - nextc(); - c = peekc(); - if (c == '<') { - nextc(); - return op(OP_SHL); - } - if (c == '=') { - nextc(); - return op(OP_LTE); - } - return op(OP_LT); - } - if (is_digit(c)) { - return lex_integer(false); - } - char* name = lex_identifier(); - if (strcmp(name, "if") == 0) { - return simple(TOK_IF); - } - if (strcmp(name, "else") == 0) { - return simple(TOK_ELSE); - } - if (strcmp(name, "match") == 0) { - return simple(TOK_MATCH); - } - if (strcmp(name, "case") == 0) { - return simple(TOK_CASE); - } - if (strcmp(name, "loop") == 0) { - return simple(TOK_LOOP); - } - if (strcmp(name, "fn") == 0) { - return simple(TOK_FN); - } - if (strcmp(name, "next") == 0) { - return simple(TOK_NEXT); - } - if (strcmp(name, "exit") == 0) { - return simple(TOK_EXIT); - } - if (strcmp(name, "recurse") == 0) { - return simple(TOK_RECURSE); - } - if (strcmp(name, "return") == 0) { - return simple(TOK_RETURN); + } + // skip line comments + if (c == '!') { + do { + c = nextc(); + } while (c != '\n'); + } else { + break; + } } - union token_data data; - data.name = name; - struct token tok = { TOK_NAME, data }; + // syntax + switch (c) { + case 0: + return simple(TOK_EOF); + case '{': + nextc(); + return simple(TOK_MAP_BEGIN); + case '}': + nextc(); + return simple(TOK_MAP_END); + } + + char* name = lex_identifier(); + + // keywords + for (size_t kwd = 0; kwd < KEYWORD_COUNT; kwd++) { + if (strcmp(name, keywords[kwd]) == 0) { + return simple((enum token_type) kwd); + } + } + + enum token_type type = TOK_JUMP; + // labels + if (peekc() == ':') { + type = TOK_LABEL; + nextc(); + } + + struct token tok = { type, name }; return tok; } @@ -418,7 +156,6 @@ static struct token peek_buf; struct token next(void) { if (!init) { init = true; - indent_level = lex_indentation(); next(); } struct token tmp = peek_buf; @@ -429,78 +166,3 @@ struct token next(void) { struct token peek(void) { return peek_buf; } - -void print_token(struct token tok) { - switch (tok.type) { - case TOK_NAME: - fprintf(stdout, "%s", tok.data.name); - break; - case TOK_LABEL: - fprintf(stdout, "'%s", tok.data.label); - break; - case TOK_INTEGER: - fprintf(stdout, "%zi", tok.data.int_); - break; - case TOK_STRING: - fprintf(stdout, "\"%s\"", tok.data.string); - break; - case TOK_OPEN_GROUP: - fprintf(stdout, "("); - break; - case TOK_CLOSE_GROUP: - fprintf(stdout, ")"); - break; - case TOK_OPEN_BLOCK: - fprintf(stdout, "{"); - break; - case TOK_CLOSE_BLOCK: - fprintf(stdout, "}"); - break; - case TOK_TERMINATOR: - fprintf(stdout, ";"); - break; - case TOK_SEPARATOR: - fprintf(stdout, ","); - break; - case TOK_OPERATOR: - // TODO: printing for operators - fprintf(stdout, "OP:%i", tok.data.op); - break; - case TOK_EOF: - fprintf(stdout, ""); - break; - case TOK_CASE: - fprintf(stdout, "case"); - break; - case TOK_ELSE: - fprintf(stdout, "else"); - break; - case TOK_EQUALS: - fprintf(stdout, "="); - break; - case TOK_EXIT: - fprintf(stdout, "exit"); - break; - case TOK_FN: - fprintf(stdout, "fn"); - break; - case TOK_IF: - fprintf(stdout, "if"); - break; - case TOK_LOOP: - fprintf(stdout, "loop"); - break; - case TOK_NEXT: - fprintf(stdout, "next"); - break; - case TOK_RETURN: - fprintf(stdout, "return"); - break; - case TOK_RECURSE: - fprintf(stdout, "recurse"); - break; - case TOK_MATCH: - fprintf(stdout, "match"); - break; - } -} diff --git a/src/lex.h b/src/lex.h index 50041ba..4e20c3b 100644 --- a/src/lex.h +++ b/src/lex.h @@ -4,79 +4,44 @@ #include #include +#define KEYWORD_COUNT 23 + enum token_type { + TOK_COMM = 0, + TOK_ASSOCL = 1, + TOK_ASSOCR = 2, + TOK_DISTL = 3, + TOK_DISTR = 4, + TOK_FACTL = 5, + TOK_FACTR = 6, + TOK_MAPL = 7, + TOK_MAPR = 8, + TOK_UNITIL = 9, + TOK_UNITIR = 10, + TOK_UNITEL = 11, + TOK_UNITER = 12, + TOK_COMM_PLUS = 13, + TOK_ASSOCL_PLUS = 14, + TOK_ASSOCR_PLUS = 15, + TOK_MAPL_PLUS = 16, + TOK_MAPR_PLUS = 17, + TOK_INL = 18, + TOK_INR = 19, + TOK_OUT = 20, + TOK_HALT = 21, + TOK_IF = 22, + TOK_LABEL, + TOK_JUMP, + TOK_MAP_BEGIN, + TOK_MAP_END, TOK_EOF, // end of file - TOK_NAME, // foo, bar_quux123, loop - TOK_LABEL, // 'my_loop - TOK_INTEGER, // -123, 0xDEADBEEF - TOK_STRING, // "..." - TOK_OPERATOR, - TOK_OPEN_BLOCK, // `{` or `:` at the end of a line - TOK_CLOSE_BLOCK, // `}` or inferred from indentation - TOK_OPEN_GROUP, // `(` - TOK_CLOSE_GROUP, // `)` - TOK_TERMINATOR, // `;` or inferred from indentation, used to separate statements in blocks - TOK_SEPARATOR, // `,`, used to separate variables in initializers - TOK_EQUALS, // `=`, used for assignments or as an equality operator - TOK_IF, // if - TOK_ELSE, // else - TOK_MATCH, // match - TOK_CASE, // case - TOK_LOOP, // loop - TOK_FN, // fn - TOK_NEXT, // next - TOK_EXIT, // exit - TOK_RECURSE, // recurse - TOK_RETURN, // return -}; - -enum operator_ { - OP_EQ, // = - - OP_ADD, // + - OP_SUB, // - - OP_MUL, // * - OP_DIV, // / - OP_MOD, // % - - OP_INV, // ~ - OP_AND, // & - OP_OR, // | - OP_XOR, // ^ - OP_SHL, // << - OP_SAR, // >> - OP_SHR, // >>> - - OP_NOT, // ! - OP_GT, // > - OP_LT, // < - OP_GTE, // >= - OP_LTE, // <= - OP_NE, // != - - OP_TYPE, // : - OP_FUN, // -> - - OP_JUXT, // space! but this is not emitted by the lexer. -}; - -union token_data { - char* name; - char* label; - char* string; - int64_t int_; - enum operator_ op; }; struct token { enum token_type type; - union token_data data; + char* identifier; }; -_Bool is_unary(enum operator_ op); -_Bool is_binary(enum operator_ op); -_Bool is_lit(struct token tok); - struct token next(void); struct token peek(void); diff --git a/src/main.c b/src/main.c index d8b3f84..1a65873 100644 --- a/src/main.c +++ b/src/main.c @@ -2,310 +2,209 @@ #include #include #include +#include #include "bytecode.h" #include "format.h" #include "io.h" +#include "lex.h" #define ELF_HEADER_SIZE 0xb0 +enum map_type { + MAP_LEFT_TIMES, + MAP_RIGHT_TIMES, + MAP_LEFT_PLUS, + MAP_RIGHT_PLUS, +}; -// a + (b + (c + d)) -// (a + b) + (c + d) -// (b + a) + (c + d) -// b + (a + (c + d)) -// +static enum map_type maps[16]; +static size_t mapi = 0; -void transition_right(void) { - assocl_plus(); - mapl_plus_begin(); - out(); - inr(); - mapl_plus_end(); - assocr_plus(); +static char* label_names[2048]; +static symbol label_symbols[2048]; +static size_t labeli = 0; + +enum map_type pop(void) { + if (mapi <= 0) { + fprintf(stderr, "unmatched }\n"); + exit(1); + } + + return maps[--mapi]; } -void transition_left(void) { - out(); - inl(); +void push(enum map_type type) { + if (mapi >= 16) { + fprintf(stderr, "out of maps\n"); + exit(1); + } + + maps[mapi++] = type; } -void jump_from_to(size_t from, size_t to) { - if (from < to) { - mapl_plus_begin(); - inl(); - for(; from <= to; to--) { - inr(); - } - mapl_plus_end(); - mapr_plus_begin(); - inr(); - mapr_plus_end(); - out(); - } else if (to > from) { - for (size_t i = 0; i < from - to; i++) { - mapr_plus_end(); - mapl_plus_begin(); - inl(); - mapl_plus_end(); +symbol lookup_label(const char* name) { + for (size_t i = 0; i < labeli; i++) { + if (strcmp(label_names[i], name) == 0) { + fprintf(stderr, "%s\n", name); + return label_symbols[i]; } } -} - -void transition_into(void) { - assocl_plus(); - mapl_plus_begin(); - mapl_plus_begin(); - inl(); - mapl_plus_end(); - out(); - inr(); - mapl_plus_end(); - assocr_plus(); -} - -void transition_while(void) { - assocl_plus(); - mapl_plus_begin(); - mapr_plus_begin(); - inr(); - mapr_plus_end(); - out(); - mapl_plus_end(); - assocr_plus(); -} - -void inc(void) { - inr(); - factl(); -} - -void new_nat(void) { - // ctx - unitil(); // ctx * 1 - inl(); // ctx * 1 + ctx * 1 - factl(); // ctx * (1 + 1) -} - -void swap(void) { - assocr(); - mapr_begin(); - comm(); - mapr_end(); - assocl(); -} - -static void select_var(size_t var) { - // (... * a) * (b * (c * ...)) - for (size_t i = 0; i < var; i++) { - assocr(); - // ((... * a) * b) * (c * ...) + if (labeli >= 2048) { + fprintf(stderr, "out of labels\n"); + exit(1); } - comm(); - // (c * ...) * ((... * a) * b) - assocl(); - // ((c * ...) * (... * a)) * b + + fprintf(stderr, "%s:\n", name); + + unsigned long len = strlen(name) + 1; + label_names[labeli] = malloc(len); + memcpy(label_names[labeli], name, len); + label_symbols[labeli] = new_symbol(); + return label_symbols[labeli++]; } -static void unselect_var(size_t var) { - assocr(); - comm(); - for (size_t i = 0; i < var; i++) { - assocl(); +void nomap(void) { + if (mapi > 0) { + fprintf(stderr, "expected all maps to be closed before new label or EOF\n"); + exit(1); } } -static void case_on(size_t var) { - select_var(var); - distr(); - mapl_plus_begin(); { - unselect_var(var); - } mapl_plus_end(); - mapr_plus_begin(); { - unselect_var(var); - } mapr_plus_end(); -} - -static void snipe(size_t var) { - select_var(var); - unitel(); - comm(); - for (size_t i = 0; i < var; i++) { - assocl(); +void begin_map(enum map_type type) { + if (next().type != TOK_MAP_BEGIN) { + fprintf(stderr, "expected {\n"); + exit(1); } + push(type); } symbol compile(void) { symbol entry_point = init_bytecode(); - // This is the program we're trying to execute: - // - // fib n = fib_acc n 0 1 - // fib_acc 0 a b = a - // fib_acc (S n) a b = fib_acc n b (a + b) - // - // Looks simple, right? Well, things are a bit more complicated than that. - // - // 1. In `fib_acc 0`, we implicitly drop the value of `b`. Because we do not have - // weakening, we will have to free `b` explicitly here. - // - // fib_acc 0 a 0 = a - // fib_acc 0 a (S b) = fib_acc 0 a b - // - // 2. In `fib_acc (S n)`, we use `b` twice. We do not have contraction, so we must - // explicitly duplicate it, or implicitly duplicate it when we consume `b`. - // - // 3. We do not have addition as a built-in; we will need to define it ourselves. - // Moreover, we do not have functions, so it must be fused into the definition - // of fib_acc. - // - // -- We will duplicate `b` into the first argument (the new `a`) - // -- while adding it to the second argument (`a`, which will become the new `b`). - // fib_acc (S n) a b = fib_acc_plus n 0 a b - // fib_acc_plus n a b' 0 = fib_acc n a b' - // fib_acc_plus n a b' (S b) = fib_acc_plus n (S a) (S b') b - // - // 4. We'll have to do a lot of tedious work shuffling variables around. - // We don't even have implicit associativity, much less commutativity! - // - // We have this hierarchy of states: - // - // 1. start(1) - // 2. fib(n) - // 3. fib_acc(n, a, b) - // 4. fib_acc(0, a, b) - // 5. fib_acc_0(a b) - // - - // States: - // * start(1) - // * fib(n) - // * fib_acc(n, a, b) - // * fib_acc_Z(1, (a, b)) - // * fib_acc_Z_free(a, b) - // * fib_acc_Z_done - // * fib_acc_S - // * fib_acc_S_copy - // * fib_acc_S_copy_done - // * fib_acc_S_copy_S - - // State 0: starting state - mapl_plus_begin(); - // Initialize with integer (5). - inl(); - inr(); - inr(); - inr(); - inr(); - inr(); - mapl_plus_end(); - transition_right(); - - mapr_plus_begin(); - // State 1: fib(n); - mapl_plus_begin(); - // a = 0 - new_nat(); - // b = 1 - new_nat(); - inc(); - mapl_plus_end(); - transition_right(); - - mapr_plus_begin(); - // State 2: fib_acc(n, a, b) - mapl_plus_begin(); - // if n=1, we return the accumulated value - assocr(); - distl(); - mapl_plus_end(); - transition_right(); - - mapr_plus_begin(); - mapl_plus_begin(); - // State 3.1.1: fib_acc_Z(1, (a, b)) - mapl_plus_begin(); - uniter(); - // (a, b) - mapl_plus_end(); - transition_into(); - - mapr_plus_begin(); - // State 3.1.2.1: fib_acc_Z_free(a, b) - mapl_plus_begin(); - // n * (1 + n) - distr(); - mapl_plus_end(); - transition_while(); - - // State 3.1.2.2: fib_acc_Z_done - mapr_plus_begin(); - uniter(); - quit(); - mapr_plus_end(); - mapr_plus_end(); - mapl_plus_end(); - - mapr_plus_begin(); - // State 4: fib_acc_S - mapl_plus_begin(); - assocl(); - new_nat(); - swap(); - new_nat(); - swap(); - mapl_plus_end(); - transition_into(); - - mapr_plus_begin(); - mapl_plus_begin(); - // State 5.1: fib_acc_S_copy(n, a, b1, b2, b) - mapl_plus_begin(); - distl(); - mapl_plus_end(); - transition_into(); - mapr_plus_begin(); - mapl_plus_begin(); - // State 5.2.1: fib_acc_S_copy_done(n, a, b, b, 1) - uniter(); - // TODO: - mapl_plus_end(); - mapr_plus_begin(); - // State 5.2.2: fib_acc_S_copy_S(n, a, b1, b2, b) - mapr_plus_end(); - mapr_plus_end(); - mapl_plus_end(); - - mapr_plus_end(); - mapr_plus_end(); - mapr_plus_end(); - mapr_plus_end(); - mapr_plus_end(); - - - // State 1: fib(n) - assocl_plus(); - mapl_plus_begin(); - - // switch to state 2 - out(); - inr(); - mapl_plus_end(); - assocr_plus(); - - mapr_plus_begin(); - // State 2: fib_acc(n, a, b) - mapl_plus_begin(); - - - // State 2.1: transition to state 3 - mapl_plus_begin(); - - - mapr_plus_end(); - mapr_plus_end(); - + while (true) { + struct token tok = next(); + switch (tok.type) { + case TOK_COMM: + comm(); + break; + case TOK_ASSOCL: + assocl(); + break; + case TOK_ASSOCR: + assocr(); + break; + case TOK_DISTL: + distl(); + break; + case TOK_DISTR: + distr(); + break; + case TOK_FACTL: + factl(); + break; + case TOK_FACTR: + factr(); + break; + case TOK_MAPL: + begin_map(MAP_LEFT_TIMES); + mapl_begin(); + break; + case TOK_MAPR: + begin_map(MAP_RIGHT_TIMES); + mapr_begin(); + break; + case TOK_UNITIL: + unitil(); + break; + case TOK_UNITIR: + unitir(); + break; + case TOK_UNITEL: + unitel(); + break; + case TOK_UNITER: + uniter(); + break; + case TOK_COMM_PLUS: + comm_plus(); + break; + case TOK_ASSOCL_PLUS: + assocl_plus(); + break; + case TOK_ASSOCR_PLUS: + assocr_plus(); + break; + case TOK_MAPL_PLUS: + begin_map(MAP_LEFT_PLUS); + mapl_plus_begin(); + break; + case TOK_MAPR_PLUS: + begin_map(MAP_RIGHT_PLUS); + mapr_plus_begin(); + break; + case TOK_INL: + inl(); + break; + case TOK_INR: + inr(); + break; + case TOK_OUT: + out(); + break; + case TOK_HALT: + halt(); + break; + case TOK_LABEL: + nomap(); + define_executable_symbol(lookup_label(tok.identifier)); + break; + case TOK_JUMP: + fprintf(stderr, "!jump %s\n", tok.identifier); + nomap(); + jump(lookup_label(tok.identifier)); + break; + case TOK_MAP_BEGIN: + fprintf(stderr, "unexpected {\n"); + exit(1); + break; + case TOK_MAP_END: + ; enum map_type type = pop(); + switch (type) { + case MAP_LEFT_TIMES: + mapl_end(); + break; + case MAP_RIGHT_TIMES: + mapr_end(); + break; + case MAP_LEFT_PLUS: + mapl_plus_end(); + break; + case MAP_RIGHT_PLUS: + mapr_plus_end(); + break; + } + break; + case TOK_EOF: + goto eof; + case TOK_IF: + nomap(); + struct token a = next(); + struct token b = next(); + fprintf(stderr, "!if %s %s\n", a.identifier, b.identifier); + symbol aa = lookup_label(a.identifier); + symbol bb = lookup_label(b.identifier); + if (a.type != TOK_JUMP || b.type != TOK_JUMP) { + fprintf(stderr, "arguments to 'if' should be labels\n"); + exit(1); + } + jump_if(aa, bb); + break; + } + } +eof: + nomap(); finish_bytecode(); return entry_point; } diff --git a/src/parse.c b/src/parse.c deleted file mode 100644 index 0775b40..0000000 --- a/src/parse.c +++ /dev/null @@ -1,302 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include "lang.h" -#include "lex.h" -#include "parse.h" - -enum state { - ST_BLOCK, - ST_BLOCK_BODY, - ST_BLOCK_CONT, - ST_BLOCK_CLOSE, - ST_ASSIGN, - ST_EXPR, - // HACK: The existence of this state. - // Also, the entire structure of the parser is ugly. - ST_EXPR_HACK, - ST_EXPR_CONT, - ST_EXPR_END, - ST_GROUP, - ST_IF_ELSE, - ST_IF_END, - ST_LOOP_VARS, - ST_LOOP_VARS_CONT, -}; - -const char* state_name(enum state st) { - switch (st) { - case ST_BLOCK: - return "{"; - case ST_BLOCK_BODY: - return "B"; - case ST_BLOCK_CONT: - return ";"; - case ST_BLOCK_CLOSE: - return "}"; - case ST_ASSIGN: - return "="; - case ST_EXPR: - return "x"; - case ST_EXPR_CONT: - return "c"; - case ST_GROUP: - return "("; - case ST_IF_ELSE: - return "|"; - case ST_LOOP_VARS: - return "v"; - case ST_LOOP_VARS_CONT: - return ","; - case ST_EXPR_END: - return "E"; - case ST_EXPR_HACK: - return "H"; - case ST_IF_END: - return "i"; - } -} - -#define MAX_CONTEXT 256 -static uint32_t sp = 0; -static enum state stack[MAX_CONTEXT]; - -static void debug_print(struct token tok, struct token next) { - for (uint32_t i = 0; i < sp; i++) { - printf("%s", state_name(stack[i])); - } - printf(" "); - print_token(tok); - printf(" "); - print_token(next); - printf("\n"); -} - - -static void push(enum state state) { - stack[sp] = state; - sp++; -} - -static enum state pop(void) { - assert(sp != 0); - sp--; - return stack[sp]; -} - -static _Bool is_assignment(struct token tok, struct token next) { - return tok.type == TOK_NAME && next.type == TOK_EQUALS; -} - - -static _Bool is_expr(struct token tok) { - return is_lit(tok) - || tok.type == TOK_NAME - || tok.type == TOK_OPEN_GROUP - || tok.type == TOK_IF - || tok.type == TOK_MATCH - || tok.type == TOK_FN - || tok.type == TOK_LOOP - || tok.type == TOK_NEXT - || tok.type == TOK_EXIT - || tok.type == TOK_NEXT - || tok.type == TOK_RETURN - || tok.type == TOK_RECURSE - || tok.type == TOK_MATCH; -} - -#define syntax_error(msg) fprintf(stderr, "syntax error: %s\n", msg); exit(1) - -void parse(void) { - sp = 0; - // TODO: add support for the top-level instead of this block hack - push(ST_BLOCK_BODY); - struct token tok = next(); - struct token nxt = peek(); - while (sp > 0) { - debug_print(tok, nxt); - // FIXME: stack underflow because we're faking the top-level with blocks - switch (pop()) { - case ST_BLOCK: - if (tok.type == TOK_OPEN_BLOCK) { - push(ST_BLOCK_CLOSE); - push(ST_BLOCK_BODY); - enter_block(); - break; - } - syntax_error("expected beginning of block"); - break; - case ST_BLOCK_BODY: - if (is_assignment(tok, nxt)) { - push(ST_BLOCK_CONT); - push(ST_ASSIGN); - stmt_assign(tok.data.name); - break; - } - if (is_expr(tok)) { - push(ST_BLOCK_CONT); - push(ST_EXPR); - stmt_expr(); - continue; - } - continue; - case ST_BLOCK_CONT: - if (tok.type == TOK_TERMINATOR) { - push(ST_BLOCK_BODY); - break; - } - continue; - case ST_BLOCK_CLOSE: - if (tok.type == TOK_CLOSE_BLOCK) { - exit_block(); - break; - } - syntax_error("expected end of block"); - case ST_ASSIGN: - assert(tok.type == TOK_OPERATOR || tok.data.op == OP_EQ); - push(ST_EXPR); - break; - case ST_EXPR: - push(ST_EXPR_END); - push(ST_EXPR_HACK); - continue; - case ST_EXPR_HACK: - switch (tok.type) { - case TOK_STRING: - push(ST_EXPR_CONT); - expr_string(tok.data.string); - break; - case TOK_INTEGER: - push(ST_EXPR_CONT); - expr_integer(tok.data.int_); - break; - case TOK_IF: - push(ST_IF_END); - push(ST_IF_ELSE); - push(ST_BLOCK); - push(ST_EXPR); - enter_if(); - break; - case TOK_LOOP: - push(ST_BLOCK); - push(ST_LOOP_VARS); - if (nxt.type == TOK_LABEL) { - next(); - enter_loop(nxt.data.label); - } else { - enter_loop(NULL); - } - break; - case TOK_NEXT: - push(ST_EXPR); - if (nxt.type == TOK_LABEL) { - next(); - expr_next(nxt.data.label); - } else { - expr_next(NULL); - } - break; - case TOK_EXIT: - push(ST_EXPR); - if (nxt.type == TOK_LABEL) { - next(); - expr_exit(nxt.data.label); - } else { - expr_exit(NULL); - } - break; - case TOK_RETURN: - push(ST_EXPR); - expr_return(); - break; - case TOK_NAME: - push(ST_EXPR_CONT); - expr_var(tok.data.name); - break; - case TOK_OPEN_GROUP: - push(ST_EXPR_CONT); - push(ST_GROUP); - push(ST_EXPR); - enter_group(); - break; - case TOK_OPERATOR: - if (is_unary(tok.data.op)) { - push(ST_EXPR_CONT); - push(ST_EXPR_HACK); - expr_op(tok.data.op); - break; - } - syntax_error("only unary operators allowed at beginning of expression"); - case TOK_OPEN_BLOCK: - push(ST_BLOCK); - continue; - default: - syntax_error("expected expression"); - } - break; - case ST_EXPR_CONT: - if (is_expr(tok)) { - push(ST_EXPR_HACK); - expr_op(OP_JUXT); - continue; - } - if (tok.type == TOK_OPERATOR && is_binary(tok.data.op)) { - push(ST_EXPR_HACK); - expr_op(tok.data.op); - break; - } - continue; - case ST_EXPR_END: - exit_expr(); - continue; - case ST_GROUP: - if (tok.type == TOK_CLOSE_GROUP) { - exit_group(); - break; - } - syntax_error("mismatched parentheses"); - case ST_IF_ELSE: - if (tok.type == TOK_ELSE) { - push(ST_BLOCK); - break; - } - continue; - case ST_IF_END: - exit_if(); - continue; - case ST_LOOP_VARS: - if (is_assignment(tok, nxt)) { - push(ST_LOOP_VARS_CONT); - push(ST_ASSIGN); - cvar_init(tok.data.name); - break; - } - if (tok.type == TOK_NAME) { - push(ST_LOOP_VARS_CONT); - cvar_pass(tok.data.name); - break; - } - continue; - case ST_LOOP_VARS_CONT: - if (tok.type == TOK_SEPARATOR) { - push(ST_LOOP_VARS); - break; - } - continue; - } - tok = next(); - nxt = peek(); - } - if (tok.type != TOK_EOF) { - fprintf(stderr, "syntax error: finished parsing before end of file\n"); - exit(1); - } - if (sp > 0) { - fprintf(stderr, "syntax error: unfinished business at end of file: %i, %i\n", sp, stack[0]); - exit(1); - } -} diff --git a/src/parse.h b/src/parse.h deleted file mode 100644 index 1acd6dc..0000000 --- a/src/parse.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef PARSE_H -#define PARSE_H - -void parse(void); - -#endif