From d7c0eef7ae305379755de6acafaee89203ce6211 Mon Sep 17 00:00:00 2001 From: James Martin Date: Wed, 7 Sep 2022 20:42:37 -0700 Subject: [PATCH] Implemented parser! Recognition only, no output. Also no top-level declarations or operator precedence. The syntax is LL(1). LL syntax seems necessary because our codegen requires emitting certain code (e.g. entering control) prior to any codegen inside that context, whereas something like LR would presumably parse the inner expression before recognizing the control structure. There may be some way to work around this; I don't know, I'm not a parsing expert. Certain parts of the syntax are wonky, e.g. juxtaposition as function application means a missing semicolon can give confusing results. I suspect indentation-sensitive syntax would work more nicely, and intend to implement it some time in the future. --- Makefile | 2 +- src/io.c | 1 + src/io.h | 6 +- src/ir.h | 1 + src/lex.c | 88 +++++++++++++++++++- src/lex.h | 7 ++ src/main.c | 45 +--------- src/parse.c | 235 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/parse.h | 6 ++ 9 files changed, 341 insertions(+), 50 deletions(-) create mode 100644 src/parse.c create mode 100644 src/parse.h diff --git a/Makefile b/Makefile index 9584e56..729d4e7 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ SHELL = /bin/sh CFLAGS = -std=c99 -pedantic -Wextra -Os LDFLAGS = -lc -OBJECTS = asm.o io.o ir.o lex.o main.o x86encode.o +OBJECTS = asm.o io.o ir.o lex.o main.o parse.o x86encode.o .PHONY: passc passc: .bin $(OBJECTS) diff --git a/src/io.c b/src/io.c index aaea749..6c946f9 100644 --- a/src/io.c +++ b/src/io.c @@ -2,6 +2,7 @@ #include #include +#include #include #ifdef __unix__ diff --git a/src/io.h b/src/io.h index b981f65..f865434 100644 --- a/src/io.h +++ b/src/io.h @@ -1,8 +1,8 @@ -#ifndef _IO_H -#define _IO_H +#ifndef IO_H +#define IO_H +#include #include -#include extern uint32_t here; diff --git a/src/ir.h b/src/ir.h index cd61831..9aec2de 100644 --- a/src/ir.h +++ b/src/ir.h @@ -28,6 +28,7 @@ label enter(uint32_t retc); /// plus the return values declared by the call to `enter`. void leave(var* args); +label declare_continue(uint32_t retc); /// Declare a new label in the innermost block. /// diff --git a/src/lex.c b/src/lex.c index 18f835c..5c294c6 100644 --- a/src/lex.c +++ b/src/lex.c @@ -1,12 +1,45 @@ #include #include #include +#include #include -#include +#include #include "lex.h" #include "io.h" +_Bool is_unary(enum operator_ op) { + return op == OP_SUB + || op == OP_INV + || op == OP_NOT; +} + +_Bool is_binary(enum operator_ op) { + return op == OP_EQ + || op == OP_ADD + || op == OP_SUB + || op == OP_MUL + || op == OP_DIV + || op == OP_MOD + || op == OP_AND + || op == OP_OR + || op == OP_XOR + || op == OP_SHL + || op == OP_SAR + || op == OP_SHR + || op == OP_GT + || op == OP_LT + || op == OP_GTE + || op == OP_LTE + || op == OP_NE + || op == OP_TYPE + || op == OP_FUN; +} + +_Bool is_lit(struct token tok) { + return tok.type == TOK_INTEGER || tok.type == TOK_STRING || tok.type == TOK_NAME; +} + static _Bool is_whitespace(char c) { return c == ' ' || c == '\t' || c == '\r' || c == '\n'; } @@ -153,6 +186,14 @@ static struct token lex_integer(void) { static size_t str_index; static char str_buf[MAX_STR_LEN]; +static char* leak_buf(void) { + // FIXME: memory leak + char* str = malloc(str_index + 1); + memcpy(str, str_buf, str_index); + str[str_index] = 0; + return str; +} + static char* lex_string(void) { char* buf; size_t len = 1; @@ -185,7 +226,7 @@ static char* lex_string(void) { skip(1); } str_buf[str_index] = 0; - return str_buf; + return leak_buf(); } static char* lex_identifier(void) { @@ -208,7 +249,7 @@ static char* lex_identifier(void) { exit(1); } str_buf[str_index] = 0; - return str_buf; + return leak_buf(); } struct token lex(void) { @@ -348,3 +389,44 @@ struct token lex(void) { struct token tok = { TOK_NAME, data }; return tok; } + +void print_token(struct token tok) { + switch (tok.type) { + case TOK_NAME: + fprintf(stdout, "%s", tok.data.name); + break; + case TOK_LABEL: + fprintf(stdout, "'%s", tok.data.label); + break; + case TOK_INTEGER: + fprintf(stdout, "%zi", tok.data.int_); + break; + case TOK_STRING: + fprintf(stdout, "\"%s\"", tok.data.string); + break; + case TOK_OPEN_GROUP: + fprintf(stdout, "("); + break; + case TOK_CLOSE_GROUP: + fprintf(stdout, ")"); + break; + case TOK_OPEN_BLOCK: + fprintf(stdout, "{"); + break; + case TOK_CLOSE_BLOCK: + fprintf(stdout, "}"); + break; + case TOK_TERMINATOR: + fprintf(stdout, ";"); + break; + case TOK_SEPARATOR: + fprintf(stdout, ","); + break; + case TOK_OPERATOR: + fprintf(stdout, "OP: %i", tok.data.op); + break; + case TOK_EOF: + fprintf(stdout, "EOF"); + break; + } +} diff --git a/src/lex.h b/src/lex.h index efd7ec6..b981382 100644 --- a/src/lex.h +++ b/src/lex.h @@ -1,6 +1,7 @@ #ifndef LEX_H #define LEX_H +#include #include enum token_type { @@ -59,6 +60,12 @@ struct token { union token_data data; }; +_Bool is_unary(enum operator_ op); +_Bool is_binary(enum operator_ op); +_Bool is_lit(struct token tok); + struct token lex(void); +void print_token(struct token tok); + #endif diff --git a/src/main.c b/src/main.c index 148fa7e..90b81d0 100644 --- a/src/main.c +++ b/src/main.c @@ -9,7 +9,7 @@ #include "io.h" #include "ir.h" -#include "lex.h" +#include "parse.h" #define ELF_HEADER_SIZE 0xb0 @@ -76,48 +76,7 @@ int main(int argc, char** argv) { } open_files(argv[2], argv[1]); - struct token tok; - do { - tok = lex(); - switch (tok.type) { - case TOK_NAME: - fprintf(stdout, "%s\n", tok.data.name); - break; - case TOK_LABEL: - fprintf(stdout, "'%s\n", tok.data.label); - break; - case TOK_INTEGER: - fprintf(stdout, "%zi\n", tok.data.int_); - break; - case TOK_STRING: - fprintf(stdout, "\"%s\"\n", tok.data.string); - break; - case TOK_OPEN_GROUP: - fprintf(stdout, "(\n"); - break; - case TOK_CLOSE_GROUP: - fprintf(stdout, ")\n"); - break; - case TOK_OPEN_BLOCK: - fprintf(stdout, "{\n"); - break; - case TOK_CLOSE_BLOCK: - fprintf(stdout, "}\n"); - break; - case TOK_TERMINATOR: - fprintf(stdout, ";\n"); - break; - case TOK_SEPARATOR: - fprintf(stdout, ",\n"); - break; - case TOK_OPERATOR: - fprintf(stdout, "OP: %i\n", tok.data.op); - break; - case TOK_EOF: - fprintf(stdout, "EOF\n"); - break; - } - } while (tok.type != TOK_EOF); + parse(); reserve(ELF_HEADER_SIZE); size_t entry_point = compile(); diff --git a/src/parse.c b/src/parse.c new file mode 100644 index 0000000..4d3cd7f --- /dev/null +++ b/src/parse.c @@ -0,0 +1,235 @@ +#include +#include +#include +#include +#include +#include + +#include "lex.h" +#include "parse.h" + +enum state { + ST_BLOCK_OPEN, + ST_BLOCK, + ST_ASSIGN, + ST_EXPR, + ST_EXPR_CONT, + ST_GROUP, + ST_IF_ELSE, + ST_LOOP_VARS, + ST_LOOP_VARS_CONT, + ST_LABEL, +}; + +const char* state_name(enum state st) { + switch (st) { + case ST_BLOCK_OPEN: + return "{"; + case ST_BLOCK: + return ";"; + case ST_ASSIGN: + return "="; + case ST_EXPR: + return "x"; + case ST_EXPR_CONT: + return "c"; + case ST_GROUP: + return "("; + case ST_IF_ELSE: + return "|"; + case ST_LOOP_VARS: + return "v"; + case ST_LOOP_VARS_CONT: + return ","; + case ST_LABEL: + return "'"; + } +} + +#define MAX_CONTEXT 256 +static uint32_t sp = 0; +static enum state stack[MAX_CONTEXT]; + +static void debug_print(struct token tok, struct token next) { + for (uint32_t i = 0; i < sp; i++) { + printf("%s", state_name(stack[i])); + } + printf(" "); + print_token(tok); + printf(" "); + print_token(next); + printf("\n"); +} + + +static void push(enum state state) { + stack[sp] = state; + sp++; +} + +static enum state pop(void) { + assert(sp != 0); + sp--; + return stack[sp]; +} + +static _Bool is_assignment(struct token tok, struct token next) { + return tok.type == TOK_NAME && next.type == TOK_OPERATOR && next.data.op == OP_EQ; +} + + +static _Bool is_expr(struct token tok) { + if (is_lit(tok) || tok.type == TOK_OPEN_GROUP) { + return true; + } + return tok.type == TOK_NAME; +} + +#define syntax_error(msg) fprintf(stderr, "syntax error: %s\n", msg); exit(1) + +void parse(void) { + // TODO: add support for the top-level instead of this block hack + push(ST_BLOCK); + struct token tok = lex(); + struct token next = lex(); + while (sp > 0) { + debug_print(tok, next); + // FIXME: stack underflow because we're faking the top-level with blocks + switch (pop()) { + case ST_BLOCK_OPEN: + if (tok.type != TOK_OPEN_BLOCK) { + syntax_error("expected open block (`{`)"); + } + push(ST_BLOCK); + break; + case ST_BLOCK: + if (tok.type == TOK_CLOSE_BLOCK) { + break; + } + if (tok.type == TOK_TERMINATOR) { + push(ST_BLOCK); + break; + } + if (is_assignment(tok, next)) { + push(ST_BLOCK); + push(ST_ASSIGN); + break; + } + if (is_expr(tok)) { + push(ST_BLOCK); + push(ST_EXPR); + continue; + } + break; + case ST_ASSIGN: + assert(tok.type == TOK_OPERATOR || tok.data.op == OP_EQ); + push(ST_EXPR); + break; + case ST_EXPR: + push(ST_EXPR_CONT); + if (tok.type == TOK_STRING) { + break; + } + if (tok.type == TOK_INTEGER) { + break; + } + if (tok.type == TOK_NAME) { + char* name = tok.data.name; + if (strcmp(name, "if") == 0) { + push(ST_IF_ELSE); + push(ST_BLOCK_OPEN); + push(ST_EXPR); + break; + } + if (strcmp(name, "loop") == 0) { + push(ST_BLOCK_OPEN); + push(ST_LOOP_VARS); + if (next.type == TOK_LABEL) { + push(ST_LABEL); + } + break; + } + if (strcmp(name, "next") == 0) { + push(ST_LOOP_VARS); + if (next.type == TOK_LABEL) { + push(ST_LABEL); + } + break; + } + if (strcmp(name, "exit") == 0) { + push(ST_EXPR); + if (next.type == TOK_LABEL) { + push(ST_LABEL); + } + break; + } + if (strcmp(name, "return") == 0) { + push(ST_EXPR); + break; + } + break; + } + if (tok.type == TOK_OPEN_GROUP) { + push(ST_GROUP); + push(ST_EXPR); + break; + } + if (tok.type == TOK_OPERATOR && is_unary(tok.data.op)) { + push(ST_EXPR); + break; + } + syntax_error("expected expression"); + case ST_EXPR_CONT: + if (is_expr(tok)) { + push(ST_EXPR); + continue; + } + if (tok.type == TOK_OPERATOR && is_binary(tok.data.op)) { + push(ST_EXPR); + break; + } + continue; + case ST_GROUP: + if (tok.type == TOK_CLOSE_GROUP) { + break; + } + syntax_error("mismatched parentheses"); + case ST_IF_ELSE: + if (tok.type == TOK_NAME && strcmp(tok.data.name, "else") == 0) { + push(ST_BLOCK_OPEN); + break; + } + continue; + case ST_LOOP_VARS: + if (is_assignment(tok, next)) { + push(ST_LOOP_VARS_CONT); + push(ST_ASSIGN); + break; + } + if (tok.type == TOK_NAME) { + push(ST_LOOP_VARS_CONT); + break; + } + continue; + case ST_LOOP_VARS_CONT: + if (tok.type == TOK_SEPARATOR) { + push(ST_LOOP_VARS); + break; + } + continue; + case ST_LABEL: + assert(tok.type == TOK_LABEL); + break; + } + tok = next; + next = lex(); + } + if (tok.type != TOK_EOF) { + fprintf(stderr, "syntax error: finished parsing before end of file\n"); + exit(1); + } + if (sp > 0) { + fprintf(stderr, "syntax error: unfinished business at end of file: %i, %i\n", sp, stack[0]); + exit(1); + } +} diff --git a/src/parse.h b/src/parse.h new file mode 100644 index 0000000..1acd6dc --- /dev/null +++ b/src/parse.h @@ -0,0 +1,6 @@ +#ifndef PARSE_H +#define PARSE_H + +void parse(void); + +#endif