From 3fe367675a9857036e1eeffaa93993037d501bd9 Mon Sep 17 00:00:00 2001 From: James Martin Date: Sat, 10 Sep 2022 14:58:22 -0700 Subject: [PATCH] Removed radix#int syntax, added keywords to lexer. --- docs/syntax.md | 33 ++++++---- src/ir.c | 2 +- src/lex.c | 163 ++++++++++++++++++++++++++++++++++++------------- src/lex.h | 25 +++++--- src/parse.c | 107 ++++++++++++++++---------------- 5 files changed, 216 insertions(+), 114 deletions(-) diff --git a/docs/syntax.md b/docs/syntax.md index 1ffcee5..c30c88e 100644 --- a/docs/syntax.md +++ b/docs/syntax.md @@ -3,15 +3,22 @@ The grammar is LL(1). ```ebnf -block = open-block, block-body, close-block -block-body = stmt, [{ terminator, stmt }] ; +block = open-block, block-body, close-block ; +block-body = [ stmt, [{ terminator, stmt }] ] ; stmt = assignment | expr ; assignment = var, [":", expr], "=", expr ; +match-block = open-block, match-block-body, close-block ; +match-block-body = [ match-case, [{ terminator, match-case }] ] ; +match-case = "case", pattern-vars, block | "else", block ; +matchable-block = open-block, { match-block-body | block-body }, close-block ; expr = "if", expr, block, [ "else", block ] - | "loop", [ label ], control-vars, block - | "next", [ label ] - | "exit", [ label ], expr - | "return", expr + | "match", [ label ], init-vars, match-block + | "loop", [ label ], init-vars, matchable-block + | "fn", [ label ], arg-vars, matchable-block + | "next", [ label ], expr-cont + | "exit", [ label ], expr-cont + | "rec", [ label ], expr-cont + | "return", [ label ], expr-cont (* these expressions can be used as the LHS of *) (* a function application or binary operator. *) | "(", expr, ")", expr-cont @@ -22,8 +29,11 @@ expr = "if", expr, block, [ "else", block ] ; (* an optional binary operator or function application *) expr-cont = [ binop, expr | expr ] ; -control-vars = [ control-var, [{ ",", control-var }] ] ; -control-var = assignment | var ; +init-vars = [ control-var, [{ ",", control-var }] ] ; +init-var = assignment | var ; +arg-vars = [{ "(", var, type-annotation, ")" }] +pattern-vars = ? TODO ? +type-annotation = ":", expr ``` ## Lexemes @@ -45,12 +55,12 @@ binop = "+" | "-" | "*" | "/" | "%" (* types *) | ":" | "->" ; -num = ["-"], { decimal-digit | "," }, ["#", { digit | "," }] ; +num = ["-"], [ "0b" | "0x" ], digit, [{ ",", digit | digit }] string = '"', [{ -('"' | newline }], '"' ; label = "'", identifier ; identifier = alpha, [{ alphanumeric | "_" }] ; -alpha = ? 'A'..'Z' | 'a'..'z' ? ; +alpha = ? 'A'..'Z' ? | ? 'a'..'z' ? ; decimal-digit = ? '0'..'9' ? ; alphanumeric = decimal-digit | alpha ; digit = alphanumeric ; @@ -58,8 +68,7 @@ newline = "\r" | "\n" ; ``` A number is a series of base 10 digits by default. -You may use a different base using the syntax `base#digits`, -e.g. `2#100101`, `16#DEADBEEF`. +You may use base 2 or base 16 via `0b100101` and `0xDEADBEEF`. ## Blocks & Terminators The rules for blocks and terminators. diff --git a/src/ir.c b/src/ir.c index 12d81db..512f101 100644 --- a/src/ir.c +++ b/src/ir.c @@ -102,7 +102,7 @@ void jump_table(size_t branches, label* labels, var index, var* args) { } void jump_if(label label, var cond, var* args) { - assert(0); // UNIMPLEMENTED + //assert(0); // UNIMPLEMENTED } void load_var(reg reg, var var) { diff --git a/src/lex.c b/src/lex.c index d140c93..d1b0d6d 100644 --- a/src/lex.c +++ b/src/lex.c @@ -52,7 +52,8 @@ static _Bool is_alphanumeric(char c) { } static _Bool is_id_char(char c) { - return is_alpha(c) || is_digit(c) || c == '_'; + // TODO: allow unicode identifiers + return is_alphanumeric(c) || c == '_'; } static struct token simple(enum token_type type) { @@ -67,62 +68,75 @@ static struct token op(enum operator_ op) { return tok; } -static uint8_t digit_value(uint8_t base, char c) { - assert(base <= 36); +static uint8_t digit_value(char c) { if (is_digit(c)) { return c - '0'; } if (c >= 'A' && c <= 'Z') { return c - 'A' + 0xA; } if (c >= 'a' && c <= 'z') { return c - 'a' + 0xA; } assert(0); } -static _Bool is_extended_digit(uint8_t base, char c) { +static _Bool is_digit_in(uint8_t base, char c) { if (!is_alphanumeric(c)) { return false; } - uint8_t val = digit_value(base, c); - return val < base; + return digit_value(c) < base; } static uint64_t lex_digits(uint8_t base) { - _Bool at_least_one_char = false; uint64_t acc = 0; while (true) { char c = peekc(); - if (!is_extended_digit(base, c)) { + if (!is_digit_in(base, c)) { // commas are legal digit separators - if (c == ',' && is_extended_digit(base, peekc())) { + if (c == ',' && is_digit_in(base, peekc())) { nextc(); continue; } break; } nextc(); - uint8_t digit = digit_value(base, c); + uint8_t digit = digit_value(c); // (val * base + digit) > UINT64_MAX if (acc > ((UINT64_MAX - digit) / base)) { fprintf(stderr, "lexical error: integer literal overflow\n"); exit(1); } - at_least_one_char = true; acc *= base; acc += digit; } - if (!at_least_one_char) { - fprintf(stderr, "lexical error: expected digits\n"); - exit(1); - } return acc; } +static struct token integer_tok(uint64_t integer) { + union token_data data; + data.int_ = integer; + struct token tok = { TOK_INTEGER, data }; + return tok; +} + static struct token lex_integer(_Bool sign) { - uint64_t acc = lex_digits(10); - if (peekc() == '#') { - if (acc != 2 || acc != 8 && acc != 10 && acc != 16) { - fprintf(stderr, "lexical error: illegal integer literal base\n"); - exit(1); - } + uint8_t base = 10; + if (peekc() == '0') { nextc(); - acc = lex_digits((uint8_t) acc); + if (peekc() == 'b') { + base = 2; + nextc(); + } else if (peekc() == 'x') { + base = 16; + nextc(); + } + if(!is_digit(peekc() && !is_id_char(peekc()))) { + return integer_tok(0); + } + } + if (!is_digit_in(base, peekc())) { + fprintf(stderr, "lexical error: expected base-%i digits\n", base); + exit(1); + } + uint64_t acc = lex_digits(10); + if (is_id_char(peekc())) { + fprintf(stderr, "lexical error: must put space between integer and following identifier\n"); + exit(1); } if (sign && acc > INT64_MAX) { fprintf(stderr, "lexical error: signed integer literal overflow\n"); @@ -132,10 +146,7 @@ static struct token lex_integer(_Bool sign) { if (sign) { val = -val; } - union token_data data; - data.int_ = val; - struct token tok = { TOK_INTEGER, data }; - return tok; + return integer_tok(val); } #define MAX_STR_LEN 4096 @@ -155,6 +166,7 @@ static char* str_buf(void) { } static char* lex_string(void) { + // TODO: string escapes, multi-line strings, no length limit on strings str_index = 0; char* buf = str_buf(); while (true) { @@ -167,7 +179,6 @@ static char* lex_string(void) { fprintf(stderr, "lexical error: unclosed string (reached end of file)\n"); exit(1); } - // TODO: string escapes, multi-line strings if (c == '"') { break; } @@ -259,6 +270,16 @@ static struct token lex(void) { struct token tok = { TOK_LABEL, data }; return tok; } + case ':': + nextc(); + while (is_indent(peekc())) { + nextc(); + } + if (is_newline(peekc())) { + level_is_block[indent_level + 1] = true; + return lex(); + } + return op(OP_TYPE); case '{': nextc(); return simple(TOK_OPEN_BLOCK); @@ -279,10 +300,7 @@ static struct token lex(void) { return simple(TOK_SEPARATOR); case '=': nextc(); - return op(OP_EQ); - case '+': - nextc(); - return op(OP_ADD); + return simple(TOK_EQUALS); case '-': nextc(); if (peekc() == '>') { @@ -293,6 +311,9 @@ static struct token lex(void) { return lex_integer(true); } return op(OP_SUB); + case '+': + nextc(); + return op(OP_ADD); case '*': nextc(); return op(OP_MUL); @@ -321,16 +342,6 @@ static struct token lex(void) { return op(OP_NE); } return op(OP_NOT); - case ':': - nextc(); - while (is_indent(peekc())) { - nextc(); - } - if (is_newline(peekc())) { - level_is_block[indent_level + 1] = true; - return lex(); - } - return op(OP_TYPE); case '>': nextc(); c = peekc(); @@ -363,8 +374,40 @@ static struct token lex(void) { if (is_digit(c)) { return lex_integer(false); } + char* name = lex_identifier(); + if (strcmp(name, "if") == 0) { + return simple(TOK_IF); + } + if (strcmp(name, "else") == 0) { + return simple(TOK_ELSE); + } + if (strcmp(name, "match") == 0) { + return simple(TOK_MATCH); + } + if (strcmp(name, "case") == 0) { + return simple(TOK_CASE); + } + if (strcmp(name, "loop") == 0) { + return simple(TOK_LOOP); + } + if (strcmp(name, "fn") == 0) { + return simple(TOK_FN); + } + if (strcmp(name, "next") == 0) { + return simple(TOK_NEXT); + } + if (strcmp(name, "exit") == 0) { + return simple(TOK_EXIT); + } + if (strcmp(name, "recurse") == 0) { + return simple(TOK_RECURSE); + } + if (strcmp(name, "return") == 0) { + return simple(TOK_RETURN); + } + union token_data data; - data.name = lex_identifier(); + data.name = name; struct token tok = { TOK_NAME, data }; return tok; } @@ -420,10 +463,44 @@ void print_token(struct token tok) { fprintf(stdout, ","); break; case TOK_OPERATOR: + // TODO: printing for operators fprintf(stdout, "OP:%i", tok.data.op); break; case TOK_EOF: - fprintf(stdout, "EOF"); + fprintf(stdout, ""); + break; + case TOK_CASE: + fprintf(stdout, "case"); + break; + case TOK_ELSE: + fprintf(stdout, "else"); + break; + case TOK_EQUALS: + fprintf(stdout, "="); + break; + case TOK_EXIT: + fprintf(stdout, "exit"); + break; + case TOK_FN: + fprintf(stdout, "fn"); + break; + case TOK_IF: + fprintf(stdout, "if"); + break; + case TOK_LOOP: + fprintf(stdout, "loop"); + break; + case TOK_NEXT: + fprintf(stdout, "next"); + break; + case TOK_RETURN: + fprintf(stdout, "return"); + break; + case TOK_RECURSE: + fprintf(stdout, "recurse"); + break; + case TOK_MATCH: + fprintf(stdout, "match"); break; } } diff --git a/src/lex.h b/src/lex.h index d1f2437..50041ba 100644 --- a/src/lex.h +++ b/src/lex.h @@ -8,15 +8,26 @@ enum token_type { TOK_EOF, // end of file TOK_NAME, // foo, bar_quux123, loop TOK_LABEL, // 'my_loop - TOK_INTEGER, // -123, 16#DEADBEEF + TOK_INTEGER, // -123, 0xDEADBEEF TOK_STRING, // "..." TOK_OPERATOR, - TOK_OPEN_GROUP, // ( - TOK_CLOSE_GROUP, // ) - TOK_OPEN_BLOCK, // { - TOK_CLOSE_BLOCK, // } - TOK_TERMINATOR, // ; - TOK_SEPARATOR, // , + TOK_OPEN_BLOCK, // `{` or `:` at the end of a line + TOK_CLOSE_BLOCK, // `}` or inferred from indentation + TOK_OPEN_GROUP, // `(` + TOK_CLOSE_GROUP, // `)` + TOK_TERMINATOR, // `;` or inferred from indentation, used to separate statements in blocks + TOK_SEPARATOR, // `,`, used to separate variables in initializers + TOK_EQUALS, // `=`, used for assignments or as an equality operator + TOK_IF, // if + TOK_ELSE, // else + TOK_MATCH, // match + TOK_CASE, // case + TOK_LOOP, // loop + TOK_FN, // fn + TOK_NEXT, // next + TOK_EXIT, // exit + TOK_RECURSE, // recurse + TOK_RETURN, // return }; enum operator_ { diff --git a/src/parse.c b/src/parse.c index 56d48d7..b1bc112 100644 --- a/src/parse.c +++ b/src/parse.c @@ -16,6 +16,8 @@ enum state { ST_BLOCK_CLOSE, ST_ASSIGN, ST_EXPR, + // HACK: The existence of this state. + // Also, the entire structure of the parser is ugly. ST_EXPR_HACK, ST_EXPR_CONT, ST_EXPR_END, @@ -87,16 +89,24 @@ static enum state pop(void) { } static _Bool is_assignment(struct token tok, struct token next) { - return tok.type == TOK_NAME && next.type == TOK_OPERATOR && next.data.op == OP_EQ; - + return tok.type == TOK_NAME && next.type == TOK_EQUALS; } static _Bool is_expr(struct token tok) { - if (is_lit(tok) || tok.type == TOK_OPEN_GROUP) { - return true; - } - return tok.type == TOK_NAME; + return is_lit(tok) + || tok.type == TOK_NAME + || tok.type == TOK_OPEN_GROUP + || tok.type == TOK_IF + || tok.type == TOK_MATCH + || tok.type == TOK_FN + || tok.type == TOK_LOOP + || tok.type == TOK_NEXT + || tok.type == TOK_EXIT + || tok.type == TOK_NEXT + || tok.type == TOK_RETURN + || tok.type == TOK_RECURSE + || tok.type == TOK_MATCH; } #define syntax_error(msg) fprintf(stderr, "syntax error: %s\n", msg); exit(1) @@ -155,27 +165,23 @@ void parse(void) { push(ST_EXPR_HACK); continue; case ST_EXPR_HACK: - if (tok.type == TOK_STRING) { - push(ST_EXPR_CONT); - expr_string(tok.data.string); - break; - } - if (tok.type == TOK_INTEGER) { - push(ST_EXPR_CONT); - expr_integer(tok.data.int_); - break; - } - if (tok.type == TOK_NAME) { - char* name = tok.data.name; - if (strcmp(name, "if") == 0) { + switch (tok.type) { + case TOK_STRING: + push(ST_EXPR_CONT); + expr_string(tok.data.string); + break; + case TOK_INTEGER: + push(ST_EXPR_CONT); + expr_integer(tok.data.int_); + break; + case TOK_IF: push(ST_IF_END); push(ST_IF_ELSE); push(ST_BLOCK); push(ST_EXPR); enter_if(); break; - } - if (strcmp(name, "loop") == 0) { + case TOK_LOOP: push(ST_BLOCK); push(ST_LOOP_VARS); if (nxt.type == TOK_LABEL) { @@ -185,9 +191,8 @@ void parse(void) { enter_loop(NULL); } break; - } - if (strcmp(name, "next") == 0) { - push(ST_LOOP_VARS); + case TOK_NEXT: + push(ST_EXPR_CONT); if (nxt.type == TOK_LABEL) { next(); expr_next(nxt.data.label); @@ -195,8 +200,7 @@ void parse(void) { expr_next(NULL); } break; - } - if (strcmp(name, "exit") == 0) { + case TOK_EXIT: push(ST_EXPR); if (nxt.type == TOK_LABEL) { next(); @@ -205,34 +209,35 @@ void parse(void) { expr_exit(NULL); } break; - } - if (strcmp(name, "return") == 0) { + case TOK_RETURN: push(ST_EXPR); expr_return(); break; - } - push(ST_EXPR_CONT); - expr_var(tok.data.name); - break; + case TOK_NAME: + push(ST_EXPR_CONT); + expr_var(tok.data.name); + break; + case TOK_OPEN_GROUP: + push(ST_EXPR_CONT); + push(ST_GROUP); + push(ST_EXPR); + enter_group(); + break; + case TOK_OPERATOR: + if (is_unary(tok.data.op)) { + push(ST_EXPR_CONT); + push(ST_EXPR_HACK); + expr_op(tok.data.op); + break; + } + syntax_error("only unary operators allowed at beginning of expression"); + case TOK_OPEN_BLOCK: + push(ST_BLOCK); + continue; + default: + syntax_error("expected expression"); } - if (tok.type == TOK_OPEN_GROUP) { - push(ST_EXPR_CONT); - push(ST_GROUP); - push(ST_EXPR); - enter_group(); - break; - } - if (tok.type == TOK_OPERATOR && is_unary(tok.data.op)) { - push(ST_EXPR_CONT); - push(ST_EXPR_HACK); - expr_op(tok.data.op); - break; - } - if (tok.type == TOK_OPEN_BLOCK) { - push(ST_BLOCK); - continue; - } - syntax_error("expected expression"); + break; case ST_EXPR_CONT: if (is_expr(tok)) { push(ST_EXPR_HACK); @@ -255,7 +260,7 @@ void parse(void) { } syntax_error("mismatched parentheses"); case ST_IF_ELSE: - if (tok.type == TOK_NAME && strcmp(tok.data.name, "else") == 0) { + if (tok.type == TOK_ELSE) { push(ST_BLOCK); break; }