Greatly simplify lexer thanks to new knowledge of lookahead.

Now I know that the parser is LL(1) and the lexer also only needs one-character lookahead, which allows me to dramatically simplify the interface for IO input, and improve the interface to the lexer. Even if I did want unbounded peek, I'd want the interface to be `peek(off)`, not that awful buffer. I intend to use the new lexer interface to make the parser states more stateful, and potentially read multiple tokens in a row. Then, states would only be needed for recursive structures, without the awkward intermediate states like ST_LABEL which exists only to let me burn a token. I also removed the nonsense related to base 64 parsing, because it was unclear how to handle it, and the advantages of having it weren't clear. I kept up to base 36, but honestly I might want to consider getting rid of everything but decimal, hex, and binary anyway. I'm not sure if I'd want to keep using the current syntax for the radix either.
2022-09-07 23:02:15 -07:00 · 2022-09-07 23:02:15 -07:00 · bce39fdc22
parent d7c0eef7ae
commit bce39fdc22
5 changed files with 173 additions and 222 deletions
--- a/src/io.c
+++ b/src/io.c
@ -1,6 +1,8 @@
 #include "io.h"

+#include <assert.h>
 #include <errno.h>
+#include <stdbool.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
@ -104,38 +106,39 @@ void patch_i32(size_t off, int32_t x) {
    patch_u32(off, (uint32_t) x);
 }

-#define MAX_LOOKAHEAD 4
-static size_t read_buf_len = 0;
-static char read_buf[MAX_LOOKAHEAD];
+static char peek_buf;
+static _Bool peeked;

-char* peek(size_t* len) {
-    if (*len >= MAX_LOOKAHEAD) {
-        fprintf(stderr, "syntax error: maximum lookahead exceeded\n");
-        exit(1);
-    }
-    if (*len >= read_buf_len) {
-        size_t inc = fread(read_buf + read_buf_len, 1, *len - read_buf_len, infile);
+static char next_(void) {
+    int c = getc(infile);
+    if (c == EOF) {
        if (ferror(infile)) {
            fprintf(stderr, "failed to read source file: %s\n", strerror(errno));
            exit(1);
        }
-        read_buf_len += inc;
-        *len = read_buf_len;
+        return 0;
    }
-    return read_buf;
+    return c;
 }

-void skip(size_t off) {
-    if (read_buf_len > off) {
-        memmove(read_buf, &read_buf[off], read_buf_len - off);
-        read_buf_len -= off;
-    } else {
-        if (read_buf_len < off) {
-            if (fseek(infile, off - read_buf_len, SEEK_CUR) != 0) {
-                fprintf(stderr, "failed to seek in source file: %s\n", strerror(errno));
-                exit(1);
-            }
-        }
-        read_buf_len = 0;
+char nextc(void) {
+    if (peeked) {
+        peeked = false;
+        return peek_buf;
    }
+    return next_();
+}
+
+void unnextc(char c) {
+    assert(!peeked);
+    peek_buf = c;
+    peeked = true;
+}
+
+char peekc(void) {
+    if (!peeked) {
+        peek_buf = next_();
+        peeked = true;
+    }
+    return peek_buf;
 }
--- a/src/io.h
+++ b/src/io.h
@ -19,7 +19,8 @@ void patch(size_t off, const void* ptr, size_t count);
 void patch_u32(size_t off, uint32_t x);
 void patch_i32(size_t off, int32_t x);

-char* peek(size_t* len);
-void skip(size_t off);
+char nextc(void);
+void unnextc(char c);
+char peekc(void);

 #endif
--- a/src/lex.c
+++ b/src/lex.c
@ -52,10 +52,6 @@ static _Bool is_digit(char c) {
    return c >= '0' && c <= '9';
 }

-static _Bool begins_integer(char c) {
-    return is_digit(c) || c == '-';
-}
-
 static _Bool id_char(char c) {
    return is_alpha(c) || is_digit(c) || c == '_';
 }
@ -73,72 +69,41 @@ static struct token op(enum operator_ op) {
 }

 static uint8_t digit_value(uint8_t base, char c) {
-    // TODO: sort out this mess
-
-    // restrict bases to avoid having to make decisions about how to handle
-    // upper vs. lower and base64. (letters before digits? seriously?)
-    if (base != 2 && base != 4 && base != 8 && base != 10 && base != 16) {
-        fprintf(stderr, "lexical error: illegal integer base (for now)\n");
-        exit(1);
-    }
-    // who invented this???? why can't 0 be 0? screw you.
-    if (base == 64) {
-        if (is_digit(c)) return c - '0' + 52;
-        if (c >= 'A' && c <= 'Z') return c - 'A';
-        if (c >= 'a' && c <= 'Z') return c - 'a' + 26;
-        if (c == '+') return 62;
-        // c == '/'
-        return 63;
-    }
-    if (is_digit(c)) return c - '0';
-    if (c >= 'A' && c <= 'Z') return c - 'A' + 10;
-    if (c >= 'a' && c <= 'z') {
-        if (base > 36) {
-            return c - 'a' + 36;
-        }
-        return c - 'a' + 10;
-    }
-    if (c == '+') return 62;
-    // c == '/'
-    return 63;
+    assert(base <= 36);
+    if (is_digit(c)) { return c - '0'; }
+    if (c >= 'A' && c <= 'Z') { return c - 'A' + 0xA; }
+    if (c >= 'a' && c <= 'z') { return c - 'a' + 0xA; }
+    assert(0);
 }

 static _Bool is_extended_digit(uint8_t base, char c) {
-    if (!is_digit(c) && !is_alpha(c) && c != '+' && c != '/')
+    if (!is_digit(c) && !is_alpha(c)) {
        return false;
+    }
    uint8_t val = digit_value(base, c);
-    if (val > base)
-        return false;
-    return true;
+    return val < base;
 }

 static uint64_t lex_digits(uint8_t base) {
-    uint64_t acc = 0;
-    char* buf;
-    size_t len;
    _Bool at_least_one_char = false;
+    uint64_t acc = 0;
    while (true) {
-        len = 1;
-        buf = peek(&len);
-        if (!is_extended_digit(base, buf[0])) {
+        char c = peekc();
+        if (!is_extended_digit(base, c)) {
            // commas are legal digit separators
-            if (buf[0] == ',') {
-                len = 2;
-                buf = peek(&len);
-                if (len == 2 && is_extended_digit(base, buf[1])) {
-                    skip(1);
-                    continue;
-                }
+            if (c == ',' && is_extended_digit(base, peekc())) {
+                nextc();
+                continue;
            }
            break;
        }
-        uint8_t digit = digit_value(base, buf[0]);
+        nextc();
+        uint8_t digit = digit_value(base, c);
        // (val * base + digit) > UINT64_MAX
        if (acc > ((UINT64_MAX - digit) / base)) {
            fprintf(stderr, "lexical error: integer literal overflow\n");
            exit(1);
        }
-        skip(1);
        at_least_one_char = true;
        acc *= base;
        acc += digit;
@ -150,32 +115,28 @@ static uint64_t lex_digits(uint8_t base) {
    return acc;
 }

-static struct token lex_integer(void) {
-    _Bool sign = false;
-    char* buf;
-    size_t len = 1;
-    buf = peek(&len);
-    assert(len > 0 && begins_integer(buf[0]));
-    if (buf[0] == '-') {
-        sign = true;
-        skip(1);
-    }
+static struct token lex_integer(_Bool sign) {
    uint64_t acc = lex_digits(10);
-    len = 1;
-    buf = peek(&len);
-    if (len == 1 && buf[0] == '#') {
-        if (acc > 64) {
+    if (peekc() == '#') {
+        if (acc < 2) {
+            fprintf(stderr, "lexical error: integer literal base too small\n");
+            exit(1);
+        }
+        if (acc > 36) {
            fprintf(stderr, "lexical error: integer literal base too large\n");
            exit(1);
        }
-        skip(1);
+        nextc();
        acc = lex_digits((uint8_t) acc);
    }
    if (sign && acc > INT64_MAX) {
        fprintf(stderr, "lexical error: integer literal overflow due to sign\n");
        exit(1);
    }
-    int64_t val = sign ? -(int64_t) acc : (int64_t) acc;
+    int64_t val = (int64_t) acc;
+    if (sign) {
+        val = -val;
+    }
    union token_data data;
    data.int_ = val;
    struct token tok = { TOK_INTEGER, data };
@ -184,212 +145,196 @@ static struct token lex_integer(void) {

 #define MAX_STR_LEN 4096
 static size_t str_index;
-static char str_buf[MAX_STR_LEN];
+// alternate string buffers between tokens to prevent overwriting buffer.
+// we're LL(1) so 2 buffers is sufficient.
+static _Bool which_buf = false;
+static char str_buf_1[MAX_STR_LEN];
+static char str_buf_2[MAX_STR_LEN];

-static char* leak_buf(void) {
-    // FIXME: memory leak
-    char* str = malloc(str_index + 1);
-    memcpy(str, str_buf, str_index);
-    str[str_index] = 0;
-    return str;
+static char* str_buf(void) {
+    which_buf = !which_buf;
+    if (which_buf) {
+        return str_buf_1;
+    }
+    return str_buf_2;
 }

 static char* lex_string(void) {
-    char* buf;
-    size_t len = 1;
-    buf = peek(&len);
-    assert(len == 1 && buf[0] == '"');
-    skip(1);
    str_index = 0;
+    char* buf = str_buf();
    while (true) {
+        char c = nextc();
        if (str_index == MAX_STR_LEN - 1) {
            fprintf(stderr, "lexical error: string too long\n");
            exit(1);
        }
-        len = 1;
-        buf = peek(&len);
-        if (len < 1) {
+        if (c == 0) {
            fprintf(stderr, "lexical error: unclosed string (reached end of file)\n");
            exit(1);
        }
        // TODO: string escapes, multi-line strings
-        if (buf[0] == '"') {
-            skip(1);
+        if (c == '"') {
            break;
        }
-        if (buf[0] == '\n') {
+        if (c == '\n') {
            fprintf(stderr, "lexical error: unclosed string (reached end of line)\n");
            exit(1);
        }
-        str_buf[str_index] = buf[0];
+        buf[str_index] = c;
        str_index++;
-        skip(1);
    }
-    str_buf[str_index] = 0;
-    return leak_buf();
+    buf[str_index] = 0;
+    return buf;
 }

 static char* lex_identifier(void) {
-    char* buf;
-    size_t len;
-    char c;
    str_index = 0;
+    char* buf = str_buf();
    while (true) {
-        len = 1;
-        buf = peek(&len);
-        if (len == 0) break;
-        c = buf[0];
+        char c = peekc();
        if (!is_alpha(c) && !is_digit(c) && c != '_') break;
-        skip(1);
-        str_buf[str_index] = c;
+        nextc();
+        buf[str_index] = c;
        str_index++;
    }
    if (str_index == 0) {
        fprintf(stderr, "lexical error: expected identifier (possibly illegal character?)\n");
        exit(1);
    }
-    str_buf[str_index] = 0;
-    return leak_buf();
+    buf[str_index] = 0;
+    return buf;
 }

-struct token lex(void) {
-    char* buf;
-    size_t len;
-    len = 1;
-    buf = peek(&len);
-    if (len < 1) {
-        return simple(TOK_EOF);
-    }
-    char c = buf[0];
-    while (is_whitespace(c)) {
-        skip(1);
-        len = 1;
-        peek(&len);
-        if (len == 0) {
-            return simple(TOK_EOF);
-        }
-        c = buf[0];
-    }
-    if (begins_integer(c)) {
-        len = 2;
-        peek(&len);
-        if (is_digit(buf[0]) || (len > 1 && is_digit(buf[1])))
-            return lex_integer();
-    }
-    if (c == '"') {
-        union token_data data;
-        data.string = lex_string();
-        struct token tok = { TOK_STRING, data };
-        return tok;
-    }
-    if (c == '\'') {
-        skip(1);
-        union token_data data;
-        data.label = lex_identifier();
-        struct token tok = { TOK_LABEL, data };
-        return tok;
-    }
+static struct token lex(void) {
+    char c;
+    do {
+        c = nextc();
+    } while (is_whitespace(c));
+    _Bool sign = false;
    switch (c) {
+        case 0:
+            return simple(TOK_EOF);
+        case '"': {
+            union token_data data;
+            data.string = lex_string();
+            struct token tok = { TOK_STRING, data };
+            return tok;
+        }
+        case '\'': {
+            union token_data data;
+            data.label = lex_identifier();
+            struct token tok = { TOK_LABEL, data };
+            return tok;
+        }
        case '{':
-            skip(1);
            return simple(TOK_OPEN_BLOCK);
        case '}':
-            skip(1);
            return simple(TOK_CLOSE_BLOCK);
        case '(':
-            skip(1);
            return simple(TOK_OPEN_GROUP);
        case ')':
-            skip(1);
            return simple(TOK_CLOSE_GROUP);
        case ';':
-            skip(1);
            return simple(TOK_TERMINATOR);
        case ',':
-            skip(1);
            return simple(TOK_SEPARATOR);
        case '=':
-            skip(1);
            return op(OP_EQ);
        case '+':
-            skip(1);
            return op(OP_ADD);
        case '-':
-            skip(1);
-            len = 1;
-            buf = peek(&len);
-            if (len == 1 && buf[0] == '>') {
-                 skip(1);
+            if (peekc() == '>') {
+                nextc();
                return op(OP_FUN);
            }
+            if (is_digit(peekc())) {
+                return lex_integer(true);
+            }
            return op(OP_SUB);
        case '*':
-            skip(1);
            return op(OP_MUL);
        case '/':
-            skip(1);
            return op(OP_DIV);
        case '%':
-            skip(1);
            return op(OP_MOD);
        case '~':
-            skip(1);
            return op(OP_INV);
        case '&':
-            skip(1);
            return op(OP_AND);
        case '|':
-            skip(1);
            return op(OP_OR);
        case '^':
-            skip(1);
            return op(OP_XOR);
        case '!':
-            skip(1);
-            len = 1;
-             buf = peek(&len);
-            if (len == 1 && buf[0] == '=') {
-                skip(1);
+            if (peekc() == '=') {
+                nextc();
                return op(OP_NE);
            }
            return op(OP_NOT);
        case ':':
-            skip(1);
            return op(OP_TYPE);
-         case '>':
-            skip(1);
-            len = 2;
-            buf = peek(&len);
-            if (len == 2 && buf[0] == '>' && buf[1] == '>') {
-                skip(2);
-                return op(OP_SHR);
-            } else if (len >= 1 && buf[0] == '>') {
-                skip(1);
-                return op(OP_SAR);
-            } else if (len >= 1 && buf[0] == '=') {
-                skip(1);
+        case '>':
+            c = peekc();
+            if (c == '=') {
+                nextc();
                return op(OP_GTE);
            }
+            if (c == '>') {
+                nextc();
+                if (peekc() == '>') {
+                    nextc();
+                    return op(OP_SHR);
+                }
+                return op(OP_SAR);
+            }
            return op(OP_GT);
        case '<':
-            skip(1);
-            len = 1;
-            buf = peek(&len);
-            if (len == 1 && buf[0] == '<') {
-                skip(1);
+            c = peekc();
+            if (c == '<') {
+                nextc();
                return op(OP_SHL);
-            } else if (len == 1 && buf[0] == '=') {
-                skip(1);
+            }
+            if (c == '=') {
+                nextc();
                return op(OP_LTE);
            }
            return op(OP_LT);
    }
+    unnextc(c);
+    if (is_digit(c)) {
+        return lex_integer(false);
+    }
    union token_data data;
    data.name = lex_identifier();
    struct token tok = { TOK_NAME, data };
    return tok;
 }

+static _Bool peeked = false;
+static struct token peek_buf;
+
+struct token next(void) {
+    if (peeked) {
+        peeked = false;
+        return peek_buf;
+    }
+    return lex();
+}
+
+void unnext(struct token tok) {
+    assert(!peeked);
+    peeked = true;
+    peek_buf = tok;
+}
+
+struct token peek(void) {
+    if (!peeked) {
+        peek_buf = lex();
+        peeked = true;
+    }
+    return peek_buf;
+}
+
 void print_token(struct token tok) {
    switch (tok.type) {
        case TOK_NAME:
@ -423,7 +368,7 @@ void print_token(struct token tok) {
            fprintf(stdout, ",");
            break;
        case TOK_OPERATOR:
-            fprintf(stdout, "OP: %i", tok.data.op);
+            fprintf(stdout, "OP:%i", tok.data.op);
            break;
        case TOK_EOF:
            fprintf(stdout, "EOF");
--- a/src/lex.h
+++ b/src/lex.h
@ -5,6 +5,7 @@
 #include <stdint.h>

 enum token_type {
+    TOK_EOF,         // end of file
    TOK_NAME,        // foo, bar_quux123, loop
    TOK_LABEL,       // 'my_loop
    TOK_INTEGER,     // -123, 16#DEADBEEF
@ -16,7 +17,6 @@ enum token_type {
    TOK_CLOSE_BLOCK, // }
    TOK_TERMINATOR,  // ;
    TOK_SEPARATOR,   // ,
-    TOK_EOF,         // end of file
 };

 enum operator_ {
@ -64,7 +64,9 @@ _Bool is_unary(enum operator_ op);
 _Bool is_binary(enum operator_ op);
 _Bool is_lit(struct token tok);

-struct token lex(void);
+struct token next(void);
+void unnext(struct token tok);
+struct token peek(void);

 void print_token(struct token tok);

--- a/src/parse.c
+++ b/src/parse.c
@ -90,10 +90,11 @@ static _Bool is_expr(struct token tok) {
 void parse(void) {
    // TODO: add support for the top-level instead of this block hack
    push(ST_BLOCK);
-    struct token tok = lex();
-    struct token next = lex();
+    struct token tok = next();
+    struct token nxt;
    while (sp > 0) {
-        debug_print(tok, next);
+        nxt = peek();
+        debug_print(tok, nxt);
        // FIXME: stack underflow because we're faking the top-level with blocks
        switch (pop()) {
            case ST_BLOCK_OPEN:
@ -110,7 +111,7 @@ void parse(void) {
                    push(ST_BLOCK);
                    break;
                }
-                if (is_assignment(tok, next)) {
+                if (is_assignment(tok, nxt)) {
                    push(ST_BLOCK);
                    push(ST_ASSIGN);
                    break;
@ -144,21 +145,21 @@ void parse(void) {
                    if (strcmp(name, "loop") == 0) {
                        push(ST_BLOCK_OPEN);
                        push(ST_LOOP_VARS);
-                        if (next.type == TOK_LABEL) {
+                        if (nxt.type == TOK_LABEL) {
                            push(ST_LABEL);
                        }
                        break;
                    }
                    if (strcmp(name, "next") == 0) {
                        push(ST_LOOP_VARS);
-                        if (next.type == TOK_LABEL) {
+                        if (nxt.type == TOK_LABEL) {
                            push(ST_LABEL);
                        }
                        break;
                    }
                    if (strcmp(name, "exit") == 0) {
                        push(ST_EXPR);
-                        if (next.type == TOK_LABEL) {
+                        if (nxt.type == TOK_LABEL) {
                            push(ST_LABEL);
                        }
                        break;
@ -201,7 +202,7 @@ void parse(void) {
                }
                continue;
            case ST_LOOP_VARS:
-                if (is_assignment(tok, next)) {
+                if (is_assignment(tok, nxt)) {
                    push(ST_LOOP_VARS_CONT);
                    push(ST_ASSIGN);
                    break;
@ -221,8 +222,7 @@ void parse(void) {
                assert(tok.type == TOK_LABEL);
                break;
        }
-        tok = next;
-        next = lex();
+        tok = next();
    }
    if (tok.type != TOK_EOF) {
        fprintf(stderr, "syntax error: finished parsing before end of file\n");