From bce39fdc22fc0abdfd66100c374388a4bbb6c1d1 Mon Sep 17 00:00:00 2001
From: James Martin <james@jtm.dev>
Date: Wed, 7 Sep 2022 23:02:15 -0700
Subject: [PATCH] Greatly simplify lexer thanks to new knowledge of lookahead.

Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.

I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.

I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
---
 src/io.c    |  53 ++++-----
 src/io.h    |   5 +-
 src/lex.c   | 311 +++++++++++++++++++++-------------------------------
 src/lex.h   |   6 +-
 src/parse.c |  20 ++--
 5 files changed, 173 insertions(+), 222 deletions(-)

diff --git a/src/io.c b/src/io.c
index 6c946f9..aaea51b 100644
--- a/src/io.c
+++ b/src/io.c
@@ -1,6 +1,8 @@
 #include "io.h"
 
+#include <assert.h>
 #include <errno.h>
+#include <stdbool.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
@@ -104,38 +106,39 @@ void patch_i32(size_t off, int32_t x) {
     patch_u32(off, (uint32_t) x);
 }
 
-#define MAX_LOOKAHEAD 4
-static size_t read_buf_len = 0;
-static char read_buf[MAX_LOOKAHEAD];
+static char peek_buf;
+static _Bool peeked;
 
-char* peek(size_t* len) {
-    if (*len >= MAX_LOOKAHEAD) {
-        fprintf(stderr, "syntax error: maximum lookahead exceeded\n");
-        exit(1);
-    }
-    if (*len >= read_buf_len) {
-        size_t inc = fread(read_buf + read_buf_len, 1, *len - read_buf_len, infile);
+static char next_(void) {
+    int c = getc(infile);
+    if (c == EOF) {
         if (ferror(infile)) {
             fprintf(stderr, "failed to read source file: %s\n", strerror(errno));
             exit(1);
         }
-        read_buf_len += inc;
-        *len = read_buf_len;
+        return 0;
     }
-    return read_buf;
+    return c;
 }
 
-void skip(size_t off) {
-    if (read_buf_len > off) {
-        memmove(read_buf, &read_buf[off], read_buf_len - off);
-        read_buf_len -= off;
-    } else {
-        if (read_buf_len < off) {
-            if (fseek(infile, off - read_buf_len, SEEK_CUR) != 0) {
-                fprintf(stderr, "failed to seek in source file: %s\n", strerror(errno));
-                exit(1);
-            }
-        }
-        read_buf_len = 0;
+char nextc(void) {
+    if (peeked) {
+        peeked = false;
+        return peek_buf;
     }
+    return next_();
+}
+
+void unnextc(char c) {
+    assert(!peeked);
+    peek_buf = c;
+    peeked = true;
+}
+
+char peekc(void) {
+    if (!peeked) {
+        peek_buf = next_();
+        peeked = true;
+    }
+    return peek_buf;
 }
diff --git a/src/io.h b/src/io.h
index f865434..de359da 100644
--- a/src/io.h
+++ b/src/io.h
@@ -19,7 +19,8 @@ void patch(size_t off, const void* ptr, size_t count);
 void patch_u32(size_t off, uint32_t x);
 void patch_i32(size_t off, int32_t x);
 
-char* peek(size_t* len);
-void skip(size_t off);
+char nextc(void);
+void unnextc(char c);
+char peekc(void);
 
 #endif
diff --git a/src/lex.c b/src/lex.c
index 5c294c6..0e83443 100644
--- a/src/lex.c
+++ b/src/lex.c
@@ -52,10 +52,6 @@ static _Bool is_digit(char c) {
     return c >= '0' && c <= '9';
 }
 
-static _Bool begins_integer(char c) {
-    return is_digit(c) || c == '-';
-}
-
 static _Bool id_char(char c) {
     return is_alpha(c) || is_digit(c) || c == '_';
 }
@@ -73,72 +69,41 @@ static struct token op(enum operator_ op) {
 }
 
 static uint8_t digit_value(uint8_t base, char c) {
-    // TODO: sort out this mess
-
-    // restrict bases to avoid having to make decisions about how to handle
-    // upper vs. lower and base64. (letters before digits? seriously?)
-    if (base != 2 && base != 4 && base != 8 && base != 10 && base != 16) {
-        fprintf(stderr, "lexical error: illegal integer base (for now)\n");
-        exit(1);
-    }
-    // who invented this???? why can't 0 be 0? screw you.
-    if (base == 64) {
-        if (is_digit(c)) return c - '0' + 52;
-        if (c >= 'A' && c <= 'Z') return c - 'A';
-        if (c >= 'a' && c <= 'Z') return c - 'a' + 26;
-        if (c == '+') return 62;
-        // c == '/'
-        return 63;
-    }
-    if (is_digit(c)) return c - '0';
-    if (c >= 'A' && c <= 'Z') return c - 'A' + 10;
-    if (c >= 'a' && c <= 'z') {
-        if (base > 36) {
-            return c - 'a' + 36;
-        }
-        return c - 'a' + 10;
-    }
-    if (c == '+') return 62;
-    // c == '/'
-    return 63;
+    assert(base <= 36);
+    if (is_digit(c)) { return c - '0'; }
+    if (c >= 'A' && c <= 'Z') { return c - 'A' + 0xA; }
+    if (c >= 'a' && c <= 'z') { return c - 'a' + 0xA; }
+    assert(0);
 }
 
 static _Bool is_extended_digit(uint8_t base, char c) {
-    if (!is_digit(c) && !is_alpha(c) && c != '+' && c != '/')
+    if (!is_digit(c) && !is_alpha(c)) {
         return false;
+    }
     uint8_t val = digit_value(base, c);
-    if (val > base)
-        return false;
-    return true;
+    return val < base;
 }
 
 static uint64_t lex_digits(uint8_t base) {
-    uint64_t acc = 0;
-    char* buf;
-    size_t len;
     _Bool at_least_one_char = false;
+    uint64_t acc = 0;
     while (true) {
-        len = 1;
-        buf = peek(&len);
-        if (!is_extended_digit(base, buf[0])) {
+        char c = peekc();
+        if (!is_extended_digit(base, c)) {
             // commas are legal digit separators
-            if (buf[0] == ',') {
-                len = 2;
-                buf = peek(&len);
-                if (len == 2 && is_extended_digit(base, buf[1])) {
-                    skip(1);
-                    continue;
-                }
+            if (c == ',' && is_extended_digit(base, peekc())) {
+                nextc();
+                continue;
             }
             break;
         }
-        uint8_t digit = digit_value(base, buf[0]);
+        nextc();
+        uint8_t digit = digit_value(base, c);
         // (val * base + digit) > UINT64_MAX
         if (acc > ((UINT64_MAX - digit) / base)) {
             fprintf(stderr, "lexical error: integer literal overflow\n");
             exit(1);
         }
-        skip(1);
         at_least_one_char = true;
         acc *= base;
         acc += digit;
@@ -150,32 +115,28 @@ static uint64_t lex_digits(uint8_t base) {
     return acc;
 }
 
-static struct token lex_integer(void) {
-    _Bool sign = false;
-    char* buf;
-    size_t len = 1;
-    buf = peek(&len);
-    assert(len > 0 && begins_integer(buf[0]));
-    if (buf[0] == '-') {
-        sign = true;
-        skip(1);
-    }
+static struct token lex_integer(_Bool sign) {
     uint64_t acc = lex_digits(10);
-    len = 1;
-    buf = peek(&len);
-    if (len == 1 && buf[0] == '#') {
-        if (acc > 64) {
+    if (peekc() == '#') {
+        if (acc < 2) {
+            fprintf(stderr, "lexical error: integer literal base too small\n");
+            exit(1);
+        }
+        if (acc > 36) {
             fprintf(stderr, "lexical error: integer literal base too large\n");
             exit(1);
         }
-        skip(1);
+        nextc();
         acc = lex_digits((uint8_t) acc);
     }
     if (sign && acc > INT64_MAX) {
         fprintf(stderr, "lexical error: integer literal overflow due to sign\n");
         exit(1);
     }
-    int64_t val = sign ? -(int64_t) acc : (int64_t) acc;
+    int64_t val = (int64_t) acc;
+    if (sign) {
+        val = -val;
+    }
     union token_data data;
     data.int_ = val;
     struct token tok = { TOK_INTEGER, data };
@@ -184,212 +145,196 @@ static struct token lex_integer(void) {
 
 #define MAX_STR_LEN 4096
 static size_t str_index;
-static char str_buf[MAX_STR_LEN];
+// alternate string buffers between tokens to prevent overwriting buffer.
+// we're LL(1) so 2 buffers is sufficient.
+static _Bool which_buf = false;
+static char str_buf_1[MAX_STR_LEN];
+static char str_buf_2[MAX_STR_LEN];
 
-static char* leak_buf(void) {
-    // FIXME: memory leak
-    char* str = malloc(str_index + 1);
-    memcpy(str, str_buf, str_index);
-    str[str_index] = 0;
-    return str;
+static char* str_buf(void) {
+    which_buf = !which_buf;
+    if (which_buf) {
+        return str_buf_1;
+    }
+    return str_buf_2;
 }
 
 static char* lex_string(void) {
-    char* buf;
-    size_t len = 1;
-    buf = peek(&len);
-    assert(len == 1 && buf[0] == '"');
-    skip(1);
     str_index = 0;
+    char* buf = str_buf();
     while (true) {
+        char c = nextc();
         if (str_index == MAX_STR_LEN - 1) {
             fprintf(stderr, "lexical error: string too long\n");
             exit(1);
         }
-        len = 1;
-        buf = peek(&len);
-        if (len < 1) {
+        if (c == 0) {
             fprintf(stderr, "lexical error: unclosed string (reached end of file)\n");
             exit(1);
         }
         // TODO: string escapes, multi-line strings
-        if (buf[0] == '"') {
-            skip(1);
+        if (c == '"') {
             break;
         }
-        if (buf[0] == '\n') {
+        if (c == '\n') {
             fprintf(stderr, "lexical error: unclosed string (reached end of line)\n");
             exit(1);
         }
-        str_buf[str_index] = buf[0];
+        buf[str_index] = c;
         str_index++;
-        skip(1);
     }
-    str_buf[str_index] = 0;
-    return leak_buf();
+    buf[str_index] = 0;
+    return buf;
 }
 
 static char* lex_identifier(void) {
-    char* buf;
-    size_t len;
-    char c;
     str_index = 0;
+    char* buf = str_buf();
     while (true) {
-        len = 1;
-        buf = peek(&len);
-        if (len == 0) break;
-        c = buf[0];
+        char c = peekc();
         if (!is_alpha(c) && !is_digit(c) && c != '_') break;
-        skip(1);
-        str_buf[str_index] = c;
+        nextc();
+        buf[str_index] = c;
         str_index++;
     }
     if (str_index == 0) {
         fprintf(stderr, "lexical error: expected identifier (possibly illegal character?)\n");
         exit(1);
     }
-    str_buf[str_index] = 0;
-    return leak_buf();
+    buf[str_index] = 0;
+    return buf;
 }
 
-struct token lex(void) {
-    char* buf;
-    size_t len;
-    len = 1;
-    buf = peek(&len);
-    if (len < 1) {
-        return simple(TOK_EOF);
-    }
-    char c = buf[0];
-    while (is_whitespace(c)) {
-        skip(1);
-        len = 1;
-        peek(&len);
-        if (len == 0) {
-            return simple(TOK_EOF);
-        }
-        c = buf[0];
-    }
-    if (begins_integer(c)) {
-        len = 2;
-        peek(&len);
-        if (is_digit(buf[0]) || (len > 1 && is_digit(buf[1])))
-            return lex_integer();
-    }
-    if (c == '"') {
-        union token_data data;
-        data.string = lex_string();
-        struct token tok = { TOK_STRING, data };
-        return tok;
-    }
-    if (c == '\'') {
-        skip(1);
-        union token_data data;
-        data.label = lex_identifier();
-        struct token tok = { TOK_LABEL, data };
-        return tok;
-    }
+static struct token lex(void) {
+    char c;
+    do {
+        c = nextc();
+    } while (is_whitespace(c));
+    _Bool sign = false;
     switch (c) {
+        case 0:
+            return simple(TOK_EOF);
+        case '"': {
+            union token_data data;
+            data.string = lex_string();
+            struct token tok = { TOK_STRING, data };
+            return tok;
+        }
+        case '\'': {
+            union token_data data;
+            data.label = lex_identifier();
+            struct token tok = { TOK_LABEL, data };
+            return tok;
+        }
         case '{':
-            skip(1);
             return simple(TOK_OPEN_BLOCK);
         case '}':
-            skip(1);
             return simple(TOK_CLOSE_BLOCK);
         case '(':
-            skip(1);
             return simple(TOK_OPEN_GROUP);
         case ')':
-            skip(1);
             return simple(TOK_CLOSE_GROUP);
         case ';':
-            skip(1);
             return simple(TOK_TERMINATOR);
         case ',':
-            skip(1);
             return simple(TOK_SEPARATOR);
         case '=':
-            skip(1);
             return op(OP_EQ);
         case '+':
-            skip(1);
             return op(OP_ADD);
         case '-':
-            skip(1);
-            len = 1;
-            buf = peek(&len);
-            if (len == 1 && buf[0] == '>') {
-                 skip(1);
+            if (peekc() == '>') {
+                nextc();
                 return op(OP_FUN);
             }
+            if (is_digit(peekc())) {
+                return lex_integer(true);
+            }
             return op(OP_SUB);
         case '*':
-            skip(1);
             return op(OP_MUL);
         case '/':
-            skip(1);
             return op(OP_DIV);
         case '%':
-            skip(1);
             return op(OP_MOD);
         case '~':
-            skip(1);
             return op(OP_INV);
         case '&':
-            skip(1);
             return op(OP_AND);
         case '|':
-            skip(1);
             return op(OP_OR);
         case '^':
-            skip(1);
             return op(OP_XOR);
         case '!':
-            skip(1);
-            len = 1;
-             buf = peek(&len);
-            if (len == 1 && buf[0] == '=') {
-                skip(1);
+            if (peekc() == '=') {
+                nextc();
                 return op(OP_NE);
             }
             return op(OP_NOT);
         case ':':
-            skip(1);
             return op(OP_TYPE);
-         case '>':
-            skip(1);
-            len = 2;
-            buf = peek(&len);
-            if (len == 2 && buf[0] == '>' && buf[1] == '>') {
-                skip(2);
-                return op(OP_SHR);
-            } else if (len >= 1 && buf[0] == '>') {
-                skip(1);
-                return op(OP_SAR);
-            } else if (len >= 1 && buf[0] == '=') {
-                skip(1);
+        case '>':
+            c = peekc();
+            if (c == '=') {
+                nextc();
                 return op(OP_GTE);
             }
+            if (c == '>') {
+                nextc();
+                if (peekc() == '>') {
+                    nextc();
+                    return op(OP_SHR);
+                }
+                return op(OP_SAR);
+            }
             return op(OP_GT);
         case '<':
-            skip(1);
-            len = 1;
-            buf = peek(&len);
-            if (len == 1 && buf[0] == '<') {
-                skip(1);
+            c = peekc();
+            if (c == '<') {
+                nextc();
                 return op(OP_SHL);
-            } else if (len == 1 && buf[0] == '=') {
-                skip(1);
+            }
+            if (c == '=') {
+                nextc();
                 return op(OP_LTE);
             }
             return op(OP_LT);
     }
+    unnextc(c);
+    if (is_digit(c)) {
+        return lex_integer(false);
+    }
     union token_data data;
     data.name = lex_identifier();
     struct token tok = { TOK_NAME, data };
     return tok;
 }
 
+static _Bool peeked = false;
+static struct token peek_buf;
+
+struct token next(void) {
+    if (peeked) {
+        peeked = false;
+        return peek_buf;
+    }
+    return lex();
+}
+
+void unnext(struct token tok) {
+    assert(!peeked);
+    peeked = true;
+    peek_buf = tok;
+}
+
+struct token peek(void) {
+    if (!peeked) {
+        peek_buf = lex();
+        peeked = true;
+    }
+    return peek_buf;
+}
+
 void print_token(struct token tok) {
     switch (tok.type) {
         case TOK_NAME:
@@ -423,7 +368,7 @@ void print_token(struct token tok) {
             fprintf(stdout, ",");
             break;
         case TOK_OPERATOR:
-            fprintf(stdout, "OP: %i", tok.data.op);
+            fprintf(stdout, "OP:%i", tok.data.op);
             break;
         case TOK_EOF:
             fprintf(stdout, "EOF");
diff --git a/src/lex.h b/src/lex.h
index b981382..488c61b 100644
--- a/src/lex.h
+++ b/src/lex.h
@@ -5,6 +5,7 @@
 #include <stdint.h>
 
 enum token_type {
+    TOK_EOF,         // end of file
     TOK_NAME,        // foo, bar_quux123, loop
     TOK_LABEL,       // 'my_loop
     TOK_INTEGER,     // -123, 16#DEADBEEF
@@ -16,7 +17,6 @@ enum token_type {
     TOK_CLOSE_BLOCK, // }
     TOK_TERMINATOR,  // ;
     TOK_SEPARATOR,   // ,
-    TOK_EOF,         // end of file
 };
 
 enum operator_ {
@@ -64,7 +64,9 @@ _Bool is_unary(enum operator_ op);
 _Bool is_binary(enum operator_ op);
 _Bool is_lit(struct token tok);
 
-struct token lex(void);
+struct token next(void);
+void unnext(struct token tok);
+struct token peek(void);
 
 void print_token(struct token tok);
 
diff --git a/src/parse.c b/src/parse.c
index 4d3cd7f..bf6e0a6 100644
--- a/src/parse.c
+++ b/src/parse.c
@@ -90,10 +90,11 @@ static _Bool is_expr(struct token tok) {
 void parse(void) {
     // TODO: add support for the top-level instead of this block hack
     push(ST_BLOCK);
-    struct token tok = lex();
-    struct token next = lex();
+    struct token tok = next();
+    struct token nxt;
     while (sp > 0) {
-        debug_print(tok, next);
+        nxt = peek();
+        debug_print(tok, nxt);
         // FIXME: stack underflow because we're faking the top-level with blocks
         switch (pop()) {
             case ST_BLOCK_OPEN:
@@ -110,7 +111,7 @@ void parse(void) {
                     push(ST_BLOCK);
                     break;
                 }
-                if (is_assignment(tok, next)) {
+                if (is_assignment(tok, nxt)) {
                     push(ST_BLOCK);
                     push(ST_ASSIGN);
                     break;
@@ -144,21 +145,21 @@ void parse(void) {
                     if (strcmp(name, "loop") == 0) {
                         push(ST_BLOCK_OPEN);
                         push(ST_LOOP_VARS);
-                        if (next.type == TOK_LABEL) {
+                        if (nxt.type == TOK_LABEL) {
                             push(ST_LABEL);
                         }
                         break;
                     }
                     if (strcmp(name, "next") == 0) {
                         push(ST_LOOP_VARS);
-                        if (next.type == TOK_LABEL) {
+                        if (nxt.type == TOK_LABEL) {
                             push(ST_LABEL);
                         }
                         break;
                     }
                     if (strcmp(name, "exit") == 0) {
                         push(ST_EXPR);
-                        if (next.type == TOK_LABEL) {
+                        if (nxt.type == TOK_LABEL) {
                             push(ST_LABEL);
                         }
                         break;
@@ -201,7 +202,7 @@ void parse(void) {
                 }
                 continue;
             case ST_LOOP_VARS:
-                if (is_assignment(tok, next)) {
+                if (is_assignment(tok, nxt)) {
                     push(ST_LOOP_VARS_CONT);
                     push(ST_ASSIGN);
                     break;
@@ -221,8 +222,7 @@ void parse(void) {
                 assert(tok.type == TOK_LABEL);
                 break;
         }
-        tok = next;
-        next = lex();
+        tok = next();
     }
     if (tok.type != TOK_EOF) {
         fprintf(stderr, "syntax error: finished parsing before end of file\n");