From bce39fdc22fc0abdfd66100c374388a4bbb6c1d1 Mon Sep 17 00:00:00 2001 From: James Martin Date: Wed, 7 Sep 2022 23:02:15 -0700 Subject: [PATCH] Greatly simplify lexer thanks to new knowledge of lookahead. Now I know that the parser is LL(1) and the lexer also only needs one-character lookahead, which allows me to dramatically simplify the interface for IO input, and improve the interface to the lexer. Even if I did want unbounded peek, I'd want the interface to be `peek(off)`, not that awful buffer. I intend to use the new lexer interface to make the parser states more stateful, and potentially read multiple tokens in a row. Then, states would only be needed for recursive structures, without the awkward intermediate states like ST_LABEL which exists only to let me burn a token. I also removed the nonsense related to base 64 parsing, because it was unclear how to handle it, and the advantages of having it weren't clear. I kept up to base 36, but honestly I might want to consider getting rid of everything but decimal, hex, and binary anyway. I'm not sure if I'd want to keep using the current syntax for the radix either. --- src/io.c | 53 ++++----- src/io.h | 5 +- src/lex.c | 311 +++++++++++++++++++++------------------------------- src/lex.h | 6 +- src/parse.c | 20 ++-- 5 files changed, 173 insertions(+), 222 deletions(-) diff --git a/src/io.c b/src/io.c index 6c946f9..aaea51b 100644 --- a/src/io.c +++ b/src/io.c @@ -1,6 +1,8 @@ #include "io.h" +#include #include +#include #include #include #include @@ -104,38 +106,39 @@ void patch_i32(size_t off, int32_t x) { patch_u32(off, (uint32_t) x); } -#define MAX_LOOKAHEAD 4 -static size_t read_buf_len = 0; -static char read_buf[MAX_LOOKAHEAD]; +static char peek_buf; +static _Bool peeked; -char* peek(size_t* len) { - if (*len >= MAX_LOOKAHEAD) { - fprintf(stderr, "syntax error: maximum lookahead exceeded\n"); - exit(1); - } - if (*len >= read_buf_len) { - size_t inc = fread(read_buf + read_buf_len, 1, *len - read_buf_len, infile); +static char next_(void) { + int c = getc(infile); + if (c == EOF) { if (ferror(infile)) { fprintf(stderr, "failed to read source file: %s\n", strerror(errno)); exit(1); } - read_buf_len += inc; - *len = read_buf_len; + return 0; } - return read_buf; + return c; } -void skip(size_t off) { - if (read_buf_len > off) { - memmove(read_buf, &read_buf[off], read_buf_len - off); - read_buf_len -= off; - } else { - if (read_buf_len < off) { - if (fseek(infile, off - read_buf_len, SEEK_CUR) != 0) { - fprintf(stderr, "failed to seek in source file: %s\n", strerror(errno)); - exit(1); - } - } - read_buf_len = 0; +char nextc(void) { + if (peeked) { + peeked = false; + return peek_buf; } + return next_(); +} + +void unnextc(char c) { + assert(!peeked); + peek_buf = c; + peeked = true; +} + +char peekc(void) { + if (!peeked) { + peek_buf = next_(); + peeked = true; + } + return peek_buf; } diff --git a/src/io.h b/src/io.h index f865434..de359da 100644 --- a/src/io.h +++ b/src/io.h @@ -19,7 +19,8 @@ void patch(size_t off, const void* ptr, size_t count); void patch_u32(size_t off, uint32_t x); void patch_i32(size_t off, int32_t x); -char* peek(size_t* len); -void skip(size_t off); +char nextc(void); +void unnextc(char c); +char peekc(void); #endif diff --git a/src/lex.c b/src/lex.c index 5c294c6..0e83443 100644 --- a/src/lex.c +++ b/src/lex.c @@ -52,10 +52,6 @@ static _Bool is_digit(char c) { return c >= '0' && c <= '9'; } -static _Bool begins_integer(char c) { - return is_digit(c) || c == '-'; -} - static _Bool id_char(char c) { return is_alpha(c) || is_digit(c) || c == '_'; } @@ -73,72 +69,41 @@ static struct token op(enum operator_ op) { } static uint8_t digit_value(uint8_t base, char c) { - // TODO: sort out this mess - - // restrict bases to avoid having to make decisions about how to handle - // upper vs. lower and base64. (letters before digits? seriously?) - if (base != 2 && base != 4 && base != 8 && base != 10 && base != 16) { - fprintf(stderr, "lexical error: illegal integer base (for now)\n"); - exit(1); - } - // who invented this???? why can't 0 be 0? screw you. - if (base == 64) { - if (is_digit(c)) return c - '0' + 52; - if (c >= 'A' && c <= 'Z') return c - 'A'; - if (c >= 'a' && c <= 'Z') return c - 'a' + 26; - if (c == '+') return 62; - // c == '/' - return 63; - } - if (is_digit(c)) return c - '0'; - if (c >= 'A' && c <= 'Z') return c - 'A' + 10; - if (c >= 'a' && c <= 'z') { - if (base > 36) { - return c - 'a' + 36; - } - return c - 'a' + 10; - } - if (c == '+') return 62; - // c == '/' - return 63; + assert(base <= 36); + if (is_digit(c)) { return c - '0'; } + if (c >= 'A' && c <= 'Z') { return c - 'A' + 0xA; } + if (c >= 'a' && c <= 'z') { return c - 'a' + 0xA; } + assert(0); } static _Bool is_extended_digit(uint8_t base, char c) { - if (!is_digit(c) && !is_alpha(c) && c != '+' && c != '/') + if (!is_digit(c) && !is_alpha(c)) { return false; + } uint8_t val = digit_value(base, c); - if (val > base) - return false; - return true; + return val < base; } static uint64_t lex_digits(uint8_t base) { - uint64_t acc = 0; - char* buf; - size_t len; _Bool at_least_one_char = false; + uint64_t acc = 0; while (true) { - len = 1; - buf = peek(&len); - if (!is_extended_digit(base, buf[0])) { + char c = peekc(); + if (!is_extended_digit(base, c)) { // commas are legal digit separators - if (buf[0] == ',') { - len = 2; - buf = peek(&len); - if (len == 2 && is_extended_digit(base, buf[1])) { - skip(1); - continue; - } + if (c == ',' && is_extended_digit(base, peekc())) { + nextc(); + continue; } break; } - uint8_t digit = digit_value(base, buf[0]); + nextc(); + uint8_t digit = digit_value(base, c); // (val * base + digit) > UINT64_MAX if (acc > ((UINT64_MAX - digit) / base)) { fprintf(stderr, "lexical error: integer literal overflow\n"); exit(1); } - skip(1); at_least_one_char = true; acc *= base; acc += digit; @@ -150,32 +115,28 @@ static uint64_t lex_digits(uint8_t base) { return acc; } -static struct token lex_integer(void) { - _Bool sign = false; - char* buf; - size_t len = 1; - buf = peek(&len); - assert(len > 0 && begins_integer(buf[0])); - if (buf[0] == '-') { - sign = true; - skip(1); - } +static struct token lex_integer(_Bool sign) { uint64_t acc = lex_digits(10); - len = 1; - buf = peek(&len); - if (len == 1 && buf[0] == '#') { - if (acc > 64) { + if (peekc() == '#') { + if (acc < 2) { + fprintf(stderr, "lexical error: integer literal base too small\n"); + exit(1); + } + if (acc > 36) { fprintf(stderr, "lexical error: integer literal base too large\n"); exit(1); } - skip(1); + nextc(); acc = lex_digits((uint8_t) acc); } if (sign && acc > INT64_MAX) { fprintf(stderr, "lexical error: integer literal overflow due to sign\n"); exit(1); } - int64_t val = sign ? -(int64_t) acc : (int64_t) acc; + int64_t val = (int64_t) acc; + if (sign) { + val = -val; + } union token_data data; data.int_ = val; struct token tok = { TOK_INTEGER, data }; @@ -184,212 +145,196 @@ static struct token lex_integer(void) { #define MAX_STR_LEN 4096 static size_t str_index; -static char str_buf[MAX_STR_LEN]; +// alternate string buffers between tokens to prevent overwriting buffer. +// we're LL(1) so 2 buffers is sufficient. +static _Bool which_buf = false; +static char str_buf_1[MAX_STR_LEN]; +static char str_buf_2[MAX_STR_LEN]; -static char* leak_buf(void) { - // FIXME: memory leak - char* str = malloc(str_index + 1); - memcpy(str, str_buf, str_index); - str[str_index] = 0; - return str; +static char* str_buf(void) { + which_buf = !which_buf; + if (which_buf) { + return str_buf_1; + } + return str_buf_2; } static char* lex_string(void) { - char* buf; - size_t len = 1; - buf = peek(&len); - assert(len == 1 && buf[0] == '"'); - skip(1); str_index = 0; + char* buf = str_buf(); while (true) { + char c = nextc(); if (str_index == MAX_STR_LEN - 1) { fprintf(stderr, "lexical error: string too long\n"); exit(1); } - len = 1; - buf = peek(&len); - if (len < 1) { + if (c == 0) { fprintf(stderr, "lexical error: unclosed string (reached end of file)\n"); exit(1); } // TODO: string escapes, multi-line strings - if (buf[0] == '"') { - skip(1); + if (c == '"') { break; } - if (buf[0] == '\n') { + if (c == '\n') { fprintf(stderr, "lexical error: unclosed string (reached end of line)\n"); exit(1); } - str_buf[str_index] = buf[0]; + buf[str_index] = c; str_index++; - skip(1); } - str_buf[str_index] = 0; - return leak_buf(); + buf[str_index] = 0; + return buf; } static char* lex_identifier(void) { - char* buf; - size_t len; - char c; str_index = 0; + char* buf = str_buf(); while (true) { - len = 1; - buf = peek(&len); - if (len == 0) break; - c = buf[0]; + char c = peekc(); if (!is_alpha(c) && !is_digit(c) && c != '_') break; - skip(1); - str_buf[str_index] = c; + nextc(); + buf[str_index] = c; str_index++; } if (str_index == 0) { fprintf(stderr, "lexical error: expected identifier (possibly illegal character?)\n"); exit(1); } - str_buf[str_index] = 0; - return leak_buf(); + buf[str_index] = 0; + return buf; } -struct token lex(void) { - char* buf; - size_t len; - len = 1; - buf = peek(&len); - if (len < 1) { - return simple(TOK_EOF); - } - char c = buf[0]; - while (is_whitespace(c)) { - skip(1); - len = 1; - peek(&len); - if (len == 0) { - return simple(TOK_EOF); - } - c = buf[0]; - } - if (begins_integer(c)) { - len = 2; - peek(&len); - if (is_digit(buf[0]) || (len > 1 && is_digit(buf[1]))) - return lex_integer(); - } - if (c == '"') { - union token_data data; - data.string = lex_string(); - struct token tok = { TOK_STRING, data }; - return tok; - } - if (c == '\'') { - skip(1); - union token_data data; - data.label = lex_identifier(); - struct token tok = { TOK_LABEL, data }; - return tok; - } +static struct token lex(void) { + char c; + do { + c = nextc(); + } while (is_whitespace(c)); + _Bool sign = false; switch (c) { + case 0: + return simple(TOK_EOF); + case '"': { + union token_data data; + data.string = lex_string(); + struct token tok = { TOK_STRING, data }; + return tok; + } + case '\'': { + union token_data data; + data.label = lex_identifier(); + struct token tok = { TOK_LABEL, data }; + return tok; + } case '{': - skip(1); return simple(TOK_OPEN_BLOCK); case '}': - skip(1); return simple(TOK_CLOSE_BLOCK); case '(': - skip(1); return simple(TOK_OPEN_GROUP); case ')': - skip(1); return simple(TOK_CLOSE_GROUP); case ';': - skip(1); return simple(TOK_TERMINATOR); case ',': - skip(1); return simple(TOK_SEPARATOR); case '=': - skip(1); return op(OP_EQ); case '+': - skip(1); return op(OP_ADD); case '-': - skip(1); - len = 1; - buf = peek(&len); - if (len == 1 && buf[0] == '>') { - skip(1); + if (peekc() == '>') { + nextc(); return op(OP_FUN); } + if (is_digit(peekc())) { + return lex_integer(true); + } return op(OP_SUB); case '*': - skip(1); return op(OP_MUL); case '/': - skip(1); return op(OP_DIV); case '%': - skip(1); return op(OP_MOD); case '~': - skip(1); return op(OP_INV); case '&': - skip(1); return op(OP_AND); case '|': - skip(1); return op(OP_OR); case '^': - skip(1); return op(OP_XOR); case '!': - skip(1); - len = 1; - buf = peek(&len); - if (len == 1 && buf[0] == '=') { - skip(1); + if (peekc() == '=') { + nextc(); return op(OP_NE); } return op(OP_NOT); case ':': - skip(1); return op(OP_TYPE); - case '>': - skip(1); - len = 2; - buf = peek(&len); - if (len == 2 && buf[0] == '>' && buf[1] == '>') { - skip(2); - return op(OP_SHR); - } else if (len >= 1 && buf[0] == '>') { - skip(1); - return op(OP_SAR); - } else if (len >= 1 && buf[0] == '=') { - skip(1); + case '>': + c = peekc(); + if (c == '=') { + nextc(); return op(OP_GTE); } + if (c == '>') { + nextc(); + if (peekc() == '>') { + nextc(); + return op(OP_SHR); + } + return op(OP_SAR); + } return op(OP_GT); case '<': - skip(1); - len = 1; - buf = peek(&len); - if (len == 1 && buf[0] == '<') { - skip(1); + c = peekc(); + if (c == '<') { + nextc(); return op(OP_SHL); - } else if (len == 1 && buf[0] == '=') { - skip(1); + } + if (c == '=') { + nextc(); return op(OP_LTE); } return op(OP_LT); } + unnextc(c); + if (is_digit(c)) { + return lex_integer(false); + } union token_data data; data.name = lex_identifier(); struct token tok = { TOK_NAME, data }; return tok; } +static _Bool peeked = false; +static struct token peek_buf; + +struct token next(void) { + if (peeked) { + peeked = false; + return peek_buf; + } + return lex(); +} + +void unnext(struct token tok) { + assert(!peeked); + peeked = true; + peek_buf = tok; +} + +struct token peek(void) { + if (!peeked) { + peek_buf = lex(); + peeked = true; + } + return peek_buf; +} + void print_token(struct token tok) { switch (tok.type) { case TOK_NAME: @@ -423,7 +368,7 @@ void print_token(struct token tok) { fprintf(stdout, ","); break; case TOK_OPERATOR: - fprintf(stdout, "OP: %i", tok.data.op); + fprintf(stdout, "OP:%i", tok.data.op); break; case TOK_EOF: fprintf(stdout, "EOF"); diff --git a/src/lex.h b/src/lex.h index b981382..488c61b 100644 --- a/src/lex.h +++ b/src/lex.h @@ -5,6 +5,7 @@ #include enum token_type { + TOK_EOF, // end of file TOK_NAME, // foo, bar_quux123, loop TOK_LABEL, // 'my_loop TOK_INTEGER, // -123, 16#DEADBEEF @@ -16,7 +17,6 @@ enum token_type { TOK_CLOSE_BLOCK, // } TOK_TERMINATOR, // ; TOK_SEPARATOR, // , - TOK_EOF, // end of file }; enum operator_ { @@ -64,7 +64,9 @@ _Bool is_unary(enum operator_ op); _Bool is_binary(enum operator_ op); _Bool is_lit(struct token tok); -struct token lex(void); +struct token next(void); +void unnext(struct token tok); +struct token peek(void); void print_token(struct token tok); diff --git a/src/parse.c b/src/parse.c index 4d3cd7f..bf6e0a6 100644 --- a/src/parse.c +++ b/src/parse.c @@ -90,10 +90,11 @@ static _Bool is_expr(struct token tok) { void parse(void) { // TODO: add support for the top-level instead of this block hack push(ST_BLOCK); - struct token tok = lex(); - struct token next = lex(); + struct token tok = next(); + struct token nxt; while (sp > 0) { - debug_print(tok, next); + nxt = peek(); + debug_print(tok, nxt); // FIXME: stack underflow because we're faking the top-level with blocks switch (pop()) { case ST_BLOCK_OPEN: @@ -110,7 +111,7 @@ void parse(void) { push(ST_BLOCK); break; } - if (is_assignment(tok, next)) { + if (is_assignment(tok, nxt)) { push(ST_BLOCK); push(ST_ASSIGN); break; @@ -144,21 +145,21 @@ void parse(void) { if (strcmp(name, "loop") == 0) { push(ST_BLOCK_OPEN); push(ST_LOOP_VARS); - if (next.type == TOK_LABEL) { + if (nxt.type == TOK_LABEL) { push(ST_LABEL); } break; } if (strcmp(name, "next") == 0) { push(ST_LOOP_VARS); - if (next.type == TOK_LABEL) { + if (nxt.type == TOK_LABEL) { push(ST_LABEL); } break; } if (strcmp(name, "exit") == 0) { push(ST_EXPR); - if (next.type == TOK_LABEL) { + if (nxt.type == TOK_LABEL) { push(ST_LABEL); } break; @@ -201,7 +202,7 @@ void parse(void) { } continue; case ST_LOOP_VARS: - if (is_assignment(tok, next)) { + if (is_assignment(tok, nxt)) { push(ST_LOOP_VARS_CONT); push(ST_ASSIGN); break; @@ -221,8 +222,7 @@ void parse(void) { assert(tok.type == TOK_LABEL); break; } - tok = next; - next = lex(); + tok = next(); } if (tok.type != TOK_EOF) { fprintf(stderr, "syntax error: finished parsing before end of file\n");