From 162683d63e085e5317d71d4b34ec659984145a46 Mon Sep 17 00:00:00 2001 From: James Martin Date: Wed, 7 Sep 2022 10:22:38 -0700 Subject: [PATCH] Hacked together a god-awful hand-written lexer. --- Makefile | 2 +- src/io.c | 37 +++++- src/io.h | 3 + src/lex.c | 350 +++++++++++++++++++++++++++++++++++++++++++++++++++++ src/lex.h | 64 ++++++++++ src/main.c | 45 ++++++- 6 files changed, 498 insertions(+), 3 deletions(-) create mode 100644 src/lex.h diff --git a/Makefile b/Makefile index 841ab72..9584e56 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ SHELL = /bin/sh CFLAGS = -std=c99 -pedantic -Wextra -Os LDFLAGS = -lc -OBJECTS = asm.o io.o ir.o main.o x86encode.o +OBJECTS = asm.o io.o ir.o lex.o main.o x86encode.o .PHONY: passc passc: .bin $(OBJECTS) diff --git a/src/io.c b/src/io.c index 314c6ea..aaea749 100644 --- a/src/io.c +++ b/src/io.c @@ -34,7 +34,6 @@ void close_files(void) { if (fclose(outfile) != 0) { fprintf(stderr, "failed to close output file: %s\n", strerror(errno)); // NOTE: ideally we'd do this on any dirty exit - // TODO: use portable tempfiles and then just copy the entire file at the end? if (remove(outfile_name) != 0) { fprintf(stderr, "failed to remove output file; if it exists, it is corrupt: %s\n", strerror(errno)); } @@ -103,3 +102,39 @@ void patch_u32(size_t off, uint32_t x) { void patch_i32(size_t off, int32_t x) { patch_u32(off, (uint32_t) x); } + +#define MAX_LOOKAHEAD 4 +static size_t read_buf_len = 0; +static char read_buf[MAX_LOOKAHEAD]; + +char* peek(size_t* len) { + if (*len >= MAX_LOOKAHEAD) { + fprintf(stderr, "syntax error: maximum lookahead exceeded\n"); + exit(1); + } + if (*len >= read_buf_len) { + size_t inc = fread(read_buf + read_buf_len, 1, *len - read_buf_len, infile); + if (ferror(infile)) { + fprintf(stderr, "failed to read source file: %s\n", strerror(errno)); + exit(1); + } + read_buf_len += inc; + *len = read_buf_len; + } + return read_buf; +} + +void skip(size_t off) { + if (read_buf_len > off) { + memmove(read_buf, &read_buf[off], read_buf_len - off); + read_buf_len -= off; + } else { + if (read_buf_len < off) { + if (fseek(infile, off - read_buf_len, SEEK_CUR) != 0) { + fprintf(stderr, "failed to seek in source file: %s\n", strerror(errno)); + exit(1); + } + } + read_buf_len = 0; + } +} diff --git a/src/io.h b/src/io.h index 3f5dc11..b981f65 100644 --- a/src/io.h +++ b/src/io.h @@ -19,4 +19,7 @@ void patch(size_t off, const void* ptr, size_t count); void patch_u32(size_t off, uint32_t x); void patch_i32(size_t off, int32_t x); +char* peek(size_t* len); +void skip(size_t off); + #endif diff --git a/src/lex.c b/src/lex.c index e69de29..18f835c 100644 --- a/src/lex.c +++ b/src/lex.c @@ -0,0 +1,350 @@ +#include +#include +#include +#include +#include + +#include "lex.h" +#include "io.h" + +static _Bool is_whitespace(char c) { + return c == ' ' || c == '\t' || c == '\r' || c == '\n'; +} + +static _Bool is_alpha(char c) { + return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); +} + +static _Bool is_digit(char c) { + return c >= '0' && c <= '9'; +} + +static _Bool begins_integer(char c) { + return is_digit(c) || c == '-'; +} + +static _Bool id_char(char c) { + return is_alpha(c) || is_digit(c) || c == '_'; +} + +static struct token simple(enum token_type type) { + struct token tok = { type, 0 }; + return tok; +} + +static struct token op(enum operator_ op) { + union token_data data; + data.op = op; + struct token tok = { TOK_OPERATOR, data }; + return tok; +} + +static uint8_t digit_value(uint8_t base, char c) { + // TODO: sort out this mess + + // restrict bases to avoid having to make decisions about how to handle + // upper vs. lower and base64. (letters before digits? seriously?) + if (base != 2 && base != 4 && base != 8 && base != 10 && base != 16) { + fprintf(stderr, "lexical error: illegal integer base (for now)\n"); + exit(1); + } + // who invented this???? why can't 0 be 0? screw you. + if (base == 64) { + if (is_digit(c)) return c - '0' + 52; + if (c >= 'A' && c <= 'Z') return c - 'A'; + if (c >= 'a' && c <= 'Z') return c - 'a' + 26; + if (c == '+') return 62; + // c == '/' + return 63; + } + if (is_digit(c)) return c - '0'; + if (c >= 'A' && c <= 'Z') return c - 'A' + 10; + if (c >= 'a' && c <= 'z') { + if (base > 36) { + return c - 'a' + 36; + } + return c - 'a' + 10; + } + if (c == '+') return 62; + // c == '/' + return 63; +} + +static _Bool is_extended_digit(uint8_t base, char c) { + if (!is_digit(c) && !is_alpha(c) && c != '+' && c != '/') + return false; + uint8_t val = digit_value(base, c); + if (val > base) + return false; + return true; +} + +static uint64_t lex_digits(uint8_t base) { + uint64_t acc = 0; + char* buf; + size_t len; + _Bool at_least_one_char = false; + while (true) { + len = 1; + buf = peek(&len); + if (!is_extended_digit(base, buf[0])) { + // commas are legal digit separators + if (buf[0] == ',') { + len = 2; + buf = peek(&len); + if (len == 2 && is_extended_digit(base, buf[1])) { + skip(1); + continue; + } + } + break; + } + uint8_t digit = digit_value(base, buf[0]); + // (val * base + digit) > UINT64_MAX + if (acc > ((UINT64_MAX - digit) / base)) { + fprintf(stderr, "lexical error: integer literal overflow\n"); + exit(1); + } + skip(1); + at_least_one_char = true; + acc *= base; + acc += digit; + } + if (!at_least_one_char) { + fprintf(stderr, "lexical error: expected digits\n"); + exit(1); + } + return acc; +} + +static struct token lex_integer(void) { + _Bool sign = false; + char* buf; + size_t len = 1; + buf = peek(&len); + assert(len > 0 && begins_integer(buf[0])); + if (buf[0] == '-') { + sign = true; + skip(1); + } + uint64_t acc = lex_digits(10); + len = 1; + buf = peek(&len); + if (len == 1 && buf[0] == '#') { + if (acc > 64) { + fprintf(stderr, "lexical error: integer literal base too large\n"); + exit(1); + } + skip(1); + acc = lex_digits((uint8_t) acc); + } + if (sign && acc > INT64_MAX) { + fprintf(stderr, "lexical error: integer literal overflow due to sign\n"); + exit(1); + } + int64_t val = sign ? -(int64_t) acc : (int64_t) acc; + union token_data data; + data.int_ = val; + struct token tok = { TOK_INTEGER, data }; + return tok; +} + +#define MAX_STR_LEN 4096 +static size_t str_index; +static char str_buf[MAX_STR_LEN]; + +static char* lex_string(void) { + char* buf; + size_t len = 1; + buf = peek(&len); + assert(len == 1 && buf[0] == '"'); + skip(1); + str_index = 0; + while (true) { + if (str_index == MAX_STR_LEN - 1) { + fprintf(stderr, "lexical error: string too long\n"); + exit(1); + } + len = 1; + buf = peek(&len); + if (len < 1) { + fprintf(stderr, "lexical error: unclosed string (reached end of file)\n"); + exit(1); + } + // TODO: string escapes, multi-line strings + if (buf[0] == '"') { + skip(1); + break; + } + if (buf[0] == '\n') { + fprintf(stderr, "lexical error: unclosed string (reached end of line)\n"); + exit(1); + } + str_buf[str_index] = buf[0]; + str_index++; + skip(1); + } + str_buf[str_index] = 0; + return str_buf; +} + +static char* lex_identifier(void) { + char* buf; + size_t len; + char c; + str_index = 0; + while (true) { + len = 1; + buf = peek(&len); + if (len == 0) break; + c = buf[0]; + if (!is_alpha(c) && !is_digit(c) && c != '_') break; + skip(1); + str_buf[str_index] = c; + str_index++; + } + if (str_index == 0) { + fprintf(stderr, "lexical error: expected identifier (possibly illegal character?)\n"); + exit(1); + } + str_buf[str_index] = 0; + return str_buf; +} + +struct token lex(void) { + char* buf; + size_t len; + len = 1; + buf = peek(&len); + if (len < 1) { + return simple(TOK_EOF); + } + char c = buf[0]; + while (is_whitespace(c)) { + skip(1); + len = 1; + peek(&len); + if (len == 0) { + return simple(TOK_EOF); + } + c = buf[0]; + } + if (begins_integer(c)) { + len = 2; + peek(&len); + if (is_digit(buf[0]) || (len > 1 && is_digit(buf[1]))) + return lex_integer(); + } + if (c == '"') { + union token_data data; + data.string = lex_string(); + struct token tok = { TOK_STRING, data }; + return tok; + } + if (c == '\'') { + skip(1); + union token_data data; + data.label = lex_identifier(); + struct token tok = { TOK_LABEL, data }; + return tok; + } + switch (c) { + case '{': + skip(1); + return simple(TOK_OPEN_BLOCK); + case '}': + skip(1); + return simple(TOK_CLOSE_BLOCK); + case '(': + skip(1); + return simple(TOK_OPEN_GROUP); + case ')': + skip(1); + return simple(TOK_CLOSE_GROUP); + case ';': + skip(1); + return simple(TOK_TERMINATOR); + case ',': + skip(1); + return simple(TOK_SEPARATOR); + case '=': + skip(1); + return op(OP_EQ); + case '+': + skip(1); + return op(OP_ADD); + case '-': + skip(1); + len = 1; + buf = peek(&len); + if (len == 1 && buf[0] == '>') { + skip(1); + return op(OP_FUN); + } + return op(OP_SUB); + case '*': + skip(1); + return op(OP_MUL); + case '/': + skip(1); + return op(OP_DIV); + case '%': + skip(1); + return op(OP_MOD); + case '~': + skip(1); + return op(OP_INV); + case '&': + skip(1); + return op(OP_AND); + case '|': + skip(1); + return op(OP_OR); + case '^': + skip(1); + return op(OP_XOR); + case '!': + skip(1); + len = 1; + buf = peek(&len); + if (len == 1 && buf[0] == '=') { + skip(1); + return op(OP_NE); + } + return op(OP_NOT); + case ':': + skip(1); + return op(OP_TYPE); + case '>': + skip(1); + len = 2; + buf = peek(&len); + if (len == 2 && buf[0] == '>' && buf[1] == '>') { + skip(2); + return op(OP_SHR); + } else if (len >= 1 && buf[0] == '>') { + skip(1); + return op(OP_SAR); + } else if (len >= 1 && buf[0] == '=') { + skip(1); + return op(OP_GTE); + } + return op(OP_GT); + case '<': + skip(1); + len = 1; + buf = peek(&len); + if (len == 1 && buf[0] == '<') { + skip(1); + return op(OP_SHL); + } else if (len == 1 && buf[0] == '=') { + skip(1); + return op(OP_LTE); + } + return op(OP_LT); + } + union token_data data; + data.name = lex_identifier(); + struct token tok = { TOK_NAME, data }; + return tok; +} diff --git a/src/lex.h b/src/lex.h new file mode 100644 index 0000000..efd7ec6 --- /dev/null +++ b/src/lex.h @@ -0,0 +1,64 @@ +#ifndef LEX_H +#define LEX_H + +#include + +enum token_type { + TOK_NAME, // foo, bar_quux123, loop + TOK_LABEL, // 'my_loop + TOK_INTEGER, // -123, 16#DEADBEEF + TOK_STRING, // "..." + TOK_OPERATOR, + TOK_OPEN_GROUP, // ( + TOK_CLOSE_GROUP, // ) + TOK_OPEN_BLOCK, // { + TOK_CLOSE_BLOCK, // } + TOK_TERMINATOR, // ; + TOK_SEPARATOR, // , + TOK_EOF, // end of file +}; + +enum operator_ { + OP_EQ, // = + + OP_ADD, // + + OP_SUB, // - + OP_MUL, // * + OP_DIV, // / + OP_MOD, // % + + OP_INV, // ~ + OP_AND, // & + OP_OR, // | + OP_XOR, // ^ + OP_SHL, // << + OP_SAR, // >> + OP_SHR, // >>> + + OP_NOT, // ! + OP_GT, // > + OP_LT, // < + OP_GTE, // >= + OP_LTE, // <= + OP_NE, // != + + OP_TYPE, // : + OP_FUN, // -> +}; + +union token_data { + char* name; + char* label; + char* string; + int64_t int_; + enum operator_ op; +}; + +struct token { + enum token_type type; + union token_data data; +}; + +struct token lex(void); + +#endif diff --git a/src/main.c b/src/main.c index 8094d27..148fa7e 100644 --- a/src/main.c +++ b/src/main.c @@ -9,7 +9,7 @@ #include "io.h" #include "ir.h" - +#include "lex.h" #define ELF_HEADER_SIZE 0xb0 @@ -76,6 +76,49 @@ int main(int argc, char** argv) { } open_files(argv[2], argv[1]); + struct token tok; + do { + tok = lex(); + switch (tok.type) { + case TOK_NAME: + fprintf(stdout, "%s\n", tok.data.name); + break; + case TOK_LABEL: + fprintf(stdout, "'%s\n", tok.data.label); + break; + case TOK_INTEGER: + fprintf(stdout, "%zi\n", tok.data.int_); + break; + case TOK_STRING: + fprintf(stdout, "\"%s\"\n", tok.data.string); + break; + case TOK_OPEN_GROUP: + fprintf(stdout, "(\n"); + break; + case TOK_CLOSE_GROUP: + fprintf(stdout, ")\n"); + break; + case TOK_OPEN_BLOCK: + fprintf(stdout, "{\n"); + break; + case TOK_CLOSE_BLOCK: + fprintf(stdout, "}\n"); + break; + case TOK_TERMINATOR: + fprintf(stdout, ";\n"); + break; + case TOK_SEPARATOR: + fprintf(stdout, ",\n"); + break; + case TOK_OPERATOR: + fprintf(stdout, "OP: %i\n", tok.data.op); + break; + case TOK_EOF: + fprintf(stdout, "EOF\n"); + break; + } + } while (tok.type != TOK_EOF); + reserve(ELF_HEADER_SIZE); size_t entry_point = compile(); write_elf((uint64_t) entry_point);