#include #include #include #include #include #include "lex.h" #include "io.h" static _Bool is_whitespace(char c) { return c == ' ' || c == '\t' || c == '\r' || c == '\n'; } static _Bool is_alpha(char c) { return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); } static _Bool is_digit(char c) { return c >= '0' && c <= '9'; } static _Bool begins_integer(char c) { return is_digit(c) || c == '-'; } static _Bool id_char(char c) { return is_alpha(c) || is_digit(c) || c == '_'; } static struct token simple(enum token_type type) { struct token tok = { type, 0 }; return tok; } static struct token op(enum operator_ op) { union token_data data; data.op = op; struct token tok = { TOK_OPERATOR, data }; return tok; } static uint8_t digit_value(uint8_t base, char c) { // TODO: sort out this mess // restrict bases to avoid having to make decisions about how to handle // upper vs. lower and base64. (letters before digits? seriously?) if (base != 2 && base != 4 && base != 8 && base != 10 && base != 16) { fprintf(stderr, "lexical error: illegal integer base (for now)\n"); exit(1); } // who invented this???? why can't 0 be 0? screw you. if (base == 64) { if (is_digit(c)) return c - '0' + 52; if (c >= 'A' && c <= 'Z') return c - 'A'; if (c >= 'a' && c <= 'Z') return c - 'a' + 26; if (c == '+') return 62; // c == '/' return 63; } if (is_digit(c)) return c - '0'; if (c >= 'A' && c <= 'Z') return c - 'A' + 10; if (c >= 'a' && c <= 'z') { if (base > 36) { return c - 'a' + 36; } return c - 'a' + 10; } if (c == '+') return 62; // c == '/' return 63; } static _Bool is_extended_digit(uint8_t base, char c) { if (!is_digit(c) && !is_alpha(c) && c != '+' && c != '/') return false; uint8_t val = digit_value(base, c); if (val > base) return false; return true; } static uint64_t lex_digits(uint8_t base) { uint64_t acc = 0; char* buf; size_t len; _Bool at_least_one_char = false; while (true) { len = 1; buf = peek(&len); if (!is_extended_digit(base, buf[0])) { // commas are legal digit separators if (buf[0] == ',') { len = 2; buf = peek(&len); if (len == 2 && is_extended_digit(base, buf[1])) { skip(1); continue; } } break; } uint8_t digit = digit_value(base, buf[0]); // (val * base + digit) > UINT64_MAX if (acc > ((UINT64_MAX - digit) / base)) { fprintf(stderr, "lexical error: integer literal overflow\n"); exit(1); } skip(1); at_least_one_char = true; acc *= base; acc += digit; } if (!at_least_one_char) { fprintf(stderr, "lexical error: expected digits\n"); exit(1); } return acc; } static struct token lex_integer(void) { _Bool sign = false; char* buf; size_t len = 1; buf = peek(&len); assert(len > 0 && begins_integer(buf[0])); if (buf[0] == '-') { sign = true; skip(1); } uint64_t acc = lex_digits(10); len = 1; buf = peek(&len); if (len == 1 && buf[0] == '#') { if (acc > 64) { fprintf(stderr, "lexical error: integer literal base too large\n"); exit(1); } skip(1); acc = lex_digits((uint8_t) acc); } if (sign && acc > INT64_MAX) { fprintf(stderr, "lexical error: integer literal overflow due to sign\n"); exit(1); } int64_t val = sign ? -(int64_t) acc : (int64_t) acc; union token_data data; data.int_ = val; struct token tok = { TOK_INTEGER, data }; return tok; } #define MAX_STR_LEN 4096 static size_t str_index; static char str_buf[MAX_STR_LEN]; static char* lex_string(void) { char* buf; size_t len = 1; buf = peek(&len); assert(len == 1 && buf[0] == '"'); skip(1); str_index = 0; while (true) { if (str_index == MAX_STR_LEN - 1) { fprintf(stderr, "lexical error: string too long\n"); exit(1); } len = 1; buf = peek(&len); if (len < 1) { fprintf(stderr, "lexical error: unclosed string (reached end of file)\n"); exit(1); } // TODO: string escapes, multi-line strings if (buf[0] == '"') { skip(1); break; } if (buf[0] == '\n') { fprintf(stderr, "lexical error: unclosed string (reached end of line)\n"); exit(1); } str_buf[str_index] = buf[0]; str_index++; skip(1); } str_buf[str_index] = 0; return str_buf; } static char* lex_identifier(void) { char* buf; size_t len; char c; str_index = 0; while (true) { len = 1; buf = peek(&len); if (len == 0) break; c = buf[0]; if (!is_alpha(c) && !is_digit(c) && c != '_') break; skip(1); str_buf[str_index] = c; str_index++; } if (str_index == 0) { fprintf(stderr, "lexical error: expected identifier (possibly illegal character?)\n"); exit(1); } str_buf[str_index] = 0; return str_buf; } struct token lex(void) { char* buf; size_t len; len = 1; buf = peek(&len); if (len < 1) { return simple(TOK_EOF); } char c = buf[0]; while (is_whitespace(c)) { skip(1); len = 1; peek(&len); if (len == 0) { return simple(TOK_EOF); } c = buf[0]; } if (begins_integer(c)) { len = 2; peek(&len); if (is_digit(buf[0]) || (len > 1 && is_digit(buf[1]))) return lex_integer(); } if (c == '"') { union token_data data; data.string = lex_string(); struct token tok = { TOK_STRING, data }; return tok; } if (c == '\'') { skip(1); union token_data data; data.label = lex_identifier(); struct token tok = { TOK_LABEL, data }; return tok; } switch (c) { case '{': skip(1); return simple(TOK_OPEN_BLOCK); case '}': skip(1); return simple(TOK_CLOSE_BLOCK); case '(': skip(1); return simple(TOK_OPEN_GROUP); case ')': skip(1); return simple(TOK_CLOSE_GROUP); case ';': skip(1); return simple(TOK_TERMINATOR); case ',': skip(1); return simple(TOK_SEPARATOR); case '=': skip(1); return op(OP_EQ); case '+': skip(1); return op(OP_ADD); case '-': skip(1); len = 1; buf = peek(&len); if (len == 1 && buf[0] == '>') { skip(1); return op(OP_FUN); } return op(OP_SUB); case '*': skip(1); return op(OP_MUL); case '/': skip(1); return op(OP_DIV); case '%': skip(1); return op(OP_MOD); case '~': skip(1); return op(OP_INV); case '&': skip(1); return op(OP_AND); case '|': skip(1); return op(OP_OR); case '^': skip(1); return op(OP_XOR); case '!': skip(1); len = 1; buf = peek(&len); if (len == 1 && buf[0] == '=') { skip(1); return op(OP_NE); } return op(OP_NOT); case ':': skip(1); return op(OP_TYPE); case '>': skip(1); len = 2; buf = peek(&len); if (len == 2 && buf[0] == '>' && buf[1] == '>') { skip(2); return op(OP_SHR); } else if (len >= 1 && buf[0] == '>') { skip(1); return op(OP_SAR); } else if (len >= 1 && buf[0] == '=') { skip(1); return op(OP_GTE); } return op(OP_GT); case '<': skip(1); len = 1; buf = peek(&len); if (len == 1 && buf[0] == '<') { skip(1); return op(OP_SHL); } else if (len == 1 && buf[0] == '=') { skip(1); return op(OP_LTE); } return op(OP_LT); } union token_data data; data.name = lex_identifier(); struct token tok = { TOK_NAME, data }; return tok; }