#include #include #include #include #include "lex.h" #include "lex/indent.h" #include "io.h" _Bool is_unary(enum operator_ op) { return op == OP_SUB || op == OP_INV || op == OP_NOT; } _Bool is_binary(enum operator_ op) { return op == OP_EQ || op == OP_ADD || op == OP_SUB || op == OP_MUL || op == OP_DIV || op == OP_MOD || op == OP_AND || op == OP_OR || op == OP_XOR || op == OP_SHL || op == OP_SAR || op == OP_SHR || op == OP_GT || op == OP_LT || op == OP_GTE || op == OP_LTE || op == OP_NE || op == OP_TYPE || op == OP_FUN; } _Bool is_lit(struct token tok) { return tok.type == TOK_INTEGER || tok.type == TOK_STRING || tok.type == TOK_NAME; } static _Bool is_alpha(char c) { return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); } static _Bool is_digit(char c) { return c >= '0' && c <= '9'; } static _Bool is_alphanumeric(char c) { return is_alpha(c) || is_digit(c); } static _Bool is_id_char(char c) { return is_alpha(c) || is_digit(c) || c == '_'; } static struct token simple(enum token_type type) { struct token tok = { type, 0 }; return tok; } static struct token op(enum operator_ op) { union token_data data; data.op = op; struct token tok = { TOK_OPERATOR, data }; return tok; } static uint8_t digit_value(uint8_t base, char c) { assert(base <= 36); if (is_digit(c)) { return c - '0'; } if (c >= 'A' && c <= 'Z') { return c - 'A' + 0xA; } if (c >= 'a' && c <= 'z') { return c - 'a' + 0xA; } assert(0); } static _Bool is_extended_digit(uint8_t base, char c) { if (!is_alphanumeric(c)) { return false; } uint8_t val = digit_value(base, c); return val < base; } static uint64_t lex_digits(uint8_t base) { _Bool at_least_one_char = false; uint64_t acc = 0; while (true) { char c = peekc(); if (!is_extended_digit(base, c)) { // commas are legal digit separators if (c == ',' && is_extended_digit(base, peekc())) { nextc(); continue; } break; } nextc(); uint8_t digit = digit_value(base, c); // (val * base + digit) > UINT64_MAX if (acc > ((UINT64_MAX - digit) / base)) { fprintf(stderr, "lexical error: integer literal overflow\n"); exit(1); } at_least_one_char = true; acc *= base; acc += digit; } if (!at_least_one_char) { fprintf(stderr, "lexical error: expected digits\n"); exit(1); } return acc; } static struct token lex_integer(_Bool sign) { uint64_t acc = lex_digits(10); if (peekc() == '#') { if (acc != 2 || acc != 8 && acc != 10 && acc != 16) { fprintf(stderr, "lexical error: illegal integer literal base\n"); exit(1); } nextc(); acc = lex_digits((uint8_t) acc); } if (sign && acc > INT64_MAX) { fprintf(stderr, "lexical error: signed integer literal overflow\n"); exit(1); } int64_t val = (int64_t) acc; if (sign) { val = -val; } union token_data data; data.int_ = val; struct token tok = { TOK_INTEGER, data }; return tok; } #define MAX_STR_LEN 4096 static size_t str_index; // alternate string buffers between tokens to prevent overwriting buffer. // we're LL(1) so 2 buffers is sufficient. static _Bool which_buf = false; static char str_buf_1[MAX_STR_LEN]; static char str_buf_2[MAX_STR_LEN]; static char* str_buf(void) { which_buf = !which_buf; if (which_buf) { return str_buf_1; } return str_buf_2; } static char* lex_string(void) { str_index = 0; char* buf = str_buf(); while (true) { char c = nextc(); if (str_index == MAX_STR_LEN - 1) { fprintf(stderr, "lexical error: string too long\n"); exit(1); } if (c == 0) { fprintf(stderr, "lexical error: unclosed string (reached end of file)\n"); exit(1); } // TODO: string escapes, multi-line strings if (c == '"') { break; } if (c == '\n') { fprintf(stderr, "lexical error: unclosed string (reached end of line)\n"); exit(1); } buf[str_index] = c; str_index++; } buf[str_index] = 0; return buf; } static char* lex_identifier(void) { str_index = 0; char* buf = str_buf(); while (true) { char c = peekc(); if (!is_id_char(c)) break; nextc(); buf[str_index] = c; str_index++; } if (str_index == 0) { fprintf(stderr, "lexical error: expected identifier (possibly illegal character?)\n"); exit(1); } buf[str_index] = 0; return buf; } static uint32_t indent_level = 0; static uint32_t pending_level = 0; static _Bool level_is_block[MAX_INDENTS] = {true}; // going back to a previous indentation level. // if we're going back, then we insert a terminator. static _Bool going_back = false; static struct token lex(void) { char c = peekc(); if (is_newline(c)) { indent_level = lex_indentation(); if (indent_level <= pending_level) { going_back = true; } } while (indent_level > pending_level) { pending_level++; if (level_is_block[pending_level]) { return simple(TOK_OPEN_BLOCK); } } while (indent_level < pending_level) { _Bool was_block = level_is_block[pending_level]; level_is_block[pending_level] = false; pending_level--; if (was_block) { return simple(TOK_CLOSE_BLOCK); } } if (going_back) { going_back = false; if (level_is_block[indent_level]) { return simple(TOK_TERMINATOR); } } c = peekc(); while (is_indent(c)) { nextc(); c = peekc(); } _Bool sign = false; switch (c) { case 0: nextc(); return simple(TOK_EOF); case '"': { nextc(); union token_data data; data.string = lex_string(); struct token tok = { TOK_STRING, data }; return tok; } case '\'': { nextc(); union token_data data; data.label = lex_identifier(); struct token tok = { TOK_LABEL, data }; return tok; } case '{': nextc(); return simple(TOK_OPEN_BLOCK); case '}': nextc(); return simple(TOK_CLOSE_BLOCK); case '(': nextc(); return simple(TOK_OPEN_GROUP); case ')': nextc(); return simple(TOK_CLOSE_GROUP); case ';': nextc(); return simple(TOK_TERMINATOR); case ',': nextc(); return simple(TOK_SEPARATOR); case '=': nextc(); return op(OP_EQ); case '+': nextc(); return op(OP_ADD); case '-': nextc(); if (peekc() == '>') { nextc(); return op(OP_FUN); } if (is_digit(peekc())) { return lex_integer(true); } return op(OP_SUB); case '*': nextc(); return op(OP_MUL); case '/': nextc(); return op(OP_DIV); case '%': nextc(); return op(OP_MOD); case '~': nextc(); return op(OP_INV); case '&': nextc(); return op(OP_AND); case '|': nextc(); return op(OP_OR); case '^': nextc(); return op(OP_XOR); case '!': nextc(); if (peekc() == '=') { nextc(); return op(OP_NE); } return op(OP_NOT); case ':': nextc(); while (is_indent(peekc())) { nextc(); } if (is_newline(peekc())) { level_is_block[indent_level + 1] = true; return lex(); } return op(OP_TYPE); case '>': nextc(); c = peekc(); if (c == '=') { nextc(); return op(OP_GTE); } if (c == '>') { nextc(); if (peekc() == '>') { nextc(); return op(OP_SHR); } return op(OP_SAR); } return op(OP_GT); case '<': nextc(); c = peekc(); if (c == '<') { nextc(); return op(OP_SHL); } if (c == '=') { nextc(); return op(OP_LTE); } return op(OP_LT); } if (is_digit(c)) { return lex_integer(false); } union token_data data; data.name = lex_identifier(); struct token tok = { TOK_NAME, data }; return tok; } static _Bool init = false; static struct token peek_buf; struct token next(void) { if (!init) { init = true; indent_level = lex_indentation(); next(); } struct token tmp = peek_buf; peek_buf = lex(); return tmp; } struct token peek(void) { return peek_buf; } void print_token(struct token tok) { switch (tok.type) { case TOK_NAME: fprintf(stdout, "%s", tok.data.name); break; case TOK_LABEL: fprintf(stdout, "'%s", tok.data.label); break; case TOK_INTEGER: fprintf(stdout, "%zi", tok.data.int_); break; case TOK_STRING: fprintf(stdout, "\"%s\"", tok.data.string); break; case TOK_OPEN_GROUP: fprintf(stdout, "("); break; case TOK_CLOSE_GROUP: fprintf(stdout, ")"); break; case TOK_OPEN_BLOCK: fprintf(stdout, "{"); break; case TOK_CLOSE_BLOCK: fprintf(stdout, "}"); break; case TOK_TERMINATOR: fprintf(stdout, ";"); break; case TOK_SEPARATOR: fprintf(stdout, ","); break; case TOK_OPERATOR: fprintf(stdout, "OP:%i", tok.data.op); break; case TOK_EOF: fprintf(stdout, "EOF"); break; } }