507 lines
12 KiB
C
507 lines
12 KiB
C
#include <assert.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
#include "lex.h"
|
|
#include "lex/indent.h"
|
|
#include "io.h"
|
|
|
|
_Bool is_unary(enum operator_ op) {
|
|
return op == OP_SUB
|
|
|| op == OP_INV
|
|
|| op == OP_NOT;
|
|
}
|
|
|
|
_Bool is_binary(enum operator_ op) {
|
|
return op == OP_EQ
|
|
|| op == OP_ADD
|
|
|| op == OP_SUB
|
|
|| op == OP_MUL
|
|
|| op == OP_DIV
|
|
|| op == OP_MOD
|
|
|| op == OP_AND
|
|
|| op == OP_OR
|
|
|| op == OP_XOR
|
|
|| op == OP_SHL
|
|
|| op == OP_SAR
|
|
|| op == OP_SHR
|
|
|| op == OP_GT
|
|
|| op == OP_LT
|
|
|| op == OP_GTE
|
|
|| op == OP_LTE
|
|
|| op == OP_NE
|
|
|| op == OP_TYPE
|
|
|| op == OP_FUN;
|
|
}
|
|
|
|
_Bool is_lit(struct token tok) {
|
|
return tok.type == TOK_INTEGER || tok.type == TOK_STRING || tok.type == TOK_NAME;
|
|
}
|
|
|
|
static _Bool is_alpha(char c) {
|
|
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
|
|
}
|
|
|
|
static _Bool is_digit(char c) {
|
|
return c >= '0' && c <= '9';
|
|
}
|
|
|
|
static _Bool is_alphanumeric(char c) {
|
|
return is_alpha(c) || is_digit(c);
|
|
}
|
|
|
|
static _Bool is_id_char(char c) {
|
|
// TODO: allow unicode identifiers
|
|
return is_alphanumeric(c) || c == '_';
|
|
}
|
|
|
|
static struct token simple(enum token_type type) {
|
|
struct token tok = { type, 0 };
|
|
return tok;
|
|
}
|
|
|
|
static struct token op(enum operator_ op) {
|
|
union token_data data;
|
|
data.op = op;
|
|
struct token tok = { TOK_OPERATOR, data };
|
|
return tok;
|
|
}
|
|
|
|
static uint8_t digit_value(char c) {
|
|
if (is_digit(c)) { return c - '0'; }
|
|
if (c >= 'A' && c <= 'Z') { return c - 'A' + 0xA; }
|
|
if (c >= 'a' && c <= 'z') { return c - 'a' + 0xA; }
|
|
assert(0);
|
|
}
|
|
|
|
static _Bool is_digit_in(uint8_t base, char c) {
|
|
if (!is_alphanumeric(c)) {
|
|
return false;
|
|
}
|
|
return digit_value(c) < base;
|
|
}
|
|
|
|
static uint64_t lex_digits(uint8_t base) {
|
|
uint64_t acc = 0;
|
|
while (true) {
|
|
char c = peekc();
|
|
if (!is_digit_in(base, c)) {
|
|
// commas are legal digit separators
|
|
if (c == ',' && is_digit_in(base, peekc())) {
|
|
nextc();
|
|
continue;
|
|
}
|
|
break;
|
|
}
|
|
nextc();
|
|
uint8_t digit = digit_value(c);
|
|
// (val * base + digit) > UINT64_MAX
|
|
if (acc > ((UINT64_MAX - digit) / base)) {
|
|
fprintf(stderr, "lexical error: integer literal overflow\n");
|
|
exit(1);
|
|
}
|
|
acc *= base;
|
|
acc += digit;
|
|
}
|
|
return acc;
|
|
}
|
|
|
|
static struct token integer_tok(uint64_t integer) {
|
|
union token_data data;
|
|
data.int_ = integer;
|
|
struct token tok = { TOK_INTEGER, data };
|
|
return tok;
|
|
}
|
|
|
|
static struct token lex_integer(_Bool sign) {
|
|
uint8_t base = 10;
|
|
if (peekc() == '0') {
|
|
nextc();
|
|
if (peekc() == 'b') {
|
|
base = 2;
|
|
nextc();
|
|
} else if (peekc() == 'x') {
|
|
base = 16;
|
|
nextc();
|
|
}
|
|
if(!is_digit(peekc() && !is_id_char(peekc()))) {
|
|
return integer_tok(0);
|
|
}
|
|
}
|
|
if (!is_digit_in(base, peekc())) {
|
|
fprintf(stderr, "lexical error: expected base-%i digits\n", base);
|
|
exit(1);
|
|
}
|
|
uint64_t acc = lex_digits(10);
|
|
if (is_id_char(peekc())) {
|
|
fprintf(stderr, "lexical error: must put space between integer and following identifier\n");
|
|
exit(1);
|
|
}
|
|
if (sign && acc > INT64_MAX) {
|
|
fprintf(stderr, "lexical error: signed integer literal overflow\n");
|
|
exit(1);
|
|
}
|
|
int64_t val = (int64_t) acc;
|
|
if (sign) {
|
|
val = -val;
|
|
}
|
|
return integer_tok(val);
|
|
}
|
|
|
|
#define MAX_STR_LEN 4096
|
|
static size_t str_index;
|
|
// alternate string buffers between tokens to prevent overwriting buffer.
|
|
// we're LL(1) so 2 buffers is sufficient.
|
|
static _Bool which_buf = false;
|
|
static char str_buf_1[MAX_STR_LEN];
|
|
static char str_buf_2[MAX_STR_LEN];
|
|
|
|
static char* str_buf(void) {
|
|
which_buf = !which_buf;
|
|
if (which_buf) {
|
|
return str_buf_1;
|
|
}
|
|
return str_buf_2;
|
|
}
|
|
|
|
static char* lex_string(void) {
|
|
// TODO: string escapes, multi-line strings, no length limit on strings
|
|
str_index = 0;
|
|
char* buf = str_buf();
|
|
while (true) {
|
|
char c = nextc();
|
|
if (str_index == MAX_STR_LEN - 1) {
|
|
fprintf(stderr, "lexical error: string too long\n");
|
|
exit(1);
|
|
}
|
|
if (c == 0) {
|
|
fprintf(stderr, "lexical error: unclosed string (reached end of file)\n");
|
|
exit(1);
|
|
}
|
|
if (c == '"') {
|
|
break;
|
|
}
|
|
if (c == '\n') {
|
|
fprintf(stderr, "lexical error: unclosed string (reached end of line)\n");
|
|
exit(1);
|
|
}
|
|
buf[str_index] = c;
|
|
str_index++;
|
|
}
|
|
buf[str_index] = 0;
|
|
return buf;
|
|
}
|
|
|
|
static char* lex_identifier(void) {
|
|
str_index = 0;
|
|
char* buf = str_buf();
|
|
while (true) {
|
|
char c = peekc();
|
|
if (!is_id_char(c)) break;
|
|
nextc();
|
|
buf[str_index] = c;
|
|
str_index++;
|
|
}
|
|
if (str_index == 0) {
|
|
fprintf(stderr, "lexical error: expected identifier (possibly illegal character?)\n");
|
|
exit(1);
|
|
}
|
|
buf[str_index] = 0;
|
|
return buf;
|
|
}
|
|
|
|
static uint32_t indent_level = 0;
|
|
static uint32_t pending_level = 0;
|
|
static _Bool level_is_block[MAX_INDENTS] = {true};
|
|
// going back to a previous indentation level.
|
|
// if we're going back, then we insert a terminator.
|
|
static _Bool going_back = false;
|
|
|
|
static struct token lex(void) {
|
|
char c = peekc();
|
|
if (is_newline(c)) {
|
|
indent_level = lex_indentation();
|
|
if (indent_level <= pending_level) {
|
|
going_back = true;
|
|
}
|
|
}
|
|
while (indent_level > pending_level) {
|
|
pending_level++;
|
|
if (level_is_block[pending_level]) {
|
|
return simple(TOK_OPEN_BLOCK);
|
|
}
|
|
}
|
|
while (indent_level < pending_level) {
|
|
_Bool was_block = level_is_block[pending_level];
|
|
level_is_block[pending_level] = false;
|
|
pending_level--;
|
|
if (was_block) {
|
|
return simple(TOK_CLOSE_BLOCK);
|
|
}
|
|
}
|
|
if (going_back) {
|
|
going_back = false;
|
|
if (level_is_block[indent_level]) {
|
|
return simple(TOK_TERMINATOR);
|
|
}
|
|
}
|
|
c = peekc();
|
|
while (is_indent(c)) {
|
|
nextc();
|
|
c = peekc();
|
|
}
|
|
_Bool sign = false;
|
|
switch (c) {
|
|
case 0:
|
|
nextc();
|
|
return simple(TOK_EOF);
|
|
case '"': {
|
|
nextc();
|
|
union token_data data;
|
|
data.string = lex_string();
|
|
struct token tok = { TOK_STRING, data };
|
|
return tok;
|
|
}
|
|
case '\'': {
|
|
nextc();
|
|
union token_data data;
|
|
data.label = lex_identifier();
|
|
struct token tok = { TOK_LABEL, data };
|
|
return tok;
|
|
}
|
|
case ':':
|
|
nextc();
|
|
while (is_indent(peekc())) {
|
|
nextc();
|
|
}
|
|
if (is_newline(peekc())) {
|
|
level_is_block[indent_level + 1] = true;
|
|
return lex();
|
|
}
|
|
return op(OP_TYPE);
|
|
case '{':
|
|
nextc();
|
|
return simple(TOK_OPEN_BLOCK);
|
|
case '}':
|
|
nextc();
|
|
return simple(TOK_CLOSE_BLOCK);
|
|
case '(':
|
|
nextc();
|
|
return simple(TOK_OPEN_GROUP);
|
|
case ')':
|
|
nextc();
|
|
return simple(TOK_CLOSE_GROUP);
|
|
case ';':
|
|
nextc();
|
|
return simple(TOK_TERMINATOR);
|
|
case ',':
|
|
nextc();
|
|
return simple(TOK_SEPARATOR);
|
|
case '=':
|
|
nextc();
|
|
return simple(TOK_EQUALS);
|
|
case '-':
|
|
nextc();
|
|
if (peekc() == '>') {
|
|
nextc();
|
|
return op(OP_FUN);
|
|
}
|
|
if (is_digit(peekc())) {
|
|
return lex_integer(true);
|
|
}
|
|
return op(OP_SUB);
|
|
case '+':
|
|
nextc();
|
|
return op(OP_ADD);
|
|
case '*':
|
|
nextc();
|
|
return op(OP_MUL);
|
|
case '/':
|
|
nextc();
|
|
return op(OP_DIV);
|
|
case '%':
|
|
nextc();
|
|
return op(OP_MOD);
|
|
case '~':
|
|
nextc();
|
|
return op(OP_INV);
|
|
case '&':
|
|
nextc();
|
|
return op(OP_AND);
|
|
case '|':
|
|
nextc();
|
|
return op(OP_OR);
|
|
case '^':
|
|
nextc();
|
|
return op(OP_XOR);
|
|
case '!':
|
|
nextc();
|
|
if (peekc() == '=') {
|
|
nextc();
|
|
return op(OP_NE);
|
|
}
|
|
return op(OP_NOT);
|
|
case '>':
|
|
nextc();
|
|
c = peekc();
|
|
if (c == '=') {
|
|
nextc();
|
|
return op(OP_GTE);
|
|
}
|
|
if (c == '>') {
|
|
nextc();
|
|
if (peekc() == '>') {
|
|
nextc();
|
|
return op(OP_SHR);
|
|
}
|
|
return op(OP_SAR);
|
|
}
|
|
return op(OP_GT);
|
|
case '<':
|
|
nextc();
|
|
c = peekc();
|
|
if (c == '<') {
|
|
nextc();
|
|
return op(OP_SHL);
|
|
}
|
|
if (c == '=') {
|
|
nextc();
|
|
return op(OP_LTE);
|
|
}
|
|
return op(OP_LT);
|
|
}
|
|
if (is_digit(c)) {
|
|
return lex_integer(false);
|
|
}
|
|
char* name = lex_identifier();
|
|
if (strcmp(name, "if") == 0) {
|
|
return simple(TOK_IF);
|
|
}
|
|
if (strcmp(name, "else") == 0) {
|
|
return simple(TOK_ELSE);
|
|
}
|
|
if (strcmp(name, "match") == 0) {
|
|
return simple(TOK_MATCH);
|
|
}
|
|
if (strcmp(name, "case") == 0) {
|
|
return simple(TOK_CASE);
|
|
}
|
|
if (strcmp(name, "loop") == 0) {
|
|
return simple(TOK_LOOP);
|
|
}
|
|
if (strcmp(name, "fn") == 0) {
|
|
return simple(TOK_FN);
|
|
}
|
|
if (strcmp(name, "next") == 0) {
|
|
return simple(TOK_NEXT);
|
|
}
|
|
if (strcmp(name, "exit") == 0) {
|
|
return simple(TOK_EXIT);
|
|
}
|
|
if (strcmp(name, "recurse") == 0) {
|
|
return simple(TOK_RECURSE);
|
|
}
|
|
if (strcmp(name, "return") == 0) {
|
|
return simple(TOK_RETURN);
|
|
}
|
|
|
|
union token_data data;
|
|
data.name = name;
|
|
struct token tok = { TOK_NAME, data };
|
|
return tok;
|
|
}
|
|
|
|
static _Bool init = false;
|
|
static struct token peek_buf;
|
|
|
|
struct token next(void) {
|
|
if (!init) {
|
|
init = true;
|
|
indent_level = lex_indentation();
|
|
next();
|
|
}
|
|
struct token tmp = peek_buf;
|
|
peek_buf = lex();
|
|
return tmp;
|
|
}
|
|
|
|
struct token peek(void) {
|
|
return peek_buf;
|
|
}
|
|
|
|
void print_token(struct token tok) {
|
|
switch (tok.type) {
|
|
case TOK_NAME:
|
|
fprintf(stdout, "%s", tok.data.name);
|
|
break;
|
|
case TOK_LABEL:
|
|
fprintf(stdout, "'%s", tok.data.label);
|
|
break;
|
|
case TOK_INTEGER:
|
|
fprintf(stdout, "%zi", tok.data.int_);
|
|
break;
|
|
case TOK_STRING:
|
|
fprintf(stdout, "\"%s\"", tok.data.string);
|
|
break;
|
|
case TOK_OPEN_GROUP:
|
|
fprintf(stdout, "(");
|
|
break;
|
|
case TOK_CLOSE_GROUP:
|
|
fprintf(stdout, ")");
|
|
break;
|
|
case TOK_OPEN_BLOCK:
|
|
fprintf(stdout, "{");
|
|
break;
|
|
case TOK_CLOSE_BLOCK:
|
|
fprintf(stdout, "}");
|
|
break;
|
|
case TOK_TERMINATOR:
|
|
fprintf(stdout, ";");
|
|
break;
|
|
case TOK_SEPARATOR:
|
|
fprintf(stdout, ",");
|
|
break;
|
|
case TOK_OPERATOR:
|
|
// TODO: printing for operators
|
|
fprintf(stdout, "OP:%i", tok.data.op);
|
|
break;
|
|
case TOK_EOF:
|
|
fprintf(stdout, "<EOF>");
|
|
break;
|
|
case TOK_CASE:
|
|
fprintf(stdout, "case");
|
|
break;
|
|
case TOK_ELSE:
|
|
fprintf(stdout, "else");
|
|
break;
|
|
case TOK_EQUALS:
|
|
fprintf(stdout, "=");
|
|
break;
|
|
case TOK_EXIT:
|
|
fprintf(stdout, "exit");
|
|
break;
|
|
case TOK_FN:
|
|
fprintf(stdout, "fn");
|
|
break;
|
|
case TOK_IF:
|
|
fprintf(stdout, "if");
|
|
break;
|
|
case TOK_LOOP:
|
|
fprintf(stdout, "loop");
|
|
break;
|
|
case TOK_NEXT:
|
|
fprintf(stdout, "next");
|
|
break;
|
|
case TOK_RETURN:
|
|
fprintf(stdout, "return");
|
|
break;
|
|
case TOK_RECURSE:
|
|
fprintf(stdout, "recurse");
|
|
break;
|
|
case TOK_MATCH:
|
|
fprintf(stdout, "match");
|
|
break;
|
|
}
|
|
}
|