pass-lang/src/lex.c

378 lines
9.2 KiB
C

#include <assert.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "lex.h"
#include "io.h"
_Bool is_unary(enum operator_ op) {
return op == OP_SUB
|| op == OP_INV
|| op == OP_NOT;
}
_Bool is_binary(enum operator_ op) {
return op == OP_EQ
|| op == OP_ADD
|| op == OP_SUB
|| op == OP_MUL
|| op == OP_DIV
|| op == OP_MOD
|| op == OP_AND
|| op == OP_OR
|| op == OP_XOR
|| op == OP_SHL
|| op == OP_SAR
|| op == OP_SHR
|| op == OP_GT
|| op == OP_LT
|| op == OP_GTE
|| op == OP_LTE
|| op == OP_NE
|| op == OP_TYPE
|| op == OP_FUN;
}
_Bool is_lit(struct token tok) {
return tok.type == TOK_INTEGER || tok.type == TOK_STRING || tok.type == TOK_NAME;
}
static _Bool is_whitespace(char c) {
return c == ' ' || c == '\t' || c == '\r' || c == '\n';
}
static _Bool is_alpha(char c) {
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
}
static _Bool is_digit(char c) {
return c >= '0' && c <= '9';
}
static _Bool id_char(char c) {
return is_alpha(c) || is_digit(c) || c == '_';
}
static struct token simple(enum token_type type) {
struct token tok = { type, 0 };
return tok;
}
static struct token op(enum operator_ op) {
union token_data data;
data.op = op;
struct token tok = { TOK_OPERATOR, data };
return tok;
}
static uint8_t digit_value(uint8_t base, char c) {
assert(base <= 36);
if (is_digit(c)) { return c - '0'; }
if (c >= 'A' && c <= 'Z') { return c - 'A' + 0xA; }
if (c >= 'a' && c <= 'z') { return c - 'a' + 0xA; }
assert(0);
}
static _Bool is_extended_digit(uint8_t base, char c) {
if (!is_digit(c) && !is_alpha(c)) {
return false;
}
uint8_t val = digit_value(base, c);
return val < base;
}
static uint64_t lex_digits(uint8_t base) {
_Bool at_least_one_char = false;
uint64_t acc = 0;
while (true) {
char c = peekc();
if (!is_extended_digit(base, c)) {
// commas are legal digit separators
if (c == ',' && is_extended_digit(base, peekc())) {
nextc();
continue;
}
break;
}
nextc();
uint8_t digit = digit_value(base, c);
// (val * base + digit) > UINT64_MAX
if (acc > ((UINT64_MAX - digit) / base)) {
fprintf(stderr, "lexical error: integer literal overflow\n");
exit(1);
}
at_least_one_char = true;
acc *= base;
acc += digit;
}
if (!at_least_one_char) {
fprintf(stderr, "lexical error: expected digits\n");
exit(1);
}
return acc;
}
static struct token lex_integer(_Bool sign) {
uint64_t acc = lex_digits(10);
if (peekc() == '#') {
if (acc < 2) {
fprintf(stderr, "lexical error: integer literal base too small\n");
exit(1);
}
if (acc > 36) {
fprintf(stderr, "lexical error: integer literal base too large\n");
exit(1);
}
nextc();
acc = lex_digits((uint8_t) acc);
}
if (sign && acc > INT64_MAX) {
fprintf(stderr, "lexical error: integer literal overflow due to sign\n");
exit(1);
}
int64_t val = (int64_t) acc;
if (sign) {
val = -val;
}
union token_data data;
data.int_ = val;
struct token tok = { TOK_INTEGER, data };
return tok;
}
#define MAX_STR_LEN 4096
static size_t str_index;
// alternate string buffers between tokens to prevent overwriting buffer.
// we're LL(1) so 2 buffers is sufficient.
static _Bool which_buf = false;
static char str_buf_1[MAX_STR_LEN];
static char str_buf_2[MAX_STR_LEN];
static char* str_buf(void) {
which_buf = !which_buf;
if (which_buf) {
return str_buf_1;
}
return str_buf_2;
}
static char* lex_string(void) {
str_index = 0;
char* buf = str_buf();
while (true) {
char c = nextc();
if (str_index == MAX_STR_LEN - 1) {
fprintf(stderr, "lexical error: string too long\n");
exit(1);
}
if (c == 0) {
fprintf(stderr, "lexical error: unclosed string (reached end of file)\n");
exit(1);
}
// TODO: string escapes, multi-line strings
if (c == '"') {
break;
}
if (c == '\n') {
fprintf(stderr, "lexical error: unclosed string (reached end of line)\n");
exit(1);
}
buf[str_index] = c;
str_index++;
}
buf[str_index] = 0;
return buf;
}
static char* lex_identifier(void) {
str_index = 0;
char* buf = str_buf();
while (true) {
char c = peekc();
if (!is_alpha(c) && !is_digit(c) && c != '_') break;
nextc();
buf[str_index] = c;
str_index++;
}
if (str_index == 0) {
fprintf(stderr, "lexical error: expected identifier (possibly illegal character?)\n");
exit(1);
}
buf[str_index] = 0;
return buf;
}
static struct token lex(void) {
char c;
do {
c = nextc();
} while (is_whitespace(c));
_Bool sign = false;
switch (c) {
case 0:
return simple(TOK_EOF);
case '"': {
union token_data data;
data.string = lex_string();
struct token tok = { TOK_STRING, data };
return tok;
}
case '\'': {
union token_data data;
data.label = lex_identifier();
struct token tok = { TOK_LABEL, data };
return tok;
}
case '{':
return simple(TOK_OPEN_BLOCK);
case '}':
return simple(TOK_CLOSE_BLOCK);
case '(':
return simple(TOK_OPEN_GROUP);
case ')':
return simple(TOK_CLOSE_GROUP);
case ';':
return simple(TOK_TERMINATOR);
case ',':
return simple(TOK_SEPARATOR);
case '=':
return op(OP_EQ);
case '+':
return op(OP_ADD);
case '-':
if (peekc() == '>') {
nextc();
return op(OP_FUN);
}
if (is_digit(peekc())) {
return lex_integer(true);
}
return op(OP_SUB);
case '*':
return op(OP_MUL);
case '/':
return op(OP_DIV);
case '%':
return op(OP_MOD);
case '~':
return op(OP_INV);
case '&':
return op(OP_AND);
case '|':
return op(OP_OR);
case '^':
return op(OP_XOR);
case '!':
if (peekc() == '=') {
nextc();
return op(OP_NE);
}
return op(OP_NOT);
case ':':
return op(OP_TYPE);
case '>':
c = peekc();
if (c == '=') {
nextc();
return op(OP_GTE);
}
if (c == '>') {
nextc();
if (peekc() == '>') {
nextc();
return op(OP_SHR);
}
return op(OP_SAR);
}
return op(OP_GT);
case '<':
c = peekc();
if (c == '<') {
nextc();
return op(OP_SHL);
}
if (c == '=') {
nextc();
return op(OP_LTE);
}
return op(OP_LT);
}
unnextc(c);
if (is_digit(c)) {
return lex_integer(false);
}
union token_data data;
data.name = lex_identifier();
struct token tok = { TOK_NAME, data };
return tok;
}
static _Bool peeked = false;
static struct token peek_buf;
struct token next(void) {
if (peeked) {
peeked = false;
return peek_buf;
}
return lex();
}
void unnext(struct token tok) {
assert(!peeked);
peeked = true;
peek_buf = tok;
}
struct token peek(void) {
if (!peeked) {
peek_buf = lex();
peeked = true;
}
return peek_buf;
}
void print_token(struct token tok) {
switch (tok.type) {
case TOK_NAME:
fprintf(stdout, "%s", tok.data.name);
break;
case TOK_LABEL:
fprintf(stdout, "'%s", tok.data.label);
break;
case TOK_INTEGER:
fprintf(stdout, "%zi", tok.data.int_);
break;
case TOK_STRING:
fprintf(stdout, "\"%s\"", tok.data.string);
break;
case TOK_OPEN_GROUP:
fprintf(stdout, "(");
break;
case TOK_CLOSE_GROUP:
fprintf(stdout, ")");
break;
case TOK_OPEN_BLOCK:
fprintf(stdout, "{");
break;
case TOK_CLOSE_BLOCK:
fprintf(stdout, "}");
break;
case TOK_TERMINATOR:
fprintf(stdout, ";");
break;
case TOK_SEPARATOR:
fprintf(stdout, ",");
break;
case TOK_OPERATOR:
fprintf(stdout, "OP:%i", tok.data.op);
break;
case TOK_EOF:
fprintf(stdout, "EOF");
break;
}
}