pass-lang/src/lex.c

351 lines
8.8 KiB
C

#include <assert.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
#include <unistd.h>
#include "lex.h"
#include "io.h"
static _Bool is_whitespace(char c) {
return c == ' ' || c == '\t' || c == '\r' || c == '\n';
}
static _Bool is_alpha(char c) {
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
}
static _Bool is_digit(char c) {
return c >= '0' && c <= '9';
}
static _Bool begins_integer(char c) {
return is_digit(c) || c == '-';
}
static _Bool id_char(char c) {
return is_alpha(c) || is_digit(c) || c == '_';
}
static struct token simple(enum token_type type) {
struct token tok = { type, 0 };
return tok;
}
static struct token op(enum operator_ op) {
union token_data data;
data.op = op;
struct token tok = { TOK_OPERATOR, data };
return tok;
}
static uint8_t digit_value(uint8_t base, char c) {
// TODO: sort out this mess
// restrict bases to avoid having to make decisions about how to handle
// upper vs. lower and base64. (letters before digits? seriously?)
if (base != 2 && base != 4 && base != 8 && base != 10 && base != 16) {
fprintf(stderr, "lexical error: illegal integer base (for now)\n");
exit(1);
}
// who invented this???? why can't 0 be 0? screw you.
if (base == 64) {
if (is_digit(c)) return c - '0' + 52;
if (c >= 'A' && c <= 'Z') return c - 'A';
if (c >= 'a' && c <= 'Z') return c - 'a' + 26;
if (c == '+') return 62;
// c == '/'
return 63;
}
if (is_digit(c)) return c - '0';
if (c >= 'A' && c <= 'Z') return c - 'A' + 10;
if (c >= 'a' && c <= 'z') {
if (base > 36) {
return c - 'a' + 36;
}
return c - 'a' + 10;
}
if (c == '+') return 62;
// c == '/'
return 63;
}
static _Bool is_extended_digit(uint8_t base, char c) {
if (!is_digit(c) && !is_alpha(c) && c != '+' && c != '/')
return false;
uint8_t val = digit_value(base, c);
if (val > base)
return false;
return true;
}
static uint64_t lex_digits(uint8_t base) {
uint64_t acc = 0;
char* buf;
size_t len;
_Bool at_least_one_char = false;
while (true) {
len = 1;
buf = peek(&len);
if (!is_extended_digit(base, buf[0])) {
// commas are legal digit separators
if (buf[0] == ',') {
len = 2;
buf = peek(&len);
if (len == 2 && is_extended_digit(base, buf[1])) {
skip(1);
continue;
}
}
break;
}
uint8_t digit = digit_value(base, buf[0]);
// (val * base + digit) > UINT64_MAX
if (acc > ((UINT64_MAX - digit) / base)) {
fprintf(stderr, "lexical error: integer literal overflow\n");
exit(1);
}
skip(1);
at_least_one_char = true;
acc *= base;
acc += digit;
}
if (!at_least_one_char) {
fprintf(stderr, "lexical error: expected digits\n");
exit(1);
}
return acc;
}
static struct token lex_integer(void) {
_Bool sign = false;
char* buf;
size_t len = 1;
buf = peek(&len);
assert(len > 0 && begins_integer(buf[0]));
if (buf[0] == '-') {
sign = true;
skip(1);
}
uint64_t acc = lex_digits(10);
len = 1;
buf = peek(&len);
if (len == 1 && buf[0] == '#') {
if (acc > 64) {
fprintf(stderr, "lexical error: integer literal base too large\n");
exit(1);
}
skip(1);
acc = lex_digits((uint8_t) acc);
}
if (sign && acc > INT64_MAX) {
fprintf(stderr, "lexical error: integer literal overflow due to sign\n");
exit(1);
}
int64_t val = sign ? -(int64_t) acc : (int64_t) acc;
union token_data data;
data.int_ = val;
struct token tok = { TOK_INTEGER, data };
return tok;
}
#define MAX_STR_LEN 4096
static size_t str_index;
static char str_buf[MAX_STR_LEN];
static char* lex_string(void) {
char* buf;
size_t len = 1;
buf = peek(&len);
assert(len == 1 && buf[0] == '"');
skip(1);
str_index = 0;
while (true) {
if (str_index == MAX_STR_LEN - 1) {
fprintf(stderr, "lexical error: string too long\n");
exit(1);
}
len = 1;
buf = peek(&len);
if (len < 1) {
fprintf(stderr, "lexical error: unclosed string (reached end of file)\n");
exit(1);
}
// TODO: string escapes, multi-line strings
if (buf[0] == '"') {
skip(1);
break;
}
if (buf[0] == '\n') {
fprintf(stderr, "lexical error: unclosed string (reached end of line)\n");
exit(1);
}
str_buf[str_index] = buf[0];
str_index++;
skip(1);
}
str_buf[str_index] = 0;
return str_buf;
}
static char* lex_identifier(void) {
char* buf;
size_t len;
char c;
str_index = 0;
while (true) {
len = 1;
buf = peek(&len);
if (len == 0) break;
c = buf[0];
if (!is_alpha(c) && !is_digit(c) && c != '_') break;
skip(1);
str_buf[str_index] = c;
str_index++;
}
if (str_index == 0) {
fprintf(stderr, "lexical error: expected identifier (possibly illegal character?)\n");
exit(1);
}
str_buf[str_index] = 0;
return str_buf;
}
struct token lex(void) {
char* buf;
size_t len;
len = 1;
buf = peek(&len);
if (len < 1) {
return simple(TOK_EOF);
}
char c = buf[0];
while (is_whitespace(c)) {
skip(1);
len = 1;
peek(&len);
if (len == 0) {
return simple(TOK_EOF);
}
c = buf[0];
}
if (begins_integer(c)) {
len = 2;
peek(&len);
if (is_digit(buf[0]) || (len > 1 && is_digit(buf[1])))
return lex_integer();
}
if (c == '"') {
union token_data data;
data.string = lex_string();
struct token tok = { TOK_STRING, data };
return tok;
}
if (c == '\'') {
skip(1);
union token_data data;
data.label = lex_identifier();
struct token tok = { TOK_LABEL, data };
return tok;
}
switch (c) {
case '{':
skip(1);
return simple(TOK_OPEN_BLOCK);
case '}':
skip(1);
return simple(TOK_CLOSE_BLOCK);
case '(':
skip(1);
return simple(TOK_OPEN_GROUP);
case ')':
skip(1);
return simple(TOK_CLOSE_GROUP);
case ';':
skip(1);
return simple(TOK_TERMINATOR);
case ',':
skip(1);
return simple(TOK_SEPARATOR);
case '=':
skip(1);
return op(OP_EQ);
case '+':
skip(1);
return op(OP_ADD);
case '-':
skip(1);
len = 1;
buf = peek(&len);
if (len == 1 && buf[0] == '>') {
skip(1);
return op(OP_FUN);
}
return op(OP_SUB);
case '*':
skip(1);
return op(OP_MUL);
case '/':
skip(1);
return op(OP_DIV);
case '%':
skip(1);
return op(OP_MOD);
case '~':
skip(1);
return op(OP_INV);
case '&':
skip(1);
return op(OP_AND);
case '|':
skip(1);
return op(OP_OR);
case '^':
skip(1);
return op(OP_XOR);
case '!':
skip(1);
len = 1;
buf = peek(&len);
if (len == 1 && buf[0] == '=') {
skip(1);
return op(OP_NE);
}
return op(OP_NOT);
case ':':
skip(1);
return op(OP_TYPE);
case '>':
skip(1);
len = 2;
buf = peek(&len);
if (len == 2 && buf[0] == '>' && buf[1] == '>') {
skip(2);
return op(OP_SHR);
} else if (len >= 1 && buf[0] == '>') {
skip(1);
return op(OP_SAR);
} else if (len >= 1 && buf[0] == '=') {
skip(1);
return op(OP_GTE);
}
return op(OP_GT);
case '<':
skip(1);
len = 1;
buf = peek(&len);
if (len == 1 && buf[0] == '<') {
skip(1);
return op(OP_SHL);
} else if (len == 1 && buf[0] == '=') {
skip(1);
return op(OP_LTE);
}
return op(OP_LT);
}
union token_data data;
data.name = lex_identifier();
struct token tok = { TOK_NAME, data };
return tok;
}