pass-lang/src/lex.c

169 lines
3.3 KiB
C

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "lex.h"
#include "lex/indent.h"
#include "io.h"
static const char* const keywords[KEYWORD_COUNT] = {
"comm",
"assocl",
"assocr",
"distl",
"distr",
"factl",
"factr",
"mapl",
"mapr",
"unitil",
"unitir",
"unitel",
"uniter",
"comm+",
"assocl+",
"assocr+",
"mapl+",
"mapr+",
"inl",
"inr",
"out",
"halt",
"if",
};
static struct token simple(enum token_type type) {
struct token tok = { type, 0 };
return tok;
}
#define MAX_STR_LEN 4096
static size_t str_index;
// alternate string buffers between tokens to prevent overwriting buffer.
// we're LL(1) so 2 buffers is sufficient.
// NOTE: I later changed the code and it wasn't sufficient anymore, lmao.
static int which_buf = 0;
static char str_buf_1[MAX_STR_LEN];
static char str_buf_2[MAX_STR_LEN];
static char str_buf_3[MAX_STR_LEN];
static char* str_buf(void) {
which_buf = (which_buf + 1) % 3;
switch (which_buf) {
case 0: return str_buf_1;
case 1: return str_buf_2;
case 2: return str_buf_3;
}
assert(false);
}
static _Bool is_alpha(char c) {
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
}
static _Bool is_digit(char c) {
return c >= '0' && c <= '9';
}
static _Bool is_alphanumeric(char c) {
return is_alpha(c) || is_digit(c);
}
static _Bool is_id_char(char c) {
// TODO: allow unicode identifiers
return is_alphanumeric(c) || c == '_';
}
_Bool is_whitespace(char c) {
return c == ' ' || c == '\t' || c == '\r' || c == '\n';
}
static char* lex_identifier(void) {
str_index = 0;
char* buf = str_buf();
while (true) {
char c = peekc();
if (!is_id_char(c)) break;
nextc();
buf[str_index] = c;
str_index++;
}
if (str_index == 0) {
fprintf(stderr, "lexical error: expected identifier (possibly illegal character?)\n");
exit(1);
}
buf[str_index] = 0;
return buf;
}
static struct token lex(void) {
char c = peekc();
while (true) {
// skip whitespace
while (is_whitespace(c)) {
nextc();
c = peekc();
}
// skip line comments
if (c == '!') {
do {
c = nextc();
} while (c != '\n');
} else {
break;
}
}
// syntax
switch (c) {
case 0:
return simple(TOK_EOF);
case '{':
nextc();
return simple(TOK_MAP_BEGIN);
case '}':
nextc();
return simple(TOK_MAP_END);
}
char* name = lex_identifier();
// keywords
for (size_t kwd = 0; kwd < KEYWORD_COUNT; kwd++) {
if (strcmp(name, keywords[kwd]) == 0) {
return simple((enum token_type) kwd);
}
}
enum token_type type = TOK_JUMP;
// labels
if (peekc() == ':') {
type = TOK_LABEL;
nextc();
}
struct token tok = { type, name };
return tok;
}
static _Bool init = false;
static struct token peek_buf;
struct token next(void) {
if (!init) {
init = true;
next();
}
struct token tmp = peek_buf;
peek_buf = lex();
return tmp;
}
struct token peek(void) {
return peek_buf;
}