2022-09-07 10:22:38 -07:00
|
|
|
#include <assert.h>
|
2022-09-07 20:42:37 -07:00
|
|
|
#include <stdio.h>
|
2022-09-07 10:22:38 -07:00
|
|
|
#include <stdlib.h>
|
2022-09-07 20:42:37 -07:00
|
|
|
#include <string.h>
|
2022-09-07 10:22:38 -07:00
|
|
|
|
|
|
|
#include "lex.h"
|
2022-09-08 16:01:31 -07:00
|
|
|
#include "lex/indent.h"
|
2022-09-07 10:22:38 -07:00
|
|
|
#include "io.h"
|
|
|
|
|
2023-07-28 20:03:10 -07:00
|
|
|
static const char* const keywords[KEYWORD_COUNT] = {
|
|
|
|
"comm",
|
|
|
|
"assocl",
|
|
|
|
"assocr",
|
|
|
|
"distl",
|
|
|
|
"distr",
|
|
|
|
"factl",
|
|
|
|
"factr",
|
|
|
|
"mapl",
|
|
|
|
"mapr",
|
|
|
|
"unitil",
|
|
|
|
"unitir",
|
|
|
|
"unitel",
|
|
|
|
"uniter",
|
|
|
|
"comm+",
|
|
|
|
"assocl+",
|
|
|
|
"assocr+",
|
|
|
|
"mapl+",
|
|
|
|
"mapr+",
|
|
|
|
"inl",
|
|
|
|
"inr",
|
|
|
|
"out",
|
|
|
|
"halt",
|
|
|
|
"if",
|
|
|
|
};
|
2022-09-07 20:42:37 -07:00
|
|
|
|
2023-07-28 20:03:10 -07:00
|
|
|
static struct token simple(enum token_type type) {
|
|
|
|
struct token tok = { type, 0 };
|
|
|
|
return tok;
|
2022-09-07 20:42:37 -07:00
|
|
|
}
|
|
|
|
|
2023-07-28 20:03:10 -07:00
|
|
|
#define MAX_STR_LEN 4096
|
|
|
|
static size_t str_index;
|
|
|
|
// alternate string buffers between tokens to prevent overwriting buffer.
|
|
|
|
// we're LL(1) so 2 buffers is sufficient.
|
|
|
|
|
|
|
|
// NOTE: I later changed the code and it wasn't sufficient anymore, lmao.
|
|
|
|
static int which_buf = 0;
|
|
|
|
static char str_buf_1[MAX_STR_LEN];
|
|
|
|
static char str_buf_2[MAX_STR_LEN];
|
|
|
|
static char str_buf_3[MAX_STR_LEN];
|
|
|
|
|
|
|
|
static char* str_buf(void) {
|
|
|
|
which_buf = (which_buf + 1) % 3;
|
|
|
|
switch (which_buf) {
|
|
|
|
case 0: return str_buf_1;
|
|
|
|
case 1: return str_buf_2;
|
|
|
|
case 2: return str_buf_3;
|
|
|
|
}
|
|
|
|
assert(false);
|
2022-09-07 20:42:37 -07:00
|
|
|
}
|
|
|
|
|
2023-07-28 20:03:10 -07:00
|
|
|
|
2022-09-07 10:22:38 -07:00
|
|
|
static _Bool is_alpha(char c) {
|
|
|
|
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
|
|
|
|
}
|
|
|
|
|
|
|
|
static _Bool is_digit(char c) {
|
|
|
|
return c >= '0' && c <= '9';
|
|
|
|
}
|
|
|
|
|
2022-09-08 16:01:31 -07:00
|
|
|
static _Bool is_alphanumeric(char c) {
|
|
|
|
return is_alpha(c) || is_digit(c);
|
|
|
|
}
|
|
|
|
|
|
|
|
static _Bool is_id_char(char c) {
|
2022-09-10 14:58:22 -07:00
|
|
|
// TODO: allow unicode identifiers
|
|
|
|
return is_alphanumeric(c) || c == '_';
|
2022-09-07 10:22:38 -07:00
|
|
|
}
|
|
|
|
|
2023-07-28 20:03:10 -07:00
|
|
|
_Bool is_whitespace(char c) {
|
|
|
|
return c == ' ' || c == '\t' || c == '\r' || c == '\n';
|
2022-09-07 10:22:38 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static char* lex_identifier(void) {
|
|
|
|
str_index = 0;
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
char* buf = str_buf();
|
2022-09-07 10:22:38 -07:00
|
|
|
while (true) {
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
char c = peekc();
|
2022-09-08 16:01:31 -07:00
|
|
|
if (!is_id_char(c)) break;
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
nextc();
|
|
|
|
buf[str_index] = c;
|
2022-09-07 10:22:38 -07:00
|
|
|
str_index++;
|
|
|
|
}
|
|
|
|
if (str_index == 0) {
|
|
|
|
fprintf(stderr, "lexical error: expected identifier (possibly illegal character?)\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
buf[str_index] = 0;
|
|
|
|
return buf;
|
2022-09-07 10:22:38 -07:00
|
|
|
}
|
|
|
|
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
static struct token lex(void) {
|
2022-09-08 16:01:31 -07:00
|
|
|
char c = peekc();
|
2023-07-28 20:03:10 -07:00
|
|
|
while (true) {
|
|
|
|
// skip whitespace
|
|
|
|
while (is_whitespace(c)) {
|
|
|
|
nextc();
|
|
|
|
c = peekc();
|
2022-09-08 16:01:31 -07:00
|
|
|
}
|
2023-07-28 20:03:10 -07:00
|
|
|
// skip line comments
|
|
|
|
if (c == '!') {
|
|
|
|
do {
|
|
|
|
c = nextc();
|
|
|
|
} while (c != '\n');
|
|
|
|
} else {
|
|
|
|
break;
|
2022-09-08 16:01:31 -07:00
|
|
|
}
|
|
|
|
}
|
2023-07-28 20:03:10 -07:00
|
|
|
|
|
|
|
// syntax
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
switch (c) {
|
|
|
|
case 0:
|
2022-09-07 10:22:38 -07:00
|
|
|
return simple(TOK_EOF);
|
|
|
|
case '{':
|
2022-09-08 16:01:31 -07:00
|
|
|
nextc();
|
2023-07-28 20:03:10 -07:00
|
|
|
return simple(TOK_MAP_BEGIN);
|
2022-09-07 10:22:38 -07:00
|
|
|
case '}':
|
2022-09-08 16:01:31 -07:00
|
|
|
nextc();
|
2023-07-28 20:03:10 -07:00
|
|
|
return simple(TOK_MAP_END);
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
}
|
2023-07-28 20:03:10 -07:00
|
|
|
|
2022-09-10 14:58:22 -07:00
|
|
|
char* name = lex_identifier();
|
2023-07-28 20:03:10 -07:00
|
|
|
|
|
|
|
// keywords
|
|
|
|
for (size_t kwd = 0; kwd < KEYWORD_COUNT; kwd++) {
|
|
|
|
if (strcmp(name, keywords[kwd]) == 0) {
|
|
|
|
return simple((enum token_type) kwd);
|
|
|
|
}
|
2022-09-10 14:58:22 -07:00
|
|
|
}
|
2023-07-28 20:03:10 -07:00
|
|
|
|
|
|
|
enum token_type type = TOK_JUMP;
|
|
|
|
// labels
|
|
|
|
if (peekc() == ':') {
|
|
|
|
type = TOK_LABEL;
|
|
|
|
nextc();
|
2022-09-10 14:58:22 -07:00
|
|
|
}
|
|
|
|
|
2023-07-28 20:03:10 -07:00
|
|
|
struct token tok = { type, name };
|
2022-09-07 10:22:38 -07:00
|
|
|
return tok;
|
|
|
|
}
|
2022-09-07 20:42:37 -07:00
|
|
|
|
2022-09-08 16:01:31 -07:00
|
|
|
static _Bool init = false;
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
static struct token peek_buf;
|
|
|
|
|
|
|
|
struct token next(void) {
|
2022-09-08 16:01:31 -07:00
|
|
|
if (!init) {
|
|
|
|
init = true;
|
|
|
|
next();
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
}
|
2022-09-08 16:01:31 -07:00
|
|
|
struct token tmp = peek_buf;
|
|
|
|
peek_buf = lex();
|
|
|
|
return tmp;
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
struct token peek(void) {
|
|
|
|
return peek_buf;
|
|
|
|
}
|