2022-09-07 10:22:38 -07:00
|
|
|
#include <assert.h>
|
2022-09-07 20:42:37 -07:00
|
|
|
#include <stdio.h>
|
2022-09-07 10:22:38 -07:00
|
|
|
#include <stdlib.h>
|
2022-09-07 20:42:37 -07:00
|
|
|
#include <string.h>
|
2022-09-07 10:22:38 -07:00
|
|
|
|
|
|
|
#include "lex.h"
|
2022-09-08 16:01:31 -07:00
|
|
|
#include "lex/indent.h"
|
2022-09-07 10:22:38 -07:00
|
|
|
#include "io.h"
|
|
|
|
|
2022-09-07 20:42:37 -07:00
|
|
|
_Bool is_unary(enum operator_ op) {
|
|
|
|
return op == OP_SUB
|
|
|
|
|| op == OP_INV
|
|
|
|
|| op == OP_NOT;
|
|
|
|
}
|
|
|
|
|
|
|
|
_Bool is_binary(enum operator_ op) {
|
|
|
|
return op == OP_EQ
|
|
|
|
|| op == OP_ADD
|
|
|
|
|| op == OP_SUB
|
|
|
|
|| op == OP_MUL
|
|
|
|
|| op == OP_DIV
|
|
|
|
|| op == OP_MOD
|
|
|
|
|| op == OP_AND
|
|
|
|
|| op == OP_OR
|
|
|
|
|| op == OP_XOR
|
|
|
|
|| op == OP_SHL
|
|
|
|
|| op == OP_SAR
|
|
|
|
|| op == OP_SHR
|
|
|
|
|| op == OP_GT
|
|
|
|
|| op == OP_LT
|
|
|
|
|| op == OP_GTE
|
|
|
|
|| op == OP_LTE
|
|
|
|
|| op == OP_NE
|
|
|
|
|| op == OP_TYPE
|
|
|
|
|| op == OP_FUN;
|
|
|
|
}
|
|
|
|
|
|
|
|
_Bool is_lit(struct token tok) {
|
|
|
|
return tok.type == TOK_INTEGER || tok.type == TOK_STRING || tok.type == TOK_NAME;
|
|
|
|
}
|
|
|
|
|
2022-09-07 10:22:38 -07:00
|
|
|
static _Bool is_alpha(char c) {
|
|
|
|
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
|
|
|
|
}
|
|
|
|
|
|
|
|
static _Bool is_digit(char c) {
|
|
|
|
return c >= '0' && c <= '9';
|
|
|
|
}
|
|
|
|
|
2022-09-08 16:01:31 -07:00
|
|
|
static _Bool is_alphanumeric(char c) {
|
|
|
|
return is_alpha(c) || is_digit(c);
|
|
|
|
}
|
|
|
|
|
|
|
|
static _Bool is_id_char(char c) {
|
2022-09-07 10:22:38 -07:00
|
|
|
return is_alpha(c) || is_digit(c) || c == '_';
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct token simple(enum token_type type) {
|
|
|
|
struct token tok = { type, 0 };
|
|
|
|
return tok;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct token op(enum operator_ op) {
|
|
|
|
union token_data data;
|
|
|
|
data.op = op;
|
|
|
|
struct token tok = { TOK_OPERATOR, data };
|
|
|
|
return tok;
|
|
|
|
}
|
|
|
|
|
|
|
|
static uint8_t digit_value(uint8_t base, char c) {
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
assert(base <= 36);
|
|
|
|
if (is_digit(c)) { return c - '0'; }
|
|
|
|
if (c >= 'A' && c <= 'Z') { return c - 'A' + 0xA; }
|
|
|
|
if (c >= 'a' && c <= 'z') { return c - 'a' + 0xA; }
|
|
|
|
assert(0);
|
2022-09-07 10:22:38 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
static _Bool is_extended_digit(uint8_t base, char c) {
|
2022-09-08 16:01:31 -07:00
|
|
|
if (!is_alphanumeric(c)) {
|
2022-09-07 10:22:38 -07:00
|
|
|
return false;
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
}
|
2022-09-07 10:22:38 -07:00
|
|
|
uint8_t val = digit_value(base, c);
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
return val < base;
|
2022-09-07 10:22:38 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
static uint64_t lex_digits(uint8_t base) {
|
|
|
|
_Bool at_least_one_char = false;
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
uint64_t acc = 0;
|
2022-09-07 10:22:38 -07:00
|
|
|
while (true) {
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
char c = peekc();
|
|
|
|
if (!is_extended_digit(base, c)) {
|
2022-09-07 10:22:38 -07:00
|
|
|
// commas are legal digit separators
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
if (c == ',' && is_extended_digit(base, peekc())) {
|
|
|
|
nextc();
|
|
|
|
continue;
|
2022-09-07 10:22:38 -07:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
nextc();
|
|
|
|
uint8_t digit = digit_value(base, c);
|
2022-09-07 10:22:38 -07:00
|
|
|
// (val * base + digit) > UINT64_MAX
|
|
|
|
if (acc > ((UINT64_MAX - digit) / base)) {
|
|
|
|
fprintf(stderr, "lexical error: integer literal overflow\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
at_least_one_char = true;
|
|
|
|
acc *= base;
|
|
|
|
acc += digit;
|
|
|
|
}
|
|
|
|
if (!at_least_one_char) {
|
|
|
|
fprintf(stderr, "lexical error: expected digits\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
return acc;
|
|
|
|
}
|
|
|
|
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
static struct token lex_integer(_Bool sign) {
|
2022-09-07 10:22:38 -07:00
|
|
|
uint64_t acc = lex_digits(10);
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
if (peekc() == '#') {
|
2022-09-08 16:01:31 -07:00
|
|
|
if (acc != 2 || acc != 8 && acc != 10 && acc != 16) {
|
|
|
|
fprintf(stderr, "lexical error: illegal integer literal base\n");
|
2022-09-07 10:22:38 -07:00
|
|
|
exit(1);
|
|
|
|
}
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
nextc();
|
2022-09-07 10:22:38 -07:00
|
|
|
acc = lex_digits((uint8_t) acc);
|
|
|
|
}
|
|
|
|
if (sign && acc > INT64_MAX) {
|
2022-09-08 16:01:31 -07:00
|
|
|
fprintf(stderr, "lexical error: signed integer literal overflow\n");
|
2022-09-07 10:22:38 -07:00
|
|
|
exit(1);
|
|
|
|
}
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
int64_t val = (int64_t) acc;
|
|
|
|
if (sign) {
|
|
|
|
val = -val;
|
|
|
|
}
|
2022-09-07 10:22:38 -07:00
|
|
|
union token_data data;
|
|
|
|
data.int_ = val;
|
|
|
|
struct token tok = { TOK_INTEGER, data };
|
|
|
|
return tok;
|
|
|
|
}
|
|
|
|
|
|
|
|
#define MAX_STR_LEN 4096
|
|
|
|
static size_t str_index;
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
// alternate string buffers between tokens to prevent overwriting buffer.
|
|
|
|
// we're LL(1) so 2 buffers is sufficient.
|
|
|
|
static _Bool which_buf = false;
|
|
|
|
static char str_buf_1[MAX_STR_LEN];
|
|
|
|
static char str_buf_2[MAX_STR_LEN];
|
2022-09-07 10:22:38 -07:00
|
|
|
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
static char* str_buf(void) {
|
|
|
|
which_buf = !which_buf;
|
|
|
|
if (which_buf) {
|
|
|
|
return str_buf_1;
|
|
|
|
}
|
|
|
|
return str_buf_2;
|
2022-09-07 20:42:37 -07:00
|
|
|
}
|
|
|
|
|
2022-09-07 10:22:38 -07:00
|
|
|
static char* lex_string(void) {
|
|
|
|
str_index = 0;
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
char* buf = str_buf();
|
2022-09-07 10:22:38 -07:00
|
|
|
while (true) {
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
char c = nextc();
|
2022-09-07 10:22:38 -07:00
|
|
|
if (str_index == MAX_STR_LEN - 1) {
|
|
|
|
fprintf(stderr, "lexical error: string too long\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
if (c == 0) {
|
2022-09-07 10:22:38 -07:00
|
|
|
fprintf(stderr, "lexical error: unclosed string (reached end of file)\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
// TODO: string escapes, multi-line strings
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
if (c == '"') {
|
2022-09-07 10:22:38 -07:00
|
|
|
break;
|
|
|
|
}
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
if (c == '\n') {
|
2022-09-07 10:22:38 -07:00
|
|
|
fprintf(stderr, "lexical error: unclosed string (reached end of line)\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
buf[str_index] = c;
|
2022-09-07 10:22:38 -07:00
|
|
|
str_index++;
|
|
|
|
}
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
buf[str_index] = 0;
|
|
|
|
return buf;
|
2022-09-07 10:22:38 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
static char* lex_identifier(void) {
|
|
|
|
str_index = 0;
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
char* buf = str_buf();
|
2022-09-07 10:22:38 -07:00
|
|
|
while (true) {
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
char c = peekc();
|
2022-09-08 16:01:31 -07:00
|
|
|
if (!is_id_char(c)) break;
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
nextc();
|
|
|
|
buf[str_index] = c;
|
2022-09-07 10:22:38 -07:00
|
|
|
str_index++;
|
|
|
|
}
|
|
|
|
if (str_index == 0) {
|
|
|
|
fprintf(stderr, "lexical error: expected identifier (possibly illegal character?)\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
buf[str_index] = 0;
|
|
|
|
return buf;
|
2022-09-07 10:22:38 -07:00
|
|
|
}
|
|
|
|
|
2022-09-08 16:01:31 -07:00
|
|
|
static uint32_t indent_level = 0;
|
|
|
|
static uint32_t pending_level = 0;
|
|
|
|
static _Bool level_is_block[MAX_INDENTS] = {true};
|
|
|
|
// going back to a previous indentation level.
|
|
|
|
// if we're going back, then we insert a terminator.
|
|
|
|
static _Bool going_back = false;
|
|
|
|
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
static struct token lex(void) {
|
2022-09-08 16:01:31 -07:00
|
|
|
char c = peekc();
|
|
|
|
if (is_newline(c)) {
|
|
|
|
indent_level = lex_indentation();
|
|
|
|
if (indent_level <= pending_level) {
|
|
|
|
going_back = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
while (indent_level > pending_level) {
|
|
|
|
pending_level++;
|
|
|
|
if (level_is_block[pending_level]) {
|
|
|
|
return simple(TOK_OPEN_BLOCK);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
while (indent_level < pending_level) {
|
|
|
|
_Bool was_block = level_is_block[pending_level];
|
|
|
|
level_is_block[pending_level] = false;
|
|
|
|
pending_level--;
|
|
|
|
if (was_block) {
|
|
|
|
return simple(TOK_CLOSE_BLOCK);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (going_back) {
|
|
|
|
going_back = false;
|
|
|
|
if (level_is_block[indent_level]) {
|
|
|
|
return simple(TOK_TERMINATOR);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
c = peekc();
|
|
|
|
while (is_indent(c)) {
|
|
|
|
nextc();
|
|
|
|
c = peekc();
|
|
|
|
}
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
_Bool sign = false;
|
|
|
|
switch (c) {
|
|
|
|
case 0:
|
2022-09-08 16:01:31 -07:00
|
|
|
nextc();
|
2022-09-07 10:22:38 -07:00
|
|
|
return simple(TOK_EOF);
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
case '"': {
|
2022-09-08 16:01:31 -07:00
|
|
|
nextc();
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
union token_data data;
|
|
|
|
data.string = lex_string();
|
|
|
|
struct token tok = { TOK_STRING, data };
|
|
|
|
return tok;
|
|
|
|
}
|
|
|
|
case '\'': {
|
2022-09-08 16:01:31 -07:00
|
|
|
nextc();
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
union token_data data;
|
|
|
|
data.label = lex_identifier();
|
|
|
|
struct token tok = { TOK_LABEL, data };
|
|
|
|
return tok;
|
2022-09-07 10:22:38 -07:00
|
|
|
}
|
|
|
|
case '{':
|
2022-09-08 16:01:31 -07:00
|
|
|
nextc();
|
2022-09-07 10:22:38 -07:00
|
|
|
return simple(TOK_OPEN_BLOCK);
|
|
|
|
case '}':
|
2022-09-08 16:01:31 -07:00
|
|
|
nextc();
|
2022-09-07 10:22:38 -07:00
|
|
|
return simple(TOK_CLOSE_BLOCK);
|
|
|
|
case '(':
|
2022-09-08 16:01:31 -07:00
|
|
|
nextc();
|
2022-09-07 10:22:38 -07:00
|
|
|
return simple(TOK_OPEN_GROUP);
|
|
|
|
case ')':
|
2022-09-08 16:01:31 -07:00
|
|
|
nextc();
|
2022-09-07 10:22:38 -07:00
|
|
|
return simple(TOK_CLOSE_GROUP);
|
|
|
|
case ';':
|
2022-09-08 16:01:31 -07:00
|
|
|
nextc();
|
2022-09-07 10:22:38 -07:00
|
|
|
return simple(TOK_TERMINATOR);
|
|
|
|
case ',':
|
2022-09-08 16:01:31 -07:00
|
|
|
nextc();
|
2022-09-07 10:22:38 -07:00
|
|
|
return simple(TOK_SEPARATOR);
|
|
|
|
case '=':
|
2022-09-08 16:01:31 -07:00
|
|
|
nextc();
|
2022-09-07 10:22:38 -07:00
|
|
|
return op(OP_EQ);
|
|
|
|
case '+':
|
2022-09-08 16:01:31 -07:00
|
|
|
nextc();
|
2022-09-07 10:22:38 -07:00
|
|
|
return op(OP_ADD);
|
|
|
|
case '-':
|
2022-09-08 16:01:31 -07:00
|
|
|
nextc();
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
if (peekc() == '>') {
|
|
|
|
nextc();
|
2022-09-07 10:22:38 -07:00
|
|
|
return op(OP_FUN);
|
|
|
|
}
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
if (is_digit(peekc())) {
|
|
|
|
return lex_integer(true);
|
|
|
|
}
|
2022-09-07 10:22:38 -07:00
|
|
|
return op(OP_SUB);
|
|
|
|
case '*':
|
2022-09-08 16:01:31 -07:00
|
|
|
nextc();
|
2022-09-07 10:22:38 -07:00
|
|
|
return op(OP_MUL);
|
|
|
|
case '/':
|
2022-09-08 16:01:31 -07:00
|
|
|
nextc();
|
2022-09-07 10:22:38 -07:00
|
|
|
return op(OP_DIV);
|
|
|
|
case '%':
|
2022-09-08 16:01:31 -07:00
|
|
|
nextc();
|
2022-09-07 10:22:38 -07:00
|
|
|
return op(OP_MOD);
|
|
|
|
case '~':
|
2022-09-08 16:01:31 -07:00
|
|
|
nextc();
|
2022-09-07 10:22:38 -07:00
|
|
|
return op(OP_INV);
|
|
|
|
case '&':
|
2022-09-08 16:01:31 -07:00
|
|
|
nextc();
|
2022-09-07 10:22:38 -07:00
|
|
|
return op(OP_AND);
|
|
|
|
case '|':
|
2022-09-08 16:01:31 -07:00
|
|
|
nextc();
|
2022-09-07 10:22:38 -07:00
|
|
|
return op(OP_OR);
|
|
|
|
case '^':
|
2022-09-08 16:01:31 -07:00
|
|
|
nextc();
|
2022-09-07 10:22:38 -07:00
|
|
|
return op(OP_XOR);
|
|
|
|
case '!':
|
2022-09-08 16:01:31 -07:00
|
|
|
nextc();
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
if (peekc() == '=') {
|
|
|
|
nextc();
|
2022-09-07 10:22:38 -07:00
|
|
|
return op(OP_NE);
|
|
|
|
}
|
|
|
|
return op(OP_NOT);
|
|
|
|
case ':':
|
2022-09-08 16:01:31 -07:00
|
|
|
nextc();
|
|
|
|
while (is_indent(peekc())) {
|
|
|
|
nextc();
|
|
|
|
}
|
|
|
|
if (is_newline(peekc())) {
|
|
|
|
level_is_block[indent_level + 1] = true;
|
|
|
|
return lex();
|
|
|
|
}
|
2022-09-07 10:22:38 -07:00
|
|
|
return op(OP_TYPE);
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
case '>':
|
2022-09-08 16:01:31 -07:00
|
|
|
nextc();
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
c = peekc();
|
|
|
|
if (c == '=') {
|
|
|
|
nextc();
|
2022-09-07 10:22:38 -07:00
|
|
|
return op(OP_GTE);
|
|
|
|
}
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
if (c == '>') {
|
|
|
|
nextc();
|
|
|
|
if (peekc() == '>') {
|
|
|
|
nextc();
|
|
|
|
return op(OP_SHR);
|
|
|
|
}
|
|
|
|
return op(OP_SAR);
|
|
|
|
}
|
2022-09-07 10:22:38 -07:00
|
|
|
return op(OP_GT);
|
|
|
|
case '<':
|
2022-09-08 16:01:31 -07:00
|
|
|
nextc();
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
c = peekc();
|
|
|
|
if (c == '<') {
|
|
|
|
nextc();
|
2022-09-07 10:22:38 -07:00
|
|
|
return op(OP_SHL);
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
}
|
|
|
|
if (c == '=') {
|
|
|
|
nextc();
|
2022-09-07 10:22:38 -07:00
|
|
|
return op(OP_LTE);
|
|
|
|
}
|
|
|
|
return op(OP_LT);
|
|
|
|
}
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
if (is_digit(c)) {
|
|
|
|
return lex_integer(false);
|
|
|
|
}
|
2022-09-07 10:22:38 -07:00
|
|
|
union token_data data;
|
|
|
|
data.name = lex_identifier();
|
|
|
|
struct token tok = { TOK_NAME, data };
|
|
|
|
return tok;
|
|
|
|
}
|
2022-09-07 20:42:37 -07:00
|
|
|
|
2022-09-08 16:01:31 -07:00
|
|
|
static _Bool init = false;
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
static struct token peek_buf;
|
|
|
|
|
|
|
|
struct token next(void) {
|
2022-09-08 16:01:31 -07:00
|
|
|
if (!init) {
|
|
|
|
init = true;
|
|
|
|
indent_level = lex_indentation();
|
|
|
|
next();
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
}
|
2022-09-08 16:01:31 -07:00
|
|
|
struct token tmp = peek_buf;
|
|
|
|
peek_buf = lex();
|
|
|
|
return tmp;
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
struct token peek(void) {
|
|
|
|
return peek_buf;
|
|
|
|
}
|
|
|
|
|
2022-09-07 20:42:37 -07:00
|
|
|
void print_token(struct token tok) {
|
|
|
|
switch (tok.type) {
|
|
|
|
case TOK_NAME:
|
|
|
|
fprintf(stdout, "%s", tok.data.name);
|
|
|
|
break;
|
|
|
|
case TOK_LABEL:
|
|
|
|
fprintf(stdout, "'%s", tok.data.label);
|
|
|
|
break;
|
|
|
|
case TOK_INTEGER:
|
|
|
|
fprintf(stdout, "%zi", tok.data.int_);
|
|
|
|
break;
|
|
|
|
case TOK_STRING:
|
|
|
|
fprintf(stdout, "\"%s\"", tok.data.string);
|
|
|
|
break;
|
|
|
|
case TOK_OPEN_GROUP:
|
|
|
|
fprintf(stdout, "(");
|
|
|
|
break;
|
|
|
|
case TOK_CLOSE_GROUP:
|
|
|
|
fprintf(stdout, ")");
|
|
|
|
break;
|
|
|
|
case TOK_OPEN_BLOCK:
|
|
|
|
fprintf(stdout, "{");
|
|
|
|
break;
|
|
|
|
case TOK_CLOSE_BLOCK:
|
|
|
|
fprintf(stdout, "}");
|
|
|
|
break;
|
|
|
|
case TOK_TERMINATOR:
|
|
|
|
fprintf(stdout, ";");
|
|
|
|
break;
|
|
|
|
case TOK_SEPARATOR:
|
|
|
|
fprintf(stdout, ",");
|
|
|
|
break;
|
|
|
|
case TOK_OPERATOR:
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
fprintf(stdout, "OP:%i", tok.data.op);
|
2022-09-07 20:42:37 -07:00
|
|
|
break;
|
|
|
|
case TOK_EOF:
|
|
|
|
fprintf(stdout, "EOF");
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|