2022-09-07 20:42:37 -07:00
|
|
|
#include <assert.h>
|
|
|
|
#include <stdbool.h>
|
|
|
|
#include <stdint.h>
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <string.h>
|
|
|
|
|
|
|
|
#include "lex.h"
|
|
|
|
#include "parse.h"
|
|
|
|
|
|
|
|
enum state {
|
|
|
|
ST_BLOCK,
|
2022-09-08 16:01:31 -07:00
|
|
|
ST_BLOCK_BODY,
|
|
|
|
ST_BLOCK_CONT,
|
|
|
|
ST_BLOCK_CLOSE,
|
2022-09-07 20:42:37 -07:00
|
|
|
ST_ASSIGN,
|
|
|
|
ST_EXPR,
|
|
|
|
ST_EXPR_CONT,
|
|
|
|
ST_GROUP,
|
|
|
|
ST_IF_ELSE,
|
|
|
|
ST_LOOP_VARS,
|
|
|
|
ST_LOOP_VARS_CONT,
|
|
|
|
};
|
|
|
|
|
|
|
|
const char* state_name(enum state st) {
|
|
|
|
switch (st) {
|
|
|
|
case ST_BLOCK:
|
2022-09-08 16:01:31 -07:00
|
|
|
return "{";
|
|
|
|
case ST_BLOCK_BODY:
|
|
|
|
return "B";
|
|
|
|
case ST_BLOCK_CONT:
|
2022-09-07 20:42:37 -07:00
|
|
|
return ";";
|
2022-09-08 16:01:31 -07:00
|
|
|
case ST_BLOCK_CLOSE:
|
|
|
|
return "}";
|
2022-09-07 20:42:37 -07:00
|
|
|
case ST_ASSIGN:
|
|
|
|
return "=";
|
|
|
|
case ST_EXPR:
|
|
|
|
return "x";
|
|
|
|
case ST_EXPR_CONT:
|
|
|
|
return "c";
|
|
|
|
case ST_GROUP:
|
|
|
|
return "(";
|
|
|
|
case ST_IF_ELSE:
|
|
|
|
return "|";
|
|
|
|
case ST_LOOP_VARS:
|
|
|
|
return "v";
|
|
|
|
case ST_LOOP_VARS_CONT:
|
|
|
|
return ",";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#define MAX_CONTEXT 256
|
|
|
|
static uint32_t sp = 0;
|
|
|
|
static enum state stack[MAX_CONTEXT];
|
|
|
|
|
|
|
|
static void debug_print(struct token tok, struct token next) {
|
|
|
|
for (uint32_t i = 0; i < sp; i++) {
|
|
|
|
printf("%s", state_name(stack[i]));
|
|
|
|
}
|
|
|
|
printf(" ");
|
|
|
|
print_token(tok);
|
|
|
|
printf(" ");
|
|
|
|
print_token(next);
|
|
|
|
printf("\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static void push(enum state state) {
|
|
|
|
stack[sp] = state;
|
|
|
|
sp++;
|
|
|
|
}
|
|
|
|
|
|
|
|
static enum state pop(void) {
|
|
|
|
assert(sp != 0);
|
|
|
|
sp--;
|
|
|
|
return stack[sp];
|
|
|
|
}
|
|
|
|
|
|
|
|
static _Bool is_assignment(struct token tok, struct token next) {
|
|
|
|
return tok.type == TOK_NAME && next.type == TOK_OPERATOR && next.data.op == OP_EQ;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static _Bool is_expr(struct token tok) {
|
|
|
|
if (is_lit(tok) || tok.type == TOK_OPEN_GROUP) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return tok.type == TOK_NAME;
|
|
|
|
}
|
|
|
|
|
|
|
|
#define syntax_error(msg) fprintf(stderr, "syntax error: %s\n", msg); exit(1)
|
|
|
|
|
|
|
|
void parse(void) {
|
2022-09-08 16:01:31 -07:00
|
|
|
sp = 0;
|
2022-09-07 20:42:37 -07:00
|
|
|
// TODO: add support for the top-level instead of this block hack
|
2022-09-08 16:01:31 -07:00
|
|
|
push(ST_BLOCK_BODY);
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
struct token tok = next();
|
2022-09-08 16:01:31 -07:00
|
|
|
struct token nxt = peek();
|
2022-09-07 20:42:37 -07:00
|
|
|
while (sp > 0) {
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
debug_print(tok, nxt);
|
2022-09-07 20:42:37 -07:00
|
|
|
// FIXME: stack underflow because we're faking the top-level with blocks
|
|
|
|
switch (pop()) {
|
|
|
|
case ST_BLOCK:
|
2022-09-08 16:01:31 -07:00
|
|
|
if (tok.type == TOK_OPEN_BLOCK) {
|
|
|
|
push(ST_BLOCK_CLOSE);
|
|
|
|
push(ST_BLOCK_BODY);
|
2022-09-07 20:42:37 -07:00
|
|
|
break;
|
|
|
|
}
|
2022-09-08 16:01:31 -07:00
|
|
|
syntax_error("expected beginning of block");
|
|
|
|
break;
|
|
|
|
case ST_BLOCK_BODY:
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
if (is_assignment(tok, nxt)) {
|
2022-09-08 16:01:31 -07:00
|
|
|
push(ST_BLOCK_CONT);
|
2022-09-07 20:42:37 -07:00
|
|
|
push(ST_ASSIGN);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (is_expr(tok)) {
|
2022-09-08 16:01:31 -07:00
|
|
|
push(ST_BLOCK_CONT);
|
2022-09-07 20:42:37 -07:00
|
|
|
push(ST_EXPR);
|
|
|
|
continue;
|
|
|
|
}
|
2022-09-08 16:01:31 -07:00
|
|
|
continue;
|
|
|
|
case ST_BLOCK_CONT:
|
|
|
|
if (tok.type == TOK_TERMINATOR) {
|
|
|
|
push(ST_BLOCK_BODY);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
case ST_BLOCK_CLOSE:
|
|
|
|
if (tok.type == TOK_CLOSE_BLOCK) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
syntax_error("expected end of block");
|
2022-09-07 20:42:37 -07:00
|
|
|
case ST_ASSIGN:
|
|
|
|
assert(tok.type == TOK_OPERATOR || tok.data.op == OP_EQ);
|
|
|
|
push(ST_EXPR);
|
|
|
|
break;
|
|
|
|
case ST_EXPR:
|
|
|
|
if (tok.type == TOK_STRING) {
|
2022-09-08 16:01:31 -07:00
|
|
|
push(ST_EXPR_CONT);
|
2022-09-07 20:42:37 -07:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (tok.type == TOK_INTEGER) {
|
2022-09-08 16:01:31 -07:00
|
|
|
push(ST_EXPR_CONT);
|
2022-09-07 20:42:37 -07:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (tok.type == TOK_NAME) {
|
|
|
|
char* name = tok.data.name;
|
|
|
|
if (strcmp(name, "if") == 0) {
|
|
|
|
push(ST_IF_ELSE);
|
2022-09-08 16:01:31 -07:00
|
|
|
push(ST_BLOCK);
|
2022-09-07 20:42:37 -07:00
|
|
|
push(ST_EXPR);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (strcmp(name, "loop") == 0) {
|
2022-09-08 16:01:31 -07:00
|
|
|
push(ST_BLOCK);
|
2022-09-07 20:42:37 -07:00
|
|
|
push(ST_LOOP_VARS);
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
if (nxt.type == TOK_LABEL) {
|
2022-09-08 16:01:31 -07:00
|
|
|
next();
|
2022-09-07 20:42:37 -07:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (strcmp(name, "next") == 0) {
|
|
|
|
push(ST_LOOP_VARS);
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
if (nxt.type == TOK_LABEL) {
|
2022-09-08 16:01:31 -07:00
|
|
|
next();
|
2022-09-07 20:42:37 -07:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (strcmp(name, "exit") == 0) {
|
|
|
|
push(ST_EXPR);
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
if (nxt.type == TOK_LABEL) {
|
2022-09-08 16:01:31 -07:00
|
|
|
next();
|
2022-09-07 20:42:37 -07:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (strcmp(name, "return") == 0) {
|
|
|
|
push(ST_EXPR);
|
|
|
|
break;
|
|
|
|
}
|
2022-09-08 16:01:31 -07:00
|
|
|
push(ST_EXPR_CONT);
|
2022-09-07 20:42:37 -07:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (tok.type == TOK_OPEN_GROUP) {
|
2022-09-08 16:01:31 -07:00
|
|
|
push(ST_EXPR_CONT);
|
2022-09-07 20:42:37 -07:00
|
|
|
push(ST_GROUP);
|
|
|
|
push(ST_EXPR);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (tok.type == TOK_OPERATOR && is_unary(tok.data.op)) {
|
2022-09-08 16:01:31 -07:00
|
|
|
push(ST_EXPR_CONT);
|
2022-09-07 20:42:37 -07:00
|
|
|
push(ST_EXPR);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
syntax_error("expected expression");
|
|
|
|
case ST_EXPR_CONT:
|
|
|
|
if (is_expr(tok)) {
|
|
|
|
push(ST_EXPR);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (tok.type == TOK_OPERATOR && is_binary(tok.data.op)) {
|
|
|
|
push(ST_EXPR);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
case ST_GROUP:
|
|
|
|
if (tok.type == TOK_CLOSE_GROUP) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
syntax_error("mismatched parentheses");
|
|
|
|
case ST_IF_ELSE:
|
|
|
|
if (tok.type == TOK_NAME && strcmp(tok.data.name, "else") == 0) {
|
2022-09-08 16:01:31 -07:00
|
|
|
push(ST_BLOCK);
|
2022-09-07 20:42:37 -07:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
case ST_LOOP_VARS:
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
if (is_assignment(tok, nxt)) {
|
2022-09-07 20:42:37 -07:00
|
|
|
push(ST_LOOP_VARS_CONT);
|
|
|
|
push(ST_ASSIGN);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (tok.type == TOK_NAME) {
|
|
|
|
push(ST_LOOP_VARS_CONT);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
case ST_LOOP_VARS_CONT:
|
|
|
|
if (tok.type == TOK_SEPARATOR) {
|
|
|
|
push(ST_LOOP_VARS);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
tok = next();
|
2022-09-08 16:01:31 -07:00
|
|
|
nxt = peek();
|
2022-09-07 20:42:37 -07:00
|
|
|
}
|
|
|
|
if (tok.type != TOK_EOF) {
|
|
|
|
fprintf(stderr, "syntax error: finished parsing before end of file\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
if (sp > 0) {
|
|
|
|
fprintf(stderr, "syntax error: unfinished business at end of file: %i, %i\n", sp, stack[0]);
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
}
|