Implemented parser! Recognition only, no output.

Also no top-level declarations or operator precedence.

The syntax is LL(1). LL syntax seems necessary because
our codegen requires emitting certain code (e.g. entering control)
prior to any codegen inside that context, whereas something like
LR would presumably parse the inner expression before recognizing
the control structure. There may be some way to work around this;
I don't know, I'm not a parsing expert.

Certain parts of the syntax are wonky, e.g. juxtaposition as
function application means a missing semicolon can give confusing
results. I suspect indentation-sensitive syntax would work
more nicely, and intend to implement it some time in the future.
master
James T. Martin 2022-09-07 20:42:37 -07:00
parent 162683d63e
commit d7c0eef7ae
Signed by: james
GPG Key ID: D6FB2F9892F9B225
9 changed files with 341 additions and 50 deletions

View File

@ -6,7 +6,7 @@ SHELL = /bin/sh
CFLAGS = -std=c99 -pedantic -Wextra -Os
LDFLAGS = -lc
OBJECTS = asm.o io.o ir.o lex.o main.o x86encode.o
OBJECTS = asm.o io.o ir.o lex.o main.o parse.o x86encode.o
.PHONY: passc
passc: .bin $(OBJECTS)

View File

@ -2,6 +2,7 @@
#include <errno.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#ifdef __unix__

View File

@ -1,8 +1,8 @@
#ifndef _IO_H
#define _IO_H
#ifndef IO_H
#define IO_H
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
extern uint32_t here;

View File

@ -28,6 +28,7 @@ label enter(uint32_t retc);
/// plus the return values declared by the call to `enter`.
void leave(var* args);
label declare_continue(uint32_t retc);
/// Declare a new label in the innermost block.
///

View File

@ -1,12 +1,45 @@
#include <assert.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include "lex.h"
#include "io.h"
_Bool is_unary(enum operator_ op) {
return op == OP_SUB
|| op == OP_INV
|| op == OP_NOT;
}
_Bool is_binary(enum operator_ op) {
return op == OP_EQ
|| op == OP_ADD
|| op == OP_SUB
|| op == OP_MUL
|| op == OP_DIV
|| op == OP_MOD
|| op == OP_AND
|| op == OP_OR
|| op == OP_XOR
|| op == OP_SHL
|| op == OP_SAR
|| op == OP_SHR
|| op == OP_GT
|| op == OP_LT
|| op == OP_GTE
|| op == OP_LTE
|| op == OP_NE
|| op == OP_TYPE
|| op == OP_FUN;
}
_Bool is_lit(struct token tok) {
return tok.type == TOK_INTEGER || tok.type == TOK_STRING || tok.type == TOK_NAME;
}
static _Bool is_whitespace(char c) {
return c == ' ' || c == '\t' || c == '\r' || c == '\n';
}
@ -153,6 +186,14 @@ static struct token lex_integer(void) {
static size_t str_index;
static char str_buf[MAX_STR_LEN];
static char* leak_buf(void) {
// FIXME: memory leak
char* str = malloc(str_index + 1);
memcpy(str, str_buf, str_index);
str[str_index] = 0;
return str;
}
static char* lex_string(void) {
char* buf;
size_t len = 1;
@ -185,7 +226,7 @@ static char* lex_string(void) {
skip(1);
}
str_buf[str_index] = 0;
return str_buf;
return leak_buf();
}
static char* lex_identifier(void) {
@ -208,7 +249,7 @@ static char* lex_identifier(void) {
exit(1);
}
str_buf[str_index] = 0;
return str_buf;
return leak_buf();
}
struct token lex(void) {
@ -348,3 +389,44 @@ struct token lex(void) {
struct token tok = { TOK_NAME, data };
return tok;
}
void print_token(struct token tok) {
switch (tok.type) {
case TOK_NAME:
fprintf(stdout, "%s", tok.data.name);
break;
case TOK_LABEL:
fprintf(stdout, "'%s", tok.data.label);
break;
case TOK_INTEGER:
fprintf(stdout, "%zi", tok.data.int_);
break;
case TOK_STRING:
fprintf(stdout, "\"%s\"", tok.data.string);
break;
case TOK_OPEN_GROUP:
fprintf(stdout, "(");
break;
case TOK_CLOSE_GROUP:
fprintf(stdout, ")");
break;
case TOK_OPEN_BLOCK:
fprintf(stdout, "{");
break;
case TOK_CLOSE_BLOCK:
fprintf(stdout, "}");
break;
case TOK_TERMINATOR:
fprintf(stdout, ";");
break;
case TOK_SEPARATOR:
fprintf(stdout, ",");
break;
case TOK_OPERATOR:
fprintf(stdout, "OP: %i", tok.data.op);
break;
case TOK_EOF:
fprintf(stdout, "EOF");
break;
}
}

View File

@ -1,6 +1,7 @@
#ifndef LEX_H
#define LEX_H
#include <stdbool.h>
#include <stdint.h>
enum token_type {
@ -59,6 +60,12 @@ struct token {
union token_data data;
};
_Bool is_unary(enum operator_ op);
_Bool is_binary(enum operator_ op);
_Bool is_lit(struct token tok);
struct token lex(void);
void print_token(struct token tok);
#endif

View File

@ -9,7 +9,7 @@
#include "io.h"
#include "ir.h"
#include "lex.h"
#include "parse.h"
#define ELF_HEADER_SIZE 0xb0
@ -76,48 +76,7 @@ int main(int argc, char** argv) {
}
open_files(argv[2], argv[1]);
struct token tok;
do {
tok = lex();
switch (tok.type) {
case TOK_NAME:
fprintf(stdout, "%s\n", tok.data.name);
break;
case TOK_LABEL:
fprintf(stdout, "'%s\n", tok.data.label);
break;
case TOK_INTEGER:
fprintf(stdout, "%zi\n", tok.data.int_);
break;
case TOK_STRING:
fprintf(stdout, "\"%s\"\n", tok.data.string);
break;
case TOK_OPEN_GROUP:
fprintf(stdout, "(\n");
break;
case TOK_CLOSE_GROUP:
fprintf(stdout, ")\n");
break;
case TOK_OPEN_BLOCK:
fprintf(stdout, "{\n");
break;
case TOK_CLOSE_BLOCK:
fprintf(stdout, "}\n");
break;
case TOK_TERMINATOR:
fprintf(stdout, ";\n");
break;
case TOK_SEPARATOR:
fprintf(stdout, ",\n");
break;
case TOK_OPERATOR:
fprintf(stdout, "OP: %i\n", tok.data.op);
break;
case TOK_EOF:
fprintf(stdout, "EOF\n");
break;
}
} while (tok.type != TOK_EOF);
parse();
reserve(ELF_HEADER_SIZE);
size_t entry_point = compile();

235
src/parse.c Normal file
View File

@ -0,0 +1,235 @@
#include <assert.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "lex.h"
#include "parse.h"
enum state {
ST_BLOCK_OPEN,
ST_BLOCK,
ST_ASSIGN,
ST_EXPR,
ST_EXPR_CONT,
ST_GROUP,
ST_IF_ELSE,
ST_LOOP_VARS,
ST_LOOP_VARS_CONT,
ST_LABEL,
};
const char* state_name(enum state st) {
switch (st) {
case ST_BLOCK_OPEN:
return "{";
case ST_BLOCK:
return ";";
case ST_ASSIGN:
return "=";
case ST_EXPR:
return "x";
case ST_EXPR_CONT:
return "c";
case ST_GROUP:
return "(";
case ST_IF_ELSE:
return "|";
case ST_LOOP_VARS:
return "v";
case ST_LOOP_VARS_CONT:
return ",";
case ST_LABEL:
return "'";
}
}
#define MAX_CONTEXT 256
static uint32_t sp = 0;
static enum state stack[MAX_CONTEXT];
static void debug_print(struct token tok, struct token next) {
for (uint32_t i = 0; i < sp; i++) {
printf("%s", state_name(stack[i]));
}
printf(" ");
print_token(tok);
printf(" ");
print_token(next);
printf("\n");
}
static void push(enum state state) {
stack[sp] = state;
sp++;
}
static enum state pop(void) {
assert(sp != 0);
sp--;
return stack[sp];
}
static _Bool is_assignment(struct token tok, struct token next) {
return tok.type == TOK_NAME && next.type == TOK_OPERATOR && next.data.op == OP_EQ;
}
static _Bool is_expr(struct token tok) {
if (is_lit(tok) || tok.type == TOK_OPEN_GROUP) {
return true;
}
return tok.type == TOK_NAME;
}
#define syntax_error(msg) fprintf(stderr, "syntax error: %s\n", msg); exit(1)
void parse(void) {
// TODO: add support for the top-level instead of this block hack
push(ST_BLOCK);
struct token tok = lex();
struct token next = lex();
while (sp > 0) {
debug_print(tok, next);
// FIXME: stack underflow because we're faking the top-level with blocks
switch (pop()) {
case ST_BLOCK_OPEN:
if (tok.type != TOK_OPEN_BLOCK) {
syntax_error("expected open block (`{`)");
}
push(ST_BLOCK);
break;
case ST_BLOCK:
if (tok.type == TOK_CLOSE_BLOCK) {
break;
}
if (tok.type == TOK_TERMINATOR) {
push(ST_BLOCK);
break;
}
if (is_assignment(tok, next)) {
push(ST_BLOCK);
push(ST_ASSIGN);
break;
}
if (is_expr(tok)) {
push(ST_BLOCK);
push(ST_EXPR);
continue;
}
break;
case ST_ASSIGN:
assert(tok.type == TOK_OPERATOR || tok.data.op == OP_EQ);
push(ST_EXPR);
break;
case ST_EXPR:
push(ST_EXPR_CONT);
if (tok.type == TOK_STRING) {
break;
}
if (tok.type == TOK_INTEGER) {
break;
}
if (tok.type == TOK_NAME) {
char* name = tok.data.name;
if (strcmp(name, "if") == 0) {
push(ST_IF_ELSE);
push(ST_BLOCK_OPEN);
push(ST_EXPR);
break;
}
if (strcmp(name, "loop") == 0) {
push(ST_BLOCK_OPEN);
push(ST_LOOP_VARS);
if (next.type == TOK_LABEL) {
push(ST_LABEL);
}
break;
}
if (strcmp(name, "next") == 0) {
push(ST_LOOP_VARS);
if (next.type == TOK_LABEL) {
push(ST_LABEL);
}
break;
}
if (strcmp(name, "exit") == 0) {
push(ST_EXPR);
if (next.type == TOK_LABEL) {
push(ST_LABEL);
}
break;
}
if (strcmp(name, "return") == 0) {
push(ST_EXPR);
break;
}
break;
}
if (tok.type == TOK_OPEN_GROUP) {
push(ST_GROUP);
push(ST_EXPR);
break;
}
if (tok.type == TOK_OPERATOR && is_unary(tok.data.op)) {
push(ST_EXPR);
break;
}
syntax_error("expected expression");
case ST_EXPR_CONT:
if (is_expr(tok)) {
push(ST_EXPR);
continue;
}
if (tok.type == TOK_OPERATOR && is_binary(tok.data.op)) {
push(ST_EXPR);
break;
}
continue;
case ST_GROUP:
if (tok.type == TOK_CLOSE_GROUP) {
break;
}
syntax_error("mismatched parentheses");
case ST_IF_ELSE:
if (tok.type == TOK_NAME && strcmp(tok.data.name, "else") == 0) {
push(ST_BLOCK_OPEN);
break;
}
continue;
case ST_LOOP_VARS:
if (is_assignment(tok, next)) {
push(ST_LOOP_VARS_CONT);
push(ST_ASSIGN);
break;
}
if (tok.type == TOK_NAME) {
push(ST_LOOP_VARS_CONT);
break;
}
continue;
case ST_LOOP_VARS_CONT:
if (tok.type == TOK_SEPARATOR) {
push(ST_LOOP_VARS);
break;
}
continue;
case ST_LABEL:
assert(tok.type == TOK_LABEL);
break;
}
tok = next;
next = lex();
}
if (tok.type != TOK_EOF) {
fprintf(stderr, "syntax error: finished parsing before end of file\n");
exit(1);
}
if (sp > 0) {
fprintf(stderr, "syntax error: unfinished business at end of file: %i, %i\n", sp, stack[0]);
exit(1);
}
}

6
src/parse.h Normal file
View File

@ -0,0 +1,6 @@
#ifndef PARSE_H
#define PARSE_H
void parse(void);
#endif