diff --git a/Makefile b/Makefile index 729d4e7..709ab72 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ SHELL = /bin/sh CFLAGS = -std=c99 -pedantic -Wextra -Os LDFLAGS = -lc -OBJECTS = asm.o io.o ir.o lex.o main.o parse.o x86encode.o +OBJECTS = asm.o io.o ir.o lex.o lex/indent.o main.o parse.o x86encode.o .PHONY: passc passc: .bin $(OBJECTS) @@ -14,11 +14,16 @@ passc: .bin $(OBJECTS) .PHONY: .bin .bin: - @mkdir -p bin/obj + @mkdir -p bin/obj bin/obj/lex %.o : src/%.c $(CC) -c $(CFLAGS) $< -o bin/obj/$@ +# if you know how to avoid this duplication for subdirectories, +# please send a patch or tell me how! +lex/indent.o : src/lex/indent.c + $(CC) -c $(CFLAGS) $< -o bin/obj/$@ + .PHONY: clean clean: @-rm -rf bin diff --git a/docs/syntax.md b/docs/syntax.md index ff9483c..d551708 100644 --- a/docs/syntax.md +++ b/docs/syntax.md @@ -3,8 +3,8 @@ The grammar is LL(1). ```ebnf -block = open-block, ("pass" | block-body), close-block ; -block-body = stmt, [{ terminator, stmt }] ; +block = open-block, block-body, close-block +block-body = "stmt, [{ terminator, stmt }] ; stmt = assignment | expr ; assignment = var, [":", expr], "=", expr ; expr = "if", expr, block, [ "else", block ] @@ -32,6 +32,9 @@ then the lexer is regular. If you use indentation-sensitive syntax, then lexing is context-sensitive. ```ebnf +open-block = "{" | ":", ? indentation-based ? +close-block = "}" | ? indentation-based ? +terminator = ";" | ? indentation-based ? unop = "-" | "~" | "!" ; (* arithmetic *) binop = "+" | "-" | "*" | "/" | "%" @@ -45,7 +48,7 @@ binop = "+" | "-" | "*" | "/" | "%" num = ["-"], { decimal-digit | "," }, ["#", { digit | "," }] ; string = '"', [{ -('"' | newline }], '"' ; label = "'", identifier ; -identifier = alpha, [{ alphanumeric }] ; +identifier = alpha, [{ alphanumeric | "_" }] ; alpha = ? 'A'..'Z' | 'a'..'z' ? ; decimal-digit = ? '0'..'9' ? ; diff --git a/src/io.c b/src/io.c index aaea51b..db45be5 100644 --- a/src/io.c +++ b/src/io.c @@ -106,39 +106,35 @@ void patch_i32(size_t off, int32_t x) { patch_u32(off, (uint32_t) x); } +static _Bool init = false; static char peek_buf; -static _Bool peeked; static char next_(void) { - int c = getc(infile); + char c = getc(infile); if (c == EOF) { if (ferror(infile)) { fprintf(stderr, "failed to read source file: %s\n", strerror(errno)); exit(1); } - return 0; + c = 0; } return c; } char nextc(void) { - if (peeked) { - peeked = false; - return peek_buf; + if (!init) { + init = true; + peek_buf = next_(); } - return next_(); -} - -void unnextc(char c) { - assert(!peeked); - peek_buf = c; - peeked = true; + int tmp = peek_buf; + peek_buf = next_(); + return peek_buf; } char peekc(void) { - if (!peeked) { - peek_buf = next_(); - peeked = true; + if (!init) { + init = true; + return nextc(); } return peek_buf; } diff --git a/src/lex.c b/src/lex.c index 0e83443..d140c93 100644 --- a/src/lex.c +++ b/src/lex.c @@ -1,11 +1,10 @@ #include -#include -#include #include #include #include #include "lex.h" +#include "lex/indent.h" #include "io.h" _Bool is_unary(enum operator_ op) { @@ -40,10 +39,6 @@ _Bool is_lit(struct token tok) { return tok.type == TOK_INTEGER || tok.type == TOK_STRING || tok.type == TOK_NAME; } -static _Bool is_whitespace(char c) { - return c == ' ' || c == '\t' || c == '\r' || c == '\n'; -} - static _Bool is_alpha(char c) { return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); } @@ -52,7 +47,11 @@ static _Bool is_digit(char c) { return c >= '0' && c <= '9'; } -static _Bool id_char(char c) { +static _Bool is_alphanumeric(char c) { + return is_alpha(c) || is_digit(c); +} + +static _Bool is_id_char(char c) { return is_alpha(c) || is_digit(c) || c == '_'; } @@ -77,7 +76,7 @@ static uint8_t digit_value(uint8_t base, char c) { } static _Bool is_extended_digit(uint8_t base, char c) { - if (!is_digit(c) && !is_alpha(c)) { + if (!is_alphanumeric(c)) { return false; } uint8_t val = digit_value(base, c); @@ -118,19 +117,15 @@ static uint64_t lex_digits(uint8_t base) { static struct token lex_integer(_Bool sign) { uint64_t acc = lex_digits(10); if (peekc() == '#') { - if (acc < 2) { - fprintf(stderr, "lexical error: integer literal base too small\n"); - exit(1); - } - if (acc > 36) { - fprintf(stderr, "lexical error: integer literal base too large\n"); + if (acc != 2 || acc != 8 && acc != 10 && acc != 16) { + fprintf(stderr, "lexical error: illegal integer literal base\n"); exit(1); } nextc(); acc = lex_digits((uint8_t) acc); } if (sign && acc > INT64_MAX) { - fprintf(stderr, "lexical error: integer literal overflow due to sign\n"); + fprintf(stderr, "lexical error: signed integer literal overflow\n"); exit(1); } int64_t val = (int64_t) acc; @@ -192,7 +187,7 @@ static char* lex_identifier(void) { char* buf = str_buf(); while (true) { char c = peekc(); - if (!is_alpha(c) && !is_digit(c) && c != '_') break; + if (!is_id_char(c)) break; nextc(); buf[str_index] = c; str_index++; @@ -205,44 +200,91 @@ static char* lex_identifier(void) { return buf; } +static uint32_t indent_level = 0; +static uint32_t pending_level = 0; +static _Bool level_is_block[MAX_INDENTS] = {true}; +// going back to a previous indentation level. +// if we're going back, then we insert a terminator. +static _Bool going_back = false; + static struct token lex(void) { - char c; - do { - c = nextc(); - } while (is_whitespace(c)); + char c = peekc(); + if (is_newline(c)) { + indent_level = lex_indentation(); + if (indent_level <= pending_level) { + going_back = true; + } + } + while (indent_level > pending_level) { + pending_level++; + if (level_is_block[pending_level]) { + return simple(TOK_OPEN_BLOCK); + } + } + while (indent_level < pending_level) { + _Bool was_block = level_is_block[pending_level]; + level_is_block[pending_level] = false; + pending_level--; + if (was_block) { + return simple(TOK_CLOSE_BLOCK); + } + } + if (going_back) { + going_back = false; + if (level_is_block[indent_level]) { + return simple(TOK_TERMINATOR); + } + } + c = peekc(); + while (is_indent(c)) { + nextc(); + c = peekc(); + } _Bool sign = false; switch (c) { case 0: + nextc(); return simple(TOK_EOF); case '"': { + nextc(); union token_data data; data.string = lex_string(); struct token tok = { TOK_STRING, data }; return tok; } case '\'': { + nextc(); union token_data data; data.label = lex_identifier(); struct token tok = { TOK_LABEL, data }; return tok; } case '{': + nextc(); return simple(TOK_OPEN_BLOCK); case '}': + nextc(); return simple(TOK_CLOSE_BLOCK); case '(': + nextc(); return simple(TOK_OPEN_GROUP); case ')': + nextc(); return simple(TOK_CLOSE_GROUP); case ';': + nextc(); return simple(TOK_TERMINATOR); case ',': + nextc(); return simple(TOK_SEPARATOR); case '=': + nextc(); return op(OP_EQ); case '+': + nextc(); return op(OP_ADD); case '-': + nextc(); if (peekc() == '>') { nextc(); return op(OP_FUN); @@ -252,28 +294,45 @@ static struct token lex(void) { } return op(OP_SUB); case '*': + nextc(); return op(OP_MUL); case '/': + nextc(); return op(OP_DIV); case '%': + nextc(); return op(OP_MOD); case '~': + nextc(); return op(OP_INV); case '&': + nextc(); return op(OP_AND); case '|': + nextc(); return op(OP_OR); case '^': + nextc(); return op(OP_XOR); case '!': + nextc(); if (peekc() == '=') { nextc(); return op(OP_NE); } return op(OP_NOT); case ':': + nextc(); + while (is_indent(peekc())) { + nextc(); + } + if (is_newline(peekc())) { + level_is_block[indent_level + 1] = true; + return lex(); + } return op(OP_TYPE); case '>': + nextc(); c = peekc(); if (c == '=') { nextc(); @@ -289,6 +348,7 @@ static struct token lex(void) { } return op(OP_GT); case '<': + nextc(); c = peekc(); if (c == '<') { nextc(); @@ -300,7 +360,6 @@ static struct token lex(void) { } return op(OP_LT); } - unnextc(c); if (is_digit(c)) { return lex_integer(false); } @@ -310,28 +369,21 @@ static struct token lex(void) { return tok; } -static _Bool peeked = false; +static _Bool init = false; static struct token peek_buf; struct token next(void) { - if (peeked) { - peeked = false; - return peek_buf; + if (!init) { + init = true; + indent_level = lex_indentation(); + next(); } - return lex(); -} - -void unnext(struct token tok) { - assert(!peeked); - peeked = true; - peek_buf = tok; + struct token tmp = peek_buf; + peek_buf = lex(); + return tmp; } struct token peek(void) { - if (!peeked) { - peek_buf = lex(); - peeked = true; - } return peek_buf; } diff --git a/src/lex.h b/src/lex.h index 488c61b..ed35e61 100644 --- a/src/lex.h +++ b/src/lex.h @@ -65,7 +65,6 @@ _Bool is_binary(enum operator_ op); _Bool is_lit(struct token tok); struct token next(void); -void unnext(struct token tok); struct token peek(void); void print_token(struct token tok); diff --git a/src/lex/indent.c b/src/lex/indent.c new file mode 100644 index 0000000..a6defd5 --- /dev/null +++ b/src/lex/indent.c @@ -0,0 +1,166 @@ +/// +/// See `docs/syntax.md#indentation-levels` for an explanation of the indent level algorithm. +/// + +#include +#include +#include +#include + +#include "../io.h" +#include "indent.h" + +_Bool is_indent(char c) { + return c == ' ' || c == '\t'; +} + +_Bool is_newline(char c) { + return c == '\r' || c == '\n'; +} + +enum indent_type { + INDENT_TABS, + INDENT_SPACES, +}; + +struct indent { + uint32_t tabs; + uint32_t spaces; +}; + +static char good_indent(enum indent_type type) { + switch (type) { + case INDENT_TABS: + return '\t'; + case INDENT_SPACES: + return ' '; + } +} + +static char bad_indent(enum indent_type type) { + switch (type) { + case INDENT_TABS: + return ' '; + case INDENT_SPACES: + return '\t'; + } +} + +static uint32_t indent_levels = 0; +static struct indent indents[MAX_INDENTS]; +static uint32_t additional_line_length = 0; + +static _Bool tabs_allowed(void) { + return indent_levels == 0 || indents[indent_levels - 1].spaces == 0; +} + +// We only throw errors on bad indentation if the line is not empty. +// This function spins to the end of the line to determine whether to throw the error. +static void indent_error(enum indent_type type) { + char c = peekc(); + while (is_indent(c)) { + nextc(); + c = peekc(); + if (is_newline(c)) { + return; + } + } + switch (type) { + case INDENT_SPACES: + fprintf(stderr, "lexical error: previous line used spaces at this indentation level; this line used tabs\n"); + exit(1); + case INDENT_TABS: + fprintf(stderr, "lexical error: previous lines used tabs at this indentation level; this line used spaces\n"); + exit(1); + } +} + +static void expect_indent(enum indent_type type, uint32_t depth) { + char good = good_indent(type); + char bad = bad_indent(type); + char c = peekc(); + for (uint32_t i = 0; i < depth; i++) { + if (c == bad) { + indent_error(type); + return; + } + if (is_newline(c)) { + return; + } + if (c != good) { + fprintf(stderr, "lexical error: indentation does not match any preceding indentation level\n"); + exit(1); + } + nextc(); + c = peekc(); + } +} + +static uint32_t count_indents(enum indent_type type) { + uint32_t counter = 0; + char indent = good_indent(type); + char c = peekc(); + while (c == indent) { + counter++; + nextc(); + c = peekc(); + } + return counter; +} + +static void new_indent(void) { + struct indent indent = { 0, 0 }; + indent.tabs = count_indents(INDENT_TABS); + indent.spaces = count_indents(INDENT_SPACES); + char c = peekc(); + if (c == '\t' && (indent.spaces > 0 || !tabs_allowed())) { + fprintf(stderr, "lexical error: all tabs on a line must precede all spaces\n"); + exit(1); + } + if (is_newline(c)) { + return; + } + if (indent_levels == MAX_INDENTS) { + fprintf(stderr, "lexical error: too many indentation levels! factor your code!\n"); + exit(1); + } + indents[indent_levels] = indent; + indent_levels++; +} + +int32_t lex_indentation(void) { + uint32_t indent_level = 0; + char c = peekc(); + while (true) { + while (is_newline(c)) { + nextc(); + c = peekc(); + } + if (c == 0) { + indent_levels = 0; + return indent_level; + } + if (!is_indent(c)) { + break; + } + indent_level = 0; + while (is_indent(c) && indent_level < indent_levels) { + struct indent indent = indents[indent_level]; + expect_indent(INDENT_TABS, indent.tabs); + expect_indent(INDENT_SPACES, indent.spaces); + indent_level++; + c = peekc(); + } + if (is_indent(c)) { + new_indent(); + c = peekc(); + if (!is_newline(c)) { + indent_levels++; + return indent_levels; + } + } + c = peekc(); + } + indent_levels = indent_level; + return indent_levels; +} diff --git a/src/lex/indent.h b/src/lex/indent.h new file mode 100644 index 0000000..5504d4a --- /dev/null +++ b/src/lex/indent.h @@ -0,0 +1,14 @@ +#ifndef LEX_INDENT_H +#define LEX_INDENT_H + +#include +#include + +#define MAX_INDENTS 32 + +_Bool is_newline(char c); +_Bool is_indent(char c); + +int32_t lex_indentation(void); + +#endif diff --git a/src/parse.c b/src/parse.c index bf6e0a6..872590f 100644 --- a/src/parse.c +++ b/src/parse.c @@ -9,8 +9,10 @@ #include "parse.h" enum state { - ST_BLOCK_OPEN, ST_BLOCK, + ST_BLOCK_BODY, + ST_BLOCK_CONT, + ST_BLOCK_CLOSE, ST_ASSIGN, ST_EXPR, ST_EXPR_CONT, @@ -18,15 +20,18 @@ enum state { ST_IF_ELSE, ST_LOOP_VARS, ST_LOOP_VARS_CONT, - ST_LABEL, }; const char* state_name(enum state st) { switch (st) { - case ST_BLOCK_OPEN: - return "{"; case ST_BLOCK: + return "{"; + case ST_BLOCK_BODY: + return "B"; + case ST_BLOCK_CONT: return ";"; + case ST_BLOCK_CLOSE: + return "}"; case ST_ASSIGN: return "="; case ST_EXPR: @@ -41,8 +46,6 @@ const char* state_name(enum state st) { return "v"; case ST_LOOP_VARS_CONT: return ","; - case ST_LABEL: - return "'"; } } @@ -88,79 +91,86 @@ static _Bool is_expr(struct token tok) { #define syntax_error(msg) fprintf(stderr, "syntax error: %s\n", msg); exit(1) void parse(void) { + sp = 0; // TODO: add support for the top-level instead of this block hack - push(ST_BLOCK); + push(ST_BLOCK_BODY); struct token tok = next(); - struct token nxt; + struct token nxt = peek(); while (sp > 0) { - nxt = peek(); debug_print(tok, nxt); // FIXME: stack underflow because we're faking the top-level with blocks switch (pop()) { - case ST_BLOCK_OPEN: - if (tok.type != TOK_OPEN_BLOCK) { - syntax_error("expected open block (`{`)"); - } - push(ST_BLOCK); - break; case ST_BLOCK: - if (tok.type == TOK_CLOSE_BLOCK) { - break; - } - if (tok.type == TOK_TERMINATOR) { - push(ST_BLOCK); + if (tok.type == TOK_OPEN_BLOCK) { + push(ST_BLOCK_CLOSE); + push(ST_BLOCK_BODY); break; } + syntax_error("expected beginning of block"); + break; + case ST_BLOCK_BODY: if (is_assignment(tok, nxt)) { - push(ST_BLOCK); + push(ST_BLOCK_CONT); push(ST_ASSIGN); break; } if (is_expr(tok)) { - push(ST_BLOCK); + push(ST_BLOCK_CONT); push(ST_EXPR); continue; } - break; + continue; + case ST_BLOCK_CONT: + if (tok.type == TOK_TERMINATOR) { + push(ST_BLOCK_BODY); + break; + } + continue; + case ST_BLOCK_CLOSE: + if (tok.type == TOK_CLOSE_BLOCK) { + break; + } + syntax_error("expected end of block"); case ST_ASSIGN: assert(tok.type == TOK_OPERATOR || tok.data.op == OP_EQ); push(ST_EXPR); break; case ST_EXPR: - push(ST_EXPR_CONT); if (tok.type == TOK_STRING) { + push(ST_EXPR_CONT); break; } if (tok.type == TOK_INTEGER) { + push(ST_EXPR_CONT); break; } if (tok.type == TOK_NAME) { char* name = tok.data.name; if (strcmp(name, "if") == 0) { push(ST_IF_ELSE); - push(ST_BLOCK_OPEN); + push(ST_BLOCK); push(ST_EXPR); break; } if (strcmp(name, "loop") == 0) { - push(ST_BLOCK_OPEN); + push(ST_BLOCK); push(ST_LOOP_VARS); if (nxt.type == TOK_LABEL) { - push(ST_LABEL); + next(); } break; } if (strcmp(name, "next") == 0) { push(ST_LOOP_VARS); if (nxt.type == TOK_LABEL) { - push(ST_LABEL); + next(); } break; } if (strcmp(name, "exit") == 0) { push(ST_EXPR); if (nxt.type == TOK_LABEL) { - push(ST_LABEL); + next(); } break; } @@ -168,14 +178,17 @@ void parse(void) { push(ST_EXPR); break; } + push(ST_EXPR_CONT); break; } if (tok.type == TOK_OPEN_GROUP) { + push(ST_EXPR_CONT); push(ST_GROUP); push(ST_EXPR); break; } if (tok.type == TOK_OPERATOR && is_unary(tok.data.op)) { + push(ST_EXPR_CONT); push(ST_EXPR); break; } @@ -197,7 +210,7 @@ void parse(void) { syntax_error("mismatched parentheses"); case ST_IF_ELSE: if (tok.type == TOK_NAME && strcmp(tok.data.name, "else") == 0) { - push(ST_BLOCK_OPEN); + push(ST_BLOCK); break; } continue; @@ -218,11 +231,9 @@ void parse(void) { break; } continue; - case ST_LABEL: - assert(tok.type == TOK_LABEL); - break; } tok = next(); + nxt = peek(); } if (tok.type != TOK_EOF) { fprintf(stderr, "syntax error: finished parsing before end of file\n");