Removed radix#int syntax, added keywords to lexer.

master
James T. Martin 2022-09-10 14:58:22 -07:00
parent 4c4ebeecfc
commit 3fe367675a
Signed by: james
GPG Key ID: D6FB2F9892F9B225
5 changed files with 216 additions and 114 deletions

View File

@ -3,15 +3,22 @@
The grammar is LL(1). The grammar is LL(1).
```ebnf ```ebnf
block = open-block, block-body, close-block block = open-block, block-body, close-block ;
block-body = stmt, [{ terminator, stmt }] ; block-body = [ stmt, [{ terminator, stmt }] ] ;
stmt = assignment | expr ; stmt = assignment | expr ;
assignment = var, [":", expr], "=", expr ; assignment = var, [":", expr], "=", expr ;
match-block = open-block, match-block-body, close-block ;
match-block-body = [ match-case, [{ terminator, match-case }] ] ;
match-case = "case", pattern-vars, block | "else", block ;
matchable-block = open-block, { match-block-body | block-body }, close-block ;
expr = "if", expr, block, [ "else", block ] expr = "if", expr, block, [ "else", block ]
| "loop", [ label ], control-vars, block | "match", [ label ], init-vars, match-block
| "next", [ label ] | "loop", [ label ], init-vars, matchable-block
| "exit", [ label ], expr | "fn", [ label ], arg-vars, matchable-block
| "return", expr | "next", [ label ], expr-cont
| "exit", [ label ], expr-cont
| "rec", [ label ], expr-cont
| "return", [ label ], expr-cont
(* these expressions can be used as the LHS of *) (* these expressions can be used as the LHS of *)
(* a function application or binary operator. *) (* a function application or binary operator. *)
| "(", expr, ")", expr-cont | "(", expr, ")", expr-cont
@ -22,8 +29,11 @@ expr = "if", expr, block, [ "else", block ]
; ;
(* an optional binary operator or function application *) (* an optional binary operator or function application *)
expr-cont = [ binop, expr | expr ] ; expr-cont = [ binop, expr | expr ] ;
control-vars = [ control-var, [{ ",", control-var }] ] ; init-vars = [ control-var, [{ ",", control-var }] ] ;
control-var = assignment | var ; init-var = assignment | var ;
arg-vars = [{ "(", var, type-annotation, ")" }]
pattern-vars = ? TODO ?
type-annotation = ":", expr
``` ```
## Lexemes ## Lexemes
@ -45,12 +55,12 @@ binop = "+" | "-" | "*" | "/" | "%"
(* types *) (* types *)
| ":" | "->" | ":" | "->"
; ;
num = ["-"], { decimal-digit | "," }, ["#", { digit | "," }] ; num = ["-"], [ "0b" | "0x" ], digit, [{ ",", digit | digit }]
string = '"', [{ -('"' | newline }], '"' ; string = '"', [{ -('"' | newline }], '"' ;
label = "'", identifier ; label = "'", identifier ;
identifier = alpha, [{ alphanumeric | "_" }] ; identifier = alpha, [{ alphanumeric | "_" }] ;
alpha = ? 'A'..'Z' | 'a'..'z' ? ; alpha = ? 'A'..'Z' ? | ? 'a'..'z' ? ;
decimal-digit = ? '0'..'9' ? ; decimal-digit = ? '0'..'9' ? ;
alphanumeric = decimal-digit | alpha ; alphanumeric = decimal-digit | alpha ;
digit = alphanumeric ; digit = alphanumeric ;
@ -58,8 +68,7 @@ newline = "\r" | "\n" ;
``` ```
A number is a series of base 10 digits by default. A number is a series of base 10 digits by default.
You may use a different base using the syntax `base#digits`, You may use base 2 or base 16 via `0b100101` and `0xDEADBEEF`.
e.g. `2#100101`, `16#DEADBEEF`.
## Blocks & Terminators ## Blocks & Terminators
The rules for blocks and terminators. The rules for blocks and terminators.

View File

@ -102,7 +102,7 @@ void jump_table(size_t branches, label* labels, var index, var* args) {
} }
void jump_if(label label, var cond, var* args) { void jump_if(label label, var cond, var* args) {
assert(0); // UNIMPLEMENTED //assert(0); // UNIMPLEMENTED
} }
void load_var(reg reg, var var) { void load_var(reg reg, var var) {

163
src/lex.c
View File

@ -52,7 +52,8 @@ static _Bool is_alphanumeric(char c) {
} }
static _Bool is_id_char(char c) { static _Bool is_id_char(char c) {
return is_alpha(c) || is_digit(c) || c == '_'; // TODO: allow unicode identifiers
return is_alphanumeric(c) || c == '_';
} }
static struct token simple(enum token_type type) { static struct token simple(enum token_type type) {
@ -67,62 +68,75 @@ static struct token op(enum operator_ op) {
return tok; return tok;
} }
static uint8_t digit_value(uint8_t base, char c) { static uint8_t digit_value(char c) {
assert(base <= 36);
if (is_digit(c)) { return c - '0'; } if (is_digit(c)) { return c - '0'; }
if (c >= 'A' && c <= 'Z') { return c - 'A' + 0xA; } if (c >= 'A' && c <= 'Z') { return c - 'A' + 0xA; }
if (c >= 'a' && c <= 'z') { return c - 'a' + 0xA; } if (c >= 'a' && c <= 'z') { return c - 'a' + 0xA; }
assert(0); assert(0);
} }
static _Bool is_extended_digit(uint8_t base, char c) { static _Bool is_digit_in(uint8_t base, char c) {
if (!is_alphanumeric(c)) { if (!is_alphanumeric(c)) {
return false; return false;
} }
uint8_t val = digit_value(base, c); return digit_value(c) < base;
return val < base;
} }
static uint64_t lex_digits(uint8_t base) { static uint64_t lex_digits(uint8_t base) {
_Bool at_least_one_char = false;
uint64_t acc = 0; uint64_t acc = 0;
while (true) { while (true) {
char c = peekc(); char c = peekc();
if (!is_extended_digit(base, c)) { if (!is_digit_in(base, c)) {
// commas are legal digit separators // commas are legal digit separators
if (c == ',' && is_extended_digit(base, peekc())) { if (c == ',' && is_digit_in(base, peekc())) {
nextc(); nextc();
continue; continue;
} }
break; break;
} }
nextc(); nextc();
uint8_t digit = digit_value(base, c); uint8_t digit = digit_value(c);
// (val * base + digit) > UINT64_MAX // (val * base + digit) > UINT64_MAX
if (acc > ((UINT64_MAX - digit) / base)) { if (acc > ((UINT64_MAX - digit) / base)) {
fprintf(stderr, "lexical error: integer literal overflow\n"); fprintf(stderr, "lexical error: integer literal overflow\n");
exit(1); exit(1);
} }
at_least_one_char = true;
acc *= base; acc *= base;
acc += digit; acc += digit;
} }
if (!at_least_one_char) {
fprintf(stderr, "lexical error: expected digits\n");
exit(1);
}
return acc; return acc;
} }
static struct token integer_tok(uint64_t integer) {
union token_data data;
data.int_ = integer;
struct token tok = { TOK_INTEGER, data };
return tok;
}
static struct token lex_integer(_Bool sign) { static struct token lex_integer(_Bool sign) {
uint64_t acc = lex_digits(10); uint8_t base = 10;
if (peekc() == '#') { if (peekc() == '0') {
if (acc != 2 || acc != 8 && acc != 10 && acc != 16) {
fprintf(stderr, "lexical error: illegal integer literal base\n");
exit(1);
}
nextc(); nextc();
acc = lex_digits((uint8_t) acc); if (peekc() == 'b') {
base = 2;
nextc();
} else if (peekc() == 'x') {
base = 16;
nextc();
}
if(!is_digit(peekc() && !is_id_char(peekc()))) {
return integer_tok(0);
}
}
if (!is_digit_in(base, peekc())) {
fprintf(stderr, "lexical error: expected base-%i digits\n", base);
exit(1);
}
uint64_t acc = lex_digits(10);
if (is_id_char(peekc())) {
fprintf(stderr, "lexical error: must put space between integer and following identifier\n");
exit(1);
} }
if (sign && acc > INT64_MAX) { if (sign && acc > INT64_MAX) {
fprintf(stderr, "lexical error: signed integer literal overflow\n"); fprintf(stderr, "lexical error: signed integer literal overflow\n");
@ -132,10 +146,7 @@ static struct token lex_integer(_Bool sign) {
if (sign) { if (sign) {
val = -val; val = -val;
} }
union token_data data; return integer_tok(val);
data.int_ = val;
struct token tok = { TOK_INTEGER, data };
return tok;
} }
#define MAX_STR_LEN 4096 #define MAX_STR_LEN 4096
@ -155,6 +166,7 @@ static char* str_buf(void) {
} }
static char* lex_string(void) { static char* lex_string(void) {
// TODO: string escapes, multi-line strings, no length limit on strings
str_index = 0; str_index = 0;
char* buf = str_buf(); char* buf = str_buf();
while (true) { while (true) {
@ -167,7 +179,6 @@ static char* lex_string(void) {
fprintf(stderr, "lexical error: unclosed string (reached end of file)\n"); fprintf(stderr, "lexical error: unclosed string (reached end of file)\n");
exit(1); exit(1);
} }
// TODO: string escapes, multi-line strings
if (c == '"') { if (c == '"') {
break; break;
} }
@ -259,6 +270,16 @@ static struct token lex(void) {
struct token tok = { TOK_LABEL, data }; struct token tok = { TOK_LABEL, data };
return tok; return tok;
} }
case ':':
nextc();
while (is_indent(peekc())) {
nextc();
}
if (is_newline(peekc())) {
level_is_block[indent_level + 1] = true;
return lex();
}
return op(OP_TYPE);
case '{': case '{':
nextc(); nextc();
return simple(TOK_OPEN_BLOCK); return simple(TOK_OPEN_BLOCK);
@ -279,10 +300,7 @@ static struct token lex(void) {
return simple(TOK_SEPARATOR); return simple(TOK_SEPARATOR);
case '=': case '=':
nextc(); nextc();
return op(OP_EQ); return simple(TOK_EQUALS);
case '+':
nextc();
return op(OP_ADD);
case '-': case '-':
nextc(); nextc();
if (peekc() == '>') { if (peekc() == '>') {
@ -293,6 +311,9 @@ static struct token lex(void) {
return lex_integer(true); return lex_integer(true);
} }
return op(OP_SUB); return op(OP_SUB);
case '+':
nextc();
return op(OP_ADD);
case '*': case '*':
nextc(); nextc();
return op(OP_MUL); return op(OP_MUL);
@ -321,16 +342,6 @@ static struct token lex(void) {
return op(OP_NE); return op(OP_NE);
} }
return op(OP_NOT); return op(OP_NOT);
case ':':
nextc();
while (is_indent(peekc())) {
nextc();
}
if (is_newline(peekc())) {
level_is_block[indent_level + 1] = true;
return lex();
}
return op(OP_TYPE);
case '>': case '>':
nextc(); nextc();
c = peekc(); c = peekc();
@ -363,8 +374,40 @@ static struct token lex(void) {
if (is_digit(c)) { if (is_digit(c)) {
return lex_integer(false); return lex_integer(false);
} }
char* name = lex_identifier();
if (strcmp(name, "if") == 0) {
return simple(TOK_IF);
}
if (strcmp(name, "else") == 0) {
return simple(TOK_ELSE);
}
if (strcmp(name, "match") == 0) {
return simple(TOK_MATCH);
}
if (strcmp(name, "case") == 0) {
return simple(TOK_CASE);
}
if (strcmp(name, "loop") == 0) {
return simple(TOK_LOOP);
}
if (strcmp(name, "fn") == 0) {
return simple(TOK_FN);
}
if (strcmp(name, "next") == 0) {
return simple(TOK_NEXT);
}
if (strcmp(name, "exit") == 0) {
return simple(TOK_EXIT);
}
if (strcmp(name, "recurse") == 0) {
return simple(TOK_RECURSE);
}
if (strcmp(name, "return") == 0) {
return simple(TOK_RETURN);
}
union token_data data; union token_data data;
data.name = lex_identifier(); data.name = name;
struct token tok = { TOK_NAME, data }; struct token tok = { TOK_NAME, data };
return tok; return tok;
} }
@ -420,10 +463,44 @@ void print_token(struct token tok) {
fprintf(stdout, ","); fprintf(stdout, ",");
break; break;
case TOK_OPERATOR: case TOK_OPERATOR:
// TODO: printing for operators
fprintf(stdout, "OP:%i", tok.data.op); fprintf(stdout, "OP:%i", tok.data.op);
break; break;
case TOK_EOF: case TOK_EOF:
fprintf(stdout, "EOF"); fprintf(stdout, "<EOF>");
break;
case TOK_CASE:
fprintf(stdout, "case");
break;
case TOK_ELSE:
fprintf(stdout, "else");
break;
case TOK_EQUALS:
fprintf(stdout, "=");
break;
case TOK_EXIT:
fprintf(stdout, "exit");
break;
case TOK_FN:
fprintf(stdout, "fn");
break;
case TOK_IF:
fprintf(stdout, "if");
break;
case TOK_LOOP:
fprintf(stdout, "loop");
break;
case TOK_NEXT:
fprintf(stdout, "next");
break;
case TOK_RETURN:
fprintf(stdout, "return");
break;
case TOK_RECURSE:
fprintf(stdout, "recurse");
break;
case TOK_MATCH:
fprintf(stdout, "match");
break; break;
} }
} }

View File

@ -8,15 +8,26 @@ enum token_type {
TOK_EOF, // end of file TOK_EOF, // end of file
TOK_NAME, // foo, bar_quux123, loop TOK_NAME, // foo, bar_quux123, loop
TOK_LABEL, // 'my_loop TOK_LABEL, // 'my_loop
TOK_INTEGER, // -123, 16#DEADBEEF TOK_INTEGER, // -123, 0xDEADBEEF
TOK_STRING, // "..." TOK_STRING, // "..."
TOK_OPERATOR, TOK_OPERATOR,
TOK_OPEN_GROUP, // ( TOK_OPEN_BLOCK, // `{` or `:` at the end of a line
TOK_CLOSE_GROUP, // ) TOK_CLOSE_BLOCK, // `}` or inferred from indentation
TOK_OPEN_BLOCK, // { TOK_OPEN_GROUP, // `(`
TOK_CLOSE_BLOCK, // } TOK_CLOSE_GROUP, // `)`
TOK_TERMINATOR, // ; TOK_TERMINATOR, // `;` or inferred from indentation, used to separate statements in blocks
TOK_SEPARATOR, // , TOK_SEPARATOR, // `,`, used to separate variables in initializers
TOK_EQUALS, // `=`, used for assignments or as an equality operator
TOK_IF, // if
TOK_ELSE, // else
TOK_MATCH, // match
TOK_CASE, // case
TOK_LOOP, // loop
TOK_FN, // fn
TOK_NEXT, // next
TOK_EXIT, // exit
TOK_RECURSE, // recurse
TOK_RETURN, // return
}; };
enum operator_ { enum operator_ {

View File

@ -16,6 +16,8 @@ enum state {
ST_BLOCK_CLOSE, ST_BLOCK_CLOSE,
ST_ASSIGN, ST_ASSIGN,
ST_EXPR, ST_EXPR,
// HACK: The existence of this state.
// Also, the entire structure of the parser is ugly.
ST_EXPR_HACK, ST_EXPR_HACK,
ST_EXPR_CONT, ST_EXPR_CONT,
ST_EXPR_END, ST_EXPR_END,
@ -87,16 +89,24 @@ static enum state pop(void) {
} }
static _Bool is_assignment(struct token tok, struct token next) { static _Bool is_assignment(struct token tok, struct token next) {
return tok.type == TOK_NAME && next.type == TOK_OPERATOR && next.data.op == OP_EQ; return tok.type == TOK_NAME && next.type == TOK_EQUALS;
} }
static _Bool is_expr(struct token tok) { static _Bool is_expr(struct token tok) {
if (is_lit(tok) || tok.type == TOK_OPEN_GROUP) { return is_lit(tok)
return true; || tok.type == TOK_NAME
} || tok.type == TOK_OPEN_GROUP
return tok.type == TOK_NAME; || tok.type == TOK_IF
|| tok.type == TOK_MATCH
|| tok.type == TOK_FN
|| tok.type == TOK_LOOP
|| tok.type == TOK_NEXT
|| tok.type == TOK_EXIT
|| tok.type == TOK_NEXT
|| tok.type == TOK_RETURN
|| tok.type == TOK_RECURSE
|| tok.type == TOK_MATCH;
} }
#define syntax_error(msg) fprintf(stderr, "syntax error: %s\n", msg); exit(1) #define syntax_error(msg) fprintf(stderr, "syntax error: %s\n", msg); exit(1)
@ -155,27 +165,23 @@ void parse(void) {
push(ST_EXPR_HACK); push(ST_EXPR_HACK);
continue; continue;
case ST_EXPR_HACK: case ST_EXPR_HACK:
if (tok.type == TOK_STRING) { switch (tok.type) {
push(ST_EXPR_CONT); case TOK_STRING:
expr_string(tok.data.string); push(ST_EXPR_CONT);
break; expr_string(tok.data.string);
} break;
if (tok.type == TOK_INTEGER) { case TOK_INTEGER:
push(ST_EXPR_CONT); push(ST_EXPR_CONT);
expr_integer(tok.data.int_); expr_integer(tok.data.int_);
break; break;
} case TOK_IF:
if (tok.type == TOK_NAME) {
char* name = tok.data.name;
if (strcmp(name, "if") == 0) {
push(ST_IF_END); push(ST_IF_END);
push(ST_IF_ELSE); push(ST_IF_ELSE);
push(ST_BLOCK); push(ST_BLOCK);
push(ST_EXPR); push(ST_EXPR);
enter_if(); enter_if();
break; break;
} case TOK_LOOP:
if (strcmp(name, "loop") == 0) {
push(ST_BLOCK); push(ST_BLOCK);
push(ST_LOOP_VARS); push(ST_LOOP_VARS);
if (nxt.type == TOK_LABEL) { if (nxt.type == TOK_LABEL) {
@ -185,9 +191,8 @@ void parse(void) {
enter_loop(NULL); enter_loop(NULL);
} }
break; break;
} case TOK_NEXT:
if (strcmp(name, "next") == 0) { push(ST_EXPR_CONT);
push(ST_LOOP_VARS);
if (nxt.type == TOK_LABEL) { if (nxt.type == TOK_LABEL) {
next(); next();
expr_next(nxt.data.label); expr_next(nxt.data.label);
@ -195,8 +200,7 @@ void parse(void) {
expr_next(NULL); expr_next(NULL);
} }
break; break;
} case TOK_EXIT:
if (strcmp(name, "exit") == 0) {
push(ST_EXPR); push(ST_EXPR);
if (nxt.type == TOK_LABEL) { if (nxt.type == TOK_LABEL) {
next(); next();
@ -205,34 +209,35 @@ void parse(void) {
expr_exit(NULL); expr_exit(NULL);
} }
break; break;
} case TOK_RETURN:
if (strcmp(name, "return") == 0) {
push(ST_EXPR); push(ST_EXPR);
expr_return(); expr_return();
break; break;
} case TOK_NAME:
push(ST_EXPR_CONT); push(ST_EXPR_CONT);
expr_var(tok.data.name); expr_var(tok.data.name);
break; break;
case TOK_OPEN_GROUP:
push(ST_EXPR_CONT);
push(ST_GROUP);
push(ST_EXPR);
enter_group();
break;
case TOK_OPERATOR:
if (is_unary(tok.data.op)) {
push(ST_EXPR_CONT);
push(ST_EXPR_HACK);
expr_op(tok.data.op);
break;
}
syntax_error("only unary operators allowed at beginning of expression");
case TOK_OPEN_BLOCK:
push(ST_BLOCK);
continue;
default:
syntax_error("expected expression");
} }
if (tok.type == TOK_OPEN_GROUP) { break;
push(ST_EXPR_CONT);
push(ST_GROUP);
push(ST_EXPR);
enter_group();
break;
}
if (tok.type == TOK_OPERATOR && is_unary(tok.data.op)) {
push(ST_EXPR_CONT);
push(ST_EXPR_HACK);
expr_op(tok.data.op);
break;
}
if (tok.type == TOK_OPEN_BLOCK) {
push(ST_BLOCK);
continue;
}
syntax_error("expected expression");
case ST_EXPR_CONT: case ST_EXPR_CONT:
if (is_expr(tok)) { if (is_expr(tok)) {
push(ST_EXPR_HACK); push(ST_EXPR_HACK);
@ -255,7 +260,7 @@ void parse(void) {
} }
syntax_error("mismatched parentheses"); syntax_error("mismatched parentheses");
case ST_IF_ELSE: case ST_IF_ELSE:
if (tok.type == TOK_NAME && strcmp(tok.data.name, "else") == 0) { if (tok.type == TOK_ELSE) {
push(ST_BLOCK); push(ST_BLOCK);
break; break;
} }