pass-lang/src/lex.c

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "lex.h"
#include "lex/indent.h"
#include "io.h"

static const char* const keywords[KEYWORD_COUNT] = {
    "comm",
    "assocl",
    "assocr",
    "distl",
    "distr",
    "factl",
    "factr",
    "mapl",
    "mapr",
    "unitil",
    "unitir",
    "unitel",
    "uniter",
    "comm+",
    "assocl+",
    "assocr+",
    "mapl+",
    "mapr+",
    "inl",
    "inr",
    "out",
    "halt",
    "if",
};

static struct token simple(enum token_type type) {
    struct token tok = { type, 0 };
    return tok;
}

#define MAX_STR_LEN 4096
static size_t str_index;
// alternate string buffers between tokens to prevent overwriting buffer.
// we're LL(1) so 2 buffers is sufficient.

// NOTE: I later changed the code and it wasn't sufficient anymore, lmao.
static int which_buf = 0;
static char str_buf_1[MAX_STR_LEN];
static char str_buf_2[MAX_STR_LEN];
static char str_buf_3[MAX_STR_LEN];

static char* str_buf(void) {
    which_buf = (which_buf + 1) % 3;
    switch (which_buf) {
        case 0: return str_buf_1;
        case 1: return str_buf_2;
        case 2: return str_buf_3;
    }
    assert(false);
}


static _Bool is_alpha(char c) {
    return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
}

static _Bool is_digit(char c) {
    return c >= '0' && c <= '9';
}

static _Bool is_alphanumeric(char c) {
    return is_alpha(c) || is_digit(c);
}

static _Bool is_id_char(char c) {
    // TODO: allow unicode identifiers
    return is_alphanumeric(c) || c == '_';
}

_Bool is_whitespace(char c) {
    return c == ' ' || c == '\t' || c == '\r' || c == '\n';
}


static char* lex_identifier(void) {
    str_index = 0;
    char* buf = str_buf();
    while (true) {
        char c = peekc();
        if (!is_id_char(c)) break;
        nextc();
        buf[str_index] = c;
        str_index++;
    }
    if (str_index == 0) {
        fprintf(stderr, "lexical error: expected identifier (possibly illegal character?)\n");
        exit(1);
    }
    buf[str_index] = 0;
    return buf;
}

static struct token lex(void) {
    char c = peekc();
    while (true) {
        // skip whitespace
        while (is_whitespace(c)) {
            nextc();
            c = peekc();
        }
        // skip line comments
        if (c == '!') {
            do {
                c = nextc();
            } while (c != '\n');
        } else {
            break;
        }
    }

    // syntax
    switch (c) {
        case 0:
            return simple(TOK_EOF);
        case '{':
            nextc();
            return simple(TOK_MAP_BEGIN);
        case '}':
            nextc();
            return simple(TOK_MAP_END);
    }

    char* name = lex_identifier();

    // keywords
    for (size_t kwd = 0; kwd < KEYWORD_COUNT; kwd++) {
        if (strcmp(name, keywords[kwd]) == 0) {
            return simple((enum token_type) kwd);
        }
    }

    enum token_type type = TOK_JUMP;
    // labels
    if (peekc() == ':') {
        type = TOK_LABEL;
        nextc();
    }

    struct token tok = { type, name };
    return tok;
}

static _Bool init = false;
static struct token peek_buf;

struct token next(void) {
    if (!init) {
        init = true;
        next();
    }
    struct token tmp = peek_buf;
    peek_buf = lex();
    return tmp;
}

struct token peek(void) {
    return peek_buf;
}