pass-lang/src/io.c

#include "io.h"

#include <assert.h>
#include <errno.h>
#include <stdbool.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#ifdef __unix__
#include <sys/stat.h>
#endif

static const char* outfile_name;
static FILE* infile;
static FILE* outfile;

void open_files(const char* infile_name, const char* outfile_name_) {
    outfile_name = outfile_name_;
    infile = fopen(infile_name, "rb");
    if (infile == NULL) {
        fprintf(stderr, "failed to open source file: %s\n", strerror(errno));
        exit(1);
    }

    outfile = fopen(outfile_name, "wb");
    if (outfile == NULL) {
        fprintf(stderr, "failed to open output file: %s\n", strerror(errno));
        exit(1);
    }
}

void close_files(void) {
    fclose(infile);
    if (fclose(outfile) != 0) {
        fprintf(stderr, "failed to close output file: %s\n", strerror(errno));
        // NOTE: ideally we'd do this on any dirty exit
        if (remove(outfile_name) != 0) {
            fprintf(stderr, "failed to remove output file; if it exists, it is corrupt: %s\n", strerror(errno));
        }
        exit(1);
    }

    #ifdef __unix__
    chmod(outfile_name, 0777);
    #endif
}

void reserve(size_t len) {
    if (fseek(outfile, len, SEEK_CUR) != 0) {
        fprintf(stderr, "failed to reserve space in in output file: %s\n", strerror(errno));
        exit(1);
    }
}

void emit(const void* restrict ptr, size_t count) {
    fwrite(ptr, 1, count, outfile);
    if (ferror(outfile)) {
        fprintf(stderr, "failed to write to output file\n");
        exit(1);
    }
}

void emit_u8(uint8_t x) {
    emit(&x, sizeof(uint8_t));
}

void emit_u32(uint32_t x) {
    emit(&x, sizeof(uint32_t));
}

void emit_u64(uint64_t x) {
    emit(&x, sizeof(uint64_t));
}

void patch(size_t off, const void* ptr, size_t count) {
    fpos_t save;
    if (fgetpos(outfile, &save) != 0) {
        fprintf(stderr, "failed to save file position before patch: %s\n", strerror(errno));
        exit(1);
    }
    if (fseek(outfile, (long) off, SEEK_SET) != 0) {
        fprintf(stderr, "failed to set file position for patch: %s\n", strerror(errno));
        exit(1);
    }
    fwrite(ptr, 1, count, outfile);
    if (ferror(outfile) != 0) {
        fprintf(stderr, "failed to patch output file: %s\n", strerror(errno));
        exit(1);
    }
    if (fsetpos(outfile, &save) != 0) {
        fprintf(stderr, "failed to restore file position after patch: %s\n", strerror(errno));
        exit(1);
    }
}

void patch_u32(size_t off, uint32_t x) {
    patch(off, &x, sizeof(uint32_t));
}

void patch_i32(size_t off, int32_t x) {
    patch_u32(off, (uint32_t) x);
}

static _Bool init = false;
static char peek_buf;

static char next_(void) {
    char c = getc(infile);
    if (c == EOF) {
        if (ferror(infile)) {
            fprintf(stderr, "failed to read source file: %s\n", strerror(errno));
            exit(1);
        }
        c = 0;
    }
    return c;
}

char nextc(void) {
    if (!init) {
        init = true;
        peek_buf = next_();
    }
    int tmp = peek_buf;
    peek_buf = next_();
    return peek_buf;
}

char peekc(void) {
    if (!init) {
        init = true;
        return nextc();
    }
    return peek_buf;
}
Initial commit. 2022-09-05 23:48:56 -07:00			`#include "io.h"`

Greatly simplify lexer thanks to new knowledge of lookahead. Now I know that the parser is LL(1) and the lexer also only needs one-character lookahead, which allows me to dramatically simplify the interface for IO input, and improve the interface to the lexer. Even if I did want unbounded peek, I'd want the interface to be `peek(off)`, not that awful buffer. I intend to use the new lexer interface to make the parser states more stateful, and potentially read multiple tokens in a row. Then, states would only be needed for recursive structures, without the awkward intermediate states like ST_LABEL which exists only to let me burn a token. I also removed the nonsense related to base 64 parsing, because it was unclear how to handle it, and the advantages of having it weren't clear. I kept up to base 36, but honestly I might want to consider getting rid of everything but decimal, hex, and binary anyway. I'm not sure if I'd want to keep using the current syntax for the radix either. 2022-09-07 23:02:15 -07:00			`#include <assert.h>`
Initial commit. 2022-09-05 23:48:56 -07:00			`#include <errno.h>`
Greatly simplify lexer thanks to new knowledge of lookahead. Now I know that the parser is LL(1) and the lexer also only needs one-character lookahead, which allows me to dramatically simplify the interface for IO input, and improve the interface to the lexer. Even if I did want unbounded peek, I'd want the interface to be `peek(off)`, not that awful buffer. I intend to use the new lexer interface to make the parser states more stateful, and potentially read multiple tokens in a row. Then, states would only be needed for recursive structures, without the awkward intermediate states like ST_LABEL which exists only to let me burn a token. I also removed the nonsense related to base 64 parsing, because it was unclear how to handle it, and the advantages of having it weren't clear. I kept up to base 36, but honestly I might want to consider getting rid of everything but decimal, hex, and binary anyway. I'm not sure if I'd want to keep using the current syntax for the radix either. 2022-09-07 23:02:15 -07:00			`#include <stdbool.h>`
Initial commit. 2022-09-05 23:48:56 -07:00			`#include <stdlib.h>`
Implemented parser! Recognition only, no output. Also no top-level declarations or operator precedence. The syntax is LL(1). LL syntax seems necessary because our codegen requires emitting certain code (e.g. entering control) prior to any codegen inside that context, whereas something like LR would presumably parse the inner expression before recognizing the control structure. There may be some way to work around this; I don't know, I'm not a parsing expert. Certain parts of the syntax are wonky, e.g. juxtaposition as function application means a missing semicolon can give confusing results. I suspect indentation-sensitive syntax would work more nicely, and intend to implement it some time in the future. 2022-09-07 20:42:37 -07:00			`#include <stdio.h>`
Initial commit. 2022-09-05 23:48:56 -07:00			`#include <string.h>`

Remove gratuitous platform-specific IO. I don't need some fancy atomic output file updating or posix_fadvise. I removed all platform-specific code except for a single `chmod`. That's not to say there's no advantage to atomically reading or writing files, but for this project, the first rule needs to be KISS. It's premature optimization and overengineering. 2022-09-06 23:16:23 -07:00			`#ifdef __unix__`
Initial commit. 2022-09-05 23:48:56 -07:00			`#include <sys/stat.h>`
			`#endif`

			`static const char* outfile_name;`
Remove gratuitous platform-specific IO. I don't need some fancy atomic output file updating or posix_fadvise. I removed all platform-specific code except for a single `chmod`. That's not to say there's no advantage to atomically reading or writing files, but for this project, the first rule needs to be KISS. It's premature optimization and overengineering. 2022-09-06 23:16:23 -07:00			`static FILE* infile;`
			`static FILE* outfile;`
Initial commit. 2022-09-05 23:48:56 -07:00
			`void open_files(const char* infile_name, const char* outfile_name_) {`
			`outfile_name = outfile_name_;`
			`infile = fopen(infile_name, "rb");`
			`if (infile == NULL) {`
			`fprintf(stderr, "failed to open source file: %s\n", strerror(errno));`
			`exit(1);`
			`}`

			`outfile = fopen(outfile_name, "wb");`
			`if (outfile == NULL) {`
			`fprintf(stderr, "failed to open output file: %s\n", strerror(errno));`
			`exit(1);`
			`}`
			`}`

			`void close_files(void) {`
Remove gratuitous platform-specific IO. I don't need some fancy atomic output file updating or posix_fadvise. I removed all platform-specific code except for a single `chmod`. That's not to say there's no advantage to atomically reading or writing files, but for this project, the first rule needs to be KISS. It's premature optimization and overengineering. 2022-09-06 23:16:23 -07:00			`fclose(infile);`
Initial commit. 2022-09-05 23:48:56 -07:00			`if (fclose(outfile) != 0) {`
			`fprintf(stderr, "failed to close output file: %s\n", strerror(errno));`
			`// NOTE: ideally we'd do this on any dirty exit`
			`if (remove(outfile_name) != 0) {`
Remove gratuitous platform-specific IO. I don't need some fancy atomic output file updating or posix_fadvise. I removed all platform-specific code except for a single `chmod`. That's not to say there's no advantage to atomically reading or writing files, but for this project, the first rule needs to be KISS. It's premature optimization and overengineering. 2022-09-06 23:16:23 -07:00			`fprintf(stderr, "failed to remove output file; if it exists, it is corrupt: %s\n", strerror(errno));`
Initial commit. 2022-09-05 23:48:56 -07:00			`}`
			`exit(1);`
			`}`
Remove gratuitous platform-specific IO. I don't need some fancy atomic output file updating or posix_fadvise. I removed all platform-specific code except for a single `chmod`. That's not to say there's no advantage to atomically reading or writing files, but for this project, the first rule needs to be KISS. It's premature optimization and overengineering. 2022-09-06 23:16:23 -07:00
			`#ifdef __unix__`
			`chmod(outfile_name, 0777);`
			`#endif`
			`}`

			`void reserve(size_t len) {`
			`if (fseek(outfile, len, SEEK_CUR) != 0) {`
			`fprintf(stderr, "failed to reserve space in in output file: %s\n", strerror(errno));`
			`exit(1);`
			`}`
Initial commit. 2022-09-05 23:48:56 -07:00			`}`

			`void emit(const void* restrict ptr, size_t count) {`
			`fwrite(ptr, 1, count, outfile);`
			`if (ferror(outfile)) {`
			`fprintf(stderr, "failed to write to output file\n");`
			`exit(1);`
			`}`
			`}`

			`void emit_u8(uint8_t x) {`
			`emit(&x, sizeof(uint8_t));`
			`}`

			`void emit_u32(uint32_t x) {`
			`emit(&x, sizeof(uint32_t));`
			`}`

			`void emit_u64(uint64_t x) {`
			`emit(&x, sizeof(uint64_t));`
			`}`

			`void patch(size_t off, const void* ptr, size_t count) {`
			`fpos_t save;`
			`if (fgetpos(outfile, &save) != 0) {`
			`fprintf(stderr, "failed to save file position before patch: %s\n", strerror(errno));`
			`exit(1);`
			`}`
			`if (fseek(outfile, (long) off, SEEK_SET) != 0) {`
			`fprintf(stderr, "failed to set file position for patch: %s\n", strerror(errno));`
			`exit(1);`
			`}`
			`fwrite(ptr, 1, count, outfile);`
			`if (ferror(outfile) != 0) {`
			`fprintf(stderr, "failed to patch output file: %s\n", strerror(errno));`
			`exit(1);`
			`}`
			`if (fsetpos(outfile, &save) != 0) {`
			`fprintf(stderr, "failed to restore file position after patch: %s\n", strerror(errno));`
			`exit(1);`
			`}`
			`}`

			`void patch_u32(size_t off, uint32_t x) {`
			`patch(off, &x, sizeof(uint32_t));`
			`}`

			`void patch_i32(size_t off, int32_t x) {`
			`patch_u32(off, (uint32_t) x);`
			`}`
Hacked together a god-awful hand-written lexer. 2022-09-07 10:22:38 -07:00
Indentation-sensitive syntax! 2022-09-08 16:01:31 -07:00			`static _Bool init = false;`
Greatly simplify lexer thanks to new knowledge of lookahead. Now I know that the parser is LL(1) and the lexer also only needs one-character lookahead, which allows me to dramatically simplify the interface for IO input, and improve the interface to the lexer. Even if I did want unbounded peek, I'd want the interface to be `peek(off)`, not that awful buffer. I intend to use the new lexer interface to make the parser states more stateful, and potentially read multiple tokens in a row. Then, states would only be needed for recursive structures, without the awkward intermediate states like ST_LABEL which exists only to let me burn a token. I also removed the nonsense related to base 64 parsing, because it was unclear how to handle it, and the advantages of having it weren't clear. I kept up to base 36, but honestly I might want to consider getting rid of everything but decimal, hex, and binary anyway. I'm not sure if I'd want to keep using the current syntax for the radix either. 2022-09-07 23:02:15 -07:00			`static char peek_buf;`
Hacked together a god-awful hand-written lexer. 2022-09-07 10:22:38 -07:00
Greatly simplify lexer thanks to new knowledge of lookahead. Now I know that the parser is LL(1) and the lexer also only needs one-character lookahead, which allows me to dramatically simplify the interface for IO input, and improve the interface to the lexer. Even if I did want unbounded peek, I'd want the interface to be `peek(off)`, not that awful buffer. I intend to use the new lexer interface to make the parser states more stateful, and potentially read multiple tokens in a row. Then, states would only be needed for recursive structures, without the awkward intermediate states like ST_LABEL which exists only to let me burn a token. I also removed the nonsense related to base 64 parsing, because it was unclear how to handle it, and the advantages of having it weren't clear. I kept up to base 36, but honestly I might want to consider getting rid of everything but decimal, hex, and binary anyway. I'm not sure if I'd want to keep using the current syntax for the radix either. 2022-09-07 23:02:15 -07:00			`static char next_(void) {`
Indentation-sensitive syntax! 2022-09-08 16:01:31 -07:00			`char c = getc(infile);`
Greatly simplify lexer thanks to new knowledge of lookahead. Now I know that the parser is LL(1) and the lexer also only needs one-character lookahead, which allows me to dramatically simplify the interface for IO input, and improve the interface to the lexer. Even if I did want unbounded peek, I'd want the interface to be `peek(off)`, not that awful buffer. I intend to use the new lexer interface to make the parser states more stateful, and potentially read multiple tokens in a row. Then, states would only be needed for recursive structures, without the awkward intermediate states like ST_LABEL which exists only to let me burn a token. I also removed the nonsense related to base 64 parsing, because it was unclear how to handle it, and the advantages of having it weren't clear. I kept up to base 36, but honestly I might want to consider getting rid of everything but decimal, hex, and binary anyway. I'm not sure if I'd want to keep using the current syntax for the radix either. 2022-09-07 23:02:15 -07:00			`if (c == EOF) {`
Hacked together a god-awful hand-written lexer. 2022-09-07 10:22:38 -07:00			`if (ferror(infile)) {`
			`fprintf(stderr, "failed to read source file: %s\n", strerror(errno));`
			`exit(1);`
			`}`
Indentation-sensitive syntax! 2022-09-08 16:01:31 -07:00			`c = 0;`
Hacked together a god-awful hand-written lexer. 2022-09-07 10:22:38 -07:00			`}`
Greatly simplify lexer thanks to new knowledge of lookahead. Now I know that the parser is LL(1) and the lexer also only needs one-character lookahead, which allows me to dramatically simplify the interface for IO input, and improve the interface to the lexer. Even if I did want unbounded peek, I'd want the interface to be `peek(off)`, not that awful buffer. I intend to use the new lexer interface to make the parser states more stateful, and potentially read multiple tokens in a row. Then, states would only be needed for recursive structures, without the awkward intermediate states like ST_LABEL which exists only to let me burn a token. I also removed the nonsense related to base 64 parsing, because it was unclear how to handle it, and the advantages of having it weren't clear. I kept up to base 36, but honestly I might want to consider getting rid of everything but decimal, hex, and binary anyway. I'm not sure if I'd want to keep using the current syntax for the radix either. 2022-09-07 23:02:15 -07:00			`return c;`
Hacked together a god-awful hand-written lexer. 2022-09-07 10:22:38 -07:00			`}`

Greatly simplify lexer thanks to new knowledge of lookahead. Now I know that the parser is LL(1) and the lexer also only needs one-character lookahead, which allows me to dramatically simplify the interface for IO input, and improve the interface to the lexer. Even if I did want unbounded peek, I'd want the interface to be `peek(off)`, not that awful buffer. I intend to use the new lexer interface to make the parser states more stateful, and potentially read multiple tokens in a row. Then, states would only be needed for recursive structures, without the awkward intermediate states like ST_LABEL which exists only to let me burn a token. I also removed the nonsense related to base 64 parsing, because it was unclear how to handle it, and the advantages of having it weren't clear. I kept up to base 36, but honestly I might want to consider getting rid of everything but decimal, hex, and binary anyway. I'm not sure if I'd want to keep using the current syntax for the radix either. 2022-09-07 23:02:15 -07:00			`char nextc(void) {`
Indentation-sensitive syntax! 2022-09-08 16:01:31 -07:00			`if (!init) {`
			`init = true;`
			`peek_buf = next_();`
Greatly simplify lexer thanks to new knowledge of lookahead. Now I know that the parser is LL(1) and the lexer also only needs one-character lookahead, which allows me to dramatically simplify the interface for IO input, and improve the interface to the lexer. Even if I did want unbounded peek, I'd want the interface to be `peek(off)`, not that awful buffer. I intend to use the new lexer interface to make the parser states more stateful, and potentially read multiple tokens in a row. Then, states would only be needed for recursive structures, without the awkward intermediate states like ST_LABEL which exists only to let me burn a token. I also removed the nonsense related to base 64 parsing, because it was unclear how to handle it, and the advantages of having it weren't clear. I kept up to base 36, but honestly I might want to consider getting rid of everything but decimal, hex, and binary anyway. I'm not sure if I'd want to keep using the current syntax for the radix either. 2022-09-07 23:02:15 -07:00			`}`
Indentation-sensitive syntax! 2022-09-08 16:01:31 -07:00			`int tmp = peek_buf;`
			`peek_buf = next_();`
			`return peek_buf;`
Greatly simplify lexer thanks to new knowledge of lookahead. Now I know that the parser is LL(1) and the lexer also only needs one-character lookahead, which allows me to dramatically simplify the interface for IO input, and improve the interface to the lexer. Even if I did want unbounded peek, I'd want the interface to be `peek(off)`, not that awful buffer. I intend to use the new lexer interface to make the parser states more stateful, and potentially read multiple tokens in a row. Then, states would only be needed for recursive structures, without the awkward intermediate states like ST_LABEL which exists only to let me burn a token. I also removed the nonsense related to base 64 parsing, because it was unclear how to handle it, and the advantages of having it weren't clear. I kept up to base 36, but honestly I might want to consider getting rid of everything but decimal, hex, and binary anyway. I'm not sure if I'd want to keep using the current syntax for the radix either. 2022-09-07 23:02:15 -07:00			`}`

			`char peekc(void) {`
Indentation-sensitive syntax! 2022-09-08 16:01:31 -07:00			`if (!init) {`
			`init = true;`
			`return nextc();`
Hacked together a god-awful hand-written lexer. 2022-09-07 10:22:38 -07:00			`}`
Greatly simplify lexer thanks to new knowledge of lookahead. Now I know that the parser is LL(1) and the lexer also only needs one-character lookahead, which allows me to dramatically simplify the interface for IO input, and improve the interface to the lexer. Even if I did want unbounded peek, I'd want the interface to be `peek(off)`, not that awful buffer. I intend to use the new lexer interface to make the parser states more stateful, and potentially read multiple tokens in a row. Then, states would only be needed for recursive structures, without the awkward intermediate states like ST_LABEL which exists only to let me burn a token. I also removed the nonsense related to base 64 parsing, because it was unclear how to handle it, and the advantages of having it weren't clear. I kept up to base 36, but honestly I might want to consider getting rid of everything but decimal, hex, and binary anyway. I'm not sure if I'd want to keep using the current syntax for the radix either. 2022-09-07 23:02:15 -07:00			`return peek_buf;`
Hacked together a god-awful hand-written lexer. 2022-09-07 10:22:38 -07:00			`}`