2022-09-05 23:48:56 -07:00
|
|
|
#include "io.h"
|
|
|
|
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
#include <assert.h>
|
2022-09-05 23:48:56 -07:00
|
|
|
#include <errno.h>
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
#include <stdbool.h>
|
2022-09-05 23:48:56 -07:00
|
|
|
#include <stdlib.h>
|
2022-09-07 20:42:37 -07:00
|
|
|
#include <stdio.h>
|
2022-09-05 23:48:56 -07:00
|
|
|
#include <string.h>
|
|
|
|
|
2022-09-06 23:16:23 -07:00
|
|
|
#ifdef __unix__
|
2022-09-05 23:48:56 -07:00
|
|
|
#include <sys/stat.h>
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static const char* outfile_name;
|
2022-09-06 23:16:23 -07:00
|
|
|
static FILE* infile;
|
|
|
|
static FILE* outfile;
|
2022-09-05 23:48:56 -07:00
|
|
|
|
|
|
|
void open_files(const char* infile_name, const char* outfile_name_) {
|
|
|
|
outfile_name = outfile_name_;
|
|
|
|
infile = fopen(infile_name, "rb");
|
|
|
|
if (infile == NULL) {
|
|
|
|
fprintf(stderr, "failed to open source file: %s\n", strerror(errno));
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
outfile = fopen(outfile_name, "wb");
|
|
|
|
if (outfile == NULL) {
|
|
|
|
fprintf(stderr, "failed to open output file: %s\n", strerror(errno));
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void close_files(void) {
|
2022-09-06 23:16:23 -07:00
|
|
|
fclose(infile);
|
2022-09-05 23:48:56 -07:00
|
|
|
if (fclose(outfile) != 0) {
|
|
|
|
fprintf(stderr, "failed to close output file: %s\n", strerror(errno));
|
|
|
|
// NOTE: ideally we'd do this on any dirty exit
|
|
|
|
if (remove(outfile_name) != 0) {
|
2022-09-06 23:16:23 -07:00
|
|
|
fprintf(stderr, "failed to remove output file; if it exists, it is corrupt: %s\n", strerror(errno));
|
2022-09-05 23:48:56 -07:00
|
|
|
}
|
|
|
|
exit(1);
|
|
|
|
}
|
2022-09-06 23:16:23 -07:00
|
|
|
|
|
|
|
#ifdef __unix__
|
|
|
|
chmod(outfile_name, 0777);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
void reserve(size_t len) {
|
|
|
|
if (fseek(outfile, len, SEEK_CUR) != 0) {
|
|
|
|
fprintf(stderr, "failed to reserve space in in output file: %s\n", strerror(errno));
|
|
|
|
exit(1);
|
|
|
|
}
|
2022-09-05 23:48:56 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
void emit(const void* restrict ptr, size_t count) {
|
|
|
|
fwrite(ptr, 1, count, outfile);
|
|
|
|
if (ferror(outfile)) {
|
|
|
|
fprintf(stderr, "failed to write to output file\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void emit_u8(uint8_t x) {
|
|
|
|
emit(&x, sizeof(uint8_t));
|
|
|
|
}
|
|
|
|
|
|
|
|
void emit_u32(uint32_t x) {
|
|
|
|
emit(&x, sizeof(uint32_t));
|
|
|
|
}
|
|
|
|
|
|
|
|
void emit_u64(uint64_t x) {
|
|
|
|
emit(&x, sizeof(uint64_t));
|
|
|
|
}
|
|
|
|
|
|
|
|
void patch(size_t off, const void* ptr, size_t count) {
|
|
|
|
fpos_t save;
|
|
|
|
if (fgetpos(outfile, &save) != 0) {
|
|
|
|
fprintf(stderr, "failed to save file position before patch: %s\n", strerror(errno));
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
if (fseek(outfile, (long) off, SEEK_SET) != 0) {
|
|
|
|
fprintf(stderr, "failed to set file position for patch: %s\n", strerror(errno));
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
fwrite(ptr, 1, count, outfile);
|
|
|
|
if (ferror(outfile) != 0) {
|
|
|
|
fprintf(stderr, "failed to patch output file: %s\n", strerror(errno));
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
if (fsetpos(outfile, &save) != 0) {
|
|
|
|
fprintf(stderr, "failed to restore file position after patch: %s\n", strerror(errno));
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void patch_u32(size_t off, uint32_t x) {
|
|
|
|
patch(off, &x, sizeof(uint32_t));
|
|
|
|
}
|
|
|
|
|
|
|
|
void patch_i32(size_t off, int32_t x) {
|
|
|
|
patch_u32(off, (uint32_t) x);
|
|
|
|
}
|
2022-09-07 10:22:38 -07:00
|
|
|
|
2022-09-08 16:01:31 -07:00
|
|
|
static _Bool init = false;
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
static char peek_buf;
|
2022-09-07 10:22:38 -07:00
|
|
|
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
static char next_(void) {
|
2022-09-08 16:01:31 -07:00
|
|
|
char c = getc(infile);
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
if (c == EOF) {
|
2022-09-07 10:22:38 -07:00
|
|
|
if (ferror(infile)) {
|
|
|
|
fprintf(stderr, "failed to read source file: %s\n", strerror(errno));
|
|
|
|
exit(1);
|
|
|
|
}
|
2022-09-08 16:01:31 -07:00
|
|
|
c = 0;
|
2022-09-07 10:22:38 -07:00
|
|
|
}
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
return c;
|
2022-09-07 10:22:38 -07:00
|
|
|
}
|
|
|
|
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
char nextc(void) {
|
2022-09-08 16:01:31 -07:00
|
|
|
if (!init) {
|
|
|
|
init = true;
|
|
|
|
peek_buf = next_();
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
}
|
2022-09-08 16:01:31 -07:00
|
|
|
int tmp = peek_buf;
|
|
|
|
peek_buf = next_();
|
|
|
|
return peek_buf;
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
char peekc(void) {
|
2022-09-08 16:01:31 -07:00
|
|
|
if (!init) {
|
|
|
|
init = true;
|
|
|
|
return nextc();
|
2022-09-07 10:22:38 -07:00
|
|
|
}
|
Greatly simplify lexer thanks to new knowledge of lookahead.
Now I know that the parser is LL(1) and the lexer also only
needs one-character lookahead, which allows me to dramatically
simplify the interface for IO input, and improve the interface
to the lexer. Even if I did want unbounded peek, I'd want
the interface to be `peek(off)`, not that awful buffer.
I intend to use the new lexer interface to make the parser
states more stateful, and potentially read multiple tokens
in a row. Then, states would only be needed for recursive
structures, without the awkward intermediate states like
ST_LABEL which exists only to let me burn a token.
I also removed the nonsense related to base 64 parsing,
because it was unclear how to handle it, and the advantages
of having it weren't clear. I kept up to base 36, but honestly
I might want to consider getting rid of everything but decimal,
hex, and binary anyway. I'm not sure if I'd want to keep using
the current syntax for the radix either.
2022-09-07 23:02:15 -07:00
|
|
|
return peek_buf;
|
2022-09-07 10:22:38 -07:00
|
|
|
}
|