From e5af28536bfb0f4c9131df56d2009ba5196f5e3a Mon Sep 17 00:00:00 2001 From: Lizzy Fleckenstein Date: Sun, 12 Apr 2026 20:57:06 +0200 Subject: init --- src/lex.c | 213 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 213 insertions(+) create mode 100644 src/lex.c (limited to 'src/lex.c') diff --git a/src/lex.c b/src/lex.c new file mode 100644 index 0000000..29ddba8 --- /dev/null +++ b/src/lex.c @@ -0,0 +1,213 @@ +#include +#include +#include "lex.h" + +#define CHAR_COMMENT '#' + +static bool source_peek(struct source *src, char *c) +{ + if (src->offset >= src->contents.len) + return false; + *c = src->contents.ptr[src->offset]; + return true; +} + +static void source_next(struct source *src) +{ + assert(src->offset < src->contents.len); + src->offset++; +} + +static bool source_get(struct source *src, char *c) +{ + if (!source_peek(src, c)) + return false; + source_next(src); + return true; +} + +static void source_unget(struct source *src) +{ + assert(src->offset); + src->offset--; +} + +static struct range loc_from(struct source *src, size_t start) +{ + return (struct range) { .start = start, .end = src->offset }; +} + +static struct range loc_last(struct source *src) +{ + return loc_from(src, src->offset-1); +} + +static str str_from(struct source *src, size_t start) +{ + return (str) { src->offset - start, src->contents.ptr + start }; +} + +static bool is_blank_char(char c) +{ + return isblank(c) || c == '\n'; +} + +static bool is_num_start_char(char c) +{ + return isdigit(c) || c == '+' || c == '-' || c == '.'; +} + +static bool is_num_char(char c) +{ + return isxdigit(c) || c == '+' || c == '-' || c == '.'; +} + +static bool is_name_char(char c) +{ + return isprint(c) && !is_blank_char(c) && c != CHAR_COMMENT && c != '(' && c != ')'; +} + +static void source_skip(struct source *src) +{ + char c; + while (source_peek(src, &c) && is_blank_char(c)) + source_next(src); + if (source_peek(src, &c) && c == CHAR_COMMENT) + while (source_get(src, &c) && c != '\n') + ; +} + +static str lex_name(struct source *src) +{ + size_t start = src->offset; + char c; + while (source_peek(src, &c) && is_name_char(c)) + source_next(src); + return str_from(src, start); +} + +static str lex_string(struct source *src, size_t start) +{ + strbuf buf = { .cap = 32 }; + char c; + while (source_get(src, &c)) { + if (c == '\"') { + str s = arraybuf_cast(buf); + arraybuf_insert(&src->strings, s); + return s; + } + // TODO: more escapes + if (c == '\\') { + if (!source_get(src, &c)) + source_error(src, loc_last(src), "unterminated escape"); + switch (c) { + case '\\': + c = '\\'; + break; + case '\"': + c = '\"'; + break; + default: + source_error(src, loc_last(src), "unknown escape"); + break; + } + } + arraybuf_insert(&buf, c); + } + source_error(src, loc_from(src, start), "unterminated string"); +} + +static double lex_number(struct source *src) +{ + size_t start = src->offset; + char c; + while (source_peek(src, &c) && is_num_char(c)) + source_next(src); + double num; + if (!str_parse_double(str_from(src, start), &num)) + source_error(src, loc_from(src, start), "invalid number"); + return num; +} + +enum op_category op_category_tab[] = { +#define OP_TYPE_CAT(name, sym, cat) OPC_##cat, + OP_TYPES(OP_TYPE_CAT) +#undef OP_TYPE_CAT +}; + +str op_name_tab[] = { +#define OP_TYPE_NAME(name, sym, cat) S(name), + OP_TYPES(OP_TYPE_NAME) +#undef OP_TYPE_NAME +}; + +static bool lex_op_type(str in_sym, enum op_type *op) +{ + if (0) {} +#define OP_TYPE_CASE(sym, name, cat) else if (array_eq(in_sym, S(sym))) { *op = OP_##name; return true; } + OP_TYPES(OP_TYPE_CASE) +#undef OP_TYPE_CASE + else return false; +} + +bool lex_token(struct source *src, struct token *tok) +{ + source_skip(src); + size_t start = src->offset; + char c; + if (!source_get(src, &c)) + return false; + switch (c) { + case '(': { + tok->type = TOKEN_OP; + source_skip(src); + + size_t start_name = src->offset; + str name = lex_name(src); + tok->op.name_loc = loc_from(src, start_name); + + if (!lex_op_type(name, &tok->op.type)) + source_error(src, tok->op.name_loc, "unknown operation"); + + arraybuf(struct token) children = {}; + while ((source_skip(src), !(source_peek(src, &c) && c == ')'))) { + arraybuf_grow(&children, 1); + if (!lex_token(src, &children.ptr[children.len++])) + source_error(src, loc_from(src, start), "unterminated operation"); + } + source_next(src); + array_assign(&tok->op.children, children); + break; + } + case '$': + tok->type = TOKEN_VAR; + tok->var = lex_name(src); + break; + case '"': + tok->type = TOKEN_STRING; + tok->string = lex_string(src, start); + break; + default: + source_unget(src); + if (is_num_start_char(c)) { + tok->type = TOKEN_NUMBER; + tok->number = lex_number(src); + } else if (is_name_char(c)) { + tok->type = TOKEN_IDENT; + tok->ident = lex_name(src); + } else { + source_error(src, loc_last(src), "expected token"); + } + break; + } + tok->loc = loc_from(src, start); + return true; +} + +void free_token(struct token *tok) +{ + if (tok->type != TOKEN_OP) return; + for (size_t i = 0; i < tok->op.children.len; i++) + free_token(&tok->op.children.ptr[i]); + free(tok->op.children.ptr); +} -- cgit v1.2.3