From e079370a92766181ff5281c5d6cea03a1fce5b93 Mon Sep 17 00:00:00 2001 From: "Anna (navi) Figueiredo Gomes" Date: Fri, 5 Jan 2024 22:51:33 +0100 Subject: initial release Signed-off-by: Anna (navi) Figueiredo Gomes --- src/string.c | 273 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 273 insertions(+) create mode 100644 src/string.c (limited to 'src/string.c') diff --git a/src/string.c b/src/string.c new file mode 100644 index 0000000..2aea375 --- /dev/null +++ b/src/string.c @@ -0,0 +1,273 @@ +#include +#include +#include +#include +#include "internal.h" + +char *json_string_get(const struct json *str) { + if (str->type != JSON_STRING) + return NULL; + return str->string; +} + +enum json_parse_result parse_string(struct json **json_out, struct raw_json *raw) { + struct json *json = json_new_string(); + if (!json) + return JSON_OOM; + enum json_parse_result ret = parse_raw_string(&json->string, raw); + if (ret == JSON_PARSE_OK) { + *json_out = json; + } else { + json_delete(json); + } + return ret; +} + +static inline +enum json_parse_result parse_octet(uint8_t *out, const char ch) { + uint8_t value = ch; + if (value >= '0' && value <= '9') { + *out = value - '0'; + return JSON_PARSE_OK; + } + + value |= 0x20; // set 6th bit to force lowercase. + + if (value >= 'a' && value <= 'f') { + *out = 9 + (value & 0xF); // 'a' & 00001111 == 0001, but 0xA == 10, so +9 + return JSON_PARSE_OK; + } + + return JSON_PARSE_STRING_INVALID_UNICODE_ERR; +} + +static inline +enum json_parse_result parse_hex_pair(uint32_t *out, struct raw_json *raw) { + if (raw->index + 4 > raw->size) { + return JSON_PARSE_STRING_INVALID_UNICODE_ERR; + } + enum json_parse_result ret = JSON_PARSE_OK; + uint8_t hihi = 0; + uint8_t hilo = 0; + + ret = parse_octet(&hihi, raw->data[raw->index++]); + if (ret != JSON_PARSE_OK) + return ret; + hihi <<= 4; + ret = parse_octet(&hilo, raw->data[raw->index++]); + if (ret != JSON_PARSE_OK) + return ret; + + uint8_t hi = hihi | hilo; + + ret = parse_octet(&hihi, raw->data[raw->index++]); + if (ret != JSON_PARSE_OK) + return ret; + hihi <<= 4; + ret = parse_octet(&hilo, raw->data[raw->index++]); + if (ret != JSON_PARSE_OK) + return ret; + + uint8_t lo = hihi | hilo; + + *out = (hi << 8) | lo; + return ret; +} + +static inline +enum json_parse_result parse_unicode(char *str, size_t len, size_t i, struct raw_json *raw) { + enum json_parse_result ret = JSON_PARSE_OK; + uint32_t codepoint = 0; + ret = parse_hex_pair(&codepoint, raw); + if (ret != JSON_PARSE_OK) { + return ret; + } + + if (codepoint <= 0x1F || (codepoint >= 0x7F && codepoint <= 0x9F)) { + if (i + 6 > len || i + 6 > raw->size) { + return JSON_PARSE_STRING_INVALID_ERR; + } + + /* unescaping the same codepoints as we do on ascii */ + switch (codepoint) { + case 0x8: + str[i++] = '\b'; + return JSON_PARSE_OK; + case 0x9: + str[i++] = '\t'; + return JSON_PARSE_OK; + case 0xa: + str[i++] = '\n'; + return JSON_PARSE_OK; + case 0xc: + str[i++] = '\f'; + return JSON_PARSE_OK; + case 0xd: + str[i++] = '\r'; + return JSON_PARSE_OK; + } + + /* rolling back index so we can parse the codepoint again */ + raw->index -= 6; + for (size_t limit = i + 6; i < limit; i++) { + str[i] = raw->data[raw->index++]; + } + /* the calling function expects to do the last advance */ + raw->index--; + return ret; + } + + /* here we're dealing with a utf-16 surrogate pair*/ + if (codepoint >= 0xD800 && codepoint <= 0xDFFF) { + if (raw->index + 6 > raw->size + || (raw->data[raw->index++] != '\\' || raw->data[raw->index++] != 'u') + || codepoint < 0xD800 || codepoint > 0xDBFF) { + ret = JSON_PARSE_STRING_INVALID_UNICODE_ERR; + return ret; + } + uint32_t second_codepoint = 0; + ret = parse_hex_pair(&second_codepoint, raw); + if (ret != JSON_PARSE_OK) { + return ret; + } + + if (second_codepoint < 0xDC00 || second_codepoint > 0xDFFF) { + return JSON_PARSE_STRING_INVALID_UNICODE_ERR; + } + /* 0x3FF = 00000011111111 - we mask the lower 10 bits of both, + * the first codepoint make up the high 10 bits of the result, + * the second makes up the low 10 bits */ + codepoint = ((codepoint & 0x3FF) << 10) | (second_codepoint & 0x3FF); + codepoint += 0x10000; + if (codepoint < 0x110000 && i + 4 < len) { + str[i++] = 0xF0 | (codepoint >> 18); + str[i++] = 0x80 | ((codepoint >> 12) & 0x3F); + str[i++] = 0x80 | ((codepoint >> 6) & 0x3F); + str[i] = 0x80 | (codepoint & 0x3F); + } else { + ret = JSON_PARSE_STRING_INVALID_UNICODE_ERR; + return ret; + } + /* here it's utf-8 */ + } else { + if (codepoint < 0x80) { + str[i] = codepoint; + } else if (codepoint < 0x800) { + str[i++] = 0xC0 | (codepoint >> 6); + str[i] = 0x80 | (codepoint & 0x3F); + } else if (codepoint < 0x10000) { + str[i++] = 0xE0 | (codepoint >> 12); + str[i++] = 0x80 | ((codepoint >> 6) & 0x3F); + str[i] = 0x80 | (codepoint & 0x3F); + } else { + ret = JSON_PARSE_STRING_INVALID_UNICODE_ERR; + return ret; + } + } + + /* the calling function expects to do the last advance */ + raw->index--; + return ret; +} + +enum json_parse_result parse_raw_string(char **str_out, struct raw_json *raw) { + assert(raw->data[raw->index] == '"'); + assert(str_out); + assert(raw); + + char *str = NULL; + enum json_parse_result ret = JSON_PARSE_OK; + size_t end = raw->index + 1; + size_t skipped = 0; + for (; end < raw->size && raw->data[end] != '"'; end++) { + switch (raw->data[end]) { + case '\n': + case '\f': + case '\b': + case '\r': + case '\t': + case 0: + ret = JSON_PARSE_STRING_INVALID_ESCAPE_ERR; + goto err; + case '\\': + /* for unicode escapes, we can't skip bytes, since control + * escape codes will not be parsed, and will be stored as is */ + if (++end < raw->size && raw->data[end] != 'u') { + skipped++; + } + break; + } + } + + if (raw->data[end] != '"') { + ret = JSON_PARSE_STRING_INVALID_ERR; + goto err; + } + + size_t len = end - raw->index - skipped; + str = calloc(len + 1, sizeof(char)); + if (!str) { + return JSON_OOM; + } + size_t i; + for (i = 0, raw->index++; raw->index < raw->size && raw->data[raw->index] != '"' && i < len; raw->index++, i++) { + if (raw->data[raw->index] == '\\' && raw->index + 1 < raw->size) { + switch (raw->data[++raw->index]) { + case '\\': + case '\"': + case '/': + break; + case 'n': + str[i] = '\n'; + continue; + case 'f': + str[i] = '\f'; + continue; + case 'b': + str[i] = '\b'; + continue; + case 'r': + str[i] = '\r'; + continue; + case 't': + str[i] = '\t'; + continue; + case 'u': { + raw->index++; + ret = parse_unicode(str, len, i, raw); + if (ret != JSON_PARSE_OK) + goto err; + continue; + } + default: + ret = JSON_PARSE_STRING_INVALID_ESCAPE_ERR; + goto err; + } + } + str[i] = raw->data[raw->index]; + } + + if (raw->data[raw->index++] != '"') { + ret = JSON_PARSE_INVALID_STRING_ERR; + goto err; + } + + *str_out = str; + goto end; +err: + free(str); +end: + return ret; +} + +void json_string_set(struct json *dest, const char *string) { + if (dest->type != JSON_STRING) { + char *key = dest->key; + dest->key = NULL; + json_clear(dest); + dest->key = key; + dest->type = JSON_STRING; + } + free(dest->string); + dest->string = strdup(string); +} -- cgit v1.2.3