summaryrefslogtreecommitdiff
path: root/src/string.c
diff options
context:
space:
mode:
authorAnna (navi) Figueiredo Gomes <navi@vlhl.dev>2024-01-05 22:51:33 +0100
committerAnna (navi) Figueiredo Gomes <navi@vlhl.dev>2024-01-06 18:58:13 +0100
commite079370a92766181ff5281c5d6cea03a1fce5b93 (patch)
treec05ffce648ea7cb5e3c3ebb102ac4c80ff9b4c35 /src/string.c
initial release
Signed-off-by: Anna (navi) Figueiredo Gomes <navi@vlhl.dev>
Diffstat (limited to 'src/string.c')
-rw-r--r--src/string.c273
1 files changed, 273 insertions, 0 deletions
diff --git a/src/string.c b/src/string.c
new file mode 100644
index 0000000..2aea375
--- /dev/null
+++ b/src/string.c
@@ -0,0 +1,273 @@
+#include <assert.h>
+#include <string.h>
+#include <stdint.h>
+#include <json.h>
+#include "internal.h"
+
+char *json_string_get(const struct json *str) {
+ if (str->type != JSON_STRING)
+ return NULL;
+ return str->string;
+}
+
+enum json_parse_result parse_string(struct json **json_out, struct raw_json *raw) {
+ struct json *json = json_new_string();
+ if (!json)
+ return JSON_OOM;
+ enum json_parse_result ret = parse_raw_string(&json->string, raw);
+ if (ret == JSON_PARSE_OK) {
+ *json_out = json;
+ } else {
+ json_delete(json);
+ }
+ return ret;
+}
+
+static inline
+enum json_parse_result parse_octet(uint8_t *out, const char ch) {
+ uint8_t value = ch;
+ if (value >= '0' && value <= '9') {
+ *out = value - '0';
+ return JSON_PARSE_OK;
+ }
+
+ value |= 0x20; // set 6th bit to force lowercase.
+
+ if (value >= 'a' && value <= 'f') {
+ *out = 9 + (value & 0xF); // 'a' & 00001111 == 0001, but 0xA == 10, so +9
+ return JSON_PARSE_OK;
+ }
+
+ return JSON_PARSE_STRING_INVALID_UNICODE_ERR;
+}
+
+static inline
+enum json_parse_result parse_hex_pair(uint32_t *out, struct raw_json *raw) {
+ if (raw->index + 4 > raw->size) {
+ return JSON_PARSE_STRING_INVALID_UNICODE_ERR;
+ }
+ enum json_parse_result ret = JSON_PARSE_OK;
+ uint8_t hihi = 0;
+ uint8_t hilo = 0;
+
+ ret = parse_octet(&hihi, raw->data[raw->index++]);
+ if (ret != JSON_PARSE_OK)
+ return ret;
+ hihi <<= 4;
+ ret = parse_octet(&hilo, raw->data[raw->index++]);
+ if (ret != JSON_PARSE_OK)
+ return ret;
+
+ uint8_t hi = hihi | hilo;
+
+ ret = parse_octet(&hihi, raw->data[raw->index++]);
+ if (ret != JSON_PARSE_OK)
+ return ret;
+ hihi <<= 4;
+ ret = parse_octet(&hilo, raw->data[raw->index++]);
+ if (ret != JSON_PARSE_OK)
+ return ret;
+
+ uint8_t lo = hihi | hilo;
+
+ *out = (hi << 8) | lo;
+ return ret;
+}
+
+static inline
+enum json_parse_result parse_unicode(char *str, size_t len, size_t i, struct raw_json *raw) {
+ enum json_parse_result ret = JSON_PARSE_OK;
+ uint32_t codepoint = 0;
+ ret = parse_hex_pair(&codepoint, raw);
+ if (ret != JSON_PARSE_OK) {
+ return ret;
+ }
+
+ if (codepoint <= 0x1F || (codepoint >= 0x7F && codepoint <= 0x9F)) {
+ if (i + 6 > len || i + 6 > raw->size) {
+ return JSON_PARSE_STRING_INVALID_ERR;
+ }
+
+ /* unescaping the same codepoints as we do on ascii */
+ switch (codepoint) {
+ case 0x8:
+ str[i++] = '\b';
+ return JSON_PARSE_OK;
+ case 0x9:
+ str[i++] = '\t';
+ return JSON_PARSE_OK;
+ case 0xa:
+ str[i++] = '\n';
+ return JSON_PARSE_OK;
+ case 0xc:
+ str[i++] = '\f';
+ return JSON_PARSE_OK;
+ case 0xd:
+ str[i++] = '\r';
+ return JSON_PARSE_OK;
+ }
+
+ /* rolling back index so we can parse the codepoint again */
+ raw->index -= 6;
+ for (size_t limit = i + 6; i < limit; i++) {
+ str[i] = raw->data[raw->index++];
+ }
+ /* the calling function expects to do the last advance */
+ raw->index--;
+ return ret;
+ }
+
+ /* here we're dealing with a utf-16 surrogate pair*/
+ if (codepoint >= 0xD800 && codepoint <= 0xDFFF) {
+ if (raw->index + 6 > raw->size
+ || (raw->data[raw->index++] != '\\' || raw->data[raw->index++] != 'u')
+ || codepoint < 0xD800 || codepoint > 0xDBFF) {
+ ret = JSON_PARSE_STRING_INVALID_UNICODE_ERR;
+ return ret;
+ }
+ uint32_t second_codepoint = 0;
+ ret = parse_hex_pair(&second_codepoint, raw);
+ if (ret != JSON_PARSE_OK) {
+ return ret;
+ }
+
+ if (second_codepoint < 0xDC00 || second_codepoint > 0xDFFF) {
+ return JSON_PARSE_STRING_INVALID_UNICODE_ERR;
+ }
+ /* 0x3FF = 00000011111111 - we mask the lower 10 bits of both,
+ * the first codepoint make up the high 10 bits of the result,
+ * the second makes up the low 10 bits */
+ codepoint = ((codepoint & 0x3FF) << 10) | (second_codepoint & 0x3FF);
+ codepoint += 0x10000;
+ if (codepoint < 0x110000 && i + 4 < len) {
+ str[i++] = 0xF0 | (codepoint >> 18);
+ str[i++] = 0x80 | ((codepoint >> 12) & 0x3F);
+ str[i++] = 0x80 | ((codepoint >> 6) & 0x3F);
+ str[i] = 0x80 | (codepoint & 0x3F);
+ } else {
+ ret = JSON_PARSE_STRING_INVALID_UNICODE_ERR;
+ return ret;
+ }
+ /* here it's utf-8 */
+ } else {
+ if (codepoint < 0x80) {
+ str[i] = codepoint;
+ } else if (codepoint < 0x800) {
+ str[i++] = 0xC0 | (codepoint >> 6);
+ str[i] = 0x80 | (codepoint & 0x3F);
+ } else if (codepoint < 0x10000) {
+ str[i++] = 0xE0 | (codepoint >> 12);
+ str[i++] = 0x80 | ((codepoint >> 6) & 0x3F);
+ str[i] = 0x80 | (codepoint & 0x3F);
+ } else {
+ ret = JSON_PARSE_STRING_INVALID_UNICODE_ERR;
+ return ret;
+ }
+ }
+
+ /* the calling function expects to do the last advance */
+ raw->index--;
+ return ret;
+}
+
+enum json_parse_result parse_raw_string(char **str_out, struct raw_json *raw) {
+ assert(raw->data[raw->index] == '"');
+ assert(str_out);
+ assert(raw);
+
+ char *str = NULL;
+ enum json_parse_result ret = JSON_PARSE_OK;
+ size_t end = raw->index + 1;
+ size_t skipped = 0;
+ for (; end < raw->size && raw->data[end] != '"'; end++) {
+ switch (raw->data[end]) {
+ case '\n':
+ case '\f':
+ case '\b':
+ case '\r':
+ case '\t':
+ case 0:
+ ret = JSON_PARSE_STRING_INVALID_ESCAPE_ERR;
+ goto err;
+ case '\\':
+ /* for unicode escapes, we can't skip bytes, since control
+ * escape codes will not be parsed, and will be stored as is */
+ if (++end < raw->size && raw->data[end] != 'u') {
+ skipped++;
+ }
+ break;
+ }
+ }
+
+ if (raw->data[end] != '"') {
+ ret = JSON_PARSE_STRING_INVALID_ERR;
+ goto err;
+ }
+
+ size_t len = end - raw->index - skipped;
+ str = calloc(len + 1, sizeof(char));
+ if (!str) {
+ return JSON_OOM;
+ }
+ size_t i;
+ for (i = 0, raw->index++; raw->index < raw->size && raw->data[raw->index] != '"' && i < len; raw->index++, i++) {
+ if (raw->data[raw->index] == '\\' && raw->index + 1 < raw->size) {
+ switch (raw->data[++raw->index]) {
+ case '\\':
+ case '\"':
+ case '/':
+ break;
+ case 'n':
+ str[i] = '\n';
+ continue;
+ case 'f':
+ str[i] = '\f';
+ continue;
+ case 'b':
+ str[i] = '\b';
+ continue;
+ case 'r':
+ str[i] = '\r';
+ continue;
+ case 't':
+ str[i] = '\t';
+ continue;
+ case 'u': {
+ raw->index++;
+ ret = parse_unicode(str, len, i, raw);
+ if (ret != JSON_PARSE_OK)
+ goto err;
+ continue;
+ }
+ default:
+ ret = JSON_PARSE_STRING_INVALID_ESCAPE_ERR;
+ goto err;
+ }
+ }
+ str[i] = raw->data[raw->index];
+ }
+
+ if (raw->data[raw->index++] != '"') {
+ ret = JSON_PARSE_INVALID_STRING_ERR;
+ goto err;
+ }
+
+ *str_out = str;
+ goto end;
+err:
+ free(str);
+end:
+ return ret;
+}
+
+void json_string_set(struct json *dest, const char *string) {
+ if (dest->type != JSON_STRING) {
+ char *key = dest->key;
+ dest->key = NULL;
+ json_clear(dest);
+ dest->key = key;
+ dest->type = JSON_STRING;
+ }
+ free(dest->string);
+ dest->string = strdup(string);
+}