From e079370a92766181ff5281c5d6cea03a1fce5b93 Mon Sep 17 00:00:00 2001
From: "Anna (navi) Figueiredo Gomes" <navi@vlhl.dev>
Date: Fri, 5 Jan 2024 22:51:33 +0100
Subject: initial release

Signed-off-by: Anna (navi) Figueiredo Gomes <navi@vlhl.dev>
---
 src/string.c | 273 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 273 insertions(+)
 create mode 100644 src/string.c

(limited to 'src/string.c')

diff --git a/src/string.c b/src/string.c
new file mode 100644
index 0000000..2aea375
--- /dev/null
+++ b/src/string.c
@@ -0,0 +1,273 @@
+#include <assert.h>
+#include <string.h>
+#include <stdint.h>
+#include <json.h>
+#include "internal.h"
+
+char *json_string_get(const struct json *str) {
+	if (str->type != JSON_STRING)
+		return NULL;
+	return str->string;
+}
+
+enum json_parse_result parse_string(struct json **json_out, struct raw_json *raw) {
+	struct json *json = json_new_string();
+	if (!json)
+		return JSON_OOM;
+	enum json_parse_result ret = parse_raw_string(&json->string, raw);
+	if (ret == JSON_PARSE_OK) {
+		*json_out = json;
+	} else {
+		json_delete(json);
+	}
+	return ret;
+}
+
+static inline
+enum json_parse_result parse_octet(uint8_t *out, const char ch) {
+	uint8_t value = ch;
+	if (value >= '0' && value <= '9') {
+		*out = value - '0';
+		return JSON_PARSE_OK;
+	}
+
+	value |= 0x20; // set 6th bit to force lowercase.
+
+	if (value >= 'a' && value <= 'f') {
+		*out = 9 + (value & 0xF); // 'a' & 00001111 == 0001, but 0xA == 10, so +9
+		return JSON_PARSE_OK;
+	}
+
+	return JSON_PARSE_STRING_INVALID_UNICODE_ERR;
+}
+
+static inline
+enum json_parse_result parse_hex_pair(uint32_t *out, struct raw_json *raw) {
+	if (raw->index + 4 > raw->size) {
+		return JSON_PARSE_STRING_INVALID_UNICODE_ERR;
+	}
+	enum json_parse_result ret = JSON_PARSE_OK;
+	uint8_t hihi = 0;
+	uint8_t hilo = 0;
+
+	ret = parse_octet(&hihi, raw->data[raw->index++]);
+	if (ret != JSON_PARSE_OK)
+		return ret;
+	hihi <<= 4;
+	ret = parse_octet(&hilo, raw->data[raw->index++]);
+	if (ret != JSON_PARSE_OK)
+		return ret;
+	
+	uint8_t hi = hihi | hilo;
+
+	ret = parse_octet(&hihi, raw->data[raw->index++]);
+	if (ret != JSON_PARSE_OK)
+		return ret;
+	hihi <<= 4;
+	ret = parse_octet(&hilo, raw->data[raw->index++]);
+	if (ret != JSON_PARSE_OK)
+		return ret;
+
+	uint8_t lo = hihi | hilo;
+
+	*out = (hi << 8) | lo;
+	return ret;
+}
+
+static inline
+enum json_parse_result parse_unicode(char *str, size_t len, size_t i, struct raw_json *raw) {
+	enum json_parse_result ret = JSON_PARSE_OK;
+	uint32_t codepoint = 0;
+	ret = parse_hex_pair(&codepoint, raw);
+	if (ret != JSON_PARSE_OK) {
+		return ret;
+	}
+
+	if (codepoint <= 0x1F || (codepoint >= 0x7F && codepoint <= 0x9F)) {
+		if (i + 6 > len || i + 6 > raw->size) {
+			return JSON_PARSE_STRING_INVALID_ERR;
+		}
+
+		/* unescaping the same codepoints as we do on ascii */
+		switch (codepoint) {
+			case 0x8:
+				str[i++] = '\b';
+				return JSON_PARSE_OK;
+			case 0x9:
+				str[i++] = '\t';
+				return JSON_PARSE_OK;
+			case 0xa:
+				str[i++] = '\n';
+				return JSON_PARSE_OK;
+			case 0xc:
+				str[i++] = '\f';
+				return JSON_PARSE_OK;
+			case 0xd:
+				str[i++] = '\r';
+				return JSON_PARSE_OK;
+		}
+
+		/* rolling back index so we can parse the codepoint again */
+		raw->index -= 6;
+		for (size_t limit = i + 6; i < limit; i++) {
+			str[i] = raw->data[raw->index++];
+		}
+		/* the calling function expects to do the last advance */
+		raw->index--;
+		return ret;
+	}
+
+	/* here we're dealing with a utf-16 surrogate pair*/
+	if (codepoint >= 0xD800 && codepoint <= 0xDFFF) {
+		if (raw->index + 6 > raw->size
+				|| (raw->data[raw->index++] != '\\' || raw->data[raw->index++] != 'u')
+				|| codepoint < 0xD800 || codepoint > 0xDBFF) {
+			ret = JSON_PARSE_STRING_INVALID_UNICODE_ERR;
+			return ret;
+		}
+		uint32_t second_codepoint = 0;
+		ret = parse_hex_pair(&second_codepoint, raw);
+		if (ret != JSON_PARSE_OK) {
+			return ret;
+		}
+
+		if (second_codepoint < 0xDC00 || second_codepoint > 0xDFFF) {
+			return JSON_PARSE_STRING_INVALID_UNICODE_ERR;
+		}
+		/* 0x3FF = 00000011111111 - we mask the lower 10 bits of both,
+		 * the first codepoint make up the high 10 bits of the result,
+		 * the second makes up the low 10 bits */
+		codepoint = ((codepoint & 0x3FF) << 10) | (second_codepoint & 0x3FF);
+		codepoint += 0x10000;
+		if (codepoint < 0x110000 && i + 4 < len) {
+			str[i++] = 0xF0 | (codepoint >> 18);
+			str[i++] = 0x80 | ((codepoint >> 12) & 0x3F);
+			str[i++] = 0x80 | ((codepoint >> 6) & 0x3F);
+			str[i] = 0x80 | (codepoint & 0x3F);
+		} else {
+			ret = JSON_PARSE_STRING_INVALID_UNICODE_ERR;
+			return ret;
+		}
+	/* here it's utf-8 */
+	} else {
+		if (codepoint < 0x80) {
+			str[i] = codepoint;
+		} else if (codepoint < 0x800) {
+			str[i++] = 0xC0 | (codepoint >> 6);
+			str[i] = 0x80 | (codepoint & 0x3F);
+		} else if (codepoint < 0x10000) {
+			str[i++] = 0xE0 | (codepoint >> 12);
+			str[i++] = 0x80 | ((codepoint >> 6) & 0x3F);
+			str[i] = 0x80 | (codepoint & 0x3F);
+		} else {
+			ret = JSON_PARSE_STRING_INVALID_UNICODE_ERR;
+			return ret;
+		}
+	}
+	
+	/* the calling function expects to do the last advance */
+	raw->index--;
+	return ret;
+}
+
+enum json_parse_result parse_raw_string(char **str_out, struct raw_json *raw) {
+	assert(raw->data[raw->index] == '"');
+	assert(str_out);
+	assert(raw);
+
+	char *str = NULL;
+	enum json_parse_result ret = JSON_PARSE_OK;
+	size_t end = raw->index + 1;
+	size_t skipped = 0;
+	for (; end < raw->size && raw->data[end] != '"'; end++) {
+		switch (raw->data[end]) {
+			case '\n':
+			case '\f':
+			case '\b':
+			case '\r':
+			case '\t':
+			case 0:
+				ret = JSON_PARSE_STRING_INVALID_ESCAPE_ERR;
+				goto err;
+			case '\\':
+				/* for unicode escapes, we can't skip bytes, since control
+				 * escape codes will not be parsed, and will be stored as is */
+				if (++end < raw->size && raw->data[end] != 'u') {
+					skipped++;
+				}
+				break;
+		}
+	}
+
+	if (raw->data[end] != '"') {
+		ret = JSON_PARSE_STRING_INVALID_ERR;
+		goto err;
+	}
+
+	size_t len = end - raw->index - skipped;
+	str = calloc(len + 1, sizeof(char));
+	if (!str) {
+		return JSON_OOM;
+	}
+	size_t i;
+	for (i = 0, raw->index++; raw->index < raw->size && raw->data[raw->index] != '"' && i < len; raw->index++, i++) {
+		if (raw->data[raw->index] == '\\' && raw->index + 1 < raw->size) {
+			switch (raw->data[++raw->index]) {
+				case '\\':
+				case '\"':
+				case '/':
+					break;
+				case 'n':
+					str[i] = '\n';
+					continue;
+				case 'f':
+					str[i] = '\f';
+					continue;
+				case 'b':
+					str[i] = '\b';
+					continue;
+				case 'r':
+					str[i] = '\r';
+					continue;
+				case 't':
+					str[i] = '\t';
+					continue;
+				case 'u': {
+					raw->index++;
+					ret = parse_unicode(str, len, i, raw);
+					if (ret != JSON_PARSE_OK)
+						goto err;
+					continue;
+				}
+				default:
+					ret = JSON_PARSE_STRING_INVALID_ESCAPE_ERR;
+					goto err;
+			}
+		}
+		str[i] = raw->data[raw->index];
+	}
+
+	if (raw->data[raw->index++] != '"') {
+		ret = JSON_PARSE_INVALID_STRING_ERR;
+		goto err;
+	}
+
+	*str_out = str;
+	goto end;
+err:
+	free(str);
+end:
+	return ret;
+}
+
+void json_string_set(struct json *dest, const char *string) {
+	if (dest->type != JSON_STRING) {
+		char *key = dest->key;
+		dest->key = NULL;
+		json_clear(dest);
+		dest->key = key;
+		dest->type = JSON_STRING;
+	}
+	free(dest->string);
+	dest->string = strdup(string);
+}
-- 
cgit v1.2.3