aboutsummaryrefslogtreecommitdiff
path: root/common/unicode.c
diff options
context:
space:
mode:
Diffstat (limited to 'common/unicode.c')
-rw-r--r--common/unicode.c101
1 files changed, 101 insertions, 0 deletions
diff --git a/common/unicode.c b/common/unicode.c
new file mode 100644
index 00000000..38a9b48e
--- /dev/null
+++ b/common/unicode.c
@@ -0,0 +1,101 @@
+#include <stdint.h>
+#include <stddef.h>
+#include "unicode.h"
+
+size_t utf8_chsize(uint32_t ch) {
+ if (ch < 0x80) {
+ return 1;
+ } else if (ch < 0x800) {
+ return 2;
+ } else if (ch < 0x10000) {
+ return 3;
+ }
+ return 4;
+}
+
+static const uint8_t masks[] = {
+ 0x7F,
+ 0x1F,
+ 0x0F,
+ 0x07,
+ 0x03,
+ 0x01
+};
+
+uint32_t utf8_decode(const char **char_str) {
+ uint8_t **s = (uint8_t **)char_str;
+
+ uint32_t cp = 0;
+ if (**s < 128) {
+ // shortcut
+ cp = **s;
+ ++*s;
+ return cp;
+ }
+ int size = utf8_size((char *)*s);
+ if (size == -1) {
+ ++*s;
+ return UTF8_INVALID;
+ }
+ uint8_t mask = masks[size - 1];
+ cp = **s & mask;
+ ++*s;
+ while (--size) {
+ cp <<= 6;
+ cp |= **s & 0x3f;
+ ++*s;
+ }
+ return cp;
+}
+
+size_t utf8_encode(char *str, uint32_t ch) {
+ size_t len = 0;
+ uint8_t first;
+
+ if (ch < 0x80) {
+ first = 0;
+ len = 1;
+ } else if (ch < 0x800) {
+ first = 0xc0;
+ len = 2;
+ } else if (ch < 0x10000) {
+ first = 0xe0;
+ len = 3;
+ } else {
+ first = 0xf0;
+ len = 4;
+ }
+
+ for (size_t i = len - 1; i > 0; --i) {
+ str[i] = (ch & 0x3f) | 0x80;
+ ch >>= 6;
+ }
+
+ str[0] = ch | first;
+ return len;
+}
+
+
+static const struct {
+ uint8_t mask;
+ uint8_t result;
+ int octets;
+} sizes[] = {
+ { 0x80, 0x00, 1 },
+ { 0xE0, 0xC0, 2 },
+ { 0xF0, 0xE0, 3 },
+ { 0xF8, 0xF0, 4 },
+ { 0xFC, 0xF8, 5 },
+ { 0xFE, 0xF8, 6 },
+ { 0x80, 0x80, -1 },
+};
+
+int utf8_size(const char *s) {
+ uint8_t c = (uint8_t)*s;
+ for (size_t i = 0; i < sizeof(sizes) / 2; ++i) {
+ if ((c & sizes[i].mask) == sizes[i].result) {
+ return sizes[i].octets;
+ }
+ }
+ return -1;
+}