diff options
Diffstat (limited to 'common/unicode.c')
-rw-r--r-- | common/unicode.c | 101 |
1 files changed, 101 insertions, 0 deletions
diff --git a/common/unicode.c b/common/unicode.c new file mode 100644 index 00000000..38a9b48e --- /dev/null +++ b/common/unicode.c @@ -0,0 +1,101 @@ +#include <stdint.h> +#include <stddef.h> +#include "unicode.h" + +size_t utf8_chsize(uint32_t ch) { + if (ch < 0x80) { + return 1; + } else if (ch < 0x800) { + return 2; + } else if (ch < 0x10000) { + return 3; + } + return 4; +} + +static const uint8_t masks[] = { + 0x7F, + 0x1F, + 0x0F, + 0x07, + 0x03, + 0x01 +}; + +uint32_t utf8_decode(const char **char_str) { + uint8_t **s = (uint8_t **)char_str; + + uint32_t cp = 0; + if (**s < 128) { + // shortcut + cp = **s; + ++*s; + return cp; + } + int size = utf8_size((char *)*s); + if (size == -1) { + ++*s; + return UTF8_INVALID; + } + uint8_t mask = masks[size - 1]; + cp = **s & mask; + ++*s; + while (--size) { + cp <<= 6; + cp |= **s & 0x3f; + ++*s; + } + return cp; +} + +size_t utf8_encode(char *str, uint32_t ch) { + size_t len = 0; + uint8_t first; + + if (ch < 0x80) { + first = 0; + len = 1; + } else if (ch < 0x800) { + first = 0xc0; + len = 2; + } else if (ch < 0x10000) { + first = 0xe0; + len = 3; + } else { + first = 0xf0; + len = 4; + } + + for (size_t i = len - 1; i > 0; --i) { + str[i] = (ch & 0x3f) | 0x80; + ch >>= 6; + } + + str[0] = ch | first; + return len; +} + + +static const struct { + uint8_t mask; + uint8_t result; + int octets; +} sizes[] = { + { 0x80, 0x00, 1 }, + { 0xE0, 0xC0, 2 }, + { 0xF0, 0xE0, 3 }, + { 0xF8, 0xF0, 4 }, + { 0xFC, 0xF8, 5 }, + { 0xFE, 0xF8, 6 }, + { 0x80, 0x80, -1 }, +}; + +int utf8_size(const char *s) { + uint8_t c = (uint8_t)*s; + for (size_t i = 0; i < sizeof(sizes) / 2; ++i) { + if ((c & sizes[i].mask) == sizes[i].result) { + return sizes[i].octets; + } + } + return -1; +} |