diff options
author | Michael Forney <mforney@mforney.org> | 2021-07-06 18:51:56 -0700 |
---|---|---|
committer | Michael Forney <mforney@mforney.org> | 2021-07-06 19:09:10 -0700 |
commit | 70a532525946a28dc1ffacc542ba6a5b67aee986 (patch) | |
tree | bff296a43c06fff62bed451509782cfbc703167a | |
parent | 0f348202d1282d2b38c3a84547693a7a82b25724 (diff) |
Add functions for encoding/decoding UTF-8/16
These will be needed to implement wide string literals.
-rw-r--r-- | Makefile | 2 | ||||
-rw-r--r-- | utf.c | 80 | ||||
-rw-r--r-- | utf.h | 3 |
3 files changed, 85 insertions, 0 deletions
@@ -37,6 +37,7 @@ SRC=\ token.c\ tree.c\ type.c\ + utf.c\ util.c\ $(BACKEND).c OBJ=$(SRC:%.c=$(objdir)/%.o) @@ -61,6 +62,7 @@ $(objdir)/targ.o : targ.c util.h cc.h $(stagedeps) ; $(CC) $(CFLAGS) $(objdir)/token.o : token.c util.h cc.h $(stagedeps) ; $(CC) $(CFLAGS) -c -o $@ token.c $(objdir)/tree.o : tree.c util.h $(stagedeps) ; $(CC) $(CFLAGS) -c -o $@ tree.c $(objdir)/type.o : type.c util.h cc.h $(stagedeps) ; $(CC) $(CFLAGS) -c -o $@ type.c +$(objdir)/utf.o : utf.c utf.h $(stagedeps) ; $(CC) $(CFLAGS) -c -o $@ utf.c $(objdir)/util.o : util.c util.h $(stagedeps) ; $(CC) $(CFLAGS) -c -o $@ util.c # Make sure stage2 and stage3 binaries are stripped by adding -s to @@ -0,0 +1,80 @@ +#include <uchar.h> +#include "utf.h" + +size_t +utf8enc(char32_t c, char *s) +{ + if (c < 0x80) { + s[0] = c; + return 1; + } + if (c < 0x800) { + s[0] = 0xc0 | c >> 6; + s[1] = 0x80 | c & 0x3f; + return 2; + } + if (c < 0xd800 || c - 0xe000 < 0x2000) { + s[0] = 0xe0 | c >> 12; + s[1] = 0x80 | c >> 6 & 0x3f; + s[2] = 0x80 | c & 0x3f; + return 3; + } + if (c - 0x10000 < 0x100000) { + s[0] = 0xf0 | c >> 18; + s[1] = 0x80 | c >> 12 & 0x3f; + s[2] = 0x80 | c >> 6 & 0x3f; + s[3] = 0x80 | c & 0x3f; + return 4; + } + return -1; +} + +size_t +utf8dec(const char *s, size_t n, char32_t *c) +{ + size_t i, l; + unsigned char b; + char32_t x; + + b = s[0]; + if (b < 0x80) { + *c = b; + return 1; + } + if ((b & 0xe0) == 0xc0) { + x = b & 0x1f; + l = 2; + } else if ((b & 0xf0) == 0xe0) { + x = b & 0x0f; + l = 3; + } else if ((b & 0xf8) == 0xf0) { + x = b & 0x07; + l = 4; + } + if (n < l) + return -1; + for (i = 1; i < l; ++i) { + b = *++s; + if ((b & 0xc0) != 0x80) + return -1; + x = x << 6 | b & 0x3f; + } + *c = x; + return l; +} + +size_t +utf16enc(char32_t c, char16_t *s) +{ + if (c < 0xd800 || c - 0xe000 < 0x2000) { + s[0] = c; + return 1; + } + c -= 0x10000; + if (c < 0x100000) { + s[0] = 0xd800 | c >> 10 & 0x3ff; + s[1] = 0xdc00 | c & 0x3ff; + return 2; + } + return -1; +} @@ -0,0 +1,3 @@ +size_t utf8enc(char32_t, char *); +size_t utf8dec(const char *, size_t, char32_t *); +size_t utf16enc(char32_t, char16_t *); |