aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Forney <mforney@mforney.org>2021-07-06 18:51:56 -0700
committerMichael Forney <mforney@mforney.org>2021-07-06 19:09:10 -0700
commit70a532525946a28dc1ffacc542ba6a5b67aee986 (patch)
treebff296a43c06fff62bed451509782cfbc703167a
parent0f348202d1282d2b38c3a84547693a7a82b25724 (diff)
Add functions for encoding/decoding UTF-8/16
These will be needed to implement wide string literals.
-rw-r--r--Makefile2
-rw-r--r--utf.c80
-rw-r--r--utf.h3
3 files changed, 85 insertions, 0 deletions
diff --git a/Makefile b/Makefile
index f53225d..b04f111 100644
--- a/Makefile
+++ b/Makefile
@@ -37,6 +37,7 @@ SRC=\
token.c\
tree.c\
type.c\
+ utf.c\
util.c\
$(BACKEND).c
OBJ=$(SRC:%.c=$(objdir)/%.o)
@@ -61,6 +62,7 @@ $(objdir)/targ.o : targ.c util.h cc.h $(stagedeps) ; $(CC) $(CFLAGS)
$(objdir)/token.o : token.c util.h cc.h $(stagedeps) ; $(CC) $(CFLAGS) -c -o $@ token.c
$(objdir)/tree.o : tree.c util.h $(stagedeps) ; $(CC) $(CFLAGS) -c -o $@ tree.c
$(objdir)/type.o : type.c util.h cc.h $(stagedeps) ; $(CC) $(CFLAGS) -c -o $@ type.c
+$(objdir)/utf.o : utf.c utf.h $(stagedeps) ; $(CC) $(CFLAGS) -c -o $@ utf.c
$(objdir)/util.o : util.c util.h $(stagedeps) ; $(CC) $(CFLAGS) -c -o $@ util.c
# Make sure stage2 and stage3 binaries are stripped by adding -s to
diff --git a/utf.c b/utf.c
new file mode 100644
index 0000000..a8f5a49
--- /dev/null
+++ b/utf.c
@@ -0,0 +1,80 @@
+#include <uchar.h>
+#include "utf.h"
+
+size_t
+utf8enc(char32_t c, char *s)
+{
+ if (c < 0x80) {
+ s[0] = c;
+ return 1;
+ }
+ if (c < 0x800) {
+ s[0] = 0xc0 | c >> 6;
+ s[1] = 0x80 | c & 0x3f;
+ return 2;
+ }
+ if (c < 0xd800 || c - 0xe000 < 0x2000) {
+ s[0] = 0xe0 | c >> 12;
+ s[1] = 0x80 | c >> 6 & 0x3f;
+ s[2] = 0x80 | c & 0x3f;
+ return 3;
+ }
+ if (c - 0x10000 < 0x100000) {
+ s[0] = 0xf0 | c >> 18;
+ s[1] = 0x80 | c >> 12 & 0x3f;
+ s[2] = 0x80 | c >> 6 & 0x3f;
+ s[3] = 0x80 | c & 0x3f;
+ return 4;
+ }
+ return -1;
+}
+
+size_t
+utf8dec(const char *s, size_t n, char32_t *c)
+{
+ size_t i, l;
+ unsigned char b;
+ char32_t x;
+
+ b = s[0];
+ if (b < 0x80) {
+ *c = b;
+ return 1;
+ }
+ if ((b & 0xe0) == 0xc0) {
+ x = b & 0x1f;
+ l = 2;
+ } else if ((b & 0xf0) == 0xe0) {
+ x = b & 0x0f;
+ l = 3;
+ } else if ((b & 0xf8) == 0xf0) {
+ x = b & 0x07;
+ l = 4;
+ }
+ if (n < l)
+ return -1;
+ for (i = 1; i < l; ++i) {
+ b = *++s;
+ if ((b & 0xc0) != 0x80)
+ return -1;
+ x = x << 6 | b & 0x3f;
+ }
+ *c = x;
+ return l;
+}
+
+size_t
+utf16enc(char32_t c, char16_t *s)
+{
+ if (c < 0xd800 || c - 0xe000 < 0x2000) {
+ s[0] = c;
+ return 1;
+ }
+ c -= 0x10000;
+ if (c < 0x100000) {
+ s[0] = 0xd800 | c >> 10 & 0x3ff;
+ s[1] = 0xdc00 | c & 0x3ff;
+ return 2;
+ }
+ return -1;
+}
diff --git a/utf.h b/utf.h
new file mode 100644
index 0000000..3f360ad
--- /dev/null
+++ b/utf.h
@@ -0,0 +1,3 @@
+size_t utf8enc(char32_t, char *);
+size_t utf8dec(const char *, size_t, char32_t *);
+size_t utf16enc(char32_t, char16_t *);