From e2d6bba40d79ce8b59c2a8b49f6d7241183ae15a Mon Sep 17 00:00:00 2001 From: cinap_lenrek Date: Thu, 20 Dec 2012 17:58:26 +0100 Subject: file: detect and unwrap utf-16 encoded text formats --- sys/src/cmd/file.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 64 insertions(+), 4 deletions(-) diff --git a/sys/src/cmd/file.c b/sys/src/cmd/file.c index 640494c7d..dec241f96 100644 --- a/sys/src/cmd/file.c +++ b/sys/src/cmd/file.c @@ -325,6 +325,68 @@ chartorune1(Rune1 *rune, char *str) return 4; } +void +utfconv(void) +{ + Rune r; + uchar *rb; + char *p, *e; + int i; + + if(nbuf < 4) + return; + + if(memcmp(buf, "\x00\x00\xFE\xFF", 4) == 0){ + if(!mime) + print("utf-32be "); + return; + } else + if(memcmp(buf, "\xFE\xFF\x00\x00", 4) == 0){ + if(!mime) + print("utf-32le "); + return; + } else + if(memcmp(buf, "\xEF\xBB\xBF", 3) == 0){ + memmove(buf, buf+3, nbuf-3); + nbuf -= 3; + return; + } else + if(memcmp(buf, "\xFE\xFF", 2) == 0){ + if(!mime) + print("utf-16be "); + + nbuf -= 2; + rb = malloc(nbuf+1); + memmove(rb, buf+2, nbuf); + p = (char*)buf; + e = p+nbuf-4; + for(i=0; i