diff options
author | cinap_lenrek <cinap_lenrek@felloff.net> | 2014-05-11 00:54:59 +0200 |
---|---|---|
committer | cinap_lenrek <cinap_lenrek@felloff.net> | 2014-05-11 00:54:59 +0200 |
commit | edca217bb99f7c32413c117239d12acdc223e811 (patch) | |
tree | c77aa8a8494ffe8784bf3b4e264579a50c2c4233 | |
parent | 7388792a124756a528666cb5c375ee919db9ca11 (diff) | |
download | plan9front-edca217bb99f7c32413c117239d12acdc223e811.tar.xz |
tcs: handle surrogate pairs
-rw-r--r-- | sys/src/cmd/tcs/conv_big5.c | 6 | ||||
-rw-r--r-- | sys/src/cmd/tcs/conv_gb.c | 6 | ||||
-rw-r--r-- | sys/src/cmd/tcs/conv_gbk.c | 6 | ||||
-rw-r--r-- | sys/src/cmd/tcs/conv_jis.c | 21 | ||||
-rw-r--r-- | sys/src/cmd/tcs/conv_ksc.c | 6 | ||||
-rw-r--r-- | sys/src/cmd/tcs/hdr.h | 2 | ||||
-rw-r--r-- | sys/src/cmd/tcs/html.c | 40 | ||||
-rw-r--r-- | sys/src/cmd/tcs/tcs.c | 142 | ||||
-rw-r--r-- | sys/src/cmd/tcs/tune.c | 21 | ||||
-rw-r--r-- | sys/src/cmd/tcs/utf.c | 14 |
10 files changed, 149 insertions, 115 deletions
diff --git a/sys/src/cmd/tcs/conv_big5.c b/sys/src/cmd/tcs/conv_big5.c index 87ecc88ae..8e6b60858 100644 --- a/sys/src/cmd/tcs/conv_big5.c +++ b/sys/src/cmd/tcs/conv_big5.c @@ -82,7 +82,7 @@ big5proc(int c, Rune **r, long input_loc) } void -big5_in(int fd, long *notused, struct convert *out) +big5_in(int fd, long *, struct convert *out) { Rune ob[N]; Rune *r, *re; @@ -90,7 +90,6 @@ big5_in(int fd, long *notused, struct convert *out) int n, i; long nin; - USED(notused); r = ob; re = ob+N-3; nin = 0; @@ -114,14 +113,13 @@ big5_in(int fd, long *notused, struct convert *out) } void -big5_out(Rune *base, int n, long *notused) +big5_out(Rune *base, int n, long *) { char *p; int i; Rune r; static int first = 1; - USED(notused); if(first){ first = 0; for(i = 0; i < NRUNE; i++) diff --git a/sys/src/cmd/tcs/conv_gb.c b/sys/src/cmd/tcs/conv_gb.c index 626323ad9..81fcc0d45 100644 --- a/sys/src/cmd/tcs/conv_gb.c +++ b/sys/src/cmd/tcs/conv_gb.c @@ -60,7 +60,7 @@ gbproc(int c, Rune **r, long input_loc) } void -gb_in(int fd, long *notused, struct convert *out) +gb_in(int fd, long *, struct convert *out) { Rune ob[N]; Rune *r, *re; @@ -68,7 +68,6 @@ gb_in(int fd, long *notused, struct convert *out) int n, i; long nin; - USED(notused); r = ob; re = ob+N-3; nin = 0; @@ -92,14 +91,13 @@ gb_in(int fd, long *notused, struct convert *out) } void -gb_out(Rune *base, int n, long *notused) +gb_out(Rune *base, int n, long *) { char *p; int i; Rune r; static int first = 1; - USED(notused); if(first){ first = 0; for(i = 0; i < NRUNE; i++) diff --git a/sys/src/cmd/tcs/conv_gbk.c b/sys/src/cmd/tcs/conv_gbk.c index 9f870f126..6b2650723 100644 --- a/sys/src/cmd/tcs/conv_gbk.c +++ b/sys/src/cmd/tcs/conv_gbk.c @@ -51,7 +51,7 @@ gbkproc(int c, Rune **r, long input_loc) } void -gbk_in(int fd, long *notused, struct convert *out) +gbk_in(int fd, long *, struct convert *out) { Rune ob[N]; Rune *r, *re; @@ -59,7 +59,6 @@ gbk_in(int fd, long *notused, struct convert *out) int n, i; long nin; - USED(notused); r = ob; re = ob+N-3; nin = 0; @@ -84,14 +83,13 @@ gbk_in(int fd, long *notused, struct convert *out) void -gbk_out(Rune *base, int n, long *notused) +gbk_out(Rune *base, int n, long *) { char *p; int i; Rune r; static int first = 1; - USED(notused); if(first){ first = 0; for(i = 0; i < NRUNE; i++) diff --git a/sys/src/cmd/tcs/conv_jis.c b/sys/src/cmd/tcs/conv_jis.c index 61d0aa046..4b979a7c5 100644 --- a/sys/src/cmd/tcs/conv_jis.c +++ b/sys/src/cmd/tcs/conv_jis.c @@ -367,30 +367,26 @@ do_in(int fd, void (*procfn)(int, Rune **, long), struct convert *out) } void -jis_in(int fd, long *notused, struct convert *out) +jis_in(int fd, long *, struct convert *out) { - USED(notused); do_in(fd, alljis, out); } void -ujis_in(int fd, long *notused, struct convert *out) +ujis_in(int fd, long *, struct convert *out) { - USED(notused); do_in(fd, ujis, out); } void -msjis_in(int fd, long *notused, struct convert *out) +msjis_in(int fd, long *, struct convert *out) { - USED(notused); do_in(fd, ms, out); } void -jisjis_in(int fd, long *notused, struct convert *out) +jisjis_in(int fd, long *, struct convert *out) { - USED(notused); do_in(fd, jis, out); } @@ -417,14 +413,13 @@ tab_init(void) /* jis-kanji, or ISO 2022-JP */ void -jisjis_out(Rune *base, int n, long *notused) +jisjis_out(Rune *base, int n, long *) { char *p; int i; Rune r; static enum { ascii, japan646, jp2022 } state = ascii; - USED(notused); if(first) tab_init(); nrunes += n; @@ -462,13 +457,12 @@ jisjis_out(Rune *base, int n, long *notused) /* ms-kanji, or Shift-JIS */ void -msjis_out(Rune *base, int n, long *notused) +msjis_out(Rune *base, int n, long *) { char *p; int i, hi, lo; Rune r; - USED(notused); if(first) tab_init(); nrunes += n; @@ -501,13 +495,12 @@ msjis_out(Rune *base, int n, long *notused) /* ujis, or EUC */ void -ujis_out(Rune *base, int n, long *notused) +ujis_out(Rune *base, int n, long *) { char *p; int i; Rune r; - USED(notused); if(first) tab_init(); nrunes += n; diff --git a/sys/src/cmd/tcs/conv_ksc.c b/sys/src/cmd/tcs/conv_ksc.c index 74a40a591..76f352e30 100644 --- a/sys/src/cmd/tcs/conv_ksc.c +++ b/sys/src/cmd/tcs/conv_ksc.c @@ -81,7 +81,7 @@ ukscproc(int c, Rune **r, long input_loc) } void -uksc_in(int fd, long *notused, struct convert *out) +uksc_in(int fd, long *, struct convert *out) { Rune ob[N]; Rune *r, *re; @@ -89,7 +89,6 @@ uksc_in(int fd, long *notused, struct convert *out) int n, i; long nin; - USED(notused); r = ob; re = ob+N-3; nin = 0; @@ -113,7 +112,7 @@ uksc_in(int fd, long *notused, struct convert *out) } void -uksc_out(Rune *base, int n, long *notused) +uksc_out(Rune *base, int n, long *) { char *p; int i; @@ -121,7 +120,6 @@ uksc_out(Rune *base, int n, long *notused) long l; static int first = 1; - USED(notused); if(first){ first = 0; for(i = 0; i < NRUNE; i++) diff --git a/sys/src/cmd/tcs/hdr.h b/sys/src/cmd/tcs/hdr.h index b065f2a38..b5b29cff2 100644 --- a/sys/src/cmd/tcs/hdr.h +++ b/sys/src/cmd/tcs/hdr.h @@ -19,6 +19,7 @@ struct convert *conv(char *, int); typedef void (*Infn)(int, long *, struct convert *); typedef void (*Outfn)(Rune *, int, long *); void outtable(Rune *, int, long *); +int fixsurrogate(Rune *rp, Rune r2); void utf_in(int, long *, struct convert *); void utf_out(Rune *, int, long *); @@ -41,6 +42,5 @@ extern char obuf[UTFmax*N]; /* maximum bloat from N runes */ #define EXIT(n,s) exits(s) #else #define EPR fprintf(stderr, -#define USED(x) /* in plan 9, USED(x) tells the compiler to treat x as used */ #define EXIT(n,s) exit(n) #endif diff --git a/sys/src/cmd/tcs/html.c b/sys/src/cmd/tcs/html.c index 36c7cf43d..9caec778e 100644 --- a/sys/src/cmd/tcs/html.c +++ b/sys/src/cmd/tcs/html.c @@ -2141,24 +2141,22 @@ findbyrune(Rune r) } void -html_in(int fd, long *x, struct convert *out) +html_in(int fd, long *, struct convert *out) { char buf[100], *p; Biobuf b; - Rune rbuf[N]; - Rune *r, *er; + Rune *r, *er, r2; int c, s, i; - USED(x); - html_init(); - r = rbuf; - er = rbuf+N; + r = runes; + er = runes+N; + r2 = 0; Binit(&b, fd, OREAD); while((c = Bgetrune(&b)) != Beof){ if(r >= er){ - OUT(out, rbuf, r-rbuf); - r = rbuf; + OUT(out, runes, r-runes); + r = runes; } if(c == '&'){ s = 0; @@ -2185,7 +2183,7 @@ html_in(int fd, long *x, struct convert *out) c = strtol(buf+3, &p, 16); else c = strtol(buf+2, &p, 10); - if(*p || c >= NRUNE || c < 0) + if(*p || c < 0) goto bad; goto out; } @@ -2196,10 +2194,11 @@ html_in(int fd, long *x, struct convert *out) for(p=buf; p<buf+i; ){ p += chartorune(r++, p); if(r >= er){ - OUT(out, rbuf, r-rbuf); - r = rbuf; + OUT(out, runes, r-runes); + r = runes; } } + r2 = 0; continue; out: if((c & 0x7f) == c && strchr("<>&\"'", c)){ @@ -2207,25 +2206,30 @@ html_in(int fd, long *x, struct convert *out) i = sprint(buf, "&%s", findbyrune(c)); goto bad; } + } + *r = c; + if(fixsurrogate(r, r2)){ + r2 = *r; + continue; } - *r++ = c; + r2 = 0; + r++; } - if(r > rbuf) - OUT(out, rbuf, r-rbuf); - OUT(out, rbuf, 0); + if(r > runes) + OUT(out, runes, r-runes); + OUT(out, runes, 0); } /* * use biobuf because can use more than UTFmax bytes per rune */ void -html_out(Rune *r, int n, long *x) +html_out(Rune *r, int n, long *) { char *s; Biobuf b; Rune *er; - USED(x); html_init(); Binit(&b, 1, OWRITE); er = r+n; diff --git a/sys/src/cmd/tcs/tcs.c b/sys/src/cmd/tcs/tcs.c index 2a8b5b353..7732a03b5 100644 --- a/sys/src/cmd/tcs/tcs.c +++ b/sys/src/cmd/tcs/tcs.c @@ -73,7 +73,6 @@ main(int argc, char **argv) break; } ARGEND - USED(argc); if(verbose) squawk = 1; if(listem){ @@ -214,49 +213,63 @@ Again: } void -unicode_in_be(int fd, long *notused, struct convert *out) +unicode_in_be(int fd, long *, struct convert *out) { - int i, n; - Rune buf[N], r; - uchar *p; + uchar buf[2*N], *p, *e; + Rune *r, r2; + int n; - USED(notused); - while((n = cread(fd, (char *)buf, 2*N, 2)) > 0){ - /* go backwards as sizeof(Rune) >= 2 */ - p = (uchar*)buf + n; + r2 = 0; + while((n = cread(fd, (char*)buf, 2*N, 2)) > 0){ ninput += n; - n /= 2; - for(i=n-1; i>=0; i--){ - r = *(--p); - r |= *(--p) << 8; - buf[i] = r; + p = buf; + e = buf + n; + r = runes; + while(p < e){ + *r = *p++ << 8; + *r |= *p++; + if(fixsurrogate(r, r2)){ + r2 = *r; + continue; + } + r2 = 0; + r++; + } + if(r > runes){ + OUT(out, runes, r-runes); } - OUT(out, buf, n); } - OUT(out, buf, 0); + OUT(out, runes, 0); } void -unicode_in_le(int fd, long *notused, struct convert *out) +unicode_in_le(int fd, long *, struct convert *out) { - int i, n; - Rune buf[N], r; - uchar *p; + uchar buf[2*N], *p, *e; + Rune *r, r2; + int n; - USED(notused); - while((n = cread(fd, (char *)buf, 2*N, 2)) > 0){ - /* go backwards as sizeof(Rune) >= 2 */ - p = (uchar*)buf + n; + r2 = 0; + while((n = cread(fd, (char*)buf, 2*N, 2)) > 0){ ninput += n; - n /= 2; - for(i=n-1; i>=0; i--){ - r = *(--p) << 8; - r |= *(--p); - buf[i] = r; + p = buf; + e = buf + n; + r = runes; + while(p < e){ + *r = *p++; + *r |= *p++ << 8; + if(fixsurrogate(r, r2)){ + r2 = *r; + continue; + } + r2 = 0; + r++; + } + if(r > runes){ + OUT(out, runes, r-runes); } - OUT(out, buf, n); } - OUT(out, buf, 0); + OUT(out, runes, 0); } void @@ -284,41 +297,57 @@ unicode_in(int fd, long *notused, struct convert *out) } void -unicode_out_be(Rune *base, int n, long *notused) +unicode_out_be(Rune *base, int n, long *) { int i; uchar *p; - Rune r; + unsigned long r; - USED(notused); p = (uchar*)base; for(i=0; i<n; i++){ r = base[i]; - *p++ = r>>8; - *p++ = r; + if(r > 0xFFFF){ + r -= 0x10000; + *p++ = ((r>>18)&3) + 0xD8; + *p++ = r>>10; + *p++ = ((r>>8)&3) + 0xDC; + *p++ = r; + } else { + *p++ = r>>8; + *p++ = r; + } } nrunes += n; - noutput += 2*n; - write(1, (char *)base, 2*n); + n = p - (uchar*)base; + noutput += n; + write(1, (char *)base, n); } void -unicode_out_le(Rune *base, int n, long *notused) +unicode_out_le(Rune *base, int n, long *) { int i; uchar *p; - Rune r; + unsigned long r; - USED(notused); p = (uchar*)base; for(i=0; i<n; i++){ r = base[i]; - *p++ = r; - *p++ = r>>8; + if(r > 0xFFFF){ + r -= 0x10000; + *p++ = r>>10; + *p++ = ((r>>18)&3) + 0xD8; + *p++ = r; + *p++ = ((r>>8)&3) + 0xDC; + } else { + *p++ = r; + *p++ = r>>8; + } } nrunes += n; - noutput += 2*n; - write(1, (char *)base, 2*n); + n = p - (uchar*)base; + noutput += n; + write(1, (char *)base, n); } void @@ -403,6 +432,29 @@ outtable(Rune *base, int n, long *map) write(1, obuf, p-obuf); } +int +fixsurrogate(Rune *rp, Rune r2) +{ + Rune r1; + + r1 = *rp; + if(r1 >= 0xD800 && r1 <= 0xDBFF){ + if(r2 >= 0xDC00 && r2 <= 0xDFFF){ + *rp = 0x10000 + (((r1 - 0xD800)<<10) | (r2 - 0xDC00)); + return 0; + } + return 1; + } else + if(r1 >= 0xDC00 && r1 <= 0xDFFF){ + if(r2 >= 0xD800 && r2 <= 0xDBFF){ + *rp = 0x10000 + (((r2 - 0xD800)<<10) | (r1 - 0xDC00)); + return 0; + } + return 1; + } + return 0; +} + long tabascii[256] = { 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f, diff --git a/sys/src/cmd/tcs/tune.c b/sys/src/cmd/tcs/tune.c index 1046b7796..22cd1c8d0 100644 --- a/sys/src/cmd/tcs/tune.c +++ b/sys/src/cmd/tcs/tune.c @@ -106,22 +106,20 @@ findindex(Rune *rstr, int size, Rune r) } void -tune_in(int fd, long *x, struct convert *out) +tune_in(int fd, long *, struct convert *out) { Biobuf b; - Rune rbuf[N]; Rune *r, *er, tr; int c, i; - USED(x); - r = rbuf; - er = rbuf+N-3; + r = runes; + er = runes+N-3; Binit(&b, fd, OREAD); while((c = Bgetrune(&b)) != Beof){ ninput += b.runesize; if(r >= er){ - OUT(out, rbuf, r-rbuf); - r = rbuf; + OUT(out, runes, r-runes); + r = runes; } if(c>=0xe210/**/ && c <= 0xe38c/**/ && (i = c%16) < nelem(t2)){ if(c >= 0xe380/**/){ @@ -172,13 +170,13 @@ tune_in(int fd, long *x, struct convert *out) break; } } - if(r > rbuf) - OUT(out, rbuf, r-rbuf); - OUT(out, rbuf, 0); + if(r > runes) + OUT(out, runes, r-runes); + OUT(out, runes, 0); } void -tune_out(Rune *r, int n, long *x) +tune_out(Rune *r, int n, long *) { static int state = 0; static Rune lastr; @@ -186,7 +184,6 @@ tune_out(Rune *r, int n, long *x) char *p; int i; - USED(x); nrunes += n; er = r+n; for(p = obuf; r < er; r++){ diff --git a/sys/src/cmd/tcs/utf.c b/sys/src/cmd/tcs/utf.c index 764ef9f7b..381c5facc 100644 --- a/sys/src/cmd/tcs/utf.c +++ b/sys/src/cmd/tcs/utf.c @@ -27,13 +27,12 @@ int fullisorune(char *str, int n); int isochartorune(Rune *rune, char *str); void -utf_in(int fd, long *notused, struct convert *out) +utf_in(int fd, long *, struct convert *out) { char buf[N]; int i, j, c, n, tot; - ulong l; + unsigned long l; - USED(notused); tot = 0; while((n = read(fd, buf+tot, N-tot)) >= 0){ tot += n; @@ -65,12 +64,11 @@ utf_in(int fd, long *notused, struct convert *out) } void -utf_out(Rune *base, int n, long *notused) +utf_out(Rune *base, int n, long *) { char *p; Rune *r; - USED(notused); nrunes += n; for(r = base, p = obuf; n-- > 0; r++){ p += our_wctomb(p, *r); @@ -80,12 +78,11 @@ utf_out(Rune *base, int n, long *notused) } void -isoutf_in(int fd, long *notused, struct convert *out) +isoutf_in(int fd, long *, struct convert *out) { char buf[N]; int i, j, c, n, tot; - USED(notused); tot = 0; while((n = read(fd, buf+tot, N-tot)) >= 0){ tot += n; @@ -117,12 +114,11 @@ isoutf_in(int fd, long *notused, struct convert *out) } void -isoutf_out(Rune *base, int n, long *notused) +isoutf_out(Rune *base, int n, long *) { char *p; Rune *r; - USED(notused); nrunes += n; for(r = base, p = obuf; n-- > 0; r++) p += runetoisoutf(p, r); |