summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorcinap_lenrek <cinap_lenrek@felloff.net>2014-05-11 00:54:59 +0200
committercinap_lenrek <cinap_lenrek@felloff.net>2014-05-11 00:54:59 +0200
commitedca217bb99f7c32413c117239d12acdc223e811 (patch)
treec77aa8a8494ffe8784bf3b4e264579a50c2c4233
parent7388792a124756a528666cb5c375ee919db9ca11 (diff)
downloadplan9front-edca217bb99f7c32413c117239d12acdc223e811.tar.xz
tcs: handle surrogate pairs
-rw-r--r--sys/src/cmd/tcs/conv_big5.c6
-rw-r--r--sys/src/cmd/tcs/conv_gb.c6
-rw-r--r--sys/src/cmd/tcs/conv_gbk.c6
-rw-r--r--sys/src/cmd/tcs/conv_jis.c21
-rw-r--r--sys/src/cmd/tcs/conv_ksc.c6
-rw-r--r--sys/src/cmd/tcs/hdr.h2
-rw-r--r--sys/src/cmd/tcs/html.c40
-rw-r--r--sys/src/cmd/tcs/tcs.c142
-rw-r--r--sys/src/cmd/tcs/tune.c21
-rw-r--r--sys/src/cmd/tcs/utf.c14
10 files changed, 149 insertions, 115 deletions
diff --git a/sys/src/cmd/tcs/conv_big5.c b/sys/src/cmd/tcs/conv_big5.c
index 87ecc88ae..8e6b60858 100644
--- a/sys/src/cmd/tcs/conv_big5.c
+++ b/sys/src/cmd/tcs/conv_big5.c
@@ -82,7 +82,7 @@ big5proc(int c, Rune **r, long input_loc)
}
void
-big5_in(int fd, long *notused, struct convert *out)
+big5_in(int fd, long *, struct convert *out)
{
Rune ob[N];
Rune *r, *re;
@@ -90,7 +90,6 @@ big5_in(int fd, long *notused, struct convert *out)
int n, i;
long nin;
- USED(notused);
r = ob;
re = ob+N-3;
nin = 0;
@@ -114,14 +113,13 @@ big5_in(int fd, long *notused, struct convert *out)
}
void
-big5_out(Rune *base, int n, long *notused)
+big5_out(Rune *base, int n, long *)
{
char *p;
int i;
Rune r;
static int first = 1;
- USED(notused);
if(first){
first = 0;
for(i = 0; i < NRUNE; i++)
diff --git a/sys/src/cmd/tcs/conv_gb.c b/sys/src/cmd/tcs/conv_gb.c
index 626323ad9..81fcc0d45 100644
--- a/sys/src/cmd/tcs/conv_gb.c
+++ b/sys/src/cmd/tcs/conv_gb.c
@@ -60,7 +60,7 @@ gbproc(int c, Rune **r, long input_loc)
}
void
-gb_in(int fd, long *notused, struct convert *out)
+gb_in(int fd, long *, struct convert *out)
{
Rune ob[N];
Rune *r, *re;
@@ -68,7 +68,6 @@ gb_in(int fd, long *notused, struct convert *out)
int n, i;
long nin;
- USED(notused);
r = ob;
re = ob+N-3;
nin = 0;
@@ -92,14 +91,13 @@ gb_in(int fd, long *notused, struct convert *out)
}
void
-gb_out(Rune *base, int n, long *notused)
+gb_out(Rune *base, int n, long *)
{
char *p;
int i;
Rune r;
static int first = 1;
- USED(notused);
if(first){
first = 0;
for(i = 0; i < NRUNE; i++)
diff --git a/sys/src/cmd/tcs/conv_gbk.c b/sys/src/cmd/tcs/conv_gbk.c
index 9f870f126..6b2650723 100644
--- a/sys/src/cmd/tcs/conv_gbk.c
+++ b/sys/src/cmd/tcs/conv_gbk.c
@@ -51,7 +51,7 @@ gbkproc(int c, Rune **r, long input_loc)
}
void
-gbk_in(int fd, long *notused, struct convert *out)
+gbk_in(int fd, long *, struct convert *out)
{
Rune ob[N];
Rune *r, *re;
@@ -59,7 +59,6 @@ gbk_in(int fd, long *notused, struct convert *out)
int n, i;
long nin;
- USED(notused);
r = ob;
re = ob+N-3;
nin = 0;
@@ -84,14 +83,13 @@ gbk_in(int fd, long *notused, struct convert *out)
void
-gbk_out(Rune *base, int n, long *notused)
+gbk_out(Rune *base, int n, long *)
{
char *p;
int i;
Rune r;
static int first = 1;
- USED(notused);
if(first){
first = 0;
for(i = 0; i < NRUNE; i++)
diff --git a/sys/src/cmd/tcs/conv_jis.c b/sys/src/cmd/tcs/conv_jis.c
index 61d0aa046..4b979a7c5 100644
--- a/sys/src/cmd/tcs/conv_jis.c
+++ b/sys/src/cmd/tcs/conv_jis.c
@@ -367,30 +367,26 @@ do_in(int fd, void (*procfn)(int, Rune **, long), struct convert *out)
}
void
-jis_in(int fd, long *notused, struct convert *out)
+jis_in(int fd, long *, struct convert *out)
{
- USED(notused);
do_in(fd, alljis, out);
}
void
-ujis_in(int fd, long *notused, struct convert *out)
+ujis_in(int fd, long *, struct convert *out)
{
- USED(notused);
do_in(fd, ujis, out);
}
void
-msjis_in(int fd, long *notused, struct convert *out)
+msjis_in(int fd, long *, struct convert *out)
{
- USED(notused);
do_in(fd, ms, out);
}
void
-jisjis_in(int fd, long *notused, struct convert *out)
+jisjis_in(int fd, long *, struct convert *out)
{
- USED(notused);
do_in(fd, jis, out);
}
@@ -417,14 +413,13 @@ tab_init(void)
/* jis-kanji, or ISO 2022-JP */
void
-jisjis_out(Rune *base, int n, long *notused)
+jisjis_out(Rune *base, int n, long *)
{
char *p;
int i;
Rune r;
static enum { ascii, japan646, jp2022 } state = ascii;
- USED(notused);
if(first)
tab_init();
nrunes += n;
@@ -462,13 +457,12 @@ jisjis_out(Rune *base, int n, long *notused)
/* ms-kanji, or Shift-JIS */
void
-msjis_out(Rune *base, int n, long *notused)
+msjis_out(Rune *base, int n, long *)
{
char *p;
int i, hi, lo;
Rune r;
- USED(notused);
if(first)
tab_init();
nrunes += n;
@@ -501,13 +495,12 @@ msjis_out(Rune *base, int n, long *notused)
/* ujis, or EUC */
void
-ujis_out(Rune *base, int n, long *notused)
+ujis_out(Rune *base, int n, long *)
{
char *p;
int i;
Rune r;
- USED(notused);
if(first)
tab_init();
nrunes += n;
diff --git a/sys/src/cmd/tcs/conv_ksc.c b/sys/src/cmd/tcs/conv_ksc.c
index 74a40a591..76f352e30 100644
--- a/sys/src/cmd/tcs/conv_ksc.c
+++ b/sys/src/cmd/tcs/conv_ksc.c
@@ -81,7 +81,7 @@ ukscproc(int c, Rune **r, long input_loc)
}
void
-uksc_in(int fd, long *notused, struct convert *out)
+uksc_in(int fd, long *, struct convert *out)
{
Rune ob[N];
Rune *r, *re;
@@ -89,7 +89,6 @@ uksc_in(int fd, long *notused, struct convert *out)
int n, i;
long nin;
- USED(notused);
r = ob;
re = ob+N-3;
nin = 0;
@@ -113,7 +112,7 @@ uksc_in(int fd, long *notused, struct convert *out)
}
void
-uksc_out(Rune *base, int n, long *notused)
+uksc_out(Rune *base, int n, long *)
{
char *p;
int i;
@@ -121,7 +120,6 @@ uksc_out(Rune *base, int n, long *notused)
long l;
static int first = 1;
- USED(notused);
if(first){
first = 0;
for(i = 0; i < NRUNE; i++)
diff --git a/sys/src/cmd/tcs/hdr.h b/sys/src/cmd/tcs/hdr.h
index b065f2a38..b5b29cff2 100644
--- a/sys/src/cmd/tcs/hdr.h
+++ b/sys/src/cmd/tcs/hdr.h
@@ -19,6 +19,7 @@ struct convert *conv(char *, int);
typedef void (*Infn)(int, long *, struct convert *);
typedef void (*Outfn)(Rune *, int, long *);
void outtable(Rune *, int, long *);
+int fixsurrogate(Rune *rp, Rune r2);
void utf_in(int, long *, struct convert *);
void utf_out(Rune *, int, long *);
@@ -41,6 +42,5 @@ extern char obuf[UTFmax*N]; /* maximum bloat from N runes */
#define EXIT(n,s) exits(s)
#else
#define EPR fprintf(stderr,
-#define USED(x) /* in plan 9, USED(x) tells the compiler to treat x as used */
#define EXIT(n,s) exit(n)
#endif
diff --git a/sys/src/cmd/tcs/html.c b/sys/src/cmd/tcs/html.c
index 36c7cf43d..9caec778e 100644
--- a/sys/src/cmd/tcs/html.c
+++ b/sys/src/cmd/tcs/html.c
@@ -2141,24 +2141,22 @@ findbyrune(Rune r)
}
void
-html_in(int fd, long *x, struct convert *out)
+html_in(int fd, long *, struct convert *out)
{
char buf[100], *p;
Biobuf b;
- Rune rbuf[N];
- Rune *r, *er;
+ Rune *r, *er, r2;
int c, s, i;
- USED(x);
-
html_init();
- r = rbuf;
- er = rbuf+N;
+ r = runes;
+ er = runes+N;
+ r2 = 0;
Binit(&b, fd, OREAD);
while((c = Bgetrune(&b)) != Beof){
if(r >= er){
- OUT(out, rbuf, r-rbuf);
- r = rbuf;
+ OUT(out, runes, r-runes);
+ r = runes;
}
if(c == '&'){
s = 0;
@@ -2185,7 +2183,7 @@ html_in(int fd, long *x, struct convert *out)
c = strtol(buf+3, &p, 16);
else
c = strtol(buf+2, &p, 10);
- if(*p || c >= NRUNE || c < 0)
+ if(*p || c < 0)
goto bad;
goto out;
}
@@ -2196,10 +2194,11 @@ html_in(int fd, long *x, struct convert *out)
for(p=buf; p<buf+i; ){
p += chartorune(r++, p);
if(r >= er){
- OUT(out, rbuf, r-rbuf);
- r = rbuf;
+ OUT(out, runes, r-runes);
+ r = runes;
}
}
+ r2 = 0;
continue;
out:
if((c & 0x7f) == c && strchr("<>&\"'", c)){
@@ -2207,25 +2206,30 @@ html_in(int fd, long *x, struct convert *out)
i = sprint(buf, "&%s", findbyrune(c));
goto bad;
}
+ }
+ *r = c;
+ if(fixsurrogate(r, r2)){
+ r2 = *r;
+ continue;
}
- *r++ = c;
+ r2 = 0;
+ r++;
}
- if(r > rbuf)
- OUT(out, rbuf, r-rbuf);
- OUT(out, rbuf, 0);
+ if(r > runes)
+ OUT(out, runes, r-runes);
+ OUT(out, runes, 0);
}
/*
* use biobuf because can use more than UTFmax bytes per rune
*/
void
-html_out(Rune *r, int n, long *x)
+html_out(Rune *r, int n, long *)
{
char *s;
Biobuf b;
Rune *er;
- USED(x);
html_init();
Binit(&b, 1, OWRITE);
er = r+n;
diff --git a/sys/src/cmd/tcs/tcs.c b/sys/src/cmd/tcs/tcs.c
index 2a8b5b353..7732a03b5 100644
--- a/sys/src/cmd/tcs/tcs.c
+++ b/sys/src/cmd/tcs/tcs.c
@@ -73,7 +73,6 @@ main(int argc, char **argv)
break;
} ARGEND
- USED(argc);
if(verbose)
squawk = 1;
if(listem){
@@ -214,49 +213,63 @@ Again:
}
void
-unicode_in_be(int fd, long *notused, struct convert *out)
+unicode_in_be(int fd, long *, struct convert *out)
{
- int i, n;
- Rune buf[N], r;
- uchar *p;
+ uchar buf[2*N], *p, *e;
+ Rune *r, r2;
+ int n;
- USED(notused);
- while((n = cread(fd, (char *)buf, 2*N, 2)) > 0){
- /* go backwards as sizeof(Rune) >= 2 */
- p = (uchar*)buf + n;
+ r2 = 0;
+ while((n = cread(fd, (char*)buf, 2*N, 2)) > 0){
ninput += n;
- n /= 2;
- for(i=n-1; i>=0; i--){
- r = *(--p);
- r |= *(--p) << 8;
- buf[i] = r;
+ p = buf;
+ e = buf + n;
+ r = runes;
+ while(p < e){
+ *r = *p++ << 8;
+ *r |= *p++;
+ if(fixsurrogate(r, r2)){
+ r2 = *r;
+ continue;
+ }
+ r2 = 0;
+ r++;
+ }
+ if(r > runes){
+ OUT(out, runes, r-runes);
}
- OUT(out, buf, n);
}
- OUT(out, buf, 0);
+ OUT(out, runes, 0);
}
void
-unicode_in_le(int fd, long *notused, struct convert *out)
+unicode_in_le(int fd, long *, struct convert *out)
{
- int i, n;
- Rune buf[N], r;
- uchar *p;
+ uchar buf[2*N], *p, *e;
+ Rune *r, r2;
+ int n;
- USED(notused);
- while((n = cread(fd, (char *)buf, 2*N, 2)) > 0){
- /* go backwards as sizeof(Rune) >= 2 */
- p = (uchar*)buf + n;
+ r2 = 0;
+ while((n = cread(fd, (char*)buf, 2*N, 2)) > 0){
ninput += n;
- n /= 2;
- for(i=n-1; i>=0; i--){
- r = *(--p) << 8;
- r |= *(--p);
- buf[i] = r;
+ p = buf;
+ e = buf + n;
+ r = runes;
+ while(p < e){
+ *r = *p++;
+ *r |= *p++ << 8;
+ if(fixsurrogate(r, r2)){
+ r2 = *r;
+ continue;
+ }
+ r2 = 0;
+ r++;
+ }
+ if(r > runes){
+ OUT(out, runes, r-runes);
}
- OUT(out, buf, n);
}
- OUT(out, buf, 0);
+ OUT(out, runes, 0);
}
void
@@ -284,41 +297,57 @@ unicode_in(int fd, long *notused, struct convert *out)
}
void
-unicode_out_be(Rune *base, int n, long *notused)
+unicode_out_be(Rune *base, int n, long *)
{
int i;
uchar *p;
- Rune r;
+ unsigned long r;
- USED(notused);
p = (uchar*)base;
for(i=0; i<n; i++){
r = base[i];
- *p++ = r>>8;
- *p++ = r;
+ if(r > 0xFFFF){
+ r -= 0x10000;
+ *p++ = ((r>>18)&3) + 0xD8;
+ *p++ = r>>10;
+ *p++ = ((r>>8)&3) + 0xDC;
+ *p++ = r;
+ } else {
+ *p++ = r>>8;
+ *p++ = r;
+ }
}
nrunes += n;
- noutput += 2*n;
- write(1, (char *)base, 2*n);
+ n = p - (uchar*)base;
+ noutput += n;
+ write(1, (char *)base, n);
}
void
-unicode_out_le(Rune *base, int n, long *notused)
+unicode_out_le(Rune *base, int n, long *)
{
int i;
uchar *p;
- Rune r;
+ unsigned long r;
- USED(notused);
p = (uchar*)base;
for(i=0; i<n; i++){
r = base[i];
- *p++ = r;
- *p++ = r>>8;
+ if(r > 0xFFFF){
+ r -= 0x10000;
+ *p++ = r>>10;
+ *p++ = ((r>>18)&3) + 0xD8;
+ *p++ = r;
+ *p++ = ((r>>8)&3) + 0xDC;
+ } else {
+ *p++ = r;
+ *p++ = r>>8;
+ }
}
nrunes += n;
- noutput += 2*n;
- write(1, (char *)base, 2*n);
+ n = p - (uchar*)base;
+ noutput += n;
+ write(1, (char *)base, n);
}
void
@@ -403,6 +432,29 @@ outtable(Rune *base, int n, long *map)
write(1, obuf, p-obuf);
}
+int
+fixsurrogate(Rune *rp, Rune r2)
+{
+ Rune r1;
+
+ r1 = *rp;
+ if(r1 >= 0xD800 && r1 <= 0xDBFF){
+ if(r2 >= 0xDC00 && r2 <= 0xDFFF){
+ *rp = 0x10000 + (((r1 - 0xD800)<<10) | (r2 - 0xDC00));
+ return 0;
+ }
+ return 1;
+ } else
+ if(r1 >= 0xDC00 && r1 <= 0xDFFF){
+ if(r2 >= 0xD800 && r2 <= 0xDBFF){
+ *rp = 0x10000 + (((r2 - 0xD800)<<10) | (r1 - 0xDC00));
+ return 0;
+ }
+ return 1;
+ }
+ return 0;
+}
+
long tabascii[256] =
{
0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
diff --git a/sys/src/cmd/tcs/tune.c b/sys/src/cmd/tcs/tune.c
index 1046b7796..22cd1c8d0 100644
--- a/sys/src/cmd/tcs/tune.c
+++ b/sys/src/cmd/tcs/tune.c
@@ -106,22 +106,20 @@ findindex(Rune *rstr, int size, Rune r)
}
void
-tune_in(int fd, long *x, struct convert *out)
+tune_in(int fd, long *, struct convert *out)
{
Biobuf b;
- Rune rbuf[N];
Rune *r, *er, tr;
int c, i;
- USED(x);
- r = rbuf;
- er = rbuf+N-3;
+ r = runes;
+ er = runes+N-3;
Binit(&b, fd, OREAD);
while((c = Bgetrune(&b)) != Beof){
ninput += b.runesize;
if(r >= er){
- OUT(out, rbuf, r-rbuf);
- r = rbuf;
+ OUT(out, runes, r-runes);
+ r = runes;
}
if(c>=0xe210/**/ && c <= 0xe38c/**/ && (i = c%16) < nelem(t2)){
if(c >= 0xe380/**/){
@@ -172,13 +170,13 @@ tune_in(int fd, long *x, struct convert *out)
break;
}
}
- if(r > rbuf)
- OUT(out, rbuf, r-rbuf);
- OUT(out, rbuf, 0);
+ if(r > runes)
+ OUT(out, runes, r-runes);
+ OUT(out, runes, 0);
}
void
-tune_out(Rune *r, int n, long *x)
+tune_out(Rune *r, int n, long *)
{
static int state = 0;
static Rune lastr;
@@ -186,7 +184,6 @@ tune_out(Rune *r, int n, long *x)
char *p;
int i;
- USED(x);
nrunes += n;
er = r+n;
for(p = obuf; r < er; r++){
diff --git a/sys/src/cmd/tcs/utf.c b/sys/src/cmd/tcs/utf.c
index 764ef9f7b..381c5facc 100644
--- a/sys/src/cmd/tcs/utf.c
+++ b/sys/src/cmd/tcs/utf.c
@@ -27,13 +27,12 @@ int fullisorune(char *str, int n);
int isochartorune(Rune *rune, char *str);
void
-utf_in(int fd, long *notused, struct convert *out)
+utf_in(int fd, long *, struct convert *out)
{
char buf[N];
int i, j, c, n, tot;
- ulong l;
+ unsigned long l;
- USED(notused);
tot = 0;
while((n = read(fd, buf+tot, N-tot)) >= 0){
tot += n;
@@ -65,12 +64,11 @@ utf_in(int fd, long *notused, struct convert *out)
}
void
-utf_out(Rune *base, int n, long *notused)
+utf_out(Rune *base, int n, long *)
{
char *p;
Rune *r;
- USED(notused);
nrunes += n;
for(r = base, p = obuf; n-- > 0; r++){
p += our_wctomb(p, *r);
@@ -80,12 +78,11 @@ utf_out(Rune *base, int n, long *notused)
}
void
-isoutf_in(int fd, long *notused, struct convert *out)
+isoutf_in(int fd, long *, struct convert *out)
{
char buf[N];
int i, j, c, n, tot;
- USED(notused);
tot = 0;
while((n = read(fd, buf+tot, N-tot)) >= 0){
tot += n;
@@ -117,12 +114,11 @@ isoutf_in(int fd, long *notused, struct convert *out)
}
void
-isoutf_out(Rune *base, int n, long *notused)
+isoutf_out(Rune *base, int n, long *)
{
char *p;
Rune *r;
- USED(notused);
nrunes += n;
for(r = base, p = obuf; n-- > 0; r++)
p += runetoisoutf(p, r);