Hello, I have fixed a rather serious bug in wily, but first some small ones. (patches agains wily-0.13.41-9libs) If you enter a rune with hex value 0000 (zero) with M-X, 9libs barfs on stderr and wily starts acting goofy. I thought a while about what to do about this, and I think it should be regarded as a deliberate action from the user. Wily can't have rune zero in the buffer, but it can represent an error rune, so I made it insert a such. This has the advantage that you can enter an error rune in the tag to search for it in a large text with B3. I did this a lot when fixing the bib bug coming later. diff -rN -u old-diff/wily-9libs/wily/keyboard.c new-diff/wily-9libs/wily/keyboard.c --- old-diff/wily-9libs/wily/keyboard.c Fri Mar 4 02:07:39 2005 +++ new-diff/wily-9libs/wily/keyboard.c Fri Mar 4 01:56:42 2005 @@ -16,6 +16,8 @@ void dokeyboard(View *v, Rune r) { switch(r) { + case 0: addrune(v,Runeerror); break; + case DownArrow: case UpArrow: case Home: Here's a rendering bug. Put the cursor towards the end of a long text. Do 'Get short-file' in the tag. Wily will reuse the view, update and refresh it. However, it will forget to update visible.p0, and thus be lost in the new file, with various interesting results. diff -rN -u old-diff/wily-9libs/wily/view.c new-diff/wily-9libs/wily/view.c --- old-diff/wily-9libs/wily/view.c Fri Mar 4 02:07:39 2005 +++ new-diff/wily-9libs/wily/view.c Fri Mar 4 01:49:22 2005 @@ -369,6 +369,7 @@ view_refresh(View*v) { if(ISVISIBLE(v)) { + v->visible.p0 = 0; v->sel = range(v->visible.p0, v->visible.p0); frdelete(&v->f, 0, v->f.nchars); fill(v); And here's the big one. Large texts with multi-byte utfs are trashed. This applies to both reading files and piping data. Both file reading and data pipeing is chopped up in buffersize portions, and if a multibyte utf sequence crosses a buffer border, it will be interpreted as one or more illegal utf sequences. First some new functions. In utftotext I also set utfHadNulls to true if the text (really) contains illegal utf, since this will (as well as null chars) make it impossible to save the file as it was. Another bug was that utftotext_unconverted is not set to zero when there where no unconverted utfs, which did confuse functions that called utftotext multiple times in sequence. diff -rN -u old-diff/wily-9libs/wily/proto.h new-diff/wily-9libs/wily/proto.h --- old-diff/wily-9libs/wily/proto.h Fri Mar 4 02:07:39 2005 +++ new-diff/wily-9libs/wily/proto.h Fri Mar 4 01:35:31 2005 @@ -228,6 +228,8 @@ int diag (char*,char*, ...); RETSIGTYPE cleanup_and_abort (int); RETSIGTYPE cleanup_and_die (int); +int fillutfchar (char*,int,char*,int); +int unfullutfbytes (char*,int); Rstring utf2rstring (char*utf); /* vgeom.c */ diff -rN -u old-diff/wily-9libs/wily/util.c new-diff/wily-9libs/wily/util.c --- old-diff/wily-9libs/wily/util.c Fri Mar 4 02:07:39 2005 +++ new-diff/wily-9libs/wily/util.c Fri Mar 4 01:49:22 2005 @@ -75,6 +75,42 @@ } /* +** Append bytes from `s' (of length `ns') to `r' (of length `nr') until +** `r' has a full rune. Stop if `s' ends or an illegal utf sequence +** is detected. Return the number of bytes appended. +** sizeof `r' must be at least UTFmax+1 bytes. +*/ +int +fillutfchar(char *r, int nr, char *s, int ns) { + int c = 0; + + r += nr; + while ((nr < UTFmax) && (ns > 0) && ((((uchar)*s)&0xC0) == 0x80)) + *r++=*s++, nr++, ns--, c++; + *r = '\0'; + return c; +} + +/* +** If utf string `s' of length `n' ends with an uncomplete rune, return +** the number of bytes in that rune. (If `s' ends with an illegal utf +** sequence, return zero.) + */ +int +unfullutfbytes(char *s, int n) { + int e = 1; + + for (e = 1; (e < n) && (e < UTFmax); ++e) { + if (((uchar)s[n-e]&0xC0) != 0x80) { + if (!fullrune(s+n-e, e)) + return e; + break; + } + } + return 0; +} + +/* * Return Rstring for utf. Either s.r0 == s.r1 == 0, * or s.r0 will need to be free */ @@ -435,8 +471,22 @@ /* Convert UTF from s1 to s2 into runes, store them at r * Returns the number of runes stored. - * Sets utfHadNulls to indicate that some of the UTF contained nulls, - * which are _not_ converted into null Runes. + * Sets utfHadNulls to indicate that some of the UTF contained nulls or bad UTF. + * Bad UTF and nulls will be replaced with rune Runeerror (u0080). + * Set utftotext_unconverted to the number of bytes before the end that + * belongs to a char extending beyond the end and wasn't converted. + * + * This function will look beyond the end s2 if the last UTF combination + * is not completely within the range. If converting an entire range + * at once, it should have an extra '\0' at s2 (right after the end) + * so that an incomplete (bad) multibyte UTF combination at the end + * will be correctly detected. It converting a subrange of a larger UTF + * sequence, make sure the end s2 is after a complete UTF combination, + * OR leave enough bytes beyond the end to complete any multibyte UTF char + * (UTFmax) and examine utftotext_unconverted after the call returns. + * + * Note: This function will be called multiple times during a big read, + * so utfHadNulls must be preset to false by the caller as appropriate. */ ulong utftotext(Rune *r, char *s1, char *s2) @@ -456,6 +506,8 @@ *q = *v++; } else { v += chartorune(q, v); + if (*q == Runeerror) + utfHadNulls = true; } assert(*q); q++; @@ -465,6 +517,8 @@ q--; length = runelen(*q); utftotext_unconverted = s2 - (v-length); + } else { + utftotext_unconverted = 0; } return q-r; } diff -rN -u old-diff/wily-9libs/wily/file.c new-diff/wily-9libs/wily/file.c --- old-diff/wily-9libs/wily/file.c Fri Mar 4 02:07:39 2005 +++ new-diff/wily-9libs/wily/file.c Fri Mar 4 01:51:03 2005 @@ -179,7 +179,7 @@ text_read(d->t, fd, buf->st_size); close(fd); if (utfHadNulls) { - diag(path, "removed nulls from [%s]", path); + errno=0, diag(path, "nulls or illegal utf in [%s]", path); data_setbackup(d,0); } File reads is the simple one. I have also fixed an overallocation of memory that got bundled in this diff. The size for a new buffer was multiplied with Runesize twice. diff -rN -u old-diff/wily-9libs/wily/text.c new-diff/wily-9libs/wily/text.c --- old-diff/wily-9libs/wily/text.c Fri Mar 4 02:07:39 2005 +++ new-diff/wily-9libs/wily/text.c Fri Mar 4 02:06:40 2005 @@ -58,12 +58,12 @@ int desired, nread; char buf[BUFFERSIZE]; extern int utftotext_unconverted; - int offset; + int offset, end, stop; /* Ensure we have enough rune space. Do it one * block to avoid fragmentation */ - desired = len*sizeof(Rune) + GAPSIZE; + desired = len + GAPSIZE; if (t->alloced < desired) { t->alloced = desired; if (t->text) @@ -73,23 +73,30 @@ t->length = 0; t->pos = 0; offset = 0; + while(len > 0) { - desired = MIN(len, BUFFERSIZE - offset); + desired = MIN(len, BUFFERSIZE - offset - 1); nread = read(fd, buf + offset, desired); if(nread<=0) return -1; - t->length += utftotext(t->text+t->length, buf, buf + nread +offset); len -= nread; + stop = end = offset + nread; + buf[end] = '\0'; + if (len > 0) /* if this is not the end of the file */ + stop -= unfullutfbytes(buf, end); + t->length += utftotext(t->text+t->length, buf, buf + stop); /* * If there were bytes at the end of the buffer that * weren't a complete rune, copy them back to the * start of the buffer for the next time. */ - if ( (offset = utftotext_unconverted)) { - memcpy(buf, buf + desired - offset, offset); - } + offset = end - stop; + if (offset > 0) + memcpy(buf, buf + end - offset, offset); } + assert(offset == 0); + t->gap = range(t->length, t->alloced); undo_reset(t); undo_start(t); Piped data is a bit complicated. I have added a small buffer in struct Key, in which to concatenate split multi-byte utfs. Then it is a smallish nightmare to cover all cases of split utf, real illegal utf, extremely small chunks of data and end of data. diff -rN -u old-diff/wily-9libs/wily/event.c new-diff/wily-9libs/wily/event.c --- old-diff/wily-9libs/wily/event.c Fri Mar 4 02:07:39 2005 +++ new-diff/wily-9libs/wily/event.c Fri Mar 4 01:42:37 2005 @@ -43,6 +43,13 @@ */ View *v; Bool first; + + /* + ** Runes with multibyte UTF codes must not be split across + ** block boundaries. Strip them off and join them in this buffer. + */ + char obuf[UTFmax+1]; + int nobuf; }; /* @@ -64,6 +71,7 @@ static Key* key_new(ulong key, int fd, Keytype t); static Key* key_findcmd(char*cmd); static void oneword(char*s, char*l); +static void output_fullutfs(Key*k, int n, char*s); static void output(Key*k, int n, char*s); static void ex_accept(int n, char*s); @@ -110,6 +118,9 @@ olabel(k->olabel, label); addrunning(k->cmd); } + + k->nobuf = 0; + return 0; } @@ -138,7 +149,7 @@ switch(k->t){ case Klisten: ex_accept(n,s); break; - case Kout: output(k, n, s); break; + case Kout: output_fullutfs(k, n, s); break; case Kmsg: outmsg(k, n, s); break; default: error("bad key type %d", k->t); break; } @@ -191,7 +202,9 @@ estop(k->key); switch(k->t) { - case Kout: if(!k->v)rmrunning(k->cmd); break; + case Kout: if (k->nobuf > 0) output(k, k->nobuf, k->obuf); + if (!k->v) rmrunning(k->cmd); + break; case Kmsg: data_fdstop(k->fd); break; case Klisten: error("Klisten closed"); default: error("bad key type %d", k->t); @@ -231,6 +244,39 @@ /* break at first whitespace */ if( (s = strpbrk(s, whitespace)) ) *s='\0'; +} + +/* +** Key has a buffer where we concatenate runes that get split between two +** (or more -- very small?) blocks. This is accomplished in four steps: +** 1) If we have an uncomplete rune from last call, try to complete it. +** 2) If output isn't exhausted and we have something from last call, +** output it -- it's either a full rune or bad utf. +** 3) Put away any unfull rune at the and of the output. +** 4) Output any remaining output, which is now only full utfs. +*/ +static void +output_fullutfs(Key *k, int n, char *s) { + if (k->nobuf > 0) { + int m = fillutfchar(k->obuf, k->nobuf, s, n); + k->nobuf+=m, s+=m, n-=m; + } + if ((n > 0) && (k->nobuf > 0)) { + output(k, k->nobuf, k->obuf); + k->nobuf = 0; + } + if (n > 0) { + int m = unfullutfbytes(s, n); + if (m > 0) { + memcpy(k->obuf, s+n-m, m); + k->obuf[m] = '\0'; + n -= m; + } + k->nobuf=m; + } + if (n > 0) { + output(k, n, s); + } } /* Copy the 'n' bytes of data in 's' to the window or output directory -- Tommy Pettersson <ptp@xxxxxxxxxxxxxx>