fix for big bug, trashed multi-byte utf

From: Tommy Pettersson <ptp@xxxxxxxxxxxxxx>
To: wilyfans@xxxxxxxxxxxxx
Date: Fri, 4 Mar 2005 03:42:08 +0100
Hello,

I have fixed a rather serious bug in wily, but first some
small ones.  (patches agains wily-0.13.41-9libs)


If you enter a rune with hex value 0000 (zero) with M-X, 9libs
barfs on stderr and wily starts acting goofy.  I thought a
while about what to do about this, and I think it should be
regarded as a deliberate action from the user.  Wily can't
have rune zero in the buffer, but it can represent an error
rune, so I made it insert a such.  This has the advantage
that you can enter an error rune in the tag to search for it
in a large text with B3.  I did this a lot when fixing the
bib bug coming later.

diff -rN -u old-diff/wily-9libs/wily/keyboard.c 
new-diff/wily-9libs/wily/keyboard.c
--- old-diff/wily-9libs/wily/keyboard.c Fri Mar  4 02:07:39 2005
+++ new-diff/wily-9libs/wily/keyboard.c Fri Mar  4 01:56:42 2005
@@ -16,6 +16,8 @@
 void
 dokeyboard(View *v, Rune r) {
        switch(r) {
+       case 0:                 addrune(v,Runeerror); break;
+
        case DownArrow:
        case UpArrow:
        case Home:


Here's a rendering bug.  Put the cursor towards the end of a
long text.  Do 'Get short-file' in the tag.  Wily will reuse
the view, update and refresh it.  However, it will forget
to update visible.p0, and thus be lost in the new file,
with various interesting results.

diff -rN -u old-diff/wily-9libs/wily/view.c new-diff/wily-9libs/wily/view.c
--- old-diff/wily-9libs/wily/view.c     Fri Mar  4 02:07:39 2005
+++ new-diff/wily-9libs/wily/view.c     Fri Mar  4 01:49:22 2005
@@ -369,6 +369,7 @@
 view_refresh(View*v)
 {
        if(ISVISIBLE(v)) {
+               v->visible.p0 = 0;
                v->sel = range(v->visible.p0, v->visible.p0);
                frdelete(&v->f, 0, v->f.nchars);
                fill(v);


And here's the big one.  Large texts with multi-byte utfs are
trashed.  This applies to both reading files and piping data.
Both file reading and data pipeing is chopped up in buffersize
portions, and if a multibyte utf sequence crosses a buffer
border, it will be interpreted as one or more illegal utf
sequences.

First some new functions.  In utftotext I also set utfHadNulls
to true if the text (really) contains illegal utf, since this
will (as well as null chars) make it impossible to save the
file as it was.  Another bug was that utftotext_unconverted
is not set to zero when there where no unconverted utfs,
which did confuse functions that called utftotext multiple
times in sequence.

diff -rN -u old-diff/wily-9libs/wily/proto.h new-diff/wily-9libs/wily/proto.h
--- old-diff/wily-9libs/wily/proto.h    Fri Mar  4 02:07:39 2005
+++ new-diff/wily-9libs/wily/proto.h    Fri Mar  4 01:35:31 2005
@@ -228,6 +228,8 @@
 int            diag                            (char*,char*, ...);
 RETSIGTYPE     cleanup_and_abort       (int);
 RETSIGTYPE     cleanup_and_die (int);
+int            fillutfchar             (char*,int,char*,int);
+int            unfullutfbytes          (char*,int);
 Rstring        utf2rstring             (char*utf);
 
 /* vgeom.c */
diff -rN -u old-diff/wily-9libs/wily/util.c new-diff/wily-9libs/wily/util.c
--- old-diff/wily-9libs/wily/util.c     Fri Mar  4 02:07:39 2005
+++ new-diff/wily-9libs/wily/util.c     Fri Mar  4 01:49:22 2005
@@ -75,6 +75,42 @@
 }
 
 /*
+** Append bytes from `s' (of length `ns') to `r' (of length `nr') until
+** `r' has a full rune.  Stop if `s' ends or an illegal utf sequence
+** is detected.  Return the number of bytes appended.
+** sizeof `r' must be at least UTFmax+1 bytes.
+*/
+int
+fillutfchar(char *r, int nr, char *s, int ns) {
+       int     c = 0;
+
+       r += nr;
+       while ((nr < UTFmax) && (ns > 0) && ((((uchar)*s)&0xC0) == 0x80))
+               *r++=*s++, nr++, ns--, c++;
+       *r = '\0';
+       return c;
+}
+
+/*
+** If utf string `s' of length `n' ends with an uncomplete rune, return
+** the number of bytes in that rune.  (If `s' ends with an illegal utf
+** sequence, return zero.)
+ */
+int
+unfullutfbytes(char *s, int n) {
+       int     e = 1;
+
+       for (e = 1; (e < n) && (e < UTFmax); ++e) {
+               if (((uchar)s[n-e]&0xC0) != 0x80) {
+                       if (!fullrune(s+n-e, e))
+                               return e;
+                       break;
+               }
+       }
+       return 0;
+}
+
+/*
  * Return Rstring for utf.  Either s.r0 == s.r1 == 0, 
  * or s.r0 will need to be free
  */
@@ -435,8 +471,22 @@
 
 /* Convert UTF from s1 to s2 into runes, store them at r
  * Returns the number of runes stored.
- * Sets utfHadNulls to indicate that some of the UTF contained nulls,
- * which are _not_ converted into null Runes.
+ * Sets utfHadNulls to indicate that some of the UTF contained nulls or bad 
UTF.
+ * Bad UTF and nulls will be replaced with rune Runeerror (u0080).
+ * Set utftotext_unconverted to the number of bytes before the end that
+ * belongs to a char extending beyond the end and wasn't converted.
+ *
+ * This function will look beyond the end s2 if the last UTF combination
+ * is not completely within the range.  If converting an entire range
+ * at once, it should have an extra '\0' at s2 (right after the end)
+ * so that an incomplete (bad) multibyte UTF combination at the end
+ * will be correctly detected.  It converting a subrange of a larger UTF
+ * sequence, make sure the end s2 is after a complete UTF combination,
+ * OR leave enough bytes beyond the end to complete any multibyte UTF char
+ * (UTFmax) and examine utftotext_unconverted after the call returns.
+ *
+ * Note: This function will be called multiple times during a big read,
+ * so utfHadNulls must be preset to false by the caller as appropriate.
  */
 ulong
 utftotext(Rune *r, char *s1, char *s2)
@@ -456,6 +506,8 @@
                        *q = *v++;
                } else {
                        v += chartorune(q, v);
+                       if (*q == Runeerror)
+                               utfHadNulls = true;
                }
                assert(*q);
                q++;
@@ -465,6 +517,8 @@
                q--;
                length  = runelen(*q);
                utftotext_unconverted = s2 - (v-length);
+       } else {
+               utftotext_unconverted = 0;
        }
        return q-r;
 }
diff -rN -u old-diff/wily-9libs/wily/file.c new-diff/wily-9libs/wily/file.c
--- old-diff/wily-9libs/wily/file.c     Fri Mar  4 02:07:39 2005
+++ new-diff/wily-9libs/wily/file.c     Fri Mar  4 01:51:03 2005
@@ -179,7 +179,7 @@
        text_read(d->t, fd, buf->st_size);
        close(fd);
        if (utfHadNulls) {
-               diag(path, "removed nulls from [%s]", path);
+               errno=0, diag(path, "nulls or illegal utf in [%s]", path);
                data_setbackup(d,0);
        }
 


File reads is the simple one.  I have also fixed an
overallocation of memory that got bundled in this diff.
The size for a new buffer was multiplied with Runesize twice.

diff -rN -u old-diff/wily-9libs/wily/text.c new-diff/wily-9libs/wily/text.c
--- old-diff/wily-9libs/wily/text.c     Fri Mar  4 02:07:39 2005
+++ new-diff/wily-9libs/wily/text.c     Fri Mar  4 02:06:40 2005
@@ -58,12 +58,12 @@
        int     desired, nread;
        char    buf[BUFFERSIZE];
        extern int utftotext_unconverted;
-       int     offset;
+       int     offset, end, stop;
        
        /* Ensure we have enough rune space.  Do it one
         * block to avoid fragmentation
         */
-       desired = len*sizeof(Rune) + GAPSIZE;
+       desired = len + GAPSIZE;
        if (t->alloced < desired) {
                t->alloced = desired;
                if (t->text)
@@ -73,23 +73,30 @@
        t->length = 0;
        t->pos = 0;
        offset = 0;
+
        while(len > 0) {
-               desired = MIN(len, BUFFERSIZE - offset);
+               desired = MIN(len, BUFFERSIZE - offset - 1);
                nread = read(fd, buf + offset, desired);
                if(nread<=0)
                        return -1;
-               t->length += utftotext(t->text+t->length, buf, buf + nread 
+offset);
                len -= nread;
+               stop = end = offset + nread;
+               buf[end] = '\0';
+               if (len > 0) /* if this is not the end of the file */
+                       stop -= unfullutfbytes(buf, end);
+               t->length += utftotext(t->text+t->length, buf, buf + stop);
                
                /*
                 * If there were bytes at the end of the buffer that
                 * weren't a complete rune, copy them back to the
                 * start of the buffer for the next time.
                 */
-               if ( (offset = utftotext_unconverted)) {
-                       memcpy(buf, buf + desired - offset, offset);
-               }
+               offset = end - stop;
+               if (offset > 0)
+                       memcpy(buf, buf + end - offset, offset);
        }
+       assert(offset == 0);
+
        t->gap = range(t->length, t->alloced);
        undo_reset(t);
        undo_start(t);


Piped data is a bit complicated.  I have added a small buffer
in struct Key, in which to concatenate split multi-byte utfs.
Then it is a smallish nightmare to cover all cases of split
utf, real illegal utf, extremely small chunks of data and
end of data.

diff -rN -u old-diff/wily-9libs/wily/event.c new-diff/wily-9libs/wily/event.c
--- old-diff/wily-9libs/wily/event.c    Fri Mar  4 02:07:39 2005
+++ new-diff/wily-9libs/wily/event.c    Fri Mar  4 01:42:37 2005
@@ -43,6 +43,13 @@
         */
        View            *v;
        Bool            first;
+
+       /*
+       ** Runes with multibyte UTF codes must not be split across
+       ** block boundaries.  Strip them off and join them in this buffer.
+       */
+       char            obuf[UTFmax+1];
+       int             nobuf;
 };
 
 /*
@@ -64,6 +71,7 @@
 static Key*    key_new(ulong key, int fd, Keytype t);
 static Key*    key_findcmd(char*cmd);
 static void    oneword(char*s, char*l);
+static void    output_fullutfs(Key*k, int n, char*s);
 static void    output(Key*k, int n, char*s);
 static void    ex_accept(int n, char*s);
 
@@ -110,6 +118,9 @@
                olabel(k->olabel, label);
                addrunning(k->cmd);
        }
+
+       k->nobuf = 0;
+
        return 0;
 }
 
@@ -138,7 +149,7 @@
 
        switch(k->t){
        case Klisten:   ex_accept(n,s); break;
-       case Kout:      output(k, n, s);        break;
+       case Kout:      output_fullutfs(k, n, s);       break;
        case Kmsg:      outmsg(k, n, s);        break;
        default:                error("bad key type %d", k->t);         break;
        }
@@ -191,7 +202,9 @@
        estop(k->key);
 
        switch(k->t) {
-       case Kout:      if(!k->v)rmrunning(k->cmd); break;
+       case Kout:      if (k->nobuf > 0)       output(k, k->nobuf, k->obuf);
+                               if (!k->v)                      
rmrunning(k->cmd);
+                               break;
        case Kmsg:      data_fdstop(k->fd); break;
        case Klisten:   error("Klisten closed");
        default:                error("bad key type %d", k->t);
@@ -231,6 +244,39 @@
        /* break at first whitespace */
        if( (s = strpbrk(s, whitespace)) )
                *s='\0';
+}
+
+/*
+** Key has a buffer where we concatenate runes that get split between two
+** (or more -- very small?) blocks.  This is accomplished in four steps:
+** 1)  If we have an uncomplete rune from last call, try to complete it.
+** 2)  If output isn't exhausted and we have something from last call,
+** output it -- it's either a full rune or bad utf.
+** 3)  Put away any unfull rune at the and of the output.
+** 4)  Output any remaining output, which is now only full utfs.
+*/
+static void
+output_fullutfs(Key *k, int n, char *s) {
+       if (k->nobuf > 0) {
+               int m = fillutfchar(k->obuf, k->nobuf, s, n);
+               k->nobuf+=m, s+=m, n-=m;
+       }
+       if ((n > 0) && (k->nobuf > 0)) {
+               output(k, k->nobuf, k->obuf);
+               k->nobuf = 0;
+       }
+       if (n > 0) {
+               int m = unfullutfbytes(s, n);
+               if (m > 0) {
+                       memcpy(k->obuf, s+n-m, m);
+                       k->obuf[m] = '\0';
+                       n -= m;
+               }
+               k->nobuf=m;
+       }
+       if (n > 0) {
+               output(k, n, s);
+       }
 }
 
 /* Copy the 'n' bytes of data in 's' to the window or output directory


-- 
Tommy Pettersson <ptp@xxxxxxxxxxxxxx>
fix for big bug, trashed multi-byte utf

Other related posts: