[haiku-commits] r35875 - haiku/trunk/src/kits/textencoding

  • From: michael.w.pfeiffer@xxxxxxxxx
  • To: haiku-commits@xxxxxxxxxxxxx
  • Date: Tue, 16 Mar 2010 10:28:53 +0100 (CET)

Author: laplace
Date: 2010-03-16 10:28:53 +0100 (Tue, 16 Mar 2010)
New Revision: 35875
Changeset: http://dev.haiku-os.org/changeset/35875/haiku

Modified:
   haiku/trunk/src/kits/textencoding/utf8_conversions.cpp
Log:
Replace each occurence of an invalide character. Before all invalid
characters where omitted and the substitute character was append at the end
of the input text.
Added comment how the continuation of incomplete multibyte sequences
could be solved.
Please review.


Modified: haiku/trunk/src/kits/textencoding/utf8_conversions.cpp
===================================================================
--- haiku/trunk/src/kits/textencoding/utf8_conversions.cpp      2010-03-16 
09:21:26 UTC (rev 35874)
+++ haiku/trunk/src/kits/textencoding/utf8_conversions.cpp      2010-03-16 
09:28:53 UTC (rev 35875)
@@ -29,6 +29,51 @@
 int iconvctl(iconv_t icd, int request, void* argument);
 
 
+static void
+discard_invalid_input_character(iconv_t* conversion, char** inputBuffer,
+       size_t* inputLeft)
+{
+       if (*inputLeft == 0)
+               return;
+
+       char outputBuffer[1];
+
+       // skip the invalid input character only
+       size_t left = 1;
+       for (; left <= *inputLeft; left ++) {
+               // reset internal state
+               iconv(*conversion, NULL, NULL, NULL, NULL);
+
+               char* buffer = *inputBuffer;
+               char* output = outputBuffer;
+               size_t outputLeft = 1;
+               size_t size = iconv(*conversion, &buffer, &left,
+                       &output, &outputLeft);
+
+               if (size != (size_t)-1) {
+                       // should not reach here
+                       break;
+               }
+
+               if (errno == EINVAL) {
+                       // too few input bytes provided,
+                       // increase input buffer size and try again
+                       continue;
+               }
+
+               if (errno == EILSEQ) {
+                       // minimal size of input buffer found
+                       break;
+               }
+
+               // should not reach here
+       };
+
+       *inputBuffer += left;
+       *inputLeft -= left;
+}
+
+
 status_t
 convert_encoding(const char* from, const char* to, const char* src,
        int32* srcLen, char* dst, int32* dstLen, int32* state,
@@ -71,11 +116,8 @@
                        switch (errno) {
                                case EILSEQ: // unable to generate a 
corresponding character
                                {
-                                       // discard the input character
-                                       const int one = 1, zero = 0;
-                                       iconvctl(conversion, 
ICONV_SET_DISCARD_ILSEQ, (void*)&one);
-                                       iconv(conversion, inputBuffer, 
&inputLeft, &dst, &outputLeft);
-                                       iconvctl(conversion, 
ICONV_SET_DISCARD_ILSEQ, (void*)&zero);
+                                       
discard_invalid_input_character(&conversion, inputBuffer,
+                                               &inputLeft);
 
                                        // prepare to convert the substitute 
character to target encoding
                                        char original = substitute;
@@ -96,7 +138,12 @@
                                        break;
                                }
 
-                               case EINVAL: // incomplete multibyte sequence 
in the input
+                               case EINVAL: // incomplete multibyte sequence 
at the end of the input
+                                       // TODO inputLeft bytes from 
inputBuffer should
+                                       // be stored in state variable, so that 
conversion
+                                       // can continue when the caller 
provides the missing
+                                       // bytes with the next call of this 
method
+
                                        // we just eat bad bytes, as part of 
robustness/best-effort
                                        inputBuffer++;
                                        inputLeft--;


Other related posts: