[arachne] Re: UTF-8 support

  • From: "Glenn McCorkle" <glennmcc@xxxxxxxxxx>
  • To: arachne@xxxxxxxxxxxxx
  • Date: Thu, 27 Mar 2008 19:14:43 -0500

Arachne at FreeLists---The Arachne Fan Club!

On Thu, 27 Mar 2008 12:27:22, Christof Lange wrote:

> Hi developers,

> well, it seems I have slept quite a long time...

> When I went online with A193, I was surprised by a comprehensive
> UTF-8 support within the main program. Thank you, thank you, thank
> you! I probably can use Arachne for another 10-15 years.

> I will have to check, whether I have installed all the necessary
> mime.cfg commands or codepage tables. My first attempt to read
> UTF-8 encoded pages with my ISO-8858-2 font got this result:

> 1. The page ist not altered. When I save the page or edit it, the
> charset information in the header and the 16bit characters are
> preserved.

> 2. On the display and while looking at the HTML source with F6 the
> unicode characters are rendered with the 8bit character set.

> 3. The conversion is almost correct. Those characters which are
> different between the Windows character set cp1250 and iso-8859-2
> are displayes by a ASCII 127 character. This is probably not a bug,
> but due to my yet incomplete installation or some keyword in
> arachne.cfg may be missing/wrong.

Hi Christof,

If you think the existing UTF-8 support in the copy of v1.93 you have
now is good......

You are going to LOVE the updated package just now uploaded to ...
http://cisnet.com/glennmcc/a193gpl/

You can now add or modify the interpretation of _any_ 2 byte
or 3 byte UTF-8 encoded character by simply editing entity.cfg

--- the new entity.cfg ---

[Entity conversions]

[Named Entities]

lsquo  `
rsquo  ´ ;ASCII 180
ldquo  "
rdquo  "
mdash  -
ndash  -
euro   ? ;ASCII 129 euro symbol
circ   ^
tilde  ~
brkbar ¦ ;ASCII 166
lsaquo ` ;lsaquo is proposed but not yet ISO standardized
rsaquo ´ ;rsaquo is proposed but not yet ISO standardized (ASCII 180)
OElig  ? ;ASCII 140
oelig  ? ;ASCII 156

[Numeric Entities]
[decimal format only... HEX is converted to DEC internally]
[this section also handles UTF-8 conversions]

145  `
146  ´ ;ASCII 180
150  -
151  -
8211 -
8212 -
8216 `
8217 ´ ;ASCII 180
8220 "
8221 "
8222 "
8249 `
8250 ´ ;ASCII 180
8254 ¯ ;ASCII 175 (overscore)
8364 ? ;ASCII 129 euro symbol
8482 ?
618  i
331  n ;simulate 'small nj'
233  é
338  ? ;OElig
339  ? ;oelig
__________________________________

____Here's_Werner's_original_code_and_my_mods_to_it___________________________________________

// werner scholz  begin  Nov 8,2006   ------  utf8-table  -------
unsigned char utf8table(unsigned char x1,unsigned char x2,unsigned char 
x3,unsigned char x4)
{ unsigned char c;
   c=127;          // square is default if utf8 converting fails !

  if(x1==194)     // C2
  {
   if((x2>160)&&(x2<192))c=x2;
  }

  if(x1==195)     // C3
  {
   if((x2>127)&&(x2<192))c=x2+64;
  }
  // Here ends charset ISO-8859-1

//!!glennmcc: Mar 27, 2008 -- convert UTF-8 into numerical equiv
//and then read its needed character from entity.cfg
  if(x1>=192 && x1<=223)
  {
   char *number="\0";
   int uc;
   uc=(x1-192) * 64 + (x2-128);
   if((uc>31 && uc<128) || (uc>159 && uc<256)) return uc;
   if(configvariable(&ENTITYcfg,itoa(uc,number,10),NULL))
   {
//    Piip();//during testing... 'beep' when a conversion takes place
    return *configvariable(&ENTITYcfg,itoa(uc,number,10),NULL);
   }
  }

  if(x1>=224 && x1<=239)
  {
   char *number="\0";
   int uc;
   uc=(x1-224) * 4096 + (x2-128) * 64 + (x3-128);
   if((uc>31 && uc<128) || (uc>159 && uc<256)) return uc;
   if(configvariable(&ENTITYcfg,itoa(uc,number,10),NULL))
   {
//    Piip();//during testing... 'beep' when a conversion takes place
    return *configvariable(&ENTITYcfg,itoa(uc,number,10),NULL);
   }
  }
//original code below will take-over if this code fails to find
//the needed conversion in entity.cfg
//!!glennmcc: end

  // More utf8 characters can be added here ....

  if(x1==196)     // C4
  {
   if(x2==141) c=135;
   if(x2==140) c=135;
   if(x2==184) c=227;
   if(x2==131) c=229;
  }

  if(x1==197)     // C5
  {
   if(x2==161) c=154;
  }

  if(x1==226)      // E2
  {
   if((x2==128)&&(x3==158))c=132;
   if((x2==128)&&(x3==156))c=148;
   if((x2==128)&&(x3==147))c=173;
   if((x2==128)&&(x3==148))c=45;
   if((x2==128)&&(x3==162))c=183;
   if((x2==128)&&(x3==153))c=39;//!!glennmcc: Mar 26, 2008 -- apostrophe
  }
 return c;
}
// werner scholz end
______________________________________________________________________________________________

-- 
 Glenn
 http://www.delorie.com/listserv/mime/
 http://www.cisnet.com/glennmcc/
 http://www.law.cornell.edu/constitution/constitution.table.html
                  Arachne at FreeLists                  
-- Arachne, The Premier GPL Web Browser/Suite for DOS --

Other related posts: