Unicode and XML
- From: Deblauwe Gino <admin@xxxxxxxxxxxxxx>
- To: ftcdev@xxxxxxxxxxxxx
- Date: Fri, 28 Nov 2008 11:38:24 +0100
In xml it's not wise to include the unicode chars by itself, besides of
the 5 standard replacements:
'&' => '&'
'>' => '>'
'<' => '<'
'"' => '"'
''' => '''
You have for each unicode character a proper replacement in xml that is
widely supported. So you don't run across heavy encoding problems
I included an old piece of VB6 code with an incomplete list (most used
codes only)
Private Function ReplaceCodesByUnicode(strTxt As String) As String
Dim t As Long
Dim strOut As String
Dim strChar As String
strOut = Replace(strTxt, "á", "á")
strOut = Replace(strOut, "Á", "Á")
strOut = Replace(strOut, "â", "â")
strOut = Replace(strOut, "Â", "Â")
strOut = Replace(strOut, "à", "à")
strOut = Replace(strOut, "À", "À")
strOut = Replace(strOut, "å", "å")
strOut = Replace(strOut, "Å", "Å")
strOut = Replace(strOut, "ã", "ã")
strOut = Replace(strOut, "Ã", "Ã")
strOut = Replace(strOut, "ä", "ä")
strOut = Replace(strOut, "Ä", "Ä")
strOut = Replace(strOut, "æ", "æ")
strOut = Replace(strOut, "Æ", "Æ")
strOut = Replace(strOut, "ç", "ç")
strOut = Replace(strOut, "Ç", "Ç")
strOut = Replace(strOut, "Ð", "Ð")
strOut = Replace(strOut, "ð", "ð")
strOut = Replace(strOut, "é", "é")
strOut = Replace(strOut, "É", "É")
strOut = Replace(strOut, "ê", "ê")
strOut = Replace(strOut, "Ê", "Ê")
strOut = Replace(strOut, "è", "è")
strOut = Replace(strOut, "È", "È")
strOut = Replace(strOut, "ë", "ë")
strOut = Replace(strOut, "Ë", "Ë")
strOut = Replace(strOut, "í", "í")
strOut = Replace(strOut, "Í", "Í")
strOut = Replace(strOut, "î", "î")
strOut = Replace(strOut, "Î", "Î")
strOut = Replace(strOut, "ì", "ì")
strOut = Replace(strOut, "Ì", "Ì")
strOut = Replace(strOut, "ï", "ï")
strOut = Replace(strOut, "Ï", "Ï")
strOut = Replace(strOut, "ñ", "ñ")
strOut = Replace(strOut, "Ñ", "Ñ")
strOut = Replace(strOut, "ó", "ó")
strOut = Replace(strOut, "Ó", "Ó")
strOut = Replace(strOut, "ô", "ô")
strOut = Replace(strOut, "Ô", "Ô")
strOut = Replace(strOut, "ò", "ò")
strOut = Replace(strOut, "Ò", "Ò")
strOut = Replace(strOut, "⊘", "ø")
strOut = Replace(strOut, "Ø", "Ø")
strOut = Replace(strOut, "õ", "õ")
strOut = Replace(strOut, "Õ", "Õ")
strOut = Replace(strOut, "ö", "ö")
strOut = Replace(strOut, "Ö", "Ö")
strOut = Replace(strOut, "ß", "ß")
strOut = Replace(strOut, "þ", "þ")
strOut = Replace(strOut, "Þ", "Þ")
strOut = Replace(strOut, "ú", "ú")
strOut = Replace(strOut, "Ú", "Ú")
strOut = Replace(strOut, "Û", "Û")
strOut = Replace(strOut, "û", "û")
strOut = Replace(strOut, "ù", "ù")
strOut = Replace(strOut, "Ù", "Ù")
strOut = Replace(strOut, "ü", "ü")
strOut = Replace(strOut, "Ü", "Ü")
strOut = Replace(strOut, "ý", "ý")
strOut = Replace(strOut, "Ý", "Ý")
strOut = Replace(strOut, "ÿ", "ÿ")
strOut = Replace(strOut, "€", "€")
strOut = Replace(strOut, "´", "´")
strOut = Replace(strOut, "`", "`")
strOut = Replace(strOut, "~", "~")
strOut = Replace(strOut, "¨", "¨")
strOut = Replace(strOut, "©", "©")
strOut = Replace(strOut, "∘", "^")
strOut = Replace(strOut, "§", "§")
strOut = Replace(strOut, "µ", "µ")
strOut = Replace(strOut, "£", "£")
strOut = Replace(strOut, "$", "$")
strOut = Replace(strOut, "°", "°")
strOut = Replace(strOut, "@", "@")
strOut = Replace(strOut, "²", "²")
strOut = Replace(strOut, "³", "³")
strOut = Replace(strOut, "«", "«")
strOut = Replace(strOut, "»", "»")
ReplaceCodesByUnicode = strOut
End Function
Private Function ReplaceUnicodeByCodes(strTxt As String) As String
Dim t As Long
Dim strOut As String
Dim strBuf As String
Dim strChar As String
strOut = ""
strBuf = ""
For t = 1 To Len(strTxt)
strChar = Mid$(strTxt, t, 1)
Select Case strChar
Case Is = "á"
strBuf = strBuf & "á"
Case Is = "Á"
strBuf = strBuf & "Á"
Case Is = "â"
strBuf = strBuf & "â"
Case Is = "Â"
strBuf = strBuf & "Â"
Case Is = "à"
strBuf = strBuf & "à"
Case Is = "À"
strBuf = strBuf & "À"
Case Is = "å"
strBuf = strBuf & "å"
Case Is = "Å"
strBuf = strBuf & "Å"
Case Is = "ã"
strBuf = strBuf & "ã"
Case Is = "Ã"
strBuf = strBuf & "Ã"
Case Is = "ä"
strBuf = strBuf & "ä"
Case Is = "Ä"
strBuf = strBuf & "Ä"
Case Is = "æ"
strBuf = strBuf & "æ"
Case Is = "Æ"
strBuf = strBuf & "Æ"
Case Is = "ç"
strBuf = strBuf & "ç"
Case Is = "Ç"
strBuf = strBuf & "Ç"
Case Is = "Ð"
strBuf = strBuf & "Ð"
Case Is = "ð"
strBuf = strBuf & "ð"
Case Is = "é"
strBuf = strBuf & "é"
Case Is = "É"
strBuf = strBuf & "É"
Case Is = "ê"
strBuf = strBuf & "ê"
Case Is = "Ê"
strBuf = strBuf & "Ê"
Case Is = "è"
strBuf = strBuf & "è"
Case Is = "È"
strBuf = strBuf & "È"
Case Is = "ë"
strBuf = strBuf & "ë"
Case Is = "Ë"
strBuf = strBuf & "Ë"
Case Is = "í"
strBuf = strBuf & "í"
Case Is = "Í"
strBuf = strBuf & "Í"
Case Is = "î"
strBuf = strBuf & "î"
Case Is = "Î"
strBuf = strBuf & "Î"
Case Is = "ì"
strBuf = strBuf & "ì"
Case Is = "Ì"
strBuf = strBuf & "Ì"
Case Is = "ï"
strBuf = strBuf & "ï"
Case Is = "Ï"
strBuf = strBuf & "Ï"
Case Is = "ñ"
strBuf = strBuf & "ñ"
Case Is = "Ñ"
strBuf = strBuf & "Ñ"
Case Is = "ó"
strBuf = strBuf & "ó"
Case Is = "Ó"
strBuf = strBuf & "Ó"
Case Is = "ô"
strBuf = strBuf & "ô"
Case Is = "Ô"
strBuf = strBuf & "Ô"
Case Is = "ò"
strBuf = strBuf & "ò"
Case Is = "Ò"
strBuf = strBuf & "Ò"
Case Is = "ø"
strBuf = strBuf & "⊘"
Case Is = "Ø"
strBuf = strBuf & "Ø"
Case Is = "õ"
strBuf = strBuf & "õ"
Case Is = "Õ"
strBuf = strBuf & "Õ"
Case Is = "ö"
strBuf = strBuf & "ö"
Case Is = "Ö"
strBuf = strBuf & "Ö"
Case Is = "ß"
strBuf = strBuf & "ß"
Case Is = "þ"
strBuf = strBuf & "þ"
Case Is = "Þ"
strBuf = strBuf & "Þ"
Case Is = "ú"
strBuf = strBuf & "ú"
Case Is = "Ú"
strBuf = strBuf & "Ú"
Case Is = "Û"
strBuf = strBuf & "Û"
Case Is = "û"
strBuf = strBuf & "û"
Case Is = "ù"
strBuf = strBuf & "ù"
Case Is = "Ù"
strBuf = strBuf & "Ù"
Case Is = "ü"
strBuf = strBuf & "ü"
Case Is = "Ü"
strBuf = strBuf & "Ü"
Case Is = "ý"
strBuf = strBuf & "ý"
Case Is = "Ý"
strBuf = strBuf & "Ý"
Case Is = "ÿ"
strBuf = strBuf & "ÿ"
Case Is = "€"
strBuf = strBuf & "€"
Case Is = "´"
strBuf = strBuf & "´"
Case Is = "’"
strBuf = strBuf & "´"
Case Is = "`"
strBuf = strBuf & "`"
Case Is = "~"
strBuf = strBuf & "~"
Case Is = "¨"
strBuf = strBuf & "¨"
Case Is = "^"
strBuf = strBuf & "∘"
Case Is = "§"
strBuf = strBuf & "§"
Case Is = "µ"
strBuf = strBuf & "µ"
Case Is = "£"
strBuf = strBuf & "£"
Case Is = "$"
strBuf = strBuf & "$"
Case Is = "©"
strBuf = strBuf & "©"
Case Is = "°"
strBuf = strBuf & "°"
Case Is = "@"
strBuf = strBuf & "@"
Case Is = "²"
strBuf = strBuf & "²"
Case Is = "³"
strBuf = strBuf & "³"
Case Is = "«"
strBuf = strBuf & "«"
Case Is = "»"
strBuf = strBuf & "»"
Case Else
strBuf = strBuf & strChar
End Select
If (t Mod 4096) = 0 Then
strOut = strOut & strBuf
strBuf = ""
End If
Next t
strOut = strOut & strBuf
ReplaceUnicodeByCodes = strOut
End Function
More character can be found on
http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
For xml the code is in the third column so Yen (¥) => '¥'
Hopefully it helps you as much as it helped me
BTW v1.2 Beta 12 is for me release quality
Greetings
Gino Deblauwe
Chief programming
UseIt Group NV
FTC Website:
http://www.truenorthsoftware.com/FormattedTextControl/FormattedTextControl.html
Set List Options (digest and vacation modes): www.freelists.org/list/ftcdev
List Archive: www.freelists.org/archives/ftcdev
Unsubscribe: Send email to ftcdev-request@xxxxxxxxxxxxx with "unsubscribe" in
the subject field.
Other related posts: