[yunqa.de] Re: DIXML encoding

  • From: Delphi Inspiration <delphi@xxxxxxxx>
  • To: yunqa@xxxxxxxxxxxxx
  • Date: Tue, 23 Jul 2013 16:40:25 +0200

On 23.07.2013 14:02, Andy Zubov wrote:

How i can read xml file with win-1251(russian lang) encoding?

Win-1251 encoding is not built into DIXml but is easily added as a custom encoding.

Please take a look at the demo project located in

  DIXml\Demos\DIXml_Custom_Encodings\DIXml_Custom_Encodings.dpr

You will see that it registers a new character encoding which is used by the XML.

I have added windows-1251 support to DIXmlConverters.pas (attached). With this, you can register the windows-1251 character encoding:

  { Register the windows-1251 encoding. }
  xmlNewCharEncodingHandler(
    'windows-1251', xmlWin1251ToUtf8, xmlUtf8ToWin1251);

Afterwards, parse your document(s) as usual and windows-1251 support will be automatic.

Ralf
{-------------------------------------------------------------------------------
 
 Copyright (c) 1999-2013 Ralf Junker, The Delphi Inspiration
 Internet: http://www.yunqa.de/delphi/
 E-Mail:   delphi@xxxxxxxx

-------------------------------------------------------------------------------}

unit DIXmlConverters;

{$I DICompilers.inc}

interface

uses
  DIXml;

function xmlGB2312ToUtf8(
  Out_: C_char_ptr;
  OutLen: C_int_ptr;
  In_: C_char_ptr;
  InLen: C_int_ptr): C_int;

function xmlUtf8ToGB2312(
  Out_: C_char_ptr;
  OutLen: C_int_ptr;
  In_: C_char_ptr;
  InLen: C_int_ptr): C_int;

function xmlWin1251ToUtf8(
  Out_: C_char_ptr;
  OutLen: C_int_ptr;
  In_: C_char_ptr;
  InLen: C_int_ptr): C_int;

function xmlUtf8ToWin1251(
  Out_: C_char_ptr;
  OutLen: C_int_ptr;
  In_: C_char_ptr;
  InLen: C_int_ptr): C_int;

implementation

uses
  DIConverters;

function xmlEncodingWrapper(
  Out_: C_char_ptr;
  OutLen: C_int_ptr;
  In_: C_char_ptr;
  InLen: C_int_ptr;
  mbtowc: xxx_mbtowc;
  wctomb: xxx_wctomb): C_int;
var
  convIn, convOut: conv_struct;
  pIn, pOut: PAnsiChar;
  lIn, lOut, rIn, rOut: Integer;
  u: ucs4_t;
begin
  if Assigned(Out_) and Assigned(OutLen) and Assigned(InLen) then
    begin
      Result := 0;
      if Assigned(In_) then
        begin
          lIn := InLen^;
          if lIn > 0 then
            begin
              pOut := Out_; lOut := OutLen^; pIn := In_;
              convIn.ioState := 0; convOut.ioState := 0;
              repeat

                rIn := mbtowc(@convIn, u, pIn, lIn);
                if rIn > 0 then
                  begin
                    if lOut > 0 then
                      begin

                        rOut := wctomb(@convOut, pOut, u, lOut);
                        if rOut > 0 then
                          begin
                            Inc(pOut, rOut); Dec(lOut, rOut);
                          end
                        else
                          begin
                            if lOut = RET_ILUNI then
                              Result := -2;
                            Break;
                          end;
                      end
                    else
                      Break;
                    Inc(pIn, rIn); Dec(lIn, rIn);
                  end
                else
                  begin
                    if rIn = RET_ILSEQ then
                      Result := -2;
                    Break;
                  end;
              until lIn = 0;

              InLen^ := pIn - In_;
              OutLen^ := pOut - Out_;
              if Result = 0 then
                Result := OutLen^;
              Exit;
            end
        end;
      InLen^ := Result;
      OutLen^ := Result;
    end
  else
    Result := -1;
end;

function xmlGB2312ToUtf8(
  Out_: C_char_ptr;
  OutLen: C_int_ptr;
  In_: C_char_ptr;
  InLen: C_int_ptr): C_int;
begin
  Result := xmlEncodingWrapper(Out_, OutLen, In_, InLen, euc_cn_mbtowc, 
utf8_wctomb);
end;

function xmlUtf8ToGB2312(
  Out_: C_char_ptr;
  OutLen: C_int_ptr;
  In_: C_char_ptr;
  InLen: C_int_ptr): C_int;
begin
  Result := xmlEncodingWrapper(Out_, OutLen, In_, InLen, utf8_mbtowc, 
euc_cn_wctomb);
end;

function xmlWin1251ToUtf8(
  Out_: C_char_ptr;
  OutLen: C_int_ptr;
  In_: C_char_ptr;
  InLen: C_int_ptr): C_int;
begin
  Result := xmlEncodingWrapper(Out_, OutLen, In_, InLen, cp1251_mbtowc, 
utf8_wctomb);
end;

function xmlUtf8ToWin1251(
  Out_: C_char_ptr;
  OutLen: C_int_ptr;
  In_: C_char_ptr;
  InLen: C_int_ptr): C_int;
begin
  Result := xmlEncodingWrapper(Out_, OutLen, In_, InLen, utf8_mbtowc, 
cp1251_wctomb);
end;

end.

Other related posts: