[yunqa.de] Re: Parsing HTML-Tags detect HTML-Tags wrong

From: Delphi Inspiration <delphi@xxxxxxxx>
To: yunqa@xxxxxxxxxxxxx
Date: Fri, 07 Mar 2008 13:41:20 +0100

Simon Schaeberle wrote:

>I think there is a problem in the parser in combination with widestrings.

No, DIHtmlParser does not have any problems parsing Delphi WideStrings. See 
compilable example below for details (same contents as attached *.dpr file).

Ralf

{ DIHtmlParser WideString 2 parsing example.

  Character codepages and encodings are confusing to many who want to parse
  Unicode HTML. By default, DIHtmlParser handles all input as Latin1
  (ISO-8859-1). This applies AnsiStrings and WideStrings as well, since both
  string types can potentially store any kind of encoding.

  AnsiStrings can, for example, hold Latin1 or UTF-8 encoded text. While
  Delphi WideStrings usually contain UTF-16LE, they may also contain UTF-16BE.

  It is therefore important to set the TDIHtmlParser.ReadMethods to the correct
  encoding ahead of the parsing. In this demo we are parsing UTF-16LE
  WideStrings, which is made explicit by the following line of code:

    HtmlParser.ReadMethods := Read_UTF_16_LE;

  Visit the DIHtmlParser homepage for latest information and updates:

    http://www.yunqa.de/delphi/

  Copyright (c) 2002-2008 Ralf Junker, The Delphi Inspiration <delphi@xxxxxxxx>

------------------------------------------------------------------------------ }

program DIHtmlParser_WideString_Parsing_2;

{$APPTYPE Console}
{$I DI.inc}

uses
  DIUnicode, DIHtmlParser;

function ParseHtml(const AInput: WideString): WideString;
var
  HP: TDIHtmlParser;
begin
  Result := '';
  HP := TDIHtmlParser.Create(nil);
  try
    HP.ReadMethods := Read_UTF_16_LE;
    HP.SetSourceBufferAsStrW(AInput); // Notice the ...W for WideString!
    HP.FilterHtmlTags.SetStartEnd(fiShow);
    HP.FilterText := fiShow;
    while HP.ParseNextPiece do
      case HP.PieceType of
        ptHtmlTag:
          Result := Result + HP.HtmlTag.Code;
        ptText:
          Result := Result + HP.DataAsStrW;
      end;
  finally
    HP.Free;
  end;
end;

const
  HTML_DATA: WideString =
    '<HEAD>' + #13#10 +
    '  <!--Head Comment-->' + #13#10 +
    '</HEAD>' + #13#10 +
    '<BODY>' + #13#10 +
    '  <!--Body Comment-->' + #13#10 +
    'Body Text' + #13#10 +
    '</BODY>';

begin
  WriteLn('Original HTML:');
  WriteLn;
  WriteLn(HTML_DATA);
  WriteLn;

  { Optionally call these register functions once before the parsing. They are
    not actually needed for this example, but are be required for correct
    parsing if the HTML contains <PRE>, <TITLE>, <SCRIPT> or <STYLE> elements. }
  RegisterHtmlTags;
  RegisterHtmlAttribs;
  RegisterHtmlDecodingEntities;

  WriteLn('Parsed HTML:');
  WriteLn;
  WriteLn(ParseHtml(HTML_DATA));
  WriteLn;

  WriteLn('Done - Press ENTER to exit.');
  ReadLn;
end.

{ DIHtmlParser WideString 2 parsing example.

  Character codepages and encodings are confusing to many who want to parse
  Unicode HTML. By default, DIHtmlParser handles all input as Latin1
  (ISO-8859-1). This applies AnsiStrings and WideStrings as well, since both
  string types can potentially store any kind of encoding.

  AnsiStrings can, for example, hold Latin1 or UTF-8 encoded text. While
  Delphi WideStrings usually contain UTF-16LE, they may also contain UTF-16BE.

  It is therefore important to set the TDIHtmlParser.ReadMethods to the correct
  encoding ahead of the parsing. In this demo we are parsing UTF-16LE
  WideStrings, which is made explicit by the following line of code:

    HtmlParser.ReadMethods := Read_UTF_16_LE;

  Visit the DIHtmlParser homepage for latest information and updates:

    http://www.yunqa.de/delphi/

  Copyright (c) 2002-2008 Ralf Junker, The Delphi Inspiration <delphi@xxxxxxxx>

------------------------------------------------------------------------------ }

program DIHtmlParser_WideString_Parsing_2;

{$APPTYPE Console}
{$I DI.inc}

uses
  DIUnicode, DIHtmlParser;

function ParseHtml(const AInput: WideString): WideString;
var
  HP: TDIHtmlParser;
begin
  Result := '';
  HP := TDIHtmlParser.Create(nil);
  try
    HP.ReadMethods := Read_UTF_16_LE;
    HP.SetSourceBufferAsStrW(AInput); // Notice the ...W for WideString!
    HP.FilterHtmlTags.SetStartEnd(fiShow);
    HP.FilterText := fiShow;
    while HP.ParseNextPiece do
      case HP.PieceType of
        ptHtmlTag:
          Result := Result + HP.HtmlTag.Code;
        ptText:
          Result := Result + HP.DataAsStrW;
      end;
  finally
    HP.Free;
  end;
end;

const
  HTML_DATA: WideString =
    '<HEAD>' + #13#10 +
    '  <!--Head Comment-->' + #13#10 +
    '</HEAD>' + #13#10 +
    '<BODY>' + #13#10 +
    '  <!--Body Comment-->' + #13#10 +
    'Body Text' + #13#10 +
    '</BODY>';

begin
  WriteLn('Original HTML:');
  WriteLn;
  WriteLn(HTML_DATA);
  WriteLn;

  { Optionally call these register functions once before the parsing. They are
    not actually needed for this example, but are be required for correct
    parsing if the HTML contains <PRE>, <TITLE>, <SCRIPT> or <STYLE> elements. }
  RegisterHtmlTags;
  RegisterHtmlAttribs;
  RegisterHtmlDecodingEntities;

  WriteLn('Parsed HTML:');
  WriteLn;
  WriteLn(ParseHtml(HTML_DATA));
  WriteLn;

  WriteLn('Done - Press ENTER to exit.');
  ReadLn;
end.

References:
- [yunqa.de] Re: Parsing HTML-Tags detect HTML-Tags wrong
  - From: Simon Schaeberle

[yunqa.de] Re: Parsing HTML-Tags detect HTML-Tags wrong

Other related posts: