[yunqa.de] Re: DIHtmlParser

  • From: Delphi Inspiration <delphi@xxxxxxxx>
  • To: yunqa@xxxxxxxxxxxxx
  • Date: Wed, 06 Jan 2010 16:46:59 +0100

At 14:57 06.01.2010, randy wrote:

>Hi ,  what i want is to extract the scripts using your DIHtmlParser , then 
>saving the extracted script into a new  html file named Ext_Script.html

Very simple. See the demo project below and in the attach for an example 
implementation.

Ralf

{ DIHtmlParser example showing how to extract <SCRIPT> contents.

  Visit the DIHtmlParser homepage for latest information and updates:

    http://www.yunqa.de/delphi/

  Copyright (c) 2002-2010 Ralf Junker, The Delphi Inspiration <delphi@xxxxxxxx>

------------------------------------------------------------------------------ }

program DIHtmlParser_Extract_Scripts;

{$APPTYPE Console}
{$I DI.inc}

uses
  {$IFDEF FastMM}FastMM4, {$ENDIF}DIUtils, DIHtmlMisc, DIHtmlParser;

const
  HTML_DATA: AnsiString =
    '<html>' + CRLF +
    '<head>' + CRLF +
    '<title>' + CRLF +
    '  my web project' + CRLF +
    '</title>' + CRLF +
    '</head>' + CRLF +
    '<body>' + CRLF +
    '<script type="text/javascript">test test tes te t</script>  ' + CRLF +
    '</body>' + CRLF +
    '</html>';

var
  HtmlParser: TDIHtmlParser;
begin
  { Register HTML tags and attributes. }
  RegisterHtmlTags;
  RegisterHtmlAttribs;

  WriteLn('Original HTML:');
  WriteLn;
  WriteLn(HTML_DATA);
  WriteLn;

  WriteLn('List of all script tags and contents:');

  HtmlParser := TDIHtmlParser.Create(nil);
  try
    HtmlParser.SourceBufferAsStrA := HTML_DATA;

    // Make sure comments are reported to the application.
    HtmlParser.FilterHtmlTags.StartTags := fiShow;
    HtmlParser.FilterScripts := fiShow;

    while HtmlParser.ParseNextPiece do
      case HtmlParser.PieceType of

        ptHtmlTag:
          case HtmlParser.HtmlTag.TagID of

            TAG_SCRIPT_ID: // Here comes a <SCRIPT> HTML tag.
              begin
                { Is it a start tag? }
                if HtmlParser.HtmlTag.tagtype = ttstarttag then
                  begin
                    WriteLn;
                    WriteLn(string(HtmlParser.HtmlTag.Code));
                  end;
              end;
          end;

        ptScript: // Here comes the script's contents.
          begin
            { The <SCRIPT> ... </SCRIPT> contents is now stored in the
              TDIHtmlParser's data. Here we retrieve it as a string and
              write it to the console.

              Alternatively you may save it as UTF-16LE to file or stream:

              * HtmlParser.SaveDataToFile('FileName.txt');
              * HtmlParser.SaveDataToStream(StreamInstance); }
            WriteLn(string(HtmlParser.DataAsStrW));
          end;

      else
        // Process other HTML pieces.
      end;

  finally
    HtmlParser.Free;
  end;

  WriteLn;
  WriteLn('Done - Press ENTER to exit.');
  ReadLn;
end. 
{ DIHtmlParser example showing how to extract <SCRIPT> contents.

  Visit the DIHtmlParser homepage for latest information and updates:

    http://www.yunqa.de/delphi/

  Copyright (c) 2002-2010 Ralf Junker, The Delphi Inspiration <delphi@xxxxxxxx>

------------------------------------------------------------------------------ }

program DIHtmlParser_Extract_Scripts;

{$APPTYPE Console}
{$I DI.inc}

uses
  {$IFDEF FastMM}FastMM4, {$ENDIF}DIUtils, DIHtmlMisc, DIHtmlParser;

const
  HTML_DATA: AnsiString =
    '<html>' + CRLF +
    '<head>' + CRLF +
    '<title>' + CRLF +
    '  my web project' + CRLF +
    '</title>' + CRLF +
    '</head>' + CRLF +
    '<body>' + CRLF +
    '<script type="text/javascript">test test tes te t</script>  ' + CRLF +
    '</body>' + CRLF +
    '</html>';

var
  HtmlParser: TDIHtmlParser;
begin
  { Register HTML tags and attributes. }
  RegisterHtmlTags;
  RegisterHtmlAttribs;

  WriteLn('Original HTML:');
  WriteLn;
  WriteLn(HTML_DATA);
  WriteLn;

  WriteLn('List of all script tags and contents:');

  HtmlParser := TDIHtmlParser.Create(nil);
  try
    HtmlParser.SourceBufferAsStrA := HTML_DATA;

    // Make sure comments are reported to the application.
    HtmlParser.FilterHtmlTags.StartTags := fiShow;
    HtmlParser.FilterScripts := fiShow;

    while HtmlParser.ParseNextPiece do
      case HtmlParser.PieceType of

        ptHtmlTag:
          case HtmlParser.HtmlTag.TagID of

            TAG_SCRIPT_ID: // Here comes a <SCRIPT> HTML tag.
              begin
                { Is it a start tag? }
                if HtmlParser.HtmlTag.tagtype = ttstarttag then
                  begin
                    WriteLn;
                    WriteLn(string(HtmlParser.HtmlTag.Code));
                  end;
              end;
          end;

        ptScript: // Here comes the script's contents.
          begin
            { The <SCRIPT> ... </SCRIPT> contents is now stored in the
              TDIHtmlParser's data. Here we retrieve it as a string and
              write it to the console.

              Alternatively you may save it as UTF-16LE to file or stream:

              * HtmlParser.SaveDataToFile('FileName.txt');
              * HtmlParser.SaveDataToStream(StreamInstance); }
            WriteLn(string(HtmlParser.DataAsStrW));
          end;

      else
        // Process other HTML pieces.
      end;

  finally
    HtmlParser.Free;
  end;

  WriteLn;
  WriteLn('Done - Press ENTER to exit.');
  ReadLn;
end.

Other related posts: