[yunqa.de] Re: How to TIDY HTML with unicode via DITidy?

  • From: Delphi Inspiration <delphi@xxxxxxxx>
  • To: yunqa@xxxxxxxxxxxxx
  • Date: Sun, 08 Feb 2009 12:56:50 +0100

Bear Xu wrote:

>1.
>I do not know the encode type of the the html file(may be windows 1251 or 
>gb2312, or utf8 or others), e.g. used in a crawler,
>How to use DiTidy to parse the html file with unicode? it is possible?

DITidy readily supports these encodings:

  raw
  ascii
  latin0
  latin1
  utf8
  iso2022
  mac
  win1252
  ibm858
  utf16le
  utf16be
  utf16
  big5
  shiftjis

DITidy also detects encoding markers present in HTML documents. If such a 
marker is missing, you can set the encoding manually before the parsing.

  tidySetCharEncoding(Doc, PAnsiChar(cboEncoding.Text));

See DITidy_Analyse.dpr for an example implementation.

Web servers often return the character encodings along with their response 
which you can pass to tidySetCharEncoding().

>2. 
>How to parse the WideString Source code, and return the clear and repaired 
>html source code :
> 
>function TidyHtml(HTML_Source:WideString) : WideString;
>begin
>  ???
>end;

Please see attached demo project for such a function.

Ralf 
{ DITidy "Hello World" example project. Writes the string 'Hello World' in HTML.

  Visit the DITidy homepage for latest information and updates:

    http://www.yunqa.de/delphi/

  Copyright (c) 2009 Ralf Junker, The Delphi Inspiration <delphi@xxxxxxxx>

------------------------------------------------------------------------------ }

program DITidy_Hello_World_Utf16;

{$APPTYPE CONSOLE}
{$I DI.inc}

uses
  {$IFDEF FastMM}FastMM4, {$ENDIF}DITidy;

{ Set up an UnicodeString / WideString allocator to optimize memory handling
  and usage when writing the output HTML. }

function UnicodeString_alloc(
  Self: TidyAllocatorPtr;
  nBytes: Cardinal): Pointer;
begin
  SetString(WideString(Result), nil, Succ(nBytes) div 2);
end;

function UnicodeString_realloc(
  Self: TidyAllocatorPtr;
  Block: Pointer;
  nBytes: Cardinal): Pointer;
begin
  Result := Block;
  SetLength(WideString(Result), Succ(nBytes) div 2);
end;

procedure UnicodeString_Free(
  Self: TidyAllocatorPtr;
  Block: Pointer);
begin
  WideString(Block) := '';
end;

procedure UnicodeString_panic(
  Self: TidyAllocatorPtr;
  const Msg: ctmbstr);
begin
end;

const
  TidyAllocatorUnicodeStringVtbl: TidyAllocatorVtbl = (
    alloc: UnicodeString_alloc;
    realloc: UnicodeString_realloc;
    Free: UnicodeString_Free;
    panic: UnicodeString_panic);

  TidyAllocatorUnicodeString: TidyAllocator = (
    vtbl: @TidyAllocatorUnicodeStringVtbl);

procedure tidyBufInitUnicodeString(
  Buf: TidyBufferPtr);
begin
  FillChar(Buf^, SizeOf(Buf^), 0);
  Buf^.allocator := @TidyAllocatorUnicodeString;
end;

//------------------------------------------------------------------------------

{ Cleans up some HTML and return results. }
function TidyHtml(const Html: WideString): WideString;
const
  UnicodeStringEncoding: PAnsiChar = 'utf16le';
var
  InBuf, OutBuf: TidyBuffer;
  TidyHandle: TidyDoc;
begin
  TidyHandle := tidyCreate;

  { Prepare the input buffer. }
  FillChar(InBuf, SizeOf(InBuf), 0);
  InBuf.bp := Pointer(Html);
  InBuf.Size := Length(Html) * SizeOf(Html[1]);
  { Set the input encoding to match UnicodeString / WideString. }
  tidySetCharEncoding(TidyHandle, UnicodeStringEncoding);

  { Parse the HTML. }
  tidyParseBuffer(TidyHandle, @InBuf);

  { Clean up HTML. }
  tidyCleanAndRepair(TidyHandle);

  { Prepare the output buffer. }
  tidyBufInitUnicodeString(@OutBuf);
  { Set the output encoding to match UnicodeString / WideString. }
  tidySetOutCharEncoding(TidyHandle, UnicodeStringEncoding);
  { Do not write BOM. }
  tidyOptSetBool(TidyHandle, TidyOutputBOM, 0);

  { Write cleaned-up HTML. }
  tidySaveBuffer(TidyHandle, @OutBuf);

  tidyRelease(TidyHandle);

  { Shorten output string to actual length ... }
  SetLength(WideString(Pointer(OutBuf.bp)), OutBuf.Size div 2);
  { ... and copy to result. }
  Result := WideString(Pointer(OutBuf.bp));

  tidyBufFree(@OutBuf);
end;

const
  Html: WideString =
    '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";>' +
    '<html xmlns="http://www.w3.org/1999/xhtml";>' +
    '<head>' +
    '<title>Hello World</title>' +
    '</head>' +
    '<body>' +
    '<p>Hello World' +
    '</body>' +
    '</html>';

begin
  WriteLn('Tidying HTML input:');
  WriteLn;
  WriteLn(Html);

  WriteLn;
  WriteLn('Result HTML Ouput:');
  WriteLn;
  WriteLn(TidyHtml(Html));

  WriteLn('Done - Press ENTER to exit');
  ReadLn;
end.

Other related posts: