Bear Xu wrote: >1. >I do not know the encode type of the the html file(may be windows 1251 or >gb2312, or utf8 or others), e.g. used in a crawler, >How to use DiTidy to parse the html file with unicode? it is possible? DITidy readily supports these encodings: raw ascii latin0 latin1 utf8 iso2022 mac win1252 ibm858 utf16le utf16be utf16 big5 shiftjis DITidy also detects encoding markers present in HTML documents. If such a marker is missing, you can set the encoding manually before the parsing. tidySetCharEncoding(Doc, PAnsiChar(cboEncoding.Text)); See DITidy_Analyse.dpr for an example implementation. Web servers often return the character encodings along with their response which you can pass to tidySetCharEncoding(). >2. >How to parse the WideString Source code, and return the clear and repaired >html source code : > >function TidyHtml(HTML_Source:WideString) : WideString; >begin > ??? >end; Please see attached demo project for such a function. Ralf
{ DITidy "Hello World" example project. Writes the string 'Hello World' in HTML. Visit the DITidy homepage for latest information and updates: http://www.yunqa.de/delphi/ Copyright (c) 2009 Ralf Junker, The Delphi Inspiration <delphi@xxxxxxxx> ------------------------------------------------------------------------------ } program DITidy_Hello_World_Utf16; {$APPTYPE CONSOLE} {$I DI.inc} uses {$IFDEF FastMM}FastMM4, {$ENDIF}DITidy; { Set up an UnicodeString / WideString allocator to optimize memory handling and usage when writing the output HTML. } function UnicodeString_alloc( Self: TidyAllocatorPtr; nBytes: Cardinal): Pointer; begin SetString(WideString(Result), nil, Succ(nBytes) div 2); end; function UnicodeString_realloc( Self: TidyAllocatorPtr; Block: Pointer; nBytes: Cardinal): Pointer; begin Result := Block; SetLength(WideString(Result), Succ(nBytes) div 2); end; procedure UnicodeString_Free( Self: TidyAllocatorPtr; Block: Pointer); begin WideString(Block) := ''; end; procedure UnicodeString_panic( Self: TidyAllocatorPtr; const Msg: ctmbstr); begin end; const TidyAllocatorUnicodeStringVtbl: TidyAllocatorVtbl = ( alloc: UnicodeString_alloc; realloc: UnicodeString_realloc; Free: UnicodeString_Free; panic: UnicodeString_panic); TidyAllocatorUnicodeString: TidyAllocator = ( vtbl: @TidyAllocatorUnicodeStringVtbl); procedure tidyBufInitUnicodeString( Buf: TidyBufferPtr); begin FillChar(Buf^, SizeOf(Buf^), 0); Buf^.allocator := @TidyAllocatorUnicodeString; end; //------------------------------------------------------------------------------ { Cleans up some HTML and return results. } function TidyHtml(const Html: WideString): WideString; const UnicodeStringEncoding: PAnsiChar = 'utf16le'; var InBuf, OutBuf: TidyBuffer; TidyHandle: TidyDoc; begin TidyHandle := tidyCreate; { Prepare the input buffer. } FillChar(InBuf, SizeOf(InBuf), 0); InBuf.bp := Pointer(Html); InBuf.Size := Length(Html) * SizeOf(Html[1]); { Set the input encoding to match UnicodeString / WideString. } tidySetCharEncoding(TidyHandle, UnicodeStringEncoding); { Parse the HTML. } tidyParseBuffer(TidyHandle, @InBuf); { Clean up HTML. } tidyCleanAndRepair(TidyHandle); { Prepare the output buffer. } tidyBufInitUnicodeString(@OutBuf); { Set the output encoding to match UnicodeString / WideString. } tidySetOutCharEncoding(TidyHandle, UnicodeStringEncoding); { Do not write BOM. } tidyOptSetBool(TidyHandle, TidyOutputBOM, 0); { Write cleaned-up HTML. } tidySaveBuffer(TidyHandle, @OutBuf); tidyRelease(TidyHandle); { Shorten output string to actual length ... } SetLength(WideString(Pointer(OutBuf.bp)), OutBuf.Size div 2); { ... and copy to result. } Result := WideString(Pointer(OutBuf.bp)); tidyBufFree(@OutBuf); end; const Html: WideString = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";>' + '<html xmlns="http://www.w3.org/1999/xhtml";>' + '<head>' + '<title>Hello World</title>' + '</head>' + '<body>' + '<p>Hello World' + '</body>' + '</html>'; begin WriteLn('Tidying HTML input:'); WriteLn; WriteLn(Html); WriteLn; WriteLn('Result HTML Ouput:'); WriteLn; WriteLn(TidyHtml(Html)); WriteLn('Done - Press ENTER to exit'); ReadLn; end.