[yunqa.de] Re: DIRegEx - translating utf8 match info to utf16 - yunqa

[yunqa.de] Re: DIRegEx - translating utf8 match info to utf16

From: Delphi Inspiration <delphi@xxxxxxxx>
To: yunqa@xxxxxxxxxxxxx
Date: Fri, 16 Nov 2007 17:05:49 +0100
Hello Jim Bretti,

>I can display the matched string with StrDecodeUtf8(re.MatchedStr), but I'm
>having trouble with RE.MatchedStrLength and RE.MatchStrFirstCharPos.  These
>values are relative to the UTF8 encoded source ... is there any way to
>translate the values so they are relative to the original utf16 source?

You are looking for the number of UTF-8 characters (or code points) contained 
in the subject string up to RE.MatchStrFirstCharPos. Since UTF-8 is a variable 
length encoding, counting them is unfortunately inevitable.

The BufCountUtf8Chars function in DIUtils.pas is available for the task. Please 
find below the modified DIRegEx_Simple_Unicode.dpr source code to demonstrate 
its usage. I have added a simple assertion to make sure it is working as 
expected.

Ralf

program DIRegEx_Simple_Unicode;

{$APPTYPE Console}
{$I DI.inc}

{ Increase the maximum stack size to reduce the potential of stack overflow
  when matching very demanding regular expressions. }
{$MAXSTACKSIZE $00200000}

uses
  DIUtils, DIRegEx;

const
  { The following string contains non-ASCII characters. It is encoded in the
    Windows-1252 character set and might be illegible if your Delphi uses a
    different locale. }
  SUBJECT_STRING: WideString =
    'DIRegEx supports Unicode via UTF-8: ÄÖÜ äöü ÁÉÍÓÚ áéíóú.';

var
  RegEx: TDIRegEx;
  SubjectStr8: AnsiString = ''; s8: AnsiString = ''; // Hold UTF-8 
representations.
  w{$IFOPT C+}, w_Assert{$ENDIF}: WideString;
  MatchCount: Integer;
  Uni_MatchedStrFirstCharPos, Uni_MatchedStrLen: Cardinal;
begin
  WriteLn('CAUTION: This Unicode demo is a console application. Unfortunately, 
the console');
  WriteLn('         does not display Unicode characters properly. Therefore 
some some');
  WriteLn('         characters might not look right on the console, but they 
are still');
  WriteLn('         handled correctly internally.');
  WriteLn;

  RegEx := TDIPerlRegEx.Create{$IFNDEF DI_No_RegEx_Component}(nil){$ENDIF}; ;
  try
    { Switch on UTF-8 support for DIRegEx. With coUtf8 set, it treats all
      input and output as UTF-8 characters. Unless text is already in UTF-8
      format, you need to encode / decode it first before passing it to 
DIRegEx. }
    RegEx.CompileOptions := RegEx.CompileOptions + [coUtf8];

    { Encode the WideString to UTF-8 and set it as the subject string. }
    SubjectStr8 := StrEncodeUtf8(SUBJECT_STRING);
    RegEx.SetSubjectStr(SubjectStr8);

    (* Tell RegEx what we want to search for: The \p{L}+ pattern matches any
       word made up of a sequence of Unicode letters. *)
    RegEx.MatchPattern := StrEncodeUtf8('\p{L}+');

    { Loop to extract, count and write all matches. }
    MatchCount := 0;
    if RegEx.Match(0) > 0 then
      repeat
        Inc(MatchCount);

        { Retrieve the matched string (which is in UTF-8 format)
          and convert it to a WideString for display. }
        s8 := RegEx.MatchedStr;
        w := StrDecodeUtf8(s8);

        { Calculate Unicode / WideString match positions and length
          from the UTF-8 values. }
        Uni_MatchedStrFirstCharPos :=
          BufCountUtf8Chars(PAnsiChar(SubjectStr8), 
RegEx.MatchedStrFirstCharPos);
        Uni_MatchedStrLen :=
          BufCountUtf8Chars(RegEx.MatchedStrPtr, RegEx.MatchedStrLength);

        {$IFOPT C+}
        { Assert that the calculated Unicode match locations are correct. }
        w_Assert := Copy(SUBJECT_STRING,
          Uni_MatchedStrFirstCharPos + 1, Uni_MatchedStrLen);
        Assert(w_Assert = w, w_Assert + ' - ' + w);
        {$ENDIF}

        { Write the matched string to the console. }

        WriteLn('# ', MatchCount,
          ' - Uni FirstCharPos: ', Uni_MatchedStrFirstCharPos: 2,
          ' - Uni Length: ', Uni_MatchedStrLen: 2,
          ' - ', {$IFDEF COMPILER_6_UP}w{$ELSE}AnsiString(w){$ENDIF});

      until RegEx.MatchNext < 0;

  finally
    RegEx.Free;
  end;

  WriteLn;
  WriteLn('Done - Press ENTER to exit');
  ReadLn;
end.

_______________________________________________
Delphi Inspiration mailing list
yunqa@xxxxxxxxxxxxx
//www.freelists.org/list/yunqa
References:
- [yunqa.de] DIRegEx - translating utf8 match info to utf16
  - From: Jim Bretti
[yunqa.de] Re: DIRegEx - translating utf8 match info to utf16

Other related posts: