View Issue Details

IDProjectCategoryView StatusLast Update
0020405LazarusLCLpublic2011-10-08 01:48
ReportermalcomeAssigned ToFelipe Monteiro de Carvalho 
PrioritynormalSeverityminorReproducibilityalways
Status closedResolutionfixed 
Product Version0.9.31 (SVN)Product Build 
Target VersionFixed in Version0.9.31 (SVN) 
Summary0020405: TStringListUTF8.DoCompareText in lazutf8classes unit.
DescriptionIt seems that the

function TStringListUTF8.DoCompareText(const s1, s2: string): PtrInt;
begin
  if CaseSensitive then
    Result:= CompareStr(s1,s2)
  else
    Result:= CompareText(s1,s2);
end;

 is better.
It is an error of the patch which I created.
TagsNo tags attached.
Fixed in Revision32703, 32729, 32731
LazTarget0.99.0
WidgetsetWin32/Win64
Attached Files
  • UTF8Compare.patch (3,108 bytes)
    Index: components/lazutils/lazutf8.pas
    ===================================================================
    --- components/lazutils/lazutf8.pas	(revision 32717)
    +++ components/lazutils/lazutf8.pas	(working copy)
    @@ -1147,7 +1147,7 @@
      ------------------------------------------------------------------------------}
     function UTF8CompareStr(const S1, S2: utf8string): Integer;
     begin
    -  //Result := WideCompareStr(UTF8ToUTF16(S1), UTF8ToUTF16(S2));
    +  Result := CompareStr(S1, S2);
     end;
     
     {------------------------------------------------------------------------------
    @@ -1159,8 +1159,90 @@
       This function guarantees proper collation on all supported platforms.
      ------------------------------------------------------------------------------}
     function UTF8CompareText(const S1, S2: utf8string): Integer;
    +
    +  procedure _UTF8LowerCase(p: PChar; var c: Int64; var l: integer);
    +  begin
    +    c:= 0; l:= 0;
    +    case Byte(p^) of
    +      $00..$7F: begin
    +        c:= Byte(p^);
    +        case c of
    +          $61..$7A: c := c - $20;
    +        end;
    +        l:= 1;
    +      end;
    +      $C2..$DF: begin
    +        c:= (Byte(p^) shl 8) or Byte((p+1)^);
    +        case c of
    +          $CEB1..$CEBF: c := c - $20; // Greek Characters
    +          $CF80..$CF89: c := c - $E0; // Greek Characters
    +        end;
    +        l:= 2;
    +      end;
    +      $E0..$EF: begin
    +        c:= (Byte(p^) shl 16) or (Byte((p+1)^) shl 8) or Byte((p+2)^);
    +        case c of
    +          $EFBD81..$EFBD9A: begin // ZENKAKU alphabet
    +            c:= $EFBC00 or (Byte((p+2)^) + $20);
    +          end;
    +        end;
    +        l:= 3;
    +      end;
    +      $F0..$F7: begin
    +        c:= (Byte(p^) shl 24) or (Byte((p+1)^) shl 16) or (Byte((p+2)^) shl 8)
    +         or Byte((p+3)^);
    +        case c of
    +          $F0FFFFFF..$F0FFFFFF: ;
    +        end;
    +        l:= 4;
    +      end;
    +      $F8..$FB: begin
    +        c:= (Byte(p^) shl 32) or (Byte((p+1)^) shl 24) or (Byte((p+2)^) shl 16)
    +         or (Byte((p+3)^) shl 8) or Byte((p+4)^);
    +        case c of
    +          $F8FFFFFFFF..$F8FFFFFFFF: ;
    +        end;
    +        l:= 5;
    +      end;
    +      $FC..$FD: begin
    +        c:= (Byte(p^) shl 40) or (Byte((p+1)^) shl 32) or (Byte((p+2)^) shl 24)
    +         or (Byte((p+3)^) shl 16) or (Byte((p+4)^) shl 8) or Byte((p+5)^);
    +        case c of
    +           $FCFFFFFFFFFF..$FCFFFFFFFFFF: ;
    +        end;
    +        l:= 6;
    +      end;
    +    end;
    +  end;
    +
    +var
    +  i, l, l2, Count, Count1, Count2: Integer;
    +  c, c2: Int64;
    +  p1, p2: PChar;
     begin
    -  //Result := WideCompareText(UTF8ToUTF16(S1), UTF8ToUTF16(S2));
    +  Count1 := UTF8Length(S1);
    +  Count2 := UTF8Length(S2);
    +  if Count1 > Count2 then
    +    Count := Count2
    +  else
    +    Count := Count1;
    +  i := 0;
    +  if Count > 0 then
    +  begin
    +    p1 := @S1[1];
    +    p2 := @S2[1];
    +    while i < Count do
    +    begin
    +      _UTF8LowerCase(p1, c{%H-}, l{%H-});
    +      _UTF8LowerCase(p2, c2{%H-}, l2{%H-});
    +      if (l <> l2) or (c <> c2) then Break;
    +      Inc(p1, l); Inc(p2, l); Inc(i, l);
    +    end;
    +  end;
    +  if i < Count then
    +    Result := c - c2
    +  else
    +    Result := Count1 - Count2;
     end;
     
     procedure InternalInit;
    
    UTF8Compare.patch (3,108 bytes)
  • UTF8Compare.patch_v2 (2,547 bytes)
  • UTF8Compare_v3.patch (2,525 bytes)
    Index: lazutils/lazutf8.pas
    ===================================================================
    --- lazutils/lazutf8.pas	(revision 32719)
    +++ lazutils/lazutf8.pas	(working copy)
    @@ -1147,7 +1147,7 @@
      ------------------------------------------------------------------------------}
     function UTF8CompareStr(const S1, S2: utf8string): Integer;
     begin
    -  //Result := WideCompareStr(UTF8ToUTF16(S1), UTF8ToUTF16(S2));
    +  Result := CompareStr(S1, S2);
     end;
     
     {------------------------------------------------------------------------------
    @@ -1159,8 +1159,73 @@
       This function guarantees proper collation on all supported platforms.
      ------------------------------------------------------------------------------}
     function UTF8CompareText(const S1, S2: utf8string): Integer;
    +
    +  procedure _UTF8LowerCase(p: PChar; var c, l: integer);
    +  begin
    +    c:= 0; l:= 0;
    +    case Byte(p^) of
    +      $00..$7F: begin
    +        c:= Byte(p^);
    +        case c of
    +          $61..$7A: c := c - $20;
    +        end;
    +        l:= 1;
    +      end;
    +      $C2..$DF: begin
    +        c:= (Byte(p^) shl 8) or Byte((p+1)^);
    +        case c of
    +          $CEB1..$CEBF: c := c - $20; // Greek Characters
    +          $CF80..$CF89: c := c - $E0; // Greek Characters
    +        end;
    +        l:= 2;
    +      end;
    +      $E0..$EF: begin
    +        c:= (Byte(p^) shl 16) or (Byte((p+1)^) shl 8) or Byte((p+2)^);
    +        case c of
    +          $EFBD81..$EFBD9A: begin // ZENKAKU alphabet
    +            c:= $EFBC00 or (Byte((p+2)^) + $20);
    +          end;
    +        end;
    +        l:= 3;
    +      end;
    +      $F0..$F7: begin
    +        c:= (Byte(p^) shl 24) or (Byte((p+1)^) shl 16) or (Byte((p+2)^) shl 8)
    +         or Byte((p+3)^);
    +        case c of
    +          $F0FFFFFF..$F0FFFFFF: ;
    +        end;
    +        l:= 4;
    +      end;
    +    end;
    +  end;
    +
    +var
    +  i, c, c2, l, l2, Count, Count1, Count2: Integer;
    +  p1, p2: PChar;
     begin
    -  //Result := WideCompareText(UTF8ToUTF16(S1), UTF8ToUTF16(S2));
    +  Count1 := UTF8Length(S1);
    +  Count2 := UTF8Length(S2);
    +  if Count1 > Count2 then
    +    Count := Count2
    +  else
    +    Count := Count1;
    +  i := 0;
    +  if Count > 0 then
    +  begin
    +    p1 := @S1[1];
    +    p2 := @S2[1];
    +    while i < Count do
    +    begin
    +      _UTF8LowerCase(p1, c{%H-}, l{%H-});
    +      _UTF8LowerCase(p2, c2{%H-}, l2{%H-});
    +      if (l <> l2) or (c <> c2) or (l = 0) then Break;
    +      Inc(p1, l); Inc(p2, l); Inc(i, l);
    +    end;
    +  end;
    +  if i < Count then
    +    Result := c - c2
    +  else
    +    Result := Count1 - Count2;
     end;
     
     procedure InternalInit;
    
    UTF8Compare_v3.patch (2,525 bytes)
  • UTF8LowerCase&CompareText.patch (4,662 bytes)
    Index: components/lazutils/lazutf8.pas
    ===================================================================
    --- components/lazutils/lazutf8.pas	(revision 32724)
    +++ components/lazutils/lazutf8.pas	(working copy)
    @@ -1094,46 +1094,81 @@
       end;
     end;
     
    +function UTF8LowerCaseRaw(p: PChar; var c: LongWord; var l: integer): boolean;
    +begin
    +  c:= 0; l:= 0;
    +  Result:= True;
    +  case Byte(p^) of
    +    $00..$7F: begin
    +      c:= Byte(p^);
    +      case c of
    +        $41..$5A: c := c + $20;
    +        else Result:= False;
    +      end;
    +      l:= 1;
    +    end;
    +    $C2..$DF: begin
    +      c:= (Byte(p^) shl 8) or Byte((p+1)^);
    +      case c of
    +        $CE91..$CE9F: c := c + $20; // Greek Characters
    +        $CEA0..$CEA9: c := c + $E0; // Greek Characters
    +        $D090..$D09F: c := c + $20; // Cyrillic alphabet
    +        $D0A0..$D0AF: c := c + $E0; // Cyrillic alphabet
    +        else Result:= False;
    +      end;
    +      l:= 2;
    +    end;
    +    $E0..$EF: begin
    +      c:= (Byte(p^) shl 16) or (Byte((p+1)^) shl 8) or Byte((p+2)^);
    +      case c of
    +        $EFBCA1..$EFBCBA: c:= c + $E0; // ZENKAKU alphabet
    +        else Result:= False;
    +      end;
    +      l:= 3;
    +    end;
    +    $F0..$F7: begin
    +      c:= (Byte(p^) shl 24) or (Byte((p+1)^) shl 16) or (Byte((p+2)^) shl 8)
    +       or Byte((p+3)^);
    +      case c of
    +        $F0FFFFFF..$F0FFFFFF: ;
    +        else Result:= False;
    +      end;
    +      l:= 4;
    +    end;
    +  end;
    +end;
    +
     function UTF8LowerCase(const s: utf8string): utf8string;
     var
    -  i: PtrInt;
    -  CharLen: integer;
    -  OldCode: LongWord;
    -  NewCode: LongWord;
    -  NewCharLen: integer;
    +  i, l0, l: integer;
    +  c: LongWord;
    +  p: PChar;
     begin
    -  Result:=s;
    -  i:=1;
    -  while i<=length(Result) do begin
    -    case Result[i] of
    -    { First ASCII chars }
    -    'A'..'Z':
    -      begin
    -        Result[i]:=chr(ord(Result[i])+32);
    -        inc(i);
    +  Result := '';
    +  i := 1;
    +  l0:= Length(s);
    +  p := @s[1];
    +  while i <= l0 do begin
    +    if UTF8LowerCaseRaw(p, c{%H-}, l{%H-}) then begin
    +      case l of
    +        1: Result := Result + Char(c);
    +        2: Result := Result + Char(c shr 8) + Char(c and $FF);
    +        3: Result := Result + Char(c shr 16) + Char((c shr 8) and $FF) +
    +         Char(c and $FF);
    +        4: Result := Result + Char(c shr 24) + Char((c shr 16) and $FF) +
    +         Char((c shr 8) and $FF) + Char(c and $FF);
    +        else Break;
           end;
    -    { Now chars with multiple bytes }
    -    #192..#240:
    -      begin
    -        OldCode:=UTF8CharacterToUnicode(@Result[i],CharLen);
    -        NewCode:=UnicodeLowercase(OldCode);
    -        if NewCode=OldCode then begin
    -          inc(i,CharLen);
    -        end else begin
    -          UniqueString(Result);
    -          NewCharLen:=UnicodeToUTF8(NewCode,@Result[i]);
    -          if CharLen=NewCharLen then begin
    -            inc(i,NewCharLen);
    -          end else begin
    -            // string size changed => use slower function
    -            Result:=UTF8LowercaseDynLength(s);
    -            exit;
    -          end;
    -        end;
    +    end else begin
    +      case l of
    +        1: Result := Result + p^;
    +        2: Result := Result + p^ + (p+1)^;
    +        3: Result := Result + p^ + (p+1)^ + (p+2)^;
    +        4: Result := Result + p^ + (p+1)^ + (p+2)^ + (p+3)^;
    +        else Break;
           end;
    -    else
    -      inc(i);
         end;
    +    Inc(p, l); Inc(i, l);
       end;
     end;
     
    @@ -1147,7 +1182,7 @@
      ------------------------------------------------------------------------------}
     function UTF8CompareStr(const S1, S2: utf8string): Integer;
     begin
    -  //Result := WideCompareStr(UTF8ToUTF16(S1), UTF8ToUTF16(S2));
    +  Result := CompareStr(S1, S2);
     end;
     
     {------------------------------------------------------------------------------
    @@ -1159,8 +1194,33 @@
       This function guarantees proper collation on all supported platforms.
      ------------------------------------------------------------------------------}
     function UTF8CompareText(const S1, S2: utf8string): Integer;
    +var
    +  i, l, l2, Count, Count1, Count2: Integer;
    +  c, c2: LongWord;
    +  p1, p2: PChar;
     begin
    -  //Result := WideCompareText(UTF8ToUTF16(S1), UTF8ToUTF16(S2));
    +  Count1 := UTF8Length(S1);
    +  Count2 := UTF8Length(S2);
    +  if Count1 > Count2 then
    +    Count := Count2
    +  else
    +    Count := Count1;
    +  i := 0;
    +  if Count > 0 then begin
    +    p1 := @S1[1];
    +    p2 := @S2[1];
    +    while i < Count do begin
    +      UTF8LowerCaseRaw(p1, c{%H-}, l{%H-});
    +      UTF8LowerCaseRaw(p2, c2{%H-}, l2{%H-});
    +      if (l <> l2) or (c <> c2) or (l = 0) then Break;
    +      Inc(p1, l); Inc(p2, l);
    +      Inc(i);
    +    end;
    +  end;
    +  if i < Count then
    +    Result := c - c2
    +  else
    +    Result := Count1 - Count2;
     end;
     
     procedure InternalInit;
    
  • UTF8LowerCase&CompareText_v2.patch (4,566 bytes)
    Index: components/lazutils/lazutf8.pas
    ===================================================================
    --- components/lazutils/lazutf8.pas	(revision 32724)
    +++ components/lazutils/lazutf8.pas	(working copy)
    @@ -1094,46 +1094,81 @@
       end;
     end;
     
    +function UTF8LowerCaseRaw(p: PChar; var c: LongWord; var l: integer): boolean;
    +begin
    +  c:= 0; l:= 0;
    +  Result:= True;
    +  case Byte(p^) of
    +    $00..$7F: begin
    +      c:= Byte(p^);
    +      case c of
    +        $41..$5A: c := c + $20;
    +        else Result:= False;
    +      end;
    +      l:= 1;
    +    end;
    +    $C2..$DF: begin
    +      c:= (Byte(p^) shl 8) or Byte((p+1)^);
    +      case c of
    +        $CE91..$CE9F: c := c + $20; // Greek Characters
    +        $CEA0..$CEA9: c := c + $E0; // Greek Characters
    +        $D090..$D09F: c := c + $20; // Cyrillic alphabet
    +        $D0A0..$D0AF: c := c + $E0; // Cyrillic alphabet
    +        else Result:= False;
    +      end;
    +      l:= 2;
    +    end;
    +    $E0..$EF: begin
    +      c:= (Byte(p^) shl 16) or (Byte((p+1)^) shl 8) or Byte((p+2)^);
    +      case c of
    +        $EFBCA1..$EFBCBA: c:= c + $E0; // ZENKAKU alphabet
    +        else Result:= False;
    +      end;
    +      l:= 3;
    +    end;
    +    $F0..$F7: begin
    +      c:= (Byte(p^) shl 24) or (Byte((p+1)^) shl 16) or (Byte((p+2)^) shl 8)
    +       or Byte((p+3)^);
    +      case c of
    +        $F0FFFFFF..$F0FFFFFF: ;
    +        else Result:= False;
    +      end;
    +      l:= 4;
    +    end;
    +  end;
    +end;
    +
     function UTF8LowerCase(const s: utf8string): utf8string;
     var
    -  i: PtrInt;
    -  CharLen: integer;
    -  OldCode: LongWord;
    -  NewCode: LongWord;
    -  NewCharLen: integer;
    +  i, l0, l: integer;
    +  c: LongWord;
    +  p: PChar;
     begin
    -  Result:=s;
    -  i:=1;
    -  while i<=length(Result) do begin
    -    case Result[i] of
    -    { First ASCII chars }
    -    'A'..'Z':
    -      begin
    -        Result[i]:=chr(ord(Result[i])+32);
    -        inc(i);
    +  Result := '';
    +  i := 1;
    +  l0:= Length(s);
    +  p := @s[1];
    +  while i <= l0 do begin
    +    if UTF8LowerCaseRaw(p, c{%H-}, l{%H-}) then begin
    +      case l of
    +        1: Result := Result + Char(c);
    +        2: Result := Result + Char(c shr 8) + Char(c and $FF);
    +        3: Result := Result + Char(c shr 16) + Char((c shr 8) and $FF) +
    +         Char(c and $FF);
    +        4: Result := Result + Char(c shr 24) + Char((c shr 16) and $FF) +
    +         Char((c shr 8) and $FF) + Char(c and $FF);
    +        else Break;
           end;
    -    { Now chars with multiple bytes }
    -    #192..#240:
    -      begin
    -        OldCode:=UTF8CharacterToUnicode(@Result[i],CharLen);
    -        NewCode:=UnicodeLowercase(OldCode);
    -        if NewCode=OldCode then begin
    -          inc(i,CharLen);
    -        end else begin
    -          UniqueString(Result);
    -          NewCharLen:=UnicodeToUTF8(NewCode,@Result[i]);
    -          if CharLen=NewCharLen then begin
    -            inc(i,NewCharLen);
    -          end else begin
    -            // string size changed => use slower function
    -            Result:=UTF8LowercaseDynLength(s);
    -            exit;
    -          end;
    -        end;
    +    end else begin
    +      case l of
    +        1: Result := Result + p^;
    +        2: Result := Result + p^ + (p+1)^;
    +        3: Result := Result + p^ + (p+1)^ + (p+2)^;
    +        4: Result := Result + p^ + (p+1)^ + (p+2)^ + (p+3)^;
    +        else Break;
           end;
    -    else
    -      inc(i);
         end;
    +    Inc(p, l); Inc(i, l);
       end;
     end;
     
    @@ -1147,7 +1182,7 @@
      ------------------------------------------------------------------------------}
     function UTF8CompareStr(const S1, S2: utf8string): Integer;
     begin
    -  //Result := WideCompareStr(UTF8ToUTF16(S1), UTF8ToUTF16(S2));
    +  Result := CompareStr(S1, S2);
     end;
     
     {------------------------------------------------------------------------------
    @@ -1159,8 +1194,28 @@
       This function guarantees proper collation on all supported platforms.
      ------------------------------------------------------------------------------}
     function UTF8CompareText(const S1, S2: utf8string): Integer;
    +var
    +  l, l2: Integer;
    +  c, c2: LongWord;
    +  p1, p2: PChar;
     begin
    -  //Result := WideCompareText(UTF8ToUTF16(S1), UTF8ToUTF16(S2));
    +  p1 := @S1[1];
    +  p2 := @S2[1];
    +  c:= 0; c2:= 0; l:= 0; l2:= 0;
    +  while (p1^ <> #0) and (p2^ <> #0) do begin
    +    UTF8LowerCaseRaw(p1, c, l);
    +    UTF8LowerCaseRaw(p2, c2, l2);
    +    if (l <> l2) or (c <> c2) or (l = 0) then Break;
    +    Inc(p1, l); Inc(p2, l);
    +  end;
    +  if (l <> l2) or (c <> c2) or (l = 0) then
    +    Result := c - c2
    +  else if (p1^ = #0) and (p2^ = #0) then
    +    Result := 0
    +  else if p1^ = #0 then
    +    Result := -1
    +  else
    +    Result := 1;
     end;
     
     procedure InternalInit;
    
  • UTF8LowerCase&CompareText_v3.patch (4,577 bytes)
    Index: components/lazutils/lazutf8.pas
    ===================================================================
    --- components/lazutils/lazutf8.pas	(revision 32724)
    +++ components/lazutils/lazutf8.pas	(working copy)
    @@ -1094,46 +1094,81 @@
       end;
     end;
     
    +function UTF8LowerCaseRaw(p: PChar; var c: LongWord; var l: integer): boolean;
    +begin
    +  c:= 0; l:= 0;
    +  Result:= True;
    +  case Byte(p^) of
    +    $00..$7F: begin
    +      c:= Byte(p^);
    +      case c of
    +        $41..$5A: c := c + $20;
    +        else Result:= False;
    +      end;
    +      l:= 1;
    +    end;
    +    $C2..$DF: begin
    +      c:= (Byte(p^) shl 8) or Byte((p+1)^);
    +      case c of
    +        $CE91..$CE9F: c := c + $20; // Greek Characters
    +        $CEA0..$CEA9: c := c + $E0; // Greek Characters
    +        $D090..$D09F: c := c + $20; // Cyrillic alphabet
    +        $D0A0..$D0AF: c := c + $E0; // Cyrillic alphabet
    +        else Result:= False;
    +      end;
    +      l:= 2;
    +    end;
    +    $E0..$EF: begin
    +      c:= (Byte(p^) shl 16) or (Byte((p+1)^) shl 8) or Byte((p+2)^);
    +      case c of
    +        $EFBCA1..$EFBCBA: c:= c + $E0; // ZENKAKU alphabet
    +        else Result:= False;
    +      end;
    +      l:= 3;
    +    end;
    +    $F0..$F7: begin
    +      c:= (Byte(p^) shl 24) or (Byte((p+1)^) shl 16) or (Byte((p+2)^) shl 8)
    +       or Byte((p+3)^);
    +      case c of
    +        $F0FFFFFF..$F0FFFFFF: ;
    +        else Result:= False;
    +      end;
    +      l:= 4;
    +    end;
    +  end;
    +end;
    +
     function UTF8LowerCase(const s: utf8string): utf8string;
     var
    -  i: PtrInt;
    -  CharLen: integer;
    -  OldCode: LongWord;
    -  NewCode: LongWord;
    -  NewCharLen: integer;
    +  i, l0, l: integer;
    +  c: LongWord;
    +  p: PChar;
     begin
    -  Result:=s;
    -  i:=1;
    -  while i<=length(Result) do begin
    -    case Result[i] of
    -    { First ASCII chars }
    -    'A'..'Z':
    -      begin
    -        Result[i]:=chr(ord(Result[i])+32);
    -        inc(i);
    +  Result := '';
    +  i := 1;
    +  l0:= Length(s);
    +  p := @s[1];
    +  while i <= l0 do begin
    +    if UTF8LowerCaseRaw(p, c{%H-}, l{%H-}) then begin
    +      case l of
    +        1: Result := Result + Char(c);
    +        2: Result := Result + Char(c shr 8) + Char(c and $FF);
    +        3: Result := Result + Char(c shr 16) + Char((c shr 8) and $FF) +
    +         Char(c and $FF);
    +        4: Result := Result + Char(c shr 24) + Char((c shr 16) and $FF) +
    +         Char((c shr 8) and $FF) + Char(c and $FF);
    +        else Break;
           end;
    -    { Now chars with multiple bytes }
    -    #192..#240:
    -      begin
    -        OldCode:=UTF8CharacterToUnicode(@Result[i],CharLen);
    -        NewCode:=UnicodeLowercase(OldCode);
    -        if NewCode=OldCode then begin
    -          inc(i,CharLen);
    -        end else begin
    -          UniqueString(Result);
    -          NewCharLen:=UnicodeToUTF8(NewCode,@Result[i]);
    -          if CharLen=NewCharLen then begin
    -            inc(i,NewCharLen);
    -          end else begin
    -            // string size changed => use slower function
    -            Result:=UTF8LowercaseDynLength(s);
    -            exit;
    -          end;
    -        end;
    +    end else begin
    +      case l of
    +        1: Result := Result + p^;
    +        2: Result := Result + p^ + (p+1)^;
    +        3: Result := Result + p^ + (p+1)^ + (p+2)^;
    +        4: Result := Result + p^ + (p+1)^ + (p+2)^ + (p+3)^;
    +        else Break;
           end;
    -    else
    -      inc(i);
         end;
    +    Inc(p, l); Inc(i, l);
       end;
     end;
     
    @@ -1147,7 +1182,7 @@
      ------------------------------------------------------------------------------}
     function UTF8CompareStr(const S1, S2: utf8string): Integer;
     begin
    -  //Result := WideCompareStr(UTF8ToUTF16(S1), UTF8ToUTF16(S2));
    +  Result := CompareStr(S1, S2);
     end;
     
     {------------------------------------------------------------------------------
    @@ -1159,8 +1194,27 @@
       This function guarantees proper collation on all supported platforms.
      ------------------------------------------------------------------------------}
     function UTF8CompareText(const S1, S2: utf8string): Integer;
    +var
    +  l, l2: Integer;
    +  c, c2: LongWord;
    +  p1, p2: PChar;
     begin
    -  //Result := WideCompareText(UTF8ToUTF16(S1), UTF8ToUTF16(S2));
    +  p1 := @S1[1];
    +  p2 := @S2[1];
    +  if (p1 <> nil) and (p2 <> nil) then begin
    +    c:= 0; c2:= 0; l:= 0; l2:= 0;
    +    while (p1^ <> #0) and (p2^ <> #0) do begin
    +      UTF8LowerCaseRaw(p1, c, l);
    +      UTF8LowerCaseRaw(p2, c2, l2);
    +      if (l <> l2) or (c <> c2) or (l = 0) then Break;
    +      Inc(p1, l); Inc(p2, l);
    +    end;
    +    if (p1^ = #0) or (p2^ = #0) then
    +      Result := Byte(p1^) - Byte(p2^)
    +    else
    +      Result := c - c2
    +  end else
    +    Result := p1 - p2;
     end;
     
     procedure InternalInit;
    

Activities

Zeljan Rikalo

2011-10-04 11:00

developer   ~0052489

It must use UTF8CompareXXX otherwise collation won't work as expected.

function TStringListUTF8.DoCompareText(const s1, s2: string): PtrInt;
begin
  if CaseSensitive then
    Result:= UTF8CompareStr(s1,s2)
  else
    Result:= UTF8CompareText(s1,s2);
end;

malcome

2011-10-04 14:17

reporter   ~0052505

Last edited: 2011-10-04 14:30

@Zeljan

It is probably right. But it is very slow(Especially in East Asia).

Zeljan Rikalo

2011-10-05 21:56

developer   ~0052598

malcome, but without UTF8CompareText results are incorrect ;) so what's better ? correct or fast ?

malcome

2011-10-06 02:06

reporter   ~0052605

Last edited: 2011-10-06 02:32

The present code:
function UTF8CompareStr(const S1, S2: String): Integer;
begin
  Result := WideCompareStr(UTF8ToUTF16(S1), UTF8ToUTF16(S2));
end;

UTF8ToUTF16 is High cost. Especially in East Asia.
IMHO create real UTF8CompareStr and UTF8CompareText(no use UTF8ToUTF16) is the best method. .

Or create TStringListW and

function TStringListW.DoCompareText(const s1, s2: WideString): PtrInt;
begin
  if CaseSensitive then
    Result:= WideCompareStr(s1, s2)
  else
    Result:= WideCompareText(s1, s2);
end;

Probably current TStringListUTF8 with sorted is too slow to use.

Zeljan Rikalo

2011-10-06 07:56

developer   ~0052609

maybe your solution works, but I doubt about correct collation with different lanaguages (eg. I'm Croatian and under qtlcl-windows your solution have wrong collation, also there's issue about sorting (collation) of Hebrew strings in TStringGrid which is fixed by using UTF8CompareXX functions, and it does not work with your solution as expected). Maybe Felipe can add something like that for east-asian (or other languages) , like slow/fast compare switch ... dunno).

Felipe Monteiro de Carvalho

2011-10-06 08:02

developer   ~0052610

> IMHO create real UTF8CompareStr and UTF8CompareText(no use UTF8ToUTF16) is the best method.

Sure, will you create a patch for this?

We already have some upper/lower case tables, although they are in an ifdef. I would say it is time to remove the ifdefs and start using the tables.

Felipe Monteiro de Carvalho

2011-10-06 08:33

developer   ~0052611

Last edited: 2011-10-06 08:34

I prepared the environment for working on this in rev32713. UTF8CompareStr and Text should be fixed in this unit:

components/lazutils/lazutf8.pas

Not in LCLProc.

2011-10-06 11:26

 

UTF8Compare.patch (3,108 bytes)
Index: components/lazutils/lazutf8.pas
===================================================================
--- components/lazutils/lazutf8.pas	(revision 32717)
+++ components/lazutils/lazutf8.pas	(working copy)
@@ -1147,7 +1147,7 @@
  ------------------------------------------------------------------------------}
 function UTF8CompareStr(const S1, S2: utf8string): Integer;
 begin
-  //Result := WideCompareStr(UTF8ToUTF16(S1), UTF8ToUTF16(S2));
+  Result := CompareStr(S1, S2);
 end;
 
 {------------------------------------------------------------------------------
@@ -1159,8 +1159,90 @@
   This function guarantees proper collation on all supported platforms.
  ------------------------------------------------------------------------------}
 function UTF8CompareText(const S1, S2: utf8string): Integer;
+
+  procedure _UTF8LowerCase(p: PChar; var c: Int64; var l: integer);
+  begin
+    c:= 0; l:= 0;
+    case Byte(p^) of
+      $00..$7F: begin
+        c:= Byte(p^);
+        case c of
+          $61..$7A: c := c - $20;
+        end;
+        l:= 1;
+      end;
+      $C2..$DF: begin
+        c:= (Byte(p^) shl 8) or Byte((p+1)^);
+        case c of
+          $CEB1..$CEBF: c := c - $20; // Greek Characters
+          $CF80..$CF89: c := c - $E0; // Greek Characters
+        end;
+        l:= 2;
+      end;
+      $E0..$EF: begin
+        c:= (Byte(p^) shl 16) or (Byte((p+1)^) shl 8) or Byte((p+2)^);
+        case c of
+          $EFBD81..$EFBD9A: begin // ZENKAKU alphabet
+            c:= $EFBC00 or (Byte((p+2)^) + $20);
+          end;
+        end;
+        l:= 3;
+      end;
+      $F0..$F7: begin
+        c:= (Byte(p^) shl 24) or (Byte((p+1)^) shl 16) or (Byte((p+2)^) shl 8)
+         or Byte((p+3)^);
+        case c of
+          $F0FFFFFF..$F0FFFFFF: ;
+        end;
+        l:= 4;
+      end;
+      $F8..$FB: begin
+        c:= (Byte(p^) shl 32) or (Byte((p+1)^) shl 24) or (Byte((p+2)^) shl 16)
+         or (Byte((p+3)^) shl 8) or Byte((p+4)^);
+        case c of
+          $F8FFFFFFFF..$F8FFFFFFFF: ;
+        end;
+        l:= 5;
+      end;
+      $FC..$FD: begin
+        c:= (Byte(p^) shl 40) or (Byte((p+1)^) shl 32) or (Byte((p+2)^) shl 24)
+         or (Byte((p+3)^) shl 16) or (Byte((p+4)^) shl 8) or Byte((p+5)^);
+        case c of
+           $FCFFFFFFFFFF..$FCFFFFFFFFFF: ;
+        end;
+        l:= 6;
+      end;
+    end;
+  end;
+
+var
+  i, l, l2, Count, Count1, Count2: Integer;
+  c, c2: Int64;
+  p1, p2: PChar;
 begin
-  //Result := WideCompareText(UTF8ToUTF16(S1), UTF8ToUTF16(S2));
+  Count1 := UTF8Length(S1);
+  Count2 := UTF8Length(S2);
+  if Count1 > Count2 then
+    Count := Count2
+  else
+    Count := Count1;
+  i := 0;
+  if Count > 0 then
+  begin
+    p1 := @S1[1];
+    p2 := @S2[1];
+    while i < Count do
+    begin
+      _UTF8LowerCase(p1, c{%H-}, l{%H-});
+      _UTF8LowerCase(p2, c2{%H-}, l2{%H-});
+      if (l <> l2) or (c <> c2) then Break;
+      Inc(p1, l); Inc(p2, l); Inc(i, l);
+    end;
+  end;
+  if i < Count then
+    Result := c - c2
+  else
+    Result := Count1 - Count2;
 end;
 
 procedure InternalInit;
UTF8Compare.patch (3,108 bytes)

malcome

2011-10-06 11:31

reporter   ~0052622

Last edited: 2011-10-06 12:48

I created a patch. But I mounted locale which I know. Please add other locale.

The table was not used to avoid the cost of UTF8Char -> WideChar.

2011-10-06 13:32

 

UTF8Compare.patch_v2 (2,547 bytes)

malcome

2011-10-06 13:37

reporter   ~0052641

Last edited: 2011-10-06 13:48

I updated the patch.(Sorry, file name was mistaken. UTF8Compare.patch_v2 -> UTF8Compare_v2.patch)

.................

I create UTF8Compare_v3.patch for corrected ∞ loop.

2011-10-06 13:46

 

UTF8Compare_v3.patch (2,525 bytes)
Index: lazutils/lazutf8.pas
===================================================================
--- lazutils/lazutf8.pas	(revision 32719)
+++ lazutils/lazutf8.pas	(working copy)
@@ -1147,7 +1147,7 @@
  ------------------------------------------------------------------------------}
 function UTF8CompareStr(const S1, S2: utf8string): Integer;
 begin
-  //Result := WideCompareStr(UTF8ToUTF16(S1), UTF8ToUTF16(S2));
+  Result := CompareStr(S1, S2);
 end;
 
 {------------------------------------------------------------------------------
@@ -1159,8 +1159,73 @@
   This function guarantees proper collation on all supported platforms.
  ------------------------------------------------------------------------------}
 function UTF8CompareText(const S1, S2: utf8string): Integer;
+
+  procedure _UTF8LowerCase(p: PChar; var c, l: integer);
+  begin
+    c:= 0; l:= 0;
+    case Byte(p^) of
+      $00..$7F: begin
+        c:= Byte(p^);
+        case c of
+          $61..$7A: c := c - $20;
+        end;
+        l:= 1;
+      end;
+      $C2..$DF: begin
+        c:= (Byte(p^) shl 8) or Byte((p+1)^);
+        case c of
+          $CEB1..$CEBF: c := c - $20; // Greek Characters
+          $CF80..$CF89: c := c - $E0; // Greek Characters
+        end;
+        l:= 2;
+      end;
+      $E0..$EF: begin
+        c:= (Byte(p^) shl 16) or (Byte((p+1)^) shl 8) or Byte((p+2)^);
+        case c of
+          $EFBD81..$EFBD9A: begin // ZENKAKU alphabet
+            c:= $EFBC00 or (Byte((p+2)^) + $20);
+          end;
+        end;
+        l:= 3;
+      end;
+      $F0..$F7: begin
+        c:= (Byte(p^) shl 24) or (Byte((p+1)^) shl 16) or (Byte((p+2)^) shl 8)
+         or Byte((p+3)^);
+        case c of
+          $F0FFFFFF..$F0FFFFFF: ;
+        end;
+        l:= 4;
+      end;
+    end;
+  end;
+
+var
+  i, c, c2, l, l2, Count, Count1, Count2: Integer;
+  p1, p2: PChar;
 begin
-  //Result := WideCompareText(UTF8ToUTF16(S1), UTF8ToUTF16(S2));
+  Count1 := UTF8Length(S1);
+  Count2 := UTF8Length(S2);
+  if Count1 > Count2 then
+    Count := Count2
+  else
+    Count := Count1;
+  i := 0;
+  if Count > 0 then
+  begin
+    p1 := @S1[1];
+    p2 := @S2[1];
+    while i < Count do
+    begin
+      _UTF8LowerCase(p1, c{%H-}, l{%H-});
+      _UTF8LowerCase(p2, c2{%H-}, l2{%H-});
+      if (l <> l2) or (c <> c2) or (l = 0) then Break;
+      Inc(p1, l); Inc(p2, l); Inc(i, l);
+    end;
+  end;
+  if i < Count then
+    Result := c - c2
+  else
+    Result := Count1 - Count2;
 end;
 
 procedure InternalInit;
UTF8Compare_v3.patch (2,525 bytes)

Felipe Monteiro de Carvalho

2011-10-06 16:30

developer   ~0052661

Last edited: 2011-10-06 16:32

This cannot be applied like this ... if UTF8LowerCase needs to be improved, then it should be improved...

malcome

2011-10-06 17:34

reporter   ~0052666

Please upgrade in your style.
However, please hold speed ;-)

Felipe Monteiro de Carvalho

2011-10-06 18:09

developer   ~0052667

The changes to this are not trivial, I would prefer if you create a new patch which improves UTF8LowerCase.

2011-10-06 23:47

 

UTF8LowerCase&CompareText.patch (4,662 bytes)
Index: components/lazutils/lazutf8.pas
===================================================================
--- components/lazutils/lazutf8.pas	(revision 32724)
+++ components/lazutils/lazutf8.pas	(working copy)
@@ -1094,46 +1094,81 @@
   end;
 end;
 
+function UTF8LowerCaseRaw(p: PChar; var c: LongWord; var l: integer): boolean;
+begin
+  c:= 0; l:= 0;
+  Result:= True;
+  case Byte(p^) of
+    $00..$7F: begin
+      c:= Byte(p^);
+      case c of
+        $41..$5A: c := c + $20;
+        else Result:= False;
+      end;
+      l:= 1;
+    end;
+    $C2..$DF: begin
+      c:= (Byte(p^) shl 8) or Byte((p+1)^);
+      case c of
+        $CE91..$CE9F: c := c + $20; // Greek Characters
+        $CEA0..$CEA9: c := c + $E0; // Greek Characters
+        $D090..$D09F: c := c + $20; // Cyrillic alphabet
+        $D0A0..$D0AF: c := c + $E0; // Cyrillic alphabet
+        else Result:= False;
+      end;
+      l:= 2;
+    end;
+    $E0..$EF: begin
+      c:= (Byte(p^) shl 16) or (Byte((p+1)^) shl 8) or Byte((p+2)^);
+      case c of
+        $EFBCA1..$EFBCBA: c:= c + $E0; // ZENKAKU alphabet
+        else Result:= False;
+      end;
+      l:= 3;
+    end;
+    $F0..$F7: begin
+      c:= (Byte(p^) shl 24) or (Byte((p+1)^) shl 16) or (Byte((p+2)^) shl 8)
+       or Byte((p+3)^);
+      case c of
+        $F0FFFFFF..$F0FFFFFF: ;
+        else Result:= False;
+      end;
+      l:= 4;
+    end;
+  end;
+end;
+
 function UTF8LowerCase(const s: utf8string): utf8string;
 var
-  i: PtrInt;
-  CharLen: integer;
-  OldCode: LongWord;
-  NewCode: LongWord;
-  NewCharLen: integer;
+  i, l0, l: integer;
+  c: LongWord;
+  p: PChar;
 begin
-  Result:=s;
-  i:=1;
-  while i<=length(Result) do begin
-    case Result[i] of
-    { First ASCII chars }
-    'A'..'Z':
-      begin
-        Result[i]:=chr(ord(Result[i])+32);
-        inc(i);
+  Result := '';
+  i := 1;
+  l0:= Length(s);
+  p := @s[1];
+  while i <= l0 do begin
+    if UTF8LowerCaseRaw(p, c{%H-}, l{%H-}) then begin
+      case l of
+        1: Result := Result + Char(c);
+        2: Result := Result + Char(c shr 8) + Char(c and $FF);
+        3: Result := Result + Char(c shr 16) + Char((c shr 8) and $FF) +
+         Char(c and $FF);
+        4: Result := Result + Char(c shr 24) + Char((c shr 16) and $FF) +
+         Char((c shr 8) and $FF) + Char(c and $FF);
+        else Break;
       end;
-    { Now chars with multiple bytes }
-    #192..#240:
-      begin
-        OldCode:=UTF8CharacterToUnicode(@Result[i],CharLen);
-        NewCode:=UnicodeLowercase(OldCode);
-        if NewCode=OldCode then begin
-          inc(i,CharLen);
-        end else begin
-          UniqueString(Result);
-          NewCharLen:=UnicodeToUTF8(NewCode,@Result[i]);
-          if CharLen=NewCharLen then begin
-            inc(i,NewCharLen);
-          end else begin
-            // string size changed => use slower function
-            Result:=UTF8LowercaseDynLength(s);
-            exit;
-          end;
-        end;
+    end else begin
+      case l of
+        1: Result := Result + p^;
+        2: Result := Result + p^ + (p+1)^;
+        3: Result := Result + p^ + (p+1)^ + (p+2)^;
+        4: Result := Result + p^ + (p+1)^ + (p+2)^ + (p+3)^;
+        else Break;
       end;
-    else
-      inc(i);
     end;
+    Inc(p, l); Inc(i, l);
   end;
 end;
 
@@ -1147,7 +1182,7 @@
  ------------------------------------------------------------------------------}
 function UTF8CompareStr(const S1, S2: utf8string): Integer;
 begin
-  //Result := WideCompareStr(UTF8ToUTF16(S1), UTF8ToUTF16(S2));
+  Result := CompareStr(S1, S2);
 end;
 
 {------------------------------------------------------------------------------
@@ -1159,8 +1194,33 @@
   This function guarantees proper collation on all supported platforms.
  ------------------------------------------------------------------------------}
 function UTF8CompareText(const S1, S2: utf8string): Integer;
+var
+  i, l, l2, Count, Count1, Count2: Integer;
+  c, c2: LongWord;
+  p1, p2: PChar;
 begin
-  //Result := WideCompareText(UTF8ToUTF16(S1), UTF8ToUTF16(S2));
+  Count1 := UTF8Length(S1);
+  Count2 := UTF8Length(S2);
+  if Count1 > Count2 then
+    Count := Count2
+  else
+    Count := Count1;
+  i := 0;
+  if Count > 0 then begin
+    p1 := @S1[1];
+    p2 := @S2[1];
+    while i < Count do begin
+      UTF8LowerCaseRaw(p1, c{%H-}, l{%H-});
+      UTF8LowerCaseRaw(p2, c2{%H-}, l2{%H-});
+      if (l <> l2) or (c <> c2) or (l = 0) then Break;
+      Inc(p1, l); Inc(p2, l);
+      Inc(i);
+    end;
+  end;
+  if i < Count then
+    Result := c - c2
+  else
+    Result := Count1 - Count2;
 end;
 
 procedure InternalInit;

malcome

2011-10-06 23:49

reporter   ~0052683

I created a new patch. OK now?

2011-10-07 01:12

 

UTF8LowerCase&CompareText_v2.patch (4,566 bytes)
Index: components/lazutils/lazutf8.pas
===================================================================
--- components/lazutils/lazutf8.pas	(revision 32724)
+++ components/lazutils/lazutf8.pas	(working copy)
@@ -1094,46 +1094,81 @@
   end;
 end;
 
+function UTF8LowerCaseRaw(p: PChar; var c: LongWord; var l: integer): boolean;
+begin
+  c:= 0; l:= 0;
+  Result:= True;
+  case Byte(p^) of
+    $00..$7F: begin
+      c:= Byte(p^);
+      case c of
+        $41..$5A: c := c + $20;
+        else Result:= False;
+      end;
+      l:= 1;
+    end;
+    $C2..$DF: begin
+      c:= (Byte(p^) shl 8) or Byte((p+1)^);
+      case c of
+        $CE91..$CE9F: c := c + $20; // Greek Characters
+        $CEA0..$CEA9: c := c + $E0; // Greek Characters
+        $D090..$D09F: c := c + $20; // Cyrillic alphabet
+        $D0A0..$D0AF: c := c + $E0; // Cyrillic alphabet
+        else Result:= False;
+      end;
+      l:= 2;
+    end;
+    $E0..$EF: begin
+      c:= (Byte(p^) shl 16) or (Byte((p+1)^) shl 8) or Byte((p+2)^);
+      case c of
+        $EFBCA1..$EFBCBA: c:= c + $E0; // ZENKAKU alphabet
+        else Result:= False;
+      end;
+      l:= 3;
+    end;
+    $F0..$F7: begin
+      c:= (Byte(p^) shl 24) or (Byte((p+1)^) shl 16) or (Byte((p+2)^) shl 8)
+       or Byte((p+3)^);
+      case c of
+        $F0FFFFFF..$F0FFFFFF: ;
+        else Result:= False;
+      end;
+      l:= 4;
+    end;
+  end;
+end;
+
 function UTF8LowerCase(const s: utf8string): utf8string;
 var
-  i: PtrInt;
-  CharLen: integer;
-  OldCode: LongWord;
-  NewCode: LongWord;
-  NewCharLen: integer;
+  i, l0, l: integer;
+  c: LongWord;
+  p: PChar;
 begin
-  Result:=s;
-  i:=1;
-  while i<=length(Result) do begin
-    case Result[i] of
-    { First ASCII chars }
-    'A'..'Z':
-      begin
-        Result[i]:=chr(ord(Result[i])+32);
-        inc(i);
+  Result := '';
+  i := 1;
+  l0:= Length(s);
+  p := @s[1];
+  while i <= l0 do begin
+    if UTF8LowerCaseRaw(p, c{%H-}, l{%H-}) then begin
+      case l of
+        1: Result := Result + Char(c);
+        2: Result := Result + Char(c shr 8) + Char(c and $FF);
+        3: Result := Result + Char(c shr 16) + Char((c shr 8) and $FF) +
+         Char(c and $FF);
+        4: Result := Result + Char(c shr 24) + Char((c shr 16) and $FF) +
+         Char((c shr 8) and $FF) + Char(c and $FF);
+        else Break;
       end;
-    { Now chars with multiple bytes }
-    #192..#240:
-      begin
-        OldCode:=UTF8CharacterToUnicode(@Result[i],CharLen);
-        NewCode:=UnicodeLowercase(OldCode);
-        if NewCode=OldCode then begin
-          inc(i,CharLen);
-        end else begin
-          UniqueString(Result);
-          NewCharLen:=UnicodeToUTF8(NewCode,@Result[i]);
-          if CharLen=NewCharLen then begin
-            inc(i,NewCharLen);
-          end else begin
-            // string size changed => use slower function
-            Result:=UTF8LowercaseDynLength(s);
-            exit;
-          end;
-        end;
+    end else begin
+      case l of
+        1: Result := Result + p^;
+        2: Result := Result + p^ + (p+1)^;
+        3: Result := Result + p^ + (p+1)^ + (p+2)^;
+        4: Result := Result + p^ + (p+1)^ + (p+2)^ + (p+3)^;
+        else Break;
       end;
-    else
-      inc(i);
     end;
+    Inc(p, l); Inc(i, l);
   end;
 end;
 
@@ -1147,7 +1182,7 @@
  ------------------------------------------------------------------------------}
 function UTF8CompareStr(const S1, S2: utf8string): Integer;
 begin
-  //Result := WideCompareStr(UTF8ToUTF16(S1), UTF8ToUTF16(S2));
+  Result := CompareStr(S1, S2);
 end;
 
 {------------------------------------------------------------------------------
@@ -1159,8 +1194,28 @@
   This function guarantees proper collation on all supported platforms.
  ------------------------------------------------------------------------------}
 function UTF8CompareText(const S1, S2: utf8string): Integer;
+var
+  l, l2: Integer;
+  c, c2: LongWord;
+  p1, p2: PChar;
 begin
-  //Result := WideCompareText(UTF8ToUTF16(S1), UTF8ToUTF16(S2));
+  p1 := @S1[1];
+  p2 := @S2[1];
+  c:= 0; c2:= 0; l:= 0; l2:= 0;
+  while (p1^ <> #0) and (p2^ <> #0) do begin
+    UTF8LowerCaseRaw(p1, c, l);
+    UTF8LowerCaseRaw(p2, c2, l2);
+    if (l <> l2) or (c <> c2) or (l = 0) then Break;
+    Inc(p1, l); Inc(p2, l);
+  end;
+  if (l <> l2) or (c <> c2) or (l = 0) then
+    Result := c - c2
+  else if (p1^ = #0) and (p2^ = #0) then
+    Result := 0
+  else if p1^ = #0 then
+    Result := -1
+  else
+    Result := 1;
 end;
 
 procedure InternalInit;

malcome

2011-10-07 01:13

reporter   ~0052684

More speed up in UTF8LowerCase&CompareText_v2.patch.

2011-10-07 03:17

 

UTF8LowerCase&CompareText_v3.patch (4,577 bytes)
Index: components/lazutils/lazutf8.pas
===================================================================
--- components/lazutils/lazutf8.pas	(revision 32724)
+++ components/lazutils/lazutf8.pas	(working copy)
@@ -1094,46 +1094,81 @@
   end;
 end;
 
+function UTF8LowerCaseRaw(p: PChar; var c: LongWord; var l: integer): boolean;
+begin
+  c:= 0; l:= 0;
+  Result:= True;
+  case Byte(p^) of
+    $00..$7F: begin
+      c:= Byte(p^);
+      case c of
+        $41..$5A: c := c + $20;
+        else Result:= False;
+      end;
+      l:= 1;
+    end;
+    $C2..$DF: begin
+      c:= (Byte(p^) shl 8) or Byte((p+1)^);
+      case c of
+        $CE91..$CE9F: c := c + $20; // Greek Characters
+        $CEA0..$CEA9: c := c + $E0; // Greek Characters
+        $D090..$D09F: c := c + $20; // Cyrillic alphabet
+        $D0A0..$D0AF: c := c + $E0; // Cyrillic alphabet
+        else Result:= False;
+      end;
+      l:= 2;
+    end;
+    $E0..$EF: begin
+      c:= (Byte(p^) shl 16) or (Byte((p+1)^) shl 8) or Byte((p+2)^);
+      case c of
+        $EFBCA1..$EFBCBA: c:= c + $E0; // ZENKAKU alphabet
+        else Result:= False;
+      end;
+      l:= 3;
+    end;
+    $F0..$F7: begin
+      c:= (Byte(p^) shl 24) or (Byte((p+1)^) shl 16) or (Byte((p+2)^) shl 8)
+       or Byte((p+3)^);
+      case c of
+        $F0FFFFFF..$F0FFFFFF: ;
+        else Result:= False;
+      end;
+      l:= 4;
+    end;
+  end;
+end;
+
 function UTF8LowerCase(const s: utf8string): utf8string;
 var
-  i: PtrInt;
-  CharLen: integer;
-  OldCode: LongWord;
-  NewCode: LongWord;
-  NewCharLen: integer;
+  i, l0, l: integer;
+  c: LongWord;
+  p: PChar;
 begin
-  Result:=s;
-  i:=1;
-  while i<=length(Result) do begin
-    case Result[i] of
-    { First ASCII chars }
-    'A'..'Z':
-      begin
-        Result[i]:=chr(ord(Result[i])+32);
-        inc(i);
+  Result := '';
+  i := 1;
+  l0:= Length(s);
+  p := @s[1];
+  while i <= l0 do begin
+    if UTF8LowerCaseRaw(p, c{%H-}, l{%H-}) then begin
+      case l of
+        1: Result := Result + Char(c);
+        2: Result := Result + Char(c shr 8) + Char(c and $FF);
+        3: Result := Result + Char(c shr 16) + Char((c shr 8) and $FF) +
+         Char(c and $FF);
+        4: Result := Result + Char(c shr 24) + Char((c shr 16) and $FF) +
+         Char((c shr 8) and $FF) + Char(c and $FF);
+        else Break;
       end;
-    { Now chars with multiple bytes }
-    #192..#240:
-      begin
-        OldCode:=UTF8CharacterToUnicode(@Result[i],CharLen);
-        NewCode:=UnicodeLowercase(OldCode);
-        if NewCode=OldCode then begin
-          inc(i,CharLen);
-        end else begin
-          UniqueString(Result);
-          NewCharLen:=UnicodeToUTF8(NewCode,@Result[i]);
-          if CharLen=NewCharLen then begin
-            inc(i,NewCharLen);
-          end else begin
-            // string size changed => use slower function
-            Result:=UTF8LowercaseDynLength(s);
-            exit;
-          end;
-        end;
+    end else begin
+      case l of
+        1: Result := Result + p^;
+        2: Result := Result + p^ + (p+1)^;
+        3: Result := Result + p^ + (p+1)^ + (p+2)^;
+        4: Result := Result + p^ + (p+1)^ + (p+2)^ + (p+3)^;
+        else Break;
       end;
-    else
-      inc(i);
     end;
+    Inc(p, l); Inc(i, l);
   end;
 end;
 
@@ -1147,7 +1182,7 @@
  ------------------------------------------------------------------------------}
 function UTF8CompareStr(const S1, S2: utf8string): Integer;
 begin
-  //Result := WideCompareStr(UTF8ToUTF16(S1), UTF8ToUTF16(S2));
+  Result := CompareStr(S1, S2);
 end;
 
 {------------------------------------------------------------------------------
@@ -1159,8 +1194,27 @@
   This function guarantees proper collation on all supported platforms.
  ------------------------------------------------------------------------------}
 function UTF8CompareText(const S1, S2: utf8string): Integer;
+var
+  l, l2: Integer;
+  c, c2: LongWord;
+  p1, p2: PChar;
 begin
-  //Result := WideCompareText(UTF8ToUTF16(S1), UTF8ToUTF16(S2));
+  p1 := @S1[1];
+  p2 := @S2[1];
+  if (p1 <> nil) and (p2 <> nil) then begin
+    c:= 0; c2:= 0; l:= 0; l2:= 0;
+    while (p1^ <> #0) and (p2^ <> #0) do begin
+      UTF8LowerCaseRaw(p1, c, l);
+      UTF8LowerCaseRaw(p2, c2, l2);
+      if (l <> l2) or (c <> c2) or (l = 0) then Break;
+      Inc(p1, l); Inc(p2, l);
+    end;
+    if (p1^ = #0) or (p2^ = #0) then
+      Result := Byte(p1^) - Byte(p2^)
+    else
+      Result := c - c2
+  end else
+    Result := p1 - p2;
 end;
 
 procedure InternalInit;

malcome

2011-10-07 03:18

reporter   ~0052688

UTF8LowerCase&CompareText_v3.patch fix the problem string is null. Thanks FPC team.

Felipe Monteiro de Carvalho

2011-10-07 07:57

developer   ~0052696

You routine naming and parameter naming are not really good. Neither of them intuitively explain what they do. This:

function UTF8LowerCaseRaw(p: PChar; var c: LongWord; var l: integer): Boolean;

Should be:

{
  AInChar -> The input character in UTF-8
  AOutChar -> The output character in UTF-8
  AOutLen -> The length of the output character in bytes
}
function UTF8LowerCaseCharacter(AInChar: PChar; var AOutChar: LongWord; var AOutLen: integer): Boolean;

Your variable naming in UTF8LowerCase is also very bad. Compare your variables:

+ i, l0, l: integer;
+ c: LongWord;
+ p: PChar;

With the previous ones:

 var
- i: PtrInt;
- CharLen: integer;
- OldCode: LongWord;
- NewCode: LongWord;
- NewCharLen: integer;

The previous ones where much more intuitive ... you should use variable names which explain what they are.

The same for UTF8CompareText

malcome

2011-10-07 08:24

reporter   ~0052702

Last edited: 2011-10-07 08:27

Please upgrade in your style. My English & Naming sense is poor ;-)

Felipe Monteiro de Carvalho

2011-10-07 10:33

developer   ~0052717

What about now? Could you update to the latest revision and verify if it is fast enough for you?

The comparision still could be optimized, maybe by cutting the input string in blocks of 100 chars.

malcome

2011-10-07 10:58

reporter   ~0052719

Last edited: 2011-10-07 11:18

I leave it to you.
Regrettably your code is very slower than my code,
But I mounted TStringListUTF8_mod class in My App, So do not worried.
I use big StringList with sorted in my App.

malcome

2011-10-07 11:30

reporter   ~0052721

Last edited: 2011-10-07 12:24

S1Lower := UTF8LowerCase(S1);
  S2Lower := UTF8LowerCase(S2);
  Result := UTF8CompareStr(S1Lower, S2Lower);

and

  S1w := UTF8toUTF16(S1);
  S2w := UTF8toUTF16(S2);
  Result := WideCompareText(S1w, S2w);

I think that both have the problem similarly. Especially UTF8String in East Asia is long, so UTF8toUTF16(AllString) and UTF8LowerCase(AllString) is high cost.

Felipe Monteiro de Carvalho

2011-10-07 16:59

developer   ~0052742

Could you try again with the latest code from lazutils? I did a lot of speed optimizations and fixes today.

Note that utf8classes will not automatically use lazutils, by default it will use LCLProc. We are still working on this.

Felipe Monteiro de Carvalho

2011-10-07 17:01

developer   ~0052743

Just for you to know:

  lStartTime := Now;
  for i := 0 to 99999 do
  begin
    Str := UTF8LowerCase('名字叫嘉英,嘉陵江的嘉,英國的英 => 名字叫嘉英,嘉陵江的嘉,英國的英');
  end;
  lTimeDiff := Now - lStartTime;
  WriteLn('LowerCase Performance test took: ', DateTimeToMilliseconds(lTimeDiff), ' ms');

An iteration of 10000 times UTF8LowerString in this test string takes only 36ms in my computer =D

Felipe Monteiro de Carvalho

2011-10-07 17:28

developer   ~0052746

I will close the issue now, but it would be nice if you can answer my question about the performance of the new utf8lowercase in the forum for example.

malcome

2011-10-08 01:48

reporter   ~0052761

Does the time value in your computer have a meaning?
Is it a relative on many My customers.

Issue History

Date Modified Username Field Change
2011-10-04 06:11 malcome New Issue
2011-10-04 06:11 malcome Widgetset => Win32/Win64
2011-10-04 11:00 Zeljan Rikalo Note Added: 0052489
2011-10-04 12:31 Zeljan Rikalo Status new => assigned
2011-10-04 12:31 Zeljan Rikalo Assigned To => Felipe Monteiro de Carvalho
2011-10-04 14:17 malcome Note Added: 0052505
2011-10-04 14:22 malcome Note Edited: 0052505
2011-10-04 14:23 malcome Note Edited: 0052505
2011-10-04 14:30 malcome Note Edited: 0052505
2011-10-05 13:59 Felipe Monteiro de Carvalho Fixed in Revision => 32703
2011-10-05 13:59 Felipe Monteiro de Carvalho LazTarget => -
2011-10-05 13:59 Felipe Monteiro de Carvalho Status assigned => resolved
2011-10-05 13:59 Felipe Monteiro de Carvalho Fixed in Version => 0.9.31 (SVN)
2011-10-05 13:59 Felipe Monteiro de Carvalho Resolution open => fixed
2011-10-05 21:56 Zeljan Rikalo Note Added: 0052598
2011-10-06 02:06 malcome Status resolved => assigned
2011-10-06 02:06 malcome Resolution fixed => reopened
2011-10-06 02:06 malcome Note Added: 0052605
2011-10-06 02:10 malcome Note Edited: 0052605
2011-10-06 02:32 malcome Note Edited: 0052605
2011-10-06 07:56 Zeljan Rikalo Note Added: 0052609
2011-10-06 08:02 Felipe Monteiro de Carvalho Note Added: 0052610
2011-10-06 08:33 Felipe Monteiro de Carvalho Note Added: 0052611
2011-10-06 08:34 Felipe Monteiro de Carvalho Note Edited: 0052611
2011-10-06 11:26 malcome File Added: UTF8Compare.patch
2011-10-06 11:31 malcome Note Added: 0052622
2011-10-06 12:48 malcome Note Edited: 0052622
2011-10-06 13:32 malcome File Added: UTF8Compare.patch_v2
2011-10-06 13:37 malcome Note Added: 0052641
2011-10-06 13:46 malcome File Added: UTF8Compare_v3.patch
2011-10-06 13:48 malcome Note Edited: 0052641
2011-10-06 16:30 Felipe Monteiro de Carvalho Note Added: 0052661
2011-10-06 16:32 Felipe Monteiro de Carvalho Note Edited: 0052661
2011-10-06 17:08 Felipe Monteiro de Carvalho LazTarget - => 0.99.0
2011-10-06 17:08 Felipe Monteiro de Carvalho Status assigned => feedback
2011-10-06 17:34 malcome Note Added: 0052666
2011-10-06 18:09 Felipe Monteiro de Carvalho Note Added: 0052667
2011-10-06 23:47 malcome File Added: UTF8LowerCase&CompareText.patch
2011-10-06 23:49 malcome Note Added: 0052683
2011-10-07 01:12 malcome File Added: UTF8LowerCase&CompareText_v2.patch
2011-10-07 01:13 malcome Note Added: 0052684
2011-10-07 03:17 malcome File Added: UTF8LowerCase&CompareText_v3.patch
2011-10-07 03:18 malcome Note Added: 0052688
2011-10-07 07:57 Felipe Monteiro de Carvalho Note Added: 0052696
2011-10-07 08:24 malcome Note Added: 0052702
2011-10-07 08:26 malcome Note Edited: 0052702
2011-10-07 08:27 malcome Note Edited: 0052702
2011-10-07 10:28 Felipe Monteiro de Carvalho Fixed in Revision 32703 => 32703, 32729
2011-10-07 10:33 Felipe Monteiro de Carvalho Fixed in Revision 32703, 32729 => 32703, 32729, 32731
2011-10-07 10:33 Felipe Monteiro de Carvalho Note Added: 0052717
2011-10-07 10:58 malcome Note Added: 0052719
2011-10-07 11:18 malcome Note Edited: 0052719
2011-10-07 11:30 malcome Note Added: 0052721
2011-10-07 11:57 malcome Note Edited: 0052721
2011-10-07 11:58 malcome Note Edited: 0052721
2011-10-07 12:24 malcome Note Edited: 0052721
2011-10-07 16:59 Felipe Monteiro de Carvalho Note Added: 0052742
2011-10-07 17:01 Felipe Monteiro de Carvalho Note Added: 0052743
2011-10-07 17:28 Felipe Monteiro de Carvalho Status feedback => resolved
2011-10-07 17:28 Felipe Monteiro de Carvalho Resolution reopened => fixed
2011-10-07 17:28 Felipe Monteiro de Carvalho Note Added: 0052746
2011-10-08 01:48 malcome Status resolved => closed
2011-10-08 01:48 malcome Note Added: 0052761