View Issue Details

IDProjectCategoryView StatusLast Update
0028406LazarusLCLpublic2015-12-02 20:28
ReporterMichlAssigned ToBart Broersma 
PrioritynormalSeverityminorReproducibilityalways
Status closedResolutionfixed 
PlatformWindowsOS7OS Version64bit
Product Version1.5 (SVN)Product Build49558 
Target Version1.6Fixed in Version1.6 
Summary0028406: possible Patch -> bug in TFileSearcher -> umlauts not found
DescriptionIf you have umlauts in your directory, this directory is not found with FindAllDirectories or FindAllFiles.
Steps To ReproduceRun attached project.
Additional InformationPossible patch attached.
Tagspatch
Fixed in Revisionr49888, r50567
LazTarget1.6
WidgetsetWin32/Win64
Attached Files
  • TestTSearchRec.zip (5,206 bytes)
  • withoutpatch.png (7,354 bytes)
    withoutpatch.png (7,354 bytes)
  • withpatch.png (7,751 bytes)
    withpatch.png (7,751 bytes)
  • fileutil.inc.patch (1,220 bytes)
    Index: components/lazutils/fileutil.inc
    ===================================================================
    --- components/lazutils/fileutil.inc	(revision 49558)
    +++ components/lazutils/fileutil.inc	(working copy)
    @@ -1118,7 +1118,7 @@
     
       procedure DoSearch(const APath: String; const ALevel: Integer);
       var
    -    P: String;
    +    P, Dummy: String;
         PathInfo: TSearchRec;
       begin
         P := APath + AllDirectoryEntriesMask;
    @@ -1126,6 +1126,9 @@
         if FindFirstUTF8(P, FileAttribute, PathInfo) = 0 then
         try
           repeat
    +        Dummy := '';
    +        Insert(PathInfo.Name, Dummy, 1);
    +        PathInfo.Name := Dummy;
             // skip special files
             if (PathInfo.Name = '.') or (PathInfo.Name = '..') or
               (PathInfo.Name = '') then Continue;
    @@ -1161,6 +1164,10 @@
           if FindFirstUTF8(P, DirectoryAttribute, PathInfo) = 0 then
           try
             repeat
    +          Dummy := '';
    +          Insert(PathInfo.Name, Dummy, 1);
    +          PathInfo.Name := Dummy;
    +
               if (PathInfo.Name = '.') or (PathInfo.Name = '..') or
                  (PathInfo.Name = '') or ((PathInfo.Attr and faDirectory) = 0) or
                  (not FFollowSymLink and FileIsSymlink(APath + PathInfo.Name))
    
    fileutil.inc.patch (1,220 bytes)
  • ChrisFCapture.png (16,595 bytes)
    ChrisFCapture.png (16,595 bytes)
  • winlazfileutils.inc.patch (907 bytes)
    Index: components/lazutils/winlazfileutils.inc
    ===================================================================
    --- components/lazutils/winlazfileutils.inc	(revision 49558)
    +++ components/lazutils/winlazfileutils.inc	(working copy)
    @@ -541,6 +541,10 @@
     
     
     function FindMatch(var f: TSearchRec) : Longint;
    +{$IFNDEF EnableUTF8RTL}
    +var
    +  Dummy: String;
    +{$ENDIF}
     begin
       { Find file with correct attribute }
       While (F.FindData.dwFileAttributes and cardinal(F.ExcludeAttr))<>0 do
    @@ -559,7 +563,13 @@
         in win32 it is the ansi structure with a utf-8 string
         in wince it is a wide structure }
       {$ifdef FindData_W}
    +  {$IFDEF EnableUTF8RTL}
       f.Name:=UTF8Encode(UnicodeString(F.FindData.cFileName));
    +  {$ELSE}
    +  Dummy := '';
    +  Insert(UTF8Encode(UnicodeString(F.FindData.cFileName)), Dummy, 1);
    +  f.Name := Dummy;
    +  {$ENDIF}
       {$else}
       f.Name:=F.FindData.cFileName;
       {$endif}
    
  • winlazfileutilspchar.inc.patch (840 bytes)
    Index: components/lazutils/winlazfileutils.inc
    ===================================================================
    --- components/lazutils/winlazfileutils.inc	(revision 50105)
    +++ components/lazutils/winlazfileutils.inc	(working copy)
    @@ -541,10 +541,6 @@
     
     
     function FindMatch(var f: TSearchRec) : Longint;
    -{$IFnDEF EnableUTF8RTL}
    -var
    -  Dummy: String;
    -{$ENDIF}
     begin
       { Find file with correct attribute }
       While (F.FindData.dwFileAttributes and cardinal(F.ExcludeAttr))<>0 do
    @@ -566,9 +562,7 @@
       {$IFDEF EnableUTF8RTL}
       f.Name:=UTF8Encode(UnicodeString(F.FindData.cFileName));
       {$ELSE}
    -  Dummy := '';
    -  Insert(UTF8Encode(UnicodeString(F.FindData.cFileName)), Dummy, 1);
    -  f.Name := Dummy;
    +  f.Name:=PChar(UTF8Encode(UnicodeString(F.FindData.cFileName)));
       {$ENDIF}
       {$else}
       f.Name:=F.FindData.cFileName;
    
  • compiler_rawstring_rules.patch (1,599 bytes)
    Index: compiler/defcmp.pas
    ===================================================================
    --- compiler/defcmp.pas	(revision 28814)
    +++ compiler/defcmp.pas	(working copy)
    @@ -522,7 +522,10 @@
                               (tstringdef(def_from).len=tstringdef(def_to).len)) and
                              { for ansi- and unicodestrings also the encoding must match }
                              (not(tstringdef(def_from).stringtype in [st_ansistring,st_unicodestring]) or
    -                          (tstringdef(def_from).encoding=tstringdef(def_to).encoding)) then
    +                          (tstringdef(def_from).encoding=tstringdef(def_to).encoding) or
    +						  (((tstringdef(def_to).encoding=globals.CP_NONE) or (tstringdef(def_to).encoding=0)) and
    +						   (tstringdef(def_from).stringtype=st_ansistring))
    +						 ) then
                             eq:=te_equal
                          else
                            begin
    Index: compiler/htypechk.pas
    ===================================================================
    --- compiler/htypechk.pas	(revision 28814)
    +++ compiler/htypechk.pas	(working copy)
    @@ -2770,6 +2770,7 @@
                         is_ansistring(def_from) and
                         is_ansistring(def_to) and
                         (tstringdef(def_from).encoding<>tstringdef(def_to).encoding) and
    +					((tstringdef(def_to).encoding<>globals.CP_NONE) and (tstringdef(def_to).encoding<>0)) and
                         (currpara.varspez in [vs_var,vs_out]) then
                         eq:=te_convert_l1 // don't allow to pass different ansistring types to each-other
                      else
    
  • defcmp.pas.patch (891 bytes)
    Index: compiler/defcmp.pas
    ===================================================================
    --- compiler/defcmp.pas	(revision 32093)
    +++ compiler/defcmp.pas	(working copy)
    @@ -497,7 +497,8 @@
                                ((tstringdef(def_to).encoding=0) and (tstringdef(def_from).encoding=getansistringcodepage)) or
                                ((tstringdef(def_to).encoding=getansistringcodepage) and (tstringdef(def_from).encoding=0)) or
                                (tstringdef(def_to).encoding=globals.CP_NONE) or
    -                           (tstringdef(def_from).encoding=globals.CP_NONE) then
    +                           (tstringdef(def_from).encoding=globals.CP_NONE) or
    +                           ((tstringdef(def_from).encoding=globals.CP_UTF8) {and flagforlazarus}) then
                              begin
                                eq:=te_equal;
                              end
    
    defcmp.pas.patch (891 bytes)
  • rawbytestring_no_encoding_concat_astrings.inc.patch (3,347 bytes)
    Index: rtl/inc/astrings.inc
    ===================================================================
    --- rtl/inc/astrings.inc	(revision 32099)
    +++ rtl/inc/astrings.inc	(working copy)
    @@ -215,8 +215,6 @@
     begin
     {$ifdef FPC_HAS_CPSTRING}
       DestCP:=cp;
    -  if DestCp=CP_NONE then
    -    DestCP:=DefaultSystemCodePage;
     {$else FPC_HAS_CPSTRING}
       DestCP:=StringCodePage(DestS);
     {$endif FPC_HAS_CPSTRING}
    @@ -225,23 +223,25 @@
         but avoid conversions if either addend is empty (StringCodePage will return
         DefaultSystemCodePage in that case, which may differ from other addend/dest) }
       if S1='' then
    +    {$ifdef FPC_HAS_CPSTRING}
    +    S1CP:=DefaultSystemCodePage
    +    {$else FPC_HAS_CPSTRING}
         S1CP:=DestCP
    +    {$endif FPC_HAS_CPSTRING}
       else
         S1CP:=StringCodePage(S1);
       S1CP:=TranslatePlaceholderCP(S1CP);
       if S2='' then
    +    {$ifdef FPC_HAS_CPSTRING}
    +    S2CP:=DefaultSystemCodePage
    +    {$else FPC_HAS_CPSTRING}
         S2CP:=DestCP
    +    {$endif FPC_HAS_CPSTRING}
       else
         S2CP:=StringCodePage(S2);
       S2CP:=TranslatePlaceholderCP(S2CP);
    -{$ifdef FPC_HAS_CPSTRING}
    -  { if the result is rawbytestring and both strings have the same code page,
    -    keep that code page }
    -  if (cp=CP_NONE) and
    -     (S1CP=S2CP) then
    -    DestCP:=S1CP;
    -{$endif FPC_HAS_CPSTRING}
    -  if (S1CP<>DestCP) or (S2CP<>DestCP) then
    +  { skip unicode conversion if dest encoding is CP_NONE }
    +  if (DestCP<>CP_NONE) and ((S1CP<>DestCP) or (S2CP<>DestCP)) then
         begin
           ansistr_concat_complex(DestS,S1,S2,DestCP);
           exit;
    @@ -281,6 +281,10 @@
           fpc_pchar_ansistr_intern_charmove(PAnsiChar(S1),0,DestS,0,S1Len);
           fpc_pchar_ansistr_intern_charmove(PAnsiChar(S2),0,DestS,S1Len,S2Len+1);
         end;
    +  {$ifdef FPC_HAS_CPSTRING}
    +  if DestCP=CP_NONE then
    +    DestCP:=DefaultSystemCodePage;
    +  {$endif FPC_HAS_CPSTRING}
       SetCodePage(DestS,DestCP,false);
     end;
     {$endif FPC_HAS_ANSISTR_CONCAT}
    @@ -309,8 +313,6 @@
         end;
     {$ifdef FPC_HAS_CPSTRING}
       DestCP:=cp;
    -  if DestCp=CP_NONE then
    -    DestCP:=DefaultSystemCodePage;
     {$else FPC_HAS_CPSTRING}
       DestCP:=StringCodePage(DestS);
     {$endif FPC_HAS_CPSTRING}
    @@ -326,6 +328,9 @@
         end;
       DestCP:=TranslatePlaceholderCP(DestCP);
       sameCP:=true;
    +  { skip unicode conversion if dest encoding is CP_NONE }
    +  if DestCP<>CP_NONE then
    +  begin
       tmpCP:=TranslatePlaceholderCP(StringCodePage(sarr[lowstart]));
       for i:=lowstart+1 to high(sarr) do
         begin
    @@ -338,6 +343,7 @@
               break;
             end;
         end;
    +  end;
       if not sameCP then
         begin
           U:='';
    @@ -349,12 +355,7 @@
           widestringmanager.Unicode2AnsiMoveProc(PUnicodeChar(Pointer(U)),DestS,DestCP,Length(U));
           exit;
         end;
    -{$ifdef FPC_HAS_CPSTRING}
    -  { if the result is rawbytestring and all strings have the same code page,
    -    keep that code page }
    -  if cp=CP_NONE then
    -    DestCP:=tmpCP;
    -{$endif FPC_HAS_CPSTRING}
    +
       destcopy:=nil;
       nonemptystart:=lowstart;
       { Check for another reuse, then we can't use
    @@ -401,7 +402,13 @@
               inc(pc,size);
             end;
         end;
    -  SetCodePage(DestS,tmpCP,False);
    +  if DestCP<>CP_NONE then
    +  SetCodePage(DestS,tmpCP,False)
    +  {$ifdef FPC_HAS_CPSTRING}
    +    else
    +      DestCP:=DefaultSystemCodePage
    +  {$endif FPC_HAS_CPSTRING}
    +    ;
       SetCodePage(DestS,DestCP,True);
       fpc_AnsiStr_Decr_Ref(destcopy);
     end;
    
  • SearchRecTest.zip (27,726 bytes)
  • stringmagic.patch (1,771 bytes)
    Index: components/lazutils/winlazfileutils.inc
    ===================================================================
    --- components/lazutils/winlazfileutils.inc	(revision 50203)
    +++ components/lazutils/winlazfileutils.inc	(working copy)
    @@ -538,15 +538,8 @@
     {$IF DEFINED(WinCE) OR (FPC_FULLVERSION>=30000)}
       {$define FindData_W}
     {$IFEND}
    -{$IF (FPC_FULLVERSION >= 30000) AND NOT DEFINED(DisableUTF8RTL)}
    -  {$DEFINE ReallyUseUTF8RTL}
    -{$IFEND}
     
     function FindMatch(var f: TSearchRec) : Longint;
    -{$IFnDEF ReallyUseUTF8RTL}
    -var
    -  Dummy: String;
    -{$ENDIF}
     begin
       { Find file with correct attribute }
       While (F.FindData.dwFileAttributes and cardinal(F.ExcludeAttr))<>0 do
    @@ -565,12 +558,10 @@
         in win32 it is the ansi structure with a utf-8 string
         in wince it is a wide structure }
       {$ifdef FindData_W}
    -  {$IFDEF ReallyUseUTF8RTL}
    +  {$IF (FPC_FULLVERSION>=30000)}
    +  f.Name:=String(UnicodeString(F.FindData.cFileName));
    +  {$ELSE}
       f.Name:=UTF8Encode(UnicodeString(F.FindData.cFileName));
    -  {$ELSE}
    -  Dummy := '';
    -  Insert(UTF8Encode(UnicodeString(F.FindData.cFileName)), Dummy, 1);
    -  f.Name := Dummy;
       {$ENDIF}
       {$else}
       f.Name:=F.FindData.cFileName;
    @@ -623,7 +614,11 @@
       Rslt.ExcludeAttr:=(not Attr) and ($1e);
                      { $1e = faHidden or faSysFile or faVolumeID or faDirectory }
       { FindFirstFile is a Win32 Call }
    -  Rslt.FindHandle:=Windows.FindFirstFileW( PWideChar(UTF8Decode(Path)),find{%H-});
    +  {$IF (FPC_FULLVERSION>=30000)}
    +  Rslt.FindHandle:=Windows.FindFirstFileW(PWideChar(WideString(Path)),find{%H-});
    +  {$ELSE}
    +  Rslt.FindHandle:=Windows.FindFirstFileW(PWideChar(UTF8Decode(Path)),find{%H-});
    +  {$ENDIF}
       If Rslt.FindHandle=Windows.Invalid_Handle_value then
       begin
         Result:=GetLastError;
    
    stringmagic.patch (1,771 bytes)
  • TestFindFirstUTF8.zip (10,679 bytes)
  • TestStringMagic.zip (2,130 bytes)
  • TestStringMagic2.zip (3,007 bytes)
  • 2015-11-25winlazfileutils.inc.patch (1,545 bytes)
    Index: components/lazutils/winlazfileutils.inc
    ===================================================================
    --- components/lazutils/winlazfileutils.inc	(revision 50507)
    +++ components/lazutils/winlazfileutils.inc	(working copy)
    @@ -544,10 +544,6 @@
     {$IFEND}
     
     function FindMatch(var f: TSearchRec) : Longint;
    -{$IFDEF ACP_RTL}
    -var
    -  Dummy: String;
    -{$ENDIF}
     begin
       { Find file with correct attribute }
       While (F.FindData.dwFileAttributes and cardinal(F.ExcludeAttr))<>0 do
    @@ -566,12 +562,10 @@
         in win32 it is the ansi structure with a utf-8 string
         in wince it is a wide structure }
       {$ifdef FindData_W}
    -  {$IFnDEF ACP_RTL}
    +  {$IFDEF ACP_RTL}
    +  f.Name:=String(UnicodeString(F.FindData.cFileName));
    +  {$ELSE}
       f.Name:=UTF8Encode(UnicodeString(F.FindData.cFileName));
    -  {$ELSE}
    -  Dummy := '';
    -  Insert(UTF8Encode(UnicodeString(F.FindData.cFileName)), Dummy, 1);
    -  f.Name := Dummy;
       {$ENDIF}
       {$else}
       f.Name:=F.FindData.cFileName;
    @@ -624,7 +618,11 @@
       Rslt.ExcludeAttr:=(not Attr) and ($1e);
                      { $1e = faHidden or faSysFile or faVolumeID or faDirectory }
       { FindFirstFile is a Win32 Call }
    -  Rslt.FindHandle:=Windows.FindFirstFileW( PWideChar(UTF8Decode(Path)),find{%H-});
    +  {$IFDEF ACP_RTL}
    +  Rslt.FindHandle:=Windows.FindFirstFileW(PWideChar(WideString(Path)),find{%H-});
    +  {$ELSE}
    +  Rslt.FindHandle:=Windows.FindFirstFileW(PWideChar(UTF8Decode(Path)),find{%H-});
    +  {$ENDIF}
       If Rslt.FindHandle=Windows.Invalid_Handle_value then
       begin
         Result:=GetLastError;
    
  • 2015-11-25alternativ.inc.patch (1,551 bytes)
    Index: components/lazutils/winlazfileutils.inc
    ===================================================================
    --- components/lazutils/winlazfileutils.inc	(revision 50507)
    +++ components/lazutils/winlazfileutils.inc	(working copy)
    @@ -544,10 +544,6 @@
     {$IFEND}
     
     function FindMatch(var f: TSearchRec) : Longint;
    -{$IFDEF ACP_RTL}
    -var
    -  Dummy: String;
    -{$ENDIF}
     begin
       { Find file with correct attribute }
       While (F.FindData.dwFileAttributes and cardinal(F.ExcludeAttr))<>0 do
    @@ -566,12 +562,10 @@
         in win32 it is the ansi structure with a utf-8 string
         in wince it is a wide structure }
       {$ifdef FindData_W}
    -  {$IFnDEF ACP_RTL}
    +  {$IFNDEF NO_CP_RTL}
    +  f.Name:=String(UnicodeString(F.FindData.cFileName));
    +  {$ELSE}
       f.Name:=UTF8Encode(UnicodeString(F.FindData.cFileName));
    -  {$ELSE}
    -  Dummy := '';
    -  Insert(UTF8Encode(UnicodeString(F.FindData.cFileName)), Dummy, 1);
    -  f.Name := Dummy;
       {$ENDIF}
       {$else}
       f.Name:=F.FindData.cFileName;
    @@ -624,7 +618,11 @@
       Rslt.ExcludeAttr:=(not Attr) and ($1e);
                      { $1e = faHidden or faSysFile or faVolumeID or faDirectory }
       { FindFirstFile is a Win32 Call }
    -  Rslt.FindHandle:=Windows.FindFirstFileW( PWideChar(UTF8Decode(Path)),find{%H-});
    +  {$IFNDEF NO_CP_RTL}
    +  Rslt.FindHandle:=Windows.FindFirstFileW(PWideChar(WideString(Path)),find{%H-});
    +  {$ELSE}
    +  Rslt.FindHandle:=Windows.FindFirstFileW(PWideChar(UTF8Decode(Path)),find{%H-});
    +  {$ENDIF}
       If Rslt.FindHandle=Windows.Invalid_Handle_value then
       begin
         Result:=GetLastError;
    
  • project1.lpr (8,233 bytes)
  • winlazfile.patch (1,443 bytes)
    Index: components/lazutils/lazutf8.pas
    ===================================================================
    --- components/lazutils/lazutf8.pas	(revision 50529)
    +++ components/lazutils/lazutf8.pas	(working copy)
    @@ -173,6 +173,8 @@
     procedure ReplaceSubstring(var s: string; StartPos, Count: SizeInt;
                                const Insertion: string);
     
    +function LazConcatStr(const s1, s2 : string):string;
    +
     implementation
     
     uses
    @@ -3556,6 +3558,19 @@
         System.Move(PByte(Insertion)^,(PByte(s)+StartPos-1)^,InsertionLen);
     end;
     
    +function LazConcatstr(const s1, s2: string): string;
    +var
    +  i, j : Integer;
    +begin
    +  i:=Length(s1);
    +  j:=Length(s2);
    +  SetLength(Result,i+j);
    +  if i<>0 then
    +    system.Move(s1[1],Result[1],i);
    +  if j<>0 then
    +    system.Move(s2[1],Result[i+1],j);
    +end;
    +
     procedure InitFPUpchars;
     var
       c: Char;
    Index: components/lazutils/winlazfileutils.inc
    ===================================================================
    --- components/lazutils/winlazfileutils.inc	(revision 50529)
    +++ components/lazutils/winlazfileutils.inc	(working copy)
    @@ -569,9 +569,7 @@
       {$IFnDEF ACP_RTL}
       f.Name:=UTF8Encode(UnicodeString(F.FindData.cFileName));
       {$ELSE}
    -  Dummy := '';
    -  Insert(UTF8Encode(UnicodeString(F.FindData.cFileName)), Dummy, 1);
    -  f.Name := Dummy;
    +  f.Name:=LazConcatStr(UTF8Encode(unicodestring(f.FindData.cFileName)),'');
       {$ENDIF}
       {$else}
       f.Name:=F.FindData.cFileName;
    
    winlazfile.patch (1,443 bytes)

Relationships

related to 0026453 closedJuha Manninen Memo.Lines and umlauts 
related to 0026449 resolvedBart Broersma valuelisteditor and umlauts 
related to 0028943 closedJuha Manninen Feature request: general switch FPC > 3.0.0 usage of UTF8 or ACP 
child of 0028857 closedBart Broersma Implicit Codepage Conversion meta issue 

Activities

Michl

2015-07-19 17:49

developer  

TestTSearchRec.zip (5,206 bytes)

Michl

2015-07-19 18:10

developer  

withoutpatch.png (7,354 bytes)
withoutpatch.png (7,354 bytes)

Michl

2015-07-19 18:10

developer  

withpatch.png (7,751 bytes)
withpatch.png (7,751 bytes)

Michl

2015-07-19 18:13

developer  

fileutil.inc.patch (1,220 bytes)
Index: components/lazutils/fileutil.inc
===================================================================
--- components/lazutils/fileutil.inc	(revision 49558)
+++ components/lazutils/fileutil.inc	(working copy)
@@ -1118,7 +1118,7 @@
 
   procedure DoSearch(const APath: String; const ALevel: Integer);
   var
-    P: String;
+    P, Dummy: String;
     PathInfo: TSearchRec;
   begin
     P := APath + AllDirectoryEntriesMask;
@@ -1126,6 +1126,9 @@
     if FindFirstUTF8(P, FileAttribute, PathInfo) = 0 then
     try
       repeat
+        Dummy := '';
+        Insert(PathInfo.Name, Dummy, 1);
+        PathInfo.Name := Dummy;
         // skip special files
         if (PathInfo.Name = '.') or (PathInfo.Name = '..') or
           (PathInfo.Name = '') then Continue;
@@ -1161,6 +1164,10 @@
       if FindFirstUTF8(P, DirectoryAttribute, PathInfo) = 0 then
       try
         repeat
+          Dummy := '';
+          Insert(PathInfo.Name, Dummy, 1);
+          PathInfo.Name := Dummy;
+
           if (PathInfo.Name = '.') or (PathInfo.Name = '..') or
              (PathInfo.Name = '') or ((PathInfo.Attr and faDirectory) = 0) or
              (not FFollowSymLink and FileIsSymlink(APath + PathInfo.Name))
fileutil.inc.patch (1,220 bytes)

Michl

2015-07-19 18:52

developer   ~0084987

Oh, I forgot to say, I'm using FPC 3.1.1 (Rev. 31215).

Bart Broersma

2015-07-20 01:04

developer   ~0084989

Maybe FindFirstUtf8 should not be used with fpc 3.x?
Did you use -dEnableUTF8RTL and -FcUTF8 when you built the program?

Michl

2015-07-20 09:03

developer   ~0084991

Last edited: 2015-07-20 09:21

View 3 revisions

I've not used -dEnableUTF8RTL and -FcUTF8 for my first example.

But I've now tested -dEnableUTF8RTL and -FcUTF8 and it doesn't work with or without the patch.

I've also tested to replace FindFirstUTF8, FindNextUTF8 with FindFirst and FindNext in TFileSearcher.Search (fileutil.inc) and it doesn't work with/without -dEnableUTF8RTL and -FcUTF8 and with/without the patch.

But I'm not the master of that UTF8- and RawByteString conversion, so maybe there are better solutions?! For me, the patch does its job.

Edit:

Please notice, if you add in TFileSearcher.Search.DoSearch (fileutil.inc) in the first loop a simple writeln(UTF8ToSys(FileName)), without the patch you get a string with two different encodings, like:

...\Dir_ÄÖÜ\???.txt

The path is UTF8, the FFileInfo.Name not!

Bart Broersma

2015-07-20 12:21

developer   ~0084996

> writeln(UTF8ToSys(FileName))
That's wrong, it should be Utf8ToOEM IIRC

Bart Broersma

2015-07-20 12:29

developer   ~0084997

Curious: does your patch also work with filenames that use encoding outside the current codepage (e.g. chinese or cyrillic)?

Michl

2015-07-20 13:13

developer   ~0085000

> That's wrong, it should be Utf8ToOEM IIRC
Thank you, for that hint! I only wanted to show, that there are two different encodings in one string (one part is UTF8-encoded one part has the encoding dependent on codepage).

>Curious: does your patch also work with filenames that use encoding outside the current codepage (e.g. chinese or cyrillic)?
Interesting question, I once made a small test. The patch seems to work fine. See attached picture.

Michl

2015-07-20 13:13

developer  

ChrisF

2015-07-20 15:44

reporter   ~0085001

The test program is working correctly here (without any patch): see ChrisFCapture.png.

Tested also with Lazarus trunk version and Free Pascal trunk version (i.e. 3.1.1). They are not exactly the latest versions, but I've got them just a couple of weeks ago.

Of course, it's only working with the "UTF-8 in RTL" option, but AFAIK Lazarus doesn't support officially the other mode.

Note: UTF8ToConsole is not required in this sample, with FPC 2.7.1+. Just "writeln(FileIterator.FileName)" should do the trick, as it will be converted automatically to the OEM code page.

ChrisF

2015-07-20 15:44

reporter  

ChrisFCapture.png (16,595 bytes)
ChrisFCapture.png (16,595 bytes)

Michl

2015-07-20 21:36

developer   ~0085003

Thank you for your anwser!

I made a mistake by testing -FcUTF8 -dEnableUTF8RTL. I wrote it to "Custom option" and it work not in the same way, as the button "UTF-8 in RTL" (with generate the same command line parameters).

If I use such a UTF8-Aplication, the test program also work fine here without the patch.

>but AFAIK Lazarus doesn't support officially the other mode.
If this is the future, you can close this ticket as "no change required", if not, there must be something done.

Anyway, after thinking about that, I think the patch not in the right place. It should be better implemented in FindFirstUTF8 and FindNextUTF8, cause of other methods calls.

Michl

2015-07-21 23:49

developer  

winlazfileutils.inc.patch (907 bytes)
Index: components/lazutils/winlazfileutils.inc
===================================================================
--- components/lazutils/winlazfileutils.inc	(revision 49558)
+++ components/lazutils/winlazfileutils.inc	(working copy)
@@ -541,6 +541,10 @@
 
 
 function FindMatch(var f: TSearchRec) : Longint;
+{$IFNDEF EnableUTF8RTL}
+var
+  Dummy: String;
+{$ENDIF}
 begin
   { Find file with correct attribute }
   While (F.FindData.dwFileAttributes and cardinal(F.ExcludeAttr))<>0 do
@@ -559,7 +563,13 @@
     in win32 it is the ansi structure with a utf-8 string
     in wince it is a wide structure }
   {$ifdef FindData_W}
+  {$IFDEF EnableUTF8RTL}
   f.Name:=UTF8Encode(UnicodeString(F.FindData.cFileName));
+  {$ELSE}
+  Dummy := '';
+  Insert(UTF8Encode(UnicodeString(F.FindData.cFileName)), Dummy, 1);
+  f.Name := Dummy;
+  {$ENDIF}
   {$else}
   f.Name:=F.FindData.cFileName;
   {$endif}

Michl

2015-07-21 23:50

developer   ~0085011

Last edited: 2015-08-09 19:49

View 2 revisions

Now, I created a patch, which has no downside for -dEnableUTF8RTL, but works also for normal mode. This patch/workaround looks better to me.

[Edit]
As far as I know, non dEnableUTF8RTL mode will be supported in the future. It is set as default. The last patch should be valid (and work fine for me).

Bart Broersma

2015-09-25 17:18

developer   ~0086073

Simple test with FindFirstUtf8/FindNextUtf8, the unpatched version.
Tested with 3.0.0rc1 compiler.
Tested with and without "Utf-8 in RTL"
Tested on Win7.


var
  Path, S: String;
  SR: TSearchRec;
  Err: LongInt;
begin
  Path := ExpandFilenameUtf8('.');

  Memo1.Lines.Clear;
  Memo1.Lines.Add('=================');
  Memo1.Lines.Add('Fpc: '+Fpc);
  Memo1.Lines.add(RTL);
  Err := FindFirstUtf8('*.utf8',faAnyFile, SR);
  while (Err = 0) do
  begin
    S := sR.Name + ' [' + AppendPathDelim(Path) + SR.Name + ']';
    Memo1.Lines.Add(S);
    Err := FindNextUtf8(SR);
  end;
  FindCloseUtf8(SR);
  Memo1.Lines.Add('=================');
end;

Output in memo:
=================
Fpc: 3.0.0rc1
Utf8 in RTL enabled
ascii.utf8 [C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\ascii.utf8]
ä.utf8 [C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\ä.utf8]
ë.utf8 [C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\ë.utf8]
บลูเบอ .utf8 [C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\บลูเบอ .utf8]
=================
=================
Fpc: 3.0.0rc1
Utf8 in RTL NOT enabed
ascii.utf8 [C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\ascii.utf8]
ä.utf8 [C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\ä.utf8]
ë.utf8 [C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\ë.utf8]
บลูเบอ .utf8 [C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\บลูเบอ .utf8]
=================

So, all filenames are displayed correctly for me.
3.0.0rc1 differs from trunk or am I missing the point here?

Michl

2015-09-28 14:28

developer   ~0086124

@Bart:

I've tested your code with FPC 32bit 3.1.1 revision 31875 and Lazarus 32bit 1.5 revision 49884 an Windows 7 64bit.

The files that should be found are:

test.utf8
äöü.utf8
κόσμεコン.utf8


Without -dEnableUTF8RTL and without the patch, the result is not correct:

test.utf8 [C:\04 UTF8 Test\test.utf8]
???.utf8 [C:\04 UTF8 Test\???.utf8]
??s?e??.utf8 [C:\04 UTF8 Test\??s?e??.utf8]


With -dEnableUTF8RTL and without the patch, the result is correct:

test.utf8 [C:\04 UTF8 Test\test.utf8]
äöü.utf8 [C:\04 UTF8 Test\äöü.utf8]
κόσμεコン.utf8 [C:\04 UTF8 Test\κόσμεコン.utf8]


Without -dEnableUTF8RTL and with the patch, the result is correct:

test.utf8 [C:\04 UTF8 Test\test.utf8]
äöü.utf8 [C:\04 UTF8 Test\äöü.utf8]
κόσμεコン.utf8 [C:\04 UTF8 Test\κόσμεコン.utf8]

For me, the patch is valid.


> 3.0.0rc1 differs from trunk or am I missing the point here?

I don't know, if there is a difference between 3.1.1 and 3.0.0rc1. I'll try 3.0.0rc1 this evening. I don't have installed it yet.

Juha Manninen

2015-09-28 21:58

developer   ~0086138

Applied, thanks.

Bart Broersma

2015-09-28 22:29

developer   ~0086141

@Juha: I feel a bit uncomfortable with this patch.
- Why the difference between 3.0.0rc1 and trunk?
- Why does the patch work (did anyone confirm the behaviour with trunk fpc)?
- A comment in the patch explaining why this works might be an option?

No offence intended.

Michl

2015-09-28 23:00

developer   ~0086143

I've builded Lazarus 49887 on FPC 3.0.0rc1 Windows 7 64bit. Here it is the same bug as written above. I've also tested the 64bit Lazarus trunk version with the same bug.

@Bart: I don't know why it works for you. But I'm not the only one, who can see this bug.

> ChrisF: "Of course, it's only working with the "UTF-8 in RTL" option"

Juha Manninen

2015-09-28 23:50

developer   ~0086144

Assigning to Bart. I don't even have a working windows installation now.

Michl

2015-09-29 00:18

developer   ~0086145

> Why does the patch work (did anyone confirm the behaviour with trunk fpc)?

I've seen this solution by someone else (I don't find this bugreport), but here 0026453 is the same problem I've reported a while ago. Here 0026449 was a similar problem, what was resolved in a comparable way.

The bug 0026453, is also only with disabled UTF-8 in RTL.

Should I ask in the forum or mailing list for other testers or should I report it here http://wiki.lazarus.freepascal.org/Release_3.0.0 ?

Michl

2015-09-30 11:16

developer   ~0086179

I've tested different codepages with no success. Perhaps we use different compiler options.

I use:
 -MObjFPC -Scghi -O1 -g -gl -WG -l -vewnhibq -FiC:\Users\x\AppData\Local\Temp\lib\i386-win32 -FuC:\FreePascal\Laz\lcl\units\i386-win32\win32 -FuC:\FreePascal\Laz\lcl\units\i386-win32 -FuC:\FreePascal\Laz\components\lazutils\lib\i386-win32 -FuC:\FreePascal\Laz\packager\units\i386-win32 -Fu. -FUC:\Users\x\AppData\Local\Temp\lib\i386-win32\ -FEC:\Users\x\AppData\Local\Temp\ -dLCL -dLCLwin32

Bart Broersma

2015-10-14 18:24

developer   ~0086579

Using Utf16ToUtf8 instead of Utf8Encode also resolves the problem (as discussed in 0028850).

Michl

2015-10-14 18:38

developer   ~0086582

Should I create a new patch for it (or wait for Jonas Maebe possible work)?

Is using Utf16ToUtf8 instead of Utf8Encode the preferred method for Lazarus (should Utf8Encode be deprecated like in Delphi)?

Bart Broersma

2015-10-14 23:30

developer   ~0086589

I'm not sure.
Let's see what will be done (if anything) on fpc side.
As it is now, it's getting rather frustrating hunting all these bugs and solving them depending on fpc version and EnableUTF8RTL.

Michl

2015-10-19 00:19

developer   ~0086685

Last edited: 2015-10-19 00:24

View 2 revisions

As Do-wan Kim wrote 0026453, in such special cases you can use PChar as a non converted string (the result is a string based on default codepage).

I think that can be used here.

Patch added (anyway the old hack isn't working any more (0028850)), what is imho a real better solution (also with no speed loss by EnabledUtf8InRtl).

Tested with FPC 32092 and Lazarus 50105


[Edit]

You can of course delete the IFDEF and use PChar(...) in both cases (with/without EnabledUtf8InRtl). I dont't know, if there is a notable speed loss.

Michl

2015-10-19 00:20

developer  

winlazfileutilspchar.inc.patch (840 bytes)
Index: components/lazutils/winlazfileutils.inc
===================================================================
--- components/lazutils/winlazfileutils.inc	(revision 50105)
+++ components/lazutils/winlazfileutils.inc	(working copy)
@@ -541,10 +541,6 @@
 
 
 function FindMatch(var f: TSearchRec) : Longint;
-{$IFnDEF EnableUTF8RTL}
-var
-  Dummy: String;
-{$ENDIF}
 begin
   { Find file with correct attribute }
   While (F.FindData.dwFileAttributes and cardinal(F.ExcludeAttr))<>0 do
@@ -566,9 +562,7 @@
   {$IFDEF EnableUTF8RTL}
   f.Name:=UTF8Encode(UnicodeString(F.FindData.cFileName));
   {$ELSE}
-  Dummy := '';
-  Insert(UTF8Encode(UnicodeString(F.FindData.cFileName)), Dummy, 1);
-  f.Name := Dummy;
+  f.Name:=PChar(UTF8Encode(UnicodeString(F.FindData.cFileName)));
   {$ENDIF}
   {$else}
   f.Name:=F.FindData.cFileName;

Do-wan Kim

2015-10-19 01:16

reporter   ~0086693

Last edited: 2015-10-19 02:46

View 4 revisions

I think better solution is 'string = rawbytestring' in lazarus.
Or assign new compiler directive that ignore encoding between utf8 and ansi conversion.

(Edit)
with 'pchar' conversion, there is a little overhead in code generation.

unit1.pas:90 Memo1.Lines.Add(pchar(s));
00422865 8b45f4 mov -0xc(%ebp),%eax
00422868 85c0 test %eax,%eax
0042286A 7505 jne 0x422871 <TFORM1__BUTTON1CLICK+385>
0042286C b8e0c25b00 mov $0x5bc2e0,%eax
00422871 8d4dec lea -0x14(%ebp),%ecx
00422874 66ba0000 mov $0x0,%dx
00422878 e8a36dfeff call 0x409620 <fpc_pchar_to_ansistr>

0042287D 8b55ec mov -0x14(%ebp),%edx
00422880 8b45f8 mov -0x8(%ebp),%eax
00422883 8b807c040000 mov 0x47c(%eax),%eax
00422889 8b80a0030000 mov 0x3a0(%eax),%eax
0042288F 8b4df8 mov -0x8(%ebp),%ecx
00422892 8b897c040000 mov 0x47c(%ecx),%ecx
00422898 8b89a0030000 mov 0x3a0(%ecx),%ecx
0042289E 8b09 mov (%ecx),%ecx
004228A0 ff91a4000000 call *0xa4(%ecx)
unit1.pas:91 Memo1.Lines.Add(s);
004228A6 8b45f8 mov -0x8(%ebp),%eax
004228A9 8b807c040000 mov 0x47c(%eax),%eax
004228AF 8b80a0030000 mov 0x3a0(%eax),%eax
004228B5 8b55f4 mov -0xc(%ebp),%edx
004228B8 8b4df8 mov -0x8(%ebp),%ecx
004228BB 8b897c040000 mov 0x47c(%ecx),%ecx
004228C1 8b89a0030000 mov 0x3a0(%ecx),%ecx
004228C7 8b09 mov (%ecx),%ecx
004228C9 ff91a4000000 call *0xa4(%ecx)

I uploaded patch from previous bugreport.
I don't remember bugreport number :(
But it break rules and old revision based.

(edit)
New patch may works, but break rules(no UTF8->ansi conversion).

Do-wan Kim

2015-10-19 02:23

reporter  

compiler_rawstring_rules.patch (1,599 bytes)
Index: compiler/defcmp.pas
===================================================================
--- compiler/defcmp.pas	(revision 28814)
+++ compiler/defcmp.pas	(working copy)
@@ -522,7 +522,10 @@
                           (tstringdef(def_from).len=tstringdef(def_to).len)) and
                          { for ansi- and unicodestrings also the encoding must match }
                          (not(tstringdef(def_from).stringtype in [st_ansistring,st_unicodestring]) or
-                          (tstringdef(def_from).encoding=tstringdef(def_to).encoding)) then
+                          (tstringdef(def_from).encoding=tstringdef(def_to).encoding) or
+						  (((tstringdef(def_to).encoding=globals.CP_NONE) or (tstringdef(def_to).encoding=0)) and
+						   (tstringdef(def_from).stringtype=st_ansistring))
+						 ) then
                         eq:=te_equal
                      else
                        begin
Index: compiler/htypechk.pas
===================================================================
--- compiler/htypechk.pas	(revision 28814)
+++ compiler/htypechk.pas	(working copy)
@@ -2770,6 +2770,7 @@
                     is_ansistring(def_from) and
                     is_ansistring(def_to) and
                     (tstringdef(def_from).encoding<>tstringdef(def_to).encoding) and
+					((tstringdef(def_to).encoding<>globals.CP_NONE) and (tstringdef(def_to).encoding<>0)) and
                     (currpara.varspez in [vs_var,vs_out]) then
                     eq:=te_convert_l1 // don't allow to pass different ansistring types to each-other
                  else

Do-wan Kim

2015-10-19 02:43

reporter  

defcmp.pas.patch (891 bytes)
Index: compiler/defcmp.pas
===================================================================
--- compiler/defcmp.pas	(revision 32093)
+++ compiler/defcmp.pas	(working copy)
@@ -497,7 +497,8 @@
                            ((tstringdef(def_to).encoding=0) and (tstringdef(def_from).encoding=getansistringcodepage)) or
                            ((tstringdef(def_to).encoding=getansistringcodepage) and (tstringdef(def_from).encoding=0)) or
                            (tstringdef(def_to).encoding=globals.CP_NONE) or
-                           (tstringdef(def_from).encoding=globals.CP_NONE) then
+                           (tstringdef(def_from).encoding=globals.CP_NONE) or
+                           ((tstringdef(def_from).encoding=globals.CP_UTF8) {and flagforlazarus}) then
                          begin
                            eq:=te_equal;
                          end
defcmp.pas.patch (891 bytes)

Michl

2015-10-19 08:46

developer   ~0086702

The bug report number is 0028850.

Thank you for your patch, but I think this place is not the correct one, cause this bug report is a Lazarus issue, with this patch you change a FPC rule. I think no FPC core member will have a look at it here.

As Jonas Maebe wrote there, from FPC side the conversions are correct, cause Delphi does the same. I don't have Delphi to check the behaviours there and I'm not studied enough to say, what FPC rule is the best solution. If you think, that your patch is valid and is a better solution than the current one, I think it is better to open a new FPC bug report for it.

Do-wan Kim

2015-10-19 09:42

reporter   ~0086705

Last edited: 2015-10-19 11:04

View 4 revisions

Yep :)

I tested my patch, it is also wrong.
It is 'fpc_ansi_concat' problem.
It calls 'ansistr_concat_complex' with parameter destcp assigned 0, it convert result string to CP_ACP encoding.

I can't find more better way on 'fpc_ansi_concat', 'pchar' cast is best way to do.

(another way)
'pchar' conversion copy new string, but this change only encoding.
resemble as 'setcodepage', but it returns string referenced.

Type
  PAnsiRec = ^TAnsiRec;
  TAnsiRec = Record
    CodePage : TSystemCodePage;
    ElementSize : Word;
{$ifdef CPU64}
    { align fields }
    Dummy : DWord;
{$endif CPU64}
    Ref : SizeInt;
    Len : SizeInt;
  end;

Const
  AnsiFirstOff = SizeOf(TAnsiRec);

function MakeDefaultCodepage(const S:string):string;
begin
  Result:=S;
  if Pointer(S)<>nil then
    PAnsiRec(Pointer(S)-AnsiFirstOff)^.CodePage:=DefaultSystemCodePage;
end;

Do-wan Kim

2015-10-20 04:48

reporter  

rawbytestring_no_encoding_concat_astrings.inc.patch (3,347 bytes)
Index: rtl/inc/astrings.inc
===================================================================
--- rtl/inc/astrings.inc	(revision 32099)
+++ rtl/inc/astrings.inc	(working copy)
@@ -215,8 +215,6 @@
 begin
 {$ifdef FPC_HAS_CPSTRING}
   DestCP:=cp;
-  if DestCp=CP_NONE then
-    DestCP:=DefaultSystemCodePage;
 {$else FPC_HAS_CPSTRING}
   DestCP:=StringCodePage(DestS);
 {$endif FPC_HAS_CPSTRING}
@@ -225,23 +223,25 @@
     but avoid conversions if either addend is empty (StringCodePage will return
     DefaultSystemCodePage in that case, which may differ from other addend/dest) }
   if S1='' then
+    {$ifdef FPC_HAS_CPSTRING}
+    S1CP:=DefaultSystemCodePage
+    {$else FPC_HAS_CPSTRING}
     S1CP:=DestCP
+    {$endif FPC_HAS_CPSTRING}
   else
     S1CP:=StringCodePage(S1);
   S1CP:=TranslatePlaceholderCP(S1CP);
   if S2='' then
+    {$ifdef FPC_HAS_CPSTRING}
+    S2CP:=DefaultSystemCodePage
+    {$else FPC_HAS_CPSTRING}
     S2CP:=DestCP
+    {$endif FPC_HAS_CPSTRING}
   else
     S2CP:=StringCodePage(S2);
   S2CP:=TranslatePlaceholderCP(S2CP);
-{$ifdef FPC_HAS_CPSTRING}
-  { if the result is rawbytestring and both strings have the same code page,
-    keep that code page }
-  if (cp=CP_NONE) and
-     (S1CP=S2CP) then
-    DestCP:=S1CP;
-{$endif FPC_HAS_CPSTRING}
-  if (S1CP<>DestCP) or (S2CP<>DestCP) then
+  { skip unicode conversion if dest encoding is CP_NONE }
+  if (DestCP<>CP_NONE) and ((S1CP<>DestCP) or (S2CP<>DestCP)) then
     begin
       ansistr_concat_complex(DestS,S1,S2,DestCP);
       exit;
@@ -281,6 +281,10 @@
       fpc_pchar_ansistr_intern_charmove(PAnsiChar(S1),0,DestS,0,S1Len);
       fpc_pchar_ansistr_intern_charmove(PAnsiChar(S2),0,DestS,S1Len,S2Len+1);
     end;
+  {$ifdef FPC_HAS_CPSTRING}
+  if DestCP=CP_NONE then
+    DestCP:=DefaultSystemCodePage;
+  {$endif FPC_HAS_CPSTRING}
   SetCodePage(DestS,DestCP,false);
 end;
 {$endif FPC_HAS_ANSISTR_CONCAT}
@@ -309,8 +313,6 @@
     end;
 {$ifdef FPC_HAS_CPSTRING}
   DestCP:=cp;
-  if DestCp=CP_NONE then
-    DestCP:=DefaultSystemCodePage;
 {$else FPC_HAS_CPSTRING}
   DestCP:=StringCodePage(DestS);
 {$endif FPC_HAS_CPSTRING}
@@ -326,6 +328,9 @@
     end;
   DestCP:=TranslatePlaceholderCP(DestCP);
   sameCP:=true;
+  { skip unicode conversion if dest encoding is CP_NONE }
+  if DestCP<>CP_NONE then
+  begin
   tmpCP:=TranslatePlaceholderCP(StringCodePage(sarr[lowstart]));
   for i:=lowstart+1 to high(sarr) do
     begin
@@ -338,6 +343,7 @@
           break;
         end;
     end;
+  end;
   if not sameCP then
     begin
       U:='';
@@ -349,12 +355,7 @@
       widestringmanager.Unicode2AnsiMoveProc(PUnicodeChar(Pointer(U)),DestS,DestCP,Length(U));
       exit;
     end;
-{$ifdef FPC_HAS_CPSTRING}
-  { if the result is rawbytestring and all strings have the same code page,
-    keep that code page }
-  if cp=CP_NONE then
-    DestCP:=tmpCP;
-{$endif FPC_HAS_CPSTRING}
+
   destcopy:=nil;
   nonemptystart:=lowstart;
   { Check for another reuse, then we can't use
@@ -401,7 +402,13 @@
           inc(pc,size);
         end;
     end;
-  SetCodePage(DestS,tmpCP,False);
+  if DestCP<>CP_NONE then
+  SetCodePage(DestS,tmpCP,False)
+  {$ifdef FPC_HAS_CPSTRING}
+    else
+      DestCP:=DefaultSystemCodePage
+  {$endif FPC_HAS_CPSTRING}
+    ;
   SetCodePage(DestS,DestCP,True);
   fpc_AnsiStr_Decr_Ref(destcopy);
 end;

Do-wan Kim

2015-10-20 04:52

reporter   ~0086721

Last edited: 2015-10-20 05:29

View 2 revisions

I make new patch.

If resultstring encoding is 'rawbytestring', just concat strings and no encoding conversions with small overhead.

it simply change to 'rawbytestring' in lazarus like this,
------------------------------------------------------------------
procedure TWin32MemoStrings.Insert(Index: integer; const S: string);
var
  LineStart: Integer;
  NewLine: rawbyteString;
begin
  LineStart := GetLineStart(Index);
  if Index < GetRealCount then
  begin
    //insert with LineEnding
    LineStart := GetLineStart(Index);
    NewLine := S+LineEnding;
   {$ifdef WindowsUnicodeSupport}


I also hope to find way in compiler,
's := rawbytestring(s1+s2)' and 's := rawbytestring(s1+s2+s3)' pattern.

Michl

2015-11-02 23:36

developer   ~0087078

Last edited: 2015-11-04 09:50

View 2 revisions

Generally this bugreport isn't longer valid, cause the issue has changed with the implementation of

{$IF (FPC_FULLVERSION >= 30000) AND NOT DEFINED(DisableUTF8RTL)}
initialization
  SetMultiByteConversionCodePage(CP_UTF8);
  // SetMultiByteFileSystemCodePage(CP_UTF8); not needed, this is the default under Windows
  SetMultiByteRTLFileSystemCodePage(CP_UTF8);
{$IFEND}


But of cause there is the problem, if you use DisableUTF8RTL, the files aren't found with codepoints > 127.

The solution for that problem is quite very simple. You can't use any longer UTF8Decode / UTF8Encode for the conversions String -> WideString and back. But you can use the string magic of FPC.

Before FPC 3.0.0, all inner strings were UTF8 encoded:
aString := UTF8Encode(UnicodeString(PWideChar));
aPWideChar := PWideChar(UTF8Decode(String));

Now you have strings, which can be ACP or UTF8 strings, defined by the DefaultSystemCodePage and you can use the FPC string conversions:
aString := String(UnicodeString(PWideChar));
aPWideChar := WideString(UTF8Decode(String));

Very simple and works for ACP and UTF8 - Projects.


I add a test project for that issue and a patch. I think the last one here ;)

[Edit] Correct the typecast

Michl

2015-11-02 23:37

developer  

SearchRecTest.zip (27,726 bytes)

Michl

2015-11-02 23:39

developer  

stringmagic.patch (1,771 bytes)
Index: components/lazutils/winlazfileutils.inc
===================================================================
--- components/lazutils/winlazfileutils.inc	(revision 50203)
+++ components/lazutils/winlazfileutils.inc	(working copy)
@@ -538,15 +538,8 @@
 {$IF DEFINED(WinCE) OR (FPC_FULLVERSION>=30000)}
   {$define FindData_W}
 {$IFEND}
-{$IF (FPC_FULLVERSION >= 30000) AND NOT DEFINED(DisableUTF8RTL)}
-  {$DEFINE ReallyUseUTF8RTL}
-{$IFEND}
 
 function FindMatch(var f: TSearchRec) : Longint;
-{$IFnDEF ReallyUseUTF8RTL}
-var
-  Dummy: String;
-{$ENDIF}
 begin
   { Find file with correct attribute }
   While (F.FindData.dwFileAttributes and cardinal(F.ExcludeAttr))<>0 do
@@ -565,12 +558,10 @@
     in win32 it is the ansi structure with a utf-8 string
     in wince it is a wide structure }
   {$ifdef FindData_W}
-  {$IFDEF ReallyUseUTF8RTL}
+  {$IF (FPC_FULLVERSION>=30000)}
+  f.Name:=String(UnicodeString(F.FindData.cFileName));
+  {$ELSE}
   f.Name:=UTF8Encode(UnicodeString(F.FindData.cFileName));
-  {$ELSE}
-  Dummy := '';
-  Insert(UTF8Encode(UnicodeString(F.FindData.cFileName)), Dummy, 1);
-  f.Name := Dummy;
   {$ENDIF}
   {$else}
   f.Name:=F.FindData.cFileName;
@@ -623,7 +614,11 @@
   Rslt.ExcludeAttr:=(not Attr) and ($1e);
                  { $1e = faHidden or faSysFile or faVolumeID or faDirectory }
   { FindFirstFile is a Win32 Call }
-  Rslt.FindHandle:=Windows.FindFirstFileW( PWideChar(UTF8Decode(Path)),find{%H-});
+  {$IF (FPC_FULLVERSION>=30000)}
+  Rslt.FindHandle:=Windows.FindFirstFileW(PWideChar(WideString(Path)),find{%H-});
+  {$ELSE}
+  Rslt.FindHandle:=Windows.FindFirstFileW(PWideChar(UTF8Decode(Path)),find{%H-});
+  {$ENDIF}
   If Rslt.FindHandle=Windows.Invalid_Handle_value then
   begin
     Result:=GetLastError;
stringmagic.patch (1,771 bytes)

Michl

2015-11-02 23:43

developer   ~0087079

I have two build modes for the test project CP1252 (DisableUTF8RTL) and UTF8.

If you can't open the project (cause it is zipped with utf8 filenames) the program is here:

program Project1;

{$mode objfpc}{$H+}

uses
  {$IFDEF UNIX}{$IFDEF UseCThreads}
  cthreads,
  {$ENDIF}{$ENDIF}
  Classes, sysutils, FileUtil
  {$IfDef Windows}
  ,setdefaultcodepages
  {$EndIf} ;

type
  TDummy = class
    procedure FileFound(FileIterator: TFileIterator);
  end;

procedure TDummy.FileFound(FileIterator: TFileIterator);
const
  i: Integer = 0;
begin
  WriteLn(i: 2 , ': ', FileIterator.FileName);
  inc(i);
end;

var
  FS: TFileSearcher;
  Dummy: TDummy;

begin
  Dummy := TDummy.Create;
  FS := TFileSearcher.Create;
  FS.OnFileFound := @Dummy.FileFound;
  WriteLn('Files in project directory:');
  FS.Search(ExtractFilePath(ParamStr(0)), '*.txt', True);
  Dummy.Free;
  FS.Free;
  ReadLn;
end.

Bart Broersma

2015-11-03 00:00

developer   ~0087080

Can you please (if possible) reduce the test program so that it just uses FindFirstUtf8/FindNextUtf8 (without the overhead of TFileSearcher)?

Michl

2015-11-03 09:13

developer   ~0087087

Last edited: 2015-11-03 09:20

View 3 revisions

Here it is.

Without the patch, with enabled CP1252, only chars < 128 are found. With the patch, all chars in the CP1252 are found. It should be documented, that Unicode chars wouldn't be found (so in a directory with Unicodechars wouldn't be jumped).

With enabled UTF8, all chars are found with and without the patch.


program Project1;

{$mode objfpc}{$H+}

uses
  {$IFDEF UNIX}{$IFDEF UseCThreads}
  cthreads,
  {$ENDIF}{$ENDIF}
  Classes, sysutils, LazFileUtils
  {$IfDef Windows}
  ,setdefaultcodepages
  {$EndIf} ;

procedure Output(s: String);
const
  i: Integer = 0;
begin
  WriteLn(i: 2 , ': ', s);
  inc(i);
end;

procedure Search(const APath: String);
var
  P: String;
  PathInfo: TSearchRec;
begin
  P := APath + '*';

  if FindFirstUTF8(P, faAnyFile, PathInfo) = 0 then
  try
    repeat
      if (PathInfo.Name = '.') or (PathInfo.Name = '..') or
        (PathInfo.Name = '') then Continue;
      if (PathInfo.Attr and faDirectory) = 0 then
      begin // File
        if ExtractFileExt(PathInfo.Name) <> '.txt' then Continue;
        OutPut('File: ' + APath + PathInfo.Name);
      end
      else begin // Directory
        OutPut('Dir: ' + APath + PathInfo.Name);
        Search(AppendPathDelim(APath + PathInfo.Name));
      end;
    until (FindNextUTF8(PathInfo) <> 0);
  finally
    FindCloseUTF8(PathInfo);
  end;
end;

begin
  WriteLn('Files in project directory:');
  Search(ExtractFilePath(ParamStr(0)));
  ReadLn;
end.

Michl

2015-11-03 09:13

developer  

TestFindFirstUTF8.zip (10,679 bytes)

Juha Manninen

2015-11-03 12:48

developer   ~0087090

stringmagic.patch looks very good.
I believe system codepage on Windows with FPC 3.0 can be used in a clean way after all!

Do-wan Kim

2015-11-04 01:43

reporter   ~0087104

+ {$IF (FPC_FULLVERSION>=30000)}
+ f.Name:=String(UnicodeString(F.FindData.cFileName));
+ {$ELSE}

Is it unicode capability remains with 'unicode -> ansi' conversion?

Michl

2015-11-04 09:27

developer   ~0087107

Last edited: 2015-11-04 10:44

View 10 revisions

> Is it unicode capability remains with 'unicode -> ansi' conversion?

In that case, yes! Afaik, String(UnicodeString(PWideChar)) works, cause with the setted DefaultSystemCodePage the system knows, which encoding a string have and can handle it right.

You can make a simple test:

- create a new program "Simple Program"
- the source must be saved in normal mode with utf8 encoding
- the source must be saved with your system encoding (for me CP1252) if you compile your project with -dDisableUTF8RTL
- source:

program Project1;

{$mode objfpc}{$H+}

uses
  {$IFDEF UNIX}{$IFDEF UseCThreads}
  cthreads,
  {$ENDIF}{$ENDIF}
  Classes, sysutils, LazUtils
  {$IfDef Windows}
  ,setdefaultcodepages // See http://wiki.freepascal.org/Lazarus_with_FPC3.0_without_UTF-8_mode#Problem_System_encoding_and_Console_encoding_.28Windows.29
  {$EndIf} ;

function PWideCharToHex(P: Pointer): String;
begin
  Result := '';
  while PWord(P)^ > 0 do begin
    Result := Result + IntToHex(PWord(P)^, 4) + ' ';
    inc(p, 2);
  end;
end;

function PStringToHex(P: Pointer; Cnt: Integer): String;
var
  i: Integer;
begin
  Result := '';
  for i := 1 to Cnt do begin
    Result := Result + IntToHex(PByte(P)^, 2) + ' ';
    inc(p);
  end;
end;

var
  s: String;
  pwc: PWideChar;

begin
// DefaultSystemCodePage := cp_utf8; //If you don't insert LazUtils in your uses clause

  s := 'aä';
  pwc := PWideChar(UnicodeString('aä'));
  WriteLn(s, ': ', PWideCharToHex(pwc), '- PWideChar(UnicodeString(Const String)) does not work');

  pwc := PWideChar(UnicodeString(s));
  WriteLn(s, ': ', PWideCharToHex(pwc), '- PWideChar(UnicodeString(String)) works');

  s := String(UnicodeString(pwc));
  WriteLn(s, ': ', PStringToHex(Pointer(s), Length(s)), '- String(UnicodeString(PWideChar)) works');

  ReadLn;
end.

Michl

2015-11-04 10:37

developer  

TestStringMagic.zip (2,130 bytes)

Michl

2015-11-04 10:44

developer   ~0087108

Last edited: 2015-11-04 10:47

View 2 revisions

I've added a file "TestStringMagic.zip" with two files (ProjectCP1252.pas and ProjectUTF8.pas), the source code shown one post before. You can compile it on command line. There is a file "compile.bat" inside. You can edit it and change the location of your fpc.exe. Start it and than the two created projects. You can see the correct string conversions (tested with FPC 3.1.1 64bit and 32bit on Windows7 64bit).

Bart Broersma

2015-11-04 11:28

developer   ~0087110

@Michl: I will test your sample project as soon as I have the time (and energy) again (my daytime job, which has nothing to do with programming, is eating away both currently).

Since you seem to have tested this a lot, you seem to know the "place" where it goes wrong.
To me it would be interesting to know (especially since the problem did not occur in 3.0.0rc1) what the value is of StringCodePage() for each of the parts that are concatenated, for the result of the concatenation and for the string that this result eventually gets assigned to.

Bart Broersma

2015-11-04 12:14

developer   ~0087112

@Michl:I like your idea of altering the defines.
IIRC it was discussed somewhere else (IIRC you wanted to include an lcl include in lazuils, which was rejected), but I cannot find this discussion anymore.

Juha Manninen

2015-11-04 12:25

developer   ~0087113

Last edited: 2015-11-04 12:29

View 3 revisions

> IIRC it was discussed somewhere else (IIRC you wanted to include an lcl include in lazuils, which was rejected), but I cannot find this discussion anymore.

It happened in the related issue :
 http://bugs.freepascal.org/view.php?id=28943
I have nothing against such defines if they have no LazUtils -> LCL dependency.

Michl

2015-11-04 13:08

developer  

TestStringMagic2.zip (3,007 bytes)

Michl

2015-11-04 13:13

developer   ~0087114

Last edited: 2015-11-04 13:34

View 5 revisions

I've added the projects again (with some little changes), I've made a mistake before (the old DefaultSystemCodePage wasn't changed there) - please delete the file TestStringMagic.zip.

> To me it would be interesting to know (especially since the problem did not occur in 3.0.0rc1) what the value is of StringCodePage() for each of the parts that are concatenated, for the result of the concatenation and for the string that this result eventually gets assigned to.


The results are:

ProjectCP1252.pas:

StringCodePage: 0
aä: 0061 00E4 - PWideChar(UnicodeString(String))

StringCodePage: 1252
aä: 61 E4 - String(UnicodeString(PWideChar))

StringCodePage: 65001
aä: 61 C3 A4 - UTF8Encode(UnicodeString(PWideChar)) - the behavoiur before the patch "stringmagic.patch" in winlazfileutils.inc


ProjectUTF8.pas:

StringCodePage: 0
aä: 0061 00E4 - PWideChar(UnicodeString(String))

StringCodePage: 65001
aä: 61 C3 A4 - String(UnicodeString(PWideChar))

StringCodePage: 65001
aä: 61 C3 A4 - UTF8Encode(UnicodeString(PWideChar)) - the behavoiur before the patch "stringmagic.patch" in winlazfileutils.inc


> (my daytime job, which has nothing to do with programming, is eating away both currently)

Oh, I'm in the same club ;)

> I will test your sample project as soon as I have the time (and energy) again

Thank you very much!

Bart Broersma

2015-11-24 17:38

developer   ~0087550

@Michl: I started an attempt to unify the defines for the different (no) codepage situations in r50498.

Bart Broersma

2015-11-25 17:59

developer   ~0087587

@Michl: sorry to hassle you, but can you provide a new patch that uses the new defines (in lazutils_defines.inc)?
So where you use now $if fpc_fullversion > 30000, use $IFnDef NO_CP_RTL?

Michl

2015-11-25 23:03

developer  

2015-11-25winlazfileutils.inc.patch (1,545 bytes)
Index: components/lazutils/winlazfileutils.inc
===================================================================
--- components/lazutils/winlazfileutils.inc	(revision 50507)
+++ components/lazutils/winlazfileutils.inc	(working copy)
@@ -544,10 +544,6 @@
 {$IFEND}
 
 function FindMatch(var f: TSearchRec) : Longint;
-{$IFDEF ACP_RTL}
-var
-  Dummy: String;
-{$ENDIF}
 begin
   { Find file with correct attribute }
   While (F.FindData.dwFileAttributes and cardinal(F.ExcludeAttr))<>0 do
@@ -566,12 +562,10 @@
     in win32 it is the ansi structure with a utf-8 string
     in wince it is a wide structure }
   {$ifdef FindData_W}
-  {$IFnDEF ACP_RTL}
+  {$IFDEF ACP_RTL}
+  f.Name:=String(UnicodeString(F.FindData.cFileName));
+  {$ELSE}
   f.Name:=UTF8Encode(UnicodeString(F.FindData.cFileName));
-  {$ELSE}
-  Dummy := '';
-  Insert(UTF8Encode(UnicodeString(F.FindData.cFileName)), Dummy, 1);
-  f.Name := Dummy;
   {$ENDIF}
   {$else}
   f.Name:=F.FindData.cFileName;
@@ -624,7 +618,11 @@
   Rslt.ExcludeAttr:=(not Attr) and ($1e);
                  { $1e = faHidden or faSysFile or faVolumeID or faDirectory }
   { FindFirstFile is a Win32 Call }
-  Rslt.FindHandle:=Windows.FindFirstFileW( PWideChar(UTF8Decode(Path)),find{%H-});
+  {$IFDEF ACP_RTL}
+  Rslt.FindHandle:=Windows.FindFirstFileW(PWideChar(WideString(Path)),find{%H-});
+  {$ELSE}
+  Rslt.FindHandle:=Windows.FindFirstFileW(PWideChar(UTF8Decode(Path)),find{%H-});
+  {$ENDIF}
   If Rslt.FindHandle=Windows.Invalid_Handle_value then
   begin
     Result:=GetLastError;

Michl

2015-11-25 23:27

developer  

2015-11-25alternativ.inc.patch (1,551 bytes)
Index: components/lazutils/winlazfileutils.inc
===================================================================
--- components/lazutils/winlazfileutils.inc	(revision 50507)
+++ components/lazutils/winlazfileutils.inc	(working copy)
@@ -544,10 +544,6 @@
 {$IFEND}
 
 function FindMatch(var f: TSearchRec) : Longint;
-{$IFDEF ACP_RTL}
-var
-  Dummy: String;
-{$ENDIF}
 begin
   { Find file with correct attribute }
   While (F.FindData.dwFileAttributes and cardinal(F.ExcludeAttr))<>0 do
@@ -566,12 +562,10 @@
     in win32 it is the ansi structure with a utf-8 string
     in wince it is a wide structure }
   {$ifdef FindData_W}
-  {$IFnDEF ACP_RTL}
+  {$IFNDEF NO_CP_RTL}
+  f.Name:=String(UnicodeString(F.FindData.cFileName));
+  {$ELSE}
   f.Name:=UTF8Encode(UnicodeString(F.FindData.cFileName));
-  {$ELSE}
-  Dummy := '';
-  Insert(UTF8Encode(UnicodeString(F.FindData.cFileName)), Dummy, 1);
-  f.Name := Dummy;
   {$ENDIF}
   {$else}
   f.Name:=F.FindData.cFileName;
@@ -624,7 +618,11 @@
   Rslt.ExcludeAttr:=(not Attr) and ($1e);
                  { $1e = faHidden or faSysFile or faVolumeID or faDirectory }
   { FindFirstFile is a Win32 Call }
-  Rslt.FindHandle:=Windows.FindFirstFileW( PWideChar(UTF8Decode(Path)),find{%H-});
+  {$IFNDEF NO_CP_RTL}
+  Rslt.FindHandle:=Windows.FindFirstFileW(PWideChar(WideString(Path)),find{%H-});
+  {$ELSE}
+  Rslt.FindHandle:=Windows.FindFirstFileW(PWideChar(UTF8Decode(Path)),find{%H-});
+  {$ENDIF}
   If Rslt.FindHandle=Windows.Invalid_Handle_value then
   begin
     Result:=GetLastError;

Michl

2015-11-25 23:27

developer   ~0087597

Last edited: 2015-11-25 23:31

View 2 revisions

I'm currently on a business trip, so I only have my notebook and a slow wifi here. But I could load the latest trunks (FPC and Lazarus). I've added the patch 2015-11-25winlazfileutils.inc.patch with the new defines. There is only a change by the usage of a defined ACP_RTL.

It is also possible to assign a Widestring to a String (and vice versa) directly with FPC 3.0.0, no matter of defined ACP_RTL or UTF8_RTL. The warnings could be disabled with the typecast (see 0024103). I've added a second patch 2015-11-25alternativ.inc.patch as a alternative.

To test the patches, you can test with testproject TestFindFirstUTF8.zip (you have to add the package LazUtils to the project). You can compile it with the two compile modes UTF8 and CP1252, selectable with the Button "Change build mode"

Thank you for that defines. They are helpful. I hope, I find some time when I come back home to test some other problems and solutions with the new defines :)

Do-wan Kim

2015-11-26 02:27

reporter   ~0087600

string casting may not work eastern(CJK) language locale.

some code changes on example, adding utf8decode
...
  s := 'aä';
  pwc := PWideChar(UnicodeString(utf8decode('aä')));
  WriteLn(s, ': ', PWideCharToHex(pwc), '- PWideChar(UnicodeString(Const String)) does not work');

  pwc := PWideChar(UnicodeString(utf8decode(s)));
  WriteLn(s, ': ', PWideCharToHex(pwc), '- PWideChar(UnicodeString(String)) works');

and result,

a채: 0061 00E4 - PWideChar(UnicodeString(Const String)) does not work
a채: 0061 00E4 - PWideChar(UnicodeString(String)) works
aa: 61 61 - String(UnicodeString(PWideChar)) works

Michl

2015-11-26 07:51

developer   ~0087601

> string casting may not work eastern(CJK) language locale.

What is your DefaultSystemCodePage? If a "ä" isn't in your charset the result seems to be right.

Bart Broersma

2015-11-26 12:43

developer  

project1.lpr (8,233 bytes)

Bart Broersma

2015-11-26 12:45

developer   ~0087607

Last edited: 2015-11-26 12:46

View 2 revisions

I slightly altered the test program:
- not recurse into directories
- extended the output procedure (and renamed it, because CodeTools barfed on it)

My test results:

Expected to find files:

1_File_C__ÄÖO¥.txt
1_File_U_ß©éËÑȦߦä+É.txt
3_File_บลูเบอ.txt <<== nice one to test
cp1252.txt
cputf8.txt

FPC 3.0.0RC2

Unpatched

UTF8:
Files in project directory:
 1: File: C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\cptest\1_File_A_Abcde.txt [$31 $5F $46 $69 $6C $65 $5F $41 $5F $41 $62 $63 $64 $65 $2E $74 $78 $74]
 2: File: C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\cptest\1_File_C__ÄÖO¥.txt [$31 $5F $46 $69 $6C $65 $5F $43 $5F $5F $C3 $84 $C3 $96 $4F $C2 $A5 $2E $74 $78 $74]
 3: File: C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\cptest\1_File_U_ß©éËÑȦߦä+É.txt [$31 $5F $46 $69 $6C $65 $5F $55 $5F $C3 $9F $C2 $A9 $C3 $A9 $C3 $8B $C3 $91 $C3 $88 $C2 $A6 $C3 $9F $C2 $A6 $C3 $A4 $2B $C3 $89 $2E $74 $78 $74]
 4: File: C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\cptest\3_File_บลูเบอ.txt [$33 $5F $46 $69 $6C $65 $5F $E0 $B8 $9A $E0 $B8 $A5 $E0 $B8 $B9 $E0 $B9 $80 $E0 $B8 $9A $E0 $B8 $AD $2E $74 $78 $74]
 5: File: C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\cptest\cp1252.txt [$63 $70 $31 $32 $35 $32 $2E $74 $78 $74]
 6: File: C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\cptest\cputf8.txt [$63 $70 $75 $74 $66 $38 $2E $74 $78 $74]


CP1252:
Files in project directory:
 1: File: C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\cptest\1_File_A_Abcde.txt [$31 $5F $46 $69 $6C $65 $5F $41 $5F $41 $62 $63 $64 $65 $2E $74 $78 $74]
 2: File: C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\cptest\1_File_C__ÄÖO¥.txt [$31 $5F $46 $69 $6C $65 $5F $43 $5F $5F $C3 $84 $C3 $96 $4F $C2 $A5 $2E $74 $78 $74]
 3: File: C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\cptest\1_File_U_ß©éËÑȦߦä+É.txt [$31 $5F $46 $69 $6C $65 $5F $55 $5F $C3 $9F $C2 $A9 $C3 $A9 $C3 $8B $C3 $91 $C3 $88 $C2 $A6 $C3 $9F $C2 $A6 $C3 $A4 $2B $C3 $89 $2E $74 $78 $74]
 4: File: C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\cptest\3_File_บลูเบอ.txt [$33 $5F $46 $69 $6C $65 $5F $E0 $B8 $9A $E0 $B8 $A5 $E0 $B8 $B9 $E0 $B9 $80 $E0 $B8 $9A $E0 $B8 $AD $2E $74 $78 $74]
 5: File: C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\cptest\cp1252.txt [$63 $70 $31 $32 $35 $32 $2E $74 $78 $74]
 6: File: C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\cptest\cputf8.txt [$63 $70 $75 $74 $66 $38 $2E $74 $78 $74]

Note: CP1252 also seems to get the righ filename for 3_File_บลูเบอ.txt
Even though this has characters outside the current codepage, which is kind of strange



Patched (2015-11-25winlazfileutils.inc.patch):

UTF8:
Files in project directory:
 1: File: C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\cptest\1_File_A_Abcde.txt [$31 $5F $46 $69 $6C $65 $5F $41 $5F $41 $62 $63 $64 $65 $2E $74 $78 $74]
 2: File: C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\cptest\1_File_C__ÄÖO¥.txt [$31 $5F $46 $69 $6C $65 $5F $43 $5F $5F $C3 $84 $C3 $96 $4F $C2 $A5 $2E $74 $78 $74]
 3: File: C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\cptest\1_File_U_ß©éËÑȦߦä+É.txt [$31 $5F $46 $69 $6C $65 $5F $55 $5F $C3 $9F $C2 $A9 $C3 $A9 $C3 $8B $C3 $91 $C3 $88 $C2 $A6 $C3 $9F $C2 $A6 $C3 $A4 $2B $C3 $89 $2E $74 $78 $74]
 4: File: C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\cptest\3_File_บลูเบอ.txt [$33 $5F $46 $69 $6C $65 $5F $E0 $B8 $9A $E0 $B8 $A5 $E0 $B8 $B9 $E0 $B9 $80 $E0 $B8 $9A $E0 $B8 $AD $2E $74 $78 $74]
 5: File: C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\cptest\cp1252.txt [$63 $70 $31 $32 $35 $32 $2E $74 $78 $74]
 6: File: C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\cptest\cputf8.txt [$63 $70 $75 $74 $66 $38 $2E $74 $78 $74]

CP1252:
Files in project directory:
 1: File: C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\cptest\1_File_A_Abcde.txt [$31 $5F $46 $69 $6C $65 $5F $41 $5F $41 $62 $63 $64 $65 $2E $74 $78 $74]
 2: File: C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\cptest\1_File_C__ÄÖO¥.txt [$31 $5F $46 $69 $6C $65 $5F $43 $5F $5F $C4 $D6 $4F $A5 $2E $74 $78 $74]
 3: File: C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\cptest\1_File_U_ß©éËÑȦߦä+É.txt [$31 $5F $46 $69 $6C $65 $5F $55 $5F $DF $A9 $E9 $CB $D1 $C8 $A6 $DF $A6 $E4 $2B $C9 $2E $74 $78 $74]
 4: File: C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\cptest\3_File_??????.txt [$33 $5F $46 $69 $6C $65 $5F $3F $3F $3F $3F $3F $3F $2E $74 $78 $74]
 5: File: C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\cptest\cp1252.txt [$63 $70 $31 $32 $35 $32 $2E $74 $78 $74]
 6: File: C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\cptest\cputf8.txt [$63 $70 $75 $74 $66 $38 $2E $74 $78 $74]

Note: 3_File_??????.txt
This is what could be expected, since บลูเบอ isnot in current codepage


Patched (2015-11-25alternativ.inc.patch):

UTF8:
Files in project directory:
 1: File: C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\cptest\1_File_A_Abcde.txt [$31 $5F $46 $69 $6C $65 $5F $41 $5F $41 $62 $63 $64 $65 $2E $74 $78 $74]
 2: File: C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\cptest\1_File_C__ÄÖO¥.txt [$31 $5F $46 $69 $6C $65 $5F $43 $5F $5F $C3 $84 $C3 $96 $4F $C2 $A5 $2E $74 $78 $74]
 3: File: C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\cptest\1_File_U_ß©éËÑȦߦä+É.txt [$31 $5F $46 $69 $6C $65 $5F $55 $5F $C3 $9F $C2 $A9 $C3 $A9 $C3 $8B $C3 $91 $C3 $88 $C2 $A6 $C3 $9F $C2 $A6 $C3 $A4 $2B $C3 $89 $2E $74 $78 $74]
 4: File: C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\cptest\3_File_บลูเบอ.txt [$33 $5F $46 $69 $6C $65 $5F $E0 $B8 $9A $E0 $B8 $A5 $E0 $B8 $B9 $E0 $B9 $80 $E0 $B8 $9A $E0 $B8 $AD $2E $74 $78 $74]
 5: File: C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\cptest\cp1252.txt [$63 $70 $31 $32 $35 $32 $2E $74 $78 $74]
 6: File: C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\cptest\cputf8.txt [$63 $70 $75 $74 $66 $38 $2E $74 $78 $74]

CP1252:
Files in project directory:
 1: File: C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\cptest\1_File_A_Abcde.txt [$31 $5F $46 $69 $6C $65 $5F $41 $5F $41 $62 $63 $64 $65 $2E $74 $78 $74]
 2: File: C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\cptest\1_File_C__ÄÖO¥.txt [$31 $5F $46 $69 $6C $65 $5F $43 $5F $5F $C4 $D6 $4F $A5 $2E $74 $78 $74]
 3: File: C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\cptest\1_File_U_ß©éËÑȦߦä+É.txt [$31 $5F $46 $69 $6C $65 $5F $55 $5F $DF $A9 $E9 $CB $D1 $C8 $A6 $DF $A6 $E4 $2B $C9 $2E $74 $78 $74]
 4: File: C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\cptest\3_File_??????.txt [$33 $5F $46 $69 $6C $65 $5F $3F $3F $3F $3F $3F $3F $2E $74 $78 $74]
 5: File: C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\cptest\cp1252.txt [$63 $70 $31 $32 $35 $32 $2E $74 $78 $74]
 6: File: C:\Users\Bart\LazarusProjecten\bugs\FileUtf8\cptest\cputf8.txt [$63 $70 $75 $74 $66 $38 $2E $74 $78 $74]

Michl

2015-11-26 22:06

developer   ~0087629

Thank you for testing. As far as I can see, it is correct.

For the ACP test, can you test to jump in a directory with ACP characters. There you can see the biggest difference. Without the patch the program doesn't jump in that directory. That is the reason, why the compiling of a program with ACP characters in the PATH failed, but that is a other bugreport.

@Do-wan Kim: Can you also test the testprogram without and with the patch with a file and a directory with ACP characters and your codepage? Till now we only see, that the patch work for CP1252 (Barts and my used system codepage).

Sorry for my english, friendly greetings, today only by mobil phone :)

Bart Broersma

2015-11-26 22:34

developer   ~0087630

With 2015-11-25winlazfileutils.inc.patch applied:

Testing from within a folder "äëï" (copied the executables there) the cp1252 version works as expected.
Testing from within a folder "äëïบลูเบอ" the cp1521 version fails, it lists 0 files.
(The utf8 version works OK)

@Michl: which patch of you do you think is better (and why)?

Michl

2015-11-27 17:28

developer   ~0087649

I dont really know.

2015-11-25alternativ.inc.patch:

In future, when FPC do its job right and no FPC version < 3.0.0 will be supported any more, there is no need to distinguish between CP_UFT8 and CP_ACP, if you take SomeString := String(SomeUnicodeString), cause it would be done by FPC. So this definition could be removed sometime. This is also a little little bit faster then the second possiblity.


2015-11-25winlazfileutils.inc.patch:

There you explicitly activate the new FPC String Magic, if you activate "Use system encoding". In all other cases (Lazarus with FPC 2.6.4 / default 3.0.0) you use SomeString := UTF8Encode(SomeUnicodeString). It is more clear, what happens, if you have to debug.

If would prefer a little bit 2015-11-25alternativ.inc.patch, cause if the bugreport 0024103 would be resolved, we can simply assign SomeString := SomeUnicodeString without any warning and remove also the defines {$IFNDEF NO_CP_RTL}, but I don't know, in which case there can be a data loss, I never noticed some.

Do-wan Kim

2015-12-01 13:14

reporter  

winlazfile.patch (1,443 bytes)
Index: components/lazutils/lazutf8.pas
===================================================================
--- components/lazutils/lazutf8.pas	(revision 50529)
+++ components/lazutils/lazutf8.pas	(working copy)
@@ -173,6 +173,8 @@
 procedure ReplaceSubstring(var s: string; StartPos, Count: SizeInt;
                            const Insertion: string);
 
+function LazConcatStr(const s1, s2 : string):string;
+
 implementation
 
 uses
@@ -3556,6 +3558,19 @@
     System.Move(PByte(Insertion)^,(PByte(s)+StartPos-1)^,InsertionLen);
 end;
 
+function LazConcatstr(const s1, s2: string): string;
+var
+  i, j : Integer;
+begin
+  i:=Length(s1);
+  j:=Length(s2);
+  SetLength(Result,i+j);
+  if i<>0 then
+    system.Move(s1[1],Result[1],i);
+  if j<>0 then
+    system.Move(s2[1],Result[i+1],j);
+end;
+
 procedure InitFPUpchars;
 var
   c: Char;
Index: components/lazutils/winlazfileutils.inc
===================================================================
--- components/lazutils/winlazfileutils.inc	(revision 50529)
+++ components/lazutils/winlazfileutils.inc	(working copy)
@@ -569,9 +569,7 @@
   {$IFnDEF ACP_RTL}
   f.Name:=UTF8Encode(UnicodeString(F.FindData.cFileName));
   {$ELSE}
-  Dummy := '';
-  Insert(UTF8Encode(UnicodeString(F.FindData.cFileName)), Dummy, 1);
-  f.Name := Dummy;
+  f.Name:=LazConcatStr(UTF8Encode(unicodestring(f.FindData.cFileName)),'');
   {$ENDIF}
   {$else}
   f.Name:=F.FindData.cFileName;
winlazfile.patch (1,443 bytes)

Do-wan Kim

2015-12-01 13:22

reporter   ~0087717

I think best resolution is copy of strings without encoding.
Or using pchar casting on utf-8 string.

Concat Encoding problem cannot be solved without FPC RTL changes.

Michl

2015-12-02 11:11

developer   ~0087738

@Do-wan Kim:

I've looked at your patch. IMHO it is wrong here, cause it isn't a concat problem any more.

In this bugreport we have to solve 3 situations.

1. Lazarus with FPC 2.6.4 (NO_CP_RTL)
2. Lazarus with FPC 3+ with ACP_RTL
3. Lazarus with FPC 3+ with UTF8_RTL

For point 1 and 3 it is correct to assign a UnicodeString to a String with UTF8Encode.

For point 2 your patch is IMHO not correct. You can assign a UnicodeString to ACP-String with UTF8Encode and change its encoding by rawbytestring typecast or something like this but it isn't necessary. You can simple let the compiler do this job.
FPC does here a fine job. You can simple do SomeString := SomeUnicodeString and vice versa. With the typecast SomeString := String(SomeUnicodeString) you hide the warning "Implicit string type conversion with potential data loss from "UnicodeString" to "AnsiString"".
This automatic conversion also work for point 3. I've made some test about it. But I think this shouldn't be discussed here, cause this bug report is long enough ;)

For me the patches 2015-11-25winlazfileutils.inc.patch and 2015-11-25alternativ.inc.patch works fine (tested on two Windows 7 machines with CP1252). As far as I can see it also work for Bart. Can you also test the patches especially 2015-11-25alternativ.inc.patch with your system, as I asked 4 posts before?

Thank you very much.

Bart Broersma

2015-12-02 13:45

developer   ~0087749

Applied 2015-11-25winlazfileutils.inc.patch from Michl.
Please test and close if OK.

Michl

2015-12-02 20:28

developer   ~0087763

Thank you for your patience!

Tested with Lazarus 1.7 r50547 on FPC 3.1.1 r32586 i386-win32

Issue History

Date Modified Username Field Change
2015-07-19 17:49 Michl New Issue
2015-07-19 17:49 Michl File Added: TestTSearchRec.zip
2015-07-19 18:10 Michl File Added: withoutpatch.png
2015-07-19 18:10 Michl File Added: withpatch.png
2015-07-19 18:13 Michl File Added: fileutil.inc.patch
2015-07-19 18:52 Michl Note Added: 0084987
2015-07-20 01:04 Bart Broersma Note Added: 0084989
2015-07-20 09:03 Michl Note Added: 0084991
2015-07-20 09:19 Michl Note Edited: 0084991 View Revisions
2015-07-20 09:21 Michl Note Edited: 0084991 View Revisions
2015-07-20 12:21 Bart Broersma Note Added: 0084996
2015-07-20 12:29 Bart Broersma Note Added: 0084997
2015-07-20 13:13 Michl Note Added: 0085000
2015-07-20 13:13 Michl File Added: outsidecurrentcodepage.png
2015-07-20 15:44 ChrisF Note Added: 0085001
2015-07-20 15:44 ChrisF File Added: ChrisFCapture.png
2015-07-20 21:36 Michl Note Added: 0085003
2015-07-21 23:49 Michl File Added: winlazfileutils.inc.patch
2015-07-21 23:50 Michl Note Added: 0085011
2015-08-09 19:49 Michl Note Edited: 0085011 View Revisions
2015-08-09 19:49 Michl Tag Attached: patch
2015-09-23 15:38 Juha Manninen Assigned To => Juha Manninen
2015-09-23 15:38 Juha Manninen Status new => assigned
2015-09-25 17:18 Bart Broersma Note Added: 0086073
2015-09-28 14:28 Michl Note Added: 0086124
2015-09-28 21:58 Juha Manninen Fixed in Revision => r49888
2015-09-28 21:58 Juha Manninen LazTarget => -
2015-09-28 21:58 Juha Manninen Note Added: 0086138
2015-09-28 21:58 Juha Manninen Status assigned => resolved
2015-09-28 21:58 Juha Manninen Resolution open => fixed
2015-09-28 22:29 Bart Broersma Note Added: 0086141
2015-09-28 22:29 Bart Broersma Status resolved => assigned
2015-09-28 22:29 Bart Broersma Resolution fixed => reopened
2015-09-28 23:00 Michl Note Added: 0086143
2015-09-28 23:48 Juha Manninen Assigned To Juha Manninen => Bart Broersma
2015-09-28 23:50 Juha Manninen Note Added: 0086144
2015-09-29 00:18 Michl Note Added: 0086145
2015-09-29 09:00 Juha Manninen Relationship added related to 0026453
2015-09-29 11:48 Juha Manninen Relationship added related to 0026449
2015-09-30 11:16 Michl Note Added: 0086179
2015-10-14 18:24 Bart Broersma Note Added: 0086579
2015-10-14 18:38 Michl Note Added: 0086582
2015-10-14 23:30 Bart Broersma Note Added: 0086589
2015-10-14 23:39 Bart Broersma Relationship added child of 0028857
2015-10-19 00:19 Michl Note Added: 0086685
2015-10-19 00:20 Michl File Added: winlazfileutilspchar.inc.patch
2015-10-19 00:24 Michl Note Edited: 0086685 View Revisions
2015-10-19 01:16 Do-wan Kim Note Added: 0086693
2015-10-19 01:54 Do-wan Kim Note Edited: 0086693 View Revisions
2015-10-19 02:23 Do-wan Kim File Added: compiler_rawstring_rules.patch
2015-10-19 02:25 Do-wan Kim Note Edited: 0086693 View Revisions
2015-10-19 02:43 Do-wan Kim File Added: defcmp.pas.patch
2015-10-19 02:46 Do-wan Kim Note Edited: 0086693 View Revisions
2015-10-19 08:46 Michl Note Added: 0086702
2015-10-19 09:42 Do-wan Kim Note Added: 0086705
2015-10-19 09:58 Do-wan Kim Note Edited: 0086705 View Revisions
2015-10-19 11:03 Do-wan Kim Note Edited: 0086705 View Revisions
2015-10-19 11:04 Do-wan Kim Note Edited: 0086705 View Revisions
2015-10-20 04:48 Do-wan Kim File Added: rawbytestring_no_encoding_concat_astrings.inc.patch
2015-10-20 04:52 Do-wan Kim Note Added: 0086721
2015-10-20 05:29 Do-wan Kim Note Edited: 0086721 View Revisions
2015-11-02 23:36 Michl Note Added: 0087078
2015-11-02 23:37 Michl File Added: SearchRecTest.zip
2015-11-02 23:39 Michl File Added: stringmagic.patch
2015-11-02 23:43 Michl Note Added: 0087079
2015-11-03 00:00 Bart Broersma Note Added: 0087080
2015-11-03 00:00 Bart Broersma Status assigned => feedback
2015-11-03 09:13 Michl Note Added: 0087087
2015-11-03 09:13 Michl Status feedback => assigned
2015-11-03 09:13 Michl File Added: TestFindFirstUTF8.zip
2015-11-03 09:14 Michl Note Edited: 0087087 View Revisions
2015-11-03 09:20 Michl Note Edited: 0087087 View Revisions
2015-11-03 12:29 Juha Manninen Relationship added related to 0028943
2015-11-03 12:48 Juha Manninen Note Added: 0087090
2015-11-04 01:43 Do-wan Kim Note Added: 0087104
2015-11-04 09:27 Michl Note Added: 0087107
2015-11-04 09:28 Michl Note Edited: 0087107 View Revisions
2015-11-04 09:30 Michl Note Edited: 0087107 View Revisions
2015-11-04 09:41 Michl Note Edited: 0087107 View Revisions
2015-11-04 09:41 Michl Note Edited: 0087107 View Revisions
2015-11-04 09:50 Michl Note Edited: 0087078 View Revisions
2015-11-04 09:57 Michl Note Edited: 0087107 View Revisions
2015-11-04 10:03 Michl Note Edited: 0087107 View Revisions
2015-11-04 10:13 Michl Note Edited: 0087107 View Revisions
2015-11-04 10:13 Michl Note Edited: 0087107 View Revisions
2015-11-04 10:37 Michl File Added: TestStringMagic.zip
2015-11-04 10:44 Michl Note Added: 0087108
2015-11-04 10:44 Michl Note Edited: 0087107 View Revisions
2015-11-04 10:47 Michl Note Edited: 0087108 View Revisions
2015-11-04 11:28 Bart Broersma Note Added: 0087110
2015-11-04 12:14 Bart Broersma Note Added: 0087112
2015-11-04 12:25 Juha Manninen Note Added: 0087113
2015-11-04 12:27 Juha Manninen Note Edited: 0087113 View Revisions
2015-11-04 12:29 Juha Manninen Note Edited: 0087113 View Revisions
2015-11-04 13:08 Michl File Added: TestStringMagic2.zip
2015-11-04 13:13 Michl Note Added: 0087114
2015-11-04 13:14 Michl Note Edited: 0087114 View Revisions
2015-11-04 13:14 Michl Note Edited: 0087114 View Revisions
2015-11-04 13:34 Michl Note Edited: 0087114 View Revisions
2015-11-04 13:34 Michl Note Edited: 0087114 View Revisions
2015-11-24 17:38 Bart Broersma Note Added: 0087550
2015-11-25 17:59 Bart Broersma Note Added: 0087587
2015-11-25 17:59 Bart Broersma Status assigned => feedback
2015-11-25 23:03 Michl File Added: 2015-11-25winlazfileutils.inc.patch
2015-11-25 23:27 Michl File Added: 2015-11-25alternativ.inc.patch
2015-11-25 23:27 Michl Note Added: 0087597
2015-11-25 23:27 Michl Status feedback => assigned
2015-11-25 23:31 Michl Note Edited: 0087597 View Revisions
2015-11-26 02:27 Do-wan Kim Note Added: 0087600
2015-11-26 07:51 Michl Note Added: 0087601
2015-11-26 12:43 Bart Broersma File Added: project1.lpr
2015-11-26 12:45 Bart Broersma Note Added: 0087607
2015-11-26 12:46 Bart Broersma Note Edited: 0087607 View Revisions
2015-11-26 22:06 Michl Note Added: 0087629
2015-11-26 22:34 Bart Broersma Note Added: 0087630
2015-11-27 17:28 Michl Note Added: 0087649
2015-12-01 13:14 Do-wan Kim File Added: winlazfile.patch
2015-12-01 13:22 Do-wan Kim Note Added: 0087717
2015-12-02 11:11 Michl Note Added: 0087738
2015-12-02 13:45 Bart Broersma Fixed in Revision r49888 => r49888, r50567
2015-12-02 13:45 Bart Broersma LazTarget - => 1.6
2015-12-02 13:45 Bart Broersma Note Added: 0087749
2015-12-02 13:45 Bart Broersma Status assigned => resolved
2015-12-02 13:45 Bart Broersma Fixed in Version => 1.6
2015-12-02 13:45 Bart Broersma Resolution reopened => fixed
2015-12-02 13:45 Bart Broersma Target Version => 1.6
2015-12-02 20:28 Michl Note Added: 0087763
2015-12-02 20:28 Michl Status resolved => closed