View Issue Details

IDProjectCategoryView StatusLast Update
0022501FPCRTLpublic2016-03-27 13:09
ReporteroceanAssigned ToMichael Van Canneyt 
PrioritynormalSeverityminorReproducibilityalways
Status resolvedResolutionno change required 
PlatformWin32OSOS Version
Product Version2.7.1Product Build 
Target VersionFixed in Version 
Summary0022501: Stringreplace corrupts my strings
DescriptionThis code shows message "Bug!". I have simplified it from larger program.

Tested 1.1.-37904-fpc-2.7.1-20120710-win32

Problem is NOT present 0.9.30.4 / 2.60

procedure test(s: string);
var i, j: integer;
begin
 i:=pos('€', s); //1
 s:=stringreplace(s, 'anything', 'anything2', []); // do nothing
 j:=pos('€', s); //0
 if i<>j then showmessage('Bug!');
end;

procedure TForm1.Button1Click(Sender: TObject);
var o: olevariant;
begin
 o:=UTF8Decode('€');
 test(UTF8Encode(o));
end;
TagsNo tags attached.
Fixed in Revision
FPCOldBugId
FPCTarget
Attached Files
  • test.pas (357 bytes)
    program test;
    
    {$mode objfpc}{$H+}
    
    uses
      Classes,sysutils;
    
    procedure test(s: string);
    var i, j: integer;
    begin
     i:=pos('€', s); //1
     s:=stringreplace(s, 'anything', 'anything2', []); // do nothing
     j:=pos('€', s); //0
     if i<>j then writeln('Bug!');
    end;
    
    var o: variant;
    begin
     o:=UTF8Decode('€');
     test(UTF8Encode(o));
    end.
    
    
    test.pas (357 bytes)
  • ignore_ansistring_encoding_somefunc.patch (2,761 bytes)
    Index: compiler/defcmp.pas
    ===================================================================
    --- compiler/defcmp.pas	(revision 28821)
    +++ compiler/defcmp.pas	(working copy)
    @@ -522,7 +522,10 @@
                               (tstringdef(def_from).len=tstringdef(def_to).len)) and
                              { for ansi- and unicodestrings also the encoding must match }
                              (not(tstringdef(def_from).stringtype in [st_ansistring,st_unicodestring]) or
    -                          (tstringdef(def_from).encoding=tstringdef(def_to).encoding)) then
    +                          (tstringdef(def_from).encoding=tstringdef(def_to).encoding) or
    +						  (((tstringdef(def_to).encoding=globals.CP_NONE) or (tstringdef(def_to).encoding=0)) and
    +						   (tstringdef(def_from).stringtype=st_ansistring))
    +						 ) then
                             eq:=te_equal
                          else
                            begin
    Index: compiler/htypechk.pas
    ===================================================================
    --- compiler/htypechk.pas	(revision 28821)
    +++ compiler/htypechk.pas	(working copy)
    @@ -2770,6 +2770,7 @@
                         is_ansistring(def_from) and
                         is_ansistring(def_to) and
                         (tstringdef(def_from).encoding<>tstringdef(def_to).encoding) and
    +					((tstringdef(def_to).encoding<>globals.CP_NONE) and (tstringdef(def_to).encoding<>0)) and
                         (currpara.varspez in [vs_var,vs_out]) then
                         eq:=te_convert_l1 // don't allow to pass different ansistring types to each-other
                      else
    Index: rtl/objpas/sysutils/sysstr.inc
    ===================================================================
    --- rtl/objpas/sysutils/sysstr.inc	(revision 28821)
    +++ rtl/objpas/sysutils/sysstr.inc	(working copy)
    @@ -76,7 +76,7 @@
     Dest := Dest + S;
     end ;
     
    -Function InternalChangeCase(Const S : AnsiString; const Chars: TSysCharSet; const Adjustment: Longint): AnsiString;
    +Function InternalChangeCase(Const S : string; const Chars: TSysCharSet; const Adjustment: Longint): string;
       var
         i : Integer;
         P : PChar;
    @@ -106,7 +106,7 @@
     
     {   UpperCase returns a copy of S where all lowercase characters ( from a to z )
         have been converted to uppercase   }
    -Function UpperCase(Const S : AnsiString) : AnsiString;
    +Function UpperCase(Const S : string) : string;
       begin
         Result:=InternalChangeCase(S,['a'..'z'],-32);
       end;
    @@ -114,7 +114,7 @@
     
     {   LowerCase returns a copy of S where all uppercase characters ( from A to Z )
         have been converted to lowercase  }
    -Function Lowercase(Const S : AnsiString) : AnsiString;
    +Function Lowercase(Const S : string) : string;
       begin
         Result:=InternalChangeCase(S,['A'..'Z'],32);
       end;
    
  • stringreplace_but_breaktherules.patch (4,812 bytes)
    Index: compiler/defcmp.pas
    ===================================================================
    --- compiler/defcmp.pas	(revision 28822)
    +++ compiler/defcmp.pas	(working copy)
    @@ -522,7 +522,10 @@
                               (tstringdef(def_from).len=tstringdef(def_to).len)) and
                              { for ansi- and unicodestrings also the encoding must match }
                              (not(tstringdef(def_from).stringtype in [st_ansistring,st_unicodestring]) or
    -                          (tstringdef(def_from).encoding=tstringdef(def_to).encoding)) then
    +                          (tstringdef(def_from).encoding=tstringdef(def_to).encoding) or
    +						  (((tstringdef(def_to).encoding=globals.CP_NONE) or (tstringdef(def_to).encoding=0)) and
    +						   (tstringdef(def_from).stringtype=st_ansistring))
    +						 ) then
                             eq:=te_equal
                          else
                            begin
    Index: compiler/htypechk.pas
    ===================================================================
    --- compiler/htypechk.pas	(revision 28822)
    +++ compiler/htypechk.pas	(working copy)
    @@ -2770,6 +2770,7 @@
                         is_ansistring(def_from) and
                         is_ansistring(def_to) and
                         (tstringdef(def_from).encoding<>tstringdef(def_to).encoding) and
    +					((tstringdef(def_to).encoding<>globals.CP_NONE) and (tstringdef(def_to).encoding<>0)) and
                         (currpara.varspez in [vs_var,vs_out]) then
                         eq:=te_convert_l1 // don't allow to pass different ansistring types to each-other
                      else
    Index: rtl/inc/astrings.inc
    ===================================================================
    --- rtl/inc/astrings.inc	(revision 28822)
    +++ rtl/inc/astrings.inc	(working copy)
    @@ -216,10 +216,12 @@
     {$ifdef FPC_HAS_CPSTRING}
       DestCP:=cp;
       if DestCp=CP_NONE then
    -    DestCP:=DefaultSystemCodePage;
    +    DestCP:=0; //DefaultSystemCodePage;
     {$else FPC_HAS_CPSTRING}
       DestCP:=StringCodePage(DestS);
     {$endif FPC_HAS_CPSTRING}
    +  if DestCP=0 then
    +    DestCP:=StringCodePage(DestS);
       DestCP:=TranslatePlaceholderCP(DestCP);
       { if codepages are different then concat using unicodestring,
         but avoid conversions if either addend is empty (StringCodePage will return
    Index: rtl/objpas/sysutils/sysstr.inc
    ===================================================================
    --- rtl/objpas/sysutils/sysstr.inc	(revision 28822)
    +++ rtl/objpas/sysutils/sysstr.inc	(working copy)
    @@ -76,7 +76,7 @@
     Dest := Dest + S;
     end ;
     
    -Function InternalChangeCase(Const S : AnsiString; const Chars: TSysCharSet; const Adjustment: Longint): AnsiString;
    +Function InternalChangeCase(Const S : string; const Chars: TSysCharSet; const Adjustment: Longint): string;
       var
         i : Integer;
         P : PChar;
    @@ -106,7 +106,7 @@
     
     {   UpperCase returns a copy of S where all lowercase characters ( from a to z )
         have been converted to uppercase   }
    -Function UpperCase(Const S : AnsiString) : AnsiString;
    +Function UpperCase(Const S : string) : string;
       begin
         Result:=InternalChangeCase(S,['a'..'z'],-32);
       end;
    @@ -114,7 +114,7 @@
     
     {   LowerCase returns a copy of S where all uppercase characters ( from A to Z )
         have been converted to lowercase  }
    -Function Lowercase(Const S : AnsiString) : AnsiString;
    +Function Lowercase(Const S : string) : string;
       begin
         Result:=InternalChangeCase(S,['A'..'Z'],32);
       end;
    @@ -2635,8 +2635,8 @@
     
     Function StringReplace(const S, OldPattern, NewPattern: string;  Flags: TReplaceFlags): string;
     var
    -  Srch,OldP,RemS: string; // Srch and Oldp can contain uppercase versions of S,OldPattern
    -  P : Integer;
    +  Srch,OldP,RemS,RetTemp: string; // Srch and Oldp can contain uppercase versions of S,OldPattern
    +  P, ENC : Integer;
     begin
       Srch:=S;
       OldP:=OldPattern;
    @@ -2647,22 +2647,25 @@
         end;
       RemS:=S;
       Result:='';
    +  RetTemp:=' ';
    +  ENC:=StringCodepage(S);
    +  SetCodePage(RetTemp,ENC,False);
       while (Length(Srch)<>0) do
         begin
         P:=AnsiPos(OldP, Srch);
         if P=0 then
           begin
    -      Result:=Result+RemS;
    +      RetTemp:=RetTemp+RemS;
           Srch:='';
           end
         else
           begin
    -      Result:=Result+Copy(RemS,1,P-1)+NewPattern;
    +      RetTemp:=RetTemp+Copy(RemS,1,P-1)+NewPattern;
           P:=P+Length(OldP);
           RemS:=Copy(RemS,P,Length(RemS)-P+1);
           if not (rfReplaceAll in Flags) then
             begin
    -        Result:=Result+RemS;
    +        RetTemp:=RetTemp+RemS;
             Srch:='';
             end
           else
    @@ -2669,6 +2672,9 @@
              Srch:=Copy(Srch,P,Length(Srch)-P+1);
           end;
         end;
    +  Result:=pchar(@RetTemp[2]);	// ignore first char, and pchar magic! Is it save memory? or Delete(Result,1,1)?
    +  SetCodePage(Result,ENC,False);
    +  UniqueString(Result);			// it need?
     end;
     
     
    

Activities

Michael Van Canneyt

2012-07-24 14:12

administrator   ~0061200

I cannot reprocude the problem.

The following program:

uses sysutils;

procedure test(s: string);
var i, j: integer;
begin
 i:=pos('€', s); //1
 s:=stringreplace(s, 'anything', 'anything2', []); // do nothing
 j:=pos('€', s); //0
  if i<>j then writeln('Bug!');
end;

begin
  test('€');
end.

does not print anything.
I suspect the bug is not in stringreplace, but in some conversion routine.

Tested with Free Pascal Compiler version 2.7.1 [2012/07/16] for x86_64,
with ansistring and shortstring.

Marco van de Voort

2012-07-24 14:48

manager   ~0061201

Maybe the widestring (?) return of utf8decode assigned to the variant goes wrong.

But first we must have a minimal (preferably, without lazarus dependencies) COMPILABLE program to test this.

ocean

2012-07-24 16:08

reporter   ~0061206

test('€'); works here too.

It was not related to variant, even this gives me the problem. Remove "stringreplace" line, and it works.

var s: string;
begin
 s:=UTF8Encode(UTF8Decode('€'));
 s:=stringreplace(s, 'a', 'b', []);
 if pos('€', s)=0 then showmessage('bug');
end;

Tested 3 computers

WinXP + some older 2.7.1 = Bug
Win7 + version in post0 = Bug
Win7 + 0.9.30/2.6.0 = Works

Michael Van Canneyt

2012-07-24 16:20

administrator   ~0061207

Tested your new program on Linux, it still works.

So it is either windows related, or you are not using the sysutils version of stringreplace, maybe Lazarus has a lazarus-specific version of this routine.
 
Please also check if you are using the FPC variant of the UTF8 routines, or the lazarus specific ones.

I tested without lazarus.

Bart Broersma

2012-07-24 18:25

reporter   ~0061208

This program tests both scenarios (with and without olevriants).

program sr;

{$mode objfpc}{$H+}

uses SysUtils, ActiveX;


function testsr(s: string): Boolean;
var i, j: integer;
begin
 i:=pos('€', s);
 if i <> 1 then writeln('i = ',i,' [should be 1]');
 s:=stringreplace(s, 'anything', 'anything2', []); // do nothing
 j:=pos('€', s);
 if j <> 1 then writeln('j = ',j,' [should be 1]');
 Result := (i=1) and (j=1);
end;

procedure testo;
var o: olevariant;
  b: Boolean;
begin
 o:=UTF8Decode('€');
 b := testsr(UTF8Encode(o));
 write('Test with olevariant = ');
 if b then writeln('Ok') else writeln('Fail');
end;

procedure tests;
var b: Boolean;
begin
 b := testsr(UTF8Encode(UTF8Decode('€')));
 write('Test with string = ');
 if b then writeln('Ok') else writeln('Fail');
end;

begin
  testo;
  tests;
end.

C:\Users\Bart\LazarusProjecten\ConsoleProjecten\bugs\StringReplace>fpc sr.lpr
Free Pascal Compiler version 2.6.0 [2011/12/25] for i386
Copyright (c) 1993-2011 by Florian Klaempfl and others
Target OS: Win32 for i386
Compiling sr.lpr
Linking sr.exe
41 lines compiled, 0.3 sec , 150128 bytes code, 22668 bytes data

C:\Users\Bart\LazarusProjecten\ConsoleProjecten\bugs\StringReplace>sr
Test with olevariant = Ok
Test with string = Ok

Tested on Win7

ocean

2012-07-24 19:28

reporter   ~0061210

I tested your program, Win7/32, locale finnish.

You used 2.6.0, that works here too.

C:\lazarus\fpc\2.7.1\bin\i386-win32>fpc c:/bug/sr.lpr
Free Pascal Compiler version 2.7.1 [2012/05/23] for i386
Copyright (c) 1993-2012 by Florian Klaempfl and others
Target OS: Win32 for i386
Compiling c:\bug\sr.lpr
Linking c:\bug\sr.exe
39 lines compiled, 3.8 sec, 149984 bytes code, 24972 bytes data

C:\lazarus\fpc\2.7.1\bin\i386-win32>c:/bug/sr
i = 0 [should be 1]
j = 0 [should be 1]
Test with olevariant = Fail
i = 0 [should be 1]
j = 0 [should be 1]
Test with string = Fail

ocean

2012-07-24 19:47

reporter   ~0061211

Try 2, (Coding broke, when I copied it from here to my texteditor, sorry)

C:\lazarus\fpc\2.7.1\bin\i386-win32>fpc c:/bug/sr2.lpr
Free Pascal Compiler version 2.7.1 [2012/05/23] for i386
Copyright (c) 1993-2012 by Florian Klaempfl and others
Target OS: Win32 for i386
Compiling c:\bug\sr2.lpr
Linking c:\bug\sr2.exe
40 lines compiled, 3.6 sec, 150000 bytes code, 24972 bytes data

C:\lazarus\fpc\2.7.1\bin\i386-win32>c:/bug/sr2
j = 0 [should be 1]
Test with olevariant = Fail
j = 0 [should be 1]
Test with string = Fail

Bart Broersma

2012-07-24 20:56

reporter   ~0061212

Last edited: 2012-07-24 21:07

This should fix any copy/paste problems, and it will tell us the content of S if things go wrong.

program sr;

{$mode objfpc}{$H+}

uses SysUtils, ActiveX;

const
  EUR = Chr(226) + Chr(130) + Chr(172); //UTF-8 sequnece for the Euro symbol


function testsr(s: string): Boolean;
var x, i, j: integer;
begin
 i:=pos(EUR, s);
 if i <> 1 then
 begin
   writeln('Before StringReplace: i = ',i,' [should be 1]');
   write('EUR = ');
   for x := 1 to length(EUR) do write('#',Ord(EUR[x]),' '); writeln;
   write('S = ');
   for x := 1 to length(s) do write('#',Ord(s[x]),' '); writeln;
 end;
 s:=stringreplace(s, 'anything', 'anything2', []); // do nothing
 j:=pos(EUR, s);
 if j <> 1 then
 begin
   writeln('Before StringReplace: j = ',j,' [should be 1]');
   write('EUR = ');
   for x := 1 to length(EUR) do write('#',Ord(EUR[x]),' '); writeln;
   write('S = ');
   for x := 1 to length(s) do write('#',Ord(s[x]),' '); writeln;
 end;
 Result := (i=1) and (j=1);
end;

procedure testo;
var o: olevariant;
  b: Boolean;
begin
 o:=UTF8Decode(EUR);
 b := testsr(UTF8Encode(o));
 write('Test with olevariant = ');
 if b then writeln('Ok') else writeln('Fail');
end;

procedure tests;
var b: Boolean;
begin
 b := testsr(UTF8Encode(UTF8Decode(EUR)));
 write('Test with string = ');
 if b then writeln('Ok') else writeln('Fail');
end;

begin
  testo;
  tests;
end.


Please re-test.

ocean

2012-07-24 21:15

reporter   ~0061213

C:\lazarus\fpc\2.7.1\bin\i386-win32>fpc c:/bug/sr.lpr
Free Pascal Compiler version 2.7.1 [2012/05/23] for i386
Copyright (c) 1993-2012 by Florian Klaempfl and others
Target OS: Win32 for i386
Compiling c:\bug\sr.lpr
Linking c:\bug\sr.exe
56 lines compiled, 3.7 sec, 150704 bytes code, 25036 bytes data

C:\lazarus\fpc\2.7.1\bin\i386-win32>c:/bug/sr
Before StringReplace: j = 0 [should be 1]
EUR = 0000226 0000130 0000172
S = 0000128
Test with olevariant = Fail
Before StringReplace: j = 0 [should be 1]
EUR = 0000226 0000130 0000172
S = 0000128
Test with string = Fail

Ludo Brands

2012-07-24 21:41

developer   ~0061214

Last edited: 2012-07-24 22:32

I can reproduce the problem with fpc 2.7.1 21643. Program attached. Replace var o: variant; with var o: olevariant; and the program will crash with a EAccessViolation. But that is probably yet another bug.
At i:=pos('€', s); s contains 0xe2 0x82 0xac 0x00 which is utf8 for €. At j:=pos('€', s); s contains 0x80 0x00.
Stringreplace is from sysstr.inc in the rtl.
Did some debugging and the problem is in fpc_AnsiStr_Concat. StringReplace does a Result:=Result+RemS; with result being ''. fpc_AnsiStr_Concat does
  if (Pointer(DestS)=nil) then
    DestCP:=cp

and forces the codepage to the system code page which is 1252 on windows.

Edit: Olevariant crash reported as 0022504

2012-07-24 21:42

 

test.pas (357 bytes)
program test;

{$mode objfpc}{$H+}

uses
  Classes,sysutils;

procedure test(s: string);
var i, j: integer;
begin
 i:=pos('€', s); //1
 s:=stringreplace(s, 'anything', 'anything2', []); // do nothing
 j:=pos('€', s); //0
 if i<>j then writeln('Bug!');
end;

var o: variant;
begin
 o:=UTF8Decode('€');
 test(UTF8Encode(o));
end.

test.pas (357 bytes)

Bart Broersma

2012-07-24 22:43

reporter   ~0061215

Does the program (my latest version from the notes above) still fail if you leave out the utf8Encode(utf8 decode()) part?
It should if your analysis is correct (which would mean that many string functions would suffer from this?).

Ludo Brands

2012-07-25 08:16

developer   ~0061229

@ Bart. Are you kidding? "we must have a minimal (preferably, without lazarus dependencies) COMPILABLE program to test this". Is 357 bytes not minimal enough? Why do we need to test your program when you have a program that reproduces the problem.

A shorter version if you have problems downloading the attached file and that shows the extend of the problem (line second bug):

program test2;

var s,snul:string;
  s1:string;
begin
  snul:='';
  s:=utf8encode(utf8decode('€'));
  s1:=snul+s;
  if s[1]<>s1[1] then
    begin
    writeln('bug');
    if s=s1 then
      writeln('a second bug');
    end;
end.

The s=s1 works because before doing the compare both sides are converted to the system code page. Reason why I did a s[1]<>s1[1] before. s1 does not contain a €.
The problem is linked to utf8Encode returning a RawByteString. To illustrate the problem with concatenating RawByteString and string try this one (even shorter and using your preferred € encoding):

program test2;

var sr:RawByteString;
  snul,s1:string;
begin
  snul:='';
  sr:=Chr(226) + Chr(130) + Chr(172);
  s1:=snul+sr;
  if sr<>s1 then
    writeln('bug');
end.

In this test s1 is empty which is even more surprising. Using an intermediate string for the concat doesn't change the result:

program test2;

var sr:RawByteString;
  snul,s,s1:string;
begin
  snul:='';
  sr:=Chr(226) + Chr(130) + Chr(172);
  s:=sr;
  s1:=snul+s;
  if sr<>s1 then
    writeln('bug');
end.

s contains € but s1 is '' again.

Sergei Gorelkin

2012-07-25 10:10

developer   ~0061235

Make no mistake, in 2.7.1 the generic 'string' type and all functions using it as arguments or result types are generally unusable in Windows if you deal with data outside of ANSI codepage. In will drop data outside ANSI codepage. Hacks like utf8encode which actually permit the result string to contain data in different encoding than its declaration soften some corners but don't help in general.

In D2009+ this model is usable because 'string' aliases to UnicodeString. It is also usable in Linux because of its default utf8 codepage.

To make it work properly, each and every procedure/function working with strings, excluding a few basic ones like concat/insert/delete (these are already fixed), has to be changed to accept RawByteString arguments, examine actual encoding of arguments and do necessary conversions. While in principle this can be done for RTL/packages codebase, I remain very pessimistic about possibility for people to write their own string functions using the new model.

Ludo Brands

2012-07-25 11:47

developer   ~0061238

> excluding a few basic ones like concat/insert/delete (these are already fixed)

except that in case of concat with mixed encodings the results goes through a unicode to system encoding conversion even if one side is null. Concat with an empty string should be a zero operation.

BTW The empty result string in the last tests is caused by Win32Ansi2UnicodeMove of a RawByteString that calls MultiByteToWideChar with cp=$ffff (CP_NONE) which is not supported.

theo

2012-07-25 12:26

reporter   ~0061239

Don't know if exactly the same, but I had some problems with StringReplace on Lazarus 1.1 r FPC 2.7.1 x86_64-win64-win32/win64 some minutes ago.

Everything works as I expect when using this code:

initialization
widestringmanager.Unicode2AnsiMoveProc:=@DefaultUnicode2AnsiMove;
widestringmanager.Ansi2UnicodeMoveProc:=@DefaultAnsi2UnicodeMove;

Sergei Gorelkin

2012-07-25 13:07

developer   ~0061240

Here you use RawByteString to hack a utf8 sequence into an ansistring which cannot contain one. This isn't going to work (and RawByteString is intended to be used only along with manual manipulations on codepage field -- but since nothing prevents its use as generic type, similar issues are going to pop up endlessly, I guess).

fpc_ansistr_concat routine assumes codepage of an empty argument equal to destination codepage, so it won't perform conversion if codepage of another argument matches destination too. If another argument has different codepage then conversion will still occur (and it will go through unicode because direct conversion from one non-unicode codepage to another is not possible). I believe this behavior is correct.

fpc_ansistr_concat_multi does not follow this pattern though, it likely needs a fix.

Bart Broersma

2012-07-25 14:11

reporter   ~0061241

@Sergei:
> Make no mistake, in 2.7.1 the generic 'string' type and all functions using it
> as arguments or result types are generally unusable in Windows if you deal with
> data outside of ANSI codepage.

Maybe I misunderstand this, but it seems to me that this would make all fpc string operations almost useless in Lazarus on Windows, since LCL is UTF-8 by nature and Windows isn't.

@Ludo: forgive my ignorance. Whay I mainly wanted to know was:

program test2;

var s,snul:string;
  s1:string;
begin
  snul:='';
  s:=utf8encode(utf8decode('€')); //<<-- ****
  s1:=snul+s;
  if s[1]<>s1[1] then
    begin
    writeln('bug');
    if s=s1 then
      writeln('a second bug');
    end;
end.


If you leave out the utf8encode(utf8decode('€')) part and simply use s := '€', does this bug still happen? IOW, what part does the utf8encode(utf8decode()) play in all of this.
(Alo my test offerd some insight into the content of the strings.)

Sergei Gorelkin

2012-07-25 14:58

developer   ~0061243

@Bart:
>Maybe I misunderstand this, but it seems to me that this would make all fpc >string operations almost useless in Lazarus on Windows, since LCL is UTF-8 by >nature and Windows isn't.

You understand correctly. Either LCL will have to change all 'string' declarations into 'utf8string' to work properly in utf8 encoding, or FPC will have to introduce means to redefine 'string' type globally as utf8string.

Jonas Maebe

2012-07-25 15:49

manager   ~0061245

Or the LCL (or the LCL program) could set DefaultSystemCodePage to CP_UTF8.

Do-wan Kim

2014-10-15 01:37

reporter   ~0078232

Last edited: 2014-10-15 09:09

View 6 revisions

It's simply fixed by ignore ansistring encoding type in parameter with some functions and procedures.

patch ignores string encoding defined type "string" and "rawbytestring" in paramaters in any func or proc. And ansi* named proc and func should be change "ansisting" in parameters, but it was not done in patch.

(edit) patch works no problem under linux.

oops, my patch is wrong. Sorry, forget it.

Do-wan Kim

2014-10-15 02:20

reporter  

ignore_ansistring_encoding_somefunc.patch (2,761 bytes)
Index: compiler/defcmp.pas
===================================================================
--- compiler/defcmp.pas	(revision 28821)
+++ compiler/defcmp.pas	(working copy)
@@ -522,7 +522,10 @@
                           (tstringdef(def_from).len=tstringdef(def_to).len)) and
                          { for ansi- and unicodestrings also the encoding must match }
                          (not(tstringdef(def_from).stringtype in [st_ansistring,st_unicodestring]) or
-                          (tstringdef(def_from).encoding=tstringdef(def_to).encoding)) then
+                          (tstringdef(def_from).encoding=tstringdef(def_to).encoding) or
+						  (((tstringdef(def_to).encoding=globals.CP_NONE) or (tstringdef(def_to).encoding=0)) and
+						   (tstringdef(def_from).stringtype=st_ansistring))
+						 ) then
                         eq:=te_equal
                      else
                        begin
Index: compiler/htypechk.pas
===================================================================
--- compiler/htypechk.pas	(revision 28821)
+++ compiler/htypechk.pas	(working copy)
@@ -2770,6 +2770,7 @@
                     is_ansistring(def_from) and
                     is_ansistring(def_to) and
                     (tstringdef(def_from).encoding<>tstringdef(def_to).encoding) and
+					((tstringdef(def_to).encoding<>globals.CP_NONE) and (tstringdef(def_to).encoding<>0)) and
                     (currpara.varspez in [vs_var,vs_out]) then
                     eq:=te_convert_l1 // don't allow to pass different ansistring types to each-other
                  else
Index: rtl/objpas/sysutils/sysstr.inc
===================================================================
--- rtl/objpas/sysutils/sysstr.inc	(revision 28821)
+++ rtl/objpas/sysutils/sysstr.inc	(working copy)
@@ -76,7 +76,7 @@
 Dest := Dest + S;
 end ;
 
-Function InternalChangeCase(Const S : AnsiString; const Chars: TSysCharSet; const Adjustment: Longint): AnsiString;
+Function InternalChangeCase(Const S : string; const Chars: TSysCharSet; const Adjustment: Longint): string;
   var
     i : Integer;
     P : PChar;
@@ -106,7 +106,7 @@
 
 {   UpperCase returns a copy of S where all lowercase characters ( from a to z )
     have been converted to uppercase   }
-Function UpperCase(Const S : AnsiString) : AnsiString;
+Function UpperCase(Const S : string) : string;
   begin
     Result:=InternalChangeCase(S,['a'..'z'],-32);
   end;
@@ -114,7 +114,7 @@
 
 {   LowerCase returns a copy of S where all uppercase characters ( from A to Z )
     have been converted to lowercase  }
-Function Lowercase(Const S : AnsiString) : AnsiString;
+Function Lowercase(Const S : string) : string;
   begin
     Result:=InternalChangeCase(S,['A'..'Z'],32);
   end;

Do-wan Kim

2014-10-15 13:16

reporter   ~0078241

"stringreplace" result type is used in "fpc_AnsiStr_Concat" "cp" parameter, if result type is "string", "cp" is always 0 that compiler read only defined not actual encoding from string variable. and then "Destcp" changed systemdefaultcodepage. if systemcodepage is utf-8, "cp(destcp)" is 65001, and no problem but it's buggy not noticed it.

I changed it, but I got another problem.

procedure fpc_AnsiStr_Concat (var DestS:RawByteString;const S1,S2 : RawByteString{$ifdef FPC_HAS_CPSTRING};cp : TSystemCodePage{$endif FPC_HAS_CPSTRING}); compilerproc;
Var
  S1Len, S2Len: SizeInt;
  same : boolean;
  S1CP, S2CP, DestCP: TSystemCodePage;
begin
{$ifdef FPC_HAS_CPSTRING}
  DestCP:=cp; // <- cp always defined value.
  if DestCp=CP_NONE then
    DestCP:=0; //DefaultSystemCodePage;
{$else FPC_HAS_CPSTRING}
  DestCP:=StringCodePage(DestS);
{$endif FPC_HAS_CPSTRING}
  if DestCP=0 then
    DestCP:=StringCodePage(DestS); // <--- get string variables acutual codepage.
  DestCP:=TranslatePlaceholderCP(DestCP);
  { if codepages are different then concat using unicodestring,
    but avoid conversions if either addend is empty (StringCodePage will return
    DefaultSystemCodePage in that case, which may differ from other addend/dest) }
...
  if (S1CP<>DestCP) or (S2CP<>DestCP) then
    begin
      ansistr_concat_complex(DestS,S1,S2,DestCP); // <--- now DestCP corrent encodig.
      exit;
    end;
  { only assign if s1 or s2 is empty }


Function StringReplace(const S, OldPattern, NewPattern: string; Flags: TReplaceFlags): string;
var
  Srch,OldP,RemS: string; // Srch and Oldp can contain uppercase versions of S,OldPattern
  P, c : Integer;
begin
  Srch:=S;
  OldP:=OldPattern;
  if rfIgnoreCase in Flags then
    begin
    Srch:=AnsiUpperCase(Srch);
    OldP:=AnsiUpperCase(OldP);
    end;
  RemS:=S;
  Result:=' '; // <--- if '', cannot get string encoding infomation.
  SetCodePage(Result,StringCodepage(S),False);
  while (Length(Srch)<>0) do
    begin
    P:=AnsiPos(OldP, Srch);
    if P=0 then
      begin
      Result:=Result+RemS;
      Srch:='';
      end
    else
      begin
      Result:=Result+Copy(RemS,1,P-1)+NewPattern;
      P:=P+Length(OldP);
      RemS:=Copy(RemS,P,Length(RemS)-P+1);
      if not (rfReplaceAll in Flags) then
        begin
        Result:=Result+RemS;
        Srch:='';
        end
      else
         Srch:=Copy(Srch,P,Length(Srch)-P+1);
      end;
    end;
  Delete(Result,1,1); // <-- delete first space value. but slow
end;

This problem also happen similar functions, returns result from source string encoding.

Jonas Maebe

2014-10-15 13:27

manager   ~0078242

You cannot change the implementation of fpc_AnsiStr_Concat() like that. See http://wiki.freepascal.org/FPC_Unicode_support#String_concatenations for details on how it must work.

It also won't help because the parameters to StringReplace are defined as Ansistring (= AnsiString(0)), and hence they will be converted to DefaultSystemCodePage on the calling side already.

The only way this can be solved is by changing the StringReplace() parameter types to RawByteString, inserting the necessary codepage conversions explicitly in the code inside that routine, and then using operations that are guaranteed not to insert any further code page conversions (when code page conversions are performed is documented on the aforementioned wiki page).

Do-wan Kim

2014-10-15 14:02

reporter  

stringreplace_but_breaktherules.patch (4,812 bytes)
Index: compiler/defcmp.pas
===================================================================
--- compiler/defcmp.pas	(revision 28822)
+++ compiler/defcmp.pas	(working copy)
@@ -522,7 +522,10 @@
                           (tstringdef(def_from).len=tstringdef(def_to).len)) and
                          { for ansi- and unicodestrings also the encoding must match }
                          (not(tstringdef(def_from).stringtype in [st_ansistring,st_unicodestring]) or
-                          (tstringdef(def_from).encoding=tstringdef(def_to).encoding)) then
+                          (tstringdef(def_from).encoding=tstringdef(def_to).encoding) or
+						  (((tstringdef(def_to).encoding=globals.CP_NONE) or (tstringdef(def_to).encoding=0)) and
+						   (tstringdef(def_from).stringtype=st_ansistring))
+						 ) then
                         eq:=te_equal
                      else
                        begin
Index: compiler/htypechk.pas
===================================================================
--- compiler/htypechk.pas	(revision 28822)
+++ compiler/htypechk.pas	(working copy)
@@ -2770,6 +2770,7 @@
                     is_ansistring(def_from) and
                     is_ansistring(def_to) and
                     (tstringdef(def_from).encoding<>tstringdef(def_to).encoding) and
+					((tstringdef(def_to).encoding<>globals.CP_NONE) and (tstringdef(def_to).encoding<>0)) and
                     (currpara.varspez in [vs_var,vs_out]) then
                     eq:=te_convert_l1 // don't allow to pass different ansistring types to each-other
                  else
Index: rtl/inc/astrings.inc
===================================================================
--- rtl/inc/astrings.inc	(revision 28822)
+++ rtl/inc/astrings.inc	(working copy)
@@ -216,10 +216,12 @@
 {$ifdef FPC_HAS_CPSTRING}
   DestCP:=cp;
   if DestCp=CP_NONE then
-    DestCP:=DefaultSystemCodePage;
+    DestCP:=0; //DefaultSystemCodePage;
 {$else FPC_HAS_CPSTRING}
   DestCP:=StringCodePage(DestS);
 {$endif FPC_HAS_CPSTRING}
+  if DestCP=0 then
+    DestCP:=StringCodePage(DestS);
   DestCP:=TranslatePlaceholderCP(DestCP);
   { if codepages are different then concat using unicodestring,
     but avoid conversions if either addend is empty (StringCodePage will return
Index: rtl/objpas/sysutils/sysstr.inc
===================================================================
--- rtl/objpas/sysutils/sysstr.inc	(revision 28822)
+++ rtl/objpas/sysutils/sysstr.inc	(working copy)
@@ -76,7 +76,7 @@
 Dest := Dest + S;
 end ;
 
-Function InternalChangeCase(Const S : AnsiString; const Chars: TSysCharSet; const Adjustment: Longint): AnsiString;
+Function InternalChangeCase(Const S : string; const Chars: TSysCharSet; const Adjustment: Longint): string;
   var
     i : Integer;
     P : PChar;
@@ -106,7 +106,7 @@
 
 {   UpperCase returns a copy of S where all lowercase characters ( from a to z )
     have been converted to uppercase   }
-Function UpperCase(Const S : AnsiString) : AnsiString;
+Function UpperCase(Const S : string) : string;
   begin
     Result:=InternalChangeCase(S,['a'..'z'],-32);
   end;
@@ -114,7 +114,7 @@
 
 {   LowerCase returns a copy of S where all uppercase characters ( from A to Z )
     have been converted to lowercase  }
-Function Lowercase(Const S : AnsiString) : AnsiString;
+Function Lowercase(Const S : string) : string;
   begin
     Result:=InternalChangeCase(S,['A'..'Z'],32);
   end;
@@ -2635,8 +2635,8 @@
 
 Function StringReplace(const S, OldPattern, NewPattern: string;  Flags: TReplaceFlags): string;
 var
-  Srch,OldP,RemS: string; // Srch and Oldp can contain uppercase versions of S,OldPattern
-  P : Integer;
+  Srch,OldP,RemS,RetTemp: string; // Srch and Oldp can contain uppercase versions of S,OldPattern
+  P, ENC : Integer;
 begin
   Srch:=S;
   OldP:=OldPattern;
@@ -2647,22 +2647,25 @@
     end;
   RemS:=S;
   Result:='';
+  RetTemp:=' ';
+  ENC:=StringCodepage(S);
+  SetCodePage(RetTemp,ENC,False);
   while (Length(Srch)<>0) do
     begin
     P:=AnsiPos(OldP, Srch);
     if P=0 then
       begin
-      Result:=Result+RemS;
+      RetTemp:=RetTemp+RemS;
       Srch:='';
       end
     else
       begin
-      Result:=Result+Copy(RemS,1,P-1)+NewPattern;
+      RetTemp:=RetTemp+Copy(RemS,1,P-1)+NewPattern;
       P:=P+Length(OldP);
       RemS:=Copy(RemS,P,Length(RemS)-P+1);
       if not (rfReplaceAll in Flags) then
         begin
-        Result:=Result+RemS;
+        RetTemp:=RetTemp+RemS;
         Srch:='';
         end
       else
@@ -2669,6 +2672,9 @@
          Srch:=Copy(Srch,P,Length(Srch)-P+1);
       end;
     end;
+  Result:=pchar(@RetTemp[2]);	// ignore first char, and pchar magic! Is it save memory? or Delete(Result,1,1)?
+  SetCodePage(Result,ENC,False);
+  UniqueString(Result);			// it need?
 end;
 
 

Do-wan Kim

2014-10-15 14:03

reporter   ~0078243

Last edited: 2014-10-16 02:42

View 3 revisions

yep, it break the rules xD

(edit) I guess fpc rtl have more functions like setstring and strcat with encoding parameter for relible string manupulations and remove strict type checking of rawbytestring on parameters. It's not only stringreplace problem.

setstring(..,.enc);
strcat(...,enc);

Michael Van Canneyt

2016-03-27 12:31

administrator   ~0091476

You must use the unicodestring version of stringreplace.
StringReplace uses ansistring, that will not work with UTF8, because it is not intended to work with UTF8.

Following works just fine:

program test;

{$mode objfpc}{$H+}
{$codepage utf8}

uses
  Classes,sysutils;

procedure test(s: unicodestring);
var i, j: integer;
begin
 i:=pos('€', s); //1
 s:=stringreplace(s, 'anything', 'anything2', []); // do nothing
 j:=pos('€', s); //0
 if i<>j then writeln('Bug!');
end;

var o: variant;
begin
 o:=UTF8Decode('€');
 test(UTF8Encode(o));
end.

Issue History

Date Modified Username Field Change
2012-07-24 13:42 ocean New Issue
2012-07-24 14:12 Michael Van Canneyt Note Added: 0061200
2012-07-24 14:48 Marco van de Voort Note Added: 0061201
2012-07-24 15:36 Marco van de Voort Status new => feedback
2012-07-24 16:08 ocean Note Added: 0061206
2012-07-24 16:20 Michael Van Canneyt Note Added: 0061207
2012-07-24 18:25 Bart Broersma Note Added: 0061208
2012-07-24 19:28 ocean Note Added: 0061210
2012-07-24 19:47 ocean Note Added: 0061211
2012-07-24 20:56 Bart Broersma Note Added: 0061212
2012-07-24 20:57 Bart Broersma Note Edited: 0061212
2012-07-24 20:58 Bart Broersma Note Edited: 0061212
2012-07-24 21:05 Bart Broersma Note Edited: 0061212
2012-07-24 21:06 Bart Broersma Note Edited: 0061212
2012-07-24 21:07 Bart Broersma Note Edited: 0061212
2012-07-24 21:15 ocean Note Added: 0061213
2012-07-24 21:41 Ludo Brands Note Added: 0061214
2012-07-24 21:42 Ludo Brands File Added: test.pas
2012-07-24 22:32 Ludo Brands Note Edited: 0061214
2012-07-24 22:43 Bart Broersma Note Added: 0061215
2012-07-25 08:16 Ludo Brands Note Added: 0061229
2012-07-25 10:10 Sergei Gorelkin Note Added: 0061235
2012-07-25 11:47 Ludo Brands Note Added: 0061238
2012-07-25 12:26 theo Note Added: 0061239
2012-07-25 13:07 Sergei Gorelkin Note Added: 0061240
2012-07-25 14:11 Bart Broersma Note Added: 0061241
2012-07-25 14:58 Sergei Gorelkin Note Added: 0061243
2012-07-25 15:27 Marco van de Voort Status feedback => confirmed
2012-07-25 15:49 Jonas Maebe Note Added: 0061245
2014-10-15 01:37 Do-wan Kim Note Added: 0078232
2014-10-15 02:20 Do-wan Kim File Added: ignore_ansistring_encoding_somefunc.patch
2014-10-15 02:25 Do-wan Kim Note Edited: 0078232 View Revisions
2014-10-15 02:25 Do-wan Kim Note Edited: 0078232 View Revisions
2014-10-15 02:26 Do-wan Kim Note Edited: 0078232 View Revisions
2014-10-15 03:54 Do-wan Kim Note Edited: 0078232 View Revisions
2014-10-15 09:09 Do-wan Kim Note Edited: 0078232 View Revisions
2014-10-15 13:16 Do-wan Kim Note Added: 0078241
2014-10-15 13:27 Jonas Maebe Note Added: 0078242
2014-10-15 14:02 Do-wan Kim File Added: stringreplace_but_breaktherules.patch
2014-10-15 14:03 Do-wan Kim Note Added: 0078243
2014-10-16 02:41 Do-wan Kim Note Edited: 0078243 View Revisions
2014-10-16 02:42 Do-wan Kim Note Edited: 0078243 View Revisions
2016-03-27 12:31 Michael Van Canneyt Note Added: 0091476
2016-03-27 12:31 Michael Van Canneyt Status confirmed => resolved
2016-03-27 12:31 Michael Van Canneyt Resolution open => no change required
2016-03-27 12:31 Michael Van Canneyt Assigned To => Michael Van Canneyt