View Issue Details

IDProjectCategoryView StatusLast Update
0026864FPCRTLpublic2017-07-28 01:08
ReporterJarto TarpioAssigned ToMichael Van Canneyt 
PrioritynormalSeverityminorReproducibilityalways
Status resolvedResolutionfixed 
Product Version2.6.4Product Build 
Target Version3.0.2Fixed in Version3.1.1 
Summary0026864: New, faster StringReplace
DescriptionRTL's StringReplace is extremely slow. It does a lot of string copy and concatenation - even with cases where the search and replace patterns are as long. I noticed this with Indy10, which uses StringReplace to replace '+' with ' ' while parsing http POST data.
Steps To ReproduceTry StringReplace(StringOfChar('a',200000),'a','b',[rfReplaceAll]);
Additional InformationAttached is a new, faster version, which I'd like to contribute to FPC's RTL. With a string of just 50K, it's up to 150 times faster than FPC's current implementation. When the string is 200K, it's up to 1000 times faster.

All string concatenations are eliminated. The general idea is to calculate the length of the result string, set the length once and only manipulate the contents of it.

The new implementation uses PosEx so StrUtils is needed. I wonder if it's a problem?
TagsNo tags attached.
Fixed in Revision33055
FPCOldBugId
FPCTarget
Attached Files
  • strreplace.txt (2,057 bytes)
    Function StringReplace(const S, OldPattern, NewPattern: string;  Flags: TReplaceFlags): string;
    var
      OldPat,Srch: string; // Srch and Oldp can contain uppercase versions of S,OldPattern
      PatLength,NewPatLength,P,Cnt,PatCount,PrevP: Integer;
      c,d: pchar;
    begin
      PatLength:=Length(OldPattern);
      if PatLength=0 then begin
        Result:=S;
        exit;
      end;
    
      if rfIgnoreCase in Flags then begin
        Srch:=AnsiUpperCase(S);
        OldPat:=AnsiUpperCase(OldPattern);
      end else begin
        Srch:=S;
        OldPat:=OldPattern;
      end;
    
      PatLength:=Length(OldPat);
      if Length(NewPattern)=PatLength then begin
        //Result length will not change
        Result:=S;
        P:=1;
        repeat
          P:=PosEx(OldPat,Srch,P);
          if P>0 then begin
            move(NewPattern[1],Result[P],PatLength);
            if not (rfReplaceAll in Flags) then exit;
            inc(P,PatLength);
          end;
        until p=0;
      end else begin
        //Different pattern length -> Result length will change
        //To avoid creating a lot of temporary strings, we count how many
        //replacements we're going to make.
        P:=1; PatCount:=0;
        repeat
          P:=PosEx(OldPat,Srch,P);
          if P>0 then begin
            inc(P,PatLength);
            inc(PatCount);
            if not (rfReplaceAll in Flags) then break;
          end;
        until p=0;
        if PatCount=0 then begin
          Result:=S;
          exit;
        end;
        NewPatLength:=Length(NewPattern);
        SetLength(Result,Length(S)+PatCount*(NewPatLength-PatLength));
        P:=1; PrevP:=0;
        c:=pchar(Result); d:=pchar(S);
        repeat
          P:=PosEx(OldPat,Srch,P);
          if P>0 then begin
            Cnt:=P-PrevP-1;
            if Cnt>0 then begin
              Move(d^,c^,Cnt);
              inc(c,Cnt);
              inc(d,Cnt);
            end;
            if NewPatLength>0 then begin
              Move(NewPattern[1],c^,NewPatLength);
              inc(c,NewPatLength);
            end;
            inc(P,PatLength);
            inc(d,PatLength);
            PrevP:=P-1;
            if not (rfReplaceAll in Flags) then break;
          end;
        until p=0;
        Cnt:=Length(S)-PrevP;
        if Cnt>0 then Move(d^,c^,Cnt);
      end;
    end;
    
    strreplace.txt (2,057 bytes)
  • compareString.diff (2,818 bytes)
    Index: sysstr.inc
    ===================================================================
    --- sysstr.inc	(revision 28884)
    +++ sysstr.inc	(working copy)
    @@ -2635,40 +2635,94 @@
     
     Function StringReplace(const S, OldPattern, NewPattern: string;  Flags: TReplaceFlags): string;
     var
    -  Srch,OldP,RemS: string; // Srch and Oldp can contain uppercase versions of S,OldPattern
    -  P : Integer;
    +  Srch,OldP: string; // Srch and Oldp can contain uppercase versions of S,OldPattern
    +  I: SizeInt;
    +  R: SizeInt;   // Result copy index
    +  SC: SizeInt;  // S copy index
    +  MaxLen: SizeInt;
    +  NewPatLen: SizeInt;
    +  Replaced: Boolean;
    +
    +  procedure CopySource(ToIdx: Integer; ReplacePat: Boolean);
    +  var
    +    SrcCnt: SizeInt;
    +    Sz : SizeInt;
    +    NeedSz: SizeInt;
    +    NeedCnt: SizeInt;
    +  begin
    +    SrcCnt:=ToIdx-SC;
    +    Sz:=length(Result);
    +
    +    NeedCnt:=SrcCnt;
    +    if ReplacePat then Inc(NeedCnt, NewPatLen);
    +
    +    If NeedCnt>0 then
    +      begin
    +      NeedSz:=R-1+NeedCnt;
    +      if Sz<NeedSz then
    +        begin
    +        while Sz<NeedSz do
    +          if Sz=0 then Sz:=4 else Sz:=Sz*2;
    +        SetLength(Result, Sz);
    +        end;
    +      Move(S[SC], Result[R], SrcCnt);
    +      inc(SC, SrcCnt);
    +      inc(R,SrcCnt);
    +      end;
    +
    +    if ReplacePat then
    +      begin
    +      inc(SC, Length(OldPattern));
    +      if (NewPatLen>0) then
    +        begin
    +        Move(NewPattern[1], Result[R], NewPatLen);
    +        inc(R, NewPatLen);
    +        end;
    +      end;
    +  end;
    +
     begin
    +  if OldPattern='' then
    +    begin
    +    Result:=S;
    +    Exit;
    +    end;
       Srch:=S;
       OldP:=OldPattern;
    +  NewPatLen:=length(NewPattern);
       if rfIgnoreCase in Flags then
         begin
         Srch:=AnsiUpperCase(Srch);
         OldP:=AnsiUpperCase(OldP);
         end;
    -  RemS:=S;
       Result:='';
    -  while (Length(Srch)<>0) do
    +  I:=1;
    +  SC:=1;
    +  R:=1;
    +  MaxLen:=length(Srch)-Length(OldP)+1;
    +  Replaced:=false;
    +  while I<=MaxLen do
         begin
    -    P:=AnsiPos(OldP, Srch);
    -    if P=0 then
    +    if (Srch[I]=OldP[1]) and (CompareByte(Srch[I], OldP[1], length(OldP))=0) then
           begin
    -      Result:=Result+RemS;
    -      Srch:='';
    +      Replaced:=true;
    +      CopySource(I, True);
    +      inc(I, Length(OldP));
    +      if not (rfReplaceAll in Flags) then Break;
           end
         else
    -      begin
    -      Result:=Result+Copy(RemS,1,P-1)+NewPattern;
    -      P:=P+Length(OldP);
    -      RemS:=Copy(RemS,P,Length(RemS)-P+1);
    -      if not (rfReplaceAll in Flags) then
    -        begin
    -        Result:=Result+RemS;
    -        Srch:='';
    -        end
    -      else
    -         Srch:=Copy(Srch,P,Length(Srch)-P+1);
    -      end;
    +      Inc(I);
         end;
    +
    +  if Replaced then
    +    begin
    +    // at least one replacement has occurred
    +    CopySource(Length(S)+1, False);
    +    SetLength(Result, R-1);
    +    end
    +  else
    +    // no old patterns found
    +    Result:=S;
     end;
     
     
    
    compareString.diff (2,818 bytes)
  • ustringreplace.pas (19,898 bytes)
    unit ustringreplace;
    
    {$mode objfpc}{$H+}
    
    interface
    
    uses
      SysUtils;
    
    function StringReplaceX(const S, OldPattern, NewPattern: ansistring; Flags: TReplaceFlags): ansistring;
    
    function StringReplaceBoyerMoore (const S, OldPattern, NewPattern: ansistring; Flags: TReplaceFlags): ansistring;
    
    implementation
    
    type
      SizeIntArray = array of SizeInt;
    
    procedure FindMatchesBoyerMooreCaseSensitive(const S,OldPattern: PChar; const SSize, OldPatternSize: SizeInt; out aMatches: SizeIntArray; const aMatchAll: Boolean); forward;
    procedure FindMatchesBoyerMooreCaseINSensitive(const S, OldPattern: PChar; const SSize, OldPatternSize: SizeInt; out aMatches: SizeIntArray; const aMatchAll: Boolean); forward;
    
    (*
      StringReplace
    
      Replaces one or many ocurrences of an ansistring in another ansistring by a new one.
      It can perform the compare ignoring case (ansi).
    
      * Parameters (Read only):
      S: The string to be searched in.
      OldPattern: The string to be searched.
      NewPattern: The string to replace OldPattern matches.
      Flags:
        rfReplaceAll: Replace all occurrences.
        rfIgnoreCase: Ignore case in OldPattern matching.
    
      * Returns:
        The modified string (if needed).
    
      The function is almost linear.
      It is memory conservative, just sizeof(SizeInt) per match in blocks off 100 matches
      plus Length(OldPattern)*2 in the case of ignoring case.
      Memory copies are the minimun necessary.
    *)
    
    function StringReplaceX(const S, OldPattern, NewPattern: ansistring;
      Flags: TReplaceFlags): ansistring;
    const
      MATCHESCOUNTRESIZER=100; //Arbitrary value. Memory used = MATCHESCOUNTRESIZER * sizeof(SizeInt)
    var
      //Stores where a replace will take place
      Matches: array of SizeInt;
      //Stores the amount of replaces that will take place
      MatchesCount: SizeInt;
      //Currently allocated space for matches.
      MatchesAllocatedLimit: SizeInt;
      //Uppercase version of pattern
      PatternUppercase: ansistring;
      //Lowercase version of pattern
      PatternLowerCase: ansistring;
      //Index
      MatchIndex: SizeInt;
      MatchLimit: SizeInt;
      MatchInternal: SizeInt;
      MatchTarget: SizeInt;
      AdvanceIndex: SizeInt;
    
      //Miscelanous variables
      OldPatternSize: SizeInt;
      NewPatternSize: SizeInt;
    
      //Resizes the allocated space for replacement index
      procedure ResizeAllocatedMatches;
      begin
        MatchesAllocatedLimit:=MatchesCount+MATCHESCOUNTRESIZER;
        SetLength(Matches,MatchesAllocatedLimit);
      end;
    
      //Add a match to be replaced
      procedure AddMatch(const aPosition: SizeInt); inline;
      begin
        if MatchesCount = MatchesAllocatedLimit then begin
          ResizeAllocatedMatches;
        end;
        Matches[MatchesCount]:=aPosition;
        inc(MatchesCount);
      end;
    begin
      if (OldPattern='') or (Length(OldPattern)>Length(S)) then begin
        //This cases will never match nothing.
        Result:=S;
        exit;
      end;
      Result:='';
      OldPatternSize:=Length(OldPattern);
      MatchesCount:=0;
      MatchesAllocatedLimit:=0;
      if rfIgnoreCase in Flags then begin
        //Different algorithm for case sensitive and insensitive
        //This is insensitive, so 2 new ansistrings are created for search pattern, one upper and one lower case.
        //It is easy, usually, to create 2 versions of the match pattern than uppercased and lowered case each
        //character in the "to be matched" string.
        PatternUppercase:=AnsiUpperCase(OldPattern);
        PatternLowerCase:=AnsiLowerCase(OldPattern);
        MatchIndex:=Length(OldPattern);
        MatchLimit:=Length(S);
        NewPatternSize:=Length(NewPattern);
        while MatchIndex <= MatchLimit do begin
          if (S[MatchIndex]=PatternLowerCase[OldPatternSize]) or (S[MatchIndex]=PatternUppercase[OldPatternSize]) then begin
            //Match backwards...
            MatchInternal:=OldPatternSize-1;
            MatchTarget:=MatchIndex-1;
            while MatchInternal>=1 do begin
              if (S[MatchTarget]=PatternLowerCase[MatchInternal]) or (S[MatchTarget]=PatternUppercase[MatchInternal]) then begin
                dec(MatchInternal);
                dec(MatchTarget);
              end else begin
                break;
              end;
            end;
            if MatchInternal=0 then begin
              //Match found, all char meet the sequence
              //MatchTarget points to char before, so matching is +1
              AddMatch(MatchTarget+1);
              inc(MatchIndex,OldPatternSize);
              if not (rfReplaceAll in Flags) then begin
                break;
              end;
            end else begin
              //Match not found
              inc(MatchIndex);
            end;
          end else begin
            inc(MatchIndex);
          end;
        end;
      end else begin
        //Different algorithm for case sensitive and insensitive
        //This is sensitive, so just 1 binary comprare
        MatchIndex:=Length(OldPattern);
        MatchLimit:=Length(S);
        NewPatternSize:=Length(NewPattern);
        while MatchIndex <= MatchLimit do begin
          if (S[MatchIndex]=OldPattern[OldPatternSize]) then begin
            //Match backwards...
            MatchInternal:=OldPatternSize-1;
            MatchTarget:=MatchIndex-1;
            while MatchInternal>=1 do begin
              if (S[MatchTarget]=OldPattern[MatchInternal]) then begin
                dec(MatchInternal);
                dec(MatchTarget);
              end else begin
                break;
              end;
            end;
            if MatchInternal=0 then begin
              //Match found, all char meet the sequence
              //MatchTarget points to char before, so matching is +1
              AddMatch(MatchTarget+1);
              inc(MatchIndex,OldPatternSize);
              if not (rfReplaceAll in Flags) then begin
                break;
              end;
            end else begin
              //Match not found
              inc(MatchIndex);
            end;
          end else begin
            inc(MatchIndex);
          end;
        end;
      end;
      //Create room enougth for the result string
      SetLength(Result,Length(S)-OldPatternSize*MatchesCount+NewPatternSize*MatchesCount);
      MatchIndex:=1;
      MatchTarget:=1;
      //Matches[x] are 1 based offsets
      for MatchInternal := 0 to Pred(MatchesCount) do begin
        //Copy information up to next match
        AdvanceIndex:=Matches[MatchInternal]-MatchIndex;
        if AdvanceIndex>0 then begin
          move(S[MatchIndex],Result[MatchTarget],AdvanceIndex);
          inc(MatchTarget,AdvanceIndex);
          inc(MatchIndex,AdvanceIndex);
        end;
        //Copy the new replace information string
        if NewPatternSize>0 then begin
          move(NewPattern[1],Result[MatchTarget],NewPatternSize);
          inc(MatchTarget,NewPatternSize);
        end;
        inc(MatchIndex,OldPatternSize);
      end;
      if MatchTarget<=Length(Result) then begin
        //Add remain data at the end of source.
        move(S[MatchIndex],Result[MatchTarget],Length(Result)-MatchTarget+1);
      end;
    end;
    
    (*
      StringReplaceBoyerMoore
    
      Replaces one or many ocurrences of an ansistring in another ansistring by a new one.
      It can perform the compare ignoring case (ansi).
    
      * Parameters (Read only):
      S: The string to be searched in.
      OldPattern: The string to be searched.
      NewPattern: The string to replace OldPattern matches.
      Flags:
        rfReplaceAll: Replace all occurrences.
        rfIgnoreCase: Ignore case in OldPattern matching.
    
      * Returns:
        The modified string (if needed).
    
      It is memory conservative, just sizeof(SizeInt) per match in blocks off 100 matches
      plus Length(OldPattern)*2 in the case of ignoring case.
      Memory copies are the minimun necessary.
      Algorithm based in the Boyer-Moore string search algorithm.
    
      It is faster when the "S" string is very long and the OldPattern is also
      very big. As much big the OldPattern is, faster the search is too.
    
      It uses 2 different helper versions of Boyer-Moore algorithm, one for case
      sensitive and one for case INsensitive for speed reasons.
    
    *)
    
    function StringReplaceBoyerMoore(const S, OldPattern, NewPattern: ansistring;Flags: TReplaceFlags): ansistring;
    var
      Matches: SizeIntArray;
      OldPatternSize: SizeInt;
      NewPatternSize: SizeInt;
      MatchesCount: SizeInt;
      MatchIndex: SizeInt;
      MatchTarget: SizeInt;
      MatchInternal: SizeInt;
      AdvanceIndex: SizeInt;
    begin
      OldPatternSize:=Length(OldPattern);
      NewPatternSize:=Length(NewPattern);
      if (OldPattern='') or (Length(OldPattern)>Length(S)) then begin
        Result:=S;
        exit;
      end;
    
      if rfIgnoreCase in Flags then begin
        FindMatchesBoyerMooreCaseINSensitive(@s[1],@OldPattern[1],Length(S),Length(OldPattern),Matches, rfReplaceAll in Flags);
      end else begin
        FindMatchesBoyerMooreCaseSensitive(@s[1],@OldPattern[1],Length(S),Length(OldPattern),Matches, rfReplaceAll in Flags);
      end;
    
      MatchesCount:=Length(Matches);
    
      //Create room enougth for the result string
      SetLength(Result,Length(S)-OldPatternSize*MatchesCount+NewPatternSize*MatchesCount);
      MatchIndex:=1;
      MatchTarget:=1;
      //Matches[x] are 0 based offsets
      for MatchInternal := 0 to Pred(MatchesCount) do begin
        //Copy information up to next match
        AdvanceIndex:=Matches[MatchInternal]+1-MatchIndex;
        if AdvanceIndex>0 then begin
          move(S[MatchIndex],Result[MatchTarget],AdvanceIndex);
          inc(MatchTarget,AdvanceIndex);
          inc(MatchIndex,AdvanceIndex);
        end;
        //Copy the new replace information string
        if NewPatternSize>0 then begin
          move(NewPattern[1],Result[MatchTarget],NewPatternSize);
          inc(MatchTarget,NewPatternSize);
        end;
        inc(MatchIndex,OldPatternSize);
      end;
      if MatchTarget<=Length(Result) then begin
        //Add remain data at the end of source.
        move(S[MatchIndex],Result[MatchTarget],Length(Result)-MatchTarget+1);
      end;
    end;
    
    (*
      FindMatchesBoyerMooreCaseSensitive
    
      Finds one or many ocurrences of an ansistring in another ansistring.
      It is case sensitive.
    
      * Parameters:
      S: The PChar to be searched in. (Read only).
      OldPattern: The PChar to be searched. (Read only).
      SSize: The size of S in Chars. (Read only).
      OldPatternSize: The size of OldPatter in chars. (Read only).
      aMatches: SizeInt array where match indexes are returned (zero based) (write only).
      aMatchAll: Finds all matches, not just the first one. (Read only).
    
      * Returns:
        Nothing, information returned in aMatches parameter.
    
      The function is based in the Boyer-Moore algorithm.
    *)
    
    procedure FindMatchesBoyerMooreCaseSensitive(const S, OldPattern: PChar;
      const SSize, OldPatternSize: SizeInt; out aMatches: SizeIntArray;
      const aMatchAll: Boolean);
    const
      ALPHABET_LENGHT=256;
      MATCHESCOUNTRESIZER=100; //Arbitrary value. Memory used = MATCHESCOUNTRESIZER * sizeof(SizeInt)
    var
      //Stores the amount of replaces that will take place
      MatchesCount: SizeInt;
      //Currently allocated space for matches.
      MatchesAllocatedLimit: SizeInt;
    type
      AlphabetArray=array [0..ALPHABET_LENGHT-1] of SizeInt;
    
      function Max(const a1,a2: SizeInt): SizeInt;
      begin
        if a1>a2 then Result:=a1 else Result:=a2;
      end;
    
      procedure MakeDeltaJumpTable1(out DeltaJumpTable1: AlphabetArray; const aPattern: PChar; const aPatternSize: SizeInt);
      var
        i: SizeInt;
      begin
        for i := 0 to ALPHABET_LENGHT-1 do begin
          DeltaJumpTable1[i]:=aPatternSize;
        end;
        //Last char do not enter in the equation
        for i := 0 to aPatternSize - 1 - 1 do begin
          DeltaJumpTable1[Ord(aPattern[i])]:=aPatternSize -1 - i;
        end;
      end;
    
      function IsPrefix(const aPattern: PChar; const aPatternSize, aPos: SizeInt): Boolean;
      var
        i: SizeInt;
        SuffixLength: SizeInt;
      begin
        SuffixLength:=aPatternSize-aPos;
        for i := 0 to SuffixLength-1 do begin
          if (aPattern[i] <> aPattern[aPos+i]) then begin
              exit(false);
          end;
        end;
        Result:=true;
      end;
    
      function SuffixLength(const aPattern: PChar; const aPatternSize, aPos: SizeInt): SizeInt;
      var
        i: SizeInt;
      begin
        i:=0;
        while (aPattern[aPos-i] = aPattern[aPatternSize-1-i]) and (i < aPos) do begin
          inc(i);
        end;
        Result:=i;
      end;
    
      procedure MakeDeltaJumpTable2(var DeltaJumpTable2: SizeIntArray; const aPattern: PChar; const aPatternSize: SizeInt);
      var
        Position: SizeInt;
        LastPrefixIndex: SizeInt;
        SuffixLengthValue: SizeInt;
      begin
        LastPrefixIndex:=aPatternSize-1;
        Position:=aPatternSize-1;
        while Position>=0 do begin
          if IsPrefix(aPattern,aPatternSize,Position+1) then begin
            LastPrefixIndex := Position+1;
          end;
          DeltaJumpTable2[Position] := LastPrefixIndex + (aPatternSize-1 - Position);
          Dec(Position);
        end;
        Position:=0;
        while Position<aPatternSize-1 do begin
          SuffixLengthValue:=SuffixLength(aPattern,aPatternSize,Position);
          if aPattern[Position-SuffixLengthValue] <> aPattern[aPatternSize-1 - SuffixLengthValue] then begin
            DeltaJumpTable2[aPatternSize - 1 - SuffixLengthValue] := aPatternSize - 1 - Position + SuffixLengthValue;
          end;
          Inc(Position);
        end;
      end;
    
      //Resizes the allocated space for replacement index
      procedure ResizeAllocatedMatches;
      begin
        MatchesAllocatedLimit:=MatchesCount+MATCHESCOUNTRESIZER;
        SetLength(aMatches,MatchesAllocatedLimit);
      end;
    
      //Add a match to be replaced
      procedure AddMatch(const aPosition: SizeInt); inline;
      begin
        if MatchesCount = MatchesAllocatedLimit then begin
          ResizeAllocatedMatches;
        end;
        aMatches[MatchesCount]:=aPosition;
        inc(MatchesCount);
      end;
    var
      i,j: SizeInt;
      DeltaJumpTable1: array [0..ALPHABET_LENGHT-1] of SizeInt;
      DeltaJumpTable2: SizeIntArray;
    begin
      MatchesCount:=0;
      MatchesAllocatedLimit:=0;
      SetLength(aMatches,MatchesCount);
      if OldPatternSize=0 then begin
        Exit;
      end;
      SetLength(DeltaJumpTable2,OldPatternSize);
    
      MakeDeltaJumpTable1(DeltaJumpTable1,OldPattern,OldPatternSize);
      MakeDeltaJumpTable2(DeltaJumpTable2,OldPattern,OldPatternSize);
    
      i:=OldPatternSize-1;
      while i < SSize do begin
        j:=OldPatternSize-1;
        while (j>=0) and (S[i] = OldPattern[j]) do begin
          dec(i);
          dec(j);
        end;
        if (j<0) then begin
          AddMatch(i+1);
          //Only first match ?
          if not aMatchAll then exit;
          inc(i,OldPatternSize);
          inc(i,OldPatternSize);
        end else begin
          i:=i + Max(DeltaJumpTable1[ord(s[i])],DeltaJumpTable2[j]);
        end;
      end;
      SetLength(aMatches,MatchesCount);
    end;
    
    (*
      FindMatchesBoyerMooreCaseINSensitive
    
      Finds one or many ocurrences of an ansistring in another ansistring.
      It is case INsensitive.
    
      * Parameters:
      S: The PChar to be searched in. (Read only).
      OldPattern: The PChar to be searched. (Read only).
      SSize: The size of S in Chars. (Read only).
      OldPatternSize: The size of OldPatter in chars. (Read only).
      aMatches: SizeInt array where match indexes are returned (zero based) (write only).
      aMatchAll: Finds all matches, not just the first one. (Read only).
    
      * Returns:
        Nothing, information returned in aMatches parameter.
    
      The function is based in the Boyer-Moore algorithm.
    *)
    
    procedure FindMatchesBoyerMooreCaseINSensitive(const S, OldPattern: PChar;
      const SSize, OldPatternSize: SizeInt; out aMatches: SizeIntArray;
      const aMatchAll: Boolean);
    const
      ALPHABET_LENGHT=256;
      MATCHESCOUNTRESIZER=100; //Arbitrary value. Memory used = MATCHESCOUNTRESIZER * sizeof(SizeInt)
    var
      //Lowercased OldPattern
      lPattern: ansistring;
      //Array of lowercased alphabet
      lCaseArray: array [0..ALPHABET_LENGHT-1] of char;
      //Stores the amount of replaces that will take place
      MatchesCount: SizeInt;
      //Currently allocated space for matches.
      MatchesAllocatedLimit: SizeInt;
    type
      AlphabetArray=array [0..ALPHABET_LENGHT-1] of SizeInt;
    
      function Max(const a1,a2: SizeInt): SizeInt;
      begin
        if a1>a2 then Result:=a1 else Result:=a2;
      end;
    
      procedure MakeDeltaJumpTable1(out DeltaJumpTable1: AlphabetArray; const aPattern: PChar; const aPatternSize: SizeInt);
      var
        i: SizeInt;
      begin
        for i := 0 to ALPHABET_LENGHT-1 do begin
          DeltaJumpTable1[i]:=aPatternSize;
        end;
        //Last char do not enter in the equation
        for i := 0 to aPatternSize - 1 - 1 do begin
          DeltaJumpTable1[Ord(aPattern[i])]:=aPatternSize - 1 - i;
        end;
      end;
    
      function IsPrefix(const aPattern: PChar; const aPatternSize, aPos: SizeInt): Boolean; inline;
      var
        i: SizeInt;
        SuffixLength: SizeInt;
      begin
        SuffixLength:=aPatternSize-aPos;
        for i := 0 to SuffixLength-1 do begin
          if (aPattern[i+1] <> aPattern[aPos+i]) then begin
            exit(false);
          end;
        end;
        Result:=true;
      end;
    
      function SuffixLength(const aPattern: PChar; const aPatternSize, aPos: SizeInt): SizeInt; inline;
      var
        i: SizeInt;
      begin
        i:=0;
        while (aPattern[aPos-i] = aPattern[aPatternSize-1-i]) and (i < aPos) do begin
          inc(i);
        end;
        Result:=i;
      end;
    
      procedure MakeDeltaJumpTable2(var DeltaJumpTable2: SizeIntArray; const aPattern: PChar; const aPatternSize: SizeInt);
      var
        Position: SizeInt;
        LastPrefixIndex: SizeInt;
        SuffixLengthValue: SizeInt;
      begin
        LastPrefixIndex:=aPatternSize-1;
        Position:=aPatternSize-1;
        while Position>=0 do begin
          if IsPrefix(aPattern,aPatternSize,Position+1) then begin
            LastPrefixIndex := Position+1;
          end;
          DeltaJumpTable2[Position] := LastPrefixIndex + (aPatternSize-1 - Position);
          Dec(Position);
        end;
        Position:=0;
        while Position<aPatternSize-1 do begin
          SuffixLengthValue:=SuffixLength(aPattern,aPatternSize,Position);
          if aPattern[Position-SuffixLengthValue] <> aPattern[aPatternSize-1 - SuffixLengthValue] then begin
            DeltaJumpTable2[aPatternSize - 1 - SuffixLengthValue] := aPatternSize - 1 - Position + SuffixLengthValue;
          end;
          Inc(Position);
        end;
      end;
    
      //Resizes the allocated space for replacement index
      procedure ResizeAllocatedMatches;
      begin
        MatchesAllocatedLimit:=MatchesCount+MATCHESCOUNTRESIZER;
        SetLength(aMatches,MatchesAllocatedLimit);
      end;
    
      //Add a match to be replaced
      procedure AddMatch(const aPosition: SizeInt); inline;
      begin
        if MatchesCount = MatchesAllocatedLimit then begin
          ResizeAllocatedMatches;
        end;
        aMatches[MatchesCount]:=aPosition;
        inc(MatchesCount);
      end;
    var
      i,j: SizeInt;
      DeltaJumpTable1: array [0..ALPHABET_LENGHT-1] of SizeInt;
      DeltaJumpTable2: SizeIntArray;
      //Pointer to lowered OldPattern
      plPattern: PChar;
    begin
      MatchesCount:=0;
      MatchesAllocatedLimit:=0;
      SetLength(aMatches,MatchesCount);
      if OldPatternSize=0 then begin
        Exit;
      end;
    
      //Build an internal array of lowercase version of every possible char.
      for j := 0 to Pred(ALPHABET_LENGHT) do begin
        lCaseArray[j]:=AnsiLowerCase(char(j))[1];
      end;
    
      //Create the new lowercased pattern
      SetLength(lPattern,OldPatternSize);
      for j := 0 to Pred(OldPatternSize) do begin
        lPattern[j+1]:=lCaseArray[ord(OldPattern[j])];
      end;
    
      SetLength(DeltaJumpTable2,OldPatternSize);
    
      MakeDeltaJumpTable1(DeltaJumpTable1,@lPattern[1],OldPatternSize);
      MakeDeltaJumpTable2(DeltaJumpTable2,@lPattern[1],OldPatternSize);
    
      plPattern:=@lPattern[1];
      i:=OldPatternSize-1;
      while i < SSize do begin
        j:=OldPatternSize-1;
        while (j>=0) and (lCaseArray[Ord(S[i])] = plPattern[j]) do begin
          dec(i);
          dec(j);
        end;
        if (j<0) then begin
          AddMatch(i+1);
          //Only first match ?
          if not aMatchAll then break;
          inc(i,OldPatternSize);
          inc(i,OldPatternSize);
        end else begin
          i:=i + Max(DeltaJumpTable1[Ord(lCaseArray[Ord(s[i])])],DeltaJumpTable2[j]);
        end;
      end;
      SetLength(aMatches,MatchesCount);
    end;
    
    end.
    
    
    ustringreplace.pas (19,898 bytes)
  • ustringreplace_2.pas (19,855 bytes)
    unit ustringreplace;
    
    {$mode objfpc}{$H+}
    
    interface
    
    uses
      SysUtils;
    
    function StringReplaceX(const S, OldPattern, NewPattern: string; Flags: TReplaceFlags): string;
    
    function StringReplaceBoyerMoore (const S, OldPattern, NewPattern: string; Flags: TReplaceFlags): string;
    
    implementation
    
    type
      SizeIntArray = array of SizeInt;
    
    procedure FindMatchesBoyerMooreCaseSensitive(const S,OldPattern: PChar; const SSize, OldPatternSize: SizeInt; out aMatches: SizeIntArray; const aMatchAll: Boolean); forward;
    procedure FindMatchesBoyerMooreCaseINSensitive(const S, OldPattern: PChar; const SSize, OldPatternSize: SizeInt; out aMatches: SizeIntArray; const aMatchAll: Boolean); forward;
    
    (*
      StringReplace
    
      Replaces one or many ocurrences of an ansistring in another ansistring by a new one.
      It can perform the compare ignoring case (ansi).
    
      * Parameters (Read only):
      S: The string to be searched in.
      OldPattern: The string to be searched.
      NewPattern: The string to replace OldPattern matches.
      Flags:
        rfReplaceAll: Replace all occurrences.
        rfIgnoreCase: Ignore case in OldPattern matching.
    
      * Returns:
        The modified string (if needed).
    
      The function is almost linear.
      It is memory conservative, just sizeof(SizeInt) per match in blocks off 100 matches
      plus Length(OldPattern)*2 in the case of ignoring case.
      Memory copies are the minimun necessary.
    *)
    
    function StringReplaceX(const S, OldPattern, NewPattern: string;
      Flags: TReplaceFlags): string;
    const
      MATCHESCOUNTRESIZER=100; //Arbitrary value. Memory used = MATCHESCOUNTRESIZER * sizeof(SizeInt)
    var
      //Stores where a replace will take place
      Matches: array of SizeInt;
      //Stores the amount of replaces that will take place
      MatchesCount: SizeInt;
      //Currently allocated space for matches.
      MatchesAllocatedLimit: SizeInt;
      //Uppercase version of pattern
      PatternUppercase: string;
      //Lowercase version of pattern
      PatternLowerCase: string;
      //Index
      MatchIndex: SizeInt;
      MatchLimit: SizeInt;
      MatchInternal: SizeInt;
      MatchTarget: SizeInt;
      AdvanceIndex: SizeInt;
    
      //Miscelanous variables
      OldPatternSize: SizeInt;
      NewPatternSize: SizeInt;
    
      //Resizes the allocated space for replacement index
      procedure ResizeAllocatedMatches;
      begin
        MatchesAllocatedLimit:=MatchesCount+MATCHESCOUNTRESIZER;
        SetLength(Matches,MatchesAllocatedLimit);
      end;
    
      //Add a match to be replaced
      procedure AddMatch(const aPosition: SizeInt); inline;
      begin
        if MatchesCount = MatchesAllocatedLimit then begin
          ResizeAllocatedMatches;
        end;
        Matches[MatchesCount]:=aPosition;
        inc(MatchesCount);
      end;
    begin
      if (OldPattern='') or (Length(OldPattern)>Length(S)) then begin
        //This cases will never match nothing.
        Result:=S;
        exit;
      end;
      Result:='';
      OldPatternSize:=Length(OldPattern);
      MatchesCount:=0;
      MatchesAllocatedLimit:=0;
      if rfIgnoreCase in Flags then begin
        //Different algorithm for case sensitive and insensitive
        //This is insensitive, so 2 new ansistrings are created for search pattern, one upper and one lower case.
        //It is easy, usually, to create 2 versions of the match pattern than uppercased and lowered case each
        //character in the "to be matched" string.
        PatternUppercase:=AnsiUpperCase(OldPattern);
        PatternLowerCase:=AnsiLowerCase(OldPattern);
        MatchIndex:=Length(OldPattern);
        MatchLimit:=Length(S);
        NewPatternSize:=Length(NewPattern);
        while MatchIndex <= MatchLimit do begin
          if (S[MatchIndex]=PatternLowerCase[OldPatternSize]) or (S[MatchIndex]=PatternUppercase[OldPatternSize]) then begin
            //Match backwards...
            MatchInternal:=OldPatternSize-1;
            MatchTarget:=MatchIndex-1;
            while MatchInternal>=1 do begin
              if (S[MatchTarget]=PatternLowerCase[MatchInternal]) or (S[MatchTarget]=PatternUppercase[MatchInternal]) then begin
                dec(MatchInternal);
                dec(MatchTarget);
              end else begin
                break;
              end;
            end;
            if MatchInternal=0 then begin
              //Match found, all char meet the sequence
              //MatchTarget points to char before, so matching is +1
              AddMatch(MatchTarget+1);
              inc(MatchIndex,OldPatternSize);
              if not (rfReplaceAll in Flags) then begin
                break;
              end;
            end else begin
              //Match not found
              inc(MatchIndex);
            end;
          end else begin
            inc(MatchIndex);
          end;
        end;
      end else begin
        //Different algorithm for case sensitive and insensitive
        //This is sensitive, so just 1 binary comprare
        MatchIndex:=Length(OldPattern);
        MatchLimit:=Length(S);
        NewPatternSize:=Length(NewPattern);
        while MatchIndex <= MatchLimit do begin
          if (S[MatchIndex]=OldPattern[OldPatternSize]) then begin
            //Match backwards...
            MatchInternal:=OldPatternSize-1;
            MatchTarget:=MatchIndex-1;
            while MatchInternal>=1 do begin
              if (S[MatchTarget]=OldPattern[MatchInternal]) then begin
                dec(MatchInternal);
                dec(MatchTarget);
              end else begin
                break;
              end;
            end;
            if MatchInternal=0 then begin
              //Match found, all char meet the sequence
              //MatchTarget points to char before, so matching is +1
              AddMatch(MatchTarget+1);
              inc(MatchIndex,OldPatternSize);
              if not (rfReplaceAll in Flags) then begin
                break;
              end;
            end else begin
              //Match not found
              inc(MatchIndex);
            end;
          end else begin
            inc(MatchIndex);
          end;
        end;
      end;
      //Create room enougth for the result string
      SetLength(Result,Length(S)-OldPatternSize*MatchesCount+NewPatternSize*MatchesCount);
      MatchIndex:=1;
      MatchTarget:=1;
      //Matches[x] are 1 based offsets
      for MatchInternal := 0 to Pred(MatchesCount) do begin
        //Copy information up to next match
        AdvanceIndex:=Matches[MatchInternal]-MatchIndex;
        if AdvanceIndex>0 then begin
          move(S[MatchIndex],Result[MatchTarget],AdvanceIndex);
          inc(MatchTarget,AdvanceIndex);
          inc(MatchIndex,AdvanceIndex);
        end;
        //Copy the new replace information string
        if NewPatternSize>0 then begin
          move(NewPattern[1],Result[MatchTarget],NewPatternSize);
          inc(MatchTarget,NewPatternSize);
        end;
        inc(MatchIndex,OldPatternSize);
      end;
      if MatchTarget<=Length(Result) then begin
        //Add remain data at the end of source.
        move(S[MatchIndex],Result[MatchTarget],Length(Result)-MatchTarget+1);
      end;
    end;
    
    (*
      StringReplaceBoyerMoore
    
      Replaces one or many ocurrences of an ansistring in another ansistring by a new one.
      It can perform the compare ignoring case (ansi).
    
      * Parameters (Read only):
      S: The string to be searched in.
      OldPattern: The string to be searched.
      NewPattern: The string to replace OldPattern matches.
      Flags:
        rfReplaceAll: Replace all occurrences.
        rfIgnoreCase: Ignore case in OldPattern matching.
    
      * Returns:
        The modified string (if needed).
    
      It is memory conservative, just sizeof(SizeInt) per match in blocks off 100 matches
      plus Length(OldPattern)*2 in the case of ignoring case.
      Memory copies are the minimun necessary.
      Algorithm based in the Boyer-Moore string search algorithm.
    
      It is faster when the "S" string is very long and the OldPattern is also
      very big. As much big the OldPattern is, faster the search is too.
    
      It uses 2 different helper versions of Boyer-Moore algorithm, one for case
      sensitive and one for case INsensitive for speed reasons.
    
    *)
    
    function StringReplaceBoyerMoore(const S, OldPattern, NewPattern: string;Flags: TReplaceFlags): string;
    var
      Matches: SizeIntArray;
      OldPatternSize: SizeInt;
      NewPatternSize: SizeInt;
      MatchesCount: SizeInt;
      MatchIndex: SizeInt;
      MatchTarget: SizeInt;
      MatchInternal: SizeInt;
      AdvanceIndex: SizeInt;
    begin
      OldPatternSize:=Length(OldPattern);
      NewPatternSize:=Length(NewPattern);
      if (OldPattern='') or (Length(OldPattern)>Length(S)) then begin
        Result:=S;
        exit;
      end;
    
      if rfIgnoreCase in Flags then begin
        FindMatchesBoyerMooreCaseINSensitive(@s[1],@OldPattern[1],Length(S),Length(OldPattern),Matches, rfReplaceAll in Flags);
      end else begin
        FindMatchesBoyerMooreCaseSensitive(@s[1],@OldPattern[1],Length(S),Length(OldPattern),Matches, rfReplaceAll in Flags);
      end;
    
      MatchesCount:=Length(Matches);
    
      //Create room enougth for the result string
      SetLength(Result,Length(S)-OldPatternSize*MatchesCount+NewPatternSize*MatchesCount);
      MatchIndex:=1;
      MatchTarget:=1;
      //Matches[x] are 0 based offsets
      for MatchInternal := 0 to Pred(MatchesCount) do begin
        //Copy information up to next match
        AdvanceIndex:=Matches[MatchInternal]+1-MatchIndex;
        if AdvanceIndex>0 then begin
          move(S[MatchIndex],Result[MatchTarget],AdvanceIndex);
          inc(MatchTarget,AdvanceIndex);
          inc(MatchIndex,AdvanceIndex);
        end;
        //Copy the new replace information string
        if NewPatternSize>0 then begin
          move(NewPattern[1],Result[MatchTarget],NewPatternSize);
          inc(MatchTarget,NewPatternSize);
        end;
        inc(MatchIndex,OldPatternSize);
      end;
      if MatchTarget<=Length(Result) then begin
        //Add remain data at the end of source.
        move(S[MatchIndex],Result[MatchTarget],Length(Result)-MatchTarget+1);
      end;
    end;
    
    (*
      FindMatchesBoyerMooreCaseSensitive
    
      Finds one or many ocurrences of an ansistring in another ansistring.
      It is case sensitive.
    
      * Parameters:
      S: The PChar to be searched in. (Read only).
      OldPattern: The PChar to be searched. (Read only).
      SSize: The size of S in Chars. (Read only).
      OldPatternSize: The size of OldPatter in chars. (Read only).
      aMatches: SizeInt array where match indexes are returned (zero based) (write only).
      aMatchAll: Finds all matches, not just the first one. (Read only).
    
      * Returns:
        Nothing, information returned in aMatches parameter.
    
      The function is based in the Boyer-Moore algorithm.
    *)
    
    procedure FindMatchesBoyerMooreCaseSensitive(const S, OldPattern: PChar;
      const SSize, OldPatternSize: SizeInt; out aMatches: SizeIntArray;
      const aMatchAll: Boolean);
    const
      ALPHABET_LENGHT=256;
      MATCHESCOUNTRESIZER=100; //Arbitrary value. Memory used = MATCHESCOUNTRESIZER * sizeof(SizeInt)
    var
      //Stores the amount of replaces that will take place
      MatchesCount: SizeInt;
      //Currently allocated space for matches.
      MatchesAllocatedLimit: SizeInt;
    type
      AlphabetArray=array [0..ALPHABET_LENGHT-1] of SizeInt;
    
      function Max(const a1,a2: SizeInt): SizeInt;
      begin
        if a1>a2 then Result:=a1 else Result:=a2;
      end;
    
      procedure MakeDeltaJumpTable1(out DeltaJumpTable1: AlphabetArray; const aPattern: PChar; const aPatternSize: SizeInt);
      var
        i: SizeInt;
      begin
        for i := 0 to ALPHABET_LENGHT-1 do begin
          DeltaJumpTable1[i]:=aPatternSize;
        end;
        //Last char do not enter in the equation
        for i := 0 to aPatternSize - 1 - 1 do begin
          DeltaJumpTable1[Ord(aPattern[i])]:=aPatternSize -1 - i;
        end;
      end;
    
      function IsPrefix(const aPattern: PChar; const aPatternSize, aPos: SizeInt): Boolean;
      var
        i: SizeInt;
        SuffixLength: SizeInt;
      begin
        SuffixLength:=aPatternSize-aPos;
        for i := 0 to SuffixLength-1 do begin
          if (aPattern[i] <> aPattern[aPos+i]) then begin
              exit(false);
          end;
        end;
        Result:=true;
      end;
    
      function SuffixLength(const aPattern: PChar; const aPatternSize, aPos: SizeInt): SizeInt;
      var
        i: SizeInt;
      begin
        i:=0;
        while (aPattern[aPos-i] = aPattern[aPatternSize-1-i]) and (i < aPos) do begin
          inc(i);
        end;
        Result:=i;
      end;
    
      procedure MakeDeltaJumpTable2(var DeltaJumpTable2: SizeIntArray; const aPattern: PChar; const aPatternSize: SizeInt);
      var
        Position: SizeInt;
        LastPrefixIndex: SizeInt;
        SuffixLengthValue: SizeInt;
      begin
        LastPrefixIndex:=aPatternSize-1;
        Position:=aPatternSize-1;
        while Position>=0 do begin
          if IsPrefix(aPattern,aPatternSize,Position+1) then begin
            LastPrefixIndex := Position+1;
          end;
          DeltaJumpTable2[Position] := LastPrefixIndex + (aPatternSize-1 - Position);
          Dec(Position);
        end;
        Position:=0;
        while Position<aPatternSize-1 do begin
          SuffixLengthValue:=SuffixLength(aPattern,aPatternSize,Position);
          if aPattern[Position-SuffixLengthValue] <> aPattern[aPatternSize-1 - SuffixLengthValue] then begin
            DeltaJumpTable2[aPatternSize - 1 - SuffixLengthValue] := aPatternSize - 1 - Position + SuffixLengthValue;
          end;
          Inc(Position);
        end;
      end;
    
      //Resizes the allocated space for replacement index
      procedure ResizeAllocatedMatches;
      begin
        MatchesAllocatedLimit:=MatchesCount+MATCHESCOUNTRESIZER;
        SetLength(aMatches,MatchesAllocatedLimit);
      end;
    
      //Add a match to be replaced
      procedure AddMatch(const aPosition: SizeInt); inline;
      begin
        if MatchesCount = MatchesAllocatedLimit then begin
          ResizeAllocatedMatches;
        end;
        aMatches[MatchesCount]:=aPosition;
        inc(MatchesCount);
      end;
    var
      i,j: SizeInt;
      DeltaJumpTable1: array [0..ALPHABET_LENGHT-1] of SizeInt;
      DeltaJumpTable2: SizeIntArray;
    begin
      MatchesCount:=0;
      MatchesAllocatedLimit:=0;
      SetLength(aMatches,MatchesCount);
      if OldPatternSize=0 then begin
        Exit;
      end;
      SetLength(DeltaJumpTable2,OldPatternSize);
    
      MakeDeltaJumpTable1(DeltaJumpTable1,OldPattern,OldPatternSize);
      MakeDeltaJumpTable2(DeltaJumpTable2,OldPattern,OldPatternSize);
    
      i:=OldPatternSize-1;
      while i < SSize do begin
        j:=OldPatternSize-1;
        while (j>=0) and (S[i] = OldPattern[j]) do begin
          dec(i);
          dec(j);
        end;
        if (j<0) then begin
          AddMatch(i+1);
          //Only first match ?
          if not aMatchAll then break;
          inc(i,OldPatternSize);
          inc(i,OldPatternSize);
        end else begin
          i:=i + Max(DeltaJumpTable1[ord(s[i])],DeltaJumpTable2[j]);
        end;
      end;
      SetLength(aMatches,MatchesCount);
    end;
    
    (*
      FindMatchesBoyerMooreCaseINSensitive
    
      Finds one or many ocurrences of an ansistring in another ansistring.
      It is case INsensitive.
    
      * Parameters:
      S: The PChar to be searched in. (Read only).
      OldPattern: The PChar to be searched. (Read only).
      SSize: The size of S in Chars. (Read only).
      OldPatternSize: The size of OldPatter in chars. (Read only).
      aMatches: SizeInt array where match indexes are returned (zero based) (write only).
      aMatchAll: Finds all matches, not just the first one. (Read only).
    
      * Returns:
        Nothing, information returned in aMatches parameter.
    
      The function is based in the Boyer-Moore algorithm.
    *)
    
    procedure FindMatchesBoyerMooreCaseINSensitive(const S, OldPattern: PChar;
      const SSize, OldPatternSize: SizeInt; out aMatches: SizeIntArray;
      const aMatchAll: Boolean);
    const
      ALPHABET_LENGHT=256;
      MATCHESCOUNTRESIZER=100; //Arbitrary value. Memory used = MATCHESCOUNTRESIZER * sizeof(SizeInt)
    var
      //Lowercased OldPattern
      lPattern: string;
      //Array of lowercased alphabet
      lCaseArray: array [0..ALPHABET_LENGHT-1] of char;
      //Stores the amount of replaces that will take place
      MatchesCount: SizeInt;
      //Currently allocated space for matches.
      MatchesAllocatedLimit: SizeInt;
    type
      AlphabetArray=array [0..ALPHABET_LENGHT-1] of SizeInt;
    
      function Max(const a1,a2: SizeInt): SizeInt;
      begin
        if a1>a2 then Result:=a1 else Result:=a2;
      end;
    
      procedure MakeDeltaJumpTable1(out DeltaJumpTable1: AlphabetArray; const aPattern: PChar; const aPatternSize: SizeInt);
      var
        i: SizeInt;
      begin
        for i := 0 to ALPHABET_LENGHT-1 do begin
          DeltaJumpTable1[i]:=aPatternSize;
        end;
        //Last char do not enter in the equation
        for i := 0 to aPatternSize - 1 - 1 do begin
          DeltaJumpTable1[Ord(aPattern[i])]:=aPatternSize - 1 - i;
        end;
      end;
    
      function IsPrefix(const aPattern: PChar; const aPatternSize, aPos: SizeInt): Boolean; inline;
      var
        i: SizeInt;
        SuffixLength: SizeInt;
      begin
        SuffixLength:=aPatternSize-aPos;
        for i := 0 to SuffixLength-1 do begin
          if (aPattern[i+1] <> aPattern[aPos+i]) then begin
            exit(false);
          end;
        end;
        Result:=true;
      end;
    
      function SuffixLength(const aPattern: PChar; const aPatternSize, aPos: SizeInt): SizeInt; inline;
      var
        i: SizeInt;
      begin
        i:=0;
        while (aPattern[aPos-i] = aPattern[aPatternSize-1-i]) and (i < aPos) do begin
          inc(i);
        end;
        Result:=i;
      end;
    
      procedure MakeDeltaJumpTable2(var DeltaJumpTable2: SizeIntArray; const aPattern: PChar; const aPatternSize: SizeInt);
      var
        Position: SizeInt;
        LastPrefixIndex: SizeInt;
        SuffixLengthValue: SizeInt;
      begin
        LastPrefixIndex:=aPatternSize-1;
        Position:=aPatternSize-1;
        while Position>=0 do begin
          if IsPrefix(aPattern,aPatternSize,Position+1) then begin
            LastPrefixIndex := Position+1;
          end;
          DeltaJumpTable2[Position] := LastPrefixIndex + (aPatternSize-1 - Position);
          Dec(Position);
        end;
        Position:=0;
        while Position<aPatternSize-1 do begin
          SuffixLengthValue:=SuffixLength(aPattern,aPatternSize,Position);
          if aPattern[Position-SuffixLengthValue] <> aPattern[aPatternSize-1 - SuffixLengthValue] then begin
            DeltaJumpTable2[aPatternSize - 1 - SuffixLengthValue] := aPatternSize - 1 - Position + SuffixLengthValue;
          end;
          Inc(Position);
        end;
      end;
    
      //Resizes the allocated space for replacement index
      procedure ResizeAllocatedMatches;
      begin
        MatchesAllocatedLimit:=MatchesCount+MATCHESCOUNTRESIZER;
        SetLength(aMatches,MatchesAllocatedLimit);
      end;
    
      //Add a match to be replaced
      procedure AddMatch(const aPosition: SizeInt); inline;
      begin
        if MatchesCount = MatchesAllocatedLimit then begin
          ResizeAllocatedMatches;
        end;
        aMatches[MatchesCount]:=aPosition;
        inc(MatchesCount);
      end;
    var
      i,j: SizeInt;
      DeltaJumpTable1: array [0..ALPHABET_LENGHT-1] of SizeInt;
      DeltaJumpTable2: SizeIntArray;
      //Pointer to lowered OldPattern
      plPattern: PChar;
    begin
      MatchesCount:=0;
      MatchesAllocatedLimit:=0;
      SetLength(aMatches,MatchesCount);
      if OldPatternSize=0 then begin
        Exit;
      end;
    
      //Build an internal array of lowercase version of every possible char.
      for j := 0 to Pred(ALPHABET_LENGHT) do begin
        lCaseArray[j]:=AnsiLowerCase(char(j))[1];
      end;
    
      //Create the new lowercased pattern
      SetLength(lPattern,OldPatternSize);
      for j := 0 to Pred(OldPatternSize) do begin
        lPattern[j+1]:=lCaseArray[ord(OldPattern[j])];
      end;
    
      SetLength(DeltaJumpTable2,OldPatternSize);
    
      MakeDeltaJumpTable1(DeltaJumpTable1,@lPattern[1],OldPatternSize);
      MakeDeltaJumpTable2(DeltaJumpTable2,@lPattern[1],OldPatternSize);
    
      plPattern:=@lPattern[1];
      i:=OldPatternSize-1;
      while i < SSize do begin
        j:=OldPatternSize-1;
        while (j>=0) and (lCaseArray[Ord(S[i])] = plPattern[j]) do begin
          dec(i);
          dec(j);
        end;
        if (j<0) then begin
          AddMatch(i+1);
          //Only first match ?
          if not aMatchAll then break;
          inc(i,OldPatternSize);
          inc(i,OldPatternSize);
        end else begin
          i:=i + Max(DeltaJumpTable1[Ord(lCaseArray[Ord(s[i])])],DeltaJumpTable2[j]);
        end;
      end;
      SetLength(aMatches,MatchesCount);
    end;
    
    end.
    
    
    ustringreplace_2.pas (19,855 bytes)

Activities

Jarto Tarpio

2014-10-15 17:52

reporter  

strreplace.txt (2,057 bytes)
Function StringReplace(const S, OldPattern, NewPattern: string;  Flags: TReplaceFlags): string;
var
  OldPat,Srch: string; // Srch and Oldp can contain uppercase versions of S,OldPattern
  PatLength,NewPatLength,P,Cnt,PatCount,PrevP: Integer;
  c,d: pchar;
begin
  PatLength:=Length(OldPattern);
  if PatLength=0 then begin
    Result:=S;
    exit;
  end;

  if rfIgnoreCase in Flags then begin
    Srch:=AnsiUpperCase(S);
    OldPat:=AnsiUpperCase(OldPattern);
  end else begin
    Srch:=S;
    OldPat:=OldPattern;
  end;

  PatLength:=Length(OldPat);
  if Length(NewPattern)=PatLength then begin
    //Result length will not change
    Result:=S;
    P:=1;
    repeat
      P:=PosEx(OldPat,Srch,P);
      if P>0 then begin
        move(NewPattern[1],Result[P],PatLength);
        if not (rfReplaceAll in Flags) then exit;
        inc(P,PatLength);
      end;
    until p=0;
  end else begin
    //Different pattern length -> Result length will change
    //To avoid creating a lot of temporary strings, we count how many
    //replacements we're going to make.
    P:=1; PatCount:=0;
    repeat
      P:=PosEx(OldPat,Srch,P);
      if P>0 then begin
        inc(P,PatLength);
        inc(PatCount);
        if not (rfReplaceAll in Flags) then break;
      end;
    until p=0;
    if PatCount=0 then begin
      Result:=S;
      exit;
    end;
    NewPatLength:=Length(NewPattern);
    SetLength(Result,Length(S)+PatCount*(NewPatLength-PatLength));
    P:=1; PrevP:=0;
    c:=pchar(Result); d:=pchar(S);
    repeat
      P:=PosEx(OldPat,Srch,P);
      if P>0 then begin
        Cnt:=P-PrevP-1;
        if Cnt>0 then begin
          Move(d^,c^,Cnt);
          inc(c,Cnt);
          inc(d,Cnt);
        end;
        if NewPatLength>0 then begin
          Move(NewPattern[1],c^,NewPatLength);
          inc(c,NewPatLength);
        end;
        inc(P,PatLength);
        inc(d,PatLength);
        PrevP:=P-1;
        if not (rfReplaceAll in Flags) then break;
      end;
    until p=0;
    Cnt:=Length(S)-PrevP;
    if Cnt>0 then Move(d^,c^,Cnt);
  end;
end;
strreplace.txt (2,057 bytes)

Dmitry Boyarintsev

2014-10-15 21:05

developer   ~0078255

Last edited: 2014-10-15 21:06

View 3 revisions

Double-search might be inefficient as well.
An alternative implementation - compareString.diff
Reallocate the result string only twice (first time for a worse case scenario, second time for the actual result).
Works 25% faster, than double-search for StringOfChar('a',20000000)

Jarto Tarpio

2014-10-16 17:52

reporter   ~0078278

Reallocating the result string with worst case scenario can potentially use up a lot of memory. Especially if the source string is long and NewPattern is a lot longer than OldPattern. It may cause more trouble than what it's worth.

Dmitry Boyarintsev

2014-10-16 20:48

developer   ~0078285

Agreed. Then, gradual memory allocation (multiply 2) is as affective as double-search in the "a lot of replacement" scenario.
As well as more effective in for a worst case of the search substring - obviously, twice effective.

Jarto Tarpio

2014-10-18 19:01

reporter   ~0078348

Dmitry, your code has a bug in it:

StringReplace(StringOfChar('a',20),'a','',[rfReplaceAll]) returns a string of 10 chars instead of ''

Dmitry Boyarintsev

2014-10-20 04:10

developer  

compareString.diff (2,818 bytes)
Index: sysstr.inc
===================================================================
--- sysstr.inc	(revision 28884)
+++ sysstr.inc	(working copy)
@@ -2635,40 +2635,94 @@
 
 Function StringReplace(const S, OldPattern, NewPattern: string;  Flags: TReplaceFlags): string;
 var
-  Srch,OldP,RemS: string; // Srch and Oldp can contain uppercase versions of S,OldPattern
-  P : Integer;
+  Srch,OldP: string; // Srch and Oldp can contain uppercase versions of S,OldPattern
+  I: SizeInt;
+  R: SizeInt;   // Result copy index
+  SC: SizeInt;  // S copy index
+  MaxLen: SizeInt;
+  NewPatLen: SizeInt;
+  Replaced: Boolean;
+
+  procedure CopySource(ToIdx: Integer; ReplacePat: Boolean);
+  var
+    SrcCnt: SizeInt;
+    Sz : SizeInt;
+    NeedSz: SizeInt;
+    NeedCnt: SizeInt;
+  begin
+    SrcCnt:=ToIdx-SC;
+    Sz:=length(Result);
+
+    NeedCnt:=SrcCnt;
+    if ReplacePat then Inc(NeedCnt, NewPatLen);
+
+    If NeedCnt>0 then
+      begin
+      NeedSz:=R-1+NeedCnt;
+      if Sz<NeedSz then
+        begin
+        while Sz<NeedSz do
+          if Sz=0 then Sz:=4 else Sz:=Sz*2;
+        SetLength(Result, Sz);
+        end;
+      Move(S[SC], Result[R], SrcCnt);
+      inc(SC, SrcCnt);
+      inc(R,SrcCnt);
+      end;
+
+    if ReplacePat then
+      begin
+      inc(SC, Length(OldPattern));
+      if (NewPatLen>0) then
+        begin
+        Move(NewPattern[1], Result[R], NewPatLen);
+        inc(R, NewPatLen);
+        end;
+      end;
+  end;
+
 begin
+  if OldPattern='' then
+    begin
+    Result:=S;
+    Exit;
+    end;
   Srch:=S;
   OldP:=OldPattern;
+  NewPatLen:=length(NewPattern);
   if rfIgnoreCase in Flags then
     begin
     Srch:=AnsiUpperCase(Srch);
     OldP:=AnsiUpperCase(OldP);
     end;
-  RemS:=S;
   Result:='';
-  while (Length(Srch)<>0) do
+  I:=1;
+  SC:=1;
+  R:=1;
+  MaxLen:=length(Srch)-Length(OldP)+1;
+  Replaced:=false;
+  while I<=MaxLen do
     begin
-    P:=AnsiPos(OldP, Srch);
-    if P=0 then
+    if (Srch[I]=OldP[1]) and (CompareByte(Srch[I], OldP[1], length(OldP))=0) then
       begin
-      Result:=Result+RemS;
-      Srch:='';
+      Replaced:=true;
+      CopySource(I, True);
+      inc(I, Length(OldP));
+      if not (rfReplaceAll in Flags) then Break;
       end
     else
-      begin
-      Result:=Result+Copy(RemS,1,P-1)+NewPattern;
-      P:=P+Length(OldP);
-      RemS:=Copy(RemS,P,Length(RemS)-P+1);
-      if not (rfReplaceAll in Flags) then
-        begin
-        Result:=Result+RemS;
-        Srch:='';
-        end
-      else
-         Srch:=Copy(Srch,P,Length(Srch)-P+1);
-      end;
+      Inc(I);
     end;
+
+  if Replaced then
+    begin
+    // at least one replacement has occurred
+    CopySource(Length(S)+1, False);
+    SetLength(Result, R-1);
+    end
+  else
+    // no old patterns found
+    Result:=S;
 end;
 
 
compareString.diff (2,818 bytes)

Dmitry Boyarintsev

2014-10-20 04:11

developer   ~0078407

Even two bugs, once the first one is fixed, the second appears. This new version handles both.

Jarto Tarpio

2014-10-20 16:16

reporter   ~0078422

Dmitry, this last one works better. It's faster than my version, except when OldPattern and NewPattern are as long.

José Mejuto

2014-10-24 14:22

reporter   ~0078586

I had added my own version of StringReplace (called StringReplaceX in the code) which is fast and memory conservative in usual situations, except in the case of replacing an string of the same character multiple times, replacing that character by other one (or the same). But I think this a big corner case situation.

Also to the unit I had added a compatible Boyen-Moore implementation which is faster for large matching strings.

The unit may need little adjusts like replacing "ansistring" by "string" (I like to use ansistring instead string, when using AnsiStrings.

Functions compatibility with original fpc's one has been tested with random strings and performing the same replace with original code and the new one. The test procedure is this one:

procedure TestStringReplace;
const
  RANGECHAR=128;
  RANGESTRINGSIZE=50;
var
  C1,C2,C3: ansistring;
  j,k: integer;
  r1,r2: ansistring;
  x: int64;
begin
  x:=GetTickCount64;
  for j := 0 to Pred(100000) do begin
    r1:='';
    r2:='';
    SetLength(C1,Random(100));
    for k := 1 to Length(C1) do begin
      c1[k]:=char(Random(RANGECHAR)+65);
    end;
    SetLength(C2,Random(RANGESTRINGSIZE));
    for k := 1 to Length(C2) do begin
      c2[k]:=Char(Random(RANGECHAR)+65);
    end;
    SetLength(C3,Random(RANGESTRINGSIZE));
    for k := 1 to Length(C3) do begin
      c3[k]:=Char(Random(RANGECHAR)+65);
    end;
    writeln(j);
    r1:=StringReplace(C1,C2,C3,[rfReplaceAll,rfIgnoreCase]);
    r2:=StringReplaceBoyerMoore(C1,C2,C3,[rfReplaceAll,rfIgnoreCase]);
    //r2:=StringReplaceX(C1,C2,C3,[rfReplaceAll,rfIgnoreCase]);
    if r1<>r2 then begin
      beep;
      Writeln('Fail ',Length(R1),' - ',Length(r2));
      break;
    end;
  end;
  writeln((GetTickCount64-x) / 1000);
end;

José Mejuto

2014-10-24 14:23

reporter  

ustringreplace.pas (19,898 bytes)
unit ustringreplace;

{$mode objfpc}{$H+}

interface

uses
  SysUtils;

function StringReplaceX(const S, OldPattern, NewPattern: ansistring; Flags: TReplaceFlags): ansistring;

function StringReplaceBoyerMoore (const S, OldPattern, NewPattern: ansistring; Flags: TReplaceFlags): ansistring;

implementation

type
  SizeIntArray = array of SizeInt;

procedure FindMatchesBoyerMooreCaseSensitive(const S,OldPattern: PChar; const SSize, OldPatternSize: SizeInt; out aMatches: SizeIntArray; const aMatchAll: Boolean); forward;
procedure FindMatchesBoyerMooreCaseINSensitive(const S, OldPattern: PChar; const SSize, OldPatternSize: SizeInt; out aMatches: SizeIntArray; const aMatchAll: Boolean); forward;

(*
  StringReplace

  Replaces one or many ocurrences of an ansistring in another ansistring by a new one.
  It can perform the compare ignoring case (ansi).

  * Parameters (Read only):
  S: The string to be searched in.
  OldPattern: The string to be searched.
  NewPattern: The string to replace OldPattern matches.
  Flags:
    rfReplaceAll: Replace all occurrences.
    rfIgnoreCase: Ignore case in OldPattern matching.

  * Returns:
    The modified string (if needed).

  The function is almost linear.
  It is memory conservative, just sizeof(SizeInt) per match in blocks off 100 matches
  plus Length(OldPattern)*2 in the case of ignoring case.
  Memory copies are the minimun necessary.
*)

function StringReplaceX(const S, OldPattern, NewPattern: ansistring;
  Flags: TReplaceFlags): ansistring;
const
  MATCHESCOUNTRESIZER=100; //Arbitrary value. Memory used = MATCHESCOUNTRESIZER * sizeof(SizeInt)
var
  //Stores where a replace will take place
  Matches: array of SizeInt;
  //Stores the amount of replaces that will take place
  MatchesCount: SizeInt;
  //Currently allocated space for matches.
  MatchesAllocatedLimit: SizeInt;
  //Uppercase version of pattern
  PatternUppercase: ansistring;
  //Lowercase version of pattern
  PatternLowerCase: ansistring;
  //Index
  MatchIndex: SizeInt;
  MatchLimit: SizeInt;
  MatchInternal: SizeInt;
  MatchTarget: SizeInt;
  AdvanceIndex: SizeInt;

  //Miscelanous variables
  OldPatternSize: SizeInt;
  NewPatternSize: SizeInt;

  //Resizes the allocated space for replacement index
  procedure ResizeAllocatedMatches;
  begin
    MatchesAllocatedLimit:=MatchesCount+MATCHESCOUNTRESIZER;
    SetLength(Matches,MatchesAllocatedLimit);
  end;

  //Add a match to be replaced
  procedure AddMatch(const aPosition: SizeInt); inline;
  begin
    if MatchesCount = MatchesAllocatedLimit then begin
      ResizeAllocatedMatches;
    end;
    Matches[MatchesCount]:=aPosition;
    inc(MatchesCount);
  end;
begin
  if (OldPattern='') or (Length(OldPattern)>Length(S)) then begin
    //This cases will never match nothing.
    Result:=S;
    exit;
  end;
  Result:='';
  OldPatternSize:=Length(OldPattern);
  MatchesCount:=0;
  MatchesAllocatedLimit:=0;
  if rfIgnoreCase in Flags then begin
    //Different algorithm for case sensitive and insensitive
    //This is insensitive, so 2 new ansistrings are created for search pattern, one upper and one lower case.
    //It is easy, usually, to create 2 versions of the match pattern than uppercased and lowered case each
    //character in the "to be matched" string.
    PatternUppercase:=AnsiUpperCase(OldPattern);
    PatternLowerCase:=AnsiLowerCase(OldPattern);
    MatchIndex:=Length(OldPattern);
    MatchLimit:=Length(S);
    NewPatternSize:=Length(NewPattern);
    while MatchIndex <= MatchLimit do begin
      if (S[MatchIndex]=PatternLowerCase[OldPatternSize]) or (S[MatchIndex]=PatternUppercase[OldPatternSize]) then begin
        //Match backwards...
        MatchInternal:=OldPatternSize-1;
        MatchTarget:=MatchIndex-1;
        while MatchInternal>=1 do begin
          if (S[MatchTarget]=PatternLowerCase[MatchInternal]) or (S[MatchTarget]=PatternUppercase[MatchInternal]) then begin
            dec(MatchInternal);
            dec(MatchTarget);
          end else begin
            break;
          end;
        end;
        if MatchInternal=0 then begin
          //Match found, all char meet the sequence
          //MatchTarget points to char before, so matching is +1
          AddMatch(MatchTarget+1);
          inc(MatchIndex,OldPatternSize);
          if not (rfReplaceAll in Flags) then begin
            break;
          end;
        end else begin
          //Match not found
          inc(MatchIndex);
        end;
      end else begin
        inc(MatchIndex);
      end;
    end;
  end else begin
    //Different algorithm for case sensitive and insensitive
    //This is sensitive, so just 1 binary comprare
    MatchIndex:=Length(OldPattern);
    MatchLimit:=Length(S);
    NewPatternSize:=Length(NewPattern);
    while MatchIndex <= MatchLimit do begin
      if (S[MatchIndex]=OldPattern[OldPatternSize]) then begin
        //Match backwards...
        MatchInternal:=OldPatternSize-1;
        MatchTarget:=MatchIndex-1;
        while MatchInternal>=1 do begin
          if (S[MatchTarget]=OldPattern[MatchInternal]) then begin
            dec(MatchInternal);
            dec(MatchTarget);
          end else begin
            break;
          end;
        end;
        if MatchInternal=0 then begin
          //Match found, all char meet the sequence
          //MatchTarget points to char before, so matching is +1
          AddMatch(MatchTarget+1);
          inc(MatchIndex,OldPatternSize);
          if not (rfReplaceAll in Flags) then begin
            break;
          end;
        end else begin
          //Match not found
          inc(MatchIndex);
        end;
      end else begin
        inc(MatchIndex);
      end;
    end;
  end;
  //Create room enougth for the result string
  SetLength(Result,Length(S)-OldPatternSize*MatchesCount+NewPatternSize*MatchesCount);
  MatchIndex:=1;
  MatchTarget:=1;
  //Matches[x] are 1 based offsets
  for MatchInternal := 0 to Pred(MatchesCount) do begin
    //Copy information up to next match
    AdvanceIndex:=Matches[MatchInternal]-MatchIndex;
    if AdvanceIndex>0 then begin
      move(S[MatchIndex],Result[MatchTarget],AdvanceIndex);
      inc(MatchTarget,AdvanceIndex);
      inc(MatchIndex,AdvanceIndex);
    end;
    //Copy the new replace information string
    if NewPatternSize>0 then begin
      move(NewPattern[1],Result[MatchTarget],NewPatternSize);
      inc(MatchTarget,NewPatternSize);
    end;
    inc(MatchIndex,OldPatternSize);
  end;
  if MatchTarget<=Length(Result) then begin
    //Add remain data at the end of source.
    move(S[MatchIndex],Result[MatchTarget],Length(Result)-MatchTarget+1);
  end;
end;

(*
  StringReplaceBoyerMoore

  Replaces one or many ocurrences of an ansistring in another ansistring by a new one.
  It can perform the compare ignoring case (ansi).

  * Parameters (Read only):
  S: The string to be searched in.
  OldPattern: The string to be searched.
  NewPattern: The string to replace OldPattern matches.
  Flags:
    rfReplaceAll: Replace all occurrences.
    rfIgnoreCase: Ignore case in OldPattern matching.

  * Returns:
    The modified string (if needed).

  It is memory conservative, just sizeof(SizeInt) per match in blocks off 100 matches
  plus Length(OldPattern)*2 in the case of ignoring case.
  Memory copies are the minimun necessary.
  Algorithm based in the Boyer-Moore string search algorithm.

  It is faster when the "S" string is very long and the OldPattern is also
  very big. As much big the OldPattern is, faster the search is too.

  It uses 2 different helper versions of Boyer-Moore algorithm, one for case
  sensitive and one for case INsensitive for speed reasons.

*)

function StringReplaceBoyerMoore(const S, OldPattern, NewPattern: ansistring;Flags: TReplaceFlags): ansistring;
var
  Matches: SizeIntArray;
  OldPatternSize: SizeInt;
  NewPatternSize: SizeInt;
  MatchesCount: SizeInt;
  MatchIndex: SizeInt;
  MatchTarget: SizeInt;
  MatchInternal: SizeInt;
  AdvanceIndex: SizeInt;
begin
  OldPatternSize:=Length(OldPattern);
  NewPatternSize:=Length(NewPattern);
  if (OldPattern='') or (Length(OldPattern)>Length(S)) then begin
    Result:=S;
    exit;
  end;

  if rfIgnoreCase in Flags then begin
    FindMatchesBoyerMooreCaseINSensitive(@s[1],@OldPattern[1],Length(S),Length(OldPattern),Matches, rfReplaceAll in Flags);
  end else begin
    FindMatchesBoyerMooreCaseSensitive(@s[1],@OldPattern[1],Length(S),Length(OldPattern),Matches, rfReplaceAll in Flags);
  end;

  MatchesCount:=Length(Matches);

  //Create room enougth for the result string
  SetLength(Result,Length(S)-OldPatternSize*MatchesCount+NewPatternSize*MatchesCount);
  MatchIndex:=1;
  MatchTarget:=1;
  //Matches[x] are 0 based offsets
  for MatchInternal := 0 to Pred(MatchesCount) do begin
    //Copy information up to next match
    AdvanceIndex:=Matches[MatchInternal]+1-MatchIndex;
    if AdvanceIndex>0 then begin
      move(S[MatchIndex],Result[MatchTarget],AdvanceIndex);
      inc(MatchTarget,AdvanceIndex);
      inc(MatchIndex,AdvanceIndex);
    end;
    //Copy the new replace information string
    if NewPatternSize>0 then begin
      move(NewPattern[1],Result[MatchTarget],NewPatternSize);
      inc(MatchTarget,NewPatternSize);
    end;
    inc(MatchIndex,OldPatternSize);
  end;
  if MatchTarget<=Length(Result) then begin
    //Add remain data at the end of source.
    move(S[MatchIndex],Result[MatchTarget],Length(Result)-MatchTarget+1);
  end;
end;

(*
  FindMatchesBoyerMooreCaseSensitive

  Finds one or many ocurrences of an ansistring in another ansistring.
  It is case sensitive.

  * Parameters:
  S: The PChar to be searched in. (Read only).
  OldPattern: The PChar to be searched. (Read only).
  SSize: The size of S in Chars. (Read only).
  OldPatternSize: The size of OldPatter in chars. (Read only).
  aMatches: SizeInt array where match indexes are returned (zero based) (write only).
  aMatchAll: Finds all matches, not just the first one. (Read only).

  * Returns:
    Nothing, information returned in aMatches parameter.

  The function is based in the Boyer-Moore algorithm.
*)

procedure FindMatchesBoyerMooreCaseSensitive(const S, OldPattern: PChar;
  const SSize, OldPatternSize: SizeInt; out aMatches: SizeIntArray;
  const aMatchAll: Boolean);
const
  ALPHABET_LENGHT=256;
  MATCHESCOUNTRESIZER=100; //Arbitrary value. Memory used = MATCHESCOUNTRESIZER * sizeof(SizeInt)
var
  //Stores the amount of replaces that will take place
  MatchesCount: SizeInt;
  //Currently allocated space for matches.
  MatchesAllocatedLimit: SizeInt;
type
  AlphabetArray=array [0..ALPHABET_LENGHT-1] of SizeInt;

  function Max(const a1,a2: SizeInt): SizeInt;
  begin
    if a1>a2 then Result:=a1 else Result:=a2;
  end;

  procedure MakeDeltaJumpTable1(out DeltaJumpTable1: AlphabetArray; const aPattern: PChar; const aPatternSize: SizeInt);
  var
    i: SizeInt;
  begin
    for i := 0 to ALPHABET_LENGHT-1 do begin
      DeltaJumpTable1[i]:=aPatternSize;
    end;
    //Last char do not enter in the equation
    for i := 0 to aPatternSize - 1 - 1 do begin
      DeltaJumpTable1[Ord(aPattern[i])]:=aPatternSize -1 - i;
    end;
  end;

  function IsPrefix(const aPattern: PChar; const aPatternSize, aPos: SizeInt): Boolean;
  var
    i: SizeInt;
    SuffixLength: SizeInt;
  begin
    SuffixLength:=aPatternSize-aPos;
    for i := 0 to SuffixLength-1 do begin
      if (aPattern[i] <> aPattern[aPos+i]) then begin
          exit(false);
      end;
    end;
    Result:=true;
  end;

  function SuffixLength(const aPattern: PChar; const aPatternSize, aPos: SizeInt): SizeInt;
  var
    i: SizeInt;
  begin
    i:=0;
    while (aPattern[aPos-i] = aPattern[aPatternSize-1-i]) and (i < aPos) do begin
      inc(i);
    end;
    Result:=i;
  end;

  procedure MakeDeltaJumpTable2(var DeltaJumpTable2: SizeIntArray; const aPattern: PChar; const aPatternSize: SizeInt);
  var
    Position: SizeInt;
    LastPrefixIndex: SizeInt;
    SuffixLengthValue: SizeInt;
  begin
    LastPrefixIndex:=aPatternSize-1;
    Position:=aPatternSize-1;
    while Position>=0 do begin
      if IsPrefix(aPattern,aPatternSize,Position+1) then begin
        LastPrefixIndex := Position+1;
      end;
      DeltaJumpTable2[Position] := LastPrefixIndex + (aPatternSize-1 - Position);
      Dec(Position);
    end;
    Position:=0;
    while Position<aPatternSize-1 do begin
      SuffixLengthValue:=SuffixLength(aPattern,aPatternSize,Position);
      if aPattern[Position-SuffixLengthValue] <> aPattern[aPatternSize-1 - SuffixLengthValue] then begin
        DeltaJumpTable2[aPatternSize - 1 - SuffixLengthValue] := aPatternSize - 1 - Position + SuffixLengthValue;
      end;
      Inc(Position);
    end;
  end;

  //Resizes the allocated space for replacement index
  procedure ResizeAllocatedMatches;
  begin
    MatchesAllocatedLimit:=MatchesCount+MATCHESCOUNTRESIZER;
    SetLength(aMatches,MatchesAllocatedLimit);
  end;

  //Add a match to be replaced
  procedure AddMatch(const aPosition: SizeInt); inline;
  begin
    if MatchesCount = MatchesAllocatedLimit then begin
      ResizeAllocatedMatches;
    end;
    aMatches[MatchesCount]:=aPosition;
    inc(MatchesCount);
  end;
var
  i,j: SizeInt;
  DeltaJumpTable1: array [0..ALPHABET_LENGHT-1] of SizeInt;
  DeltaJumpTable2: SizeIntArray;
begin
  MatchesCount:=0;
  MatchesAllocatedLimit:=0;
  SetLength(aMatches,MatchesCount);
  if OldPatternSize=0 then begin
    Exit;
  end;
  SetLength(DeltaJumpTable2,OldPatternSize);

  MakeDeltaJumpTable1(DeltaJumpTable1,OldPattern,OldPatternSize);
  MakeDeltaJumpTable2(DeltaJumpTable2,OldPattern,OldPatternSize);

  i:=OldPatternSize-1;
  while i < SSize do begin
    j:=OldPatternSize-1;
    while (j>=0) and (S[i] = OldPattern[j]) do begin
      dec(i);
      dec(j);
    end;
    if (j<0) then begin
      AddMatch(i+1);
      //Only first match ?
      if not aMatchAll then exit;
      inc(i,OldPatternSize);
      inc(i,OldPatternSize);
    end else begin
      i:=i + Max(DeltaJumpTable1[ord(s[i])],DeltaJumpTable2[j]);
    end;
  end;
  SetLength(aMatches,MatchesCount);
end;

(*
  FindMatchesBoyerMooreCaseINSensitive

  Finds one or many ocurrences of an ansistring in another ansistring.
  It is case INsensitive.

  * Parameters:
  S: The PChar to be searched in. (Read only).
  OldPattern: The PChar to be searched. (Read only).
  SSize: The size of S in Chars. (Read only).
  OldPatternSize: The size of OldPatter in chars. (Read only).
  aMatches: SizeInt array where match indexes are returned (zero based) (write only).
  aMatchAll: Finds all matches, not just the first one. (Read only).

  * Returns:
    Nothing, information returned in aMatches parameter.

  The function is based in the Boyer-Moore algorithm.
*)

procedure FindMatchesBoyerMooreCaseINSensitive(const S, OldPattern: PChar;
  const SSize, OldPatternSize: SizeInt; out aMatches: SizeIntArray;
  const aMatchAll: Boolean);
const
  ALPHABET_LENGHT=256;
  MATCHESCOUNTRESIZER=100; //Arbitrary value. Memory used = MATCHESCOUNTRESIZER * sizeof(SizeInt)
var
  //Lowercased OldPattern
  lPattern: ansistring;
  //Array of lowercased alphabet
  lCaseArray: array [0..ALPHABET_LENGHT-1] of char;
  //Stores the amount of replaces that will take place
  MatchesCount: SizeInt;
  //Currently allocated space for matches.
  MatchesAllocatedLimit: SizeInt;
type
  AlphabetArray=array [0..ALPHABET_LENGHT-1] of SizeInt;

  function Max(const a1,a2: SizeInt): SizeInt;
  begin
    if a1>a2 then Result:=a1 else Result:=a2;
  end;

  procedure MakeDeltaJumpTable1(out DeltaJumpTable1: AlphabetArray; const aPattern: PChar; const aPatternSize: SizeInt);
  var
    i: SizeInt;
  begin
    for i := 0 to ALPHABET_LENGHT-1 do begin
      DeltaJumpTable1[i]:=aPatternSize;
    end;
    //Last char do not enter in the equation
    for i := 0 to aPatternSize - 1 - 1 do begin
      DeltaJumpTable1[Ord(aPattern[i])]:=aPatternSize - 1 - i;
    end;
  end;

  function IsPrefix(const aPattern: PChar; const aPatternSize, aPos: SizeInt): Boolean; inline;
  var
    i: SizeInt;
    SuffixLength: SizeInt;
  begin
    SuffixLength:=aPatternSize-aPos;
    for i := 0 to SuffixLength-1 do begin
      if (aPattern[i+1] <> aPattern[aPos+i]) then begin
        exit(false);
      end;
    end;
    Result:=true;
  end;

  function SuffixLength(const aPattern: PChar; const aPatternSize, aPos: SizeInt): SizeInt; inline;
  var
    i: SizeInt;
  begin
    i:=0;
    while (aPattern[aPos-i] = aPattern[aPatternSize-1-i]) and (i < aPos) do begin
      inc(i);
    end;
    Result:=i;
  end;

  procedure MakeDeltaJumpTable2(var DeltaJumpTable2: SizeIntArray; const aPattern: PChar; const aPatternSize: SizeInt);
  var
    Position: SizeInt;
    LastPrefixIndex: SizeInt;
    SuffixLengthValue: SizeInt;
  begin
    LastPrefixIndex:=aPatternSize-1;
    Position:=aPatternSize-1;
    while Position>=0 do begin
      if IsPrefix(aPattern,aPatternSize,Position+1) then begin
        LastPrefixIndex := Position+1;
      end;
      DeltaJumpTable2[Position] := LastPrefixIndex + (aPatternSize-1 - Position);
      Dec(Position);
    end;
    Position:=0;
    while Position<aPatternSize-1 do begin
      SuffixLengthValue:=SuffixLength(aPattern,aPatternSize,Position);
      if aPattern[Position-SuffixLengthValue] <> aPattern[aPatternSize-1 - SuffixLengthValue] then begin
        DeltaJumpTable2[aPatternSize - 1 - SuffixLengthValue] := aPatternSize - 1 - Position + SuffixLengthValue;
      end;
      Inc(Position);
    end;
  end;

  //Resizes the allocated space for replacement index
  procedure ResizeAllocatedMatches;
  begin
    MatchesAllocatedLimit:=MatchesCount+MATCHESCOUNTRESIZER;
    SetLength(aMatches,MatchesAllocatedLimit);
  end;

  //Add a match to be replaced
  procedure AddMatch(const aPosition: SizeInt); inline;
  begin
    if MatchesCount = MatchesAllocatedLimit then begin
      ResizeAllocatedMatches;
    end;
    aMatches[MatchesCount]:=aPosition;
    inc(MatchesCount);
  end;
var
  i,j: SizeInt;
  DeltaJumpTable1: array [0..ALPHABET_LENGHT-1] of SizeInt;
  DeltaJumpTable2: SizeIntArray;
  //Pointer to lowered OldPattern
  plPattern: PChar;
begin
  MatchesCount:=0;
  MatchesAllocatedLimit:=0;
  SetLength(aMatches,MatchesCount);
  if OldPatternSize=0 then begin
    Exit;
  end;

  //Build an internal array of lowercase version of every possible char.
  for j := 0 to Pred(ALPHABET_LENGHT) do begin
    lCaseArray[j]:=AnsiLowerCase(char(j))[1];
  end;

  //Create the new lowercased pattern
  SetLength(lPattern,OldPatternSize);
  for j := 0 to Pred(OldPatternSize) do begin
    lPattern[j+1]:=lCaseArray[ord(OldPattern[j])];
  end;

  SetLength(DeltaJumpTable2,OldPatternSize);

  MakeDeltaJumpTable1(DeltaJumpTable1,@lPattern[1],OldPatternSize);
  MakeDeltaJumpTable2(DeltaJumpTable2,@lPattern[1],OldPatternSize);

  plPattern:=@lPattern[1];
  i:=OldPatternSize-1;
  while i < SSize do begin
    j:=OldPatternSize-1;
    while (j>=0) and (lCaseArray[Ord(S[i])] = plPattern[j]) do begin
      dec(i);
      dec(j);
    end;
    if (j<0) then begin
      AddMatch(i+1);
      //Only first match ?
      if not aMatchAll then break;
      inc(i,OldPatternSize);
      inc(i,OldPatternSize);
    end else begin
      i:=i + Max(DeltaJumpTable1[Ord(lCaseArray[Ord(s[i])])],DeltaJumpTable2[j]);
    end;
  end;
  SetLength(aMatches,MatchesCount);
end;

end.

ustringreplace.pas (19,898 bytes)

silvioprog

2015-02-18 06:56

reporter   ~0081203

The StringReplaceX is more fast than original StringReplace (thanks José! =)). You could changing the declaration from ansistring to string.

But I think that I found a bug in both (original StringReplace and StringReplaceX). Please test it:

program project1;

{$mode objfpc}{$H+}

uses
  ustringreplace,
  SysUtils;

const
  fmt = 'hh:nn:ss.zzz';
  Count = 100000;
  f = 'I çove free pascaÇ';
  o = 'ç';
  n = 'l';

  procedure TestStringReplace;
  var
    i: integer;
    s: string;
    b, e: TDateTime;
  begin
    b := now;
    for i := 1 to Count do
      s := StringReplace(f, o, n, [rfReplaceAll, rfIgnoreCase]);
    e := Now;
    WriteLn('StringReplace: ', s, ' - ', FormatDateTime(fmt, e - b));
  end;

  procedure TestStringReplaceX;
  var
    i: integer;
    s: string;
    b, e: TDateTime;
  begin
    b := now;
    for i := 1 to Count do
      s := StringReplaceX(f, o, n, [rfReplaceAll, rfIgnoreCase]);
    e := Now;
    WriteLn('StringReplaceX: ', s, ' - ', FormatDateTime(fmt, e - b));
  end;

begin
  TestStringReplace;
  TestStringReplaceX;

  TestStringReplaceX;
  TestStringReplace;

  ReadLn;
end.


The result is (Lazarus 1.5 rUnknown FPC 3.1.1 i386-win32-win32/win64):

StringReplace: I love free pascaA╪ - 00:00:00.111
StringReplaceX: I love free pascaA╪ - 00:00:00.060
StringReplaceX: I love free pascaA╪ - 00:00:00.060
StringReplace: I love free pascaA╪ - 00:00:00.110


And this code below in Delphi 7 is a bit slow, but replaces all chars:

program Project2;

{$APPTYPE CONSOLE}

uses
  SysUtils;

const
  fmt = 'hh:nn:ss.zzz';
  Count = 100000;
  f = 'I çove free pascaÇ';
  o = 'ç';
  n = 'l';
var
  i: integer;
  s: string;
  b, e: TDateTime;
begin
  b := now;
  for i := 1 to Count do
    s := StringReplace(f, o, n, [rfReplaceAll, rfIgnoreCase]);
  e := Now;
  WriteLn('StringReplace: ', s, ' - ', FormatDateTime(fmt, e - b));
  Readln;
end.


The result is:
StringReplace: I love free pascal - 00:00:00.209

silvioprog

2015-02-18 07:02

reporter   ~0081204

Last edited: 2015-02-18 07:05

View 3 revisions

The StringReplaceBoyerMoore is slow and return a AV in this test:

program project1;

{$mode objfpc}{$H+}

uses
  ustringreplace,
  SysUtils;

const
  fmt = 'hh:nn:ss.zzz';
  Count = 100000;
  f = 'I çove çççççfree pascaÇ{9A6798A4-BECD-4964-B7C5-ãçéñE8FA4B5CE262}ÇÇÇ';
  o = 'ãçéñE8FA4B5CE262';
  n = 'XXXXããXXX';

  procedure TestStringReplace;
  var
    i: integer;
    s: string;
    b, e: TDateTime;
  begin
    b := now;
    for i := 1 to Count do
      s := StringReplaceBoyerMoore(f, o, n, []);
    e := Now;
    WriteLn('StringReplaceBoyerMoore: ', s, ' - ', FormatDateTime(fmt, e - b));
  end;

begin
  TestStringReplace;
  TestStringReplace;
  TestStringReplace;
  ReadLn;
end.


(damn, mantis cutted my log =| ): http://pastebin.com/arDHQvLK

silvioprog

2015-02-18 07:20

reporter   ~0081206

Hm, after change the encoding (right click, File Settings | Encoding > CP1252) of my project and of the ustringreplace unit to CP1252, now the result is:

StringReplace: I love free pascal - 00:00:00.145
StringReplaceX: I love free pascaÇ - 00:00:00.033
StringReplaceX: I love free pascaÇ - 00:00:00.034
StringReplace: I love free pascal - 00:00:00.142


So original StringReplace seems working fine in correctly CP.

José Mejuto

2015-02-18 14:03

reporter   ~0081212

Thanks for the tests Silvio.

The Boyern-Moore code is slow for small strings (less than 1000 bytes +/-) and is faster for big "to be replaced" strings, it should not be used as general case. There is a bug in the code in the insensitive version, the line:

      //Only first match ?
      if not aMatchAll then exit;

Should be:

      //Only first match ?
      if not aMatchAll then break;

The differences in StringReplace and StringReplaceX are the "ansistring" used in StringReplaceX as conversions take place. Replacing all ocurrences of "ansistring" by "string" in ustringreplace.pas fixes the situation.

I had uploaded fixed bug version and with the ansistring by string replacement.

José Mejuto

2015-02-18 14:04

reporter  

ustringreplace_2.pas (19,855 bytes)
unit ustringreplace;

{$mode objfpc}{$H+}

interface

uses
  SysUtils;

function StringReplaceX(const S, OldPattern, NewPattern: string; Flags: TReplaceFlags): string;

function StringReplaceBoyerMoore (const S, OldPattern, NewPattern: string; Flags: TReplaceFlags): string;

implementation

type
  SizeIntArray = array of SizeInt;

procedure FindMatchesBoyerMooreCaseSensitive(const S,OldPattern: PChar; const SSize, OldPatternSize: SizeInt; out aMatches: SizeIntArray; const aMatchAll: Boolean); forward;
procedure FindMatchesBoyerMooreCaseINSensitive(const S, OldPattern: PChar; const SSize, OldPatternSize: SizeInt; out aMatches: SizeIntArray; const aMatchAll: Boolean); forward;

(*
  StringReplace

  Replaces one or many ocurrences of an ansistring in another ansistring by a new one.
  It can perform the compare ignoring case (ansi).

  * Parameters (Read only):
  S: The string to be searched in.
  OldPattern: The string to be searched.
  NewPattern: The string to replace OldPattern matches.
  Flags:
    rfReplaceAll: Replace all occurrences.
    rfIgnoreCase: Ignore case in OldPattern matching.

  * Returns:
    The modified string (if needed).

  The function is almost linear.
  It is memory conservative, just sizeof(SizeInt) per match in blocks off 100 matches
  plus Length(OldPattern)*2 in the case of ignoring case.
  Memory copies are the minimun necessary.
*)

function StringReplaceX(const S, OldPattern, NewPattern: string;
  Flags: TReplaceFlags): string;
const
  MATCHESCOUNTRESIZER=100; //Arbitrary value. Memory used = MATCHESCOUNTRESIZER * sizeof(SizeInt)
var
  //Stores where a replace will take place
  Matches: array of SizeInt;
  //Stores the amount of replaces that will take place
  MatchesCount: SizeInt;
  //Currently allocated space for matches.
  MatchesAllocatedLimit: SizeInt;
  //Uppercase version of pattern
  PatternUppercase: string;
  //Lowercase version of pattern
  PatternLowerCase: string;
  //Index
  MatchIndex: SizeInt;
  MatchLimit: SizeInt;
  MatchInternal: SizeInt;
  MatchTarget: SizeInt;
  AdvanceIndex: SizeInt;

  //Miscelanous variables
  OldPatternSize: SizeInt;
  NewPatternSize: SizeInt;

  //Resizes the allocated space for replacement index
  procedure ResizeAllocatedMatches;
  begin
    MatchesAllocatedLimit:=MatchesCount+MATCHESCOUNTRESIZER;
    SetLength(Matches,MatchesAllocatedLimit);
  end;

  //Add a match to be replaced
  procedure AddMatch(const aPosition: SizeInt); inline;
  begin
    if MatchesCount = MatchesAllocatedLimit then begin
      ResizeAllocatedMatches;
    end;
    Matches[MatchesCount]:=aPosition;
    inc(MatchesCount);
  end;
begin
  if (OldPattern='') or (Length(OldPattern)>Length(S)) then begin
    //This cases will never match nothing.
    Result:=S;
    exit;
  end;
  Result:='';
  OldPatternSize:=Length(OldPattern);
  MatchesCount:=0;
  MatchesAllocatedLimit:=0;
  if rfIgnoreCase in Flags then begin
    //Different algorithm for case sensitive and insensitive
    //This is insensitive, so 2 new ansistrings are created for search pattern, one upper and one lower case.
    //It is easy, usually, to create 2 versions of the match pattern than uppercased and lowered case each
    //character in the "to be matched" string.
    PatternUppercase:=AnsiUpperCase(OldPattern);
    PatternLowerCase:=AnsiLowerCase(OldPattern);
    MatchIndex:=Length(OldPattern);
    MatchLimit:=Length(S);
    NewPatternSize:=Length(NewPattern);
    while MatchIndex <= MatchLimit do begin
      if (S[MatchIndex]=PatternLowerCase[OldPatternSize]) or (S[MatchIndex]=PatternUppercase[OldPatternSize]) then begin
        //Match backwards...
        MatchInternal:=OldPatternSize-1;
        MatchTarget:=MatchIndex-1;
        while MatchInternal>=1 do begin
          if (S[MatchTarget]=PatternLowerCase[MatchInternal]) or (S[MatchTarget]=PatternUppercase[MatchInternal]) then begin
            dec(MatchInternal);
            dec(MatchTarget);
          end else begin
            break;
          end;
        end;
        if MatchInternal=0 then begin
          //Match found, all char meet the sequence
          //MatchTarget points to char before, so matching is +1
          AddMatch(MatchTarget+1);
          inc(MatchIndex,OldPatternSize);
          if not (rfReplaceAll in Flags) then begin
            break;
          end;
        end else begin
          //Match not found
          inc(MatchIndex);
        end;
      end else begin
        inc(MatchIndex);
      end;
    end;
  end else begin
    //Different algorithm for case sensitive and insensitive
    //This is sensitive, so just 1 binary comprare
    MatchIndex:=Length(OldPattern);
    MatchLimit:=Length(S);
    NewPatternSize:=Length(NewPattern);
    while MatchIndex <= MatchLimit do begin
      if (S[MatchIndex]=OldPattern[OldPatternSize]) then begin
        //Match backwards...
        MatchInternal:=OldPatternSize-1;
        MatchTarget:=MatchIndex-1;
        while MatchInternal>=1 do begin
          if (S[MatchTarget]=OldPattern[MatchInternal]) then begin
            dec(MatchInternal);
            dec(MatchTarget);
          end else begin
            break;
          end;
        end;
        if MatchInternal=0 then begin
          //Match found, all char meet the sequence
          //MatchTarget points to char before, so matching is +1
          AddMatch(MatchTarget+1);
          inc(MatchIndex,OldPatternSize);
          if not (rfReplaceAll in Flags) then begin
            break;
          end;
        end else begin
          //Match not found
          inc(MatchIndex);
        end;
      end else begin
        inc(MatchIndex);
      end;
    end;
  end;
  //Create room enougth for the result string
  SetLength(Result,Length(S)-OldPatternSize*MatchesCount+NewPatternSize*MatchesCount);
  MatchIndex:=1;
  MatchTarget:=1;
  //Matches[x] are 1 based offsets
  for MatchInternal := 0 to Pred(MatchesCount) do begin
    //Copy information up to next match
    AdvanceIndex:=Matches[MatchInternal]-MatchIndex;
    if AdvanceIndex>0 then begin
      move(S[MatchIndex],Result[MatchTarget],AdvanceIndex);
      inc(MatchTarget,AdvanceIndex);
      inc(MatchIndex,AdvanceIndex);
    end;
    //Copy the new replace information string
    if NewPatternSize>0 then begin
      move(NewPattern[1],Result[MatchTarget],NewPatternSize);
      inc(MatchTarget,NewPatternSize);
    end;
    inc(MatchIndex,OldPatternSize);
  end;
  if MatchTarget<=Length(Result) then begin
    //Add remain data at the end of source.
    move(S[MatchIndex],Result[MatchTarget],Length(Result)-MatchTarget+1);
  end;
end;

(*
  StringReplaceBoyerMoore

  Replaces one or many ocurrences of an ansistring in another ansistring by a new one.
  It can perform the compare ignoring case (ansi).

  * Parameters (Read only):
  S: The string to be searched in.
  OldPattern: The string to be searched.
  NewPattern: The string to replace OldPattern matches.
  Flags:
    rfReplaceAll: Replace all occurrences.
    rfIgnoreCase: Ignore case in OldPattern matching.

  * Returns:
    The modified string (if needed).

  It is memory conservative, just sizeof(SizeInt) per match in blocks off 100 matches
  plus Length(OldPattern)*2 in the case of ignoring case.
  Memory copies are the minimun necessary.
  Algorithm based in the Boyer-Moore string search algorithm.

  It is faster when the "S" string is very long and the OldPattern is also
  very big. As much big the OldPattern is, faster the search is too.

  It uses 2 different helper versions of Boyer-Moore algorithm, one for case
  sensitive and one for case INsensitive for speed reasons.

*)

function StringReplaceBoyerMoore(const S, OldPattern, NewPattern: string;Flags: TReplaceFlags): string;
var
  Matches: SizeIntArray;
  OldPatternSize: SizeInt;
  NewPatternSize: SizeInt;
  MatchesCount: SizeInt;
  MatchIndex: SizeInt;
  MatchTarget: SizeInt;
  MatchInternal: SizeInt;
  AdvanceIndex: SizeInt;
begin
  OldPatternSize:=Length(OldPattern);
  NewPatternSize:=Length(NewPattern);
  if (OldPattern='') or (Length(OldPattern)>Length(S)) then begin
    Result:=S;
    exit;
  end;

  if rfIgnoreCase in Flags then begin
    FindMatchesBoyerMooreCaseINSensitive(@s[1],@OldPattern[1],Length(S),Length(OldPattern),Matches, rfReplaceAll in Flags);
  end else begin
    FindMatchesBoyerMooreCaseSensitive(@s[1],@OldPattern[1],Length(S),Length(OldPattern),Matches, rfReplaceAll in Flags);
  end;

  MatchesCount:=Length(Matches);

  //Create room enougth for the result string
  SetLength(Result,Length(S)-OldPatternSize*MatchesCount+NewPatternSize*MatchesCount);
  MatchIndex:=1;
  MatchTarget:=1;
  //Matches[x] are 0 based offsets
  for MatchInternal := 0 to Pred(MatchesCount) do begin
    //Copy information up to next match
    AdvanceIndex:=Matches[MatchInternal]+1-MatchIndex;
    if AdvanceIndex>0 then begin
      move(S[MatchIndex],Result[MatchTarget],AdvanceIndex);
      inc(MatchTarget,AdvanceIndex);
      inc(MatchIndex,AdvanceIndex);
    end;
    //Copy the new replace information string
    if NewPatternSize>0 then begin
      move(NewPattern[1],Result[MatchTarget],NewPatternSize);
      inc(MatchTarget,NewPatternSize);
    end;
    inc(MatchIndex,OldPatternSize);
  end;
  if MatchTarget<=Length(Result) then begin
    //Add remain data at the end of source.
    move(S[MatchIndex],Result[MatchTarget],Length(Result)-MatchTarget+1);
  end;
end;

(*
  FindMatchesBoyerMooreCaseSensitive

  Finds one or many ocurrences of an ansistring in another ansistring.
  It is case sensitive.

  * Parameters:
  S: The PChar to be searched in. (Read only).
  OldPattern: The PChar to be searched. (Read only).
  SSize: The size of S in Chars. (Read only).
  OldPatternSize: The size of OldPatter in chars. (Read only).
  aMatches: SizeInt array where match indexes are returned (zero based) (write only).
  aMatchAll: Finds all matches, not just the first one. (Read only).

  * Returns:
    Nothing, information returned in aMatches parameter.

  The function is based in the Boyer-Moore algorithm.
*)

procedure FindMatchesBoyerMooreCaseSensitive(const S, OldPattern: PChar;
  const SSize, OldPatternSize: SizeInt; out aMatches: SizeIntArray;
  const aMatchAll: Boolean);
const
  ALPHABET_LENGHT=256;
  MATCHESCOUNTRESIZER=100; //Arbitrary value. Memory used = MATCHESCOUNTRESIZER * sizeof(SizeInt)
var
  //Stores the amount of replaces that will take place
  MatchesCount: SizeInt;
  //Currently allocated space for matches.
  MatchesAllocatedLimit: SizeInt;
type
  AlphabetArray=array [0..ALPHABET_LENGHT-1] of SizeInt;

  function Max(const a1,a2: SizeInt): SizeInt;
  begin
    if a1>a2 then Result:=a1 else Result:=a2;
  end;

  procedure MakeDeltaJumpTable1(out DeltaJumpTable1: AlphabetArray; const aPattern: PChar; const aPatternSize: SizeInt);
  var
    i: SizeInt;
  begin
    for i := 0 to ALPHABET_LENGHT-1 do begin
      DeltaJumpTable1[i]:=aPatternSize;
    end;
    //Last char do not enter in the equation
    for i := 0 to aPatternSize - 1 - 1 do begin
      DeltaJumpTable1[Ord(aPattern[i])]:=aPatternSize -1 - i;
    end;
  end;

  function IsPrefix(const aPattern: PChar; const aPatternSize, aPos: SizeInt): Boolean;
  var
    i: SizeInt;
    SuffixLength: SizeInt;
  begin
    SuffixLength:=aPatternSize-aPos;
    for i := 0 to SuffixLength-1 do begin
      if (aPattern[i] <> aPattern[aPos+i]) then begin
          exit(false);
      end;
    end;
    Result:=true;
  end;

  function SuffixLength(const aPattern: PChar; const aPatternSize, aPos: SizeInt): SizeInt;
  var
    i: SizeInt;
  begin
    i:=0;
    while (aPattern[aPos-i] = aPattern[aPatternSize-1-i]) and (i < aPos) do begin
      inc(i);
    end;
    Result:=i;
  end;

  procedure MakeDeltaJumpTable2(var DeltaJumpTable2: SizeIntArray; const aPattern: PChar; const aPatternSize: SizeInt);
  var
    Position: SizeInt;
    LastPrefixIndex: SizeInt;
    SuffixLengthValue: SizeInt;
  begin
    LastPrefixIndex:=aPatternSize-1;
    Position:=aPatternSize-1;
    while Position>=0 do begin
      if IsPrefix(aPattern,aPatternSize,Position+1) then begin
        LastPrefixIndex := Position+1;
      end;
      DeltaJumpTable2[Position] := LastPrefixIndex + (aPatternSize-1 - Position);
      Dec(Position);
    end;
    Position:=0;
    while Position<aPatternSize-1 do begin
      SuffixLengthValue:=SuffixLength(aPattern,aPatternSize,Position);
      if aPattern[Position-SuffixLengthValue] <> aPattern[aPatternSize-1 - SuffixLengthValue] then begin
        DeltaJumpTable2[aPatternSize - 1 - SuffixLengthValue] := aPatternSize - 1 - Position + SuffixLengthValue;
      end;
      Inc(Position);
    end;
  end;

  //Resizes the allocated space for replacement index
  procedure ResizeAllocatedMatches;
  begin
    MatchesAllocatedLimit:=MatchesCount+MATCHESCOUNTRESIZER;
    SetLength(aMatches,MatchesAllocatedLimit);
  end;

  //Add a match to be replaced
  procedure AddMatch(const aPosition: SizeInt); inline;
  begin
    if MatchesCount = MatchesAllocatedLimit then begin
      ResizeAllocatedMatches;
    end;
    aMatches[MatchesCount]:=aPosition;
    inc(MatchesCount);
  end;
var
  i,j: SizeInt;
  DeltaJumpTable1: array [0..ALPHABET_LENGHT-1] of SizeInt;
  DeltaJumpTable2: SizeIntArray;
begin
  MatchesCount:=0;
  MatchesAllocatedLimit:=0;
  SetLength(aMatches,MatchesCount);
  if OldPatternSize=0 then begin
    Exit;
  end;
  SetLength(DeltaJumpTable2,OldPatternSize);

  MakeDeltaJumpTable1(DeltaJumpTable1,OldPattern,OldPatternSize);
  MakeDeltaJumpTable2(DeltaJumpTable2,OldPattern,OldPatternSize);

  i:=OldPatternSize-1;
  while i < SSize do begin
    j:=OldPatternSize-1;
    while (j>=0) and (S[i] = OldPattern[j]) do begin
      dec(i);
      dec(j);
    end;
    if (j<0) then begin
      AddMatch(i+1);
      //Only first match ?
      if not aMatchAll then break;
      inc(i,OldPatternSize);
      inc(i,OldPatternSize);
    end else begin
      i:=i + Max(DeltaJumpTable1[ord(s[i])],DeltaJumpTable2[j]);
    end;
  end;
  SetLength(aMatches,MatchesCount);
end;

(*
  FindMatchesBoyerMooreCaseINSensitive

  Finds one or many ocurrences of an ansistring in another ansistring.
  It is case INsensitive.

  * Parameters:
  S: The PChar to be searched in. (Read only).
  OldPattern: The PChar to be searched. (Read only).
  SSize: The size of S in Chars. (Read only).
  OldPatternSize: The size of OldPatter in chars. (Read only).
  aMatches: SizeInt array where match indexes are returned (zero based) (write only).
  aMatchAll: Finds all matches, not just the first one. (Read only).

  * Returns:
    Nothing, information returned in aMatches parameter.

  The function is based in the Boyer-Moore algorithm.
*)

procedure FindMatchesBoyerMooreCaseINSensitive(const S, OldPattern: PChar;
  const SSize, OldPatternSize: SizeInt; out aMatches: SizeIntArray;
  const aMatchAll: Boolean);
const
  ALPHABET_LENGHT=256;
  MATCHESCOUNTRESIZER=100; //Arbitrary value. Memory used = MATCHESCOUNTRESIZER * sizeof(SizeInt)
var
  //Lowercased OldPattern
  lPattern: string;
  //Array of lowercased alphabet
  lCaseArray: array [0..ALPHABET_LENGHT-1] of char;
  //Stores the amount of replaces that will take place
  MatchesCount: SizeInt;
  //Currently allocated space for matches.
  MatchesAllocatedLimit: SizeInt;
type
  AlphabetArray=array [0..ALPHABET_LENGHT-1] of SizeInt;

  function Max(const a1,a2: SizeInt): SizeInt;
  begin
    if a1>a2 then Result:=a1 else Result:=a2;
  end;

  procedure MakeDeltaJumpTable1(out DeltaJumpTable1: AlphabetArray; const aPattern: PChar; const aPatternSize: SizeInt);
  var
    i: SizeInt;
  begin
    for i := 0 to ALPHABET_LENGHT-1 do begin
      DeltaJumpTable1[i]:=aPatternSize;
    end;
    //Last char do not enter in the equation
    for i := 0 to aPatternSize - 1 - 1 do begin
      DeltaJumpTable1[Ord(aPattern[i])]:=aPatternSize - 1 - i;
    end;
  end;

  function IsPrefix(const aPattern: PChar; const aPatternSize, aPos: SizeInt): Boolean; inline;
  var
    i: SizeInt;
    SuffixLength: SizeInt;
  begin
    SuffixLength:=aPatternSize-aPos;
    for i := 0 to SuffixLength-1 do begin
      if (aPattern[i+1] <> aPattern[aPos+i]) then begin
        exit(false);
      end;
    end;
    Result:=true;
  end;

  function SuffixLength(const aPattern: PChar; const aPatternSize, aPos: SizeInt): SizeInt; inline;
  var
    i: SizeInt;
  begin
    i:=0;
    while (aPattern[aPos-i] = aPattern[aPatternSize-1-i]) and (i < aPos) do begin
      inc(i);
    end;
    Result:=i;
  end;

  procedure MakeDeltaJumpTable2(var DeltaJumpTable2: SizeIntArray; const aPattern: PChar; const aPatternSize: SizeInt);
  var
    Position: SizeInt;
    LastPrefixIndex: SizeInt;
    SuffixLengthValue: SizeInt;
  begin
    LastPrefixIndex:=aPatternSize-1;
    Position:=aPatternSize-1;
    while Position>=0 do begin
      if IsPrefix(aPattern,aPatternSize,Position+1) then begin
        LastPrefixIndex := Position+1;
      end;
      DeltaJumpTable2[Position] := LastPrefixIndex + (aPatternSize-1 - Position);
      Dec(Position);
    end;
    Position:=0;
    while Position<aPatternSize-1 do begin
      SuffixLengthValue:=SuffixLength(aPattern,aPatternSize,Position);
      if aPattern[Position-SuffixLengthValue] <> aPattern[aPatternSize-1 - SuffixLengthValue] then begin
        DeltaJumpTable2[aPatternSize - 1 - SuffixLengthValue] := aPatternSize - 1 - Position + SuffixLengthValue;
      end;
      Inc(Position);
    end;
  end;

  //Resizes the allocated space for replacement index
  procedure ResizeAllocatedMatches;
  begin
    MatchesAllocatedLimit:=MatchesCount+MATCHESCOUNTRESIZER;
    SetLength(aMatches,MatchesAllocatedLimit);
  end;

  //Add a match to be replaced
  procedure AddMatch(const aPosition: SizeInt); inline;
  begin
    if MatchesCount = MatchesAllocatedLimit then begin
      ResizeAllocatedMatches;
    end;
    aMatches[MatchesCount]:=aPosition;
    inc(MatchesCount);
  end;
var
  i,j: SizeInt;
  DeltaJumpTable1: array [0..ALPHABET_LENGHT-1] of SizeInt;
  DeltaJumpTable2: SizeIntArray;
  //Pointer to lowered OldPattern
  plPattern: PChar;
begin
  MatchesCount:=0;
  MatchesAllocatedLimit:=0;
  SetLength(aMatches,MatchesCount);
  if OldPatternSize=0 then begin
    Exit;
  end;

  //Build an internal array of lowercase version of every possible char.
  for j := 0 to Pred(ALPHABET_LENGHT) do begin
    lCaseArray[j]:=AnsiLowerCase(char(j))[1];
  end;

  //Create the new lowercased pattern
  SetLength(lPattern,OldPatternSize);
  for j := 0 to Pred(OldPatternSize) do begin
    lPattern[j+1]:=lCaseArray[ord(OldPattern[j])];
  end;

  SetLength(DeltaJumpTable2,OldPatternSize);

  MakeDeltaJumpTable1(DeltaJumpTable1,@lPattern[1],OldPatternSize);
  MakeDeltaJumpTable2(DeltaJumpTable2,@lPattern[1],OldPatternSize);

  plPattern:=@lPattern[1];
  i:=OldPatternSize-1;
  while i < SSize do begin
    j:=OldPatternSize-1;
    while (j>=0) and (lCaseArray[Ord(S[i])] = plPattern[j]) do begin
      dec(i);
      dec(j);
    end;
    if (j<0) then begin
      AddMatch(i+1);
      //Only first match ?
      if not aMatchAll then break;
      inc(i,OldPatternSize);
      inc(i,OldPatternSize);
    end else begin
      i:=i + Max(DeltaJumpTable1[Ord(lCaseArray[Ord(s[i])])],DeltaJumpTable2[j]);
    end;
  end;
  SetLength(aMatches,MatchesCount);
end;

end.

ustringreplace_2.pas (19,855 bytes)

silvioprog

2015-02-18 17:44

reporter   ~0081218

Last edited: 2015-02-18 17:56

View 2 revisions

Good! =)

Now the result is (even ustringreplace_2 as UTF-8 encoding):

StringReplace: I love free pascal - 00:00:00.124
StringReplaceX: I love free pascal - 00:00:00.065
StringReplaceX: I love free pascal - 00:00:00.065
StringReplace: I love free pascal - 00:00:00.123


Plans to add it in SysUtils? This function is awesome.

Michael Van Canneyt

2016-01-14 08:59

administrator   ~0088840

I added the routine(s) to strutils, not to sysutils.
Reason is that all algorithms have their pros and cons, so I prefer to add all of them. But adding all this to sysutils will only blow up its size.

So I added all algorithms to strutils instead.
I added a parameter 'Algorithm' to be able to select the mechanism.
The default is the current implementation, for backwards compatibility.

The routines for Boyer-Moore I have exposed, and added a plain string version.
Please test and close if OK.

Jarto Tarpio

2016-01-31 19:17

reporter   ~0089632

This bug was originally submitted because the default StringReplace was extremely slow. It's a real disappointment, that the old version was not replaced and is in fact used as default.

Michael Van Canneyt

2016-01-31 22:58

administrator   ~0089635

I am aware that the original is slower than yours.
Yours has also drawbacks.
Now people can choose whatever they feel is best.

Jarto Tarpio

2016-02-01 20:42

reporter   ~0089660

As a matter of fact, there are at least four new implementations here:

- my original version (basically two pass)
- Dmitry's version
- sraManySmall
- sraBoyerMoore

Only the last two were merged. But that's really not the reason why I'm complaining. Michael, if you could tell what the drawbacks are in each of these compared to the original one, we could maybe make one new version, that would be fast _and_ 100% backwards compatible.

I'm not arguing, that it's a bad idea to have the freedom to choose the algorithm, but the default one should be at least decent. For example, in most ported code from Delphi no-one will notice the optional argument and use it.

user268

2016-02-04 23:49

  ~0089775

Last edited: 2016-02-05 00:06

View 10 revisions

This is not 2 years old issue, but from beginning, started with Turbo Pascal and continued with Delphi. Who ever made this routine at beginning should get fired immediately from Borland... Sadly, you have just copied that...

Basically, here is made few bad approaches.

1. Any other algorithm is better instead childish concating strings by s:=s+t.
2. Get real test cases. No one need to replace 1 mil. of 'a' to 'b' or similar. Inseted:
   - Usually is replaced short (less than 256) filenames: space to underscore. Here speed of algorithm is irrelevant.
   - Replacing CRLF to LF or vice-versa
   - Replacing specific tags in text with real value - formulars, reports and similar.
   - Replacing specific entities in HTML page when display (as &amp and similar)
   - Replacing some words including sensitive or case insensitive search.

That means, text is up to some 100K the most and any non trivial optimized algorithm is sufficient.

If it is required speed in any situation, several algorithms must exist. For instance, if there is one or few words found in text of 100K in the middle or end, simple position keeping due first pass (in order to determinate exact resulting string), replacing is faster than searching it again in second pass.

If there is many strings to replace, creating list (or using primitive version of Boyer-Moore algorithm) is not optimal. etc.

My version of StringReplace I have made 15 years ago if faster 25% in real situations than any other here posted, however that is irrelevant as in reality text is very short and difference of few or few dozen ms is a joke.

I'm quite suprised this issue required any discussion!
Simple implementation of any correctly optimized algorithm is far better than original.

user268

2016-02-06 08:32

  ~0089802

Last edited: 2016-02-06 09:12

View 12 revisions

The only real problem here is how to handle utf-8 case insensitive search. As is known, utf-8 characters have variable length between 1-4 bytes.

Both possible approaches:

1. Direct conversion of each utf-8 character in code
2. Initial conversion to utf-16 and use utf-16 version of the function.

Either version could be equally bad decision, depending on how many utf-8 characters which length exceed 1b exists. It is possible to count how many these characters exist, however text itself does not have to contain printable characters at all...

If start with original assumption that the initial text is relatively short, there is no need to complicate anything more than necessary and for utf-8 case insensitive version make initial conversion and use utf-16 version of the function.

It is unclear however does FPC have or distinguish ANSI and utf-8 string types in order to have several overloaded versions of the function (ANSI, utf-8 and utf-16).

Lowering utf-8 main text and searching string and using ANSI version of the function is as well possible solution, however this require additional info to the main function in order not to get lower string as result. As well it may produce additional complication if some lower and upper utf-8 character does not have the same length...

That is all about the issue.

Michael Van Canneyt

2016-02-06 11:26

administrator   ~0089805

In revision 33055, the routine by Jarto Tarpio was implemented in the sysutils unit.

There were 2 problems with his original routine:

1. It uses PChar. This is ansistring only.
   StringReplace also needs to work with UnicodeString.
   I had to modify the routine for this.

2. It used PosEx, which was only in strutils. So it was unusable.
   Luckily, meanwhile the regular Pos() has been made Delphi XE 10 compatible
   so it also accepts a start pos indicator.

So now the default sysutils.stringreplace is faster, works correctly for all types of strings (I made some test routines).
The one in strutils remains in place so people can optimize when needed.

Maybe an overloaded version with single chars for new/old pattern can be implemented since that is a common usecase which can be implemented faster.
But that is for a different bug report.

Issue History

Date Modified Username Field Change
2014-10-15 17:52 Jarto Tarpio New Issue
2014-10-15 17:52 Jarto Tarpio File Added: strreplace.txt
2014-10-15 21:03 Dmitry Boyarintsev File Added: compareString.diff
2014-10-15 21:05 Dmitry Boyarintsev Note Added: 0078255
2014-10-15 21:06 Dmitry Boyarintsev Note Edited: 0078255 View Revisions
2014-10-15 21:06 Dmitry Boyarintsev Note Edited: 0078255 View Revisions
2014-10-16 13:50 Michael Van Canneyt Assigned To => Michael Van Canneyt
2014-10-16 13:50 Michael Van Canneyt Status new => assigned
2014-10-16 17:52 Jarto Tarpio Note Added: 0078278
2014-10-16 18:27 Dmitry Boyarintsev File Deleted: compareString.diff
2014-10-16 18:27 Dmitry Boyarintsev File Added: compareString.diff
2014-10-16 18:29 Dmitry Boyarintsev File Deleted: compareString.diff
2014-10-16 18:29 Dmitry Boyarintsev File Added: compareString.diff
2014-10-16 20:42 Dmitry Boyarintsev File Deleted: compareString.diff
2014-10-16 20:42 Dmitry Boyarintsev File Added: compareString.diff
2014-10-16 20:48 Dmitry Boyarintsev Note Added: 0078285
2014-10-18 19:01 Jarto Tarpio Note Added: 0078348
2014-10-20 04:10 Dmitry Boyarintsev File Deleted: compareString.diff
2014-10-20 04:10 Dmitry Boyarintsev File Added: compareString.diff
2014-10-20 04:11 Dmitry Boyarintsev Note Added: 0078407
2014-10-20 16:16 Jarto Tarpio Note Added: 0078422
2014-10-24 14:22 José Mejuto Note Added: 0078586
2014-10-24 14:23 José Mejuto File Added: ustringreplace.pas
2015-02-18 06:56 silvioprog Note Added: 0081203
2015-02-18 07:02 silvioprog Note Added: 0081204
2015-02-18 07:02 silvioprog Note Edited: 0081204 View Revisions
2015-02-18 07:05 silvioprog Note Edited: 0081204 View Revisions
2015-02-18 07:20 silvioprog Note Added: 0081206
2015-02-18 14:03 José Mejuto Note Added: 0081212
2015-02-18 14:04 José Mejuto File Added: ustringreplace_2.pas
2015-02-18 17:44 silvioprog Note Added: 0081218
2015-02-18 17:56 silvioprog Note Edited: 0081218 View Revisions
2016-01-14 08:59 Michael Van Canneyt Fixed in Revision => 32939
2016-01-14 08:59 Michael Van Canneyt Note Added: 0088840
2016-01-14 08:59 Michael Van Canneyt Status assigned => resolved
2016-01-14 08:59 Michael Van Canneyt Fixed in Version => 3.1.1
2016-01-14 08:59 Michael Van Canneyt Resolution open => fixed
2016-01-14 08:59 Michael Van Canneyt Target Version => 3.0.2
2016-01-31 19:17 Jarto Tarpio Note Added: 0089632
2016-01-31 19:17 Jarto Tarpio Status resolved => feedback
2016-01-31 19:17 Jarto Tarpio Resolution fixed => reopened
2016-01-31 22:58 Michael Van Canneyt Note Added: 0089635
2016-01-31 22:58 Michael Van Canneyt Status feedback => resolved
2016-01-31 22:58 Michael Van Canneyt Resolution reopened => fixed
2016-02-01 20:42 Jarto Tarpio Note Added: 0089660
2016-02-01 20:42 Jarto Tarpio Status resolved => feedback
2016-02-01 20:42 Jarto Tarpio Resolution fixed => reopened
2016-02-04 23:49 user268 Note Added: 0089775
2016-02-04 23:51 user268 Note Edited: 0089775 View Revisions
2016-02-04 23:51 user268 Note Edited: 0089775 View Revisions
2016-02-04 23:52 user268 Note Edited: 0089775 View Revisions
2016-02-04 23:55 user268 Note Edited: 0089775 View Revisions
2016-02-04 23:59 user268 Note Edited: 0089775 View Revisions
2016-02-05 00:00 user268 Note Edited: 0089775 View Revisions
2016-02-05 00:03 user268 Note Edited: 0089775 View Revisions
2016-02-05 00:04 user268 Note Edited: 0089775 View Revisions
2016-02-05 00:06 user268 Note Edited: 0089775 View Revisions
2016-02-06 08:32 user268 Note Added: 0089802
2016-02-06 08:33 user268 Note Edited: 0089802 View Revisions
2016-02-06 08:35 user268 Note Edited: 0089802 View Revisions
2016-02-06 08:38 user268 Note Edited: 0089802 View Revisions
2016-02-06 08:41 user268 Note Edited: 0089802 View Revisions
2016-02-06 08:57 user268 Note Edited: 0089802 View Revisions
2016-02-06 08:59 user268 Note Edited: 0089802 View Revisions
2016-02-06 09:09 user268 Note Edited: 0089802 View Revisions
2016-02-06 09:09 user268 Note Edited: 0089802 View Revisions
2016-02-06 09:10 user268 Note Edited: 0089802 View Revisions
2016-02-06 09:11 user268 Note Edited: 0089802 View Revisions
2016-02-06 09:12 user268 Note Edited: 0089802 View Revisions
2016-02-06 11:26 Michael Van Canneyt Fixed in Revision 32939 => 33055
2016-02-06 11:26 Michael Van Canneyt Note Added: 0089805
2016-02-06 11:26 Michael Van Canneyt Status feedback => resolved
2016-02-06 11:26 Michael Van Canneyt Resolution reopened => fixed