View Issue Details

IDProjectCategoryView StatusLast Update
0037580FPCCompilerpublic2020-09-16 00:05
ReporterJ. Gareth Moreton Assigned To 
PrioritynormalSeverityfeatureReproducibilityN/A
Status newResolutionopen 
Platformaarch64OSLinux (Raspberry Pi OS) 
Product Version3.3.1 
Summary0037580: [Patch] LDR/STR pairing optimisation for AArch64
DescriptionThis patch looks at groups of LDR and STR instructions and attempts to pair them into LDP and STP instructions if they read from contiguous memory, while making sure it doesn't make combinations that are undefined (e.g. an LDP instruction where both destination registers are the same) or incorrect (the address register is modified in between).
Steps To ReproduceApply patch and confirm correct compilation and improved optimisation.
Additional InformationThis optimisation is run in pass 2 so future optimisations that affect LDR and STR instructions on an individual basis are more efficient. Additionally, I cannot make too much progress just yet on pass 1 optimisations until 0037526 is approved or rejected, so I know what to branch off of.

Test suite has been run under aarch64-linux and shows no regressions.
Tagsaarch64, optimization, patch
Fixed in Revision
FPCOldBugId
FPCTarget-
Attached Files

Activities

J. Gareth Moreton

2020-08-20 08:47

developer   ~0125004

Fixed a bug that caused the optimisation to malfunction on -O3 and -O4.

J. Gareth Moreton

2020-08-20 13:51

developer   ~0125014

Another bug fix. I forgot to check the index registers. A lot of these bugs don't show up under -O2 or even through running the test suite- requires additional options or some deep thought!
load-store-pair.patch (8,907 bytes)   
Index: compiler/aarch64/aoptcpu.pas
===================================================================
--- compiler/aarch64/aoptcpu.pas	(revision 46481)
+++ compiler/aarch64/aoptcpu.pas	(working copy)
@@ -39,6 +39,7 @@
       TCpuAsmOptimizer = class(TARMAsmOptimizer)
         { uses the same constructor as TAopObj }
         function PeepHoleOptPass1Cpu(var p: tai): boolean; override;
+        function PeepHoleOptPass2Cpu(var p: tai): boolean; override;
         function PostPeepHoleOptsCpu(var p: tai): boolean; override;
         function RegLoadedWithNewValue(reg: tregister; hp: tai): boolean;override;
         function InstructionLoadsFromReg(const reg: TRegister; const hp: tai): boolean;override;
@@ -51,6 +52,8 @@
         function OptPass1STP(var p: tai): boolean;
         function OptPass1Mov(var p: tai): boolean;
         function OptPass1FMov(var p: tai): Boolean;
+
+        function OptPass2LDRSTR(var p: tai): boolean;
       End;
 
 Implementation
@@ -526,6 +529,163 @@
     end;
 
 
+  function TCpuAsmOptimizer.OptPass2LDRSTR(var p: tai): boolean;
+    var
+      hp1, hp1_last: tai;
+      ThisRegister: TRegister;
+      OffsetVal, ValidOffset, MinOffset, MaxOffset: asizeint;
+      TargetOpcode: TAsmOp;
+      Breakout: Boolean;
+    begin
+      Result := False;
+      ThisRegister := taicpu(p).oper[0]^.reg;
+
+      case taicpu(p).opcode of
+        A_LDR:
+          TargetOpcode := A_LDP;
+        A_STR:
+          TargetOpcode := A_STP;
+        else
+          InternalError(2020081501);
+      end;
+
+      { reg appearing in ref invalidates these optimisations }
+      if (TargetOpcode = A_STP) or not RegInRef(ThisRegister, taicpu(p).oper[1]^.ref^) then
+        begin
+          { LDP/STP has a smaller permitted offset range than LDR/STR.
+
+            TODO: For a group of out-of-range LDR/STR instructions, can
+            we declare a temporary register equal to the offset base
+            address, modify the STR instructions to use that register
+            and then convert them to STP instructions?  Note that STR
+            generally takes 2 cycles (on top of the memory latency),
+            while LDP/STP takes 3.
+          }
+
+          if (getsubreg(ThisRegister) = R_SUBQ) then
+            begin
+              ValidOffset := 8;
+              MinOffset := -512;
+              MaxOffset := 504;
+            end
+          else
+            begin
+              ValidOffset := 4;
+              MinOffset := -256;
+              MaxOffset := 252;
+            end;
+
+          hp1_last := p;
+
+          { Look for nearby LDR/STR instructions }
+          if (taicpu(p).oppostfix = PF_NONE) and
+            (taicpu(p).oper[1]^.ref^.addressmode = AM_OFFSET) then
+            { If SkipGetNext is True, GextNextInstruction isn't called }
+            while GetNextInstruction(hp1_last, hp1) do
+              begin
+                if (hp1.typ <> ait_instruction) then
+                  Break;
+
+                if (taicpu(hp1).opcode = taicpu(p).opcode) then
+                  begin
+                    Breakout := False;
+
+                    if (taicpu(hp1).oppostfix = PF_NONE) and
+                      { Registers need to be the same size }
+                      (getsubreg(ThisRegister) = getsubreg(taicpu(hp1).oper[0]^.reg)) and
+                      (
+                        (TargetOpcode = A_STP) or
+                        { LDP x0, x0, [sp, #imm] is undefined behaviour, even
+                          though such an LDR pair should have been optimised
+                          out by now. STP is okay }
+                        (ThisRegister <> taicpu(hp1).oper[0]^.reg)
+                      ) and
+                      (taicpu(hp1).oper[1]^.ref^.addressmode = AM_OFFSET) and
+                      (taicpu(p).oper[1]^.ref^.base = taicpu(hp1).oper[1]^.ref^.base) and
+                      (taicpu(p).oper[1]^.ref^.index = taicpu(hp1).oper[1]^.ref^.index) and
+                      { Make sure the address registers haven't changed }
+                      not RegModifiedBetween(taicpu(hp1).oper[1]^.ref^.base, p, hp1) and
+                      (
+                        (taicpu(hp1).oper[1]^.ref^.index = NR_NO) or
+                        not RegModifiedBetween(taicpu(hp1).oper[1]^.ref^.index, p, hp1)
+                      ) and
+                      { Don't need to check "RegInRef" because the base registers are identical,
+                        and the first one was checked already. [Kit] }
+                      not RegModifiedBetween(taicpu(hp1).oper[0]^.reg, p, hp1) then
+                      begin
+                        { Can we convert these two LDR/STR instructions into a
+                          single LDR/STP? }
+
+                        OffsetVal := taicpu(hp1).oper[1]^.ref^.offset - taicpu(p).oper[1]^.ref^.offset;
+                        if (OffsetVal = ValidOffset) then
+                          begin
+                            if  (taicpu(p).oper[1]^.ref^.offset >= MinOffset) and (taicpu(hp1).oper[1]^.ref^.offset <= MaxOffset) then
+                              begin
+                                { Convert:
+                                    LDR/STR reg0, [reg2, #ofs]
+                                    ...
+                                    LDR/STR reg1. [reg2, #ofs + 8] // 4 if registers are 32-bit
+                                  To:
+                                    LDP/STP reg0, reg1, [reg2, #ofs]
+                                }
+                                taicpu(p).opcode := TargetOpcode;
+                                if TargetOpcode = A_STP then
+                                  DebugMsg('Peephole Optimization: StrStr2Stp', p)
+                                else
+                                  DebugMsg('Peephole Optimization: LdrLdr2Ldp', p);
+                                taicpu(p).ops := 3;
+                                taicpu(p).loadref(2, taicpu(p).oper[1]^.ref^);
+                                taicpu(p).loadreg(1, taicpu(hp1).oper[0]^.reg);
+
+                                asml.Remove(hp1);
+                                hp1.Free;
+                                Result := True;
+                                Exit;
+                              end;
+                          end
+                        else if (OffsetVal = -ValidOffset) then
+                          begin
+                            if (taicpu(hp1).oper[1]^.ref^.offset >= MinOffset) and (taicpu(p).oper[1]^.ref^.offset <= MaxOffset) then
+                              begin
+                                { Convert:
+                                    LDR/STR reg0, [reg2, #ofs + 8] // 4 if registers are 32-bit
+                                    ...
+                                    LDR/STR reg1. [reg2, #ofs]
+                                  To:
+                                    LDP/STP reg1, reg0, [reg2, #ofs]
+                                }
+                                taicpu(p).opcode := TargetOpcode;
+                                if TargetOpcode = A_STP then
+                                  DebugMsg('Peephole Optimization: StrStr2Stp (reverse)', p)
+                                else
+                                  DebugMsg('Peephole Optimization: LdrLdr2Ldp (reverse)', p);
+                                taicpu(p).ops := 3;
+                                taicpu(p).loadref(2, taicpu(hp1).oper[1]^.ref^);
+                                taicpu(p).loadreg(1, taicpu(p).oper[0]^.reg);
+                                taicpu(p).loadreg(0, taicpu(hp1).oper[0]^.reg);
+
+                                asml.Remove(hp1);
+                                hp1.Free;
+                                Result := True;
+                                Exit;
+                              end;
+                          end;
+                      end;
+                  end
+                else
+                  Break;
+
+                { Don't continue looking for LDR/STR pairs if the address register
+                  gets modified }
+                if RegModifiedByInstruction(taicpu(p).oper[1]^.ref^.base, hp1) then
+                  Break;
+
+                hp1_last := hp1;
+              end;
+        end;
+    end;
+
+
   function TCpuAsmOptimizer.OptPostCMP(var p : tai): boolean;
     var
      hp1,hp2: tai;
@@ -626,6 +786,24 @@
     end;
 
 
+  function TCpuAsmOptimizer.PeepHoleOptPass2Cpu(var p: tai): boolean;
+    var
+      hp1: tai;
+    begin
+      result := false;
+      if p.typ=ait_instruction then
+        begin
+          case taicpu(p).opcode of
+            A_LDR,
+            A_STR:
+              Result:=OptPass2LDRSTR(p);
+            else
+              ;
+          end;
+        end;
+    end;
+
+
   function TCpuAsmOptimizer.PostPeepHoleOptsCpu(var p: tai): boolean;
     begin
       result := false;
load-store-pair.patch (8,907 bytes)   

J. Gareth Moreton

2020-09-16 00:05

developer   ~0125563

Any updates/opinions on this?

Issue History

Date Modified Username Field Change
2020-08-16 04:47 J. Gareth Moreton New Issue
2020-08-16 04:47 J. Gareth Moreton File Added: load-store-pair.patch
2020-08-16 04:48 J. Gareth Moreton Tag Attached: patch
2020-08-16 04:48 J. Gareth Moreton Tag Attached: aarch64
2020-08-16 04:48 J. Gareth Moreton Tag Attached: optimization
2020-08-16 04:49 J. Gareth Moreton Severity minor => feature
2020-08-16 04:49 J. Gareth Moreton Additional Information Updated View Revisions
2020-08-16 04:49 J. Gareth Moreton FPCTarget => -
2020-08-20 08:46 J. Gareth Moreton File Deleted: load-store-pair.patch
2020-08-20 08:47 J. Gareth Moreton Note Added: 0125004
2020-08-20 08:47 J. Gareth Moreton File Added: load-store-pair.patch
2020-08-20 13:50 J. Gareth Moreton File Deleted: load-store-pair.patch
2020-08-20 13:51 J. Gareth Moreton Note Added: 0125014
2020-08-20 13:51 J. Gareth Moreton File Added: load-store-pair.patch
2020-09-16 00:05 J. Gareth Moreton Note Added: 0125563