View Issue Details

IDProjectCategoryView StatusLast Update
0036095FPCCompilerpublic2019-10-12 16:42
ReporterChristo CrauseAssigned ToFlorian 
PrioritynormalSeverityminorReproducibilityN/A
Status closedResolutionfixed 
Product Version3.3.1Product Build 
Target VersionFixed in Version3.3.1 
Summary0036095: AVR [patch] Optimizing code generation for shift with compile time constant
DescriptionAttached please find a patch which optimizes code generation for expressions involving shift by compile time constant. A discussion of the trade-off between a bit shift loop (existing general algorithm), loop unrolling and byte moving in terms of speed and code size as implemented in the patch can be found here: https://github.com/ccrause/freepascal/wiki/Optimizing-code-generation-for-shift-with-compile-time-constant
TagsAVR
Fixed in Revision43136, 43169
FPCOldBugId
FPCTarget-
Attached Files
  • shiftbyconst.patch (5,078 bytes)
    diff --git compiler/avr/cgcpu.pas compiler/avr/cgcpu.pas
    index 661c300a9d..a379068a67 100644
    --- compiler/avr/cgcpu.pas
    +++ compiler/avr/cgcpu.pas
    @@ -438,6 +438,11 @@ unit cgcpu;
     
     
          procedure tcgavr.a_op_const_reg_reg(list: TAsmList; op: TOpCg; size: tcgsize; a: tcgint; src, dst: tregister);
    +       var
    +         tmpSrc, tmpDst, countreg: TRegister;
    +         b, b2, i, j: byte;
    +         s1, s2, t1: integer;
    +         l1: TAsmLabel;
            begin
              if (op in [OP_MUL,OP_IMUL]) and (size in [OS_16,OS_S16]) and (a in [2,4,8]) then
                begin
    @@ -451,6 +456,98 @@ unit cgcpu;
                      a:=a shr 1;
                    end;
                end
    +         else if (op in [OP_SHL, OP_SHR]) and (a > 0)  then // a = 0 get eliminated later by tcg.optimize_op_const
    +           begin
    +             b := a div 8;  // number of bytes to shift
    +
    +             // Ensure that b is never larger than base type
    +             if b > tcgsize2size[size] then
    +               begin
    +                 b := tcgsize2size[size];
    +                 b2 := 0;
    +               end
    +             else
    +               b2 := a mod 8;
    +
    +             if b < tcgsize2size[size] then
    +               // copy from src to dst accounting for shift offset
    +               for i := 0 to (tcgsize2size[size]-b-1) do
    +                 if op = OP_SHL then
    +                   a_load_reg_reg(list, OS_8, OS_8,
    +                     GetOffsetReg64(src, NR_NO, i),
    +                     GetOffsetReg64(dst, NR_NO, i+b))
    +                 else
    +                   a_load_reg_reg(list, OS_8, OS_8,
    +                     GetOffsetReg64(src, NR_NO, i+b),
    +                     GetOffsetReg64(dst, NR_NO, i));
    +
    +             // remaining bit shifts
    +             if b2 > 0 then
    +               begin
    +                 // Cost of loop
    +                 s1 := 3 + tcgsize2size[size] - b;
    +                 t1 := b2*(tcgsize2size[size] - b + 3);
    +                 //Cost of loop unrolling, t2 = s2
    +                 s2 := b2*(tcgsize2size[size] - b);
    +
    +                 if ((cs_opt_size in current_settings.optimizerswitches) and (s1 < s2)) or
    +                    (((s2 - s1) - t1/s2) > 0) then
    +                   begin
    +                     // Shift non-moved bytes in loop
    +                     current_asmdata.getjumplabel(l1);
    +                     countreg:=getintregister(list,OS_8);
    +                     a_load_const_reg(list,OS_8,b2,countreg);
    +                     cg.a_label(list,l1);
    +                     if op = OP_SHL then
    +                       list.concat(taicpu.op_reg(A_LSL,GetOffsetReg64(dst, NR_NO, b)))
    +                     else
    +                       list.concat(taicpu.op_reg(A_LSR,GetOffsetReg64(dst, NR_NO, tcgsize2size[size]-1-b)));
    +
    +                     if size in [OS_S16,OS_16,OS_S32,OS_32,OS_S64,OS_64] then
    +                       begin
    +                         for i:=2+b to tcgsize2size[size] do
    +                           if op = OP_SHL then
    +                             list.concat(taicpu.op_reg(A_ROL,GetOffsetReg64(dst,NR_NO,i-1)))
    +                           else
    +                             list.concat(taicpu.op_reg(A_ROR,GetOffsetReg64(dst, NR_NO,tcgsize2size[size]-i-b)));
    +                       end;
    +                     list.concat(taicpu.op_reg(A_DEC,countreg));
    +                     a_jmp_flags(list,F_NE,l1);
    +                     // keep registers alive
    +                     list.concat(taicpu.op_reg_reg(A_MOV,countreg,countreg));
    +                   end
    +                 else
    +                   begin
    +                     // Unroll shift loop over non-moved bytes
    +                     for j := 1 to b2 do
    +                     begin
    +                       if op = OP_SHL then
    +                         list.concat(taicpu.op_reg(A_LSL,
    +                         GetOffsetReg64(dst, NR_NO, b)))
    +                       else
    +                         list.concat(taicpu.op_reg(A_LSR,
    +                           GetOffsetReg64(dst, NR_NO, tcgsize2size[size]-b-1)));
    +
    +                       if not(size in [OS_8, OS_S8]) then
    +                         for i := 2 to tcgsize2size[size]-b do
    +                           if op = OP_SHL then
    +                             list.concat(taicpu.op_reg(A_ROL,
    +                               GetOffsetReg64(dst, NR_NO, b+i-1)))
    +                           else
    +                             list.concat(taicpu.op_reg(A_ROR,
    +                               GetOffsetReg64(dst, NR_NO, tcgsize2size[size]-b-i)));
    +                     end;
    +                   end;
    +               end;
    +
    +               // fill skipped destination registers with 0
    +               // Do last, then optimizer can optimize register moves
    +               for i := 1 to b do
    +                 if op = OP_SHL then
    +                   emit_mov(list, GetOffsetReg64(dst, NR_NO, i-1), NR_R1)
    +                 else
    +                   emit_mov(list, GetOffsetReg64(dst, NR_NO, tcgsize2size[size]-i), NR_R1);
    +           end
              else
                inherited a_op_const_reg_reg(list,op,size,a,src,dst);
            end;
    
    shiftbyconst.patch (5,078 bytes)

Activities

Christo Crause

2019-09-20 17:23

reporter  

shiftbyconst.patch (5,078 bytes)
diff --git compiler/avr/cgcpu.pas compiler/avr/cgcpu.pas
index 661c300a9d..a379068a67 100644
--- compiler/avr/cgcpu.pas
+++ compiler/avr/cgcpu.pas
@@ -438,6 +438,11 @@ unit cgcpu;
 
 
      procedure tcgavr.a_op_const_reg_reg(list: TAsmList; op: TOpCg; size: tcgsize; a: tcgint; src, dst: tregister);
+       var
+         tmpSrc, tmpDst, countreg: TRegister;
+         b, b2, i, j: byte;
+         s1, s2, t1: integer;
+         l1: TAsmLabel;
        begin
          if (op in [OP_MUL,OP_IMUL]) and (size in [OS_16,OS_S16]) and (a in [2,4,8]) then
            begin
@@ -451,6 +456,98 @@ unit cgcpu;
                  a:=a shr 1;
                end;
            end
+         else if (op in [OP_SHL, OP_SHR]) and (a > 0)  then // a = 0 get eliminated later by tcg.optimize_op_const
+           begin
+             b := a div 8;  // number of bytes to shift
+
+             // Ensure that b is never larger than base type
+             if b > tcgsize2size[size] then
+               begin
+                 b := tcgsize2size[size];
+                 b2 := 0;
+               end
+             else
+               b2 := a mod 8;
+
+             if b < tcgsize2size[size] then
+               // copy from src to dst accounting for shift offset
+               for i := 0 to (tcgsize2size[size]-b-1) do
+                 if op = OP_SHL then
+                   a_load_reg_reg(list, OS_8, OS_8,
+                     GetOffsetReg64(src, NR_NO, i),
+                     GetOffsetReg64(dst, NR_NO, i+b))
+                 else
+                   a_load_reg_reg(list, OS_8, OS_8,
+                     GetOffsetReg64(src, NR_NO, i+b),
+                     GetOffsetReg64(dst, NR_NO, i));
+
+             // remaining bit shifts
+             if b2 > 0 then
+               begin
+                 // Cost of loop
+                 s1 := 3 + tcgsize2size[size] - b;
+                 t1 := b2*(tcgsize2size[size] - b + 3);
+                 //Cost of loop unrolling, t2 = s2
+                 s2 := b2*(tcgsize2size[size] - b);
+
+                 if ((cs_opt_size in current_settings.optimizerswitches) and (s1 < s2)) or
+                    (((s2 - s1) - t1/s2) > 0) then
+                   begin
+                     // Shift non-moved bytes in loop
+                     current_asmdata.getjumplabel(l1);
+                     countreg:=getintregister(list,OS_8);
+                     a_load_const_reg(list,OS_8,b2,countreg);
+                     cg.a_label(list,l1);
+                     if op = OP_SHL then
+                       list.concat(taicpu.op_reg(A_LSL,GetOffsetReg64(dst, NR_NO, b)))
+                     else
+                       list.concat(taicpu.op_reg(A_LSR,GetOffsetReg64(dst, NR_NO, tcgsize2size[size]-1-b)));
+
+                     if size in [OS_S16,OS_16,OS_S32,OS_32,OS_S64,OS_64] then
+                       begin
+                         for i:=2+b to tcgsize2size[size] do
+                           if op = OP_SHL then
+                             list.concat(taicpu.op_reg(A_ROL,GetOffsetReg64(dst,NR_NO,i-1)))
+                           else
+                             list.concat(taicpu.op_reg(A_ROR,GetOffsetReg64(dst, NR_NO,tcgsize2size[size]-i-b)));
+                       end;
+                     list.concat(taicpu.op_reg(A_DEC,countreg));
+                     a_jmp_flags(list,F_NE,l1);
+                     // keep registers alive
+                     list.concat(taicpu.op_reg_reg(A_MOV,countreg,countreg));
+                   end
+                 else
+                   begin
+                     // Unroll shift loop over non-moved bytes
+                     for j := 1 to b2 do
+                     begin
+                       if op = OP_SHL then
+                         list.concat(taicpu.op_reg(A_LSL,
+                         GetOffsetReg64(dst, NR_NO, b)))
+                       else
+                         list.concat(taicpu.op_reg(A_LSR,
+                           GetOffsetReg64(dst, NR_NO, tcgsize2size[size]-b-1)));
+
+                       if not(size in [OS_8, OS_S8]) then
+                         for i := 2 to tcgsize2size[size]-b do
+                           if op = OP_SHL then
+                             list.concat(taicpu.op_reg(A_ROL,
+                               GetOffsetReg64(dst, NR_NO, b+i-1)))
+                           else
+                             list.concat(taicpu.op_reg(A_ROR,
+                               GetOffsetReg64(dst, NR_NO, tcgsize2size[size]-b-i)));
+                     end;
+                   end;
+               end;
+
+               // fill skipped destination registers with 0
+               // Do last, then optimizer can optimize register moves
+               for i := 1 to b do
+                 if op = OP_SHL then
+                   emit_mov(list, GetOffsetReg64(dst, NR_NO, i-1), NR_R1)
+                 else
+                   emit_mov(list, GetOffsetReg64(dst, NR_NO, tcgsize2size[size]-i), NR_R1);
+           end
          else
            inherited a_op_const_reg_reg(list,op,size,a,src,dst);
        end;
shiftbyconst.patch (5,078 bytes)

Florian

2019-10-05 22:50

administrator   ~0118359

Thanks, commit in r43136 with formatting adapted to the surrounding code. What I noticed, is that tests/test/cg/tmoddiv3 now fails. As mod/div depend on shifting, this might show some problem?

Christo Crause

2019-10-06 16:29

reporter   ~0118380

My apologies Florian. There is indeed a typo in the patch I provided leading to the failed test. In line 516 b should not be subtracted from the register offset. Please refer to patch below:

diff --git a/compiler/avr/cgcpu.pas b/compiler/avr/cgcpu.pas
index dfd3eeecf6..731386be0b 100644
--- a/compiler/avr/cgcpu.pas
+++ b/compiler/avr/cgcpu.pas
@@ -513,7 +513,7 @@ unit cgcpu;
                            if op=OP_SHL then
                              list.concat(taicpu.op_reg(A_ROL,GetOffsetReg64(dst,NR_NO,i-1)))
                            else
- list.concat(taicpu.op_reg(A_ROR,GetOffsetReg64(dst,NR_NO,tcgsize2size[size]-i-b)));
+ list.concat(taicpu.op_reg(A_ROR,GetOffsetReg64(dst,NR_NO,tcgsize2size[size]-i)));
                        end;
                      list.concat(taicpu.op_reg(A_DEC,countreg));
                      a_jmp_flags(list,F_NE,l1);

Florian

2019-10-12 13:52

administrator   ~0118511

Thanks, applied in r43169.

Christo Crause

2019-10-12 16:42

reporter   ~0118517

Thank you Florian.

Issue History

Date Modified Username Field Change
2019-09-20 17:23 Christo Crause New Issue
2019-09-20 17:23 Christo Crause File Added: shiftbyconst.patch
2019-09-20 17:24 Christo Crause Tag Attached: AVR
2019-10-05 22:50 Florian Assigned To => Florian
2019-10-05 22:50 Florian Status new => resolved
2019-10-05 22:50 Florian Resolution open => fixed
2019-10-05 22:50 Florian Fixed in Version => 3.3.1
2019-10-05 22:50 Florian Fixed in Revision => 43136
2019-10-05 22:50 Florian FPCTarget => -
2019-10-05 22:50 Florian Note Added: 0118359
2019-10-06 16:29 Christo Crause Status resolved => feedback
2019-10-06 16:29 Christo Crause Resolution fixed => reopened
2019-10-06 16:29 Christo Crause Note Added: 0118380
2019-10-12 13:52 Florian Status feedback => resolved
2019-10-12 13:52 Florian Resolution reopened => fixed
2019-10-12 13:52 Florian Fixed in Revision 43136 => 43136, 43169
2019-10-12 13:52 Florian Note Added: 0118511
2019-10-12 16:42 Christo Crause Status resolved => closed
2019-10-12 16:42 Christo Crause Note Added: 0118517