View Issue Details

IDProjectCategoryView StatusLast Update
0027870FPCCompilerpublic2019-05-02 14:21
ReporterAdriaan van OsAssigned ToJ. Gareth Moreton 
PrioritynormalSeverityminorReproducibilityalways
Status resolvedResolutionfixed 
PlatformDarwin i386OSMac OS XOS Version10,6
Product Version2.6.4Product Build 
Target Version3.1.1Fixed in Version3.1.1 
Summary0027870: SSE vector-of-single tests fail
Descriptionprogram ssetest;
var v1, v2, v3: array[ 0..3] of single;
begin
  writeln;
  v1[ 0] := 0.2;
  v1[ 1] := 0.2;
  v1[ 2] := 0.2;
  v1[ 3] := 0.2;
  writeln( 'v1[ 0] = ', v1[ 0]);
  writeln( 'v1[ 1] = ', v1[ 1]);
  writeln( 'v1[ 2] = ', v1[ 2]);
  writeln( 'v1[ 3] = ', v1[ 3]);
  writeln;
  v2[ 0] := 0.3;
  v2[ 1] := 0.3;
  v2[ 2] := 0.3;
  v2[ 3] := 0.3;
  writeln( 'v2[ 0] = ', v2[ 0]);
  writeln( 'v2[ 1] = ', v2[ 1]);
  writeln( 'v2[ 2] = ', v2[ 2]);
  writeln( 'v2[ 3] = ', v2[ 3]);
  writeln;
  v3 := v1 + v2;
  writeln( 'v3 := v1 + v2;');
  writeln( 'v3[ 0] = ', v3[ 0]);
  writeln( 'v3[ 1] = ', v3[ 1]);
  writeln( 'v3[ 2] = ', v3[ 2]);
  writeln( 'v3[ 3] = ', v3[ 3]);
  writeln;
  v3 := v1 - v2;
  writeln( 'v3 := v1 - v2;');
  writeln( 'v3[ 0] = ', v3[ 0]);
  writeln( 'v3[ 1] = ', v3[ 1]);
  writeln( 'v3[ 2] = ', v3[ 2]);
  writeln( 'v3[ 3] = ', v3[ 3]);
  writeln;
  v3 := v1 * v2;
  writeln( 'v3 := v1 * v2;');
  writeln( 'v3[ 0] = ', v3[ 0]);
  writeln( 'v3[ 1] = ', v3[ 1]);
  writeln( 'v3[ 2] = ', v3[ 2]);
  writeln( 'v3[ 3] = ', v3[ 3]);
  writeln;
  v3 := v1 / v2;
  writeln( 'v3 := v1 / v2;');
  writeln( 'v3[ 0] = ', v3[ 0]);
  writeln( 'v3[ 1] = ', v3[ 1]);
  writeln( 'v3[ 2] = ', v3[ 2]);
  writeln( 'v3[ 3] = ', v3[ 3]);
  writeln;
end.

adriaan% fpc -Sv -CfSSE2 ssetest.pas
Free Pascal Compiler version 2.6.4 [2014/02/26] for i386
Copyright (c) 1993-2014 by Florian Klaempfl and others
Target OS: Darwin for i386
Compiling ssetest.pas
Assembling (pipe) ssetest.s
Linking ssetest
52 lines compiled, 0.1 sec

adriaan% ./ssetest

v1[ 0] = 2.000000030E-01
v1[ 1] = 2.000000030E-01
v1[ 2] = 2.000000030E-01
v1[ 3] = 2.000000030E-01

v2[ 0] = 3.000000119E-01
v2[ 1] = 3.000000119E-01
v2[ 2] = 3.000000119E-01
v2[ 3] = 3.000000119E-01

Runtime error 216 at $0001106C
  $0001106C
  $0002F746
  $00010CB9
  $00010BE8
  $00000001



TagsNo tags attached.
Fixed in Revision38206
FPCOldBugId0
FPCTarget
Attached Files
  • VECTORIZATION.patch (26,949 bytes)
    Index: compiler/cgbase.pas
    ===================================================================
    --- compiler/cgbase.pas	(revision 37727)
    +++ compiler/cgbase.pas	(working copy)
    @@ -165,13 +165,18 @@
              not be loaded in a register directly }
            TCgSize = (OS_NO,
                      { integer registers }
    -                  OS_8,OS_16,OS_32,OS_64,OS_128,OS_S8,OS_S16,OS_S32,OS_S64,OS_S128,
    -                 { single,double,extended,comp,float128 }
    -                  OS_F32,OS_F64,OS_F80,OS_C64,OS_F128,
    +                  OS_8,   OS_16,   OS_32,   OS_64,   OS_128,
    +                  OS_S8,  OS_S16,  OS_S32,  OS_S64,  OS_S128,
    +                 { single, double, extended, comp, float128 }
    +                  OS_F32, OS_F64,  OS_F80,  OS_C64,  OS_F128,
                      { multi-media sizes: split in byte, word, dword, ... }
                      { entities, then the signed counterparts             }
    -                  OS_M8,OS_M16,OS_M32,OS_M64,OS_M128,OS_M256,  
    -                  OS_MS8,OS_MS16,OS_MS32,OS_MS64,OS_MS128,OS_MS256 );  
    +                  OS_M8,  OS_M16,  OS_M32,  OS_M64,  OS_M128,  OS_M256,  OS_M512,
    +                  OS_MS8, OS_MS16, OS_MS32, OS_MS64, OS_MS128, OS_MS256, OS_MS512,
    +                 { multi-media sizes: single-precision floating-point }
    +                  OS_MF32, OS_MF128, OS_MF256, OS_MF512,
    +                 { multi-media sizes: double-precision floating-point }
    +                  OS_MD64, OS_MD128, OS_MD256, OS_MD512);
     
           { Register types }
           TRegisterType = (
    @@ -307,12 +312,20 @@
            NR_INVALID    = tregister($fffffffff);
     
            tcgsize2size : Array[tcgsize] of integer =
    +
    +        (0,
              { integer values }
    -        (0,1,2,4,8,16,1,2,4,8,16,
    +         1,  2,  4,  8, 16,
    +         1,  2,  4,  8, 16,
              { floating point values }
    -         4,8,10,8,16,
    +         4,  8, 10,  8, 16,
              { multimedia values }
    -         1,2,4,8,16,32,1,2,4,8,16,32); 
    +         1,  2,  4,  8, 16, 32, 64,
    +         1,  2,  4,  8, 16, 32, 64,
    +         { single-precision multimedia values }
    +         4, 16, 32, 64,
    +         { double-precision multimedia values }
    +         8, 16, 32, 64);
     
            tfloat2tcgsize: array[tfloattype] of tcgsize =
              (OS_F32,OS_F64,OS_F80,OS_F80,OS_C64,OS_C64,OS_F128);
    @@ -348,17 +361,24 @@
            { Table to convert tcgsize variables to the correspondending
              unsigned types }
            tcgsize2unsigned : array[tcgsize] of tcgsize = (OS_NO,
    -          OS_8,OS_16,OS_32,OS_64,OS_128,OS_8,OS_16,OS_32,OS_64,OS_128,
    -          OS_F32,OS_F64,OS_F80,OS_C64,OS_F128,
    -          OS_M8,OS_M16,OS_M32,OS_M64,OS_M128,OS_M256,OS_M8,OS_M16,OS_M32,
    -          OS_M64,OS_M128,OS_M256);
    +         OS_8,    OS_16,   OS_32,   OS_64,   OS_128,
    +         OS_8,    OS_16,   OS_32,   OS_64,   OS_128,
     
    +         OS_F32,  OS_F64,  OS_F80,  OS_C64,  OS_F128,
    +         OS_M8,   OS_M16,  OS_M32,  OS_M64,  OS_M128, OS_M256, OS_M512,
    +         OS_M8,   OS_M16,  OS_M32,  OS_M64,  OS_M128, OS_M256, OS_M512,
    +         OS_MF32, OS_MF128,OS_MF256,OS_MF512,
    +         OS_MD64, OS_MD128,OS_MD256,OS_MD512);
    +
            tcgsize2signed : array[tcgsize] of tcgsize = (OS_NO,
    -          OS_S8,OS_S16,OS_S32,OS_S64,OS_S128,OS_S8,OS_S16,OS_S32,OS_S64,OS_S128,
    -          OS_F32,OS_F64,OS_F80,OS_C64,OS_F128,
    -          OS_M8,OS_M16,OS_M32,OS_M64,OS_M128,OS_M256,OS_M8,OS_M16,OS_M32,
    -          OS_M64,OS_M128,OS_M256);
    +         OS_S8,   OS_S16,  OS_S32,  OS_S64,  OS_S128,
    +         OS_S8,   OS_S16,  OS_S32,  OS_S64,  OS_S128,
     
    +         OS_F32,  OS_F64,  OS_F80,  OS_C64,  OS_F128,
    +         OS_MS8,  OS_MS16, OS_MS32, OS_MS64, OS_MS128,OS_MS256,OS_MS512,
    +         OS_MS8,  OS_MS16, OS_MS32, OS_MS64, OS_MS128,OS_MS256,OS_MS512,
    +         OS_MF32, OS_MF128,OS_MF256,OS_MF512,
    +         OS_MD64, OS_MD128,OS_MD256,OS_MD512);
     
            tcgloc2str : array[TCGLoc] of string[12] = (
                 'LOC_INVALID',
    @@ -404,6 +424,8 @@
         }
         function int_cgsize(const a: tcgint): tcgsize;{$ifdef USEINLINE}inline;{$endif}
         function int_float_cgsize(const a: tcgint): tcgsize;
    +    function float_array_cgsize(const a: tcgint): tcgsize;{$ifdef USEINLINE}inline;{$endif}
    +    function double_array_cgsize(const a: tcgint): tcgsize;{$ifdef USEINLINE}inline;{$endif}
     
         function tcgsize2str(cgsize: tcgsize):string;
     
    @@ -685,22 +707,53 @@
     
     
         function int_float_cgsize(const a: tcgint): tcgsize;
    -      begin
    -        case a of
    -          4 :
    -            result:=OS_F32;
    -          8 :
    -            result:=OS_F64;
    -          10 :
    -            result:=OS_F80;
    -          16 :
    -            result:=OS_F128;
    -          else
    -            internalerror(200603211);
    -        end;
    +    begin
    +      case a of
    +      4:
    +        result := OS_F32;
    +      8:
    +        result := OS_F64;
    +      10:
    +        result := OS_F80;
    +      16:
    +        result := OS_F128;
    +      else
    +        internalerror(200603211);
           end;
    +    end;
     
    +    function float_array_cgsize(const a: tcgint): tcgsize;{$ifdef USEINLINE}inline;{$endif}
    +    begin
    +      case a of
    +      4:
    +        result := OS_MF32;
    +      16:
    +        result := OS_MF128;
    +      32:
    +        result := OS_MF256;
    +      64:
    +        result := OS_MF512;
    +      else
    +        result := int_cgsize(a);
    +      end;
    +    end;
     
    +    function double_array_cgsize(const a: tcgint): tcgsize;{$ifdef USEINLINE}inline;{$endif}
    +    begin
    +      case a of
    +      8:
    +        result := OS_MD64;
    +      16:
    +        result := OS_MD128;
    +      32:
    +        result := OS_MD256;
    +      64:
    +        result := OS_MD512;
    +      else
    +        result := int_cgsize(a);
    +      end;
    +    end;
    +
         function tcgsize2str(cgsize: tcgsize):string;
           begin
             Str(cgsize, Result);
    Index: compiler/defutil.pas
    ===================================================================
    --- compiler/defutil.pas	(revision 37727)
    +++ compiler/defutil.pas	(working copy)
    @@ -1268,8 +1268,23 @@
               arraydef :
                 begin
                   if is_dynamic_array(def) or not is_special_array(def) then
    -                result := int_cgsize(def.size)
    -              else
    +              begin
    +                if (cs_support_vectors in current_settings.globalswitches) and is_vector(def) and ((TArrayDef(def).elementdef.typ = floatdef) and not (cs_fp_emulation in current_settings.moduleswitches)) then
    +                begin
    +                  { Determine if, based on the floating-point type and the size
    +                    of the array, if it can be made into a vector }
    +                  case TFloatDef(def).floattype of
    +                  s32real:
    +                    result := float_array_cgsize(def.size);
    +                  s64real:
    +                    result := double_array_cgsize(def.size);
    +                  else
    +                    { If not, fall back }
    +                    result := int_cgsize(def.size);
    +                  end;
    +                end else
    +                  result := int_cgsize(def.size);
    +              end else
                     result := OS_NO;
                 end;
               else
    @@ -1309,7 +1324,8 @@
             case def.typ of
               arraydef:
                 begin
    -              if tarraydef(def).elementdef.typ in [orddef,floatdef] then
    +              case tarraydef(def).elementdef.typ of
    +              orddef:
                     begin
                       { this is not correct, OS_MX normally mean that the vector
                         contains elements of size X. However, vectors themselves
    @@ -1322,12 +1338,39 @@
                         8: result:=OS_M64;
                         16: result:=OS_M128;
                         32: result:=OS_M256;
    +                    64: result:=OS_M512;
                         else
                           internalerror(2013060103);
                       end;
    -                end
    +                end;
    +              floatdef:
    +                begin
    +                  case TFloatDef(tarraydef(def).elementdef).floattype of
    +                    s32real:
    +                      case def.size of
    +                      4: result:=OS_MF32;
    +                      16: result:=OS_MF128;
    +                      32: result:=OS_MF256;
    +                      64: result:=OS_MF512;
    +                      else
    +                        internalerror(2017121400);
    +                      end;
    +                    s64real:
    +                      case def.size of
    +                        8: result:=OS_MD64;
    +                        16: result:=OS_MD128;
    +                        32: result:=OS_MD256;
    +                        64: result:=OS_MD512;
    +                        else
    +                          internalerror(2017121401);
    +                      end;
    +                    else
    +                      internalerror(2017121402);
    +                  end;
    +                end;
                   else
                     result:=def_cgsize(def);
    +              end;
                 end
               else
                 result:=def_cgsize(def);
    Index: compiler/i386/cpubase.inc
    ===================================================================
    --- compiler/i386/cpubase.inc	(revision 37727)
    +++ compiler/i386/cpubase.inc	(working copy)
    @@ -35,7 +35,8 @@
             S_NEAR,S_FAR,S_SHORT,
             S_T,
             S_XMM,
    -        S_YMM
    +        S_YMM,
    +        S_ZMM
           );
     
           TOpSizes = set of topsize;
    Index: compiler/i8086/cpubase.inc
    ===================================================================
    --- compiler/i8086/cpubase.inc	(revision 37727)
    +++ compiler/i8086/cpubase.inc	(working copy)
    @@ -35,7 +35,8 @@
             S_NEAR,S_FAR,S_SHORT,
             S_T,
             S_XMM,
    -        S_YMM
    +        S_YMM,
    +        S_ZMM
           );
     
           TOpSizes = set of topsize;
    Index: compiler/ncgld.pas
    ===================================================================
    --- compiler/ncgld.pas	(revision 37727)
    +++ compiler/ncgld.pas	(working copy)
    @@ -682,6 +682,7 @@
     
         procedure tcgassignmentnode.pass_generate_code;
           var
    +         shuffle : pmmshuffle;
              hlabel : tasmlabel;
              href : treference;
              releaseright : boolean;
    @@ -968,22 +969,21 @@
                   LOC_MMREGISTER,
                   LOC_CMMREGISTER:
                     begin
    -                  if left.resultdef.typ=arraydef then
    -                    begin
    -                    end
    +                  if (is_vector(left.resultdef)) then
    +                    shuffle := nil
                       else
    -                    begin
    -                      case left.location.loc of
    -                        LOC_CMMREGISTER,
    -                        LOC_MMREGISTER:
    -                          hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,right.resultdef,left.resultdef,right.location.register,left.location.register,mms_movescalar);
    -                        LOC_REFERENCE,
    -                        LOC_CREFERENCE:
    -                          hlcg.a_loadmm_reg_ref(current_asmdata.CurrAsmList,right.resultdef,left.resultdef,right.location.register,left.location.reference,mms_movescalar);
    -                        else
    -                          internalerror(2009112601);
    -                      end;
    -                    end;
    +                    shuffle := mms_movescalar;
    +
    +                  case left.location.loc of
    +                    LOC_CMMREGISTER,
    +                    LOC_MMREGISTER:
    +                      hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,right.resultdef,left.resultdef,right.location.register,left.location.register, shuffle);
    +                    LOC_REFERENCE,
    +                    LOC_CREFERENCE:
    +                      hlcg.a_loadmm_reg_ref(current_asmdata.CurrAsmList,right.resultdef,left.resultdef,right.location.register,left.location.reference, shuffle);
    +                    else
    +                      internalerror(2009112601);
    +                  end;
                     end;
                   LOC_REGISTER,
                   LOC_CREGISTER :
    Index: compiler/x86/aasmcpu.pas
    ===================================================================
    --- compiler/x86/aasmcpu.pas	(revision 37727)
    +++ compiler/x86/aasmcpu.pas	(working copy)
    @@ -52,6 +52,7 @@
           OT_BITS64    = $00000008;  { x86_64 and FPU }
           OT_BITS128   = $10000000;  { 16 byte SSE }
           OT_BITS256   = $20000000;  { 32 byte AVX }
    +      OT_BITS512   = $40000000;  { 64 byte AVX512 }
           OT_BITS80    = $00000010;  { FPU only  }
           OT_FAR       = $00000020;  { this means 16:16 or 16:32, like in CALL/JMP }
           OT_NEAR      = $00000040;
    @@ -608,7 +609,8 @@
               OT_NEAR,OT_FAR,OT_SHORT,
               OT_NONE,
               OT_BITS128,
    -          OT_BITS256
    +          OT_BITS256,
    +          OT_BITS512
              ),
              (OT_NONE,
               OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_BITS8,OT_BITS8,OT_BITS16,OT_BITS8,OT_BITS16,OT_BITS32,
    @@ -618,7 +620,8 @@
               OT_NEAR,OT_FAR,OT_SHORT,
               OT_NONE,
               OT_BITS128,
    -          OT_BITS256
    +          OT_BITS256,
    +          OT_BITS512
              ),
              (OT_NONE,
               OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_NONE,OT_NONE,OT_NONE,OT_NONE,OT_NONE,OT_NONE,
    @@ -628,7 +631,8 @@
               OT_NEAR,OT_FAR,OT_SHORT,
               OT_NONE,
               OT_BITS128,
    -          OT_BITS256
    +          OT_BITS256,
    +          OT_BITS512
              )
            );
     
    @@ -646,7 +650,8 @@
               OT_NEAR,OT_FAR,OT_SHORT,
               OT_NONE,
               OT_BITS128,
    -          OT_BITS256
    +          OT_BITS256,
    +          OT_BITS512
              ),
              (OT_NONE,
               OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_BITS8,OT_BITS8,OT_BITS16,
    @@ -656,7 +661,8 @@
               OT_NEAR,OT_FAR,OT_SHORT,
               OT_NONE,
               OT_BITS128,
    -          OT_BITS256
    +          OT_BITS256,
    +          OT_BITS512
              ),
              (OT_NONE,
               OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_NONE,OT_NONE,OT_NONE,
    @@ -666,7 +672,8 @@
               OT_NEAR,OT_FAR,OT_SHORT,
               OT_NONE,
               OT_BITS128,
    -          OT_BITS256
    +          OT_BITS256,
    +          OT_BITS512
              )
           );
     
    @@ -684,7 +691,8 @@
               OT_NEAR,OT_FAR,OT_SHORT,
               OT_NONE,
               OT_BITS128,
    -          OT_BITS256
    +          OT_BITS256,
    +          OT_BITS512
              ),
              (OT_NONE,
               OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_BITS8,OT_BITS8,OT_BITS16,
    @@ -694,7 +702,8 @@
               OT_NEAR,OT_FAR,OT_SHORT,
               OT_NONE,
               OT_BITS128,
    -          OT_BITS256
    +          OT_BITS256,
    +          OT_BITS512
              ),
              (OT_NONE,
               OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_NONE,OT_NONE,OT_NONE,
    @@ -704,7 +713,8 @@
               OT_NEAR,OT_FAR,OT_SHORT,
               OT_NONE,
               OT_BITS128,
    -          OT_BITS256
    +          OT_BITS256,
    +          OT_BITS512
              )
           );
     
    Index: compiler/x86/cgx86.pas
    ===================================================================
    --- compiler/x86/cgx86.pas	(revision 37727)
    +++ compiler/x86/cgx86.pas	(working copy)
    @@ -158,20 +158,26 @@
           TCGSize2OpSize: Array[tcgsize] of topsize =
             (S_NO,S_B,S_W,S_L,S_Q,S_XMM,S_B,S_W,S_L,S_Q,S_XMM,
              S_FS,S_FL,S_FX,S_IQ,S_FXX,
    -         S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,
    -         S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM);
    +         S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,S_ZMM,
    +         S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM,S_ZMM,
    +         S_NO,S_XMM,S_YMM,S_ZMM,
    +         S_NO,S_XMM,S_YMM,S_ZMM);
     {$elseif defined(i386)}
           TCGSize2OpSize: Array[tcgsize] of topsize =
             (S_NO,S_B,S_W,S_L,S_L,S_T,S_B,S_W,S_L,S_L,S_L,
              S_FS,S_FL,S_FX,S_IQ,S_FXX,
    -         S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,
    -         S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM);
    +         S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,S_ZMM,
    +         S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM,S_ZMM,
    +         S_NO,S_XMM,S_YMM,S_ZMM,
    +         S_NO,S_XMM,S_YMM,S_ZMM);
     {$elseif defined(i8086)}
           TCGSize2OpSize: Array[tcgsize] of topsize =
             (S_NO,S_B,S_W,S_W,S_W,S_T,S_B,S_W,S_W,S_W,S_W,
              S_FS,S_FL,S_FX,S_IQ,S_FXX,
    -         S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,
    -         S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM);
    +         S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,S_ZMM,
    +         S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM,S_ZMM,
    +         S_NO,S_XMM,S_YMM,S_ZMM,
    +         S_NO,S_XMM,S_YMM,S_ZMM);
     {$endif}
     
     {$ifndef NOTARGETWIN}
    @@ -1385,15 +1391,77 @@
              make_simple_ref(list,tmpref);
              if shuffle=nil then
                begin
    -             if fromsize=OS_M64 then
    -               list.concat(taicpu.op_ref_reg(A_MOVQ,S_NO,tmpref,reg))
    +             case fromsize of
    +             OS_F32:
    +               if UseAVX then
    +                 op := A_VMOVSS
    +               else
    +                 op := A_MOVSS;
    +             OS_F64:
    +               if UseAVX then
    +                 op := A_VMOVSD
    +               else
    +                 op := A_MOVSD;
    +             OS_M32, OS_32, OS_S32:
    +               if UseAVX then
    +                 op := A_VMOVD
    +               else
    +                 op := A_MOVD;
    +             OS_M64, OS_64, OS_S64:
    +               if UseAVX then
    +                 op := A_VMOVQ
    +               else
    +                 op := A_MOVQ;
    +             OS_MF128:
    +               { Use XMM transfer of packed singles }
    +               if UseAVX then
    +               begin
    +                 if tmpref.alignment = 16 then
    +                   op := A_VMOVAPS
    +                 else
    +                   op := A_VMOVUPS
    +               end else
    +               begin
    +                 if tmpref.alignment = 16 then
    +                   op := A_MOVAPS
    +                 else
    +                   op := A_MOVUPS
    +               end;
    +             OS_MD128:
    +               { Use XMM transfer of packed doubles }
    +               if UseAVX then
    +               begin
    +                 if tmpref.alignment = 16 then
    +                   op := A_VMOVAPD
    +                 else
    +                   op := A_VMOVUPD
    +               end else
    +               begin
    +                 if tmpref.alignment = 16 then
    +                   op := A_MOVAPD
    +                 else
    +                   op := A_MOVUPD
    +               end;
    +             OS_M128, OS_MS128:
    +               { Use XMM integer transfer }
    +               if UseAVX then
    +               begin
    +                 if tmpref.alignment = 16 then
    +                   op := A_VMOVDQA
    +                 else
    +                   op := A_VMOVDQU
    +               end else
    +               begin
    +                 if tmpref.alignment = 16 then
    +                   op := A_MOVDQA
    +                 else
    +                   op := A_MOVDQU
    +               end;
                  else
    -{$ifdef x86_64}
    -               { x86-64 has always properly aligned data }
    -               list.concat(taicpu.op_ref_reg(A_MOVDQA,S_NO,tmpref,reg));
    -{$else x86_64}
    -               list.concat(taicpu.op_ref_reg(A_MOVDQU,S_NO,tmpref,reg));
    -{$endif x86_64}
    +               { No valid transfer command available }
    +               internalerror(2017121410);
    +             end;
    +             list.concat(taicpu.op_ref_reg(op,S_NO,tmpref,reg));
                end
              else if shufflescalar(shuffle) then
                begin
    @@ -1420,15 +1488,77 @@
              make_simple_ref(list,tmpref);
              if shuffle=nil then
                begin
    -             if fromsize=OS_M64 then
    -               list.concat(taicpu.op_reg_ref(A_MOVQ,S_NO,reg,tmpref))
    +             case fromsize of
    +             OS_F32:
    +               if UseAVX then
    +                 op := A_VMOVSS
    +               else
    +                 op := A_MOVSS;
    +             OS_F64:
    +               if UseAVX then
    +                 op := A_VMOVSD
    +               else
    +                 op := A_MOVSD;
    +             OS_M32, OS_32, OS_S32:
    +               if UseAVX then
    +                 op := A_VMOVD
    +               else
    +                 op := A_MOVD;
    +             OS_M64, OS_64, OS_S64:
    +               if UseAVX then
    +                 op := A_VMOVQ
    +               else
    +                 op := A_MOVQ;
    +             OS_MF128:
    +               { Use XMM transfer of packed singles }
    +               if UseAVX then
    +               begin
    +                 if tmpref.alignment = 16 then
    +                   op := A_VMOVAPS
    +                 else
    +                   op := A_VMOVUPS
    +               end else
    +               begin
    +                 if tmpref.alignment = 16 then
    +                   op := A_MOVAPS
    +                 else
    +                   op := A_MOVUPS
    +               end;
    +             OS_MD128:
    +               { Use XMM transfer of packed doubles }
    +               if UseAVX then
    +               begin
    +                 if tmpref.alignment = 16 then
    +                   op := A_VMOVAPD
    +                 else
    +                   op := A_VMOVUPD
    +               end else
    +               begin
    +                 if tmpref.alignment = 16 then
    +                   op := A_MOVAPD
    +                 else
    +                   op := A_MOVUPD
    +               end;
    +             OS_M128, OS_MS128:
    +               { Use XMM integer transfer }
    +               if UseAVX then
    +               begin
    +                 if tmpref.alignment = 16 then
    +                   op := A_VMOVDQA
    +                 else
    +                   op := A_VMOVDQU
    +               end else
    +               begin
    +                 if tmpref.alignment = 16 then
    +                   op := A_MOVDQA
    +                 else
    +                   op := A_MOVDQU
    +               end;
                  else
    -{$ifdef x86_64}
    -               { x86-64 has always properly aligned data }
    -               list.concat(taicpu.op_reg_ref(A_MOVDQA,S_NO,reg,tmpref))
    -{$else x86_64}
    -               list.concat(taicpu.op_reg_ref(A_MOVDQU,S_NO,reg,tmpref))
    -{$endif x86_64}
    +               { No valid transfer command available }
    +               internalerror(2017121411);
    +             end;
    +             list.concat(taicpu.op_reg_ref(op,S_NO,reg,tmpref));
                end
              else if shufflescalar(shuffle) then
                begin
    Index: compiler/x86/itcpugas.pas
    ===================================================================
    --- compiler/x86/itcpugas.pas	(revision 37727)
    +++ compiler/x86/itcpugas.pas	(working copy)
    @@ -52,27 +52,28 @@
            'd',
            '','','',
            't',
    -        'x',
    -        'y'
    +       'x',
    +       'y',
    +       'z'
          );
          { suffix-to-opsize conversion tables, used in asmreadrer }
          { !! S_LQ excluded: movzlq does not exist, movslq is processed
            as a separate instruction w/o suffix (aka movsxd), and there are
            no more instructions needing it. }
    -     att_sizesuffixstr : array[0..13] of string[2] = (
    -       '','BW','BL','WL','BQ','WQ',{'LQ',}'B','W','L','S','Q','T','X','Y'
    +     att_sizesuffixstr : array[0..14] of string[2] = (
    +       '','BW','BL','WL','BQ','WQ',{'LQ',}'B','W','L','S','Q','T','X','Y','Z'
          );
    -     att_sizesuffix : array[0..13] of topsize = (
    -       S_NO,S_BW,S_BL,S_WL,S_BQ,S_WQ,{S_LQ,}S_B,S_W,S_L,S_NO,S_Q,S_NO,S_NO,S_NO
    +     att_sizesuffix : array[0..14] of topsize = (
    +       S_NO,S_BW,S_BL,S_WL,S_BQ,S_WQ,{S_LQ,}S_B,S_W,S_L,S_NO,S_Q,S_NO,S_NO,S_NO,S_NO
          );
    -     att_sizefpusuffix : array[0..13] of topsize = (
    -       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,{S_NO,}S_NO,S_NO,S_FL,S_FS,S_NO,S_FX,S_NO,S_NO
    +     att_sizefpusuffix : array[0..14] of topsize = (
    +       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,{S_NO,}S_NO,S_NO,S_FL,S_FS,S_NO,S_FX,S_NO,S_NO,S_NO
          );
    -     att_sizefpuintsuffix : array[0..13] of topsize = (
    -       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,{S_NO,}S_NO,S_NO,S_IL,S_IS,S_IQ,S_NO,S_NO,S_NO
    +     att_sizefpuintsuffix : array[0..14] of topsize = (
    +       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,{S_NO,}S_NO,S_NO,S_IL,S_IS,S_IQ,S_NO,S_NO,S_NO,S_NO
          );
    -     att_sizemmsuffix : array[0..13] of topsize = (
    -       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,{S_NO,}S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM
    +     att_sizemmsuffix : array[0..14] of topsize = (
    +       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,{S_NO,}S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM,S_ZMM
          );
     {$else x86_64}
          gas_opsize2str : array[topsize] of string[2] = ('',
    @@ -82,24 +83,25 @@
            'd',
            '','','',
            't',
    -        'x',
    -        'y'
    +       'x',
    +       'y',
    +       'z'
          );
          { suffix-to-opsize conversion tables, used in asmreadrer }
    -     att_sizesuffixstr : array[0..11] of string[2] = (
    -       '','BW','BL','WL','B','W','L','S','Q','T','X','Y'
    +     att_sizesuffixstr : array[0..12] of string[2] = (
    +       '','BW','BL','WL','B','W','L','S','Q','T','X','Y','Z'
          );
    -     att_sizesuffix : array[0..11] of topsize = (
    -       S_NO,S_BW,S_BL,S_WL,S_B,S_W,S_L,S_NO,S_NO,S_NO,S_NO,S_NO
    +     att_sizesuffix : array[0..12] of topsize = (
    +       S_NO,S_BW,S_BL,S_WL,S_B,S_W,S_L,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO
          );
    -     att_sizefpusuffix : array[0..11] of topsize = (
    -       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_FL,S_FS,S_NO,S_FX,S_NO,S_NO
    +     att_sizefpusuffix : array[0..12] of topsize = (
    +       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_FL,S_FS,S_NO,S_FX,S_NO,S_NO,S_NO
          );
    -     att_sizefpuintsuffix : array[0..11] of topsize = (
    -       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_IL,S_IS,S_IQ,S_NO,S_NO,S_NO
    +     att_sizefpuintsuffix : array[0..12] of topsize = (
    +       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_IL,S_IS,S_IQ,S_NO,S_NO,S_NO,S_NO
          );
    -     att_sizemmsuffix : array[0..11] of topsize = (
    -       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM
    +     att_sizemmsuffix : array[0..12] of topsize = (
    +       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM,S_ZMM
          );
     
     {$endif x86_64}
    Index: compiler/x86/rax86.pas
    ===================================================================
    --- compiler/x86/rax86.pas	(revision 37727)
    +++ compiler/x86/rax86.pas	(working copy)
    @@ -343,7 +343,8 @@
          0,0,0,
          80,
          128,
    -     256
    +     256,
    +     512
         );
     {$else}
     topsize2memsize: array[topsize] of integer =
    @@ -354,7 +355,8 @@
        0,0,0,
        80,
        128,
    -   256
    +   256,
    +   512
       );
     {$endif}
     
    Index: compiler/x86_64/aoptcpu.pas
    ===================================================================
    --- compiler/x86_64/aoptcpu.pas	(revision 37727)
    +++ compiler/x86_64/aoptcpu.pas	(working copy)
    @@ -74,10 +74,14 @@
                   A_MOVZX:
                     Result:=OptPass1Movx(p);
                   A_VMOVAPS,
    -              A_VMOVAPD:
    +              A_VMOVAPD,
    +              A_VMOVUPS,
    +              A_VMOVUPD:
                     result:=OptPass1VMOVAP(p);
                   A_MOVAPD,
    -              A_MOVAPS:
    +              A_MOVAPS,
    +              A_MOVUPD,
    +              A_MOVUPS:
                     result:=OptPass1MOVAP(p);
                   A_VDIVSD,
                   A_VDIVSS,
    Index: compiler/x86_64/cpubase.inc
    ===================================================================
    --- compiler/x86_64/cpubase.inc	(revision 37727)
    +++ compiler/x86_64/cpubase.inc	(working copy)
    @@ -35,7 +35,8 @@
         S_NEAR,S_FAR,S_SHORT,
         S_T,
         S_XMM,
    -    S_YMM
    +    S_YMM,
    +    S_ZMM
       );
     
       TOpSizes = set of topsize;
    
    VECTORIZATION.patch (26,949 bytes)

Relationships

related to 0032781 resolvedFlorian [Feature request] "vectorcall" 
related to 0028037 closedJonas Maebe $codealign localmin=16 doesn't work 

Activities

Jonas Maebe

2015-04-16 11:05

manager   ~0082955

SSE requires 16 byte alignment. While unaligned load/store SSE instructions exist, FPC does not generate them because they are significantly slower and the only point of using vector operations is to make things as fast as possible.

Use {$codealign varmin=x} to specify the minimum alignment for global/static variables: http://www.freepascal.org/docs-html/prog/progsu9.html

The natural alignment of single (and hence array of single) on Darwin i386 is 4. Automatically using the correct alignment for vector operations will only be done when/if we introduce a dedicated vector type.

Adriaan van Os

2015-04-16 11:32

developer   ~0082957

Thanks for the info !

I added
{$codealign varmin=16}

The crash disappears, but the results are wrong.

adriaan% fpc -Sv -CfSSE3 -alnrt ssetest.pas
Free Pascal Compiler version 2.6.4 [2014/02/26] for i386
Copyright (c) 1993-2014 by Florian Klaempfl and others
Target OS: Darwin for i386
Compiling ssetest.pas
Assembling ssetest
Linking ssetest
53 lines compiled, 0.1 sec

adriaan% ./ssetest

v1[ 0] = 2.000000030E-01
v1[ 1] = 2.000000030E-01
v1[ 2] = 2.000000030E-01
v1[ 3] = 2.000000030E-01

v2[ 0] = 3.000000119E-01
v2[ 1] = 3.000000119E-01
v2[ 2] = 3.000000119E-01
v2[ 3] = 3.000000119E-01

v3 := v1 + v2;
v3[ 0] = 0.000000000E+00
v3[ 1] = 0.000000000E+00
v3[ 2] = 0.000000000E+00
v3[ 3] = 0.000000000E+00

v3 := v1 - v2;
v3[ 0] = 0.000000000E+00
v3[ 1] = 0.000000000E+00
v3[ 2] = 0.000000000E+00
v3[ 3] = 0.000000000E+00

v3 := v1 * v2;
v3[ 0] = 0.000000000E+00
v3[ 1] = 0.000000000E+00
v3[ 2] = 0.000000000E+00
v3[ 3] = 0.000000000E+00

v3 := v1 / v2;
v3[ 0] = 0.000000000E+00
v3[ 1] = 0.000000000E+00
v3[ 2] = 0.000000000E+00
v3[ 3] = 0.000000000E+00

Jonas Maebe

2015-04-16 11:39

manager   ~0082958

I can't help with that, I'm not a maintainer of this functionality.

Adriaan van Os

2015-04-16 12:31

developer   ~0082959

And when using local variables, the test still crashes.

{$codealign varmin=16}
{$codealign localmin=16}
program ssetest;

procedure test;
var v1, v2, v3: array[ 0..3] of single;
begin
  writeln;
  v1[ 0] := 0.2;
  v1[ 1] := 0.2;
  v1[ 2] := 0.2;
  v1[ 3] := 0.2;
  writeln( 'v1[ 0] = ', v1[ 0]);
  writeln( 'v1[ 1] = ', v1[ 1]);
  writeln( 'v1[ 2] = ', v1[ 2]);
  writeln( 'v1[ 3] = ', v1[ 3]);
  writeln;
  v2[ 0] := 0.3;
  v2[ 1] := 0.3;
  v2[ 2] := 0.3;
  v2[ 3] := 0.3;
  writeln( 'v2[ 0] = ', v2[ 0]);
  writeln( 'v2[ 1] = ', v2[ 1]);
  writeln( 'v2[ 2] = ', v2[ 2]);
  writeln( 'v2[ 3] = ', v2[ 3]);
  writeln;
  v3 := v1 + v2;
  writeln( 'v3 := v1 + v2;');
  writeln( 'v3[ 0] = ', v3[ 0]);
  writeln( 'v3[ 1] = ', v3[ 1]);
  writeln( 'v3[ 2] = ', v3[ 2]);
  writeln( 'v3[ 3] = ', v3[ 3]);
  writeln;
  v3 := v1 - v2;
  writeln( 'v3 := v1 - v2;');
  writeln( 'v3[ 0] = ', v3[ 0]);
  writeln( 'v3[ 1] = ', v3[ 1]);
  writeln( 'v3[ 2] = ', v3[ 2]);
  writeln( 'v3[ 3] = ', v3[ 3]);
  writeln;
  v3 := v1 * v2;
  writeln( 'v3 := v1 * v2;');
  writeln( 'v3[ 0] = ', v3[ 0]);
  writeln( 'v3[ 1] = ', v3[ 1]);
  writeln( 'v3[ 2] = ', v3[ 2]);
  writeln( 'v3[ 3] = ', v3[ 3]);
  writeln;
  v3 := v1 / v2;
  writeln( 'v3 := v1 / v2;');
  writeln( 'v3[ 0] = ', v3[ 0]);
  writeln( 'v3[ 1] = ', v3[ 1]);
  writeln( 'v3[ 2] = ', v3[ 2]);
  writeln( 'v3[ 3] = ', v3[ 3]);
  writeln;
end;

begin
  test
end.


adriaan% fpc -Sv -CfSSE3 ssetest.pas
Free Pascal Compiler version 2.6.4 [2014/02/26] for i386
Copyright (c) 1993-2014 by Florian Klaempfl and others
Target OS: Darwin for i386
Compiling ssetest.pas
Assembling (pipe) ssetest.s
Linking ssetest
60 lines compiled, 0.1 sec

adriaan% ./ssetest

v1[ 0] = 2.000000030E-01
v1[ 1] = 2.000000030E-01
v1[ 2] = 2.000000030E-01
v1[ 3] = 2.000000030E-01

v2[ 0] = 3.000000119E-01
v2[ 1] = 3.000000119E-01
v2[ 2] = 3.000000119E-01
v2[ 3] = 3.000000119E-01

Runtime error 216 at $000110D0
  $000110D0
  $000116F0
  $0002F746
  $00010D89
  $00010CB8
  $00000001

Adriaan van Os

2015-04-16 12:43

developer   ~0082960

Apparently {$codealign localmin=16} doesn't work, as shown by this test

{$codealign varmin=16}
{$codealign localmin=16}
program ssetest;

procedure test;
var v1, v2, v3: array[ 0..3] of single;
begin
  writeln;
  writeln( 'v1 located at address ', HexStr( @v1));
  writeln( 'v2 located at address ', HexStr( @v2));
  writeln( 'v3 located at address ', HexStr( @v3));
  writeln;
  v1[ 0] := 0.2;
  v1[ 1] := 0.2;
  v1[ 2] := 0.2;
  v1[ 3] := 0.2;
  writeln( 'v1[ 0] = ', v1[ 0]);
  writeln( 'v1[ 1] = ', v1[ 1]);
  writeln( 'v1[ 2] = ', v1[ 2]);
  writeln( 'v1[ 3] = ', v1[ 3]);
  writeln;
  v2[ 0] := 0.3;
  v2[ 1] := 0.3;
  v2[ 2] := 0.3;
  v2[ 3] := 0.3;
  writeln( 'v2[ 0] = ', v2[ 0]);
  writeln( 'v2[ 1] = ', v2[ 1]);
  writeln( 'v2[ 2] = ', v2[ 2]);
  writeln( 'v2[ 3] = ', v2[ 3]);
  writeln;
  v3 := v1 + v2;
  writeln( 'v3 := v1 + v2;');
  writeln( 'v3[ 0] = ', v3[ 0]);
  writeln( 'v3[ 1] = ', v3[ 1]);
  writeln( 'v3[ 2] = ', v3[ 2]);
  writeln( 'v3[ 3] = ', v3[ 3]);
  writeln;
  v3 := v1 - v2;
  writeln( 'v3 := v1 - v2;');
  writeln( 'v3[ 0] = ', v3[ 0]);
  writeln( 'v3[ 1] = ', v3[ 1]);
  writeln( 'v3[ 2] = ', v3[ 2]);
  writeln( 'v3[ 3] = ', v3[ 3]);
  writeln;
  v3 := v1 * v2;
  writeln( 'v3 := v1 * v2;');
  writeln( 'v3[ 0] = ', v3[ 0]);
  writeln( 'v3[ 1] = ', v3[ 1]);
  writeln( 'v3[ 2] = ', v3[ 2]);
  writeln( 'v3[ 3] = ', v3[ 3]);
  writeln;
  v3 := v1 / v2;
  writeln( 'v3 := v1 / v2;');
  writeln( 'v3[ 0] = ', v3[ 0]);
  writeln( 'v3[ 1] = ', v3[ 1]);
  writeln( 'v3[ 2] = ', v3[ 2]);
  writeln( 'v3[ 3] = ', v3[ 3]);
  writeln;
end;

begin
  test
end.


adriaan% fpc -Sv -CfSSE3 ssetest.pas
Free Pascal Compiler version 2.6.4 [2014/02/26] for i386
Copyright (c) 1993-2014 by Florian Klaempfl and others
Target OS: Darwin for i386
Compiling ssetest.pas
Assembling (pipe) ssetest.s
Linking ssetest
64 lines compiled, 0.1 sec

adriaan% ./ssetest

v1 located at address BFFFF918
v2 located at address BFFFF908
v3 located at address BFFFF8F8

v1[ 0] = 2.000000030E-01
v1[ 1] = 2.000000030E-01
v1[ 2] = 2.000000030E-01
v1[ 3] = 2.000000030E-01

v2[ 0] = 3.000000119E-01
v2[ 1] = 3.000000119E-01
v2[ 2] = 3.000000119E-01
v2[ 3] = 3.000000119E-01

Runtime error 216 at $00011079
  $00011079
  $000116A0
  $0002F6F6
  $00010C29
  $00010B58
  $00000001

Thaddy de Koning

2015-04-16 19:05

reporter   ~0082965

Can you reproduce this with a maintained version of the compiler? 2.6.4 is eol. Can you use 3.0.1 or trunk?
I can't test this on i386 darwin.

Nitorami

2015-04-16 22:07

reporter   ~0082971

Same behaviour on Windows XP (i386-win32) with FPC 3.1.1, code compiles but results of vector operations are all zero. Surprised that this compiles in the first place, I never knew of the -Sv switch.

Nitorami

2015-04-16 22:32

reporter   ~0082972

Is this really a supported function ? I know little about assembler, but when I look at the dump, the vectors are aligned as expected at 16-byte boundaries

v1: 0040E000
v2: 0040E010
v3: 0040E020

but v3 := v1 + v2 is translated as

$00401711: movdqu 0x40e000,%xmm0
$00401719: addps 0x40e010,%xmm0

i.e. v3 is never written.

Jonas Maebe

2015-04-16 22:49

manager   ~0082977

> Is this really a supported function ?

It was implemented at some point and presumably worked, but
1) it was never widely advertised
2) there are no regression tests for it

So it's very likely that it got broken without anyone noticing.

Nitorami

2015-04-16 23:08

reporter   ~0082979

Pity. Any plans to resurrect this ? Wouldn't native support of float vector operations be a good argument for the compiler ?

Jonas Maebe

2015-04-16 23:16

manager   ~0082980

Features never get added to FPC because they are a good argument in favour of the compiler, but only because someone is interested in working on that feature. Maybe Florian, who added the original support, is interested in fixing it.

Fully featured vector support would be nice to have, but it would take a lot of work.

Thaddy de Koning

2017-12-12 09:29

reporter   ~0104657

Last edited: 2017-12-12 09:32

View 3 revisions

Then Sv should be disabled for now. Similar on armhf, but there it throws an abstract error:
 $ fpc -CX -XX -Xs -CfVFPv3 -O4 -Sv -a ssetest.pas
Free Pascal Compiler version 3.1.1-r37709 [2017/12/11] for arm
Copyright (c) 1993-2017 by Florian Klaempfl and others
Note: Switching assembler to default source writing assembler
Target OS: Linux for ARMHF
Compiling ssetest.pas
ssetest.pas(24,12) Error: Compilation raised exception internally
Fatal: Compilation aborted
An unhandled exception occurred at $0024C2F8:
EAbstractError: Abstract method called
  $0024C2F8
  $001ACDAC
  $00223E34
  $000425EC

-CfVFPv4 does the same.

In this casse because there are maybe stubs but no implementation for the operators.

J. Gareth Moreton

2017-12-14 21:24

developer  

VECTORIZATION.patch (26,949 bytes)
Index: compiler/cgbase.pas
===================================================================
--- compiler/cgbase.pas	(revision 37727)
+++ compiler/cgbase.pas	(working copy)
@@ -165,13 +165,18 @@
          not be loaded in a register directly }
        TCgSize = (OS_NO,
                  { integer registers }
-                  OS_8,OS_16,OS_32,OS_64,OS_128,OS_S8,OS_S16,OS_S32,OS_S64,OS_S128,
-                 { single,double,extended,comp,float128 }
-                  OS_F32,OS_F64,OS_F80,OS_C64,OS_F128,
+                  OS_8,   OS_16,   OS_32,   OS_64,   OS_128,
+                  OS_S8,  OS_S16,  OS_S32,  OS_S64,  OS_S128,
+                 { single, double, extended, comp, float128 }
+                  OS_F32, OS_F64,  OS_F80,  OS_C64,  OS_F128,
                  { multi-media sizes: split in byte, word, dword, ... }
                  { entities, then the signed counterparts             }
-                  OS_M8,OS_M16,OS_M32,OS_M64,OS_M128,OS_M256,  
-                  OS_MS8,OS_MS16,OS_MS32,OS_MS64,OS_MS128,OS_MS256 );  
+                  OS_M8,  OS_M16,  OS_M32,  OS_M64,  OS_M128,  OS_M256,  OS_M512,
+                  OS_MS8, OS_MS16, OS_MS32, OS_MS64, OS_MS128, OS_MS256, OS_MS512,
+                 { multi-media sizes: single-precision floating-point }
+                  OS_MF32, OS_MF128, OS_MF256, OS_MF512,
+                 { multi-media sizes: double-precision floating-point }
+                  OS_MD64, OS_MD128, OS_MD256, OS_MD512);
 
       { Register types }
       TRegisterType = (
@@ -307,12 +312,20 @@
        NR_INVALID    = tregister($fffffffff);
 
        tcgsize2size : Array[tcgsize] of integer =
+
+        (0,
          { integer values }
-        (0,1,2,4,8,16,1,2,4,8,16,
+         1,  2,  4,  8, 16,
+         1,  2,  4,  8, 16,
          { floating point values }
-         4,8,10,8,16,
+         4,  8, 10,  8, 16,
          { multimedia values }
-         1,2,4,8,16,32,1,2,4,8,16,32); 
+         1,  2,  4,  8, 16, 32, 64,
+         1,  2,  4,  8, 16, 32, 64,
+         { single-precision multimedia values }
+         4, 16, 32, 64,
+         { double-precision multimedia values }
+         8, 16, 32, 64);
 
        tfloat2tcgsize: array[tfloattype] of tcgsize =
          (OS_F32,OS_F64,OS_F80,OS_F80,OS_C64,OS_C64,OS_F128);
@@ -348,17 +361,24 @@
        { Table to convert tcgsize variables to the correspondending
          unsigned types }
        tcgsize2unsigned : array[tcgsize] of tcgsize = (OS_NO,
-          OS_8,OS_16,OS_32,OS_64,OS_128,OS_8,OS_16,OS_32,OS_64,OS_128,
-          OS_F32,OS_F64,OS_F80,OS_C64,OS_F128,
-          OS_M8,OS_M16,OS_M32,OS_M64,OS_M128,OS_M256,OS_M8,OS_M16,OS_M32,
-          OS_M64,OS_M128,OS_M256);
+         OS_8,    OS_16,   OS_32,   OS_64,   OS_128,
+         OS_8,    OS_16,   OS_32,   OS_64,   OS_128,
 
+         OS_F32,  OS_F64,  OS_F80,  OS_C64,  OS_F128,
+         OS_M8,   OS_M16,  OS_M32,  OS_M64,  OS_M128, OS_M256, OS_M512,
+         OS_M8,   OS_M16,  OS_M32,  OS_M64,  OS_M128, OS_M256, OS_M512,
+         OS_MF32, OS_MF128,OS_MF256,OS_MF512,
+         OS_MD64, OS_MD128,OS_MD256,OS_MD512);
+
        tcgsize2signed : array[tcgsize] of tcgsize = (OS_NO,
-          OS_S8,OS_S16,OS_S32,OS_S64,OS_S128,OS_S8,OS_S16,OS_S32,OS_S64,OS_S128,
-          OS_F32,OS_F64,OS_F80,OS_C64,OS_F128,
-          OS_M8,OS_M16,OS_M32,OS_M64,OS_M128,OS_M256,OS_M8,OS_M16,OS_M32,
-          OS_M64,OS_M128,OS_M256);
+         OS_S8,   OS_S16,  OS_S32,  OS_S64,  OS_S128,
+         OS_S8,   OS_S16,  OS_S32,  OS_S64,  OS_S128,
 
+         OS_F32,  OS_F64,  OS_F80,  OS_C64,  OS_F128,
+         OS_MS8,  OS_MS16, OS_MS32, OS_MS64, OS_MS128,OS_MS256,OS_MS512,
+         OS_MS8,  OS_MS16, OS_MS32, OS_MS64, OS_MS128,OS_MS256,OS_MS512,
+         OS_MF32, OS_MF128,OS_MF256,OS_MF512,
+         OS_MD64, OS_MD128,OS_MD256,OS_MD512);
 
        tcgloc2str : array[TCGLoc] of string[12] = (
             'LOC_INVALID',
@@ -404,6 +424,8 @@
     }
     function int_cgsize(const a: tcgint): tcgsize;{$ifdef USEINLINE}inline;{$endif}
     function int_float_cgsize(const a: tcgint): tcgsize;
+    function float_array_cgsize(const a: tcgint): tcgsize;{$ifdef USEINLINE}inline;{$endif}
+    function double_array_cgsize(const a: tcgint): tcgsize;{$ifdef USEINLINE}inline;{$endif}
 
     function tcgsize2str(cgsize: tcgsize):string;
 
@@ -685,22 +707,53 @@
 
 
     function int_float_cgsize(const a: tcgint): tcgsize;
-      begin
-        case a of
-          4 :
-            result:=OS_F32;
-          8 :
-            result:=OS_F64;
-          10 :
-            result:=OS_F80;
-          16 :
-            result:=OS_F128;
-          else
-            internalerror(200603211);
-        end;
+    begin
+      case a of
+      4:
+        result := OS_F32;
+      8:
+        result := OS_F64;
+      10:
+        result := OS_F80;
+      16:
+        result := OS_F128;
+      else
+        internalerror(200603211);
       end;
+    end;
 
+    function float_array_cgsize(const a: tcgint): tcgsize;{$ifdef USEINLINE}inline;{$endif}
+    begin
+      case a of
+      4:
+        result := OS_MF32;
+      16:
+        result := OS_MF128;
+      32:
+        result := OS_MF256;
+      64:
+        result := OS_MF512;
+      else
+        result := int_cgsize(a);
+      end;
+    end;
 
+    function double_array_cgsize(const a: tcgint): tcgsize;{$ifdef USEINLINE}inline;{$endif}
+    begin
+      case a of
+      8:
+        result := OS_MD64;
+      16:
+        result := OS_MD128;
+      32:
+        result := OS_MD256;
+      64:
+        result := OS_MD512;
+      else
+        result := int_cgsize(a);
+      end;
+    end;
+
     function tcgsize2str(cgsize: tcgsize):string;
       begin
         Str(cgsize, Result);
Index: compiler/defutil.pas
===================================================================
--- compiler/defutil.pas	(revision 37727)
+++ compiler/defutil.pas	(working copy)
@@ -1268,8 +1268,23 @@
           arraydef :
             begin
               if is_dynamic_array(def) or not is_special_array(def) then
-                result := int_cgsize(def.size)
-              else
+              begin
+                if (cs_support_vectors in current_settings.globalswitches) and is_vector(def) and ((TArrayDef(def).elementdef.typ = floatdef) and not (cs_fp_emulation in current_settings.moduleswitches)) then
+                begin
+                  { Determine if, based on the floating-point type and the size
+                    of the array, if it can be made into a vector }
+                  case TFloatDef(def).floattype of
+                  s32real:
+                    result := float_array_cgsize(def.size);
+                  s64real:
+                    result := double_array_cgsize(def.size);
+                  else
+                    { If not, fall back }
+                    result := int_cgsize(def.size);
+                  end;
+                end else
+                  result := int_cgsize(def.size);
+              end else
                 result := OS_NO;
             end;
           else
@@ -1309,7 +1324,8 @@
         case def.typ of
           arraydef:
             begin
-              if tarraydef(def).elementdef.typ in [orddef,floatdef] then
+              case tarraydef(def).elementdef.typ of
+              orddef:
                 begin
                   { this is not correct, OS_MX normally mean that the vector
                     contains elements of size X. However, vectors themselves
@@ -1322,12 +1338,39 @@
                     8: result:=OS_M64;
                     16: result:=OS_M128;
                     32: result:=OS_M256;
+                    64: result:=OS_M512;
                     else
                       internalerror(2013060103);
                   end;
-                end
+                end;
+              floatdef:
+                begin
+                  case TFloatDef(tarraydef(def).elementdef).floattype of
+                    s32real:
+                      case def.size of
+                      4: result:=OS_MF32;
+                      16: result:=OS_MF128;
+                      32: result:=OS_MF256;
+                      64: result:=OS_MF512;
+                      else
+                        internalerror(2017121400);
+                      end;
+                    s64real:
+                      case def.size of
+                        8: result:=OS_MD64;
+                        16: result:=OS_MD128;
+                        32: result:=OS_MD256;
+                        64: result:=OS_MD512;
+                        else
+                          internalerror(2017121401);
+                      end;
+                    else
+                      internalerror(2017121402);
+                  end;
+                end;
               else
                 result:=def_cgsize(def);
+              end;
             end
           else
             result:=def_cgsize(def);
Index: compiler/i386/cpubase.inc
===================================================================
--- compiler/i386/cpubase.inc	(revision 37727)
+++ compiler/i386/cpubase.inc	(working copy)
@@ -35,7 +35,8 @@
         S_NEAR,S_FAR,S_SHORT,
         S_T,
         S_XMM,
-        S_YMM
+        S_YMM,
+        S_ZMM
       );
 
       TOpSizes = set of topsize;
Index: compiler/i8086/cpubase.inc
===================================================================
--- compiler/i8086/cpubase.inc	(revision 37727)
+++ compiler/i8086/cpubase.inc	(working copy)
@@ -35,7 +35,8 @@
         S_NEAR,S_FAR,S_SHORT,
         S_T,
         S_XMM,
-        S_YMM
+        S_YMM,
+        S_ZMM
       );
 
       TOpSizes = set of topsize;
Index: compiler/ncgld.pas
===================================================================
--- compiler/ncgld.pas	(revision 37727)
+++ compiler/ncgld.pas	(working copy)
@@ -682,6 +682,7 @@
 
     procedure tcgassignmentnode.pass_generate_code;
       var
+         shuffle : pmmshuffle;
          hlabel : tasmlabel;
          href : treference;
          releaseright : boolean;
@@ -968,22 +969,21 @@
               LOC_MMREGISTER,
               LOC_CMMREGISTER:
                 begin
-                  if left.resultdef.typ=arraydef then
-                    begin
-                    end
+                  if (is_vector(left.resultdef)) then
+                    shuffle := nil
                   else
-                    begin
-                      case left.location.loc of
-                        LOC_CMMREGISTER,
-                        LOC_MMREGISTER:
-                          hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,right.resultdef,left.resultdef,right.location.register,left.location.register,mms_movescalar);
-                        LOC_REFERENCE,
-                        LOC_CREFERENCE:
-                          hlcg.a_loadmm_reg_ref(current_asmdata.CurrAsmList,right.resultdef,left.resultdef,right.location.register,left.location.reference,mms_movescalar);
-                        else
-                          internalerror(2009112601);
-                      end;
-                    end;
+                    shuffle := mms_movescalar;
+
+                  case left.location.loc of
+                    LOC_CMMREGISTER,
+                    LOC_MMREGISTER:
+                      hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,right.resultdef,left.resultdef,right.location.register,left.location.register, shuffle);
+                    LOC_REFERENCE,
+                    LOC_CREFERENCE:
+                      hlcg.a_loadmm_reg_ref(current_asmdata.CurrAsmList,right.resultdef,left.resultdef,right.location.register,left.location.reference, shuffle);
+                    else
+                      internalerror(2009112601);
+                  end;
                 end;
               LOC_REGISTER,
               LOC_CREGISTER :
Index: compiler/x86/aasmcpu.pas
===================================================================
--- compiler/x86/aasmcpu.pas	(revision 37727)
+++ compiler/x86/aasmcpu.pas	(working copy)
@@ -52,6 +52,7 @@
       OT_BITS64    = $00000008;  { x86_64 and FPU }
       OT_BITS128   = $10000000;  { 16 byte SSE }
       OT_BITS256   = $20000000;  { 32 byte AVX }
+      OT_BITS512   = $40000000;  { 64 byte AVX512 }
       OT_BITS80    = $00000010;  { FPU only  }
       OT_FAR       = $00000020;  { this means 16:16 or 16:32, like in CALL/JMP }
       OT_NEAR      = $00000040;
@@ -608,7 +609,8 @@
           OT_NEAR,OT_FAR,OT_SHORT,
           OT_NONE,
           OT_BITS128,
-          OT_BITS256
+          OT_BITS256,
+          OT_BITS512
          ),
          (OT_NONE,
           OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_BITS8,OT_BITS8,OT_BITS16,OT_BITS8,OT_BITS16,OT_BITS32,
@@ -618,7 +620,8 @@
           OT_NEAR,OT_FAR,OT_SHORT,
           OT_NONE,
           OT_BITS128,
-          OT_BITS256
+          OT_BITS256,
+          OT_BITS512
          ),
          (OT_NONE,
           OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_NONE,OT_NONE,OT_NONE,OT_NONE,OT_NONE,OT_NONE,
@@ -628,7 +631,8 @@
           OT_NEAR,OT_FAR,OT_SHORT,
           OT_NONE,
           OT_BITS128,
-          OT_BITS256
+          OT_BITS256,
+          OT_BITS512
          )
        );
 
@@ -646,7 +650,8 @@
           OT_NEAR,OT_FAR,OT_SHORT,
           OT_NONE,
           OT_BITS128,
-          OT_BITS256
+          OT_BITS256,
+          OT_BITS512
          ),
          (OT_NONE,
           OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_BITS8,OT_BITS8,OT_BITS16,
@@ -656,7 +661,8 @@
           OT_NEAR,OT_FAR,OT_SHORT,
           OT_NONE,
           OT_BITS128,
-          OT_BITS256
+          OT_BITS256,
+          OT_BITS512
          ),
          (OT_NONE,
           OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_NONE,OT_NONE,OT_NONE,
@@ -666,7 +672,8 @@
           OT_NEAR,OT_FAR,OT_SHORT,
           OT_NONE,
           OT_BITS128,
-          OT_BITS256
+          OT_BITS256,
+          OT_BITS512
          )
       );
 
@@ -684,7 +691,8 @@
           OT_NEAR,OT_FAR,OT_SHORT,
           OT_NONE,
           OT_BITS128,
-          OT_BITS256
+          OT_BITS256,
+          OT_BITS512
          ),
          (OT_NONE,
           OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_BITS8,OT_BITS8,OT_BITS16,
@@ -694,7 +702,8 @@
           OT_NEAR,OT_FAR,OT_SHORT,
           OT_NONE,
           OT_BITS128,
-          OT_BITS256
+          OT_BITS256,
+          OT_BITS512
          ),
          (OT_NONE,
           OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_NONE,OT_NONE,OT_NONE,
@@ -704,7 +713,8 @@
           OT_NEAR,OT_FAR,OT_SHORT,
           OT_NONE,
           OT_BITS128,
-          OT_BITS256
+          OT_BITS256,
+          OT_BITS512
          )
       );
 
Index: compiler/x86/cgx86.pas
===================================================================
--- compiler/x86/cgx86.pas	(revision 37727)
+++ compiler/x86/cgx86.pas	(working copy)
@@ -158,20 +158,26 @@
       TCGSize2OpSize: Array[tcgsize] of topsize =
         (S_NO,S_B,S_W,S_L,S_Q,S_XMM,S_B,S_W,S_L,S_Q,S_XMM,
          S_FS,S_FL,S_FX,S_IQ,S_FXX,
-         S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,
-         S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM);
+         S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,S_ZMM,
+         S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM,S_ZMM,
+         S_NO,S_XMM,S_YMM,S_ZMM,
+         S_NO,S_XMM,S_YMM,S_ZMM);
 {$elseif defined(i386)}
       TCGSize2OpSize: Array[tcgsize] of topsize =
         (S_NO,S_B,S_W,S_L,S_L,S_T,S_B,S_W,S_L,S_L,S_L,
          S_FS,S_FL,S_FX,S_IQ,S_FXX,
-         S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,
-         S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM);
+         S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,S_ZMM,
+         S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM,S_ZMM,
+         S_NO,S_XMM,S_YMM,S_ZMM,
+         S_NO,S_XMM,S_YMM,S_ZMM);
 {$elseif defined(i8086)}
       TCGSize2OpSize: Array[tcgsize] of topsize =
         (S_NO,S_B,S_W,S_W,S_W,S_T,S_B,S_W,S_W,S_W,S_W,
          S_FS,S_FL,S_FX,S_IQ,S_FXX,
-         S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,
-         S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM);
+         S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,S_ZMM,
+         S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM,S_ZMM,
+         S_NO,S_XMM,S_YMM,S_ZMM,
+         S_NO,S_XMM,S_YMM,S_ZMM);
 {$endif}
 
 {$ifndef NOTARGETWIN}
@@ -1385,15 +1391,77 @@
          make_simple_ref(list,tmpref);
          if shuffle=nil then
            begin
-             if fromsize=OS_M64 then
-               list.concat(taicpu.op_ref_reg(A_MOVQ,S_NO,tmpref,reg))
+             case fromsize of
+             OS_F32:
+               if UseAVX then
+                 op := A_VMOVSS
+               else
+                 op := A_MOVSS;
+             OS_F64:
+               if UseAVX then
+                 op := A_VMOVSD
+               else
+                 op := A_MOVSD;
+             OS_M32, OS_32, OS_S32:
+               if UseAVX then
+                 op := A_VMOVD
+               else
+                 op := A_MOVD;
+             OS_M64, OS_64, OS_S64:
+               if UseAVX then
+                 op := A_VMOVQ
+               else
+                 op := A_MOVQ;
+             OS_MF128:
+               { Use XMM transfer of packed singles }
+               if UseAVX then
+               begin
+                 if tmpref.alignment = 16 then
+                   op := A_VMOVAPS
+                 else
+                   op := A_VMOVUPS
+               end else
+               begin
+                 if tmpref.alignment = 16 then
+                   op := A_MOVAPS
+                 else
+                   op := A_MOVUPS
+               end;
+             OS_MD128:
+               { Use XMM transfer of packed doubles }
+               if UseAVX then
+               begin
+                 if tmpref.alignment = 16 then
+                   op := A_VMOVAPD
+                 else
+                   op := A_VMOVUPD
+               end else
+               begin
+                 if tmpref.alignment = 16 then
+                   op := A_MOVAPD
+                 else
+                   op := A_MOVUPD
+               end;
+             OS_M128, OS_MS128:
+               { Use XMM integer transfer }
+               if UseAVX then
+               begin
+                 if tmpref.alignment = 16 then
+                   op := A_VMOVDQA
+                 else
+                   op := A_VMOVDQU
+               end else
+               begin
+                 if tmpref.alignment = 16 then
+                   op := A_MOVDQA
+                 else
+                   op := A_MOVDQU
+               end;
              else
-{$ifdef x86_64}
-               { x86-64 has always properly aligned data }
-               list.concat(taicpu.op_ref_reg(A_MOVDQA,S_NO,tmpref,reg));
-{$else x86_64}
-               list.concat(taicpu.op_ref_reg(A_MOVDQU,S_NO,tmpref,reg));
-{$endif x86_64}
+               { No valid transfer command available }
+               internalerror(2017121410);
+             end;
+             list.concat(taicpu.op_ref_reg(op,S_NO,tmpref,reg));
            end
          else if shufflescalar(shuffle) then
            begin
@@ -1420,15 +1488,77 @@
          make_simple_ref(list,tmpref);
          if shuffle=nil then
            begin
-             if fromsize=OS_M64 then
-               list.concat(taicpu.op_reg_ref(A_MOVQ,S_NO,reg,tmpref))
+             case fromsize of
+             OS_F32:
+               if UseAVX then
+                 op := A_VMOVSS
+               else
+                 op := A_MOVSS;
+             OS_F64:
+               if UseAVX then
+                 op := A_VMOVSD
+               else
+                 op := A_MOVSD;
+             OS_M32, OS_32, OS_S32:
+               if UseAVX then
+                 op := A_VMOVD
+               else
+                 op := A_MOVD;
+             OS_M64, OS_64, OS_S64:
+               if UseAVX then
+                 op := A_VMOVQ
+               else
+                 op := A_MOVQ;
+             OS_MF128:
+               { Use XMM transfer of packed singles }
+               if UseAVX then
+               begin
+                 if tmpref.alignment = 16 then
+                   op := A_VMOVAPS
+                 else
+                   op := A_VMOVUPS
+               end else
+               begin
+                 if tmpref.alignment = 16 then
+                   op := A_MOVAPS
+                 else
+                   op := A_MOVUPS
+               end;
+             OS_MD128:
+               { Use XMM transfer of packed doubles }
+               if UseAVX then
+               begin
+                 if tmpref.alignment = 16 then
+                   op := A_VMOVAPD
+                 else
+                   op := A_VMOVUPD
+               end else
+               begin
+                 if tmpref.alignment = 16 then
+                   op := A_MOVAPD
+                 else
+                   op := A_MOVUPD
+               end;
+             OS_M128, OS_MS128:
+               { Use XMM integer transfer }
+               if UseAVX then
+               begin
+                 if tmpref.alignment = 16 then
+                   op := A_VMOVDQA
+                 else
+                   op := A_VMOVDQU
+               end else
+               begin
+                 if tmpref.alignment = 16 then
+                   op := A_MOVDQA
+                 else
+                   op := A_MOVDQU
+               end;
              else
-{$ifdef x86_64}
-               { x86-64 has always properly aligned data }
-               list.concat(taicpu.op_reg_ref(A_MOVDQA,S_NO,reg,tmpref))
-{$else x86_64}
-               list.concat(taicpu.op_reg_ref(A_MOVDQU,S_NO,reg,tmpref))
-{$endif x86_64}
+               { No valid transfer command available }
+               internalerror(2017121411);
+             end;
+             list.concat(taicpu.op_reg_ref(op,S_NO,reg,tmpref));
            end
          else if shufflescalar(shuffle) then
            begin
Index: compiler/x86/itcpugas.pas
===================================================================
--- compiler/x86/itcpugas.pas	(revision 37727)
+++ compiler/x86/itcpugas.pas	(working copy)
@@ -52,27 +52,28 @@
        'd',
        '','','',
        't',
-        'x',
-        'y'
+       'x',
+       'y',
+       'z'
      );
      { suffix-to-opsize conversion tables, used in asmreadrer }
      { !! S_LQ excluded: movzlq does not exist, movslq is processed
        as a separate instruction w/o suffix (aka movsxd), and there are
        no more instructions needing it. }
-     att_sizesuffixstr : array[0..13] of string[2] = (
-       '','BW','BL','WL','BQ','WQ',{'LQ',}'B','W','L','S','Q','T','X','Y'
+     att_sizesuffixstr : array[0..14] of string[2] = (
+       '','BW','BL','WL','BQ','WQ',{'LQ',}'B','W','L','S','Q','T','X','Y','Z'
      );
-     att_sizesuffix : array[0..13] of topsize = (
-       S_NO,S_BW,S_BL,S_WL,S_BQ,S_WQ,{S_LQ,}S_B,S_W,S_L,S_NO,S_Q,S_NO,S_NO,S_NO
+     att_sizesuffix : array[0..14] of topsize = (
+       S_NO,S_BW,S_BL,S_WL,S_BQ,S_WQ,{S_LQ,}S_B,S_W,S_L,S_NO,S_Q,S_NO,S_NO,S_NO,S_NO
      );
-     att_sizefpusuffix : array[0..13] of topsize = (
-       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,{S_NO,}S_NO,S_NO,S_FL,S_FS,S_NO,S_FX,S_NO,S_NO
+     att_sizefpusuffix : array[0..14] of topsize = (
+       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,{S_NO,}S_NO,S_NO,S_FL,S_FS,S_NO,S_FX,S_NO,S_NO,S_NO
      );
-     att_sizefpuintsuffix : array[0..13] of topsize = (
-       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,{S_NO,}S_NO,S_NO,S_IL,S_IS,S_IQ,S_NO,S_NO,S_NO
+     att_sizefpuintsuffix : array[0..14] of topsize = (
+       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,{S_NO,}S_NO,S_NO,S_IL,S_IS,S_IQ,S_NO,S_NO,S_NO,S_NO
      );
-     att_sizemmsuffix : array[0..13] of topsize = (
-       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,{S_NO,}S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM
+     att_sizemmsuffix : array[0..14] of topsize = (
+       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,{S_NO,}S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM,S_ZMM
      );
 {$else x86_64}
      gas_opsize2str : array[topsize] of string[2] = ('',
@@ -82,24 +83,25 @@
        'd',
        '','','',
        't',
-        'x',
-        'y'
+       'x',
+       'y',
+       'z'
      );
      { suffix-to-opsize conversion tables, used in asmreadrer }
-     att_sizesuffixstr : array[0..11] of string[2] = (
-       '','BW','BL','WL','B','W','L','S','Q','T','X','Y'
+     att_sizesuffixstr : array[0..12] of string[2] = (
+       '','BW','BL','WL','B','W','L','S','Q','T','X','Y','Z'
      );
-     att_sizesuffix : array[0..11] of topsize = (
-       S_NO,S_BW,S_BL,S_WL,S_B,S_W,S_L,S_NO,S_NO,S_NO,S_NO,S_NO
+     att_sizesuffix : array[0..12] of topsize = (
+       S_NO,S_BW,S_BL,S_WL,S_B,S_W,S_L,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO
      );
-     att_sizefpusuffix : array[0..11] of topsize = (
-       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_FL,S_FS,S_NO,S_FX,S_NO,S_NO
+     att_sizefpusuffix : array[0..12] of topsize = (
+       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_FL,S_FS,S_NO,S_FX,S_NO,S_NO,S_NO
      );
-     att_sizefpuintsuffix : array[0..11] of topsize = (
-       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_IL,S_IS,S_IQ,S_NO,S_NO,S_NO
+     att_sizefpuintsuffix : array[0..12] of topsize = (
+       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_IL,S_IS,S_IQ,S_NO,S_NO,S_NO,S_NO
      );
-     att_sizemmsuffix : array[0..11] of topsize = (
-       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM
+     att_sizemmsuffix : array[0..12] of topsize = (
+       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM,S_ZMM
      );
 
 {$endif x86_64}
Index: compiler/x86/rax86.pas
===================================================================
--- compiler/x86/rax86.pas	(revision 37727)
+++ compiler/x86/rax86.pas	(working copy)
@@ -343,7 +343,8 @@
      0,0,0,
      80,
      128,
-     256
+     256,
+     512
     );
 {$else}
 topsize2memsize: array[topsize] of integer =
@@ -354,7 +355,8 @@
    0,0,0,
    80,
    128,
-   256
+   256,
+   512
   );
 {$endif}
 
Index: compiler/x86_64/aoptcpu.pas
===================================================================
--- compiler/x86_64/aoptcpu.pas	(revision 37727)
+++ compiler/x86_64/aoptcpu.pas	(working copy)
@@ -74,10 +74,14 @@
               A_MOVZX:
                 Result:=OptPass1Movx(p);
               A_VMOVAPS,
-              A_VMOVAPD:
+              A_VMOVAPD,
+              A_VMOVUPS,
+              A_VMOVUPD:
                 result:=OptPass1VMOVAP(p);
               A_MOVAPD,
-              A_MOVAPS:
+              A_MOVAPS,
+              A_MOVUPD,
+              A_MOVUPS:
                 result:=OptPass1MOVAP(p);
               A_VDIVSD,
               A_VDIVSS,
Index: compiler/x86_64/cpubase.inc
===================================================================
--- compiler/x86_64/cpubase.inc	(revision 37727)
+++ compiler/x86_64/cpubase.inc	(working copy)
@@ -35,7 +35,8 @@
     S_NEAR,S_FAR,S_SHORT,
     S_T,
     S_XMM,
-    S_YMM
+    S_YMM,
+    S_ZMM
   );
 
   TOpSizes = set of topsize;
VECTORIZATION.patch (26,949 bytes)

J. Gareth Moreton

2017-12-14 21:25

developer   ~0104720

I've built a patch based on the latest version of FPC from the repository. There are some extra bits of code to help with future expansion when it comes to vectorization. Let me know how it goes for you.

J. Gareth Moreton

2018-02-15 00:38

developer   ~0106393

I might consider this one fixed now, unless bugs still appear, since the additions for feature 0032781 also include the code in the patch for this issue.

J. Gareth Moreton

2018-04-06 17:14

developer   ~0107649

Should now work now that vectorcall and all of its supporting code has been implemented (even if the procedure in question doesn't actually use the vectorcall calling convention).

Issue History

Date Modified Username Field Change
2015-04-16 08:45 Adriaan van Os New Issue
2015-04-16 11:05 Jonas Maebe Note Added: 0082955
2015-04-16 11:05 Jonas Maebe Status new => resolved
2015-04-16 11:05 Jonas Maebe Resolution open => no change required
2015-04-16 11:05 Jonas Maebe Assigned To => Jonas Maebe
2015-04-16 11:32 Adriaan van Os Note Added: 0082957
2015-04-16 11:32 Adriaan van Os Status resolved => feedback
2015-04-16 11:32 Adriaan van Os Resolution no change required => reopened
2015-04-16 11:39 Jonas Maebe Note Added: 0082958
2015-04-16 11:39 Jonas Maebe Assigned To Jonas Maebe =>
2015-04-16 11:39 Jonas Maebe Status feedback => new
2015-04-16 11:39 Jonas Maebe Description Updated View Revisions
2015-04-16 12:31 Adriaan van Os Note Added: 0082959
2015-04-16 12:43 Adriaan van Os Note Added: 0082960
2015-04-16 19:05 Thaddy de Koning Note Added: 0082965
2015-04-16 22:07 Nitorami Note Added: 0082971
2015-04-16 22:32 Nitorami Note Added: 0082972
2015-04-16 22:49 Jonas Maebe Note Added: 0082977
2015-04-16 23:08 Nitorami Note Added: 0082979
2015-04-16 23:16 Jonas Maebe Note Added: 0082980
2015-05-06 11:54 Adriaan van Os Relationship added related to 0028037
2017-12-12 09:29 Thaddy de Koning Note Added: 0104657
2017-12-12 09:31 Thaddy de Koning Note Edited: 0104657 View Revisions
2017-12-12 09:32 Thaddy de Koning Note Edited: 0104657 View Revisions
2017-12-14 21:24 J. Gareth Moreton File Added: VECTORIZATION.patch
2017-12-14 21:25 J. Gareth Moreton Note Added: 0104720
2018-02-15 00:38 J. Gareth Moreton Note Added: 0106393
2018-04-06 17:11 J. Gareth Moreton Relationship added related to 0032781
2018-04-06 17:14 J. Gareth Moreton Fixed in Revision => 38206
2018-04-06 17:14 J. Gareth Moreton Note Added: 0107649
2018-04-06 17:14 J. Gareth Moreton Status new => resolved
2018-04-06 17:14 J. Gareth Moreton Fixed in Version => 3.1.1
2018-04-06 17:14 J. Gareth Moreton Resolution reopened => fixed
2018-04-06 17:14 J. Gareth Moreton Assigned To => J. Gareth Moreton
2018-04-06 17:14 J. Gareth Moreton Target Version => 3.1.1