View Issue Details

IDProjectCategoryView StatusLast Update
0037526FPCCompilerpublic2020-09-27 23:06
ReporterJ. Gareth Moreton Assigned ToFlorian  
PrioritynormalSeverityminorReproducibilityN/A
Status resolvedResolutionfixed 
Platformarm and aarch64OSLinux (Raspberry Pi OS) 
Product Version3.3.1 
Fixed in Version3.3.1 
Summary0037526: [Patch / Refactor] Maintenance of ARM/AARCH64 Peephole Optimizer
DescriptionThis patch refactors the Peephole Optimizer for ARM and AARCH64 platforms, especially the former where individual optimisations in Pass 1 are moved to separate subroutines. This will aid future maintenance and expansion of the Peephole Optimizer for these two platforms.
Steps To ReproduceApply patch and confirm identical compilation
Additional InformationSome minor efficiency changes were made, such as attempting to reduce the number of calls to GetNextInstruction by rearranging conditions in if-statements, or factoring out common conditions.
Tagsaarch64, arm, patch, refactor
Fixed in Revision46975, 46976
FPCOldBugId
FPCTarget-
Attached Files

Activities

Florian

2020-08-16 17:19

administrator   ~0124925

After the patch some optimizations are missing, see diffs.
lineinfo.arm-linux.diff (3,323 bytes)   
*** rtl/units.1/arm-linux/lineinfo.s	2020-08-16 17:10:32.528524611 +0200
--- rtl/units.2/arm-linux/lineinfo.s	2020-08-16 17:14:22.338013124 +0200
***************
*** 76,88 ****
  	ldr	r1,.Lj5
  	ldr	r0,.Lj21
  	bl	EXEINFO_$$_OPENEXEFILE$TEXEFILE$SHORTSTRING$$BOOLEAN
! 	ands	r0,r0,#255
  	beq	.Lj3
  # [137] if ReadDebugLink(e,dbgfn) then
  	ldr	r1,.Lj24
  	ldr	r0,.Lj21
  	bl	EXEINFO_$$_READDEBUGLINK$TEXEFILE$SHORTSTRING$$BOOLEAN
! 	ands	r0,r0,#255
  	beq	.Lj27
  # [139] CloseExeFile(e);
  	ldr	r0,.Lj21
--- 76,90 ----
  	ldr	r1,.Lj5
  	ldr	r0,.Lj21
  	bl	EXEINFO_$$_OPENEXEFILE$TEXEFILE$SHORTSTRING$$BOOLEAN
! 	and	r0,r0,#255
! 	cmp	r0,#0
  	beq	.Lj3
  # [137] if ReadDebugLink(e,dbgfn) then
  	ldr	r1,.Lj24
  	ldr	r0,.Lj21
  	bl	EXEINFO_$$_READDEBUGLINK$TEXEFILE$SHORTSTRING$$BOOLEAN
! 	and	r0,r0,#255
! 	cmp	r0,#0
  	beq	.Lj27
  # [139] CloseExeFile(e);
  	ldr	r0,.Lj21
***************
*** 91,97 ****
  	ldr	r1,.Lj24
  	ldr	r0,.Lj21
  	bl	EXEINFO_$$_OPENEXEFILE$TEXEFILE$SHORTSTRING$$BOOLEAN
! 	ands	r0,r0,#255
  	beq	.Lj3
  .Lj27:
  	ldr	r0,.Lj21
--- 93,100 ----
  	ldr	r1,.Lj24
  	ldr	r0,.Lj21
  	bl	EXEINFO_$$_OPENEXEFILE$TEXEFILE$SHORTSTRING$$BOOLEAN
! 	and	r0,r0,#255
! 	cmp	r0,#0
  	beq	.Lj3
  .Lj27:
  	ldr	r0,.Lj21
***************
*** 111,117 ****
  	ldr	r0,.Lj21
  	ldr	r1,.Lj39
  	bl	EXEINFO_$$_FINDEXESECTION$TEXEFILE$SHORTSTRING$LONGINT$LONGINT$$BOOLEAN
! 	ands	r0,r0,#255
  	beq	.Lj41
  # [153] FindExeSection(e,'.stabstr',stabstrofs,stabstrlen) then
  	ldr	r3,.Lj43
--- 114,121 ----
  	ldr	r0,.Lj21
  	ldr	r1,.Lj39
  	bl	EXEINFO_$$_FINDEXESECTION$TEXEFILE$SHORTSTRING$LONGINT$LONGINT$$BOOLEAN
! 	and	r0,r0,#255
! 	cmp	r0,#0
  	beq	.Lj41
  # [153] FindExeSection(e,'.stabstr',stabstrofs,stabstrlen) then
  	ldr	r3,.Lj43
***************
*** 119,125 ****
  	ldr	r0,.Lj21
  	ldr	r1,.Lj46
  	bl	EXEINFO_$$_FINDEXESECTION$TEXEFILE$SHORTSTRING$LONGINT$LONGINT$$BOOLEAN
! 	ands	r0,r0,#255
  	beq	.Lj41
  # [155] stabcnt:=stablen div sizeof(tstab);
  	ldr	r1,.Lj36
--- 123,130 ----
  	ldr	r0,.Lj21
  	ldr	r1,.Lj46
  	bl	EXEINFO_$$_FINDEXESECTION$TEXEFILE$SHORTSTRING$LONGINT$LONGINT$$BOOLEAN
! 	and	r0,r0,#255
! 	cmp	r0,#0
  	beq	.Lj41
  # [155] stabcnt:=stablen div sizeof(tstab);
  	ldr	r1,.Lj36
***************
*** 254,260 ****
  # [190] if not OpenStabs(pointer(addr)) then
  	mov	r0,r8
  	bl	LINEINFO_$$_OPENSTABS$POINTER$$BOOLEAN
! 	ands	r0,r0,#255
  	beq	.Lj60
  # [195] addr := dword(addr - e.processaddress);
  	ldr	r0,.Lj64
--- 259,266 ----
  # [190] if not OpenStabs(pointer(addr)) then
  	mov	r0,r8
  	bl	LINEINFO_$$_OPENSTABS$POINTER$$BOOLEAN
! 	and	r0,r0,#255
! 	cmp	r0,#0
  	beq	.Lj60
  # [195] addr := dword(addr - e.processaddress);
  	ldr	r0,.Lj64
***************
*** 360,366 ****
  	movne	r0,#0
  	ldr	r1,.Lj92
  	ldrb	r1,[r1]
! 	ands	r0,r1,r0
  	beq	.Lj94
  # [224] inc(stabs[i].nvalue,lastfunc.nvalue);
  	mov	r0,#12
--- 366,373 ----
  	movne	r0,#0
  	ldr	r1,.Lj92
  	ldrb	r1,[r1]
! 	and	r0,r1,r0
! 	cmp	r0,#0
  	beq	.Lj94
  # [224] inc(stabs[i].nvalue,lastfunc.nvalue);
  	mov	r0,#12
***************
*** 542,548 ****
  	cmp	r5,#0
  	moveq	r0,#1
  	movne	r0,#0
! 	orrs	r0,r7,r0
  	beq	.Lj72
  # [274] line:=linestab.ndesc;
  	ldr	r0,.Lj137
--- 549,556 ----
  	cmp	r5,#0
  	moveq	r0,#1
  	movne	r0,#0
! 	orr	r0,r7,r0
! 	cmp	r0,#0
  	beq	.Lj72
  # [274] line:=linestab.ndesc;
  	ldr	r0,.Lj137
lineinfo.arm-linux.diff (3,323 bytes)   
system.aarch64-linux.diff (17,191 bytes)   
*** rtl/units.1/aarch64-linux/system.s	2020-08-16 17:10:37.660557972 +0200
--- rtl/units.2/aarch64-linux/system.s	2020-08-16 17:14:31.330071177 +0200
***************
*** 9092,9098 ****
  .Lj1226:
  // [360] x := x*2;
  	fmov	d1,# 2.0000000000000000E+000
! 	fmul	d0,d0,d1
  // [361] Dec(exponent);
  	ldr	w2,[x1]
  	sub	w2,w2,#1
--- 9092,9099 ----
  .Lj1226:
  // [360] x := x*2;
  	fmov	d1,# 2.0000000000000000E+000
! 	fmul	d1,d0,d1
! 	fmov	d0,d1
  // [361] Dec(exponent);
  	ldr	w2,[x1]
  	sub	w2,w2,#1
***************
*** 9110,9116 ****
  .Lj1230:
  // [366] x := x/2;
  	fmov	d1,# 5.0000000000000000E-001
! 	fmul	d0,d0,d1
  // [367] Inc(exponent);
  	ldr	w2,[x1]
  	add	w2,w2,#1
--- 9111,9118 ----
  .Lj1230:
  // [366] x := x/2;
  	fmov	d1,# 5.0000000000000000E-001
! 	fmul	d1,d0,d1
! 	fmov	d0,d1
  // [367] Inc(exponent);
  	ldr	w2,[x1]
  	add	w2,w2,#1
***************
*** 9204,9210 ****
  	adrp	x0,:got:TC_$SYSTEM_$$_HUGE
  	ldr	x0,[x0, :got_lo12:TC_$SYSTEM_$$_HUGE]
  	ldur	d2,[x0]
! 	fmul	d1,d0,d2
  	b	.Lj1239
  .Lj1246:
  // [396] ldexp := (-2.0)*huge;
--- 9206,9213 ----
  	adrp	x0,:got:TC_$SYSTEM_$$_HUGE
  	ldr	x0,[x0, :got_lo12:TC_$SYSTEM_$$_HUGE]
  	ldur	d2,[x0]
! 	fmul	d0,d0,d2
! 	fmov	d1,d0
  	b	.Lj1239
  .Lj1246:
  // [396] ldexp := (-2.0)*huge;
***************
*** 9212,9218 ****
  	adrp	x0,:got:TC_$SYSTEM_$$_HUGE
  	ldr	x0,[x0, :got_lo12:TC_$SYSTEM_$$_HUGE]
  	ldur	d2,[x0]
! 	fmul	d1,d0,d2
  .Lj1247:
  	b	.Lj1239
  .Lj1244:
--- 9215,9222 ----
  	adrp	x0,:got:TC_$SYSTEM_$$_HUGE
  	ldr	x0,[x0, :got_lo12:TC_$SYSTEM_$$_HUGE]
  	ldur	d2,[x0]
! 	fmul	d0,d0,d2
! 	fmov	d1,d0
  .Lj1247:
  	b	.Lj1239
  .Lj1244:
***************
*** 9244,9250 ****
  	adrp	x0,:got:TC_$SYSTEM_$$_H2_54
  	ldr	x0,[x0, :got_lo12:TC_$SYSTEM_$$_H2_54]
  	ldur	d2,[x0]
! 	fdiv	d1,d0,d2
  .Lj1253:
  	b	.Lj1239
  .Lj1250:
--- 9248,9255 ----
  	adrp	x0,:got:TC_$SYSTEM_$$_H2_54
  	ldr	x0,[x0, :got_lo12:TC_$SYSTEM_$$_H2_54]
  	ldur	d2,[x0]
! 	fdiv	d0,d0,d2
! 	fmov	d1,d0
  .Lj1253:
  	b	.Lj1239
  .Lj1250:
***************
*** 9382,9388 ****
  .Lj1273:
  // [514] floord := t - 1.0;
  	fmov	d2,# 1.0000000000000000E+000
! 	fsub	d1,d0,d2
  .Lj1274:
  // [515] end;
  	fmov	d0,d1
--- 9387,9394 ----
  .Lj1273:
  // [514] floord := t - 1.0;
  	fmov	d2,# 1.0000000000000000E+000
! 	fsub	d0,d0,d2
! 	fmov	d1,d0
  .Lj1274:
  // [515] end;
  	fmov	d0,d1
***************
*** 9593,9599 ****
  	sxtw	x0,w22
  	add	x0,sp,x0,lsl #3
  	ldr	d0,[x0, #392]
! 	fadd	d9,d0,d8
  // [699] inc(i);
  	ldr	x0,[sp, #568]
  	add	w0,w0,#1
--- 9599,9606 ----
  	sxtw	x0,w22
  	add	x0,sp,x0,lsl #3
  	ldr	d0,[x0, #392]
! 	fadd	d0,d0,d8
! 	fmov	d9,d0
  // [699] inc(i);
  	ldr	x0,[sp, #568]
  	add	w0,w0,#1
***************
*** 9623,9629 ****
  	str	x0,[sp, #584]
  	ldr	x0,[sp, #584]
  	scvtf	d0,w0
! 	fsub	d9,d9,d0
  // [707] ih := 0;
  	movz	w0,#0
  	str	x0,[sp, #600]
--- 9630,9637 ----
  	str	x0,[sp, #584]
  	ldr	x0,[sp, #584]
  	scvtf	d0,w0
! 	fsub	d0,d9,d0
! 	fmov	d9,d0
  // [707] ih := 0;
  	movz	w0,#0
  	str	x0,[sp, #600]
***************
*** 9774,9787 ****
  	b.ne	.Lj1314
  // [750] z := 1.0 - z;
  	fmov	d0,# 1.0000000000000000E+000
! 	fsub	d9,d0,d9
  // [751] if carry<>0 then
  	cbz	w21,.Lj1314
  // [752] z := z - ldexp(1.0,q0);
  	sxth	w0,w27
  	fmov	d0,# 1.0000000000000000E+000
  	bl	SYSTEM_$$_LDEXP$REAL$SMALLINT$$REAL
! 	fsub	d9,d9,d0
  .Lj1333:
  .Lj1331:
  .Lj1314:
--- 9782,9797 ----
  	b.ne	.Lj1314
  // [750] z := 1.0 - z;
  	fmov	d0,# 1.0000000000000000E+000
! 	fsub	d0,d0,d9
! 	fmov	d9,d0
  // [751] if carry<>0 then
  	cbz	w21,.Lj1314
  // [752] z := z - ldexp(1.0,q0);
  	sxth	w0,w27
  	fmov	d0,# 1.0000000000000000E+000
  	bl	SYSTEM_$$_LDEXP$REAL$SMALLINT$$REAL
! 	fsub	d0,d9,d0
! 	fmov	d9,d0
  .Lj1333:
  .Lj1331:
  .Lj1314:
***************
*** 9984,9990 ****
  	adrp	x0,:got:TC_$SYSTEM_$$_TWON24
  	ldr	x0,[x0, :got_lo12:TC_$SYSTEM_$$_TWON24]
  	ldur	d1,[x0]
! 	fmul	d0,d0,d1
  	ldr	x0,[sp, #568]
  	cmp	w0,#0
  	b.gt	.Lj1367
--- 9994,10001 ----
  	adrp	x0,:got:TC_$SYSTEM_$$_TWON24
  	ldr	x0,[x0, :got_lo12:TC_$SYSTEM_$$_TWON24]
  	ldur	d1,[x0]
! 	fmul	d1,d0,d1
! 	fmov	d0,d1
  	ldr	x0,[sp, #568]
  	cmp	w0,#0
  	b.gt	.Lj1367
***************
*** 10076,10082 ****
  	mov	w1,w0
  	add	x0,sp,w1,uxtw #3
  	ldur	d1,[x0, #240]
! 	fadd	d0,d0,d1
  	ldr	x0,[sp, #568]
  	cmp	w0,#0
  	b.gt	.Lj1387
--- 10087,10094 ----
  	mov	w1,w0
  	add	x0,sp,w1,uxtw #3
  	ldur	d1,[x0, #240]
! 	fadd	d1,d0,d1
! 	fmov	d0,d1
  	ldr	x0,[sp, #568]
  	cmp	w0,#0
  	b.gt	.Lj1387
***************
*** 10113,10119 ****
  	mov	w0,w1
  	add	x0,sp,w0,uxtw #3
  	ldur	d1,[x0, #240]
! 	fadd	d0,d0,d1
  	ldr	x0,[sp, #568]
  	cmp	w0,#0
  	b.gt	.Lj1395
--- 10125,10132 ----
  	mov	w0,w1
  	add	x0,sp,w0,uxtw #3
  	ldur	d1,[x0, #240]
! 	fadd	d1,d0,d1
! 	fmov	d0,d1
  	ldr	x0,[sp, #568]
  	cmp	w0,#0
  	b.gt	.Lj1395
***************
*** 10133,10139 ****
  .Lj1400:
  // [848] fw := fq[0]-fw;
  	ldr	d1,[sp, #240]
! 	fsub	d0,d1,d0
  // [849] for i:=1 to jz do
  	mov	w0,w28
  	cmp	w0,#1
--- 10146,10153 ----
  .Lj1400:
  // [848] fw := fq[0]-fw;
  	ldr	d1,[sp, #240]
! 	fsub	d1,d1,d0
! 	fmov	d0,d1
  // [849] for i:=1 to jz do
  	mov	w0,w28
  	cmp	w0,#1
***************
*** 10149,10155 ****
  	mov	w2,w1
  	add	x1,sp,w2,uxtw #3
  	ldur	d1,[x1, #240]
! 	fadd	d0,d0,d1
  	ldr	x1,[sp, #568]
  	cmp	w0,w1
  	b.gt	.Lj1403
--- 10163,10170 ----
  	mov	w2,w1
  	add	x1,sp,w2,uxtw #3
  	ldur	d1,[x1, #240]
! 	fadd	d1,d0,d1
! 	fmov	d0,d1
  	ldr	x1,[sp, #568]
  	cmp	w0,w1
  	b.gt	.Lj1403
***************
*** 10187,10193 ****
  	ldur	d2,[x0, #232]
  	add	x0,sp,w1,uxtw #3
  	ldur	d1,[x0, #240]
! 	fadd	d0,d2,d1
  // [863] fq[i]  := fq[i]+(fq[i-1]-fw);
  	ldr	x1,[sp, #568]
  	mov	w0,w1
--- 10202,10209 ----
  	ldur	d2,[x0, #232]
  	add	x0,sp,w1,uxtw #3
  	ldur	d1,[x0, #240]
! 	fadd	d1,d2,d1
! 	fmov	d0,d1
  // [863] fq[i]  := fq[i]+(fq[i-1]-fw);
  	ldr	x1,[sp, #568]
  	mov	w0,w1
***************
*** 10230,10236 ****
  	ldur	d1,[x0, #232]
  	add	x0,sp,w1,uxtw #3
  	ldur	d2,[x0, #240]
! 	fadd	d0,d1,d2
  // [869] fq[i]  := fq[i]+(fq[i-1]-fw);
  	ldr	x1,[sp, #568]
  	mov	w0,w1
--- 10246,10253 ----
  	ldur	d1,[x0, #232]
  	add	x0,sp,w1,uxtw #3
  	ldur	d2,[x0, #240]
! 	fadd	d1,d1,d2
! 	fmov	d0,d1
  // [869] fq[i]  := fq[i]+(fq[i-1]-fw);
  	ldr	x1,[sp, #568]
  	mov	w0,w1
***************
*** 10272,10278 ****
  	mov	w1,w0
  	add	x0,sp,w1,uxtw #3
  	ldur	d1,[x0, #240]
! 	fadd	d0,d0,d1
  	ldr	x0,[sp, #568]
  	cmp	w0,#2
  	b.gt	.Lj1421
--- 10289,10296 ----
  	mov	w1,w0
  	add	x0,sp,w1,uxtw #3
  	ldur	d1,[x0, #240]
! 	fadd	d1,d0,d1
! 	fmov	d0,d1
  	ldr	x0,[sp, #568]
  	cmp	w0,#2
  	b.gt	.Lj1421
***************
*** 10396,10402 ****
  	add	w0,w0,#1
  // [920] y := y + 1.0;
  	fmov	d0,# 1.0000000000000000E+000
! 	fadd	d9,d9,d0
  .Lj1435:
  // [922] z := ((x - y * DP1) - y * DP2) - y * DP3;
  	adrp	x1,:got:_$SYSTEM$_Ld7
--- 10414,10421 ----
  	add	w0,w0,#1
  // [920] y := y + 1.0;
  	fmov	d0,# 1.0000000000000000E+000
! 	fadd	d0,d9,d0
! 	fmov	d9,d0
  .Lj1435:
  // [922] z := ((x - y * DP1) - y * DP2) - y * DP3;
  	adrp	x1,:got:_$SYSTEM$_Ld7
***************
*** 10622,10628 ****
  	adrp	x2,:got:TC_$SYSTEM_$$_HUGE
  	ldr	x2,[x2, :got_lo12:TC_$SYSTEM_$$_HUGE]
  	ldur	d4,[x2]
! 	fmul	d1,d4,d4
  // [1157] exit;
  	b	.Lj1446
  .Lj1459:
--- 10641,10648 ----
  	adrp	x2,:got:TC_$SYSTEM_$$_HUGE
  	ldr	x2,[x2, :got_lo12:TC_$SYSTEM_$$_HUGE]
  	ldur	d4,[x2]
! 	fmul	d4,d4,d4
! 	fmov	d1,d4
  // [1157] exit;
  	b	.Lj1446
  .Lj1459:
***************
*** 10636,10642 ****
  	adrp	x2,:got:TC_$SYSTEM$_$fpc_exp_real$DOUBLE$$DOUBLE_$$_TWOM1000
  	ldr	x2,[x2, :got_lo12:TC_$SYSTEM$_$fpc_exp_real$DOUBLE$$DOUBLE_$$_TWOM1000]
  	ldur	d4,[x2]
! 	fmul	d1,d4,d4
  // [1161] exit;
  	b	.Lj1446
  .Lj1461:
--- 10656,10663 ----
  	adrp	x2,:got:TC_$SYSTEM$_$fpc_exp_real$DOUBLE$$DOUBLE_$$_TWOM1000
  	ldr	x2,[x2, :got_lo12:TC_$SYSTEM$_$fpc_exp_real$DOUBLE$$DOUBLE_$$_TWOM1000]
  	ldur	d4,[x2]
! 	fmul	d4,d4,d4
! 	fmov	d1,d4
  // [1161] exit;
  	b	.Lj1446
  .Lj1461:
***************
*** 10656,10662 ****
  	adrp	x3,:got:TC_$SYSTEM$_$fpc_exp_real$DOUBLE$$DOUBLE_$$_LN2HI
  	ldr	x3,[x3, :got_lo12:TC_$SYSTEM$_$fpc_exp_real$DOUBLE$$DOUBLE_$$_LN2HI]
  	ldr	d4,[x3, w2, uxtw #3]
! 	fsub	d2,d0,d4
  // [1170] lo:=ln2LO[xsb];
  	mov	w2,w1
  	adrp	x3,:got:TC_$SYSTEM$_$fpc_exp_real$DOUBLE$$DOUBLE_$$_LN2LO
--- 10677,10684 ----
  	adrp	x3,:got:TC_$SYSTEM$_$fpc_exp_real$DOUBLE$$DOUBLE_$$_LN2HI
  	ldr	x3,[x3, :got_lo12:TC_$SYSTEM$_$fpc_exp_real$DOUBLE$$DOUBLE_$$_LN2HI]
  	ldr	d4,[x3, w2, uxtw #3]
! 	fsub	d4,d0,d4
! 	fmov	d2,d4
  // [1170] lo:=ln2LO[xsb];
  	mov	w2,w1
  	adrp	x3,:got:TC_$SYSTEM$_$fpc_exp_real$DOUBLE$$DOUBLE_$$_LN2LO
***************
*** 10691,10700 ****
  	adrp	x1,:got:TC_$SYSTEM$_$fpc_exp_real$DOUBLE$$DOUBLE_$$_LN2LO
  	ldr	x1,[x1, :got_lo12:TC_$SYSTEM$_$fpc_exp_real$DOUBLE$$DOUBLE_$$_LN2LO]
  	ldur	d5,[x1]
! 	fmul	d3,d4,d5
  .Lj1466:
  // [1180] d  := hi - lo;
! 	fsub	d0,d2,d3
  	b	.Lj1467
  .Lj1463:
  // [1182] else if hx < $3e300000 then
--- 10713,10724 ----
  	adrp	x1,:got:TC_$SYSTEM$_$fpc_exp_real$DOUBLE$$DOUBLE_$$_LN2LO
  	ldr	x1,[x1, :got_lo12:TC_$SYSTEM$_$fpc_exp_real$DOUBLE$$DOUBLE_$$_LN2LO]
  	ldur	d5,[x1]
! 	fmul	d4,d4,d5
! 	fmov	d3,d4
  .Lj1466:
  // [1180] d  := hi - lo;
! 	fsub	d4,d2,d3
! 	fmov	d0,d4
  	b	.Lj1467
  .Lj1463:
  // [1182] else if hx < $3e300000 then
***************
*** 10715,10721 ****
  	adrp	x0,:got:TC_$SYSTEM_$$_ONE
  	ldr	x0,[x0, :got_lo12:TC_$SYSTEM_$$_ONE]
  	ldur	d4,[x0]
! 	fadd	d1,d4,d0
  // [1187] exit;
  	b	.Lj1446
  .Lj1471:
--- 10739,10746 ----
  	adrp	x0,:got:TC_$SYSTEM_$$_ONE
  	ldr	x0,[x0, :got_lo12:TC_$SYSTEM_$$_ONE]
  	ldur	d4,[x0]
! 	fadd	d4,d4,d0
! 	fmov	d1,d4
  // [1187] exit;
  	b	.Lj1446
  .Lj1471:
***************
*** 10761,10767 ****
  	adrp	x0,:got:TC_$SYSTEM_$$_ONE
  	ldr	x0,[x0, :got_lo12:TC_$SYSTEM_$$_ONE]
  	ldur	d5,[x0]
! 	fsub	d1,d5,d6
  // [1199] exit;
  	b	.Lj1446
  .Lj1474:
--- 10786,10793 ----
  	adrp	x0,:got:TC_$SYSTEM_$$_ONE
  	ldr	x0,[x0, :got_lo12:TC_$SYSTEM_$$_ONE]
  	ldur	d5,[x0]
! 	fsub	d5,d5,d6
! 	fmov	d1,d5
  // [1199] exit;
  	b	.Lj1446
  .Lj1474:
***************
*** 10809,10815 ****
  	adrp	x0,:got:TC_$SYSTEM$_$fpc_exp_real$DOUBLE$$DOUBLE_$$_TWOM1000
  	ldr	x0,[x0, :got_lo12:TC_$SYSTEM$_$fpc_exp_real$DOUBLE$$DOUBLE_$$_TWOM1000]
  	ldur	d2,[x0]
! 	fmul	d1,d0,d2
  .Lj1478:
  .Lj1446:
  // [1216] end;
--- 10835,10842 ----
  	adrp	x0,:got:TC_$SYSTEM$_$fpc_exp_real$DOUBLE$$DOUBLE_$$_TWOM1000
  	ldr	x0,[x0, :got_lo12:TC_$SYSTEM$_$fpc_exp_real$DOUBLE$$DOUBLE_$$_TWOM1000]
  	ldur	d2,[x0]
! 	fmul	d0,d0,d2
! 	fmov	d1,d0
  .Lj1478:
  .Lj1446:
  // [1216] end;
***************
*** 10897,10903 ****
  // [1453] exit(d+d);
  	ldr	d1,[sp]
  	ldr	d2,[sp]
! 	fadd	d0,d1,d2
  	b	.Lj1479
  .Lj1488:
  // [1454] inc(k, (hx shr 20)-1023);
--- 10924,10931 ----
  // [1453] exit(d+d);
  	ldr	d1,[sp]
  	ldr	d2,[sp]
! 	fadd	d1,d1,d2
! 	fmov	d0,d1
  	b	.Lj1479
  .Lj1488:
  // [1454] inc(k, (hx shr 20)-1023);
***************
*** 10955,10961 ****
  	ldr	x1,[x1, :got_lo12:TC_$SYSTEM$_$fpc_ln_real$DOUBLE$$DOUBLE_$$_LN2_LO]
  	ldur	d4,[x1]
  	fmul	d4,d2,d4
! 	fadd	d0,d3,d4
  	b	.Lj1479
  .Lj1495:
  .Lj1492:
--- 10983,10990 ----
  	ldr	x1,[x1, :got_lo12:TC_$SYSTEM$_$fpc_ln_real$DOUBLE$$DOUBLE_$$_LN2_LO]
  	ldur	d4,[x1]
  	fmul	d4,d2,d4
! 	fadd	d3,d3,d4
! 	fmov	d0,d3
  	b	.Lj1479
  .Lj1495:
  .Lj1492:
***************
*** 10970,10976 ****
  // [1473] if (k=0) then
  	cbnz	w3,.Lj1497
  // [1474] exit(f-R)
! 	fsub	d0,d1,d3
  	b	.Lj1479
  .Lj1497:
  // [1477] dk := k;
--- 10999,11007 ----
  // [1473] if (k=0) then
  	cbnz	w3,.Lj1497
  // [1474] exit(f-R)
! 	fsub	d4,d1,d3
! // [1475] else
! 	fmov	d0,d4
  	b	.Lj1479
  .Lj1497:
  // [1477] dk := k;
***************
*** 10986,10992 ****
  	ldr	x1,[x1, :got_lo12:TC_$SYSTEM$_$fpc_ln_real$DOUBLE$$DOUBLE_$$_LN2_HI]
  	ldur	d4,[x1]
  	fmul	d2,d2,d4
! 	fsub	d0,d2,d3
  	b	.Lj1479
  .Lj1498:
  .Lj1490:
--- 11017,11024 ----
  	ldr	x1,[x1, :got_lo12:TC_$SYSTEM$_$fpc_ln_real$DOUBLE$$DOUBLE_$$_LN2_HI]
  	ldur	d4,[x1]
  	fmul	d2,d2,d4
! 	fsub	d2,d2,d3
! 	fmov	d0,d2
  	b	.Lj1479
  .Lj1498:
  .Lj1490:
***************
*** 11063,11069 ****
  // [1495] result := f-(hfsq-s*(hfsq+R))
  	fadd	d6,d5,d4
  	fmsub	d6,d2,d6,d5
! 	fsub	d0,d1,d6
  	b	.Lj1479
  .Lj1502:
  // [1497] result := dk*ln2_hi-((hfsq-(s*(hfsq+R)+dk*ln2_lo))-f);
--- 11095,11102 ----
  // [1495] result := f-(hfsq-s*(hfsq+R))
  	fadd	d6,d5,d4
  	fmsub	d6,d2,d6,d5
! 	fsub	d6,d1,d6
! 	fmov	d0,d6
  	b	.Lj1479
  .Lj1502:
  // [1497] result := dk*ln2_hi-((hfsq-(s*(hfsq+R)+dk*ln2_lo))-f);
***************
*** 11080,11086 ****
  	ldr	x0,[x0, :got_lo12:TC_$SYSTEM$_$fpc_ln_real$DOUBLE$$DOUBLE_$$_LN2_HI]
  	ldur	d6,[x0]
  	fmul	d6,d3,d6
! 	fsub	d0,d6,d5
  .Lj1503:
  	b	.Lj1479
  .Lj1500:
--- 11113,11120 ----
  	ldr	x0,[x0, :got_lo12:TC_$SYSTEM$_$fpc_ln_real$DOUBLE$$DOUBLE_$$_LN2_HI]
  	ldur	d6,[x0]
  	fmul	d6,d3,d6
! 	fsub	d5,d6,d5
! 	fmov	d0,d5
  .Lj1503:
  	b	.Lj1479
  .Lj1500:
***************
*** 11105,11111 ****
  	ldr	x0,[x0, :got_lo12:TC_$SYSTEM$_$fpc_ln_real$DOUBLE$$DOUBLE_$$_LN2_HI]
  	ldur	d2,[x0]
  	fmul	d2,d3,d2
! 	fsub	d0,d2,d1
  .Lj1507:
  .Lj1504:
  .Lj1479:
--- 11139,11146 ----
  	ldr	x0,[x0, :got_lo12:TC_$SYSTEM$_$fpc_ln_real$DOUBLE$$DOUBLE_$$_LN2_HI]
  	ldur	d2,[x0]
  	fmul	d2,d3,d2
! 	fsub	d1,d2,d1
! 	fmov	d0,d1
  .Lj1507:
  .Lj1504:
  .Lj1479:
***************
*** 11191,11197 ****
  	cmp	x19,#1
  	b.le	.Lj1516
  // [1552] result := -y
! 	fneg	d8,d10
  	b	.Lj1508
  .Lj1516:
  // [1554] result := y;
--- 11226,11233 ----
  	cmp	x19,#1
  	b.le	.Lj1516
  // [1552] result := -y
! 	fneg	d0,d10
! 	fmov	d8,d0
  	b	.Lj1508
  .Lj1516:
  // [1554] result := y;
***************
*** 11268,11274 ****
  	fmov	d0,d8
  	bl	SYSTEM_$$_POLEVL$REAL$PREAL$INT64$$REAL
  	fmul	d0,d11,d0
! 	fadd	d9,d10,d0
  .Lj1522:
  // [1599] if (j = 1) or (j = 2) then
  	cmp	x19,#1
--- 11304,11311 ----
  	fmov	d0,d8
  	bl	SYSTEM_$$_POLEVL$REAL$PREAL$INT64$$REAL
  	fmul	d0,d11,d0
! 	fadd	d0,d10,d0
! 	fmov	d9,d0
  .Lj1522:
  // [1599] if (j = 1) or (j = 2) then
  	cmp	x19,#1
***************
*** 11350,11356 ****
  	adrp	x2,:got:TC_$SYSTEM$_$fpc_arctan_real$DOUBLE$$DOUBLE_$$_ATANLO
  	ldr	x2,[x2, :got_lo12:TC_$SYSTEM$_$fpc_arctan_real$DOUBLE$$DOUBLE_$$_ATANLO]
  	ldur	d2,[x2, #24]
! 	fadd	d1,d3,d2
  	b	.Lj1526
  .Lj1534:
  // [1696] exit(-atanhi[3]-atanlo[3]);
--- 11387,11395 ----
  	adrp	x2,:got:TC_$SYSTEM$_$fpc_arctan_real$DOUBLE$$DOUBLE_$$_ATANLO
  	ldr	x2,[x2, :got_lo12:TC_$SYSTEM$_$fpc_arctan_real$DOUBLE$$DOUBLE_$$_ATANLO]
  	ldur	d2,[x2, #24]
! 	fadd	d2,d3,d2
! // [1695] else
! 	fmov	d1,d2
  	b	.Lj1526
  .Lj1534:
  // [1696] exit(-atanhi[3]-atanlo[3]);
***************
*** 11361,11367 ****
  	adrp	x2,:got:TC_$SYSTEM$_$fpc_arctan_real$DOUBLE$$DOUBLE_$$_ATANLO
  	ldr	x2,[x2, :got_lo12:TC_$SYSTEM$_$fpc_arctan_real$DOUBLE$$DOUBLE_$$_ATANLO]
  	ldur	d3,[x2, #24]
! 	fsub	d1,d2,d3
  	b	.Lj1526
  .Lj1535:
  .Lj1529:
--- 11400,11407 ----
  	adrp	x2,:got:TC_$SYSTEM$_$fpc_arctan_real$DOUBLE$$DOUBLE_$$_ATANLO
  	ldr	x2,[x2, :got_lo12:TC_$SYSTEM$_$fpc_arctan_real$DOUBLE$$DOUBLE_$$_ATANLO]
  	ldur	d3,[x2, #24]
! 	fsub	d2,d2,d3
! 	fmov	d1,d2
  	b	.Lj1526
  .Lj1535:
  .Lj1529:
***************
*** 11392,11398 ****
  	b	.Lj1542
  .Lj1537:
  // [1708] d := abs(d);
! 	fabs	d0,d0
  // [1709] if (ix < $3ff30000) then    { |x| < 1.1875 }
  	movz	w3,#16371,lsl #16
  	cmp	w1,w3
--- 11432,11439 ----
  	b	.Lj1542
  .Lj1537:
  // [1708] d := abs(d);
! 	fabs	d2,d0
! 	fmov	d0,d2
  // [1709] if (ix < $3ff30000) then    { |x| < 1.1875 }
  	movz	w3,#16371,lsl #16
  	cmp	w1,w3
***************
*** 11410,11416 ****
  	fnmsub	d2,d3,d0,d2
  	fmov	d3,# 2.0000000000000000E+000
  	fadd	d3,d3,d0
! 	fdiv	d0,d2,d3
  	b	.Lj1542
  .Lj1546:
  // [1717] id := 1; d := (d-one)/(d+one);
--- 11451,11458 ----
  	fnmsub	d2,d3,d0,d2
  	fmov	d3,# 2.0000000000000000E+000
  	fadd	d3,d3,d0
! 	fdiv	d2,d2,d3
! 	fmov	d0,d2
  	b	.Lj1542
  .Lj1546:
  // [1717] id := 1; d := (d-one)/(d+one);
***************
*** 11420,11426 ****
  	ldur	d3,[x3]
  	fsub	d2,d0,d3
  	fadd	d3,d0,d3
! 	fdiv	d0,d2,d3
  .Lj1547:
  	b	.Lj1542
  .Lj1544:
--- 11462,11469 ----
  	ldur	d3,[x3]
  	fsub	d2,d0,d3
  	fadd	d3,d0,d3
! 	fdiv	d2,d2,d3
! 	fmov	d0,d2
  .Lj1547:
  	b	.Lj1542
  .Lj1544:
***************
*** 11438,11450 ****
  	fmadd	d2,d3,d0,d2
  	fmov	d3,# 1.5000000000000000E+000
  	fsub	d3,d0,d3
! 	fdiv	d0,d3,d2
  	b	.Lj1542
  .Lj1550:
  // [1728] id := 3; d := -1.0/d;
  	movz	w2,#3
  	fmov	d2,#-1.0000000000000000E+000
! 	fdiv	d0,d2,d0
  .Lj1551:
  .Lj1548:
  .Lj1542:
--- 11481,11495 ----
  	fmadd	d2,d3,d0,d2
  	fmov	d3,# 1.5000000000000000E+000
  	fsub	d3,d0,d3
! 	fdiv	d2,d3,d2
! 	fmov	d0,d2
  	b	.Lj1542
  .Lj1550:
  // [1728] id := 3; d := -1.0/d;
  	movz	w2,#3
  	fmov	d2,#-1.0000000000000000E+000
! 	fdiv	d2,d2,d0
! 	fmov	d0,d2
  .Lj1551:
  .Lj1548:
  .Lj1542:
***************
*** 11522,11533 ****
  	fnmsub	d3,d0,d4,d3
  	fsub	d3,d3,d0
  	ldr	d0,[x3, w1, uxtw #3]
! 	fsub	d2,d0,d3
  // [1743] if hx<0 then
  	cmp	w0,#0
  	b.ge	.Lj1556
  // [1744] result := -z
! 	fneg	d1,d2
  	b	.Lj1526
  .Lj1556:
  // [1746] result := z;
--- 11567,11580 ----
  	fnmsub	d3,d0,d4,d3
  	fsub	d3,d3,d0
  	ldr	d0,[x3, w1, uxtw #3]
! 	fsub	d0,d0,d3
! 	fmov	d2,d0
  // [1743] if hx<0 then
  	cmp	w0,#0
  	b.ge	.Lj1556
  // [1744] result := -z
! 	fneg	d0,d2
! 	fmov	d1,d0
  	b	.Lj1526
  .Lj1556:
  // [1746] result := z;
system.aarch64-linux.diff (17,191 bytes)   

J. Gareth Moreton

2020-08-16 21:58

developer   ~0124933

Hmmm, I'm starting to get lax! The System unit looks like a far better unit to monitor for changes, since Lazarus lacks a lot of floating-point stuff. I'll start making corrections.

Florian

2020-08-16 23:03

administrator   ~0124934

Normally, I use the rtl as a reference.

J. Gareth Moreton

2020-08-17 00:59

developer   ~0124938

Possibly a better option, and faster too, when it comes to looking for new optimisations. Anyhow, I'm getting to work on fixing it up now.

J. Gareth Moreton

2020-08-18 16:41

developer   ~0124966

Last edited: 2020-08-19 08:33

View 5 revisions

FIxed it hopefully. aarch64 was easy - I called the wrong method in the new OptPass1FData routine. arm-32 was a bit harder because I got careless with dangling pointers in a couple of places, but all should be well now.

The second patch I'm supplying, arm-gniur-upgrade.patch, is not technically a refactor since it changes the behaviour of the GetNextInstructionUsingReg method for ARM, but the change causes it to return False if it exits before finding an instruction that uses the given register, thus saving time by allowing optimisation routines to drop out earlier (if they don't check the result of GetNextInstructionUsingReg, the output instruction will still be valid, thus is failsafe... if they do crash because GetNextInstruction returned a null instruction (result is False), then they would have crashed using the current version of GetNextInstructionUsingReg too)
arm-aarch64-refactor.patch (137,509 bytes)   
Index: compiler/aarch64/aoptcpu.pas
===================================================================
--- compiler/aarch64/aoptcpu.pas	(revision 46481)
+++ compiler/aarch64/aoptcpu.pas	(working copy)
@@ -42,12 +42,13 @@
         function PostPeepHoleOptsCpu(var p: tai): boolean; override;
         function RegLoadedWithNewValue(reg: tregister; hp: tai): boolean;override;
         function InstructionLoadsFromReg(const reg: TRegister; const hp: tai): boolean;override;
-        function LookForPostindexedPattern(p : taicpu) : boolean;
+        function LookForPostindexedPattern(var p : tai) : boolean;
       private
+        function RemoveSuperfluousFMov(const p: tai; movp: tai; const optimizer: string): boolean;
         function OptPass1Shift(var p: tai): boolean;
         function OptPostCMP(var p: tai): boolean;
         function OptPass1Data(var p: tai): boolean;
-        function RemoveSuperfluousFMov(const p: tai; movp: tai; const optimizer: string): boolean;
+        function OptPass1FData(var p: tai): Boolean;
         function OptPass1STP(var p: tai): boolean;
         function OptPass1Mov(var p: tai): boolean;
         function OptPass1FMov(var p: tai): Boolean;
@@ -169,20 +170,20 @@
 
       ldr/str regX,[reg1], regY/const
   }
-  function TCpuAsmOptimizer.LookForPostindexedPattern(p: taicpu) : boolean;
+  function TCpuAsmOptimizer.LookForPostindexedPattern(var p: tai) : boolean;
     var
       hp1 : tai;
     begin
       Result:=false;
-      if (p.oper[1]^.typ = top_ref) and
-        (p.oper[1]^.ref^.addressmode=AM_OFFSET) and
-        (p.oper[1]^.ref^.index=NR_NO) and
-        (p.oper[1]^.ref^.offset=0) and
-        GetNextInstructionUsingReg(p, hp1, p.oper[1]^.ref^.base) and
+      if (taicpu(p).oper[1]^.typ = top_ref) and
+        (taicpu(p).oper[1]^.ref^.addressmode=AM_OFFSET) and
+        (taicpu(p).oper[1]^.ref^.index=NR_NO) and
+        (taicpu(p).oper[1]^.ref^.offset=0) and
+        GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[1]^.ref^.base) and
         { we cannot check NR_DEFAULTFLAGS for modification yet so don't allow a condition }
         MatchInstruction(hp1, [A_ADD, A_SUB], [PF_None]) and
-        (taicpu(hp1).oper[0]^.reg=p.oper[1]^.ref^.base) and
-        (taicpu(hp1).oper[1]^.reg=p.oper[1]^.ref^.base) and
+        (taicpu(hp1).oper[0]^.reg=taicpu(p).oper[1]^.ref^.base) and
+        (taicpu(hp1).oper[1]^.reg=taicpu(p).oper[1]^.ref^.base) and
         (
          { valid offset? }
          (taicpu(hp1).oper[2]^.typ=top_const) and
@@ -190,16 +191,20 @@
          (abs(taicpu(hp1).oper[2]^.val)<256)
         ) and
         { don't apply the optimization if the base register is loaded }
-        (getsupreg(p.oper[0]^.reg)<>getsupreg(p.oper[1]^.ref^.base)) and
+        (getsupreg(taicpu(p).oper[0]^.reg)<>getsupreg(taicpu(p).oper[1]^.ref^.base)) and
         not(RegModifiedBetween(taicpu(hp1).oper[0]^.reg,p,hp1)) and
         not(RegModifiedBetween(taicpu(hp1).oper[2]^.reg,p,hp1)) then
         begin
-          DebugMsg('Peephole Str/LdrAdd/Sub2Str/Ldr Postindex done', p);
-          p.oper[1]^.ref^.addressmode:=AM_POSTINDEXED;
+          if taicpu(p).opcode = A_LDR then
+            DebugMsg('Peephole LdrAdd/Sub2Ldr Postindex done', p)
+          else
+            DebugMsg('Peephole StrAdd/Sub2Str Postindex done', p);
+
+          taicpu(p).oper[1]^.ref^.addressmode:=AM_POSTINDEXED;
           if taicpu(hp1).opcode=A_ADD then
-            p.oper[1]^.ref^.offset:=taicpu(hp1).oper[2]^.val
+            taicpu(p).oper[1]^.ref^.offset:=taicpu(hp1).oper[2]^.val
           else
-            p.oper[1]^.ref^.offset:=-taicpu(hp1).oper[2]^.val;
+            taicpu(p).oper[1]^.ref^.offset:=-taicpu(hp1).oper[2]^.val;
           asml.Remove(hp1);
           hp1.Free;
           Result:=true;
@@ -395,13 +400,20 @@
     var
       hp1: tai;
     begin
-      result:=false;
-      if GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
-        RemoveSuperfluousMove(p, hp1, 'DataMov2Data') then
-        Result:=true;
+      Result := GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
+        RemoveSuperfluousMove(p, hp1, 'DataMov2Data');
     end;
 
 
+  function TCpuAsmOptimizer.OptPass1FData(var p: tai): Boolean;
+    var
+      hp1: tai;
+    begin
+      Result := GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
+        RemoveSuperfluousFMov(p, hp1, 'FOpFMov2FOp');
+    end;
+
+
   function TCpuAsmOptimizer.OptPass1STP(var p : tai): boolean;
     var
       hp1, hp2, hp3, hp4: tai;
@@ -428,21 +440,20 @@
         (taicpu(p).oper[2]^.ref^.index=NR_NO) and
         (taicpu(p).oper[2]^.ref^.offset=-16) and
         (taicpu(p).oper[2]^.ref^.addressmode=AM_PREINDEXED) and
+
         GetNextInstruction(p, hp1) and
-        GetNextInstruction(hp1, hp2) and
-        SkipEntryExitMarker(hp2, hp2) and
-        GetNextInstruction(hp2, hp3) and
-        SkipEntryExitMarker(hp3, hp3) and
-        GetNextInstruction(hp3, hp4) and
-
         MatchInstruction(hp1, A_MOV, [C_None], [PF_NONE]) and
         MatchOperand(taicpu(hp1).oper[0]^,taicpu(p).oper[0]^) and
         (taicpu(hp1).oper[1]^.typ = top_reg) and
         (taicpu(hp1).oper[1]^.reg = NR_STACK_POINTER_REG) and
 
+        GetNextInstruction(hp1, hp2) and
+        SkipEntryExitMarker(hp2, hp2) and
         MatchInstruction(hp2, A_BL, [C_None], [PF_NONE]) and
         (taicpu(hp2).oper[0]^.typ = top_ref) and
 
+        GetNextInstruction(hp2, hp3) and
+        SkipEntryExitMarker(hp3, hp3) and
         MatchInstruction(hp3, A_LDP, [C_None], [PF_NONE]) and
         MatchOpType(taicpu(hp3),top_reg,top_reg,top_ref) and
         (taicpu(hp3).oper[0]^.reg = NR_X29) and
@@ -452,6 +463,7 @@
         (taicpu(hp3).oper[2]^.ref^.offset=16) and
         (taicpu(hp3).oper[2]^.ref^.addressmode=AM_POSTINDEXED) and
 
+        GetNextInstruction(hp3, hp4) and
         MatchInstruction(hp4, A_RET, [C_None], [PF_None]) and
         (taicpu(hp4).ops = 0) then
         begin
@@ -567,14 +579,9 @@
       if p.typ=ait_instruction then
         begin
           case taicpu(p).opcode of
-            A_LDR:
-              begin
-                Result:=LookForPostindexedPattern(taicpu(p));
-              end;
+            A_LDR,
             A_STR:
-              begin
-                Result:=LookForPostindexedPattern(taicpu(p));
-              end;
+              Result:=LookForPostindexedPattern(p);
             A_MOV:
               Result:=OptPass1Mov(p);
             A_STP:
@@ -612,11 +619,7 @@
             A_FNEG,
             A_FCVT,
             A_FABS:
-              begin
-                if GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
-                  RemoveSuperfluousFMov(p, hp1, 'FOpFMov2FOp') then
-                  Result:=true;
-              end;
+              Result:=OptPass1FData(p);
             A_FMOV:
               Result:=OptPass1FMov(p);
             else
Index: compiler/arm/aoptcpu.pas
===================================================================
--- compiler/arm/aoptcpu.pas	(revision 46481)
+++ compiler/arm/aoptcpu.pas	(working copy)
@@ -62,6 +62,21 @@
   protected
     function LookForPreindexedPattern(p: taicpu): boolean;
     function LookForPostindexedPattern(p: taicpu): boolean;
+
+
+    { Individual optimisation routines }
+    function OptPass1DataCheckMov(var p: tai): Boolean;
+    function OptPass1ADDSUB(var p: tai): Boolean;
+    function OptPass1And(var p: tai): Boolean; override; { There's optimisation code that's general for all ARM platforms }
+    function OptPass1CMP(var p: tai): Boolean;
+    function OptPass1LDR(var p: tai): Boolean;
+    function OptPass1STM(var p: tai): Boolean;
+    function OptPass1STR(var p: tai): Boolean;
+    function OptPass1MOV(var p: tai): Boolean;
+    function OptPass1MUL(var p: tai): Boolean;
+    function OptPass1MVN(var p: tai): Boolean;
+    function OptPass1VMov(var p: tai): Boolean;
+    function OptPass1VOp(var p: tai): Boolean;
   End;
 
   TCpuPreRegallocScheduler = class(TAsmScheduler)
@@ -117,7 +132,7 @@
          (taicpu(cmpp).oper[0]^.reg = taicpu(movp).oper[0]^.reg) and
          (taicpu(cmpp).oper[1]^.val = taicpu(movp).oper[1]^.val) then
       begin
-        asml.insertafter(tai_comment.Create(strpnew('Peephole CmpMovMov - Removed redundant moveq')), movp);
+        asml.insertafter(tai_comment.Create(strpnew('Peephole Optimization: CmpMovMov - Removed redundant moveq')), movp);
         asml.remove(movp);
         movp.free;
         Result:=true;
@@ -355,7 +370,7 @@
           dealloc:=FindRegDeAlloc(taicpu(p).oper[0]^.reg,tai(movp.Next));
           if assigned(dealloc) then
             begin
-              DebugMsg('Peephole '+optimizer+' removed superfluous vmov', movp);
+              DebugMsg('Peephole Optimization: '+optimizer+' removed superfluous vmov', movp);
               result:=true;
 
               { taicpu(p).oper[0]^.reg is not used anymore, try to find its allocation
@@ -498,7 +513,7 @@
         not(RegModifiedBetween(taicpu(hp1).oper[2]^.reg,p,hp1)) and
         GenerateARMCode then
         begin
-          DebugMsg('Peephole Str/LdrAdd/Sub2Str/Ldr Postindex done', p);
+          DebugMsg('Peephole Optimization: Str/LdrAdd/Sub2Str/Ldr Postindex done', p);
           p.oper[1]^.ref^.addressmode:=AM_POSTINDEXED;
           if taicpu(hp1).oper[2]^.typ=top_const then
             begin
@@ -522,1300 +537,1410 @@
     end;
 
 
-  function TCpuAsmOptimizer.PeepHoleOptPass1Cpu(var p: tai): boolean;
+  function TCpuAsmOptimizer.OptPass1ADDSUB(var p: tai): Boolean;
     var
-      hp1,hp2,hp3,hp4: tai;
-      i, i2: longint;
-      tempop: tasmop;
+      hp1,hp2: tai;
       oldreg: tregister;
-      dealloc: tai_regalloc;
+    begin
+      Result := OptPass1DataCheckMov(p);
 
-    function IsPowerOf2(const value: DWord): boolean; inline;
-      begin
-        Result:=(value and (value - 1)) = 0;
-      end;
+      {
+        change
+        add/sub reg2,reg1,const1
+        str/ldr reg3,[reg2,const2]
+        dealloc reg2
+        to
+        str/ldr reg3,[reg1,const2+/-const1]
+      }
+      if (not GenerateThumbCode) and
+         (taicpu(p).ops>2) and
+         (taicpu(p).oper[1]^.typ = top_reg) and
+         (taicpu(p).oper[2]^.typ = top_const) then
+        begin
+          hp1:=p;
+          while GetNextInstructionUsingReg(hp1, hp1, taicpu(p).oper[0]^.reg) and
+            { we cannot check NR_DEFAULTFLAGS for modification yet so don't allow a condition }
+            MatchInstruction(hp1, [A_LDR, A_STR], [C_None], []) and
+            (taicpu(hp1).oper[1]^.typ = top_ref) and
+            (taicpu(hp1).oper[1]^.ref^.base=taicpu(p).oper[0]^.reg) and
+            { don't optimize if the register is stored/overwritten }
+            (taicpu(hp1).oper[0]^.reg<>taicpu(p).oper[1]^.reg) and
+            (taicpu(hp1).oper[1]^.ref^.index=NR_NO) and
+            (taicpu(hp1).oper[1]^.ref^.addressmode=AM_OFFSET) and
+            { new offset must be valid: either in the range of 8 or 12 bit, depend on the
+              ldr postfix }
+            (((taicpu(p).opcode=A_ADD) and
+             isValidConstLoadStoreOffset(taicpu(hp1).oper[1]^.ref^.offset+taicpu(p).oper[2]^.val, taicpu(hp1).oppostfix)
+             ) or
+             ((taicpu(p).opcode=A_SUB) and
+              isValidConstLoadStoreOffset(taicpu(hp1).oper[1]^.ref^.offset-taicpu(p).oper[2]^.val, taicpu(hp1).oppostfix)
+             )
+            ) do
+            begin
+              { neither reg1 nor reg2 might be changed inbetween }
+              if RegModifiedBetween(taicpu(p).oper[0]^.reg,p,hp1) or
+                RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1) then
+                break;
+              { reg2 must be either overwritten by the ldr or it is deallocated afterwards }
+              if ((taicpu(hp1).opcode=A_LDR) and (taicpu(p).oper[0]^.reg=taicpu(hp1).oper[0]^.reg)) or
+                assigned(FindRegDeAlloc(taicpu(p).oper[0]^.reg,tai(hp1.Next))) then
+                begin
+                  { remember last instruction }
+                  hp2:=hp1;
+                  DebugMsg('Peephole Optimization: Add/SubLdr2Ldr done', p);
+                  hp1:=p;
+                  { fix all ldr/str }
+                  while GetNextInstructionUsingReg(hp1, hp1, taicpu(p).oper[0]^.reg) do
+                    begin
+                      taicpu(hp1).oper[1]^.ref^.base:=taicpu(p).oper[1]^.reg;
+                      if taicpu(p).opcode=A_ADD then
+                        inc(taicpu(hp1).oper[1]^.ref^.offset,taicpu(p).oper[2]^.val)
+                      else
+                        dec(taicpu(hp1).oper[1]^.ref^.offset,taicpu(p).oper[2]^.val);
+                      if hp1=hp2 then
+                        break;
+                    end;
+                  RemoveCurrentP(p);
+                  result:=true;
+                  Exit;
+                end;
+            end;
+        end;
 
+      if (taicpu(p).condition = C_None) and
+        (taicpu(p).oppostfix = PF_None) and
+        LookForPreindexedPattern(taicpu(p)) then
+        begin
+          DebugMsg('Peephole Optimization: Add/Sub to Preindexed done', p);
+          RemoveCurrentP(p);
+          Result:=true;
+          Exit;
+        end;
+    end;
+
+
+  function TCpuAsmOptimizer.OptPass1MUL(var p: tai): Boolean;
+    var
+      hp1,hp2: tai;
+      oldreg: tregister;
     begin
-      result := false;
-      case p.typ of
-        ait_instruction:
-          begin
-            {
-              change
-              <op> reg,x,y
-              cmp reg,#0
-              into
-              <op>s reg,x,y
-            }
-            { this optimization can applied only to the currently enabled operations because
-              the other operations do not update all flags and FPC does not track flag usage }
-            if MatchInstruction(p, [A_ADC,A_ADD,A_BIC,A_SUB,A_MUL,A_MVN,A_MOV,A_ORR,A_EOR,A_AND,
-                                 A_RSB,A_RSC,A_SBC,A_MLA], [C_None], [PF_None]) and
-              GetNextInstruction(p, hp1) and
-              { mlas is only allowed in arm mode }
-              ((taicpu(p).opcode<>A_MLA) or
-               (current_settings.instructionset<>is_thumb)) and
-              MatchInstruction(hp1, A_CMP, [C_None], [PF_None]) and
-              (taicpu(hp1).oper[1]^.typ = top_const) and
-              (taicpu(p).oper[0]^.reg = taicpu(hp1).oper[0]^.reg) and
-              (taicpu(hp1).oper[1]^.val = 0) and
-              GetNextInstruction(hp1, hp2) and
-              { be careful here, following instructions could use other flags
-                however after a jump fpc never depends on the value of flags }
-              { All above instructions set Z and N according to the following
-                Z := result = 0;
-                N := result[31];
-                EQ = Z=1; NE = Z=0;
-                MI = N=1; PL = N=0; }
-              (MatchInstruction(hp2, A_B, [C_EQ,C_NE,C_MI,C_PL], []) or
-               { mov is also possible, but only if there is no shifter operand, it could be an rxx,
-                 we are too lazy to check if it is rxx or something else }
-               (MatchInstruction(hp2, A_MOV, [C_EQ,C_NE,C_MI,C_PL], []) and (taicpu(hp2).ops=2))) and
-              assigned(FindRegDealloc(NR_DEFAULTFLAGS,tai(hp2.Next))) then
-             begin
-               DebugMsg('Peephole OpCmp2OpS done', p);
+      Result := OptPass1DataCheckMov(p);
+      {
+       Turn
+       mul reg0, z,w
+       sub/add x, y, reg0
+       dealloc reg0
 
-               taicpu(p).oppostfix:=PF_S;
+       into
 
-               { move flag allocation if possible }
-               GetLastInstruction(hp1, hp2);
-               hp2:=FindRegAlloc(NR_DEFAULTFLAGS,tai(hp2.Next));
-               if assigned(hp2) then
-                 begin
-                   asml.Remove(hp2);
-                   asml.insertbefore(hp2, p);
-                 end;
+       mls/mla x,z,w,y
+       }
+      if (taicpu(p).condition = C_None) and
+        (taicpu(p).oppostfix = PF_None) and
+        (taicpu(p).ops=3) and
+        (taicpu(p).oper[0]^.typ = top_reg) and
+        (taicpu(p).oper[1]^.typ = top_reg) and
+        (taicpu(p).oper[2]^.typ = top_reg) and
+        GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
+        MatchInstruction(hp1,[A_ADD,A_SUB],[C_None],[PF_None]) and
+        (not RegModifiedBetween(taicpu(p).oper[1]^.reg, p, hp1)) and
+        (not RegModifiedBetween(taicpu(p).oper[2]^.reg, p, hp1)) and
 
-               asml.remove(hp1);
-               hp1.free;
-               Result:=true;
-             end
-           else
-              case taicpu(p).opcode of
-                A_STR:
-                  begin
-                    { change
-                      str reg1,ref
-                      ldr reg2,ref
-                      into
-                      str reg1,ref
-                      mov reg2,reg1
-                    }
-                    if (taicpu(p).oper[1]^.typ = top_ref) and
-                       (taicpu(p).oper[1]^.ref^.addressmode=AM_OFFSET) and
-                       (taicpu(p).oppostfix=PF_None) and
-                       (taicpu(p).condition=C_None) and
-                       GetNextInstructionUsingRef(p,hp1,taicpu(p).oper[1]^.ref^) and
-                       MatchInstruction(hp1, A_LDR, [taicpu(p).condition], [PF_None]) and
-                       (taicpu(hp1).oper[1]^.typ=top_ref) and
-                       (taicpu(hp1).oper[1]^.ref^.addressmode=AM_OFFSET) and
-                       not(RegModifiedBetween(taicpu(p).oper[0]^.reg, p, hp1)) and
-                       ((taicpu(hp1).oper[1]^.ref^.index=NR_NO) or not (RegModifiedBetween(taicpu(hp1).oper[1]^.ref^.index, p, hp1))) and
-                       ((taicpu(hp1).oper[1]^.ref^.base=NR_NO) or not (RegModifiedBetween(taicpu(hp1).oper[1]^.ref^.base, p, hp1))) then
-                      begin
-                        if taicpu(hp1).oper[0]^.reg=taicpu(p).oper[0]^.reg then
-                          begin
-                            DebugMsg('Peephole StrLdr2StrMov 1 done', hp1);
-                            asml.remove(hp1);
-                            hp1.free;
-                          end
-                        else
-                          begin
-                            taicpu(hp1).opcode:=A_MOV;
-                            taicpu(hp1).oppostfix:=PF_None;
-                            taicpu(hp1).loadreg(1,taicpu(p).oper[0]^.reg);
-                            DebugMsg('Peephole StrLdr2StrMov 2 done', hp1);
-                          end;
-                        result := true;
-                      end
-                    { change
-                      str reg1,ref
-                      str reg2,ref
-                      into
-                      strd reg1,reg2,ref
-                    }
-                    else if (GenerateARMCode or GenerateThumb2Code) and
-                       (CPUARM_HAS_EDSP in cpu_capabilities[current_settings.cputype]) and
-                       (taicpu(p).oppostfix=PF_None) and
-                       (taicpu(p).oper[1]^.ref^.addressmode=AM_OFFSET) and
-                       GetNextInstruction(p,hp1) and
-                       MatchInstruction(hp1, A_STR, [taicpu(p).condition, C_None], [PF_None]) and
-                       not(odd(getsupreg(taicpu(p).oper[0]^.reg))) and
-                      (getsupreg(taicpu(p).oper[0]^.reg)+1=getsupreg(taicpu(hp1).oper[0]^.reg)) and
-                      { str ensures that either base or index contain no register, else ldr wouldn't
-                        use an offset either
-                      }
-                      (taicpu(p).oper[1]^.ref^.base=taicpu(hp1).oper[1]^.ref^.base) and
-                      (taicpu(p).oper[1]^.ref^.index=taicpu(hp1).oper[1]^.ref^.index) and
-                      (taicpu(p).oper[1]^.ref^.offset+4=taicpu(hp1).oper[1]^.ref^.offset) and
-                      (abs(taicpu(p).oper[1]^.ref^.offset)<256) and
-                      AlignedToQWord(taicpu(p).oper[1]^.ref^) then
-                      begin
-                        DebugMsg('Peephole StrStr2Strd done', p);
-                        taicpu(p).oppostfix:=PF_D;
-                        taicpu(p).loadref(2,taicpu(p).oper[1]^.ref^);
-                        taicpu(p).loadreg(1, taicpu(hp1).oper[0]^.reg);
-                        taicpu(p).ops:=3;
-                        asml.remove(hp1);
-                        hp1.free;
-                        result:=true;
-                      end;
-                    Result:=LookForPostindexedPattern(taicpu(p)) or Result;
-                  end;
-                A_LDR:
-                  begin
-                    { change
-                      ldr reg1,ref
-                      ldr reg2,ref
-                      into ...
-                    }
-                    if (taicpu(p).oper[1]^.typ = top_ref) and
-                       (taicpu(p).oper[1]^.ref^.addressmode=AM_OFFSET) and
-                       GetNextInstruction(p,hp1) and
-                       { ldrd is not allowed here }
-                       MatchInstruction(hp1, A_LDR, [taicpu(p).condition, C_None], [taicpu(p).oppostfix,PF_None]-[PF_D]) then
-                      begin
-                        {
-                          ...
-                          ldr reg1,ref
-                          mov reg2,reg1
-                        }
-                        if (taicpu(p).oppostfix=taicpu(hp1).oppostfix) and
-                           RefsEqual(taicpu(p).oper[1]^.ref^,taicpu(hp1).oper[1]^.ref^) and
-                           (taicpu(p).oper[0]^.reg<>taicpu(hp1).oper[1]^.ref^.index) and
-                           (taicpu(p).oper[0]^.reg<>taicpu(hp1).oper[1]^.ref^.base) and
-                           (taicpu(hp1).oper[1]^.ref^.addressmode=AM_OFFSET) then
-                          begin
-                            if taicpu(hp1).oper[0]^.reg=taicpu(p).oper[0]^.reg then
-                              begin
-                                DebugMsg('Peephole LdrLdr2Ldr done', hp1);
-                                asml.remove(hp1);
-                                hp1.free;
-                              end
-                            else
-                              begin
-                                DebugMsg('Peephole LdrLdr2LdrMov done', hp1);
-                                taicpu(hp1).opcode:=A_MOV;
-                                taicpu(hp1).oppostfix:=PF_None;
-                                taicpu(hp1).loadreg(1,taicpu(p).oper[0]^.reg);
-                              end;
-                            result := true;
-                          end
-                        {
-                           ...
-                           ldrd reg1,reg1+1,ref
-                        }
-                        else if (GenerateARMCode or GenerateThumb2Code) and
-                          (CPUARM_HAS_EDSP in cpu_capabilities[current_settings.cputype]) and
-                          { ldrd does not allow any postfixes ... }
-                          (taicpu(p).oppostfix=PF_None) and
-                          not(odd(getsupreg(taicpu(p).oper[0]^.reg))) and
-                          (getsupreg(taicpu(p).oper[0]^.reg)+1=getsupreg(taicpu(hp1).oper[0]^.reg)) and
-                          { ldr ensures that either base or index contain no register, else ldr wouldn't
-                            use an offset either
-                          }
-                          (taicpu(p).oper[1]^.ref^.base=taicpu(hp1).oper[1]^.ref^.base) and
-                          (taicpu(p).oper[1]^.ref^.index=taicpu(hp1).oper[1]^.ref^.index) and
-                          (taicpu(p).oper[1]^.ref^.offset+4=taicpu(hp1).oper[1]^.ref^.offset) and
-                          (abs(taicpu(p).oper[1]^.ref^.offset)<256) and
-                          AlignedToQWord(taicpu(p).oper[1]^.ref^) then
-                          begin
-                            DebugMsg('Peephole LdrLdr2Ldrd done', p);
-                            taicpu(p).loadref(2,taicpu(p).oper[1]^.ref^);
-                            taicpu(p).loadreg(1, taicpu(hp1).oper[0]^.reg);
-                            taicpu(p).ops:=3;
-                            taicpu(p).oppostfix:=PF_D;
-                            asml.remove(hp1);
-                            hp1.free;
-                            result:=true;
-                          end;
-                      end;
+        (((taicpu(hp1).opcode=A_ADD) and (current_settings.cputype>=cpu_armv4)) or
+         ((taicpu(hp1).opcode=A_SUB) and (current_settings.cputype in [cpu_armv6t2,cpu_armv7,cpu_armv7a,cpu_armv7r,cpu_armv7m,cpu_armv7em]))) and
 
-                    {
-                      Change
+        // CPUs before ARMv6 don't recommend having the same Rd and Rm for MLA.
+        // TODO: A workaround would be to swap Rm and Rs
+        (not ((taicpu(hp1).opcode=A_ADD) and (current_settings.cputype<=cpu_armv6) and MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^))) and
 
-                        ldrb dst1, [REF]
-                        and  dst2, dst1, #255
+        (((taicpu(hp1).ops=3) and
+          (taicpu(hp1).oper[2]^.typ=top_reg) and
+          ((MatchOperand(taicpu(hp1).oper[2]^, taicpu(p).oper[0]^.reg) and
+            (not RegModifiedBetween(taicpu(hp1).oper[1]^.reg, p, hp1))) or
+           ((MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) and
+             (taicpu(hp1).opcode=A_ADD) and
+             (not RegModifiedBetween(taicpu(hp1).oper[2]^.reg, p, hp1)))))) or
+         ((taicpu(hp1).ops=2) and
+          (taicpu(hp1).oper[1]^.typ=top_reg) and
+          MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg))) and
+        (RegEndOfLife(taicpu(p).oper[0]^.reg,taicpu(hp1))) then
+        begin
+          if taicpu(hp1).opcode=A_ADD then
+            begin
+              taicpu(hp1).opcode:=A_MLA;
 
-                      into
+              if taicpu(hp1).ops=3 then
+                begin
+                  if MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^) then
+                    oldreg:=taicpu(hp1).oper[2]^.reg
+                  else
+                    oldreg:=taicpu(hp1).oper[1]^.reg;
+                end
+              else
+                oldreg:=taicpu(hp1).oper[0]^.reg;
 
-                        ldrb dst2, [ref]
-                    }
-                    if not(GenerateThumbCode) and
-                       (taicpu(p).oppostfix=PF_B) and
-                       GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
-                       MatchInstruction(hp1, A_AND, [taicpu(p).condition], [PF_NONE]) and
-                       (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[0]^.reg) and
-                       (taicpu(hp1).oper[2]^.typ = top_const) and
-                       (taicpu(hp1).oper[2]^.val = $FF) and
-                       not(RegUsedBetween(taicpu(hp1).oper[0]^.reg, p, hp1)) and
-                       RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hp1)) then
-                       begin
-                         DebugMsg('Peephole LdrbAnd2Ldrb done', p);
-                         taicpu(p).oper[0]^.reg := taicpu(hp1).oper[0]^.reg;
-                         asml.remove(hp1);
-                         hp1.free;
-                         result:=true;
-                       end;
-                    Result:=LookForPostindexedPattern(taicpu(p)) or Result;
-                    { Remove superfluous mov after ldr
-                      changes
-                      ldr reg1, ref
-                      mov reg2, reg1
-                      to
-                      ldr reg2, ref
+              taicpu(hp1).loadreg(1,taicpu(p).oper[1]^.reg);
+              taicpu(hp1).loadreg(2,taicpu(p).oper[2]^.reg);
+              taicpu(hp1).loadreg(3,oldreg);
 
-                      conditions are:
-                        * no ldrd usage
-                        * reg1 must be released after mov
-                        * mov can not contain shifterops
-                        * ldr+mov have the same conditions
-                        * mov does not set flags
-                    }
-                    if (taicpu(p).oppostfix<>PF_D) and
-                       GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
-                       RemoveSuperfluousMove(p, hp1, 'LdrMov2Ldr') then
-                      Result:=true;
-                  end;
-                A_MOV:
+              DebugMsg('Peephole Optimization: MulAdd2MLA done', p);
+            end
+          else
+            begin
+              taicpu(hp1).opcode:=A_MLS;
+
+              taicpu(hp1).loadreg(3,taicpu(hp1).oper[1]^.reg);
+
+              if taicpu(hp1).ops=2 then
+                taicpu(hp1).loadreg(1,taicpu(hp1).oper[0]^.reg)
+              else
+                taicpu(hp1).loadreg(1,taicpu(p).oper[2]^.reg);
+
+              taicpu(hp1).loadreg(2,taicpu(p).oper[1]^.reg);
+
+              DebugMsg('Peephole Optimization: MulSub2MLS done', p);
+              AllocRegBetween(taicpu(hp1).oper[1]^.reg,p,hp1,UsedRegs);
+              AllocRegBetween(taicpu(hp1).oper[2]^.reg,p,hp1,UsedRegs);
+              AllocRegBetween(taicpu(hp1).oper[3]^.reg,p,hp1,UsedRegs);
+
+            end;
+
+          taicpu(hp1).ops:=4;
+          RemoveCurrentP(p);
+          Result := True;
+          Exit;
+        end
+    end;
+
+
+  function TCpuAsmOptimizer.OptPass1And(var p: tai): Boolean;
+    begin
+      Result := OptPass1DataCheckMov(p);
+      Result := inherited OptPass1And(p) or Result;
+    end;
+
+
+  function TCpuAsmOptimizer.OptPass1DataCheckMov(var p: tai): Boolean;
+    var
+      hp1: tai;
+    begin
+      {
+        change
+        op  reg1, ...
+        mov reg2, reg1
+        to
+        op  reg2, ...
+      }
+      Result := (taicpu(p).ops >= 3) and
+        GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
+        RemoveSuperfluousMove(p, hp1, 'DataMov2Data');
+    end;
+
+
+  function TCpuAsmOptimizer.OptPass1CMP(var p: tai): Boolean;
+    var
+      hp1, hp2, hp_last: tai;
+      MovRem1, MovRem2: Boolean;
+    begin
+      Result := False;
+
+      { These optimizations can be applied only to the currently enabled operations because
+        the other operations do not update all flags and FPC does not track flag usage }
+      if (taicpu(p).condition = C_None) and
+        (taicpu(p).oper[1]^.typ = top_const) and
+        GetNextInstruction(p, hp1) then
+        begin
+          {
+            change
+            cmp   reg,const1
+            moveq reg,const1
+            movne reg,const2
+            to
+            cmp   reg,const1
+            movne reg,const2
+          }
+          if MatchInstruction(hp1, A_MOV, [C_EQ, C_NE], [PF_NONE]) and
+            (taicpu(hp1).oper[1]^.typ = top_const) and
+            GetNextInstruction(hp1, hp2) and
+            MatchInstruction(hp2, A_MOV, [C_EQ, C_NE], [PF_NONE]) and
+            (taicpu(hp2).oper[1]^.typ = top_const) then
+            begin
+              MovRem1 := RemoveRedundantMove(p, hp1, asml);
+              MovRem2 := RemoveRedundantMove(p, hp2, asml);
+
+              Result:= MovRem1 or MovRem2;
+
+              { Make sure that hp1 is still the next instruction after p }
+              if MovRem1 then
+                if MovRem2 then
                   begin
-                    { fold
-                      mov reg1,reg0, shift imm1
-                      mov reg1,reg1, shift imm2
-                    }
-                    if (taicpu(p).ops=3) and
-                       (taicpu(p).oper[2]^.typ = top_shifterop) and
-                       (taicpu(p).oper[2]^.shifterop^.rs = NR_NO) and
-                       getnextinstruction(p,hp1) and
-                       MatchInstruction(hp1, A_MOV, [taicpu(p).condition], [PF_None]) and
-                       (taicpu(hp1).ops=3) and
-                       MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^.reg) and
-                       MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) and
-                       (taicpu(hp1).oper[2]^.typ = top_shifterop) and
-                       (taicpu(hp1).oper[2]^.shifterop^.rs = NR_NO) then
-                      begin
-                        { fold
-                          mov reg1,reg0, lsl 16
-                          mov reg1,reg1, lsr 16
-                          strh reg1, ...
-                          dealloc reg1
-                          to
-                          strh reg1, ...
-                          dealloc reg1
-                        }
-                        if (taicpu(p).oper[2]^.shifterop^.shiftmode=SM_LSL) and
-                          (taicpu(p).oper[2]^.shifterop^.shiftimm=16) and
-                          (taicpu(hp1).oper[2]^.shifterop^.shiftmode in [SM_LSR,SM_ASR]) and
-                          (taicpu(hp1).oper[2]^.shifterop^.shiftimm=16) and
-                          getnextinstruction(hp1,hp2) and
-                          MatchInstruction(hp2, A_STR, [taicpu(p).condition], [PF_H]) and
-                          MatchOperand(taicpu(hp2).oper[0]^, taicpu(p).oper[0]^.reg) then
-                          begin
-                            TransferUsedRegs(TmpUsedRegs);
-                            UpdateUsedRegs(TmpUsedRegs, tai(p.next));
-                            UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
-                            if not(RegUsedAfterInstruction(taicpu(p).oper[0]^.reg,hp2,TmpUsedRegs)) then
-                              begin
-                                DebugMsg('Peephole optimizer removed superfluous 16 Bit zero extension', hp1);
-                                taicpu(hp2).loadreg(0,taicpu(p).oper[1]^.reg);
-                                asml.remove(p);
-                                asml.remove(hp1);
-                                p.free;
-                                hp1.free;
-                                p:=hp2;
-                                Result:=true;
-                              end;
-                          end
-                        { fold
-                          mov reg1,reg0, shift imm1
-                          mov reg1,reg1, shift imm2
-                          to
-                          mov reg1,reg0, shift imm1+imm2
-                        }
-                        else if (taicpu(p).oper[2]^.shifterop^.shiftmode=taicpu(hp1).oper[2]^.shifterop^.shiftmode) or
-                          { asr makes no use after a lsr, the asr can be foled into the lsr }
-                           ((taicpu(p).oper[2]^.shifterop^.shiftmode=SM_LSR) and (taicpu(hp1).oper[2]^.shifterop^.shiftmode=SM_ASR) ) then
-                          begin
-                            inc(taicpu(p).oper[2]^.shifterop^.shiftimm,taicpu(hp1).oper[2]^.shifterop^.shiftimm);
-                            { avoid overflows }
-                            if taicpu(p).oper[2]^.shifterop^.shiftimm>31 then
-                              case taicpu(p).oper[2]^.shifterop^.shiftmode of
-                                SM_ROR:
-                                  taicpu(p).oper[2]^.shifterop^.shiftimm:=taicpu(p).oper[2]^.shifterop^.shiftimm and 31;
-                                SM_ASR:
-                                  taicpu(p).oper[2]^.shifterop^.shiftimm:=31;
-                                SM_LSR,
-                                SM_LSL:
-                                  begin
-                                    hp2:=taicpu.op_reg_const(A_MOV,taicpu(p).oper[0]^.reg,0);
-                                    InsertLLItem(p.previous, p.next, hp2);
-                                    p.free;
-                                    p:=hp2;
-                                  end;
-                                else
-                                  internalerror(2008072803);
-                              end;
-                            DebugMsg('Peephole ShiftShift2Shift 1 done', p);
-                            asml.remove(hp1);
-                            hp1.free;
-                            result := true;
-                          end
-                        { fold
-                          mov reg1,reg0, shift imm1
-                          mov reg1,reg1, shift imm2
-                          mov reg1,reg1, shift imm3 ...
-                          mov reg2,reg1, shift imm3 ...
-                        }
-                        else if GetNextInstructionUsingReg(hp1,hp2, taicpu(hp1).oper[0]^.reg) and
-                          MatchInstruction(hp2, A_MOV, [taicpu(p).condition], [PF_None]) and
-                          (taicpu(hp2).ops=3) and
-                          MatchOperand(taicpu(hp2).oper[1]^, taicpu(hp1).oper[0]^.reg) and
-                          RegEndofLife(taicpu(p).oper[0]^.reg,taicpu(hp2)) and
-                          (taicpu(hp2).oper[2]^.typ = top_shifterop) and
-                          (taicpu(hp2).oper[2]^.shifterop^.rs = NR_NO) then
-                          begin
-                            { mov reg1,reg0, lsl imm1
-                              mov reg1,reg1, lsr/asr imm2
-                              mov reg2,reg1, lsl imm3 ...
-                              to
-                              mov reg1,reg0, lsl imm1
-                              mov reg2,reg1, lsr/asr imm2-imm3
-                              if
-                              imm1>=imm2
-                            }
-                            if (taicpu(p).oper[2]^.shifterop^.shiftmode=SM_LSL) and (taicpu(hp2).oper[2]^.shifterop^.shiftmode=SM_LSL) and
-                              (taicpu(hp1).oper[2]^.shifterop^.shiftmode in [SM_ASR,SM_LSR]) and
-                              (taicpu(p).oper[2]^.shifterop^.shiftimm>=taicpu(hp1).oper[2]^.shifterop^.shiftimm) then
-                              begin
-                                if (taicpu(hp2).oper[2]^.shifterop^.shiftimm>=taicpu(hp1).oper[2]^.shifterop^.shiftimm) then
-                                  begin
-                                    if not(RegUsedBetween(taicpu(hp2).oper[0]^.reg,p,hp1)) and
-                                      not(RegUsedBetween(taicpu(hp2).oper[0]^.reg,hp1,hp2)) then
-                                      begin
-                                        DebugMsg('Peephole ShiftShiftShift2ShiftShift 1a done', p);
-                                        inc(taicpu(p).oper[2]^.shifterop^.shiftimm,taicpu(hp2).oper[2]^.shifterop^.shiftimm-taicpu(hp1).oper[2]^.shifterop^.shiftimm);
-                                        taicpu(p).oper[0]^.reg:=taicpu(hp2).oper[0]^.reg;
-                                        asml.remove(hp1);
-                                        asml.remove(hp2);
-                                        hp1.free;
-                                        hp2.free;
+                    if not GetNextInstruction(p, hp1) then
+                      Exit;
+                  end
+                else
+                  hp1 := hp2;
+            end;
 
-                                        if taicpu(p).oper[2]^.shifterop^.shiftimm>=32 then
-                                          begin
-                                            taicpu(p).freeop(1);
-                                            taicpu(p).freeop(2);
-                                            taicpu(p).loadconst(1,0);
-                                          end;
-                                        result := true;
-                                      end;
-                                  end
-                                else if not(RegUsedBetween(taicpu(hp2).oper[0]^.reg,hp1,hp2)) then
-                                  begin
-                                    DebugMsg('Peephole ShiftShiftShift2ShiftShift 1b done', p);
+          {
+            change
+            <op> reg,x,y
+            cmp reg,#0
+            into
+            <op>s reg,x,y
+          }
+          if (taicpu(p).oppostfix = PF_None) and
+            (taicpu(p).oper[1]^.val = 0) and
+            { be careful here, following instructions could use other flags
+              however after a jump fpc never depends on the value of flags }
+            { All above instructions set Z and N according to the following
+              Z := result = 0;
+              N := result[31];
+              EQ = Z=1; NE = Z=0;
+              MI = N=1; PL = N=0; }
+            (MatchInstruction(hp1, A_B, [C_EQ,C_NE,C_MI,C_PL], []) or
+            { mov is also possible, but only if there is no shifter operand, it could be an rxx,
+              we are too lazy to check if it is rxx or something else }
+            (MatchInstruction(hp1, A_MOV, [C_EQ,C_NE,C_MI,C_PL], []) and (taicpu(hp1).ops=2))) and
+            GetLastInstruction(p, hp_last) and
+            MatchInstruction(hp_last, [A_ADC,A_ADD,A_BIC,A_SUB,A_MUL,A_MVN,A_MOV,A_ORR,
+              A_EOR,A_AND,A_RSB,A_RSC,A_SBC,A_MLA], [C_None], [PF_None]) and
+            (
+              { mlas is only allowed in arm mode }
+              (taicpu(hp_last).opcode<>A_MLA) or
+              (current_settings.instructionset<>is_thumb)
+            ) and
+            (taicpu(hp_last).oper[0]^.reg = taicpu(p).oper[0]^.reg) and
+            assigned(FindRegDealloc(NR_DEFAULTFLAGS,tai(hp1.Next))) then
+            begin
+              DebugMsg('Peephole Optimization: OpCmp2OpS done', hp_last);
 
-                                    dec(taicpu(hp1).oper[2]^.shifterop^.shiftimm,taicpu(hp2).oper[2]^.shifterop^.shiftimm);
-                                    taicpu(hp1).oper[0]^.reg:=taicpu(hp2).oper[0]^.reg;
-                                    asml.remove(hp2);
-                                    hp2.free;
-                                    result := true;
-                                  end;
-                              end
-                            { mov reg1,reg0, lsr/asr imm1
-                              mov reg1,reg1, lsl imm2
-                              mov reg1,reg1, lsr/asr imm3 ...
+              taicpu(hp_last).oppostfix:=PF_S;
 
-                              if imm3>=imm1 and imm2>=imm1
-                              to
-                              mov reg1,reg0, lsl imm2-imm1
-                              mov reg1,reg1, lsr/asr imm3 ...
-                            }
-                            else if (taicpu(p).oper[2]^.shifterop^.shiftmode in [SM_ASR,SM_LSR]) and (taicpu(hp2).oper[2]^.shifterop^.shiftmode in [SM_ASR,SM_LSR]) and
-                              (taicpu(hp1).oper[2]^.shifterop^.shiftmode=SM_LSL) and
-                              (taicpu(hp2).oper[2]^.shifterop^.shiftimm>=taicpu(p).oper[2]^.shifterop^.shiftimm) and
-                              (taicpu(hp1).oper[2]^.shifterop^.shiftimm>=taicpu(p).oper[2]^.shifterop^.shiftimm) then
-                              begin
-                                dec(taicpu(hp1).oper[2]^.shifterop^.shiftimm,taicpu(p).oper[2]^.shifterop^.shiftimm);
-                                taicpu(hp1).oper[1]^.reg:=taicpu(p).oper[1]^.reg;
-                                DebugMsg('Peephole ShiftShiftShift2ShiftShift 2 done', p);
-                                asml.remove(p);
-                                p.free;
-                                p:=hp2;
-                                if taicpu(hp1).oper[2]^.shifterop^.shiftimm=0 then
-                                  begin
-                                    taicpu(hp2).oper[1]^.reg:=taicpu(hp1).oper[1]^.reg;
-                                    asml.remove(hp1);
-                                    hp1.free;
-                                    p:=hp2;
-                                  end;
-                                result := true;
-                              end;
-                          end;
-                      end;
-                    { Change the common
-                      mov r0, r0, lsr #xxx
-                      and r0, r0, #yyy/bic r0, r0, #xxx
+              { move flag allocation if possible }
+              hp1:=FindRegAlloc(NR_DEFAULTFLAGS,tai(hp_last.Next));
+              if assigned(hp1) then
+                begin
+                  asml.Remove(hp1);
+                  asml.insertbefore(hp1, hp_last);
+                end;
 
-                      and remove the superfluous and/bic if possible
+              RemoveCurrentP(p);
+              Result:=true;
+            end;
+        end;
+    end;
 
-                      This could be extended to handle more cases.
-                    }
-                    if (taicpu(p).ops=3) and
-                       (taicpu(p).oper[2]^.typ = top_shifterop) and
-                       (taicpu(p).oper[2]^.shifterop^.rs = NR_NO) and
-                       (taicpu(p).oper[2]^.shifterop^.shiftmode = SM_LSR) and
-                       GetNextInstructionUsingReg(p,hp1, taicpu(p).oper[0]^.reg) and
-                       (hp1.typ=ait_instruction) and
-                       (taicpu(hp1).ops>=1) and
-                       (taicpu(hp1).oper[0]^.typ=top_reg) and
-                       (not RegModifiedBetween(taicpu(hp1).oper[0]^.reg, p, hp1)) and
-                       RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hp1)) then
-                       begin
-                         if (taicpu(p).oper[2]^.shifterop^.shiftimm >= 24 ) and
-                           MatchInstruction(hp1, A_AND, [taicpu(p).condition], [taicpu(p).oppostfix]) and
-                           (taicpu(hp1).ops=3) and
-                           MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[1]^) and
-                           (taicpu(hp1).oper[2]^.typ = top_const) and
-                           { Check if the AND actually would only mask out bits being already zero because of the shift
-                           }
-                           ((($ffffffff shr taicpu(p).oper[2]^.shifterop^.shiftimm) and taicpu(hp1).oper[2]^.val) =
-                             ($ffffffff shr taicpu(p).oper[2]^.shifterop^.shiftimm)) then
-                           begin
-                             DebugMsg('Peephole LsrAnd2Lsr done', hp1);
-                             taicpu(p).oper[0]^.reg:=taicpu(hp1).oper[0]^.reg;
-                             asml.remove(hp1);
-                             hp1.free;
-                             result:=true;
-                           end
-                         else if MatchInstruction(hp1, A_BIC, [taicpu(p).condition], [taicpu(p).oppostfix]) and
-                           (taicpu(hp1).ops=3) and
-                           MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[1]^) and
-                           (taicpu(hp1).oper[2]^.typ = top_const) and
-                           { Check if the BIC actually would only mask out bits beeing already zero because of the shift }
-                           (taicpu(hp1).oper[2]^.val<>0) and
-                           (BsfDWord(taicpu(hp1).oper[2]^.val)>=32-taicpu(p).oper[2]^.shifterop^.shiftimm) then
-                           begin
-                             DebugMsg('Peephole LsrBic2Lsr done', hp1);
-                             taicpu(p).oper[0]^.reg:=taicpu(hp1).oper[0]^.reg;
-                             asml.remove(hp1);
-                             hp1.free;
-                             result:=true;
-                           end;
-                       end;
-                    { Change
-                      mov rx, ry, lsr/ror #xxx
-                      uxtb/uxth rz,rx/and rz,rx,0xFF
-                      dealloc rx
 
-                      to
+  function TCpuAsmOptimizer.OptPass1LDR(var p: tai): Boolean;
+    var
+      hp1: tai;
+    begin
+      Result := False;
 
-                      uxtb/uxth rz,ry,ror #xxx
-                    }
-                    if (taicpu(p).ops=3) and
-                       (taicpu(p).oper[2]^.typ = top_shifterop) and
-                       (taicpu(p).oper[2]^.shifterop^.rs = NR_NO) and
-                       (taicpu(p).oper[2]^.shifterop^.shiftmode in [SM_LSR,SM_ROR]) and
-                       (GenerateThumb2Code) and
-                       GetNextInstructionUsingReg(p,hp1, taicpu(p).oper[0]^.reg) and
-                       RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hp1)) then
-                       begin
-                         if MatchInstruction(hp1, A_UXTB, [C_None], [PF_None]) and
-                           (taicpu(hp1).ops = 2) and
-                           (taicpu(p).oper[2]^.shifterop^.shiftimm in [8,16,24]) and
-                           MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) then
-                           begin
-                             taicpu(hp1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
-                             taicpu(hp1).loadshifterop(2,taicpu(p).oper[2]^.shifterop^);
-                             taicpu(hp1).oper[2]^.shifterop^.shiftmode:=SM_ROR;
-                             taicpu(hp1).ops := 3;
+      { change
+        ldr reg1,ref
+        ldr reg2,ref
+        into ...
+      }
+      if (taicpu(p).oper[1]^.typ = top_ref) and
+         (taicpu(p).oper[1]^.ref^.addressmode=AM_OFFSET) and
+         GetNextInstruction(p,hp1) and
+         { ldrd is not allowed here }
+         MatchInstruction(hp1, A_LDR, [taicpu(p).condition, C_None], [taicpu(p).oppostfix,PF_None]-[PF_D]) then
+        begin
+          {
+            ...
+            ldr reg1,ref
+            mov reg2,reg1
+          }
+          if (taicpu(p).oppostfix=taicpu(hp1).oppostfix) and
+             RefsEqual(taicpu(p).oper[1]^.ref^,taicpu(hp1).oper[1]^.ref^) and
+             (taicpu(p).oper[0]^.reg<>taicpu(hp1).oper[1]^.ref^.index) and
+             (taicpu(p).oper[0]^.reg<>taicpu(hp1).oper[1]^.ref^.base) and
+             (taicpu(hp1).oper[1]^.ref^.addressmode=AM_OFFSET) then
+            begin
+              if taicpu(hp1).oper[0]^.reg=taicpu(p).oper[0]^.reg then
+                begin
+                  DebugMsg('Peephole Optimization: LdrLdr2Ldr done', hp1);
+                  asml.remove(hp1);
+                  hp1.free;
+                end
+              else
+                begin
+                  DebugMsg('Peephole Optimization: LdrLdr2LdrMov done', hp1);
+                  taicpu(hp1).opcode:=A_MOV;
+                  taicpu(hp1).oppostfix:=PF_None;
+                  taicpu(hp1).loadreg(1,taicpu(p).oper[0]^.reg);
+                end;
+              result := true;
+            end
+          {
+             ...
+             ldrd reg1,reg1+1,ref
+          }
+          else if (GenerateARMCode or GenerateThumb2Code) and
+            (CPUARM_HAS_EDSP in cpu_capabilities[current_settings.cputype]) and
+            { ldrd does not allow any postfixes ... }
+            (taicpu(p).oppostfix=PF_None) and
+            not(odd(getsupreg(taicpu(p).oper[0]^.reg))) and
+            (getsupreg(taicpu(p).oper[0]^.reg)+1=getsupreg(taicpu(hp1).oper[0]^.reg)) and
+            { ldr ensures that either base or index contain no register, else ldr wouldn't
+              use an offset either
+            }
+            (taicpu(p).oper[1]^.ref^.base=taicpu(hp1).oper[1]^.ref^.base) and
+            (taicpu(p).oper[1]^.ref^.index=taicpu(hp1).oper[1]^.ref^.index) and
+            (taicpu(p).oper[1]^.ref^.offset+4=taicpu(hp1).oper[1]^.ref^.offset) and
+            (abs(taicpu(p).oper[1]^.ref^.offset)<256) and
+            AlignedToQWord(taicpu(p).oper[1]^.ref^) then
+            begin
+              DebugMsg('Peephole Optimization: LdrLdr2Ldrd done', p);
+              taicpu(p).loadref(2,taicpu(p).oper[1]^.ref^);
+              taicpu(p).loadreg(1, taicpu(hp1).oper[0]^.reg);
+              taicpu(p).ops:=3;
+              taicpu(p).oppostfix:=PF_D;
+              asml.remove(hp1);
+              hp1.free;
+              result:=true;
+            end;
+        end;
 
-                             GetNextInstruction(p,hp1);
+      {
+        Change
 
-                             asml.Remove(p);
-                             p.Free;
+          ldrb dst1, [REF]
+          and  dst2, dst1, #255
 
-                             p:=hp1;
+        into
 
-                             result:=true;
-                             exit;
-                           end
-                         else if MatchInstruction(hp1, A_UXTH, [C_None], [PF_None]) and
-                           (taicpu(hp1).ops=2) and
-                           (taicpu(p).oper[2]^.shifterop^.shiftimm in [16]) and
-                           MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) then
-                           begin
-                             taicpu(hp1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
-                             taicpu(hp1).loadshifterop(2,taicpu(p).oper[2]^.shifterop^);
-                             taicpu(hp1).oper[2]^.shifterop^.shiftmode:=SM_ROR;
-                             taicpu(hp1).ops := 3;
+          ldrb dst2, [ref]
+      }
+      if not(GenerateThumbCode) and
+         (taicpu(p).oppostfix=PF_B) and
+         GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
+         MatchInstruction(hp1, A_AND, [taicpu(p).condition], [PF_NONE]) and
+         (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[0]^.reg) and
+         (taicpu(hp1).oper[2]^.typ = top_const) and
+         (taicpu(hp1).oper[2]^.val = $FF) and
+         not(RegUsedBetween(taicpu(hp1).oper[0]^.reg, p, hp1)) and
+         RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hp1)) then
+         begin
+           DebugMsg('Peephole Optimization: LdrbAnd2Ldrb done', p);
+           taicpu(p).oper[0]^.reg := taicpu(hp1).oper[0]^.reg;
+           asml.remove(hp1);
+           hp1.free;
+           result:=true;
+         end;
+      Result:=LookForPostindexedPattern(taicpu(p)) or Result;
+      { Remove superfluous mov after ldr
+        changes
+        ldr reg1, ref
+        mov reg2, reg1
+        to
+        ldr reg2, ref
 
-                             GetNextInstruction(p,hp1);
+        conditions are:
+          * no ldrd usage
+          * reg1 must be released after mov
+          * mov can not contain shifterops
+          * ldr+mov have the same conditions
+          * mov does not set flags
+      }
+      if (taicpu(p).oppostfix<>PF_D) and
+         GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
+         RemoveSuperfluousMove(p, hp1, 'LdrMov2Ldr') then
+        Result:=true;
+    end;
 
-                             asml.Remove(p);
-                             p.Free;
 
-                             p:=hp1;
+  function TCpuAsmOptimizer.OptPass1STM(var p: tai): Boolean;
+    var
+      hp1, hp2, hp3, hp4: tai;
+    begin
+      Result := False;
 
-                             result:=true;
-                             exit;
-                           end
-                         else if MatchInstruction(hp1, A_AND, [C_None], [PF_None]) and
-                           (taicpu(hp1).ops = 3) and
-                           (taicpu(hp1).oper[2]^.typ = top_const) and
-                           (taicpu(hp1).oper[2]^.val = $FF) and
-                           (taicpu(p).oper[2]^.shifterop^.shiftimm in [8,16,24]) and
-                           MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) then
-                           begin
-                             taicpu(hp1).ops := 3;
-                             taicpu(hp1).opcode := A_UXTB;
-                             taicpu(hp1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
-                             taicpu(hp1).loadshifterop(2,taicpu(p).oper[2]^.shifterop^);
-                             taicpu(hp1).oper[2]^.shifterop^.shiftmode:=SM_ROR;
+      {
+        change
+        stmfd	r13!,[r14]
+        sub	r13,r13,#4
+        bl	abc
+        add	r13,r13,#4
+        ldmfd	r13!,[r15]
+        into
+        b         abc
+      }
+      if not(ts_thumb_interworking in current_settings.targetswitches) and
+        (taicpu(p).condition = C_None) and
+        (taicpu(p).oppostfix = PF_FD) and
+        (taicpu(p).oper[0]^.typ = top_ref) and
+        (taicpu(p).oper[0]^.ref^.index=NR_STACK_POINTER_REG) and
+        (taicpu(p).oper[0]^.ref^.base=NR_NO) and
+        (taicpu(p).oper[0]^.ref^.offset=0) and
+        (taicpu(p).oper[0]^.ref^.addressmode=AM_PREINDEXED) and
+        (taicpu(p).oper[1]^.typ = top_regset) and
+        (taicpu(p).oper[1]^.regset^ = [RS_R14]) and
+        GetNextInstruction(p, hp1) and
+        MatchInstruction(hp1, A_SUB, [C_None], [PF_NONE]) and
+        (taicpu(hp1).oper[0]^.typ = top_reg) and
+        (taicpu(hp1).oper[0]^.reg = NR_STACK_POINTER_REG) and
+        MatchOperand(taicpu(hp1).oper[0]^,taicpu(hp1).oper[1]^) and
+        (taicpu(hp1).oper[2]^.typ = top_const) and
 
-                             GetNextInstruction(p,hp1);
+        GetNextInstruction(hp1, hp2) and
+        SkipEntryExitMarker(hp2, hp2) and
 
-                             asml.Remove(p);
-                             p.Free;
+        MatchInstruction(hp2, [A_BL,A_BLX], [C_None], [PF_NONE]) and
+        (taicpu(hp2).oper[0]^.typ = top_ref) and
 
-                             p:=hp1;
+        GetNextInstruction(hp2, hp3) and
+        SkipEntryExitMarker(hp3, hp3) and
+        MatchInstruction(hp3, A_ADD, [C_None], [PF_NONE]) and
+        MatchOperand(taicpu(hp1).oper[0]^,taicpu(hp3).oper[0]^) and
+        MatchOperand(taicpu(hp1).oper[0]^,taicpu(hp3).oper[1]^) and
+        MatchOperand(taicpu(hp1).oper[2]^,taicpu(hp3).oper[2]^) and
 
-                             result:=true;
-                             exit;
-                           end;
-                       end;
-                    {
-                      optimize
-                      mov rX, yyyy
-                      ....
-                    }
-                    if (taicpu(p).ops = 2) and
-                       GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
-                       (tai(hp1).typ = ait_instruction) then
-                      begin
-                        {
-                          This removes the mul from
-                          mov rX,0
-                          ...
-                          mul ...,rX,...
-                        }
-                        if false and (taicpu(p).oper[1]^.typ = top_const) and
-                          (taicpu(p).oper[1]^.val=0) and
-                          MatchInstruction(hp1, [A_MUL,A_MLA], [taicpu(p).condition], [taicpu(p).oppostfix]) and
-                          (((taicpu(hp1).oper[1]^.typ=top_reg) and MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[1]^)) or
-                           ((taicpu(hp1).oper[2]^.typ=top_reg) and MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[2]^))) then
-                            begin
-                              TransferUsedRegs(TmpUsedRegs);
-                              UpdateUsedRegs(TmpUsedRegs, tai(p.next));
-                              UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
-                              DebugMsg('Peephole MovMUL/MLA2Mov0 done', p);
-                              if taicpu(hp1).opcode=A_MUL then
-                                taicpu(hp1).loadconst(1,0)
-                              else
-                                taicpu(hp1).loadreg(1,taicpu(hp1).oper[3]^.reg);
-                              taicpu(hp1).ops:=2;
-                              taicpu(hp1).opcode:=A_MOV;
-                              if not(RegUsedAfterInstruction(taicpu(p).oper[0]^.reg,hp1,TmpUsedRegs)) then
-                                RemoveCurrentP(p);
-                              Result:=true;
-                              exit;
-                            end
-                        else if (taicpu(p).oper[1]^.typ = top_const) and
-                          (taicpu(p).oper[1]^.val=0) and
-                          MatchInstruction(hp1, A_MLA, [taicpu(p).condition], [taicpu(p).oppostfix]) and
-                          MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[3]^) then
-                            begin
-                              TransferUsedRegs(TmpUsedRegs);
-                              UpdateUsedRegs(TmpUsedRegs, tai(p.next));
-                              UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
-                              DebugMsg('Peephole MovMLA2MUL 1 done', p);
-                              taicpu(hp1).ops:=3;
-                              taicpu(hp1).opcode:=A_MUL;
-                              if not(RegUsedAfterInstruction(taicpu(p).oper[0]^.reg,hp1,TmpUsedRegs)) then
-                                RemoveCurrentP(p);
-                              Result:=true;
-                              exit;
-                            end
-                        {
-                          This changes the very common
-                          mov r0, #0
-                          str r0, [...]
-                          mov r0, #0
-                          str r0, [...]
+        GetNextInstruction(hp3, hp4) and
+        MatchInstruction(hp4, A_LDM, [C_None], [PF_FD]) and
+        MatchOperand(taicpu(p).oper[0]^,taicpu(hp4).oper[0]^) and
+        (taicpu(hp4).oper[1]^.typ = top_regset) and
+        (taicpu(hp4).oper[1]^.regset^ = [RS_R15]) then
+        begin
+          asml.Remove(hp1);
+          asml.Remove(hp3);
+          asml.Remove(hp4);
+          taicpu(hp2).opcode:=A_B;
+          hp1.free;
+          hp3.free;
+          hp4.free;
+          RemoveCurrentp(p, hp2);
+          DebugMsg('Peephole Optimization: Bl2B done', p);
+          Result := True;
+        end;
+    end;
 
-                          and removes all superfluous mov instructions
-                        }
-                        else if (taicpu(p).oper[1]^.typ = top_const) and
-                           (taicpu(hp1).opcode=A_STR) then
-                          while MatchInstruction(hp1, A_STR, [taicpu(p).condition], []) and
-                                MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[0]^) and
-                                GetNextInstruction(hp1, hp2) and
-                                MatchInstruction(hp2, A_MOV, [taicpu(p).condition], [PF_None]) and
-                                (taicpu(hp2).ops = 2) and
-                                MatchOperand(taicpu(hp2).oper[0]^, taicpu(p).oper[0]^) and
-                                MatchOperand(taicpu(hp2).oper[1]^, taicpu(p).oper[1]^) do
-                            begin
-                              DebugMsg('Peephole MovStrMov done', hp2);
-                              GetNextInstruction(hp2,hp1);
-                              asml.remove(hp2);
-                              hp2.free;
-                              result:=true;
-                              if not assigned(hp1) then break;
-                            end
-                        {
-                          This removes the first mov from
-                          mov rX,...
-                          mov rX,...
-                        }
-                        else if taicpu(hp1).opcode=A_MOV then
-                          while MatchInstruction(hp1, A_MOV, [taicpu(p).condition], [taicpu(p).oppostfix]) and
-                                (taicpu(hp1).ops = 2) and
-                                MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[0]^) and
-                                { don't remove the first mov if the second is a mov rX,rX }
-                                not(MatchOperand(taicpu(hp1).oper[0]^, taicpu(hp1).oper[1]^)) do
-                            begin
-                              DebugMsg('Peephole MovMov done', p);
-                              asml.remove(p);
-                              p.free;
-                              p:=hp1;
-                              GetNextInstruction(hp1,hp1);
-                              result:=true;
-                              if not assigned(hp1) then
-                                break;
-                            end;
-                         if RedundantMovProcess(p,hp1) then
-                           begin
-                             Result:=true;
-                             { p might not point at a mov anymore }
-                             exit;
-                           end;
-                      end;
 
-                    { Fold the very common sequence
-                        mov  regA, regB
-                        ldr* regA, [regA]
-                      to
-                        ldr* regA, [regB]
-                      CAUTION! If this one is successful p might not be a mov instruction anymore!
-                    }
-                    if (taicpu(p).opcode = A_MOV) and
-                       (taicpu(p).ops = 2) and
-                       (taicpu(p).oper[1]^.typ = top_reg) and
-                       (taicpu(p).oppostfix = PF_NONE) and
-                       GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
-                       MatchInstruction(hp1, [A_LDR, A_STR], [taicpu(p).condition], []) and
-                       (taicpu(hp1).oper[1]^.typ = top_ref) and
-                       { We can change the base register only when the instruction uses AM_OFFSET }
-                       ((taicpu(hp1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg) or
-                         ((taicpu(hp1).oper[1]^.ref^.addressmode = AM_OFFSET) and
-                          (taicpu(hp1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg))
-                       ) and
-                       not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) and
 
-                       // Make sure that Thumb code doesn't propagate a high register into a reference
-                       ((GenerateThumbCode and
-                         (getsupreg(taicpu(p).oper[1]^.reg) < RS_R8)) or
-                        (not GenerateThumbCode)) and
+  function TCpuAsmOptimizer.OptPass1STR(var p: tai): Boolean;
+    var
+      hp1: tai;
+    begin
+      Result := False;
 
-                       RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hp1)) then
-                      begin
-                        DebugMsg('Peephole MovLdr2Ldr done', hp1);
-                        if (taicpu(hp1).oper[1]^.ref^.addressmode = AM_OFFSET) and
-                           (taicpu(hp1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg) then
-                          taicpu(hp1).oper[1]^.ref^.base := taicpu(p).oper[1]^.reg;
+      { Common conditions }
+      if (taicpu(p).oper[1]^.typ = top_ref) and
+        (taicpu(p).oper[1]^.ref^.addressmode=AM_OFFSET) and
+        (taicpu(p).oppostfix=PF_None) then
+        begin
+          { change
+            str reg1,ref
+            ldr reg2,ref
+            into
+            str reg1,ref
+            mov reg2,reg1
+          }
+          if (taicpu(p).condition=C_None) and
+             GetNextInstructionUsingRef(p,hp1,taicpu(p).oper[1]^.ref^) and
+             MatchInstruction(hp1, A_LDR, [taicpu(p).condition], [PF_None]) and
+             (taicpu(hp1).oper[1]^.typ=top_ref) and
+             (taicpu(hp1).oper[1]^.ref^.addressmode=AM_OFFSET) and
+             not(RegModifiedBetween(taicpu(p).oper[0]^.reg, p, hp1)) and
+             ((taicpu(hp1).oper[1]^.ref^.index=NR_NO) or not (RegModifiedBetween(taicpu(hp1).oper[1]^.ref^.index, p, hp1))) and
+             ((taicpu(hp1).oper[1]^.ref^.base=NR_NO) or not (RegModifiedBetween(taicpu(hp1).oper[1]^.ref^.base, p, hp1))) then
+            begin
+              if taicpu(hp1).oper[0]^.reg=taicpu(p).oper[0]^.reg then
+                begin
+                  DebugMsg('Peephole Optimization: StrLdr2StrMov 1 done', hp1);
+                  asml.remove(hp1);
+                  hp1.free;
+                end
+              else
+                begin
+                  taicpu(hp1).opcode:=A_MOV;
+                  taicpu(hp1).oppostfix:=PF_None;
+                  taicpu(hp1).loadreg(1,taicpu(p).oper[0]^.reg);
+                  DebugMsg('Peephole Optimization: StrLdr2StrMov 2 done', hp1);
+                end;
+              result := True;
+            end
+          { change
+            str reg1,ref
+            str reg2,ref
+            into
+            strd reg1,reg2,ref
+          }
+          else if (GenerateARMCode or GenerateThumb2Code) and
+             (CPUARM_HAS_EDSP in cpu_capabilities[current_settings.cputype]) and
+             not(odd(getsupreg(taicpu(p).oper[0]^.reg))) and
+             (abs(taicpu(p).oper[1]^.ref^.offset)<256) and
+             AlignedToQWord(taicpu(p).oper[1]^.ref^) and
+             GetNextInstruction(p,hp1) and
+             MatchInstruction(hp1, A_STR, [taicpu(p).condition, C_None], [PF_None]) and
+            (getsupreg(taicpu(p).oper[0]^.reg)+1=getsupreg(taicpu(hp1).oper[0]^.reg)) and
+            { str ensures that either base or index contain no register, else ldr wouldn't
+              use an offset either
+            }
+            (taicpu(p).oper[1]^.ref^.base=taicpu(hp1).oper[1]^.ref^.base) and
+            (taicpu(p).oper[1]^.ref^.index=taicpu(hp1).oper[1]^.ref^.index) and
+            (taicpu(p).oper[1]^.ref^.offset+4=taicpu(hp1).oper[1]^.ref^.offset) then
+            begin
+              DebugMsg('Peephole Optimization: StrStr2Strd done', p);
+              taicpu(p).oppostfix:=PF_D;
+              taicpu(p).loadref(2,taicpu(p).oper[1]^.ref^);
+              taicpu(p).loadreg(1, taicpu(hp1).oper[0]^.reg);
+              taicpu(p).ops:=3;
+              asml.remove(hp1);
+              hp1.free;
+              result:=true;
+            end;
+        end;
 
-                        if taicpu(hp1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg then
-                          taicpu(hp1).oper[1]^.ref^.index := taicpu(p).oper[1]^.reg;
+      Result:=LookForPostindexedPattern(taicpu(p)) or Result;
+    end;
 
-                        dealloc:=FindRegDeAlloc(taicpu(p).oper[1]^.reg, tai(p.Next));
-                        if Assigned(dealloc) then
-                          begin
-                            asml.remove(dealloc);
-                            asml.InsertAfter(dealloc,hp1);
-                          end;
 
-                        GetNextInstruction(p, hp1);
-                        asml.remove(p);
-                        p.free;
-                        p:=hp1;
-                        result:=true;
-                      end;
+  function TCpuAsmOptimizer.OptPass1MOV(var p: tai): Boolean;
+    var
+      hp1, hpfar1, hp2, hp3: tai;
+      i, i2: longint;
+      tempop: tasmop;
+      dealloc: tai_regalloc;
+    begin
+      Result := False;
+      hp1 := nil;
 
-                    { This folds shifterops into following instructions
-                      mov r0, r1, lsl #8
-                      add r2, r3, r0
+      { fold
+        mov reg1,reg0, shift imm1
+        mov reg1,reg1, shift imm2
+      }
+      if (taicpu(p).ops=3) and
+         (taicpu(p).oper[2]^.typ = top_shifterop) and
+         (taicpu(p).oper[2]^.shifterop^.rs = NR_NO) and
+         getnextinstruction(p,hp1) and
+         MatchInstruction(hp1, A_MOV, [taicpu(p).condition], [PF_None]) and
+         (taicpu(hp1).ops=3) and
+         MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^.reg) and
+         MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) and
+         (taicpu(hp1).oper[2]^.typ = top_shifterop) and
+         (taicpu(hp1).oper[2]^.shifterop^.rs = NR_NO) then
+        begin
+          { fold
+            mov reg1,reg0, lsl 16
+            mov reg1,reg1, lsr 16
+            strh reg1, ...
+            dealloc reg1
+            to
+            strh reg1, ...
+            dealloc reg1
+          }
+          if (taicpu(p).oper[2]^.shifterop^.shiftmode=SM_LSL) and
+            (taicpu(p).oper[2]^.shifterop^.shiftimm=16) and
+            (taicpu(hp1).oper[2]^.shifterop^.shiftmode in [SM_LSR,SM_ASR]) and
+            (taicpu(hp1).oper[2]^.shifterop^.shiftimm=16) and
+            getnextinstruction(hp1,hp2) and
+            MatchInstruction(hp2, A_STR, [taicpu(p).condition], [PF_H]) and
+            MatchOperand(taicpu(hp2).oper[0]^, taicpu(p).oper[0]^.reg) then
+            begin
+              TransferUsedRegs(TmpUsedRegs);
+              UpdateUsedRegs(TmpUsedRegs, tai(p.next));
+              UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
+              if not(RegUsedAfterInstruction(taicpu(p).oper[0]^.reg,hp2,TmpUsedRegs)) then
+                begin
+                  DebugMsg('Peephole Optimization: removed superfluous 16 Bit zero extension', hp1);
+                  taicpu(hp2).loadreg(0,taicpu(p).oper[1]^.reg);
+                  asml.remove(hp1);
+                  hp1.free;
 
-                      to
+                  RemoveCurrentP(p, hp2);
+                  Result:=true;
+                  Exit;
+                end;
+            end
+          { fold
+            mov reg1,reg0, shift imm1
+            mov reg1,reg1, shift imm2
+            to
+            mov reg1,reg0, shift imm1+imm2
+          }
+          else if (taicpu(p).oper[2]^.shifterop^.shiftmode=taicpu(hp1).oper[2]^.shifterop^.shiftmode) or
+            { asr makes no use after a lsr, the asr can be foled into the lsr }
+             ((taicpu(p).oper[2]^.shifterop^.shiftmode=SM_LSR) and (taicpu(hp1).oper[2]^.shifterop^.shiftmode=SM_ASR) ) then
+            begin
+              inc(taicpu(p).oper[2]^.shifterop^.shiftimm,taicpu(hp1).oper[2]^.shifterop^.shiftimm);
+              { avoid overflows }
+              if taicpu(p).oper[2]^.shifterop^.shiftimm>31 then
+                case taicpu(p).oper[2]^.shifterop^.shiftmode of
+                  SM_ROR:
+                    taicpu(p).oper[2]^.shifterop^.shiftimm:=taicpu(p).oper[2]^.shifterop^.shiftimm and 31;
+                  SM_ASR:
+                    taicpu(p).oper[2]^.shifterop^.shiftimm:=31;
+                  SM_LSR,
+                  SM_LSL:
+                    begin
+                      hp2:=taicpu.op_reg_const(A_MOV,taicpu(p).oper[0]^.reg,0);
+                      InsertLLItem(p.previous, p.next, hp2);
+                      p.free;
+                      p:=hp2;
+                    end;
+                  else
+                    internalerror(2008072803);
+                end;
+              DebugMsg('Peephole Optimization: ShiftShift2Shift 1 done', p);
+              asml.remove(hp1);
+              hp1.free;
+              hp1 := nil;
+              result := true;
+            end
+          { fold
+            mov reg1,reg0, shift imm1
+            mov reg1,reg1, shift imm2
+            mov reg1,reg1, shift imm3 ...
+            mov reg2,reg1, shift imm3 ...
+          }
+          else if GetNextInstructionUsingReg(hp1,hp2, taicpu(hp1).oper[0]^.reg) and
+            MatchInstruction(hp2, A_MOV, [taicpu(p).condition], [PF_None]) and
+            (taicpu(hp2).ops=3) and
+            MatchOperand(taicpu(hp2).oper[1]^, taicpu(hp1).oper[0]^.reg) and
+            RegEndofLife(taicpu(p).oper[0]^.reg,taicpu(hp2)) and
+            (taicpu(hp2).oper[2]^.typ = top_shifterop) and
+            (taicpu(hp2).oper[2]^.shifterop^.rs = NR_NO) then
+            begin
+              { mov reg1,reg0, lsl imm1
+                mov reg1,reg1, lsr/asr imm2
+                mov reg2,reg1, lsl imm3 ...
+                to
+                mov reg1,reg0, lsl imm1
+                mov reg2,reg1, lsr/asr imm2-imm3
+                if
+                imm1>=imm2
+              }
+              if (taicpu(p).oper[2]^.shifterop^.shiftmode=SM_LSL) and (taicpu(hp2).oper[2]^.shifterop^.shiftmode=SM_LSL) and
+                (taicpu(hp1).oper[2]^.shifterop^.shiftmode in [SM_ASR,SM_LSR]) and
+                (taicpu(p).oper[2]^.shifterop^.shiftimm>=taicpu(hp1).oper[2]^.shifterop^.shiftimm) then
+                begin
+                  if (taicpu(hp2).oper[2]^.shifterop^.shiftimm>=taicpu(hp1).oper[2]^.shifterop^.shiftimm) then
+                    begin
+                      if not(RegUsedBetween(taicpu(hp2).oper[0]^.reg,p,hp1)) and
+                        not(RegUsedBetween(taicpu(hp2).oper[0]^.reg,hp1,hp2)) then
+                        begin
+                          DebugMsg('Peephole Optimization: ShiftShiftShift2ShiftShift 1a done', p);
+                          inc(taicpu(p).oper[2]^.shifterop^.shiftimm,taicpu(hp2).oper[2]^.shifterop^.shiftimm-taicpu(hp1).oper[2]^.shifterop^.shiftimm);
+                          taicpu(p).oper[0]^.reg:=taicpu(hp2).oper[0]^.reg;
+                          asml.remove(hp1);
+                          asml.remove(hp2);
+                          hp1.free;
+                          hp2.free;
 
-                      add r2, r3, r1, lsl #8
-                      CAUTION! If this one is successful p might not be a mov instruction anymore!
-                    }
-                    if (taicpu(p).opcode = A_MOV) and
-                       (taicpu(p).ops = 3) and
-                       (taicpu(p).oper[1]^.typ = top_reg) and
-                       (taicpu(p).oper[2]^.typ = top_shifterop) and
-                       (taicpu(p).oppostfix = PF_NONE) and
-                       GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
-                       MatchInstruction(hp1, [A_ADD, A_ADC, A_RSB, A_RSC, A_SUB, A_SBC,
-                                              A_AND, A_BIC, A_EOR, A_ORR, A_TEQ, A_TST,
-                                              A_CMP, A_CMN],
-                                        [taicpu(p).condition], [PF_None]) and
-                       (not ((GenerateThumb2Code) and
-                             (taicpu(hp1).opcode in [A_SBC]) and
-                             (((taicpu(hp1).ops=3) and
-                               MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[1]^.reg)) or
-                              ((taicpu(hp1).ops=2) and
-                               MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[0]^.reg))))) and
-                       RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hp1)) and
-                       (taicpu(hp1).ops >= 2) and
-                       {Currently we can't fold into another shifterop}
-                       (taicpu(hp1).oper[taicpu(hp1).ops-1]^.typ = top_reg) and
-                       {Folding rrx is problematic because of the C-Flag, as we currently can't check
-                        NR_DEFAULTFLAGS for modification}
-                       (
-                         {Everything is fine if we don't use RRX}
-                         (taicpu(p).oper[2]^.shifterop^.shiftmode <> SM_RRX) or
-                         (
-                           {If it is RRX, then check if we're just accessing the next instruction}
-                           GetNextInstruction(p, hp2) and
-                           (hp1 = hp2)
-                         )
-                       ) and
-                       { reg1 might not be modified inbetween }
-                       not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) and
-                       { The shifterop can contain a register, might not be modified}
-                       (
-                         (taicpu(p).oper[2]^.shifterop^.rs = NR_NO) or
-                         not(RegModifiedBetween(taicpu(p).oper[2]^.shifterop^.rs, p, hp1))
-                       ) and
-                       (
-                         {Only ONE of the two src operands is allowed to match}
-                         MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[taicpu(hp1).ops-2]^) xor
-                         MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[taicpu(hp1).ops-1]^)
-                       ) then
-                      begin
-                        if taicpu(hp1).opcode in [A_TST, A_TEQ, A_CMN] then
-                          I2:=0
-                        else
-                          I2:=1;
-                        for I:=I2 to taicpu(hp1).ops-1 do
-                          if MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[I]^.reg) then
+                          if taicpu(p).oper[2]^.shifterop^.shiftimm>=32 then
                             begin
-                              { If the parameter matched on the second op from the RIGHT
-                                we have to switch the parameters, this will not happen for CMP
-                                were we're only evaluating the most right parameter
-                              }
-                              if I <> taicpu(hp1).ops-1 then
-                                begin
-                                  {The SUB operators need to be changed when we swap parameters}
-                                  case taicpu(hp1).opcode of
-                                    A_SUB: tempop:=A_RSB;
-                                    A_SBC: tempop:=A_RSC;
-                                    A_RSB: tempop:=A_SUB;
-                                    A_RSC: tempop:=A_SBC;
-                                    else tempop:=taicpu(hp1).opcode;
-                                  end;
-                                  if taicpu(hp1).ops = 3 then
-                                    hp2:=taicpu.op_reg_reg_reg_shifterop(tempop,
-                                         taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[2]^.reg,
-                                         taicpu(p).oper[1]^.reg, taicpu(p).oper[2]^.shifterop^)
-                                  else
-                                    hp2:=taicpu.op_reg_reg_shifterop(tempop,
-                                         taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg,
-                                         taicpu(p).oper[2]^.shifterop^);
-                                end
-                              else
-                                if taicpu(hp1).ops = 3 then
-                                  hp2:=taicpu.op_reg_reg_reg_shifterop(taicpu(hp1).opcode,
-                                       taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg,
-                                       taicpu(p).oper[1]^.reg, taicpu(p).oper[2]^.shifterop^)
-                                else
-                                  hp2:=taicpu.op_reg_reg_shifterop(taicpu(hp1).opcode,
-                                       taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg,
-                                       taicpu(p).oper[2]^.shifterop^);
-                              if taicpu(p).oper[2]^.shifterop^.rs<>NR_NO then
-                                AllocRegBetween(taicpu(p).oper[2]^.shifterop^.rs,p,hp1,UsedRegs);
-                              AllocRegBetween(taicpu(p).oper[1]^.reg,p,hp1,UsedRegs);
-                              asml.insertbefore(hp2, hp1);
-                              GetNextInstruction(p, hp2);
-                              asml.remove(p);
-                              asml.remove(hp1);
-                              p.free;
-                              hp1.free;
-                              p:=hp2;
-                              DebugMsg('Peephole FoldShiftProcess done', p);
-                              Result:=true;
-                              break;
+                              taicpu(p).freeop(1);
+                              taicpu(p).freeop(2);
+                              taicpu(p).loadconst(1,0);
                             end;
-                      end;
-                    {
-                      Fold
-                        mov r1, r1, lsl #2
-                        ldr/ldrb r0, [r0, r1]
-                      to
-                        ldr/ldrb r0, [r0, r1, lsl #2]
+                          result := true;
+                          Exit;
+                        end;
+                    end
+                  else if not(RegUsedBetween(taicpu(hp2).oper[0]^.reg,hp1,hp2)) then
+                    begin
+                      DebugMsg('Peephole Optimization: ShiftShiftShift2ShiftShift 1b done', p);
 
-                      XXX: This still needs some work, as we quite often encounter something like
-                             mov r1, r2, lsl #2
-                             add r2, r3, #imm
-                             ldr r0, [r2, r1]
-                           which can't be folded because r2 is overwritten between the shift and the ldr.
-                           We could try to shuffle the registers around and fold it into.
-                             add r1, r3, #imm
-                             ldr r0, [r1, r2, lsl #2]
-                    }
-                    if (not(GenerateThumbCode)) and
-                       (taicpu(p).opcode = A_MOV) and
-                       (taicpu(p).ops = 3) and
-                       (taicpu(p).oper[1]^.typ = top_reg) and
-                       (taicpu(p).oper[2]^.typ = top_shifterop) and
-                       { RRX is tough to handle, because it requires tracking the C-Flag,
-                         it is also extremly unlikely to be emitted this way}
-                       (taicpu(p).oper[2]^.shifterop^.shiftmode <> SM_RRX) and
-                       (taicpu(p).oper[2]^.shifterop^.shiftimm <> 0) and
-                       { thumb2 allows only lsl #0..#3 }
-                       (not(GenerateThumb2Code) or
-                        ((taicpu(p).oper[2]^.shifterop^.shiftimm in [0..3]) and
-                         (taicpu(p).oper[2]^.shifterop^.shiftmode=SM_LSL)
-                        )
-                       ) and
-                       (taicpu(p).oppostfix = PF_NONE) and
-                       GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
-                       {Only LDR, LDRB, STR, STRB can handle scaled register indexing}
-                       (MatchInstruction(hp1, [A_LDR, A_STR], [taicpu(p).condition], [PF_None, PF_B]) or
-                        (GenerateThumb2Code and
-                         MatchInstruction(hp1, [A_LDR, A_STR], [taicpu(p).condition], [PF_None, PF_B, PF_SB, PF_H, PF_SH]))
-                       ) and
-                       (
-                         {If this is address by offset, one of the two registers can be used}
-                         ((taicpu(hp1).oper[1]^.ref^.addressmode=AM_OFFSET) and
-                           (
-                             (taicpu(hp1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg) xor
-                             (taicpu(hp1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg)
-                           )
-                         ) or
-                         {For post and preindexed only the index register can be used}
-                         ((taicpu(hp1).oper[1]^.ref^.addressmode in [AM_POSTINDEXED, AM_PREINDEXED]) and
-                           (
-                             (taicpu(hp1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg) and
-                             (taicpu(hp1).oper[1]^.ref^.base <> taicpu(p).oper[0]^.reg)
-                           ) and
-                           (not GenerateThumb2Code)
-                         )
-                       ) and
-                       { Only fold if both registers are used. Otherwise we are folding p with itself }
-                       (taicpu(hp1).oper[1]^.ref^.index<>NR_NO) and
-                       (taicpu(hp1).oper[1]^.ref^.base<>NR_NO) and
-                       { Only fold if there isn't another shifterop already, and offset is zero. }
-                       (taicpu(hp1).oper[1]^.ref^.offset = 0) and
-                       (taicpu(hp1).oper[1]^.ref^.shiftmode = SM_None) and
-                       not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) and
-                       RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hp1)) then
-                       begin
-                         { If the register we want to do the shift for resides in base, we need to swap that}
-                         if (taicpu(hp1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg) then
-                           taicpu(hp1).oper[1]^.ref^.base := taicpu(hp1).oper[1]^.ref^.index;
-                         taicpu(hp1).oper[1]^.ref^.index := taicpu(p).oper[1]^.reg;
-                         taicpu(hp1).oper[1]^.ref^.shiftmode := taicpu(p).oper[2]^.shifterop^.shiftmode;
-                         taicpu(hp1).oper[1]^.ref^.shiftimm := taicpu(p).oper[2]^.shifterop^.shiftimm;
-                         DebugMsg('Peephole FoldShiftLdrStr done', hp1);
-                         GetNextInstruction(p, hp1);
-                         asml.remove(p);
-                         p.free;
-                         p:=hp1;
-                         Result:=true;
-                       end;
-                    {
-                      Often we see shifts and then a superfluous mov to another register
-                      In the future this might be handled in RedundantMovProcess when it uses RegisterTracking
-                    }
-                    if (taicpu(p).opcode = A_MOV) and
-                       GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
-                       RemoveSuperfluousMove(p, hp1, 'MovMov2Mov') then
-                      Result:=true;
-                  end;
-                A_ADD,
-                A_ADC,
-                A_RSB,
-                A_RSC,
-                A_SUB,
-                A_SBC,
-                A_BIC,
-                A_EOR,
-                A_ORR,
-                A_MLA,
-                A_MLS,
-                A_MUL,
-                A_QADD,A_QADD16,A_QADD8,
-                A_QSUB,A_QSUB16,A_QSUB8,
-                A_QDADD,A_QDSUB,A_QASX,A_QSAX,
-                A_SHADD16,A_SHADD8,A_UHADD16,A_UHADD8,
-                A_SHSUB16,A_SHSUB8,A_UHSUB16,A_UHSUB8,
-                A_PKHTB,A_PKHBT,
-                A_SMUAD,A_SMUSD:
-                  begin
-                    {
-                      change
-                      add/sub reg2,reg1,const1
-                      str/ldr reg3,[reg2,const2]
-                      dealloc reg2
-                      to
-                      str/ldr reg3,[reg1,const2+/-const1]
-                    }
-                    if (not GenerateThumbCode) and
-                       (taicpu(p).opcode in [A_ADD,A_SUB]) and
-                       (taicpu(p).ops>2) and
-                       (taicpu(p).oper[1]^.typ = top_reg) and
-                       (taicpu(p).oper[2]^.typ = top_const) then
-                      begin
-                        hp1:=p;
-                        while GetNextInstructionUsingReg(hp1, hp1, taicpu(p).oper[0]^.reg) and
-                          { we cannot check NR_DEFAULTFLAGS for modification yet so don't allow a condition }
-                          MatchInstruction(hp1, [A_LDR, A_STR], [C_None], []) and
-                          (taicpu(hp1).oper[1]^.typ = top_ref) and
-                          (taicpu(hp1).oper[1]^.ref^.base=taicpu(p).oper[0]^.reg) and
-                          { don't optimize if the register is stored/overwritten }
-                          (taicpu(hp1).oper[0]^.reg<>taicpu(p).oper[1]^.reg) and
-                          (taicpu(hp1).oper[1]^.ref^.index=NR_NO) and
-                          (taicpu(hp1).oper[1]^.ref^.addressmode=AM_OFFSET) and
-                          { new offset must be valid: either in the range of 8 or 12 bit, depend on the
-                            ldr postfix }
-                          (((taicpu(p).opcode=A_ADD) and
-                           isValidConstLoadStoreOffset(taicpu(hp1).oper[1]^.ref^.offset+taicpu(p).oper[2]^.val, taicpu(hp1).oppostfix)
-                           ) or
-                           ((taicpu(p).opcode=A_SUB) and
-                            isValidConstLoadStoreOffset(taicpu(hp1).oper[1]^.ref^.offset-taicpu(p).oper[2]^.val, taicpu(hp1).oppostfix)
-                           )
-                          ) do
-                          begin
-                            { neither reg1 nor reg2 might be changed inbetween }
-                            if RegModifiedBetween(taicpu(p).oper[0]^.reg,p,hp1) or
-                              RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1) then
-                              break;
-                            { reg2 must be either overwritten by the ldr or it is deallocated afterwards }
-                            if ((taicpu(hp1).opcode=A_LDR) and (taicpu(p).oper[0]^.reg=taicpu(hp1).oper[0]^.reg)) or
-                              assigned(FindRegDeAlloc(taicpu(p).oper[0]^.reg,tai(hp1.Next))) then
-                              begin
-                                { remember last instruction }
-                                hp2:=hp1;
-                                DebugMsg('Peephole Add/SubLdr2Ldr done', p);
-                                hp1:=p;
-                                { fix all ldr/str }
-                                while GetNextInstructionUsingReg(hp1, hp1, taicpu(p).oper[0]^.reg) do
-                                  begin
-                                    taicpu(hp1).oper[1]^.ref^.base:=taicpu(p).oper[1]^.reg;
-                                    if taicpu(p).opcode=A_ADD then
-                                      inc(taicpu(hp1).oper[1]^.ref^.offset,taicpu(p).oper[2]^.val)
-                                    else
-                                      dec(taicpu(hp1).oper[1]^.ref^.offset,taicpu(p).oper[2]^.val);
-                                    if hp1=hp2 then
-                                      break;
-                                  end;
-                                GetNextInstruction(p,hp1);
-                                asml.remove(p);
-                                p.free;
-                                p:=hp1;
-                                result:=true;
-                                break;
-                              end;
-                          end;
-                      end;
-                    {
-                      change
-                      add reg1, ...
-                      mov reg2, reg1
-                      to
-                      add reg2, ...
-                    }
-                    if GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
-                       (taicpu(p).ops>=3) and
-                       RemoveSuperfluousMove(p, hp1, 'DataMov2Data') then
-                      Result:=true;
+                      dec(taicpu(hp1).oper[2]^.shifterop^.shiftimm,taicpu(hp2).oper[2]^.shifterop^.shiftimm);
+                      taicpu(hp1).oper[0]^.reg:=taicpu(hp2).oper[0]^.reg;
+                      asml.remove(hp2);
+                      hp2.free;
+                      result := true;
+                      Exit;
+                    end;
+                end
+              { mov reg1,reg0, lsr/asr imm1
+                mov reg1,reg1, lsl imm2
+                mov reg1,reg1, lsr/asr imm3 ...
 
-                    if MatchInstruction(p, [A_ADD,A_SUB], [C_None], [PF_None]) and
-                      LookForPreindexedPattern(taicpu(p)) then
-                      begin
-                        GetNextInstruction(p,hp1);
-                        DebugMsg('Peephole Add/Sub to Preindexed done', p);
-                        asml.remove(p);
-                        p.free;
-                        p:=hp1;
-                        Result:=true;
-                      end;
-                    {
-                     Turn
-                     mul reg0, z,w
-                     sub/add x, y, reg0
-                     dealloc reg0
+                if imm3>=imm1 and imm2>=imm1
+                to
+                mov reg1,reg0, lsl imm2-imm1
+                mov reg1,reg1, lsr/asr imm3 ...
+              }
+              else if (taicpu(p).oper[2]^.shifterop^.shiftmode in [SM_ASR,SM_LSR]) and (taicpu(hp2).oper[2]^.shifterop^.shiftmode in [SM_ASR,SM_LSR]) and
+                (taicpu(hp1).oper[2]^.shifterop^.shiftmode=SM_LSL) and
+                (taicpu(hp2).oper[2]^.shifterop^.shiftimm>=taicpu(p).oper[2]^.shifterop^.shiftimm) and
+                (taicpu(hp1).oper[2]^.shifterop^.shiftimm>=taicpu(p).oper[2]^.shifterop^.shiftimm) then
+                begin
+                  dec(taicpu(hp1).oper[2]^.shifterop^.shiftimm,taicpu(p).oper[2]^.shifterop^.shiftimm);
+                  taicpu(hp1).oper[1]^.reg:=taicpu(p).oper[1]^.reg;
+                  DebugMsg('Peephole Optimization: ShiftShiftShift2ShiftShift 2 done', p);
+                  if taicpu(hp1).oper[2]^.shifterop^.shiftimm=0 then
+                    begin
+                      taicpu(hp2).oper[1]^.reg:=taicpu(hp1).oper[1]^.reg;
+                      asml.remove(hp1);
+                      hp1.free;
+                    end;
 
-                     into
+                  RemoveCurrentp(p);
+                  result := true;
+                  Exit;
+                end;
+            end;
+        end;
 
-                     mls/mla x,z,w,y
-                     }
-                    if MatchInstruction(p, [A_MUL], [C_None], [PF_None]) and
-                      (taicpu(p).ops=3) and
-                      (taicpu(p).oper[0]^.typ = top_reg) and
-                      (taicpu(p).oper[1]^.typ = top_reg) and
-                      (taicpu(p).oper[2]^.typ = top_reg) and
-                      GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
-                      MatchInstruction(hp1,[A_ADD,A_SUB],[C_None],[PF_None]) and
-                      (not RegModifiedBetween(taicpu(p).oper[1]^.reg, p, hp1)) and
-                      (not RegModifiedBetween(taicpu(p).oper[2]^.reg, p, hp1)) and
+      { All the optimisations from this point on require GetNextInstructionUsingReg
+        to return True }
+      if not (
+        GetNextInstructionUsingReg(p, hpfar1, taicpu(p).oper[0]^.reg) and
+        (hpfar1.typ = ait_instruction)
+      ) then
+        Exit;
 
-                      (((taicpu(hp1).opcode=A_ADD) and (current_settings.cputype>=cpu_armv4)) or
-                       ((taicpu(hp1).opcode=A_SUB) and (current_settings.cputype in [cpu_armv6t2,cpu_armv7,cpu_armv7a,cpu_armv7r,cpu_armv7m,cpu_armv7em]))) and
+      { Change the common
+        mov r0, r0, lsr #xxx
+        and r0, r0, #yyy/bic r0, r0, #xxx
 
-                      // CPUs before ARMv6 don't recommend having the same Rd and Rm for MLA.
-                      // TODO: A workaround would be to swap Rm and Rs
-                      (not ((taicpu(hp1).opcode=A_ADD) and (current_settings.cputype<=cpu_armv6) and MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^))) and
+        and remove the superfluous and/bic if possible
 
-                      (((taicpu(hp1).ops=3) and
-                        (taicpu(hp1).oper[2]^.typ=top_reg) and
-                        ((MatchOperand(taicpu(hp1).oper[2]^, taicpu(p).oper[0]^.reg) and
-                          (not RegModifiedBetween(taicpu(hp1).oper[1]^.reg, p, hp1))) or
-                         ((MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) and
-                           (taicpu(hp1).opcode=A_ADD) and
-                           (not RegModifiedBetween(taicpu(hp1).oper[2]^.reg, p, hp1)))))) or
-                       ((taicpu(hp1).ops=2) and
-                        (taicpu(hp1).oper[1]^.typ=top_reg) and
-                        MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg))) and
-                      (RegEndOfLife(taicpu(p).oper[0]^.reg,taicpu(hp1))) then
-                      begin
-                        if taicpu(hp1).opcode=A_ADD then
-                          begin
-                            taicpu(hp1).opcode:=A_MLA;
+        This could be extended to handle more cases.
+      }
 
-                            if taicpu(hp1).ops=3 then
-                              begin
-                                if MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^) then
-                                  oldreg:=taicpu(hp1).oper[2]^.reg
-                                else
-                                  oldreg:=taicpu(hp1).oper[1]^.reg;
-                              end
-                            else
-                              oldreg:=taicpu(hp1).oper[0]^.reg;
+      { Change
+        mov rx, ry, lsr/ror #xxx
+        uxtb/uxth rz,rx/and rz,rx,0xFF
+        dealloc rx
 
-                            taicpu(hp1).loadreg(1,taicpu(p).oper[1]^.reg);
-                            taicpu(hp1).loadreg(2,taicpu(p).oper[2]^.reg);
-                            taicpu(hp1).loadreg(3,oldreg);
+        to
 
-                            DebugMsg('MulAdd2MLA done', p);
+        uxtb/uxth rz,ry,ror #xxx
+      }
+      if (GenerateThumb2Code) and
+         (taicpu(p).ops=3) and
+         (taicpu(p).oper[2]^.typ = top_shifterop) and
+         (taicpu(p).oper[2]^.shifterop^.rs = NR_NO) and
+         (taicpu(p).oper[2]^.shifterop^.shiftmode in [SM_LSR,SM_ROR]) and
+         RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) then
+         begin
+           if MatchInstruction(hpfar1, A_UXTB, [C_None], [PF_None]) and
+             (taicpu(hpfar1).ops = 2) and
+             (taicpu(p).oper[2]^.shifterop^.shiftimm in [8,16,24]) and
+             MatchOperand(taicpu(hpfar1).oper[1]^, taicpu(p).oper[0]^.reg) then
+             begin
+               taicpu(hpfar1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
+               taicpu(hpfar1).loadshifterop(2,taicpu(p).oper[2]^.shifterop^);
+               taicpu(hpfar1).oper[2]^.shifterop^.shiftmode:=SM_ROR;
+               taicpu(hpfar1).ops := 3;
 
-                            taicpu(hp1).ops:=4;
+               if not Assigned(hp1) then
+                 GetNextInstruction(p,hp1);
 
-                            asml.remove(p);
-                            p.free;
-                            p:=hp1;
-                          end
-                        else
-                          begin
-                            taicpu(hp1).opcode:=A_MLS;
+               RemoveCurrentP(p, hp1);
 
+               result:=true;
+               exit;
+             end
+           else if MatchInstruction(hpfar1, A_UXTH, [C_None], [PF_None]) and
+             (taicpu(hpfar1).ops=2) and
+             (taicpu(p).oper[2]^.shifterop^.shiftimm in [16]) and
+             MatchOperand(taicpu(hpfar1).oper[1]^, taicpu(p).oper[0]^.reg) then
+             begin
+               taicpu(hpfar1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
+               taicpu(hpfar1).loadshifterop(2,taicpu(p).oper[2]^.shifterop^);
+               taicpu(hpfar1).oper[2]^.shifterop^.shiftmode:=SM_ROR;
+               taicpu(hpfar1).ops := 3;
 
-                            taicpu(hp1).loadreg(3,taicpu(hp1).oper[1]^.reg);
+               if not Assigned(hp1) then
+                 GetNextInstruction(p,hp1);
 
-                            if taicpu(hp1).ops=2 then
-                              taicpu(hp1).loadreg(1,taicpu(hp1).oper[0]^.reg)
-                            else
-                              taicpu(hp1).loadreg(1,taicpu(p).oper[2]^.reg);
+               RemoveCurrentP(p, hp1);
 
-                            taicpu(hp1).loadreg(2,taicpu(p).oper[1]^.reg);
+               result:=true;
+               exit;
+             end
+           else if MatchInstruction(hpfar1, A_AND, [C_None], [PF_None]) and
+             (taicpu(hpfar1).ops = 3) and
+             (taicpu(hpfar1).oper[2]^.typ = top_const) and
+             (taicpu(hpfar1).oper[2]^.val = $FF) and
+             (taicpu(p).oper[2]^.shifterop^.shiftimm in [8,16,24]) and
+             MatchOperand(taicpu(hpfar1).oper[1]^, taicpu(p).oper[0]^.reg) then
+             begin
+               taicpu(hpfar1).ops := 3;
+               taicpu(hpfar1).opcode := A_UXTB;
+               taicpu(hpfar1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
+               taicpu(hpfar1).loadshifterop(2,taicpu(p).oper[2]^.shifterop^);
+               taicpu(hpfar1).oper[2]^.shifterop^.shiftmode:=SM_ROR;
 
-                            DebugMsg('MulSub2MLS done', p);
-                            AllocRegBetween(taicpu(hp1).oper[1]^.reg,p,hp1,UsedRegs);
-                            AllocRegBetween(taicpu(hp1).oper[2]^.reg,p,hp1,UsedRegs);
-                            AllocRegBetween(taicpu(hp1).oper[3]^.reg,p,hp1,UsedRegs);
+               if not Assigned(hp1) then
+                 GetNextInstruction(p,hp1);
 
-                            taicpu(hp1).ops:=4;
-                            RemoveCurrentP(p, hp1); // <-- Is this actually safe? hp1 is not necessarily the next instruction. [Kit]
-                          end;
+               RemoveCurrentP(p, hp1);
 
-                        result:=true;
-                      end
-                  end;
-{$ifdef dummy}
-                A_MVN:
-                  begin
-                    {
-                      change
-                      mvn reg2,reg1
-                      and reg3,reg4,reg2
-                      dealloc reg2
-                      to
-                      bic reg3,reg4,reg1
-                    }
-                    if (taicpu(p).oper[1]^.typ = top_reg) and
-                      GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
-                      MatchInstruction(hp1,A_AND,[],[]) and
-                      (((taicpu(hp1).ops=3) and
-                        (taicpu(hp1).oper[2]^.typ=top_reg) and
-                        (MatchOperand(taicpu(hp1).oper[2]^, taicpu(p).oper[0]^.reg) or
-                         MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg))) or
-                       ((taicpu(hp1).ops=2) and
-                        (taicpu(hp1).oper[1]^.typ=top_reg) and
-                        MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg))) and
-                      assigned(FindRegDealloc(taicpu(p).oper[0]^.reg,tai(hp1.Next))) and
-                      { reg1 might not be modified inbetween }
-                      not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) then
-                      begin
-                        DebugMsg('Peephole MvnAnd2Bic done', p);
-                        taicpu(hp1).opcode:=A_BIC;
+               result:=true;
+               exit;
+             end;
+         end;
 
-                        if taicpu(hp1).ops=3 then
-                          begin
-                            if MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) then
-                              taicpu(hp1).loadReg(1,taicpu(hp1).oper[2]^.reg); // Swap operands
+      { 2-operald mov optimisations }
+      if (taicpu(p).ops = 2) then
+        begin
+          {
+            This removes the mul from
+            mov rX,0
+            ...
+            mul ...,rX,...
+          }
+          if (taicpu(p).oper[1]^.typ = top_const) then
+            begin
+(*          if false and
+            (taicpu(p).oper[1]^.val=0) and
+            MatchInstruction(hpfar1, [A_MUL,A_MLA], [taicpu(p).condition], [taicpu(p).oppostfix]) and
+            (((taicpu(hpfar1).oper[1]^.typ=top_reg) and MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[1]^)) or
+             ((taicpu(hpfar1).oper[2]^.typ=top_reg) and MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[2]^))) then
+              begin
+                TransferUsedRegs(TmpUsedRegs);
+                UpdateUsedRegs(TmpUsedRegs, tai(p.next));
+                UpdateUsedRegs(TmpUsedRegs, tai(hpfar1.next));
+                DebugMsg('Peephole Optimization: MovMUL/MLA2Mov0 done', p);
+                if taicpu(hpfar1).opcode=A_MUL then
+                  taicpu(hpfar1).loadconst(1,0)
+                else
+                  taicpu(hpfar1).loadreg(1,taicpu(hpfar1).oper[3]^.reg);
+                taicpu(hpfar1).ops:=2;
+                taicpu(hpfar1).opcode:=A_MOV;
+                if not(RegUsedAfterInstruction(taicpu(p).oper[0]^.reg,hpfar1,TmpUsedRegs)) then
+                  RemoveCurrentP(p);
+                Result:=true;
+                exit;
+              end
+          else*) if (taicpu(p).oper[1]^.val=0) and
+              MatchInstruction(hpfar1, A_MLA, [taicpu(p).condition], [taicpu(p).oppostfix]) and
+              MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[3]^) then
+                begin
+                  TransferUsedRegs(TmpUsedRegs);
+                  UpdateUsedRegs(TmpUsedRegs, tai(p.next));
+                  UpdateUsedRegs(TmpUsedRegs, tai(hpfar1.next));
+                  DebugMsg('Peephole Optimization: MovMLA2MUL 1 done', p);
+                  taicpu(hpfar1).ops:=3;
+                  taicpu(hpfar1).opcode:=A_MUL;
+                  if not(RegUsedAfterInstruction(taicpu(p).oper[0]^.reg,hpfar1,TmpUsedRegs)) then
+                    begin
+                      RemoveCurrentP(p);
+                      Result:=true;
+                    end;
+                  exit;
+                end
+            {
+              This changes the very common
+              mov r0, #0
+              str r0, [...]
+              mov r0, #0
+              str r0, [...]
 
-                            taicpu(hp1).loadReg(2,taicpu(p).oper[1]^.reg);
-                          end
-                        else
-                          taicpu(hp1).loadReg(1,taicpu(p).oper[1]^.reg);
-                        GetNextInstruction(p, hp1);
-                        asml.remove(p);
-                        p.free;
-                        p:=hp1;
-                      end;
-                  end;
-{$endif dummy}
-                A_UXTB:
-                  Result:=OptPass1UXTB(p);
-                A_UXTH:
-                  Result:=OptPass1UXTH(p);
-                A_SXTB:
-                  Result:=OptPass1SXTB(p);
-                A_SXTH:
-                  Result:=OptPass1SXTH(p);
-                A_CMP:
+              and removes all superfluous mov instructions
+            }
+            else if (taicpu(hpfar1).opcode=A_STR) then
+              begin
+                hp1 := hpfar1;
+                while MatchInstruction(hp1, A_STR, [taicpu(p).condition], []) and
+                      MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[0]^) and
+                      GetNextInstruction(hp1, hp2) and
+                      MatchInstruction(hp2, A_MOV, [taicpu(p).condition], [PF_None]) and
+                      (taicpu(hp2).ops = 2) and
+                      MatchOperand(taicpu(hp2).oper[0]^, taicpu(p).oper[0]^) and
+                      MatchOperand(taicpu(hp2).oper[1]^, taicpu(p).oper[1]^) do
                   begin
-                    {
-                      change
-                      cmp   reg,const1
-                      moveq reg,const1
-                      movne reg,const2
-                      to
-                      cmp   reg,const1
-                      movne reg,const2
-                    }
-                    if (taicpu(p).oper[1]^.typ = top_const) and
-                       GetNextInstruction(p, hp1) and
-                       MatchInstruction(hp1, A_MOV, [C_EQ, C_NE], [PF_NONE]) and
-                       (taicpu(hp1).oper[1]^.typ = top_const) and
-                       GetNextInstruction(hp1, hp2) and
-                       MatchInstruction(hp2, A_MOV, [C_EQ, C_NE], [PF_NONE]) and
-                       (taicpu(hp1).oper[1]^.typ = top_const) then
-                      begin
-                        Result:=RemoveRedundantMove(p, hp1, asml) or Result;
-                        Result:=RemoveRedundantMove(p, hp2, asml) or Result;
-                      end;
+                    DebugMsg('Peephole Optimization: MovStrMov done', hp2);
+                    GetNextInstruction(hp2,hp1);
+                    asml.remove(hp2);
+                    hp2.free;
+                    result:=true;
+                    if not assigned(hp1) then break;
                   end;
-                A_STM:
-                  begin
-                    {
-                      change
-	              stmfd	r13!,[r14]
-	              sub	r13,r13,#4
-	              bl	abc
-	              add	r13,r13,#4
-	              ldmfd	r13!,[r15]
-                      into
-                      b         abc
-                    }
-                    if not(ts_thumb_interworking in current_settings.targetswitches) and
-                       MatchInstruction(p, A_STM, [C_None], [PF_FD]) and
-                      GetNextInstruction(p, hp1) and
-                      GetNextInstruction(hp1, hp2) and
-                      SkipEntryExitMarker(hp2, hp2) and
-                      GetNextInstruction(hp2, hp3) and
-                      SkipEntryExitMarker(hp3, hp3) and
-                      GetNextInstruction(hp3, hp4) and
-                      (taicpu(p).oper[0]^.typ = top_ref) and
-                      (taicpu(p).oper[0]^.ref^.index=NR_STACK_POINTER_REG) and
-                      (taicpu(p).oper[0]^.ref^.base=NR_NO) and
-                      (taicpu(p).oper[0]^.ref^.offset=0) and
-                      (taicpu(p).oper[0]^.ref^.addressmode=AM_PREINDEXED) and
-                      (taicpu(p).oper[1]^.typ = top_regset) and
-                      (taicpu(p).oper[1]^.regset^ = [RS_R14]) and
 
-                      MatchInstruction(hp1, A_SUB, [C_None], [PF_NONE]) and
-                      (taicpu(hp1).oper[0]^.typ = top_reg) and
-                      (taicpu(hp1).oper[0]^.reg = NR_STACK_POINTER_REG) and
-                      MatchOperand(taicpu(hp1).oper[0]^,taicpu(hp1).oper[1]^) and
-                      (taicpu(hp1).oper[2]^.typ = top_const) and
+                if Result then
+                  Exit;
+              end;
+            end;
+          {
+            This removes the first mov from
+            mov rX,...
+            mov rX,...
+          }
+          if taicpu(hpfar1).opcode=A_MOV then
+            begin
+              hp1 := p;
+              while MatchInstruction(hpfar1, A_MOV, [taicpu(hp1).condition], [taicpu(hp1).oppostfix]) and
+                    (taicpu(hpfar1).ops = 2) and
+                    MatchOperand(taicpu(hp1).oper[0]^, taicpu(hpfar1).oper[0]^) and
+                    { don't remove the first mov if the second is a mov rX,rX }
+                    not(MatchOperand(taicpu(hpfar1).oper[0]^, taicpu(hpfar1).oper[1]^)) do
+                begin
+                  { Defer removing the first p until after the while loop }
+                  if p <> hp1 then
+                    begin
+                      DebugMsg('Peephole Optimization: MovMov done', hp1);
+                      asml.remove(hp1);
+                      hp1.free;
+                    end;
+                  hp1:=hpfar1;
+                  GetNextInstruction(hpfar1,hpfar1);
+                  result:=true;
+                  if not assigned(hpfar1) then
+                    Break;
+                end;
 
-                      MatchInstruction(hp3, A_ADD, [C_None], [PF_NONE]) and
-                      MatchOperand(taicpu(hp1).oper[0]^,taicpu(hp3).oper[0]^) and
-                      MatchOperand(taicpu(hp1).oper[0]^,taicpu(hp3).oper[1]^) and
-                      MatchOperand(taicpu(hp1).oper[2]^,taicpu(hp3).oper[2]^) and
+              if Result then
+                begin
+                  DebugMsg('Peephole Optimization: MovMov done', p);
+                  RemoveCurrentp(p);
+                  Exit;
+                end;
+            end;
 
-                      MatchInstruction(hp2, [A_BL,A_BLX], [C_None], [PF_NONE]) and
-                      (taicpu(hp2).oper[0]^.typ = top_ref) and
+          if RedundantMovProcess(p,hpfar1) then
+            begin
+              Result:=true;
+              { p might not point at a mov anymore }
+              exit;
+            end;
 
-                      MatchInstruction(hp4, A_LDM, [C_None], [PF_FD]) and
-                      MatchOperand(taicpu(p).oper[0]^,taicpu(hp4).oper[0]^) and
-                      (taicpu(hp4).oper[1]^.typ = top_regset) and
-                      (taicpu(hp4).oper[1]^.regset^ = [RS_R15]) then
-                      begin
-                        asml.Remove(p);
-                        asml.Remove(hp1);
-                        asml.Remove(hp3);
-                        asml.Remove(hp4);
-                        taicpu(hp2).opcode:=A_B;
-                        p.free;
-                        hp1.free;
-                        hp3.free;
-                        hp4.free;
-                        p:=hp2;
-                        DebugMsg('Peephole Bl2B done', p);
+          { Fold the very common sequence
+              mov  regA, regB
+              ldr* regA, [regA]
+            to
+              ldr* regA, [regB]
+            CAUTION! If this one is successful p might not be a mov instruction anymore!
+          }
+          if
+             // Make sure that Thumb code doesn't propagate a high register into a reference
+             (
+               (
+                 GenerateThumbCode and
+                 (getsupreg(taicpu(p).oper[1]^.reg) < RS_R8)
+               ) or (not GenerateThumbCode)
+             ) and
+             (taicpu(p).oper[1]^.typ = top_reg) and
+             (taicpu(p).oppostfix = PF_NONE) and
+             MatchInstruction(hpfar1, [A_LDR, A_STR], [taicpu(p).condition], []) and
+             (taicpu(hpfar1).oper[1]^.typ = top_ref) and
+             { We can change the base register only when the instruction uses AM_OFFSET }
+             ((taicpu(hpfar1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg) or
+               ((taicpu(hpfar1).oper[1]^.ref^.addressmode = AM_OFFSET) and
+                (taicpu(hpfar1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg))
+             ) and
+             not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hpfar1)) and
+             RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) then
+            begin
+              DebugMsg('Peephole Optimization: MovLdr2Ldr done', hpfar1);
+              if (taicpu(hpfar1).oper[1]^.ref^.addressmode = AM_OFFSET) and
+                 (taicpu(hpfar1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg) then
+                taicpu(hpfar1).oper[1]^.ref^.base := taicpu(p).oper[1]^.reg;
+
+              if taicpu(hpfar1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg then
+                taicpu(hpfar1).oper[1]^.ref^.index := taicpu(p).oper[1]^.reg;
+
+              dealloc:=FindRegDeAlloc(taicpu(p).oper[1]^.reg, tai(p.Next));
+              if Assigned(dealloc) then
+                begin
+                  asml.remove(dealloc);
+                  asml.InsertAfter(dealloc,hpfar1);
+                end;
+
+              if not Assigned(hp1) then
+                GetNextInstruction(p, hp1);
+
+              RemoveCurrentP(p, hp1);
+
+              result:=true;
+              Exit;
+            end
+        end
+
+      { 3-operald mov optimisations }
+      else if (taicpu(p).ops = 3) then
+        begin
+
+          if (taicpu(p).oper[2]^.typ = top_shifterop) and
+            (taicpu(p).oper[2]^.shifterop^.rs = NR_NO) and
+            (taicpu(p).oper[2]^.shifterop^.shiftmode = SM_LSR) and
+            (taicpu(hpfar1).ops>=1) and
+            (taicpu(hpfar1).oper[0]^.typ=top_reg) and
+            (not RegModifiedBetween(taicpu(hpfar1).oper[0]^.reg, p, hpfar1)) and
+            RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) then
+            begin
+              if (taicpu(p).oper[2]^.shifterop^.shiftimm >= 24 ) and
+                MatchInstruction(hpfar1, A_AND, [taicpu(p).condition], [taicpu(p).oppostfix]) and
+                (taicpu(hpfar1).ops=3) and
+                MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[1]^) and
+                (taicpu(hpfar1).oper[2]^.typ = top_const) and
+                { Check if the AND actually would only mask out bits being already zero because of the shift
+                }
+                ((($ffffffff shr taicpu(p).oper[2]^.shifterop^.shiftimm) and taicpu(hpfar1).oper[2]^.val) =
+                  ($ffffffff shr taicpu(p).oper[2]^.shifterop^.shiftimm)) then
+                begin
+                  DebugMsg('Peephole Optimization: LsrAnd2Lsr done', hpfar1);
+                  taicpu(p).oper[0]^.reg:=taicpu(hpfar1).oper[0]^.reg;
+                  asml.remove(hpfar1);
+                  hpfar1.free;
+                  result:=true;
+                  Exit;
+                end
+              else if MatchInstruction(hpfar1, A_BIC, [taicpu(p).condition], [taicpu(p).oppostfix]) and
+                (taicpu(hpfar1).ops=3) and
+                MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[1]^) and
+                (taicpu(hpfar1).oper[2]^.typ = top_const) and
+                { Check if the BIC actually would only mask out bits beeing already zero because of the shift }
+                (taicpu(hpfar1).oper[2]^.val<>0) and
+                (BsfDWord(taicpu(hpfar1).oper[2]^.val)>=32-taicpu(p).oper[2]^.shifterop^.shiftimm) then
+                begin
+                  DebugMsg('Peephole Optimization: LsrBic2Lsr done', hpfar1);
+                  taicpu(p).oper[0]^.reg:=taicpu(hpfar1).oper[0]^.reg;
+                  asml.remove(hpfar1);
+                  hpfar1.free;
+                  result:=true;
+                  Exit;
+                end;
+            end;
+          { This folds shifterops into following instructions
+            mov r0, r1, lsl #8
+            add r2, r3, r0
+
+            to
+
+            add r2, r3, r1, lsl #8
+            CAUTION! If this one is successful p might not be a mov instruction anymore!
+          }
+          if (taicpu(p).oper[1]^.typ = top_reg) and
+           (taicpu(p).oper[2]^.typ = top_shifterop) and
+           (taicpu(p).oppostfix = PF_NONE) and
+           MatchInstruction(hpfar1, [A_ADD, A_ADC, A_RSB, A_RSC, A_SUB, A_SBC,
+                                  A_AND, A_BIC, A_EOR, A_ORR, A_TEQ, A_TST,
+                                  A_CMP, A_CMN],
+                            [taicpu(p).condition], [PF_None]) and
+           (not ((GenerateThumb2Code) and
+                 (taicpu(hpfar1).opcode in [A_SBC]) and
+                 (((taicpu(hpfar1).ops=3) and
+                   MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[1]^.reg)) or
+                  ((taicpu(hpfar1).ops=2) and
+                   MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[0]^.reg))))) and
+           RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) and
+           (taicpu(hpfar1).ops >= 2) and
+           {Currently we can't fold into another shifterop}
+           (taicpu(hpfar1).oper[taicpu(hpfar1).ops-1]^.typ = top_reg) and
+           {Folding rrx is problematic because of the C-Flag, as we currently can't check
+            NR_DEFAULTFLAGS for modification}
+           (
+             {Everything is fine if we don't use RRX}
+             (taicpu(p).oper[2]^.shifterop^.shiftmode <> SM_RRX) or
+             (
+               {If it is RRX, then check if we're just accessing the next instruction}
+               Assigned(hp1) and
+               (hpfar1 = hp1)
+             )
+           ) and
+           { reg1 might not be modified inbetween }
+           not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hpfar1)) and
+           { The shifterop can contain a register, might not be modified}
+           (
+             (taicpu(p).oper[2]^.shifterop^.rs = NR_NO) or
+             not(RegModifiedBetween(taicpu(p).oper[2]^.shifterop^.rs, p, hpfar1))
+           ) and
+           (
+             {Only ONE of the two src operands is allowed to match}
+             MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[taicpu(hpfar1).ops-2]^) xor
+             MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[taicpu(hpfar1).ops-1]^)
+           ) then
+          begin
+            if taicpu(hpfar1).opcode in [A_TST, A_TEQ, A_CMN] then
+              I2:=0
+            else
+              I2:=1;
+            for I:=I2 to taicpu(hpfar1).ops-1 do
+              if MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[I]^.reg) then
+                begin
+                  { If the parameter matched on the second op from the RIGHT
+                    we have to switch the parameters, this will not happen for CMP
+                    were we're only evaluating the most right parameter
+                  }
+                  if I <> taicpu(hpfar1).ops-1 then
+                    begin
+                      {The SUB operators need to be changed when we swap parameters}
+                      case taicpu(hpfar1).opcode of
+                        A_SUB: tempop:=A_RSB;
+                        A_SBC: tempop:=A_RSC;
+                        A_RSB: tempop:=A_SUB;
+                        A_RSC: tempop:=A_SBC;
+                        else tempop:=taicpu(hpfar1).opcode;
                       end;
-                  end;
-                A_VMOV:
-                  begin
-                    {
-                      change
-                      vmov reg0,reg1,reg2
-                      vmov reg1,reg2,reg0
-                      into
-                      vmov reg0,reg1,reg2
+                      if taicpu(hpfar1).ops = 3 then
+                        hp2:=taicpu.op_reg_reg_reg_shifterop(tempop,
+                             taicpu(hpfar1).oper[0]^.reg, taicpu(hpfar1).oper[2]^.reg,
+                             taicpu(p).oper[1]^.reg, taicpu(p).oper[2]^.shifterop^)
+                      else
+                        hp2:=taicpu.op_reg_reg_shifterop(tempop,
+                             taicpu(hpfar1).oper[0]^.reg, taicpu(p).oper[1]^.reg,
+                             taicpu(p).oper[2]^.shifterop^);
+                    end
+                  else
+                    if taicpu(hpfar1).ops = 3 then
+                      hp2:=taicpu.op_reg_reg_reg_shifterop(taicpu(hpfar1).opcode,
+                           taicpu(hpfar1).oper[0]^.reg, taicpu(hpfar1).oper[1]^.reg,
+                           taicpu(p).oper[1]^.reg, taicpu(p).oper[2]^.shifterop^)
+                    else
+                      hp2:=taicpu.op_reg_reg_shifterop(taicpu(hpfar1).opcode,
+                           taicpu(hpfar1).oper[0]^.reg, taicpu(p).oper[1]^.reg,
+                           taicpu(p).oper[2]^.shifterop^);
+                  if taicpu(p).oper[2]^.shifterop^.rs<>NR_NO then
+                    AllocRegBetween(taicpu(p).oper[2]^.shifterop^.rs,p,hpfar1,UsedRegs);
+                  AllocRegBetween(taicpu(p).oper[1]^.reg,p,hpfar1,UsedRegs);
+                  asml.insertbefore(hp2, hpfar1);
+                  asml.remove(hpfar1);
+                  hpfar1.free;
+                  DebugMsg('Peephole Optimization: FoldShiftProcess done', hp2);
 
-                      can be applied regardless if reg0 or reg2 is the vfp register
-                    }
-                    if (taicpu(p).ops = 3) and
-                      GetNextInstruction(p, hp1) and
-                      MatchInstruction(hp1, A_VMOV, [taicpu(p).condition], [taicpu(p).oppostfix]) and
-                      (taicpu(hp1).ops = 3) and
-                      MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[2]^) and
-                      MatchOperand(taicpu(p).oper[1]^, taicpu(hp1).oper[0]^) and
-                      MatchOperand(taicpu(p).oper[2]^, taicpu(hp1).oper[1]^) then
-                      begin
-                        asml.Remove(hp1);
-                        hp1.free;
-                        DebugMsg('Peephole VMovVMov2VMov done', p);
-                      end;
-                  end;
-                A_AND:
-                  Result:=OptPass1And(p);
-                A_VLDR,
-                A_VADD,
-                A_VMUL,
-                A_VDIV,
-                A_VSUB,
-                A_VSQRT,
-                A_VNEG,
-                A_VCVT,
-                A_VABS:
-                  begin
-                    if GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
-                      RemoveSuperfluousVMov(p, hp1, 'VOpVMov2VOp') then
-                      Result:=true;
-                  end
-                else
-                  ;
-              end;
+                  if not Assigned(hp1) then
+                    GetNextInstruction(p, hp1)
+                  else if hp1 = hpfar1 then
+                    { If hp1 = hpfar1, then it's a dangling pointer }
+                    hp1 := hp2;
+
+                  RemoveCurrentP(p, hp1);
+                  Result:=true;
+                  Exit;
+                end;
           end;
-        else
-          ;
-      end;
+        {
+          Fold
+            mov r1, r1, lsl #2
+            ldr/ldrb r0, [r0, r1]
+          to
+            ldr/ldrb r0, [r0, r1, lsl #2]
+
+          XXX: This still needs some work, as we quite often encounter something like
+                 mov r1, r2, lsl #2
+                 add r2, r3, #imm
+                 ldr r0, [r2, r1]
+               which can't be folded because r2 is overwritten between the shift and the ldr.
+               We could try to shuffle the registers around and fold it into.
+                 add r1, r3, #imm
+                 ldr r0, [r1, r2, lsl #2]
+        }
+        if (not(GenerateThumbCode)) and
+          { thumb2 allows only lsl #0..#3 }
+          (not(GenerateThumb2Code) or
+           ((taicpu(p).oper[2]^.shifterop^.shiftimm in [0..3]) and
+            (taicpu(p).oper[2]^.shifterop^.shiftmode=SM_LSL)
+           )
+          ) and
+           (taicpu(p).oper[1]^.typ = top_reg) and
+           (taicpu(p).oper[2]^.typ = top_shifterop) and
+           { RRX is tough to handle, because it requires tracking the C-Flag,
+             it is also extremly unlikely to be emitted this way}
+           (taicpu(p).oper[2]^.shifterop^.shiftmode <> SM_RRX) and
+           (taicpu(p).oper[2]^.shifterop^.shiftimm <> 0) and
+           (taicpu(p).oppostfix = PF_NONE) and
+           {Only LDR, LDRB, STR, STRB can handle scaled register indexing}
+           (MatchInstruction(hpfar1, [A_LDR, A_STR], [taicpu(p).condition], [PF_None, PF_B]) or
+            (GenerateThumb2Code and
+             MatchInstruction(hpfar1, [A_LDR, A_STR], [taicpu(p).condition], [PF_None, PF_B, PF_SB, PF_H, PF_SH]))
+           ) and
+           (
+             {If this is address by offset, one of the two registers can be used}
+             ((taicpu(hpfar1).oper[1]^.ref^.addressmode=AM_OFFSET) and
+               (
+                 (taicpu(hpfar1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg) xor
+                 (taicpu(hpfar1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg)
+               )
+             ) or
+             {For post and preindexed only the index register can be used}
+             ((taicpu(hpfar1).oper[1]^.ref^.addressmode in [AM_POSTINDEXED, AM_PREINDEXED]) and
+               (
+                 (taicpu(hpfar1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg) and
+                 (taicpu(hpfar1).oper[1]^.ref^.base <> taicpu(p).oper[0]^.reg)
+               ) and
+               (not GenerateThumb2Code)
+             )
+           ) and
+           { Only fold if both registers are used. Otherwise we are folding p with itself }
+           (taicpu(hpfar1).oper[1]^.ref^.index<>NR_NO) and
+           (taicpu(hpfar1).oper[1]^.ref^.base<>NR_NO) and
+           { Only fold if there isn't another shifterop already, and offset is zero. }
+           (taicpu(hpfar1).oper[1]^.ref^.offset = 0) and
+           (taicpu(hpfar1).oper[1]^.ref^.shiftmode = SM_None) and
+           not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hpfar1)) and
+           RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) then
+           begin
+             { If the register we want to do the shift for resides in base, we need to swap that}
+             if (taicpu(hpfar1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg) then
+               taicpu(hpfar1).oper[1]^.ref^.base := taicpu(hpfar1).oper[1]^.ref^.index;
+             taicpu(hpfar1).oper[1]^.ref^.index := taicpu(p).oper[1]^.reg;
+             taicpu(hpfar1).oper[1]^.ref^.shiftmode := taicpu(p).oper[2]^.shifterop^.shiftmode;
+             taicpu(hpfar1).oper[1]^.ref^.shiftimm := taicpu(p).oper[2]^.shifterop^.shiftimm;
+             DebugMsg('Peephole Optimization: FoldShiftLdrStr done', hpfar1);
+             RemoveCurrentP(p);
+             Result:=true;
+             Exit;
+           end;
+        end;
+      {
+        Often we see shifts and then a superfluous mov to another register
+        In the future this might be handled in RedundantMovProcess when it uses RegisterTracking
+      }
+      if RemoveSuperfluousMove(p, hpfar1, 'MovMov2Mov') then
+        Result:=true;
     end;
 
 
+  function TCpuAsmOptimizer.OptPass1MVN(var p: tai): Boolean;
+    var
+      hp1: tai;
+    begin
+      {
+        change
+        mvn reg2,reg1
+        and reg3,reg4,reg2
+        dealloc reg2
+        to
+        bic reg3,reg4,reg1
+      }
+      if (taicpu(p).oper[1]^.typ = top_reg) and
+        GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
+        MatchInstruction(hp1,A_AND,[],[]) and
+        (((taicpu(hp1).ops=3) and
+          (taicpu(hp1).oper[2]^.typ=top_reg) and
+          (MatchOperand(taicpu(hp1).oper[2]^, taicpu(p).oper[0]^.reg) or
+           MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg))) or
+         ((taicpu(hp1).ops=2) and
+          (taicpu(hp1).oper[1]^.typ=top_reg) and
+          MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg))) and
+        assigned(FindRegDealloc(taicpu(p).oper[0]^.reg,tai(hp1.Next))) and
+        { reg1 might not be modified inbetween }
+        not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) then
+        begin
+          DebugMsg('Peephole Optimization: MvnAnd2Bic done', p);
+          taicpu(hp1).opcode:=A_BIC;
+
+          if taicpu(hp1).ops=3 then
+            begin
+              if MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) then
+                taicpu(hp1).loadReg(1,taicpu(hp1).oper[2]^.reg); // Swap operands
+
+              taicpu(hp1).loadReg(2,taicpu(p).oper[1]^.reg);
+            end
+          else
+            taicpu(hp1).loadReg(1,taicpu(p).oper[1]^.reg);
+
+          RemoveCurrentp(p);
+          Result := True;
+          Exit;
+        end;
+    end;
+
+
+  function TCpuAsmOptimizer.OptPass1VMov(var p: tai): Boolean;
+    var
+      hp1: tai;
+    begin
+      {
+        change
+        vmov reg0,reg1,reg2
+        vmov reg1,reg2,reg0
+        into
+        vmov reg0,reg1,reg2
+
+        can be applied regardless if reg0 or reg2 is the vfp register
+      }
+      Result := False;
+      if (taicpu(p).ops = 3) then
+        while GetNextInstruction(p, hp1) and
+          MatchInstruction(hp1, A_VMOV, [taicpu(p).condition], [taicpu(p).oppostfix]) and
+          (taicpu(hp1).ops = 3) and
+          MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[2]^) and
+          MatchOperand(taicpu(p).oper[1]^, taicpu(hp1).oper[0]^) and
+          MatchOperand(taicpu(p).oper[2]^, taicpu(hp1).oper[1]^) do
+          begin
+            asml.Remove(hp1);
+            hp1.free;
+            DebugMsg('Peephole Optimization: VMovVMov2VMov done', p);
+            { Can we do it again? }
+          end;
+    end;
+
+
+  function TCpuAsmOptimizer.OptPass1VOp(var p: tai): Boolean;
+    var
+      hp1: tai;
+    begin
+      Result := GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
+        RemoveSuperfluousVMov(p, hp1, 'VOpVMov2VOp');
+    end;
+
+
+  function TCpuAsmOptimizer.PeepHoleOptPass1Cpu(var p: tai): boolean;
+    begin
+      result := false;
+      if p.typ = ait_instruction then
+        begin
+          case taicpu(p).opcode of
+            A_CMP:
+              Result := OptPass1CMP(p);
+            A_STR:
+              Result := OptPass1STR(p);
+            A_LDR:
+              Result := OptPass1LDR(p);
+            A_MOV:
+              Result := OptPass1MOV(p);
+            A_AND:
+              Result := OptPass1And(p);
+            A_ADD,
+            A_SUB:
+              Result := OptPass1ADDSUB(p);
+            A_MUL:
+              REsult := OptPass1MUL(p);
+            A_ADC,
+            A_RSB,
+            A_RSC,
+            A_SBC,
+            A_BIC,
+            A_EOR,
+            A_ORR,
+            A_MLA,
+            A_MLS,
+            A_QADD,A_QADD16,A_QADD8,
+            A_QSUB,A_QSUB16,A_QSUB8,
+            A_QDADD,A_QDSUB,A_QASX,A_QSAX,
+            A_SHADD16,A_SHADD8,A_UHADD16,A_UHADD8,
+            A_SHSUB16,A_SHSUB8,A_UHSUB16,A_UHSUB8,
+            A_PKHTB,A_PKHBT,
+            A_SMUAD,A_SMUSD:
+              Result := OptPass1DataCheckMov(p);
+{$ifdef dummy}
+            A_MVN:
+              Result := OPtPass1MVN(p);
+{$endif dummy}
+            A_UXTB:
+              Result := OptPass1UXTB(p);
+            A_UXTH:
+              Result := OptPass1UXTH(p);
+            A_SXTB:
+              Result := OptPass1SXTB(p);
+            A_SXTH:
+              Result := OptPass1SXTH(p);
+            A_STM:
+              Result := OptPass1STM(p);
+            A_VMOV:
+              Result := OptPass1VMov(p);
+            A_VLDR,
+            A_VADD,
+            A_VMUL,
+            A_VDIV,
+            A_VSUB,
+            A_VSQRT,
+            A_VNEG,
+            A_VCVT,
+            A_VABS:
+              Result := OptPass1VOp(p);
+            else
+              ;
+          end;
+        end;
+    end;
+
+
   { instructions modifying the CPSR can be only the last instruction }
   function MustBeLast(p : tai) : boolean;
     begin
Index: compiler/armgen/aoptarm.pas
===================================================================
--- compiler/armgen/aoptarm.pas	(revision 46481)
+++ compiler/armgen/aoptarm.pas	(working copy)
@@ -47,7 +47,7 @@
     function OptPass1UXTH(var p: tai): Boolean;
     function OptPass1SXTB(var p: tai): Boolean;
     function OptPass1SXTH(var p: tai): Boolean;
-    function OptPass1And(var p: tai): Boolean;
+    function OptPass1And(var p: tai): Boolean; virtual;
   End;
 
   function MatchInstruction(const instr: tai; const op: TCommonAsmOps; const cond: TAsmConds; const postfix: TOpPostfixes): boolean;
arm-aarch64-refactor.patch (137,509 bytes)   

J. Gareth Moreton

2020-08-18 16:42

developer   ~0124967

arm-gniur-upgrade.patch (1,231 bytes)   
Index: compiler/armgen/aoptarm.pas
===================================================================
--- compiler/armgen/aoptarm.pas	(revision 46481)
+++ compiler/armgen/aoptarm.pas	(working copy)
@@ -170,18 +170,26 @@
 
   function TARMAsmOptimizer.GetNextInstructionUsingReg(Current: tai;
     Out Next: tai; reg: TRegister): Boolean;
+    var
+      gniResult: Boolean;
     begin
       Next:=Current;
+      Result := False;
       repeat
-        Result:=GetNextInstruction(Next,Next);
-      until not (Result) or
-            not(cs_opt_level3 in current_settings.optimizerswitches) or
-            (Next.typ<>ait_instruction) or
-            RegInInstruction(reg,Next) or
-            is_calljmp(taicpu(Next).opcode)
+
+        gniResult:=GetNextInstruction(Next,Next);
+        if gniResult and RegInInstruction(reg,Next) then
+          { Found something }
+          Exit(True);
+
+      until not gniResult or
+        not(cs_opt_level3 in current_settings.optimizerswitches) or
+        (Next.typ<>ait_instruction) or
+        is_calljmp(taicpu(Next).opcode)
 {$ifdef ARM}
-            or RegModifiedByInstruction(NR_PC,Next);
+        or RegModifiedByInstruction(NR_PC,Next)
 {$endif ARM}
+        ;
     end;
 
 
arm-gniur-upgrade.patch (1,231 bytes)   

J. Gareth Moreton

2020-09-16 00:05

developer   ~0125562

Any updates/opinions on this?

Florian

2020-09-16 21:48

administrator   ~0125581

Not yet, I'll look into it ASAP.

Florian

2020-09-27 23:06

administrator   ~0125911

Thanks, finally applied (with a small modification).

Issue History

Date Modified Username Field Change
2020-08-09 10:50 J. Gareth Moreton New Issue
2020-08-09 10:50 J. Gareth Moreton File Added: arm-aarch64-refactor.patch
2020-08-09 10:51 J. Gareth Moreton Tag Attached: patch
2020-08-09 10:51 J. Gareth Moreton Tag Attached: refactor
2020-08-09 10:51 J. Gareth Moreton Tag Attached: arm
2020-08-09 10:51 J. Gareth Moreton Tag Attached: aarch64
2020-08-16 17:19 Florian Note Added: 0124925
2020-08-16 17:19 Florian File Added: lineinfo.arm-linux.diff
2020-08-16 17:19 Florian File Added: system.aarch64-linux.diff
2020-08-16 21:58 J. Gareth Moreton Note Added: 0124933
2020-08-16 23:03 Florian Note Added: 0124934
2020-08-17 00:59 J. Gareth Moreton Note Added: 0124938
2020-08-18 16:37 J. Gareth Moreton File Deleted: arm-aarch64-refactor.patch
2020-08-18 16:41 J. Gareth Moreton Note Added: 0124966
2020-08-18 16:41 J. Gareth Moreton File Added: arm-aarch64-refactor.patch
2020-08-18 16:42 J. Gareth Moreton Note Added: 0124967
2020-08-18 16:42 J. Gareth Moreton File Added: arm-gniur-upgrade.patch
2020-08-18 16:42 J. Gareth Moreton Note Edited: 0124966 View Revisions
2020-08-18 16:42 J. Gareth Moreton Note Edited: 0124966 View Revisions
2020-08-19 08:33 J. Gareth Moreton Note Edited: 0124966 View Revisions
2020-08-19 08:33 J. Gareth Moreton Note Edited: 0124966 View Revisions
2020-09-16 00:05 J. Gareth Moreton Note Added: 0125562
2020-09-16 21:48 Florian Note Added: 0125581
2020-09-27 23:06 Florian Assigned To => Florian
2020-09-27 23:06 Florian Status new => resolved
2020-09-27 23:06 Florian Resolution open => fixed
2020-09-27 23:06 Florian Fixed in Version => 3.3.1
2020-09-27 23:06 Florian Fixed in Revision => 46975, 46976
2020-09-27 23:06 Florian FPCTarget => -
2020-09-27 23:06 Florian Note Added: 0125911