Commit 8eed3289 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

bn/asm/armv4-mont.pl: boost NEON performance.



Close difference gap on Cortex-A9, which resulted in further improvement
even on other processors.

Reviewed-by: default avatarRichard Levitte <levitte@openssl.org>
parent 75f648aa
Loading
Loading
Loading
Loading
+328 −294
Original line number Diff line number Diff line
@@ -38,6 +38,15 @@
# for execution on all NEON-capable processors, because gain on
# others outweighs the marginal loss on Cortex-A9.

# September 2015
#
# Align Cortex-A9 performance with November 2013 improvements, i.e.
# NEON code is now ~20-105% faster than integer-only one on this
# processor. But this optimization further improved performance even
# on other processors: NEON code path is ~45-180% faster than original
# integer-only on Cortex-A8, ~10-210% on Cortex-A15, ~70-450% on
# Snapdragon S4.

$flavour = shift;
if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
@@ -272,19 +281,16 @@ bn_mul_mont:
.size	bn_mul_mont,.-bn_mul_mont
___
{
sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }

my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
my ($Z,$Temp)=("q4","q5");
my ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13));
my @ACC=map("q$_",(6..13));
my ($Bi,$Ni,$M0)=map("d$_",(28..31));
my $zero=&Dlo($Z);
my $temp=&Dlo($Temp);
my $zero="$Z#lo";
my $temp="$Temp#lo";

my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
my ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9));
my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("r$_",(6..11));

$code.=<<___;
#if __ARM_MAX_ARCH__>=7
@@ -300,59 +306,58 @@ bn_mul8x_mont_neon:
	ldmia	ip,{r4-r5}		@ load rest of parameter block
	mov	ip,sp

	sub		$toutptr,sp,#16
	cmp	$num,#8
	bhi	.LNEON_8n

	@ special case for $num==8, everything is in register bank...

	vld1.32		{${Bi}[0]}, [$bptr,:32]!
	sub		$toutptr,$toutptr,$num,lsl#4
	veor		$zero,$zero,$zero
	sub		$toutptr,sp,$num,lsl#4
	vld1.32		{$A0-$A3},  [$aptr]!		@ can't specify :32 :-(
	and		$toutptr,$toutptr,#-64
	vld1.32		{${M0}[0]}, [$n0,:32]
	mov		sp,$toutptr			@ alloca
	veor		$zero,$zero,$zero
	subs		$inner,$num,#8
	vzip.16		$Bi,$zero

	vmull.u32	$A0xB,$Bi,${A0}[0]
	vmull.u32	$A1xB,$Bi,${A0}[1]
	vmull.u32	$A2xB,$Bi,${A1}[0]
	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
	vmull.u32	$A3xB,$Bi,${A1}[1]
	vmull.u32	@ACC[0],$Bi,${A0}[0]
	vmull.u32	@ACC[1],$Bi,${A0}[1]
	vmull.u32	@ACC[2],$Bi,${A1}[0]
	vshl.i64	$Ni,@ACC[0]#hi,#16
	vmull.u32	@ACC[3],$Bi,${A1}[1]

	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
	vadd.u64	$Ni,$Ni,@ACC[0]#lo
	veor		$zero,$zero,$zero
	vmul.u32	$Ni,$temp,$M0
	vmul.u32	$Ni,$Ni,$M0

	vmull.u32	$A4xB,$Bi,${A2}[0]
	vmull.u32	@ACC[4],$Bi,${A2}[0]
	 vld1.32	{$N0-$N3}, [$nptr]!
	vmull.u32	$A5xB,$Bi,${A2}[1]
	vmull.u32	$A6xB,$Bi,${A3}[0]
	vmull.u32	@ACC[5],$Bi,${A2}[1]
	vmull.u32	@ACC[6],$Bi,${A3}[0]
	vzip.16		$Ni,$zero
	vmull.u32	$A7xB,$Bi,${A3}[1]

	bne	.LNEON_1st

	@ special case for num=8, everything is in register bank...
	vmull.u32	@ACC[7],$Bi,${A3}[1]

	vmlal.u32	$A0xB,$Ni,${N0}[0]
	vmlal.u32	@ACC[0],$Ni,${N0}[0]
	sub		$outer,$num,#1
	vmlal.u32	$A1xB,$Ni,${N0}[1]
	vmlal.u32	$A2xB,$Ni,${N1}[0]
	vmlal.u32	$A3xB,$Ni,${N1}[1]

	vmlal.u32	$A4xB,$Ni,${N2}[0]
	vmov		$Temp,$A0xB
	vmlal.u32	$A5xB,$Ni,${N2}[1]
	vmov		$A0xB,$A1xB
	vmlal.u32	$A6xB,$Ni,${N3}[0]
	vmov		$A1xB,$A2xB
	vmlal.u32	$A7xB,$Ni,${N3}[1]
	vmov		$A2xB,$A3xB
	vmov		$A3xB,$A4xB
	vmlal.u32	@ACC[1],$Ni,${N0}[1]
	vmlal.u32	@ACC[2],$Ni,${N1}[0]
	vmlal.u32	@ACC[3],$Ni,${N1}[1]

	vmlal.u32	@ACC[4],$Ni,${N2}[0]
	vmov		$Temp,@ACC[0]
	vmlal.u32	@ACC[5],$Ni,${N2}[1]
	vmov		@ACC[0],@ACC[1]
	vmlal.u32	@ACC[6],$Ni,${N3}[0]
	vmov		@ACC[1],@ACC[2]
	vmlal.u32	@ACC[7],$Ni,${N3}[1]
	vmov		@ACC[2],@ACC[3]
	vmov		@ACC[3],@ACC[4]
	vshr.u64	$temp,$temp,#16
	vmov		$A4xB,$A5xB
	vmov		$A5xB,$A6xB
	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
	vmov		$A6xB,$A7xB
	veor		$A7xB,$A7xB
	vmov		@ACC[4],@ACC[5]
	vmov		@ACC[5],@ACC[6]
	vadd.u64	$temp,$temp,$Temp#hi
	vmov		@ACC[6],@ACC[7]
	veor		@ACC[7],@ACC[7]
	vshr.u64	$temp,$temp,#16

	b	.LNEON_outer8
@@ -362,279 +367,302 @@ bn_mul8x_mont_neon:
	vld1.32		{${Bi}[0]}, [$bptr,:32]!
	veor		$zero,$zero,$zero
	vzip.16		$Bi,$zero
	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
	vadd.u64	@ACC[0]#lo,@ACC[0]#lo,$temp

	vmlal.u32	$A0xB,$Bi,${A0}[0]
	vmlal.u32	$A1xB,$Bi,${A0}[1]
	vmlal.u32	$A2xB,$Bi,${A1}[0]
	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
	vmlal.u32	$A3xB,$Bi,${A1}[1]
	vmlal.u32	@ACC[0],$Bi,${A0}[0]
	vmlal.u32	@ACC[1],$Bi,${A0}[1]
	vmlal.u32	@ACC[2],$Bi,${A1}[0]
	vshl.i64	$Ni,@ACC[0]#hi,#16
	vmlal.u32	@ACC[3],$Bi,${A1}[1]

	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
	vadd.u64	$Ni,$Ni,@ACC[0]#lo
	veor		$zero,$zero,$zero
	subs		$outer,$outer,#1
	vmul.u32	$Ni,$temp,$M0
	vmul.u32	$Ni,$Ni,$M0

	vmlal.u32	$A4xB,$Bi,${A2}[0]
	vmlal.u32	$A5xB,$Bi,${A2}[1]
	vmlal.u32	$A6xB,$Bi,${A3}[0]
	vmlal.u32	@ACC[4],$Bi,${A2}[0]
	vmlal.u32	@ACC[5],$Bi,${A2}[1]
	vmlal.u32	@ACC[6],$Bi,${A3}[0]
	vzip.16		$Ni,$zero
	vmlal.u32	$A7xB,$Bi,${A3}[1]

	vmlal.u32	$A0xB,$Ni,${N0}[0]
	vmlal.u32	$A1xB,$Ni,${N0}[1]
	vmlal.u32	$A2xB,$Ni,${N1}[0]
	vmlal.u32	$A3xB,$Ni,${N1}[1]

	vmlal.u32	$A4xB,$Ni,${N2}[0]
	vmov		$Temp,$A0xB
	vmlal.u32	$A5xB,$Ni,${N2}[1]
	vmov		$A0xB,$A1xB
	vmlal.u32	$A6xB,$Ni,${N3}[0]
	vmov		$A1xB,$A2xB
	vmlal.u32	$A7xB,$Ni,${N3}[1]
	vmov		$A2xB,$A3xB
	vmov		$A3xB,$A4xB
	vmlal.u32	@ACC[7],$Bi,${A3}[1]

	vmlal.u32	@ACC[0],$Ni,${N0}[0]
	vmlal.u32	@ACC[1],$Ni,${N0}[1]
	vmlal.u32	@ACC[2],$Ni,${N1}[0]
	vmlal.u32	@ACC[3],$Ni,${N1}[1]

	vmlal.u32	@ACC[4],$Ni,${N2}[0]
	vmov		$Temp,@ACC[0]
	vmlal.u32	@ACC[5],$Ni,${N2}[1]
	vmov		@ACC[0],@ACC[1]
	vmlal.u32	@ACC[6],$Ni,${N3}[0]
	vmov		@ACC[1],@ACC[2]
	vmlal.u32	@ACC[7],$Ni,${N3}[1]
	vmov		@ACC[2],@ACC[3]
	vmov		@ACC[3],@ACC[4]
	vshr.u64	$temp,$temp,#16
	vmov		$A4xB,$A5xB
	vmov		$A5xB,$A6xB
	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
	vmov		$A6xB,$A7xB
	veor		$A7xB,$A7xB
	vmov		@ACC[4],@ACC[5]
	vmov		@ACC[5],@ACC[6]
	vadd.u64	$temp,$temp,$Temp#hi
	vmov		@ACC[6],@ACC[7]
	veor		@ACC[7],@ACC[7]
	vshr.u64	$temp,$temp,#16

	bne	.LNEON_outer8

	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
	vadd.u64	@ACC[0]#lo,@ACC[0]#lo,$temp
	mov		$toutptr,sp
	vshr.u64	$temp,`&Dlo("$A0xB")`,#16
	vshr.u64	$temp,@ACC[0]#lo,#16
	mov		$inner,$num
	vadd.u64	`&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
	add		$tinptr,sp,#16
	vshr.u64	$temp,`&Dhi("$A0xB")`,#16
	vzip.16		`&Dlo("$A0xB")`,`&Dhi("$A0xB")`
	vadd.u64	@ACC[0]#hi,@ACC[0]#hi,$temp
	add		$tinptr,sp,#96
	vshr.u64	$temp,@ACC[0]#hi,#16
	vzip.16		@ACC[0]#lo,@ACC[0]#hi

	b	.LNEON_tail2
	b	.LNEON_tail_entry

.align	4
.LNEON_1st:
	vmlal.u32	$A0xB,$Ni,${N0}[0]
	 vld1.32	{$A0-$A3}, [$aptr]!
	vmlal.u32	$A1xB,$Ni,${N0}[1]
	subs		$inner,$inner,#8
	vmlal.u32	$A2xB,$Ni,${N1}[0]
	vmlal.u32	$A3xB,$Ni,${N1}[1]

	vmlal.u32	$A4xB,$Ni,${N2}[0]
	 vld1.32	{$N0-$N1}, [$nptr]!
	vmlal.u32	$A5xB,$Ni,${N2}[1]
	 vst1.64	{$A0xB-$A1xB}, [$toutptr,:256]!
	vmlal.u32	$A6xB,$Ni,${N3}[0]
	vmlal.u32	$A7xB,$Ni,${N3}[1]
	 vst1.64	{$A2xB-$A3xB}, [$toutptr,:256]!

	vmull.u32	$A0xB,$Bi,${A0}[0]
	 vld1.32	{$N2-$N3}, [$nptr]!
	vmull.u32	$A1xB,$Bi,${A0}[1]
	 vst1.64	{$A4xB-$A5xB}, [$toutptr,:256]!
	vmull.u32	$A2xB,$Bi,${A1}[0]
	vmull.u32	$A3xB,$Bi,${A1}[1]
	 vst1.64	{$A6xB-$A7xB}, [$toutptr,:256]!

	vmull.u32	$A4xB,$Bi,${A2}[0]
	vmull.u32	$A5xB,$Bi,${A2}[1]
	vmull.u32	$A6xB,$Bi,${A3}[0]
	vmull.u32	$A7xB,$Bi,${A3}[1]

	bne	.LNEON_1st

	vmlal.u32	$A0xB,$Ni,${N0}[0]
	add		$tinptr,sp,#16
	vmlal.u32	$A1xB,$Ni,${N0}[1]
	sub		$aptr,$aptr,$num,lsl#2		@ rewind $aptr
	vmlal.u32	$A2xB,$Ni,${N1}[0]
	 vld1.64	{$Temp}, [sp,:128]
	vmlal.u32	$A3xB,$Ni,${N1}[1]
	sub		$outer,$num,#1
.LNEON_8n:
	veor		@ACC[0],@ACC[0],@ACC[0]
	 sub		$toutptr,sp,#128
	veor		@ACC[1],@ACC[1],@ACC[1]
	 sub		$toutptr,$toutptr,$num,lsl#4
	veor		@ACC[2],@ACC[2],@ACC[2]
	 and		$toutptr,$toutptr,#-64
	veor		@ACC[3],@ACC[3],@ACC[3]
	 mov		sp,$toutptr			@ alloca
	veor		@ACC[4],@ACC[4],@ACC[4]
	 add		$toutptr,$toutptr,#256
	veor		@ACC[5],@ACC[5],@ACC[5]
	 sub		$inner,$num,#8
	veor		@ACC[6],@ACC[6],@ACC[6]
	veor		@ACC[7],@ACC[7],@ACC[7]

	vmlal.u32	$A4xB,$Ni,${N2}[0]
	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
	vmlal.u32	$A5xB,$Ni,${N2}[1]
	vshr.u64	$temp,$temp,#16
	 vld1.64	{$A0xB},       [$tinptr, :128]!
	vmlal.u32	$A6xB,$Ni,${N3}[0]
	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
	vmlal.u32	$A7xB,$Ni,${N3}[1]

	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
	veor		$Z,$Z,$Z
	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
	vst1.64		{$Z},          [$toutptr,:128]
	vshr.u64	$temp,$temp,#16
.LNEON_8n_init:
	vst1.64		{@ACC[0]-@ACC[1]},[$toutptr,:256]!
	subs		$inner,$inner,#8
	vst1.64		{@ACC[2]-@ACC[3]},[$toutptr,:256]!
	vst1.64		{@ACC[4]-@ACC[5]},[$toutptr,:256]!
	vst1.64		{@ACC[6]-@ACC[7]},[$toutptr,:256]!
	bne		.LNEON_8n_init

	b		.LNEON_outer
	add		$tinptr,sp,#256
	vld1.32		{$A0-$A3},[$aptr]!
	add		$bnptr,sp,#8
	vld1.32		{${M0}[0]},[$n0,:32]
	mov		$outer,$num
	b		.LNEON_8n_outer

.align	4
.LNEON_outer:
	vld1.32		{${Bi}[0]}, [$bptr,:32]!
	sub		$nptr,$nptr,$num,lsl#2		@ rewind $nptr
	vld1.32		{$A0-$A3},  [$aptr]!
.LNEON_8n_outer:
	vld1.32		{${Bi}[0]},[$bptr,:32]!	@ *b++
	veor		$zero,$zero,$zero
	mov		$toutptr,sp
	vzip.16		$Bi,$zero
	sub		$inner,$num,#8
	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp

	vmlal.u32	$A0xB,$Bi,${A0}[0]
	 vld1.64	{$A3xB-$A4xB},[$tinptr,:256]!
	vmlal.u32	$A1xB,$Bi,${A0}[1]
	vmlal.u32	$A2xB,$Bi,${A1}[0]
	 vld1.64	{$A5xB-$A6xB},[$tinptr,:256]!
	vmlal.u32	$A3xB,$Bi,${A1}[1]
	add		$toutptr,sp,#128
	vld1.32		{$N0-$N3},[$nptr]!

	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
	vmlal.u32	@ACC[0],$Bi,${A0}[0]
	vmlal.u32	@ACC[1],$Bi,${A0}[1]
	 veor		$zero,$zero,$zero
	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
	 vld1.64	{$A7xB},[$tinptr,:128]!
	vmul.u32	$Ni,$temp,$M0

	vmlal.u32	$A4xB,$Bi,${A2}[0]
	 vld1.32	{$N0-$N3}, [$nptr]!
	vmlal.u32	$A5xB,$Bi,${A2}[1]
	vmlal.u32	$A6xB,$Bi,${A3}[0]
	vmlal.u32	@ACC[2],$Bi,${A1}[0]
	 vshl.i64	$Ni,@ACC[0]#hi,#16
	vmlal.u32	@ACC[3],$Bi,${A1}[1]
	 vadd.u64	$Ni,$Ni,@ACC[0]#lo
	vmlal.u32	@ACC[4],$Bi,${A2}[0]
	 vmul.u32	$Ni,$Ni,$M0
	vmlal.u32	@ACC[5],$Bi,${A2}[1]
	vst1.32		{$Bi},[sp,:64]		@ put aside smashed b[8*i+0]
	vmlal.u32	@ACC[6],$Bi,${A3}[0]
	 vzip.16	$Ni,$zero
	vmlal.u32	$A7xB,$Bi,${A3}[1]

.LNEON_inner:
	vmlal.u32	$A0xB,$Ni,${N0}[0]
	vmlal.u32	@ACC[7],$Bi,${A3}[1]
___
for ($i=0; $i<7;) {
$code.=<<___;
	vld1.32		{${Bi}[0]},[$bptr,:32]!	@ *b++
	vmlal.u32	@ACC[0],$Ni,${N0}[0]
	veor		$temp,$temp,$temp
	vmlal.u32	@ACC[1],$Ni,${N0}[1]
	vzip.16		$Bi,$temp
	vmlal.u32	@ACC[2],$Ni,${N1}[0]
	 vshr.u64	@ACC[0]#lo,@ACC[0]#lo,#16
	vmlal.u32	@ACC[3],$Ni,${N1}[1]
	vmlal.u32	@ACC[4],$Ni,${N2}[0]
	 vadd.u64	@ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
	vmlal.u32	@ACC[5],$Ni,${N2}[1]
	 vshr.u64	@ACC[0]#lo,@ACC[0]#lo,#16
	vmlal.u32	@ACC[6],$Ni,${N3}[0]
	vmlal.u32	@ACC[7],$Ni,${N3}[1]
	 vadd.u64	@ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
	vst1.32		{$Ni},[$bnptr,:64]!	@ put aside smashed m[8*i+$i]
___
	push(@ACC,shift(@ACC));	$i++;
$code.=<<___;
	vmlal.u32	@ACC[0],$Bi,${A0}[0]
	vld1.64		{@ACC[7]},[$tinptr,:128]!
	vmlal.u32	@ACC[1],$Bi,${A0}[1]
	 veor		$zero,$zero,$zero
	vmlal.u32	@ACC[2],$Bi,${A1}[0]
	 vshl.i64	$Ni,@ACC[0]#hi,#16
	vmlal.u32	@ACC[3],$Bi,${A1}[1]
	 vadd.u64	$Ni,$Ni,@ACC[0]#lo
	vmlal.u32	@ACC[4],$Bi,${A2}[0]
	 vmul.u32	$Ni,$Ni,$M0
	vmlal.u32	@ACC[5],$Bi,${A2}[1]
	vst1.32		{$Bi},[$bnptr,:64]!	@ put aside smashed b[8*i+$i]
	vmlal.u32	@ACC[6],$Bi,${A3}[0]
	 vzip.16	$Ni,$zero
	vmlal.u32	@ACC[7],$Bi,${A3}[1]
___
}
$code.=<<___;
	vld1.32		{$Bi},[sp,:64]		@ pull smashed b[8*i+0]
	vmlal.u32	@ACC[0],$Ni,${N0}[0]
	vld1.32		{$A0-$A3},[$aptr]!
	vmlal.u32	$A1xB,$Ni,${N0}[1]
	vmlal.u32	@ACC[1],$Ni,${N0}[1]
	vmlal.u32	@ACC[2],$Ni,${N1}[0]
	 vshr.u64	@ACC[0]#lo,@ACC[0]#lo,#16
	vmlal.u32	@ACC[3],$Ni,${N1}[1]
	vmlal.u32	@ACC[4],$Ni,${N2}[0]
	 vadd.u64	@ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
	vmlal.u32	@ACC[5],$Ni,${N2}[1]
	 vshr.u64	@ACC[0]#lo,@ACC[0]#lo,#16
	vmlal.u32	@ACC[6],$Ni,${N3}[0]
	vmlal.u32	@ACC[7],$Ni,${N3}[1]
	 vadd.u64	@ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
	vst1.32		{$Ni},[$bnptr,:64]	@ put aside smashed m[8*i+$i]
	add		$bnptr,sp,#8		@ rewind
___
	push(@ACC,shift(@ACC));
$code.=<<___;
	sub		$inner,$num,#8
	b		.LNEON_8n_inner

.align	4
.LNEON_8n_inner:
	subs		$inner,$inner,#8
	vmlal.u32	$A2xB,$Ni,${N1}[0]
	vmlal.u32	$A3xB,$Ni,${N1}[1]
	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!

	vmlal.u32	$A4xB,$Ni,${N2}[0]
	 vld1.64	{$A0xB},       [$tinptr, :128]!
	vmlal.u32	$A5xB,$Ni,${N2}[1]
	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
	vmlal.u32	$A6xB,$Ni,${N3}[0]
	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
	vmlal.u32	$A7xB,$Ni,${N3}[1]
	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!

	vmlal.u32	$A0xB,$Bi,${A0}[0]
	 vld1.64	{$A3xB-$A4xB}, [$tinptr, :256]!
	vmlal.u32	$A1xB,$Bi,${A0}[1]
	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
	vmlal.u32	$A2xB,$Bi,${A1}[0]
	 vld1.64	{$A5xB-$A6xB}, [$tinptr, :256]!
	vmlal.u32	$A3xB,$Bi,${A1}[1]
	vmlal.u32	@ACC[0],$Bi,${A0}[0]
	vld1.64		{@ACC[7]},[$tinptr,:128]
	vmlal.u32	@ACC[1],$Bi,${A0}[1]
	vld1.32		{$Ni},[$bnptr,:64]!	@ pull smashed m[8*i+0]
	vmlal.u32	@ACC[2],$Bi,${A1}[0]
	vld1.32		{$N0-$N3},[$nptr]!
	vmlal.u32	@ACC[3],$Bi,${A1}[1]
	it		ne
	addne		$tinptr,$tinptr,#16	@ don't advance in last iteration
	vmlal.u32	@ACC[4],$Bi,${A2}[0]
	vmlal.u32	@ACC[5],$Bi,${A2}[1]
	vmlal.u32	@ACC[6],$Bi,${A3}[0]
	vmlal.u32	@ACC[7],$Bi,${A3}[1]
___
for ($i=1; $i<8; $i++) {
$code.=<<___;
	vld1.32		{$Bi},[$bnptr,:64]!	@ pull smashed b[8*i+$i]
	vmlal.u32	@ACC[0],$Ni,${N0}[0]
	vmlal.u32	@ACC[1],$Ni,${N0}[1]
	vmlal.u32	@ACC[2],$Ni,${N1}[0]
	vmlal.u32	@ACC[3],$Ni,${N1}[1]
	vmlal.u32	@ACC[4],$Ni,${N2}[0]
	vmlal.u32	@ACC[5],$Ni,${N2}[1]
	vmlal.u32	@ACC[6],$Ni,${N3}[0]
	vmlal.u32	@ACC[7],$Ni,${N3}[1]
	vst1.64		{@ACC[0]},[$toutptr,:128]!
___
	push(@ACC,shift(@ACC));
$code.=<<___;
	vmlal.u32	@ACC[0],$Bi,${A0}[0]
	vld1.64		{@ACC[7]},[$tinptr,:128]
	vmlal.u32	@ACC[1],$Bi,${A0}[1]
	vld1.32		{$Ni},[$bnptr,:64]!	@ pull smashed m[8*i+$i]
	vmlal.u32	@ACC[2],$Bi,${A1}[0]
	it		ne
	addne		$tinptr,$tinptr,#16	@ don't advance in last iteration
	vmlal.u32	@ACC[3],$Bi,${A1}[1]
	vmlal.u32	@ACC[4],$Bi,${A2}[0]
	vmlal.u32	@ACC[5],$Bi,${A2}[1]
	vmlal.u32	@ACC[6],$Bi,${A3}[0]
	vmlal.u32	@ACC[7],$Bi,${A3}[1]
___
}
$code.=<<___;
	it		eq
	subeq		$aptr,$aptr,$num,lsl#2	@ rewind
	vmlal.u32	@ACC[0],$Ni,${N0}[0]
	vld1.32		{$Bi},[sp,:64]		@ pull smashed b[8*i+0]
	vmlal.u32	@ACC[1],$Ni,${N0}[1]
	vld1.32		{$A0-$A3},[$aptr]!
	vmlal.u32	@ACC[2],$Ni,${N1}[0]
	add		$bnptr,sp,#8		@ rewind
	vmlal.u32	@ACC[3],$Ni,${N1}[1]
	vmlal.u32	@ACC[4],$Ni,${N2}[0]
	vmlal.u32	@ACC[5],$Ni,${N2}[1]
	vmlal.u32	@ACC[6],$Ni,${N3}[0]
	vst1.64		{@ACC[0]},[$toutptr,:128]!
	vmlal.u32	@ACC[7],$Ni,${N3}[1]

	bne		.LNEON_8n_inner
___
	push(@ACC,shift(@ACC));
$code.=<<___;
	add		$tinptr,sp,#128
	vst1.64		{@ACC[0]-@ACC[1]},[$toutptr,:256]!
	veor		q2,q2,q2		@ $N0-$N1
	vst1.64		{@ACC[2]-@ACC[3]},[$toutptr,:256]!
	veor		q3,q3,q3		@ $N2-$N3
	vst1.64		{@ACC[4]-@ACC[5]},[$toutptr,:256]!
	vst1.64		{@ACC[6]},[$toutptr,:128]

	subs		$outer,$outer,#8
	vld1.64		{@ACC[0]-@ACC[1]},[$tinptr,:256]!
	vld1.64		{@ACC[2]-@ACC[3]},[$tinptr,:256]!
	vld1.64		{@ACC[4]-@ACC[5]},[$tinptr,:256]!
	vld1.64		{@ACC[6]-@ACC[7]},[$tinptr,:256]!

	vmlal.u32	$A4xB,$Bi,${A2}[0]
	 vld1.64	{$A7xB},       [$tinptr, :128]!
	vmlal.u32	$A5xB,$Bi,${A2}[1]
	vmlal.u32	$A6xB,$Bi,${A3}[0]
	vmlal.u32	$A7xB,$Bi,${A3}[1]

	bne	.LNEON_inner

	vmlal.u32	$A0xB,$Ni,${N0}[0]
	add		$tinptr,sp,#16
	vmlal.u32	$A1xB,$Ni,${N0}[1]
	sub		$aptr,$aptr,$num,lsl#2		@ rewind $aptr
	vmlal.u32	$A2xB,$Ni,${N1}[0]
	 vld1.64	{$Temp}, [sp,:128]
	vmlal.u32	$A3xB,$Ni,${N1}[1]
	subs		$outer,$outer,#1

	vmlal.u32	$A4xB,$Ni,${N2}[0]
	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
	vmlal.u32	$A5xB,$Ni,${N2}[1]
	 vld1.64	{$A0xB},       [$tinptr, :128]!
	vshr.u64	$temp,$temp,#16
	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
	vmlal.u32	$A6xB,$Ni,${N3}[0]
	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
	vmlal.u32	$A7xB,$Ni,${N3}[1]

	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
	vshr.u64	$temp,$temp,#16

	bne	.LNEON_outer
	itt		ne
	subne		$nptr,$nptr,$num,lsl#2	@ rewind
	bne		.LNEON_8n_outer

	add		$toutptr,sp,#128
	vst1.64		{q2-q3}, [sp,:256]!	@ start wiping stack frame
	vshr.u64	$temp,@ACC[0]#lo,#16
	vst1.64		{q2-q3},[sp,:256]!
	vadd.u64	@ACC[0]#hi,@ACC[0]#hi,$temp
	vst1.64		{q2-q3}, [sp,:256]!
	vshr.u64	$temp,@ACC[0]#hi,#16
	vst1.64		{q2-q3}, [sp,:256]!
	vzip.16		@ACC[0]#lo,@ACC[0]#hi

	mov		$toutptr,sp
	mov		$inner,$num
	b		.LNEON_tail_entry

.align	4
.LNEON_tail:
	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
	vld1.64		{$A3xB-$A4xB}, [$tinptr, :256]!
	vshr.u64	$temp,`&Dlo("$A0xB")`,#16
	vadd.u64	`&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
	vld1.64		{$A5xB-$A6xB}, [$tinptr, :256]!
	vshr.u64	$temp,`&Dhi("$A0xB")`,#16
	vld1.64		{$A7xB},       [$tinptr, :128]!
	vzip.16		`&Dlo("$A0xB")`,`&Dhi("$A0xB")`

.LNEON_tail2:
	vadd.u64	`&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp
	vst1.32		{`&Dlo("$A0xB")`[0]}, [$toutptr, :32]!
	vshr.u64	$temp,`&Dlo("$A1xB")`,#16
	vadd.u64	`&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp
	vshr.u64	$temp,`&Dhi("$A1xB")`,#16
	vzip.16		`&Dlo("$A1xB")`,`&Dhi("$A1xB")`

	vadd.u64	`&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp
	vst1.32		{`&Dlo("$A1xB")`[0]}, [$toutptr, :32]!
	vshr.u64	$temp,`&Dlo("$A2xB")`,#16
	vadd.u64	`&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp
	vshr.u64	$temp,`&Dhi("$A2xB")`,#16
	vzip.16		`&Dlo("$A2xB")`,`&Dhi("$A2xB")`

	vadd.u64	`&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp
	vst1.32		{`&Dlo("$A2xB")`[0]}, [$toutptr, :32]!
	vshr.u64	$temp,`&Dlo("$A3xB")`,#16
	vadd.u64	`&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp
	vshr.u64	$temp,`&Dhi("$A3xB")`,#16
	vzip.16		`&Dlo("$A3xB")`,`&Dhi("$A3xB")`

	vadd.u64	`&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp
	vst1.32		{`&Dlo("$A3xB")`[0]}, [$toutptr, :32]!
	vshr.u64	$temp,`&Dlo("$A4xB")`,#16
	vadd.u64	`&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp
	vshr.u64	$temp,`&Dhi("$A4xB")`,#16
	vzip.16		`&Dlo("$A4xB")`,`&Dhi("$A4xB")`

	vadd.u64	`&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp
	vst1.32		{`&Dlo("$A4xB")`[0]}, [$toutptr, :32]!
	vshr.u64	$temp,`&Dlo("$A5xB")`,#16
	vadd.u64	`&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp
	vshr.u64	$temp,`&Dhi("$A5xB")`,#16
	vzip.16		`&Dlo("$A5xB")`,`&Dhi("$A5xB")`

	vadd.u64	`&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp
	vst1.32		{`&Dlo("$A5xB")`[0]}, [$toutptr, :32]!
	vshr.u64	$temp,`&Dlo("$A6xB")`,#16
	vadd.u64	`&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp
	vld1.64		{$A0xB}, [$tinptr, :128]!
	vshr.u64	$temp,`&Dhi("$A6xB")`,#16
	vzip.16		`&Dlo("$A6xB")`,`&Dhi("$A6xB")`

	vadd.u64	`&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp
	vst1.32		{`&Dlo("$A6xB")`[0]}, [$toutptr, :32]!
	vshr.u64	$temp,`&Dlo("$A7xB")`,#16
	vadd.u64	`&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp
	vld1.64		{$A1xB-$A2xB},	[$tinptr, :256]!
	vshr.u64	$temp,`&Dhi("$A7xB")`,#16
	vzip.16		`&Dlo("$A7xB")`,`&Dhi("$A7xB")`
	vadd.u64	@ACC[0]#lo,@ACC[0]#lo,$temp
	vshr.u64	$temp,@ACC[0]#lo,#16
	vld1.64		{@ACC[2]-@ACC[3]}, [$tinptr, :256]!
	vadd.u64	@ACC[0]#hi,@ACC[0]#hi,$temp
	vld1.64		{@ACC[4]-@ACC[5]}, [$tinptr, :256]!
	vshr.u64	$temp,@ACC[0]#hi,#16
	vld1.64		{@ACC[6]-@ACC[7]}, [$tinptr, :256]!
	vzip.16		@ACC[0]#lo,@ACC[0]#hi

.LNEON_tail_entry:
___
for ($i=1; $i<8; $i++) {
$code.=<<___;
	vadd.u64	@ACC[1]#lo,@ACC[1]#lo,$temp
	vst1.32		{@ACC[0]#lo[0]}, [$toutptr, :32]!
	vshr.u64	$temp,@ACC[1]#lo,#16
	vadd.u64	@ACC[1]#hi,@ACC[1]#hi,$temp
	vshr.u64	$temp,@ACC[1]#hi,#16
	vzip.16		@ACC[1]#lo,@ACC[1]#hi
___
	push(@ACC,shift(@ACC));
}
	push(@ACC,shift(@ACC));
$code.=<<___;
	vld1.64		{@ACC[0]-@ACC[1]}, [$tinptr, :256]!
	subs		$inner,$inner,#8
	vst1.32		{`&Dlo("$A7xB")`[0]}, [$toutptr, :32]!

	vst1.32		{@ACC[7]#lo[0]},   [$toutptr, :32]!
	bne	.LNEON_tail

	vst1.32	{${temp}[0]}, [$toutptr, :32]		@ top-most bit
@@ -708,8 +736,14 @@ $code.=<<___;
#endif
___

$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
$code =~ s/\bret\b/bx	lr/gm;
print $code;
foreach (split("\n",$code)) {
	s/\`([^\`]*)\`/eval $1/ge;

	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/ge	or
	s/\bret\b/bx    lr/g						or
	s/\bbx\s+lr\b/.word\t0xe12fff1e/g;	# make it possible to compile with -march=armv4

	print $_,"\n";
}

close STDOUT;