Loading crypto/bn/asm/armv4-mont.pl +328 −294 Original line number Diff line number Diff line Loading @@ -38,6 +38,15 @@ # for execution on all NEON-capable processors, because gain on # others outweighs the marginal loss on Cortex-A9. # September 2015 # # Align Cortex-A9 performance with November 2013 improvements, i.e. # NEON code is now ~20-105% faster than integer-only one on this # processor. But this optimization further improved performance even # on other processors: NEON code path is ~45-180% faster than original # integer-only on Cortex-A8, ~10-210% on Cortex-A15, ~70-450% on # Snapdragon S4. $flavour = shift; if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} } Loading Loading @@ -272,19 +281,16 @@ bn_mul_mont: .size bn_mul_mont,.-bn_mul_mont ___ { sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } my ($A0,$A1,$A2,$A3)=map("d$_",(0..3)); my ($N0,$N1,$N2,$N3)=map("d$_",(4..7)); my ($Z,$Temp)=("q4","q5"); my ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13)); my @ACC=map("q$_",(6..13)); my ($Bi,$Ni,$M0)=map("d$_",(28..31)); my $zero=&Dlo($Z); my $temp=&Dlo($Temp); my $zero="$Z#lo"; my $temp="$Temp#lo"; my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5)); my ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9)); my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("r$_",(6..11)); $code.=<<___; #if __ARM_MAX_ARCH__>=7 Loading @@ -300,59 +306,58 @@ bn_mul8x_mont_neon: ldmia ip,{r4-r5} @ load rest of parameter block mov ip,sp sub $toutptr,sp,#16 cmp $num,#8 bhi .LNEON_8n @ special case for $num==8, everything is in register bank... vld1.32 {${Bi}[0]}, [$bptr,:32]! sub $toutptr,$toutptr,$num,lsl#4 veor $zero,$zero,$zero sub $toutptr,sp,$num,lsl#4 vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-( and $toutptr,$toutptr,#-64 vld1.32 {${M0}[0]}, [$n0,:32] mov sp,$toutptr @ alloca veor $zero,$zero,$zero subs $inner,$num,#8 vzip.16 $Bi,$zero vmull.u32 $A0xB,$Bi,${A0}[0] vmull.u32 $A1xB,$Bi,${A0}[1] vmull.u32 $A2xB,$Bi,${A1}[0] vshl.i64 $temp,`&Dhi("$A0xB")`,#16 vmull.u32 $A3xB,$Bi,${A1}[1] vmull.u32 @ACC[0],$Bi,${A0}[0] vmull.u32 @ACC[1],$Bi,${A0}[1] vmull.u32 @ACC[2],$Bi,${A1}[0] vshl.i64 $Ni,@ACC[0]#hi,#16 vmull.u32 @ACC[3],$Bi,${A1}[1] vadd.u64 $temp,$temp,`&Dlo("$A0xB")` vadd.u64 $Ni,$Ni,@ACC[0]#lo veor $zero,$zero,$zero vmul.u32 $Ni,$temp,$M0 vmul.u32 $Ni,$Ni,$M0 vmull.u32 $A4xB,$Bi,${A2}[0] vmull.u32 @ACC[4],$Bi,${A2}[0] vld1.32 {$N0-$N3}, [$nptr]! vmull.u32 $A5xB,$Bi,${A2}[1] vmull.u32 $A6xB,$Bi,${A3}[0] vmull.u32 @ACC[5],$Bi,${A2}[1] vmull.u32 @ACC[6],$Bi,${A3}[0] vzip.16 $Ni,$zero vmull.u32 $A7xB,$Bi,${A3}[1] bne .LNEON_1st @ special case for num=8, everything is in register bank... vmull.u32 @ACC[7],$Bi,${A3}[1] vmlal.u32 $A0xB,$Ni,${N0}[0] vmlal.u32 @ACC[0],$Ni,${N0}[0] sub $outer,$num,#1 vmlal.u32 $A1xB,$Ni,${N0}[1] vmlal.u32 $A2xB,$Ni,${N1}[0] vmlal.u32 $A3xB,$Ni,${N1}[1] vmlal.u32 $A4xB,$Ni,${N2}[0] vmov $Temp,$A0xB vmlal.u32 $A5xB,$Ni,${N2}[1] vmov $A0xB,$A1xB vmlal.u32 $A6xB,$Ni,${N3}[0] vmov $A1xB,$A2xB vmlal.u32 $A7xB,$Ni,${N3}[1] vmov $A2xB,$A3xB vmov $A3xB,$A4xB vmlal.u32 @ACC[1],$Ni,${N0}[1] vmlal.u32 @ACC[2],$Ni,${N1}[0] vmlal.u32 @ACC[3],$Ni,${N1}[1] vmlal.u32 @ACC[4],$Ni,${N2}[0] vmov $Temp,@ACC[0] vmlal.u32 @ACC[5],$Ni,${N2}[1] vmov @ACC[0],@ACC[1] vmlal.u32 @ACC[6],$Ni,${N3}[0] vmov @ACC[1],@ACC[2] vmlal.u32 @ACC[7],$Ni,${N3}[1] vmov @ACC[2],@ACC[3] vmov @ACC[3],@ACC[4] vshr.u64 $temp,$temp,#16 vmov $A4xB,$A5xB vmov $A5xB,$A6xB vadd.u64 $temp,$temp,`&Dhi("$Temp")` vmov $A6xB,$A7xB veor $A7xB,$A7xB vmov @ACC[4],@ACC[5] vmov @ACC[5],@ACC[6] vadd.u64 $temp,$temp,$Temp#hi vmov @ACC[6],@ACC[7] veor @ACC[7],@ACC[7] vshr.u64 $temp,$temp,#16 b .LNEON_outer8 Loading @@ -362,279 +367,302 @@ bn_mul8x_mont_neon: vld1.32 {${Bi}[0]}, [$bptr,:32]! veor $zero,$zero,$zero vzip.16 $Bi,$zero vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp vmlal.u32 $A0xB,$Bi,${A0}[0] vmlal.u32 $A1xB,$Bi,${A0}[1] vmlal.u32 $A2xB,$Bi,${A1}[0] vshl.i64 $temp,`&Dhi("$A0xB")`,#16 vmlal.u32 $A3xB,$Bi,${A1}[1] vmlal.u32 @ACC[0],$Bi,${A0}[0] vmlal.u32 @ACC[1],$Bi,${A0}[1] vmlal.u32 @ACC[2],$Bi,${A1}[0] vshl.i64 $Ni,@ACC[0]#hi,#16 vmlal.u32 @ACC[3],$Bi,${A1}[1] vadd.u64 $temp,$temp,`&Dlo("$A0xB")` vadd.u64 $Ni,$Ni,@ACC[0]#lo veor $zero,$zero,$zero subs $outer,$outer,#1 vmul.u32 $Ni,$temp,$M0 vmul.u32 $Ni,$Ni,$M0 vmlal.u32 $A4xB,$Bi,${A2}[0] vmlal.u32 $A5xB,$Bi,${A2}[1] vmlal.u32 $A6xB,$Bi,${A3}[0] vmlal.u32 @ACC[4],$Bi,${A2}[0] vmlal.u32 @ACC[5],$Bi,${A2}[1] vmlal.u32 @ACC[6],$Bi,${A3}[0] vzip.16 $Ni,$zero vmlal.u32 $A7xB,$Bi,${A3}[1] vmlal.u32 $A0xB,$Ni,${N0}[0] vmlal.u32 $A1xB,$Ni,${N0}[1] vmlal.u32 $A2xB,$Ni,${N1}[0] vmlal.u32 $A3xB,$Ni,${N1}[1] vmlal.u32 $A4xB,$Ni,${N2}[0] vmov $Temp,$A0xB vmlal.u32 $A5xB,$Ni,${N2}[1] vmov $A0xB,$A1xB vmlal.u32 $A6xB,$Ni,${N3}[0] vmov $A1xB,$A2xB vmlal.u32 $A7xB,$Ni,${N3}[1] vmov $A2xB,$A3xB vmov $A3xB,$A4xB vmlal.u32 @ACC[7],$Bi,${A3}[1] vmlal.u32 @ACC[0],$Ni,${N0}[0] vmlal.u32 @ACC[1],$Ni,${N0}[1] vmlal.u32 @ACC[2],$Ni,${N1}[0] vmlal.u32 @ACC[3],$Ni,${N1}[1] vmlal.u32 @ACC[4],$Ni,${N2}[0] vmov $Temp,@ACC[0] vmlal.u32 @ACC[5],$Ni,${N2}[1] vmov @ACC[0],@ACC[1] vmlal.u32 @ACC[6],$Ni,${N3}[0] vmov @ACC[1],@ACC[2] vmlal.u32 @ACC[7],$Ni,${N3}[1] vmov @ACC[2],@ACC[3] vmov @ACC[3],@ACC[4] vshr.u64 $temp,$temp,#16 vmov $A4xB,$A5xB vmov $A5xB,$A6xB vadd.u64 $temp,$temp,`&Dhi("$Temp")` vmov $A6xB,$A7xB veor $A7xB,$A7xB vmov @ACC[4],@ACC[5] vmov @ACC[5],@ACC[6] vadd.u64 $temp,$temp,$Temp#hi vmov @ACC[6],@ACC[7] veor @ACC[7],@ACC[7] vshr.u64 $temp,$temp,#16 bne .LNEON_outer8 vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp mov $toutptr,sp vshr.u64 $temp,`&Dlo("$A0xB")`,#16 vshr.u64 $temp,@ACC[0]#lo,#16 mov $inner,$num vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp add $tinptr,sp,#16 vshr.u64 $temp,`&Dhi("$A0xB")`,#16 vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")` vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp add $tinptr,sp,#96 vshr.u64 $temp,@ACC[0]#hi,#16 vzip.16 @ACC[0]#lo,@ACC[0]#hi b .LNEON_tail2 b .LNEON_tail_entry .align 4 .LNEON_1st: vmlal.u32 $A0xB,$Ni,${N0}[0] vld1.32 {$A0-$A3}, [$aptr]! vmlal.u32 $A1xB,$Ni,${N0}[1] subs $inner,$inner,#8 vmlal.u32 $A2xB,$Ni,${N1}[0] vmlal.u32 $A3xB,$Ni,${N1}[1] vmlal.u32 $A4xB,$Ni,${N2}[0] vld1.32 {$N0-$N1}, [$nptr]! vmlal.u32 $A5xB,$Ni,${N2}[1] vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! vmlal.u32 $A6xB,$Ni,${N3}[0] vmlal.u32 $A7xB,$Ni,${N3}[1] vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! vmull.u32 $A0xB,$Bi,${A0}[0] vld1.32 {$N2-$N3}, [$nptr]! vmull.u32 $A1xB,$Bi,${A0}[1] vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! vmull.u32 $A2xB,$Bi,${A1}[0] vmull.u32 $A3xB,$Bi,${A1}[1] vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! vmull.u32 $A4xB,$Bi,${A2}[0] vmull.u32 $A5xB,$Bi,${A2}[1] vmull.u32 $A6xB,$Bi,${A3}[0] vmull.u32 $A7xB,$Bi,${A3}[1] bne .LNEON_1st vmlal.u32 $A0xB,$Ni,${N0}[0] add $tinptr,sp,#16 vmlal.u32 $A1xB,$Ni,${N0}[1] sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr vmlal.u32 $A2xB,$Ni,${N1}[0] vld1.64 {$Temp}, [sp,:128] vmlal.u32 $A3xB,$Ni,${N1}[1] sub $outer,$num,#1 .LNEON_8n: veor @ACC[0],@ACC[0],@ACC[0] sub $toutptr,sp,#128 veor @ACC[1],@ACC[1],@ACC[1] sub $toutptr,$toutptr,$num,lsl#4 veor @ACC[2],@ACC[2],@ACC[2] and $toutptr,$toutptr,#-64 veor @ACC[3],@ACC[3],@ACC[3] mov sp,$toutptr @ alloca veor @ACC[4],@ACC[4],@ACC[4] add $toutptr,$toutptr,#256 veor @ACC[5],@ACC[5],@ACC[5] sub $inner,$num,#8 veor @ACC[6],@ACC[6],@ACC[6] veor @ACC[7],@ACC[7],@ACC[7] vmlal.u32 $A4xB,$Ni,${N2}[0] vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! vmlal.u32 $A5xB,$Ni,${N2}[1] vshr.u64 $temp,$temp,#16 vld1.64 {$A0xB}, [$tinptr, :128]! vmlal.u32 $A6xB,$Ni,${N3}[0] vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! vmlal.u32 $A7xB,$Ni,${N3}[1] vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! vadd.u64 $temp,$temp,`&Dhi("$Temp")` veor $Z,$Z,$Z vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! vst1.64 {$Z}, [$toutptr,:128] vshr.u64 $temp,$temp,#16 .LNEON_8n_init: vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]! subs $inner,$inner,#8 vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]! vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]! vst1.64 {@ACC[6]-@ACC[7]},[$toutptr,:256]! bne .LNEON_8n_init b .LNEON_outer add $tinptr,sp,#256 vld1.32 {$A0-$A3},[$aptr]! add $bnptr,sp,#8 vld1.32 {${M0}[0]},[$n0,:32] mov $outer,$num b .LNEON_8n_outer .align 4 .LNEON_outer: vld1.32 {${Bi}[0]}, [$bptr,:32]! sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr vld1.32 {$A0-$A3}, [$aptr]! .LNEON_8n_outer: vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++ veor $zero,$zero,$zero mov $toutptr,sp vzip.16 $Bi,$zero sub $inner,$num,#8 vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp vmlal.u32 $A0xB,$Bi,${A0}[0] vld1.64 {$A3xB-$A4xB},[$tinptr,:256]! vmlal.u32 $A1xB,$Bi,${A0}[1] vmlal.u32 $A2xB,$Bi,${A1}[0] vld1.64 {$A5xB-$A6xB},[$tinptr,:256]! vmlal.u32 $A3xB,$Bi,${A1}[1] add $toutptr,sp,#128 vld1.32 {$N0-$N3},[$nptr]! vshl.i64 $temp,`&Dhi("$A0xB")`,#16 vmlal.u32 @ACC[0],$Bi,${A0}[0] vmlal.u32 @ACC[1],$Bi,${A0}[1] veor $zero,$zero,$zero vadd.u64 $temp,$temp,`&Dlo("$A0xB")` vld1.64 {$A7xB},[$tinptr,:128]! vmul.u32 $Ni,$temp,$M0 vmlal.u32 $A4xB,$Bi,${A2}[0] vld1.32 {$N0-$N3}, [$nptr]! vmlal.u32 $A5xB,$Bi,${A2}[1] vmlal.u32 $A6xB,$Bi,${A3}[0] vmlal.u32 @ACC[2],$Bi,${A1}[0] vshl.i64 $Ni,@ACC[0]#hi,#16 vmlal.u32 @ACC[3],$Bi,${A1}[1] vadd.u64 $Ni,$Ni,@ACC[0]#lo vmlal.u32 @ACC[4],$Bi,${A2}[0] vmul.u32 $Ni,$Ni,$M0 vmlal.u32 @ACC[5],$Bi,${A2}[1] vst1.32 {$Bi},[sp,:64] @ put aside smashed b[8*i+0] vmlal.u32 @ACC[6],$Bi,${A3}[0] vzip.16 $Ni,$zero vmlal.u32 $A7xB,$Bi,${A3}[1] .LNEON_inner: vmlal.u32 $A0xB,$Ni,${N0}[0] vmlal.u32 @ACC[7],$Bi,${A3}[1] ___ for ($i=0; $i<7;) { $code.=<<___; vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++ vmlal.u32 @ACC[0],$Ni,${N0}[0] veor $temp,$temp,$temp vmlal.u32 @ACC[1],$Ni,${N0}[1] vzip.16 $Bi,$temp vmlal.u32 @ACC[2],$Ni,${N1}[0] vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 vmlal.u32 @ACC[3],$Ni,${N1}[1] vmlal.u32 @ACC[4],$Ni,${N2}[0] vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi vmlal.u32 @ACC[5],$Ni,${N2}[1] vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 vmlal.u32 @ACC[6],$Ni,${N3}[0] vmlal.u32 @ACC[7],$Ni,${N3}[1] vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo vst1.32 {$Ni},[$bnptr,:64]! @ put aside smashed m[8*i+$i] ___ push(@ACC,shift(@ACC)); $i++; $code.=<<___; vmlal.u32 @ACC[0],$Bi,${A0}[0] vld1.64 {@ACC[7]},[$tinptr,:128]! vmlal.u32 @ACC[1],$Bi,${A0}[1] veor $zero,$zero,$zero vmlal.u32 @ACC[2],$Bi,${A1}[0] vshl.i64 $Ni,@ACC[0]#hi,#16 vmlal.u32 @ACC[3],$Bi,${A1}[1] vadd.u64 $Ni,$Ni,@ACC[0]#lo vmlal.u32 @ACC[4],$Bi,${A2}[0] vmul.u32 $Ni,$Ni,$M0 vmlal.u32 @ACC[5],$Bi,${A2}[1] vst1.32 {$Bi},[$bnptr,:64]! @ put aside smashed b[8*i+$i] vmlal.u32 @ACC[6],$Bi,${A3}[0] vzip.16 $Ni,$zero vmlal.u32 @ACC[7],$Bi,${A3}[1] ___ } $code.=<<___; vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0] vmlal.u32 @ACC[0],$Ni,${N0}[0] vld1.32 {$A0-$A3},[$aptr]! vmlal.u32 $A1xB,$Ni,${N0}[1] vmlal.u32 @ACC[1],$Ni,${N0}[1] vmlal.u32 @ACC[2],$Ni,${N1}[0] vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 vmlal.u32 @ACC[3],$Ni,${N1}[1] vmlal.u32 @ACC[4],$Ni,${N2}[0] vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi vmlal.u32 @ACC[5],$Ni,${N2}[1] vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 vmlal.u32 @ACC[6],$Ni,${N3}[0] vmlal.u32 @ACC[7],$Ni,${N3}[1] vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo vst1.32 {$Ni},[$bnptr,:64] @ put aside smashed m[8*i+$i] add $bnptr,sp,#8 @ rewind ___ push(@ACC,shift(@ACC)); $code.=<<___; sub $inner,$num,#8 b .LNEON_8n_inner .align 4 .LNEON_8n_inner: subs $inner,$inner,#8 vmlal.u32 $A2xB,$Ni,${N1}[0] vmlal.u32 $A3xB,$Ni,${N1}[1] vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! vmlal.u32 $A4xB,$Ni,${N2}[0] vld1.64 {$A0xB}, [$tinptr, :128]! vmlal.u32 $A5xB,$Ni,${N2}[1] vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! vmlal.u32 $A6xB,$Ni,${N3}[0] vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! vmlal.u32 $A7xB,$Ni,${N3}[1] vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! vmlal.u32 $A0xB,$Bi,${A0}[0] vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]! vmlal.u32 $A1xB,$Bi,${A0}[1] vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! vmlal.u32 $A2xB,$Bi,${A1}[0] vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]! vmlal.u32 $A3xB,$Bi,${A1}[1] vmlal.u32 @ACC[0],$Bi,${A0}[0] vld1.64 {@ACC[7]},[$tinptr,:128] vmlal.u32 @ACC[1],$Bi,${A0}[1] vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+0] vmlal.u32 @ACC[2],$Bi,${A1}[0] vld1.32 {$N0-$N3},[$nptr]! vmlal.u32 @ACC[3],$Bi,${A1}[1] it ne addne $tinptr,$tinptr,#16 @ don't advance in last iteration vmlal.u32 @ACC[4],$Bi,${A2}[0] vmlal.u32 @ACC[5],$Bi,${A2}[1] vmlal.u32 @ACC[6],$Bi,${A3}[0] vmlal.u32 @ACC[7],$Bi,${A3}[1] ___ for ($i=1; $i<8; $i++) { $code.=<<___; vld1.32 {$Bi},[$bnptr,:64]! @ pull smashed b[8*i+$i] vmlal.u32 @ACC[0],$Ni,${N0}[0] vmlal.u32 @ACC[1],$Ni,${N0}[1] vmlal.u32 @ACC[2],$Ni,${N1}[0] vmlal.u32 @ACC[3],$Ni,${N1}[1] vmlal.u32 @ACC[4],$Ni,${N2}[0] vmlal.u32 @ACC[5],$Ni,${N2}[1] vmlal.u32 @ACC[6],$Ni,${N3}[0] vmlal.u32 @ACC[7],$Ni,${N3}[1] vst1.64 {@ACC[0]},[$toutptr,:128]! ___ push(@ACC,shift(@ACC)); $code.=<<___; vmlal.u32 @ACC[0],$Bi,${A0}[0] vld1.64 {@ACC[7]},[$tinptr,:128] vmlal.u32 @ACC[1],$Bi,${A0}[1] vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+$i] vmlal.u32 @ACC[2],$Bi,${A1}[0] it ne addne $tinptr,$tinptr,#16 @ don't advance in last iteration vmlal.u32 @ACC[3],$Bi,${A1}[1] vmlal.u32 @ACC[4],$Bi,${A2}[0] vmlal.u32 @ACC[5],$Bi,${A2}[1] vmlal.u32 @ACC[6],$Bi,${A3}[0] vmlal.u32 @ACC[7],$Bi,${A3}[1] ___ } $code.=<<___; it eq subeq $aptr,$aptr,$num,lsl#2 @ rewind vmlal.u32 @ACC[0],$Ni,${N0}[0] vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0] vmlal.u32 @ACC[1],$Ni,${N0}[1] vld1.32 {$A0-$A3},[$aptr]! vmlal.u32 @ACC[2],$Ni,${N1}[0] add $bnptr,sp,#8 @ rewind vmlal.u32 @ACC[3],$Ni,${N1}[1] vmlal.u32 @ACC[4],$Ni,${N2}[0] vmlal.u32 @ACC[5],$Ni,${N2}[1] vmlal.u32 @ACC[6],$Ni,${N3}[0] vst1.64 {@ACC[0]},[$toutptr,:128]! vmlal.u32 @ACC[7],$Ni,${N3}[1] bne .LNEON_8n_inner ___ push(@ACC,shift(@ACC)); $code.=<<___; add $tinptr,sp,#128 vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]! veor q2,q2,q2 @ $N0-$N1 vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]! veor q3,q3,q3 @ $N2-$N3 vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]! vst1.64 {@ACC[6]},[$toutptr,:128] subs $outer,$outer,#8 vld1.64 {@ACC[0]-@ACC[1]},[$tinptr,:256]! vld1.64 {@ACC[2]-@ACC[3]},[$tinptr,:256]! vld1.64 {@ACC[4]-@ACC[5]},[$tinptr,:256]! vld1.64 {@ACC[6]-@ACC[7]},[$tinptr,:256]! vmlal.u32 $A4xB,$Bi,${A2}[0] vld1.64 {$A7xB}, [$tinptr, :128]! vmlal.u32 $A5xB,$Bi,${A2}[1] vmlal.u32 $A6xB,$Bi,${A3}[0] vmlal.u32 $A7xB,$Bi,${A3}[1] bne .LNEON_inner vmlal.u32 $A0xB,$Ni,${N0}[0] add $tinptr,sp,#16 vmlal.u32 $A1xB,$Ni,${N0}[1] sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr vmlal.u32 $A2xB,$Ni,${N1}[0] vld1.64 {$Temp}, [sp,:128] vmlal.u32 $A3xB,$Ni,${N1}[1] subs $outer,$outer,#1 vmlal.u32 $A4xB,$Ni,${N2}[0] vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! vmlal.u32 $A5xB,$Ni,${N2}[1] vld1.64 {$A0xB}, [$tinptr, :128]! vshr.u64 $temp,$temp,#16 vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! vmlal.u32 $A6xB,$Ni,${N3}[0] vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! vmlal.u32 $A7xB,$Ni,${N3}[1] vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! vadd.u64 $temp,$temp,`&Dhi("$Temp")` vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! vshr.u64 $temp,$temp,#16 bne .LNEON_outer itt ne subne $nptr,$nptr,$num,lsl#2 @ rewind bne .LNEON_8n_outer add $toutptr,sp,#128 vst1.64 {q2-q3}, [sp,:256]! @ start wiping stack frame vshr.u64 $temp,@ACC[0]#lo,#16 vst1.64 {q2-q3},[sp,:256]! vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp vst1.64 {q2-q3}, [sp,:256]! vshr.u64 $temp,@ACC[0]#hi,#16 vst1.64 {q2-q3}, [sp,:256]! vzip.16 @ACC[0]#lo,@ACC[0]#hi mov $toutptr,sp mov $inner,$num b .LNEON_tail_entry .align 4 .LNEON_tail: vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]! vshr.u64 $temp,`&Dlo("$A0xB")`,#16 vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]! vshr.u64 $temp,`&Dhi("$A0xB")`,#16 vld1.64 {$A7xB}, [$tinptr, :128]! vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")` .LNEON_tail2: vadd.u64 `&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp vst1.32 {`&Dlo("$A0xB")`[0]}, [$toutptr, :32]! vshr.u64 $temp,`&Dlo("$A1xB")`,#16 vadd.u64 `&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp vshr.u64 $temp,`&Dhi("$A1xB")`,#16 vzip.16 `&Dlo("$A1xB")`,`&Dhi("$A1xB")` vadd.u64 `&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp vst1.32 {`&Dlo("$A1xB")`[0]}, [$toutptr, :32]! vshr.u64 $temp,`&Dlo("$A2xB")`,#16 vadd.u64 `&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp vshr.u64 $temp,`&Dhi("$A2xB")`,#16 vzip.16 `&Dlo("$A2xB")`,`&Dhi("$A2xB")` vadd.u64 `&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp vst1.32 {`&Dlo("$A2xB")`[0]}, [$toutptr, :32]! vshr.u64 $temp,`&Dlo("$A3xB")`,#16 vadd.u64 `&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp vshr.u64 $temp,`&Dhi("$A3xB")`,#16 vzip.16 `&Dlo("$A3xB")`,`&Dhi("$A3xB")` vadd.u64 `&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp vst1.32 {`&Dlo("$A3xB")`[0]}, [$toutptr, :32]! vshr.u64 $temp,`&Dlo("$A4xB")`,#16 vadd.u64 `&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp vshr.u64 $temp,`&Dhi("$A4xB")`,#16 vzip.16 `&Dlo("$A4xB")`,`&Dhi("$A4xB")` vadd.u64 `&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp vst1.32 {`&Dlo("$A4xB")`[0]}, [$toutptr, :32]! vshr.u64 $temp,`&Dlo("$A5xB")`,#16 vadd.u64 `&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp vshr.u64 $temp,`&Dhi("$A5xB")`,#16 vzip.16 `&Dlo("$A5xB")`,`&Dhi("$A5xB")` vadd.u64 `&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp vst1.32 {`&Dlo("$A5xB")`[0]}, [$toutptr, :32]! vshr.u64 $temp,`&Dlo("$A6xB")`,#16 vadd.u64 `&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp vld1.64 {$A0xB}, [$tinptr, :128]! vshr.u64 $temp,`&Dhi("$A6xB")`,#16 vzip.16 `&Dlo("$A6xB")`,`&Dhi("$A6xB")` vadd.u64 `&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp vst1.32 {`&Dlo("$A6xB")`[0]}, [$toutptr, :32]! vshr.u64 $temp,`&Dlo("$A7xB")`,#16 vadd.u64 `&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! vshr.u64 $temp,`&Dhi("$A7xB")`,#16 vzip.16 `&Dlo("$A7xB")`,`&Dhi("$A7xB")` vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp vshr.u64 $temp,@ACC[0]#lo,#16 vld1.64 {@ACC[2]-@ACC[3]}, [$tinptr, :256]! vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp vld1.64 {@ACC[4]-@ACC[5]}, [$tinptr, :256]! vshr.u64 $temp,@ACC[0]#hi,#16 vld1.64 {@ACC[6]-@ACC[7]}, [$tinptr, :256]! vzip.16 @ACC[0]#lo,@ACC[0]#hi .LNEON_tail_entry: ___ for ($i=1; $i<8; $i++) { $code.=<<___; vadd.u64 @ACC[1]#lo,@ACC[1]#lo,$temp vst1.32 {@ACC[0]#lo[0]}, [$toutptr, :32]! vshr.u64 $temp,@ACC[1]#lo,#16 vadd.u64 @ACC[1]#hi,@ACC[1]#hi,$temp vshr.u64 $temp,@ACC[1]#hi,#16 vzip.16 @ACC[1]#lo,@ACC[1]#hi ___ push(@ACC,shift(@ACC)); } push(@ACC,shift(@ACC)); $code.=<<___; vld1.64 {@ACC[0]-@ACC[1]}, [$tinptr, :256]! subs $inner,$inner,#8 vst1.32 {`&Dlo("$A7xB")`[0]}, [$toutptr, :32]! vst1.32 {@ACC[7]#lo[0]}, [$toutptr, :32]! bne .LNEON_tail vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit Loading Loading @@ -708,8 +736,14 @@ $code.=<<___; #endif ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 $code =~ s/\bret\b/bx lr/gm; print $code; foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/ge; s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/ge or s/\bret\b/bx lr/g or s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4 print $_,"\n"; } close STDOUT; Loading
crypto/bn/asm/armv4-mont.pl +328 −294 Original line number Diff line number Diff line Loading @@ -38,6 +38,15 @@ # for execution on all NEON-capable processors, because gain on # others outweighs the marginal loss on Cortex-A9. # September 2015 # # Align Cortex-A9 performance with November 2013 improvements, i.e. # NEON code is now ~20-105% faster than integer-only one on this # processor. But this optimization further improved performance even # on other processors: NEON code path is ~45-180% faster than original # integer-only on Cortex-A8, ~10-210% on Cortex-A15, ~70-450% on # Snapdragon S4. $flavour = shift; if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} } Loading Loading @@ -272,19 +281,16 @@ bn_mul_mont: .size bn_mul_mont,.-bn_mul_mont ___ { sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } my ($A0,$A1,$A2,$A3)=map("d$_",(0..3)); my ($N0,$N1,$N2,$N3)=map("d$_",(4..7)); my ($Z,$Temp)=("q4","q5"); my ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13)); my @ACC=map("q$_",(6..13)); my ($Bi,$Ni,$M0)=map("d$_",(28..31)); my $zero=&Dlo($Z); my $temp=&Dlo($Temp); my $zero="$Z#lo"; my $temp="$Temp#lo"; my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5)); my ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9)); my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("r$_",(6..11)); $code.=<<___; #if __ARM_MAX_ARCH__>=7 Loading @@ -300,59 +306,58 @@ bn_mul8x_mont_neon: ldmia ip,{r4-r5} @ load rest of parameter block mov ip,sp sub $toutptr,sp,#16 cmp $num,#8 bhi .LNEON_8n @ special case for $num==8, everything is in register bank... vld1.32 {${Bi}[0]}, [$bptr,:32]! sub $toutptr,$toutptr,$num,lsl#4 veor $zero,$zero,$zero sub $toutptr,sp,$num,lsl#4 vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-( and $toutptr,$toutptr,#-64 vld1.32 {${M0}[0]}, [$n0,:32] mov sp,$toutptr @ alloca veor $zero,$zero,$zero subs $inner,$num,#8 vzip.16 $Bi,$zero vmull.u32 $A0xB,$Bi,${A0}[0] vmull.u32 $A1xB,$Bi,${A0}[1] vmull.u32 $A2xB,$Bi,${A1}[0] vshl.i64 $temp,`&Dhi("$A0xB")`,#16 vmull.u32 $A3xB,$Bi,${A1}[1] vmull.u32 @ACC[0],$Bi,${A0}[0] vmull.u32 @ACC[1],$Bi,${A0}[1] vmull.u32 @ACC[2],$Bi,${A1}[0] vshl.i64 $Ni,@ACC[0]#hi,#16 vmull.u32 @ACC[3],$Bi,${A1}[1] vadd.u64 $temp,$temp,`&Dlo("$A0xB")` vadd.u64 $Ni,$Ni,@ACC[0]#lo veor $zero,$zero,$zero vmul.u32 $Ni,$temp,$M0 vmul.u32 $Ni,$Ni,$M0 vmull.u32 $A4xB,$Bi,${A2}[0] vmull.u32 @ACC[4],$Bi,${A2}[0] vld1.32 {$N0-$N3}, [$nptr]! vmull.u32 $A5xB,$Bi,${A2}[1] vmull.u32 $A6xB,$Bi,${A3}[0] vmull.u32 @ACC[5],$Bi,${A2}[1] vmull.u32 @ACC[6],$Bi,${A3}[0] vzip.16 $Ni,$zero vmull.u32 $A7xB,$Bi,${A3}[1] bne .LNEON_1st @ special case for num=8, everything is in register bank... vmull.u32 @ACC[7],$Bi,${A3}[1] vmlal.u32 $A0xB,$Ni,${N0}[0] vmlal.u32 @ACC[0],$Ni,${N0}[0] sub $outer,$num,#1 vmlal.u32 $A1xB,$Ni,${N0}[1] vmlal.u32 $A2xB,$Ni,${N1}[0] vmlal.u32 $A3xB,$Ni,${N1}[1] vmlal.u32 $A4xB,$Ni,${N2}[0] vmov $Temp,$A0xB vmlal.u32 $A5xB,$Ni,${N2}[1] vmov $A0xB,$A1xB vmlal.u32 $A6xB,$Ni,${N3}[0] vmov $A1xB,$A2xB vmlal.u32 $A7xB,$Ni,${N3}[1] vmov $A2xB,$A3xB vmov $A3xB,$A4xB vmlal.u32 @ACC[1],$Ni,${N0}[1] vmlal.u32 @ACC[2],$Ni,${N1}[0] vmlal.u32 @ACC[3],$Ni,${N1}[1] vmlal.u32 @ACC[4],$Ni,${N2}[0] vmov $Temp,@ACC[0] vmlal.u32 @ACC[5],$Ni,${N2}[1] vmov @ACC[0],@ACC[1] vmlal.u32 @ACC[6],$Ni,${N3}[0] vmov @ACC[1],@ACC[2] vmlal.u32 @ACC[7],$Ni,${N3}[1] vmov @ACC[2],@ACC[3] vmov @ACC[3],@ACC[4] vshr.u64 $temp,$temp,#16 vmov $A4xB,$A5xB vmov $A5xB,$A6xB vadd.u64 $temp,$temp,`&Dhi("$Temp")` vmov $A6xB,$A7xB veor $A7xB,$A7xB vmov @ACC[4],@ACC[5] vmov @ACC[5],@ACC[6] vadd.u64 $temp,$temp,$Temp#hi vmov @ACC[6],@ACC[7] veor @ACC[7],@ACC[7] vshr.u64 $temp,$temp,#16 b .LNEON_outer8 Loading @@ -362,279 +367,302 @@ bn_mul8x_mont_neon: vld1.32 {${Bi}[0]}, [$bptr,:32]! veor $zero,$zero,$zero vzip.16 $Bi,$zero vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp vmlal.u32 $A0xB,$Bi,${A0}[0] vmlal.u32 $A1xB,$Bi,${A0}[1] vmlal.u32 $A2xB,$Bi,${A1}[0] vshl.i64 $temp,`&Dhi("$A0xB")`,#16 vmlal.u32 $A3xB,$Bi,${A1}[1] vmlal.u32 @ACC[0],$Bi,${A0}[0] vmlal.u32 @ACC[1],$Bi,${A0}[1] vmlal.u32 @ACC[2],$Bi,${A1}[0] vshl.i64 $Ni,@ACC[0]#hi,#16 vmlal.u32 @ACC[3],$Bi,${A1}[1] vadd.u64 $temp,$temp,`&Dlo("$A0xB")` vadd.u64 $Ni,$Ni,@ACC[0]#lo veor $zero,$zero,$zero subs $outer,$outer,#1 vmul.u32 $Ni,$temp,$M0 vmul.u32 $Ni,$Ni,$M0 vmlal.u32 $A4xB,$Bi,${A2}[0] vmlal.u32 $A5xB,$Bi,${A2}[1] vmlal.u32 $A6xB,$Bi,${A3}[0] vmlal.u32 @ACC[4],$Bi,${A2}[0] vmlal.u32 @ACC[5],$Bi,${A2}[1] vmlal.u32 @ACC[6],$Bi,${A3}[0] vzip.16 $Ni,$zero vmlal.u32 $A7xB,$Bi,${A3}[1] vmlal.u32 $A0xB,$Ni,${N0}[0] vmlal.u32 $A1xB,$Ni,${N0}[1] vmlal.u32 $A2xB,$Ni,${N1}[0] vmlal.u32 $A3xB,$Ni,${N1}[1] vmlal.u32 $A4xB,$Ni,${N2}[0] vmov $Temp,$A0xB vmlal.u32 $A5xB,$Ni,${N2}[1] vmov $A0xB,$A1xB vmlal.u32 $A6xB,$Ni,${N3}[0] vmov $A1xB,$A2xB vmlal.u32 $A7xB,$Ni,${N3}[1] vmov $A2xB,$A3xB vmov $A3xB,$A4xB vmlal.u32 @ACC[7],$Bi,${A3}[1] vmlal.u32 @ACC[0],$Ni,${N0}[0] vmlal.u32 @ACC[1],$Ni,${N0}[1] vmlal.u32 @ACC[2],$Ni,${N1}[0] vmlal.u32 @ACC[3],$Ni,${N1}[1] vmlal.u32 @ACC[4],$Ni,${N2}[0] vmov $Temp,@ACC[0] vmlal.u32 @ACC[5],$Ni,${N2}[1] vmov @ACC[0],@ACC[1] vmlal.u32 @ACC[6],$Ni,${N3}[0] vmov @ACC[1],@ACC[2] vmlal.u32 @ACC[7],$Ni,${N3}[1] vmov @ACC[2],@ACC[3] vmov @ACC[3],@ACC[4] vshr.u64 $temp,$temp,#16 vmov $A4xB,$A5xB vmov $A5xB,$A6xB vadd.u64 $temp,$temp,`&Dhi("$Temp")` vmov $A6xB,$A7xB veor $A7xB,$A7xB vmov @ACC[4],@ACC[5] vmov @ACC[5],@ACC[6] vadd.u64 $temp,$temp,$Temp#hi vmov @ACC[6],@ACC[7] veor @ACC[7],@ACC[7] vshr.u64 $temp,$temp,#16 bne .LNEON_outer8 vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp mov $toutptr,sp vshr.u64 $temp,`&Dlo("$A0xB")`,#16 vshr.u64 $temp,@ACC[0]#lo,#16 mov $inner,$num vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp add $tinptr,sp,#16 vshr.u64 $temp,`&Dhi("$A0xB")`,#16 vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")` vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp add $tinptr,sp,#96 vshr.u64 $temp,@ACC[0]#hi,#16 vzip.16 @ACC[0]#lo,@ACC[0]#hi b .LNEON_tail2 b .LNEON_tail_entry .align 4 .LNEON_1st: vmlal.u32 $A0xB,$Ni,${N0}[0] vld1.32 {$A0-$A3}, [$aptr]! vmlal.u32 $A1xB,$Ni,${N0}[1] subs $inner,$inner,#8 vmlal.u32 $A2xB,$Ni,${N1}[0] vmlal.u32 $A3xB,$Ni,${N1}[1] vmlal.u32 $A4xB,$Ni,${N2}[0] vld1.32 {$N0-$N1}, [$nptr]! vmlal.u32 $A5xB,$Ni,${N2}[1] vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! vmlal.u32 $A6xB,$Ni,${N3}[0] vmlal.u32 $A7xB,$Ni,${N3}[1] vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! vmull.u32 $A0xB,$Bi,${A0}[0] vld1.32 {$N2-$N3}, [$nptr]! vmull.u32 $A1xB,$Bi,${A0}[1] vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! vmull.u32 $A2xB,$Bi,${A1}[0] vmull.u32 $A3xB,$Bi,${A1}[1] vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! vmull.u32 $A4xB,$Bi,${A2}[0] vmull.u32 $A5xB,$Bi,${A2}[1] vmull.u32 $A6xB,$Bi,${A3}[0] vmull.u32 $A7xB,$Bi,${A3}[1] bne .LNEON_1st vmlal.u32 $A0xB,$Ni,${N0}[0] add $tinptr,sp,#16 vmlal.u32 $A1xB,$Ni,${N0}[1] sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr vmlal.u32 $A2xB,$Ni,${N1}[0] vld1.64 {$Temp}, [sp,:128] vmlal.u32 $A3xB,$Ni,${N1}[1] sub $outer,$num,#1 .LNEON_8n: veor @ACC[0],@ACC[0],@ACC[0] sub $toutptr,sp,#128 veor @ACC[1],@ACC[1],@ACC[1] sub $toutptr,$toutptr,$num,lsl#4 veor @ACC[2],@ACC[2],@ACC[2] and $toutptr,$toutptr,#-64 veor @ACC[3],@ACC[3],@ACC[3] mov sp,$toutptr @ alloca veor @ACC[4],@ACC[4],@ACC[4] add $toutptr,$toutptr,#256 veor @ACC[5],@ACC[5],@ACC[5] sub $inner,$num,#8 veor @ACC[6],@ACC[6],@ACC[6] veor @ACC[7],@ACC[7],@ACC[7] vmlal.u32 $A4xB,$Ni,${N2}[0] vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! vmlal.u32 $A5xB,$Ni,${N2}[1] vshr.u64 $temp,$temp,#16 vld1.64 {$A0xB}, [$tinptr, :128]! vmlal.u32 $A6xB,$Ni,${N3}[0] vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! vmlal.u32 $A7xB,$Ni,${N3}[1] vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! vadd.u64 $temp,$temp,`&Dhi("$Temp")` veor $Z,$Z,$Z vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! vst1.64 {$Z}, [$toutptr,:128] vshr.u64 $temp,$temp,#16 .LNEON_8n_init: vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]! subs $inner,$inner,#8 vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]! vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]! vst1.64 {@ACC[6]-@ACC[7]},[$toutptr,:256]! bne .LNEON_8n_init b .LNEON_outer add $tinptr,sp,#256 vld1.32 {$A0-$A3},[$aptr]! add $bnptr,sp,#8 vld1.32 {${M0}[0]},[$n0,:32] mov $outer,$num b .LNEON_8n_outer .align 4 .LNEON_outer: vld1.32 {${Bi}[0]}, [$bptr,:32]! sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr vld1.32 {$A0-$A3}, [$aptr]! .LNEON_8n_outer: vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++ veor $zero,$zero,$zero mov $toutptr,sp vzip.16 $Bi,$zero sub $inner,$num,#8 vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp vmlal.u32 $A0xB,$Bi,${A0}[0] vld1.64 {$A3xB-$A4xB},[$tinptr,:256]! vmlal.u32 $A1xB,$Bi,${A0}[1] vmlal.u32 $A2xB,$Bi,${A1}[0] vld1.64 {$A5xB-$A6xB},[$tinptr,:256]! vmlal.u32 $A3xB,$Bi,${A1}[1] add $toutptr,sp,#128 vld1.32 {$N0-$N3},[$nptr]! vshl.i64 $temp,`&Dhi("$A0xB")`,#16 vmlal.u32 @ACC[0],$Bi,${A0}[0] vmlal.u32 @ACC[1],$Bi,${A0}[1] veor $zero,$zero,$zero vadd.u64 $temp,$temp,`&Dlo("$A0xB")` vld1.64 {$A7xB},[$tinptr,:128]! vmul.u32 $Ni,$temp,$M0 vmlal.u32 $A4xB,$Bi,${A2}[0] vld1.32 {$N0-$N3}, [$nptr]! vmlal.u32 $A5xB,$Bi,${A2}[1] vmlal.u32 $A6xB,$Bi,${A3}[0] vmlal.u32 @ACC[2],$Bi,${A1}[0] vshl.i64 $Ni,@ACC[0]#hi,#16 vmlal.u32 @ACC[3],$Bi,${A1}[1] vadd.u64 $Ni,$Ni,@ACC[0]#lo vmlal.u32 @ACC[4],$Bi,${A2}[0] vmul.u32 $Ni,$Ni,$M0 vmlal.u32 @ACC[5],$Bi,${A2}[1] vst1.32 {$Bi},[sp,:64] @ put aside smashed b[8*i+0] vmlal.u32 @ACC[6],$Bi,${A3}[0] vzip.16 $Ni,$zero vmlal.u32 $A7xB,$Bi,${A3}[1] .LNEON_inner: vmlal.u32 $A0xB,$Ni,${N0}[0] vmlal.u32 @ACC[7],$Bi,${A3}[1] ___ for ($i=0; $i<7;) { $code.=<<___; vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++ vmlal.u32 @ACC[0],$Ni,${N0}[0] veor $temp,$temp,$temp vmlal.u32 @ACC[1],$Ni,${N0}[1] vzip.16 $Bi,$temp vmlal.u32 @ACC[2],$Ni,${N1}[0] vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 vmlal.u32 @ACC[3],$Ni,${N1}[1] vmlal.u32 @ACC[4],$Ni,${N2}[0] vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi vmlal.u32 @ACC[5],$Ni,${N2}[1] vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 vmlal.u32 @ACC[6],$Ni,${N3}[0] vmlal.u32 @ACC[7],$Ni,${N3}[1] vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo vst1.32 {$Ni},[$bnptr,:64]! @ put aside smashed m[8*i+$i] ___ push(@ACC,shift(@ACC)); $i++; $code.=<<___; vmlal.u32 @ACC[0],$Bi,${A0}[0] vld1.64 {@ACC[7]},[$tinptr,:128]! vmlal.u32 @ACC[1],$Bi,${A0}[1] veor $zero,$zero,$zero vmlal.u32 @ACC[2],$Bi,${A1}[0] vshl.i64 $Ni,@ACC[0]#hi,#16 vmlal.u32 @ACC[3],$Bi,${A1}[1] vadd.u64 $Ni,$Ni,@ACC[0]#lo vmlal.u32 @ACC[4],$Bi,${A2}[0] vmul.u32 $Ni,$Ni,$M0 vmlal.u32 @ACC[5],$Bi,${A2}[1] vst1.32 {$Bi},[$bnptr,:64]! @ put aside smashed b[8*i+$i] vmlal.u32 @ACC[6],$Bi,${A3}[0] vzip.16 $Ni,$zero vmlal.u32 @ACC[7],$Bi,${A3}[1] ___ } $code.=<<___; vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0] vmlal.u32 @ACC[0],$Ni,${N0}[0] vld1.32 {$A0-$A3},[$aptr]! vmlal.u32 $A1xB,$Ni,${N0}[1] vmlal.u32 @ACC[1],$Ni,${N0}[1] vmlal.u32 @ACC[2],$Ni,${N1}[0] vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 vmlal.u32 @ACC[3],$Ni,${N1}[1] vmlal.u32 @ACC[4],$Ni,${N2}[0] vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi vmlal.u32 @ACC[5],$Ni,${N2}[1] vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 vmlal.u32 @ACC[6],$Ni,${N3}[0] vmlal.u32 @ACC[7],$Ni,${N3}[1] vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo vst1.32 {$Ni},[$bnptr,:64] @ put aside smashed m[8*i+$i] add $bnptr,sp,#8 @ rewind ___ push(@ACC,shift(@ACC)); $code.=<<___; sub $inner,$num,#8 b .LNEON_8n_inner .align 4 .LNEON_8n_inner: subs $inner,$inner,#8 vmlal.u32 $A2xB,$Ni,${N1}[0] vmlal.u32 $A3xB,$Ni,${N1}[1] vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! vmlal.u32 $A4xB,$Ni,${N2}[0] vld1.64 {$A0xB}, [$tinptr, :128]! vmlal.u32 $A5xB,$Ni,${N2}[1] vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! vmlal.u32 $A6xB,$Ni,${N3}[0] vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! vmlal.u32 $A7xB,$Ni,${N3}[1] vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! vmlal.u32 $A0xB,$Bi,${A0}[0] vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]! vmlal.u32 $A1xB,$Bi,${A0}[1] vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! vmlal.u32 $A2xB,$Bi,${A1}[0] vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]! vmlal.u32 $A3xB,$Bi,${A1}[1] vmlal.u32 @ACC[0],$Bi,${A0}[0] vld1.64 {@ACC[7]},[$tinptr,:128] vmlal.u32 @ACC[1],$Bi,${A0}[1] vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+0] vmlal.u32 @ACC[2],$Bi,${A1}[0] vld1.32 {$N0-$N3},[$nptr]! vmlal.u32 @ACC[3],$Bi,${A1}[1] it ne addne $tinptr,$tinptr,#16 @ don't advance in last iteration vmlal.u32 @ACC[4],$Bi,${A2}[0] vmlal.u32 @ACC[5],$Bi,${A2}[1] vmlal.u32 @ACC[6],$Bi,${A3}[0] vmlal.u32 @ACC[7],$Bi,${A3}[1] ___ for ($i=1; $i<8; $i++) { $code.=<<___; vld1.32 {$Bi},[$bnptr,:64]! @ pull smashed b[8*i+$i] vmlal.u32 @ACC[0],$Ni,${N0}[0] vmlal.u32 @ACC[1],$Ni,${N0}[1] vmlal.u32 @ACC[2],$Ni,${N1}[0] vmlal.u32 @ACC[3],$Ni,${N1}[1] vmlal.u32 @ACC[4],$Ni,${N2}[0] vmlal.u32 @ACC[5],$Ni,${N2}[1] vmlal.u32 @ACC[6],$Ni,${N3}[0] vmlal.u32 @ACC[7],$Ni,${N3}[1] vst1.64 {@ACC[0]},[$toutptr,:128]! ___ push(@ACC,shift(@ACC)); $code.=<<___; vmlal.u32 @ACC[0],$Bi,${A0}[0] vld1.64 {@ACC[7]},[$tinptr,:128] vmlal.u32 @ACC[1],$Bi,${A0}[1] vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+$i] vmlal.u32 @ACC[2],$Bi,${A1}[0] it ne addne $tinptr,$tinptr,#16 @ don't advance in last iteration vmlal.u32 @ACC[3],$Bi,${A1}[1] vmlal.u32 @ACC[4],$Bi,${A2}[0] vmlal.u32 @ACC[5],$Bi,${A2}[1] vmlal.u32 @ACC[6],$Bi,${A3}[0] vmlal.u32 @ACC[7],$Bi,${A3}[1] ___ } $code.=<<___; it eq subeq $aptr,$aptr,$num,lsl#2 @ rewind vmlal.u32 @ACC[0],$Ni,${N0}[0] vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0] vmlal.u32 @ACC[1],$Ni,${N0}[1] vld1.32 {$A0-$A3},[$aptr]! vmlal.u32 @ACC[2],$Ni,${N1}[0] add $bnptr,sp,#8 @ rewind vmlal.u32 @ACC[3],$Ni,${N1}[1] vmlal.u32 @ACC[4],$Ni,${N2}[0] vmlal.u32 @ACC[5],$Ni,${N2}[1] vmlal.u32 @ACC[6],$Ni,${N3}[0] vst1.64 {@ACC[0]},[$toutptr,:128]! vmlal.u32 @ACC[7],$Ni,${N3}[1] bne .LNEON_8n_inner ___ push(@ACC,shift(@ACC)); $code.=<<___; add $tinptr,sp,#128 vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]! veor q2,q2,q2 @ $N0-$N1 vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]! veor q3,q3,q3 @ $N2-$N3 vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]! vst1.64 {@ACC[6]},[$toutptr,:128] subs $outer,$outer,#8 vld1.64 {@ACC[0]-@ACC[1]},[$tinptr,:256]! vld1.64 {@ACC[2]-@ACC[3]},[$tinptr,:256]! vld1.64 {@ACC[4]-@ACC[5]},[$tinptr,:256]! vld1.64 {@ACC[6]-@ACC[7]},[$tinptr,:256]! vmlal.u32 $A4xB,$Bi,${A2}[0] vld1.64 {$A7xB}, [$tinptr, :128]! vmlal.u32 $A5xB,$Bi,${A2}[1] vmlal.u32 $A6xB,$Bi,${A3}[0] vmlal.u32 $A7xB,$Bi,${A3}[1] bne .LNEON_inner vmlal.u32 $A0xB,$Ni,${N0}[0] add $tinptr,sp,#16 vmlal.u32 $A1xB,$Ni,${N0}[1] sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr vmlal.u32 $A2xB,$Ni,${N1}[0] vld1.64 {$Temp}, [sp,:128] vmlal.u32 $A3xB,$Ni,${N1}[1] subs $outer,$outer,#1 vmlal.u32 $A4xB,$Ni,${N2}[0] vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! vmlal.u32 $A5xB,$Ni,${N2}[1] vld1.64 {$A0xB}, [$tinptr, :128]! vshr.u64 $temp,$temp,#16 vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! vmlal.u32 $A6xB,$Ni,${N3}[0] vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! vmlal.u32 $A7xB,$Ni,${N3}[1] vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! vadd.u64 $temp,$temp,`&Dhi("$Temp")` vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! vshr.u64 $temp,$temp,#16 bne .LNEON_outer itt ne subne $nptr,$nptr,$num,lsl#2 @ rewind bne .LNEON_8n_outer add $toutptr,sp,#128 vst1.64 {q2-q3}, [sp,:256]! @ start wiping stack frame vshr.u64 $temp,@ACC[0]#lo,#16 vst1.64 {q2-q3},[sp,:256]! vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp vst1.64 {q2-q3}, [sp,:256]! vshr.u64 $temp,@ACC[0]#hi,#16 vst1.64 {q2-q3}, [sp,:256]! vzip.16 @ACC[0]#lo,@ACC[0]#hi mov $toutptr,sp mov $inner,$num b .LNEON_tail_entry .align 4 .LNEON_tail: vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]! vshr.u64 $temp,`&Dlo("$A0xB")`,#16 vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]! vshr.u64 $temp,`&Dhi("$A0xB")`,#16 vld1.64 {$A7xB}, [$tinptr, :128]! vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")` .LNEON_tail2: vadd.u64 `&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp vst1.32 {`&Dlo("$A0xB")`[0]}, [$toutptr, :32]! vshr.u64 $temp,`&Dlo("$A1xB")`,#16 vadd.u64 `&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp vshr.u64 $temp,`&Dhi("$A1xB")`,#16 vzip.16 `&Dlo("$A1xB")`,`&Dhi("$A1xB")` vadd.u64 `&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp vst1.32 {`&Dlo("$A1xB")`[0]}, [$toutptr, :32]! vshr.u64 $temp,`&Dlo("$A2xB")`,#16 vadd.u64 `&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp vshr.u64 $temp,`&Dhi("$A2xB")`,#16 vzip.16 `&Dlo("$A2xB")`,`&Dhi("$A2xB")` vadd.u64 `&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp vst1.32 {`&Dlo("$A2xB")`[0]}, [$toutptr, :32]! vshr.u64 $temp,`&Dlo("$A3xB")`,#16 vadd.u64 `&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp vshr.u64 $temp,`&Dhi("$A3xB")`,#16 vzip.16 `&Dlo("$A3xB")`,`&Dhi("$A3xB")` vadd.u64 `&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp vst1.32 {`&Dlo("$A3xB")`[0]}, [$toutptr, :32]! vshr.u64 $temp,`&Dlo("$A4xB")`,#16 vadd.u64 `&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp vshr.u64 $temp,`&Dhi("$A4xB")`,#16 vzip.16 `&Dlo("$A4xB")`,`&Dhi("$A4xB")` vadd.u64 `&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp vst1.32 {`&Dlo("$A4xB")`[0]}, [$toutptr, :32]! vshr.u64 $temp,`&Dlo("$A5xB")`,#16 vadd.u64 `&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp vshr.u64 $temp,`&Dhi("$A5xB")`,#16 vzip.16 `&Dlo("$A5xB")`,`&Dhi("$A5xB")` vadd.u64 `&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp vst1.32 {`&Dlo("$A5xB")`[0]}, [$toutptr, :32]! vshr.u64 $temp,`&Dlo("$A6xB")`,#16 vadd.u64 `&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp vld1.64 {$A0xB}, [$tinptr, :128]! vshr.u64 $temp,`&Dhi("$A6xB")`,#16 vzip.16 `&Dlo("$A6xB")`,`&Dhi("$A6xB")` vadd.u64 `&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp vst1.32 {`&Dlo("$A6xB")`[0]}, [$toutptr, :32]! vshr.u64 $temp,`&Dlo("$A7xB")`,#16 vadd.u64 `&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! vshr.u64 $temp,`&Dhi("$A7xB")`,#16 vzip.16 `&Dlo("$A7xB")`,`&Dhi("$A7xB")` vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp vshr.u64 $temp,@ACC[0]#lo,#16 vld1.64 {@ACC[2]-@ACC[3]}, [$tinptr, :256]! vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp vld1.64 {@ACC[4]-@ACC[5]}, [$tinptr, :256]! vshr.u64 $temp,@ACC[0]#hi,#16 vld1.64 {@ACC[6]-@ACC[7]}, [$tinptr, :256]! vzip.16 @ACC[0]#lo,@ACC[0]#hi .LNEON_tail_entry: ___ for ($i=1; $i<8; $i++) { $code.=<<___; vadd.u64 @ACC[1]#lo,@ACC[1]#lo,$temp vst1.32 {@ACC[0]#lo[0]}, [$toutptr, :32]! vshr.u64 $temp,@ACC[1]#lo,#16 vadd.u64 @ACC[1]#hi,@ACC[1]#hi,$temp vshr.u64 $temp,@ACC[1]#hi,#16 vzip.16 @ACC[1]#lo,@ACC[1]#hi ___ push(@ACC,shift(@ACC)); } push(@ACC,shift(@ACC)); $code.=<<___; vld1.64 {@ACC[0]-@ACC[1]}, [$tinptr, :256]! subs $inner,$inner,#8 vst1.32 {`&Dlo("$A7xB")`[0]}, [$toutptr, :32]! vst1.32 {@ACC[7]#lo[0]}, [$toutptr, :32]! bne .LNEON_tail vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit Loading Loading @@ -708,8 +736,14 @@ $code.=<<___; #endif ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 $code =~ s/\bret\b/bx lr/gm; print $code; foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/ge; s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/ge or s/\bret\b/bx lr/g or s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4 print $_,"\n"; } close STDOUT;