Loading crypto/aes/asm/aesv8-armx.pl +389 −6 Original line number Diff line number Diff line Loading @@ -27,19 +27,34 @@ # CBC encrypt case. On Cortex-A57 parallelizable mode performance # seems to be limited by sheer amount of NEON instructions... # # April 2019 # # Key to performance of parallelize-able modes is round instruction # interleaving. But which factor to use? There is optimal one for # each combination of instruction latency and issue rate, beyond # which increasing interleave factor doesn't pay off. While on cons # side we have code size increase and resource waste on platforms for # which interleave factor is too high. In other words you want it to # be just right. So far interleave factor of 3x was serving well all # platforms. But for ThunderX2 optimal interleave factor was measured # to be 5x... # # Performance in cycles per byte processed with 128-bit key: # # CBC enc CBC dec CTR # Apple A7 2.39 1.20 1.20 # Cortex-A53 1.32 1.29 1.46 # Cortex-A57(*) 1.95 0.85 0.93 # Denver 1.96 0.86 0.80 # Mongoose 1.33 1.20 1.20 # Kryo 1.26 0.94 1.00 # ThunderX2 5.95 1.53 1.55 # Cortex-A53 1.32 1.17/1.29(**) 1.36/1.46 # Cortex-A57(*) 1.95 0.82/0.85 0.89/0.93 # Cortex-A72 1.33 0.85/0.88 0.92/0.96 # Denver 1.96 0.65/0.86 0.76/0.80 # Mongoose 1.33 1.23/1.20 1.30/1.20 # Kryo 1.26 0.87/0.94 1.00/1.00 # ThunderX2 5.95 1.25 1.30 # # (*) original 3.64/1.34/1.32 results were for r0p0 revision # and are still same even for updated module; # (**) numbers after slash are for 32-bit code, which is 3x- # interleaved; $flavour = shift; $output = shift; Loading Loading @@ -524,6 +539,13 @@ $code.=<<___; ___ { my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); my ($dat3,$in3,$tmp3); # used only in 64-bit mode my ($dat4,$in4,$tmp4); if ($flavour =~ /64/) { ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); } $code.=<<___; .align 5 .Lcbc_dec: Loading @@ -540,7 +562,196 @@ $code.=<<___; vorr $in0,$dat,$dat vorr $in1,$dat1,$dat1 vorr $in2,$dat2,$dat2 ___ $code.=<<___ if ($flavour =~ /64/); cmp $len,#32 b.lo .Loop3x_cbc_dec vld1.8 {$dat3},[$inp],#16 vld1.8 {$dat4},[$inp],#16 sub $len,$len,#32 // bias mov $cnt,$rounds vorr $in3,$dat3,$dat3 vorr $in4,$dat4,$dat4 .Loop5x_cbc_dec: aesd $dat0,q8 aesimc $dat0,$dat0 aesd $dat1,q8 aesimc $dat1,$dat1 aesd $dat2,q8 aesimc $dat2,$dat2 aesd $dat3,q8 aesimc $dat3,$dat3 aesd $dat4,q8 aesimc $dat4,$dat4 vld1.32 {q8},[$key_],#16 subs $cnt,$cnt,#2 aesd $dat0,q9 aesimc $dat0,$dat0 aesd $dat1,q9 aesimc $dat1,$dat1 aesd $dat2,q9 aesimc $dat2,$dat2 aesd $dat3,q9 aesimc $dat3,$dat3 aesd $dat4,q9 aesimc $dat4,$dat4 vld1.32 {q9},[$key_],#16 b.gt .Loop5x_cbc_dec aesd $dat0,q8 aesimc $dat0,$dat0 aesd $dat1,q8 aesimc $dat1,$dat1 aesd $dat2,q8 aesimc $dat2,$dat2 aesd $dat3,q8 aesimc $dat3,$dat3 aesd $dat4,q8 aesimc $dat4,$dat4 cmp $len,#0x40 // because .Lcbc_tail4x sub $len,$len,#0x50 aesd $dat0,q9 aesimc $dat0,$dat0 aesd $dat1,q9 aesimc $dat1,$dat1 aesd $dat2,q9 aesimc $dat2,$dat2 aesd $dat3,q9 aesimc $dat3,$dat3 aesd $dat4,q9 aesimc $dat4,$dat4 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo mov $key_,$key aesd $dat0,q10 aesimc $dat0,$dat0 aesd $dat1,q10 aesimc $dat1,$dat1 aesd $dat2,q10 aesimc $dat2,$dat2 aesd $dat3,q10 aesimc $dat3,$dat3 aesd $dat4,q10 aesimc $dat4,$dat4 add $inp,$inp,x6 // $inp is adjusted in such way that // at exit from the loop $dat1-$dat4 // are loaded with last "words" add x6,$len,#0x60 // because .Lcbc_tail4x aesd $dat0,q11 aesimc $dat0,$dat0 aesd $dat1,q11 aesimc $dat1,$dat1 aesd $dat2,q11 aesimc $dat2,$dat2 aesd $dat3,q11 aesimc $dat3,$dat3 aesd $dat4,q11 aesimc $dat4,$dat4 aesd $dat0,q12 aesimc $dat0,$dat0 aesd $dat1,q12 aesimc $dat1,$dat1 aesd $dat2,q12 aesimc $dat2,$dat2 aesd $dat3,q12 aesimc $dat3,$dat3 aesd $dat4,q12 aesimc $dat4,$dat4 aesd $dat0,q13 aesimc $dat0,$dat0 aesd $dat1,q13 aesimc $dat1,$dat1 aesd $dat2,q13 aesimc $dat2,$dat2 aesd $dat3,q13 aesimc $dat3,$dat3 aesd $dat4,q13 aesimc $dat4,$dat4 aesd $dat0,q14 aesimc $dat0,$dat0 aesd $dat1,q14 aesimc $dat1,$dat1 aesd $dat2,q14 aesimc $dat2,$dat2 aesd $dat3,q14 aesimc $dat3,$dat3 aesd $dat4,q14 aesimc $dat4,$dat4 veor $tmp0,$ivec,$rndlast aesd $dat0,q15 veor $tmp1,$in0,$rndlast vld1.8 {$in0},[$inp],#16 aesd $dat1,q15 veor $tmp2,$in1,$rndlast vld1.8 {$in1},[$inp],#16 aesd $dat2,q15 veor $tmp3,$in2,$rndlast vld1.8 {$in2},[$inp],#16 aesd $dat3,q15 veor $tmp4,$in3,$rndlast vld1.8 {$in3},[$inp],#16 aesd $dat4,q15 vorr $ivec,$in4,$in4 vld1.8 {$in4},[$inp],#16 cbz x6,.Lcbc_tail4x vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] veor $tmp0,$tmp0,$dat0 vorr $dat0,$in0,$in0 veor $tmp1,$tmp1,$dat1 vorr $dat1,$in1,$in1 veor $tmp2,$tmp2,$dat2 vorr $dat2,$in2,$in2 veor $tmp3,$tmp3,$dat3 vorr $dat3,$in3,$in3 veor $tmp4,$tmp4,$dat4 vst1.8 {$tmp0},[$out],#16 vorr $dat4,$in4,$in4 vst1.8 {$tmp1},[$out],#16 mov $cnt,$rounds vst1.8 {$tmp2},[$out],#16 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] vst1.8 {$tmp3},[$out],#16 vst1.8 {$tmp4},[$out],#16 b.hs .Loop5x_cbc_dec add $len,$len,#0x50 cbz $len,.Lcbc_done add $cnt,$rounds,#2 subs $len,$len,#0x30 vorr $dat0,$in2,$in2 vorr $in0,$in2,$in2 vorr $dat1,$in3,$in3 vorr $in1,$in3,$in3 vorr $dat2,$in4,$in4 vorr $in2,$in4,$in4 b.lo .Lcbc_dec_tail b .Loop3x_cbc_dec .align 4 .Lcbc_tail4x: veor $tmp1,$tmp0,$dat1 veor $tmp2,$tmp2,$dat2 veor $tmp3,$tmp3,$dat3 veor $tmp4,$tmp4,$dat4 vst1.8 {$tmp1},[$out],#16 vst1.8 {$tmp2},[$out],#16 vst1.8 {$tmp3},[$out],#16 vst1.8 {$tmp4},[$out],#16 b .Lcbc_done .align 4 ___ $code.=<<___; .Loop3x_cbc_dec: aesd $dat0,q8 aesimc $dat0,$dat0 Loading Loading @@ -701,6 +912,9 @@ my $step="x12"; # aliases with $tctr2 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); # used only in 64-bit mode... my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23)); my ($dat,$tmp)=($dat0,$tmp0); ### q8-q15 preloaded key schedule Loading Loading @@ -753,6 +967,175 @@ $code.=<<___; rev $tctr2, $ctr sub $len,$len,#3 // bias vmov.32 ${dat2}[3],$tctr2 ___ $code.=<<___ if ($flavour =~ /64/); cmp $len,#2 b.lo .Loop3x_ctr32 add w13,$ctr,#1 add w14,$ctr,#2 vorr $dat3,$dat0,$dat0 rev w13,w13 vorr $dat4,$dat0,$dat0 rev w14,w14 vmov.32 ${dat3}[3],w13 sub $len,$len,#2 // bias vmov.32 ${dat4}[3],w14 add $ctr,$ctr,#2 b .Loop5x_ctr32 .align 4 .Loop5x_ctr32: aese $dat0,q8 aesmc $dat0,$dat0 aese $dat1,q8 aesmc $dat1,$dat1 aese $dat2,q8 aesmc $dat2,$dat2 aese $dat3,q8 aesmc $dat3,$dat3 aese $dat4,q8 aesmc $dat4,$dat4 vld1.32 {q8},[$key_],#16 subs $cnt,$cnt,#2 aese $dat0,q9 aesmc $dat0,$dat0 aese $dat1,q9 aesmc $dat1,$dat1 aese $dat2,q9 aesmc $dat2,$dat2 aese $dat3,q9 aesmc $dat3,$dat3 aese $dat4,q9 aesmc $dat4,$dat4 vld1.32 {q9},[$key_],#16 b.gt .Loop5x_ctr32 mov $key_,$key aese $dat0,q8 aesmc $dat0,$dat0 aese $dat1,q8 aesmc $dat1,$dat1 aese $dat2,q8 aesmc $dat2,$dat2 aese $dat3,q8 aesmc $dat3,$dat3 aese $dat4,q8 aesmc $dat4,$dat4 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] aese $dat0,q9 aesmc $dat0,$dat0 aese $dat1,q9 aesmc $dat1,$dat1 aese $dat2,q9 aesmc $dat2,$dat2 aese $dat3,q9 aesmc $dat3,$dat3 aese $dat4,q9 aesmc $dat4,$dat4 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] aese $dat0,q12 aesmc $dat0,$dat0 add $tctr0,$ctr,#1 add $tctr1,$ctr,#2 aese $dat1,q12 aesmc $dat1,$dat1 add $tctr2,$ctr,#3 add w13,$ctr,#4 aese $dat2,q12 aesmc $dat2,$dat2 add w14,$ctr,#5 rev $tctr0,$tctr0 aese $dat3,q12 aesmc $dat3,$dat3 rev $tctr1,$tctr1 rev $tctr2,$tctr2 aese $dat4,q12 aesmc $dat4,$dat4 rev w13,w13 rev w14,w14 aese $dat0,q13 aesmc $dat0,$dat0 aese $dat1,q13 aesmc $dat1,$dat1 aese $dat2,q13 aesmc $dat2,$dat2 aese $dat3,q13 aesmc $dat3,$dat3 aese $dat4,q13 aesmc $dat4,$dat4 aese $dat0,q14 aesmc $dat0,$dat0 vld1.8 {$in0},[$inp],#16 aese $dat1,q14 aesmc $dat1,$dat1 vld1.8 {$in1},[$inp],#16 aese $dat2,q14 aesmc $dat2,$dat2 vld1.8 {$in2},[$inp],#16 aese $dat3,q14 aesmc $dat3,$dat3 vld1.8 {$in3},[$inp],#16 aese $dat4,q14 aesmc $dat4,$dat4 vld1.8 {$in4},[$inp],#16 aese $dat0,q15 veor $in0,$in0,$rndlast aese $dat1,q15 veor $in1,$in1,$rndlast aese $dat2,q15 veor $in2,$in2,$rndlast aese $dat3,q15 veor $in3,$in3,$rndlast aese $dat4,q15 veor $in4,$in4,$rndlast veor $in0,$in0,$dat0 vorr $dat0,$ivec,$ivec veor $in1,$in1,$dat1 vorr $dat1,$ivec,$ivec veor $in2,$in2,$dat2 vorr $dat2,$ivec,$ivec veor $in3,$in3,$dat3 vorr $dat3,$ivec,$ivec veor $in4,$in4,$dat4 vorr $dat4,$ivec,$ivec vst1.8 {$in0},[$out],#16 vmov.32 ${dat0}[3],$tctr0 vst1.8 {$in1},[$out],#16 vmov.32 ${dat1}[3],$tctr1 vst1.8 {$in2},[$out],#16 vmov.32 ${dat2}[3],$tctr2 vst1.8 {$in3},[$out],#16 vmov.32 ${dat3}[3],w13 vst1.8 {$in4},[$out],#16 vmov.32 ${dat4}[3],w14 mov $cnt,$rounds cbz $len,.Lctr32_done add $ctr,$ctr,#5 subs $len,$len,#5 b.hs .Loop5x_ctr32 add $len,$len,#5 sub $ctr,$ctr,#5 cmp $len,#2 mov $step,#16 cclr $step,lo b.ls .Lctr32_tail sub $len,$len,#3 // bias add $ctr,$ctr,#3 ___ $code.=<<___; b .Loop3x_ctr32 .align 4 Loading Loading
crypto/aes/asm/aesv8-armx.pl +389 −6 Original line number Diff line number Diff line Loading @@ -27,19 +27,34 @@ # CBC encrypt case. On Cortex-A57 parallelizable mode performance # seems to be limited by sheer amount of NEON instructions... # # April 2019 # # Key to performance of parallelize-able modes is round instruction # interleaving. But which factor to use? There is optimal one for # each combination of instruction latency and issue rate, beyond # which increasing interleave factor doesn't pay off. While on cons # side we have code size increase and resource waste on platforms for # which interleave factor is too high. In other words you want it to # be just right. So far interleave factor of 3x was serving well all # platforms. But for ThunderX2 optimal interleave factor was measured # to be 5x... # # Performance in cycles per byte processed with 128-bit key: # # CBC enc CBC dec CTR # Apple A7 2.39 1.20 1.20 # Cortex-A53 1.32 1.29 1.46 # Cortex-A57(*) 1.95 0.85 0.93 # Denver 1.96 0.86 0.80 # Mongoose 1.33 1.20 1.20 # Kryo 1.26 0.94 1.00 # ThunderX2 5.95 1.53 1.55 # Cortex-A53 1.32 1.17/1.29(**) 1.36/1.46 # Cortex-A57(*) 1.95 0.82/0.85 0.89/0.93 # Cortex-A72 1.33 0.85/0.88 0.92/0.96 # Denver 1.96 0.65/0.86 0.76/0.80 # Mongoose 1.33 1.23/1.20 1.30/1.20 # Kryo 1.26 0.87/0.94 1.00/1.00 # ThunderX2 5.95 1.25 1.30 # # (*) original 3.64/1.34/1.32 results were for r0p0 revision # and are still same even for updated module; # (**) numbers after slash are for 32-bit code, which is 3x- # interleaved; $flavour = shift; $output = shift; Loading Loading @@ -524,6 +539,13 @@ $code.=<<___; ___ { my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); my ($dat3,$in3,$tmp3); # used only in 64-bit mode my ($dat4,$in4,$tmp4); if ($flavour =~ /64/) { ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); } $code.=<<___; .align 5 .Lcbc_dec: Loading @@ -540,7 +562,196 @@ $code.=<<___; vorr $in0,$dat,$dat vorr $in1,$dat1,$dat1 vorr $in2,$dat2,$dat2 ___ $code.=<<___ if ($flavour =~ /64/); cmp $len,#32 b.lo .Loop3x_cbc_dec vld1.8 {$dat3},[$inp],#16 vld1.8 {$dat4},[$inp],#16 sub $len,$len,#32 // bias mov $cnt,$rounds vorr $in3,$dat3,$dat3 vorr $in4,$dat4,$dat4 .Loop5x_cbc_dec: aesd $dat0,q8 aesimc $dat0,$dat0 aesd $dat1,q8 aesimc $dat1,$dat1 aesd $dat2,q8 aesimc $dat2,$dat2 aesd $dat3,q8 aesimc $dat3,$dat3 aesd $dat4,q8 aesimc $dat4,$dat4 vld1.32 {q8},[$key_],#16 subs $cnt,$cnt,#2 aesd $dat0,q9 aesimc $dat0,$dat0 aesd $dat1,q9 aesimc $dat1,$dat1 aesd $dat2,q9 aesimc $dat2,$dat2 aesd $dat3,q9 aesimc $dat3,$dat3 aesd $dat4,q9 aesimc $dat4,$dat4 vld1.32 {q9},[$key_],#16 b.gt .Loop5x_cbc_dec aesd $dat0,q8 aesimc $dat0,$dat0 aesd $dat1,q8 aesimc $dat1,$dat1 aesd $dat2,q8 aesimc $dat2,$dat2 aesd $dat3,q8 aesimc $dat3,$dat3 aesd $dat4,q8 aesimc $dat4,$dat4 cmp $len,#0x40 // because .Lcbc_tail4x sub $len,$len,#0x50 aesd $dat0,q9 aesimc $dat0,$dat0 aesd $dat1,q9 aesimc $dat1,$dat1 aesd $dat2,q9 aesimc $dat2,$dat2 aesd $dat3,q9 aesimc $dat3,$dat3 aesd $dat4,q9 aesimc $dat4,$dat4 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo mov $key_,$key aesd $dat0,q10 aesimc $dat0,$dat0 aesd $dat1,q10 aesimc $dat1,$dat1 aesd $dat2,q10 aesimc $dat2,$dat2 aesd $dat3,q10 aesimc $dat3,$dat3 aesd $dat4,q10 aesimc $dat4,$dat4 add $inp,$inp,x6 // $inp is adjusted in such way that // at exit from the loop $dat1-$dat4 // are loaded with last "words" add x6,$len,#0x60 // because .Lcbc_tail4x aesd $dat0,q11 aesimc $dat0,$dat0 aesd $dat1,q11 aesimc $dat1,$dat1 aesd $dat2,q11 aesimc $dat2,$dat2 aesd $dat3,q11 aesimc $dat3,$dat3 aesd $dat4,q11 aesimc $dat4,$dat4 aesd $dat0,q12 aesimc $dat0,$dat0 aesd $dat1,q12 aesimc $dat1,$dat1 aesd $dat2,q12 aesimc $dat2,$dat2 aesd $dat3,q12 aesimc $dat3,$dat3 aesd $dat4,q12 aesimc $dat4,$dat4 aesd $dat0,q13 aesimc $dat0,$dat0 aesd $dat1,q13 aesimc $dat1,$dat1 aesd $dat2,q13 aesimc $dat2,$dat2 aesd $dat3,q13 aesimc $dat3,$dat3 aesd $dat4,q13 aesimc $dat4,$dat4 aesd $dat0,q14 aesimc $dat0,$dat0 aesd $dat1,q14 aesimc $dat1,$dat1 aesd $dat2,q14 aesimc $dat2,$dat2 aesd $dat3,q14 aesimc $dat3,$dat3 aesd $dat4,q14 aesimc $dat4,$dat4 veor $tmp0,$ivec,$rndlast aesd $dat0,q15 veor $tmp1,$in0,$rndlast vld1.8 {$in0},[$inp],#16 aesd $dat1,q15 veor $tmp2,$in1,$rndlast vld1.8 {$in1},[$inp],#16 aesd $dat2,q15 veor $tmp3,$in2,$rndlast vld1.8 {$in2},[$inp],#16 aesd $dat3,q15 veor $tmp4,$in3,$rndlast vld1.8 {$in3},[$inp],#16 aesd $dat4,q15 vorr $ivec,$in4,$in4 vld1.8 {$in4},[$inp],#16 cbz x6,.Lcbc_tail4x vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] veor $tmp0,$tmp0,$dat0 vorr $dat0,$in0,$in0 veor $tmp1,$tmp1,$dat1 vorr $dat1,$in1,$in1 veor $tmp2,$tmp2,$dat2 vorr $dat2,$in2,$in2 veor $tmp3,$tmp3,$dat3 vorr $dat3,$in3,$in3 veor $tmp4,$tmp4,$dat4 vst1.8 {$tmp0},[$out],#16 vorr $dat4,$in4,$in4 vst1.8 {$tmp1},[$out],#16 mov $cnt,$rounds vst1.8 {$tmp2},[$out],#16 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] vst1.8 {$tmp3},[$out],#16 vst1.8 {$tmp4},[$out],#16 b.hs .Loop5x_cbc_dec add $len,$len,#0x50 cbz $len,.Lcbc_done add $cnt,$rounds,#2 subs $len,$len,#0x30 vorr $dat0,$in2,$in2 vorr $in0,$in2,$in2 vorr $dat1,$in3,$in3 vorr $in1,$in3,$in3 vorr $dat2,$in4,$in4 vorr $in2,$in4,$in4 b.lo .Lcbc_dec_tail b .Loop3x_cbc_dec .align 4 .Lcbc_tail4x: veor $tmp1,$tmp0,$dat1 veor $tmp2,$tmp2,$dat2 veor $tmp3,$tmp3,$dat3 veor $tmp4,$tmp4,$dat4 vst1.8 {$tmp1},[$out],#16 vst1.8 {$tmp2},[$out],#16 vst1.8 {$tmp3},[$out],#16 vst1.8 {$tmp4},[$out],#16 b .Lcbc_done .align 4 ___ $code.=<<___; .Loop3x_cbc_dec: aesd $dat0,q8 aesimc $dat0,$dat0 Loading Loading @@ -701,6 +912,9 @@ my $step="x12"; # aliases with $tctr2 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); # used only in 64-bit mode... my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23)); my ($dat,$tmp)=($dat0,$tmp0); ### q8-q15 preloaded key schedule Loading Loading @@ -753,6 +967,175 @@ $code.=<<___; rev $tctr2, $ctr sub $len,$len,#3 // bias vmov.32 ${dat2}[3],$tctr2 ___ $code.=<<___ if ($flavour =~ /64/); cmp $len,#2 b.lo .Loop3x_ctr32 add w13,$ctr,#1 add w14,$ctr,#2 vorr $dat3,$dat0,$dat0 rev w13,w13 vorr $dat4,$dat0,$dat0 rev w14,w14 vmov.32 ${dat3}[3],w13 sub $len,$len,#2 // bias vmov.32 ${dat4}[3],w14 add $ctr,$ctr,#2 b .Loop5x_ctr32 .align 4 .Loop5x_ctr32: aese $dat0,q8 aesmc $dat0,$dat0 aese $dat1,q8 aesmc $dat1,$dat1 aese $dat2,q8 aesmc $dat2,$dat2 aese $dat3,q8 aesmc $dat3,$dat3 aese $dat4,q8 aesmc $dat4,$dat4 vld1.32 {q8},[$key_],#16 subs $cnt,$cnt,#2 aese $dat0,q9 aesmc $dat0,$dat0 aese $dat1,q9 aesmc $dat1,$dat1 aese $dat2,q9 aesmc $dat2,$dat2 aese $dat3,q9 aesmc $dat3,$dat3 aese $dat4,q9 aesmc $dat4,$dat4 vld1.32 {q9},[$key_],#16 b.gt .Loop5x_ctr32 mov $key_,$key aese $dat0,q8 aesmc $dat0,$dat0 aese $dat1,q8 aesmc $dat1,$dat1 aese $dat2,q8 aesmc $dat2,$dat2 aese $dat3,q8 aesmc $dat3,$dat3 aese $dat4,q8 aesmc $dat4,$dat4 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] aese $dat0,q9 aesmc $dat0,$dat0 aese $dat1,q9 aesmc $dat1,$dat1 aese $dat2,q9 aesmc $dat2,$dat2 aese $dat3,q9 aesmc $dat3,$dat3 aese $dat4,q9 aesmc $dat4,$dat4 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] aese $dat0,q12 aesmc $dat0,$dat0 add $tctr0,$ctr,#1 add $tctr1,$ctr,#2 aese $dat1,q12 aesmc $dat1,$dat1 add $tctr2,$ctr,#3 add w13,$ctr,#4 aese $dat2,q12 aesmc $dat2,$dat2 add w14,$ctr,#5 rev $tctr0,$tctr0 aese $dat3,q12 aesmc $dat3,$dat3 rev $tctr1,$tctr1 rev $tctr2,$tctr2 aese $dat4,q12 aesmc $dat4,$dat4 rev w13,w13 rev w14,w14 aese $dat0,q13 aesmc $dat0,$dat0 aese $dat1,q13 aesmc $dat1,$dat1 aese $dat2,q13 aesmc $dat2,$dat2 aese $dat3,q13 aesmc $dat3,$dat3 aese $dat4,q13 aesmc $dat4,$dat4 aese $dat0,q14 aesmc $dat0,$dat0 vld1.8 {$in0},[$inp],#16 aese $dat1,q14 aesmc $dat1,$dat1 vld1.8 {$in1},[$inp],#16 aese $dat2,q14 aesmc $dat2,$dat2 vld1.8 {$in2},[$inp],#16 aese $dat3,q14 aesmc $dat3,$dat3 vld1.8 {$in3},[$inp],#16 aese $dat4,q14 aesmc $dat4,$dat4 vld1.8 {$in4},[$inp],#16 aese $dat0,q15 veor $in0,$in0,$rndlast aese $dat1,q15 veor $in1,$in1,$rndlast aese $dat2,q15 veor $in2,$in2,$rndlast aese $dat3,q15 veor $in3,$in3,$rndlast aese $dat4,q15 veor $in4,$in4,$rndlast veor $in0,$in0,$dat0 vorr $dat0,$ivec,$ivec veor $in1,$in1,$dat1 vorr $dat1,$ivec,$ivec veor $in2,$in2,$dat2 vorr $dat2,$ivec,$ivec veor $in3,$in3,$dat3 vorr $dat3,$ivec,$ivec veor $in4,$in4,$dat4 vorr $dat4,$ivec,$ivec vst1.8 {$in0},[$out],#16 vmov.32 ${dat0}[3],$tctr0 vst1.8 {$in1},[$out],#16 vmov.32 ${dat1}[3],$tctr1 vst1.8 {$in2},[$out],#16 vmov.32 ${dat2}[3],$tctr2 vst1.8 {$in3},[$out],#16 vmov.32 ${dat3}[3],w13 vst1.8 {$in4},[$out],#16 vmov.32 ${dat4}[3],w14 mov $cnt,$rounds cbz $len,.Lctr32_done add $ctr,$ctr,#5 subs $len,$len,#5 b.hs .Loop5x_ctr32 add $len,$len,#5 sub $ctr,$ctr,#5 cmp $len,#2 mov $step,#16 cclr $step,lo b.ls .Lctr32_tail sub $len,$len,#3 // bias add $ctr,$ctr,#3 ___ $code.=<<___; b .Loop3x_ctr32 .align 4 Loading