Loading crypto/aes/asm/aesv8-armx.pl +206 −245 Original line number Diff line number Diff line Loading @@ -11,16 +11,21 @@ # module is endian-agnostic in sense that it supports both big- and # little-endian cases. As does it support both 32- and 64-bit modes # of operation. Latter is achieved by limiting amount of utilized # registers to 16, which implies additional instructions. This has # no effect on mighty Apple A7, as results are literally equal to # the theoretical estimates based on instruction latencies and issue # rate. It remains to be seen how does it affect other platforms... # registers to 16, which implies additional NEON load and integer # instructions. This has no effect on mighty Apple A7, where results # are literally equal to the theoretical estimates based on AES # instruction latencies and issue rates. On Cortex-A53, an in-order # execution core, this costs up to 10-15%, which is partially # compensated by implementing dedicated code path for 128-bit # CBC encrypt case. On Cortex-A57 parallelizable mode performance # seems to be limited by sheer amount of NEON instructions... # # Performance in cycles per byte processed with 128-bit key: # # CBC enc CBC dec CTR # Apple A7 2.39 1.20 1.20 # Cortex-A5x n/a n/a n/a # Cortex-A53 2.45 1.87 1.94 # Cortex-A57 3.64 1.34 1.32 $flavour = shift; open STDOUT,">".shift; Loading Loading @@ -435,189 +440,166 @@ $code.=<<___; vst1.8 {$ivec},[$out],#16 b .Lcbc_done .align 5 .Lcbc_dec128: vld1.32 {$tmp0-$tmp1},[$key_] veor $ivec,$ivec,$rndlast veor $in0,$dat0,$rndlast mov $step1,$step .Loop2x_cbc_dec128: aesd $dat0,q8 aesd $dat1,q8 aesimc $dat0,$dat0 aesimc $dat1,$dat1 subs $len,$len,#32 aesd $dat0,q9 aesd $dat1,q9 aesimc $dat0,$dat0 aesimc $dat1,$dat1 cclr $step,lo aesd $dat0,$tmp0 aesd $dat1,$tmp0 aesimc $dat0,$dat0 aesimc $dat1,$dat1 cclr $step1,ls aesd $dat0,$tmp1 aesd $dat1,$tmp1 aesimc $dat0,$dat0 aesimc $dat1,$dat1 aesd $dat0,q10 aesd $dat1,q10 aesimc $dat0,$dat0 aesimc $dat1,$dat1 aesd $dat0,q11 aesd $dat1,q11 aesimc $dat0,$dat0 aesimc $dat1,$dat1 aesd $dat0,q12 aesd $dat1,q12 aesimc $dat0,$dat0 aesimc $dat1,$dat1 aesd $dat0,q13 aesd $dat1,q13 aesimc $dat0,$dat0 aesimc $dat1,$dat1 aesd $dat0,q14 aesd $dat1,q14 aesimc $dat0,$dat0 aesimc $dat1,$dat1 aesd $dat0,q15 aesd $dat1,q15 veor $ivec,$ivec,$dat0 vld1.8 {$dat0},[$inp],$step veor $in0,$in0,$dat1 vld1.8 {$dat1},[$inp],$step1 vst1.8 {$ivec},[$out],#16 veor $ivec,$in1,$rndlast vst1.8 {$in0},[$out],#16 veor $in0,$dat0,$rndlast vorr $in1,$dat1,$dat1 b.hs .Loop2x_cbc_dec128 adds $len,$len,#32 veor $ivec,$ivec,$rndlast b.eq .Lcbc_done veor $in0,$in0,$rndlast b .Lcbc_dec_tail ___ { my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); $code.=<<___; .align 5 .Lcbc_dec: subs $len,$len,#16 vorr $in0,$dat,$dat vld1.8 {$dat2},[$inp],#16 subs $len,$len,#32 // bias add $cnt,$rounds,#2 vorr $in1,$dat,$dat vorr $dat1,$dat,$dat vorr $in2,$dat2,$dat2 b.lo .Lcbc_dec_tail cclr $step,eq cmp $rounds,#2 vld1.8 {$dat1},[$inp],$step vorr $dat1,$dat2,$dat2 vld1.8 {$dat2},[$inp],#16 vorr $in0,$dat,$dat vorr $in1,$dat1,$dat1 b.eq .Lcbc_dec128 vorr $in2,$dat2,$dat2 .Loop2x_cbc_dec: .Loop3x_cbc_dec: aesd $dat0,q8 aesd $dat1,q8 aesd $dat2,q8 vld1.32 {q8},[$key_],#16 aesimc $dat0,$dat0 aesimc $dat1,$dat1 aesimc $dat2,$dat2 subs $cnt,$cnt,#2 aesd $dat0,q9 aesd $dat1,q9 aesd $dat2,q9 vld1.32 {q9},[$key_],#16 aesimc $dat0,$dat0 aesimc $dat1,$dat1 b.gt .Loop2x_cbc_dec aesimc $dat2,$dat2 b.gt .Loop3x_cbc_dec aesd $dat0,q8 aesd $dat1,q8 aesd $dat2,q8 veor $tmp0,$ivec,$rndlast aesimc $dat0,$dat0 aesimc $dat1,$dat1 veor $tmp0,$ivec,$rndlast aesimc $dat2,$dat2 veor $tmp1,$in0,$rndlast aesd $dat0,q9 aesd $dat1,q9 aesd $dat2,q9 veor $tmp2,$in1,$rndlast subs $len,$len,#0x30 aesimc $dat0,$dat0 aesimc $dat1,$dat1 vorr $ivec,$in1,$in1 subs $len,$len,#32 aesd $dat0,q10 aesd $dat1,q10 aesimc $dat0,$dat0 cclr $step,lo aesimc $dat1,$dat1 mov $key_,$key aesd $dat0,q11 aesd $dat1,q11 aesimc $dat0,$dat0 vld1.8 {$in0},[$inp],$step aesimc $dat1,$dat1 cclr $step,ls aesimc $dat2,$dat2 vorr $ivec,$in2,$in2 mov.lo x6,$len // x6, $cnt, is zero at this point aesd $dat0,q12 aesd $dat1,q12 aesd $dat2,q12 add $inp,$inp,x6 // $inp is adjusted in such way that // at exit from the loop $dat1-$dat2 // are loaded with last "words" aesimc $dat0,$dat0 aesimc $dat1,$dat1 vld1.8 {$in1},[$inp],$step aesimc $dat2,$dat2 mov $key_,$key aesd $dat0,q13 aesd $dat1,q13 aesd $dat2,q13 vld1.8 {$in0},[$inp],#16 aesimc $dat0,$dat0 aesimc $dat1,$dat1 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] aesimc $dat2,$dat2 vld1.8 {$in1},[$inp],#16 aesd $dat0,q14 aesd $dat1,q14 aesd $dat2,q14 vld1.8 {$in2},[$inp],#16 aesimc $dat0,$dat0 aesimc $dat1,$dat1 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] aesimc $dat2,$dat2 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] aesd $dat0,q15 aesd $dat1,q15 aesd $dat2,q15 mov $cnt,$rounds add $cnt,$rounds,#2 veor $tmp0,$tmp0,$dat0 veor $tmp1,$tmp1,$dat1 veor $dat2,$dat2,$tmp2 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] vorr $dat0,$in0,$in0 vst1.8 {$tmp0},[$out],#16 vorr $dat1,$in1,$in1 vst1.8 {$tmp1},[$out],#16 b.hs .Loop2x_cbc_dec vst1.8 {$dat2},[$out],#16 vorr $dat2,$in2,$in2 b.hs .Loop3x_cbc_dec adds $len,$len,#32 cmn $len,#0x30 b.eq .Lcbc_done nop .Lcbc_dec_tail: aesd $dat,q8 aesd $dat1,q8 aesd $dat2,q8 vld1.32 {q8},[$key_],#16 aesimc $dat,$dat aesimc $dat1,$dat1 aesimc $dat2,$dat2 subs $cnt,$cnt,#2 aesd $dat,q9 aesd $dat1,q9 aesd $dat2,q9 vld1.32 {q9},[$key_],#16 aesimc $dat,$dat aesimc $dat1,$dat1 aesimc $dat2,$dat2 b.gt .Lcbc_dec_tail aesd $dat,q8 aesimc $dat,$dat aesd $dat,q9 aesimc $dat,$dat veor $tmp,$ivec,$rndlast aesd $dat,q10 aesimc $dat,$dat vorr $ivec,$in0,$in0 aesd $dat,q11 aesimc $dat,$dat aesd $dat,q12 aesimc $dat,$dat aesd $dat,q13 aesimc $dat,$dat aesd $dat,q14 aesimc $dat,$dat aesd $dat,q15 veor $tmp,$tmp,$dat vst1.8 {$tmp},[$out],#16 aesd $dat1,q8 aesd $dat2,q8 aesimc $dat1,$dat1 aesimc $dat2,$dat2 aesd $dat1,q9 aesd $dat2,q9 aesimc $dat1,$dat1 aesimc $dat2,$dat2 aesd $dat1,q12 aesd $dat2,q12 aesimc $dat1,$dat1 aesimc $dat2,$dat2 cmn $len,#0x20 aesd $dat1,q13 aesd $dat2,q13 aesimc $dat1,$dat1 aesimc $dat2,$dat2 veor $tmp1,$ivec,$rndlast aesd $dat1,q14 aesd $dat2,q14 aesimc $dat1,$dat1 aesimc $dat2,$dat2 veor $tmp2,$in1,$rndlast aesd $dat1,q15 aesd $dat2,q15 b.eq .Lcbc_dec_one veor $tmp1,$tmp1,$dat1 veor $tmp2,$tmp2,$dat2 vorr $ivec,$in2,$in2 vst1.8 {$tmp1},[$out],#16 vst1.8 {$tmp2},[$out],#16 b .Lcbc_done .Lcbc_dec_one: veor $tmp1,$tmp1,$dat2 vorr $ivec,$in2,$in2 vst1.8 {$tmp1},[$out],#16 .Lcbc_done: vst1.8 {$ivec},[$ivp] .Lcbc_abort: ___ } $code.=<<___ if ($flavour !~ /64/); vldmia sp!,{d8-d15} ldmia sp!,{r4-r8,pc} Loading @@ -632,8 +614,12 @@ ___ }}} {{{ my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my ($rounds,$cnt,$key_,$ctr,$tctr,$tctr1)=("w5","w6","x7","w8","w9","w10"); my ($rounds,$cnt,$key_)=("w5","w6","x7"); my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12)); my $step="x12"; # aliases with $tctr2 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); my ($dat,$tmp)=($dat0,$tmp0); Loading Loading @@ -662,139 +648,149 @@ $code.=<<___; vld1.32 {$dat0},[$ivp] vld1.32 {q8-q9},[$key] // load key schedule... sub $rounds,$rounds,#6 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys sub $rounds,$rounds,#4 mov $step,#16 cmp $len,#2 add $key_,$key,x5,lsl#4 // pointer to last 5 round keys sub $rounds,$rounds,#2 vld1.32 {q10-q11},[$key_],#32 vld1.32 {q12-q13},[$key_],#32 vld1.32 {q14-q15},[$key_],#32 vld1.32 {$rndlast},[$key_] add $key_,$key,#32 mov $cnt,$rounds subs $len,$len,#2 b.lo .Lctr32_tail cclr $step,lo #ifndef __ARMEB__ rev $ctr, $ctr #endif vorr $dat1,$dat0,$dat0 add $ctr, $ctr, #1 add $tctr1, $ctr, #1 vorr $dat2,$dat0,$dat0 add $ctr, $ctr, #2 vorr $ivec,$dat0,$dat0 rev $tctr1, $ctr cmp $rounds,#2 rev $tctr1, $tctr1 vmov.32 ${dat1}[3],$tctr1 b.eq .Lctr32_128 b.ls .Lctr32_tail rev $tctr2, $ctr sub $len,$len,#3 // bias vmov.32 ${dat2}[3],$tctr2 b .Loop3x_ctr32 .Loop2x_ctr32: .align 4 .Loop3x_ctr32: aese $dat0,q8 aese $dat1,q8 aese $dat2,q8 vld1.32 {q8},[$key_],#16 aesmc $dat0,$dat0 aesmc $dat1,$dat1 aesmc $dat2,$dat2 subs $cnt,$cnt,#2 aese $dat0,q9 aese $dat1,q9 aese $dat2,q9 vld1.32 {q9},[$key_],#16 aesmc $dat0,$dat0 aesmc $dat1,$dat1 b.gt .Loop2x_ctr32 aesmc $dat2,$dat2 b.gt .Loop3x_ctr32 aese $dat0,q8 aese $dat1,q8 aese $dat2,q8 mov $key_,$key aesmc $tmp0,$dat0 vorr $dat0,$ivec,$ivec vld1.8 {$in0},[$inp],#16 aesmc $tmp1,$dat1 vorr $dat1,$ivec,$ivec aesmc $dat2,$dat2 vorr $dat0,$ivec,$ivec aese $tmp0,q9 aese $tmp1,q9 vld1.8 {$in0},[$inp],#16 aesmc $tmp0,$tmp0 vld1.8 {$in1},[$inp],#16 aesmc $tmp1,$tmp1 add $ctr,$ctr,#1 aese $tmp0,q10 aese $tmp1,q10 rev $tctr,$ctr aesmc $tmp0,$tmp0 aesmc $tmp1,$tmp1 add $ctr,$ctr,#1 aese $tmp0,q11 aese $tmp1,q11 veor $in0,$in0,$rndlast rev $tctr1,$ctr aese $tmp1,q9 aese $dat2,q9 vorr $dat1,$ivec,$ivec aesmc $tmp0,$tmp0 vld1.8 {$in2},[$inp],#16 aesmc $tmp1,$tmp1 veor $in1,$in1,$rndlast mov $key_,$key aesmc $tmp2,$dat2 vorr $dat2,$ivec,$ivec add $tctr0,$ctr,#1 aese $tmp0,q12 aese $tmp1,q12 subs $len,$len,#2 aese $tmp2,q12 veor $in0,$in0,$rndlast add $tctr1,$ctr,#2 aesmc $tmp0,$tmp0 aesmc $tmp1,$tmp1 vld1.32 {q8-q9},[$key_],#32 // re-pre-load rndkey[0-1] aesmc $tmp2,$tmp2 veor $in1,$in1,$rndlast add $ctr,$ctr,#3 aese $tmp0,q13 aese $tmp1,q13 aese $tmp2,q13 veor $in2,$in2,$rndlast rev $tctr0,$tctr0 aesmc $tmp0,$tmp0 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] aesmc $tmp1,$tmp1 aesmc $tmp2,$tmp2 vmov.32 ${dat0}[3], $tctr0 rev $tctr1,$tctr1 aese $tmp0,q14 aese $tmp1,q14 vmov.32 ${dat0}[3], $tctr aesmc $tmp0,$tmp0 aese $tmp2,q14 vmov.32 ${dat1}[3], $tctr1 rev $tctr2,$ctr aesmc $tmp0,$tmp0 aesmc $tmp1,$tmp1 aesmc $tmp2,$tmp2 vmov.32 ${dat2}[3], $tctr2 subs $len,$len,#3 aese $tmp0,q15 aese $tmp1,q15 aese $tmp2,q15 mov $cnt,$rounds veor $in0,$in0,$tmp0 veor $in1,$in1,$tmp1 veor $in2,$in2,$tmp2 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] vst1.8 {$in0},[$out],#16 vst1.8 {$in1},[$out],#16 b.hs .Loop2x_ctr32 vst1.8 {$in2},[$out],#16 b.hs .Loop3x_ctr32 adds $len,$len,#2 adds $len,$len,#3 b.eq .Lctr32_done b .Lctr32_tail .Lctr32_128: vld1.32 {$tmp0-$tmp1},[$key_] cmp $len,#1 mov $step,#16 cclr $step,eq .Loop2x_ctr32_128: .Lctr32_tail: aese $dat0,q8 aese $dat1,q8 vld1.32 {q8},[$key_],#16 aesmc $dat0,$dat0 vld1.8 {$in0},[$inp],#16 aesmc $dat1,$dat1 vld1.8 {$in1},[$inp],#16 subs $cnt,$cnt,#2 aese $dat0,q9 aese $dat1,q9 add $ctr,$ctr,#1 aesmc $dat0,$dat0 aesmc $dat1,$dat1 rev $tctr,$ctr aese $dat0,$tmp0 aese $dat1,$tmp0 add $ctr,$ctr,#1 aesmc $dat0,$dat0 aesmc $dat1,$dat1 rev $tctr1,$ctr aese $dat0,$tmp1 aese $dat1,$tmp1 subs $len,$len,#2 vld1.32 {q9},[$key_],#16 aesmc $dat0,$dat0 aesmc $dat1,$dat1 aese $dat0,q10 aese $dat1,q10 b.gt .Lctr32_tail aese $dat0,q8 aese $dat1,q8 aesmc $dat0,$dat0 aesmc $dat1,$dat1 aese $dat0,q11 aese $dat1,q11 aese $dat0,q9 aese $dat1,q9 aesmc $dat0,$dat0 aesmc $dat1,$dat1 vld1.8 {$in0},[$inp],$step aese $dat0,q12 aese $dat1,q12 vld1.8 {$in1},[$inp] aesmc $dat0,$dat0 aesmc $dat1,$dat1 aese $dat0,q13 Loading @@ -803,56 +799,19 @@ $code.=<<___; aesmc $dat1,$dat1 aese $dat0,q14 aese $dat1,q14 veor $in0,$in0,$rndlast aesmc $dat0,$dat0 aesmc $dat1,$dat1 veor $in0,$in0,$rndlast aese $dat0,q15 veor $in1,$in1,$rndlast aese $dat0,q15 aese $dat1,q15 cmp $len,#1 veor $in0,$in0,$dat0 vorr $dat0,$ivec,$ivec veor $in1,$in1,$dat1 vorr $dat1,$ivec,$ivec vst1.8 {$in0},[$out],#16 vmov.32 ${dat0}[3], $tctr vst1.8 {$in1},[$out],#16 vmov.32 ${dat1}[3], $tctr1 b.hs .Loop2x_ctr32_128 adds $len,$len,#2 b.eq .Lctr32_done .Lctr32_tail: aese $dat,q8 vld1.32 {q8},[$key_],#16 aesmc $dat,$dat subs $cnt,$cnt,#2 aese $dat,q9 vld1.32 {q9},[$key_],#16 aesmc $dat,$dat b.gt .Lctr32_tail aese $dat,q8 aesmc $dat,$dat aese $dat,q9 aesmc $dat,$dat vld1.8 {$in0},[$inp] aese $dat,q10 aesmc $dat,$dat aese $dat,q11 aesmc $dat,$dat aese $dat,q12 aesmc $dat,$dat aese $dat,q13 aesmc $dat,$dat aese $dat,q14 aesmc $dat,$dat veor $in0,$in0,$rndlast aese $dat,q15 veor $in0,$in0,$dat vst1.8 {$in0},[$out] vst1.8 {$in1},[$out] .Lctr32_done: ___ Loading Loading @@ -894,6 +853,7 @@ if ($flavour =~ /64/) { ######## 64-bit code #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or s/vmov\.i8/movi/o or # fix up legacy mnemonics s/vext\.8/ext/o or s/vrev32\.8/rev32/o or Loading Loading @@ -971,6 +931,7 @@ if ($flavour =~ /64/) { ######## 64-bit code s/vdup\.32\s+(.*)/unvdup32($1)/geo or s/vmov\.32\s+(.*)/unvmov32($1)/geo or s/^(\s+)b\./$1b/o or s/^(\s+)mov\./$1mov/o or s/^(\s+)ret/$1bx\tlr/o; print $_,"\n"; Loading Loading
crypto/aes/asm/aesv8-armx.pl +206 −245 Original line number Diff line number Diff line Loading @@ -11,16 +11,21 @@ # module is endian-agnostic in sense that it supports both big- and # little-endian cases. As does it support both 32- and 64-bit modes # of operation. Latter is achieved by limiting amount of utilized # registers to 16, which implies additional instructions. This has # no effect on mighty Apple A7, as results are literally equal to # the theoretical estimates based on instruction latencies and issue # rate. It remains to be seen how does it affect other platforms... # registers to 16, which implies additional NEON load and integer # instructions. This has no effect on mighty Apple A7, where results # are literally equal to the theoretical estimates based on AES # instruction latencies and issue rates. On Cortex-A53, an in-order # execution core, this costs up to 10-15%, which is partially # compensated by implementing dedicated code path for 128-bit # CBC encrypt case. On Cortex-A57 parallelizable mode performance # seems to be limited by sheer amount of NEON instructions... # # Performance in cycles per byte processed with 128-bit key: # # CBC enc CBC dec CTR # Apple A7 2.39 1.20 1.20 # Cortex-A5x n/a n/a n/a # Cortex-A53 2.45 1.87 1.94 # Cortex-A57 3.64 1.34 1.32 $flavour = shift; open STDOUT,">".shift; Loading Loading @@ -435,189 +440,166 @@ $code.=<<___; vst1.8 {$ivec},[$out],#16 b .Lcbc_done .align 5 .Lcbc_dec128: vld1.32 {$tmp0-$tmp1},[$key_] veor $ivec,$ivec,$rndlast veor $in0,$dat0,$rndlast mov $step1,$step .Loop2x_cbc_dec128: aesd $dat0,q8 aesd $dat1,q8 aesimc $dat0,$dat0 aesimc $dat1,$dat1 subs $len,$len,#32 aesd $dat0,q9 aesd $dat1,q9 aesimc $dat0,$dat0 aesimc $dat1,$dat1 cclr $step,lo aesd $dat0,$tmp0 aesd $dat1,$tmp0 aesimc $dat0,$dat0 aesimc $dat1,$dat1 cclr $step1,ls aesd $dat0,$tmp1 aesd $dat1,$tmp1 aesimc $dat0,$dat0 aesimc $dat1,$dat1 aesd $dat0,q10 aesd $dat1,q10 aesimc $dat0,$dat0 aesimc $dat1,$dat1 aesd $dat0,q11 aesd $dat1,q11 aesimc $dat0,$dat0 aesimc $dat1,$dat1 aesd $dat0,q12 aesd $dat1,q12 aesimc $dat0,$dat0 aesimc $dat1,$dat1 aesd $dat0,q13 aesd $dat1,q13 aesimc $dat0,$dat0 aesimc $dat1,$dat1 aesd $dat0,q14 aesd $dat1,q14 aesimc $dat0,$dat0 aesimc $dat1,$dat1 aesd $dat0,q15 aesd $dat1,q15 veor $ivec,$ivec,$dat0 vld1.8 {$dat0},[$inp],$step veor $in0,$in0,$dat1 vld1.8 {$dat1},[$inp],$step1 vst1.8 {$ivec},[$out],#16 veor $ivec,$in1,$rndlast vst1.8 {$in0},[$out],#16 veor $in0,$dat0,$rndlast vorr $in1,$dat1,$dat1 b.hs .Loop2x_cbc_dec128 adds $len,$len,#32 veor $ivec,$ivec,$rndlast b.eq .Lcbc_done veor $in0,$in0,$rndlast b .Lcbc_dec_tail ___ { my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); $code.=<<___; .align 5 .Lcbc_dec: subs $len,$len,#16 vorr $in0,$dat,$dat vld1.8 {$dat2},[$inp],#16 subs $len,$len,#32 // bias add $cnt,$rounds,#2 vorr $in1,$dat,$dat vorr $dat1,$dat,$dat vorr $in2,$dat2,$dat2 b.lo .Lcbc_dec_tail cclr $step,eq cmp $rounds,#2 vld1.8 {$dat1},[$inp],$step vorr $dat1,$dat2,$dat2 vld1.8 {$dat2},[$inp],#16 vorr $in0,$dat,$dat vorr $in1,$dat1,$dat1 b.eq .Lcbc_dec128 vorr $in2,$dat2,$dat2 .Loop2x_cbc_dec: .Loop3x_cbc_dec: aesd $dat0,q8 aesd $dat1,q8 aesd $dat2,q8 vld1.32 {q8},[$key_],#16 aesimc $dat0,$dat0 aesimc $dat1,$dat1 aesimc $dat2,$dat2 subs $cnt,$cnt,#2 aesd $dat0,q9 aesd $dat1,q9 aesd $dat2,q9 vld1.32 {q9},[$key_],#16 aesimc $dat0,$dat0 aesimc $dat1,$dat1 b.gt .Loop2x_cbc_dec aesimc $dat2,$dat2 b.gt .Loop3x_cbc_dec aesd $dat0,q8 aesd $dat1,q8 aesd $dat2,q8 veor $tmp0,$ivec,$rndlast aesimc $dat0,$dat0 aesimc $dat1,$dat1 veor $tmp0,$ivec,$rndlast aesimc $dat2,$dat2 veor $tmp1,$in0,$rndlast aesd $dat0,q9 aesd $dat1,q9 aesd $dat2,q9 veor $tmp2,$in1,$rndlast subs $len,$len,#0x30 aesimc $dat0,$dat0 aesimc $dat1,$dat1 vorr $ivec,$in1,$in1 subs $len,$len,#32 aesd $dat0,q10 aesd $dat1,q10 aesimc $dat0,$dat0 cclr $step,lo aesimc $dat1,$dat1 mov $key_,$key aesd $dat0,q11 aesd $dat1,q11 aesimc $dat0,$dat0 vld1.8 {$in0},[$inp],$step aesimc $dat1,$dat1 cclr $step,ls aesimc $dat2,$dat2 vorr $ivec,$in2,$in2 mov.lo x6,$len // x6, $cnt, is zero at this point aesd $dat0,q12 aesd $dat1,q12 aesd $dat2,q12 add $inp,$inp,x6 // $inp is adjusted in such way that // at exit from the loop $dat1-$dat2 // are loaded with last "words" aesimc $dat0,$dat0 aesimc $dat1,$dat1 vld1.8 {$in1},[$inp],$step aesimc $dat2,$dat2 mov $key_,$key aesd $dat0,q13 aesd $dat1,q13 aesd $dat2,q13 vld1.8 {$in0},[$inp],#16 aesimc $dat0,$dat0 aesimc $dat1,$dat1 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] aesimc $dat2,$dat2 vld1.8 {$in1},[$inp],#16 aesd $dat0,q14 aesd $dat1,q14 aesd $dat2,q14 vld1.8 {$in2},[$inp],#16 aesimc $dat0,$dat0 aesimc $dat1,$dat1 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] aesimc $dat2,$dat2 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] aesd $dat0,q15 aesd $dat1,q15 aesd $dat2,q15 mov $cnt,$rounds add $cnt,$rounds,#2 veor $tmp0,$tmp0,$dat0 veor $tmp1,$tmp1,$dat1 veor $dat2,$dat2,$tmp2 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] vorr $dat0,$in0,$in0 vst1.8 {$tmp0},[$out],#16 vorr $dat1,$in1,$in1 vst1.8 {$tmp1},[$out],#16 b.hs .Loop2x_cbc_dec vst1.8 {$dat2},[$out],#16 vorr $dat2,$in2,$in2 b.hs .Loop3x_cbc_dec adds $len,$len,#32 cmn $len,#0x30 b.eq .Lcbc_done nop .Lcbc_dec_tail: aesd $dat,q8 aesd $dat1,q8 aesd $dat2,q8 vld1.32 {q8},[$key_],#16 aesimc $dat,$dat aesimc $dat1,$dat1 aesimc $dat2,$dat2 subs $cnt,$cnt,#2 aesd $dat,q9 aesd $dat1,q9 aesd $dat2,q9 vld1.32 {q9},[$key_],#16 aesimc $dat,$dat aesimc $dat1,$dat1 aesimc $dat2,$dat2 b.gt .Lcbc_dec_tail aesd $dat,q8 aesimc $dat,$dat aesd $dat,q9 aesimc $dat,$dat veor $tmp,$ivec,$rndlast aesd $dat,q10 aesimc $dat,$dat vorr $ivec,$in0,$in0 aesd $dat,q11 aesimc $dat,$dat aesd $dat,q12 aesimc $dat,$dat aesd $dat,q13 aesimc $dat,$dat aesd $dat,q14 aesimc $dat,$dat aesd $dat,q15 veor $tmp,$tmp,$dat vst1.8 {$tmp},[$out],#16 aesd $dat1,q8 aesd $dat2,q8 aesimc $dat1,$dat1 aesimc $dat2,$dat2 aesd $dat1,q9 aesd $dat2,q9 aesimc $dat1,$dat1 aesimc $dat2,$dat2 aesd $dat1,q12 aesd $dat2,q12 aesimc $dat1,$dat1 aesimc $dat2,$dat2 cmn $len,#0x20 aesd $dat1,q13 aesd $dat2,q13 aesimc $dat1,$dat1 aesimc $dat2,$dat2 veor $tmp1,$ivec,$rndlast aesd $dat1,q14 aesd $dat2,q14 aesimc $dat1,$dat1 aesimc $dat2,$dat2 veor $tmp2,$in1,$rndlast aesd $dat1,q15 aesd $dat2,q15 b.eq .Lcbc_dec_one veor $tmp1,$tmp1,$dat1 veor $tmp2,$tmp2,$dat2 vorr $ivec,$in2,$in2 vst1.8 {$tmp1},[$out],#16 vst1.8 {$tmp2},[$out],#16 b .Lcbc_done .Lcbc_dec_one: veor $tmp1,$tmp1,$dat2 vorr $ivec,$in2,$in2 vst1.8 {$tmp1},[$out],#16 .Lcbc_done: vst1.8 {$ivec},[$ivp] .Lcbc_abort: ___ } $code.=<<___ if ($flavour !~ /64/); vldmia sp!,{d8-d15} ldmia sp!,{r4-r8,pc} Loading @@ -632,8 +614,12 @@ ___ }}} {{{ my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my ($rounds,$cnt,$key_,$ctr,$tctr,$tctr1)=("w5","w6","x7","w8","w9","w10"); my ($rounds,$cnt,$key_)=("w5","w6","x7"); my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12)); my $step="x12"; # aliases with $tctr2 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); my ($dat,$tmp)=($dat0,$tmp0); Loading Loading @@ -662,139 +648,149 @@ $code.=<<___; vld1.32 {$dat0},[$ivp] vld1.32 {q8-q9},[$key] // load key schedule... sub $rounds,$rounds,#6 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys sub $rounds,$rounds,#4 mov $step,#16 cmp $len,#2 add $key_,$key,x5,lsl#4 // pointer to last 5 round keys sub $rounds,$rounds,#2 vld1.32 {q10-q11},[$key_],#32 vld1.32 {q12-q13},[$key_],#32 vld1.32 {q14-q15},[$key_],#32 vld1.32 {$rndlast},[$key_] add $key_,$key,#32 mov $cnt,$rounds subs $len,$len,#2 b.lo .Lctr32_tail cclr $step,lo #ifndef __ARMEB__ rev $ctr, $ctr #endif vorr $dat1,$dat0,$dat0 add $ctr, $ctr, #1 add $tctr1, $ctr, #1 vorr $dat2,$dat0,$dat0 add $ctr, $ctr, #2 vorr $ivec,$dat0,$dat0 rev $tctr1, $ctr cmp $rounds,#2 rev $tctr1, $tctr1 vmov.32 ${dat1}[3],$tctr1 b.eq .Lctr32_128 b.ls .Lctr32_tail rev $tctr2, $ctr sub $len,$len,#3 // bias vmov.32 ${dat2}[3],$tctr2 b .Loop3x_ctr32 .Loop2x_ctr32: .align 4 .Loop3x_ctr32: aese $dat0,q8 aese $dat1,q8 aese $dat2,q8 vld1.32 {q8},[$key_],#16 aesmc $dat0,$dat0 aesmc $dat1,$dat1 aesmc $dat2,$dat2 subs $cnt,$cnt,#2 aese $dat0,q9 aese $dat1,q9 aese $dat2,q9 vld1.32 {q9},[$key_],#16 aesmc $dat0,$dat0 aesmc $dat1,$dat1 b.gt .Loop2x_ctr32 aesmc $dat2,$dat2 b.gt .Loop3x_ctr32 aese $dat0,q8 aese $dat1,q8 aese $dat2,q8 mov $key_,$key aesmc $tmp0,$dat0 vorr $dat0,$ivec,$ivec vld1.8 {$in0},[$inp],#16 aesmc $tmp1,$dat1 vorr $dat1,$ivec,$ivec aesmc $dat2,$dat2 vorr $dat0,$ivec,$ivec aese $tmp0,q9 aese $tmp1,q9 vld1.8 {$in0},[$inp],#16 aesmc $tmp0,$tmp0 vld1.8 {$in1},[$inp],#16 aesmc $tmp1,$tmp1 add $ctr,$ctr,#1 aese $tmp0,q10 aese $tmp1,q10 rev $tctr,$ctr aesmc $tmp0,$tmp0 aesmc $tmp1,$tmp1 add $ctr,$ctr,#1 aese $tmp0,q11 aese $tmp1,q11 veor $in0,$in0,$rndlast rev $tctr1,$ctr aese $tmp1,q9 aese $dat2,q9 vorr $dat1,$ivec,$ivec aesmc $tmp0,$tmp0 vld1.8 {$in2},[$inp],#16 aesmc $tmp1,$tmp1 veor $in1,$in1,$rndlast mov $key_,$key aesmc $tmp2,$dat2 vorr $dat2,$ivec,$ivec add $tctr0,$ctr,#1 aese $tmp0,q12 aese $tmp1,q12 subs $len,$len,#2 aese $tmp2,q12 veor $in0,$in0,$rndlast add $tctr1,$ctr,#2 aesmc $tmp0,$tmp0 aesmc $tmp1,$tmp1 vld1.32 {q8-q9},[$key_],#32 // re-pre-load rndkey[0-1] aesmc $tmp2,$tmp2 veor $in1,$in1,$rndlast add $ctr,$ctr,#3 aese $tmp0,q13 aese $tmp1,q13 aese $tmp2,q13 veor $in2,$in2,$rndlast rev $tctr0,$tctr0 aesmc $tmp0,$tmp0 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] aesmc $tmp1,$tmp1 aesmc $tmp2,$tmp2 vmov.32 ${dat0}[3], $tctr0 rev $tctr1,$tctr1 aese $tmp0,q14 aese $tmp1,q14 vmov.32 ${dat0}[3], $tctr aesmc $tmp0,$tmp0 aese $tmp2,q14 vmov.32 ${dat1}[3], $tctr1 rev $tctr2,$ctr aesmc $tmp0,$tmp0 aesmc $tmp1,$tmp1 aesmc $tmp2,$tmp2 vmov.32 ${dat2}[3], $tctr2 subs $len,$len,#3 aese $tmp0,q15 aese $tmp1,q15 aese $tmp2,q15 mov $cnt,$rounds veor $in0,$in0,$tmp0 veor $in1,$in1,$tmp1 veor $in2,$in2,$tmp2 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] vst1.8 {$in0},[$out],#16 vst1.8 {$in1},[$out],#16 b.hs .Loop2x_ctr32 vst1.8 {$in2},[$out],#16 b.hs .Loop3x_ctr32 adds $len,$len,#2 adds $len,$len,#3 b.eq .Lctr32_done b .Lctr32_tail .Lctr32_128: vld1.32 {$tmp0-$tmp1},[$key_] cmp $len,#1 mov $step,#16 cclr $step,eq .Loop2x_ctr32_128: .Lctr32_tail: aese $dat0,q8 aese $dat1,q8 vld1.32 {q8},[$key_],#16 aesmc $dat0,$dat0 vld1.8 {$in0},[$inp],#16 aesmc $dat1,$dat1 vld1.8 {$in1},[$inp],#16 subs $cnt,$cnt,#2 aese $dat0,q9 aese $dat1,q9 add $ctr,$ctr,#1 aesmc $dat0,$dat0 aesmc $dat1,$dat1 rev $tctr,$ctr aese $dat0,$tmp0 aese $dat1,$tmp0 add $ctr,$ctr,#1 aesmc $dat0,$dat0 aesmc $dat1,$dat1 rev $tctr1,$ctr aese $dat0,$tmp1 aese $dat1,$tmp1 subs $len,$len,#2 vld1.32 {q9},[$key_],#16 aesmc $dat0,$dat0 aesmc $dat1,$dat1 aese $dat0,q10 aese $dat1,q10 b.gt .Lctr32_tail aese $dat0,q8 aese $dat1,q8 aesmc $dat0,$dat0 aesmc $dat1,$dat1 aese $dat0,q11 aese $dat1,q11 aese $dat0,q9 aese $dat1,q9 aesmc $dat0,$dat0 aesmc $dat1,$dat1 vld1.8 {$in0},[$inp],$step aese $dat0,q12 aese $dat1,q12 vld1.8 {$in1},[$inp] aesmc $dat0,$dat0 aesmc $dat1,$dat1 aese $dat0,q13 Loading @@ -803,56 +799,19 @@ $code.=<<___; aesmc $dat1,$dat1 aese $dat0,q14 aese $dat1,q14 veor $in0,$in0,$rndlast aesmc $dat0,$dat0 aesmc $dat1,$dat1 veor $in0,$in0,$rndlast aese $dat0,q15 veor $in1,$in1,$rndlast aese $dat0,q15 aese $dat1,q15 cmp $len,#1 veor $in0,$in0,$dat0 vorr $dat0,$ivec,$ivec veor $in1,$in1,$dat1 vorr $dat1,$ivec,$ivec vst1.8 {$in0},[$out],#16 vmov.32 ${dat0}[3], $tctr vst1.8 {$in1},[$out],#16 vmov.32 ${dat1}[3], $tctr1 b.hs .Loop2x_ctr32_128 adds $len,$len,#2 b.eq .Lctr32_done .Lctr32_tail: aese $dat,q8 vld1.32 {q8},[$key_],#16 aesmc $dat,$dat subs $cnt,$cnt,#2 aese $dat,q9 vld1.32 {q9},[$key_],#16 aesmc $dat,$dat b.gt .Lctr32_tail aese $dat,q8 aesmc $dat,$dat aese $dat,q9 aesmc $dat,$dat vld1.8 {$in0},[$inp] aese $dat,q10 aesmc $dat,$dat aese $dat,q11 aesmc $dat,$dat aese $dat,q12 aesmc $dat,$dat aese $dat,q13 aesmc $dat,$dat aese $dat,q14 aesmc $dat,$dat veor $in0,$in0,$rndlast aese $dat,q15 veor $in0,$in0,$dat vst1.8 {$in0},[$out] vst1.8 {$in1},[$out] .Lctr32_done: ___ Loading Loading @@ -894,6 +853,7 @@ if ($flavour =~ /64/) { ######## 64-bit code #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or s/vmov\.i8/movi/o or # fix up legacy mnemonics s/vext\.8/ext/o or s/vrev32\.8/rev32/o or Loading Loading @@ -971,6 +931,7 @@ if ($flavour =~ /64/) { ######## 64-bit code s/vdup\.32\s+(.*)/unvdup32($1)/geo or s/vmov\.32\s+(.*)/unvmov32($1)/geo or s/^(\s+)b\./$1b/o or s/^(\s+)mov\./$1mov/o or s/^(\s+)ret/$1bx\tlr/o; print $_,"\n"; Loading