Loading crypto/aes/asm/aesni-x86.pl +47 −16 Original line number Diff line number Diff line Loading @@ -207,12 +207,45 @@ sub aesni_generate1 # fully unrolled loop # every *2nd* cycle. Thus 3x interleave was the one providing optimal # utilization, i.e. when subroutine's throughput is virtually same as # of non-interleaved subroutine [for number of input blocks up to 3]. # This is why it makes no sense to implement 2x subroutine. # aes[enc|dec] latency in next processor generation is 8, but the # instructions can be scheduled every cycle. Optimal interleave for # new processor is therefore 8x, but it's unfeasible to accommodate it # in XMM registers addreassable in 32-bit mode and therefore 6x is # used instead... # This is why it originally made no sense to implement 2x subroutine. # But times change and it became appropriate to spend extra 192 bytes # on 2x subroutine on Atom Silvermont account. For processors that # can schedule aes[enc|dec] every cycle optimal interleave factor # equals to corresponding instructions latency. 8x is optimal for # * Bridge, but it's unfeasible to accommodate such implementation # in XMM registers addreassable in 32-bit mode and therefore maximum # of 6x is used instead... sub aesni_generate2 { my $p=shift; &function_begin_B("_aesni_${p}rypt2"); &$movekey ($rndkey0,&QWP(0,$key)); &shl ($rounds,4); &$movekey ($rndkey1,&QWP(16,$key)); &xorps ($inout0,$rndkey0); &pxor ($inout1,$rndkey0); &$movekey ($rndkey0,&QWP(32,$key)); &lea ($key,&DWP(32,$key,$rounds)); &neg ($rounds); &add ($rounds,16); &set_label("${p}2_loop"); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout1,$rndkey1)"; &$movekey ($rndkey1,&QWP(0,$key,$rounds)); &add ($rounds,32); eval"&aes${p} ($inout0,$rndkey0)"; eval"&aes${p} ($inout1,$rndkey0)"; &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); &jnz (&label("${p}2_loop")); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout1,$rndkey1)"; eval"&aes${p}last ($inout0,$rndkey0)"; eval"&aes${p}last ($inout1,$rndkey0)"; &ret(); &function_end_B("_aesni_${p}rypt2"); } sub aesni_generate3 { my $p=shift; Loading Loading @@ -357,6 +390,8 @@ sub aesni_generate6 &ret(); &function_end_B("_aesni_${p}rypt6"); } &aesni_generate2("enc") if ($PREFIX eq "aesni"); &aesni_generate2("dec"); &aesni_generate3("enc") if ($PREFIX eq "aesni"); &aesni_generate3("dec"); &aesni_generate4("enc") if ($PREFIX eq "aesni"); Loading Loading @@ -460,8 +495,7 @@ if ($PREFIX eq "aesni") { &jmp (&label("ecb_ret")); &set_label("ecb_enc_two",16); &xorps ($inout2,$inout2); &call ("_aesni_encrypt3"); &call ("_aesni_encrypt2"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &jmp (&label("ecb_ret")); Loading Loading @@ -561,8 +595,7 @@ if ($PREFIX eq "aesni") { &jmp (&label("ecb_ret")); &set_label("ecb_dec_two",16); &xorps ($inout2,$inout2); &call ("_aesni_decrypt3"); &call ("_aesni_decrypt2"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &jmp (&label("ecb_ret")); Loading Loading @@ -982,7 +1015,7 @@ if ($PREFIX eq "aesni") { &jmp (&label("ctr32_ret")); &set_label("ctr32_two",16); &call ("_aesni_encrypt3"); &call ("_aesni_encrypt2"); &movups ($inout3,&QWP(0,$inp)); &movups ($inout4,&QWP(0x10,$inp)); &xorps ($inout0,$inout3); Loading Loading @@ -1253,9 +1286,8 @@ if ($PREFIX eq "aesni") { &lea ($inp,&DWP(16*2,$inp)); &xorps ($inout0,$inout3); # input^=tweak &xorps ($inout1,$inout4); &xorps ($inout2,$inout2); &call ("_aesni_encrypt3"); &call ("_aesni_encrypt2"); &xorps ($inout0,$inout3); # output^=tweak &xorps ($inout1,$inout4); Loading Loading @@ -1596,7 +1628,7 @@ if ($PREFIX eq "aesni") { &xorps ($inout0,$inout3); # input^=tweak &xorps ($inout1,$inout4); &call ("_aesni_decrypt3"); &call ("_aesni_decrypt2"); &xorps ($inout0,$inout3); # output^=tweak &xorps ($inout1,$inout4); Loading Loading @@ -1896,8 +1928,7 @@ if ($PREFIX eq "aesni") { &jmp (&label("cbc_dec_tail_collected")); &set_label("cbc_dec_two",16); &xorps ($inout2,$inout2); &call ("_aesni_decrypt3"); &call ("_aesni_decrypt2"); &xorps ($inout0,$ivec); &xorps ($inout1,$in0); &movups (&QWP(0,$out),$inout0); Loading crypto/aes/asm/aesni-x86_64.pl +50 −12 Original line number Diff line number Diff line Loading @@ -288,10 +288,49 @@ ___ # every *2nd* cycle. Thus 3x interleave was the one providing optimal # utilization, i.e. when subroutine's throughput is virtually same as # of non-interleaved subroutine [for number of input blocks up to 3]. # This is why it makes no sense to implement 2x subroutine. # aes[enc|dec] latency in next processor generation is 8, but the # instructions can be scheduled every cycle. Optimal interleave for # new processor is therefore 8x... # This is why it originally made no sense to implement 2x subroutine. # But times change and it became appropriate to spend extra 192 bytes # on 2x subroutine on Atom Silvermont account. For processors that # can schedule aes[enc|dec] every cycle optimal interleave factor # equals to corresponding instructions latency. 8x is optimal for # * Bridge and "super-optimal" for other Intel CPUs... sub aesni_generate2 { my $dir=shift; # As already mentioned it takes in $key and $rounds, which are *not* # preserved. $inout[0-1] is cipher/clear text... $code.=<<___; .type _aesni_${dir}rypt2,\@abi-omnipotent .align 16 _aesni_${dir}rypt2: $movkey ($key),$rndkey0 shl \$4,$rounds $movkey 16($key),$rndkey1 xorps $rndkey0,$inout0 xorps $rndkey0,$inout1 $movkey 32($key),$rndkey0 lea 32($key,$rounds),$key neg %rax # $rounds add \$16,%rax .L${dir}_loop2: aes${dir} $rndkey1,$inout0 aes${dir} $rndkey1,$inout1 $movkey ($key,%rax),$rndkey1 add \$32,%rax aes${dir} $rndkey0,$inout0 aes${dir} $rndkey0,$inout1 $movkey -16($key,%rax),$rndkey0 jnz .L${dir}_loop2 aes${dir} $rndkey1,$inout0 aes${dir} $rndkey1,$inout1 aes${dir}last $rndkey0,$inout0 aes${dir}last $rndkey0,$inout1 ret .size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2 ___ } sub aesni_generate3 { my $dir=shift; # As already mentioned it takes in $key and $rounds, which are *not* Loading Loading @@ -524,6 +563,8 @@ _aesni_${dir}rypt8: .size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8 ___ } &aesni_generate2("enc") if ($PREFIX eq "aesni"); &aesni_generate2("dec"); &aesni_generate3("enc") if ($PREFIX eq "aesni"); &aesni_generate3("dec"); &aesni_generate4("enc") if ($PREFIX eq "aesni"); Loading Loading @@ -645,8 +686,7 @@ $code.=<<___; jmp .Lecb_ret .align 16 .Lecb_enc_two: xorps $inout2,$inout2 call _aesni_encrypt3 call _aesni_encrypt2 movups $inout0,($out) movups $inout1,0x10($out) jmp .Lecb_ret Loading Loading @@ -782,8 +822,7 @@ $code.=<<___; jmp .Lecb_ret .align 16 .Lecb_dec_two: xorps $inout2,$inout2 call _aesni_decrypt3 call _aesni_decrypt2 movups $inout0,($out) movups $inout1,0x10($out) jmp .Lecb_ret Loading Loading @@ -1875,7 +1914,7 @@ $code.=<<___; xorps @tweak[0],$inout0 xorps @tweak[1],$inout1 call _aesni_encrypt3 call _aesni_encrypt2 xorps @tweak[0],$inout0 movdqa @tweak[2],@tweak[0] Loading Loading @@ -2322,7 +2361,7 @@ $code.=<<___; xorps @tweak[0],$inout0 xorps @tweak[1],$inout1 call _aesni_decrypt3 call _aesni_decrypt2 xorps @tweak[0],$inout0 movdqa @tweak[2],@tweak[0] Loading Loading @@ -2831,8 +2870,7 @@ $code.=<<___; .align 16 .Lcbc_dec_two: movaps $inout1,$in1 xorps $inout2,$inout2 call _aesni_decrypt3 call _aesni_decrypt2 pxor $iv,$inout0 movaps $in1,$iv pxor $in0,$inout1 Loading Loading
crypto/aes/asm/aesni-x86.pl +47 −16 Original line number Diff line number Diff line Loading @@ -207,12 +207,45 @@ sub aesni_generate1 # fully unrolled loop # every *2nd* cycle. Thus 3x interleave was the one providing optimal # utilization, i.e. when subroutine's throughput is virtually same as # of non-interleaved subroutine [for number of input blocks up to 3]. # This is why it makes no sense to implement 2x subroutine. # aes[enc|dec] latency in next processor generation is 8, but the # instructions can be scheduled every cycle. Optimal interleave for # new processor is therefore 8x, but it's unfeasible to accommodate it # in XMM registers addreassable in 32-bit mode and therefore 6x is # used instead... # This is why it originally made no sense to implement 2x subroutine. # But times change and it became appropriate to spend extra 192 bytes # on 2x subroutine on Atom Silvermont account. For processors that # can schedule aes[enc|dec] every cycle optimal interleave factor # equals to corresponding instructions latency. 8x is optimal for # * Bridge, but it's unfeasible to accommodate such implementation # in XMM registers addreassable in 32-bit mode and therefore maximum # of 6x is used instead... sub aesni_generate2 { my $p=shift; &function_begin_B("_aesni_${p}rypt2"); &$movekey ($rndkey0,&QWP(0,$key)); &shl ($rounds,4); &$movekey ($rndkey1,&QWP(16,$key)); &xorps ($inout0,$rndkey0); &pxor ($inout1,$rndkey0); &$movekey ($rndkey0,&QWP(32,$key)); &lea ($key,&DWP(32,$key,$rounds)); &neg ($rounds); &add ($rounds,16); &set_label("${p}2_loop"); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout1,$rndkey1)"; &$movekey ($rndkey1,&QWP(0,$key,$rounds)); &add ($rounds,32); eval"&aes${p} ($inout0,$rndkey0)"; eval"&aes${p} ($inout1,$rndkey0)"; &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); &jnz (&label("${p}2_loop")); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout1,$rndkey1)"; eval"&aes${p}last ($inout0,$rndkey0)"; eval"&aes${p}last ($inout1,$rndkey0)"; &ret(); &function_end_B("_aesni_${p}rypt2"); } sub aesni_generate3 { my $p=shift; Loading Loading @@ -357,6 +390,8 @@ sub aesni_generate6 &ret(); &function_end_B("_aesni_${p}rypt6"); } &aesni_generate2("enc") if ($PREFIX eq "aesni"); &aesni_generate2("dec"); &aesni_generate3("enc") if ($PREFIX eq "aesni"); &aesni_generate3("dec"); &aesni_generate4("enc") if ($PREFIX eq "aesni"); Loading Loading @@ -460,8 +495,7 @@ if ($PREFIX eq "aesni") { &jmp (&label("ecb_ret")); &set_label("ecb_enc_two",16); &xorps ($inout2,$inout2); &call ("_aesni_encrypt3"); &call ("_aesni_encrypt2"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &jmp (&label("ecb_ret")); Loading Loading @@ -561,8 +595,7 @@ if ($PREFIX eq "aesni") { &jmp (&label("ecb_ret")); &set_label("ecb_dec_two",16); &xorps ($inout2,$inout2); &call ("_aesni_decrypt3"); &call ("_aesni_decrypt2"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &jmp (&label("ecb_ret")); Loading Loading @@ -982,7 +1015,7 @@ if ($PREFIX eq "aesni") { &jmp (&label("ctr32_ret")); &set_label("ctr32_two",16); &call ("_aesni_encrypt3"); &call ("_aesni_encrypt2"); &movups ($inout3,&QWP(0,$inp)); &movups ($inout4,&QWP(0x10,$inp)); &xorps ($inout0,$inout3); Loading Loading @@ -1253,9 +1286,8 @@ if ($PREFIX eq "aesni") { &lea ($inp,&DWP(16*2,$inp)); &xorps ($inout0,$inout3); # input^=tweak &xorps ($inout1,$inout4); &xorps ($inout2,$inout2); &call ("_aesni_encrypt3"); &call ("_aesni_encrypt2"); &xorps ($inout0,$inout3); # output^=tweak &xorps ($inout1,$inout4); Loading Loading @@ -1596,7 +1628,7 @@ if ($PREFIX eq "aesni") { &xorps ($inout0,$inout3); # input^=tweak &xorps ($inout1,$inout4); &call ("_aesni_decrypt3"); &call ("_aesni_decrypt2"); &xorps ($inout0,$inout3); # output^=tweak &xorps ($inout1,$inout4); Loading Loading @@ -1896,8 +1928,7 @@ if ($PREFIX eq "aesni") { &jmp (&label("cbc_dec_tail_collected")); &set_label("cbc_dec_two",16); &xorps ($inout2,$inout2); &call ("_aesni_decrypt3"); &call ("_aesni_decrypt2"); &xorps ($inout0,$ivec); &xorps ($inout1,$in0); &movups (&QWP(0,$out),$inout0); Loading
crypto/aes/asm/aesni-x86_64.pl +50 −12 Original line number Diff line number Diff line Loading @@ -288,10 +288,49 @@ ___ # every *2nd* cycle. Thus 3x interleave was the one providing optimal # utilization, i.e. when subroutine's throughput is virtually same as # of non-interleaved subroutine [for number of input blocks up to 3]. # This is why it makes no sense to implement 2x subroutine. # aes[enc|dec] latency in next processor generation is 8, but the # instructions can be scheduled every cycle. Optimal interleave for # new processor is therefore 8x... # This is why it originally made no sense to implement 2x subroutine. # But times change and it became appropriate to spend extra 192 bytes # on 2x subroutine on Atom Silvermont account. For processors that # can schedule aes[enc|dec] every cycle optimal interleave factor # equals to corresponding instructions latency. 8x is optimal for # * Bridge and "super-optimal" for other Intel CPUs... sub aesni_generate2 { my $dir=shift; # As already mentioned it takes in $key and $rounds, which are *not* # preserved. $inout[0-1] is cipher/clear text... $code.=<<___; .type _aesni_${dir}rypt2,\@abi-omnipotent .align 16 _aesni_${dir}rypt2: $movkey ($key),$rndkey0 shl \$4,$rounds $movkey 16($key),$rndkey1 xorps $rndkey0,$inout0 xorps $rndkey0,$inout1 $movkey 32($key),$rndkey0 lea 32($key,$rounds),$key neg %rax # $rounds add \$16,%rax .L${dir}_loop2: aes${dir} $rndkey1,$inout0 aes${dir} $rndkey1,$inout1 $movkey ($key,%rax),$rndkey1 add \$32,%rax aes${dir} $rndkey0,$inout0 aes${dir} $rndkey0,$inout1 $movkey -16($key,%rax),$rndkey0 jnz .L${dir}_loop2 aes${dir} $rndkey1,$inout0 aes${dir} $rndkey1,$inout1 aes${dir}last $rndkey0,$inout0 aes${dir}last $rndkey0,$inout1 ret .size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2 ___ } sub aesni_generate3 { my $dir=shift; # As already mentioned it takes in $key and $rounds, which are *not* Loading Loading @@ -524,6 +563,8 @@ _aesni_${dir}rypt8: .size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8 ___ } &aesni_generate2("enc") if ($PREFIX eq "aesni"); &aesni_generate2("dec"); &aesni_generate3("enc") if ($PREFIX eq "aesni"); &aesni_generate3("dec"); &aesni_generate4("enc") if ($PREFIX eq "aesni"); Loading Loading @@ -645,8 +686,7 @@ $code.=<<___; jmp .Lecb_ret .align 16 .Lecb_enc_two: xorps $inout2,$inout2 call _aesni_encrypt3 call _aesni_encrypt2 movups $inout0,($out) movups $inout1,0x10($out) jmp .Lecb_ret Loading Loading @@ -782,8 +822,7 @@ $code.=<<___; jmp .Lecb_ret .align 16 .Lecb_dec_two: xorps $inout2,$inout2 call _aesni_decrypt3 call _aesni_decrypt2 movups $inout0,($out) movups $inout1,0x10($out) jmp .Lecb_ret Loading Loading @@ -1875,7 +1914,7 @@ $code.=<<___; xorps @tweak[0],$inout0 xorps @tweak[1],$inout1 call _aesni_encrypt3 call _aesni_encrypt2 xorps @tweak[0],$inout0 movdqa @tweak[2],@tweak[0] Loading Loading @@ -2322,7 +2361,7 @@ $code.=<<___; xorps @tweak[0],$inout0 xorps @tweak[1],$inout1 call _aesni_decrypt3 call _aesni_decrypt2 xorps @tweak[0],$inout0 movdqa @tweak[2],@tweak[0] Loading Loading @@ -2831,8 +2870,7 @@ $code.=<<___; .align 16 .Lcbc_dec_two: movaps $inout1,$in1 xorps $inout2,$inout2 call _aesni_decrypt3 call _aesni_decrypt2 pxor $iv,$inout0 movaps $in1,$iv pxor $in0,$inout1 Loading