Loading crypto/aes/asm/aesni-x86.pl +344 −143 Original line number Diff line number Diff line Loading @@ -11,6 +11,23 @@ # OpenSSL context it's used with Intel engine, but can also be used as # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for # details]. # # Performance. # # To start with see corresponding paragraph in aesni-x86_64.pl... # Instead of filling table similar to one found there I've chosen to # summarize *comparison* results for raw ECB, CTR and CBC benchmarks. # The simplified table below represents 32-bit performance relative # to 64-bit one in every given point. Ratios vary for different # encryption modes, therefore interval values. # # 16-byte 64-byte 256-byte 1-KB 8-KB # 53-67% 67-84% 91-94% 95-98% 97-99.5% # # Lower ratios for smaller block sizes are perfectly understandable, # because function call overhead is higher in 32-bit mode. Largest # 8-KB block performance is virtually same: 32-bit code is less than # 1% slower for ECB, CBC and CCM, and ~3% slower otherwise. $PREFIX="aesni"; # if $PREFIX is set to "AES", the script # generates drop-in replacement for Loading Loading @@ -44,64 +61,66 @@ $in0="xmm6"; $in1="xmm7"; $inout3="xmm7"; # Inline version of internal aesni_[en|de]crypt1 { my $sn; sub aesni_inline_generate1 { my $p=shift; { my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout)); $sn++; &$movekey ($rndkey0,&QWP(0,$key)); &$movekey ($rndkey1,&QWP(16,$key)); &lea ($key,&DWP(32,$key)); &pxor ($inout0,$rndkey0); &set_label("${p}1_loop"); eval"&aes${p} ($inout0,$rndkey1)"; &pxor ($inout,$rndkey0); &set_label("${p}1_loop_$sn"); eval"&aes${p} ($inout,$rndkey1)"; &dec ($rounds); &$movekey ($rndkey1,&QWP(0,$key)); &lea ($key,&DWP(16,$key)); &jnz (&label("${p}1_loop")); eval"&aes${p}last ($inout0,$rndkey1)"; } &jnz (&label("${p}1_loop_$sn")); eval"&aes${p}last ($inout,$rndkey1)"; }} sub aesni_generate1 # fully unrolled loop { my $p=shift; { my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout)); &function_begin_B("_aesni_${p}rypt1"); &$movekey ($rndkey0,&QWP(0,$key)); &$movekey ($rndkey1,&QWP(0x10,$key)); &cmp ($rounds,11); &pxor ($inout0,$rndkey0); &pxor ($inout,$rndkey0); &$movekey ($rndkey0,&QWP(0x20,$key)); &lea ($key,&DWP(0x30,$key)); &cmp ($rounds,11); &jb (&label("${p}128")); &lea ($key,&DWP(0x20,$key)); &je (&label("${p}192")); &lea ($key,&DWP(0x20,$key)); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout,$rndkey1)"; &$movekey ($rndkey1,&QWP(-0x40,$key)); eval"&aes${p} ($inout0,$rndkey0)"; eval"&aes${p} ($inout,$rndkey0)"; &$movekey ($rndkey0,&QWP(-0x30,$key)); &set_label("${p}192"); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout,$rndkey1)"; &$movekey ($rndkey1,&QWP(-0x20,$key)); eval"&aes${p} ($inout0,$rndkey0)"; eval"&aes${p} ($inout,$rndkey0)"; &$movekey ($rndkey0,&QWP(-0x10,$key)); &set_label("${p}128"); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout,$rndkey1)"; &$movekey ($rndkey1,&QWP(0,$key)); eval"&aes${p} ($inout0,$rndkey0)"; eval"&aes${p} ($inout,$rndkey0)"; &$movekey ($rndkey0,&QWP(0x10,$key)); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout,$rndkey1)"; &$movekey ($rndkey1,&QWP(0x20,$key)); eval"&aes${p} ($inout0,$rndkey0)"; eval"&aes${p} ($inout,$rndkey0)"; &$movekey ($rndkey0,&QWP(0x30,$key)); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout,$rndkey1)"; &$movekey ($rndkey1,&QWP(0x40,$key)); eval"&aes${p} ($inout0,$rndkey0)"; eval"&aes${p} ($inout,$rndkey0)"; &$movekey ($rndkey0,&QWP(0x50,$key)); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout,$rndkey1)"; &$movekey ($rndkey1,&QWP(0x60,$key)); eval"&aes${p} ($inout0,$rndkey0)"; eval"&aes${p} ($inout,$rndkey0)"; &$movekey ($rndkey0,&QWP(0x70,$key)); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p}last ($inout0,$rndkey0)"; eval"&aes${p} ($inout,$rndkey1)"; eval"&aes${p}last ($inout,$rndkey0)"; &ret(); &function_end_B("_aesni_${p}rypt1"); } Loading Loading @@ -159,21 +178,21 @@ sub aesni_generate3 &pxor ($inout0,$rndkey0); &pxor ($inout1,$rndkey0); &pxor ($inout2,$rndkey0); &jmp (&label("${p}3_loop")); &set_label("${p}3_loop",16); eval"&aes${p} ($inout0,$rndkey1)"; &$movekey ($rndkey0,&QWP(0,$key)); &set_label("${p}3_loop"); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout1,$rndkey1)"; &dec ($rounds); eval"&aes${p} ($inout2,$rndkey1)"; &$movekey ($rndkey1,&QWP(16,$key)); eval"&aes${p} ($inout0,$rndkey0)"; &lea ($key,&DWP(32,$key)); eval"&aes${p} ($inout1,$rndkey0)"; &lea ($key,&DWP(32,$key)); eval"&aes${p} ($inout2,$rndkey0)"; &$movekey ($rndkey0,&QWP(0,$key)); &jnz (&label("${p}3_loop")); eval"&aes${p} ($inout0,$rndkey1)"; &$movekey ($rndkey0,&QWP(0,$key)); eval"&aes${p} ($inout1,$rndkey1)"; eval"&aes${p} ($inout2,$rndkey1)"; eval"&aes${p}last ($inout0,$rndkey0)"; Loading @@ -199,23 +218,24 @@ sub aesni_generate4 &pxor ($inout1,$rndkey0); &pxor ($inout2,$rndkey0); &pxor ($inout3,$rndkey0); &jmp (&label("${p}3_loop")); &set_label("${p}3_loop",16); eval"&aes${p} ($inout0,$rndkey1)"; &$movekey ($rndkey0,&QWP(0,$key)); &set_label("${p}3_loop"); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout1,$rndkey1)"; &dec ($rounds); eval"&aes${p} ($inout2,$rndkey1)"; eval"&aes${p} ($inout3,$rndkey1)"; &$movekey ($rndkey1,&QWP(16,$key)); eval"&aes${p} ($inout0,$rndkey0)"; &lea ($key,&DWP(32,$key)); eval"&aes${p} ($inout1,$rndkey0)"; &lea ($key,&DWP(32,$key)); eval"&aes${p} ($inout2,$rndkey0)"; eval"&aes${p} ($inout3,$rndkey0)"; &$movekey ($rndkey0,&QWP(0,$key)); &jnz (&label("${p}3_loop")); eval"&aes${p} ($inout0,$rndkey1)"; &$movekey ($rndkey0,&QWP(0,$key)); eval"&aes${p} ($inout1,$rndkey1)"; eval"&aes${p} ($inout2,$rndkey1)"; eval"&aes${p} ($inout3,$rndkey1)"; Loading Loading @@ -261,27 +281,25 @@ if ($PREFIX eq "aesni") { &movups ($inout1,&QWP(0x10,$inp)); &movups ($inout2,&QWP(0x20,$inp)); &call ("_aesni_encrypt3"); &sub ($len,0x30); &lea ($inp,&DWP(0x30,$inp)); &lea ($out,&DWP(0x30,$out)); &movups (&QWP(-0x30,$out),$inout0); &movups (&QWP(0,$out),$inout0); &mov ($key,$key_); # restore $key &movups (&QWP(-0x20,$out),$inout1); &movups (&QWP(0x10,$out),$inout1); &mov ($rounds,$rounds_); # restore $rounds &movups (&QWP(-0x10,$out),$inout2); &movups (&QWP(0x20,$out),$inout2); &lea ($out,&DWP(0x30,$out)); &sub ($len,0x30); &ja (&label("ecb_enc_loop3")); &add ($len,0x40); &jz (&label("ecb_ret")); &set_label("ecb_enc_tail"); &cmp ($len,0x20); &movups ($inout0,&QWP(0,$inp)); &cmp ($len,0x20); &jb (&label("ecb_enc_one")); &movups ($inout1,&QWP(0x10,$inp)); &je (&label("ecb_enc_two")); &cmp ($len,0x30); &movups ($inout2,&QWP(0x20,$inp)); &cmp ($len,0x30); &je (&label("ecb_enc_three")); &movups ($inout3,&QWP(0x30,$inp)); &call ("_aesni_encrypt4"); Loading @@ -300,6 +318,7 @@ if ($PREFIX eq "aesni") { &jmp (&label("ecb_ret")); &set_label("ecb_enc_two",16); &pxor ($inout2,$inout2); &call ("_aesni_encrypt3"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); Loading @@ -323,27 +342,25 @@ if ($PREFIX eq "aesni") { &movups ($inout1,&QWP(0x10,$inp)); &movups ($inout2,&QWP(0x20,$inp)); &call ("_aesni_decrypt3"); &sub ($len,0x30); &lea ($inp,&DWP(0x30,$inp)); &lea ($out,&DWP(0x30,$out)); &movups (&QWP(-0x30,$out),$inout0); &movups (&QWP(0,$out),$inout0); &mov ($key,$key_); # restore $key &movups (&QWP(-0x20,$out),$inout1); &movups (&QWP(0x10,$out),$inout1); &mov ($rounds,$rounds_); # restore $rounds &movups (&QWP(-0x10,$out),$inout2); &movups (&QWP(0x20,$out),$inout2); &lea ($out,&DWP(0x30,$out)); &sub ($len,0x30); &ja (&label("ecb_dec_loop3")); &add ($len,0x40); &jz (&label("ecb_ret")); &set_label("ecb_dec_tail"); &cmp ($len,0x20); &movups ($inout0,&QWP(0,$inp)); &cmp ($len,0x20); &jb (&label("ecb_dec_one")); &movups ($inout1,&QWP(0x10,$inp)); &je (&label("ecb_dec_two")); &cmp ($len,0x30); &movups ($inout2,&QWP(0x20,$inp)); &cmp ($len,0x30); &je (&label("ecb_dec_three")); &movups ($inout3,&QWP(0x30,$inp)); &call ("_aesni_decrypt4"); Loading @@ -362,6 +379,7 @@ if ($PREFIX eq "aesni") { &jmp (&label("ecb_ret")); &set_label("ecb_dec_two",16); &pxor ($inout2,$inout2); &call ("_aesni_decrypt3"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); Loading @@ -377,12 +395,156 @@ if ($PREFIX eq "aesni") { &function_end("aesni_ecb_encrypt"); ###################################################################### # handles only complete blocks, operates on 32-bit counter and # does not update *ivec! (see engine/eng_aesni.c for details) # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, # size_t blocks, const AES_KEY *key, # const char *ivec,char *cmac); # # Handles only complete blocks, operates on 64-bit counter and # does not update *ivec! Nor does it finalize CMAC value # (see engine/eng_aesni.c for details) # &function_begin("aesni_ccm64_encrypt_blocks"); &mov ($inp,&wparam(0)); &mov ($out,&wparam(1)); &mov ($len,&wparam(2)); &mov ($key,&wparam(3)); &mov ($rounds_,&wparam(4)); &mov ($rounds,&wparam(5)); &mov ($key_,"esp"); &sub ("esp",60); &and ("esp",-16); # align stack &mov (&DWP(48,"esp"),$key_); &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec &movdqu ($inout1,&QWP(0,$rounds)); # load cmac # compose byte-swap control mask for pshufb on stack &mov (&DWP(0,"esp"),0x0c0d0e0f); &mov (&DWP(4,"esp"),0x08090a0b); &mov (&DWP(8,"esp"),0x04050607); &mov (&DWP(12,"esp"),0x00010203); # compose counter increment vector on stack &mov ($rounds,1); &xor ($key_,$key_); &mov (&DWP(16,"esp"),$rounds); &mov (&DWP(20,"esp"),$key_); &mov (&DWP(24,"esp"),$key_); &mov (&DWP(28,"esp"),$key_); &movdqa ($inout3,&QWP(0,"esp")); &pshufb ($ivec,$inout3); # keep iv in reverse order &mov ($rounds,&DWP(240,$key)); &mov ($key_,$key); &mov ($rounds_,$rounds); &movdqa ($inout0,$ivec); &set_label("ccm64_enc_outer"); &movdqu ($in0,&QWP(0,$inp)); &pshufb ($inout0,$inout3); &mov ($key,$key_); &mov ($rounds,$rounds_); &pxor ($inout1,$in0); # cmac^=inp &pxor ($inout2,$inout2); &call ("_aesni_encrypt3"); &paddq ($ivec,&QWP(16,"esp")); &dec ($len); &lea ($inp,&DWP(16,$inp)); &pxor ($in0,$inout0); # inp^=E(ivec) &movdqa ($inout0,$ivec); &movdqu (&QWP(0,$out),$in0); &lea ($out,&DWP(16,$out)); &jnz (&label("ccm64_enc_outer")); &mov ("esp",&DWP(48,"esp")); &mov ($out,&wparam(5)); &movdqu (&QWP(0,$out),$inout1); &function_end("aesni_ccm64_encrypt_blocks"); &function_begin("aesni_ccm64_decrypt_blocks"); &mov ($inp,&wparam(0)); &mov ($out,&wparam(1)); &mov ($len,&wparam(2)); &mov ($key,&wparam(3)); &mov ($rounds_,&wparam(4)); &mov ($rounds,&wparam(5)); &mov ($key_,"esp"); &sub ("esp",60); &and ("esp",-16); # align stack &mov (&DWP(48,"esp"),$key_); &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec &movdqu ($inout1,&QWP(0,$rounds)); # load cmac # compose byte-swap control mask for pshufb on stack &mov (&DWP(0,"esp"),0x0c0d0e0f); &mov (&DWP(4,"esp"),0x08090a0b); &mov (&DWP(8,"esp"),0x04050607); &mov (&DWP(12,"esp"),0x00010203); # compose counter increment vector on stack &mov ($rounds,1); &xor ($key_,$key_); &mov (&DWP(16,"esp"),$rounds); &mov (&DWP(20,"esp"),$key_); &mov (&DWP(24,"esp"),$key_); &mov (&DWP(28,"esp"),$key_); &movdqa ($inout3,&QWP(0,"esp")); # bswap mask &movdqa ($inout0,$ivec); &pshufb ($ivec,$inout3); # keep iv in reverse order &mov ($rounds,&DWP(240,$key)); &mov ($key_,$key); &mov ($rounds_,$rounds); if ($inline) { &aesni_inline_generate1("enc"); } else { &call ("_aesni_encrypt1"); } &set_label("ccm64_dec_outer"); &movdqu ($in0,&QWP(0,$inp)); &paddq ($ivec,&QWP(16,"esp")); &dec ($len); &lea ($inp,&QWP(16,$inp)); &pxor ($in0,$inout0); &movdqa ($inout0,$ivec); &mov ($key,$key_); &mov ($rounds,$rounds_); &pshufb ($inout0,$inout3); &movdqu (&QWP(0,$out),$in0); &lea ($out,&DWP(16,$out)); &jz (&label("ccm64_dec_break")); &pxor ($inout2,$inout2); &call ("_aesni_encrypt3"); &jmp (&label("ccm64_dec_outer")); &set_label("ccm64_dec_break",16); if ($inline) { &aesni_inline_generate1("enc",$inout1); } else { &call ("_aesni_encrypt1",$inout1); } &mov ("esp",&DWP(48,"esp")); &mov ($out,&wparam(5)); &movdqu (&QWP(0,$out),$inout1); &function_end("aesni_ccm64_decrypt_blocks"); ###################################################################### # void aesni_ctr32_encrypt_blocks (const void *in, void *out, # size_t blocks, const AES_KEY *key, # const char *ivec); # # Handles only complete blocks, operates on 32-bit counter and # does not update *ivec! (see engine/eng_aesni.c for details) # &function_begin("aesni_ctr32_encrypt_blocks"); &mov ($inp,&wparam(0)); &mov ($out,&wparam(1)); Loading @@ -394,6 +556,9 @@ if ($PREFIX eq "aesni") { &and ("esp",-16); # align stack &mov (&DWP(48,"esp"),$key_); &cmp ($len,1); &je (&label("ctr32_one_shortcut")); &movups ($inout3,&QWP(0,$rounds_)); # load ivec # compose byte-swap control mask for pshufb on stack Loading @@ -414,7 +579,7 @@ if ($PREFIX eq "aesni") { &pinsrd ($inout3,$key_,3); # wipe 32-bit counter &mov ($rounds,&DWP(240,$key)); # key->rounds &movaps ($rndkey0,&QWP(0,"esp")); # load byte-swap mask &movdqa ($rndkey0,&QWP(0,"esp")); # load byte-swap mask # $ivec is vector of 3 32-bit counters &pxor ($ivec,$ivec); Loading @@ -424,11 +589,11 @@ if ($PREFIX eq "aesni") { &pinsrd ($ivec,$rounds_,1); &inc ($rounds_); &pinsrd ($ivec,$rounds_,2); &pshufb ($ivec,$rndkey0); # byte swap &cmp ($len,4); &pshufb ($ivec,$rndkey0); # byte swap &jbe (&label("ctr32_tail")); &movaps (&QWP(32,"esp"),$inout3); # save counter-less ivec &movdqa (&QWP(32,"esp"),$inout3); # save counter-less ivec &mov ($rounds_,$rounds); &mov ($key_,$key); &sub ($len,4); Loading @@ -437,104 +602,139 @@ if ($PREFIX eq "aesni") { &set_label("ctr32_loop3",16); &pshufd ($inout0,$ivec,3<<6); # place counter to upper dword &pshufd ($inout1,$ivec,2<<6); &pshufd ($inout2,$ivec,1<<6); &por ($inout0,$inout3); # merge counter-less ivec &pshufd ($inout2,$ivec,1<<6); &por ($inout1,$inout3); &por ($inout2,$inout3); &call ("_aesni_encrypt3"); # inline _aesni_encrypt3 and interleave last round # with own code... &movaps($rndkey0,&QWP(0,"esp")); # load byte-swap mask &movups ($in0,&QWP(0,$inp)); &movups ($in1,&QWP(0x10,$inp)); &movups ($rndkey1,&QWP(0x20,$inp)); &pshufb($ivec,$rndkey0); # byte swap &$movekey ($rndkey0,&QWP(0,$key)); &shr ($rounds,1); &$movekey ($rndkey1,&QWP(16,$key)); &lea ($key,&DWP(32,$key)); &pxor ($inout0,$rndkey0); &pxor ($inout1,$rndkey0); &pxor ($inout2,$rndkey0); &$movekey ($rndkey0,&QWP(0,$key)); &set_label("ctr32_enc_loop3"); &aesenc ($inout0,$rndkey1); &aesenc ($inout1,$rndkey1); &dec ($rounds); &aesenc ($inout2,$rndkey1); &$movekey ($rndkey1,&QWP(16,$key)); &aesenc ($inout0,$rndkey0); &aesenc ($inout1,$rndkey0); &lea ($key,&DWP(32,$key)); &aesenc ($inout2,$rndkey0); &$movekey ($rndkey0,&QWP(0,$key)); &jnz (&label("ctr32_enc_loop3")); &aesenc ($inout0,$rndkey1); &aesenc ($inout1,$rndkey1); &aesenc ($inout2,$rndkey1); &movdqa ($rndkey1,&QWP(0,"esp")); # load byte-swap mask &aesenclast ($inout0,$rndkey0); &pshufb ($ivec,$rndkey1); # byte swap &movdqu ($in0,&QWP(0,$inp)); &aesenclast ($inout1,$rndkey0); &paddd ($ivec,&QWP(16,"esp")); # counter increment &movdqu ($in1,&QWP(0x10,$inp)); &aesenclast ($inout2,$rndkey0); &pshufb ($ivec,$rndkey1); # byte swap &movdqu ($rndkey0,&QWP(0x20,$inp)); &lea ($inp,&DWP(0x30,$inp)); &pxor ($in0,$inout0); &mov ($key,$key_); &pxor ($in1,$inout1); &pxor ($rndkey1,$inout2); &movups (&QWP(0,$out),$in0); &movups (&QWP(0x10,$out),$in1); &movups (&QWP(0x20,$out),$rndkey1); &movaps ($inout3,&QWP(32,"esp")); # load counter-less ivec &pshufb($ivec,$rndkey0); # byte swap &movdqu (&QWP(0,$out),$in0); &pxor ($rndkey0,$inout2); &movdqu (&QWP(0x10,$out),$in1); &movdqu (&QWP(0x20,$out),$rndkey0); &movdqa ($inout3,&QWP(32,"esp")); # load counter-less ivec &sub ($len,3); &lea ($inp,&DWP(0x30,$inp)); &lea ($out,&DWP(0x30,$out)); &mov ($key,$key_); &mov ($rounds,$rounds_); &ja (&label("ctr32_loop3")); &add ($len,4); &pextrd ($rounds_,$ivec,1); # might need last counter value &jz (&label("ctr32_ret")); &add ($len,4); &bswap ($rounds_); &set_label("ctr32_tail"); &cmp ($len,2); &pshufd ($inout0,$ivec,3<<6); &pshufd ($inout1,$ivec,2<<6); &pshufd ($inout2,$ivec,1<<6); &por ($inout0,$inout3); &cmp ($len,2); &jb (&label("ctr32_one")); &lea ($rounds_,&DWP(1,$rounds_)); &pshufd ($inout2,$ivec,1<<6); &por ($inout1,$inout3); &je (&label("ctr32_two")); &cmp ($len,3); &bswap ($rounds_); &por ($inout2,$inout3); &cmp ($len,3); &je (&label("ctr32_three")); &inc ($rounds_); # compose last counter value &bswap ($rounds_); &pinsrd ($inout3,$rounds_,3); &pinsrd ($inout3,$rounds_,3); # compose last counter value &call ("_aesni_encrypt4"); &movups ($in0,&QWP(0,$inp)); &movups ($rndkey1,&QWP(0x10,$inp)); &movups ($rndkey0,&QWP(0x20,$inp)); &movups ($ivec,&QWP(0x30,$inp)); &movdqu ($in0,&QWP(0,$inp)); &movdqu ($rndkey1,&QWP(0x10,$inp)); &pxor ($in0,$inout0); &movdqu ($rndkey0,&QWP(0x20,$inp)); &pxor ($rndkey1,$inout1); &movdqu ($ivec,&QWP(0x30,$inp)); &pxor ($rndkey0,$inout2); &movdqu (&QWP(0,$out),$in0); &pxor ($ivec,$inout3); &movups (&QWP(0,$out),$in0); &movups (&QWP(0x10,$out),$rndkey1); &movups (&QWP(0x20,$out),$rndkey0); &movups (&QWP(0x30,$out),$ivec); &movdqu (&QWP(0x10,$out),$rndkey1); &movdqu (&QWP(0x20,$out),$rndkey0); &movdqu (&QWP(0x30,$out),$ivec); &jmp (&label("ctr32_ret")); &set_label("ctr32_one",16); &set_label("ctr32_one_shortcut",16); &movdqu ($inout0,&QWP(0,$rounds_)); # load ivec &mov ($rounds,&DWP(240,$key)); &set_label("ctr32_one"); if ($inline) { &aesni_inline_generate1("enc"); } else { &call ("_aesni_encrypt1"); } &movups ($in0,&QWP(0,$inp)); &movdqu ($in0,&QWP(0,$inp)); &pxor ($in0,$inout0); &movups (&QWP(0,$out),$in0); &movdqu (&QWP(0,$out),$in0); &jmp (&label("ctr32_ret")); &set_label("ctr32_two",16); &pxor ($inout2,$inout2); &call ("_aesni_encrypt3"); &movups ($in0,&QWP(0,$inp)); &movups ($in1,&QWP(0x10,$inp)); &movdqu ($in0,&QWP(0,$inp)); &movdqu ($in1,&QWP(0x10,$inp)); &pxor ($in0,$inout0); &pxor ($in1,$inout1); &movups (&QWP(0,$out),$in0); &movups (&QWP(0x10,$out),$in1); &movdqu (&QWP(0,$out),$in0); &movdqu (&QWP(0x10,$out),$in1); &jmp (&label("ctr32_ret")); &set_label("ctr32_three",16); &call ("_aesni_encrypt3"); &movups ($in0,&QWP(0,$inp)); &movups ($in1,&QWP(0x10,$inp)); &movups ($rndkey1,&QWP(0x20,$inp)); &movdqu ($in0,&QWP(0,$inp)); &movdqu ($in1,&QWP(0x10,$inp)); &movdqu ($rndkey1,&QWP(0x20,$inp)); &pxor ($in0,$inout0); &pxor ($in1,$inout1); &movdqu (&QWP(0,$out),$in0); &pxor ($rndkey1,$inout2); &movups (&QWP(0,$out),$in0); &movups (&QWP(0x10,$out),$in1); &movups (&QWP(0x20,$out),$rndkey1); &movdqu (&QWP(0x10,$out),$in1); &movdqu (&QWP(0x20,$out),$rndkey1); &set_label("ctr32_ret"); &mov ("esp",&DWP(48,"esp")); Loading @@ -550,36 +750,36 @@ if ($PREFIX eq "aesni") { &mov ($out,&wparam(1)); &mov ($len,&wparam(2)); &mov ($key,&wparam(3)); &test ($len,$len); &mov ($key_,&wparam(4)); &test ($len,$len); &jz (&label("cbc_ret")); &cmp (&wparam(5),0); &movups ($ivec,&QWP(0,$key_)); # load IV &movdqu ($ivec,&QWP(0,$key_)); # load IV &mov ($rounds,&DWP(240,$key)); &mov ($key_,$key); # backup $key &mov ($rounds_,$rounds); # backup $rounds &je (&label("cbc_decrypt")); &movaps ($inout0,$ivec); &movdqa ($inout0,$ivec); &cmp ($len,16); &jb (&label("cbc_enc_tail")); &sub ($len,16); &jmp (&label("cbc_enc_loop")); &set_label("cbc_enc_loop",16); &movups ($ivec,&QWP(0,$inp)); &movdqu ($ivec,&QWP(0,$inp)); &lea ($inp,&DWP(16,$inp)); &pxor ($inout0,$ivec); if ($inline) { &aesni_inline_generate1("enc"); } else { &call ("_aesni_encrypt1"); } &sub ($len,16); &lea ($out,&DWP(16,$out)); &mov ($rounds,$rounds_); # restore $rounds &mov ($key,$key_); # restore $key &movups (&QWP(-16,$out),$inout0); &movups (&QWP(0,$out),$inout0); # store output &lea ($out,&DWP(16,$out)); &sub ($len,16); &jnc (&label("cbc_enc_loop")); &add ($len,16); &jnz (&label("cbc_enc_tail")); Loading Loading @@ -611,93 +811,94 @@ if ($PREFIX eq "aesni") { &movups ($inout2,&QWP(0x20,$inp)); &movaps ($in0,$inout0); &movaps ($in1,$inout1); &call ("_aesni_decrypt3"); &sub ($len,0x30); &lea ($inp,&DWP(0x30,$inp)); &lea ($out,&DWP(0x30,$out)); &pxor ($inout0,$ivec); &pxor ($inout1,$in0); &movups ($ivec,&QWP(-0x10,$inp)); &movdqu ($ivec,&QWP(0x20,$inp)); &lea ($inp,&DWP(0x30,$inp)); &pxor ($inout2,$in1); &movups (&QWP(-0x30,$out),$inout0); &movdqu (&QWP(0,$out),$inout0); &mov ($rounds,$rounds_) # restore $rounds &movups (&QWP(-0x20,$out),$inout1); &movdqu (&QWP(0x10,$out),$inout1); &mov ($key,$key_); # restore $key &movups (&QWP(-0x10,$out),$inout2); &movdqu (&QWP(0x20,$out),$inout2); &lea ($out,&DWP(0x30,$out)); &sub ($len,0x30); &ja (&label("cbc_dec_loop3")); &add ($len,0x40); &jz (&label("cbc_ret")); &set_label("cbc_dec_tail"); &movups ($inout0,&QWP(0,$inp)); &cmp ($len,0x10); &movaps ($in0,$inout0); &cmp ($len,0x10); &jbe (&label("cbc_dec_one")); &movups ($inout1,&QWP(0x10,$inp)); &cmp ($len,0x20); &movaps ($in1,$inout1); &cmp ($len,0x20); &jbe (&label("cbc_dec_two")); &movups ($inout2,&QWP(0x20,$inp)); &cmp ($len,0x30); &jbe (&label("cbc_dec_three")); &movups ($inout3,&QWP(0x30,$inp)); &call ("_aesni_decrypt4"); &movups ($rndkey0,&QWP(0x10,$inp)); &movups ($rndkey1,&QWP(0x20,$inp)); &movdqu ($rndkey0,&QWP(0x10,$inp)); &movdqu ($rndkey1,&QWP(0x20,$inp)); &pxor ($inout0,$ivec); &pxor ($inout1,$in0); &movups ($ivec,&QWP(0x30,$inp)); &movups (&QWP(0,$out),$inout0); &movdqu ($ivec,&QWP(0x30,$inp)); &movdqu (&QWP(0,$out),$inout0); &pxor ($inout2,$rndkey0); &pxor ($inout3,$rndkey1); &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(0x20,$out),$inout2); &movaps ($inout0,$inout3); &movdqu (&QWP(0x10,$out),$inout1); &movdqu (&QWP(0x20,$out),$inout2); &movdqa ($inout0,$inout3); &lea ($out,&DWP(0x30,$out)); &jmp (&label("cbc_dec_tail_collected")); &set_label("cbc_dec_one"); &set_label("cbc_dec_one",16); if ($inline) { &aesni_inline_generate1("dec"); } else { &call ("_aesni_decrypt1"); } &pxor ($inout0,$ivec); &movaps ($ivec,$in0); &movdqa ($ivec,$in0); &jmp (&label("cbc_dec_tail_collected")); &set_label("cbc_dec_two"); &set_label("cbc_dec_two",16); &pxor ($inout2,$inout2); &call ("_aesni_decrypt3"); &pxor ($inout0,$ivec); &pxor ($inout1,$in0); &movups (&QWP(0,$out),$inout0); &movaps ($inout0,$inout1); &movaps ($ivec,$in1); &movdqu (&QWP(0,$out),$inout0); &movdqa ($inout0,$inout1); &movdqa ($ivec,$in1); &lea ($out,&DWP(0x10,$out)); &jmp (&label("cbc_dec_tail_collected")); &set_label("cbc_dec_three"); &set_label("cbc_dec_three",16); &call ("_aesni_decrypt3"); &pxor ($inout0,$ivec); &pxor ($inout1,$in0); &pxor ($inout2,$in1); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &movaps ($inout0,$inout2); &movups ($ivec,&QWP(0x20,$inp)); &movdqu (&QWP(0,$out),$inout0); &movdqu (&QWP(0x10,$out),$inout1); &movdqa ($inout0,$inout2); &movdqu ($ivec,&QWP(0x20,$inp)); &lea ($out,&DWP(0x20,$out)); &set_label("cbc_dec_tail_collected"); &and ($len,15); &jnz (&label("cbc_dec_tail_partial")); &movups (&QWP(0,$out),$inout0); &movdqu (&QWP(0,$out),$inout0); &jmp (&label("cbc_ret")); &set_label("cbc_dec_tail_partial"); &set_label("cbc_dec_tail_partial",16); &mov ($key_,"esp"); &sub ("esp",16); &and ("esp",-16); &movaps (&QWP(0,"esp"),$inout0); &movdqa (&QWP(0,"esp"),$inout0); &mov ($inp,"esp"); &mov ("ecx",$len); &data_word(0xA4F3F689); # rep movsb Loading Loading @@ -935,9 +1136,9 @@ if ($PREFIX eq "aesni") { &aesimc ("xmm1","xmm1"); &lea ($key,&DWP(16,$key)); &lea ("eax",&DWP(-16,"eax")); &cmp ("eax",$key); &$movekey (&QWP(16,"eax"),"xmm0"); &$movekey (&QWP(-16,$key),"xmm1"); &cmp ("eax",$key); &ja (&label("dec_key_inverse")); &$movekey ("xmm0",&QWP(0,$key)); # inverse middle Loading crypto/aes/asm/aesni-x86_64.pl +425 −112 File changed.Preview size limit exceeded, changes collapsed. Show changes Loading
crypto/aes/asm/aesni-x86.pl +344 −143 Original line number Diff line number Diff line Loading @@ -11,6 +11,23 @@ # OpenSSL context it's used with Intel engine, but can also be used as # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for # details]. # # Performance. # # To start with see corresponding paragraph in aesni-x86_64.pl... # Instead of filling table similar to one found there I've chosen to # summarize *comparison* results for raw ECB, CTR and CBC benchmarks. # The simplified table below represents 32-bit performance relative # to 64-bit one in every given point. Ratios vary for different # encryption modes, therefore interval values. # # 16-byte 64-byte 256-byte 1-KB 8-KB # 53-67% 67-84% 91-94% 95-98% 97-99.5% # # Lower ratios for smaller block sizes are perfectly understandable, # because function call overhead is higher in 32-bit mode. Largest # 8-KB block performance is virtually same: 32-bit code is less than # 1% slower for ECB, CBC and CCM, and ~3% slower otherwise. $PREFIX="aesni"; # if $PREFIX is set to "AES", the script # generates drop-in replacement for Loading Loading @@ -44,64 +61,66 @@ $in0="xmm6"; $in1="xmm7"; $inout3="xmm7"; # Inline version of internal aesni_[en|de]crypt1 { my $sn; sub aesni_inline_generate1 { my $p=shift; { my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout)); $sn++; &$movekey ($rndkey0,&QWP(0,$key)); &$movekey ($rndkey1,&QWP(16,$key)); &lea ($key,&DWP(32,$key)); &pxor ($inout0,$rndkey0); &set_label("${p}1_loop"); eval"&aes${p} ($inout0,$rndkey1)"; &pxor ($inout,$rndkey0); &set_label("${p}1_loop_$sn"); eval"&aes${p} ($inout,$rndkey1)"; &dec ($rounds); &$movekey ($rndkey1,&QWP(0,$key)); &lea ($key,&DWP(16,$key)); &jnz (&label("${p}1_loop")); eval"&aes${p}last ($inout0,$rndkey1)"; } &jnz (&label("${p}1_loop_$sn")); eval"&aes${p}last ($inout,$rndkey1)"; }} sub aesni_generate1 # fully unrolled loop { my $p=shift; { my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout)); &function_begin_B("_aesni_${p}rypt1"); &$movekey ($rndkey0,&QWP(0,$key)); &$movekey ($rndkey1,&QWP(0x10,$key)); &cmp ($rounds,11); &pxor ($inout0,$rndkey0); &pxor ($inout,$rndkey0); &$movekey ($rndkey0,&QWP(0x20,$key)); &lea ($key,&DWP(0x30,$key)); &cmp ($rounds,11); &jb (&label("${p}128")); &lea ($key,&DWP(0x20,$key)); &je (&label("${p}192")); &lea ($key,&DWP(0x20,$key)); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout,$rndkey1)"; &$movekey ($rndkey1,&QWP(-0x40,$key)); eval"&aes${p} ($inout0,$rndkey0)"; eval"&aes${p} ($inout,$rndkey0)"; &$movekey ($rndkey0,&QWP(-0x30,$key)); &set_label("${p}192"); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout,$rndkey1)"; &$movekey ($rndkey1,&QWP(-0x20,$key)); eval"&aes${p} ($inout0,$rndkey0)"; eval"&aes${p} ($inout,$rndkey0)"; &$movekey ($rndkey0,&QWP(-0x10,$key)); &set_label("${p}128"); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout,$rndkey1)"; &$movekey ($rndkey1,&QWP(0,$key)); eval"&aes${p} ($inout0,$rndkey0)"; eval"&aes${p} ($inout,$rndkey0)"; &$movekey ($rndkey0,&QWP(0x10,$key)); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout,$rndkey1)"; &$movekey ($rndkey1,&QWP(0x20,$key)); eval"&aes${p} ($inout0,$rndkey0)"; eval"&aes${p} ($inout,$rndkey0)"; &$movekey ($rndkey0,&QWP(0x30,$key)); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout,$rndkey1)"; &$movekey ($rndkey1,&QWP(0x40,$key)); eval"&aes${p} ($inout0,$rndkey0)"; eval"&aes${p} ($inout,$rndkey0)"; &$movekey ($rndkey0,&QWP(0x50,$key)); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout,$rndkey1)"; &$movekey ($rndkey1,&QWP(0x60,$key)); eval"&aes${p} ($inout0,$rndkey0)"; eval"&aes${p} ($inout,$rndkey0)"; &$movekey ($rndkey0,&QWP(0x70,$key)); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p}last ($inout0,$rndkey0)"; eval"&aes${p} ($inout,$rndkey1)"; eval"&aes${p}last ($inout,$rndkey0)"; &ret(); &function_end_B("_aesni_${p}rypt1"); } Loading Loading @@ -159,21 +178,21 @@ sub aesni_generate3 &pxor ($inout0,$rndkey0); &pxor ($inout1,$rndkey0); &pxor ($inout2,$rndkey0); &jmp (&label("${p}3_loop")); &set_label("${p}3_loop",16); eval"&aes${p} ($inout0,$rndkey1)"; &$movekey ($rndkey0,&QWP(0,$key)); &set_label("${p}3_loop"); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout1,$rndkey1)"; &dec ($rounds); eval"&aes${p} ($inout2,$rndkey1)"; &$movekey ($rndkey1,&QWP(16,$key)); eval"&aes${p} ($inout0,$rndkey0)"; &lea ($key,&DWP(32,$key)); eval"&aes${p} ($inout1,$rndkey0)"; &lea ($key,&DWP(32,$key)); eval"&aes${p} ($inout2,$rndkey0)"; &$movekey ($rndkey0,&QWP(0,$key)); &jnz (&label("${p}3_loop")); eval"&aes${p} ($inout0,$rndkey1)"; &$movekey ($rndkey0,&QWP(0,$key)); eval"&aes${p} ($inout1,$rndkey1)"; eval"&aes${p} ($inout2,$rndkey1)"; eval"&aes${p}last ($inout0,$rndkey0)"; Loading @@ -199,23 +218,24 @@ sub aesni_generate4 &pxor ($inout1,$rndkey0); &pxor ($inout2,$rndkey0); &pxor ($inout3,$rndkey0); &jmp (&label("${p}3_loop")); &set_label("${p}3_loop",16); eval"&aes${p} ($inout0,$rndkey1)"; &$movekey ($rndkey0,&QWP(0,$key)); &set_label("${p}3_loop"); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout1,$rndkey1)"; &dec ($rounds); eval"&aes${p} ($inout2,$rndkey1)"; eval"&aes${p} ($inout3,$rndkey1)"; &$movekey ($rndkey1,&QWP(16,$key)); eval"&aes${p} ($inout0,$rndkey0)"; &lea ($key,&DWP(32,$key)); eval"&aes${p} ($inout1,$rndkey0)"; &lea ($key,&DWP(32,$key)); eval"&aes${p} ($inout2,$rndkey0)"; eval"&aes${p} ($inout3,$rndkey0)"; &$movekey ($rndkey0,&QWP(0,$key)); &jnz (&label("${p}3_loop")); eval"&aes${p} ($inout0,$rndkey1)"; &$movekey ($rndkey0,&QWP(0,$key)); eval"&aes${p} ($inout1,$rndkey1)"; eval"&aes${p} ($inout2,$rndkey1)"; eval"&aes${p} ($inout3,$rndkey1)"; Loading Loading @@ -261,27 +281,25 @@ if ($PREFIX eq "aesni") { &movups ($inout1,&QWP(0x10,$inp)); &movups ($inout2,&QWP(0x20,$inp)); &call ("_aesni_encrypt3"); &sub ($len,0x30); &lea ($inp,&DWP(0x30,$inp)); &lea ($out,&DWP(0x30,$out)); &movups (&QWP(-0x30,$out),$inout0); &movups (&QWP(0,$out),$inout0); &mov ($key,$key_); # restore $key &movups (&QWP(-0x20,$out),$inout1); &movups (&QWP(0x10,$out),$inout1); &mov ($rounds,$rounds_); # restore $rounds &movups (&QWP(-0x10,$out),$inout2); &movups (&QWP(0x20,$out),$inout2); &lea ($out,&DWP(0x30,$out)); &sub ($len,0x30); &ja (&label("ecb_enc_loop3")); &add ($len,0x40); &jz (&label("ecb_ret")); &set_label("ecb_enc_tail"); &cmp ($len,0x20); &movups ($inout0,&QWP(0,$inp)); &cmp ($len,0x20); &jb (&label("ecb_enc_one")); &movups ($inout1,&QWP(0x10,$inp)); &je (&label("ecb_enc_two")); &cmp ($len,0x30); &movups ($inout2,&QWP(0x20,$inp)); &cmp ($len,0x30); &je (&label("ecb_enc_three")); &movups ($inout3,&QWP(0x30,$inp)); &call ("_aesni_encrypt4"); Loading @@ -300,6 +318,7 @@ if ($PREFIX eq "aesni") { &jmp (&label("ecb_ret")); &set_label("ecb_enc_two",16); &pxor ($inout2,$inout2); &call ("_aesni_encrypt3"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); Loading @@ -323,27 +342,25 @@ if ($PREFIX eq "aesni") { &movups ($inout1,&QWP(0x10,$inp)); &movups ($inout2,&QWP(0x20,$inp)); &call ("_aesni_decrypt3"); &sub ($len,0x30); &lea ($inp,&DWP(0x30,$inp)); &lea ($out,&DWP(0x30,$out)); &movups (&QWP(-0x30,$out),$inout0); &movups (&QWP(0,$out),$inout0); &mov ($key,$key_); # restore $key &movups (&QWP(-0x20,$out),$inout1); &movups (&QWP(0x10,$out),$inout1); &mov ($rounds,$rounds_); # restore $rounds &movups (&QWP(-0x10,$out),$inout2); &movups (&QWP(0x20,$out),$inout2); &lea ($out,&DWP(0x30,$out)); &sub ($len,0x30); &ja (&label("ecb_dec_loop3")); &add ($len,0x40); &jz (&label("ecb_ret")); &set_label("ecb_dec_tail"); &cmp ($len,0x20); &movups ($inout0,&QWP(0,$inp)); &cmp ($len,0x20); &jb (&label("ecb_dec_one")); &movups ($inout1,&QWP(0x10,$inp)); &je (&label("ecb_dec_two")); &cmp ($len,0x30); &movups ($inout2,&QWP(0x20,$inp)); &cmp ($len,0x30); &je (&label("ecb_dec_three")); &movups ($inout3,&QWP(0x30,$inp)); &call ("_aesni_decrypt4"); Loading @@ -362,6 +379,7 @@ if ($PREFIX eq "aesni") { &jmp (&label("ecb_ret")); &set_label("ecb_dec_two",16); &pxor ($inout2,$inout2); &call ("_aesni_decrypt3"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); Loading @@ -377,12 +395,156 @@ if ($PREFIX eq "aesni") { &function_end("aesni_ecb_encrypt"); ###################################################################### # handles only complete blocks, operates on 32-bit counter and # does not update *ivec! (see engine/eng_aesni.c for details) # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, # size_t blocks, const AES_KEY *key, # const char *ivec,char *cmac); # # Handles only complete blocks, operates on 64-bit counter and # does not update *ivec! Nor does it finalize CMAC value # (see engine/eng_aesni.c for details) # &function_begin("aesni_ccm64_encrypt_blocks"); &mov ($inp,&wparam(0)); &mov ($out,&wparam(1)); &mov ($len,&wparam(2)); &mov ($key,&wparam(3)); &mov ($rounds_,&wparam(4)); &mov ($rounds,&wparam(5)); &mov ($key_,"esp"); &sub ("esp",60); &and ("esp",-16); # align stack &mov (&DWP(48,"esp"),$key_); &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec &movdqu ($inout1,&QWP(0,$rounds)); # load cmac # compose byte-swap control mask for pshufb on stack &mov (&DWP(0,"esp"),0x0c0d0e0f); &mov (&DWP(4,"esp"),0x08090a0b); &mov (&DWP(8,"esp"),0x04050607); &mov (&DWP(12,"esp"),0x00010203); # compose counter increment vector on stack &mov ($rounds,1); &xor ($key_,$key_); &mov (&DWP(16,"esp"),$rounds); &mov (&DWP(20,"esp"),$key_); &mov (&DWP(24,"esp"),$key_); &mov (&DWP(28,"esp"),$key_); &movdqa ($inout3,&QWP(0,"esp")); &pshufb ($ivec,$inout3); # keep iv in reverse order &mov ($rounds,&DWP(240,$key)); &mov ($key_,$key); &mov ($rounds_,$rounds); &movdqa ($inout0,$ivec); &set_label("ccm64_enc_outer"); &movdqu ($in0,&QWP(0,$inp)); &pshufb ($inout0,$inout3); &mov ($key,$key_); &mov ($rounds,$rounds_); &pxor ($inout1,$in0); # cmac^=inp &pxor ($inout2,$inout2); &call ("_aesni_encrypt3"); &paddq ($ivec,&QWP(16,"esp")); &dec ($len); &lea ($inp,&DWP(16,$inp)); &pxor ($in0,$inout0); # inp^=E(ivec) &movdqa ($inout0,$ivec); &movdqu (&QWP(0,$out),$in0); &lea ($out,&DWP(16,$out)); &jnz (&label("ccm64_enc_outer")); &mov ("esp",&DWP(48,"esp")); &mov ($out,&wparam(5)); &movdqu (&QWP(0,$out),$inout1); &function_end("aesni_ccm64_encrypt_blocks"); &function_begin("aesni_ccm64_decrypt_blocks"); &mov ($inp,&wparam(0)); &mov ($out,&wparam(1)); &mov ($len,&wparam(2)); &mov ($key,&wparam(3)); &mov ($rounds_,&wparam(4)); &mov ($rounds,&wparam(5)); &mov ($key_,"esp"); &sub ("esp",60); &and ("esp",-16); # align stack &mov (&DWP(48,"esp"),$key_); &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec &movdqu ($inout1,&QWP(0,$rounds)); # load cmac # compose byte-swap control mask for pshufb on stack &mov (&DWP(0,"esp"),0x0c0d0e0f); &mov (&DWP(4,"esp"),0x08090a0b); &mov (&DWP(8,"esp"),0x04050607); &mov (&DWP(12,"esp"),0x00010203); # compose counter increment vector on stack &mov ($rounds,1); &xor ($key_,$key_); &mov (&DWP(16,"esp"),$rounds); &mov (&DWP(20,"esp"),$key_); &mov (&DWP(24,"esp"),$key_); &mov (&DWP(28,"esp"),$key_); &movdqa ($inout3,&QWP(0,"esp")); # bswap mask &movdqa ($inout0,$ivec); &pshufb ($ivec,$inout3); # keep iv in reverse order &mov ($rounds,&DWP(240,$key)); &mov ($key_,$key); &mov ($rounds_,$rounds); if ($inline) { &aesni_inline_generate1("enc"); } else { &call ("_aesni_encrypt1"); } &set_label("ccm64_dec_outer"); &movdqu ($in0,&QWP(0,$inp)); &paddq ($ivec,&QWP(16,"esp")); &dec ($len); &lea ($inp,&QWP(16,$inp)); &pxor ($in0,$inout0); &movdqa ($inout0,$ivec); &mov ($key,$key_); &mov ($rounds,$rounds_); &pshufb ($inout0,$inout3); &movdqu (&QWP(0,$out),$in0); &lea ($out,&DWP(16,$out)); &jz (&label("ccm64_dec_break")); &pxor ($inout2,$inout2); &call ("_aesni_encrypt3"); &jmp (&label("ccm64_dec_outer")); &set_label("ccm64_dec_break",16); if ($inline) { &aesni_inline_generate1("enc",$inout1); } else { &call ("_aesni_encrypt1",$inout1); } &mov ("esp",&DWP(48,"esp")); &mov ($out,&wparam(5)); &movdqu (&QWP(0,$out),$inout1); &function_end("aesni_ccm64_decrypt_blocks"); ###################################################################### # void aesni_ctr32_encrypt_blocks (const void *in, void *out, # size_t blocks, const AES_KEY *key, # const char *ivec); # # Handles only complete blocks, operates on 32-bit counter and # does not update *ivec! (see engine/eng_aesni.c for details) # &function_begin("aesni_ctr32_encrypt_blocks"); &mov ($inp,&wparam(0)); &mov ($out,&wparam(1)); Loading @@ -394,6 +556,9 @@ if ($PREFIX eq "aesni") { &and ("esp",-16); # align stack &mov (&DWP(48,"esp"),$key_); &cmp ($len,1); &je (&label("ctr32_one_shortcut")); &movups ($inout3,&QWP(0,$rounds_)); # load ivec # compose byte-swap control mask for pshufb on stack Loading @@ -414,7 +579,7 @@ if ($PREFIX eq "aesni") { &pinsrd ($inout3,$key_,3); # wipe 32-bit counter &mov ($rounds,&DWP(240,$key)); # key->rounds &movaps ($rndkey0,&QWP(0,"esp")); # load byte-swap mask &movdqa ($rndkey0,&QWP(0,"esp")); # load byte-swap mask # $ivec is vector of 3 32-bit counters &pxor ($ivec,$ivec); Loading @@ -424,11 +589,11 @@ if ($PREFIX eq "aesni") { &pinsrd ($ivec,$rounds_,1); &inc ($rounds_); &pinsrd ($ivec,$rounds_,2); &pshufb ($ivec,$rndkey0); # byte swap &cmp ($len,4); &pshufb ($ivec,$rndkey0); # byte swap &jbe (&label("ctr32_tail")); &movaps (&QWP(32,"esp"),$inout3); # save counter-less ivec &movdqa (&QWP(32,"esp"),$inout3); # save counter-less ivec &mov ($rounds_,$rounds); &mov ($key_,$key); &sub ($len,4); Loading @@ -437,104 +602,139 @@ if ($PREFIX eq "aesni") { &set_label("ctr32_loop3",16); &pshufd ($inout0,$ivec,3<<6); # place counter to upper dword &pshufd ($inout1,$ivec,2<<6); &pshufd ($inout2,$ivec,1<<6); &por ($inout0,$inout3); # merge counter-less ivec &pshufd ($inout2,$ivec,1<<6); &por ($inout1,$inout3); &por ($inout2,$inout3); &call ("_aesni_encrypt3"); # inline _aesni_encrypt3 and interleave last round # with own code... &movaps($rndkey0,&QWP(0,"esp")); # load byte-swap mask &movups ($in0,&QWP(0,$inp)); &movups ($in1,&QWP(0x10,$inp)); &movups ($rndkey1,&QWP(0x20,$inp)); &pshufb($ivec,$rndkey0); # byte swap &$movekey ($rndkey0,&QWP(0,$key)); &shr ($rounds,1); &$movekey ($rndkey1,&QWP(16,$key)); &lea ($key,&DWP(32,$key)); &pxor ($inout0,$rndkey0); &pxor ($inout1,$rndkey0); &pxor ($inout2,$rndkey0); &$movekey ($rndkey0,&QWP(0,$key)); &set_label("ctr32_enc_loop3"); &aesenc ($inout0,$rndkey1); &aesenc ($inout1,$rndkey1); &dec ($rounds); &aesenc ($inout2,$rndkey1); &$movekey ($rndkey1,&QWP(16,$key)); &aesenc ($inout0,$rndkey0); &aesenc ($inout1,$rndkey0); &lea ($key,&DWP(32,$key)); &aesenc ($inout2,$rndkey0); &$movekey ($rndkey0,&QWP(0,$key)); &jnz (&label("ctr32_enc_loop3")); &aesenc ($inout0,$rndkey1); &aesenc ($inout1,$rndkey1); &aesenc ($inout2,$rndkey1); &movdqa ($rndkey1,&QWP(0,"esp")); # load byte-swap mask &aesenclast ($inout0,$rndkey0); &pshufb ($ivec,$rndkey1); # byte swap &movdqu ($in0,&QWP(0,$inp)); &aesenclast ($inout1,$rndkey0); &paddd ($ivec,&QWP(16,"esp")); # counter increment &movdqu ($in1,&QWP(0x10,$inp)); &aesenclast ($inout2,$rndkey0); &pshufb ($ivec,$rndkey1); # byte swap &movdqu ($rndkey0,&QWP(0x20,$inp)); &lea ($inp,&DWP(0x30,$inp)); &pxor ($in0,$inout0); &mov ($key,$key_); &pxor ($in1,$inout1); &pxor ($rndkey1,$inout2); &movups (&QWP(0,$out),$in0); &movups (&QWP(0x10,$out),$in1); &movups (&QWP(0x20,$out),$rndkey1); &movaps ($inout3,&QWP(32,"esp")); # load counter-less ivec &pshufb($ivec,$rndkey0); # byte swap &movdqu (&QWP(0,$out),$in0); &pxor ($rndkey0,$inout2); &movdqu (&QWP(0x10,$out),$in1); &movdqu (&QWP(0x20,$out),$rndkey0); &movdqa ($inout3,&QWP(32,"esp")); # load counter-less ivec &sub ($len,3); &lea ($inp,&DWP(0x30,$inp)); &lea ($out,&DWP(0x30,$out)); &mov ($key,$key_); &mov ($rounds,$rounds_); &ja (&label("ctr32_loop3")); &add ($len,4); &pextrd ($rounds_,$ivec,1); # might need last counter value &jz (&label("ctr32_ret")); &add ($len,4); &bswap ($rounds_); &set_label("ctr32_tail"); &cmp ($len,2); &pshufd ($inout0,$ivec,3<<6); &pshufd ($inout1,$ivec,2<<6); &pshufd ($inout2,$ivec,1<<6); &por ($inout0,$inout3); &cmp ($len,2); &jb (&label("ctr32_one")); &lea ($rounds_,&DWP(1,$rounds_)); &pshufd ($inout2,$ivec,1<<6); &por ($inout1,$inout3); &je (&label("ctr32_two")); &cmp ($len,3); &bswap ($rounds_); &por ($inout2,$inout3); &cmp ($len,3); &je (&label("ctr32_three")); &inc ($rounds_); # compose last counter value &bswap ($rounds_); &pinsrd ($inout3,$rounds_,3); &pinsrd ($inout3,$rounds_,3); # compose last counter value &call ("_aesni_encrypt4"); &movups ($in0,&QWP(0,$inp)); &movups ($rndkey1,&QWP(0x10,$inp)); &movups ($rndkey0,&QWP(0x20,$inp)); &movups ($ivec,&QWP(0x30,$inp)); &movdqu ($in0,&QWP(0,$inp)); &movdqu ($rndkey1,&QWP(0x10,$inp)); &pxor ($in0,$inout0); &movdqu ($rndkey0,&QWP(0x20,$inp)); &pxor ($rndkey1,$inout1); &movdqu ($ivec,&QWP(0x30,$inp)); &pxor ($rndkey0,$inout2); &movdqu (&QWP(0,$out),$in0); &pxor ($ivec,$inout3); &movups (&QWP(0,$out),$in0); &movups (&QWP(0x10,$out),$rndkey1); &movups (&QWP(0x20,$out),$rndkey0); &movups (&QWP(0x30,$out),$ivec); &movdqu (&QWP(0x10,$out),$rndkey1); &movdqu (&QWP(0x20,$out),$rndkey0); &movdqu (&QWP(0x30,$out),$ivec); &jmp (&label("ctr32_ret")); &set_label("ctr32_one",16); &set_label("ctr32_one_shortcut",16); &movdqu ($inout0,&QWP(0,$rounds_)); # load ivec &mov ($rounds,&DWP(240,$key)); &set_label("ctr32_one"); if ($inline) { &aesni_inline_generate1("enc"); } else { &call ("_aesni_encrypt1"); } &movups ($in0,&QWP(0,$inp)); &movdqu ($in0,&QWP(0,$inp)); &pxor ($in0,$inout0); &movups (&QWP(0,$out),$in0); &movdqu (&QWP(0,$out),$in0); &jmp (&label("ctr32_ret")); &set_label("ctr32_two",16); &pxor ($inout2,$inout2); &call ("_aesni_encrypt3"); &movups ($in0,&QWP(0,$inp)); &movups ($in1,&QWP(0x10,$inp)); &movdqu ($in0,&QWP(0,$inp)); &movdqu ($in1,&QWP(0x10,$inp)); &pxor ($in0,$inout0); &pxor ($in1,$inout1); &movups (&QWP(0,$out),$in0); &movups (&QWP(0x10,$out),$in1); &movdqu (&QWP(0,$out),$in0); &movdqu (&QWP(0x10,$out),$in1); &jmp (&label("ctr32_ret")); &set_label("ctr32_three",16); &call ("_aesni_encrypt3"); &movups ($in0,&QWP(0,$inp)); &movups ($in1,&QWP(0x10,$inp)); &movups ($rndkey1,&QWP(0x20,$inp)); &movdqu ($in0,&QWP(0,$inp)); &movdqu ($in1,&QWP(0x10,$inp)); &movdqu ($rndkey1,&QWP(0x20,$inp)); &pxor ($in0,$inout0); &pxor ($in1,$inout1); &movdqu (&QWP(0,$out),$in0); &pxor ($rndkey1,$inout2); &movups (&QWP(0,$out),$in0); &movups (&QWP(0x10,$out),$in1); &movups (&QWP(0x20,$out),$rndkey1); &movdqu (&QWP(0x10,$out),$in1); &movdqu (&QWP(0x20,$out),$rndkey1); &set_label("ctr32_ret"); &mov ("esp",&DWP(48,"esp")); Loading @@ -550,36 +750,36 @@ if ($PREFIX eq "aesni") { &mov ($out,&wparam(1)); &mov ($len,&wparam(2)); &mov ($key,&wparam(3)); &test ($len,$len); &mov ($key_,&wparam(4)); &test ($len,$len); &jz (&label("cbc_ret")); &cmp (&wparam(5),0); &movups ($ivec,&QWP(0,$key_)); # load IV &movdqu ($ivec,&QWP(0,$key_)); # load IV &mov ($rounds,&DWP(240,$key)); &mov ($key_,$key); # backup $key &mov ($rounds_,$rounds); # backup $rounds &je (&label("cbc_decrypt")); &movaps ($inout0,$ivec); &movdqa ($inout0,$ivec); &cmp ($len,16); &jb (&label("cbc_enc_tail")); &sub ($len,16); &jmp (&label("cbc_enc_loop")); &set_label("cbc_enc_loop",16); &movups ($ivec,&QWP(0,$inp)); &movdqu ($ivec,&QWP(0,$inp)); &lea ($inp,&DWP(16,$inp)); &pxor ($inout0,$ivec); if ($inline) { &aesni_inline_generate1("enc"); } else { &call ("_aesni_encrypt1"); } &sub ($len,16); &lea ($out,&DWP(16,$out)); &mov ($rounds,$rounds_); # restore $rounds &mov ($key,$key_); # restore $key &movups (&QWP(-16,$out),$inout0); &movups (&QWP(0,$out),$inout0); # store output &lea ($out,&DWP(16,$out)); &sub ($len,16); &jnc (&label("cbc_enc_loop")); &add ($len,16); &jnz (&label("cbc_enc_tail")); Loading Loading @@ -611,93 +811,94 @@ if ($PREFIX eq "aesni") { &movups ($inout2,&QWP(0x20,$inp)); &movaps ($in0,$inout0); &movaps ($in1,$inout1); &call ("_aesni_decrypt3"); &sub ($len,0x30); &lea ($inp,&DWP(0x30,$inp)); &lea ($out,&DWP(0x30,$out)); &pxor ($inout0,$ivec); &pxor ($inout1,$in0); &movups ($ivec,&QWP(-0x10,$inp)); &movdqu ($ivec,&QWP(0x20,$inp)); &lea ($inp,&DWP(0x30,$inp)); &pxor ($inout2,$in1); &movups (&QWP(-0x30,$out),$inout0); &movdqu (&QWP(0,$out),$inout0); &mov ($rounds,$rounds_) # restore $rounds &movups (&QWP(-0x20,$out),$inout1); &movdqu (&QWP(0x10,$out),$inout1); &mov ($key,$key_); # restore $key &movups (&QWP(-0x10,$out),$inout2); &movdqu (&QWP(0x20,$out),$inout2); &lea ($out,&DWP(0x30,$out)); &sub ($len,0x30); &ja (&label("cbc_dec_loop3")); &add ($len,0x40); &jz (&label("cbc_ret")); &set_label("cbc_dec_tail"); &movups ($inout0,&QWP(0,$inp)); &cmp ($len,0x10); &movaps ($in0,$inout0); &cmp ($len,0x10); &jbe (&label("cbc_dec_one")); &movups ($inout1,&QWP(0x10,$inp)); &cmp ($len,0x20); &movaps ($in1,$inout1); &cmp ($len,0x20); &jbe (&label("cbc_dec_two")); &movups ($inout2,&QWP(0x20,$inp)); &cmp ($len,0x30); &jbe (&label("cbc_dec_three")); &movups ($inout3,&QWP(0x30,$inp)); &call ("_aesni_decrypt4"); &movups ($rndkey0,&QWP(0x10,$inp)); &movups ($rndkey1,&QWP(0x20,$inp)); &movdqu ($rndkey0,&QWP(0x10,$inp)); &movdqu ($rndkey1,&QWP(0x20,$inp)); &pxor ($inout0,$ivec); &pxor ($inout1,$in0); &movups ($ivec,&QWP(0x30,$inp)); &movups (&QWP(0,$out),$inout0); &movdqu ($ivec,&QWP(0x30,$inp)); &movdqu (&QWP(0,$out),$inout0); &pxor ($inout2,$rndkey0); &pxor ($inout3,$rndkey1); &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(0x20,$out),$inout2); &movaps ($inout0,$inout3); &movdqu (&QWP(0x10,$out),$inout1); &movdqu (&QWP(0x20,$out),$inout2); &movdqa ($inout0,$inout3); &lea ($out,&DWP(0x30,$out)); &jmp (&label("cbc_dec_tail_collected")); &set_label("cbc_dec_one"); &set_label("cbc_dec_one",16); if ($inline) { &aesni_inline_generate1("dec"); } else { &call ("_aesni_decrypt1"); } &pxor ($inout0,$ivec); &movaps ($ivec,$in0); &movdqa ($ivec,$in0); &jmp (&label("cbc_dec_tail_collected")); &set_label("cbc_dec_two"); &set_label("cbc_dec_two",16); &pxor ($inout2,$inout2); &call ("_aesni_decrypt3"); &pxor ($inout0,$ivec); &pxor ($inout1,$in0); &movups (&QWP(0,$out),$inout0); &movaps ($inout0,$inout1); &movaps ($ivec,$in1); &movdqu (&QWP(0,$out),$inout0); &movdqa ($inout0,$inout1); &movdqa ($ivec,$in1); &lea ($out,&DWP(0x10,$out)); &jmp (&label("cbc_dec_tail_collected")); &set_label("cbc_dec_three"); &set_label("cbc_dec_three",16); &call ("_aesni_decrypt3"); &pxor ($inout0,$ivec); &pxor ($inout1,$in0); &pxor ($inout2,$in1); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &movaps ($inout0,$inout2); &movups ($ivec,&QWP(0x20,$inp)); &movdqu (&QWP(0,$out),$inout0); &movdqu (&QWP(0x10,$out),$inout1); &movdqa ($inout0,$inout2); &movdqu ($ivec,&QWP(0x20,$inp)); &lea ($out,&DWP(0x20,$out)); &set_label("cbc_dec_tail_collected"); &and ($len,15); &jnz (&label("cbc_dec_tail_partial")); &movups (&QWP(0,$out),$inout0); &movdqu (&QWP(0,$out),$inout0); &jmp (&label("cbc_ret")); &set_label("cbc_dec_tail_partial"); &set_label("cbc_dec_tail_partial",16); &mov ($key_,"esp"); &sub ("esp",16); &and ("esp",-16); &movaps (&QWP(0,"esp"),$inout0); &movdqa (&QWP(0,"esp"),$inout0); &mov ($inp,"esp"); &mov ("ecx",$len); &data_word(0xA4F3F689); # rep movsb Loading Loading @@ -935,9 +1136,9 @@ if ($PREFIX eq "aesni") { &aesimc ("xmm1","xmm1"); &lea ($key,&DWP(16,$key)); &lea ("eax",&DWP(-16,"eax")); &cmp ("eax",$key); &$movekey (&QWP(16,"eax"),"xmm0"); &$movekey (&QWP(-16,$key),"xmm1"); &cmp ("eax",$key); &ja (&label("dec_key_inverse")); &$movekey ("xmm0",&QWP(0,$key)); # inverse middle Loading
crypto/aes/asm/aesni-x86_64.pl +425 −112 File changed.Preview size limit exceeded, changes collapsed. Show changes