Loading crypto/aes/asm/aesni-x86.pl +186 −84 Original line number Diff line number Diff line Loading @@ -29,8 +29,8 @@ $rounds="ecx"; $key="edx"; $inp="esi"; $out="edi"; $rounds_="ebx"; $key_="ebp"; $rounds_="ebx"; # backup copy for $rounds $key_="ebp"; # backup copy for $key $inout0="xmm0"; $inout1="xmm1"; Loading @@ -39,26 +39,23 @@ $rndkey0="xmm3"; $rndkey1="xmm4"; $ivec="xmm5"; $in0="xmm6"; $in1="xmm7"; $in1="xmm7"; $inout3="xmm7"; sub _aesni_generate1 # folded loop # Inline version of internal aesni_[en|de]crypt1 sub aesni_inline_generate1 { my $p=shift; &function_begin_B("_aesni_${p}rypt1"); &$movekey ($rndkey0,&QWP(0,$key)); &$movekey ($rndkey1,&QWP(16,$key)); &lea ($key,&DWP(16,$key)); &lea ($key,&DWP(32,$key)); &pxor ($inout0,$rndkey0); &dec ($rounds); &set_label("${p}1_loop",16); &set_label("${p}1_loop"); eval"&aes${p} ($inout0,$rndkey1)"; &dec ($rounds); &lea ($key,&DWP(16,$key)); &$movekey ($rndkey1,&QWP(0,$key)); &lea ($key,&DWP(16,$key)); &jnz (&label("${p}1_loop")); eval"&aes${p}last ($inout0,$rndkey1)"; &ret(); &function_end_B("_aesni_${p}rypt1"); } sub aesni_generate1 # fully unrolled loop Loading @@ -67,7 +64,7 @@ sub aesni_generate1 # fully unrolled loop &function_begin_B("_aesni_${p}rypt1"); &$movekey ($rndkey0,&QWP(0,$key)); &$movekey ($rndkey1,&QWP(0x10,$key)); &cmp ($rounds,12); &cmp ($rounds,11); &pxor ($inout0,$rndkey0); &$movekey ($rndkey0,&QWP(0x20,$key)); &lea ($key,&DWP(0x30,$key)); Loading Loading @@ -107,52 +104,52 @@ sub aesni_generate1 # fully unrolled loop &function_end_B("_aesni_${p}rypt1"); } &aesni_generate1("enc"); # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key); # &aesni_generate1("dec"); &function_begin_B("${PREFIX}_encrypt"); &mov ("eax",&wparam(0)); &mov ($key,&wparam(2)); &movups ($inout0,&QWP(0,"eax")); &mov ($rounds,&DWP(240,$key)); &mov ("eax",&wparam(1)); &call ("_aesni_encrypt1"); &aesni_inline_generate1("enc"); # &call ("_aesni_encrypt1"); &movups (&QWP(0,"eax"),$inout0); &ret (); &function_end_B("${PREFIX}_encrypt"); &aesni_generate1("dec"); # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key); # &aesni_generate1("dec"); &function_begin_B("${PREFIX}_decrypt"); &mov ("eax",&wparam(0)); &mov ($key,&wparam(2)); &movups ($inout0,&QWP(0,"eax")); &mov ($rounds,&DWP(240,$key)); &mov ("eax",&wparam(1)); &call ("_aesni_decrypt1"); &aesni_inline_generate1("dec"); # &call ("_aesni_decrypt1"); &movups (&QWP(0,"eax"),$inout0); &ret (); &function_end_B("${PREFIX}_decrypt"); # _aesni_[en|de]crypt3 are private interfaces, 3 denotes interleave # factor. Why 3x? Even though aes[enc|dec] latency is 6, it turned # out that it can be scheduled only every *second* cycle. Thus 3x # interleave is the one providing optimal utilization, i.e. when # subroutine's throughput is virtually same as of non-interleaved # subroutine for number of input blocks up to 3. This is why it # handles even double-block inputs. Larger interleave factor would # perform suboptimally on shorter inputs... # _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave # factor. Why 3x subroutine is used in loops? Even though aes[enc|dec] # latency is 6, it turned out that it can be scheduled only every # *second* cycle. Thus 3x interleave is the one providing optimal # utilization, i.e. when subroutine's throughput is virtually same as # of non-interleaved subroutine [for number of input blocks up to 3]. # This is why it makes no sense to implement 2x subroutine. As soon # as/if Intel improves throughput by making it possible to schedule # the instructions in question *every* cycles I would have to # implement 6x interleave and use it in loop... sub aesni_generate3 { my $p=shift; &function_begin_B("_aesni_${p}rypt3"); &$movekey ($rndkey0,&QWP(0,$key)); &$movekey ($rndkey1,&QWP(16,$key)); &shr ($rounds,1); &$movekey ($rndkey1,&QWP(16,$key)); &lea ($key,&DWP(32,$key)); &pxor ($inout0,$rndkey0); &pxor ($inout1,$rndkey0); &dec ($rounds); &pxor ($inout2,$rndkey0); &jmp (&label("${p}3_loop")); &set_label("${p}3_loop",16); Loading @@ -177,14 +174,59 @@ sub aesni_generate3 &ret(); &function_end_B("_aesni_${p}rypt3"); } # 4x interleave is implemented to improve small block performance, # most notably [and naturally] 4 block by ~30%. One can argue that one # should have implemented 5x as well, but improvement would be <20%, # so it's not worth it... sub aesni_generate4 { my $p=shift; &function_begin_B("_aesni_${p}rypt4"); &$movekey ($rndkey0,&QWP(0,$key)); &$movekey ($rndkey1,&QWP(16,$key)); &shr ($rounds,1); &lea ($key,&DWP(32,$key)); &pxor ($inout0,$rndkey0); &pxor ($inout1,$rndkey0); &pxor ($inout2,$rndkey0); &pxor ($inout3,$rndkey0); &jmp (&label("${p}3_loop")); &set_label("${p}3_loop",16); eval"&aes${p} ($inout0,$rndkey1)"; &$movekey ($rndkey0,&QWP(0,$key)); eval"&aes${p} ($inout1,$rndkey1)"; &dec ($rounds); eval"&aes${p} ($inout2,$rndkey1)"; eval"&aes${p} ($inout3,$rndkey1)"; &$movekey ($rndkey1,&QWP(16,$key)); eval"&aes${p} ($inout0,$rndkey0)"; &lea ($key,&DWP(32,$key)); eval"&aes${p} ($inout1,$rndkey0)"; eval"&aes${p} ($inout2,$rndkey0)"; eval"&aes${p} ($inout3,$rndkey0)"; &jnz (&label("${p}3_loop")); eval"&aes${p} ($inout0,$rndkey1)"; &$movekey ($rndkey0,&QWP(0,$key)); eval"&aes${p} ($inout1,$rndkey1)"; eval"&aes${p} ($inout2,$rndkey1)"; eval"&aes${p} ($inout3,$rndkey1)"; eval"&aes${p}last ($inout0,$rndkey0)"; eval"&aes${p}last ($inout1,$rndkey0)"; eval"&aes${p}last ($inout2,$rndkey0)"; eval"&aes${p}last ($inout3,$rndkey0)"; &ret(); &function_end_B("_aesni_${p}rypt4"); } &aesni_generate3("enc") if ($PREFIX eq "aesni"); &aesni_generate3("dec"); &aesni_generate4("enc") if ($PREFIX eq "aesni"); &aesni_generate4("dec"); if ($PREFIX eq "aesni") { # void aesni_ecb_encrypt (const void *in, void *out, # size_t length, const AES_KEY *key, # int enc); &function_begin("aesni_ecb_encrypt"); &mov ($inp,&wparam(0)); &mov ($out,&wparam(1)); Loading @@ -200,79 +242,121 @@ if ($PREFIX eq "aesni") { &mov ($rounds_,$rounds); # backup $rounds &jz (&label("ecb_decrypt")); &sub ($len,0x30); &jc (&label("ecb_enc_tail")); jmp (&label("ecb_enc_loop3")); &sub ($len,0x40); &jbe (&label("ecb_enc_tail")); &jmp (&label("ecb_enc_loop3")); &set_label("ecb_enc_loop3",16); &movups ($inout0,&QWP(0,$inp)); &movups ($inout1,&QWP(0x10,$inp)); &movups ($inout2,&QWP(0x20,$inp)); &lea ($inp,&DWP(0x30,$inp)); &call ("_aesni_encrypt3"); &movups (&QWP(0,$out),$inout0); &sub ($len,0x30); &movups (&QWP(0x10,$out),$inout1); &lea ($inp,&DWP(0x30,$inp)); &lea ($out,&DWP(0x30,$out)); &movups (&QWP(-0x30,$out),$inout0); &mov ($key,$key_); # restore $key &movups (&QWP(0x20,$out),$inout2); &movups (&QWP(-0x20,$out),$inout1); &mov ($rounds,$rounds_); # restore $rounds &lea ($out,&DWP(0x30,$out)); &jnc (&label("ecb_enc_loop3")); &movups (&QWP(-0x10,$out),$inout2); &ja (&label("ecb_enc_loop3")); &set_label("ecb_enc_tail"); &add ($len,0x30); &add ($len,0x40); &jz (&label("ecb_ret")); &cmp ($len,0x10); &movups ($inout0,&QWP(0,$inp)); je (&label("ecb_enc_one")); &je (&label("ecb_enc_one")); &cmp ($len,0x20); &movups ($inout1,&QWP(0x10,$inp)); &call ("_aesni_encrypt3"); &je (&label("ecb_enc_two")); &cmp ($len,0x30); &movups ($inout2,&QWP(0x20,$inp)); &je (&label("ecb_enc_three")); &movups ($inout3,&QWP(0x30,$inp)); &call ("_aesni_encrypt4"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(0x20,$out),$inout2); &movups (&QWP(0x30,$out),$inout3); jmp (&label("ecb_ret")); &set_label("ecb_enc_one",16); &call ("_aesni_encrypt1"); &aesni_inline_generate1("enc"); # &call ("_aesni_encrypt1"); &movups (&QWP(0,$out),$inout0); &jmp (&label("ecb_ret")); &set_label("ecb_enc_two",16); &call ("_aesni_encrypt3"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &jmp (&label("ecb_ret")); &set_label("ecb_enc_three",16); &call ("_aesni_encrypt3"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(0x20,$out),$inout2); &jmp (&label("ecb_ret")); &set_label("ecb_decrypt",16); &sub ($len,0x30); &jc (&label("ecb_dec_tail")); jmp (&label("ecb_dec_loop3")); &sub ($len,0x40); &jbe (&label("ecb_dec_tail")); &jmp (&label("ecb_dec_loop3")); &set_label("ecb_dec_loop3",16); &movups ($inout0,&QWP(0,$inp)); &movups ($inout1,&QWP(0x10,$inp)); &movups ($inout2,&QWP(0x20,$inp)); &call ("_aesni_decrypt3"); &movups (&QWP(0,$out),$inout0); &sub ($len,0x30); &lea ($inp,&DWP(0x30,$inp)); &movups (&QWP(0x10,$out),$inout1); &lea ($out,&DWP(0x30,$out)); &movups (&QWP(-0x30,$out),$inout0); &mov ($key,$key_); # restore $key &movups (&QWP(0x20,$out),$inout2); &movups (&QWP(-0x20,$out),$inout1); &mov ($rounds,$rounds_); # restore $rounds &lea ($out,&DWP(0x30,$out)); &jnc (&label("ecb_dec_loop3")); &movups (&QWP(-0x10,$out),$inout2); &ja (&label("ecb_dec_loop3")); &set_label("ecb_dec_tail"); &add ($len,0x30); &add ($len,0x40); &jz (&label("ecb_ret")); &cmp ($len,0x10); &movups ($inout0,&QWP(0,$inp)); je (&label("ecb_dec_one")); &je (&label("ecb_dec_one")); &cmp ($len,0x20); &movups ($inout1,&QWP(0x10,$inp)); &call ("_aesni_decrypt3"); &je (&label("ecb_dec_two")); &cmp ($len,0x30); &movups ($inout2,&QWP(0x20,$inp)); &je (&label("ecb_dec_three")); &movups ($inout3,&QWP(0x30,$inp)); &call ("_aesni_decrypt4"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); jmp (&label("ecb_ret")); &movups (&QWP(0x20,$out),$inout2); &movups (&QWP(0x30,$out),$inout3); &jmp (&label("ecb_ret")); &set_label("ecb_dec_one",16); &call ("_aesni_decrypt1"); &aesni_inline_generate1("dec"); # &call ("_aesni_decrypt3"); &movups (&QWP(0,$out),$inout0); &jmp (&label("ecb_ret")); &set_label("ecb_dec_two",16); &call ("_aesni_decrypt3"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &jmp (&label("ecb_ret")); &set_label("ecb_dec_three",16); &call ("_aesni_decrypt3"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(0x20,$out),$inout2); &set_label("ecb_ret"); &function_end("aesni_ecb_encrypt"); Loading @@ -288,7 +372,7 @@ if ($PREFIX eq "aesni") { &mov ($key,&wparam(3)); &test ($len,$len); &mov ($key_,&wparam(4)); &je (&label("cbc_ret")); &jz (&label("cbc_ret")); &cmp (&wparam(5),0); &movups ($ivec,&QWP(0,$key_)); # load IV Loading @@ -307,12 +391,12 @@ if ($PREFIX eq "aesni") { &movups ($ivec,&QWP(0,$inp)); &lea ($inp,&DWP(16,$inp)); &pxor ($inout0,$ivec); &call ("_aesni_encrypt1"); &aesni_inline_generate1("enc"); # &call ("_aesni_encrypt3"); &sub ($len,16); &lea ($out,&DWP(16,$out)); &mov ($rounds,$rounds_); # restore $rounds &mov ($key,$key_); # restore $key &movups (&QWP(0,$out),$inout0); &lea ($out,&DWP(16,$out)); &movups (&QWP(-16,$out),$inout0); &jnc (&label("cbc_enc_loop")); &add ($len,16); &jnz (&label("cbc_enc_tail")); Loading @@ -333,8 +417,8 @@ if ($PREFIX eq "aesni") { &jmp (&label("cbc_enc_loop")); &set_label("cbc_decrypt",16); &sub ($len,0x30); &jc (&label("cbc_dec_tail")); &sub ($len,0x40); &jbe (&label("cbc_dec_tail")); &jmp (&label("cbc_dec_loop3")); &set_label("cbc_dec_loop3",16); Loading @@ -346,20 +430,20 @@ if ($PREFIX eq "aesni") { &call ("_aesni_decrypt3"); &sub ($len,0x30); &lea ($inp,&DWP(0x30,$inp)); &lea ($out,&DWP(0x30,$out)); &pxor ($inout0,$ivec); &pxor ($inout1,$in0); &movups ($ivec,&QWP(-0x10,$inp)); &pxor ($inout2,$in1); &movups (&QWP(0,$out),$inout0); &movups (&QWP(-0x30,$out),$inout0); &mov ($rounds,$rounds_) # restore $rounds &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(-0x20,$out),$inout1); &mov ($key,$key_); # restore $key &movups (&QWP(0x20,$out),$inout2); &lea ($out,&DWP(0x30,$out)); &jnc (&label("cbc_dec_loop3")); &movups (&QWP(-0x10,$out),$inout2); &ja (&label("cbc_dec_loop3")); &set_label("cbc_dec_tail"); &add ($len,0x30); &add ($len,0x40); &jz (&label("cbc_ret")); &movups ($inout0,&QWP(0,$inp)); Loading @@ -371,19 +455,26 @@ if ($PREFIX eq "aesni") { &movaps ($in1,$inout1); &jbe (&label("cbc_dec_two")); &movups ($inout2,&QWP(0x20,$inp)); &call ("_aesni_decrypt3"); &cmp ($len,0x30); &jbe (&label("cbc_dec_three")); &movups ($inout3,&QWP(0x30,$inp)); &call ("_aesni_decrypt4"); &movups ($rndkey0,&QWP(0x10,$inp)); &movups ($rndkey1,&QWP(0x20,$inp)); &pxor ($inout0,$ivec); &movups ($ivec,&QWP(0x20,$inp)); &pxor ($inout1,$in0); &pxor ($inout2,$in1); &movups ($ivec,&QWP(0x30,$inp)); &movups (&QWP(0,$out),$inout0); &pxor ($inout2,$rndkey0); &pxor ($inout3,$rndkey1); &movups (&QWP(0x10,$out),$inout1); &movaps ($inout0,$inout2); &lea ($out,&DWP(0x20,$out)); &movups (&QWP(0x20,$out),$inout2); &movaps ($inout0,$inout3); &lea ($out,&DWP(0x30,$out)); &jmp (&label("cbc_dec_tail_collected")); &set_label("cbc_dec_one"); &call ("_aesni_decrypt1"); &aesni_inline_generate1("dec"); # &call ("_aesni_decrypt3"); &pxor ($inout0,$ivec); &movaps ($ivec,$in0); &jmp (&label("cbc_dec_tail_collected")); Loading @@ -396,6 +487,18 @@ if ($PREFIX eq "aesni") { &movaps ($inout0,$inout1); &movaps ($ivec,$in1); &lea ($out,&DWP(0x10,$out)); &jmp (&label("cbc_dec_tail_collected")); &set_label("cbc_dec_three"); &call ("_aesni_decrypt3"); &pxor ($inout0,$ivec); &pxor ($inout1,$in0); &pxor ($inout2,$in1); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &movaps ($inout0,$inout2); &movups ($ivec,&QWP(0x20,$inp)); &lea ($out,&DWP(0x20,$out)); &set_label("cbc_dec_tail_collected"); &and ($len,15); Loading Loading @@ -446,7 +549,7 @@ if ($PREFIX eq "aesni") { &jne (&label("bad_keybits")); &set_label("10rounds",16); &mov ($rounds,10); &mov ($rounds,9); &$movekey (&QWP(-16,$key),"xmm0"); # round 0 &aeskeygenassist("xmm1","xmm0",0x01); # round 1 &call (&label("key_128_cold")); Loading Loading @@ -487,7 +590,7 @@ if ($PREFIX eq "aesni") { &set_label("12rounds",16); &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey &mov ($rounds,12); &mov ($rounds,11); &$movekey (&QWP(-16,$key),"xmm0") # round 0 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2 &call (&label("key_192a_cold")); Loading Loading @@ -540,7 +643,7 @@ if ($PREFIX eq "aesni") { &set_label("14rounds",16); &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey &mov ($rounds,14); &mov ($rounds,13); &lea ($key,&DWP(16,$key)); &$movekey (&QWP(-32,$key),"xmm0"); # round 0 &$movekey (&QWP(-16,$key),"xmm2"); # round 1 Loading Loading @@ -625,10 +728,10 @@ if ($PREFIX eq "aesni") { &mov ($key,&wparam(2)); &call ("_aesni_set_encrypt_key"); &mov ($key,&wparam(2)); &shl ($rounds,4) # actually rounds after _aesni_set_encrypt_key &shl ($rounds,4) # rounds-1 after _aesni_set_encrypt_key &test ("eax","eax"); &jnz (&label("dec_key_ret")); &lea ("eax",&DWP(0,$key,$rounds)); # end of key schedule &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule &$movekey ("xmm0",&QWP(0,$key)); # just swap &$movekey ("xmm1",&QWP(0,"eax")); Loading @@ -636,9 +739,8 @@ if ($PREFIX eq "aesni") { &$movekey (&QWP(0,$key),"xmm1"); &lea ($key,&DWP(16,$key)); &lea ("eax",&DWP(-16,"eax")); &jmp (&label("dec_key_inverse")); &set_label("dec_key_inverse",16); &set_label("dec_key_inverse"); &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse &$movekey ("xmm1",&QWP(0,"eax")); &aesimc ("xmm0","xmm0"); Loading crypto/aes/asm/aesni-x86_64.pl +432 −404 File changed.Preview size limit exceeded, changes collapsed. Show changes crypto/engine/eng_aesni.c +9 −8 Original line number Diff line number Diff line Loading @@ -147,8 +147,9 @@ static int aesni_ciphers(ENGINE *e, const EVP_CIPHER **cipher, ((void *)(((unsigned long)(x)+AESNI_MIN_ALIGN-1)&~(AESNI_MIN_ALIGN-1))) /* Engine names */ static const char *aesni_id = "aesni"; static const char *aesni_name = "Intel AES-NI engine"; static const char aesni_id[] = "aesni", aesni_name[] = "Intel AES-NI engine", no_aesni_name[] = "Intel AES-NI engine (no-aesni)"; /* ===== Engine "management" functions ===== */ Loading @@ -156,15 +157,15 @@ static const char *aesni_name = "Intel AES-NI engine"; static int aesni_bind_helper(ENGINE *e) { if (!(OPENSSL_ia32cap_P[1] & (1UL << (57-32)))) return 0; int engage = (OPENSSL_ia32cap_P[1] & (1 << (57-32))) != 0; /* Register everything or return with an error */ if (!ENGINE_set_id(e, aesni_id) || !ENGINE_set_name(e, aesni_name) || !ENGINE_set_name(e, engage ? aesni_name : no_aesni_name) || !ENGINE_set_init_function(e, aesni_init) || !ENGINE_set_ciphers (e, aesni_ciphers)) (engage && !ENGINE_set_ciphers (e, aesni_ciphers)) ) return 0; /* Everything looks good */ Loading Loading @@ -286,14 +287,14 @@ static int aesni_cipher_cfb(EVP_CIPHER_CTX *ctx, unsigned char *out, { AES_KEY *key = AESNI_ALIGN(ctx->cipher_data); CRYPTO_cfb128_encrypt(in, out, inl, key, ctx->iv, &ctx->num, ctx->encrypt, aesni_encrypt); (block128_f)aesni_encrypt); return 1; } static int aesni_cipher_ofb(EVP_CIPHER_CTX *ctx, unsigned char *out, const unsigned char *in, size_t inl) { AES_KEY *key = AESNI_ALIGN(ctx->cipher_data); CRYPTO_ofb128_encrypt(in, out, inl, key, ctx->iv, &ctx->num, aesni_encrypt); &ctx->num, (block128_f)aesni_encrypt); return 1; } Loading test/test_aesni +4 −8 Original line number Diff line number Diff line Loading @@ -14,23 +14,19 @@ else exit 1; fi if $PROG engine aesni | grep aesni; then if $PROG engine aesni | grep -v no-aesni; then HASH=`cat $PROG | $PROG dgst -hex` ACE_ALGS=" aes-128-ecb aes-192-ecb aes-256-ecb \ AES_ALGS=" aes-128-ecb aes-192-ecb aes-256-ecb \ aes-128-cbc aes-192-cbc aes-256-cbc \ aes-128-cfb aes-192-cfb aes-256-cfb \ aes-128-ofb aes-192-ofb aes-256-ofb" BUFSIZE="16 32 48 64 80 96 128 999" ACE_ALGS=" aes-128-cbc aes-192-cbc aes-256-cbc \ aes-128-cfb aes-192-cfb aes-256-cfb \ aes-128-ofb aes-192-ofb aes-256-ofb" BUFSIZE="48 64 80 96 128 999" BUFSIZE="16 32 48 64 80 96 128 144 999" nerr=0 for alg in $ACE_ALGS; do for alg in $AES_ALGS; do echo $alg for bufsize in $BUFSIZE; do TEST=`( cat $PROG | \ Loading Loading
crypto/aes/asm/aesni-x86.pl +186 −84 Original line number Diff line number Diff line Loading @@ -29,8 +29,8 @@ $rounds="ecx"; $key="edx"; $inp="esi"; $out="edi"; $rounds_="ebx"; $key_="ebp"; $rounds_="ebx"; # backup copy for $rounds $key_="ebp"; # backup copy for $key $inout0="xmm0"; $inout1="xmm1"; Loading @@ -39,26 +39,23 @@ $rndkey0="xmm3"; $rndkey1="xmm4"; $ivec="xmm5"; $in0="xmm6"; $in1="xmm7"; $in1="xmm7"; $inout3="xmm7"; sub _aesni_generate1 # folded loop # Inline version of internal aesni_[en|de]crypt1 sub aesni_inline_generate1 { my $p=shift; &function_begin_B("_aesni_${p}rypt1"); &$movekey ($rndkey0,&QWP(0,$key)); &$movekey ($rndkey1,&QWP(16,$key)); &lea ($key,&DWP(16,$key)); &lea ($key,&DWP(32,$key)); &pxor ($inout0,$rndkey0); &dec ($rounds); &set_label("${p}1_loop",16); &set_label("${p}1_loop"); eval"&aes${p} ($inout0,$rndkey1)"; &dec ($rounds); &lea ($key,&DWP(16,$key)); &$movekey ($rndkey1,&QWP(0,$key)); &lea ($key,&DWP(16,$key)); &jnz (&label("${p}1_loop")); eval"&aes${p}last ($inout0,$rndkey1)"; &ret(); &function_end_B("_aesni_${p}rypt1"); } sub aesni_generate1 # fully unrolled loop Loading @@ -67,7 +64,7 @@ sub aesni_generate1 # fully unrolled loop &function_begin_B("_aesni_${p}rypt1"); &$movekey ($rndkey0,&QWP(0,$key)); &$movekey ($rndkey1,&QWP(0x10,$key)); &cmp ($rounds,12); &cmp ($rounds,11); &pxor ($inout0,$rndkey0); &$movekey ($rndkey0,&QWP(0x20,$key)); &lea ($key,&DWP(0x30,$key)); Loading Loading @@ -107,52 +104,52 @@ sub aesni_generate1 # fully unrolled loop &function_end_B("_aesni_${p}rypt1"); } &aesni_generate1("enc"); # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key); # &aesni_generate1("dec"); &function_begin_B("${PREFIX}_encrypt"); &mov ("eax",&wparam(0)); &mov ($key,&wparam(2)); &movups ($inout0,&QWP(0,"eax")); &mov ($rounds,&DWP(240,$key)); &mov ("eax",&wparam(1)); &call ("_aesni_encrypt1"); &aesni_inline_generate1("enc"); # &call ("_aesni_encrypt1"); &movups (&QWP(0,"eax"),$inout0); &ret (); &function_end_B("${PREFIX}_encrypt"); &aesni_generate1("dec"); # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key); # &aesni_generate1("dec"); &function_begin_B("${PREFIX}_decrypt"); &mov ("eax",&wparam(0)); &mov ($key,&wparam(2)); &movups ($inout0,&QWP(0,"eax")); &mov ($rounds,&DWP(240,$key)); &mov ("eax",&wparam(1)); &call ("_aesni_decrypt1"); &aesni_inline_generate1("dec"); # &call ("_aesni_decrypt1"); &movups (&QWP(0,"eax"),$inout0); &ret (); &function_end_B("${PREFIX}_decrypt"); # _aesni_[en|de]crypt3 are private interfaces, 3 denotes interleave # factor. Why 3x? Even though aes[enc|dec] latency is 6, it turned # out that it can be scheduled only every *second* cycle. Thus 3x # interleave is the one providing optimal utilization, i.e. when # subroutine's throughput is virtually same as of non-interleaved # subroutine for number of input blocks up to 3. This is why it # handles even double-block inputs. Larger interleave factor would # perform suboptimally on shorter inputs... # _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave # factor. Why 3x subroutine is used in loops? Even though aes[enc|dec] # latency is 6, it turned out that it can be scheduled only every # *second* cycle. Thus 3x interleave is the one providing optimal # utilization, i.e. when subroutine's throughput is virtually same as # of non-interleaved subroutine [for number of input blocks up to 3]. # This is why it makes no sense to implement 2x subroutine. As soon # as/if Intel improves throughput by making it possible to schedule # the instructions in question *every* cycles I would have to # implement 6x interleave and use it in loop... sub aesni_generate3 { my $p=shift; &function_begin_B("_aesni_${p}rypt3"); &$movekey ($rndkey0,&QWP(0,$key)); &$movekey ($rndkey1,&QWP(16,$key)); &shr ($rounds,1); &$movekey ($rndkey1,&QWP(16,$key)); &lea ($key,&DWP(32,$key)); &pxor ($inout0,$rndkey0); &pxor ($inout1,$rndkey0); &dec ($rounds); &pxor ($inout2,$rndkey0); &jmp (&label("${p}3_loop")); &set_label("${p}3_loop",16); Loading @@ -177,14 +174,59 @@ sub aesni_generate3 &ret(); &function_end_B("_aesni_${p}rypt3"); } # 4x interleave is implemented to improve small block performance, # most notably [and naturally] 4 block by ~30%. One can argue that one # should have implemented 5x as well, but improvement would be <20%, # so it's not worth it... sub aesni_generate4 { my $p=shift; &function_begin_B("_aesni_${p}rypt4"); &$movekey ($rndkey0,&QWP(0,$key)); &$movekey ($rndkey1,&QWP(16,$key)); &shr ($rounds,1); &lea ($key,&DWP(32,$key)); &pxor ($inout0,$rndkey0); &pxor ($inout1,$rndkey0); &pxor ($inout2,$rndkey0); &pxor ($inout3,$rndkey0); &jmp (&label("${p}3_loop")); &set_label("${p}3_loop",16); eval"&aes${p} ($inout0,$rndkey1)"; &$movekey ($rndkey0,&QWP(0,$key)); eval"&aes${p} ($inout1,$rndkey1)"; &dec ($rounds); eval"&aes${p} ($inout2,$rndkey1)"; eval"&aes${p} ($inout3,$rndkey1)"; &$movekey ($rndkey1,&QWP(16,$key)); eval"&aes${p} ($inout0,$rndkey0)"; &lea ($key,&DWP(32,$key)); eval"&aes${p} ($inout1,$rndkey0)"; eval"&aes${p} ($inout2,$rndkey0)"; eval"&aes${p} ($inout3,$rndkey0)"; &jnz (&label("${p}3_loop")); eval"&aes${p} ($inout0,$rndkey1)"; &$movekey ($rndkey0,&QWP(0,$key)); eval"&aes${p} ($inout1,$rndkey1)"; eval"&aes${p} ($inout2,$rndkey1)"; eval"&aes${p} ($inout3,$rndkey1)"; eval"&aes${p}last ($inout0,$rndkey0)"; eval"&aes${p}last ($inout1,$rndkey0)"; eval"&aes${p}last ($inout2,$rndkey0)"; eval"&aes${p}last ($inout3,$rndkey0)"; &ret(); &function_end_B("_aesni_${p}rypt4"); } &aesni_generate3("enc") if ($PREFIX eq "aesni"); &aesni_generate3("dec"); &aesni_generate4("enc") if ($PREFIX eq "aesni"); &aesni_generate4("dec"); if ($PREFIX eq "aesni") { # void aesni_ecb_encrypt (const void *in, void *out, # size_t length, const AES_KEY *key, # int enc); &function_begin("aesni_ecb_encrypt"); &mov ($inp,&wparam(0)); &mov ($out,&wparam(1)); Loading @@ -200,79 +242,121 @@ if ($PREFIX eq "aesni") { &mov ($rounds_,$rounds); # backup $rounds &jz (&label("ecb_decrypt")); &sub ($len,0x30); &jc (&label("ecb_enc_tail")); jmp (&label("ecb_enc_loop3")); &sub ($len,0x40); &jbe (&label("ecb_enc_tail")); &jmp (&label("ecb_enc_loop3")); &set_label("ecb_enc_loop3",16); &movups ($inout0,&QWP(0,$inp)); &movups ($inout1,&QWP(0x10,$inp)); &movups ($inout2,&QWP(0x20,$inp)); &lea ($inp,&DWP(0x30,$inp)); &call ("_aesni_encrypt3"); &movups (&QWP(0,$out),$inout0); &sub ($len,0x30); &movups (&QWP(0x10,$out),$inout1); &lea ($inp,&DWP(0x30,$inp)); &lea ($out,&DWP(0x30,$out)); &movups (&QWP(-0x30,$out),$inout0); &mov ($key,$key_); # restore $key &movups (&QWP(0x20,$out),$inout2); &movups (&QWP(-0x20,$out),$inout1); &mov ($rounds,$rounds_); # restore $rounds &lea ($out,&DWP(0x30,$out)); &jnc (&label("ecb_enc_loop3")); &movups (&QWP(-0x10,$out),$inout2); &ja (&label("ecb_enc_loop3")); &set_label("ecb_enc_tail"); &add ($len,0x30); &add ($len,0x40); &jz (&label("ecb_ret")); &cmp ($len,0x10); &movups ($inout0,&QWP(0,$inp)); je (&label("ecb_enc_one")); &je (&label("ecb_enc_one")); &cmp ($len,0x20); &movups ($inout1,&QWP(0x10,$inp)); &call ("_aesni_encrypt3"); &je (&label("ecb_enc_two")); &cmp ($len,0x30); &movups ($inout2,&QWP(0x20,$inp)); &je (&label("ecb_enc_three")); &movups ($inout3,&QWP(0x30,$inp)); &call ("_aesni_encrypt4"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(0x20,$out),$inout2); &movups (&QWP(0x30,$out),$inout3); jmp (&label("ecb_ret")); &set_label("ecb_enc_one",16); &call ("_aesni_encrypt1"); &aesni_inline_generate1("enc"); # &call ("_aesni_encrypt1"); &movups (&QWP(0,$out),$inout0); &jmp (&label("ecb_ret")); &set_label("ecb_enc_two",16); &call ("_aesni_encrypt3"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &jmp (&label("ecb_ret")); &set_label("ecb_enc_three",16); &call ("_aesni_encrypt3"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(0x20,$out),$inout2); &jmp (&label("ecb_ret")); &set_label("ecb_decrypt",16); &sub ($len,0x30); &jc (&label("ecb_dec_tail")); jmp (&label("ecb_dec_loop3")); &sub ($len,0x40); &jbe (&label("ecb_dec_tail")); &jmp (&label("ecb_dec_loop3")); &set_label("ecb_dec_loop3",16); &movups ($inout0,&QWP(0,$inp)); &movups ($inout1,&QWP(0x10,$inp)); &movups ($inout2,&QWP(0x20,$inp)); &call ("_aesni_decrypt3"); &movups (&QWP(0,$out),$inout0); &sub ($len,0x30); &lea ($inp,&DWP(0x30,$inp)); &movups (&QWP(0x10,$out),$inout1); &lea ($out,&DWP(0x30,$out)); &movups (&QWP(-0x30,$out),$inout0); &mov ($key,$key_); # restore $key &movups (&QWP(0x20,$out),$inout2); &movups (&QWP(-0x20,$out),$inout1); &mov ($rounds,$rounds_); # restore $rounds &lea ($out,&DWP(0x30,$out)); &jnc (&label("ecb_dec_loop3")); &movups (&QWP(-0x10,$out),$inout2); &ja (&label("ecb_dec_loop3")); &set_label("ecb_dec_tail"); &add ($len,0x30); &add ($len,0x40); &jz (&label("ecb_ret")); &cmp ($len,0x10); &movups ($inout0,&QWP(0,$inp)); je (&label("ecb_dec_one")); &je (&label("ecb_dec_one")); &cmp ($len,0x20); &movups ($inout1,&QWP(0x10,$inp)); &call ("_aesni_decrypt3"); &je (&label("ecb_dec_two")); &cmp ($len,0x30); &movups ($inout2,&QWP(0x20,$inp)); &je (&label("ecb_dec_three")); &movups ($inout3,&QWP(0x30,$inp)); &call ("_aesni_decrypt4"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); jmp (&label("ecb_ret")); &movups (&QWP(0x20,$out),$inout2); &movups (&QWP(0x30,$out),$inout3); &jmp (&label("ecb_ret")); &set_label("ecb_dec_one",16); &call ("_aesni_decrypt1"); &aesni_inline_generate1("dec"); # &call ("_aesni_decrypt3"); &movups (&QWP(0,$out),$inout0); &jmp (&label("ecb_ret")); &set_label("ecb_dec_two",16); &call ("_aesni_decrypt3"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &jmp (&label("ecb_ret")); &set_label("ecb_dec_three",16); &call ("_aesni_decrypt3"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(0x20,$out),$inout2); &set_label("ecb_ret"); &function_end("aesni_ecb_encrypt"); Loading @@ -288,7 +372,7 @@ if ($PREFIX eq "aesni") { &mov ($key,&wparam(3)); &test ($len,$len); &mov ($key_,&wparam(4)); &je (&label("cbc_ret")); &jz (&label("cbc_ret")); &cmp (&wparam(5),0); &movups ($ivec,&QWP(0,$key_)); # load IV Loading @@ -307,12 +391,12 @@ if ($PREFIX eq "aesni") { &movups ($ivec,&QWP(0,$inp)); &lea ($inp,&DWP(16,$inp)); &pxor ($inout0,$ivec); &call ("_aesni_encrypt1"); &aesni_inline_generate1("enc"); # &call ("_aesni_encrypt3"); &sub ($len,16); &lea ($out,&DWP(16,$out)); &mov ($rounds,$rounds_); # restore $rounds &mov ($key,$key_); # restore $key &movups (&QWP(0,$out),$inout0); &lea ($out,&DWP(16,$out)); &movups (&QWP(-16,$out),$inout0); &jnc (&label("cbc_enc_loop")); &add ($len,16); &jnz (&label("cbc_enc_tail")); Loading @@ -333,8 +417,8 @@ if ($PREFIX eq "aesni") { &jmp (&label("cbc_enc_loop")); &set_label("cbc_decrypt",16); &sub ($len,0x30); &jc (&label("cbc_dec_tail")); &sub ($len,0x40); &jbe (&label("cbc_dec_tail")); &jmp (&label("cbc_dec_loop3")); &set_label("cbc_dec_loop3",16); Loading @@ -346,20 +430,20 @@ if ($PREFIX eq "aesni") { &call ("_aesni_decrypt3"); &sub ($len,0x30); &lea ($inp,&DWP(0x30,$inp)); &lea ($out,&DWP(0x30,$out)); &pxor ($inout0,$ivec); &pxor ($inout1,$in0); &movups ($ivec,&QWP(-0x10,$inp)); &pxor ($inout2,$in1); &movups (&QWP(0,$out),$inout0); &movups (&QWP(-0x30,$out),$inout0); &mov ($rounds,$rounds_) # restore $rounds &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(-0x20,$out),$inout1); &mov ($key,$key_); # restore $key &movups (&QWP(0x20,$out),$inout2); &lea ($out,&DWP(0x30,$out)); &jnc (&label("cbc_dec_loop3")); &movups (&QWP(-0x10,$out),$inout2); &ja (&label("cbc_dec_loop3")); &set_label("cbc_dec_tail"); &add ($len,0x30); &add ($len,0x40); &jz (&label("cbc_ret")); &movups ($inout0,&QWP(0,$inp)); Loading @@ -371,19 +455,26 @@ if ($PREFIX eq "aesni") { &movaps ($in1,$inout1); &jbe (&label("cbc_dec_two")); &movups ($inout2,&QWP(0x20,$inp)); &call ("_aesni_decrypt3"); &cmp ($len,0x30); &jbe (&label("cbc_dec_three")); &movups ($inout3,&QWP(0x30,$inp)); &call ("_aesni_decrypt4"); &movups ($rndkey0,&QWP(0x10,$inp)); &movups ($rndkey1,&QWP(0x20,$inp)); &pxor ($inout0,$ivec); &movups ($ivec,&QWP(0x20,$inp)); &pxor ($inout1,$in0); &pxor ($inout2,$in1); &movups ($ivec,&QWP(0x30,$inp)); &movups (&QWP(0,$out),$inout0); &pxor ($inout2,$rndkey0); &pxor ($inout3,$rndkey1); &movups (&QWP(0x10,$out),$inout1); &movaps ($inout0,$inout2); &lea ($out,&DWP(0x20,$out)); &movups (&QWP(0x20,$out),$inout2); &movaps ($inout0,$inout3); &lea ($out,&DWP(0x30,$out)); &jmp (&label("cbc_dec_tail_collected")); &set_label("cbc_dec_one"); &call ("_aesni_decrypt1"); &aesni_inline_generate1("dec"); # &call ("_aesni_decrypt3"); &pxor ($inout0,$ivec); &movaps ($ivec,$in0); &jmp (&label("cbc_dec_tail_collected")); Loading @@ -396,6 +487,18 @@ if ($PREFIX eq "aesni") { &movaps ($inout0,$inout1); &movaps ($ivec,$in1); &lea ($out,&DWP(0x10,$out)); &jmp (&label("cbc_dec_tail_collected")); &set_label("cbc_dec_three"); &call ("_aesni_decrypt3"); &pxor ($inout0,$ivec); &pxor ($inout1,$in0); &pxor ($inout2,$in1); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &movaps ($inout0,$inout2); &movups ($ivec,&QWP(0x20,$inp)); &lea ($out,&DWP(0x20,$out)); &set_label("cbc_dec_tail_collected"); &and ($len,15); Loading Loading @@ -446,7 +549,7 @@ if ($PREFIX eq "aesni") { &jne (&label("bad_keybits")); &set_label("10rounds",16); &mov ($rounds,10); &mov ($rounds,9); &$movekey (&QWP(-16,$key),"xmm0"); # round 0 &aeskeygenassist("xmm1","xmm0",0x01); # round 1 &call (&label("key_128_cold")); Loading Loading @@ -487,7 +590,7 @@ if ($PREFIX eq "aesni") { &set_label("12rounds",16); &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey &mov ($rounds,12); &mov ($rounds,11); &$movekey (&QWP(-16,$key),"xmm0") # round 0 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2 &call (&label("key_192a_cold")); Loading Loading @@ -540,7 +643,7 @@ if ($PREFIX eq "aesni") { &set_label("14rounds",16); &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey &mov ($rounds,14); &mov ($rounds,13); &lea ($key,&DWP(16,$key)); &$movekey (&QWP(-32,$key),"xmm0"); # round 0 &$movekey (&QWP(-16,$key),"xmm2"); # round 1 Loading Loading @@ -625,10 +728,10 @@ if ($PREFIX eq "aesni") { &mov ($key,&wparam(2)); &call ("_aesni_set_encrypt_key"); &mov ($key,&wparam(2)); &shl ($rounds,4) # actually rounds after _aesni_set_encrypt_key &shl ($rounds,4) # rounds-1 after _aesni_set_encrypt_key &test ("eax","eax"); &jnz (&label("dec_key_ret")); &lea ("eax",&DWP(0,$key,$rounds)); # end of key schedule &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule &$movekey ("xmm0",&QWP(0,$key)); # just swap &$movekey ("xmm1",&QWP(0,"eax")); Loading @@ -636,9 +739,8 @@ if ($PREFIX eq "aesni") { &$movekey (&QWP(0,$key),"xmm1"); &lea ($key,&DWP(16,$key)); &lea ("eax",&DWP(-16,"eax")); &jmp (&label("dec_key_inverse")); &set_label("dec_key_inverse",16); &set_label("dec_key_inverse"); &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse &$movekey ("xmm1",&QWP(0,"eax")); &aesimc ("xmm0","xmm0"); Loading
crypto/aes/asm/aesni-x86_64.pl +432 −404 File changed.Preview size limit exceeded, changes collapsed. Show changes
crypto/engine/eng_aesni.c +9 −8 Original line number Diff line number Diff line Loading @@ -147,8 +147,9 @@ static int aesni_ciphers(ENGINE *e, const EVP_CIPHER **cipher, ((void *)(((unsigned long)(x)+AESNI_MIN_ALIGN-1)&~(AESNI_MIN_ALIGN-1))) /* Engine names */ static const char *aesni_id = "aesni"; static const char *aesni_name = "Intel AES-NI engine"; static const char aesni_id[] = "aesni", aesni_name[] = "Intel AES-NI engine", no_aesni_name[] = "Intel AES-NI engine (no-aesni)"; /* ===== Engine "management" functions ===== */ Loading @@ -156,15 +157,15 @@ static const char *aesni_name = "Intel AES-NI engine"; static int aesni_bind_helper(ENGINE *e) { if (!(OPENSSL_ia32cap_P[1] & (1UL << (57-32)))) return 0; int engage = (OPENSSL_ia32cap_P[1] & (1 << (57-32))) != 0; /* Register everything or return with an error */ if (!ENGINE_set_id(e, aesni_id) || !ENGINE_set_name(e, aesni_name) || !ENGINE_set_name(e, engage ? aesni_name : no_aesni_name) || !ENGINE_set_init_function(e, aesni_init) || !ENGINE_set_ciphers (e, aesni_ciphers)) (engage && !ENGINE_set_ciphers (e, aesni_ciphers)) ) return 0; /* Everything looks good */ Loading Loading @@ -286,14 +287,14 @@ static int aesni_cipher_cfb(EVP_CIPHER_CTX *ctx, unsigned char *out, { AES_KEY *key = AESNI_ALIGN(ctx->cipher_data); CRYPTO_cfb128_encrypt(in, out, inl, key, ctx->iv, &ctx->num, ctx->encrypt, aesni_encrypt); (block128_f)aesni_encrypt); return 1; } static int aesni_cipher_ofb(EVP_CIPHER_CTX *ctx, unsigned char *out, const unsigned char *in, size_t inl) { AES_KEY *key = AESNI_ALIGN(ctx->cipher_data); CRYPTO_ofb128_encrypt(in, out, inl, key, ctx->iv, &ctx->num, aesni_encrypt); &ctx->num, (block128_f)aesni_encrypt); return 1; } Loading
test/test_aesni +4 −8 Original line number Diff line number Diff line Loading @@ -14,23 +14,19 @@ else exit 1; fi if $PROG engine aesni | grep aesni; then if $PROG engine aesni | grep -v no-aesni; then HASH=`cat $PROG | $PROG dgst -hex` ACE_ALGS=" aes-128-ecb aes-192-ecb aes-256-ecb \ AES_ALGS=" aes-128-ecb aes-192-ecb aes-256-ecb \ aes-128-cbc aes-192-cbc aes-256-cbc \ aes-128-cfb aes-192-cfb aes-256-cfb \ aes-128-ofb aes-192-ofb aes-256-ofb" BUFSIZE="16 32 48 64 80 96 128 999" ACE_ALGS=" aes-128-cbc aes-192-cbc aes-256-cbc \ aes-128-cfb aes-192-cfb aes-256-cfb \ aes-128-ofb aes-192-ofb aes-256-ofb" BUFSIZE="48 64 80 96 128 999" BUFSIZE="16 32 48 64 80 96 128 144 999" nerr=0 for alg in $ACE_ALGS; do for alg in $AES_ALGS; do echo $alg for bufsize in $BUFSIZE; do TEST=`( cat $PROG | \ Loading