Loading crypto/aes/asm/aesni-x86.pl +190 −21 Original line number Diff line number Diff line Loading @@ -23,7 +23,8 @@ require "x86asm.pl"; &asm_init($ARGV[0],$0); $movekey = eval($RREFIX eq "aseni" ? "*movaps" : "*movups"); if ($PREFIX eq "aesni") { $movekey=*movaps; } else { $movekey=*movups; } $len="eax"; $rounds="ecx"; Loading @@ -41,7 +42,7 @@ $rndkey1="xmm4"; $ivec="xmm5"; $in0="xmm6"; $in1="xmm7"; $inout3="xmm7"; # Inline version of internal aesni_[en|de]crypt1 sub aesni_inline_generate1 { my $p=shift; Loading Loading @@ -104,7 +105,7 @@ sub aesni_generate1 # fully unrolled loop &ret(); &function_end_B("_aesni_${p}rypt1"); } # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key); &aesni_generate1("enc") if (!$inline); &function_begin_B("${PREFIX}_encrypt"); Loading Loading @@ -136,7 +137,7 @@ sub aesni_generate1 # fully unrolled loop &movups (&QWP(0,"eax"),$inout0); &ret (); &function_end_B("${PREFIX}_decrypt"); # _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave # factor. Why 3x subroutine is used in loops? Even though aes[enc|dec] # latency is 6, it turned out that it can be scheduled only every Loading Loading @@ -229,8 +230,9 @@ sub aesni_generate4 &aesni_generate3("dec"); &aesni_generate4("enc") if ($PREFIX eq "aesni"); &aesni_generate4("dec"); if ($PREFIX eq "aesni") { ###################################################################### # void aesni_ecb_encrypt (const void *in, void *out, # size_t length, const AES_KEY *key, # int enc); Loading @@ -249,8 +251,9 @@ if ($PREFIX eq "aesni") { &mov ($rounds_,$rounds); # backup $rounds &jz (&label("ecb_decrypt")); &sub ($len,0x40); &cmp ($len,0x40); &jbe (&label("ecb_enc_tail")); &sub ($len,0x40); &jmp (&label("ecb_enc_loop3")); &set_label("ecb_enc_loop3",16); Loading @@ -268,14 +271,13 @@ if ($PREFIX eq "aesni") { &movups (&QWP(-0x10,$out),$inout2); &ja (&label("ecb_enc_loop3")); &set_label("ecb_enc_tail"); &add ($len,0x40); &jz (&label("ecb_ret")); &cmp ($len,0x10); &movups ($inout0,&QWP(0,$inp)); &je (&label("ecb_enc_one")); &set_label("ecb_enc_tail"); &cmp ($len,0x20); &movups ($inout0,&QWP(0,$inp)); &jb (&label("ecb_enc_one")); &movups ($inout1,&QWP(0x10,$inp)); &je (&label("ecb_enc_two")); &cmp ($len,0x30); Loading Loading @@ -309,10 +311,11 @@ if ($PREFIX eq "aesni") { &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(0x20,$out),$inout2); &jmp (&label("ecb_ret")); ###################################################################### &set_label("ecb_decrypt",16); &sub ($len,0x40); &cmp ($len,0x40); &jbe (&label("ecb_dec_tail")); &sub ($len,0x40); &jmp (&label("ecb_dec_loop3")); &set_label("ecb_dec_loop3",16); Loading @@ -330,14 +333,13 @@ if ($PREFIX eq "aesni") { &movups (&QWP(-0x10,$out),$inout2); &ja (&label("ecb_dec_loop3")); &set_label("ecb_dec_tail"); &add ($len,0x40); &jz (&label("ecb_ret")); &cmp ($len,0x10); &movups ($inout0,&QWP(0,$inp)); &je (&label("ecb_dec_one")); &set_label("ecb_dec_tail"); &cmp ($len,0x20); &movups ($inout0,&QWP(0,$inp)); &jb (&label("ecb_dec_one")); &movups ($inout1,&QWP(0x10,$inp)); &je (&label("ecb_dec_two")); &cmp ($len,0x30); Loading Loading @@ -373,8 +375,173 @@ if ($PREFIX eq "aesni") { &set_label("ecb_ret"); &function_end("aesni_ecb_encrypt"); } ###################################################################### # handles only complete blocks, operates on 32-bit counter and # does not update *ivec! (see engine/eng_aesni.c for details) # # void aesni_ctr32_encrypt_blocks (const void *in, void *out, # size_t blocks, const AES_KEY *key, # const char *ivec); &function_begin("aesni_ctr32_encrypt_blocks"); &mov ($inp,&wparam(0)); &mov ($out,&wparam(1)); &mov ($len,&wparam(2)); &mov ($key,&wparam(3)); &mov ($rounds_,&wparam(4)); &mov ($key_,"esp"); &sub ("esp",60); &and ("esp",-16); # align stack &mov (&DWP(48,"esp"),$key_); &movups ($inout3,&QWP(0,$rounds_)); # load ivec # compose byte-swap control mask for pshufb on stack &mov (&DWP(0,"esp"),0x0c0d0e0f); &mov (&DWP(4,"esp"),0x08090a0b); &mov (&DWP(8,"esp"),0x04050607); &mov (&DWP(12,"esp"),0x00010203); # compose counter increment vector on stack &mov ($rounds,3); &xor ($key_,$key_); &mov (&DWP(16,"esp"),$rounds); &mov (&DWP(20,"esp"),$rounds); &mov (&DWP(24,"esp"),$rounds); &mov (&DWP(28,"esp"),$key_); &pextrd ($rounds_,$inout3,3); # pull 32-bit counter &pinsrd ($inout3,$key_,3); # wipe 32-bit counter &mov ($rounds,&DWP(240,$key)); # key->rounds &movaps ($rndkey0,&QWP(0,"esp")); # load byte-swap mask # $ivec is vector of 3 32-bit counters &pxor ($ivec,$ivec); &bswap ($rounds_); &pinsrd ($ivec,$rounds_,0); &inc ($rounds_); &pinsrd ($ivec,$rounds_,1); &inc ($rounds_); &pinsrd ($ivec,$rounds_,2); &cmp ($len,4); &pshufb ($ivec,$rndkey0); # byte swap &jbe (&label("ctr32_tail")); &movaps (&QWP(32,"esp"),$inout3); # save counter-less ivec &mov ($rounds_,$rounds); &mov ($key_,$key); &sub ($len,4); &jmp (&label("ctr32_loop3")); &set_label("ctr32_loop3",16); &pshufd ($inout0,$ivec,3<<6); # place counter to upper dword &pshufd ($inout1,$ivec,2<<6); &pshufd ($inout2,$ivec,1<<6); &por ($inout0,$inout3); # merge counter-less ivec &por ($inout1,$inout3); &por ($inout2,$inout3); &call ("_aesni_encrypt3"); &movaps($rndkey0,&QWP(0,"esp")); # load byte-swap mask &movups ($in0,&QWP(0,$inp)); &movups ($in1,&QWP(0x10,$inp)); &movups ($rndkey1,&QWP(0x20,$inp)); &pshufb($ivec,$rndkey0); # byte swap &paddd ($ivec,&QWP(16,"esp")); # counter increment &pxor ($in0,$inout0); &pxor ($in1,$inout1); &pxor ($rndkey1,$inout2); &movups (&QWP(0,$out),$in0); &movups (&QWP(0x10,$out),$in1); &movups (&QWP(0x20,$out),$rndkey1); &movaps ($inout3,&QWP(32,"esp")); # load counter-less ivec &pshufb($ivec,$rndkey0); # byte swap &sub ($len,3); &lea ($inp,&DWP(0x30,$inp)); &lea ($out,&DWP(0x30,$out)); &mov ($key,$key_); &mov ($rounds,$rounds_); &ja (&label("ctr32_loop3")); &add ($len,4); &pextrd ($rounds_,$ivec,1); # might need last counter value &jz (&label("ctr32_ret")); &bswap ($rounds_); &set_label("ctr32_tail"); &cmp ($len,2); &pshufd ($inout0,$ivec,3<<6); &pshufd ($inout1,$ivec,2<<6); &pshufd ($inout2,$ivec,1<<6); &por ($inout0,$inout3); &jb (&label("ctr32_one")); &por ($inout1,$inout3); &je (&label("ctr32_two")); &cmp ($len,3); &por ($inout2,$inout3); &je (&label("ctr32_three")); &inc ($rounds_); # compose last counter value &bswap ($rounds_); &pinsrd ($inout3,$rounds_,3); &call ("_aesni_encrypt4"); &movups ($in0,&QWP(0,$inp)); &movups ($rndkey1,&QWP(0x10,$inp)); &movups ($rndkey0,&QWP(0x20,$inp)); &movups ($ivec,&QWP(0x30,$inp)); &pxor ($in0,$inout0); &pxor ($rndkey1,$inout1); &pxor ($rndkey0,$inout2); &pxor ($ivec,$inout3); &movups (&QWP(0,$out),$in0); &movups (&QWP(0x10,$out),$rndkey1); &movups (&QWP(0x20,$out),$rndkey0); &movups (&QWP(0x30,$out),$ivec); &jmp (&label("ctr32_ret")); &set_label("ctr32_one",16); if ($inline) { &aesni_inline_generate1("enc"); } else { &call ("_aesni_encrypt1"); } &movups ($in0,&QWP(0,$inp)); &pxor ($in0,$inout0); &movups (&QWP(0,$out),$in0); &jmp (&label("ctr32_ret")); &set_label("ctr32_two",16); &call ("_aesni_encrypt3"); &movups ($in0,&QWP(0,$inp)); &movups ($in1,&QWP(0x10,$inp)); &pxor ($in0,$inout0); &pxor ($in1,$inout1); &movups (&QWP(0,$out),$in0); &movups (&QWP(0x10,$out),$in1); &jmp (&label("ctr32_ret")); &set_label("ctr32_three",16); &call ("_aesni_encrypt3"); &movups ($in0,&QWP(0,$inp)); &movups ($in1,&QWP(0x10,$inp)); &movups ($rndkey1,&QWP(0x20,$inp)); &pxor ($in0,$inout0); &pxor ($in1,$inout1); &pxor ($rndkey1,$inout2); &movups (&QWP(0,$out),$in0); &movups (&QWP(0x10,$out),$in1); &movups (&QWP(0x20,$out),$rndkey1); &set_label("ctr32_ret"); &mov ("esp",&DWP(48,"esp")); &function_end("aesni_ctr32_encrypt_blocks"); } ###################################################################### # void $PREFIX_cbc_encrypt (const void *inp, void *out, # size_t length, const AES_KEY *key, # unsigned char *ivp,const int enc); Loading Loading @@ -431,10 +598,11 @@ if ($PREFIX eq "aesni") { &mov ($inp,$out); # $inp and $out are the same &mov ($key,$key_); # restore $key &jmp (&label("cbc_enc_loop")); ###################################################################### &set_label("cbc_decrypt",16); &sub ($len,0x40); &cmp ($len,0x40); &jbe (&label("cbc_dec_tail")); &sub ($len,0x40); &jmp (&label("cbc_dec_loop3")); &set_label("cbc_dec_loop3",16); Loading @@ -458,10 +626,10 @@ if ($PREFIX eq "aesni") { &movups (&QWP(-0x10,$out),$inout2); &ja (&label("cbc_dec_loop3")); &set_label("cbc_dec_tail"); &add ($len,0x40); &jz (&label("cbc_ret")); &set_label("cbc_dec_tail"); &movups ($inout0,&QWP(0,$inp)); &cmp ($len,0x10); &movaps ($in0,$inout0); Loading Loading @@ -539,7 +707,8 @@ if ($PREFIX eq "aesni") { &mov ($key_,&wparam(4)); &movups (&QWP(0,$key_),$ivec); # output IV &function_end("${PREFIX}_cbc_encrypt"); ###################################################################### # Mechanical port from aesni-x86_64.pl. # # _aesni_set_encrypt_key is private interface, Loading crypto/aes/asm/aesni-x86_64.pl +270 −36 Original line number Diff line number Diff line Loading @@ -41,7 +41,7 @@ $inp="%rdi"; $out="%rsi"; $len="%rdx"; $key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!! $ivp="%r8"; # cbc $ivp="%r8"; # cbc, ctr $rnds_="%r10d"; # backup copy for $rounds $key_="%r11"; # backup copy for $key Loading @@ -51,7 +51,7 @@ $inout0="%xmm0"; $inout1="%xmm1"; $inout2="%xmm2"; $inout3="%xmm3"; $rndkey0="%xmm4"; $rndkey1="%xmm5"; $iv="%xmm6"; $in0="%xmm7"; # used in CBC decrypt $iv="%xmm6"; $in0="%xmm7"; # used in CBC decrypt, CTR $in1="%xmm8"; $in2="%xmm9"; # Inline version of internal aesni_[en|de]crypt1. Loading Loading @@ -214,6 +214,7 @@ ___ &aesni_generate4("dec"); if ($PREFIX eq "aesni") { ######################################################################## # void aesni_ecb_encrypt (const void *in, void *out, # size_t length, const AES_KEY *key, # int enc); Loading @@ -232,8 +233,9 @@ aesni_ecb_encrypt: mov $rounds,$rnds_ # backup $rounds jz .Lecb_decrypt #--------------------------- ECB ENCRYPT ------------------------------# sub \$0x40,$len cmp \$0x40,$len jbe .Lecb_enc_tail sub \$0x40,$len jmp .Lecb_enc_loop3 .align 16 .Lecb_enc_loop3: Loading @@ -251,14 +253,13 @@ aesni_ecb_encrypt: movups $inout2,-0x10($out) ja .Lecb_enc_loop3 .Lecb_enc_tail: add \$0x40,$len jz .Lecb_ret cmp \$0x10,$len movups ($inp),$inout0 je .Lecb_enc_one .Lecb_enc_tail: cmp \$0x20,$len movups ($inp),$inout0 jb .Lecb_enc_one movups 0x10($inp),$inout1 je .Lecb_enc_two cmp \$0x30,$len Loading Loading @@ -294,8 +295,9 @@ $code.=<<___; #--------------------------- ECB DECRYPT ------------------------------# .align 16 .Lecb_decrypt: sub \$0x40,$len cmp \$0x40,$len jbe .Lecb_dec_tail sub \$0x40,$len jmp .Lecb_dec_loop3 .align 16 .Lecb_dec_loop3: Loading @@ -313,14 +315,13 @@ $code.=<<___; movups $inout2,-0x10($out) ja .Lecb_dec_loop3 .Lecb_dec_tail: add \$0x40,$len jz .Lecb_ret cmp \$0x10,$len movups ($inp),$inout0 je .Lecb_dec_one .Lecb_dec_tail: cmp \$0x20,$len movups ($inp),$inout0 jb .Lecb_dec_one movups 0x10($inp),$inout1 je .Lecb_dec_two cmp \$0x30,$len Loading Loading @@ -357,8 +358,175 @@ $code.=<<___; ret .size aesni_ecb_encrypt,.-aesni_ecb_encrypt ___ ###################################################################### # handles only complete blocks, operates on 32-bit counter and # does not update *ivec! (see engine/eng_aesni.c for details) # # void aesni_ctr32_encrypt_blocks (const void *in, void *out, # size_t blocks, const AES_KEY *key, # const char *ivec); $increment="%xmm10"; $bswap_mask="%xmm11"; $code.=<<___; .globl aesni_ctr32_encrypt_blocks .type aesni_ctr32_encrypt_blocks,\@function,5 .align 16 aesni_ctr32_encrypt_blocks: ___ $code.=<<___ if ($win64); lea -0x68(%rsp),%rsp movaps %xmm6,(%rsp) movaps %xmm7,0x10(%rsp) movaps %xmm8,0x20(%rsp) movaps %xmm9,0x30(%rsp) movaps %xmm10,0x40(%rsp) movaps %xmm11,0x50(%rsp) .Lctr32_body: ___ $code.=<<___; movups ($ivp),$inout3 movaps .Lincrement(%rip),$increment movaps .Lbswap_mask(%rip),$bswap_mask xor $rounds,$rounds pextrd \$3,$inout3,$rnds_ # pull 32-bit counter pinsrd \$3,$rounds,$inout3 # wipe 32-bit counter mov 240($key),$rounds # key->rounds pxor $iv,$iv # vector of 3 32-bit counters bswap $rnds_ pinsrd \$0,$rnds_,$iv inc $rnds_ pinsrd \$1,$rnds_,$iv inc $rnds_ pinsrd \$2,$rnds_,$iv cmp \$4,$len pshufb $bswap_mask,$iv jbe .Lctr32_tail mov $rounds,$rnds_ mov $key,$key_ sub \$4,$len jmp .Lctr32_loop3 .align 16 .Lctr32_loop3: pshufd \$`3<<6`,$iv,$inout0 # place counter to upper dword pshufd \$`2<<6`,$iv,$inout1 pshufd \$`1<<6`,$iv,$inout2 movups ($inp),$in0 movups 0x10($inp),$in1 movups 0x20($inp),$in2 por $inout3,$inout0 # merge counter-less ivec por $inout3,$inout1 por $inout3,$inout2 pshufb $bswap_mask,$iv call _aesni_encrypt3 paddd $increment,$iv pxor $inout0,$in0 pxor $inout1,$in1 pxor $inout2,$in2 pshufb $bswap_mask,$iv movups $in0,($out) movups $in1,0x10($out) movups $in2,0x20($out) sub \$3,$len lea 0x30($inp),$inp lea 0x30($out),$out mov $key_,$key mov $rnds_,$rounds ja .Lctr32_loop3 add \$4,$len pextrd \$1,$iv,$rnds_ # migh need last counter value jz .Lctr32_done bswap $rnds_ .Lctr32_tail: cmp \$2,$len pshufd \$`3<<6`,$iv,$inout0 pshufd \$`2<<6`,$iv,$inout1 pshufd \$`1<<6`,$iv,$inout2 por $inout3,$inout0 movups ($inp),$in0 jb .Lctr32_one por $inout3,$inout1 movups 0x10($inp),$in1 je .Lctr32_two cmp \$3,$len por $inout3,$inout2 movups 0x20($inp),$in2 je .Lctr32_three inc $rnds_ # compose last counter value bswap $rnds_ pinsrd \$3,$rnds_,$inout3 movups 0x30($inp),$iv call _aesni_encrypt4 pxor $inout0,$in0 pxor $inout1,$in1 pxor $inout2,$in2 pxor $inout3,$iv movups $in0,($out) movups $in1,0x10($out) movups $in2,0x20($out) movups $iv,0x30($out) jmp .Lctr32_done .align 16 .Lctr32_one: ___ &aesni_generate1("enc",$key,$rounds); $code.=<<___; pxor $inout0,$in0 movups $in0,($out) jmp .Lctr32_done .align 16 .Lctr32_two: call _aesni_encrypt3 pxor $inout0,$in0 pxor $inout1,$in1 movups $in0,($out) movups $in1,0x10($out) jmp .Lctr32_done .align 16 .Lctr32_three: call _aesni_encrypt3 pxor $inout0,$in0 pxor $inout1,$in1 pxor $inout2,$in2 movups $in0,($out) movups $in1,0x10($out) movups $in2,0x20($out) .Lctr32_done: ___ $code.=<<___ if ($win64); movaps (%rsp),%xmm6 movaps 0x10(%rsp),%xmm7 movaps 0x20(%rsp),%xmm8 movaps 0x30(%rsp),%xmm9 movaps 0x40(%rsp),%xmm10 movaps 0x50(%rsp),%xmm11 lea 0x68(%rsp),%rsp ___ $code.=<<___; .Lctr32_ret: ret .size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks ___ } ######################################################################## # void $PREFIX_cbc_encrypt (const void *inp, void *out, # size_t length, const AES_KEY *key, # unsigned char *ivp,const int enc); Loading Loading @@ -429,9 +597,10 @@ $code.=<<___ if ($win64); ___ $code.=<<___; movups ($ivp),$iv sub \$0x40,$len cmp \$0x40,$len mov $rnds_,$rounds jbe .Lcbc_dec_tail sub \$0x40,$len jmp .Lcbc_dec_loop3 .align 16 .Lcbc_dec_loop3: Loading @@ -456,11 +625,11 @@ $code.=<<___; movups $inout2,-0x10($out) ja .Lcbc_dec_loop3 .Lcbc_dec_tail: add \$0x40,$len movups $iv,($ivp) jz .Lcbc_dec_ret .Lcbc_dec_tail: movups ($inp),$inout0 cmp \$0x10,$len movaps $inout0,$in0 Loading Loading @@ -796,6 +965,11 @@ ___ } $code.=<<___; .align 64 .Lbswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 .Lincrement: .long 3,3,3,0 .asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>" .align 64 ___ Loading @@ -810,9 +984,11 @@ $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type cbc_se_handler,\@abi-omnipotent ___ $code.=<<___ if ($PREFIX eq "aesni"); .type ecb_se_handler,\@abi-omnipotent .align 16 cbc_se_handler: ecb_se_handler: push %rsi push %rdi push %rbx Loading @@ -825,30 +1001,48 @@ cbc_se_handler: sub \$64,%rsp mov 152($context),%rax # pull context->Rsp mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi jmp .Lcommon_seh_exit .size ecb_se_handler,.-ecb_se_handler .type ctr32_se_handler,\@abi-omnipotent .align 16 ctr32_se_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip lea .Lcbc_decrypt(%rip),%r10 lea .Lctr32_body(%rip),%r10 cmp %r10,%rbx # context->Rip<"prologue" label jb .Lin_prologue jb .Lin_ctr32_prologue lea .Lcbc_decrypt_body(%rip),%r10 cmp %r10,%rbx # context->Rip<cbc_decrypt_body jb .Lrestore_rax mov 152($context),%rax # pull context->Rsp lea .Lcbc_ret(%rip),%r10 cmp %r10,%rbx # context->Rip>="epilogue" label jae .Lin_prologue lea .Lctr32_ret(%rip),%r10 cmp %r10,%rbx jae .Lin_ctr32_prologue lea 0(%rax),%rsi # top of stack lea 512($context),%rdi # &context.Xmm6 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) mov \$12,%ecx # 6*sizeof(%xmm0)/sizeof(%rax) .long 0xa548f3fc # cld; rep movsq lea 0x58(%rax),%rax # adjust stack pointer jmp .Lin_prologue lea 0x68(%rax),%rax # adjust stack pointer .Lrestore_rax: mov 120($context),%rax .Lin_prologue: .Lin_ctr32_prologue: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp Loading @@ -856,11 +1050,12 @@ cbc_se_handler: mov %rdi,176($context) # restore context->Rdi jmp .Lcommon_seh_exit .size cbc_se_handler,.-cbc_se_handler .type ecb_se_handler,\@abi-omnipotent .size ctr32_se_handler,.-ctr32_se_handler ___ $code.=<<___; .type cbc_se_handler,\@abi-omnipotent .align 16 ecb_se_handler: cbc_se_handler: push %rsi push %rdi push %rbx Loading @@ -873,8 +1068,33 @@ ecb_se_handler: sub \$64,%rsp mov 152($context),%rax # pull context->Rsp mov 248($context),%rbx # pull context->Rip lea .Lcbc_decrypt(%rip),%r10 cmp %r10,%rbx # context->Rip<"prologue" label jb .Lin_cbc_prologue lea .Lcbc_decrypt_body(%rip),%r10 cmp %r10,%rbx # context->Rip<cbc_decrypt_body jb .Lrestore_cbc_rax lea .Lcbc_ret(%rip),%r10 cmp %r10,%rbx # context->Rip>="epilogue" label jae .Lin_cbc_prologue lea 0(%rax),%rsi # top of stack lea 512($context),%rdi # &context.Xmm6 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) .long 0xa548f3fc # cld; rep movsq lea 0x58(%rax),%rax # adjust stack pointer jmp .Lin_cbc_prologue .Lrestore_cbc_rax: mov 120($context),%rax .Lin_cbc_prologue: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi Loading Loading @@ -915,10 +1135,17 @@ ecb_se_handler: .section .pdata .align 4 .rva .LSEH_begin_${PREFIX}_ecb_encrypt .rva .LSEH_end_${PREFIX}_ecb_encrypt ___ $code.=<<___ if ($PREFIX eq "aesni"); .rva .LSEH_begin_aesni_ecb_encrypt .rva .LSEH_end_aesni_ecb_encrypt .rva .LSEH_info_ecb .rva .LSEH_begin_aesni_ctr32_encrypt_blocks .rva .LSEH_end_aesni_ctr32_encrypt_blocks .rva .LSEH_info_ctr32 ___ $code.=<<___; .rva .LSEH_begin_${PREFIX}_cbc_encrypt .rva .LSEH_end_${PREFIX}_cbc_encrypt .rva .LSEH_info_cbc Loading @@ -932,9 +1159,16 @@ ecb_se_handler: .rva .LSEH_info_key .section .xdata .align 8 ___ $code.=<<___ if ($PREFIX eq "aesni"); .LSEH_info_ecb: .byte 9,0,0,0 .rva ecb_se_handler .LSEH_info_ctr32: .byte 9,0,0,0 .rva ctr32_se_handler ___ $code.=<<___; .LSEH_info_cbc: .byte 9,0,0,0 .rva cbc_se_handler Loading crypto/engine/eng_aesni.c +173 −5 Original line number Diff line number Diff line Loading @@ -111,6 +111,35 @@ void ENGINE_load_aesni (void) } #ifdef COMPILE_HW_AESNI typedef unsigned int u32; typedef unsigned char u8; #if defined(__GNUC__) && __GNUC__>=2 # define BSWAP4(x) ({ u32 ret=(x); \ asm volatile ("bswapl %0" \ : "+r"(ret)); ret; }) #elif defined(_MSC_VER) # if _MSC_VER>=1300 # pragma intrinsic(_byteswap_ulong) # define BSWAP4(x) _byteswap_ulong((u32)(x)) # elif defined(_M_IX86) __inline u32 _bswap4(u32 val) { _asm mov eax,val _asm bswap eax } # define BSWAP4(x) _bswap4(x) # endif #endif #ifdef BSWAP4 #define GETU32(p) BSWAP4(*(const u32 *)(p)) #define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v) #else #define GETU32(p) ((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3]) #define PUTU32(p,v) ((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v)) #endif int aesni_set_encrypt_key(const unsigned char *userKey, int bits, AES_KEY *key); int aesni_set_decrypt_key(const unsigned char *userKey, int bits, Loading @@ -132,6 +161,12 @@ void aesni_cbc_encrypt(const unsigned char *in, const AES_KEY *key, unsigned char *ivec, int enc); void aesni_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, size_t blocks, const AES_KEY *key, const unsigned char *ivec); /* Function for ENGINE detection and control */ static int aesni_init(ENGINE *e); Loading Loading @@ -224,16 +259,19 @@ static int aesni_cipher_nids[] = { NID_aes_128_cbc, NID_aes_128_cfb, NID_aes_128_ofb, NID_aes_128_ctr, NID_aes_192_ecb, NID_aes_192_cbc, NID_aes_192_cfb, NID_aes_192_ofb, NID_aes_192_ctr, NID_aes_256_ecb, NID_aes_256_cbc, NID_aes_256_cfb, NID_aes_256_ofb, NID_aes_256_ctr, }; static int aesni_cipher_nids_num = (sizeof(aesni_cipher_nids)/sizeof(aesni_cipher_nids[0])); Loading @@ -251,18 +289,28 @@ aesni_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *user_key, int ret; AES_KEY *key = AESNI_ALIGN(ctx->cipher_data); if ((ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_CFB_MODE || (ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_OFB_MODE || enc) ret=aesni_set_encrypt_key(user_key, ctx->key_len * 8, key); else if (((ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_ECB_MODE || (ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_CBC_MODE) && !enc) ret=aesni_set_decrypt_key(user_key, ctx->key_len * 8, key); else ret=aesni_set_encrypt_key(user_key, ctx->key_len * 8, key); if(ret < 0) { EVPerr(EVP_F_AESNI_INIT_KEY,EVP_R_AES_KEY_SETUP_FAILED); return 0; } if (ctx->cipher->flags&EVP_CIPH_CUSTOM_IV) { if (iv!=NULL) memcpy (ctx->iv,iv,ctx->cipher->iv_len); else { EVPerr(EVP_F_AESNI_INIT_KEY,EVP_R_AES_IV_SETUP_FAILED); return 0; } } return 1; } Loading Loading @@ -336,6 +384,117 @@ DECLARE_AES_EVP(256,cbc,CBC); DECLARE_AES_EVP(256,cfb,CFB); DECLARE_AES_EVP(256,ofb,OFB); static void ctr96_inc(unsigned char *counter) { u32 n=12; u8 c; do { --n; c = counter[n]; ++c; counter[n] = c; if (c) return; } while (n); } static int aesni_counter(EVP_CIPHER_CTX *ctx, unsigned char *out, const unsigned char *in, size_t len) { AES_KEY *key = AESNI_ALIGN(ctx->cipher_data); u32 n, ctr32; n = ctx->num; while (n && len) { *(out++) = *(in++) ^ ctx->buf[n]; --len; n = (n+1) % 16; } ctr32 = GETU32(ctx->iv+12); while (len>=16) { size_t blocks = len/16; /* * 1<<24 is just a not-so-small yet not-so-large number... */ if (blocks > (1U<<24)) blocks = (1U<<24); /* * As aesni_ctr32 operates on 32-bit counter, caller * has to handle overflow. 'if' below detects the * overflow, which is then handled by limiting the * amount of blocks to the exact overflow point... */ ctr32 += (u32)blocks; if (ctr32 < blocks) { blocks -= ctr32; ctr32 = 0; } aesni_ctr32_encrypt_blocks(in,out,blocks,key,ctx->iv); /* aesni_ctr32 does not update ctx->iv, caller does: */ PUTU32(ctx->iv+12,ctr32); /* ... overflow was detected, propogate carry. */ if (ctr32 == 0) ctr96_inc(ctx->iv); blocks *= 16; len -= blocks; out += blocks; in += blocks; } if (len) { aesni_encrypt(ctx->iv,ctx->buf,key); ++ctr32; PUTU32(ctx->iv+12,ctr32); if (ctr32 == 0) ctr96_inc(ctx->iv); while (len--) { out[n] = in[n] ^ ctx->buf[n]; ++n; } } ctx->num = n; return 1; } static const EVP_CIPHER aesni_128_ctr= { NID_aes_128_ctr,1,16,16, EVP_CIPH_CUSTOM_IV, aesni_init_key, aesni_counter, NULL, sizeof(AESNI_KEY), NULL, NULL, NULL, NULL }; static const EVP_CIPHER aesni_192_ctr= { NID_aes_192_ctr,1,24,16, EVP_CIPH_CUSTOM_IV, aesni_init_key, aesni_counter, NULL, sizeof(AESNI_KEY), NULL, NULL, NULL, NULL }; static const EVP_CIPHER aesni_256_ctr= { NID_aes_256_ctr,1,32,16, EVP_CIPH_CUSTOM_IV, aesni_init_key, aesni_counter, NULL, sizeof(AESNI_KEY), NULL, NULL, NULL, NULL }; static int aesni_ciphers (ENGINE *e, const EVP_CIPHER **cipher, const int **nids, int nid) Loading @@ -360,6 +519,9 @@ aesni_ciphers (ENGINE *e, const EVP_CIPHER **cipher, case NID_aes_128_ofb: *cipher = &aesni_128_ofb; break; case NID_aes_128_ctr: *cipher = &aesni_128_ctr; break; case NID_aes_192_ecb: *cipher = &aesni_192_ecb; Loading @@ -373,6 +535,9 @@ aesni_ciphers (ENGINE *e, const EVP_CIPHER **cipher, case NID_aes_192_ofb: *cipher = &aesni_192_ofb; break; case NID_aes_192_ctr: *cipher = &aesni_192_ctr; break; case NID_aes_256_ecb: *cipher = &aesni_256_ecb; Loading @@ -386,6 +551,9 @@ aesni_ciphers (ENGINE *e, const EVP_CIPHER **cipher, case NID_aes_256_ofb: *cipher = &aesni_256_ofb; break; case NID_aes_256_ctr: *cipher = &aesni_256_ctr; break; default: /* Sorry, we don't support this NID */ Loading Loading
crypto/aes/asm/aesni-x86.pl +190 −21 Original line number Diff line number Diff line Loading @@ -23,7 +23,8 @@ require "x86asm.pl"; &asm_init($ARGV[0],$0); $movekey = eval($RREFIX eq "aseni" ? "*movaps" : "*movups"); if ($PREFIX eq "aesni") { $movekey=*movaps; } else { $movekey=*movups; } $len="eax"; $rounds="ecx"; Loading @@ -41,7 +42,7 @@ $rndkey1="xmm4"; $ivec="xmm5"; $in0="xmm6"; $in1="xmm7"; $inout3="xmm7"; # Inline version of internal aesni_[en|de]crypt1 sub aesni_inline_generate1 { my $p=shift; Loading Loading @@ -104,7 +105,7 @@ sub aesni_generate1 # fully unrolled loop &ret(); &function_end_B("_aesni_${p}rypt1"); } # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key); &aesni_generate1("enc") if (!$inline); &function_begin_B("${PREFIX}_encrypt"); Loading Loading @@ -136,7 +137,7 @@ sub aesni_generate1 # fully unrolled loop &movups (&QWP(0,"eax"),$inout0); &ret (); &function_end_B("${PREFIX}_decrypt"); # _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave # factor. Why 3x subroutine is used in loops? Even though aes[enc|dec] # latency is 6, it turned out that it can be scheduled only every Loading Loading @@ -229,8 +230,9 @@ sub aesni_generate4 &aesni_generate3("dec"); &aesni_generate4("enc") if ($PREFIX eq "aesni"); &aesni_generate4("dec"); if ($PREFIX eq "aesni") { ###################################################################### # void aesni_ecb_encrypt (const void *in, void *out, # size_t length, const AES_KEY *key, # int enc); Loading @@ -249,8 +251,9 @@ if ($PREFIX eq "aesni") { &mov ($rounds_,$rounds); # backup $rounds &jz (&label("ecb_decrypt")); &sub ($len,0x40); &cmp ($len,0x40); &jbe (&label("ecb_enc_tail")); &sub ($len,0x40); &jmp (&label("ecb_enc_loop3")); &set_label("ecb_enc_loop3",16); Loading @@ -268,14 +271,13 @@ if ($PREFIX eq "aesni") { &movups (&QWP(-0x10,$out),$inout2); &ja (&label("ecb_enc_loop3")); &set_label("ecb_enc_tail"); &add ($len,0x40); &jz (&label("ecb_ret")); &cmp ($len,0x10); &movups ($inout0,&QWP(0,$inp)); &je (&label("ecb_enc_one")); &set_label("ecb_enc_tail"); &cmp ($len,0x20); &movups ($inout0,&QWP(0,$inp)); &jb (&label("ecb_enc_one")); &movups ($inout1,&QWP(0x10,$inp)); &je (&label("ecb_enc_two")); &cmp ($len,0x30); Loading Loading @@ -309,10 +311,11 @@ if ($PREFIX eq "aesni") { &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(0x20,$out),$inout2); &jmp (&label("ecb_ret")); ###################################################################### &set_label("ecb_decrypt",16); &sub ($len,0x40); &cmp ($len,0x40); &jbe (&label("ecb_dec_tail")); &sub ($len,0x40); &jmp (&label("ecb_dec_loop3")); &set_label("ecb_dec_loop3",16); Loading @@ -330,14 +333,13 @@ if ($PREFIX eq "aesni") { &movups (&QWP(-0x10,$out),$inout2); &ja (&label("ecb_dec_loop3")); &set_label("ecb_dec_tail"); &add ($len,0x40); &jz (&label("ecb_ret")); &cmp ($len,0x10); &movups ($inout0,&QWP(0,$inp)); &je (&label("ecb_dec_one")); &set_label("ecb_dec_tail"); &cmp ($len,0x20); &movups ($inout0,&QWP(0,$inp)); &jb (&label("ecb_dec_one")); &movups ($inout1,&QWP(0x10,$inp)); &je (&label("ecb_dec_two")); &cmp ($len,0x30); Loading Loading @@ -373,8 +375,173 @@ if ($PREFIX eq "aesni") { &set_label("ecb_ret"); &function_end("aesni_ecb_encrypt"); } ###################################################################### # handles only complete blocks, operates on 32-bit counter and # does not update *ivec! (see engine/eng_aesni.c for details) # # void aesni_ctr32_encrypt_blocks (const void *in, void *out, # size_t blocks, const AES_KEY *key, # const char *ivec); &function_begin("aesni_ctr32_encrypt_blocks"); &mov ($inp,&wparam(0)); &mov ($out,&wparam(1)); &mov ($len,&wparam(2)); &mov ($key,&wparam(3)); &mov ($rounds_,&wparam(4)); &mov ($key_,"esp"); &sub ("esp",60); &and ("esp",-16); # align stack &mov (&DWP(48,"esp"),$key_); &movups ($inout3,&QWP(0,$rounds_)); # load ivec # compose byte-swap control mask for pshufb on stack &mov (&DWP(0,"esp"),0x0c0d0e0f); &mov (&DWP(4,"esp"),0x08090a0b); &mov (&DWP(8,"esp"),0x04050607); &mov (&DWP(12,"esp"),0x00010203); # compose counter increment vector on stack &mov ($rounds,3); &xor ($key_,$key_); &mov (&DWP(16,"esp"),$rounds); &mov (&DWP(20,"esp"),$rounds); &mov (&DWP(24,"esp"),$rounds); &mov (&DWP(28,"esp"),$key_); &pextrd ($rounds_,$inout3,3); # pull 32-bit counter &pinsrd ($inout3,$key_,3); # wipe 32-bit counter &mov ($rounds,&DWP(240,$key)); # key->rounds &movaps ($rndkey0,&QWP(0,"esp")); # load byte-swap mask # $ivec is vector of 3 32-bit counters &pxor ($ivec,$ivec); &bswap ($rounds_); &pinsrd ($ivec,$rounds_,0); &inc ($rounds_); &pinsrd ($ivec,$rounds_,1); &inc ($rounds_); &pinsrd ($ivec,$rounds_,2); &cmp ($len,4); &pshufb ($ivec,$rndkey0); # byte swap &jbe (&label("ctr32_tail")); &movaps (&QWP(32,"esp"),$inout3); # save counter-less ivec &mov ($rounds_,$rounds); &mov ($key_,$key); &sub ($len,4); &jmp (&label("ctr32_loop3")); &set_label("ctr32_loop3",16); &pshufd ($inout0,$ivec,3<<6); # place counter to upper dword &pshufd ($inout1,$ivec,2<<6); &pshufd ($inout2,$ivec,1<<6); &por ($inout0,$inout3); # merge counter-less ivec &por ($inout1,$inout3); &por ($inout2,$inout3); &call ("_aesni_encrypt3"); &movaps($rndkey0,&QWP(0,"esp")); # load byte-swap mask &movups ($in0,&QWP(0,$inp)); &movups ($in1,&QWP(0x10,$inp)); &movups ($rndkey1,&QWP(0x20,$inp)); &pshufb($ivec,$rndkey0); # byte swap &paddd ($ivec,&QWP(16,"esp")); # counter increment &pxor ($in0,$inout0); &pxor ($in1,$inout1); &pxor ($rndkey1,$inout2); &movups (&QWP(0,$out),$in0); &movups (&QWP(0x10,$out),$in1); &movups (&QWP(0x20,$out),$rndkey1); &movaps ($inout3,&QWP(32,"esp")); # load counter-less ivec &pshufb($ivec,$rndkey0); # byte swap &sub ($len,3); &lea ($inp,&DWP(0x30,$inp)); &lea ($out,&DWP(0x30,$out)); &mov ($key,$key_); &mov ($rounds,$rounds_); &ja (&label("ctr32_loop3")); &add ($len,4); &pextrd ($rounds_,$ivec,1); # might need last counter value &jz (&label("ctr32_ret")); &bswap ($rounds_); &set_label("ctr32_tail"); &cmp ($len,2); &pshufd ($inout0,$ivec,3<<6); &pshufd ($inout1,$ivec,2<<6); &pshufd ($inout2,$ivec,1<<6); &por ($inout0,$inout3); &jb (&label("ctr32_one")); &por ($inout1,$inout3); &je (&label("ctr32_two")); &cmp ($len,3); &por ($inout2,$inout3); &je (&label("ctr32_three")); &inc ($rounds_); # compose last counter value &bswap ($rounds_); &pinsrd ($inout3,$rounds_,3); &call ("_aesni_encrypt4"); &movups ($in0,&QWP(0,$inp)); &movups ($rndkey1,&QWP(0x10,$inp)); &movups ($rndkey0,&QWP(0x20,$inp)); &movups ($ivec,&QWP(0x30,$inp)); &pxor ($in0,$inout0); &pxor ($rndkey1,$inout1); &pxor ($rndkey0,$inout2); &pxor ($ivec,$inout3); &movups (&QWP(0,$out),$in0); &movups (&QWP(0x10,$out),$rndkey1); &movups (&QWP(0x20,$out),$rndkey0); &movups (&QWP(0x30,$out),$ivec); &jmp (&label("ctr32_ret")); &set_label("ctr32_one",16); if ($inline) { &aesni_inline_generate1("enc"); } else { &call ("_aesni_encrypt1"); } &movups ($in0,&QWP(0,$inp)); &pxor ($in0,$inout0); &movups (&QWP(0,$out),$in0); &jmp (&label("ctr32_ret")); &set_label("ctr32_two",16); &call ("_aesni_encrypt3"); &movups ($in0,&QWP(0,$inp)); &movups ($in1,&QWP(0x10,$inp)); &pxor ($in0,$inout0); &pxor ($in1,$inout1); &movups (&QWP(0,$out),$in0); &movups (&QWP(0x10,$out),$in1); &jmp (&label("ctr32_ret")); &set_label("ctr32_three",16); &call ("_aesni_encrypt3"); &movups ($in0,&QWP(0,$inp)); &movups ($in1,&QWP(0x10,$inp)); &movups ($rndkey1,&QWP(0x20,$inp)); &pxor ($in0,$inout0); &pxor ($in1,$inout1); &pxor ($rndkey1,$inout2); &movups (&QWP(0,$out),$in0); &movups (&QWP(0x10,$out),$in1); &movups (&QWP(0x20,$out),$rndkey1); &set_label("ctr32_ret"); &mov ("esp",&DWP(48,"esp")); &function_end("aesni_ctr32_encrypt_blocks"); } ###################################################################### # void $PREFIX_cbc_encrypt (const void *inp, void *out, # size_t length, const AES_KEY *key, # unsigned char *ivp,const int enc); Loading Loading @@ -431,10 +598,11 @@ if ($PREFIX eq "aesni") { &mov ($inp,$out); # $inp and $out are the same &mov ($key,$key_); # restore $key &jmp (&label("cbc_enc_loop")); ###################################################################### &set_label("cbc_decrypt",16); &sub ($len,0x40); &cmp ($len,0x40); &jbe (&label("cbc_dec_tail")); &sub ($len,0x40); &jmp (&label("cbc_dec_loop3")); &set_label("cbc_dec_loop3",16); Loading @@ -458,10 +626,10 @@ if ($PREFIX eq "aesni") { &movups (&QWP(-0x10,$out),$inout2); &ja (&label("cbc_dec_loop3")); &set_label("cbc_dec_tail"); &add ($len,0x40); &jz (&label("cbc_ret")); &set_label("cbc_dec_tail"); &movups ($inout0,&QWP(0,$inp)); &cmp ($len,0x10); &movaps ($in0,$inout0); Loading Loading @@ -539,7 +707,8 @@ if ($PREFIX eq "aesni") { &mov ($key_,&wparam(4)); &movups (&QWP(0,$key_),$ivec); # output IV &function_end("${PREFIX}_cbc_encrypt"); ###################################################################### # Mechanical port from aesni-x86_64.pl. # # _aesni_set_encrypt_key is private interface, Loading
crypto/aes/asm/aesni-x86_64.pl +270 −36 Original line number Diff line number Diff line Loading @@ -41,7 +41,7 @@ $inp="%rdi"; $out="%rsi"; $len="%rdx"; $key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!! $ivp="%r8"; # cbc $ivp="%r8"; # cbc, ctr $rnds_="%r10d"; # backup copy for $rounds $key_="%r11"; # backup copy for $key Loading @@ -51,7 +51,7 @@ $inout0="%xmm0"; $inout1="%xmm1"; $inout2="%xmm2"; $inout3="%xmm3"; $rndkey0="%xmm4"; $rndkey1="%xmm5"; $iv="%xmm6"; $in0="%xmm7"; # used in CBC decrypt $iv="%xmm6"; $in0="%xmm7"; # used in CBC decrypt, CTR $in1="%xmm8"; $in2="%xmm9"; # Inline version of internal aesni_[en|de]crypt1. Loading Loading @@ -214,6 +214,7 @@ ___ &aesni_generate4("dec"); if ($PREFIX eq "aesni") { ######################################################################## # void aesni_ecb_encrypt (const void *in, void *out, # size_t length, const AES_KEY *key, # int enc); Loading @@ -232,8 +233,9 @@ aesni_ecb_encrypt: mov $rounds,$rnds_ # backup $rounds jz .Lecb_decrypt #--------------------------- ECB ENCRYPT ------------------------------# sub \$0x40,$len cmp \$0x40,$len jbe .Lecb_enc_tail sub \$0x40,$len jmp .Lecb_enc_loop3 .align 16 .Lecb_enc_loop3: Loading @@ -251,14 +253,13 @@ aesni_ecb_encrypt: movups $inout2,-0x10($out) ja .Lecb_enc_loop3 .Lecb_enc_tail: add \$0x40,$len jz .Lecb_ret cmp \$0x10,$len movups ($inp),$inout0 je .Lecb_enc_one .Lecb_enc_tail: cmp \$0x20,$len movups ($inp),$inout0 jb .Lecb_enc_one movups 0x10($inp),$inout1 je .Lecb_enc_two cmp \$0x30,$len Loading Loading @@ -294,8 +295,9 @@ $code.=<<___; #--------------------------- ECB DECRYPT ------------------------------# .align 16 .Lecb_decrypt: sub \$0x40,$len cmp \$0x40,$len jbe .Lecb_dec_tail sub \$0x40,$len jmp .Lecb_dec_loop3 .align 16 .Lecb_dec_loop3: Loading @@ -313,14 +315,13 @@ $code.=<<___; movups $inout2,-0x10($out) ja .Lecb_dec_loop3 .Lecb_dec_tail: add \$0x40,$len jz .Lecb_ret cmp \$0x10,$len movups ($inp),$inout0 je .Lecb_dec_one .Lecb_dec_tail: cmp \$0x20,$len movups ($inp),$inout0 jb .Lecb_dec_one movups 0x10($inp),$inout1 je .Lecb_dec_two cmp \$0x30,$len Loading Loading @@ -357,8 +358,175 @@ $code.=<<___; ret .size aesni_ecb_encrypt,.-aesni_ecb_encrypt ___ ###################################################################### # handles only complete blocks, operates on 32-bit counter and # does not update *ivec! (see engine/eng_aesni.c for details) # # void aesni_ctr32_encrypt_blocks (const void *in, void *out, # size_t blocks, const AES_KEY *key, # const char *ivec); $increment="%xmm10"; $bswap_mask="%xmm11"; $code.=<<___; .globl aesni_ctr32_encrypt_blocks .type aesni_ctr32_encrypt_blocks,\@function,5 .align 16 aesni_ctr32_encrypt_blocks: ___ $code.=<<___ if ($win64); lea -0x68(%rsp),%rsp movaps %xmm6,(%rsp) movaps %xmm7,0x10(%rsp) movaps %xmm8,0x20(%rsp) movaps %xmm9,0x30(%rsp) movaps %xmm10,0x40(%rsp) movaps %xmm11,0x50(%rsp) .Lctr32_body: ___ $code.=<<___; movups ($ivp),$inout3 movaps .Lincrement(%rip),$increment movaps .Lbswap_mask(%rip),$bswap_mask xor $rounds,$rounds pextrd \$3,$inout3,$rnds_ # pull 32-bit counter pinsrd \$3,$rounds,$inout3 # wipe 32-bit counter mov 240($key),$rounds # key->rounds pxor $iv,$iv # vector of 3 32-bit counters bswap $rnds_ pinsrd \$0,$rnds_,$iv inc $rnds_ pinsrd \$1,$rnds_,$iv inc $rnds_ pinsrd \$2,$rnds_,$iv cmp \$4,$len pshufb $bswap_mask,$iv jbe .Lctr32_tail mov $rounds,$rnds_ mov $key,$key_ sub \$4,$len jmp .Lctr32_loop3 .align 16 .Lctr32_loop3: pshufd \$`3<<6`,$iv,$inout0 # place counter to upper dword pshufd \$`2<<6`,$iv,$inout1 pshufd \$`1<<6`,$iv,$inout2 movups ($inp),$in0 movups 0x10($inp),$in1 movups 0x20($inp),$in2 por $inout3,$inout0 # merge counter-less ivec por $inout3,$inout1 por $inout3,$inout2 pshufb $bswap_mask,$iv call _aesni_encrypt3 paddd $increment,$iv pxor $inout0,$in0 pxor $inout1,$in1 pxor $inout2,$in2 pshufb $bswap_mask,$iv movups $in0,($out) movups $in1,0x10($out) movups $in2,0x20($out) sub \$3,$len lea 0x30($inp),$inp lea 0x30($out),$out mov $key_,$key mov $rnds_,$rounds ja .Lctr32_loop3 add \$4,$len pextrd \$1,$iv,$rnds_ # migh need last counter value jz .Lctr32_done bswap $rnds_ .Lctr32_tail: cmp \$2,$len pshufd \$`3<<6`,$iv,$inout0 pshufd \$`2<<6`,$iv,$inout1 pshufd \$`1<<6`,$iv,$inout2 por $inout3,$inout0 movups ($inp),$in0 jb .Lctr32_one por $inout3,$inout1 movups 0x10($inp),$in1 je .Lctr32_two cmp \$3,$len por $inout3,$inout2 movups 0x20($inp),$in2 je .Lctr32_three inc $rnds_ # compose last counter value bswap $rnds_ pinsrd \$3,$rnds_,$inout3 movups 0x30($inp),$iv call _aesni_encrypt4 pxor $inout0,$in0 pxor $inout1,$in1 pxor $inout2,$in2 pxor $inout3,$iv movups $in0,($out) movups $in1,0x10($out) movups $in2,0x20($out) movups $iv,0x30($out) jmp .Lctr32_done .align 16 .Lctr32_one: ___ &aesni_generate1("enc",$key,$rounds); $code.=<<___; pxor $inout0,$in0 movups $in0,($out) jmp .Lctr32_done .align 16 .Lctr32_two: call _aesni_encrypt3 pxor $inout0,$in0 pxor $inout1,$in1 movups $in0,($out) movups $in1,0x10($out) jmp .Lctr32_done .align 16 .Lctr32_three: call _aesni_encrypt3 pxor $inout0,$in0 pxor $inout1,$in1 pxor $inout2,$in2 movups $in0,($out) movups $in1,0x10($out) movups $in2,0x20($out) .Lctr32_done: ___ $code.=<<___ if ($win64); movaps (%rsp),%xmm6 movaps 0x10(%rsp),%xmm7 movaps 0x20(%rsp),%xmm8 movaps 0x30(%rsp),%xmm9 movaps 0x40(%rsp),%xmm10 movaps 0x50(%rsp),%xmm11 lea 0x68(%rsp),%rsp ___ $code.=<<___; .Lctr32_ret: ret .size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks ___ } ######################################################################## # void $PREFIX_cbc_encrypt (const void *inp, void *out, # size_t length, const AES_KEY *key, # unsigned char *ivp,const int enc); Loading Loading @@ -429,9 +597,10 @@ $code.=<<___ if ($win64); ___ $code.=<<___; movups ($ivp),$iv sub \$0x40,$len cmp \$0x40,$len mov $rnds_,$rounds jbe .Lcbc_dec_tail sub \$0x40,$len jmp .Lcbc_dec_loop3 .align 16 .Lcbc_dec_loop3: Loading @@ -456,11 +625,11 @@ $code.=<<___; movups $inout2,-0x10($out) ja .Lcbc_dec_loop3 .Lcbc_dec_tail: add \$0x40,$len movups $iv,($ivp) jz .Lcbc_dec_ret .Lcbc_dec_tail: movups ($inp),$inout0 cmp \$0x10,$len movaps $inout0,$in0 Loading Loading @@ -796,6 +965,11 @@ ___ } $code.=<<___; .align 64 .Lbswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 .Lincrement: .long 3,3,3,0 .asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>" .align 64 ___ Loading @@ -810,9 +984,11 @@ $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type cbc_se_handler,\@abi-omnipotent ___ $code.=<<___ if ($PREFIX eq "aesni"); .type ecb_se_handler,\@abi-omnipotent .align 16 cbc_se_handler: ecb_se_handler: push %rsi push %rdi push %rbx Loading @@ -825,30 +1001,48 @@ cbc_se_handler: sub \$64,%rsp mov 152($context),%rax # pull context->Rsp mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi jmp .Lcommon_seh_exit .size ecb_se_handler,.-ecb_se_handler .type ctr32_se_handler,\@abi-omnipotent .align 16 ctr32_se_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip lea .Lcbc_decrypt(%rip),%r10 lea .Lctr32_body(%rip),%r10 cmp %r10,%rbx # context->Rip<"prologue" label jb .Lin_prologue jb .Lin_ctr32_prologue lea .Lcbc_decrypt_body(%rip),%r10 cmp %r10,%rbx # context->Rip<cbc_decrypt_body jb .Lrestore_rax mov 152($context),%rax # pull context->Rsp lea .Lcbc_ret(%rip),%r10 cmp %r10,%rbx # context->Rip>="epilogue" label jae .Lin_prologue lea .Lctr32_ret(%rip),%r10 cmp %r10,%rbx jae .Lin_ctr32_prologue lea 0(%rax),%rsi # top of stack lea 512($context),%rdi # &context.Xmm6 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) mov \$12,%ecx # 6*sizeof(%xmm0)/sizeof(%rax) .long 0xa548f3fc # cld; rep movsq lea 0x58(%rax),%rax # adjust stack pointer jmp .Lin_prologue lea 0x68(%rax),%rax # adjust stack pointer .Lrestore_rax: mov 120($context),%rax .Lin_prologue: .Lin_ctr32_prologue: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp Loading @@ -856,11 +1050,12 @@ cbc_se_handler: mov %rdi,176($context) # restore context->Rdi jmp .Lcommon_seh_exit .size cbc_se_handler,.-cbc_se_handler .type ecb_se_handler,\@abi-omnipotent .size ctr32_se_handler,.-ctr32_se_handler ___ $code.=<<___; .type cbc_se_handler,\@abi-omnipotent .align 16 ecb_se_handler: cbc_se_handler: push %rsi push %rdi push %rbx Loading @@ -873,8 +1068,33 @@ ecb_se_handler: sub \$64,%rsp mov 152($context),%rax # pull context->Rsp mov 248($context),%rbx # pull context->Rip lea .Lcbc_decrypt(%rip),%r10 cmp %r10,%rbx # context->Rip<"prologue" label jb .Lin_cbc_prologue lea .Lcbc_decrypt_body(%rip),%r10 cmp %r10,%rbx # context->Rip<cbc_decrypt_body jb .Lrestore_cbc_rax lea .Lcbc_ret(%rip),%r10 cmp %r10,%rbx # context->Rip>="epilogue" label jae .Lin_cbc_prologue lea 0(%rax),%rsi # top of stack lea 512($context),%rdi # &context.Xmm6 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) .long 0xa548f3fc # cld; rep movsq lea 0x58(%rax),%rax # adjust stack pointer jmp .Lin_cbc_prologue .Lrestore_cbc_rax: mov 120($context),%rax .Lin_cbc_prologue: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi Loading Loading @@ -915,10 +1135,17 @@ ecb_se_handler: .section .pdata .align 4 .rva .LSEH_begin_${PREFIX}_ecb_encrypt .rva .LSEH_end_${PREFIX}_ecb_encrypt ___ $code.=<<___ if ($PREFIX eq "aesni"); .rva .LSEH_begin_aesni_ecb_encrypt .rva .LSEH_end_aesni_ecb_encrypt .rva .LSEH_info_ecb .rva .LSEH_begin_aesni_ctr32_encrypt_blocks .rva .LSEH_end_aesni_ctr32_encrypt_blocks .rva .LSEH_info_ctr32 ___ $code.=<<___; .rva .LSEH_begin_${PREFIX}_cbc_encrypt .rva .LSEH_end_${PREFIX}_cbc_encrypt .rva .LSEH_info_cbc Loading @@ -932,9 +1159,16 @@ ecb_se_handler: .rva .LSEH_info_key .section .xdata .align 8 ___ $code.=<<___ if ($PREFIX eq "aesni"); .LSEH_info_ecb: .byte 9,0,0,0 .rva ecb_se_handler .LSEH_info_ctr32: .byte 9,0,0,0 .rva ctr32_se_handler ___ $code.=<<___; .LSEH_info_cbc: .byte 9,0,0,0 .rva cbc_se_handler Loading
crypto/engine/eng_aesni.c +173 −5 Original line number Diff line number Diff line Loading @@ -111,6 +111,35 @@ void ENGINE_load_aesni (void) } #ifdef COMPILE_HW_AESNI typedef unsigned int u32; typedef unsigned char u8; #if defined(__GNUC__) && __GNUC__>=2 # define BSWAP4(x) ({ u32 ret=(x); \ asm volatile ("bswapl %0" \ : "+r"(ret)); ret; }) #elif defined(_MSC_VER) # if _MSC_VER>=1300 # pragma intrinsic(_byteswap_ulong) # define BSWAP4(x) _byteswap_ulong((u32)(x)) # elif defined(_M_IX86) __inline u32 _bswap4(u32 val) { _asm mov eax,val _asm bswap eax } # define BSWAP4(x) _bswap4(x) # endif #endif #ifdef BSWAP4 #define GETU32(p) BSWAP4(*(const u32 *)(p)) #define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v) #else #define GETU32(p) ((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3]) #define PUTU32(p,v) ((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v)) #endif int aesni_set_encrypt_key(const unsigned char *userKey, int bits, AES_KEY *key); int aesni_set_decrypt_key(const unsigned char *userKey, int bits, Loading @@ -132,6 +161,12 @@ void aesni_cbc_encrypt(const unsigned char *in, const AES_KEY *key, unsigned char *ivec, int enc); void aesni_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, size_t blocks, const AES_KEY *key, const unsigned char *ivec); /* Function for ENGINE detection and control */ static int aesni_init(ENGINE *e); Loading Loading @@ -224,16 +259,19 @@ static int aesni_cipher_nids[] = { NID_aes_128_cbc, NID_aes_128_cfb, NID_aes_128_ofb, NID_aes_128_ctr, NID_aes_192_ecb, NID_aes_192_cbc, NID_aes_192_cfb, NID_aes_192_ofb, NID_aes_192_ctr, NID_aes_256_ecb, NID_aes_256_cbc, NID_aes_256_cfb, NID_aes_256_ofb, NID_aes_256_ctr, }; static int aesni_cipher_nids_num = (sizeof(aesni_cipher_nids)/sizeof(aesni_cipher_nids[0])); Loading @@ -251,18 +289,28 @@ aesni_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *user_key, int ret; AES_KEY *key = AESNI_ALIGN(ctx->cipher_data); if ((ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_CFB_MODE || (ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_OFB_MODE || enc) ret=aesni_set_encrypt_key(user_key, ctx->key_len * 8, key); else if (((ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_ECB_MODE || (ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_CBC_MODE) && !enc) ret=aesni_set_decrypt_key(user_key, ctx->key_len * 8, key); else ret=aesni_set_encrypt_key(user_key, ctx->key_len * 8, key); if(ret < 0) { EVPerr(EVP_F_AESNI_INIT_KEY,EVP_R_AES_KEY_SETUP_FAILED); return 0; } if (ctx->cipher->flags&EVP_CIPH_CUSTOM_IV) { if (iv!=NULL) memcpy (ctx->iv,iv,ctx->cipher->iv_len); else { EVPerr(EVP_F_AESNI_INIT_KEY,EVP_R_AES_IV_SETUP_FAILED); return 0; } } return 1; } Loading Loading @@ -336,6 +384,117 @@ DECLARE_AES_EVP(256,cbc,CBC); DECLARE_AES_EVP(256,cfb,CFB); DECLARE_AES_EVP(256,ofb,OFB); static void ctr96_inc(unsigned char *counter) { u32 n=12; u8 c; do { --n; c = counter[n]; ++c; counter[n] = c; if (c) return; } while (n); } static int aesni_counter(EVP_CIPHER_CTX *ctx, unsigned char *out, const unsigned char *in, size_t len) { AES_KEY *key = AESNI_ALIGN(ctx->cipher_data); u32 n, ctr32; n = ctx->num; while (n && len) { *(out++) = *(in++) ^ ctx->buf[n]; --len; n = (n+1) % 16; } ctr32 = GETU32(ctx->iv+12); while (len>=16) { size_t blocks = len/16; /* * 1<<24 is just a not-so-small yet not-so-large number... */ if (blocks > (1U<<24)) blocks = (1U<<24); /* * As aesni_ctr32 operates on 32-bit counter, caller * has to handle overflow. 'if' below detects the * overflow, which is then handled by limiting the * amount of blocks to the exact overflow point... */ ctr32 += (u32)blocks; if (ctr32 < blocks) { blocks -= ctr32; ctr32 = 0; } aesni_ctr32_encrypt_blocks(in,out,blocks,key,ctx->iv); /* aesni_ctr32 does not update ctx->iv, caller does: */ PUTU32(ctx->iv+12,ctr32); /* ... overflow was detected, propogate carry. */ if (ctr32 == 0) ctr96_inc(ctx->iv); blocks *= 16; len -= blocks; out += blocks; in += blocks; } if (len) { aesni_encrypt(ctx->iv,ctx->buf,key); ++ctr32; PUTU32(ctx->iv+12,ctr32); if (ctr32 == 0) ctr96_inc(ctx->iv); while (len--) { out[n] = in[n] ^ ctx->buf[n]; ++n; } } ctx->num = n; return 1; } static const EVP_CIPHER aesni_128_ctr= { NID_aes_128_ctr,1,16,16, EVP_CIPH_CUSTOM_IV, aesni_init_key, aesni_counter, NULL, sizeof(AESNI_KEY), NULL, NULL, NULL, NULL }; static const EVP_CIPHER aesni_192_ctr= { NID_aes_192_ctr,1,24,16, EVP_CIPH_CUSTOM_IV, aesni_init_key, aesni_counter, NULL, sizeof(AESNI_KEY), NULL, NULL, NULL, NULL }; static const EVP_CIPHER aesni_256_ctr= { NID_aes_256_ctr,1,32,16, EVP_CIPH_CUSTOM_IV, aesni_init_key, aesni_counter, NULL, sizeof(AESNI_KEY), NULL, NULL, NULL, NULL }; static int aesni_ciphers (ENGINE *e, const EVP_CIPHER **cipher, const int **nids, int nid) Loading @@ -360,6 +519,9 @@ aesni_ciphers (ENGINE *e, const EVP_CIPHER **cipher, case NID_aes_128_ofb: *cipher = &aesni_128_ofb; break; case NID_aes_128_ctr: *cipher = &aesni_128_ctr; break; case NID_aes_192_ecb: *cipher = &aesni_192_ecb; Loading @@ -373,6 +535,9 @@ aesni_ciphers (ENGINE *e, const EVP_CIPHER **cipher, case NID_aes_192_ofb: *cipher = &aesni_192_ofb; break; case NID_aes_192_ctr: *cipher = &aesni_192_ctr; break; case NID_aes_256_ecb: *cipher = &aesni_256_ecb; Loading @@ -386,6 +551,9 @@ aesni_ciphers (ENGINE *e, const EVP_CIPHER **cipher, case NID_aes_256_ofb: *cipher = &aesni_256_ofb; break; case NID_aes_256_ctr: *cipher = &aesni_256_ctr; break; default: /* Sorry, we don't support this NID */ Loading