Loading crypto/aes/asm/aesni-x86.pl +35 −33 Original line number Diff line number Diff line Loading @@ -594,6 +594,7 @@ if ($PREFIX eq "aesni") { &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec &movdqu ($cmac,&QWP(0,$rounds)); # load cmac &mov ($rounds,&DWP(240,$key)); # compose byte-swap control mask for pshufb on stack &mov (&DWP(0,"esp"),0x0c0d0e0f); Loading @@ -602,34 +603,30 @@ if ($PREFIX eq "aesni") { &mov (&DWP(12,"esp"),0x00010203); # compose counter increment vector on stack &mov ($rounds,1); &mov ($rounds_,1); &xor ($key_,$key_); &mov (&DWP(16,"esp"),$rounds); &mov (&DWP(16,"esp"),$rounds_); &mov (&DWP(20,"esp"),$key_); &mov (&DWP(24,"esp"),$key_); &mov (&DWP(28,"esp"),$key_); &shr ($rounds,1); &lea ($key_,&DWP(0,$key)); &movdqa ($inout3,&QWP(0,"esp")); &pshufb ($ivec,$inout3); # keep iv in reverse order &mov ($rounds,&DWP(240,$key)); &mov ($key_,$key); &mov ($rounds_,$rounds); &movdqa ($inout0,$ivec); &mov ($rounds_,$rounds); &pshufb ($ivec,$inout3); &set_label("ccm64_enc_outer"); &movups ($in0,&QWP(0,$inp)); &pshufb ($inout0,$inout3); &mov ($key,$key_); &$movekey ($rndkey0,&QWP(0,$key_)); &mov ($rounds,$rounds_); &movups ($in0,&QWP(0,$inp)); &$movekey ($rndkey0,&QWP(0,$key)); &shr ($rounds,1); &$movekey ($rndkey1,&QWP(16,$key)); &xorps ($in0,$rndkey0); &lea ($key,&DWP(32,$key)); &xorps ($inout0,$rndkey0); &xorps ($cmac,$in0); # cmac^=inp &$movekey ($rndkey1,&QWP(16,$key_)); &xorps ($rndkey0,$in0); &lea ($key,&DWP(32,$key_)); &xorps ($cmac,$rndkey0); # cmac^=inp &$movekey ($rndkey0,&QWP(0,$key)); &set_label("ccm64_enc2_loop"); Loading @@ -644,16 +641,17 @@ if ($PREFIX eq "aesni") { &jnz (&label("ccm64_enc2_loop")); &aesenc ($inout0,$rndkey1); &aesenc ($cmac,$rndkey1); &paddq ($ivec,&QWP(16,"esp")); &aesenclast ($inout0,$rndkey0); &aesenclast ($cmac,$rndkey0); &paddq ($ivec,&QWP(16,"esp")); &dec ($len); &lea ($inp,&DWP(16,$inp)); &xorps ($in0,$inout0); # inp^=E(ivec) &movdqa ($inout0,$ivec); &movups (&QWP(0,$out),$in0); &movups (&QWP(0,$out),$in0); # save output &lea ($out,&DWP(16,$out)); &pshufb ($inout0,$inout3); &jnz (&label("ccm64_enc_outer")); &mov ("esp",&DWP(48,"esp")); Loading @@ -675,6 +673,7 @@ if ($PREFIX eq "aesni") { &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec &movdqu ($cmac,&QWP(0,$rounds)); # load cmac &mov ($rounds,&DWP(240,$key)); # compose byte-swap control mask for pshufb on stack &mov (&DWP(0,"esp"),0x0c0d0e0f); Loading @@ -683,46 +682,45 @@ if ($PREFIX eq "aesni") { &mov (&DWP(12,"esp"),0x00010203); # compose counter increment vector on stack &mov ($rounds,1); &mov ($rounds_,1); &xor ($key_,$key_); &mov (&DWP(16,"esp"),$rounds); &mov (&DWP(16,"esp"),$rounds_); &mov (&DWP(20,"esp"),$key_); &mov (&DWP(24,"esp"),$key_); &mov (&DWP(28,"esp"),$key_); &movdqa ($inout3,&QWP(0,"esp")); # bswap mask &movdqa ($inout0,$ivec); &pshufb ($ivec,$inout3); # keep iv in reverse order &mov ($rounds,&DWP(240,$key)); &mov ($key_,$key); &mov ($rounds_,$rounds); &pshufb ($ivec,$inout3); if ($inline) { &aesni_inline_generate1("enc"); } else { &call ("_aesni_encrypt1"); } &set_label("ccm64_dec_outer"); &paddq ($ivec,&QWP(16,"esp")); &movups ($in0,&QWP(0,$inp)); # load inp &xorps ($in0,$inout0); &movdqa ($inout0,$ivec); &paddq ($ivec,&QWP(16,"esp")); &lea ($inp,&QWP(16,$inp)); &pshufb ($inout0,$inout3); &mov ($key,$key_); &jmp (&label("ccm64_dec_outer")); &set_label("ccm64_dec_outer",16); &xorps ($in0,$inout0); # inp ^= E(ivec) &movdqa ($inout0,$ivec); &mov ($rounds,$rounds_); &movups (&QWP(0,$out),$in0); &movups (&QWP(0,$out),$in0); # save output &lea ($out,&DWP(16,$out)); &pshufb ($inout0,$inout3); &sub ($len,1); &jz (&label("ccm64_dec_break")); &$movekey ($rndkey0,&QWP(0,$key)); &$movekey ($rndkey0,&QWP(0,$key_)); &shr ($rounds,1); &$movekey ($rndkey1,&QWP(16,$key)); &$movekey ($rndkey1,&QWP(16,$key_)); &xorps ($in0,$rndkey0); &lea ($key,&DWP(32,$key)); &lea ($key,&DWP(32,$key_)); &xorps ($inout0,$rndkey0); &xorps ($cmac,$in0); # cmac^=out &$movekey ($rndkey0,&QWP(0,$key)); Loading @@ -737,13 +735,17 @@ if ($PREFIX eq "aesni") { &aesenc ($cmac,$rndkey0); &$movekey ($rndkey0,&QWP(0,$key)); &jnz (&label("ccm64_dec2_loop")); &movups ($in0,&QWP(0,$inp)); # load inp &paddq ($ivec,&QWP(16,"esp")); &aesenc ($inout0,$rndkey1); &aesenc ($cmac,$rndkey1); &lea ($inp,&QWP(16,$inp)); &aesenclast ($inout0,$rndkey0); &aesenclast ($cmac,$rndkey0); &jmp (&label("ccm64_dec_outer")); &set_label("ccm64_dec_break",16); &mov ($key,$key_); if ($inline) { &aesni_inline_generate1("enc",$cmac,$in0); } else Loading crypto/aes/asm/aesni-x86_64.pl +40 −34 Original line number Diff line number Diff line Loading @@ -821,8 +821,8 @@ ___ { my $cmac="%r9"; # 6th argument my $increment="%xmm8"; my $bswap_mask="%xmm9"; my $increment="%xmm6"; my $bswap_mask="%xmm7"; $code.=<<___; .globl aesni_ccm64_encrypt_blocks Loading @@ -839,30 +839,29 @@ $code.=<<___ if ($win64); .Lccm64_enc_body: ___ $code.=<<___; mov 240($key),$rounds # key->rounds movdqu ($ivp),$iv movdqu ($cmac),$inout1 movdqa .Lincrement64(%rip),$increment movdqa .Lbswap_mask(%rip),$bswap_mask pshufb $bswap_mask,$iv # keep iv in reverse order mov 240($key),$rounds # key->rounds mov $key,$key_ mov $rounds,$rnds_ shr \$1,$rounds lea 0($key),$key_ movdqu ($cmac),$inout1 movdqa $iv,$inout0 mov $rounds,$rnds_ pshufb $bswap_mask,$iv jmp .Lccm64_enc_outer .align 16 .Lccm64_enc_outer: movups ($inp),$in0 # load inp pshufb $bswap_mask,$inout0 mov $key_,$key $movkey ($key_),$rndkey0 mov $rnds_,$rounds movups ($inp),$in0 # load inp $movkey ($key),$rndkey0 shr \$1,$rounds $movkey 16($key),$rndkey1 xorps $rndkey0,$in0 lea 32($key),$key xorps $rndkey0,$inout0 xorps $inout1,$in0 # cmac^=inp xorps $rndkey0,$inout0 # counter $movkey 16($key_),$rndkey1 xorps $in0,$rndkey0 lea 32($key_),$key xorps $rndkey0,$inout1 # cmac^=inp $movkey ($key),$rndkey0 .Lccm64_enc2_loop: Loading @@ -877,16 +876,17 @@ $code.=<<___; jnz .Lccm64_enc2_loop aesenc $rndkey1,$inout0 aesenc $rndkey1,$inout1 paddq $increment,$iv aesenclast $rndkey0,$inout0 aesenclast $rndkey0,$inout1 paddq $increment,$iv dec $len lea 16($inp),$inp xorps $inout0,$in0 # inp ^= E(iv) movdqa $iv,$inout0 movups $in0,($out) # save output lea 16($out),$out pshufb $bswap_mask,$inout0 jnz .Lccm64_enc_outer movups $inout1,($cmac) Loading Loading @@ -919,39 +919,40 @@ $code.=<<___ if ($win64); .Lccm64_dec_body: ___ $code.=<<___; movdqu ($ivp),$iv mov 240($key),$rounds # key->rounds movups ($ivp),$iv movdqu ($cmac),$inout1 movdqa .Lincrement64(%rip),$increment movdqa .Lbswap_mask(%rip),$bswap_mask mov 240($key),$rounds # key->rounds movdqa $iv,$inout0 pshufb $bswap_mask,$iv # keep iv in reverse order movaps $iv,$inout0 mov $rounds,$rnds_ mov $key,$key_ pshufb $bswap_mask,$iv ___ &aesni_generate1("enc",$key,$rounds); $code.=<<___; .Lccm64_dec_outer: paddq $increment,$iv movups ($inp),$in0 # load inp xorps $inout0,$in0 movdqa $iv,$inout0 paddq $increment,$iv lea 16($inp),$inp pshufb $bswap_mask,$inout0 mov $key_,$key jmp .Lccm64_dec_outer .align 16 .Lccm64_dec_outer: xorps $inout0,$in0 # inp ^= E(iv) movdqa $iv,$inout0 mov $rnds_,$rounds movups $in0,($out) movups $in0,($out) # save output lea 16($out),$out pshufb $bswap_mask,$inout0 sub \$1,$len jz .Lccm64_dec_break $movkey ($key),$rndkey0 $movkey ($key_),$rndkey0 shr \$1,$rounds $movkey 16($key),$rndkey1 $movkey 16($key_),$rndkey1 xorps $rndkey0,$in0 lea 32($key),$key lea 32($key_),$key xorps $rndkey0,$inout0 xorps $in0,$inout1 # cmac^=out $movkey ($key),$rndkey0 Loading @@ -966,15 +967,20 @@ $code.=<<___; aesenc $rndkey0,$inout1 $movkey 0($key),$rndkey0 jnz .Lccm64_dec2_loop movups ($inp),$in0 # load inp paddq $increment,$iv aesenc $rndkey1,$inout0 aesenc $rndkey1,$inout1 lea 16($inp),$inp aesenclast $rndkey0,$inout0 aesenclast $rndkey0,$inout1 jmp .Lccm64_dec_outer .align 16 .Lccm64_dec_break: #xorps $in0,$inout1 # cmac^=out ___ &aesni_generate1("enc",$key,$rounds,$inout1); &aesni_generate1("enc",$key_,$rounds,$inout1,$in0); $code.=<<___; movups $inout1,($cmac) ___ Loading Loading
crypto/aes/asm/aesni-x86.pl +35 −33 Original line number Diff line number Diff line Loading @@ -594,6 +594,7 @@ if ($PREFIX eq "aesni") { &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec &movdqu ($cmac,&QWP(0,$rounds)); # load cmac &mov ($rounds,&DWP(240,$key)); # compose byte-swap control mask for pshufb on stack &mov (&DWP(0,"esp"),0x0c0d0e0f); Loading @@ -602,34 +603,30 @@ if ($PREFIX eq "aesni") { &mov (&DWP(12,"esp"),0x00010203); # compose counter increment vector on stack &mov ($rounds,1); &mov ($rounds_,1); &xor ($key_,$key_); &mov (&DWP(16,"esp"),$rounds); &mov (&DWP(16,"esp"),$rounds_); &mov (&DWP(20,"esp"),$key_); &mov (&DWP(24,"esp"),$key_); &mov (&DWP(28,"esp"),$key_); &shr ($rounds,1); &lea ($key_,&DWP(0,$key)); &movdqa ($inout3,&QWP(0,"esp")); &pshufb ($ivec,$inout3); # keep iv in reverse order &mov ($rounds,&DWP(240,$key)); &mov ($key_,$key); &mov ($rounds_,$rounds); &movdqa ($inout0,$ivec); &mov ($rounds_,$rounds); &pshufb ($ivec,$inout3); &set_label("ccm64_enc_outer"); &movups ($in0,&QWP(0,$inp)); &pshufb ($inout0,$inout3); &mov ($key,$key_); &$movekey ($rndkey0,&QWP(0,$key_)); &mov ($rounds,$rounds_); &movups ($in0,&QWP(0,$inp)); &$movekey ($rndkey0,&QWP(0,$key)); &shr ($rounds,1); &$movekey ($rndkey1,&QWP(16,$key)); &xorps ($in0,$rndkey0); &lea ($key,&DWP(32,$key)); &xorps ($inout0,$rndkey0); &xorps ($cmac,$in0); # cmac^=inp &$movekey ($rndkey1,&QWP(16,$key_)); &xorps ($rndkey0,$in0); &lea ($key,&DWP(32,$key_)); &xorps ($cmac,$rndkey0); # cmac^=inp &$movekey ($rndkey0,&QWP(0,$key)); &set_label("ccm64_enc2_loop"); Loading @@ -644,16 +641,17 @@ if ($PREFIX eq "aesni") { &jnz (&label("ccm64_enc2_loop")); &aesenc ($inout0,$rndkey1); &aesenc ($cmac,$rndkey1); &paddq ($ivec,&QWP(16,"esp")); &aesenclast ($inout0,$rndkey0); &aesenclast ($cmac,$rndkey0); &paddq ($ivec,&QWP(16,"esp")); &dec ($len); &lea ($inp,&DWP(16,$inp)); &xorps ($in0,$inout0); # inp^=E(ivec) &movdqa ($inout0,$ivec); &movups (&QWP(0,$out),$in0); &movups (&QWP(0,$out),$in0); # save output &lea ($out,&DWP(16,$out)); &pshufb ($inout0,$inout3); &jnz (&label("ccm64_enc_outer")); &mov ("esp",&DWP(48,"esp")); Loading @@ -675,6 +673,7 @@ if ($PREFIX eq "aesni") { &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec &movdqu ($cmac,&QWP(0,$rounds)); # load cmac &mov ($rounds,&DWP(240,$key)); # compose byte-swap control mask for pshufb on stack &mov (&DWP(0,"esp"),0x0c0d0e0f); Loading @@ -683,46 +682,45 @@ if ($PREFIX eq "aesni") { &mov (&DWP(12,"esp"),0x00010203); # compose counter increment vector on stack &mov ($rounds,1); &mov ($rounds_,1); &xor ($key_,$key_); &mov (&DWP(16,"esp"),$rounds); &mov (&DWP(16,"esp"),$rounds_); &mov (&DWP(20,"esp"),$key_); &mov (&DWP(24,"esp"),$key_); &mov (&DWP(28,"esp"),$key_); &movdqa ($inout3,&QWP(0,"esp")); # bswap mask &movdqa ($inout0,$ivec); &pshufb ($ivec,$inout3); # keep iv in reverse order &mov ($rounds,&DWP(240,$key)); &mov ($key_,$key); &mov ($rounds_,$rounds); &pshufb ($ivec,$inout3); if ($inline) { &aesni_inline_generate1("enc"); } else { &call ("_aesni_encrypt1"); } &set_label("ccm64_dec_outer"); &paddq ($ivec,&QWP(16,"esp")); &movups ($in0,&QWP(0,$inp)); # load inp &xorps ($in0,$inout0); &movdqa ($inout0,$ivec); &paddq ($ivec,&QWP(16,"esp")); &lea ($inp,&QWP(16,$inp)); &pshufb ($inout0,$inout3); &mov ($key,$key_); &jmp (&label("ccm64_dec_outer")); &set_label("ccm64_dec_outer",16); &xorps ($in0,$inout0); # inp ^= E(ivec) &movdqa ($inout0,$ivec); &mov ($rounds,$rounds_); &movups (&QWP(0,$out),$in0); &movups (&QWP(0,$out),$in0); # save output &lea ($out,&DWP(16,$out)); &pshufb ($inout0,$inout3); &sub ($len,1); &jz (&label("ccm64_dec_break")); &$movekey ($rndkey0,&QWP(0,$key)); &$movekey ($rndkey0,&QWP(0,$key_)); &shr ($rounds,1); &$movekey ($rndkey1,&QWP(16,$key)); &$movekey ($rndkey1,&QWP(16,$key_)); &xorps ($in0,$rndkey0); &lea ($key,&DWP(32,$key)); &lea ($key,&DWP(32,$key_)); &xorps ($inout0,$rndkey0); &xorps ($cmac,$in0); # cmac^=out &$movekey ($rndkey0,&QWP(0,$key)); Loading @@ -737,13 +735,17 @@ if ($PREFIX eq "aesni") { &aesenc ($cmac,$rndkey0); &$movekey ($rndkey0,&QWP(0,$key)); &jnz (&label("ccm64_dec2_loop")); &movups ($in0,&QWP(0,$inp)); # load inp &paddq ($ivec,&QWP(16,"esp")); &aesenc ($inout0,$rndkey1); &aesenc ($cmac,$rndkey1); &lea ($inp,&QWP(16,$inp)); &aesenclast ($inout0,$rndkey0); &aesenclast ($cmac,$rndkey0); &jmp (&label("ccm64_dec_outer")); &set_label("ccm64_dec_break",16); &mov ($key,$key_); if ($inline) { &aesni_inline_generate1("enc",$cmac,$in0); } else Loading
crypto/aes/asm/aesni-x86_64.pl +40 −34 Original line number Diff line number Diff line Loading @@ -821,8 +821,8 @@ ___ { my $cmac="%r9"; # 6th argument my $increment="%xmm8"; my $bswap_mask="%xmm9"; my $increment="%xmm6"; my $bswap_mask="%xmm7"; $code.=<<___; .globl aesni_ccm64_encrypt_blocks Loading @@ -839,30 +839,29 @@ $code.=<<___ if ($win64); .Lccm64_enc_body: ___ $code.=<<___; mov 240($key),$rounds # key->rounds movdqu ($ivp),$iv movdqu ($cmac),$inout1 movdqa .Lincrement64(%rip),$increment movdqa .Lbswap_mask(%rip),$bswap_mask pshufb $bswap_mask,$iv # keep iv in reverse order mov 240($key),$rounds # key->rounds mov $key,$key_ mov $rounds,$rnds_ shr \$1,$rounds lea 0($key),$key_ movdqu ($cmac),$inout1 movdqa $iv,$inout0 mov $rounds,$rnds_ pshufb $bswap_mask,$iv jmp .Lccm64_enc_outer .align 16 .Lccm64_enc_outer: movups ($inp),$in0 # load inp pshufb $bswap_mask,$inout0 mov $key_,$key $movkey ($key_),$rndkey0 mov $rnds_,$rounds movups ($inp),$in0 # load inp $movkey ($key),$rndkey0 shr \$1,$rounds $movkey 16($key),$rndkey1 xorps $rndkey0,$in0 lea 32($key),$key xorps $rndkey0,$inout0 xorps $inout1,$in0 # cmac^=inp xorps $rndkey0,$inout0 # counter $movkey 16($key_),$rndkey1 xorps $in0,$rndkey0 lea 32($key_),$key xorps $rndkey0,$inout1 # cmac^=inp $movkey ($key),$rndkey0 .Lccm64_enc2_loop: Loading @@ -877,16 +876,17 @@ $code.=<<___; jnz .Lccm64_enc2_loop aesenc $rndkey1,$inout0 aesenc $rndkey1,$inout1 paddq $increment,$iv aesenclast $rndkey0,$inout0 aesenclast $rndkey0,$inout1 paddq $increment,$iv dec $len lea 16($inp),$inp xorps $inout0,$in0 # inp ^= E(iv) movdqa $iv,$inout0 movups $in0,($out) # save output lea 16($out),$out pshufb $bswap_mask,$inout0 jnz .Lccm64_enc_outer movups $inout1,($cmac) Loading Loading @@ -919,39 +919,40 @@ $code.=<<___ if ($win64); .Lccm64_dec_body: ___ $code.=<<___; movdqu ($ivp),$iv mov 240($key),$rounds # key->rounds movups ($ivp),$iv movdqu ($cmac),$inout1 movdqa .Lincrement64(%rip),$increment movdqa .Lbswap_mask(%rip),$bswap_mask mov 240($key),$rounds # key->rounds movdqa $iv,$inout0 pshufb $bswap_mask,$iv # keep iv in reverse order movaps $iv,$inout0 mov $rounds,$rnds_ mov $key,$key_ pshufb $bswap_mask,$iv ___ &aesni_generate1("enc",$key,$rounds); $code.=<<___; .Lccm64_dec_outer: paddq $increment,$iv movups ($inp),$in0 # load inp xorps $inout0,$in0 movdqa $iv,$inout0 paddq $increment,$iv lea 16($inp),$inp pshufb $bswap_mask,$inout0 mov $key_,$key jmp .Lccm64_dec_outer .align 16 .Lccm64_dec_outer: xorps $inout0,$in0 # inp ^= E(iv) movdqa $iv,$inout0 mov $rnds_,$rounds movups $in0,($out) movups $in0,($out) # save output lea 16($out),$out pshufb $bswap_mask,$inout0 sub \$1,$len jz .Lccm64_dec_break $movkey ($key),$rndkey0 $movkey ($key_),$rndkey0 shr \$1,$rounds $movkey 16($key),$rndkey1 $movkey 16($key_),$rndkey1 xorps $rndkey0,$in0 lea 32($key),$key lea 32($key_),$key xorps $rndkey0,$inout0 xorps $in0,$inout1 # cmac^=out $movkey ($key),$rndkey0 Loading @@ -966,15 +967,20 @@ $code.=<<___; aesenc $rndkey0,$inout1 $movkey 0($key),$rndkey0 jnz .Lccm64_dec2_loop movups ($inp),$in0 # load inp paddq $increment,$iv aesenc $rndkey1,$inout0 aesenc $rndkey1,$inout1 lea 16($inp),$inp aesenclast $rndkey0,$inout0 aesenclast $rndkey0,$inout1 jmp .Lccm64_dec_outer .align 16 .Lccm64_dec_break: #xorps $in0,$inout1 # cmac^=out ___ &aesni_generate1("enc",$key,$rounds,$inout1); &aesni_generate1("enc",$key_,$rounds,$inout1,$in0); $code.=<<___; movups $inout1,($cmac) ___ Loading