Commit d8ba0dc9 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

crypto/aes/asm/aesni-x86[_64].pl update, up to 14% improvement on

Atom Silvermont. On other CPUs one can observe 1% loss on some
algorithms.
parent b217ca63
Loading
Loading
Loading
Loading
+124 −123
Original line number Original line Diff line number Diff line
#!/usr/bin/env perl
#!/usr/bin/env perl


# ====================================================================
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# details see http://www.openssl.org/~appro/cryptogams/.
@@ -208,25 +208,26 @@ sub aesni_generate3


    &function_begin_B("_aesni_${p}rypt3");
    &function_begin_B("_aesni_${p}rypt3");
	&$movekey	($rndkey0,&QWP(0,$key));
	&$movekey	($rndkey0,&QWP(0,$key));
	&shr		($rounds,1);
	&shl		($rounds,4);
	&$movekey	($rndkey1,&QWP(16,$key));
	&$movekey	($rndkey1,&QWP(16,$key));
	&lea		($key,&DWP(32,$key));
	&xorps		($inout0,$rndkey0);
	&xorps		($inout0,$rndkey0);
	&pxor		($inout1,$rndkey0);
	&pxor		($inout1,$rndkey0);
	&pxor		($inout2,$rndkey0);
	&pxor		($inout2,$rndkey0);
	&$movekey	($rndkey0,&QWP(0,$key));
	&$movekey	($rndkey0,&QWP(32,$key));
	&lea		($key,&DWP(32,$key,$rounds));
	&neg		($rounds);
	&add		($rounds,16);


    &set_label("${p}3_loop");
    &set_label("${p}3_loop");
	eval"&aes${p}	($inout0,$rndkey1)";
	eval"&aes${p}	($inout0,$rndkey1)";
	eval"&aes${p}	($inout1,$rndkey1)";
	eval"&aes${p}	($inout1,$rndkey1)";
	&dec		($rounds);
	eval"&aes${p}	($inout2,$rndkey1)";
	eval"&aes${p}	($inout2,$rndkey1)";
	&$movekey	($rndkey1,&QWP(16,$key));
	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
	&add		($rounds,32);
	eval"&aes${p}	($inout0,$rndkey0)";
	eval"&aes${p}	($inout0,$rndkey0)";
	eval"&aes${p}	($inout1,$rndkey0)";
	eval"&aes${p}	($inout1,$rndkey0)";
	&lea		($key,&DWP(32,$key));
	eval"&aes${p}	($inout2,$rndkey0)";
	eval"&aes${p}	($inout2,$rndkey0)";
	&$movekey	($rndkey0,&QWP(0,$key));
	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
	&jnz		(&label("${p}3_loop"));
	&jnz		(&label("${p}3_loop"));
    eval"&aes${p}	($inout0,$rndkey1)";
    eval"&aes${p}	($inout0,$rndkey1)";
    eval"&aes${p}	($inout1,$rndkey1)";
    eval"&aes${p}	($inout1,$rndkey1)";
@@ -248,27 +249,29 @@ sub aesni_generate4
    &function_begin_B("_aesni_${p}rypt4");
    &function_begin_B("_aesni_${p}rypt4");
	&$movekey	($rndkey0,&QWP(0,$key));
	&$movekey	($rndkey0,&QWP(0,$key));
	&$movekey	($rndkey1,&QWP(16,$key));
	&$movekey	($rndkey1,&QWP(16,$key));
	&shr		($rounds,1);
	&shl		($rounds,4);
	&lea		($key,&DWP(32,$key));
	&xorps		($inout0,$rndkey0);
	&xorps		($inout0,$rndkey0);
	&pxor		($inout1,$rndkey0);
	&pxor		($inout1,$rndkey0);
	&pxor		($inout2,$rndkey0);
	&pxor		($inout2,$rndkey0);
	&pxor		($inout3,$rndkey0);
	&pxor		($inout3,$rndkey0);
	&$movekey	($rndkey0,&QWP(0,$key));
	&$movekey	($rndkey0,&QWP(32,$key));
	&lea		($key,&DWP(32,$key,$rounds));
	&neg		($rounds);
	&data_byte	(0x0f,0x1f,0x40,0x00);
	&add		($rounds,16);


    &set_label("${p}4_loop");
    &set_label("${p}4_loop");
	eval"&aes${p}	($inout0,$rndkey1)";
	eval"&aes${p}	($inout0,$rndkey1)";
	eval"&aes${p}	($inout1,$rndkey1)";
	eval"&aes${p}	($inout1,$rndkey1)";
	&dec		($rounds);
	eval"&aes${p}	($inout2,$rndkey1)";
	eval"&aes${p}	($inout2,$rndkey1)";
	eval"&aes${p}	($inout3,$rndkey1)";
	eval"&aes${p}	($inout3,$rndkey1)";
	&$movekey	($rndkey1,&QWP(16,$key));
	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
	&add		($rounds,32);
	eval"&aes${p}	($inout0,$rndkey0)";
	eval"&aes${p}	($inout0,$rndkey0)";
	eval"&aes${p}	($inout1,$rndkey0)";
	eval"&aes${p}	($inout1,$rndkey0)";
	&lea		($key,&DWP(32,$key));
	eval"&aes${p}	($inout2,$rndkey0)";
	eval"&aes${p}	($inout2,$rndkey0)";
	eval"&aes${p}	($inout3,$rndkey0)";
	eval"&aes${p}	($inout3,$rndkey0)";
	&$movekey	($rndkey0,&QWP(0,$key));
	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
    &jnz		(&label("${p}4_loop"));
    &jnz		(&label("${p}4_loop"));


    eval"&aes${p}	($inout0,$rndkey1)";
    eval"&aes${p}	($inout0,$rndkey1)";
@@ -289,43 +292,43 @@ sub aesni_generate6
    &function_begin_B("_aesni_${p}rypt6");
    &function_begin_B("_aesni_${p}rypt6");
    &static_label("_aesni_${p}rypt6_enter");
    &static_label("_aesni_${p}rypt6_enter");
	&$movekey	($rndkey0,&QWP(0,$key));
	&$movekey	($rndkey0,&QWP(0,$key));
	&shr		($rounds,1);
	&shl		($rounds,4);
	&$movekey	($rndkey1,&QWP(16,$key));
	&$movekey	($rndkey1,&QWP(16,$key));
	&lea		($key,&DWP(32,$key));
	&xorps		($inout0,$rndkey0);
	&xorps		($inout0,$rndkey0);
	&pxor		($inout1,$rndkey0);	# pxor does better here
	&pxor		($inout1,$rndkey0);	# pxor does better here
	eval"&aes${p}	($inout0,$rndkey1)";
	&pxor		($inout2,$rndkey0);
	&pxor		($inout2,$rndkey0);
	eval"&aes${p}	($inout1,$rndkey1)";
	eval"&aes${p}	($inout0,$rndkey1)";
	&pxor		($inout3,$rndkey0);
	&pxor		($inout3,$rndkey0);
	&dec		($rounds);
	eval"&aes${p}	($inout2,$rndkey1)";
	&pxor		($inout4,$rndkey0);
	&pxor		($inout4,$rndkey0);
	eval"&aes${p}	($inout3,$rndkey1)";
	eval"&aes${p}	($inout1,$rndkey1)";
	&lea		($key,&DWP(32,$key,$rounds));
	&neg		($rounds);
	eval"&aes${p}	($inout2,$rndkey1)";
	&pxor		($inout5,$rndkey0);
	&pxor		($inout5,$rndkey0);
	&add		($rounds,16);
	eval"&aes${p}	($inout3,$rndkey1)";
	eval"&aes${p}	($inout4,$rndkey1)";
	eval"&aes${p}	($inout4,$rndkey1)";
	&$movekey	($rndkey0,&QWP(0,$key));
	eval"&aes${p}	($inout5,$rndkey1)";
	eval"&aes${p}	($inout5,$rndkey1)";
	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
	&jmp		(&label("_aesni_${p}rypt6_enter"));
	&jmp		(&label("_aesni_${p}rypt6_enter"));


    &set_label("${p}6_loop",16);
    &set_label("${p}6_loop",16);
	eval"&aes${p}	($inout0,$rndkey1)";
	eval"&aes${p}	($inout0,$rndkey1)";
	eval"&aes${p}	($inout1,$rndkey1)";
	eval"&aes${p}	($inout1,$rndkey1)";
	&dec		($rounds);
	eval"&aes${p}	($inout2,$rndkey1)";
	eval"&aes${p}	($inout2,$rndkey1)";
	eval"&aes${p}	($inout3,$rndkey1)";
	eval"&aes${p}	($inout3,$rndkey1)";
	eval"&aes${p}	($inout4,$rndkey1)";
	eval"&aes${p}	($inout4,$rndkey1)";
	eval"&aes${p}	($inout5,$rndkey1)";
	eval"&aes${p}	($inout5,$rndkey1)";
    &set_label("_aesni_${p}rypt6_enter",16);
    &set_label("_aesni_${p}rypt6_enter");
	&$movekey	($rndkey1,&QWP(16,$key));
	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
	&add		($rounds,32);
	eval"&aes${p}	($inout0,$rndkey0)";
	eval"&aes${p}	($inout0,$rndkey0)";
	eval"&aes${p}	($inout1,$rndkey0)";
	eval"&aes${p}	($inout1,$rndkey0)";
	&lea		($key,&DWP(32,$key));
	eval"&aes${p}	($inout2,$rndkey0)";
	eval"&aes${p}	($inout2,$rndkey0)";
	eval"&aes${p}	($inout3,$rndkey0)";
	eval"&aes${p}	($inout3,$rndkey0)";
	eval"&aes${p}	($inout4,$rndkey0)";
	eval"&aes${p}	($inout4,$rndkey0)";
	eval"&aes${p}	($inout5,$rndkey0)";
	eval"&aes${p}	($inout5,$rndkey0)";
	&$movekey	($rndkey0,&QWP(0,$key));
	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
    &jnz		(&label("${p}6_loop"));
    &jnz		(&label("${p}6_loop"));


    eval"&aes${p}	($inout0,$rndkey1)";
    eval"&aes${p}	($inout0,$rndkey1)";
@@ -610,11 +613,13 @@ if ($PREFIX eq "aesni") {
	&mov	(&DWP(24,"esp"),$key_);
	&mov	(&DWP(24,"esp"),$key_);
	&mov	(&DWP(28,"esp"),$key_);
	&mov	(&DWP(28,"esp"),$key_);


	&shr	($rounds,1);
	&shl	($rounds,4);
	&mov	($rounds_,16);
	&lea	($key_,&DWP(0,$key));
	&lea	($key_,&DWP(0,$key));
	&movdqa	($inout3,&QWP(0,"esp"));
	&movdqa	($inout3,&QWP(0,"esp"));
	&movdqa	($inout0,$ivec);
	&movdqa	($inout0,$ivec);
	&mov	($rounds_,$rounds);
	&lea	($key,&DWP(32,$key,$rounds));
	&sub	($rounds_,$rounds);
	&pshufb	($ivec,$inout3);
	&pshufb	($ivec,$inout3);


&set_label("ccm64_enc_outer");
&set_label("ccm64_enc_outer");
@@ -625,33 +630,31 @@ if ($PREFIX eq "aesni") {
	&xorps		($inout0,$rndkey0);
	&xorps		($inout0,$rndkey0);
	&$movekey	($rndkey1,&QWP(16,$key_));
	&$movekey	($rndkey1,&QWP(16,$key_));
	&xorps		($rndkey0,$in0);
	&xorps		($rndkey0,$in0);
	&lea		($key,&DWP(32,$key_));
	&xorps		($cmac,$rndkey0);		# cmac^=inp
	&xorps		($cmac,$rndkey0);		# cmac^=inp
	&$movekey	($rndkey0,&QWP(0,$key));
	&$movekey	($rndkey0,&QWP(32,$key_));


&set_label("ccm64_enc2_loop");
&set_label("ccm64_enc2_loop");
	&aesenc		($inout0,$rndkey1);
	&aesenc		($inout0,$rndkey1);
	&dec		($rounds);
	&aesenc		($cmac,$rndkey1);
	&aesenc		($cmac,$rndkey1);
	&$movekey	($rndkey1,&QWP(16,$key));
	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
	&add		($rounds,32);
	&aesenc		($inout0,$rndkey0);
	&aesenc		($inout0,$rndkey0);
	&lea		($key,&DWP(32,$key));
	&aesenc		($cmac,$rndkey0);
	&aesenc		($cmac,$rndkey0);
	&$movekey	($rndkey0,&QWP(0,$key));
	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
	&jnz		(&label("ccm64_enc2_loop"));
	&jnz		(&label("ccm64_enc2_loop"));
	&aesenc		($inout0,$rndkey1);
	&aesenc		($inout0,$rndkey1);
	&aesenc		($cmac,$rndkey1);
	&aesenc		($cmac,$rndkey1);
	&paddq		($ivec,&QWP(16,"esp"));
	&paddq		($ivec,&QWP(16,"esp"));
	&dec		($len);
	&aesenclast	($inout0,$rndkey0);
	&aesenclast	($inout0,$rndkey0);
	&aesenclast	($cmac,$rndkey0);
	&aesenclast	($cmac,$rndkey0);


	&dec	($len);
	&lea	($inp,&DWP(16,$inp));
	&lea	($inp,&DWP(16,$inp));
	&xorps	($in0,$inout0);			# inp^=E(ivec)
	&xorps	($in0,$inout0);			# inp^=E(ivec)
	&movdqa	($inout0,$ivec);
	&movdqa	($inout0,$ivec);
	&movups	(&QWP(0,$out),$in0);		# save output
	&movups	(&QWP(0,$out),$in0);		# save output
	&lea	($out,&DWP(16,$out));
	&pshufb	($inout0,$inout3);
	&pshufb	($inout0,$inout3);
	&lea	($out,&DWP(16,$out));
	&jnz	(&label("ccm64_enc_outer"));
	&jnz	(&label("ccm64_enc_outer"));


	&mov	("esp",&DWP(48,"esp"));
	&mov	("esp",&DWP(48,"esp"));
@@ -700,15 +703,19 @@ if ($PREFIX eq "aesni") {
	{   &aesni_inline_generate1("enc");	}
	{   &aesni_inline_generate1("enc");	}
	else
	else
	{   &call	("_aesni_encrypt1");	}
	{   &call	("_aesni_encrypt1");	}
	&shl	($rounds_,4);
	&mov	($rounds,16);
	&movups	($in0,&QWP(0,$inp));		# load inp
	&movups	($in0,&QWP(0,$inp));		# load inp
	&paddq	($ivec,&QWP(16,"esp"));
	&paddq	($ivec,&QWP(16,"esp"));
	&lea	($inp,&QWP(16,$inp));
	&lea	($inp,&QWP(16,$inp));
	&sub	($rounds,$rounds_);
	&lea	($key,&DWP(32,$key_,$rounds_));
	&mov	($rounds_,$rounds);
	&jmp	(&label("ccm64_dec_outer"));
	&jmp	(&label("ccm64_dec_outer"));


&set_label("ccm64_dec_outer",16);
&set_label("ccm64_dec_outer",16);
	&xorps	($in0,$inout0);			# inp ^= E(ivec)
	&xorps	($in0,$inout0);			# inp ^= E(ivec)
	&movdqa	($inout0,$ivec);
	&movdqa	($inout0,$ivec);
	&mov	($rounds,$rounds_);
	&movups	(&QWP(0,$out),$in0);		# save output
	&movups	(&QWP(0,$out),$in0);		# save output
	&lea	($out,&DWP(16,$out));
	&lea	($out,&DWP(16,$out));
	&pshufb	($inout0,$inout3);
	&pshufb	($inout0,$inout3);
@@ -717,34 +724,33 @@ if ($PREFIX eq "aesni") {
	&jz	(&label("ccm64_dec_break"));
	&jz	(&label("ccm64_dec_break"));


	&$movekey	($rndkey0,&QWP(0,$key_));
	&$movekey	($rndkey0,&QWP(0,$key_));
	&shr		($rounds,1);
	&mov		($rounds,$rounds_);
	&$movekey	($rndkey1,&QWP(16,$key_));
	&$movekey	($rndkey1,&QWP(16,$key_));
	&xorps		($in0,$rndkey0);
	&xorps		($in0,$rndkey0);
	&lea		($key,&DWP(32,$key_));
	&xorps		($inout0,$rndkey0);
	&xorps		($inout0,$rndkey0);
	&xorps		($cmac,$in0);		# cmac^=out
	&xorps		($cmac,$in0);		# cmac^=out
	&$movekey	($rndkey0,&QWP(0,$key));
	&$movekey	($rndkey0,&QWP(32,$key_));


&set_label("ccm64_dec2_loop");
&set_label("ccm64_dec2_loop");
	&aesenc		($inout0,$rndkey1);
	&aesenc		($inout0,$rndkey1);
	&dec		($rounds);
	&aesenc		($cmac,$rndkey1);
	&aesenc		($cmac,$rndkey1);
	&$movekey	($rndkey1,&QWP(16,$key));
	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
	&add		($rounds,32);
	&aesenc		($inout0,$rndkey0);
	&aesenc		($inout0,$rndkey0);
	&lea		($key,&DWP(32,$key));
	&aesenc		($cmac,$rndkey0);
	&aesenc		($cmac,$rndkey0);
	&$movekey	($rndkey0,&QWP(0,$key));
	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
	&jnz		(&label("ccm64_dec2_loop"));
	&jnz		(&label("ccm64_dec2_loop"));
	&movups		($in0,&QWP(0,$inp));	# load inp
	&movups		($in0,&QWP(0,$inp));	# load inp
	&paddq		($ivec,&QWP(16,"esp"));
	&paddq		($ivec,&QWP(16,"esp"));
	&aesenc		($inout0,$rndkey1);
	&aesenc		($inout0,$rndkey1);
	&aesenc		($cmac,$rndkey1);
	&aesenc		($cmac,$rndkey1);
	&lea		($inp,&QWP(16,$inp));
	&aesenclast	($inout0,$rndkey0);
	&aesenclast	($inout0,$rndkey0);
	&aesenclast	($cmac,$rndkey0);
	&aesenclast	($cmac,$rndkey0);
	&lea		($inp,&QWP(16,$inp));
	&jmp	(&label("ccm64_dec_outer"));
	&jmp	(&label("ccm64_dec_outer"));


&set_label("ccm64_dec_break",16);
&set_label("ccm64_dec_break",16);
	&mov	($rounds,&DWP(240,$key_));
	&mov	($key,$key_);
	&mov	($key,$key_);
	if ($inline)
	if ($inline)
	{   &aesni_inline_generate1("enc",$cmac,$in0);	}
	{   &aesni_inline_generate1("enc",$cmac,$in0);	}
@@ -763,7 +769,7 @@ if ($PREFIX eq "aesni") {
#                         const char *ivec);
#                         const char *ivec);
#
#
# Handles only complete blocks, operates on 32-bit counter and
# Handles only complete blocks, operates on 32-bit counter and
# does not update *ivec! (see engine/eng_aesni.c for details)
# does not update *ivec! (see crypto/modes/ctr128.c for details)
#
#
# stack layout:
# stack layout:
#	0	pshufb mask
#	0	pshufb mask
@@ -810,66 +816,61 @@ if ($PREFIX eq "aesni") {


	# compose 2 vectors of 3x32-bit counters
	# compose 2 vectors of 3x32-bit counters
	&bswap	($rounds_);
	&bswap	($rounds_);
	&pxor	($rndkey1,$rndkey1);
	&pxor	($rndkey0,$rndkey0);
	&pxor	($rndkey0,$rndkey0);
	&pxor	($rndkey1,$rndkey1);
	&movdqa	($inout0,&QWP(0,"esp"));	# load byte-swap mask
	&movdqa	($inout0,&QWP(0,"esp"));	# load byte-swap mask
	&pinsrd	($rndkey1,$rounds_,0);
	&pinsrd	($rndkey0,$rounds_,0);
	&lea	($key_,&DWP(3,$rounds_));
	&lea	($key_,&DWP(3,$rounds_));
	&pinsrd	($rndkey0,$key_,0);
	&pinsrd	($rndkey1,$key_,0);
	&inc	($rounds_);
	&inc	($rounds_);
	&pinsrd	($rndkey1,$rounds_,1);
	&pinsrd	($rndkey0,$rounds_,1);
	&inc	($key_);
	&inc	($key_);
	&pinsrd	($rndkey0,$key_,1);
	&pinsrd	($rndkey1,$key_,1);
	&inc	($rounds_);
	&inc	($rounds_);
	&pinsrd	($rndkey1,$rounds_,2);
	&pinsrd	($rndkey0,$rounds_,2);
	&inc	($key_);
	&inc	($key_);
	&pinsrd	($rndkey0,$key_,2);
	&pinsrd	($rndkey1,$key_,2);
	&movdqa	(&QWP(48,"esp"),$rndkey1);	# save 1st triplet
	&movdqa	(&QWP(48,"esp"),$rndkey0);	# save 1st triplet
	&pshufb	($rndkey1,$inout0);		# byte swap
	&movdqa	(&QWP(64,"esp"),$rndkey0);	# save 2nd triplet
	&pshufb	($rndkey0,$inout0);		# byte swap
	&pshufb	($rndkey0,$inout0);		# byte swap
	&movdqu	($inout4,&QWP(0,$key));		# key[0]
	&movdqa	(&QWP(64,"esp"),$rndkey1);	# save 2nd triplet
	&pshufb	($rndkey1,$inout0);		# byte swap


	&pshufd	($inout0,$rndkey1,3<<6);	# place counter to upper dword
	&pshufd	($inout0,$rndkey0,3<<6);	# place counter to upper dword
	&pshufd	($inout1,$rndkey1,2<<6);
	&pshufd	($inout1,$rndkey0,2<<6);
	&cmp	($len,6);
	&cmp	($len,6);
	&jb	(&label("ctr32_tail"));
	&jb	(&label("ctr32_tail"));
	&movdqa	(&QWP(32,"esp"),$inout5);	# save counter-less ivec
	&pxor	($inout5,$inout4);		# counter-less ivec^key[0]
	&shr	($rounds,1);
	&shl	($rounds,4);
	&mov	($rounds_,16);
	&movdqa	(&QWP(32,"esp"),$inout5);	# save counter-less ivec^key[0]
	&mov	($key_,$key);			# backup $key
	&mov	($key_,$key);			# backup $key
	&mov	($rounds_,$rounds);		# backup $rounds
	&sub	($rounds_,$rounds);		# backup twisted $rounds
	&lea	($key,&DWP(32,$key,$rounds));
	&sub	($len,6);
	&sub	($len,6);
	&jmp	(&label("ctr32_loop6"));
	&jmp	(&label("ctr32_loop6"));


&set_label("ctr32_loop6",16);
&set_label("ctr32_loop6",16);
	&pshufd	($inout2,$rndkey1,1<<6);
	# inlining _aesni_encrypt6's prologue gives ~6% improvement...
	&movdqa	($rndkey1,&QWP(32,"esp"));	# pull counter-less ivec
	&pshufd	($inout2,$rndkey0,1<<6);
	&pshufd	($inout3,$rndkey0,3<<6);
	&movdqa	($rndkey0,&QWP(32,"esp"));	# pull counter-less ivec
	&por	($inout0,$rndkey1);		# merge counter-less ivec
	&pshufd	($inout3,$rndkey1,3<<6);
	&pshufd	($inout4,$rndkey0,2<<6);
	&pxor		($inout0,$rndkey0);	# merge counter-less ivec
	&por	($inout1,$rndkey1);
	&pshufd	($inout4,$rndkey1,2<<6);
	&pshufd	($inout5,$rndkey0,1<<6);
	&por	($inout2,$rndkey1);
	&por	($inout3,$rndkey1);
	&por	($inout4,$rndkey1);
	&por	($inout5,$rndkey1);

	# inlining _aesni_encrypt6's prologue gives ~4% improvement...
	&$movekey	($rndkey0,&QWP(0,$key_));
	&$movekey	($rndkey1,&QWP(16,$key_));
	&lea		($key,&DWP(32,$key_));
	&dec		($rounds);
	&pxor		($inout0,$rndkey0);
	&pxor		($inout1,$rndkey0);
	&pxor		($inout1,$rndkey0);
	&aesenc		($inout0,$rndkey1);
	&pshufd	($inout5,$rndkey1,1<<6);
	&$movekey	($rndkey1,&QWP(16,$key_));
	&pxor		($inout2,$rndkey0);
	&pxor		($inout2,$rndkey0);
	&aesenc		($inout1,$rndkey1);
	&pxor		($inout3,$rndkey0);
	&pxor		($inout3,$rndkey0);
	&aesenc		($inout2,$rndkey1);
	&aesenc		($inout0,$rndkey1);
	&pxor		($inout4,$rndkey0);
	&pxor		($inout4,$rndkey0);
	&aesenc		($inout3,$rndkey1);
	&pxor		($inout5,$rndkey0);
	&pxor		($inout5,$rndkey0);
	&aesenc		($inout1,$rndkey1);
	&$movekey	($rndkey0,&QWP(32,$key_));
	&mov		($rounds,$rounds_);
	&aesenc		($inout2,$rndkey1);
	&aesenc		($inout3,$rndkey1);
	&aesenc		($inout4,$rndkey1);
	&aesenc		($inout4,$rndkey1);
	&$movekey	($rndkey0,&QWP(0,$key));
	&aesenc		($inout5,$rndkey1);
	&aesenc		($inout5,$rndkey1);


	&call		(&label("_aesni_encrypt6_enter"));
	&call		(&label("_aesni_encrypt6_enter"));
@@ -882,12 +883,12 @@ if ($PREFIX eq "aesni") {
	&movups	(&QWP(0,$out),$inout0);
	&movups	(&QWP(0,$out),$inout0);
	&movdqa	($rndkey0,&QWP(16,"esp"));	# load increment
	&movdqa	($rndkey0,&QWP(16,"esp"));	# load increment
	&xorps	($inout2,$rndkey1);
	&xorps	($inout2,$rndkey1);
	&movdqa	($rndkey1,&QWP(48,"esp"));	# load 1st triplet
	&movdqa	($rndkey1,&QWP(64,"esp"));	# load 2nd triplet
	&movups	(&QWP(0x10,$out),$inout1);
	&movups	(&QWP(0x10,$out),$inout1);
	&movups	(&QWP(0x20,$out),$inout2);
	&movups	(&QWP(0x20,$out),$inout2);


	&paddd	($rndkey1,$rndkey0);		# 1st triplet increment
	&paddd	($rndkey1,$rndkey0);		# 2nd triplet increment
	&paddd	($rndkey0,&QWP(64,"esp"));	# 2nd triplet increment
	&paddd	($rndkey0,&QWP(48,"esp"));	# 1st triplet increment
	&movdqa	($inout0,&QWP(0,"esp"));	# load byte swap mask
	&movdqa	($inout0,&QWP(0,"esp"));	# load byte swap mask


	&movups	($inout1,&QWP(0x30,$inp));
	&movups	($inout1,&QWP(0x30,$inp));
@@ -895,44 +896,44 @@ if ($PREFIX eq "aesni") {
	&xorps	($inout3,$inout1);
	&xorps	($inout3,$inout1);
	&movups	($inout1,&QWP(0x50,$inp));
	&movups	($inout1,&QWP(0x50,$inp));
	&lea	($inp,&DWP(0x60,$inp));
	&lea	($inp,&DWP(0x60,$inp));
	&movdqa	(&QWP(48,"esp"),$rndkey1);	# save 1st triplet
	&movdqa	(&QWP(48,"esp"),$rndkey0);	# save 1st triplet
	&pshufb	($rndkey1,$inout0);		# byte swap
	&pshufb	($rndkey0,$inout0);		# byte swap
	&xorps	($inout4,$inout2);
	&xorps	($inout4,$inout2);
	&movups	(&QWP(0x30,$out),$inout3);
	&movups	(&QWP(0x30,$out),$inout3);
	&xorps	($inout5,$inout1);
	&xorps	($inout5,$inout1);
	&movdqa	(&QWP(64,"esp"),$rndkey0);	# save 2nd triplet
	&movdqa	(&QWP(64,"esp"),$rndkey1);	# save 2nd triplet
	&pshufb	($rndkey0,$inout0);		# byte swap
	&pshufb	($rndkey1,$inout0);		# byte swap
	&movups	(&QWP(0x40,$out),$inout4);
	&movups	(&QWP(0x40,$out),$inout4);
	&pshufd	($inout0,$rndkey1,3<<6);
	&pshufd	($inout0,$rndkey0,3<<6);
	&movups	(&QWP(0x50,$out),$inout5);
	&movups	(&QWP(0x50,$out),$inout5);
	&lea	($out,&DWP(0x60,$out));
	&lea	($out,&DWP(0x60,$out));


	&mov	($rounds,$rounds_);
	&pshufd	($inout1,$rndkey0,2<<6);
	&pshufd	($inout1,$rndkey1,2<<6);
	&sub	($len,6);
	&sub	($len,6);
	&jnc	(&label("ctr32_loop6"));
	&jnc	(&label("ctr32_loop6"));


	&add	($len,6);
	&add	($len,6);
	&jz	(&label("ctr32_ret"));
	&jz	(&label("ctr32_ret"));
	&movdqu	($inout5,&QWP(0,$key_));
	&mov	($key,$key_);
	&mov	($key,$key_);
	&lea	($rounds,&DWP(1,"",$rounds,2));	# restore $rounds
	&pxor	($inout5,&QWP(32,"esp"));	# restore count-less ivec
	&movdqa	($inout5,&QWP(32,"esp"));	# pull count-less ivec
	&mov	($rounds,&DWP(240,$key_));	# restore $rounds


&set_label("ctr32_tail");
&set_label("ctr32_tail");
	&por	($inout0,$inout5);
	&por	($inout0,$inout5);
	&cmp	($len,2);
	&cmp	($len,2);
	&jb	(&label("ctr32_one"));
	&jb	(&label("ctr32_one"));


	&pshufd	($inout2,$rndkey1,1<<6);
	&pshufd	($inout2,$rndkey0,1<<6);
	&por	($inout1,$inout5);
	&por	($inout1,$inout5);
	&je	(&label("ctr32_two"));
	&je	(&label("ctr32_two"));


	&pshufd	($inout3,$rndkey0,3<<6);
	&pshufd	($inout3,$rndkey1,3<<6);
	&por	($inout2,$inout5);
	&por	($inout2,$inout5);
	&cmp	($len,4);
	&cmp	($len,4);
	&jb	(&label("ctr32_three"));
	&jb	(&label("ctr32_three"));


	&pshufd	($inout4,$rndkey0,2<<6);
	&pshufd	($inout4,$rndkey1,2<<6);
	&por	($inout3,$inout5);
	&por	($inout3,$inout5);
	&je	(&label("ctr32_four"));
	&je	(&label("ctr32_four"));


@@ -1057,8 +1058,10 @@ if ($PREFIX eq "aesni") {
	&sub	($len,16*6);
	&sub	($len,16*6);
	&jc	(&label("xts_enc_short"));
	&jc	(&label("xts_enc_short"));


	&shr	($rounds,1);
	&shl	($rounds,4);
	&mov	($rounds_,$rounds);
	&mov	($rounds_,16);
	&sub	($rounds_,$rounds);
	&lea	($key,&DWP(32,$key,$rounds));
	&jmp	(&label("xts_enc_loop6"));
	&jmp	(&label("xts_enc_loop6"));


&set_label("xts_enc_loop6",16);
&set_label("xts_enc_loop6",16);
@@ -1080,6 +1083,7 @@ if ($PREFIX eq "aesni") {
	&pxor	($inout5,$tweak);
	&pxor	($inout5,$tweak);


	# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
	# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
	&mov	($rounds,$rounds_);		# restore $rounds
	&movdqu	($inout1,&QWP(16*1,$inp));
	&movdqu	($inout1,&QWP(16*1,$inp));
	 &xorps		($inout0,$rndkey0);	# input^=rndkey[0]
	 &xorps		($inout0,$rndkey0);	# input^=rndkey[0]
	&movdqu	($inout2,&QWP(16*2,$inp));
	&movdqu	($inout2,&QWP(16*2,$inp));
@@ -1096,19 +1100,17 @@ if ($PREFIX eq "aesni") {
	&pxor	($inout5,$rndkey1);
	&pxor	($inout5,$rndkey1);


	 &$movekey	($rndkey1,&QWP(16,$key_));
	 &$movekey	($rndkey1,&QWP(16,$key_));
	 &lea		($key,&DWP(32,$key_));
	&pxor	($inout1,&QWP(16*1,"esp"));
	&pxor	($inout1,&QWP(16*1,"esp"));
	 &aesenc	($inout0,$rndkey1);
	&pxor	($inout2,&QWP(16*2,"esp"));
	&pxor	($inout2,&QWP(16*2,"esp"));
	 &aesenc	($inout1,$rndkey1);
	 &aesenc	($inout0,$rndkey1);
	&pxor	($inout3,&QWP(16*3,"esp"));
	&pxor	($inout3,&QWP(16*3,"esp"));
	 &dec		($rounds);
	 &aesenc	($inout2,$rndkey1);
	&pxor	($inout4,&QWP(16*4,"esp"));
	&pxor	($inout4,&QWP(16*4,"esp"));
	 &aesenc	($inout3,$rndkey1);
	 &aesenc	($inout1,$rndkey1);
	&pxor		($inout5,$rndkey0);
	&pxor		($inout5,$rndkey0);
	 &$movekey	($rndkey0,&QWP(32,$key_));
	 &aesenc	($inout2,$rndkey1);
	 &aesenc	($inout3,$rndkey1);
	 &aesenc	($inout4,$rndkey1);
	 &aesenc	($inout4,$rndkey1);
	 &$movekey	($rndkey0,&QWP(0,$key));
	 &aesenc	($inout5,$rndkey1);
	 &aesenc	($inout5,$rndkey1);
	&call		(&label("_aesni_encrypt6_enter"));
	&call		(&label("_aesni_encrypt6_enter"));


@@ -1135,13 +1137,12 @@ if ($PREFIX eq "aesni") {
	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
	&pand	($twres,$twmask);		# isolate carry and residue
	&pand	($twres,$twmask);		# isolate carry and residue
	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
	&mov	($rounds,$rounds_);		# restore $rounds
	&pxor	($tweak,$twres);
	&pxor	($tweak,$twres);


	&sub	($len,16*6);
	&sub	($len,16*6);
	&jnc	(&label("xts_enc_loop6"));
	&jnc	(&label("xts_enc_loop6"));


	&lea	($rounds,&DWP(1,"",$rounds,2));	# restore $rounds
	&mov	($rounds,&DWP(240,$key_));	# restore $rounds
	&mov	($key,$key_);			# restore $key
	&mov	($key,$key_);			# restore $key
	&mov	($rounds_,$rounds);
	&mov	($rounds_,$rounds);


@@ -1399,8 +1400,10 @@ if ($PREFIX eq "aesni") {
	&sub	($len,16*6);
	&sub	($len,16*6);
	&jc	(&label("xts_dec_short"));
	&jc	(&label("xts_dec_short"));


	&shr	($rounds,1);
	&shl	($rounds,4);
	&mov	($rounds_,$rounds);
	&mov	($rounds_,16);
	&sub	($rounds_,$rounds);
	&lea	($key,&DWP(32,$key,$rounds));
	&jmp	(&label("xts_dec_loop6"));
	&jmp	(&label("xts_dec_loop6"));


&set_label("xts_dec_loop6",16);
&set_label("xts_dec_loop6",16);
@@ -1422,6 +1425,7 @@ if ($PREFIX eq "aesni") {
	&pxor	($inout5,$tweak);
	&pxor	($inout5,$tweak);


	# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
	# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
	&mov	($rounds,$rounds_);
	&movdqu	($inout1,&QWP(16*1,$inp));
	&movdqu	($inout1,&QWP(16*1,$inp));
	 &xorps		($inout0,$rndkey0);	# input^=rndkey[0]
	 &xorps		($inout0,$rndkey0);	# input^=rndkey[0]
	&movdqu	($inout2,&QWP(16*2,$inp));
	&movdqu	($inout2,&QWP(16*2,$inp));
@@ -1438,19 +1442,17 @@ if ($PREFIX eq "aesni") {
	&pxor	($inout5,$rndkey1);
	&pxor	($inout5,$rndkey1);


	 &$movekey	($rndkey1,&QWP(16,$key_));
	 &$movekey	($rndkey1,&QWP(16,$key_));
	 &lea		($key,&DWP(32,$key_));
	&pxor	($inout1,&QWP(16*1,"esp"));
	&pxor	($inout1,&QWP(16*1,"esp"));
	 &aesdec	($inout0,$rndkey1);
	&pxor	($inout2,&QWP(16*2,"esp"));
	&pxor	($inout2,&QWP(16*2,"esp"));
	 &aesdec	($inout1,$rndkey1);
	 &aesdec	($inout0,$rndkey1);
	&pxor	($inout3,&QWP(16*3,"esp"));
	&pxor	($inout3,&QWP(16*3,"esp"));
	 &dec		($rounds);
	 &aesdec	($inout2,$rndkey1);
	&pxor	($inout4,&QWP(16*4,"esp"));
	&pxor	($inout4,&QWP(16*4,"esp"));
	 &aesdec	($inout3,$rndkey1);
	 &aesdec	($inout1,$rndkey1);
	&pxor		($inout5,$rndkey0);
	&pxor		($inout5,$rndkey0);
	 &$movekey	($rndkey0,&QWP(32,$key_));
	 &aesdec	($inout2,$rndkey1);
	 &aesdec	($inout3,$rndkey1);
	 &aesdec	($inout4,$rndkey1);
	 &aesdec	($inout4,$rndkey1);
	 &$movekey	($rndkey0,&QWP(0,$key));
	 &aesdec	($inout5,$rndkey1);
	 &aesdec	($inout5,$rndkey1);
	&call		(&label("_aesni_decrypt6_enter"));
	&call		(&label("_aesni_decrypt6_enter"));


@@ -1477,13 +1479,12 @@ if ($PREFIX eq "aesni") {
	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
	&pand	($twres,$twmask);		# isolate carry and residue
	&pand	($twres,$twmask);		# isolate carry and residue
	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
	&mov	($rounds,$rounds_);		# restore $rounds
	&pxor	($tweak,$twres);
	&pxor	($tweak,$twres);


	&sub	($len,16*6);
	&sub	($len,16*6);
	&jnc	(&label("xts_dec_loop6"));
	&jnc	(&label("xts_dec_loop6"));


	&lea	($rounds,&DWP(1,"",$rounds,2));	# restore $rounds
	&mov	($rounds,&DWP(240,$key_));	# restore $rounds
	&mov	($key,$key_);			# restore $key
	&mov	($key,$key_);			# restore $key
	&mov	($rounds_,$rounds);
	&mov	($rounds_,$rounds);


+215 −178

File changed.

Preview size limit exceeded, changes collapsed.