aes/asm/aesni-x86[_64].pl: minor Atom-specific performance tweak. (214368ff) · Commits · CYBER - Cyber Security / TS 103 523 MSP / TLMSP / TLMSP OpenSSL

crypto/aes/asm/aesni-x86.pl

+47 −16

Original line number	Diff line number	Diff line
		@@ -207,12 +207,45 @@ sub aesni_generate1 # fully unrolled loop
		# every 2nd cycle. Thus 3x interleave was the one providing optimal
		# utilization, i.e. when subroutine's throughput is virtually same as
		# of non-interleaved subroutine [for number of input blocks up to 3].
		# This is why it makes no sense to implement 2x subroutine.
		# aes[enc\|dec] latency in next processor generation is 8, but the
		# instructions can be scheduled every cycle. Optimal interleave for
		# new processor is therefore 8x, but it's unfeasible to accommodate it
		# in XMM registers addreassable in 32-bit mode and therefore 6x is
		# used instead...
		# This is why it originally made no sense to implement 2x subroutine.
		# But times change and it became appropriate to spend extra 192 bytes
		# on 2x subroutine on Atom Silvermont account. For processors that
		# can schedule aes[enc\|dec] every cycle optimal interleave factor
		# equals to corresponding instructions latency. 8x is optimal for
		# * Bridge, but it's unfeasible to accommodate such implementation
		# in XMM registers addreassable in 32-bit mode and therefore maximum
		# of 6x is used instead...

		sub aesni_generate2
		{ my $p=shift;

		&function_begin_B("_aesni_${p}rypt2");
		&$movekey ($rndkey0,&QWP(0,$key));
		&shl ($rounds,4);
		&$movekey ($rndkey1,&QWP(16,$key));
		&xorps ($inout0,$rndkey0);
		&pxor ($inout1,$rndkey0);
		&$movekey ($rndkey0,&QWP(32,$key));
		&lea ($key,&DWP(32,$key,$rounds));
		&neg ($rounds);
		&add ($rounds,16);

		&set_label("${p}2_loop");
		eval"&aes${p} ($inout0,$rndkey1)";
		eval"&aes${p} ($inout1,$rndkey1)";
		&$movekey ($rndkey1,&QWP(0,$key,$rounds));
		&add ($rounds,32);
		eval"&aes${p} ($inout0,$rndkey0)";
		eval"&aes${p} ($inout1,$rndkey0)";
		&$movekey ($rndkey0,&QWP(-16,$key,$rounds));
		&jnz (&label("${p}2_loop"));
		eval"&aes${p} ($inout0,$rndkey1)";
		eval"&aes${p} ($inout1,$rndkey1)";
		eval"&aes${p}last ($inout0,$rndkey0)";
		eval"&aes${p}last ($inout1,$rndkey0)";
		&ret();
		&function_end_B("_aesni_${p}rypt2");
		}

		sub aesni_generate3
		{ my $p=shift;
		@@ -357,6 +390,8 @@ sub aesni_generate6
		&ret();
		&function_end_B("_aesni_${p}rypt6");
		}
		&aesni_generate2("enc") if ($PREFIX eq "aesni");
		&aesni_generate2("dec");
		&aesni_generate3("enc") if ($PREFIX eq "aesni");
		&aesni_generate3("dec");
		&aesni_generate4("enc") if ($PREFIX eq "aesni");
		@@ -460,8 +495,7 @@ if ($PREFIX eq "aesni") {
		&jmp (&label("ecb_ret"));

		&set_label("ecb_enc_two",16);
		&xorps ($inout2,$inout2);
		&call ("_aesni_encrypt3");
		&call ("_aesni_encrypt2");
		&movups (&QWP(0,$out),$inout0);
		&movups (&QWP(0x10,$out),$inout1);
		&jmp (&label("ecb_ret"));
		@@ -561,8 +595,7 @@ if ($PREFIX eq "aesni") {
		&jmp (&label("ecb_ret"));

		&set_label("ecb_dec_two",16);
		&xorps ($inout2,$inout2);
		&call ("_aesni_decrypt3");
		&call ("_aesni_decrypt2");
		&movups (&QWP(0,$out),$inout0);
		&movups (&QWP(0x10,$out),$inout1);
		&jmp (&label("ecb_ret"));
		@@ -982,7 +1015,7 @@ if ($PREFIX eq "aesni") {
		&jmp (&label("ctr32_ret"));

		&set_label("ctr32_two",16);
		&call ("_aesni_encrypt3");
		&call ("_aesni_encrypt2");
		&movups ($inout3,&QWP(0,$inp));
		&movups ($inout4,&QWP(0x10,$inp));
		&xorps ($inout0,$inout3);
		@@ -1253,9 +1286,8 @@ if ($PREFIX eq "aesni") {
		&lea ($inp,&DWP(16*2,$inp));
		&xorps ($inout0,$inout3); # input^=tweak
		&xorps ($inout1,$inout4);
		&xorps ($inout2,$inout2);

		&call ("_aesni_encrypt3");
		&call ("_aesni_encrypt2");

		&xorps ($inout0,$inout3); # output^=tweak
		&xorps ($inout1,$inout4);
		@@ -1596,7 +1628,7 @@ if ($PREFIX eq "aesni") {
		&xorps ($inout0,$inout3); # input^=tweak
		&xorps ($inout1,$inout4);

		&call ("_aesni_decrypt3");
		&call ("_aesni_decrypt2");

		&xorps ($inout0,$inout3); # output^=tweak
		&xorps ($inout1,$inout4);
		@@ -1896,8 +1928,7 @@ if ($PREFIX eq "aesni") {
		&jmp (&label("cbc_dec_tail_collected"));

		&set_label("cbc_dec_two",16);
		&xorps ($inout2,$inout2);
		&call ("_aesni_decrypt3");
		&call ("_aesni_decrypt2");
		&xorps ($inout0,$ivec);
		&xorps ($inout1,$in0);
		&movups (&QWP(0,$out),$inout0);

crypto/aes/asm/aesni-x86_64.pl

+50 −12

Original line number	Diff line number	Diff line
		@@ -288,10 +288,49 @@ ___
		# every 2nd cycle. Thus 3x interleave was the one providing optimal
		# utilization, i.e. when subroutine's throughput is virtually same as
		# of non-interleaved subroutine [for number of input blocks up to 3].
		# This is why it makes no sense to implement 2x subroutine.
		# aes[enc\|dec] latency in next processor generation is 8, but the
		# instructions can be scheduled every cycle. Optimal interleave for
		# new processor is therefore 8x...
		# This is why it originally made no sense to implement 2x subroutine.
		# But times change and it became appropriate to spend extra 192 bytes
		# on 2x subroutine on Atom Silvermont account. For processors that
		# can schedule aes[enc\|dec] every cycle optimal interleave factor
		# equals to corresponding instructions latency. 8x is optimal for
		# * Bridge and "super-optimal" for other Intel CPUs...

		sub aesni_generate2 {
		my $dir=shift;
		# As already mentioned it takes in $key and $rounds, which are not
		# preserved. $inout[0-1] is cipher/clear text...
		$code.=<<___;
		.type _aesni_${dir}rypt2,\@abi-omnipotent
		.align 16
		_aesni_${dir}rypt2:
		$movkey ($key),$rndkey0
		shl \$4,$rounds
		$movkey 16($key),$rndkey1
		xorps $rndkey0,$inout0
		xorps $rndkey0,$inout1
		$movkey 32($key),$rndkey0
		lea 32($key,$rounds),$key
		neg %rax # $rounds
		add \$16,%rax

		.L${dir}_loop2:
		aes${dir} $rndkey1,$inout0
		aes${dir} $rndkey1,$inout1
		$movkey ($key,%rax),$rndkey1
		add \$32,%rax
		aes${dir} $rndkey0,$inout0
		aes${dir} $rndkey0,$inout1
		$movkey -16($key,%rax),$rndkey0
		jnz .L${dir}_loop2

		aes${dir} $rndkey1,$inout0
		aes${dir} $rndkey1,$inout1
		aes${dir}last $rndkey0,$inout0
		aes${dir}last $rndkey0,$inout1
		ret
		.size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2
		___
		}
		sub aesni_generate3 {
		my $dir=shift;
		# As already mentioned it takes in $key and $rounds, which are not
		@@ -524,6 +563,8 @@ _aesni_${dir}rypt8:
		.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8
		___
		}
		&aesni_generate2("enc") if ($PREFIX eq "aesni");
		&aesni_generate2("dec");
		&aesni_generate3("enc") if ($PREFIX eq "aesni");
		&aesni_generate3("dec");
		&aesni_generate4("enc") if ($PREFIX eq "aesni");
		@@ -645,8 +686,7 @@ $code.=<<___;
		jmp .Lecb_ret
		.align 16
		.Lecb_enc_two:
		xorps $inout2,$inout2
		call _aesni_encrypt3
		call _aesni_encrypt2
		movups $inout0,($out)
		movups $inout1,0x10($out)
		jmp .Lecb_ret
		@@ -782,8 +822,7 @@ $code.=<<___;
		jmp .Lecb_ret
		.align 16
		.Lecb_dec_two:
		xorps $inout2,$inout2
		call _aesni_decrypt3
		call _aesni_decrypt2
		movups $inout0,($out)
		movups $inout1,0x10($out)
		jmp .Lecb_ret
		@@ -1875,7 +1914,7 @@ $code.=<<___;
		xorps @tweak[0],$inout0
		xorps @tweak[1],$inout1

		call _aesni_encrypt3
		call _aesni_encrypt2

		xorps @tweak[0],$inout0
		movdqa @tweak[2],@tweak[0]
		@@ -2322,7 +2361,7 @@ $code.=<<___;
		xorps @tweak[0],$inout0
		xorps @tweak[1],$inout1

		call _aesni_decrypt3
		call _aesni_decrypt2

		xorps @tweak[0],$inout0
		movdqa @tweak[2],@tweak[0]
		@@ -2831,8 +2870,7 @@ $code.=<<___;
		.align 16
		.Lcbc_dec_two:
		movaps $inout1,$in1
		xorps $inout2,$inout2
		call _aesni_decrypt3
		call _aesni_decrypt2
		pxor $iv,$inout0
		movaps $in1,$iv
		pxor $in0,$inout1