Commit 214368ff authored by Andy Polyakov's avatar Andy Polyakov
Browse files

aes/asm/aesni-x86[_64].pl: minor Atom-specific performance tweak.

parent 47739161
Loading
Loading
Loading
Loading
+47 −16
Original line number Diff line number Diff line
@@ -207,12 +207,45 @@ sub aesni_generate1 # fully unrolled loop
# every *2nd* cycle. Thus 3x interleave was the one providing optimal
# utilization, i.e. when subroutine's throughput is virtually same as
# of non-interleaved subroutine [for number of input blocks up to 3].
# This is why it makes no sense to implement 2x subroutine.
# aes[enc|dec] latency in next processor generation is 8, but the
# instructions can be scheduled every cycle. Optimal interleave for
# new processor is therefore 8x, but it's unfeasible to accommodate it
# in XMM registers addreassable in 32-bit mode and therefore 6x is
# used instead...
# This is why it originally made no sense to implement 2x subroutine.
# But times change and it became appropriate to spend extra 192 bytes
# on 2x subroutine on Atom Silvermont account. For processors that
# can schedule aes[enc|dec] every cycle optimal interleave factor
# equals to corresponding instructions latency. 8x is optimal for
# * Bridge, but it's unfeasible to accommodate such implementation
# in XMM registers addreassable in 32-bit mode and therefore maximum
# of 6x is used instead...

sub aesni_generate2
{ my $p=shift;

    &function_begin_B("_aesni_${p}rypt2");
	&$movekey	($rndkey0,&QWP(0,$key));
	&shl		($rounds,4);
	&$movekey	($rndkey1,&QWP(16,$key));
	&xorps		($inout0,$rndkey0);
	&pxor		($inout1,$rndkey0);
	&$movekey	($rndkey0,&QWP(32,$key));
	&lea		($key,&DWP(32,$key,$rounds));
	&neg		($rounds);
	&add		($rounds,16);

    &set_label("${p}2_loop");
	eval"&aes${p}	($inout0,$rndkey1)";
	eval"&aes${p}	($inout1,$rndkey1)";
	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
	&add		($rounds,32);
	eval"&aes${p}	($inout0,$rndkey0)";
	eval"&aes${p}	($inout1,$rndkey0)";
	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
	&jnz		(&label("${p}2_loop"));
    eval"&aes${p}	($inout0,$rndkey1)";
    eval"&aes${p}	($inout1,$rndkey1)";
    eval"&aes${p}last	($inout0,$rndkey0)";
    eval"&aes${p}last	($inout1,$rndkey0)";
    &ret();
    &function_end_B("_aesni_${p}rypt2");
}

sub aesni_generate3
{ my $p=shift;
@@ -357,6 +390,8 @@ sub aesni_generate6
    &ret();
    &function_end_B("_aesni_${p}rypt6");
}
&aesni_generate2("enc") if ($PREFIX eq "aesni");
&aesni_generate2("dec");
&aesni_generate3("enc") if ($PREFIX eq "aesni");
&aesni_generate3("dec");
&aesni_generate4("enc") if ($PREFIX eq "aesni");
@@ -460,8 +495,7 @@ if ($PREFIX eq "aesni") {
	&jmp	(&label("ecb_ret"));

&set_label("ecb_enc_two",16);
	&xorps	($inout2,$inout2);
	&call	("_aesni_encrypt3");
	&call	("_aesni_encrypt2");
	&movups	(&QWP(0,$out),$inout0);
	&movups	(&QWP(0x10,$out),$inout1);
	&jmp	(&label("ecb_ret"));
@@ -561,8 +595,7 @@ if ($PREFIX eq "aesni") {
	&jmp	(&label("ecb_ret"));

&set_label("ecb_dec_two",16);
	&xorps	($inout2,$inout2);
	&call	("_aesni_decrypt3");
	&call	("_aesni_decrypt2");
	&movups	(&QWP(0,$out),$inout0);
	&movups	(&QWP(0x10,$out),$inout1);
	&jmp	(&label("ecb_ret"));
@@ -982,7 +1015,7 @@ if ($PREFIX eq "aesni") {
	&jmp	(&label("ctr32_ret"));

&set_label("ctr32_two",16);
	&call	("_aesni_encrypt3");
	&call	("_aesni_encrypt2");
	&movups	($inout3,&QWP(0,$inp));
	&movups	($inout4,&QWP(0x10,$inp));
	&xorps	($inout0,$inout3);
@@ -1253,9 +1286,8 @@ if ($PREFIX eq "aesni") {
	&lea	($inp,&DWP(16*2,$inp));
	&xorps	($inout0,$inout3);		# input^=tweak
	&xorps	($inout1,$inout4);
	&xorps	($inout2,$inout2);

	&call	("_aesni_encrypt3");
	&call	("_aesni_encrypt2");

	&xorps	($inout0,$inout3);		# output^=tweak
	&xorps	($inout1,$inout4);
@@ -1596,7 +1628,7 @@ if ($PREFIX eq "aesni") {
	&xorps	($inout0,$inout3);		# input^=tweak
	&xorps	($inout1,$inout4);

	&call	("_aesni_decrypt3");
	&call	("_aesni_decrypt2");

	&xorps	($inout0,$inout3);		# output^=tweak
	&xorps	($inout1,$inout4);
@@ -1896,8 +1928,7 @@ if ($PREFIX eq "aesni") {
	&jmp	(&label("cbc_dec_tail_collected"));

&set_label("cbc_dec_two",16);
	&xorps	($inout2,$inout2);
	&call	("_aesni_decrypt3");
	&call	("_aesni_decrypt2");
	&xorps	($inout0,$ivec);
	&xorps	($inout1,$in0);
	&movups	(&QWP(0,$out),$inout0);
+50 −12
Original line number Diff line number Diff line
@@ -288,10 +288,49 @@ ___
# every *2nd* cycle. Thus 3x interleave was the one providing optimal
# utilization, i.e. when subroutine's throughput is virtually same as
# of non-interleaved subroutine [for number of input blocks up to 3].
# This is why it makes no sense to implement 2x subroutine.
# aes[enc|dec] latency in next processor generation is 8, but the
# instructions can be scheduled every cycle. Optimal interleave for
# new processor is therefore 8x...
# This is why it originally made no sense to implement 2x subroutine.
# But times change and it became appropriate to spend extra 192 bytes
# on 2x subroutine on Atom Silvermont account. For processors that
# can schedule aes[enc|dec] every cycle optimal interleave factor
# equals to corresponding instructions latency. 8x is optimal for
# * Bridge and "super-optimal" for other Intel CPUs... 

sub aesni_generate2 {
my $dir=shift;
# As already mentioned it takes in $key and $rounds, which are *not*
# preserved. $inout[0-1] is cipher/clear text...
$code.=<<___;
.type	_aesni_${dir}rypt2,\@abi-omnipotent
.align	16
_aesni_${dir}rypt2:
	$movkey	($key),$rndkey0
	shl	\$4,$rounds
	$movkey	16($key),$rndkey1
	xorps	$rndkey0,$inout0
	xorps	$rndkey0,$inout1
	$movkey	32($key),$rndkey0
	lea	32($key,$rounds),$key
	neg	%rax				# $rounds
	add	\$16,%rax

.L${dir}_loop2:
	aes${dir}	$rndkey1,$inout0
	aes${dir}	$rndkey1,$inout1
	$movkey		($key,%rax),$rndkey1
	add		\$32,%rax
	aes${dir}	$rndkey0,$inout0
	aes${dir}	$rndkey0,$inout1
	$movkey		-16($key,%rax),$rndkey0
	jnz		.L${dir}_loop2

	aes${dir}	$rndkey1,$inout0
	aes${dir}	$rndkey1,$inout1
	aes${dir}last	$rndkey0,$inout0
	aes${dir}last	$rndkey0,$inout1
	ret
.size	_aesni_${dir}rypt2,.-_aesni_${dir}rypt2
___
}
sub aesni_generate3 {
my $dir=shift;
# As already mentioned it takes in $key and $rounds, which are *not*
@@ -524,6 +563,8 @@ _aesni_${dir}rypt8:
.size	_aesni_${dir}rypt8,.-_aesni_${dir}rypt8
___
}
&aesni_generate2("enc") if ($PREFIX eq "aesni");
&aesni_generate2("dec");
&aesni_generate3("enc") if ($PREFIX eq "aesni");
&aesni_generate3("dec");
&aesni_generate4("enc") if ($PREFIX eq "aesni");
@@ -645,8 +686,7 @@ $code.=<<___;
	jmp	.Lecb_ret
.align	16
.Lecb_enc_two:
	xorps	$inout2,$inout2
	call	_aesni_encrypt3
	call	_aesni_encrypt2
	movups	$inout0,($out)
	movups	$inout1,0x10($out)
	jmp	.Lecb_ret
@@ -782,8 +822,7 @@ $code.=<<___;
	jmp	.Lecb_ret
.align	16
.Lecb_dec_two:
	xorps	$inout2,$inout2
	call	_aesni_decrypt3
	call	_aesni_decrypt2
	movups	$inout0,($out)
	movups	$inout1,0x10($out)
	jmp	.Lecb_ret
@@ -1875,7 +1914,7 @@ $code.=<<___;
	xorps	@tweak[0],$inout0
	xorps	@tweak[1],$inout1

	call	_aesni_encrypt3
	call	_aesni_encrypt2

	xorps	@tweak[0],$inout0
	movdqa	@tweak[2],@tweak[0]
@@ -2322,7 +2361,7 @@ $code.=<<___;
	xorps	@tweak[0],$inout0
	xorps	@tweak[1],$inout1

	call	_aesni_decrypt3
	call	_aesni_decrypt2

	xorps	@tweak[0],$inout0
	movdqa	@tweak[2],@tweak[0]
@@ -2831,8 +2870,7 @@ $code.=<<___;
.align	16
.Lcbc_dec_two:
	movaps	$inout1,$in1
	xorps	$inout2,$inout2
	call	_aesni_decrypt3
	call	_aesni_decrypt2
	pxor	$iv,$inout0
	movaps	$in1,$iv
	pxor	$in0,$inout1