Commit 5599c733 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

aes/asm/aesni-x86_64.pl: further optimization for Atom Silvermont.

Improve CBC decrypt and CTR by ~13/16%, which adds up to ~25/33%
improvement over "pre-Silvermont" version. [Add performance table to
aesni-x86.pl].
parent 385b3486
Loading
Loading
Loading
Loading
+11 −0
Original line number Diff line number Diff line
@@ -43,6 +43,17 @@
# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.

######################################################################
# Current large-block performance in cycles per byte processed with
# 128-bit key (less is better).
#
#		CBC en-/decrypt	CTR	XTS	ECB
# Westmere	3.77/1.37	1.37	1.52	1.27
# * Bridge	5.07/0.98	0.99	1.09	0.91
# Haswell	4.44/0.80	0.97	1.03	0.72
# Atom		5.77/3.56	3.67	4.03	3.46
# Bulldozer	5.80/0.98	1.05	1.24	0.93

$PREFIX="aesni";	# if $PREFIX is set to "AES", the script
			# generates drop-in replacement for
			# crypto/aes/asm/aes-586.pl:-)
+168 −21
Original line number Diff line number Diff line
@@ -158,25 +158,19 @@
# in CTR mode AES instruction interleave factor was chosen to be 6x.

######################################################################
# For reference, AMD Bulldozer spends 5.77 cycles per byte processed
# with 128-bit key in CBC encrypt and 0.70 cycles in CBC decrypt, 0.70
# in ECB, 0.71 in CTR, 0.90 in XTS... This means that aes[enc|dec]
# instruction latency is 9 cycles and that they can be issued every
# cycle.

######################################################################
# Haswell spends 4.44 cycles per byte in CBC encrypt, 0.63 in CBC
# decrypt, CTR and ECB, 0.73 in XTS.

######################################################################
# Atom Silvermont spends 5.77/4.0 cycles per byte in CBC en-/decrypt,
# 3.87 in ECB, 4.15 in CTR, 4.12 in XTS. Results for parallelizeable
# modes [other than XTS] are actually suboptimal, because of penalties
# incurred by operations on %xmm8-15, which are inevitable with such
# high instruction interleave factors. This means that performance can
# be improved by decreasing the interleave factor, but then it would
# negatively affect other platforms in relatively larger degree.
# Run-time detection would solve the dilemma...
# Current large-block performance in cycles per byte processed with
# 128-bit key (less is better).
#
#		CBC en-/decrypt	CTR	XTS	ECB
# Westmere	3.77/1.25	1.25	1.25	1.26
# * Bridge	5.07/0.74	0.75	0.90	0.85
# Haswell	4.44/0.63	0.63	0.73	0.63
# Atom		5.75/3.54	3.56	4.12	3.87(*)
# Bulldozer	5.77/0.70	0.72	0.90	0.70
#
# (*)	Atom ECB result is suboptimal because of penalties incurred
#	by operations on %xmm8-15. As ECB is not considered
#	critical, nothing was done to mitigate the problem.

$PREFIX="aesni";	# if $PREFIX is set to "AES", the script
			# generates drop-in replacement for
@@ -201,6 +195,7 @@ $movkey = $PREFIX eq "aesni" ? "movups" : "movups";
		("%rdi","%rsi","%rdx","%rcx");	# Unix order

$code=".text\n";
$code.=".extern	OPENSSL_ia32cap_P\n";

$rounds="%eax";	# input to and changed by aesni_[en|de]cryptN !!!
# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
@@ -1119,7 +1114,9 @@ $code.=<<___;
	lea	7($ctr),%r9
	 mov	%r10d,0x60+12(%rsp)
	bswap	%r9d
	 mov	OPENSSL_ia32cap_P+4(%rip),%r10d 
	xor	$key0,%r9d
	 and	\$`1<<26|1<<22`,%r10d		# isolate XSAVE+MOVBE
	mov	%r9d,0x70+12(%rsp)

	$movkey	0x10($key),$rndkey1
@@ -1130,10 +1127,104 @@ $code.=<<___;
	cmp	\$8,$len
	jb	.Lctr32_tail

	sub	\$6,$len
	cmp	\$`1<<22`,%r10d		# check for MOVBE without XSAVE
	je	.Lctr32_6x

	lea	0x80($key),$key		# size optimization
	sub	\$8,$len
	sub	\$2,$len
	jmp	.Lctr32_loop8

.align	16
.Lctr32_6x:
	shl	\$4,$rounds
	mov	\$48,$rnds_
	bswap	$key0
	lea	32($key,$rounds),$key	# end of key schedule
	sub	%rax,%r10		# twisted $rounds
	jmp	.Lctr32_loop6

.align	16
.Lctr32_loop6:
	 add	\$6,$ctr
	$movkey	-48($key,$rnds_),$rndkey0
	aesenc	$rndkey1,$inout0
	 mov	$ctr,%eax
	 xor	$key0,%eax
	aesenc	$rndkey1,$inout1
	 movbe	%eax,`0x00+12`(%rsp)
	 lea	1($ctr),%eax
	aesenc	$rndkey1,$inout2
	 xor	$key0,%eax
	 movbe	%eax,`0x10+12`(%rsp)
	aesenc	$rndkey1,$inout3
	 lea	2($ctr),%eax
	 xor	$key0,%eax
	aesenc	$rndkey1,$inout4
	 movbe	%eax,`0x20+12`(%rsp)
	 lea	3($ctr),%eax
	aesenc	$rndkey1,$inout5
	$movkey	-32($key,$rnds_),$rndkey1
	 xor	$key0,%eax

	aesenc	$rndkey0,$inout0
	 movbe	%eax,`0x30+12`(%rsp)
	 lea	4($ctr),%eax
	aesenc	$rndkey0,$inout1
	 xor	$key0,%eax
	 movbe	%eax,`0x40+12`(%rsp)
	aesenc	$rndkey0,$inout2
	 lea	5($ctr),%eax
	 xor	$key0,%eax
	aesenc	$rndkey0,$inout3
	 movbe	%eax,`0x50+12`(%rsp)
	 mov	%r10,%rax		# mov	$rnds_,$rounds
	aesenc	$rndkey0,$inout4
	aesenc	$rndkey0,$inout5
	$movkey	-16($key,$rnds_),$rndkey0

	call	.Lenc_loop6

	movdqu	($inp),$inout6
	movdqu	0x10($inp),$inout7
	movdqu	0x20($inp),$in0
	movdqu	0x30($inp),$in1
	movdqu	0x40($inp),$in2
	movdqu	0x50($inp),$in3
	lea	0x60($inp),$inp
	$movkey	-64($key,$rnds_),$rndkey1
	pxor	$inout0,$inout6
	movaps	0x00(%rsp),$inout0
	pxor	$inout1,$inout7
	movaps	0x10(%rsp),$inout1
	pxor	$inout2,$in0
	movaps	0x20(%rsp),$inout2
	pxor	$inout3,$in1
	movaps	0x30(%rsp),$inout3
	pxor	$inout4,$in2
	movaps	0x40(%rsp),$inout4
	pxor	$inout5,$in3
	movaps	0x50(%rsp),$inout5
	movdqu	$inout6,($out)
	movdqu	$inout7,0x10($out)
	movdqu	$in0,0x20($out)
	movdqu	$in1,0x30($out)
	movdqu	$in2,0x40($out)
	movdqu	$in3,0x50($out)
	lea	0x60($out),$out
	
	sub	\$6,$len
	jnc	.Lctr32_loop6

	add	\$6,$len
	jz	.Lctr32_done

	lea	-48($rnds_),$rounds
	lea	-80($key,$rnds_),$key	# restore $key
	neg	$rounds
	shr	\$4,$rounds		# restore $rounds
	jmp	.Lctr32_tail

.align	32
.Lctr32_loop8:
	 add		\$8,$ctr
@@ -2455,10 +2546,15 @@ $code.=<<___;
	movdqa	$inout3,$in3
	movdqu	0x50($inp),$inout5
	movdqa	$inout4,$in4
	mov	OPENSSL_ia32cap_P+4(%rip),%r9d
	cmp	\$0x70,$len
	jbe	.Lcbc_dec_six_or_seven

	sub	\$0x70,$len
	and	\$`1<<26|1<<22`,%r9d	# isolate XSAVE+MOVBE	
	sub	\$0x50,$len
	cmp	\$`1<<22`,%r9d		# check for MOVBE without XSAVE
	je	.Lcbc_dec_loop6_enter
	sub	\$0x20,$len
	lea	0x70($key),$key		# size optimization
	jmp	.Lcbc_dec_loop8_enter
.align	16
@@ -2638,6 +2734,51 @@ $code.=<<___;
	movdqa	$inout6,$inout0
	jmp	.Lcbc_dec_tail_collected

.align	16
.Lcbc_dec_loop6:
	movups	$inout5,($out)
	lea	0x10($out),$out
	movdqu	0x00($inp),$inout0	# load input
	movdqu	0x10($inp),$inout1
	movdqa	$inout0,$in0
	movdqu	0x20($inp),$inout2
	movdqa	$inout1,$in1
	movdqu	0x30($inp),$inout3
	movdqa	$inout2,$in2
	movdqu	0x40($inp),$inout4
	movdqa	$inout3,$in3
	movdqu	0x50($inp),$inout5
	movdqa	$inout4,$in4
.Lcbc_dec_loop6_enter:
	lea	0x60($inp),$inp
	movdqa	$inout5,$inout6

	call	_aesni_decrypt6

	pxor	$iv,$inout0		# ^= IV
	movdqa	$inout6,$iv
	pxor	$in0,$inout1
	movdqu	$inout0,($out)
	pxor	$in1,$inout2
	movdqu	$inout1,0x10($out)
	pxor	$in2,$inout3
	movdqu	$inout2,0x20($out)
	pxor	$in3,$inout4
	mov	$key_,$key
	movdqu	$inout3,0x30($out)
	pxor	$in4,$inout5
	mov	$rnds_,$rounds
	movdqu	$inout4,0x40($out)
	lea	0x50($out),$out
	sub	\$0x60,$len
	ja	.Lcbc_dec_loop6

	movdqa	$inout5,$inout0
	add	\$0x50,$len
	jle	.Lcbc_dec_tail_collected
	movups	$inout5,($out)
	lea	0x10($out),$out

.Lcbc_dec_tail:
	movups	($inp),$inout0
	sub	\$0x10,$len
@@ -3360,8 +3501,14 @@ sub aesni {
    return $line;
}

sub movbe {
	".byte	0x0f,0x38,0xf1,0x44,0x24,".shift;
}

$code =~ s/\`([^\`]*)\`/eval($1)/gem;
$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm;	# debugging artefact
$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;

print $code;