Commit 558ff0f0 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

aes/asm/bsaes-x86_64.pl: Atom-specific optimization.

parent 94d1f4b0
Loading
Loading
Loading
Loading
+32 −40
Original line number Diff line number Diff line
@@ -38,8 +38,8 @@
#		Emilia's	this(*)		difference
#
# Core 2    	9.30		8.69		+7%
# Nehalem(**) 	7.63		6.98		+9%
# Atom	    	17.1		17.4		-2%(***)
# Nehalem(**) 	7.63		6.88		+11%
# Atom	    	17.1		16.4		+4%
#
# (*)	Comparison is not completely fair, because "this" is ECB,
#	i.e. no extra processing such as counter values calculation
@@ -50,14 +50,6 @@
# (**)	Results were collected on Westmere, which is considered to
#	be equivalent to Nehalem for this code.
#
# (***)	Slowdown on Atom is rather strange per se, because original
#	implementation has a number of 9+-bytes instructions, which
#	are bad for Atom front-end, and which I eliminated completely.
#	In attempt to address deterioration sbox() was tested in FP
#	SIMD "domain" (movaps instead of movdqa, xorps instead of
#	pxor, etc.). While it resulted in nominal 4% improvement on
#	Atom, it hurted Westmere by more than 2x factor.
#
# As for key schedule conversion subroutine. Interface to OpenSSL
# relies on per-invocation on-the-fly conversion. This naturally
# has impact on performance, especially for short inputs. Conversion
@@ -67,7 +59,7 @@
# 		conversion	conversion/8x block
# Core 2	240		0.22
# Nehalem	180		0.20
# Atom		430		0.19
# Atom		430		0.20
#
# The ratio values mean that 128-byte blocks will be processed
# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
@@ -83,9 +75,9 @@
# Add decryption procedure. Performance in CPU cycles spent to decrypt
# one byte out of 4096-byte buffer with 128-bit key is:
#
# Core 2	9.83
# Nehalem	7.74
# Atom		19.0
# Core 2	9.98
# Nehalem	7.80
# Atom		17.9
#
# November 2011.
#
@@ -434,21 +426,21 @@ my $mask=pop;
$code.=<<___;
	pxor	0x00($key),@x[0]
	pxor	0x10($key),@x[1]
	pshufb	$mask,@x[0]
	pxor	0x20($key),@x[2]
	pshufb	$mask,@x[1]
	pxor	0x30($key),@x[3]
	pshufb	$mask,@x[2]
	pshufb	$mask,@x[0]
	pshufb	$mask,@x[1]
	pxor	0x40($key),@x[4]
	pshufb	$mask,@x[3]
	pxor	0x50($key),@x[5]
	pshufb	$mask,@x[4]
	pshufb	$mask,@x[2]
	pshufb	$mask,@x[3]
	pxor	0x60($key),@x[6]
	pshufb	$mask,@x[5]
	pxor	0x70($key),@x[7]
	pshufb	$mask,@x[4]
	pshufb	$mask,@x[5]
	pshufb	$mask,@x[6]
	lea	0x80($key),$key
	pshufb	$mask,@x[7]
	lea	0x80($key),$key
___
}

@@ -820,18 +812,18 @@ _bsaes_encrypt8:
	movdqa	0x50($const), @XMM[8]	# .LM0SR
	pxor	@XMM[9], @XMM[0]	# xor with round0 key
	pxor	@XMM[9], @XMM[1]
	 pshufb	@XMM[8], @XMM[0]
	pxor	@XMM[9], @XMM[2]
	 pshufb	@XMM[8], @XMM[1]
	pxor	@XMM[9], @XMM[3]
	 pshufb	@XMM[8], @XMM[2]
	 pshufb	@XMM[8], @XMM[0]
	 pshufb	@XMM[8], @XMM[1]
	pxor	@XMM[9], @XMM[4]
	 pshufb	@XMM[8], @XMM[3]
	pxor	@XMM[9], @XMM[5]
	 pshufb	@XMM[8], @XMM[4]
	 pshufb	@XMM[8], @XMM[2]
	 pshufb	@XMM[8], @XMM[3]
	pxor	@XMM[9], @XMM[6]
	 pshufb	@XMM[8], @XMM[5]
	pxor	@XMM[9], @XMM[7]
	 pshufb	@XMM[8], @XMM[4]
	 pshufb	@XMM[8], @XMM[5]
	 pshufb	@XMM[8], @XMM[6]
	 pshufb	@XMM[8], @XMM[7]
_bsaes_encrypt8_bitslice:
@@ -884,18 +876,18 @@ _bsaes_decrypt8:
	movdqa	-0x30($const), @XMM[8]	# .LM0ISR
	pxor	@XMM[9], @XMM[0]	# xor with round0 key
	pxor	@XMM[9], @XMM[1]
	 pshufb	@XMM[8], @XMM[0]
	pxor	@XMM[9], @XMM[2]
	 pshufb	@XMM[8], @XMM[1]
	pxor	@XMM[9], @XMM[3]
	 pshufb	@XMM[8], @XMM[2]
	 pshufb	@XMM[8], @XMM[0]
	 pshufb	@XMM[8], @XMM[1]
	pxor	@XMM[9], @XMM[4]
	 pshufb	@XMM[8], @XMM[3]
	pxor	@XMM[9], @XMM[5]
	 pshufb	@XMM[8], @XMM[4]
	 pshufb	@XMM[8], @XMM[2]
	 pshufb	@XMM[8], @XMM[3]
	pxor	@XMM[9], @XMM[6]
	 pshufb	@XMM[8], @XMM[5]
	pxor	@XMM[9], @XMM[7]
	 pshufb	@XMM[8], @XMM[4]
	 pshufb	@XMM[8], @XMM[5]
	 pshufb	@XMM[8], @XMM[6]
	 pshufb	@XMM[8], @XMM[7]
___
@@ -1937,21 +1929,21 @@ $code.=<<___;
	movdqa	-0x10(%r11), @XMM[8]	# .LSWPUPM0SR
	pxor	@XMM[9], @XMM[0]	# xor with round0 key
	pxor	@XMM[9], @XMM[1]
	 pshufb	@XMM[8], @XMM[0]
	pxor	@XMM[9], @XMM[2]
	 pshufb	@XMM[8], @XMM[1]
	pxor	@XMM[9], @XMM[3]
	 pshufb	@XMM[8], @XMM[2]
	 pshufb	@XMM[8], @XMM[0]
	 pshufb	@XMM[8], @XMM[1]
	pxor	@XMM[9], @XMM[4]
	 pshufb	@XMM[8], @XMM[3]
	pxor	@XMM[9], @XMM[5]
	 pshufb	@XMM[8], @XMM[4]
	 pshufb	@XMM[8], @XMM[2]
	 pshufb	@XMM[8], @XMM[3]
	pxor	@XMM[9], @XMM[6]
	 pshufb	@XMM[8], @XMM[5]
	pxor	@XMM[9], @XMM[7]
	 pshufb	@XMM[8], @XMM[4]
	 pshufb	@XMM[8], @XMM[5]
	 pshufb	@XMM[8], @XMM[6]
	lea	.LBS0(%rip), %r11	# constants table
	 pshufb	@XMM[8], @XMM[7]
	lea	.LBS0(%rip), %r11	# constants table
	mov	%ebx,%r10d		# pass rounds

	call	_bsaes_encrypt8_bitslice