Commit 988d11b6 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

vpaes-x86[_64].pl: minor Atom-specific optimization.

parent 8a97a330
Loading
Loading
Loading
Loading
+23 −23
Original line number Diff line number Diff line
@@ -29,7 +29,7 @@
#
# Core 2(**)	28.1/41.4/18.3		21.9/25.2(***)
# Nehalem	27.9/40.4/18.1		10.2/11.9
# Atom		70.7/92.1/60.1		61.1/81.0(***)
# Atom		70.7/92.1/60.1		61.1/75.4(***)
#
# (*)	"Hyper-threading" in the context refers rather to cache shared
#	among multiple cores, than to specifically Intel HTT. As vast
@@ -295,43 +295,43 @@ $k_dsbo=0x2c0; # decryption sbox final output
	&movdqa	("xmm1",&QWP(-0x10,$base));	# 0 : sb9t
	&pshufb	("xmm4","xmm2");		# 4 = sb9u
	&pshufb	("xmm1","xmm3");		# 0 = sb9t
	&pxor	("xmm4","xmm0");
	&add	($key,16);			# next round key
	&pxor	("xmm1","xmm4");		# 0 = ch

	&pxor	("xmm0","xmm4");
	&movdqa	("xmm4",&QWP(0,$base));		# 4 : sbdu
	&pshufb	("xmm1","xmm5");		# MC ch
	&pshufb	("xmm4","xmm2");		# 4 = sbdu
	&movdqa	("xmm0",&QWP(0x10,$base));	# 0 : sbdt
	&pxor	("xmm4","xmm1");		# 4 = ch
	&pshufb	("xmm0","xmm3");		# 0 = sbdt
	&sub	($round,1);			# nr--
	&pxor	("xmm0","xmm4");		# 0 = ch
	&pxor	("xmm0","xmm1");		# 0 = ch
	&movdqa	("xmm1",&QWP(0x10,$base));	# 0 : sbdt

	&movdqa	("xmm4",&QWP(0x20,$base));	# 4 : sbbu
	&pshufb	("xmm4","xmm2");		# 4 = sbdu
	&pshufb	("xmm0","xmm5");		# MC ch
	&pshufb	("xmm1","xmm3");		# 0 = sbdt
	&pxor	("xmm0","xmm4");		# 4 = ch
	&movdqa	("xmm4",&QWP(0x20,$base));	# 4 : sbbu
	&pxor	("xmm0","xmm1");		# 0 = ch
	&movdqa	("xmm1",&QWP(0x30,$base));	# 0 : sbbt

	&pshufb	("xmm4","xmm2");		# 4 = sbbu
	&pshufb	("xmm0","xmm5");		# MC ch
	&pshufb	("xmm1","xmm3");		# 0 = sbbt
	&pxor	("xmm4","xmm0");		# 4 = ch
	&pxor	("xmm1","xmm4");		# 0 = ch

	&pxor	("xmm0","xmm4");		# 4 = ch
	&movdqa	("xmm4",&QWP(0x40,$base));	# 4 : sbeu
	&pshufb	("xmm1","xmm5");		# MC ch
	&movdqa	("xmm0",&QWP(0x50,$base));	# 0 : sbet
	&pxor	("xmm0","xmm1");		# 0 = ch
	&movdqa	("xmm1",&QWP(0x50,$base));	# 0 : sbet

	&pshufb	("xmm4","xmm2");		# 4 = sbeu
	&pshufb	("xmm0","xmm3");		# 0 = sbet
	&pshufb	("xmm0","xmm5");		# MC ch
	&pshufb	("xmm1","xmm3");		# 0 = sbet
	&pxor	("xmm0","xmm4");		# 4 = ch
	&add	($key,16);			# next round key
	&palignr("xmm5","xmm5",12);
	&pxor	("xmm4","xmm1");		# 4 = ch
	&pxor	("xmm0","xmm4");		# 0 = ch
	&pxor	("xmm0","xmm1");		# 0 = ch
	&sub	($round,1);			# nr--

&set_label("dec_entry");
	# top of round
	&movdqa	("xmm1","xmm6");		# 1 : i
	&pandn	("xmm1","xmm0");		# 1 = i<<4
	&movdqa	("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
	&psrld	("xmm1",4);			# 1 = i
	&pandn	("xmm1","xmm0");		# 1 = i<<4
	&pand	("xmm0","xmm6");		# 0 = k
	&psrld	("xmm1",4);			# 1 = i
	&pshufb	("xmm2","xmm0");		# 2 = a/k
	&movdqa	("xmm3","xmm7");		# 3 : 1/i
	&pxor	("xmm0","xmm1");		# 0 = j
+21 −21
Original line number Diff line number Diff line
@@ -29,7 +29,7 @@
#
# Core 2(**)	29.6/41.1/14.3		21.9/25.2(***)
# Nehalem	29.6/40.3/14.6		10.0/11.8
# Atom		57.3/74.2/32.1		60.9/82.3(***)
# Atom		57.3/74.2/32.1		60.9/77.2(***)
#
# (*)	"Hyper-threading" in the context refers rather to cache shared
#	among multiple cores, than to specifically Intel HTT. As vast
@@ -204,35 +204,35 @@ _vpaes_decrypt_core:
	movdqa  -0x10(%r10),%xmm1	# 0 : sb9t
	pshufb	%xmm2,	%xmm4		# 4 = sb9u
	pshufb	%xmm3,	%xmm1		# 0 = sb9t
	pxor	%xmm0,	%xmm4
	add	\$16, %r9		# next round key
	pxor	%xmm4,	%xmm1		# 0 = ch

	pxor	%xmm4,	%xmm0
	movdqa  0x00(%r10),%xmm4	# 4 : sbdu
	pshufb	%xmm5,	%xmm1		# MC ch
	pshufb	%xmm2,	%xmm4		# 4 = sbdu
	movdqa  0x10(%r10),%xmm0	# 0 : sbdt
	pxor	%xmm1,	%xmm4		# 4 = ch
	pshufb	%xmm3,	%xmm0		# 0 = sbdt
	sub	\$1,%rax		# nr--
	pxor	%xmm4,	%xmm0		# 0 = ch
	pxor	%xmm1,	%xmm0		# 0 = ch
	movdqa  0x10(%r10),%xmm1	# 0 : sbdt

	movdqa  0x20(%r10),%xmm4	# 4 : sbbu
	pshufb	%xmm2,	%xmm4		# 4 = sbdu
	pshufb	%xmm5,	%xmm0		# MC ch
	pshufb	%xmm3,	%xmm1		# 0 = sbdt
	pxor	%xmm4,	%xmm0		# 4 = ch
	movdqa  0x20(%r10),%xmm4	# 4 : sbbu
	pxor	%xmm1,	%xmm0		# 0 = ch
	movdqa  0x30(%r10),%xmm1	# 0 : sbbt

	pshufb	%xmm2,	%xmm4		# 4 = sbbu
	pshufb	%xmm5,	%xmm0		# MC ch
	pshufb	%xmm3,	%xmm1		# 0 = sbbt
	pxor	%xmm0,	%xmm4		# 4 = ch
	pxor	%xmm4,	%xmm1		# 0 = ch

	pxor	%xmm4,	%xmm0		# 4 = ch
	movdqa  0x40(%r10),%xmm4	# 4 : sbeu
	pshufb	%xmm5,	%xmm1		# MC ch
	movdqa  0x50(%r10),%xmm0	# 0 : sbet
	pxor	%xmm1,	%xmm0		# 0 = ch
	movdqa  0x50(%r10),%xmm1	# 0 : sbet

	pshufb	%xmm2,	%xmm4		# 4 = sbeu
	pshufb	%xmm3,	%xmm0		# 0 = sbet
	pshufb	%xmm5,	%xmm0		# MC ch
	pshufb	%xmm3,	%xmm1		# 0 = sbet
	pxor	%xmm4,	%xmm0		# 4 = ch
	add	\$16, %r9		# next round key
	palignr	\$12,	%xmm5,	%xmm5
	pxor	%xmm1,	%xmm4		# 4 = ch
	pxor	%xmm4,	%xmm0		# 0 = ch
	pxor	%xmm1,	%xmm0		# 0 = ch
	sub	\$1,%rax		# nr--

.Ldec_entry:
	# top of round