Commit 1cf8f57b authored by Andy Polyakov's avatar Andy Polyakov
Browse files

ghash-x86_64.pl: optimize for upcoming Atom.

parent 619b9466
Loading
Loading
Loading
Loading
+24 −26
Original line number Diff line number Diff line
@@ -59,11 +59,11 @@
# longer. A CPU with higher pclmulqdq issue rate would also benefit
# from higher aggregate factor...
#
# Westmere	1.76(+14%)
# Sandy Bridge	1.79(+9%)
# Ivy Bridge	1.79(+8%)
# Westmere	1.78(+13%)
# Sandy Bridge	1.80(+8%)
# Ivy Bridge	1.80(+7%)
# Haswell	0.55(+93%) (if system doesn't support AVX)
# Bulldozer	1.52(+25%)
# Bulldozer	1.49(+27%)

# March 2013
#
@@ -673,8 +673,8 @@ $code.=<<___;
	pxor		$Xl,$Xm
	pclmulqdq	\$0x00,$Hkey2,$Xl
	pclmulqdq	\$0x11,$Hkey2,$Xh
	xorps		$Xl,$Xln
	pclmulqdq	\$0x10,$HK,$Xm
	xorps		$Xl,$Xln
	xorps		$Xh,$Xhn
	movups		0x50($Htbl),$HK
	xorps		$Xm,$Xmn
@@ -692,8 +692,8 @@ $code.=<<___;
	 pshufd		\$0b01001110,$Xi,$T1
	 pxor		$Xi,$T1
	pclmulqdq	\$0x11,$Hkey3,$Xh
	xorps		$Xl,$Xln
	pclmulqdq	\$0x00,$HK,$Xm
	xorps		$Xl,$Xln
	xorps		$Xh,$Xhn

	lea	0x40($inp),$inp
@@ -711,23 +711,23 @@ $code.=<<___;
	xorps		$Xln,$Xi
	 movdqu		0x20($inp),$Xln
	 movdqa		$Xl,$Xh
	 pshufd		\$0b01001110,$Xl,$Xm
	pclmulqdq	\$0x10,$HK,$T1
	 pshufd		\$0b01001110,$Xl,$Xm
	xorps		$Xhn,$Xhi
	 pxor		$Xl,$Xm
	 pshufb		$T3,$Xln
	movups		0x20($Htbl),$HK
	 pclmulqdq	\$0x00,$Hkey,$Xl
	xorps		$Xmn,$T1
	 movdqa		$Xln,$Xhn
	 pclmulqdq	\$0x00,$Hkey,$Xl
	 pshufd		\$0b01001110,$Xln,$Xmn

	pxor		$Xi,$T1			# aggregated Karatsuba post-processing
	 pxor		$Xln,$Xmn
	 movdqa		$Xln,$Xhn
	pxor		$Xhi,$T1		#
	 pxor		$Xln,$Xmn
	movdqa		$T1,$T2			#
	pslldq		\$8,$T1
	 pclmulqdq	\$0x11,$Hkey,$Xh
	pslldq		\$8,$T1
	psrldq		\$8,$T2			#
	pxor		$T1,$Xi
	movdqa		.L7_mask(%rip),$T1
@@ -736,8 +736,8 @@ $code.=<<___;

	pand		$Xi,$T1			# 1st phase
	pshufb		$T1,$T2			#
	 pclmulqdq	\$0x00,$HK,$Xm
	pxor		$Xi,$T2			#
	 pclmulqdq	\$0x00,$HK,$Xm
	psllq		\$57,$T2		#
	movdqa		$T2,$T1			#
	pslldq		\$8,$T2
@@ -764,32 +764,31 @@ $code.=<<___;
	 movdqa		$Xl,$Xh
	 pxor		$Xm,$Xmn
	 pshufd		\$0b01001110,$Xl,$Xm
	 pxor		$Xl,$Xm
	 pclmulqdq	\$0x00,$Hkey3,$Xl
	pxor		$T2,$Xi			#
	pxor		$T1,$Xhi
	 pxor		$Xl,$Xm
	 pclmulqdq	\$0x00,$Hkey3,$Xl
	psrlq		\$1,$Xi			#
	pxor		$Xhi,$Xi		#
	movdqa		$Xi,$Xhi
	 pclmulqdq	\$0x11,$Hkey3,$Xh
	 xorps		$Xl,$Xln
	pxor		$Xhi,$Xi		#
	pshufd		\$0b01001110,$Xi,$T1
	pxor		$Xi,$T1

	 pclmulqdq	\$0x00,$HK,$Xm
	 xorps		$Xh,$Xhn

	movdqa		$Xi,$Xhi
	pshufd		\$0b01001110,$Xi,$T1
	pxor		$Xi,$T1

	lea	0x40($inp),$inp
	sub	\$0x40,$len
	jnc	.Lmod4_loop

.Ltail4x:
	pclmulqdq	\$0x00,$Hkey4,$Xi
	xorps		$Xm,$Xmn
	pclmulqdq	\$0x11,$Hkey4,$Xhi
	xorps		$Xln,$Xi
	pclmulqdq	\$0x10,$HK,$T1
	xorps		$Xm,$Xmn
	xorps		$Xln,$Xi
	xorps		$Xhn,$Xhi
	pxor		$Xi,$Xhi		# aggregated Karatsuba post-processing
	pxor		$Xmn,$T1
@@ -852,13 +851,13 @@ $code.=<<___;

	pxor		$Xln,$Xi		# (H*Ii+1) + H^2*(Ii+Xi)
	pxor		$Xhn,$Xhi
	  movdqu	($inp),$Xhn		# Ii
	  movdqu	($inp),$T2		# Ii
	pxor		$Xi,$T1			# aggregated Karatsuba post-processing
	  pshufb	$T3,$Xhn
	  pshufb	$T3,$T2
	  movdqu	16($inp),$Xln		# Ii+1

	pxor		$Xhi,$T1
	  pxor		$Xhn,$Xhi		# "Ii+Xi", consume early
	  pxor		$T2,$Xhi		# "Ii+Xi", consume early
	pxor		$T1,$Xmn
	 pshufb		$T3,$Xln
	movdqa		$Xmn,$T1		#
@@ -885,9 +884,9 @@ $code.=<<___;
	  pxor		$T1,$Xhi		#
	pxor		$Xhn,$Xmn		#

	pclmulqdq	\$0x11,$Hkey,$Xhn	#######
	  movdqa	$Xi,$T2			# 2nd phase
	  psrlq		\$1,$Xi
	pclmulqdq	\$0x11,$Hkey,$Xhn	#######
	  pxor		$T2,$Xhi		#
	  pxor		$Xi,$T2
	  psrlq		\$5,$Xi
@@ -896,7 +895,6 @@ $code.=<<___;
	  psrlq		\$1,$Xi			#
	pclmulqdq	\$0x00,$HK,$Xmn		#######
	  pxor		$Xhi,$Xi		#
	  .byte		0x66,0x90

	sub		\$0x20,$len
	ja		.Lmod_loop