Commit 98e143f1 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

ghash-x86[_64].pl: ~15% improvement on Atom Silvermont

(other processors unaffected).
parent fc923969
Loading
Loading
Loading
Loading
+9 −7
Original line number Diff line number Diff line
@@ -1021,13 +1021,14 @@ my ($Xhi,$Xi) = @_;
	&pshufd		($T1,$Xn,0b01001110);	# H*Ii+1
	&movdqa		($Xhn,$Xn);
	&pxor		($T1,$Xn);		#
	&lea		($inp,&DWP(32,$inp));	# i+=2

	&pclmulqdq	($Xn,$Hkey,0x00);	#######
	&pclmulqdq	($Xhn,$Hkey,0x11);	#######
	&movups		($Hkey,&QWP(16,$Htbl));	# load H^2
	&pclmulqdq	($T1,$T3,0x00);		#######
	&movups		($Hkey,&QWP(16,$Htbl));	# load H^2
	&nop		();

	&lea		($inp,&DWP(32,$inp));	# i+=2
	&sub		($len,0x20);
	&jbe		(&label("even_tail"));
	&jmp		(&label("mod_loop"));
@@ -1036,22 +1037,23 @@ my ($Xhi,$Xi) = @_;
	&pshufd		($T2,$Xi,0b01001110);	# H^2*(Ii+Xi)
	&movdqa		($Xhi,$Xi);
	&pxor		($T2,$Xi);		#
	&nop		();

	&pclmulqdq	($Xi,$Hkey,0x00);	#######
	&pclmulqdq	($Xhi,$Hkey,0x11);	#######
	&movups		($Hkey,&QWP(0,$Htbl));	# load H
	&pclmulqdq	($T2,$T3,0x10);		#######
	&movdqa		($T3,&QWP(0,$const));
	&movups		($Hkey,&QWP(0,$Htbl));	# load H

	&xorps		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
	&movdqa		($T3,&QWP(0,$const));
	&xorps		($Xhi,$Xhn);
	 &movdqu	($Xhn,&QWP(0,$inp));	# Ii
	&pxor		($T1,$Xi);		# aggregated Karatsuba post-processing
	 &movdqu	($Xn,&QWP(16,$inp));	# Ii+1
	&pxor		($T1,$Xhi);		#

	&pxor		($T2,$T1);		#
	 &pshufb	($Xhn,$T3);
	&pxor		($T2,$T1);		#

	&movdqa		($T1,$T2);		#
	&psrldq		($T2,8);
@@ -1068,8 +1070,8 @@ my ($Xhi,$Xi) = @_;
	  &pxor		($T1,$Xi);		#
	  &psllq	($Xi,1);
	  &pxor		($Xi,$T1);		#
	&movups		($T3,&QWP(32,$Htbl));
	&pclmulqdq	($Xn,$Hkey,0x00);	#######
	&movups		($T3,&QWP(32,$Htbl));
	  &psllq	($Xi,57);		#
	  &movdqa	($T1,$Xi);		#
	  &pslldq	($Xi,8);
@@ -1080,9 +1082,9 @@ my ($Xhi,$Xi) = @_;
	  &movdqa	($T2,$Xi);		# 2nd phase
	  &psrlq	($Xi,1);
	&pxor		($T1,$Xhn);
	  &pxor		($Xhi,$T2);		#
	&pclmulqdq	($Xhn,$Hkey,0x11);	#######
	&movups		($Hkey,&QWP(16,$Htbl));	# load H^2
	  &pxor		($Xhi,$T2);		#
	  &pxor		($T2,$Xi);
	  &psrlq	($Xi,5);
	  &pxor		($Xi,$T2);		#
+37 −25
Original line number Diff line number Diff line
@@ -214,6 +214,7 @@ ___

$code=<<___;
.text
.extern	OPENSSL_ia32cap_P

.globl	gcm_gmult_4bit
.type	gcm_gmult_4bit,\@function,2
@@ -597,7 +598,8 @@ ___
}

{ my ($Xip,$Htbl,$inp,$len)=@_4args;
  my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(6..10));
  my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7));
  my ($T1,$T2,$T3)=map("%xmm$_",(8..10));

$code.=<<___;
.globl	gcm_ghash_clmul
@@ -624,7 +626,6 @@ $code.=<<___ if ($win64);
___
$code.=<<___;
	movdqa		.Lbswap_mask(%rip),$T3
	mov		\$0xA040608020C0E000,%rax	# ((7..0)0xE0)&0xff

	movdqu		($Xip),$Xi
	movdqu		($Htbl),$Hkey
@@ -640,10 +641,16 @@ if ($do4xaggr) {
my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));

$code.=<<___;
	mov		OPENSSL_ia32cap_P+4(%rip),%eax
	cmp		\$0x30,$len
	jb		.Lskip4x

	and		\$`1<<26|1<<22`,%eax	# isolate MOVBE+XSAVE
	cmp		\$`1<<22`,%eax		# check for MOVBE without XSAVE
	je		.Lskip4x

	sub		\$0x30,$len
	mov		\$0xA040608020C0E000,%rax	# ((7..0)0xE0)&0xff
	movdqu		0x30($Htbl),$Hkey3
	movdqu		0x40($Htbl),$Hkey4

@@ -819,51 +826,54 @@ $code.=<<___;
	pxor		$T1,$Xi			# Ii+Xi

	movdqa		$Xln,$Xhn
	pshufd		\$0b01001110,$Xln,$T1
	pxor		$Xln,$T1
	pshufd		\$0b01001110,$Xln,$Xmn
	pxor		$Xln,$Xmn
	pclmulqdq	\$0x00,$Hkey,$Xln
	pclmulqdq	\$0x11,$Hkey,$Xhn
	pclmulqdq	\$0x00,$HK,$T1
	pclmulqdq	\$0x00,$HK,$Xmn

	lea		32($inp),$inp		# i+=2
	nop
	sub		\$0x20,$len
	jbe		.Leven_tail
	nop
	jmp		.Lmod_loop

.align	32
.Lmod_loop:
	movdqa		$Xi,$Xhi
	pshufd		\$0b01001110,$Xi,$T2	#
	pxor		$Xi,$T2			#
	movdqa		$Xmn,$T1
	pshufd		\$0b01001110,$Xi,$Xmn	#
	pxor		$Xi,$Xmn		#

	pclmulqdq	\$0x00,$Hkey2,$Xi
	pclmulqdq	\$0x11,$Hkey2,$Xhi
	pclmulqdq	\$0x10,$HK,$T2
	pclmulqdq	\$0x10,$HK,$Xmn

	pxor		$Xln,$Xi		# (H*Ii+1) + H^2*(Ii+Xi)
	pxor		$Xhn,$Xhi
	  movdqu	($inp),$Xhn		# Ii
	pxor		$Xi,$T1			# aggregated Karatsuba post-processing
	  pshufb	$T3,$Xhn
	  movdqu	16($inp),$Xln		# Ii+1

	pxor		$Xi,$T1			# aggregated Karatsuba post-processing
	pxor		$Xhi,$T1
	  pxor		$Xhn,$Xhi		# "Ii+Xi", consume early
	pxor		$T1,$T2
	pxor		$T1,$Xmn
	 pshufb		$T3,$Xln
	movdqa		$T2,$T1			#
	movdqa		$Xmn,$T1		#
	psrldq		\$8,$T1
	pslldq		\$8,$T2			#
	pslldq		\$8,$Xmn		#
	pxor		$T1,$Xhi
	pxor		$T2,$Xi			#
	pxor		$Xmn,$Xi		#

	movdqa		$Xln,$Xhn		#

	  movdqa	$Xi,$T2			# 1st phase
	  movdqa	$Xi,$T1
	  psllq		\$5,$Xi
	pclmulqdq	\$0x00,$Hkey,$Xln	#######
	  pxor		$Xi,$T1			#
	pclmulqdq	\$0x00,$Hkey,$Xln	#######
	  psllq		\$1,$Xi
	  pxor		$T1,$Xi			#
	  psllq		\$57,$Xi		#
@@ -871,9 +881,9 @@ $code.=<<___;
	  pslldq	\$8,$Xi
	  psrldq	\$8,$T1			#	
	  pxor		$T2,$Xi
	pshufd		\$0b01001110,$Xhn,$Xmn
	  pxor		$T1,$Xhi		#
	pshufd		\$0b01001110,$Xhn,$T1
	pxor		$Xhn,$T1		#
	pxor		$Xhn,$Xmn		#

	pclmulqdq	\$0x11,$Hkey,$Xhn	#######
	  movdqa	$Xi,$T2			# 2nd phase
@@ -882,33 +892,35 @@ $code.=<<___;
	  pxor		$Xi,$T2
	  psrlq		\$5,$Xi
	  pxor		$T2,$Xi			#
	lea		32($inp),$inp
	  psrlq		\$1,$Xi			#
	pclmulqdq	\$0x00,$HK,$T1		#######
	pclmulqdq	\$0x00,$HK,$Xmn		#######
	  pxor		$Xhi,$Xi		#
	  .byte		0x66,0x90

	lea		32($inp),$inp
	sub		\$0x20,$len
	ja		.Lmod_loop

.Leven_tail:
	 movdqa		$Xi,$Xhi
	 pshufd		\$0b01001110,$Xi,$T2	#
	 pxor		$Xi,$T2			#
	 movdqa		$Xmn,$T1
	 pshufd		\$0b01001110,$Xi,$Xmn	#
	 pxor		$Xi,$Xmn		#

	pclmulqdq	\$0x00,$Hkey2,$Xi
	pclmulqdq	\$0x11,$Hkey2,$Xhi
	pclmulqdq	\$0x10,$HK,$T2
	pclmulqdq	\$0x10,$HK,$Xmn

	pxor		$Xln,$Xi		# (H*Ii+1) + H^2*(Ii+Xi)
	pxor		$Xhn,$Xhi
	pxor		$Xi,$T1
	pxor		$Xhi,$T1
	pxor		$T1,$T2
	movdqa		$T2,$T1			#
	pxor		$T1,$Xmn
	movdqa		$Xmn,$T1		#
	psrldq		\$8,$T1
	pslldq		\$8,$T2			#
	pslldq		\$8,$Xmn		#
	pxor		$T1,$Xhi
	pxor		$T2,$Xi			#
	pxor		$Xmn,$Xi		#
___
	&reduction_alg9	($Xhi,$Xi);
$code.=<<___;