Commit 73e8a5c8 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

poly1305/asm/poly1305-x86_64.pl: switch to vpermdd in table expansion.



Effectively it's minor size optimization, 5-6% per affected subroutine.

Reviewed-by: default avatarRich Salz <rsalz@openssl.org>
parent c1e1fc50
Loading
Loading
Loading
Loading
+37 −56
Original line number Diff line number Diff line
@@ -1631,8 +1631,9 @@ $code.=<<___ if ($win64);
.Ldo_avx2_body:
___
$code.=<<___;
	lea		48+64($ctx),$ctx	# size optimization
	lea		.Lconst(%rip),%rcx
	lea		48+64($ctx),$ctx	# size optimization
	vmovdqa		96(%rcx),$T0		# .Lpermd_avx2

	# expand and copy pre-calculated table to stack
	vmovdqu		`16*0-64`($ctx),%x#$T2
@@ -1642,36 +1643,28 @@ $code.=<<___;
	vmovdqu		`16*3-64`($ctx),%x#$D0
	vmovdqu		`16*4-64`($ctx),%x#$D1
	vmovdqu		`16*5-64`($ctx),%x#$D2
	lea		0x90(%rsp),%rax		# size optimization
	vmovdqu		`16*6-64`($ctx),%x#$D3
	vpermq		\$0x15,$T2,$T2		# 00003412 -> 12343434
	vpermd		$T2,$T0,$T2		# 00003412 -> 14243444
	vmovdqu		`16*7-64`($ctx),%x#$D4
	vpermq		\$0x15,$T3,$T3
	vpshufd		\$0xc8,$T2,$T2		# 12343434 -> 14243444
	vpermd		$T3,$T0,$T3
	vmovdqu		`16*8-64`($ctx),%x#$MASK
	vpermq		\$0x15,$T4,$T4
	vpshufd		\$0xc8,$T3,$T3
	vpermd		$T4,$T0,$T4
	vmovdqa		$T2,0x00(%rsp)
	vpermq		\$0x15,$D0,$D0
	vpshufd		\$0xc8,$T4,$T4
	vmovdqa		$T3,0x20(%rsp)
	vpermq		\$0x15,$D1,$D1
	vpshufd		\$0xc8,$D0,$D0
	vmovdqa		$T4,0x40(%rsp)
	vpermq		\$0x15,$D2,$D2
	vpshufd		\$0xc8,$D1,$D1
	vmovdqa		$D0,0x60(%rsp)
	vpermq		\$0x15,$D3,$D3
	vpshufd		\$0xc8,$D2,$D2
	vmovdqa		$D1,0x80(%rsp)
	vpermq		\$0x15,$D4,$D4
	vpshufd		\$0xc8,$D3,$D3
	vmovdqa		$D2,0xa0(%rsp)
	vpermq		\$0x15,$MASK,$MASK
	vpshufd		\$0xc8,$D4,$D4
	vmovdqa		$D3,0xc0(%rsp)
	vpshufd		\$0xc8,$MASK,$MASK
	vmovdqa		$D4,0xe0(%rsp)
	vmovdqa		$MASK,0x100(%rsp)
	vpermd		$D0,$T0,$D0
	vmovdqa		$T3,0x20-0x90(%rax)
	vpermd		$D1,$T0,$D1
	vmovdqa		$T4,0x40-0x90(%rax)
	vpermd		$D2,$T0,$D2
	vmovdqa		$D0,0x60-0x90(%rax)
	vpermd		$D3,$T0,$D3
	vmovdqa		$D1,0x80-0x90(%rax)
	vpermd		$D4,$T0,$D4
	vmovdqa		$D2,0xa0-0x90(%rax)
	vpermd		$MASK,$T0,$MASK
	vmovdqa		$D3,0xc0-0x90(%rax)
	vmovdqa		$D4,0xe0-0x90(%rax)
	vmovdqa		$MASK,0x100-0x90(%rax)
	vmovdqa		64(%rcx),$MASK		# .Lmask26

	################################################################
@@ -1698,7 +1691,6 @@ $code.=<<___;
	vpand		$MASK,$T3,$T3		# 3
	vpor		32(%rcx),$T4,$T4	# padbit, yes, always

	lea		0x90(%rsp),%rax		# size optimization
	vpaddq		$H2,$T2,$H2		# accumulate input
	sub		\$64,$len
	jz		.Ltail_avx2
@@ -2055,8 +2047,9 @@ $code.=<<___ if ($win64);
.Ldo_avx512_body:
___
$code.=<<___;
	lea		48+64($ctx),$ctx	# size optimization
	lea		.Lconst(%rip),%rcx
	lea		48+64($ctx),$ctx	# size optimization
	vmovdqa		96(%rcx),$T2		# .Lpermd_avx2

	# expand pre-calculated table
	vmovdqu32	`16*0-64`($ctx),%x#$R0
@@ -2069,33 +2062,23 @@ $code.=<<___;
	vmovdqu32	`16*6-64`($ctx),%x#$S3
	vmovdqu32	`16*7-64`($ctx),%x#$R4
	vmovdqu32	`16*8-64`($ctx),%x#$S4
	vpermq		\$0x15,$R0,$R0		# 00003412 -> 12343434
	vpermd		$R0,$T2,$R0		# 00003412 -> 14243444
	vmovdqa64	64(%rcx),$MASK		# .Lmask26
	vpermq		\$0x15,$R1,$R1
	vmovdqa32	128(%rcx),$GATHER	# .Lgather
	vpermq		\$0x15,$S1,$S1
	vpshufd		\$0xc8,$R0,$R0		# 12343434 -> 14243444
	vpermq		\$0x15,$R2,$R2
	vpshufd		\$0xc8,$R1,$R1
	vpermd		$R1,$T2,$R1
	vpermd		$S1,$T2,$S1
	vpermd		$R2,$T2,$R2
	vmovdqa32	$R0,0x00(%rsp)		# save in case $len%128 != 0
	 vpsrlq		\$32,$R0,$T0		# 14243444 -> 01020304
	vpermq		\$0x15,$S2,$S2
	vpshufd		\$0xc8,$S1,$S1
	vpermd		$S2,$T2,$S2
	vmovdqa32	$R1,0x20(%rsp)
	 vpsrlq		\$32,$R1,$T1
	vpermq		\$0x15,$R3,$R3
	vpshufd		\$0xc8,$R2,$R2
	vpermd		$R3,$T2,$R3
	vmovdqa32	$S1,0x40(%rsp)
	vpermq		\$0x15,$S3,$S3
	vpshufd		\$0xc8,$S2,$S2
	vpermq		\$0x15,$R4,$R4
	vpshufd		\$0xc8,$R3,$R3
	vpermd		$S3,$T2,$S3
	vpermd		$R4,$T2,$R4
	vmovdqa32	$R2,0x60(%rsp)
	vpermq		\$0x15,$S4,$S4
	vpshufd		\$0xc8,$S3,$S3
	vpermd		$S4,$T2,$S4
	vmovdqa32	$S2,0x80(%rsp)
	vpshufd		\$0xc8,$R4,$R4
	vpshufd		\$0xc8,$S4,$S4
	vmovdqa32	$R3,0xa0(%rsp)
	vmovdqa32	$S3,0xc0(%rsp)
	vmovdqa32	$R4,0xe0(%rsp)
@@ -2275,14 +2258,16 @@ $code.=<<___;
	vpandq		$MASK,$T2,$T2		# 2
	vpandq		$MASK,$T0,$T0		# 0
	vpandq		$MASK,$T1,$T1		# 1
	#vpandq		$MASK,$T3,$T3		# 3
	vpandq		$MASK,$T3,$T3		# 3
	#vporq		$PADBIT,$T4,$T4		# padbit, yes, always

	vpaddq		$H2,$T2,$H2		# accumulate input
	mov		\$0x0f,%eax
	sub		\$192,$len
	jbe		.Ltail_avx512
	jmp		.Loop_avx512

.align	32
.Loop_avx512:
	################################################################
	# ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
@@ -2316,7 +2301,6 @@ $code.=<<___;
	 vpaddq		$H0,$T0,$H0
	vpmuludq	$H2,$R2,$D4		# d4 = h2*r2
	vpmuludq	$H2,$S3,$D0		# d0 = h2*s3
	 vpandq		$MASK,$T3,$T3		# 3, module-scheduled
	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
	 vporq		$PADBIT,$T4,$T4		# padbit, yes, always
	vpmuludq	$H2,$R0,$D2		# d2 = h2*r0
@@ -2425,7 +2409,7 @@ $code.=<<___;

	 vpandq		$MASK,$T0,$T0		# 0
	 vpandq		$MASK,$T1,$T1		# 1
	 #vpandq	$MASK,$T3,$T3		# 3
	 vpandq		$MASK,$T3,$T3		# 3
	 #vporq		$PADBIT,$T4,$T4		# padbit, yes, always

	sub		\$128,$len
@@ -2459,7 +2443,6 @@ $code.=<<___;
	vpmuludq	$H2,$S3,$D0		# d0 = h2*s3
	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
	vpmuludq	$H2,$R0,$D2		# d2 = h2*r0
	 vpandq		$MASK,$T3,$T3		# 3, module-scheduled
	 vporq		$PADBIT,$T4,$T4		# padbit, yes, always
	 vpaddq		$H1,$T1,$H1		# accumulate input
	 vpaddq		$H3,$T3,$H3
@@ -2647,10 +2630,8 @@ $code.=<<___;
.long	`1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
.Lmask26:
.long	0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
.Lfive:
.long	5,0,5,0,5,0,5,0
.Lgather:
.long	0,8, 32,40, 64,72, 96,104
.Lpermd_avx2:
.long	2,2,2,3,2,0,2,1
___
}