Commit fd910ef9 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

poly1305/asm/poly1305-x86_64.pl: add VPMADD52 code path.



This is initial and minimal single-block implementation.

Reviewed-by: default avatarRich Salz <rsalz@openssl.org>
parent 73e8a5c8
Loading
Loading
Loading
Loading
+215 −4
Original line number Diff line number Diff line
@@ -62,13 +62,13 @@ die "can't locate x86_64-xlate.pl";

if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
	$avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
	$avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26);
}

if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
	$avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
	$avx += 1 if ($1==2.11 && $2>=8);
	$avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12);
	$avx += 2 if ($1==2.11 && $2>=8);
}

if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
@@ -178,6 +178,13 @@ $code.=<<___ if ($avx>1);
	bt	\$`5+32`,%r9		# AVX2?
	cmovc	%rax,%r10
___
$code.=<<___	if ($avx>3);
	mov	\$`(1<<31|1<<21|1<<16)`,%rax
	shr	\$32,%r9
	and	%rax,%r9
	cmp	%rax,%r9
	je	.Linit_base2_44
___
$code.=<<___;
	mov	\$0x0ffffffc0fffffff,%rax
	mov	\$0x0ffffffc0ffffffc,%rcx
@@ -2620,7 +2627,200 @@ $code.=<<___;
	ret
.size	poly1305_blocks_avx512,.-poly1305_blocks_avx512
___
}	}
if ($avx>3) {
########################################################################
# VPMADD52 version using 2^44 radix.
#
# One can argue that base 2^52 would be more natural. Well, even though
# some operations would be more natural, one has to recognize couple of
# things. Base 2^52 doesn't provide advantage over base 2^44 if you look
# at amount of multiply-n-accumulate operations. Secondly, it makes it
# impossible to pre-compute multiples of 5 [referred to as s[]/sN in
# reference implementations], which means that more such operations
# would have to be performed in inner loop, which in turn makes critical
# path longer. In other words, even though base 2^44 reduction might
# look less elegant, overall critical path is actually shorter...

$code.=<<___;
.type	poly1305_init_base2_44,\@function,3
.align	32
poly1305_init_base2_44:
	xor	%rax,%rax
	mov	%rax,0($ctx)		# initialize hash value
	mov	%rax,8($ctx)
	mov	%rax,16($ctx)

.Linit_base2_44:
	lea	poly1305_blocks_vpmadd52(%rip),%r10
	lea	poly1305_emit_base2_44(%rip),%r11

	mov	\$0x0ffffffc0fffffff,%rax
	mov	\$0x0ffffffc0ffffffc,%rcx
	and	0($inp),%rax
	mov	\$0x00000fffffffffff,%r8
	and	8($inp),%rcx
	mov	\$0x00000fffffffffff,%r9
	and	%rax,%r8
	shrd	\$44,%rcx,%rax
	mov	%r8,40($ctx)		# r0
	and	%r9,%rax
	shr	\$24,%rcx
	mov	%rax,48($ctx)		# r1
	lea	(%rax,%rax,4),%rax	# *5
	mov	%rcx,56($ctx)		# r2
	shl	\$2,%rax		# magic <<2
	lea	(%rcx,%rcx,4),%rcx	# *5
	shl	\$2,%rcx		# magic <<2
	mov	%rax,24($ctx)		# s1
	mov	%rcx,32($ctx)		# s2
___
$code.=<<___	if ($flavour !~ /elf32/);
	mov	%r10,0(%rdx)
	mov	%r11,8(%rdx)
___
$code.=<<___	if ($flavour =~ /elf32/);
	mov	%r10d,0(%rdx)
	mov	%r11d,4(%rdx)
___
$code.=<<___;
	mov	\$1,%eax
	ret
.size	poly1305_init_base2_44,.-poly1305_init_base2_44
___
{
my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));

$code.=<<___;
.type	poly1305_blocks_vpmadd52,\@function,4
.align	32
poly1305_blocks_vpmadd52:
	shr	\$4,$len
	jz	.Lno_data_vpmadd52		# too short

	mov		\$7,%r10d
	mov		\$1,%r11d
	kmovw		%r10d,%k7
	lea		.L2_44_inp_permd(%rip),%r10
	shl		\$40,$padbit
	kmovw		%r11d,%k1

	vmovq		$padbit,%x#$PAD
	vmovdqa64	0(%r10),$inp_permd	# .L2_44_inp_permd
	vmovdqa64	32(%r10),$inp_shift	# .L2_44_inp_shift
	vpermq		\$0xcf,$PAD,$PAD
	vmovdqa64	64(%r10),$reduc_mask	# .L2_44_mask

	vmovdqu64	0($ctx),${Dlo}{%k7}{z}		# load hash value
	vmovdqu64	40($ctx),${r2r1r0}{%k7}{z}	# load keys
	vmovdqu64	32($ctx),${r1r0s2}{%k7}{z}
	vmovdqu64	24($ctx),${r0s2s1}{%k7}{z}

	vmovdqa64	96(%r10),$reduc_rght	# .L2_44_shift_rgt
	vmovdqa64	128(%r10),$reduc_left	# .L2_44_shift_lft

	jmp		.Loop_vpmadd52

.align	32
.Loop_vpmadd52:
	vmovdqu32	0($inp),%x#$T0		# load input as ----3210
	lea		16($inp),$inp

	vpermd		$T0,$inp_permd,$T0	# ----3210 -> --322110
	vpsrlvq		$inp_shift,$T0,$T0
	vpandq		$reduc_mask,$T0,$T0
	vporq		$PAD,$T0,$T0

	vpaddq		$T0,$Dlo,$Dlo		# accumulate input

	vpermq		\$0,$Dlo,${H0}{%k7}{z}	# smash hash value
	vpermq		\$0b01010101,$Dlo,${H1}{%k7}{z}
	vpermq		\$0b10101010,$Dlo,${H2}{%k7}{z}

	vpxord		$Dlo,$Dlo,$Dlo
	vpxord		$Dhi,$Dhi,$Dhi

	vpmadd52luq	$r2r1r0,$H0,$Dlo
	vpmadd52huq	$r2r1r0,$H0,$Dhi

	vpmadd52luq	$r1r0s2,$H1,$Dlo
	vpmadd52huq	$r1r0s2,$H1,$Dhi

	vpmadd52luq	$r0s2s1,$H2,$Dlo
	vpmadd52huq	$r0s2s1,$H2,$Dhi

	vpsrlvq		$reduc_rght,$Dlo,$T0	# 0 in topmost qword
	vpsllvq		$reduc_left,$Dhi,$Dhi	# 0 in topmost qword
	vpandq		$reduc_mask,$Dlo,$Dlo

	vpaddq		$T0,$Dhi,$Dhi

	vpermq		\$0b10010011,$Dhi,$Dhi	# 0 in lowest qword

	vpaddq		$Dhi,$Dlo,$Dlo		# note topmost qword :-)

	vpsrlvq		$reduc_rght,$Dlo,$T0	# 0 in topmost word
	vpandq		$reduc_mask,$Dlo,$Dlo

	vpermq		\$0b10010011,$T0,$T0

	vpaddq		$T0,$Dlo,$Dlo

	vpermq		\$0b10010011,$Dlo,${T0}{%k1}{z}

	vpaddq		$T0,$Dlo,$Dlo
	vpsllq		\$2,$T0,$T0

	vpaddq		$T0,$Dlo,$Dlo

	dec		$len			# len-=16
	jnz		.Loop_vpmadd52

	vmovdqu64	$Dlo,0($ctx){%k7}	# store hash value

.Lno_data_vpmadd52:
	ret
.size	poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
___
}
$code.=<<___;
.type	poly1305_emit_base2_44,\@function,3
.align	32
poly1305_emit_base2_44:
	mov	0($ctx),%r8	# load hash value
	mov	8($ctx),%r9
	mov	16($ctx),%r10

	mov	%r9,%rax
	shr	\$20,%r9
	shl	\$44,%rax
	mov	%r10,%rcx
	shr	\$40,%r10
	shl	\$24,%rcx

	add	%rax,%r8
	adc	%rcx,%r9
	adc	\$0,%r10

	mov	%r8,%rax
	add	\$5,%r8		# compare to modulus
	mov	%r9,%rcx
	adc	\$0,%r9
	adc	\$0,%r10
	shr	\$2,%r10	# did 130-bit value overfow?
	cmovnz	%r8,%rax
	cmovnz	%r9,%rcx

	add	0($nonce),%rax	# accumulate nonce
	adc	8($nonce),%rcx
	mov	%rax,0($mac)	# write result
	mov	%rcx,8($mac)

	ret
.size	poly1305_emit_base2_44,.-poly1305_emit_base2_44
___
}	}	}
$code.=<<___;
.align	64
.Lconst:
@@ -2632,6 +2832,17 @@ $code.=<<___;
.long	0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
.Lpermd_avx2:
.long	2,2,2,3,2,0,2,1

.L2_44_inp_permd:
.long	0,1,1,2,2,3,7,7
.L2_44_inp_shift:
.quad	0,12,24,64
.L2_44_mask:
.quad	0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
.L2_44_shift_rgt:
.quad	44,44,42,64
.L2_44_shift_lft:
.quad	8,8,10,64
___
}