Commit abb8c44f authored by Andy Polyakov's avatar Andy Polyakov
Browse files

x86_64 assembly pack: add AVX512 ChaCha20 and Poly1305 code paths.



Reviewed-by: default avatarRich Salz <rsalz@openssl.org>
parent f2d78649
Loading
Loading
Loading
Loading
+519 −3
Original line number Diff line number Diff line
@@ -18,6 +18,10 @@
#
# ChaCha20 for x86_64.
#
# December 2016
#
# Add AVX512F code path.
#
# Performance in cycles per byte out of large buffer.
#
#		IALU/gcc 4.8(i)	1xSSSE3/SSE2	4xSSSE3	    8xAVX2
@@ -58,12 +62,13 @@ die "can't locate x86_64-xlate.pl";

if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
	$avx = ($1>=2.19) + ($1>=2.22);
	$avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
}

if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
	$avx = ($1>=2.09) + ($1>=2.10);
	$avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
	$avx += 1 if ($1==2.11 && $2>=8);
}

if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
@@ -105,6 +110,11 @@ $code.=<<___;
.byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
.Lsigma:
.asciz	"expand 32-byte k"
.align	64
.Lincz:
.long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
.Lsixteen:
.long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
.asciz	"ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
___

@@ -1721,6 +1731,12 @@ $code.=<<___;
.align	32
ChaCha20_8x:
.LChaCha20_8x:
___
$code.=<<___		if ($avx>2);
	test		\$`1<<16`,%r10d			# check for AVX512F
	jnz		.LChaCha20_16x
___
$code.=<<___;
	mov		%rsp,%r10
	sub		\$0x280+$xframe,%rsp
	and		\$-32,%rsp
@@ -2212,7 +2228,7 @@ $code.=<<___;
	jnz		.Loop_tail8x

.Ldone8x:
	vzeroall
	vzeroupper
___
$code.=<<___	if ($win64);
	lea		0x290+0x30(%rsp),%r11
@@ -2234,6 +2250,506 @@ $code.=<<___;
___
}

########################################################################
# AVX512 code paths
if ($avx>2) {
my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
    $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15));
my  @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
	 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
my @key=map("%zmm$_",(16..31));
my ($xt0,$xt1,$xt2,$xt3)=@key[0..3];

sub AVX512_lane_ROUND {
my ($a0,$b0,$c0,$d0)=@_;
my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
my @x=map("\"$_\"",@xx);

	(
	"&vpaddd	(@x[$a0],@x[$a0],@x[$b0])",	# Q1
	 "&vpaddd	(@x[$a1],@x[$a1],@x[$b1])",	# Q2
	  "&vpaddd	(@x[$a2],@x[$a2],@x[$b2])",	# Q3
	   "&vpaddd	(@x[$a3],@x[$a3],@x[$b3])",	# Q4
	"&vpxord	(@x[$d0],@x[$d0],@x[$a0])",
	 "&vpxord	(@x[$d1],@x[$d1],@x[$a1])",
	  "&vpxord	(@x[$d2],@x[$d2],@x[$a2])",
	   "&vpxord	(@x[$d3],@x[$d3],@x[$a3])",
	"&vprold	(@x[$d0],@x[$d0],16)",
	 "&vprold	(@x[$d1],@x[$d1],16)",
	  "&vprold	(@x[$d2],@x[$d2],16)",
	   "&vprold	(@x[$d3],@x[$d3],16)",

	"&vpaddd	(@x[$c0],@x[$c0],@x[$d0])",
	 "&vpaddd	(@x[$c1],@x[$c1],@x[$d1])",
	  "&vpaddd	(@x[$c2],@x[$c2],@x[$d2])",
	   "&vpaddd	(@x[$c3],@x[$c3],@x[$d3])",
	"&vpxord	(@x[$b0],@x[$b0],@x[$c0])",
	 "&vpxord	(@x[$b1],@x[$b1],@x[$c1])",
	  "&vpxord	(@x[$b2],@x[$b2],@x[$c2])",
	   "&vpxord	(@x[$b3],@x[$b3],@x[$c3])",
	"&vprold	(@x[$b0],@x[$b0],12)",
	 "&vprold	(@x[$b1],@x[$b1],12)",
	  "&vprold	(@x[$b2],@x[$b2],12)",
	   "&vprold	(@x[$b3],@x[$b3],12)",

	"&vpaddd	(@x[$a0],@x[$a0],@x[$b0])",
	 "&vpaddd	(@x[$a1],@x[$a1],@x[$b1])",
	  "&vpaddd	(@x[$a2],@x[$a2],@x[$b2])",
	   "&vpaddd	(@x[$a3],@x[$a3],@x[$b3])",
	"&vpxord	(@x[$d0],@x[$d0],@x[$a0])",
	 "&vpxord	(@x[$d1],@x[$d1],@x[$a1])",
	  "&vpxord	(@x[$d2],@x[$d2],@x[$a2])",
	   "&vpxord	(@x[$d3],@x[$d3],@x[$a3])",
	"&vprold	(@x[$d0],@x[$d0],8)",
	 "&vprold	(@x[$d1],@x[$d1],8)",
	  "&vprold	(@x[$d2],@x[$d2],8)",
	   "&vprold	(@x[$d3],@x[$d3],8)",

	"&vpaddd	(@x[$c0],@x[$c0],@x[$d0])",
	 "&vpaddd	(@x[$c1],@x[$c1],@x[$d1])",
	  "&vpaddd	(@x[$c2],@x[$c2],@x[$d2])",
	   "&vpaddd	(@x[$c3],@x[$c3],@x[$d3])",
	"&vpxord	(@x[$b0],@x[$b0],@x[$c0])",
	 "&vpxord	(@x[$b1],@x[$b1],@x[$c1])",
	  "&vpxord	(@x[$b2],@x[$b2],@x[$c2])",
	   "&vpxord	(@x[$b3],@x[$b3],@x[$c3])",
	"&vprold	(@x[$b0],@x[$b0],7)",
	 "&vprold	(@x[$b1],@x[$b1],7)",
	  "&vprold	(@x[$b2],@x[$b2],7)",
	   "&vprold	(@x[$b3],@x[$b3],7)"
	);
}

my $xframe = $win64 ? 0xb0 : 8;

$code.=<<___;
.type	ChaCha20_16x,\@function,5
.align	32
ChaCha20_16x:
.LChaCha20_16x:
	mov		%rsp,%r11
	sub		\$64+$xframe,%rsp
	and		\$-64,%rsp
___
$code.=<<___	if ($win64);
	lea		0x290+0x30(%rsp),%r11
	movaps		%xmm6,-0x30(%r11)
	movaps		%xmm7,-0x20(%r11)
	movaps		%xmm8,-0x10(%r11)
	movaps		%xmm9,0x00(%r11)
	movaps		%xmm10,0x10(%r11)
	movaps		%xmm11,0x20(%r11)
	movaps		%xmm12,0x30(%r11)
	movaps		%xmm13,0x40(%r11)
	movaps		%xmm14,0x50(%r11)
	movaps		%xmm15,0x60(%r11)
___
$code.=<<___;
	vzeroupper

	lea		.Lsigma(%rip),%r10
	vbroadcasti32x4	(%r10),$xa3		# key[0]
	vbroadcasti32x4	($key),$xb3		# key[1]
	vbroadcasti32x4	16($key),$xc3		# key[2]
	vbroadcasti32x4	($counter),$xd3		# key[3]

	vpshufd		\$0x00,$xa3,$xa0	# smash key by lanes...
	vpshufd		\$0x55,$xa3,$xa1
	vpshufd		\$0xaa,$xa3,$xa2
	vpshufd		\$0xff,$xa3,$xa3
	vmovdqa64	$xa0,@key[0]
	vmovdqa64	$xa1,@key[1]
	vmovdqa64	$xa2,@key[2]
	vmovdqa64	$xa3,@key[3]

	vpshufd		\$0x00,$xb3,$xb0
	vpshufd		\$0x55,$xb3,$xb1
	vpshufd		\$0xaa,$xb3,$xb2
	vpshufd		\$0xff,$xb3,$xb3
	vmovdqa64	$xb0,@key[4]
	vmovdqa64	$xb1,@key[5]
	vmovdqa64	$xb2,@key[6]
	vmovdqa64	$xb3,@key[7]

	vpshufd		\$0x00,$xc3,$xc0
	vpshufd		\$0x55,$xc3,$xc1
	vpshufd		\$0xaa,$xc3,$xc2
	vpshufd		\$0xff,$xc3,$xc3
	vmovdqa64	$xc0,@key[8]
	vmovdqa64	$xc1,@key[9]
	vmovdqa64	$xc2,@key[10]
	vmovdqa64	$xc3,@key[11]

	vpshufd		\$0x00,$xd3,$xd0
	vpshufd		\$0x55,$xd3,$xd1
	vpshufd		\$0xaa,$xd3,$xd2
	vpshufd		\$0xff,$xd3,$xd3
	vpaddd		.Lincz(%rip),$xd0,$xd0	# don't save counters yet
	vmovdqa64	$xd0,@key[12]
	vmovdqa64	$xd1,@key[13]
	vmovdqa64	$xd2,@key[14]
	vmovdqa64	$xd3,@key[15]

	mov		\$10,%eax
	jmp		.Loop16x

.align	32
.Loop_outer16x:
	vpbroadcastd	0(%r10),$xa0		# reload key
	vpbroadcastd	4(%r10),$xa1
	vpbroadcastd	8(%r10),$xa2
	vpbroadcastd	12(%r10),$xa3
	vpaddd		.Lsixteen(%rip),@key[12],@key[12]	# next SIMD counters
	vmovdqa64	@key[4],$xb0
	vmovdqa64	@key[5],$xb1
	vmovdqa64	@key[6],$xb2
	vmovdqa64	@key[7],$xb3
	vmovdqa64	@key[8],$xc0
	vmovdqa64	@key[9],$xc1
	vmovdqa64	@key[10],$xc2
	vmovdqa64	@key[11],$xc3
	vmovdqa64	@key[12],$xd0
	vmovdqa64	@key[13],$xd1
	vmovdqa64	@key[14],$xd2
	vmovdqa64	@key[15],$xd3

	vmovdqa64	$xa0,@key[0]
	vmovdqa64	$xa1,@key[1]
	vmovdqa64	$xa2,@key[2]
	vmovdqa64	$xa3,@key[3]

	mov		\$10,%eax
	jmp		.Loop16x

.align	32
.Loop16x:
___
	foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
	foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
$code.=<<___;
	dec		%eax
	jnz		.Loop16x

	vpaddd		@key[0],$xa0,$xa0	# accumulate key
	vpaddd		@key[1],$xa1,$xa1
	vpaddd		@key[2],$xa2,$xa2
	vpaddd		@key[3],$xa3,$xa3

	vpunpckldq	$xa1,$xa0,$xt2		# "de-interlace" data
	vpunpckldq	$xa3,$xa2,$xt3
	vpunpckhdq	$xa1,$xa0,$xa0
	vpunpckhdq	$xa3,$xa2,$xa2
	vpunpcklqdq	$xt3,$xt2,$xa1		# "a0"
	vpunpckhqdq	$xt3,$xt2,$xt2		# "a1"
	vpunpcklqdq	$xa2,$xa0,$xa3		# "a2"
	vpunpckhqdq	$xa2,$xa0,$xa0		# "a3"
___
	($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
$code.=<<___;
	vpaddd		@key[4],$xb0,$xb0
	vpaddd		@key[5],$xb1,$xb1
	vpaddd		@key[6],$xb2,$xb2
	vpaddd		@key[7],$xb3,$xb3

	vpunpckldq	$xb1,$xb0,$xt2
	vpunpckldq	$xb3,$xb2,$xt3
	vpunpckhdq	$xb1,$xb0,$xb0
	vpunpckhdq	$xb3,$xb2,$xb2
	vpunpcklqdq	$xt3,$xt2,$xb1		# "b0"
	vpunpckhqdq	$xt3,$xt2,$xt2		# "b1"
	vpunpcklqdq	$xb2,$xb0,$xb3		# "b2"
	vpunpckhqdq	$xb2,$xb0,$xb0		# "b3"
___
	($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
$code.=<<___;
	vshufi32x4	\$0x44,$xb0,$xa0,$xt3	# "de-interlace" further
	vshufi32x4	\$0xee,$xb0,$xa0,$xb0
	vshufi32x4	\$0x44,$xb1,$xa1,$xa0
	vshufi32x4	\$0xee,$xb1,$xa1,$xb1
	vshufi32x4	\$0x44,$xb2,$xa2,$xa1
	vshufi32x4	\$0xee,$xb2,$xa2,$xb2
	vshufi32x4	\$0x44,$xb3,$xa3,$xa2
	vshufi32x4	\$0xee,$xb3,$xa3,$xb3
___
	($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
$code.=<<___;
	vpaddd		@key[8],$xc0,$xc0
	vpaddd		@key[9],$xc1,$xc1
	vpaddd		@key[10],$xc2,$xc2
	vpaddd		@key[11],$xc3,$xc3

	vpunpckldq	$xc1,$xc0,$xt2
	vpunpckldq	$xc3,$xc2,$xt3
	vpunpckhdq	$xc1,$xc0,$xc0
	vpunpckhdq	$xc3,$xc2,$xc2
	vpunpcklqdq	$xt3,$xt2,$xc1		# "c0"
	vpunpckhqdq	$xt3,$xt2,$xt2		# "c1"
	vpunpcklqdq	$xc2,$xc0,$xc3		# "c2"
	vpunpckhqdq	$xc2,$xc0,$xc0		# "c3"
___
	($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
$code.=<<___;
	vpaddd		@key[12],$xd0,$xd0
	vpaddd		@key[13],$xd1,$xd1
	vpaddd		@key[14],$xd2,$xd2
	vpaddd		@key[15],$xd3,$xd3

	vpunpckldq	$xd1,$xd0,$xt2
	vpunpckldq	$xd3,$xd2,$xt3
	vpunpckhdq	$xd1,$xd0,$xd0
	vpunpckhdq	$xd3,$xd2,$xd2
	vpunpcklqdq	$xt3,$xt2,$xd1		# "d0"
	vpunpckhqdq	$xt3,$xt2,$xt2		# "d1"
	vpunpcklqdq	$xd2,$xd0,$xd3		# "d2"
	vpunpckhqdq	$xd2,$xd0,$xd0		# "d3"
___
	($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
$code.=<<___;
	vshufi32x4	\$0x44,$xd0,$xc0,$xt3	# "de-interlace" further
	vshufi32x4	\$0xee,$xd0,$xc0,$xd0
	vshufi32x4	\$0x44,$xd1,$xc1,$xc0
	vshufi32x4	\$0xee,$xd1,$xc1,$xd1
	vshufi32x4	\$0x44,$xd2,$xc2,$xc1
	vshufi32x4	\$0xee,$xd2,$xc2,$xd2
	vshufi32x4	\$0x44,$xd3,$xc3,$xc2
	vshufi32x4	\$0xee,$xd3,$xc3,$xd3
___
	($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
$code.=<<___;
	vshufi32x4	\$0x88,$xc0,$xa0,$xt0	# "de-interlace" further
	vshufi32x4	\$0xdd,$xc0,$xa0,$xa0
	 vshufi32x4	\$0x88,$xd0,$xb0,$xc0
	 vshufi32x4	\$0xdd,$xd0,$xb0,$xd0
	vshufi32x4	\$0x88,$xc1,$xa1,$xt1
	vshufi32x4	\$0xdd,$xc1,$xa1,$xa1
	 vshufi32x4	\$0x88,$xd1,$xb1,$xc1
	 vshufi32x4	\$0xdd,$xd1,$xb1,$xd1
	vshufi32x4	\$0x88,$xc2,$xa2,$xt2
	vshufi32x4	\$0xdd,$xc2,$xa2,$xa2
	 vshufi32x4	\$0x88,$xd2,$xb2,$xc2
	 vshufi32x4	\$0xdd,$xd2,$xb2,$xd2
	vshufi32x4	\$0x88,$xc3,$xa3,$xt3
	vshufi32x4	\$0xdd,$xc3,$xa3,$xa3
	 vshufi32x4	\$0x88,$xd3,$xb3,$xc3
	 vshufi32x4	\$0xdd,$xd3,$xb3,$xd3
___
	($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)=
	($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3);

	($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1,
	 $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) =
	($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
	 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
$code.=<<___;
	cmp		\$64*16,$len
	jb		.Ltail16x

	vpxord		0x00($inp),$xa0,$xa0	# xor with input
	vpxord		0x40($inp),$xb0,$xb0
	vpxord		0x80($inp),$xc0,$xc0
	vpxord		0xc0($inp),$xd0,$xd0
	vmovdqu32	$xa0,0x00($out)
	vmovdqu32	$xb0,0x40($out)
	vmovdqu32	$xc0,0x80($out)
	vmovdqu32	$xd0,0xc0($out)

	vpxord		0x100($inp),$xa1,$xa1
	vpxord		0x140($inp),$xb1,$xb1
	vpxord		0x180($inp),$xc1,$xc1
	vpxord		0x1c0($inp),$xd1,$xd1
	vmovdqu32	$xa1,0x100($out)
	vmovdqu32	$xb1,0x140($out)
	vmovdqu32	$xc1,0x180($out)
	vmovdqu32	$xd1,0x1c0($out)

	vpxord		0x200($inp),$xa2,$xa2
	vpxord		0x240($inp),$xb2,$xb2
	vpxord		0x280($inp),$xc2,$xc2
	vpxord		0x2c0($inp),$xd2,$xd2
	vmovdqu32	$xa2,0x200($out)
	vmovdqu32	$xb2,0x240($out)
	vmovdqu32	$xc2,0x280($out)
	vmovdqu32	$xd2,0x2c0($out)

	vpxord		0x300($inp),$xa3,$xa3
	vpxord		0x340($inp),$xb3,$xb3
	vpxord		0x380($inp),$xc3,$xc3
	vpxord		0x3c0($inp),$xd3,$xd3
	lea		0x400($inp),$inp
	vmovdqu32	$xa3,0x300($out)
	vmovdqu32	$xb3,0x340($out)
	vmovdqu32	$xc3,0x380($out)
	vmovdqu32	$xd3,0x3c0($out)
	lea		0x400($out),$out

	sub		\$64*16,$len
	jnz		.Loop_outer16x

	jmp		.Ldone16x

.align	32
.Ltail16x:
	xor		%r10,%r10
	sub		$inp,$out
	cmp		\$64*1,$len
	jb		.Less_than_64_16x
	vpxord		($inp),$xa0,$xa0	# xor with input
	vmovdqu32	$xa0,($out,$inp)
	je		.Ldone16x
	vmovdqa32	$xb0,$xa0
	lea		64($inp),$inp

	cmp		\$64*2,$len
	jb		.Less_than_64_16x
	vpxord		($inp),$xb0,$xb0
	vmovdqu32	$xb0,($out,$inp)
	je		.Ldone16x
	vmovdqa32	$xc0,$xa0
	lea		64($inp),$inp

	cmp		\$64*3,$len
	jb		.Less_than_64_16x
	vpxord		($inp),$xc0,$xc0
	vmovdqu32	$xc0,($out,$inp)
	je		.Ldone16x
	vmovdqa32	$xd0,$xa0
	lea		64($inp),$inp

	cmp		\$64*4,$len
	jb		.Less_than_64_16x
	vpxord		($inp),$xd0,$xd0
	vmovdqu32	$xd0,($out,$inp)
	je		.Ldone16x
	vmovdqa32	$xa1,$xa0
	lea		64($inp),$inp

	cmp		\$64*5,$len
	jb		.Less_than_64_16x
	vpxord		($inp),$xa1,$xa1
	vmovdqu32	$xa1,($out,$inp)
	je		.Ldone16x
	vmovdqa32	$xb1,$xa0
	lea		64($inp),$inp

	cmp		\$64*6,$len
	jb		.Less_than_64_16x
	vpxord		($inp),$xb1,$xb1
	vmovdqu32	$xb1,($out,$inp)
	je		.Ldone16x
	vmovdqa32	$xc1,$xa0
	lea		64($inp),$inp

	cmp		\$64*7,$len
	jb		.Less_than_64_16x
	vpxord		($inp),$xc1,$xc1
	vmovdqu32	$xc1,($out,$inp)
	je		.Ldone16x
	vmovdqa32	$xd1,$xa0
	lea		64($inp),$inp

	cmp		\$64*8,$len
	jb		.Less_than_64_16x
	vpxord		($inp),$xd1,$xd1
	vmovdqu32	$xd1,($out,$inp)
	je		.Ldone16x
	vmovdqa32	$xa2,$xa0
	lea		64($inp),$inp

	cmp		\$64*9,$len
	jb		.Less_than_64_16x
	vpxord		($inp),$xa2,$xa2
	vmovdqu32	$xa2,($out,$inp)
	je		.Ldone16x
	vmovdqa32	$xb2,$xa0
	lea		64($inp),$inp

	cmp		\$64*10,$len
	jb		.Less_than_64_16x
	vpxord		($inp),$xb2,$xb2
	vmovdqu32	$xb2,($out,$inp)
	je		.Ldone16x
	vmovdqa32	$xc2,$xa0
	lea		64($inp),$inp

	cmp		\$64*11,$len
	jb		.Less_than_64_16x
	vpxord		($inp),$xc2,$xc2
	vmovdqu32	$xc2,($out,$inp)
	je		.Ldone16x
	vmovdqa32	$xd2,$xa0
	lea		64($inp),$inp

	cmp		\$64*12,$len
	jb		.Less_than_64_16x
	vpxord		($inp),$xd2,$xd2
	vmovdqu32	$xd2,($out,$inp)
	je		.Ldone16x
	vmovdqa32	$xa3,$xa0
	lea		64($inp),$inp

	cmp		\$64*13,$len
	jb		.Less_than_64_16x
	vpxord		($inp),$xa3,$xa3
	vmovdqu32	$xa3,($out,$inp)
	je		.Ldone16x
	vmovdqa32	$xb3,$xa0
	lea		64($inp),$inp

	cmp		\$64*14,$len
	jb		.Less_than_64_16x
	vpxord		($inp),$xb3,$xb3
	vmovdqu32	$xb3,($out,$inp)
	je		.Ldone16x
	vmovdqa32	$xc3,$xa0
	lea		64($inp),$inp

	cmp		\$64*15,$len
	jb		.Less_than_64_16x
	vpxord		($inp),$xc3,$xc3
	vmovdqu32	$xc3,($out,$inp)
	je		.Ldone16x
	vmovdqa32	$xd3,$xa0
	lea		64($inp),$inp

.Less_than_64_16x:
	vmovdqa32	$xa0,0x00(%rsp)
	lea		($out,$inp),$out
	and		\$63,$len

.Loop_tail16x:
	movzb		($inp,%r10),%eax
	movzb		(%rsp,%r10),%ecx
	lea		1(%r10),%r10
	xor		%ecx,%eax
	mov		%al,-1($out,%r10)
	dec		$len
	jnz		.Loop_tail16x

.Ldone16x:
	vzeroupper
___
$code.=<<___	if ($win64);
	lea		0x290+0x30(%rsp),%r11
	movaps		-0x30(%r11),%xmm6
	movaps		-0x20(%r11),%xmm7
	movaps		-0x10(%r11),%xmm8
	movaps		0x00(%r11),%xmm9
	movaps		0x10(%r11),%xmm10
	movaps		0x20(%r11),%xmm11
	movaps		0x30(%r11),%xmm12
	movaps		0x40(%r11),%xmm13
	movaps		0x50(%r11),%xmm14
	movaps		0x60(%r11),%xmm15
___
$code.=<<___;
	mov		%r11,%rsp
	ret
.size	ChaCha20_16x,.-ChaCha20_16x
___
}

foreach (split("\n",$code)) {
	s/\`([^\`]*)\`/eval $1/geo;

+669 −9

File changed.

Preview size limit exceeded, changes collapsed.

+25 −0
Original line number Diff line number Diff line
@@ -3437,6 +3437,31 @@ Ciphertext = 64a0861575861af460f062c79be643bd5e805cfd345cf389f108670ac76c8cb24c6
Operation = DECRYPT
Result = CIPHERFINAL_ERROR

# self-generated vectors
Cipher = chacha20-poly1305
Key = 1c9240a5eb55d38af333888604f6b5f0473917c1402b80099dca5cbc207075c0
IV = 000000000102030405060708
AAD = f33388860000000000004e91
Tag = d96119a40cd17f2527306866a3ef0413
Plaintext = 496e7465726e65742d4472616674732061726520647261667420646f63756d656e74732076616c696420666f722061206d6178696d756d206f6620736978206d6f6e74687320616e64206d617920626520757064617465642c207265706c616365642c206f72206f62736f6c65746564206279206f7468657220646f63756d656e747320617420616e792074696d652e20497420697320696e617070726f70726961746520746f2075736520496e7465726e65742d4472616674732061732072
Ciphertext = 64a0861575861af460f062c79be643bd5e805cfd345cf389f108670ac76c8cb24c6cfc18755d43eea09ee94e382d26b0bdb7b73c321b0100d4f03b7f355894cf332f830e710b97ce98c8a84abd0b948114ad176e008d33bd60f982b1ff37c8559797a06ef4f0ef61c186324e2b3506383606907b6a7c02b0f9f6157b53c867e4b9166c767b804d46a59b5216cde7a4e99040c5a40433225ee282a1b0a06c523eaf4534d7f83fa1155b0047718cbc546a0d072b04b3564eea1b422273f548271a

Cipher = chacha20-poly1305
Key = 1c9240a5eb55d38af333888604f6b5f0473917c1402b80099dca5cbc207075c0
IV = 000000000102030405060708
AAD = f33388860000000000004e91
Tag = 53aee3189d2b747032378a6186feb43f
Plaintext = 496e7465726e65742d4472616674732061726520647261667420646f63756d656e74732076616c696420666f722061206d6178696d756d206f6620736978206d6f6e74687320616e64206d617920626520757064617465642c207265706c616365642c206f72206f62736f6c65746564206279206f7468657220646f63756d656e747320617420616e792074696d652e20497420697320696e617070726f70726961746520746f2075736520496e7465726e65742d447261667473206173207265666572656e6365206d6174657269616c206f7220746f2063697465207468656d206f74686572207468616e206173202fe2809c776f726b20696e2070726f67496e7465726e65742d4472616674732061726520647261667420646f63756d656e74732076616c696420666f722061206d6178696d756d206f6620736978206d6f6e74687320616e64206d617920626520757064617465642c207265706c616365642c206f72206f62736f6c65746564206279206f7468657220646f63756d656e747320617420616e792074696d652e20497420697320696e617070726f70726961746520746f2075736520496e7465726e65742d447261667473206173207265666572656e6365206d6174657269616c206f7220746f2063697465207468656d206f74686572207468616e206173202fe2809c776f726b20696e2070726f67
Ciphertext = 64a0861575861af460f062c79be643bd5e805cfd345cf389f108670ac76c8cb24c6cfc18755d43eea09ee94e382d26b0bdb7b73c321b0100d4f03b7f355894cf332f830e710b97ce98c8a84abd0b948114ad176e008d33bd60f982b1ff37c8559797a06ef4f0ef61c186324e2b3506383606907b6a7c02b0f9f6157b53c867e4b9166c767b804d46a59b5216cde7a4e99040c5a40433225ee282a1b0a06c523eaf4534d7f83fa1155b0047718cbc546a0d072b04b3564eea1b422273f548271a0bb2316053fa76991955ebd63159434ecebb4e466dae5a1073a6727627097a1049e617d91d361094fa68f0ff77987130305beaba2eda04df997b714d6c6f2c299da65ba25e6a85842bf0440fd98a9a2266b061c4b3a13327c090f9a0789f58aad805275e4378a525f19232bfbfb749ede38480f405cf43ec2f1f8619ebcbc80a89e92a859c7911e674977ab17d4a7126a6b8a477358ff14a344d276ef6e504e10268ac3619fcf90c2d6c03fc2e3d1f290d9bf26c1fa1495dd8f97eec6229a55c2354e4524143551a5cc370a1c622c9390530cff21c3e1ed50c5e3daf97518ccce34156bdbd7eafab8bd417aef25c6c927301731bd319d247a1d5c3186ed10bfd9a7a24bac30e3e4503ed9204154d338b79ea276e7058e7f20f4d4fd1ac93d63f611af7b6d006c2a72add0eedc497b19cb30a198816664f0da00155f2e2d6ac61

Cipher = chacha20-poly1305
Key = 1c9240a5eb55d38af333888604f6b5f0473917c1402b80099dca5cbc207075c0
IV = 000000000102030405060708
AAD = f33388860000000000004e91
Tag = e0723bce23528ce6ccb10ff9627038bf
Plaintext = 496e7465726e65742d4472616674732061726520647261667420646f63756d656e74732076616c696420666f722061206d6178696d756d206f6620736978206d6f6e74687320616e64206d617920626520757064617465642c207265706c616365642c206f72206f62736f6c65746564206279206f7468657220646f63756d656e747320617420616e792074696d652e20497420697320696e617070726f70726961746520746f2075736520496e7465726e65742d447261667473206173207265666572656e6365206d6174657269616c206f7220746f2063697465207468656d206f74686572207468616e206173202fe2809c776f726b20696e2070726f67496e7465726e65742d4472616674732061726520647261667420646f63756d656e74732076616c696420666f722061206d6178696d756d206f6620736978206d6f6e74687320616e64206d617920626520757064617465642c207265706c616365642c206f72206f62736f6c65746564206279206f7468657220646f63756d656e747320617420616e792074696d652e20497420697320696e617070726f70726961746520746f2075736520496e7465726e65742d447261667473206173207265666572656e6365206d6174657269616c206f7220746f2063697465207468656d206f74686572207468616e206173202fe2809c776f726b20696e2070726f67496e7465726e65742d4472616674732061726520647261667420646f63756d656e74732076616c696420666f722061206d6178696d756d206f6620736978206d
Ciphertext = 64a0861575861af460f062c79be643bd5e805cfd345cf389f108670ac76c8cb24c6cfc18755d43eea09ee94e382d26b0bdb7b73c321b0100d4f03b7f355894cf332f830e710b97ce98c8a84abd0b948114ad176e008d33bd60f982b1ff37c8559797a06ef4f0ef61c186324e2b3506383606907b6a7c02b0f9f6157b53c867e4b9166c767b804d46a59b5216cde7a4e99040c5a40433225ee282a1b0a06c523eaf4534d7f83fa1155b0047718cbc546a0d072b04b3564eea1b422273f548271a0bb2316053fa76991955ebd63159434ecebb4e466dae5a1073a6727627097a1049e617d91d361094fa68f0ff77987130305beaba2eda04df997b714d6c6f2c299da65ba25e6a85842bf0440fd98a9a2266b061c4b3a13327c090f9a0789f58aad805275e4378a525f19232bfbfb749ede38480f405cf43ec2f1f8619ebcbc80a89e92a859c7911e674977ab17d4a7126a6b8a477358ff14a344d276ef6e504e10268ac3619fcf90c2d6c03fc2e3d1f290d9bf26c1fa1495dd8f97eec6229a55c2354e4524143551a5cc370a1c622c9390530cff21c3e1ed50c5e3daf97518ccce34156bdbd7eafab8bd417aef25c6c927301731bd319d247a1d5c3186ed10bfd9a7a24bac30e3e4503ed9204154d338b79ea276e7058e7f20f4d4fd1ac93d63f611af7b6d006c2a72add0eedc497b19cb30a198816664f0da00155f2e2d6ac61045b296d614301e0ad4983308028850dd4feffe3a8163970306e4047f5a165cb4befbc129729cd2e286e837e9b606486d402acc3dec5bf8b92387f6e486f2140

# TLS1 PRF tests, from NIST test vectors

KDF=TLS1-PRF