Commit d5487a45 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

chacha/asm/chacha-x86_64.pl: add dedicated path for 128-byte inputs.



The 128-byte vectors are extensively used in chacha20_poly1305_tls_cipher
and dedicated code path is ~30-50% faster on most platforms.

Reviewed-by: default avatarRich Salz <rsalz@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/6626)
parent b068a9b9
Loading
Loading
Loading
Loading
+221 −74
Original line number Diff line number Diff line
#! /usr/bin/env perl
# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License").  You may not use
# this file except in compliance with the License.  You can obtain a copy
@@ -28,33 +28,32 @@
#
# Performance in cycles per byte out of large buffer.
#
#		IALU/gcc 4.8(i)	1xSSSE3/SSE2	4xSSSE3	    NxAVX(v)
#		IALU/gcc 4.8(i)	1x/2xSSSE3(ii)	4xSSSE3	    NxAVX(v)
#
# P4		9.48/+99%	-/22.7(ii)	-
# Core2		7.83/+55%	7.90/8.08	4.35
# Westmere	7.19/+50%	5.60/6.70	3.00
# Sandy Bridge	8.31/+42%	5.45/6.76	2.72
# Ivy Bridge	6.71/+46%	5.40/6.49	2.41
# Haswell	5.92/+43%	5.20/6.45	2.42	    1.23
# Skylake[-X]	5.87/+39%	4.70/-		2.31	    1.19[0.80(vi)]
# Silvermont	12.0/+33%	7.75/7.40	7.03(iii)
# Knights L	11.7/-		-		9.60(iii)   0.80
# Goldmont	10.6/+17%	5.10/-		3.28
# Sledgehammer	7.28/+52%	-/14.2(ii)	-
# Bulldozer	9.66/+28%	9.85/11.1	3.06(iv)
# Ryzen		5.96/+50%	5.19/-		2.40        2.09
# VIA Nano	10.5/+46%	6.72/8.60	6.05
# P4		9.48/+99%	-		-
# Core2		7.83/+55%	7.90/5.76	4.35
# Westmere	7.19/+50%	5.60/4.50	3.00
# Sandy Bridge	8.31/+42%	5.45/4.00	2.72
# Ivy Bridge	6.71/+46%	5.40/?		2.41
# Haswell	5.92/+43%	5.20/3.45	2.42        1.23
# Skylake[-X]	5.87/+39%	4.70/3.22	2.31        1.19[0.80(vi)]
# Silvermont	12.0/+33%	7.75/6.90	7.03(iii)
# Knights L	11.7/-		?		9.60(iii)   0.80
# Goldmont	10.6/+17%	5.10/3.52	3.28
# Sledgehammer	7.28/+52%	-		-
# Bulldozer	9.66/+28%	9.85/5.35(iv)	3.06(iv)
# Ryzen		5.96/+50%	5.19/3.00	2.40        2.09
# VIA Nano	10.5/+46%	6.72/6.88	6.05
#
# (i)	compared to older gcc 3.x one can observe >2x improvement on
#	most platforms;
# (ii)	as it can be seen, SSE2 performance is too low on legacy
#	processors; NxSSE2 results are naturally better, but not
#	impressively better than IALU ones, which is why you won't
#	find SSE2 code below;
# (ii)	2xSSSE3 is code path optimized specifically for 128 bytes used
#	by chacha20_poly1305_tls_cipher, results are EVP-free;
# (iii)	this is not optimal result for Atom because of MSROM
#	limitations, SSE2 can do better, but gain is considered too
#	low to justify the [maintenance] effort;
# (iv)	Bulldozer actually executes 4xXOP code path that delivers 2.20;
# (iv)	Bulldozer actually executes 4xXOP code path that delivers 2.20
#	and 4.85 for 128-byte inputs;
# (v)	8xAVX2, 8xAVX512VL or 16xAVX512F, whichever best applicable;
# (vi)	even though Skylake-X can execute AVX512F code and deliver 0.57
#	cpb in single thread, the corresponding capability is suppressed;
@@ -489,6 +488,7 @@ $code.=<<___ if ($avx);
___
$code.=<<___;
	cmp	\$128,$len		# we might throw away some data,
	je	.LChaCha20_128
	ja	.LChaCha20_4x		# but overall it won't be slower

.Ldo_sse3_after_all:
@@ -605,6 +605,172 @@ $code.=<<___;
___
}

########################################################################
# SSSE3 code path that handles 128-byte inputs
{
my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(8,9,2..7));
my ($a1,$b1,$c1,$d1)=map("%xmm$_",(10,11,0,1));

sub SSSE3ROUND_2x {
	&paddd	($a,$b);
	&pxor	($d,$a);
	 &paddd	($a1,$b1);
	 &pxor	($d1,$a1);
	&pshufb	($d,$rot16);
	 &pshufb($d1,$rot16);

	&paddd	($c,$d);
	 &paddd	($c1,$d1);
	&pxor	($b,$c);
	 &pxor	($b1,$c1);
	&movdqa	($t,$b);
	&psrld	($b,20);
	 &movdqa($t1,$b1);
	&pslld	($t,12);
	 &psrld	($b1,20);
	&por	($b,$t);
	 &pslld	($t1,12);
	 &por	($b1,$t1);

	&paddd	($a,$b);
	&pxor	($d,$a);
	 &paddd	($a1,$b1);
	 &pxor	($d1,$a1);
	&pshufb	($d,$rot24);
	 &pshufb($d1,$rot24);

	&paddd	($c,$d);
	 &paddd	($c1,$d1);
	&pxor	($b,$c);
	 &pxor	($b1,$c1);
	&movdqa	($t,$b);
	&psrld	($b,25);
	 &movdqa($t1,$b1);
	&pslld	($t,7);
	 &psrld	($b1,25);
	&por	($b,$t);
	 &pslld	($t1,7);
	 &por	($b1,$t1);
}

my $xframe = $win64 ? 0x68 : 8;

$code.=<<___;
.type	ChaCha20_128,\@function,5
.align	32
ChaCha20_128:
.cfi_startproc
.LChaCha20_128:
	mov	%rsp,%r9		# frame pointer
.cfi_def_cfa_register	%r9
	sub	\$64+$xframe,%rsp
___
$code.=<<___	if ($win64);
	movaps	%xmm6,-0x68(%r9)
	movaps	%xmm7,-0x58(%r9)
	movaps	%xmm8,-0x48(%r9)
	movaps	%xmm9,-0x38(%r9)
	movaps	%xmm10,-0x28(%r9)
	movaps	%xmm11,-0x18(%r9)
.L128_body:
___
$code.=<<___;
	movdqa	.Lsigma(%rip),$a
	movdqu	($key),$b
	movdqu	16($key),$c
	movdqu	($counter),$d
	movdqa	.Lone(%rip),$d1
	movdqa	.Lrot16(%rip),$rot16
	movdqa	.Lrot24(%rip),$rot24

	movdqa	$a,$a1
	movdqa	$a,0x00(%rsp)
	movdqa	$b,$b1
	movdqa	$b,0x10(%rsp)
	movdqa	$c,$c1
	movdqa	$c,0x20(%rsp)
	paddd	$d,$d1
	movdqa	$d,0x30(%rsp)
	mov	\$10,$counter		# reuse $counter
	jmp	.Loop_128

.align	32
.Loop_128:
___
	&SSSE3ROUND_2x();
	&pshufd	($c,$c,0b01001110);
	&pshufd	($b,$b,0b00111001);
	&pshufd	($d,$d,0b10010011);
	&pshufd	($c1,$c1,0b01001110);
	&pshufd	($b1,$b1,0b00111001);
	&pshufd	($d1,$d1,0b10010011);

	&SSSE3ROUND_2x();
	&pshufd	($c,$c,0b01001110);
	&pshufd	($b,$b,0b10010011);
	&pshufd	($d,$d,0b00111001);
	&pshufd	($c1,$c1,0b01001110);
	&pshufd	($b1,$b1,0b10010011);
	&pshufd	($d1,$d1,0b00111001);

	&dec	($counter);
	&jnz	(".Loop_128");

$code.=<<___;
	paddd	0x00(%rsp),$a
	paddd	0x10(%rsp),$b
	paddd	0x20(%rsp),$c
	paddd	0x30(%rsp),$d
	paddd	.Lone(%rip),$d1
	paddd	0x00(%rsp),$a1
	paddd	0x10(%rsp),$b1
	paddd	0x20(%rsp),$c1
	paddd	0x30(%rsp),$d1

	movdqu	0x00($inp),$t
	movdqu	0x10($inp),$t1
	pxor	$t,$a			# xor with input
	movdqu	0x20($inp),$t
	pxor	$t1,$b
	movdqu	0x30($inp),$t1
	pxor	$t,$c
	movdqu	0x40($inp),$t
	pxor	$t1,$d
	movdqu	0x50($inp),$t1
	pxor	$t,$a1
	movdqu	0x60($inp),$t
	pxor	$t1,$b1
	movdqu	0x70($inp),$t1
	pxor	$t,$c1
	pxor	$t1,$d1

	movdqu	$a,0x00($out)		# write output
	movdqu	$b,0x10($out)
	movdqu	$c,0x20($out)
	movdqu	$d,0x30($out)
	movdqu	$a1,0x40($out)
	movdqu	$b1,0x50($out)
	movdqu	$c1,0x60($out)
	movdqu	$d1,0x70($out)
___
$code.=<<___	if ($win64);
	movaps	-0x68(%r9),%xmm6
	movaps	-0x58(%r9),%xmm7
	movaps	-0x48(%r9),%xmm8
	movaps	-0x38(%r9),%xmm9
	movaps	-0x28(%r9),%xmm10
	movaps	-0x18(%r9),%xmm11
___
$code.=<<___;
	lea	(%r9),%rsp
.cfi_def_cfa_register	%rsp
.L128_epilogue:
	ret
.cfi_endproc
.size	ChaCha20_128,.-ChaCha20_128
___
}

########################################################################
# SSSE3 code path that handles longer messages.
{
@@ -3674,9 +3840,9 @@ se_handler:
	ret
.size	se_handler,.-se_handler

.type	ssse3_handler,\@abi-omnipotent
.type	simd_handler,\@abi-omnipotent
.align	16
ssse3_handler:
simd_handler:
	push	%rsi
	push	%rdi
	push	%rbx
@@ -3702,57 +3868,20 @@ ssse3_handler:
	mov	192($context),%rax	# pull context->R9

	mov	4(%r11),%r10d		# HandlerData[1]
	mov	8(%r11),%ecx		# HandlerData[2]
	lea	(%rsi,%r10),%r10	# epilogue label
	cmp	%r10,%rbx		# context->Rip>=epilogue label
	jae	.Lcommon_seh_tail

	lea	-0x28(%rax),%rsi
	neg	%rcx
	lea	-8(%rax,%rcx),%rsi
	lea	512($context),%rdi	# &context.Xmm6
	mov	\$4,%ecx
	neg	%ecx
	shr	\$3,%ecx
	.long	0xa548f3fc		# cld; rep movsq

	jmp	.Lcommon_seh_tail
.size	ssse3_handler,.-ssse3_handler

.type	full_handler,\@abi-omnipotent
.align	16
full_handler:
	push	%rsi
	push	%rdi
	push	%rbx
	push	%rbp
	push	%r12
	push	%r13
	push	%r14
	push	%r15
	pushfq
	sub	\$64,%rsp

	mov	120($context),%rax	# pull context->Rax
	mov	248($context),%rbx	# pull context->Rip

	mov	8($disp),%rsi		# disp->ImageBase
	mov	56($disp),%r11		# disp->HandlerData

	mov	0(%r11),%r10d		# HandlerData[0]
	lea	(%rsi,%r10),%r10	# prologue label
	cmp	%r10,%rbx		# context->Rip<prologue label
	jb	.Lcommon_seh_tail

	mov	192($context),%rax	# pull context->R9

	mov	4(%r11),%r10d		# HandlerData[1]
	lea	(%rsi,%r10),%r10	# epilogue label
	cmp	%r10,%rbx		# context->Rip>=epilogue label
	jae	.Lcommon_seh_tail

	lea	-0xa8(%rax),%rsi
	lea	512($context),%rdi	# &context.Xmm6
	mov	\$20,%ecx
	.long	0xa548f3fc		# cld; rep movsq

	jmp	.Lcommon_seh_tail
.size	full_handler,.-full_handler
.size	simd_handler,.-simd_handler

.section	.pdata
.align	4
@@ -3764,6 +3893,10 @@ full_handler:
	.rva	.LSEH_end_ChaCha20_ssse3
	.rva	.LSEH_info_ChaCha20_ssse3

	.rva	.LSEH_begin_ChaCha20_128
	.rva	.LSEH_end_ChaCha20_128
	.rva	.LSEH_info_ChaCha20_128

	.rva	.LSEH_begin_ChaCha20_4x
	.rva	.LSEH_end_ChaCha20_4x
	.rva	.LSEH_info_ChaCha20_4x
@@ -3804,46 +3937,60 @@ $code.=<<___;

.LSEH_info_ChaCha20_ssse3:
	.byte	9,0,0,0
	.rva	ssse3_handler
	.rva	simd_handler
	.rva	.Lssse3_body,.Lssse3_epilogue
	.long	0x20,0

.LSEH_info_ChaCha20_128:
	.byte	9,0,0,0
	.rva	simd_handler
	.rva	.L128_body,.L128_epilogue
	.long	0x60,0

.LSEH_info_ChaCha20_4x:
	.byte	9,0,0,0
	.rva	full_handler
	.rva	simd_handler
	.rva	.L4x_body,.L4x_epilogue
	.long	0xa0,0
___
$code.=<<___ if ($avx);
.LSEH_info_ChaCha20_4xop:
	.byte	9,0,0,0
	.rva	full_handler
	.rva	simd_handler
	.rva	.L4xop_body,.L4xop_epilogue		# HandlerData[]
	.long	0xa0,0
___
$code.=<<___ if ($avx>1);
.LSEH_info_ChaCha20_8x:
	.byte	9,0,0,0
	.rva	full_handler
	.rva	simd_handler
	.rva	.L8x_body,.L8x_epilogue			# HandlerData[]
	.long	0xa0,0
___
$code.=<<___ if ($avx>2);
.LSEH_info_ChaCha20_avx512:
	.byte	9,0,0,0
	.rva	ssse3_handler
	.rva	simd_handler
	.rva	.Lavx512_body,.Lavx512_epilogue		# HandlerData[]
	.long	0x20,0

.LSEH_info_ChaCha20_avx512vl:
	.byte	9,0,0,0
	.rva	ssse3_handler
	.rva	simd_handler
	.rva	.Lavx512vl_body,.Lavx512vl_epilogue	# HandlerData[]
	.long	0x20,0

.LSEH_info_ChaCha20_16x:
	.byte	9,0,0,0
	.rva	full_handler
	.rva	simd_handler
	.rva	.L16x_body,.L16x_epilogue		# HandlerData[]
	.long	0xa0,0

.LSEH_info_ChaCha20_8xvl:
	.byte	9,0,0,0
	.rva	full_handler
	.rva	simd_handler
	.rva	.L8xvl_body,.L8xvl_epilogue		# HandlerData[]
	.long	0xa0,0
___
}