Commit 866e505e authored by Andy Polyakov's avatar Andy Polyakov
Browse files

sha/asm/sha512-armv8.pl: add NEON version of SHA256.



This provides up to 30% better performance on some of recent processors.

Reviewed-by: default avatarRichard Levitte <levitte@openssl.org>
parent 79dfc3dd
Loading
Loading
Loading
Loading
+313 −4
Original line number Diff line number Diff line
@@ -37,6 +37,20 @@
#	indication of some compiler "pathology", most notably code
#	generated with -mgeneral-regs-only is significanty faster
#	and the gap is only 40-90%.
#
# October 2016.
#
# Originally it was reckoned that it makes no sense to implement NEON
# version of SHA256 for 64-bit processors. This is because performance
# improvement on most wide-spread Cortex-A5x processors was observed
# to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
# observed that 32-bit NEON SHA256 performs significantly better than
# 64-bit scalar version on *some* of the more recent processors. As
# result 64-bit NEON version of SHA256 was added to provide best
# all-round performance. For example it executes ~30% faster on X-Gene
# and Mongoose. [For reference, NEON version of SHA512 is bound to
# deliver much less improvement, likely *negative* on Cortex-A5x.
# Which is why NEON support is limited to SHA256.]

$output=pop;
$flavour=pop;
@@ -195,6 +209,8 @@ $code.=<<___ if ($SZ==4);
	ldr	w16,[x16]
	tst	w16,#ARMV8_SHA256
	b.ne	.Lv8_entry
	tst	w16,#ARMV7_NEON
	b.ne	.Lneon_entry
#endif
___
$code.=<<___;
@@ -425,6 +441,296 @@ $code.=<<___;
___
}

if ($SZ==4) {	######################################### NEON stuff #
# You'll surely note a lot of similarities with sha256-armv4 module,
# and of course it's not a coincidence. sha256-armv4 was used as
# initial template, but was adapted for ARMv8 instruction set and
# extensively re-tuned for all-round performance.

my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10));
my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15));
my $Ktbl="x16";
my $Xfer="x17";
my @X = map("q$_",(0..3));
my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19));
my $j=0;

sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
  my $arg = pop;
    $arg = "#$arg" if ($arg*1 eq $arg);
    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
}

sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; }
sub Dlo     { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; }
sub Dhi     { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; }

sub Xupdate()
{ use integer;
  my $body = shift;
  my @insns = (&$body,&$body,&$body,&$body);
  my ($a,$b,$c,$d,$e,$f,$g,$h);

	&ext_8		($T0,@X[0],@X[1],4);	# X[1..4]
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	&ext_8		($T3,@X[2],@X[3],4);	# X[9..12]
	 eval(shift(@insns));
	 eval(shift(@insns));
	&mov		(&Dscalar($T7),&Dhi(@X[3]));	# X[14..15]
	 eval(shift(@insns));
	 eval(shift(@insns));
	&ushr_32	($T2,$T0,$sigma0[0]);
	 eval(shift(@insns));
	&ushr_32	($T1,$T0,$sigma0[2]);
	 eval(shift(@insns));
	&add_32 	(@X[0],@X[0],$T3);	# X[0..3] += X[9..12]
	 eval(shift(@insns));
	&sli_32		($T2,$T0,32-$sigma0[0]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	&ushr_32	($T3,$T0,$sigma0[1]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	&eor_8		($T1,$T1,$T2);
	 eval(shift(@insns));
	 eval(shift(@insns));
	&sli_32		($T3,$T0,32-$sigma0[1]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	  &ushr_32	($T4,$T7,$sigma1[0]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	&eor_8		($T1,$T1,$T3);		# sigma0(X[1..4])
	 eval(shift(@insns));
	 eval(shift(@insns));
	  &sli_32	($T4,$T7,32-$sigma1[0]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	  &ushr_32	($T5,$T7,$sigma1[2]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	  &ushr_32	($T3,$T7,$sigma1[1]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	&add_32		(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
	 eval(shift(@insns));
	 eval(shift(@insns));
	  &sli_u32	($T3,$T7,32-$sigma1[1]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	  &eor_8	($T5,$T5,$T4);
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	  &eor_8	($T5,$T5,$T3);		# sigma1(X[14..15])
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	&add_32		(@X[0],@X[0],$T5);	# X[0..1] += sigma1(X[14..15])
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	  &ushr_32	($T6,@X[0],$sigma1[0]);
	 eval(shift(@insns));
	  &ushr_32	($T7,@X[0],$sigma1[2]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	  &sli_32	($T6,@X[0],32-$sigma1[0]);
	 eval(shift(@insns));
	  &ushr_32	($T5,@X[0],$sigma1[1]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	  &eor_8	($T7,$T7,$T6);
	 eval(shift(@insns));
	 eval(shift(@insns));
	  &sli_32	($T5,@X[0],32-$sigma1[1]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	&ld1_32		("{$T0}","[$Ktbl], #16");
	 eval(shift(@insns));
	  &eor_8	($T7,$T7,$T5);		# sigma1(X[16..17])
	 eval(shift(@insns));
	 eval(shift(@insns));
	&eor_8		($T5,$T5,$T5);
	 eval(shift(@insns));
	 eval(shift(@insns));
	&mov		(&Dhi($T5), &Dlo($T7));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	&add_32		(@X[0],@X[0],$T5);	# X[2..3] += sigma1(X[16..17])
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	&add_32		($T0,$T0,@X[0]);
	 while($#insns>=1) { eval(shift(@insns)); }
	&st1_32		("{$T0}","[$Xfer], #16");
	 eval(shift(@insns));

	push(@X,shift(@X));		# "rotate" X[]
}

sub Xpreload()
{ use integer;
  my $body = shift;
  my @insns = (&$body,&$body,&$body,&$body);
  my ($a,$b,$c,$d,$e,$f,$g,$h);

	 eval(shift(@insns));
	 eval(shift(@insns));
	&ld1_8		("{@X[0]}","[$inp],#16");
	 eval(shift(@insns));
	 eval(shift(@insns));
	&ld1_32		("{$T0}","[$Ktbl],#16");
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	&rev32		(@X[0],@X[0]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	&add_32		($T0,$T0,@X[0]);
	 foreach (@insns) { eval; }	# remaining instructions
	&st1_32		("{$T0}","[$Xfer], #16");

	push(@X,shift(@X));		# "rotate" X[]
}

sub body_00_15 () {
	(
	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
	'&add	($a,$a,$t4);'.			# h+=Sigma0(a) from the past
	'&and	($t1,$f,$e)',
	'&bic	($t4,$g,$e)',
	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
	'&orr	($t1,$t1,$t4)',			# Ch(e,f,g)
	'&eor	($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
	'&eor	($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
	'&ror	($t0,$t0,"#$Sigma1[0]")',
	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
	'&eor	($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
	'&add	($h,$h,$t0)',			# h+=Sigma1(e)
	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
	'&ror	($t4,$t4,"#$Sigma0[0]")',
	'&add	($d,$d,$h)',			# d+=h
	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
	)
}

$code.=<<___;
#ifdef	__KERNEL__
.globl	sha256_block_neon
#endif
.type	sha256_block_neon,%function
.align	4
sha256_block_neon:
.Lneon_entry:
	stp	x29, x30, [sp, #-16]!
	mov	x29, sp
	sub	sp,sp,#16*4

	adr	$Ktbl,.LK256
	add	$num,$inp,$num,lsl#6	// len to point at the end of inp

	ld1.8	{@X[0]},[$inp], #16
	ld1.8	{@X[1]},[$inp], #16
	ld1.8	{@X[2]},[$inp], #16
	ld1.8	{@X[3]},[$inp], #16
	ld1.32	{$T0},[$Ktbl], #16
	ld1.32	{$T1},[$Ktbl], #16
	ld1.32	{$T2},[$Ktbl], #16
	ld1.32	{$T3},[$Ktbl], #16
	rev32	@X[0],@X[0]		// yes, even on
	rev32	@X[1],@X[1]		// big-endian
	rev32	@X[2],@X[2]
	rev32	@X[3],@X[3]
	mov	$Xfer,sp
	add.32	$T0,$T0,@X[0]
	add.32	$T1,$T1,@X[1]
	add.32	$T2,$T2,@X[2]
	st1.32	{$T0-$T1},[$Xfer], #32
	add.32	$T3,$T3,@X[3]
	st1.32	{$T2-$T3},[$Xfer]
	sub	$Xfer,$Xfer,#32

	ldp	$A,$B,[$ctx]
	ldp	$C,$D,[$ctx,#8]
	ldp	$E,$F,[$ctx,#16]
	ldp	$G,$H,[$ctx,#24]
	ldr	$t1,[sp,#0]
	mov	$t2,wzr
	eor	$t3,$B,$C
	mov	$t4,wzr
	b	.L_00_48

.align	4
.L_00_48:
___
	&Xupdate(\&body_00_15);
	&Xupdate(\&body_00_15);
	&Xupdate(\&body_00_15);
	&Xupdate(\&body_00_15);
$code.=<<___;
	cmp	$t1,#0				// check for K256 terminator
	ldr	$t1,[sp,#0]
	sub	$Xfer,$Xfer,#64
	bne	.L_00_48

	sub	$Ktbl,$Ktbl,#256		// rewind $Ktbl
	cmp	$inp,$num
	mov	$Xfer, #64
	csel	$Xfer, $Xfer, xzr, eq
	sub	$inp,$inp,$Xfer			// avoid SEGV
	mov	$Xfer,sp
___
	&Xpreload(\&body_00_15);
	&Xpreload(\&body_00_15);
	&Xpreload(\&body_00_15);
	&Xpreload(\&body_00_15);
$code.=<<___;
	add	$A,$A,$t4			// h+=Sigma0(a) from the past
	ldp	$t0,$t1,[$ctx,#0]
	add	$A,$A,$t2			// h+=Maj(a,b,c) from the past
	ldp	$t2,$t3,[$ctx,#8]
	add	$A,$A,$t0			// accumulate
	add	$B,$B,$t1
	ldp	$t0,$t1,[$ctx,#16]
	add	$C,$C,$t2
	add	$D,$D,$t3
	ldp	$t2,$t3,[$ctx,#24]
	add	$E,$E,$t0
	add	$F,$F,$t1
	 ldr	$t1,[sp,#0]
	stp	$A,$B,[$ctx,#0]
	add	$G,$G,$t2
	 mov	$t2,wzr
	stp	$C,$D,[$ctx,#8]
	add	$H,$H,$t3
	stp	$E,$F,[$ctx,#16]
	 eor	$t3,$B,$C
	stp	$G,$H,[$ctx,#24]
	 mov	$t4,wzr
	 mov	$Xfer,sp
	b.ne	.L_00_48

	ldr	x29,[x29]
	add	sp,sp,#16*4+16
	ret
.size	sha256_block_neon,.-sha256_block_neon
___
}

$code.=<<___;
#ifndef	__KERNEL__
.comm	OPENSSL_armcap_P,4,4
@@ -456,12 +762,15 @@ close SELF;

foreach(split("\n",$code)) {

	s/\`([^\`]*)\`/eval($1)/geo;
	s/\`([^\`]*)\`/eval($1)/ge;

	s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge;

	s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/geo;
	s/\bq([0-9]+)\b/v$1.16b/g;		# old->new registers

	s/\.\w?32\b//o		and s/\.16b/\.4s/go;
	m/(ld|st)1[^\[]+\[0\]/o	and s/\.4s/\.s/go;
	s/\.[ui]?8(\s)/$1/;
	s/\.\w?32\b//		and s/\.16b/\.4s/g;
	m/(ld|st)1[^\[]+\[0\]/	and s/\.4s/\.s/g;

	print $_,"\n";
}