sha/asm/sha512-armv8.pl: add NEON version of SHA256. (866e505e) · Commits · CYBER - Cyber Security / TS 103 523 MSP / TLMSP / TLMSP OpenSSL

crypto/sha/asm/sha512-armv8.pl

+313 −4

Original line number	Diff line number	Diff line
		@@ -37,6 +37,20 @@
		# indication of some compiler "pathology", most notably code
		# generated with -mgeneral-regs-only is significanty faster
		# and the gap is only 40-90%.
		#
		# October 2016.
		#
		# Originally it was reckoned that it makes no sense to implement NEON
		# version of SHA256 for 64-bit processors. This is because performance
		# improvement on most wide-spread Cortex-A5x processors was observed
		# to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
		# observed that 32-bit NEON SHA256 performs significantly better than
		# 64-bit scalar version on some of the more recent processors. As
		# result 64-bit NEON version of SHA256 was added to provide best
		# all-round performance. For example it executes ~30% faster on X-Gene
		# and Mongoose. [For reference, NEON version of SHA512 is bound to
		# deliver much less improvement, likely negative on Cortex-A5x.
		# Which is why NEON support is limited to SHA256.]

		$output=pop;
		$flavour=pop;
		@@ -195,6 +209,8 @@ $code.=<<___ if ($SZ==4);
		ldr w16,[x16]
		tst w16,#ARMV8_SHA256
		b.ne .Lv8_entry
		tst w16,#ARMV7_NEON
		b.ne .Lneon_entry
		#endif
		___
		$code.=<<___;
		@@ -425,6 +441,296 @@ $code.=<<___;
		___
		}

		if ($SZ==4) { ######################################### NEON stuff #
		# You'll surely note a lot of similarities with sha256-armv4 module,
		# and of course it's not a coincidence. sha256-armv4 was used as
		# initial template, but was adapted for ARMv8 instruction set and
		# extensively re-tuned for all-round performance.

		my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10));
		my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15));
		my $Ktbl="x16";
		my $Xfer="x17";
		my @X = map("q$_",(0..3));
		my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19));
		my $j=0;

		sub AUTOLOAD() # thunk [simplified] x86-style perlasm
		{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
		my $arg = pop;
		$arg = "#$arg" if ($arg*1 eq $arg);
		$code .= "\t$opcode\t".join(',',@_,$arg)."\n";
		}

		sub Dscalar { shift =~ m\|[qv]([0-9]+)\|?"d$1":""; }
		sub Dlo { shift =~ m\|[qv]([0-9]+)\|?"v$1.d[0]":""; }
		sub Dhi { shift =~ m\|[qv]([0-9]+)\|?"v$1.d[1]":""; }

		sub Xupdate()
		{ use integer;
		my $body = shift;
		my @insns = (&$body,&$body,&$body,&$body);
		my ($a,$b,$c,$d,$e,$f,$g,$h);

		&ext_8 ($T0,@X[0],@X[1],4); # X[1..4]
		eval(shift(@insns));
		eval(shift(@insns));
		eval(shift(@insns));
		&ext_8 ($T3,@X[2],@X[3],4); # X[9..12]
		eval(shift(@insns));
		eval(shift(@insns));
		&mov (&Dscalar($T7),&Dhi(@X[3])); # X[14..15]
		eval(shift(@insns));
		eval(shift(@insns));
		&ushr_32 ($T2,$T0,$sigma0[0]);
		eval(shift(@insns));
		&ushr_32 ($T1,$T0,$sigma0[2]);
		eval(shift(@insns));
		&add_32 (@X[0],@X[0],$T3); # X[0..3] += X[9..12]
		eval(shift(@insns));
		&sli_32 ($T2,$T0,32-$sigma0[0]);
		eval(shift(@insns));
		eval(shift(@insns));
		&ushr_32 ($T3,$T0,$sigma0[1]);
		eval(shift(@insns));
		eval(shift(@insns));
		&eor_8 ($T1,$T1,$T2);
		eval(shift(@insns));
		eval(shift(@insns));
		&sli_32 ($T3,$T0,32-$sigma0[1]);
		eval(shift(@insns));
		eval(shift(@insns));
		&ushr_32 ($T4,$T7,$sigma1[0]);
		eval(shift(@insns));
		eval(shift(@insns));
		&eor_8 ($T1,$T1,$T3); # sigma0(X[1..4])
		eval(shift(@insns));
		eval(shift(@insns));
		&sli_32 ($T4,$T7,32-$sigma1[0]);
		eval(shift(@insns));
		eval(shift(@insns));
		&ushr_32 ($T5,$T7,$sigma1[2]);
		eval(shift(@insns));
		eval(shift(@insns));
		&ushr_32 ($T3,$T7,$sigma1[1]);
		eval(shift(@insns));
		eval(shift(@insns));
		&add_32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
		eval(shift(@insns));
		eval(shift(@insns));
		&sli_u32 ($T3,$T7,32-$sigma1[1]);
		eval(shift(@insns));
		eval(shift(@insns));
		&eor_8 ($T5,$T5,$T4);
		eval(shift(@insns));
		eval(shift(@insns));
		eval(shift(@insns));
		&eor_8 ($T5,$T5,$T3); # sigma1(X[14..15])
		eval(shift(@insns));
		eval(shift(@insns));
		eval(shift(@insns));
		&add_32 (@X[0],@X[0],$T5); # X[0..1] += sigma1(X[14..15])
		eval(shift(@insns));
		eval(shift(@insns));
		eval(shift(@insns));
		&ushr_32 ($T6,@X[0],$sigma1[0]);
		eval(shift(@insns));
		&ushr_32 ($T7,@X[0],$sigma1[2]);
		eval(shift(@insns));
		eval(shift(@insns));
		&sli_32 ($T6,@X[0],32-$sigma1[0]);
		eval(shift(@insns));
		&ushr_32 ($T5,@X[0],$sigma1[1]);
		eval(shift(@insns));
		eval(shift(@insns));
		&eor_8 ($T7,$T7,$T6);
		eval(shift(@insns));
		eval(shift(@insns));
		&sli_32 ($T5,@X[0],32-$sigma1[1]);
		eval(shift(@insns));
		eval(shift(@insns));
		&ld1_32 ("{$T0}","[$Ktbl], #16");
		eval(shift(@insns));
		&eor_8 ($T7,$T7,$T5); # sigma1(X[16..17])
		eval(shift(@insns));
		eval(shift(@insns));
		&eor_8 ($T5,$T5,$T5);
		eval(shift(@insns));
		eval(shift(@insns));
		&mov (&Dhi($T5), &Dlo($T7));
		eval(shift(@insns));
		eval(shift(@insns));
		eval(shift(@insns));
		&add_32 (@X[0],@X[0],$T5); # X[2..3] += sigma1(X[16..17])
		eval(shift(@insns));
		eval(shift(@insns));
		eval(shift(@insns));
		&add_32 ($T0,$T0,@X[0]);
		while($#insns>=1) { eval(shift(@insns)); }
		&st1_32 ("{$T0}","[$Xfer], #16");
		eval(shift(@insns));

		push(@X,shift(@X)); # "rotate" X[]
		}

		sub Xpreload()
		{ use integer;
		my $body = shift;
		my @insns = (&$body,&$body,&$body,&$body);
		my ($a,$b,$c,$d,$e,$f,$g,$h);

		eval(shift(@insns));
		eval(shift(@insns));
		&ld1_8 ("{@X[0]}","[$inp],#16");
		eval(shift(@insns));
		eval(shift(@insns));
		&ld1_32 ("{$T0}","[$Ktbl],#16");
		eval(shift(@insns));
		eval(shift(@insns));
		eval(shift(@insns));
		eval(shift(@insns));
		&rev32 (@X[0],@X[0]);
		eval(shift(@insns));
		eval(shift(@insns));
		eval(shift(@insns));
		eval(shift(@insns));
		&add_32 ($T0,$T0,@X[0]);
		foreach (@insns) { eval; } # remaining instructions
		&st1_32 ("{$T0}","[$Xfer], #16");

		push(@X,shift(@X)); # "rotate" X[]
		}

		sub body_00_15 () {
		(
		'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
		'&add ($h,$h,$t1)', # h+=X[i]+K[i]
		'&add ($a,$a,$t4);'. # h+=Sigma0(a) from the past
		'&and ($t1,$f,$e)',
		'&bic ($t4,$g,$e)',
		'&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
		'&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
		'&orr ($t1,$t1,$t4)', # Ch(e,f,g)
		'&eor ($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
		'&eor ($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
		'&add ($h,$h,$t1)', # h+=Ch(e,f,g)
		'&ror ($t0,$t0,"#$Sigma1[0]")',
		'&eor ($t2,$a,$b)', # a^b, b^c in next round
		'&eor ($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
		'&add ($h,$h,$t0)', # h+=Sigma1(e)
		'&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
		'&ldr ($t1,"[$Ktbl]") if ($j==15);'.
		'&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
		'&ror ($t4,$t4,"#$Sigma0[0]")',
		'&add ($d,$d,$h)', # d+=h
		'&eor ($t3,$t3,$b)', # Maj(a,b,c)
		'$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
		)
		}

		$code.=<<___;
		#ifdef __KERNEL__
		.globl sha256_block_neon
		#endif
		.type sha256_block_neon,%function
		.align 4
		sha256_block_neon:
		.Lneon_entry:
		stp x29, x30, [sp, #-16]!
		mov x29, sp
		sub sp,sp,#16*4

		adr $Ktbl,.LK256
		add $num,$inp,$num,lsl#6 // len to point at the end of inp

		ld1.8 {@X[0]},[$inp], #16
		ld1.8 {@X[1]},[$inp], #16
		ld1.8 {@X[2]},[$inp], #16
		ld1.8 {@X[3]},[$inp], #16
		ld1.32 {$T0},[$Ktbl], #16
		ld1.32 {$T1},[$Ktbl], #16
		ld1.32 {$T2},[$Ktbl], #16
		ld1.32 {$T3},[$Ktbl], #16
		rev32 @X[0],@X[0] // yes, even on
		rev32 @X[1],@X[1] // big-endian
		rev32 @X[2],@X[2]
		rev32 @X[3],@X[3]
		mov $Xfer,sp
		add.32 $T0,$T0,@X[0]
		add.32 $T1,$T1,@X[1]
		add.32 $T2,$T2,@X[2]
		st1.32 {$T0-$T1},[$Xfer], #32
		add.32 $T3,$T3,@X[3]
		st1.32 {$T2-$T3},[$Xfer]
		sub $Xfer,$Xfer,#32

		ldp $A,$B,[$ctx]
		ldp $C,$D,[$ctx,#8]
		ldp $E,$F,[$ctx,#16]
		ldp $G,$H,[$ctx,#24]
		ldr $t1,[sp,#0]
		mov $t2,wzr
		eor $t3,$B,$C
		mov $t4,wzr
		b .L_00_48

		.align 4
		.L_00_48:
		___
		&Xupdate(\&body_00_15);
		&Xupdate(\&body_00_15);
		&Xupdate(\&body_00_15);
		&Xupdate(\&body_00_15);
		$code.=<<___;
		cmp $t1,#0 // check for K256 terminator
		ldr $t1,[sp,#0]
		sub $Xfer,$Xfer,#64
		bne .L_00_48

		sub $Ktbl,$Ktbl,#256 // rewind $Ktbl
		cmp $inp,$num
		mov $Xfer, #64
		csel $Xfer, $Xfer, xzr, eq
		sub $inp,$inp,$Xfer // avoid SEGV
		mov $Xfer,sp
		___
		&Xpreload(\&body_00_15);
		&Xpreload(\&body_00_15);
		&Xpreload(\&body_00_15);
		&Xpreload(\&body_00_15);
		$code.=<<___;
		add $A,$A,$t4 // h+=Sigma0(a) from the past
		ldp $t0,$t1,[$ctx,#0]
		add $A,$A,$t2 // h+=Maj(a,b,c) from the past
		ldp $t2,$t3,[$ctx,#8]
		add $A,$A,$t0 // accumulate
		add $B,$B,$t1
		ldp $t0,$t1,[$ctx,#16]
		add $C,$C,$t2
		add $D,$D,$t3
		ldp $t2,$t3,[$ctx,#24]
		add $E,$E,$t0
		add $F,$F,$t1
		ldr $t1,[sp,#0]
		stp $A,$B,[$ctx,#0]
		add $G,$G,$t2
		mov $t2,wzr
		stp $C,$D,[$ctx,#8]
		add $H,$H,$t3
		stp $E,$F,[$ctx,#16]
		eor $t3,$B,$C
		stp $G,$H,[$ctx,#24]
		mov $t4,wzr
		mov $Xfer,sp
		b.ne .L_00_48

		ldr x29,[x29]
		add sp,sp,#16*4+16
		ret
		.size sha256_block_neon,.-sha256_block_neon
		___
		}

		$code.=<<___;
		#ifndef __KERNEL__
		.comm OPENSSL_armcap_P,4,4
		@@ -456,12 +762,15 @@ close SELF;

		foreach(split("\n",$code)) {

		s/\`([^\`]*)\`/eval($1)/geo;
		s/\`([^\`]*)\`/eval($1)/ge;

		s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge;

		s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/geo;
		s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers

		s/\.\w?32\b//o and s/\.16b/\.4s/go;
		m/(ld\|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go;
		s/\.[ui]?8(\s)/$1/;
		s/\.\w?32\b// and s/\.16b/\.4s/g;
		m/(ld\|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g;

		print $_,"\n";
		}