Commit c4d9ef4c authored by Andy Polyakov's avatar Andy Polyakov
Browse files

sha/asm/sha512p8-ppc.pl: improve POWER9 performance by ~10%.



Biggest part, ~7%, of improvement resulted from omitting constants'
table index increment in each round. And minor part from rescheduling
instructions. Apparently POWER9 (and POWER8) manage to dispatch
instructions more efficiently if they are laid down as if they have
no latency...

Reviewed-by: default avatarRich Salz <rsalz@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/6406)
parent 1a467bd1
Loading
Loading
Loading
Loading
+55 −67
Original line number Diff line number Diff line
@@ -37,8 +37,8 @@
# build of sha512-ppc.pl, presented for reference.
#
#		POWER8		POWER9
# SHA256	9.9 [15.8]	12.2 [12.5]
# SHA512	6.3 [10.3]	7.7 [7.9]
# SHA256	9.7 [15.8]	11.2 [12.5]
# SHA512	6.1 [10.3]	7.0 [7.9]

$flavour=shift;
$output =shift;
@@ -79,7 +79,8 @@ if ($output =~ /512/) {
}

$func="sha${bits}_block_p8";
$FRAME=8*$SIZE_T;
$LOCALS=8*$SIZE_T+8*16;
$FRAME=$LOCALS+9*16+6*$SIZE_T;

$sp ="r1";
$toc="r2";
@@ -91,16 +92,17 @@ $idx="r7";
$lrsave="r8";
$offload="r11";
$vrsave="r12";
($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,10,26..31));
@I = ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,10,26..31));
      $x00=0 if ($flavour =~ /osx/);

@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("v$_",(0..7));
@X=map("v$_",(8..23));
($Ki,$Func,$S0,$S1,$s0,$s1,$lemask)=map("v$_",(24..31));
@X=map("v$_",(8..19,24..27));
($Ki,$Func,$Sigma,$lemask)=map("v$_",(28..31));

sub ROUND {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
my $j=($i+1)%16;
my $k=($i+2)%8;

$code.=<<___		if ($i<15 && ($i%(16/$SZ))==(16/$SZ-1));
	lvx_u		@X[$i+1],0,$inp		; load X[i] in advance
@@ -112,26 +114,30 @@ ___
$code.=<<___		if ($LENDIAN && $i<16 && ($i%(16/$SZ))==0);
	vperm		@X[$i],@X[$i],@X[$i],$lemask
___
$code.=<<___		if ($i>=15);
	vshasigma${sz}	$Sigma,@X[($j+1)%16],0,0
	vaddu${sz}m	@X[$j],@X[$j],$Sigma
	vshasigma${sz}	$Sigma,@X[($j+14)%16],0,15
	vaddu${sz}m	@X[$j],@X[$j],$Sigma
	vaddu${sz}m	@X[$j],@X[$j],@X[($j+9)%16]
___
$code.=<<___;
	`"vshasigma${sz}	$s0,@X[($j+1)%16],0,0"		if ($i>=15)`
	vsel		$Func,$g,$f,$e		; Ch(e,f,g)
	vshasigma${sz}	$S1,$e,1,15		; Sigma1(e)
	vaddu${sz}m	$h,$h,@X[$i%16]		; h+=X[i]
	vshasigma${sz}	$S0,$a,1,0		; Sigma0(a)
	`"vshasigma${sz}	$s1,@X[($j+14)%16],0,15"	if ($i>=15)`
	vsel		$Func,$g,$f,$e		; Ch(e,f,g)
	vaddu${sz}m	$g,$g,$Ki		; future h+=K[i]
	vaddu${sz}m	$h,$h,$Func		; h+=Ch(e,f,g)
	vshasigma${sz}	$Sigma,$e,1,15		; Sigma1(e)
	vaddu${sz}m	$h,$h,$Sigma		; h+=Sigma1(e)
	vxor		$Func,$a,$b
	`"vaddu${sz}m		@X[$j],@X[$j],@X[($j+9)%16]"	if ($i>=15)`
	vaddu${sz}m	$h,$h,$S1		; h+=Sigma1(e)
	vsel		$Func,$b,$c,$Func	; Maj(a,b,c)
	vaddu${sz}m	$g,$g,$Ki		; future h+=K[i]
	vaddu${sz}m	$d,$d,$h		; d+=h
	vaddu${sz}m	$S0,$S0,$Func		; Sigma0(a)+Maj(a,b,c)
	`"vaddu${sz}m		@X[$j],@X[$j],$s0"		if ($i>=15)`
	lvx		$Ki,$idx,$Tbl		; load next K[i]
	addi		$idx,$idx,16
	vaddu${sz}m	$h,$h,$S0		; h+=Sigma0(a)+Maj(a,b,c)
	`"vaddu${sz}m		@X[$j],@X[$j],$s1"		if ($i>=15)`
	vshasigma${sz}	$Sigma,$a,1,0		; Sigma0(a)
	vaddu${sz}m	$Sigma,$Sigma,$Func	; Sigma0(a)+Maj(a,b,c)
	vaddu${sz}m	$h,$h,$Sigma		; h+=Sigma0(a)+Maj(a,b,c)
	lvx		$Ki,@I[$k],$idx		; load next K[i]
___
$code.=<<___		if ($k == 7);
	addi		$idx,$idx,0x80
___
}

@@ -142,21 +148,13 @@ $code=<<___;
.globl	$func
.align	6
$func:
	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
	$STU		$sp,-$FRAME($sp)
	mflr		$lrsave
	li		r10,`$FRAME+8*16+15`
	li		r11,`$FRAME+8*16+31`
	stvx		v20,r10,$sp		# ABI says so
	li		r10,`$LOCALS+15`
	li		r11,`$LOCALS+31`
	stvx		v24,r10,$sp		# ABI says so
	addi		r10,r10,32
	mfspr		$vrsave,256
	stvx		v21,r11,$sp
	addi		r11,r11,32
	stvx		v22,r10,$sp
	addi		r10,r10,32
	stvx		v23,r11,$sp
	addi		r11,r11,32
	stvx		v24,r10,$sp
	addi		r10,r10,32
	stvx		v25,r11,$sp
	addi		r11,r11,32
	stvx		v26,r10,$sp
@@ -169,26 +167,26 @@ $func:
	addi		r11,r11,32
	stvx		v30,r10,$sp
	stvx		v31,r11,$sp
	li		r11,-1
	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
	li		r11,-4096+255
	stw		$vrsave,`$FRAME+6*$SIZE_T-4`($sp)	# save vrsave
	li		$x10,0x10
	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
	$PUSH		r26,`$FRAME-6*$SIZE_T`($sp)
	li		$x20,0x20
	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
	$PUSH		r27,`$FRAME-5*$SIZE_T`($sp)
	li		$x30,0x30
	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
	$PUSH		r28,`$FRAME-4*$SIZE_T`($sp)
	li		$x40,0x40
	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
	$PUSH		r29,`$FRAME-3*$SIZE_T`($sp)
	li		$x50,0x50
	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
	$PUSH		r30,`$FRAME-2*$SIZE_T`($sp)
	li		$x60,0x60
	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
	$PUSH		r31,`$FRAME-1*$SIZE_T`($sp)
	li		$x70,0x70
	$PUSH		$lrsave,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
	$PUSH		$lrsave,`$FRAME+$LRSAVE`($sp)
	mtspr		256,r11

	bl		LPICmeup
	addi		$offload,$sp,$FRAME+15
	addi		$offload,$sp,`8*$SIZE_T+15`
___
$code.=<<___		if ($LENDIAN);
	li		$idx,8
@@ -222,9 +220,9 @@ $code.=<<___;
.align	5
Loop:
	lvx		$Ki,$x00,$Tbl
	li		$idx,16
	lvx_u		@X[0],0,$inp
	addi		$inp,$inp,16
	mr		$idx,$Tbl		# copy $Tbl
	stvx		$A,$x00,$offload	# offload $A-$H
	stvx		$B,$x10,$offload
	stvx		$C,$x20,$offload
@@ -234,8 +232,7 @@ Loop:
	stvx		$G,$x60,$offload
	stvx		$H,$x70,$offload
	vaddu${sz}m	$H,$H,$Ki		# h+K[i]
	lvx		$Ki,$idx,$Tbl
	addi		$idx,$idx,16
	lvx		$Ki,$x10,$Tbl
___
for ($i=0;$i<16;$i++)	{ &ROUND($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
@@ -268,10 +265,9 @@ $code.=<<___;
	bne		Loop
___
$code.=<<___		if ($SZ==4);
	lvx		@X[0],$idx,$Tbl
	addi		$idx,$idx,16
	lvx		@X[0],$x20,$idx
	vperm		$A,$A,$B,$Ki		# pack the answer
	lvx		@X[1],$idx,$Tbl
	lvx		@X[1],$x30,$idx
	vperm		$E,$E,$F,$Ki
	vperm		$A,$A,$C,@X[0]
	vperm		$E,$E,$G,@X[0]
@@ -291,19 +287,11 @@ $code.=<<___ if ($SZ==8);
	stvx_u		$G,$x30,$ctx
___
$code.=<<___;
	li		r10,`$FRAME+8*16+15`
	li		r10,`$LOCALS+15`
	mtlr		$lrsave
	li		r11,`$FRAME+8*16+31`
	li		r11,`$LOCALS+31`
	mtspr		256,$vrsave
	lvx		v20,r10,$sp		# ABI says so
	addi		r10,r10,32
	lvx		v21,r11,$sp
	addi		r11,r11,32
	lvx		v22,r10,$sp
	addi		r10,r10,32
	lvx		v23,r11,$sp
	addi		r11,r11,32
	lvx		v24,r10,$sp
	lvx		v24,r10,$sp		# ABI says so
	addi		r10,r10,32
	lvx		v25,r11,$sp
	addi		r11,r11,32
@@ -317,13 +305,13 @@ $code.=<<___;
	addi		r11,r11,32
	lvx		v30,r10,$sp
	lvx		v31,r11,$sp
	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
	$POP		r26,`$FRAME-6*$SIZE_T`($sp)
	$POP		r27,`$FRAME-5*$SIZE_T`($sp)
	$POP		r28,`$FRAME-4*$SIZE_T`($sp)
	$POP		r29,`$FRAME-3*$SIZE_T`($sp)
	$POP		r30,`$FRAME-2*$SIZE_T`($sp)
	$POP		r31,`$FRAME-1*$SIZE_T`($sp)
	addi		$sp,$sp,$FRAME
	blr
	.long		0
	.byte		0,12,4,1,0x80,6,3,0