Commit 9986bfef authored by Andy Polyakov's avatar Andy Polyakov
Browse files

sha/asm/keccak1600-armv8.pl: halve the size of hw-assisted subroutine.



Yes, it's second halving, i.e. it's now 1/4 of original size, or more
specifically inner loop. The challenge with Keccak is that you need
more temporary registers than there are available. By reversing the
order in which columns are assigned in Chi, it's possible to use
three of A[][] registers as temporary prior their assigment.

Reviewed-by: default avatarRichard Levitte <levitte@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/7294)
parent 03ad7c00
Loading
Loading
Loading
Loading
+69 −77
Original line number Diff line number Diff line
@@ -533,30 +533,28 @@ my @A = map([ "v".$_.".16b", "v".($_+1).".16b", "v".($_+2).".16b",
            (0, 5, 10, 15, 20));

my @C = map("v$_.16b", (25..31));
my @D = @C[4,5,6,2,3];

$code.=<<___;
.type	KeccakF1600_ce,%function
.align	5
KeccakF1600_ce:
	mov	x9,#12
	mov	x9,#24
	adr	x10,iotas
	b	.Loop_ce
.align	4
.Loop_ce:
___
for($i=0; $i<2; $i++) {
$code.=<<___;
	////////////////////////////////////////////////// Theta
	eor3	$C[0],$A[0][0],$A[1][0],$A[2][0]
	eor3	$C[1],$A[0][1],$A[1][1],$A[2][1]
	eor3	$C[2],$A[0][2],$A[1][2],$A[2][2]
	eor3	$C[3],$A[0][3],$A[1][3],$A[2][3]
	eor3	$C[4],$A[0][4],$A[1][4],$A[2][4]
	eor3	$C[0],$C[0],   $A[3][0],$A[4][0]
	eor3	$C[1],$C[1],   $A[3][1],$A[4][1]
	eor3	$C[2],$C[2],   $A[3][2],$A[4][2]
	eor3	$C[3],$C[3],   $A[3][3],$A[4][3]
	eor3	$C[4],$C[4],   $A[3][4],$A[4][4]
	eor3	$C[0],$A[4][0],$A[3][0],$A[2][0]
	eor3	$C[1],$A[4][1],$A[3][1],$A[2][1]
	eor3	$C[2],$A[4][2],$A[3][2],$A[2][2]
	eor3	$C[3],$A[4][3],$A[3][3],$A[2][3]
	eor3	$C[4],$A[4][4],$A[3][4],$A[2][4]
	eor3	$C[0],$C[0],   $A[1][0],$A[0][0]
	eor3	$C[1],$C[1],   $A[1][1],$A[0][1]
	eor3	$C[2],$C[2],   $A[1][2],$A[0][2]
	eor3	$C[3],$C[3],   $A[1][3],$A[0][3]
	eor3	$C[4],$C[4],   $A[1][4],$A[0][4]

	rax1	$C[5],$C[0],$C[2]			// D[1]
	rax1	$C[6],$C[1],$C[3]			// D[2]
@@ -565,81 +563,75 @@ $code.=<<___;
	rax1	$C[4],$C[4],$C[1]			// D[0]

	////////////////////////////////////////////////// Theta+Rho+Pi
	xar	$C[0],   $A[1][1],$C[5],#64-$rhotates[1][1]	// C[0]=A[0][1]
	xar	$A[1][1],$A[1][4],$C[3],#64-$rhotates[1][4]
	xar	$A[1][4],$A[4][2],$C[6],#64-$rhotates[4][2]
	xar	$A[4][2],$A[2][4],$C[3],#64-$rhotates[2][4]
	xar	$A[2][4],$A[4][0],$C[4],#64-$rhotates[4][0]
	xar	$C[0],   $A[0][1],$D[1],#64-$rhotates[0][1] // C[0]=A[2][0]

	xar	$A[4][0],$A[0][2],$C[6],#64-$rhotates[0][2]
	xar	$A[0][1],$A[1][1],$D[1],#64-$rhotates[1][1]
	xar	$A[1][1],$A[1][4],$D[4],#64-$rhotates[1][4]
	xar	$A[1][4],$A[4][2],$D[2],#64-$rhotates[4][2]
	xar	$A[4][2],$A[2][4],$D[4],#64-$rhotates[2][4]
	xar	$A[2][4],$A[4][0],$D[0],#64-$rhotates[4][0]

	xar	$A[0][2],$A[2][2],$C[6],#64-$rhotates[2][2]
	xar	$A[2][2],$A[2][3],$C[2],#64-$rhotates[2][3]
	xar	$A[2][3],$A[3][4],$C[3],#64-$rhotates[3][4]
	xar	$A[3][4],$A[4][3],$C[2],#64-$rhotates[4][3]
	xar	$A[4][3],$A[3][0],$C[4],#64-$rhotates[3][0]
	xar	$C[1],   $A[0][2],$D[2],#64-$rhotates[0][2] // C[1]=A[4][0]

	xar	$A[3][0],$A[0][4],$C[3],#64-$rhotates[0][4]
	xar	$A[0][2],$A[2][2],$D[2],#64-$rhotates[2][2]
	xar	$A[2][2],$A[2][3],$D[3],#64-$rhotates[2][3]
	xar	$A[2][3],$A[3][4],$D[4],#64-$rhotates[3][4]
	xar	$A[3][4],$A[4][3],$D[3],#64-$rhotates[4][3]
	xar	$A[4][3],$A[3][0],$D[0],#64-$rhotates[3][0]

	eor	$A[0][0],$A[0][0],$C[4]
	ldr	x11,[x10],#8
	xar	$A[3][0],$A[0][4],$D[4],#64-$rhotates[0][4]

	xar	$C[1],   $A[3][3],$C[2],#64-$rhotates[3][3]	// C[1]=A[0][3]
	xar	$A[3][3],$A[3][2],$C[6],#64-$rhotates[3][2]
	xar	$A[3][2],$A[2][1],$C[5],#64-$rhotates[2][1]
	xar	$A[2][1],$A[1][2],$C[6],#64-$rhotates[1][2]
	xar	$A[1][2],$A[2][0],$C[4],#64-$rhotates[2][0]
	xar	$D[4],   $A[4][4],$D[4],#64-$rhotates[4][4] // D[4]=A[0][4]
	xar	$A[4][4],$A[4][1],$D[1],#64-$rhotates[4][1]
	xar	$A[1][3],$A[1][3],$D[3],#64-$rhotates[1][3] // A[1][3]=A[4][1]
	xar	$A[0][4],$A[3][1],$D[1],#64-$rhotates[3][1] // A[0][4]=A[1][3]
	xar	$A[3][1],$A[1][0],$D[0],#64-$rhotates[1][0]

	xar	$A[2][0],$A[0][1],$C[5],#64-$rhotates[0][1]	// *
	xar	$A[1][0],$A[0][3],$D[3],#64-$rhotates[0][3]

	xar	$A[0][4],$A[4][4],$C[3],#64-$rhotates[4][4]
	xar	$A[4][4],$A[4][1],$C[5],#64-$rhotates[4][1]
	xar	$A[4][1],$A[1][3],$C[2],#64-$rhotates[1][3]
	xar	$A[1][3],$A[3][1],$C[5],#64-$rhotates[3][1]
	xar	$A[3][1],$A[1][0],$C[4],#64-$rhotates[1][0]
	eor	$A[0][0],$A[0][0],$D[0]

	xar	$C[2],   $A[0][3],$C[2],#64-$rhotates[0][3]	// C[2]=A[1][0]
	xar	$D[3],   $A[3][3],$D[3],#64-$rhotates[3][3] // D[3]=A[0][3]
	xar	$A[0][3],$A[3][2],$D[2],#64-$rhotates[3][2] // A[0][3]=A[3][3]
	xar	$D[1],   $A[2][1],$D[1],#64-$rhotates[2][1] // D[1]=A[3][2]
	xar	$D[2],   $A[1][2],$D[2],#64-$rhotates[1][2] // D[2]=A[2][1]
	xar	$D[0],   $A[2][0],$D[0],#64-$rhotates[2][0] // D[0]=A[1][2]

	////////////////////////////////////////////////// Chi+Iota
	dup	$C[6],x11				// borrow C[6]
	bcax	$C[3],   $A[0][0],$A[0][2],$C[0]	// *
	bcax	$A[0][1],$C[0],   $C[1],   $A[0][2]	// *
	bcax	$A[0][2],$A[0][2],$A[0][4],$C[1]
	bcax	$A[0][3],$C[1],   $A[0][0],$A[0][4]
	bcax	$A[0][4],$A[0][4],$C[0],   $A[0][0]

	bcax	$A[1][0],$C[2],   $A[1][2],$A[1][1]	// *
	bcax	$C[0],   $A[1][1],$A[1][3],$A[1][2]	// *
	bcax	$A[1][2],$A[1][2],$A[1][4],$A[1][3]
	bcax	$A[1][3],$A[1][3],$C[2],   $A[1][4]
	bcax	$A[1][4],$A[1][4],$A[1][1],$C[2]

	eor	$A[0][0],$C[3],$C[6]			// Iota

	bcax	$C[1],   $A[2][0],$A[2][2],$A[2][1]	// *
	bcax	$C[2],   $A[2][1],$A[2][3],$A[2][2]	// *
	bcax	$A[2][2],$A[2][2],$A[2][4],$A[2][3]
	bcax	$A[2][3],$A[2][3],$A[2][0],$A[2][4]
	bcax	$A[2][4],$A[2][4],$A[2][1],$A[2][0]
	bcax	$A[4][0],$C[1],   $A[4][2],$A[1][3]	// A[1][3]=A[4][1]
	bcax	$A[4][1],$A[1][3],$A[4][3],$A[4][2]	// A[1][3]=A[4][1]
	bcax	$A[4][2],$A[4][2],$A[4][4],$A[4][3]
	bcax	$A[4][3],$A[4][3],$C[1],   $A[4][4]
	bcax	$A[4][4],$A[4][4],$A[1][3],$C[1]	// A[1][3]=A[4][1]

	ld1r	{$C[1]},[x10],#8

	bcax	$C[3],   $A[3][0],$A[3][2],$A[3][1]	// *
	bcax	$C[4],   $A[3][1],$A[3][3],$A[3][2]	// *
	bcax	$A[3][2],$A[3][2],$A[3][4],$A[3][3]
	bcax	$A[3][3],$A[3][3],$A[3][0],$A[3][4]
	bcax	$A[3][2],$D[1],   $A[3][4],$A[0][3]	// A[0][3]=A[3][3]
	bcax	$A[3][3],$A[0][3],$A[3][0],$A[3][4]	// A[0][3]=A[3][3]
	bcax	$A[3][4],$A[3][4],$A[3][1],$A[3][0]
	bcax	$A[3][0],$A[3][0],$D[1],   $A[3][1]
	bcax	$A[3][1],$A[3][1],$A[0][3],$D[1]	// A[0][3]=A[3][3]

	bcax	$A[2][0],$C[0],   $A[2][2],$D[2]
	bcax	$A[2][1],$D[2],   $A[2][3],$A[2][2]
	bcax	$A[2][2],$A[2][2],$A[2][4],$A[2][3]
	bcax	$A[2][3],$A[2][3],$C[0],   $A[2][4]
	bcax	$A[2][4],$A[2][4],$D[2],   $C[0]

	bcax	$A[1][2],$D[0],   $A[1][4],$A[0][4]	// A[0][4]=A[1][3]
	bcax	$A[1][3],$A[0][4],$A[1][0],$A[1][4]	// A[0][4]=A[1][3]
	bcax	$A[1][4],$A[1][4],$A[1][1],$A[1][0]
	bcax	$A[1][0],$A[1][0],$D[0],   $A[1][1]
	bcax	$A[1][1],$A[1][1],$A[0][4],$D[0]	// A[0][4]=A[1][3]

	bcax	$A[0][3],$D[3],   $A[0][0],$D[4]
	bcax	$A[0][4],$D[4],   $A[0][1],$A[0][0]
	bcax	$A[0][0],$A[0][0],$A[0][2],$A[0][1]
	bcax	$A[0][1],$A[0][1],$D[3],   $A[0][2]
	bcax	$A[0][2],$A[0][2],$D[4],   $D[3]

	eor	$A[0][0],$A[0][0],$C[1]

	bcax	$C[5],   $A[4][0],$A[4][2],$A[4][1]	// *
	bcax	$C[6],   $A[4][1],$A[4][3],$A[4][2]	// *
	bcax	$A[4][2],$A[4][2],$A[4][4],$A[4][3]
	bcax	$A[4][3],$A[4][3],$A[4][0],$A[4][4]
	bcax	$A[4][4],$A[4][4],$A[4][1],$A[4][0]
___
	(         $A[1][1],       $C[0]) = (      $C[0],          $A[1][1]);
	($A[2][0],$A[2][1], $C[1],$C[2]) = ($C[1],$C[2], $A[2][0],$A[2][1]);
	($A[3][0],$A[3][1], $C[3],$C[4]) = ($C[3],$C[4], $A[3][0],$A[3][1]);
	($A[4][0],$A[4][1], $C[5],$C[6]) = ($C[5],$C[6], $A[4][0],$A[4][1]);
}
$code.=<<___;
	subs	x9,x9,#1
	bne	.Loop_ce

@@ -857,7 +849,7 @@ foreach(split("\n",$code)) {

	s/\`([^\`]*)\`/eval($1)/ge;

	m/\bdup\b/ and s/\.16b/.2d/g	or
	m/\bld1r\b/ and s/\.16b/.2d/g	or
	s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge;

	print $_,"\n";