Commit 7ff2fa4b authored by Andy Polyakov's avatar Andy Polyakov
Browse files

modes/asm/ghashv8-armx.pl: implement 4x aggregate factor.



This initial commit is unoptimized reference version that handles
input lengths divisible by 4 blocks.

Reviewed-by: default avatarRich Salz <rsalz@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/4830)
parent a00cceb2
Loading
Loading
Loading
Loading
+171 −12
Original line number Diff line number Diff line
@@ -17,17 +17,24 @@
# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
#
# June 2014
# Initial version was developed in tight cooperation with Ard Biesheuvel
# of Linaro from bits-n-pieces from other assembly modules. Just like
# aesv8-armx.pl this module supports both AArch32 and AArch64 execution modes.
#
# Initial version was developed in tight cooperation with Ard
# Biesheuvel of Linaro from bits-n-pieces from other assembly modules.
# Just like aesv8-armx.pl this module supports both AArch32 and
# AArch64 execution modes.
#
# July 2014
#
# Implement 2x aggregated reduction [see ghash-x86.pl for background
# information].
#
# November 2017
#
# AArch64 register bank to "accommodate" 4x aggregated reduction...
#
# Current performance in cycles per processed byte:
#
#		PMULL[2]	32-bit NEON(*)
#		64-bit PMULL	32-bit PMULL	32-bit NEON(*)
# Apple A7			0.92		5.62
# Cortex-A53			1.01		8.39
# Cortex-A57			1.17		7.61
@@ -128,8 +135,56 @@ gcm_init_v8:
	vext.8		$t1,$H2,$H2,#8		@ Karatsuba pre-processing
	veor		$t1,$t1,$H2
	vext.8		$Hhl,$t0,$t1,#8		@ pack Karatsuba pre-processed
	vst1.64		{$Hhl-$H2},[x0]		@ store Htable[1..2]
	vst1.64		{$Hhl-$H2},[x0],#32	@ store Htable[1..2]
___
if ($flavour =~ /64/) {
my ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7));

$code.=<<___;
	@ calculate H^3 and H^4
	vpmull.p64	$Xl,$H, $H2
	 vpmull.p64	$Yl,$H2,$H2
	vpmull2.p64	$Xh,$H, $H2
	 vpmull2.p64	$Yh,$H2,$H2
	vpmull.p64	$Xm,$t0,$t1
	 vpmull.p64	$Ym,$t1,$t1

	vext.8		$t0,$Xl,$Xh,#8		@ Karatsuba post-processing
	 vext.8		$t1,$Yl,$Yh,#8
	veor		$t2,$Xl,$Xh
	veor		$Xm,$Xm,$t0
	 veor		$t3,$Yl,$Yh
	 veor		$Ym,$Ym,$t1
	veor		$Xm,$Xm,$t2
	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase
	 veor		$Ym,$Ym,$t3
	 vpmull.p64	$t3,$Yl,$xC2

	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
	 vmov		$Yh#lo,$Ym#hi
	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
	 vmov		$Ym#hi,$Yl#lo
	veor		$Xl,$Xm,$t2
	 veor		$Yl,$Ym,$t3

	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase
	 vext.8		$t3,$Yl,$Yl,#8
	vpmull.p64	$Xl,$Xl,$xC2
	 vpmull.p64	$Yl,$Yl,$xC2
	veor		$t2,$t2,$Xh
	 veor		$t3,$t3,$Yh
	veor		$H, $Xl,$t2		@ H^3
	 veor		$H2,$Yl,$t3		@ H^4

	vext.8		$t0,$H, $H,#8		@ Karatsuba pre-processing
	 vext.8		$t1,$H2,$H2,#8
	veor		$t0,$t0,$H
	 veor		$t1,$t1,$H2
	vext.8		$Hhl,$t0,$t1,#8		@ pack Karatsuba pre-processed
	vst1.64		{$H-$H2},[x0]		@ store Htable[3..5]
___
}
$code.=<<___;
	ret
.size	gcm_init_v8,.-gcm_init_v8
___
@@ -198,6 +253,11 @@ $code.=<<___;
.align	4
gcm_ghash_v8:
___
$code.=<<___	if ($flavour =~ /64/);
	bic		$inc,$len,#63
	cmp		$len,$inc
	b.eq		.Lgcm_ghash_v8_4x
___
$code.=<<___		if ($flavour !~ /64/);
	vstmdb		sp!,{d8-d15}		@ 32-bit ABI says so
___
@@ -345,7 +405,105 @@ $code.=<<___;
	ret
.size	gcm_ghash_v8,.-gcm_ghash_v8
___

if ($flavour =~ /64/) {				# 4x subroutine
my ($I0,$j1,$j2,$j3,
    $I1,$I2,$I3,$H3,$H34,$H4,$Yl,$Ym,$Yh) = map("q$_",(4..7,15..23));

$code.=<<___;
.type	gcm_ghash_v8_4x,%function
.align	4
gcm_ghash_v8_4x:
.Lgcm_ghash_v8_4x:
	vld1.64		{$Xl},[$Xi]		@ load [rotated] Xi
	vld1.64		{$H-$H2},[$Htbl],#48	@ load twisted H, ..., H^2
	vmov.i8		$xC2,#0xe1
	vld1.64		{$H3-$H4},[$Htbl]	@ load twisted H^3, ..., H^4
	vshl.u64	$xC2,$xC2,#57		@ compose 0xc2.0 constant
#ifndef __ARMEB__
	vrev64.8	$Xl,$Xl
#endif
	b		.Loop4x

.align	4
.Loop4x:
	vld1.64		{$I0-$j3},[$inp],#64
#ifndef __ARMEB__
	vrev64.8	$j1,$j1
	vrev64.8	$j2,$j2
	vrev64.8	$j3,$j3
	vrev64.8	$I0,$I0
#endif
	vext.8		$I3,$j3,$j3,#8
	vext.8		$I2,$j2,$j2,#8
	vext.8		$I1,$j1,$j1,#8

	vpmull.p64	$Yl,$H,$I3		@ H·Ii+3
	veor		$j3,$j3,$I3
	vpmull2.p64	$Yh,$H,$I3
	vpmull.p64	$Ym,$Hhl,$j3

	vpmull.p64	$t0,$H2,$I2		@ H^2·Ii+2
	veor		$j2,$j2,$I2
	vpmull2.p64	$I2,$H2,$I2
	vpmull2.p64	$j2,$Hhl,$j2

	veor		$Yl,$Yl,$t0
	veor		$Yh,$Yh,$I2
	veor		$Ym,$Ym,$j2

	vpmull.p64	$j3,$H3,$I1		@ H^3·Ii+1
	veor		$j1,$j1,$I1
	vpmull2.p64	$I1,$H3,$I1
	vpmull.p64	$j1,$H34,$j1

	veor		$Yl,$Yl,$j3
	veor		$Yh,$Yh,$I1
	veor		$Ym,$Ym,$j1

	veor		$t0,$I0,$Xl
	vext.8		$IN,$t0,$t0,#8

	vpmull.p64	$Xl,$H4,$IN		@ H^4·(Xi+Ii)
	veor		$t0,$t0,$IN
	vpmull2.p64	$Xh,$H4,$IN
	vpmull2.p64	$Xm,$H34,$t0

	veor		$Xl,$Xl,$Yl
	veor		$Xh,$Xh,$Yh
	veor		$Xm,$Xm,$Ym

	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
	veor		$t2,$Xl,$Xh
	veor		$Xm,$Xm,$t1
	veor		$Xm,$Xm,$t2

	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
	veor		$Xl,$Xm,$t2

	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
	vpmull.p64	$Xl,$Xl,$xC2
	veor		$t2,$t2,$Xh
	veor		$Xl,$Xl,$t2
	vext.8		$Xl,$Xl,$Xl,#8

	subs		$len,$len,#64
	b.ne		.Loop4x

#ifndef __ARMEB__
	vrev64.8	$Xl,$Xl
#endif
	vst1.64		{$Xl},[$Xi]		@ write out Xi

	ret
.size	gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
___

}
}

$code.=<<___;
.asciz  "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align  2
@@ -356,7 +514,8 @@ if ($flavour =~ /64/) { ######## 64-bit code
	my $arg=shift;

	$arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
	sprintf	"ins	v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1;
	sprintf	"ins	v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
					     $3<8?$3:$3+8,($4 eq "lo")?0:1;
    }
    foreach(split("\n",$code)) {
	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or