Commit d6e4287c authored by Andy Polyakov's avatar Andy Polyakov
Browse files

aes/asm/aesv8-armx.pl: ~20% improvement on ThunderX2.

parent 6465321e
Loading
Loading
Loading
Loading
+389 −6
Original line number Diff line number Diff line
@@ -27,19 +27,34 @@
# CBC encrypt case. On Cortex-A57 parallelizable mode performance
# seems to be limited by sheer amount of NEON instructions...
#
# April 2019
#
# Key to performance of parallelize-able modes is round instruction
# interleaving. But which factor to use? There is optimal one for
# each combination of instruction latency and issue rate, beyond
# which increasing interleave factor doesn't pay off. While on cons
# side we have code size increase and resource waste on platforms for
# which interleave factor is too high. In other words you want it to
# be just right. So far interleave factor of 3x was serving well all
# platforms. But for ThunderX2 optimal interleave factor was measured
# to be 5x...
#
# Performance in cycles per byte processed with 128-bit key:
#
#		CBC enc		CBC dec		CTR
# Apple A7	2.39		1.20		1.20
# Cortex-A53	1.32		1.29		1.46
# Cortex-A57(*)	1.95		0.85		0.93
# Denver	1.96		0.86		0.80
# Mongoose	1.33		1.20		1.20
# Kryo		1.26		0.94		1.00
# ThunderX2	5.95		1.53		1.55
# Cortex-A53	1.32		1.17/1.29(**)	1.36/1.46
# Cortex-A57(*)	1.95		0.82/0.85	0.89/0.93
# Cortex-A72	1.33		0.85/0.88	0.92/0.96
# Denver	1.96		0.65/0.86	0.76/0.80
# Mongoose	1.33		1.23/1.20	1.30/1.20
# Kryo		1.26		0.87/0.94	1.00/1.00
# ThunderX2	5.95		1.25		1.30
#
# (*)	original 3.64/1.34/1.32 results were for r0p0 revision
#	and are still same even for updated module;
# (**)	numbers after slash are for 32-bit code, which is 3x-
#	interleaved;

$flavour = shift;
$output  = shift;
@@ -524,6 +539,13 @@ $code.=<<___;
___
{
my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));

my ($dat3,$in3,$tmp3);	# used only in 64-bit mode
my ($dat4,$in4,$tmp4);
if ($flavour =~ /64/) {
    ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
}

$code.=<<___;
.align	5
.Lcbc_dec:
@@ -540,7 +562,196 @@ $code.=<<___;
	vorr	$in0,$dat,$dat
	vorr	$in1,$dat1,$dat1
	vorr	$in2,$dat2,$dat2
___
$code.=<<___	if ($flavour =~ /64/);
	cmp	$len,#32
	b.lo	.Loop3x_cbc_dec

	vld1.8	{$dat3},[$inp],#16
	vld1.8	{$dat4},[$inp],#16
	sub	$len,$len,#32		// bias
	mov	$cnt,$rounds
	vorr	$in3,$dat3,$dat3
	vorr	$in4,$dat4,$dat4

.Loop5x_cbc_dec:
	aesd	$dat0,q8
	aesimc	$dat0,$dat0
	aesd	$dat1,q8
	aesimc	$dat1,$dat1
	aesd	$dat2,q8
	aesimc	$dat2,$dat2
	aesd	$dat3,q8
	aesimc	$dat3,$dat3
	aesd	$dat4,q8
	aesimc	$dat4,$dat4
	vld1.32	{q8},[$key_],#16
	subs	$cnt,$cnt,#2
	aesd	$dat0,q9
	aesimc	$dat0,$dat0
	aesd	$dat1,q9
	aesimc	$dat1,$dat1
	aesd	$dat2,q9
	aesimc	$dat2,$dat2
	aesd	$dat3,q9
	aesimc	$dat3,$dat3
	aesd	$dat4,q9
	aesimc	$dat4,$dat4
	vld1.32	{q9},[$key_],#16
	b.gt	.Loop5x_cbc_dec

	aesd	$dat0,q8
	aesimc	$dat0,$dat0
	aesd	$dat1,q8
	aesimc	$dat1,$dat1
	aesd	$dat2,q8
	aesimc	$dat2,$dat2
	aesd	$dat3,q8
	aesimc	$dat3,$dat3
	aesd	$dat4,q8
	aesimc	$dat4,$dat4
	 cmp	$len,#0x40		// because .Lcbc_tail4x
	 sub	$len,$len,#0x50

	aesd	$dat0,q9
	aesimc	$dat0,$dat0
	aesd	$dat1,q9
	aesimc	$dat1,$dat1
	aesd	$dat2,q9
	aesimc	$dat2,$dat2
	aesd	$dat3,q9
	aesimc	$dat3,$dat3
	aesd	$dat4,q9
	aesimc	$dat4,$dat4
	 csel	x6,xzr,$len,gt		// borrow x6, $cnt, "gt" is not typo
	 mov	$key_,$key

	aesd	$dat0,q10
	aesimc	$dat0,$dat0
	aesd	$dat1,q10
	aesimc	$dat1,$dat1
	aesd	$dat2,q10
	aesimc	$dat2,$dat2
	aesd	$dat3,q10
	aesimc	$dat3,$dat3
	aesd	$dat4,q10
	aesimc	$dat4,$dat4
	 add	$inp,$inp,x6		// $inp is adjusted in such way that
					// at exit from the loop $dat1-$dat4
					// are loaded with last "words"
	 add	x6,$len,#0x60		// because .Lcbc_tail4x

	aesd	$dat0,q11
	aesimc	$dat0,$dat0
	aesd	$dat1,q11
	aesimc	$dat1,$dat1
	aesd	$dat2,q11
	aesimc	$dat2,$dat2
	aesd	$dat3,q11
	aesimc	$dat3,$dat3
	aesd	$dat4,q11
	aesimc	$dat4,$dat4

	aesd	$dat0,q12
	aesimc	$dat0,$dat0
	aesd	$dat1,q12
	aesimc	$dat1,$dat1
	aesd	$dat2,q12
	aesimc	$dat2,$dat2
	aesd	$dat3,q12
	aesimc	$dat3,$dat3
	aesd	$dat4,q12
	aesimc	$dat4,$dat4

	aesd	$dat0,q13
	aesimc	$dat0,$dat0
	aesd	$dat1,q13
	aesimc	$dat1,$dat1
	aesd	$dat2,q13
	aesimc	$dat2,$dat2
	aesd	$dat3,q13
	aesimc	$dat3,$dat3
	aesd	$dat4,q13
	aesimc	$dat4,$dat4

	aesd	$dat0,q14
	aesimc	$dat0,$dat0
	aesd	$dat1,q14
	aesimc	$dat1,$dat1
	aesd	$dat2,q14
	aesimc	$dat2,$dat2
	aesd	$dat3,q14
	aesimc	$dat3,$dat3
	aesd	$dat4,q14
	aesimc	$dat4,$dat4

	 veor	$tmp0,$ivec,$rndlast
	aesd	$dat0,q15
	 veor	$tmp1,$in0,$rndlast
	 vld1.8	{$in0},[$inp],#16
	aesd	$dat1,q15
	 veor	$tmp2,$in1,$rndlast
	 vld1.8	{$in1},[$inp],#16
	aesd	$dat2,q15
	 veor	$tmp3,$in2,$rndlast
	 vld1.8	{$in2},[$inp],#16
	aesd	$dat3,q15
	 veor	$tmp4,$in3,$rndlast
	 vld1.8	{$in3},[$inp],#16
	aesd	$dat4,q15
	 vorr	$ivec,$in4,$in4
	 vld1.8	{$in4},[$inp],#16
	cbz	x6,.Lcbc_tail4x
	 vld1.32 {q8},[$key_],#16	// re-pre-load rndkey[0]
	veor	$tmp0,$tmp0,$dat0
	 vorr	$dat0,$in0,$in0
	veor	$tmp1,$tmp1,$dat1
	 vorr	$dat1,$in1,$in1
	veor	$tmp2,$tmp2,$dat2
	 vorr	$dat2,$in2,$in2
	veor	$tmp3,$tmp3,$dat3
	 vorr	$dat3,$in3,$in3
	veor	$tmp4,$tmp4,$dat4
	vst1.8	{$tmp0},[$out],#16
	 vorr	$dat4,$in4,$in4
	vst1.8	{$tmp1},[$out],#16
	 mov	$cnt,$rounds
	vst1.8	{$tmp2},[$out],#16
	 vld1.32 {q9},[$key_],#16	// re-pre-load rndkey[1]
	vst1.8	{$tmp3},[$out],#16
	vst1.8	{$tmp4},[$out],#16
	b.hs	.Loop5x_cbc_dec

	add	$len,$len,#0x50
	cbz	$len,.Lcbc_done

	add	$cnt,$rounds,#2
	subs	$len,$len,#0x30
	vorr	$dat0,$in2,$in2
	vorr	$in0,$in2,$in2
	vorr	$dat1,$in3,$in3
	vorr	$in1,$in3,$in3
	vorr	$dat2,$in4,$in4
	vorr	$in2,$in4,$in4
	b.lo	.Lcbc_dec_tail

	b	.Loop3x_cbc_dec

.align	4
.Lcbc_tail4x:
	veor	$tmp1,$tmp0,$dat1
	veor	$tmp2,$tmp2,$dat2
	veor	$tmp3,$tmp3,$dat3
	veor	$tmp4,$tmp4,$dat4
	vst1.8	{$tmp1},[$out],#16
	vst1.8	{$tmp2},[$out],#16
	vst1.8	{$tmp3},[$out],#16
	vst1.8	{$tmp4},[$out],#16

	b	.Lcbc_done
.align	4
___
$code.=<<___;
.Loop3x_cbc_dec:
	aesd	$dat0,q8
	aesimc	$dat0,$dat0
@@ -701,6 +912,9 @@ my $step="x12"; # aliases with $tctr2
my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));

# used only in 64-bit mode...
my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23));

my ($dat,$tmp)=($dat0,$tmp0);

### q8-q15	preloaded key schedule
@@ -753,6 +967,175 @@ $code.=<<___;
	rev		$tctr2, $ctr
	sub		$len,$len,#3		// bias
	vmov.32		${dat2}[3],$tctr2
___
$code.=<<___	if ($flavour =~ /64/);
	cmp		$len,#2
	b.lo		.Loop3x_ctr32

	add		w13,$ctr,#1
	add		w14,$ctr,#2
	vorr		$dat3,$dat0,$dat0
	rev		w13,w13
	vorr		$dat4,$dat0,$dat0
	rev		w14,w14
	vmov.32		${dat3}[3],w13
	sub		$len,$len,#2		// bias
	vmov.32		${dat4}[3],w14
	add		$ctr,$ctr,#2
	b		.Loop5x_ctr32

.align	4
.Loop5x_ctr32:
	aese		$dat0,q8
	aesmc		$dat0,$dat0
	aese		$dat1,q8
	aesmc		$dat1,$dat1
	aese		$dat2,q8
	aesmc		$dat2,$dat2
	aese		$dat3,q8
	aesmc		$dat3,$dat3
	aese		$dat4,q8
	aesmc		$dat4,$dat4
	vld1.32		{q8},[$key_],#16
	subs		$cnt,$cnt,#2
	aese		$dat0,q9
	aesmc		$dat0,$dat0
	aese		$dat1,q9
	aesmc		$dat1,$dat1
	aese		$dat2,q9
	aesmc		$dat2,$dat2
	aese		$dat3,q9
	aesmc		$dat3,$dat3
	aese		$dat4,q9
	aesmc		$dat4,$dat4
	vld1.32		{q9},[$key_],#16
	b.gt		.Loop5x_ctr32

	mov		$key_,$key
	aese		$dat0,q8
	aesmc		$dat0,$dat0
	aese		$dat1,q8
	aesmc		$dat1,$dat1
	aese		$dat2,q8
	aesmc		$dat2,$dat2
	aese		$dat3,q8
	aesmc		$dat3,$dat3
	aese		$dat4,q8
	aesmc		$dat4,$dat4
	vld1.32	 	{q8},[$key_],#16	// re-pre-load rndkey[0]

	aese		$dat0,q9
	aesmc		$dat0,$dat0
	aese		$dat1,q9
	aesmc		$dat1,$dat1
	aese		$dat2,q9
	aesmc		$dat2,$dat2
	aese		$dat3,q9
	aesmc		$dat3,$dat3
	aese		$dat4,q9
	aesmc		$dat4,$dat4
	vld1.32	 	{q9},[$key_],#16	// re-pre-load rndkey[1]

	aese		$dat0,q12
	aesmc		$dat0,$dat0
	 add		$tctr0,$ctr,#1
	 add		$tctr1,$ctr,#2
	aese		$dat1,q12
	aesmc		$dat1,$dat1
	 add		$tctr2,$ctr,#3
	 add		w13,$ctr,#4
	aese		$dat2,q12
	aesmc		$dat2,$dat2
	 add		w14,$ctr,#5
	 rev		$tctr0,$tctr0
	aese		$dat3,q12
	aesmc		$dat3,$dat3
	 rev		$tctr1,$tctr1
	 rev		$tctr2,$tctr2
	aese		$dat4,q12
	aesmc		$dat4,$dat4
	 rev		w13,w13
	 rev		w14,w14

	aese		$dat0,q13
	aesmc		$dat0,$dat0
	aese		$dat1,q13
	aesmc		$dat1,$dat1
	aese		$dat2,q13
	aesmc		$dat2,$dat2
	aese		$dat3,q13
	aesmc		$dat3,$dat3
	aese		$dat4,q13
	aesmc		$dat4,$dat4

	aese		$dat0,q14
	aesmc		$dat0,$dat0
	 vld1.8		{$in0},[$inp],#16
	aese		$dat1,q14
	aesmc		$dat1,$dat1
	 vld1.8		{$in1},[$inp],#16
	aese		$dat2,q14
	aesmc		$dat2,$dat2
	 vld1.8		{$in2},[$inp],#16
	aese		$dat3,q14
	aesmc		$dat3,$dat3
	 vld1.8		{$in3},[$inp],#16
	aese		$dat4,q14
	aesmc		$dat4,$dat4
	 vld1.8		{$in4},[$inp],#16

	aese		$dat0,q15
	 veor		$in0,$in0,$rndlast
	aese		$dat1,q15
	 veor		$in1,$in1,$rndlast
	aese		$dat2,q15
	 veor		$in2,$in2,$rndlast
	aese		$dat3,q15
	 veor		$in3,$in3,$rndlast
	aese		$dat4,q15
	 veor		$in4,$in4,$rndlast

	veor		$in0,$in0,$dat0
	 vorr		$dat0,$ivec,$ivec
	veor		$in1,$in1,$dat1
	 vorr		$dat1,$ivec,$ivec
	veor		$in2,$in2,$dat2
	 vorr		$dat2,$ivec,$ivec
	veor		$in3,$in3,$dat3
	 vorr		$dat3,$ivec,$ivec
	veor		$in4,$in4,$dat4
	 vorr		$dat4,$ivec,$ivec

	vst1.8		{$in0},[$out],#16
	 vmov.32	${dat0}[3],$tctr0
	vst1.8		{$in1},[$out],#16
	 vmov.32	${dat1}[3],$tctr1
	vst1.8		{$in2},[$out],#16
	 vmov.32	${dat2}[3],$tctr2
	vst1.8		{$in3},[$out],#16
	 vmov.32	${dat3}[3],w13
	vst1.8		{$in4},[$out],#16
	 vmov.32	${dat4}[3],w14

	mov		$cnt,$rounds
	cbz		$len,.Lctr32_done

	add		$ctr,$ctr,#5
	subs		$len,$len,#5
	b.hs		.Loop5x_ctr32

	add		$len,$len,#5
	sub		$ctr,$ctr,#5

	cmp		$len,#2
	mov		$step,#16
	cclr		$step,lo
	b.ls		.Lctr32_tail

	sub		$len,$len,#3		// bias
	add		$ctr,$ctr,#3
___
$code.=<<___;
	b		.Loop3x_ctr32

.align	4