Commit 4b8736a2 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

crypto/poly1305: don't break carry chains.



RT#4483

[poly1305-armv4.pl: remove redundant #ifdef __thumb2__]
[poly1305-ppc*.pl: presumably more accurate benchmark results]

Reviewed-by: default avatarRichard Levitte <levitte@openssl.org>
parent 1400f013
Loading
Loading
Loading
Loading
+8 −26
Original line number Diff line number Diff line
@@ -10,10 +10,10 @@
#			IALU(*)/gcc-4.4		NEON
#
# ARM11xx(ARMv6)	7.78/+100%		-
# Cortex-A5		6.30/+130%		2.96
# Cortex-A5		6.35/+130%		2.96
# Cortex-A8		6.25/+115%		2.36
# Cortex-A9		5.10/+95%		2.55
# Cortex-A15		3.79/+85%		1.25(**)
# Cortex-A15		3.85/+85%		1.25(**)
# Snapdragon S4		5.70/+100%		1.48(**)
#
# (*)	this is for -march=armv6, i.e. with bunch of ldrb loading data;
@@ -313,7 +313,8 @@ poly1305_blocks:
	adds	$h0,$h0,r1
	adcs	$h1,$h1,#0
	adcs	$h2,$h2,#0
	adc	$h3,$h3,#0
	adcs	$h3,$h3,#0
	adc	$h4,$h4,#0

	cmp	r0,lr			@ done yet?
	bhi	.Loop
@@ -735,9 +736,7 @@ poly1305_blocks_neon:
.align	4
.Leven:
	subs		$len,$len,#64
# ifdef	__thumb2__
	it		lo
# endif
	movlo		$in2,$zeros

	vmov.i32	$H4,#1<<24		@ padbit, yes, always
@@ -745,9 +744,7 @@ poly1305_blocks_neon:
	add		$inp,$inp,#64
	vld4.32		{$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]	@ inp[2:3] (or 0)
	add		$in2,$in2,#64
# ifdef	__thumb2__
	itt		hi
# endif
	addhi		$tbl1,$ctx,#(48+1*9*4)
	addhi		$tbl0,$ctx,#(48+3*9*4)

@@ -817,9 +814,7 @@ poly1305_blocks_neon:
	vmull.u32	$D4,$H4#hi,${R0}[1]
	subs		$len,$len,#64
	vmlal.u32	$D0,$H4#hi,${S1}[1]
# ifdef	__thumb2__
	it		lo
# endif
	movlo		$in2,$zeros
	vmlal.u32	$D3,$H2#hi,${R1}[1]
	vld1.32		${S4}[1],[$tbl1,:32]
@@ -946,9 +941,7 @@ poly1305_blocks_neon:
	add		$tbl1,$ctx,#(48+0*9*4)
	add		$tbl0,$ctx,#(48+1*9*4)
	adds		$len,$len,#32
# ifdef	__thumb2__
	it		ne
# endif
	movne		$len,#0
	bne		.Long_tail

@@ -990,14 +983,10 @@ poly1305_blocks_neon:
	vmlal.u32	$D2,$H0#hi,$R2

	vmlal.u32	$D3,$H0#hi,$R3
# ifdef	__thumb2__
	 it		ne
# endif
	 addne		$tbl1,$ctx,#(48+2*9*4)
	vmlal.u32	$D0,$H2#hi,$S3
# ifdef	__thumb2__
	 it		ne
# endif
	 addne		$tbl0,$ctx,#(48+3*9*4)
	vmlal.u32	$D4,$H1#hi,$R3
	vmlal.u32	$D1,$H3#hi,$S3
@@ -1138,7 +1127,8 @@ poly1305_emit_neon:
	adds	$h0,$h0,$g0
	adcs	$h1,$h1,#0
	adcs	$h2,$h2,#0
	adc	$h3,$h3,#0
	adcs	$h3,$h3,#0
	adc	$h4,$h4,#0

	adds	$g0,$h0,#5		@ compare to modulus
	adcs	$g1,$h1,#0
@@ -1147,24 +1137,16 @@ poly1305_emit_neon:
	adc	$g4,$h4,#0
	tst	$g4,#4			@ did it carry/borrow?

# ifdef	__thumb2__
	it	ne
# endif
	movne	$h0,$g0
	ldr	$g0,[$nonce,#0]
# ifdef	__thumb2__
	it	ne
# endif
	movne	$h1,$g1
	ldr	$g1,[$nonce,#4]
# ifdef	__thumb2__
	it	ne
# endif
	movne	$h2,$g2
	ldr	$g2,[$nonce,#8]
# ifdef	__thumb2__
	it	ne
# endif
	movne	$h3,$g3
	ldr	$g3,[$nonce,#12]

+11 −7
Original line number Diff line number Diff line
@@ -16,10 +16,10 @@
#		IALU/gcc-4.9	NEON
#
# Apple A7	1.86/+5%	0.72
# Cortex-A53	2.63/+58%	1.47
# Cortex-A53	2.69/+58%	1.47
# Cortex-A57	2.70/+7%	1.14
# Denver	1.39/+50%	1.18(*)
# X-Gene	2.00/+68%	2.19
# Denver	1.64/+50%	1.18(*)
# X-Gene	2.13/+68%	2.19
#
# (*)	estimate based on resources availability is less than 1.0,
#	i.e. measured result is worse than expected, presumably binary
@@ -151,7 +151,8 @@ poly1305_blocks:
	and	$h2,$d2,#3
	add	$t0,$t0,$d2,lsr#2
	adds	$h0,$d0,$t0
	adc	$h1,$d1,xzr
	adcs	$h1,$d1,xzr
	adc	$h2,$h2,xzr

	cbnz	$len,.Loop

@@ -235,7 +236,8 @@ poly1305_mult:
	and	$h2,$d2,#3
	add	$t0,$t0,$d2,lsr#2
	adds	$h0,$d0,$t0
	adc	$h1,$d1,xzr
	adcs	$h1,$d1,xzr
	adc	$h2,$h2,xzr

	ret
.size	poly1305_mult,.-poly1305_mult
@@ -310,7 +312,8 @@ poly1305_blocks_neon:
	and	$h2,$d2,#3
	add	$t0,$t0,$d2,lsr#2
	adds	$h0,$h0,$t0
	adc	$h1,$h1,xzr
	adcs	$h1,$h1,xzr
	adc	$h2,$h2,xzr

#ifdef	__ARMEB__
	rev	$d0,$d0
@@ -870,7 +873,8 @@ poly1305_emit_neon:
	add	$d0,$d0,$h2,lsr#2
	and	$h2,$h2,#3
	adds	$h0,$h0,$d0
	adc	$h1,$h1,xzr
	adcs	$h1,$h1,xzr
	adc	$h2,$h2,xzr

	adds	$d0,$h0,#5		// compare to modulus
	adcs	$d1,$h1,xzr
+8 −7
Original line number Diff line number Diff line
@@ -11,7 +11,7 @@
#
# October 2015
#
# Performance is [incredible for a 32-bit processor] 1.76 cycles per
# Performance is [incredible for a 32-bit processor] 1.82 cycles per
# processed byte. Comparison to compiler-generated code is problematic,
# because results were observed to vary from 2.1 to 7.6 cpb depending
# on compiler's ability to inline small functions. Compiler also
@@ -128,7 +128,7 @@ _poly1305_blocks:
||	SWAP2	$D1,$D1

	ADDU	$D0,B24,$D0:$H0		; h0+=inp[0]
||	ADD	$D0,B24,B31		; B-copy of h0+inp[0]
||	ADD	$D0,B24,B27		; B-copy of h0+inp[0]
||	SWAP4	$D1,$D1
	ADDU	$D1,B25,$D1:$H1		; h1+=inp[1]
||	MVK	3,$THREE
@@ -140,12 +140,12 @@ _poly1305_blocks:

loop?:
	MPY32U	$H0,$R0,A17:A16
||	MPY32U	B31,$R1,B17:B16		; MPY32U	$H0,$R1,B17:B16
||	MPY32U	B27,$R1,B17:B16		; MPY32U	$H0,$R1,B17:B16
||	ADDU	$D0,$D1:$H1,B25:B24	; ADDU		$D0,$D1:$H1,$D1:$H1
||	ADDU	$D2,B28,$D2:$H2		; h2+=inp[2]
||	SWAP2	$D3,$D3
	MPY32U	$H0,$R2,A19:A18
||	MPY32U	B31,$R3,B19:B18		; MPY32U	$H0,$R3,B19:B18
||	MPY32U	B27,$R3,B19:B18		; MPY32U	$H0,$R3,B19:B18
||	ADD	$D0,$H1,A24		; A-copy of B24
||	SWAP4	$D3,$D3
|| [A2]	SUB	A2,1,A2			; decrement loop counter
@@ -227,8 +227,8 @@ loop?:

	SHRU	$H4,2,B16		; last reduction step
||	AND	$H4,$THREE,$H4
|| [A2]	BNOP	loop?
	ADDAW	B16,B16,B16		; 5*(h4>>2)
|| [A2]	BNOP	loop?

	ADDU	B24,B16,B25:B24		; B24 is h0
|| [A2]	SWAP2	$D2,$D2
@@ -236,8 +236,9 @@ loop?:
|| [A2]	SWAP4	$D2,$D2
	ADDU	B28,B27,B29:B28		; B28 is h2
|| [A2]	ADDU	$D0,B24,$D0:$H0		; h0+=inp[0]
|| [A2]	ADD	$D0,B24,B31		; B-copy of h0+inp[0]
	ADD	B30,B29,B30		; B30 is h3
|| [A2]	ADD	$D0,B24,B27		; B-copy of h0+inp[0]
	ADDU	B30,B29,B31:B30		; B30 is h3
	ADD	B31,$H4,$H4
|| [A2]	ADDU	$D1,B26,$D1:$H1		; h1+=inp[1]
;;===== branch to loop? is taken here

+6 −5
Original line number Diff line number Diff line
@@ -17,11 +17,10 @@
#			-m32		-m64
#
# Freescale e300	14.8/+80%	-
# PPC74x0		7.40/+60%	-
# PPC970		7.20/+114%	3.51/+205%
# POWER6		3.96/+250%	2.02/+170%
# POWER7		3.67/+260%	1.87/+100%
# POWER8		-		2.13/+200%
# PPC74x0		7.60/+60%	-
# PPC970		7.00/+114%	3.51/+205%
# POWER7		3.75/+260%	1.93/+100%
# POWER8		-		2.03/+200%
#
# Do we need floating-point implementation for PPC? Results presented
# in poly1305_ieee754.c are tricky to compare to, because they are for
@@ -212,6 +211,7 @@ $code.=<<___;
	add	$t0,$t0,$t1
	addc	$h0,$d0,$t0
	addze	$h1,$d1
	addze	$h2,$h2

	bdnz	Loop

@@ -518,6 +518,7 @@ $code.=<<___;
	addze	$h1,$h1
	addze	$h2,$h2
	addze	$h3,$h3
	addze	$h4,$h4

	bdnz	Loop

+2 −2
Original line number Diff line number Diff line
@@ -15,8 +15,8 @@
# and improvement coefficients relative to gcc-generated code.
#
# Freescale e300	9.78/+30%
# PPC74x0		7.08/+50%
# PPC970		6.24/+80%
# PPC74x0		6.92/+50%
# PPC970		6.03/+80%
# POWER7		3.50/+30%
# POWER8		3.75/+10%

Loading