crypto/poly1305: don't break carry chains. (4b8736a2) · Commits · CYBER - Cyber Security / TS 103 523 MSP / ETS / ETS OpenSSL

crypto/poly1305/asm/poly1305-armv4.pl

+8 −26

Original line number	Diff line number	Diff line
		@@ -10,10 +10,10 @@
		# IALU(*)/gcc-4.4 NEON
		#
		# ARM11xx(ARMv6) 7.78/+100% -
		# Cortex-A5 6.30/+130% 2.96
		# Cortex-A5 6.35/+130% 2.96
		# Cortex-A8 6.25/+115% 2.36
		# Cortex-A9 5.10/+95% 2.55
		# Cortex-A15 3.79/+85% 1.25(**)
		# Cortex-A15 3.85/+85% 1.25(**)
		# Snapdragon S4 5.70/+100% 1.48(**)
		#
		# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data;
		@@ -313,7 +313,8 @@ poly1305_blocks:
		adds $h0,$h0,r1
		adcs $h1,$h1,#0
		adcs $h2,$h2,#0
		adc $h3,$h3,#0
		adcs $h3,$h3,#0
		adc $h4,$h4,#0

		cmp r0,lr @ done yet?
		bhi .Loop
		@@ -735,9 +736,7 @@ poly1305_blocks_neon:
		.align 4
		.Leven:
		subs $len,$len,#64
		# ifdef __thumb2__
		it lo
		# endif
		movlo $in2,$zeros

		vmov.i32 $H4,#1<<24 @ padbit, yes, always
		@@ -745,9 +744,7 @@ poly1305_blocks_neon:
		add $inp,$inp,#64
		vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
		add $in2,$in2,#64
		# ifdef __thumb2__
		itt hi
		# endif
		addhi $tbl1,$ctx,#(48+194)
		addhi $tbl0,$ctx,#(48+394)

		@@ -817,9 +814,7 @@ poly1305_blocks_neon:
		vmull.u32 $D4,$H4#hi,${R0}[1]
		subs $len,$len,#64
		vmlal.u32 $D0,$H4#hi,${S1}[1]
		# ifdef __thumb2__
		it lo
		# endif
		movlo $in2,$zeros
		vmlal.u32 $D3,$H2#hi,${R1}[1]
		vld1.32 ${S4}[1],[$tbl1,:32]
		@@ -946,9 +941,7 @@ poly1305_blocks_neon:
		add $tbl1,$ctx,#(48+094)
		add $tbl0,$ctx,#(48+194)
		adds $len,$len,#32
		# ifdef __thumb2__
		it ne
		# endif
		movne $len,#0
		bne .Long_tail

		@@ -990,14 +983,10 @@ poly1305_blocks_neon:
		vmlal.u32 $D2,$H0#hi,$R2

		vmlal.u32 $D3,$H0#hi,$R3
		# ifdef __thumb2__
		it ne
		# endif
		addne $tbl1,$ctx,#(48+294)
		vmlal.u32 $D0,$H2#hi,$S3
		# ifdef __thumb2__
		it ne
		# endif
		addne $tbl0,$ctx,#(48+394)
		vmlal.u32 $D4,$H1#hi,$R3
		vmlal.u32 $D1,$H3#hi,$S3
		@@ -1138,7 +1127,8 @@ poly1305_emit_neon:
		adds $h0,$h0,$g0
		adcs $h1,$h1,#0
		adcs $h2,$h2,#0
		adc $h3,$h3,#0
		adcs $h3,$h3,#0
		adc $h4,$h4,#0

		adds $g0,$h0,#5 @ compare to modulus
		adcs $g1,$h1,#0
		@@ -1147,24 +1137,16 @@ poly1305_emit_neon:
		adc $g4,$h4,#0
		tst $g4,#4 @ did it carry/borrow?

		# ifdef __thumb2__
		it ne
		# endif
		movne $h0,$g0
		ldr $g0,[$nonce,#0]
		# ifdef __thumb2__
		it ne
		# endif
		movne $h1,$g1
		ldr $g1,[$nonce,#4]
		# ifdef __thumb2__
		it ne
		# endif
		movne $h2,$g2
		ldr $g2,[$nonce,#8]
		# ifdef __thumb2__
		it ne
		# endif
		movne $h3,$g3
		ldr $g3,[$nonce,#12]

crypto/poly1305/asm/poly1305-armv8.pl

+11 −7

Original line number	Diff line number	Diff line
		@@ -16,10 +16,10 @@
		# IALU/gcc-4.9 NEON
		#
		# Apple A7 1.86/+5% 0.72
		# Cortex-A53 2.63/+58% 1.47
		# Cortex-A53 2.69/+58% 1.47
		# Cortex-A57 2.70/+7% 1.14
		# Denver 1.39/+50% 1.18(*)
		# X-Gene 2.00/+68% 2.19
		# Denver 1.64/+50% 1.18(*)
		# X-Gene 2.13/+68% 2.19
		#
		# (*) estimate based on resources availability is less than 1.0,
		# i.e. measured result is worse than expected, presumably binary
		@@ -151,7 +151,8 @@ poly1305_blocks:
		and $h2,$d2,#3
		add $t0,$t0,$d2,lsr#2
		adds $h0,$d0,$t0
		adc $h1,$d1,xzr
		adcs $h1,$d1,xzr
		adc $h2,$h2,xzr

		cbnz $len,.Loop

		@@ -235,7 +236,8 @@ poly1305_mult:
		and $h2,$d2,#3
		add $t0,$t0,$d2,lsr#2
		adds $h0,$d0,$t0
		adc $h1,$d1,xzr
		adcs $h1,$d1,xzr
		adc $h2,$h2,xzr

		ret
		.size poly1305_mult,.-poly1305_mult
		@@ -310,7 +312,8 @@ poly1305_blocks_neon:
		and $h2,$d2,#3
		add $t0,$t0,$d2,lsr#2
		adds $h0,$h0,$t0
		adc $h1,$h1,xzr
		adcs $h1,$h1,xzr
		adc $h2,$h2,xzr

		#ifdef __ARMEB__
		rev $d0,$d0
		@@ -870,7 +873,8 @@ poly1305_emit_neon:
		add $d0,$d0,$h2,lsr#2
		and $h2,$h2,#3
		adds $h0,$h0,$d0
		adc $h1,$h1,xzr
		adcs $h1,$h1,xzr
		adc $h2,$h2,xzr

		adds $d0,$h0,#5 // compare to modulus
		adcs $d1,$h1,xzr

crypto/poly1305/asm/poly1305-c64xplus.pl

+8 −7

Original line number	Diff line number	Diff line
		@@ -11,7 +11,7 @@
		#
		# October 2015
		#
		# Performance is [incredible for a 32-bit processor] 1.76 cycles per
		# Performance is [incredible for a 32-bit processor] 1.82 cycles per
		# processed byte. Comparison to compiler-generated code is problematic,
		# because results were observed to vary from 2.1 to 7.6 cpb depending
		# on compiler's ability to inline small functions. Compiler also
		@@ -128,7 +128,7 @@ _poly1305_blocks:
		\|\| SWAP2 $D1,$D1

		ADDU $D0,B24,$D0:$H0 ; h0+=inp[0]
		\|\| ADD $D0,B24,B31 ; B-copy of h0+inp[0]
		\|\| ADD $D0,B24,B27 ; B-copy of h0+inp[0]
		\|\| SWAP4 $D1,$D1
		ADDU $D1,B25,$D1:$H1 ; h1+=inp[1]
		\|\| MVK 3,$THREE
		@@ -140,12 +140,12 @@ _poly1305_blocks:

		loop?:
		MPY32U $H0,$R0,A17:A16
		\|\| MPY32U B31,$R1,B17:B16 ; MPY32U $H0,$R1,B17:B16
		\|\| MPY32U B27,$R1,B17:B16 ; MPY32U $H0,$R1,B17:B16
		\|\| ADDU $D0,$D1:$H1,B25:B24 ; ADDU $D0,$D1:$H1,$D1:$H1
		\|\| ADDU $D2,B28,$D2:$H2 ; h2+=inp[2]
		\|\| SWAP2 $D3,$D3
		MPY32U $H0,$R2,A19:A18
		\|\| MPY32U B31,$R3,B19:B18 ; MPY32U $H0,$R3,B19:B18
		\|\| MPY32U B27,$R3,B19:B18 ; MPY32U $H0,$R3,B19:B18
		\|\| ADD $D0,$H1,A24 ; A-copy of B24
		\|\| SWAP4 $D3,$D3
		\|\| [A2] SUB A2,1,A2 ; decrement loop counter
		@@ -227,8 +227,8 @@ loop?:

		SHRU $H4,2,B16 ; last reduction step
		\|\| AND $H4,$THREE,$H4
		\|\| [A2] BNOP loop?
		ADDAW B16,B16,B16 ; 5*(h4>>2)
		\|\| [A2] BNOP loop?

		ADDU B24,B16,B25:B24 ; B24 is h0
		\|\| [A2] SWAP2 $D2,$D2
		@@ -236,8 +236,9 @@ loop?:
		\|\| [A2] SWAP4 $D2,$D2
		ADDU B28,B27,B29:B28 ; B28 is h2
		\|\| [A2] ADDU $D0,B24,$D0:$H0 ; h0+=inp[0]
		\|\| [A2] ADD $D0,B24,B31 ; B-copy of h0+inp[0]
		ADD B30,B29,B30 ; B30 is h3
		\|\| [A2] ADD $D0,B24,B27 ; B-copy of h0+inp[0]
		ADDU B30,B29,B31:B30 ; B30 is h3
		ADD B31,$H4,$H4
		\|\| [A2] ADDU $D1,B26,$D1:$H1 ; h1+=inp[1]
		;;===== branch to loop? is taken here

crypto/poly1305/asm/poly1305-ppc.pl

+6 −5

Original line number	Diff line number	Diff line
		@@ -17,11 +17,10 @@
		# -m32 -m64
		#
		# Freescale e300 14.8/+80% -
		# PPC74x0 7.40/+60% -
		# PPC970 7.20/+114% 3.51/+205%
		# POWER6 3.96/+250% 2.02/+170%
		# POWER7 3.67/+260% 1.87/+100%
		# POWER8 - 2.13/+200%
		# PPC74x0 7.60/+60% -
		# PPC970 7.00/+114% 3.51/+205%
		# POWER7 3.75/+260% 1.93/+100%
		# POWER8 - 2.03/+200%
		#
		# Do we need floating-point implementation for PPC? Results presented
		# in poly1305_ieee754.c are tricky to compare to, because they are for
		@@ -212,6 +211,7 @@ $code.=<<___;
		add $t0,$t0,$t1
		addc $h0,$d0,$t0
		addze $h1,$d1
		addze $h2,$h2

		bdnz Loop

		@@ -518,6 +518,7 @@ $code.=<<___;
		addze $h1,$h1
		addze $h2,$h2
		addze $h3,$h3
		addze $h4,$h4

		bdnz Loop

crypto/poly1305/asm/poly1305-ppcfp.pl

+2 −2

Original line number	Diff line number	Diff line
		@@ -15,8 +15,8 @@
		# and improvement coefficients relative to gcc-generated code.
		#
		# Freescale e300 9.78/+30%
		# PPC74x0 7.08/+50%
		# PPC970 6.24/+80%
		# PPC74x0 6.92/+50%
		# PPC970 6.03/+80%
		# POWER7 3.50/+30%
		# POWER8 3.75/+10%