Commit 9d0e4dc6 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

bn/asm/s390x.S: improve performance on z196 and z13 by up to 26%. [even z10 is...


bn/asm/s390x.S: improve performance on z196 and z13 by up to 26%. [even z10 is couple percent faster]. Triggered by RT#4128, but solves the problem by real modulo-scheduling.

Reviewed-by: default avatarRich Salz <rsalz@openssl.org>
parent a5fd24d1
Loading
Loading
Loading
Loading
+72 −37
Original line number Diff line number Diff line
@@ -18,71 +18,106 @@
.align	4
bn_mul_add_words:
	lghi	zero,0		// zero = 0
	la	%r1,0(%r2)	// put rp aside
	lghi	%r2,0		// i=0;
	la	%r1,0(%r2)	// put rp aside [to give way to]
	lghi	%r2,0		// return value
	ltgfr	%r4,%r4
	bler	%r14		// if (len<=0) return 0;

	stmg	%r6,%r10,48(%r15)
	lghi	%r10,3
	lghi	%r8,0		// carry = 0
	nr	%r10,%r4	// len%4
	stmg	%r6,%r13,48(%r15)
	lghi	%r2,3
	lghi	%r12,0		// carry = 0
	slgr	%r1,%r3		// rp-=ap
	nr	%r2,%r4		// len%4
	sra	%r4,2		// cnt=len/4
	jz	.Loop1_madd	// carry is incidentally cleared if branch taken
	algr	zero,zero	// clear carry

.Loop4_madd:
	lg	%r7,0(%r2,%r3)	// ap[i]
	lg	%r7,0(%r3)	// ap[0]
	lg	%r9,8(%r3)	// ap[1]
	mlgr	%r6,%r5		// *=w
	alcgr	%r7,%r8		// +=carry
	alcgr	%r6,zero
	alg	%r7,0(%r2,%r1)	// +=rp[i]
	stg	%r7,0(%r2,%r1)	// rp[i]=
	brct	%r4,.Loop4_madd
	j	.Loop4_madd_tail

	lg	%r9,8(%r2,%r3)
.Loop4_madd:
	mlgr	%r8,%r5
	lg	%r11,16(%r3)	// ap[i+2]
	alcgr	%r7,%r12	// +=carry
	alcgr	%r6,zero
	alg	%r7,0(%r3,%r1)	// +=rp[i]
	stg	%r7,0(%r3,%r1)	// rp[i]=

	mlgr	%r10,%r5
	lg	%r13,24(%r3)
	alcgr	%r9,%r6
	alcgr	%r8,zero
	alg	%r9,8(%r2,%r1)
	stg	%r9,8(%r2,%r1)
	alg	%r9,8(%r3,%r1)
	stg	%r9,8(%r3,%r1)

	mlgr	%r12,%r5
	lg	%r7,32(%r3)
	alcgr	%r11,%r8
	alcgr	%r10,zero
	alg	%r11,16(%r3,%r1)
	stg	%r11,16(%r3,%r1)

	lg	%r7,16(%r2,%r3)
	mlgr	%r6,%r5
	alcgr	%r7,%r8
	alcgr	%r6,zero
	alg	%r7,16(%r2,%r1)
	stg	%r7,16(%r2,%r1)
	lg	%r9,40(%r3)
	alcgr	%r13,%r10
	alcgr	%r12,zero
	alg	%r13,24(%r3,%r1)
	stg	%r13,24(%r3,%r1)

	lg	%r9,24(%r2,%r3)
	la	%r3,32(%r3)	// i+=4
	brct	%r4,.Loop4_madd

.Loop4_madd_tail:
	mlgr	%r8,%r5
	lg	%r11,16(%r3)
	alcgr	%r7,%r12	// +=carry
	alcgr	%r6,zero
	alg	%r7,0(%r3,%r1)	// +=rp[i]
	stg	%r7,0(%r3,%r1)	// rp[i]=

	mlgr	%r10,%r5
	lg	%r13,24(%r3)
	alcgr	%r9,%r6
	alcgr	%r8,zero
	alg	%r9,24(%r2,%r1)
	stg	%r9,24(%r2,%r1)
	alg	%r9,8(%r3,%r1)
	stg	%r9,8(%r3,%r1)

	la	%r2,32(%r2)	// i+=4
	brct	%r4,.Loop4_madd
	mlgr	%r12,%r5
	alcgr	%r11,%r8
	alcgr	%r10,zero
	alg	%r11,16(%r3,%r1)
	stg	%r11,16(%r3,%r1)

	la	%r10,1(%r10)		// see if len%4 is zero ...
	brct	%r10,.Loop1_madd	// without touching condition code:-)
	alcgr	%r13,%r10
	alcgr	%r12,zero
	alg	%r13,24(%r3,%r1)
	stg	%r13,24(%r3,%r1)

	la	%r3,32(%r3)	// i+=4

	la	%r2,1(%r2)	// see if len%4 is zero ...
	brct	%r2,.Loop1_madd	// without touching condition code:-)

.Lend_madd:
	alcgr	%r8,zero	// collect carry bit
	lgr	%r2,%r8
	lmg	%r6,%r10,48(%r15)
	lgr	%r2,zero	// return value
	alcgr	%r2,%r12	// collect even carry bit
	lmg	%r6,%r13,48(%r15)
	br	%r14

.Loop1_madd:
	lg	%r7,0(%r2,%r3)	// ap[i]
	lg	%r7,0(%r3)	// ap[i]
	mlgr	%r6,%r5		// *=w
	alcgr	%r7,%r8		// +=carry
	alcgr	%r7,%r12	// +=carry
	alcgr	%r6,zero
	alg	%r7,0(%r2,%r1)	// +=rp[i]
	stg	%r7,0(%r2,%r1)	// rp[i]=
	alg	%r7,0(%r3,%r1)	// +=rp[i]
	stg	%r7,0(%r3,%r1)	// rp[i]=

	lgr	%r8,%r6
	la	%r2,8(%r2)	// i++
	brct	%r10,.Loop1_madd
	lgr	%r12,%r6
	la	%r3,8(%r3)	// i++
	brct	%r2,.Loop1_madd

	j	.Lend_madd
.size	bn_mul_add_words,.-bn_mul_add_words