Commit 0dd25e36 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

Bignum division tune-up. Idea is to move multiplications in front of

loop body and replace 'em with addition/subtraction.
parent a40f6dce
Loading
Loading
Loading
Loading
+132 −56
Original line number Diff line number Diff line
.rdata
.asciiz "mips3.s, Version 1.0 (prerelease)"
.asciiz "mips3.s, Version 1.0"
.asciiz	"MIPS III/IV ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"

/*
@@ -19,19 +19,26 @@
 * a drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c
 * module. For updates see http://fy.chalmers.se/~appro/hpe/.
 *
 * The module is designed to work with "new" IRIX ABI(5), namely
 * N32 and N64. But it was tested only with MIPSpro 7.2.x assembler,
 * i.e. depends on preprocessor options set up by MIPSspro 7.2.x
 * driver. Another neat gadget offered by MIPSpro 7.2.x assembler is
 * an peep-hole(?) optimization pass. This gave me the opportunity
 * to make the code looking more regular as all those architecture
 * dependent(!) instruction rescheduling details were left to the
 * assembler. Cool, huh? Do note that I have no idea if GNU assembler
 * does anything similar nor how GNU C will do with this module.
 * Feedback on the matter is therefore very much appreciated:-)
 * The module is designed to work with either of the "new" MIPS ABI(5),
 * namely N32 or N64, offered by IRIX 6.x. It's not ment to work under
 * IRIX 5.x not only because it doesn't support new ABIs but also
 * because 5.x kernels put R4x00 CPU into 32-bit mode and all those
 * 64-bit instructions (daddu, dmultu, etc.) found below gonna only
 * cause illegal instruction exception:-(
 *
 * In addition the code depends on preprocessor flags set up by MIPSpro
 * compiler driver (either as or cc) and therefore (probably?) can't be
 * compiled by the GNU assembler. GNU C driver manages fine though...
 * I mean as long as -mmips-as is specified or is the default option,
 * because then it simply invokes /usr/bin/as which in turn takes
 * perfect care of the preprocessor definitions. Another neat feature
 * offered by the MIPSpro assembler is an optimization pass. This gave
 * me the opportunity to have the code looking more regular as all those
 * architecture dependent instruction rescheduling details were left to
 * the assembler. Cool, huh?
 *
 * Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
 * exhibits 3-3.5-3.7 times improvement!
 * goes way over 3 times faster!
 *
 *					<appro@fy.chalmers.se>
 */
@@ -56,8 +63,8 @@

#define	MINUS4	v1

LEAF(bn_mul_add_words)
.align	5
LEAF(bn_mul_add_words)
	.set	noreorder
	bgtzl	a2,.L_bn_mul_add_words_proceed
	ld	t0,0(a1)
@@ -185,8 +192,8 @@ LEAF(bn_mul_add_words)
	jr	ra
END(bn_mul_add_words)

LEAF(bn_mul_words)
.align	5
LEAF(bn_mul_words)
	.set	noreorder
	bgtzl	a2,.L_bn_mul_words_proceed
	ld	t0,0(a1)
@@ -284,8 +291,8 @@ LEAF(bn_mul_words)
	jr	ra
END(bn_mul_words)

LEAF(bn_sqr_words)
.align	5
LEAF(bn_sqr_words)
	.set	noreorder
	bgtzl	a2,.L_bn_sqr_words_proceed
	ld	t0,0(a1)
@@ -371,8 +378,8 @@ LEAF(bn_sqr_words)
	jr	ra
END(bn_sqr_words)

LEAF(bn_add_words)
.align	5
LEAF(bn_add_words)
	.set	noreorder
	bgtzl	a3,.L_bn_add_words_proceed
	ld	t0,0(a1)
@@ -471,8 +478,8 @@ LEAF(bn_add_words)
	jr	ra
END(bn_add_words)

LEAF(bn_sub_words)
.align	5
LEAF(bn_sub_words)
	.set	noreorder
	bgtzl	a3,.L_bn_sub_words_proceed
	ld	t0,0(a1)
@@ -567,24 +574,24 @@ END(bn_sub_words)

#undef	MINUS4

LEAF(bn_div_words)
.align	5
LEAF(bn_div_words)
	.set	noreorder
	bnezl	a2,.L_bn_div_words_proceed
	move	t0,zero
	move	v1,zero
	jr	ra
	li	v0,-1		/* I'd rather signal div-by-zero
				 * which can be done with 'break 7' */
	.set	reorder

.L_bn_div_words_proceed:
	bltz	a2,.L_bn_div_words_body
	.set	noreorder
	move	t9,v1
	dsll	a2,1
	bgtz	a2,.-4
	addu	t0,1
	addu	t9,1

	.set	reorder
	negu	t1,t0
	negu	t1,t9
	li	t2,-1
	dsll	t2,t1
	and	t2,a0
@@ -593,65 +600,135 @@ LEAF(bn_div_words)
	bnezl	t2,.+8
	break	6		/* signal overflow */
	.set	reorder
	dsll	a0,t0
	dsll	a1,t0
	dsll	a0,t9
	dsll	a1,t9
	or	a0,AT

#define	QT	ta0
#define	DH	ta1
#define	HH	ta2
#define	MINUS1	ta3
#define	HH	ta1
#define	DH	v1
.L_bn_div_words_body:
	dsrl	DH,a2,32
	li	v1,2
	sgeu	AT,a0,a2
	li	MINUS1,-1
	.set	noreorder
	bnezl	AT,.+8
	dsubu	a0,a2
	.set	reorder

.L_bn_div_words_outer_loop:
	li	QT,-1
	dsrl	HH,a0,32
	subu	v1,1
	dsrl	QT,MINUS1,32	/* q=0xffffffff */
	beq	DH,HH,.L_bn_div_words_inner_loop
	dsrl	QT,32	/* q=0xffffffff */
	beq	DH,HH,.L_bn_div_words_skip_div1
	ddivu	zero,a0,DH
	mflo	QT
.L_bn_div_words_inner_loop:
.L_bn_div_words_skip_div1:
	dmultu	a2,QT
	dsll	t3,a0,32
	dsrl	AT,a1,32
	or	t3,AT
	mflo	t0
	mfhi	t1
.L_bn_div_words_inner_loop1:
	sltu	t2,t3,t0
	seq	t8,HH,t1
	sltu	AT,HH,t1
	and	t2,t8
	or	AT,t2
	.set	noreorder
	bnezl	AT,.L_bn_div_words_inner_loop
	dsubu	QT,1
	beqz	AT,.L_bn_div_words_inner_loop1_done
	sltu	t2,t0,a2
	.set	reorder
	
	dsubu	a0,t3,t0
	beqz	v1,.L_bn_div_words_outer_loop_done
	dsubu	QT,1
	dsubu	t0,a2
	dsubu	t1,t2
	b	.L_bn_div_words_inner_loop1
.L_bn_div_words_inner_loop1_done:	

	dsll	a1,32
	dsubu	a0,t3,t0
	dsll	v0,QT,32
	b	.L_bn_div_words_outer_loop

.L_bn_div_words_outer_loop_done:
	li	QT,-1
	dsrl	HH,a0,32
	dsrl	QT,32	/* q=0xffffffff */
	beq	DH,HH,.L_bn_div_words_skip_div2
	ddivu	zero,a0,DH
	mflo	QT
.L_bn_div_words_skip_div2:
	dmultu	a2,QT
	dsll	t3,a0,32
	dsrl	AT,a1,32
	or	t3,AT
	mflo	t0
	mfhi	t1
.L_bn_div_words_inner_loop2:
	sltu	t2,t3,t0
	seq	t8,HH,t1
	sltu	AT,HH,t1
	and	t2,t8
	or	AT,t2
	.set	noreorder
	beqz	AT,.L_bn_div_words_inner_loop2_done
	sltu	t2,t0,a2
	.set	reorder
	dsubu	QT,1
	dsubu	t0,a2
	dsubu	t1,t2
	b	.L_bn_div_words_inner_loop2
.L_bn_div_words_inner_loop2_done:	

	dsubu	a0,t3,t0
	or	v0,QT
	move	v1,a0	/* v1 contains remainder if one wants it */
	dsrl	v1,a0,t9	/* v1 contains remainder if anybody wants it */
	dsrl	a2,t9		/* restore a2 */
	jr	ra
#undef	MINUS1
#undef	HH
#undef	DH
#undef	QT
END(bn_div_words)

.align 5
LEAF(bn_div_3_words)
	.set	reorder
	move	a3,a0		/* we know that bn_div_words doesn't
				 * touch a3, ta2, ta3 and preserves a2
				 * so that we can save two arguments
				 * and return address in registers
				 * instead of stack:-)
				 */
	ld	a0,(a3)
	move	ta2,a2
	move	a2,a1
	ld	a1,-8(a3)
	move	ta3,ra
	move	v1,zero
	li	v0,-1
	beq	a0,a2,.L_bn_div_3_words_skip_div
	jal	bn_div_words
	move	ra,ta3
.L_bn_div_3_words_skip_div:
	dmultu	ta2,v0
	ld	t2,-16(a3)
	mflo	t0
	mfhi	t1
.L_bn_div_3_words_inner_loop:
	sgeu	AT,t2,t0
	seq	t9,t1,v1
	sltu	t8,t1,v1
	and	AT,t9
	or	AT,t8
	bnez	AT,.L_bn_div_3_words_inner_loop_done
	daddu	v1,a2
	sltu	t3,t0,ta2
	sltu	AT,v1,a2
	dsubu	v0,1
	dsubu	t0,ta2
	dsubu	t1,t3
	beqz	AT,.L_bn_div_3_words_inner_loop
.L_bn_div_3_words_inner_loop_done:
	jr	ra
END(bn_div_3_words)

#define	a_0	t0
#define	a_1	t1
#define	a_2	t2
@@ -679,20 +756,19 @@ END(bn_div_words)

#define	FRAME_SIZE	48

LEAF(bn_mul_comba8)
.align	5
LEAF(bn_mul_comba8)
	.set	noreorder
	PTR_SUB	sp,FRAME_SIZE
	.frame	sp,64,ra
	.set	reorder
	ld	a_0,0(a1)	/* If compiled with -mips3 options
				 * assembler barks on this line with
				 * "shouldn't have mult/div as last
				 * instruction in bb (R10K bug)"
				 * warning. If anybody out there has
				 * a clue on what does "bb" mean and
				 * how to circumvent this do send me
				 * a note.
	ld	a_0,0(a1)	/* If compiled with -mips3 option on
				 * R5000 box assembler barks on this
				 * line with "shouldn't have mult/div
				 * as last instruction in bb (R10K
				 * bug)" warning. If anybody out there
				 * has a clue about how to circumvent
				 * this do send me a note.
				 *		<appro@fy.chalmers.se>
				 */
	ld	b_0,0(a2)
@@ -1286,8 +1362,8 @@ LEAF(bn_mul_comba8)
	jr	ra
END(bn_mul_comba8)

LEAF(bn_mul_comba4)
.align	5
LEAF(bn_mul_comba4)
	.set	reorder
	ld	a_0,0(a1)
	ld	b_0,0(a2)
@@ -1444,8 +1520,8 @@ END(bn_mul_comba4)
#define	a_6	b_2
#define	a_7	b_3

LEAF(bn_sqr_comba8)
.align	5
LEAF(bn_sqr_comba8)
	.set	reorder
	ld	a_0,0(a1)
	ld	a_1,8(a1)
@@ -1934,8 +2010,8 @@ LEAF(bn_sqr_comba8)
	jr	ra
END(bn_sqr_comba8)

LEAF(bn_sqr_comba4)
.align	5
LEAF(bn_sqr_comba4)
	.set	reorder
	ld	a_0,0(a1)
	ld	a_1,8(a1)
+6 −4
Original line number Diff line number Diff line
@@ -264,18 +264,20 @@ BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
		else
			q=h/dh;

		th=q*dh;
		tl=dl*q;
		for (;;)
			{
			t=(h-(th=q*dh));
			tl=BN_MASK2;
			t=h-th;
			if ((t&BN_MASK2h) ||
				((tl=dl*q) <= (
				((tl) <= (
					(t<<BN_BITS4)|
					((l&BN_MASK2h)>>BN_BITS4))))
				break;
			q--;
			th-=dh;
			tl-=dl;
			}
		if (tl==BN_MASK2) tl=q*dl;
		t=(tl>>BN_BITS4);
		tl=(tl<<BN_BITS4)&BN_MASK2h;
		th+=t;
+42 −29
Original line number Diff line number Diff line
@@ -200,56 +200,69 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,

	for (i=0; i<loop-1; i++)
		{
		BN_ULONG q,n0,n1;
		BN_ULONG l0;
		BN_ULONG q,l0;
#ifdef BN_DIV3W
		q=bn_div_3_words(wnump,d0,d1);
#else
		BN_ULONG n0,n1,rem;

		wnum.d--; wnum.top++;
		n0=wnump[0];
		n1=wnump[-1];
		if (n0 == d0)
			q=BN_MASK2;
		else
#if defined(BN_LLONG) && defined(BN_DIV2W)
			q=((((BN_ULLONG)n0)<<BN_BITS2)|n1)/((BN_ULLONG)d0);
#else
			q=bn_div_words(n0,n1,d0);
#endif
		{
#ifdef BN_LLONG
		BN_ULLONG t1,t2,rem;
		t1=((BN_ULLONG)n0<<BN_BITS2)|n1;
		BN_ULLONG t2;

		/*
		 * rem doesn't have to be BN_ULLONG. The least we
		 * know it's less that d0, isn't it?
		 */
		rem=(n1-q*d0)&BN_MASK2;

		t2=(BN_ULLONG)d1*q;
		for (;;)
			{
			rem=t1-(BN_ULLONG)q*d0;
			t2=(BN_ULLONG)d1*q;
                        if ((rem>>BN_BITS2) ||
				(t2 <= ((rem<<BN_BITS2)|wnump[-2])))
                        if (t2 <= ((((BN_ULLONG)rem)<<BN_BITS2)|wnump[-2]))
				break;
			q--;
			rem += d0;
			if (rem < d0) break; /* don't let rem overflow */
			t2 -= d1;
			}
#else
		BN_ULONG t1l,t1h,t2l,t2h,t3l,t3h,ql,qh,t3t;
		t1h=n0;
		t1l=n1;
		for (;;)
			{
		BN_ULONG t2l,t2h,ql,qh;

		/*
		 * It's more than enough with the only multiplication.
		 * See the comment above in BN_LLONG section...
		 */
		rem=(n1-q*d0)&BN_MASK2;

		t2l=LBITS(d1); t2h=HBITS(d1);
		ql =LBITS(q);  qh =HBITS(q);
		mul64(t2l,t2h,ql,qh); /* t2=(BN_ULLONG)d1*q; */

			t3t=LBITS(d0); t3h=HBITS(d0);
			mul64(t3t,t3h,ql,qh); /* t3=t1-(BN_ULLONG)q*d0; */
			t3l=(t1l-t3t)&BN_MASK2;
			if (t3l > t1l) t3h++;
			t3h=(t1h-t3h)&BN_MASK2;

			/*if ((t3>>BN_BITS2) ||
				(t2 <= ((t3<<BN_BITS2)+wnump[-2])))
				break; */
			if (t3h) break;
			if (t2h < t3l) break;
			if ((t2h == t3l) && (t2l <= wnump[-2])) break;

		for (;;)
			{
			if ((t2h < rem) ||
				((t2h == rem) && (t2l <= wnump[-2])))
				break;
			q--;
			rem += d0;
			if (rem < d0) break; /* don't let rem overflow */
			if (t2l < d1) t2h--; t2l -= d1;
			}
#endif
		}
#endif /* BN_DIV3W */
		wnum.d--; wnum.top++;
		l0=bn_mul_words(tmp->d,sdiv->d,div_n,q);
		tmp->d[div_n]=l0;
		for (j=div_n+1; j>0; j--)