Commit fb81ac5e authored by Andy Polyakov's avatar Andy Polyakov
Browse files

Support for "multiply high" instruction, see BN_UMULT_HIGH comment in

crypto/bn/bn_lcl.h for further details. It should be noted that for
the moment of this writing the code was tested only on Alpha. If
compiled with DEC C the C implementation exhibits 12% performance
improvement over the crypto/bn/asm/alpha.s (on EV56 box running
AlphaLinux). GNU C is (unfortunately) 8% behind the assembler
implementation. But it's OpenVMS Alpha users who *may* benefit most
as 'apps/openssl speed rsa' exhibits 6 (six) times performance
improvement over the original VMS bignum implementation. Where "*may*"
means "as soon as code is enabled though #define SIXTY_FOUR_BIT and
crypto/bn/asm/vms.mar is skipped."
parent 54a34aec
Loading
Loading
Loading
Loading
+61 −37
Original line number Diff line number Diff line
@@ -60,7 +60,7 @@
#include "cryptlib.h"
#include "bn_lcl.h"

#ifdef BN_LLONG 
#if defined(BN_LLONG) || defined(BN_UMULT_HIGH)

BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
	{
@@ -69,18 +69,19 @@ BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
	bn_check_num(num);
	if (num <= 0) return(c1);

	for (;;)
	while (num&~3)
		{
		mul_add(rp[0],ap[0],w,c1);
		if (--num == 0) break;
		mul_add(rp[1],ap[1],w,c1);
		if (--num == 0) break;
		mul_add(rp[2],ap[2],w,c1);
		if (--num == 0) break;
		mul_add(rp[3],ap[3],w,c1);
		if (--num == 0) break;
		ap+=4;
		rp+=4;
		ap+=4; rp+=4; num-=4;
		}
	if (num)
		{
		mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
		mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
		mul_add(rp[2],ap[2],w,c1); return c1;
		}
	
	return(c1);
@@ -93,19 +94,19 @@ BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
	bn_check_num(num);
	if (num <= 0) return(c1);

	/* for (;;) */
	while (1) /* circumvent egcs-1.1.2 bug */
	while (num&~3)
		{
		mul(rp[0],ap[0],w,c1);
		if (--num == 0) break;
		mul(rp[1],ap[1],w,c1);
		if (--num == 0) break;
		mul(rp[2],ap[2],w,c1);
		if (--num == 0) break;
		mul(rp[3],ap[3],w,c1);
		if (--num == 0) break;
		ap+=4;
		rp+=4;
		ap+=4; rp+=4; num-=4;
		}
	if (num)
		{
		mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
		mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
		mul(rp[2],ap[2],w,c1);
		}
	return(c1);
	} 
@@ -114,28 +115,19 @@ void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
        {
	bn_check_num(n);
	if (n <= 0) return;
	for (;;)
	while (n&~3)
		{
		BN_ULLONG t;

		t=(BN_ULLONG)(a[0])*(a[0]);
		r[0]=Lw(t); r[1]=Hw(t);
		if (--n == 0) break;

		t=(BN_ULLONG)(a[1])*(a[1]);
		r[2]=Lw(t); r[3]=Hw(t);
		if (--n == 0) break;

		t=(BN_ULLONG)(a[2])*(a[2]);
		r[4]=Lw(t); r[5]=Hw(t);
		if (--n == 0) break;

		t=(BN_ULLONG)(a[3])*(a[3]);
		r[6]=Lw(t); r[7]=Hw(t);
		if (--n == 0) break;

		a+=4;
		r+=8;
		sqr(r[0],r[1],a[0]);
		sqr(r[2],r[3],a[1]);
		sqr(r[4],r[5],a[2]);
		sqr(r[6],r[7],a[3]);
		a+=4; r+=8; n-=4;
		}
	if (n)
		{
		sqr(r[0],r[1],a[0]); if (--n == 0) return;
		sqr(r[2],r[3],a[1]); if (--n == 0) return;
		sqr(r[4],r[5],a[2]);
		}
	}

@@ -460,6 +452,38 @@ BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)

#define sqr_add_c2(a,i,j,c0,c1,c2) \
	mul_add_c2((a)[i],(a)[j],c0,c1,c2)

#elif defined(BN_UMULT_HIGH)

#define mul_add_c(a,b,c0,c1,c2)	{	\
	BN_ULONG ta=(a),tb=(b);		\
	t1 = ta * tb;			\
	t2 = BN_UMULT_HIGH(ta,tb);	\
	c0 += t1; t2 += (c0<t1)?1:0;	\
	c1 += t2; c2 += (c1<t2)?1:0;	\
	}

#define mul_add_c2(a,b,c0,c1,c2) {	\
	BN_ULONG ta=(a),tb=(b),t0;	\
	t1 = BN_UMULT_HIGH(ta,tb);	\
	t0 = ta * tb;			\
	t2 = t1+t1; c2 += (t2<t1)?1:0;	\
	t1 = t0+t0; t2 += (t1<t0)?1:0;	\
	c0 += t1; t2 += (c0<t1)?1:0;	\
	c1 += t2; c2 += (c1<t2)?1:0;	\
	}

#define sqr_add_c(a,i,c0,c1,c2)	{	\
	BN_ULONG ta=(a)[i];		\
	t1 = ta * ta;			\
	t2 = BN_UMULT_HIGH(ta,ta);	\
	c0 += t1; t2 += (c0<t1)?1:0;	\
	c1 += t2; c2 += (c1<t2)?1:0;	\
	}

#define sqr_add_c2(a,i,j,c0,c1,c2)	\
	mul_add_c2((a)[i],(a)[j],c0,c1,c2)

#else
#define mul_add_c(a,b,c0,c1,c2) \
	t1=LBITS(a); t2=HBITS(a); \
+5 −0
Original line number Diff line number Diff line
@@ -280,9 +280,14 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
		 */
		rem=(n1-q*d0)&BN_MASK2;
#endif
#ifdef BN_UMULT_HIGH
		t2l = d1 * q;
		t2h = BN_UMULT_HIGH(d1,q);
#else
		t2l=LBITS(d1); t2h=HBITS(d1);
		ql =LBITS(q);  qh =HBITS(q);
		mul64(t2l,t2h,ql,qh); /* t2=(BN_ULLONG)d1*q; */
#endif

		for (;;)
			{
+85 −0
Original line number Diff line number Diff line
@@ -86,6 +86,54 @@ extern "C" {
#endif
#endif

#if !defined(NO_ASM) && !defined(PEDANTIC)
/*
 * BN_UMULT_HIGH section.
 *
 * No, I'm not trying to overwhelm you when stating that the
 * product of N-bit numbers is 2*N bits wide:-) No, I don't expect
 * you to be impressed when I say that if the compiler doesn't
 * support 2*N integer type, then you have to replace every N*N
 * multiplication with 4 (N/2)*(N/2) accompanied by some shifts
 * and additions which unavoidably results in severe performance
 * penalties. Of course provided that the hardware is capable of
 * producing 2*N result... That's when you normally start
 * considering assembler implementation. However! It should be
 * pointed out that some CPUs (most notably Alpha, PowerPC and
 * upcoming IA-64 family:-) provide *separate* instruction
 * calculating the upper half of the product placing the result
 * into a general purpose register. Now *if* the compiler supports
 * inline assembler, then it's not impossible to implement the
 * "bignum" routines (and have the compiler optimize 'em)
 * exhibiting "native" performance in C. That's what BN_UMULT_HIGH
 * macro is about:-)
 *
 *					<appro@fy.chalmers.se>
 */
# if defined(__alpha) && (defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT))
#  if defined(__DECC)
#   include <c_asm.h>
#   define BN_UMULT_HIGH(a,b)	(BN_ULONG)asm("umulh %a0,%a1,%v0",(a),(b))
#  elif defined(__GNUC__)
#   define BN_UMULT_HIGH(a,b)	({	\
	register BN_ULONG ret;		\
	asm ("umulh	%1,%2,%0"	\
	     : "=r"(ret)		\
	     : "r"(a), "r"(b));		\
	ret;			})
#  endif	/* compiler */
# elif defined(_ARCH_PPC) && defined(__64BIT__) && defined(SIXTY_FOUR_BIT_LONG)
#  if defined(__GNUC__)
#   define BN_UMULT_HIGH(a,b)	({	\
	register BN_ULONG ret;		\
	asm ("mulhdu	%0,%1,%2"	\
	     : "=r"(ret)		\
	     : "r"(a), "r"(b));		\
	ret;			})
#  endif	/* compiler */
# endif		/* cpu */
#endif		/* NO_ASM */

/*************************************************************
 * Using the long long type
 */
@@ -151,6 +199,43 @@ extern "C" {
	(c)= Hw(t); \
	}

#define sqr(r0,r1,a) { \
	BN_ULLONG t; \
	t=(BN_ULLONG)(a)*(a); \
	(r0)=Lw(t); \
	(r1)=Hw(t); ]
	}

#elif defined(BN_UMULT_HIGH)
#define mul_add(r,a,w,c) {		\
	BN_ULONG high,low,ret,tmp=(a);	\
	ret =  (r);			\
	high=  BN_UMULT_HIGH(w,tmp);	\
	ret += (c);			\
	low =  (w) * tmp;		\
	(c) =  (ret<(c))?1:0;		\
	(c) += high;			\
	ret += low;			\
	(c) += (ret<low)?1:0;		\
	(r) =  ret;			\
	}

#define mul(r,a,w,c)	{		\
	BN_ULONG high,low,ret,ta=(a);	\
	low =  (w) * ta;		\
	high=  BN_UMULT_HIGH(w,ta);	\
	ret =  low + (c);		\
	(c) =  high;			\
	(c) += (ret<low)?1:0;		\
	(r) =  ret;			\
	}

#define sqr(r0,r1,a)	{		\
	BN_ULONG tmp=(a);		\
	(r0) = tmp * tmp;		\
	(r1) = BN_UMULT_HIGH(tmp,tmp);	\
	}

#else
/*************************************************************
 * No long long type