Commit 11de71b0 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

3-4 times better RSA/DSA performance on WIN64A target. Well, on AMD64 CPU,

EMT64T will hardly exhibit better performance...
parent 19bd66fe
Loading
Loading
Loading
Loading
+28 −0
Original line number Diff line number Diff line
@@ -459,6 +459,34 @@ BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
#define sqr_add_c2(a,i,j,c0,c1,c2) \
	mul_add_c2((a)[i],(a)[j],c0,c1,c2)

#elif defined(BN_UMULT_LOHI)

#define mul_add_c(a,b,c0,c1,c2)	{	\
	BN_ULONG ta=(a),tb=(b);		\
	BN_UMULT_LOHI(t1,t2,ta,tb);	\
	c0 += t1; t2 += (c0<t1)?1:0;	\
	c1 += t2; c2 += (c1<t2)?1:0;	\
	}

#define mul_add_c2(a,b,c0,c1,c2) {	\
	BN_ULONG ta=(a),tb=(b),t0;	\
	BN_UMULT_LOHI(t0,t1,ta,tb);	\
	t2 = t1+t1; c2 += (t2<t1)?1:0;	\
	t1 = t0+t0; t2 += (t1<t0)?1:0;	\
	c0 += t1; t2 += (c0<t1)?1:0;	\
	c1 += t2; c2 += (c1<t2)?1:0;	\
	}

#define sqr_add_c(a,i,c0,c1,c2)	{	\
	BN_ULONG ta=(a)[i];		\
	BN_UMULT_LOHI(t1,t2,ta,ta);	\
	c0 += t1; t2 += (c0<t1)?1:0;	\
	c1 += t2; c2 += (c1<t2)?1:0;	\
	}

#define sqr_add_c2(a,i,j,c0,c1,c2)	\
	mul_add_c2((a)[i],(a)[j],c0,c1,c2)

#elif defined(BN_UMULT_HIGH)

#define mul_add_c(a,b,c0,c1,c2)	{	\
+36 −0
Original line number Diff line number Diff line
@@ -270,6 +270,15 @@ extern "C" {
		: "a"(a),"g"(b)		\
		: "cc");
#  endif
# elif (defined(_M_AMD64) || defined(_M_X64)) && defined(SIXTY_FOUR_BIT)
#  if defined(_MSC_VER) && _MSC_VER>=1400
    unsigned __int64 __umulh	(unsigned __int64 a,unsigned __int64 b);
    unsigned __int64 _umul128	(unsigned __int64 a,unsigned __int64 b,
				 unsigned __int64 *h);
#   pragma intrinsic(__umulh,_umul128)
#   define BN_UMULT_HIGH(a,b)		__umulh((a),(b))
#   define BN_UMULT_LOHI(low,high,a,b)	((low)=_umul128((a),(b),&(high)))
#  endif
# endif		/* cpu */
#endif		/* OPENSSL_NO_ASM */

@@ -313,6 +322,33 @@ extern "C" {
	(r1)=Hw(t); \
	}

#elif defined(BN_UMULT_LOHI)
#define mul_add(r,a,w,c) {		\
	BN_ULONG high,low,ret,tmp=(a);	\
	ret =  (r);			\
	BN_UMULT_LOHI(low,high,w,tmp);	\
	ret += (c);			\
	(c) =  (ret<(c))?1:0;		\
	(c) += high;			\
	ret += low;			\
	(c) += (ret<low)?1:0;		\
	(r) =  ret;			\
	}

#define mul(r,a,w,c)	{		\
	BN_ULONG high,low,ret,ta=(a);	\
	BN_UMULT_LOHI(low,high,w,ta);	\
	ret =  low + (c);		\
	(c) =  high;			\
	(c) += (ret<low)?1:0;		\
	(r) =  ret;			\
	}

#define sqr(r0,r1,a)	{		\
	BN_ULONG tmp=(a);		\
	BN_UMULT_LOHI(r0,r1,tmp,tmp);	\
	}

#elif defined(BN_UMULT_HIGH)
#define mul_add(r,a,w,c) {		\
	BN_ULONG high,low,ret,tmp=(a);	\