Add reference implementation for bn_[mul|sqr]_mont, new candidates for (e7382805) · Commits · CYBER - Cyber Security / TS 103 523 MSP / TLMSP / TLMSP OpenSSL

crypto/bn/bn.h

+2 −0

Original line number	Diff line number	Diff line
		@@ -727,6 +727,8 @@ int RAND_pseudo_bytes(unsigned char *buf,int num);
		bn_pollute(a); \
		}

		void bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG bp, const BN_ULONG np,BN_ULONG n0, int num);
		void bn_sqr_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *np,BN_ULONG n0, int num);
		BN_ULONG bn_mul_add_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w);
		BN_ULONG bn_mul_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w);
		void bn_sqr_words(BN_ULONG rp, const BN_ULONG ap, int num);

crypto/bn/bn_asm.c

+124 −2

Original line number	Diff line number	Diff line
		@@ -820,18 +820,95 @@ void bn_sqr_comba4(BN_ULONG r, const BN_ULONG a)
		r[6]=c1;
		r[7]=c2;
		}

		#ifdef OPENSSL_BN_ASM_MONT
		/*
		* This is essentially reference implementation, which may or may not
		* result in performance improvement. E.g. on IA-32 this does give 40%
		* faster rsa1024 private key operations and 10% faster rsa4096 ones,
		* while on AMD64 it improves rsa1024 sign only by 10%, but worsens
		* rsa4096 sign by 15%. Once again, it's a reference implementation,
		* one to be used as start-point for platform-specific assembler.
		*/
		void bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG bp, const BN_ULONG np,BN_ULONG n0, int num)
		{
		BN_ULONG c0,c1,ml,*tp;
		#ifdef mul64
		BN_ULONG mh;
		#endif
		volatile BN_ULONG *vp;
		int i=0,j;

		vp = tp = alloca((num+2)*sizeof(BN_ULONG));

		tp[num] = bn_mul_words(tp,ap,num,bp[0]);
		tp[num+1] = 0;
		goto enter;

		for(i=0;i<num;i++)
		{
		c0 = bn_mul_add_words(tp,ap,num,bp[i]);
		c1 = (tp[num] + c0)&BN_MASK2;
		tp[num] = c1;
		tp[num+1] = (c1<c0?1:0);
		enter:
		c1 = tp[0];
		ml = (c1*n0)&BN_MASK2;
		c0 = 0;
		#ifdef mul64
		mh = HBITS(ml);
		ml = LBITS(ml);
		mul_add(c1,np[0],ml,mh,c0);
		#else
		mul_add(c1,ml,np[0],c0);
		#endif
		for(j=1;j<num;j++)
		{
		c1 = tp[j];
		#ifdef mul64
		mul_add(c1,np[j],ml,mh,c0);
		#else
		mul_add(c1,ml,np[j],c0);
		#endif
		tp[j-1] = c1&BN_MASK2;
		}
		c1 = (tp[num] + c0)&BN_MASK2;
		tp[num-1] = c1;
		tp[num] = tp[num+1] + (c1<c0?1:0);
		}

		if (tp[num]!=0 \|\| tp[num-1]>=np[num-1])
		{
		c0 = bn_sub_words(rp,tp,np,num);
		if (tp[num]!=0 \|\| c0==0)
		{
		for(i=0;i<num+2;i++) vp[i] = 0;
		return;
		}
		}
		for(i=0;i<num;i++) rp[i] = tp[i], vp[i] = 0;
		vp[num] = 0;
		vp[num+1] = 0;
		}

		void bn_sqr_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *np,BN_ULONG n0, int num)
		{
		bn_mul_mont(rp,ap,ap,np,n0,num);
		}
		#endif /* OPENSSL_BN_ASM_MONT */

		#else /* !BN_MUL_COMBA */

		/* hmm... is it faster just to do a multiply? */
		#undef bn_sqr_comba4
		void bn_sqr_comba4(BN_ULONG r, BN_ULONG a)
		void bn_sqr_comba4(BN_ULONG r, const BN_ULONG a)
		{
		BN_ULONG t[8];
		bn_sqr_normal(r,a,4,t);
		}

		#undef bn_sqr_comba8
		void bn_sqr_comba8(BN_ULONG r, BN_ULONG a)
		void bn_sqr_comba8(BN_ULONG r, const BN_ULONG a)
		{
		BN_ULONG t[16];
		bn_sqr_normal(r,a,8,t);
		@@ -857,4 +934,49 @@ void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
		r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]);
		}

		#ifdef OPENSSL_BN_ASM_MONT
		void bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG bp, const BN_ULONG np,BN_ULONG n0, int num)
		{
		BN_ULONG c0,c1,*tp;
		volatile BN_ULONG *vp;
		int i=0,j;

		vp = tp = alloca((num+2)*sizeof(BN_ULONG));

		for(i=0;i<=num;i++) tp[i]=0;

		for(i=0;i<num;i++)
		{
		c0 = bn_mul_add_words(tp,ap,num,bp[i]);
		c1 = tp[num] + c0;
		tp[num] = c1;
		tp[num+1] = (c1<c0?1:0);

		c0 = bn_mul_add_words(tp,np,num,tp[0]*n0);
		c1 = tp[num] + c0;
		tp[num] = c1;
		tp[num+1] += (c1<c0?1:0);
		for(j=0;j<=num;j++) tp[j]=tp[j+1];
		}

		if (tp[num]!=0 \|\| tp[num-1]>=np[num-1])
		{
		c0 = bn_sub_words(rp,tp,np,num);
		if (tp[num]!=0 \|\| c0==0)
		{
		for(i=0;i<num+2;i++) vp[i] = 0;
		return;
		}
		}
		for(i=0;i<num;i++) rp[i] = tp[i], vp[i] = 0;
		vp[num] = 0;
		vp[num+1] = 0;
		}

		void bn_sqr_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *np,BN_ULONG n0, int num)
		{
		bn_mul_mont(rp,ap,ap,np,n0,num);
		}
		#endif /* OPENSSL_BN_ASM_MONT */

		#endif /* !BN_MUL_COMBA */

crypto/bn/bn_mont.c

+16 −0

Original line number	Diff line number	Diff line
		@@ -74,6 +74,22 @@ int BN_mod_mul_montgomery(BIGNUM r, const BIGNUM a, const BIGNUM *b,
		{
		BIGNUM *tmp;
		int ret=0;
		#ifdef OPENSSL_BN_ASM_MONT
		int num = mont->N.top;

		if (num>1 && a->top==num && b->top==num)
		{
		if (bn_wexpand(r,num) == NULL) return 0;
		r->neg = a->neg^b->neg;
		r->top = num;
		if (a==b)
		bn_sqr_mont(r->d,a->d,mont->N.d,mont->n0,num);
		else
		bn_mul_mont(r->d,a->d,b->d,mont->N.d,mont->n0,num);
		bn_fix_top(r);
		return 1;
		}
		#endif

		BN_CTX_start(ctx);
		tmp = BN_CTX_get(ctx);