Commit 034688ec authored by Andy Polyakov's avatar Andy Polyakov
Browse files

bn_gf2m.c: optimized BN_GF2m_mod_inv delivers sometimes 2x of ECDSA sign.

Exact improvement coefficients vary from one benchmark and platform to
another, e.g. it performs 70%-33% better on ARM, hereafter less for
longer keys, and 100%-90% better on x86_64.
parent d1676591
Loading
Loading
Loading
Loading
+80 −16
Original line number Diff line number Diff line
@@ -364,21 +364,17 @@ int BN_GF2m_mod_arr(BIGNUM *r, const BIGNUM *a, const int p[])
int	BN_GF2m_mod(BIGNUM *r, const BIGNUM *a, const BIGNUM *p)
	{
	int ret = 0;
	const int max = BN_num_bits(p) + 1;
	int *arr=NULL;
	int arr[6];
	bn_check_top(a);
	bn_check_top(p);
	if ((arr = (int *)OPENSSL_malloc(sizeof(int) * max)) == NULL) goto err;
	ret = BN_GF2m_poly2arr(p, arr, max);
	if (!ret || ret > max)
	ret = BN_GF2m_poly2arr(p, arr, sizeof(arr)/sizeof(arr[0]));
	if (!ret || ret > sizeof(arr)/sizeof(arr[0]))
		{
		BNerr(BN_F_BN_GF2M_MOD,BN_R_INVALID_LENGTH);
		goto err;
		return 0;
		}
	ret = BN_GF2m_mod_arr(r, a, arr);
	bn_check_top(r);
err:
	if (arr) OPENSSL_free(arr);
	return ret;
	}

@@ -533,18 +529,18 @@ int BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)

	BN_CTX_start(ctx);
	
	b = BN_CTX_get(ctx);
	c = BN_CTX_get(ctx);
	u = BN_CTX_get(ctx);
	v = BN_CTX_get(ctx);
	if (v == NULL) goto err;
	if ((b = BN_CTX_get(ctx))==NULL) goto err;
	if ((c = BN_CTX_get(ctx))==NULL) goto err;
	if ((u = BN_CTX_get(ctx))==NULL) goto err;
	if ((v = BN_CTX_get(ctx))==NULL) goto err;

	if (!BN_one(b)) goto err;
	if (!BN_GF2m_mod(u, a, p)) goto err;
	if (!BN_copy(v, p)) goto err;

	if (BN_is_zero(u)) goto err;

	if (!BN_copy(v, p)) goto err;
#if 0
	if (!BN_one(b)) goto err;

	while (1)
		{
		while (!BN_is_odd(u))
@@ -568,7 +564,75 @@ int BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
		if (!BN_GF2m_add(u, u, v)) goto err;
		if (!BN_GF2m_add(b, b, c)) goto err;
		}
#else
	{
	int i,	ubits = BN_num_bits(u),
		vbits = BN_num_bits(v),	/* v is copy of p */
		top = p->top;
	BN_ULONG *udp,*bdp,*vdp,*cdp;

	bn_wexpand(u,top);	udp = u->d;
				for (i=u->top;i<top;i++) udp[i] = 0;
				u->top = top;
	bn_wexpand(b,top);	bdp = b->d;
				bdp[0] = 1;
				for (i=1;i<top;i++) bdp[i] = 0;
				b->top = top;
	bn_wexpand(c,top);	cdp = c->d;
				for (i=0;i<top;i++) cdp[i] = 0;
				c->top = top;
	vdp = v->d;	/* It pays off to "cache" *->d pointers, because
			 * it allows optimizer to be more aggressive.
			 * But we don't have to "cache" p->d, because *p
			 * is declared 'const'... */
	while (1)
		{
		while (ubits && !(udp[0]&1))
			{
			BN_ULONG u0,u1,b0,b1,mask;

			u0   = udp[0];
			b0   = bdp[0];
			mask = (BN_ULONG)0-(b0&1);
			b0  ^= p->d[0]&mask;
			for (i=0;i<top-1;i++)
				{
				u1 = udp[i+1];
				udp[i] = ((u0>>1)|(u1<<(BN_BITS2-1)))&BN_MASK2;
				u0 = u1;
				b1 = bdp[i+1]^(p->d[i+1]&mask);
				bdp[i] = ((b0>>1)|(b1<<(BN_BITS2-1)))&BN_MASK2;
				b0 = b1;
				}
			udp[i] = u0>>1;
			bdp[i] = b0>>1;
			ubits--;
			}

		if (ubits<=BN_BITS2 && udp[0]==1) break;

		if (ubits<vbits)
			{
			i = ubits; ubits = vbits; vbits = i;
			tmp = u; u = v; v = tmp;
			tmp = b; b = c; c = tmp;
			udp = vdp; vdp = v->d;
			bdp = cdp; cdp = c->d;
			}
		for(i=0;i<top;i++)
			{
			udp[i] ^= vdp[i];
			bdp[i] ^= cdp[i];
			}
		if (ubits==vbits)
			{
			bn_fix_top(u);
			ubits = BN_num_bits(u);
			}
		}
	bn_fix_top(b);
	}
#endif

	if (!BN_copy(r, b)) goto err;
	bn_check_top(r);