Loading crypto/bn/bn_div.c +34 −238 Original line number Diff line number Diff line Loading @@ -169,15 +169,13 @@ int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d, #endif /* OPENSSL_NO_ASM */ /* BN_div[_no_branch] computes dv := num / divisor, rounding towards /* BN_div computes dv := num / divisor, rounding towards * zero, and sets up rm such that dv*divisor + rm = num holds. * Thus: * dv->neg == num->neg ^ divisor->neg (unless the result is zero) * rm->neg == num->neg (unless the remainder is zero) * If 'dv' or 'rm' is NULL, the respective value is not returned. */ static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor, BN_CTX *ctx); int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor, BN_CTX *ctx) { Loading @@ -186,6 +184,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor, BN_ULONG *resp,*wnump; BN_ULONG d0,d1; int num_n,div_n; int no_branch=0; /* Invalid zero-padding would have particularly bad consequences * in the case of 'num', so don't just rely on bn_check_top() for this one Loading @@ -200,7 +199,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor, if ((BN_get_flags(num, BN_FLG_CONSTTIME) != 0) || (BN_get_flags(divisor, BN_FLG_CONSTTIME) != 0)) { return BN_div_no_branch(dv, rm, num, divisor, ctx); no_branch=1; } bn_check_top(dv); Loading @@ -214,7 +213,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor, return(0); } if (BN_ucmp(num,divisor) < 0) if (!no_branch && BN_ucmp(num,divisor) < 0) { if (rm != NULL) { if (BN_copy(rm,num) == NULL) return(0); } Loading @@ -239,227 +238,9 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor, norm_shift+=BN_BITS2; if (!(BN_lshift(snum,num,norm_shift))) goto err; snum->neg=0; div_n=sdiv->top; num_n=snum->top; loop=num_n-div_n; /* Lets setup a 'window' into snum * This is the part that corresponds to the current * 'area' being divided */ wnum.neg = 0; wnum.d = &(snum->d[loop]); wnum.top = div_n; /* only needed when BN_ucmp messes up the values between top and max */ wnum.dmax = snum->dmax - loop; /* so we don't step out of bounds */ /* Get the top 2 words of sdiv */ /* div_n=sdiv->top; */ d0=sdiv->d[div_n-1]; d1=(div_n == 1)?0:sdiv->d[div_n-2]; /* pointer to the 'top' of snum */ wnump= &(snum->d[num_n-1]); /* Setup to 'res' */ res->neg= (num->neg^divisor->neg); if (!bn_wexpand(res,(loop+1))) goto err; res->top=loop; resp= &(res->d[loop-1]); /* space for temp */ if (!bn_wexpand(tmp,(div_n+1))) goto err; if (BN_ucmp(&wnum,sdiv) >= 0) { /* If BN_DEBUG_RAND is defined BN_ucmp changes (via * bn_pollute) the const bignum arguments => * clean the values between top and max again */ bn_clear_top2max(&wnum); bn_sub_words(wnum.d, wnum.d, sdiv->d, div_n); *resp=1; } else res->top--; /* if res->top == 0 then clear the neg value otherwise decrease * the resp pointer */ if (res->top == 0) res->neg = 0; else resp--; for (i=0; i<loop-1; i++, wnump--, resp--) { BN_ULONG q,l0; /* the first part of the loop uses the top two words of * snum and sdiv to calculate a BN_ULONG q such that * | wnum - sdiv * q | < sdiv */ #if defined(BN_DIV3W) && !defined(OPENSSL_NO_ASM) BN_ULONG bn_div_3_words(BN_ULONG*,BN_ULONG,BN_ULONG); q=bn_div_3_words(wnump,d1,d0); #else BN_ULONG n0,n1,rem=0; n0=wnump[0]; n1=wnump[-1]; if (n0 == d0) q=BN_MASK2; else /* n0 < d0 */ { #ifdef BN_LLONG BN_ULLONG t2; #if defined(BN_LLONG) && defined(BN_DIV2W) && !defined(bn_div_words) q=(BN_ULONG)(((((BN_ULLONG)n0)<<BN_BITS2)|n1)/d0); #else q=bn_div_words(n0,n1,d0); #ifdef BN_DEBUG_LEVITTE fprintf(stderr,"DEBUG: bn_div_words(0x%08X,0x%08X,0x%08\ X) -> 0x%08X\n", n0, n1, d0, q); #endif #endif #ifndef REMAINDER_IS_ALREADY_CALCULATED /* * rem doesn't have to be BN_ULLONG. The least we * know it's less that d0, isn't it? */ rem=(n1-q*d0)&BN_MASK2; #endif t2=(BN_ULLONG)d1*q; for (;;) { if (t2 <= ((((BN_ULLONG)rem)<<BN_BITS2)|wnump[-2])) break; q--; rem += d0; if (rem < d0) break; /* don't let rem overflow */ t2 -= d1; } #else /* !BN_LLONG */ BN_ULONG t2l,t2h; q=bn_div_words(n0,n1,d0); #ifdef BN_DEBUG_LEVITTE fprintf(stderr,"DEBUG: bn_div_words(0x%08X,0x%08X,0x%08\ X) -> 0x%08X\n", n0, n1, d0, q); #endif #ifndef REMAINDER_IS_ALREADY_CALCULATED rem=(n1-q*d0)&BN_MASK2; #endif #if defined(BN_UMULT_LOHI) BN_UMULT_LOHI(t2l,t2h,d1,q); #elif defined(BN_UMULT_HIGH) t2l = d1 * q; t2h = BN_UMULT_HIGH(d1,q); #else if (no_branch) { BN_ULONG ql, qh; t2l=LBITS(d1); t2h=HBITS(d1); ql =LBITS(q); qh =HBITS(q); mul64(t2l,t2h,ql,qh); /* t2=(BN_ULLONG)d1*q; */ } #endif for (;;) { if ((t2h < rem) || ((t2h == rem) && (t2l <= wnump[-2]))) break; q--; rem += d0; if (rem < d0) break; /* don't let rem overflow */ if (t2l < d1) t2h--; t2l -= d1; } #endif /* !BN_LLONG */ } #endif /* !BN_DIV3W */ l0=bn_mul_words(tmp->d,sdiv->d,div_n,q); tmp->d[div_n]=l0; wnum.d--; /* ingore top values of the bignums just sub the two * BN_ULONG arrays with bn_sub_words */ if (bn_sub_words(wnum.d, wnum.d, tmp->d, div_n+1)) { /* Note: As we have considered only the leading * two BN_ULONGs in the calculation of q, sdiv * q * might be greater than wnum (but then (q-1) * sdiv * is less or equal than wnum) */ q--; if (bn_add_words(wnum.d, wnum.d, sdiv->d, div_n)) /* we can't have an overflow here (assuming * that q != 0, but if q == 0 then tmp is * zero anyway) */ (*wnump)++; } /* store part of the result */ *resp = q; } bn_correct_top(snum); if (rm != NULL) { /* Keep a copy of the neg flag in num because if rm==num * BN_rshift() will overwrite it. */ int neg = num->neg; BN_rshift(rm,snum,norm_shift); if (!BN_is_zero(rm)) rm->neg = neg; bn_check_top(rm); } BN_CTX_end(ctx); return(1); err: bn_check_top(rm); BN_CTX_end(ctx); return(0); } /* BN_div_no_branch is a special version of BN_div. It does not contain * branches that may leak sensitive information. */ static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor, BN_CTX *ctx) { int norm_shift,i,loop; BIGNUM *tmp,wnum,*snum,*sdiv,*res; BN_ULONG *resp,*wnump; BN_ULONG d0,d1; int num_n,div_n; bn_check_top(dv); bn_check_top(rm); /* bn_check_top(num); */ /* 'num' has been checked in BN_div() */ bn_check_top(divisor); if (BN_is_zero(divisor)) { BNerr(BN_F_BN_DIV_NO_BRANCH,BN_R_DIV_BY_ZERO); return(0); } BN_CTX_start(ctx); tmp=BN_CTX_get(ctx); snum=BN_CTX_get(ctx); sdiv=BN_CTX_get(ctx); if (dv == NULL) res=BN_CTX_get(ctx); else res=dv; if (sdiv == NULL || res == NULL) goto err; /* First we normalise the numbers */ norm_shift=BN_BITS2-((BN_num_bits(divisor))%BN_BITS2); if (!(BN_lshift(sdiv,divisor,norm_shift))) goto err; sdiv->neg=0; norm_shift+=BN_BITS2; if (!(BN_lshift(snum,num,norm_shift))) goto err; snum->neg=0; /* Since we don't know whether snum is larger than sdiv, * we pad snum with enough zeroes without changing its * value. Loading @@ -476,6 +257,7 @@ static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, snum->d[snum->top] = 0; snum->top ++; } } div_n=sdiv->top; num_n=snum->top; Loading @@ -500,12 +282,27 @@ static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, /* Setup to 'res' */ res->neg= (num->neg^divisor->neg); if (!bn_wexpand(res,(loop+1))) goto err; res->top=loop-1; res->top=loop-no_branch; resp= &(res->d[loop-1]); /* space for temp */ if (!bn_wexpand(tmp,(div_n+1))) goto err; if (!no_branch) { if (BN_ucmp(&wnum,sdiv) >= 0) { /* If BN_DEBUG_RAND is defined BN_ucmp changes (via * bn_pollute) the const bignum arguments => * clean the values between top and max again */ bn_clear_top2max(&wnum); bn_sub_words(wnum.d, wnum.d, sdiv->d, div_n); *resp=1; } else res->top--; } /* if res->top == 0 then clear the neg value otherwise decrease * the resp pointer */ if (res->top == 0) Loading Loading @@ -638,7 +435,7 @@ X) -> 0x%08X\n", rm->neg = neg; bn_check_top(rm); } bn_correct_top(res); if (no_branch) bn_correct_top(res); BN_CTX_end(ctx); return(1); err: Loading @@ -646,5 +443,4 @@ err: BN_CTX_end(ctx); return(0); } #endif crypto/bn/bn_exp.c +173 −67 Original line number Diff line number Diff line Loading @@ -113,6 +113,18 @@ #include "cryptlib.h" #include "bn_lcl.h" #include <stdlib.h> #ifdef _WIN32 # include <malloc.h> # ifndef alloca # define alloca _alloca # endif #elif defined(__GNUC__) # ifndef alloca # define alloca(s) __builtin_alloca((s)) # endif #endif /* maximum precomputation table size for *variable* sliding windows */ #define TABLE_SIZE 32 Loading Loading @@ -522,23 +534,17 @@ err: * as cache lines are concerned. The following functions are used to transfer a BIGNUM * from/to that table. */ static int MOD_EXP_CTIME_COPY_TO_PREBUF(BIGNUM *b, int top, unsigned char *buf, int idx, int width) static int MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top, unsigned char *buf, int idx, int width) { size_t i, j; if (bn_wexpand(b, top) == NULL) return 0; while (b->top < top) { b->d[b->top++] = 0; } if (top > b->top) top = b->top; /* this works because 'buf' is explicitly zeroed */ for (i = 0, j=idx; i < top * sizeof b->d[0]; i++, j+=width) { buf[j] = ((unsigned char*)b->d)[i]; } bn_correct_top(b); return 1; } Loading @@ -561,7 +567,7 @@ static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, unsigned char *buf /* Given a pointer value, compute the next address that is a cache line multiple. */ #define MOD_EXP_CTIME_ALIGN(x_) \ ((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((BN_ULONG)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK)))) ((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((size_t)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK)))) /* This variant of BN_mod_exp_mont() uses fixed windows and the special * precomputation memory layout to limit data-dependency to a minimum Loading @@ -572,17 +578,15 @@ static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, unsigned char *buf int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont) { int i,bits,ret=0,idx,window,wvalue; int i,bits,ret=0,window,wvalue; int top; BIGNUM *r; const BIGNUM *aa; BN_MONT_CTX *mont=NULL; int numPowers; unsigned char *powerbufFree=NULL; int powerbufLen = 0; unsigned char *powerbuf=NULL; BIGNUM *computeTemp=NULL, *am=NULL; BIGNUM tmp, am; bn_check_top(a); bn_check_top(p); Loading @@ -602,10 +606,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, return ret; } /* Initialize BIGNUM context and allocate intermediate result */ BN_CTX_start(ctx); r = BN_CTX_get(ctx); if (r == NULL) goto err; /* Allocate a montgomery context if it was not supplied by the caller. * If this is not done, things will break in the montgomery part. Loading @@ -620,40 +621,154 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, /* Get the window size to use with size of p. */ window = BN_window_bits_for_ctime_exponent_size(bits); #if defined(OPENSSL_BN_ASM_MONT5) if (window==6 && bits<=1024) window=5; /* ~5% improvement of 2048-bit RSA sign */ #endif /* Allocate a buffer large enough to hold all of the pre-computed * powers of a. * powers of am, am itself and tmp. */ numPowers = 1 << window; powerbufLen = sizeof(m->d[0])*top*numPowers; powerbufLen = sizeof(m->d[0])*(top*numPowers + ((2*top)>numPowers?(2*top):numPowers)); #ifdef alloca if (powerbufLen < 3072) powerbufFree = alloca(powerbufLen+MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH); else #endif if ((powerbufFree=(unsigned char*)OPENSSL_malloc(powerbufLen+MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH)) == NULL) goto err; powerbuf = MOD_EXP_CTIME_ALIGN(powerbufFree); memset(powerbuf, 0, powerbufLen); /* Initialize the intermediate result. Do this early to save double conversion, * once each for a^0 and intermediate result. */ if (!BN_to_montgomery(r,BN_value_one(),mont,ctx)) goto err; if (!MOD_EXP_CTIME_COPY_TO_PREBUF(r, top, powerbuf, 0, numPowers)) goto err; #ifdef alloca if (powerbufLen < 3072) powerbufFree = NULL; #endif /* Initialize computeTemp as a^1 with montgomery precalcs */ computeTemp = BN_CTX_get(ctx); am = BN_CTX_get(ctx); if (computeTemp==NULL || am==NULL) goto err; /* lay down tmp and am right after powers table */ tmp.d = (BN_ULONG *)(powerbuf + sizeof(m->d[0])*top*numPowers); am.d = tmp.d + top; tmp.top = am.top = 0; tmp.dmax = am.dmax = top; tmp.neg = am.neg = 0; tmp.flags = am.flags = BN_FLG_STATIC_DATA; /* prepare a^0 in Montgomery domain */ #if 1 if (!BN_to_montgomery(&tmp,BN_value_one(),mont,ctx)) goto err; #else tmp.d[0] = (0-m->d[0])&BN_MASK2; /* 2^(top*BN_BITS2) - m */ for (i=1;i<top;i++) tmp.d[i] = (~m->d[i])&BN_MASK2; tmp.top = top; #endif /* prepare a^1 in Montgomery domain */ if (a->neg || BN_ucmp(a,m) >= 0) { if (!BN_mod(am,a,m,ctx)) goto err; aa= am; if (!BN_mod(&am,a,m,ctx)) goto err; if (!BN_to_montgomery(&am,&am,mont,ctx)) goto err; } else if (!BN_to_montgomery(&am,a,mont,ctx)) goto err; #if defined(OPENSSL_BN_ASM_MONT5) /* This optimization uses ideas from http://eprint.iacr.org/2011/239, * specifically optimization of cache-timing attack countermeasures * and pre-computation optimization. */ /* Dedicated window==4 case improves 512-bit RSA sign by ~15%, but as * 512-bit RSA is hardly relevant, we omit it to spare size... */ if (window==5) { void bn_mul_mont_gather5(BN_ULONG *rp,const BN_ULONG *ap, const void *table,const BN_ULONG *np, const BN_ULONG *n0,int num,int power); void bn_scatter5(const BN_ULONG *inp,size_t num, void *table,size_t power); void bn_gather5(BN_ULONG *out,size_t num, void *table,size_t power); BN_ULONG *np=mont->N.d, *n0=mont->n0; /* BN_to_montgomery can contaminate words above .top * [in BN_DEBUG[_DEBUG] build]... */ for (i=am.top; i<top; i++) am.d[i]=0; for (i=tmp.top; i<top; i++) tmp.d[i]=0; bn_scatter5(tmp.d,top,powerbuf,0); bn_scatter5(am.d,am.top,powerbuf,1); bn_mul_mont(tmp.d,am.d,am.d,np,n0,top); bn_scatter5(tmp.d,top,powerbuf,2); #if 0 for (i=3; i<32; i++) { /* Calculate a^i = a^(i-1) * a */ bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1); bn_scatter5(tmp.d,top,powerbuf,i); } #else /* same as above, but uses squaring for 1/2 of operations */ for (i=4; i<32; i*=2) { bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top); bn_scatter5(tmp.d,top,powerbuf,i); } for (i=3; i<8; i+=2) { int j; bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1); bn_scatter5(tmp.d,top,powerbuf,i); for (j=2*i; j<32; j*=2) { bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top); bn_scatter5(tmp.d,top,powerbuf,j); } } for (; i<16; i+=2) { bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1); bn_scatter5(tmp.d,top,powerbuf,i); bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top); bn_scatter5(tmp.d,top,powerbuf,2*i); } for (; i<32; i+=2) { bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1); bn_scatter5(tmp.d,top,powerbuf,i); } #endif bits--; for (wvalue=0, i=bits%5; i>=0; i--,bits--) wvalue = (wvalue<<1)+BN_is_bit_set(p,bits); bn_gather5(tmp.d,top,powerbuf,wvalue); /* Scan the exponent one window at a time starting from the most * significant bits. */ while (bits >= 0) { for (wvalue=0, i=0; i<5; i++,bits--) wvalue = (wvalue<<1)+BN_is_bit_set(p,bits); bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top); bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top); bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top); bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top); bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top); bn_mul_mont_gather5(tmp.d,tmp.d,powerbuf,np,n0,top,wvalue); } tmp.top=top; bn_correct_top(&tmp); } else aa=a; if (!BN_to_montgomery(am,aa,mont,ctx)) goto err; if (!BN_copy(computeTemp, am)) goto err; if (!MOD_EXP_CTIME_COPY_TO_PREBUF(am, top, powerbuf, 1, numPowers)) goto err; #endif { if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 0, numPowers)) goto err; if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&am, top, powerbuf, 1, numPowers)) goto err; /* If the window size is greater than 1, then calculate * val[i=2..2^winsize-1]. Powers are computed as a*a^(i-1) Loading @@ -662,62 +777,54 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, */ if (window > 1) { for (i=2; i<numPowers; i++) if (!BN_mod_mul_montgomery(&tmp,&am,&am,mont,ctx)) goto err; if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 2, numPowers)) goto err; for (i=3; i<numPowers; i++) { /* Calculate a^i = a^(i-1) * a */ if (!BN_mod_mul_montgomery(computeTemp,am,computeTemp,mont,ctx)) if (!BN_mod_mul_montgomery(&tmp,&am,&tmp,mont,ctx)) goto err; if (!MOD_EXP_CTIME_COPY_TO_PREBUF(computeTemp, top, powerbuf, i, numPowers)) goto err; if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, i, numPowers)) goto err; } } /* Adjust the number of bits up to a multiple of the window size. * If the exponent length is not a multiple of the window size, then * this pads the most significant bits with zeros to normalize the * scanning loop to there's no special cases. * * * NOTE: Making the window size a power of two less than the native * * word size ensures that the padded bits won't go past the last * * word in the internal BIGNUM structure. Going past the end will * * still produce the correct result, but causes a different branch * * to be taken in the BN_is_bit_set function. */ bits = ((bits+window-1)/window)*window; idx=bits-1; /* The top bit of the window */ bits--; for (wvalue=0, i=bits%window; i>=0; i--,bits--) wvalue = (wvalue<<1)+BN_is_bit_set(p,bits); if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&tmp,top,powerbuf,wvalue,numPowers)) goto err; /* Scan the exponent one window at a time starting from the most * significant bits. */ while (idx >= 0) while (bits >= 0) { wvalue=0; /* The 'value' of the window */ /* Scan the window, squaring the result as we go */ for (i=0; i<window; i++,idx--) for (i=0; i<window; i++,bits--) { if (!BN_mod_mul_montgomery(r,r,r,mont,ctx)) goto err; wvalue = (wvalue<<1)+BN_is_bit_set(p,idx); if (!BN_mod_mul_montgomery(&tmp,&tmp,&tmp,mont,ctx)) goto err; wvalue = (wvalue<<1)+BN_is_bit_set(p,bits); } /* Fetch the appropriate pre-computed value from the pre-buf */ if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(computeTemp, top, powerbuf, wvalue, numPowers)) goto err; if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&am, top, powerbuf, wvalue, numPowers)) goto err; /* Multiply the result into the intermediate result */ if (!BN_mod_mul_montgomery(r,r,computeTemp,mont,ctx)) goto err; if (!BN_mod_mul_montgomery(&tmp,&tmp,&am,mont,ctx)) goto err; } } /* Convert the final result from montgomery to standard format */ if (!BN_from_montgomery(rr,r,mont,ctx)) goto err; if (!BN_from_montgomery(rr,&tmp,mont,ctx)) goto err; ret=1; err: if ((in_mont == NULL) && (mont != NULL)) BN_MONT_CTX_free(mont); if (powerbuf!=NULL) { OPENSSL_cleanse(powerbuf,powerbufLen); OPENSSL_free(powerbufFree); if (powerbufFree) OPENSSL_free(powerbufFree); } if (am!=NULL) BN_clear(am); if (computeTemp!=NULL) BN_clear(computeTemp); BN_CTX_end(ctx); return(ret); } Loading Loading @@ -988,4 +1095,3 @@ err: bn_check_top(r); return(ret); } crypto/bn/bn_gf2m.c +89 −17 Original line number Diff line number Diff line Loading @@ -124,6 +124,7 @@ static const BN_ULONG SQR_tb[16] = SQR_tb[(w) >> 4 & 0xF] << 8 | SQR_tb[(w) & 0xF] #endif #if !defined(OPENSSL_BN_ASM_GF2m) /* Product of two polynomials a, b each with degree < BN_BITS2 - 1, * result is a polynomial r with degree < 2 * BN_BITS - 1 * The caller MUST ensure that the variables have the right amount Loading Loading @@ -218,7 +219,9 @@ static void bn_GF2m_mul_2x2(BN_ULONG *r, const BN_ULONG a1, const BN_ULONG a0, c r[2] ^= m1 ^ r[1] ^ r[3]; /* h0 ^= m1 ^ l1 ^ h1; */ r[1] = r[3] ^ r[2] ^ r[0] ^ m1 ^ m0; /* l1 ^= l0 ^ h0 ^ m0; */ } #else void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0); #endif /* Add polynomials a and b and store result in r; r could be a or b, a and b * could be equal; r is the bitwise XOR of a and b. Loading Loading @@ -362,21 +365,17 @@ int BN_GF2m_mod_arr(BIGNUM *r, const BIGNUM *a, const int p[]) int BN_GF2m_mod(BIGNUM *r, const BIGNUM *a, const BIGNUM *p) { int ret = 0; const int max = BN_num_bits(p) + 1; int *arr=NULL; int arr[6]; bn_check_top(a); bn_check_top(p); if ((arr = (int *)OPENSSL_malloc(sizeof(int) * max)) == NULL) goto err; ret = BN_GF2m_poly2arr(p, arr, max); if (!ret || ret > max) ret = BN_GF2m_poly2arr(p, arr, sizeof(arr)/sizeof(arr[0])); if (!ret || ret > (int)(sizeof(arr)/sizeof(arr[0]))) { BNerr(BN_F_BN_GF2M_MOD,BN_R_INVALID_LENGTH); goto err; return 0; } ret = BN_GF2m_mod_arr(r, a, arr); bn_check_top(r); err: if (arr) OPENSSL_free(arr); return ret; } Loading Loading @@ -531,18 +530,18 @@ int BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx) BN_CTX_start(ctx); b = BN_CTX_get(ctx); c = BN_CTX_get(ctx); u = BN_CTX_get(ctx); v = BN_CTX_get(ctx); if (v == NULL) goto err; if ((b = BN_CTX_get(ctx))==NULL) goto err; if ((c = BN_CTX_get(ctx))==NULL) goto err; if ((u = BN_CTX_get(ctx))==NULL) goto err; if ((v = BN_CTX_get(ctx))==NULL) goto err; if (!BN_one(b)) goto err; if (!BN_GF2m_mod(u, a, p)) goto err; if (!BN_copy(v, p)) goto err; if (BN_is_zero(u)) goto err; if (!BN_copy(v, p)) goto err; #if 0 if (!BN_one(b)) goto err; while (1) { while (!BN_is_odd(u)) Loading @@ -567,13 +566,86 @@ int BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx) if (!BN_GF2m_add(u, u, v)) goto err; if (!BN_GF2m_add(b, b, c)) goto err; } #else { int i, ubits = BN_num_bits(u), vbits = BN_num_bits(v), /* v is copy of p */ top = p->top; BN_ULONG *udp,*bdp,*vdp,*cdp; bn_wexpand(u,top); udp = u->d; for (i=u->top;i<top;i++) udp[i] = 0; u->top = top; bn_wexpand(b,top); bdp = b->d; bdp[0] = 1; for (i=1;i<top;i++) bdp[i] = 0; b->top = top; bn_wexpand(c,top); cdp = c->d; for (i=0;i<top;i++) cdp[i] = 0; c->top = top; vdp = v->d; /* It pays off to "cache" *->d pointers, because * it allows optimizer to be more aggressive. * But we don't have to "cache" p->d, because *p * is declared 'const'... */ while (1) { while (ubits && !(udp[0]&1)) { BN_ULONG u0,u1,b0,b1,mask; u0 = udp[0]; b0 = bdp[0]; mask = (BN_ULONG)0-(b0&1); b0 ^= p->d[0]&mask; for (i=0;i<top-1;i++) { u1 = udp[i+1]; udp[i] = ((u0>>1)|(u1<<(BN_BITS2-1)))&BN_MASK2; u0 = u1; b1 = bdp[i+1]^(p->d[i+1]&mask); bdp[i] = ((b0>>1)|(b1<<(BN_BITS2-1)))&BN_MASK2; b0 = b1; } udp[i] = u0>>1; bdp[i] = b0>>1; ubits--; } if (ubits<=BN_BITS2 && udp[0]==1) break; if (ubits<vbits) { i = ubits; ubits = vbits; vbits = i; tmp = u; u = v; v = tmp; tmp = b; b = c; c = tmp; udp = vdp; vdp = v->d; bdp = cdp; cdp = c->d; } for(i=0;i<top;i++) { udp[i] ^= vdp[i]; bdp[i] ^= cdp[i]; } if (ubits==vbits) { bn_correct_top(u); ubits = BN_num_bits(u); } } bn_correct_top(b); } #endif if (!BN_copy(r, b)) goto err; bn_check_top(r); ret = 1; err: #ifdef BN_DEBUG /* BN_CTX_end would complain about the expanded form */ bn_correct_top(c); bn_correct_top(u); bn_correct_top(v); #endif BN_CTX_end(ctx); return ret; } Loading crypto/bn/bn_lcl.h +16 −3 Original line number Diff line number Diff line Loading @@ -238,7 +238,7 @@ extern "C" { # if defined(__DECC) # include <c_asm.h> # define BN_UMULT_HIGH(a,b) (BN_ULONG)asm("umulh %a0,%a1,%v0",(a),(b)) # elif defined(__GNUC__) # elif defined(__GNUC__) && __GNUC__>=2 # define BN_UMULT_HIGH(a,b) ({ \ register BN_ULONG ret; \ asm ("umulh %1,%2,%0" \ Loading @@ -247,7 +247,7 @@ extern "C" { ret; }) # endif /* compiler */ # elif defined(_ARCH_PPC) && defined(__64BIT__) && defined(SIXTY_FOUR_BIT_LONG) # if defined(__GNUC__) # if defined(__GNUC__) && __GNUC__>=2 # define BN_UMULT_HIGH(a,b) ({ \ register BN_ULONG ret; \ asm ("mulhdu %0,%1,%2" \ Loading @@ -257,7 +257,7 @@ extern "C" { # endif /* compiler */ # elif (defined(__x86_64) || defined(__x86_64__)) && \ (defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT)) # if defined(__GNUC__) # if defined(__GNUC__) && __GNUC__>=2 # define BN_UMULT_HIGH(a,b) ({ \ register BN_ULONG ret,discard; \ asm ("mulq %3" \ Loading @@ -280,6 +280,19 @@ extern "C" { # define BN_UMULT_HIGH(a,b) __umulh((a),(b)) # define BN_UMULT_LOHI(low,high,a,b) ((low)=_umul128((a),(b),&(high))) # endif # elif defined(__mips) && (defined(SIXTY_FOUR_BIT) || defined(SIXTY_FOUR_BIT_LONG)) # if defined(__GNUC__) && __GNUC__>=2 # define BN_UMULT_HIGH(a,b) ({ \ register BN_ULONG ret; \ asm ("dmultu %1,%2" \ : "=h"(ret) \ : "r"(a), "r"(b) : "l"); \ ret; }) # define BN_UMULT_LOHI(low,high,a,b) \ asm ("dmultu %2,%3" \ : "=l"(low),"=h"(high) \ : "r"(a), "r"(b)); # endif # endif /* cpu */ #endif /* OPENSSL_NO_ASM */ Loading crypto/bn/bn_mont.c +29 −87 File changed.Preview size limit exceeded, changes collapsed. Show changes Loading
crypto/bn/bn_div.c +34 −238 Original line number Diff line number Diff line Loading @@ -169,15 +169,13 @@ int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d, #endif /* OPENSSL_NO_ASM */ /* BN_div[_no_branch] computes dv := num / divisor, rounding towards /* BN_div computes dv := num / divisor, rounding towards * zero, and sets up rm such that dv*divisor + rm = num holds. * Thus: * dv->neg == num->neg ^ divisor->neg (unless the result is zero) * rm->neg == num->neg (unless the remainder is zero) * If 'dv' or 'rm' is NULL, the respective value is not returned. */ static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor, BN_CTX *ctx); int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor, BN_CTX *ctx) { Loading @@ -186,6 +184,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor, BN_ULONG *resp,*wnump; BN_ULONG d0,d1; int num_n,div_n; int no_branch=0; /* Invalid zero-padding would have particularly bad consequences * in the case of 'num', so don't just rely on bn_check_top() for this one Loading @@ -200,7 +199,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor, if ((BN_get_flags(num, BN_FLG_CONSTTIME) != 0) || (BN_get_flags(divisor, BN_FLG_CONSTTIME) != 0)) { return BN_div_no_branch(dv, rm, num, divisor, ctx); no_branch=1; } bn_check_top(dv); Loading @@ -214,7 +213,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor, return(0); } if (BN_ucmp(num,divisor) < 0) if (!no_branch && BN_ucmp(num,divisor) < 0) { if (rm != NULL) { if (BN_copy(rm,num) == NULL) return(0); } Loading @@ -239,227 +238,9 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor, norm_shift+=BN_BITS2; if (!(BN_lshift(snum,num,norm_shift))) goto err; snum->neg=0; div_n=sdiv->top; num_n=snum->top; loop=num_n-div_n; /* Lets setup a 'window' into snum * This is the part that corresponds to the current * 'area' being divided */ wnum.neg = 0; wnum.d = &(snum->d[loop]); wnum.top = div_n; /* only needed when BN_ucmp messes up the values between top and max */ wnum.dmax = snum->dmax - loop; /* so we don't step out of bounds */ /* Get the top 2 words of sdiv */ /* div_n=sdiv->top; */ d0=sdiv->d[div_n-1]; d1=(div_n == 1)?0:sdiv->d[div_n-2]; /* pointer to the 'top' of snum */ wnump= &(snum->d[num_n-1]); /* Setup to 'res' */ res->neg= (num->neg^divisor->neg); if (!bn_wexpand(res,(loop+1))) goto err; res->top=loop; resp= &(res->d[loop-1]); /* space for temp */ if (!bn_wexpand(tmp,(div_n+1))) goto err; if (BN_ucmp(&wnum,sdiv) >= 0) { /* If BN_DEBUG_RAND is defined BN_ucmp changes (via * bn_pollute) the const bignum arguments => * clean the values between top and max again */ bn_clear_top2max(&wnum); bn_sub_words(wnum.d, wnum.d, sdiv->d, div_n); *resp=1; } else res->top--; /* if res->top == 0 then clear the neg value otherwise decrease * the resp pointer */ if (res->top == 0) res->neg = 0; else resp--; for (i=0; i<loop-1; i++, wnump--, resp--) { BN_ULONG q,l0; /* the first part of the loop uses the top two words of * snum and sdiv to calculate a BN_ULONG q such that * | wnum - sdiv * q | < sdiv */ #if defined(BN_DIV3W) && !defined(OPENSSL_NO_ASM) BN_ULONG bn_div_3_words(BN_ULONG*,BN_ULONG,BN_ULONG); q=bn_div_3_words(wnump,d1,d0); #else BN_ULONG n0,n1,rem=0; n0=wnump[0]; n1=wnump[-1]; if (n0 == d0) q=BN_MASK2; else /* n0 < d0 */ { #ifdef BN_LLONG BN_ULLONG t2; #if defined(BN_LLONG) && defined(BN_DIV2W) && !defined(bn_div_words) q=(BN_ULONG)(((((BN_ULLONG)n0)<<BN_BITS2)|n1)/d0); #else q=bn_div_words(n0,n1,d0); #ifdef BN_DEBUG_LEVITTE fprintf(stderr,"DEBUG: bn_div_words(0x%08X,0x%08X,0x%08\ X) -> 0x%08X\n", n0, n1, d0, q); #endif #endif #ifndef REMAINDER_IS_ALREADY_CALCULATED /* * rem doesn't have to be BN_ULLONG. The least we * know it's less that d0, isn't it? */ rem=(n1-q*d0)&BN_MASK2; #endif t2=(BN_ULLONG)d1*q; for (;;) { if (t2 <= ((((BN_ULLONG)rem)<<BN_BITS2)|wnump[-2])) break; q--; rem += d0; if (rem < d0) break; /* don't let rem overflow */ t2 -= d1; } #else /* !BN_LLONG */ BN_ULONG t2l,t2h; q=bn_div_words(n0,n1,d0); #ifdef BN_DEBUG_LEVITTE fprintf(stderr,"DEBUG: bn_div_words(0x%08X,0x%08X,0x%08\ X) -> 0x%08X\n", n0, n1, d0, q); #endif #ifndef REMAINDER_IS_ALREADY_CALCULATED rem=(n1-q*d0)&BN_MASK2; #endif #if defined(BN_UMULT_LOHI) BN_UMULT_LOHI(t2l,t2h,d1,q); #elif defined(BN_UMULT_HIGH) t2l = d1 * q; t2h = BN_UMULT_HIGH(d1,q); #else if (no_branch) { BN_ULONG ql, qh; t2l=LBITS(d1); t2h=HBITS(d1); ql =LBITS(q); qh =HBITS(q); mul64(t2l,t2h,ql,qh); /* t2=(BN_ULLONG)d1*q; */ } #endif for (;;) { if ((t2h < rem) || ((t2h == rem) && (t2l <= wnump[-2]))) break; q--; rem += d0; if (rem < d0) break; /* don't let rem overflow */ if (t2l < d1) t2h--; t2l -= d1; } #endif /* !BN_LLONG */ } #endif /* !BN_DIV3W */ l0=bn_mul_words(tmp->d,sdiv->d,div_n,q); tmp->d[div_n]=l0; wnum.d--; /* ingore top values of the bignums just sub the two * BN_ULONG arrays with bn_sub_words */ if (bn_sub_words(wnum.d, wnum.d, tmp->d, div_n+1)) { /* Note: As we have considered only the leading * two BN_ULONGs in the calculation of q, sdiv * q * might be greater than wnum (but then (q-1) * sdiv * is less or equal than wnum) */ q--; if (bn_add_words(wnum.d, wnum.d, sdiv->d, div_n)) /* we can't have an overflow here (assuming * that q != 0, but if q == 0 then tmp is * zero anyway) */ (*wnump)++; } /* store part of the result */ *resp = q; } bn_correct_top(snum); if (rm != NULL) { /* Keep a copy of the neg flag in num because if rm==num * BN_rshift() will overwrite it. */ int neg = num->neg; BN_rshift(rm,snum,norm_shift); if (!BN_is_zero(rm)) rm->neg = neg; bn_check_top(rm); } BN_CTX_end(ctx); return(1); err: bn_check_top(rm); BN_CTX_end(ctx); return(0); } /* BN_div_no_branch is a special version of BN_div. It does not contain * branches that may leak sensitive information. */ static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor, BN_CTX *ctx) { int norm_shift,i,loop; BIGNUM *tmp,wnum,*snum,*sdiv,*res; BN_ULONG *resp,*wnump; BN_ULONG d0,d1; int num_n,div_n; bn_check_top(dv); bn_check_top(rm); /* bn_check_top(num); */ /* 'num' has been checked in BN_div() */ bn_check_top(divisor); if (BN_is_zero(divisor)) { BNerr(BN_F_BN_DIV_NO_BRANCH,BN_R_DIV_BY_ZERO); return(0); } BN_CTX_start(ctx); tmp=BN_CTX_get(ctx); snum=BN_CTX_get(ctx); sdiv=BN_CTX_get(ctx); if (dv == NULL) res=BN_CTX_get(ctx); else res=dv; if (sdiv == NULL || res == NULL) goto err; /* First we normalise the numbers */ norm_shift=BN_BITS2-((BN_num_bits(divisor))%BN_BITS2); if (!(BN_lshift(sdiv,divisor,norm_shift))) goto err; sdiv->neg=0; norm_shift+=BN_BITS2; if (!(BN_lshift(snum,num,norm_shift))) goto err; snum->neg=0; /* Since we don't know whether snum is larger than sdiv, * we pad snum with enough zeroes without changing its * value. Loading @@ -476,6 +257,7 @@ static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, snum->d[snum->top] = 0; snum->top ++; } } div_n=sdiv->top; num_n=snum->top; Loading @@ -500,12 +282,27 @@ static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, /* Setup to 'res' */ res->neg= (num->neg^divisor->neg); if (!bn_wexpand(res,(loop+1))) goto err; res->top=loop-1; res->top=loop-no_branch; resp= &(res->d[loop-1]); /* space for temp */ if (!bn_wexpand(tmp,(div_n+1))) goto err; if (!no_branch) { if (BN_ucmp(&wnum,sdiv) >= 0) { /* If BN_DEBUG_RAND is defined BN_ucmp changes (via * bn_pollute) the const bignum arguments => * clean the values between top and max again */ bn_clear_top2max(&wnum); bn_sub_words(wnum.d, wnum.d, sdiv->d, div_n); *resp=1; } else res->top--; } /* if res->top == 0 then clear the neg value otherwise decrease * the resp pointer */ if (res->top == 0) Loading Loading @@ -638,7 +435,7 @@ X) -> 0x%08X\n", rm->neg = neg; bn_check_top(rm); } bn_correct_top(res); if (no_branch) bn_correct_top(res); BN_CTX_end(ctx); return(1); err: Loading @@ -646,5 +443,4 @@ err: BN_CTX_end(ctx); return(0); } #endif
crypto/bn/bn_exp.c +173 −67 Original line number Diff line number Diff line Loading @@ -113,6 +113,18 @@ #include "cryptlib.h" #include "bn_lcl.h" #include <stdlib.h> #ifdef _WIN32 # include <malloc.h> # ifndef alloca # define alloca _alloca # endif #elif defined(__GNUC__) # ifndef alloca # define alloca(s) __builtin_alloca((s)) # endif #endif /* maximum precomputation table size for *variable* sliding windows */ #define TABLE_SIZE 32 Loading Loading @@ -522,23 +534,17 @@ err: * as cache lines are concerned. The following functions are used to transfer a BIGNUM * from/to that table. */ static int MOD_EXP_CTIME_COPY_TO_PREBUF(BIGNUM *b, int top, unsigned char *buf, int idx, int width) static int MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top, unsigned char *buf, int idx, int width) { size_t i, j; if (bn_wexpand(b, top) == NULL) return 0; while (b->top < top) { b->d[b->top++] = 0; } if (top > b->top) top = b->top; /* this works because 'buf' is explicitly zeroed */ for (i = 0, j=idx; i < top * sizeof b->d[0]; i++, j+=width) { buf[j] = ((unsigned char*)b->d)[i]; } bn_correct_top(b); return 1; } Loading @@ -561,7 +567,7 @@ static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, unsigned char *buf /* Given a pointer value, compute the next address that is a cache line multiple. */ #define MOD_EXP_CTIME_ALIGN(x_) \ ((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((BN_ULONG)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK)))) ((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((size_t)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK)))) /* This variant of BN_mod_exp_mont() uses fixed windows and the special * precomputation memory layout to limit data-dependency to a minimum Loading @@ -572,17 +578,15 @@ static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, unsigned char *buf int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont) { int i,bits,ret=0,idx,window,wvalue; int i,bits,ret=0,window,wvalue; int top; BIGNUM *r; const BIGNUM *aa; BN_MONT_CTX *mont=NULL; int numPowers; unsigned char *powerbufFree=NULL; int powerbufLen = 0; unsigned char *powerbuf=NULL; BIGNUM *computeTemp=NULL, *am=NULL; BIGNUM tmp, am; bn_check_top(a); bn_check_top(p); Loading @@ -602,10 +606,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, return ret; } /* Initialize BIGNUM context and allocate intermediate result */ BN_CTX_start(ctx); r = BN_CTX_get(ctx); if (r == NULL) goto err; /* Allocate a montgomery context if it was not supplied by the caller. * If this is not done, things will break in the montgomery part. Loading @@ -620,40 +621,154 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, /* Get the window size to use with size of p. */ window = BN_window_bits_for_ctime_exponent_size(bits); #if defined(OPENSSL_BN_ASM_MONT5) if (window==6 && bits<=1024) window=5; /* ~5% improvement of 2048-bit RSA sign */ #endif /* Allocate a buffer large enough to hold all of the pre-computed * powers of a. * powers of am, am itself and tmp. */ numPowers = 1 << window; powerbufLen = sizeof(m->d[0])*top*numPowers; powerbufLen = sizeof(m->d[0])*(top*numPowers + ((2*top)>numPowers?(2*top):numPowers)); #ifdef alloca if (powerbufLen < 3072) powerbufFree = alloca(powerbufLen+MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH); else #endif if ((powerbufFree=(unsigned char*)OPENSSL_malloc(powerbufLen+MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH)) == NULL) goto err; powerbuf = MOD_EXP_CTIME_ALIGN(powerbufFree); memset(powerbuf, 0, powerbufLen); /* Initialize the intermediate result. Do this early to save double conversion, * once each for a^0 and intermediate result. */ if (!BN_to_montgomery(r,BN_value_one(),mont,ctx)) goto err; if (!MOD_EXP_CTIME_COPY_TO_PREBUF(r, top, powerbuf, 0, numPowers)) goto err; #ifdef alloca if (powerbufLen < 3072) powerbufFree = NULL; #endif /* Initialize computeTemp as a^1 with montgomery precalcs */ computeTemp = BN_CTX_get(ctx); am = BN_CTX_get(ctx); if (computeTemp==NULL || am==NULL) goto err; /* lay down tmp and am right after powers table */ tmp.d = (BN_ULONG *)(powerbuf + sizeof(m->d[0])*top*numPowers); am.d = tmp.d + top; tmp.top = am.top = 0; tmp.dmax = am.dmax = top; tmp.neg = am.neg = 0; tmp.flags = am.flags = BN_FLG_STATIC_DATA; /* prepare a^0 in Montgomery domain */ #if 1 if (!BN_to_montgomery(&tmp,BN_value_one(),mont,ctx)) goto err; #else tmp.d[0] = (0-m->d[0])&BN_MASK2; /* 2^(top*BN_BITS2) - m */ for (i=1;i<top;i++) tmp.d[i] = (~m->d[i])&BN_MASK2; tmp.top = top; #endif /* prepare a^1 in Montgomery domain */ if (a->neg || BN_ucmp(a,m) >= 0) { if (!BN_mod(am,a,m,ctx)) goto err; aa= am; if (!BN_mod(&am,a,m,ctx)) goto err; if (!BN_to_montgomery(&am,&am,mont,ctx)) goto err; } else if (!BN_to_montgomery(&am,a,mont,ctx)) goto err; #if defined(OPENSSL_BN_ASM_MONT5) /* This optimization uses ideas from http://eprint.iacr.org/2011/239, * specifically optimization of cache-timing attack countermeasures * and pre-computation optimization. */ /* Dedicated window==4 case improves 512-bit RSA sign by ~15%, but as * 512-bit RSA is hardly relevant, we omit it to spare size... */ if (window==5) { void bn_mul_mont_gather5(BN_ULONG *rp,const BN_ULONG *ap, const void *table,const BN_ULONG *np, const BN_ULONG *n0,int num,int power); void bn_scatter5(const BN_ULONG *inp,size_t num, void *table,size_t power); void bn_gather5(BN_ULONG *out,size_t num, void *table,size_t power); BN_ULONG *np=mont->N.d, *n0=mont->n0; /* BN_to_montgomery can contaminate words above .top * [in BN_DEBUG[_DEBUG] build]... */ for (i=am.top; i<top; i++) am.d[i]=0; for (i=tmp.top; i<top; i++) tmp.d[i]=0; bn_scatter5(tmp.d,top,powerbuf,0); bn_scatter5(am.d,am.top,powerbuf,1); bn_mul_mont(tmp.d,am.d,am.d,np,n0,top); bn_scatter5(tmp.d,top,powerbuf,2); #if 0 for (i=3; i<32; i++) { /* Calculate a^i = a^(i-1) * a */ bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1); bn_scatter5(tmp.d,top,powerbuf,i); } #else /* same as above, but uses squaring for 1/2 of operations */ for (i=4; i<32; i*=2) { bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top); bn_scatter5(tmp.d,top,powerbuf,i); } for (i=3; i<8; i+=2) { int j; bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1); bn_scatter5(tmp.d,top,powerbuf,i); for (j=2*i; j<32; j*=2) { bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top); bn_scatter5(tmp.d,top,powerbuf,j); } } for (; i<16; i+=2) { bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1); bn_scatter5(tmp.d,top,powerbuf,i); bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top); bn_scatter5(tmp.d,top,powerbuf,2*i); } for (; i<32; i+=2) { bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1); bn_scatter5(tmp.d,top,powerbuf,i); } #endif bits--; for (wvalue=0, i=bits%5; i>=0; i--,bits--) wvalue = (wvalue<<1)+BN_is_bit_set(p,bits); bn_gather5(tmp.d,top,powerbuf,wvalue); /* Scan the exponent one window at a time starting from the most * significant bits. */ while (bits >= 0) { for (wvalue=0, i=0; i<5; i++,bits--) wvalue = (wvalue<<1)+BN_is_bit_set(p,bits); bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top); bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top); bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top); bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top); bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top); bn_mul_mont_gather5(tmp.d,tmp.d,powerbuf,np,n0,top,wvalue); } tmp.top=top; bn_correct_top(&tmp); } else aa=a; if (!BN_to_montgomery(am,aa,mont,ctx)) goto err; if (!BN_copy(computeTemp, am)) goto err; if (!MOD_EXP_CTIME_COPY_TO_PREBUF(am, top, powerbuf, 1, numPowers)) goto err; #endif { if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 0, numPowers)) goto err; if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&am, top, powerbuf, 1, numPowers)) goto err; /* If the window size is greater than 1, then calculate * val[i=2..2^winsize-1]. Powers are computed as a*a^(i-1) Loading @@ -662,62 +777,54 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, */ if (window > 1) { for (i=2; i<numPowers; i++) if (!BN_mod_mul_montgomery(&tmp,&am,&am,mont,ctx)) goto err; if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 2, numPowers)) goto err; for (i=3; i<numPowers; i++) { /* Calculate a^i = a^(i-1) * a */ if (!BN_mod_mul_montgomery(computeTemp,am,computeTemp,mont,ctx)) if (!BN_mod_mul_montgomery(&tmp,&am,&tmp,mont,ctx)) goto err; if (!MOD_EXP_CTIME_COPY_TO_PREBUF(computeTemp, top, powerbuf, i, numPowers)) goto err; if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, i, numPowers)) goto err; } } /* Adjust the number of bits up to a multiple of the window size. * If the exponent length is not a multiple of the window size, then * this pads the most significant bits with zeros to normalize the * scanning loop to there's no special cases. * * * NOTE: Making the window size a power of two less than the native * * word size ensures that the padded bits won't go past the last * * word in the internal BIGNUM structure. Going past the end will * * still produce the correct result, but causes a different branch * * to be taken in the BN_is_bit_set function. */ bits = ((bits+window-1)/window)*window; idx=bits-1; /* The top bit of the window */ bits--; for (wvalue=0, i=bits%window; i>=0; i--,bits--) wvalue = (wvalue<<1)+BN_is_bit_set(p,bits); if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&tmp,top,powerbuf,wvalue,numPowers)) goto err; /* Scan the exponent one window at a time starting from the most * significant bits. */ while (idx >= 0) while (bits >= 0) { wvalue=0; /* The 'value' of the window */ /* Scan the window, squaring the result as we go */ for (i=0; i<window; i++,idx--) for (i=0; i<window; i++,bits--) { if (!BN_mod_mul_montgomery(r,r,r,mont,ctx)) goto err; wvalue = (wvalue<<1)+BN_is_bit_set(p,idx); if (!BN_mod_mul_montgomery(&tmp,&tmp,&tmp,mont,ctx)) goto err; wvalue = (wvalue<<1)+BN_is_bit_set(p,bits); } /* Fetch the appropriate pre-computed value from the pre-buf */ if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(computeTemp, top, powerbuf, wvalue, numPowers)) goto err; if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&am, top, powerbuf, wvalue, numPowers)) goto err; /* Multiply the result into the intermediate result */ if (!BN_mod_mul_montgomery(r,r,computeTemp,mont,ctx)) goto err; if (!BN_mod_mul_montgomery(&tmp,&tmp,&am,mont,ctx)) goto err; } } /* Convert the final result from montgomery to standard format */ if (!BN_from_montgomery(rr,r,mont,ctx)) goto err; if (!BN_from_montgomery(rr,&tmp,mont,ctx)) goto err; ret=1; err: if ((in_mont == NULL) && (mont != NULL)) BN_MONT_CTX_free(mont); if (powerbuf!=NULL) { OPENSSL_cleanse(powerbuf,powerbufLen); OPENSSL_free(powerbufFree); if (powerbufFree) OPENSSL_free(powerbufFree); } if (am!=NULL) BN_clear(am); if (computeTemp!=NULL) BN_clear(computeTemp); BN_CTX_end(ctx); return(ret); } Loading Loading @@ -988,4 +1095,3 @@ err: bn_check_top(r); return(ret); }
crypto/bn/bn_gf2m.c +89 −17 Original line number Diff line number Diff line Loading @@ -124,6 +124,7 @@ static const BN_ULONG SQR_tb[16] = SQR_tb[(w) >> 4 & 0xF] << 8 | SQR_tb[(w) & 0xF] #endif #if !defined(OPENSSL_BN_ASM_GF2m) /* Product of two polynomials a, b each with degree < BN_BITS2 - 1, * result is a polynomial r with degree < 2 * BN_BITS - 1 * The caller MUST ensure that the variables have the right amount Loading Loading @@ -218,7 +219,9 @@ static void bn_GF2m_mul_2x2(BN_ULONG *r, const BN_ULONG a1, const BN_ULONG a0, c r[2] ^= m1 ^ r[1] ^ r[3]; /* h0 ^= m1 ^ l1 ^ h1; */ r[1] = r[3] ^ r[2] ^ r[0] ^ m1 ^ m0; /* l1 ^= l0 ^ h0 ^ m0; */ } #else void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0); #endif /* Add polynomials a and b and store result in r; r could be a or b, a and b * could be equal; r is the bitwise XOR of a and b. Loading Loading @@ -362,21 +365,17 @@ int BN_GF2m_mod_arr(BIGNUM *r, const BIGNUM *a, const int p[]) int BN_GF2m_mod(BIGNUM *r, const BIGNUM *a, const BIGNUM *p) { int ret = 0; const int max = BN_num_bits(p) + 1; int *arr=NULL; int arr[6]; bn_check_top(a); bn_check_top(p); if ((arr = (int *)OPENSSL_malloc(sizeof(int) * max)) == NULL) goto err; ret = BN_GF2m_poly2arr(p, arr, max); if (!ret || ret > max) ret = BN_GF2m_poly2arr(p, arr, sizeof(arr)/sizeof(arr[0])); if (!ret || ret > (int)(sizeof(arr)/sizeof(arr[0]))) { BNerr(BN_F_BN_GF2M_MOD,BN_R_INVALID_LENGTH); goto err; return 0; } ret = BN_GF2m_mod_arr(r, a, arr); bn_check_top(r); err: if (arr) OPENSSL_free(arr); return ret; } Loading Loading @@ -531,18 +530,18 @@ int BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx) BN_CTX_start(ctx); b = BN_CTX_get(ctx); c = BN_CTX_get(ctx); u = BN_CTX_get(ctx); v = BN_CTX_get(ctx); if (v == NULL) goto err; if ((b = BN_CTX_get(ctx))==NULL) goto err; if ((c = BN_CTX_get(ctx))==NULL) goto err; if ((u = BN_CTX_get(ctx))==NULL) goto err; if ((v = BN_CTX_get(ctx))==NULL) goto err; if (!BN_one(b)) goto err; if (!BN_GF2m_mod(u, a, p)) goto err; if (!BN_copy(v, p)) goto err; if (BN_is_zero(u)) goto err; if (!BN_copy(v, p)) goto err; #if 0 if (!BN_one(b)) goto err; while (1) { while (!BN_is_odd(u)) Loading @@ -567,13 +566,86 @@ int BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx) if (!BN_GF2m_add(u, u, v)) goto err; if (!BN_GF2m_add(b, b, c)) goto err; } #else { int i, ubits = BN_num_bits(u), vbits = BN_num_bits(v), /* v is copy of p */ top = p->top; BN_ULONG *udp,*bdp,*vdp,*cdp; bn_wexpand(u,top); udp = u->d; for (i=u->top;i<top;i++) udp[i] = 0; u->top = top; bn_wexpand(b,top); bdp = b->d; bdp[0] = 1; for (i=1;i<top;i++) bdp[i] = 0; b->top = top; bn_wexpand(c,top); cdp = c->d; for (i=0;i<top;i++) cdp[i] = 0; c->top = top; vdp = v->d; /* It pays off to "cache" *->d pointers, because * it allows optimizer to be more aggressive. * But we don't have to "cache" p->d, because *p * is declared 'const'... */ while (1) { while (ubits && !(udp[0]&1)) { BN_ULONG u0,u1,b0,b1,mask; u0 = udp[0]; b0 = bdp[0]; mask = (BN_ULONG)0-(b0&1); b0 ^= p->d[0]&mask; for (i=0;i<top-1;i++) { u1 = udp[i+1]; udp[i] = ((u0>>1)|(u1<<(BN_BITS2-1)))&BN_MASK2; u0 = u1; b1 = bdp[i+1]^(p->d[i+1]&mask); bdp[i] = ((b0>>1)|(b1<<(BN_BITS2-1)))&BN_MASK2; b0 = b1; } udp[i] = u0>>1; bdp[i] = b0>>1; ubits--; } if (ubits<=BN_BITS2 && udp[0]==1) break; if (ubits<vbits) { i = ubits; ubits = vbits; vbits = i; tmp = u; u = v; v = tmp; tmp = b; b = c; c = tmp; udp = vdp; vdp = v->d; bdp = cdp; cdp = c->d; } for(i=0;i<top;i++) { udp[i] ^= vdp[i]; bdp[i] ^= cdp[i]; } if (ubits==vbits) { bn_correct_top(u); ubits = BN_num_bits(u); } } bn_correct_top(b); } #endif if (!BN_copy(r, b)) goto err; bn_check_top(r); ret = 1; err: #ifdef BN_DEBUG /* BN_CTX_end would complain about the expanded form */ bn_correct_top(c); bn_correct_top(u); bn_correct_top(v); #endif BN_CTX_end(ctx); return ret; } Loading
crypto/bn/bn_lcl.h +16 −3 Original line number Diff line number Diff line Loading @@ -238,7 +238,7 @@ extern "C" { # if defined(__DECC) # include <c_asm.h> # define BN_UMULT_HIGH(a,b) (BN_ULONG)asm("umulh %a0,%a1,%v0",(a),(b)) # elif defined(__GNUC__) # elif defined(__GNUC__) && __GNUC__>=2 # define BN_UMULT_HIGH(a,b) ({ \ register BN_ULONG ret; \ asm ("umulh %1,%2,%0" \ Loading @@ -247,7 +247,7 @@ extern "C" { ret; }) # endif /* compiler */ # elif defined(_ARCH_PPC) && defined(__64BIT__) && defined(SIXTY_FOUR_BIT_LONG) # if defined(__GNUC__) # if defined(__GNUC__) && __GNUC__>=2 # define BN_UMULT_HIGH(a,b) ({ \ register BN_ULONG ret; \ asm ("mulhdu %0,%1,%2" \ Loading @@ -257,7 +257,7 @@ extern "C" { # endif /* compiler */ # elif (defined(__x86_64) || defined(__x86_64__)) && \ (defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT)) # if defined(__GNUC__) # if defined(__GNUC__) && __GNUC__>=2 # define BN_UMULT_HIGH(a,b) ({ \ register BN_ULONG ret,discard; \ asm ("mulq %3" \ Loading @@ -280,6 +280,19 @@ extern "C" { # define BN_UMULT_HIGH(a,b) __umulh((a),(b)) # define BN_UMULT_LOHI(low,high,a,b) ((low)=_umul128((a),(b),&(high))) # endif # elif defined(__mips) && (defined(SIXTY_FOUR_BIT) || defined(SIXTY_FOUR_BIT_LONG)) # if defined(__GNUC__) && __GNUC__>=2 # define BN_UMULT_HIGH(a,b) ({ \ register BN_ULONG ret; \ asm ("dmultu %1,%2" \ : "=h"(ret) \ : "r"(a), "r"(b) : "l"); \ ret; }) # define BN_UMULT_LOHI(low,high,a,b) \ asm ("dmultu %2,%3" \ : "=l"(low),"=h"(high) \ : "r"(a), "r"(b)); # endif # endif /* cpu */ #endif /* OPENSSL_NO_ASM */ Loading
crypto/bn/bn_mont.c +29 −87 File changed.Preview size limit exceeded, changes collapsed. Show changes