Loading crypto/bn/bn.h +2 −0 Original line number Diff line number Diff line Loading @@ -727,6 +727,8 @@ int RAND_pseudo_bytes(unsigned char *buf,int num); bn_pollute(a); \ } void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,BN_ULONG n0, int num); void bn_sqr_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *np,BN_ULONG n0, int num); BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w); BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w); void bn_sqr_words(BN_ULONG *rp, const BN_ULONG *ap, int num); Loading crypto/bn/bn_asm.c +124 −2 Original line number Diff line number Diff line Loading @@ -820,18 +820,95 @@ void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) r[6]=c1; r[7]=c2; } #ifdef OPENSSL_BN_ASM_MONT /* * This is essentially reference implementation, which may or may not * result in performance improvement. E.g. on IA-32 this does give 40% * faster rsa1024 private key operations and 10% faster rsa4096 ones, * while on AMD64 it improves rsa1024 sign only by 10%, but *worsens* * rsa4096 sign by 15%. Once again, it's a reference implementation, * one to be used as start-point for platform-specific assembler. */ void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,BN_ULONG n0, int num) { BN_ULONG c0,c1,ml,*tp; #ifdef mul64 BN_ULONG mh; #endif volatile BN_ULONG *vp; int i=0,j; vp = tp = alloca((num+2)*sizeof(BN_ULONG)); tp[num] = bn_mul_words(tp,ap,num,bp[0]); tp[num+1] = 0; goto enter; for(i=0;i<num;i++) { c0 = bn_mul_add_words(tp,ap,num,bp[i]); c1 = (tp[num] + c0)&BN_MASK2; tp[num] = c1; tp[num+1] = (c1<c0?1:0); enter: c1 = tp[0]; ml = (c1*n0)&BN_MASK2; c0 = 0; #ifdef mul64 mh = HBITS(ml); ml = LBITS(ml); mul_add(c1,np[0],ml,mh,c0); #else mul_add(c1,ml,np[0],c0); #endif for(j=1;j<num;j++) { c1 = tp[j]; #ifdef mul64 mul_add(c1,np[j],ml,mh,c0); #else mul_add(c1,ml,np[j],c0); #endif tp[j-1] = c1&BN_MASK2; } c1 = (tp[num] + c0)&BN_MASK2; tp[num-1] = c1; tp[num] = tp[num+1] + (c1<c0?1:0); } if (tp[num]!=0 || tp[num-1]>=np[num-1]) { c0 = bn_sub_words(rp,tp,np,num); if (tp[num]!=0 || c0==0) { for(i=0;i<num+2;i++) vp[i] = 0; return; } } for(i=0;i<num;i++) rp[i] = tp[i], vp[i] = 0; vp[num] = 0; vp[num+1] = 0; } void bn_sqr_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *np,BN_ULONG n0, int num) { bn_mul_mont(rp,ap,ap,np,n0,num); } #endif /* OPENSSL_BN_ASM_MONT */ #else /* !BN_MUL_COMBA */ /* hmm... is it faster just to do a multiply? */ #undef bn_sqr_comba4 void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) { BN_ULONG t[8]; bn_sqr_normal(r,a,4,t); } #undef bn_sqr_comba8 void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) { BN_ULONG t[16]; bn_sqr_normal(r,a,8,t); Loading @@ -857,4 +934,49 @@ void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]); } #ifdef OPENSSL_BN_ASM_MONT void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,BN_ULONG n0, int num) { BN_ULONG c0,c1,*tp; volatile BN_ULONG *vp; int i=0,j; vp = tp = alloca((num+2)*sizeof(BN_ULONG)); for(i=0;i<=num;i++) tp[i]=0; for(i=0;i<num;i++) { c0 = bn_mul_add_words(tp,ap,num,bp[i]); c1 = tp[num] + c0; tp[num] = c1; tp[num+1] = (c1<c0?1:0); c0 = bn_mul_add_words(tp,np,num,tp[0]*n0); c1 = tp[num] + c0; tp[num] = c1; tp[num+1] += (c1<c0?1:0); for(j=0;j<=num;j++) tp[j]=tp[j+1]; } if (tp[num]!=0 || tp[num-1]>=np[num-1]) { c0 = bn_sub_words(rp,tp,np,num); if (tp[num]!=0 || c0==0) { for(i=0;i<num+2;i++) vp[i] = 0; return; } } for(i=0;i<num;i++) rp[i] = tp[i], vp[i] = 0; vp[num] = 0; vp[num+1] = 0; } void bn_sqr_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *np,BN_ULONG n0, int num) { bn_mul_mont(rp,ap,ap,np,n0,num); } #endif /* OPENSSL_BN_ASM_MONT */ #endif /* !BN_MUL_COMBA */ crypto/bn/bn_mont.c +16 −0 Original line number Diff line number Diff line Loading @@ -74,6 +74,22 @@ int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, { BIGNUM *tmp; int ret=0; #ifdef OPENSSL_BN_ASM_MONT int num = mont->N.top; if (num>1 && a->top==num && b->top==num) { if (bn_wexpand(r,num) == NULL) return 0; r->neg = a->neg^b->neg; r->top = num; if (a==b) bn_sqr_mont(r->d,a->d,mont->N.d,mont->n0,num); else bn_mul_mont(r->d,a->d,b->d,mont->N.d,mont->n0,num); bn_fix_top(r); return 1; } #endif BN_CTX_start(ctx); tmp = BN_CTX_get(ctx); Loading Loading
crypto/bn/bn.h +2 −0 Original line number Diff line number Diff line Loading @@ -727,6 +727,8 @@ int RAND_pseudo_bytes(unsigned char *buf,int num); bn_pollute(a); \ } void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,BN_ULONG n0, int num); void bn_sqr_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *np,BN_ULONG n0, int num); BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w); BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w); void bn_sqr_words(BN_ULONG *rp, const BN_ULONG *ap, int num); Loading
crypto/bn/bn_asm.c +124 −2 Original line number Diff line number Diff line Loading @@ -820,18 +820,95 @@ void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) r[6]=c1; r[7]=c2; } #ifdef OPENSSL_BN_ASM_MONT /* * This is essentially reference implementation, which may or may not * result in performance improvement. E.g. on IA-32 this does give 40% * faster rsa1024 private key operations and 10% faster rsa4096 ones, * while on AMD64 it improves rsa1024 sign only by 10%, but *worsens* * rsa4096 sign by 15%. Once again, it's a reference implementation, * one to be used as start-point for platform-specific assembler. */ void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,BN_ULONG n0, int num) { BN_ULONG c0,c1,ml,*tp; #ifdef mul64 BN_ULONG mh; #endif volatile BN_ULONG *vp; int i=0,j; vp = tp = alloca((num+2)*sizeof(BN_ULONG)); tp[num] = bn_mul_words(tp,ap,num,bp[0]); tp[num+1] = 0; goto enter; for(i=0;i<num;i++) { c0 = bn_mul_add_words(tp,ap,num,bp[i]); c1 = (tp[num] + c0)&BN_MASK2; tp[num] = c1; tp[num+1] = (c1<c0?1:0); enter: c1 = tp[0]; ml = (c1*n0)&BN_MASK2; c0 = 0; #ifdef mul64 mh = HBITS(ml); ml = LBITS(ml); mul_add(c1,np[0],ml,mh,c0); #else mul_add(c1,ml,np[0],c0); #endif for(j=1;j<num;j++) { c1 = tp[j]; #ifdef mul64 mul_add(c1,np[j],ml,mh,c0); #else mul_add(c1,ml,np[j],c0); #endif tp[j-1] = c1&BN_MASK2; } c1 = (tp[num] + c0)&BN_MASK2; tp[num-1] = c1; tp[num] = tp[num+1] + (c1<c0?1:0); } if (tp[num]!=0 || tp[num-1]>=np[num-1]) { c0 = bn_sub_words(rp,tp,np,num); if (tp[num]!=0 || c0==0) { for(i=0;i<num+2;i++) vp[i] = 0; return; } } for(i=0;i<num;i++) rp[i] = tp[i], vp[i] = 0; vp[num] = 0; vp[num+1] = 0; } void bn_sqr_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *np,BN_ULONG n0, int num) { bn_mul_mont(rp,ap,ap,np,n0,num); } #endif /* OPENSSL_BN_ASM_MONT */ #else /* !BN_MUL_COMBA */ /* hmm... is it faster just to do a multiply? */ #undef bn_sqr_comba4 void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) { BN_ULONG t[8]; bn_sqr_normal(r,a,4,t); } #undef bn_sqr_comba8 void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) { BN_ULONG t[16]; bn_sqr_normal(r,a,8,t); Loading @@ -857,4 +934,49 @@ void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]); } #ifdef OPENSSL_BN_ASM_MONT void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,BN_ULONG n0, int num) { BN_ULONG c0,c1,*tp; volatile BN_ULONG *vp; int i=0,j; vp = tp = alloca((num+2)*sizeof(BN_ULONG)); for(i=0;i<=num;i++) tp[i]=0; for(i=0;i<num;i++) { c0 = bn_mul_add_words(tp,ap,num,bp[i]); c1 = tp[num] + c0; tp[num] = c1; tp[num+1] = (c1<c0?1:0); c0 = bn_mul_add_words(tp,np,num,tp[0]*n0); c1 = tp[num] + c0; tp[num] = c1; tp[num+1] += (c1<c0?1:0); for(j=0;j<=num;j++) tp[j]=tp[j+1]; } if (tp[num]!=0 || tp[num-1]>=np[num-1]) { c0 = bn_sub_words(rp,tp,np,num); if (tp[num]!=0 || c0==0) { for(i=0;i<num+2;i++) vp[i] = 0; return; } } for(i=0;i<num;i++) rp[i] = tp[i], vp[i] = 0; vp[num] = 0; vp[num+1] = 0; } void bn_sqr_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *np,BN_ULONG n0, int num) { bn_mul_mont(rp,ap,ap,np,n0,num); } #endif /* OPENSSL_BN_ASM_MONT */ #endif /* !BN_MUL_COMBA */
crypto/bn/bn_mont.c +16 −0 Original line number Diff line number Diff line Loading @@ -74,6 +74,22 @@ int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, { BIGNUM *tmp; int ret=0; #ifdef OPENSSL_BN_ASM_MONT int num = mont->N.top; if (num>1 && a->top==num && b->top==num) { if (bn_wexpand(r,num) == NULL) return 0; r->neg = a->neg^b->neg; r->top = num; if (a==b) bn_sqr_mont(r->d,a->d,mont->N.d,mont->n0,num); else bn_mul_mont(r->d,a->d,b->d,mont->N.d,mont->n0,num); bn_fix_top(r); return 1; } #endif BN_CTX_start(ctx); tmp = BN_CTX_get(ctx); Loading