Loading Configure +1 −1 Original line number Diff line number Diff line Loading @@ -128,7 +128,7 @@ my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o x86-gf2m.o:des-586.o crypt5 my $x86_elf_asm="$x86_asm:elf"; my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o aesni-gcm-x86_64.o:e_padlock-x86_64.o"; my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o aesni-gcm-x86_64.o:e_padlock-x86_64.o"; my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o::void"; my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o:des_enc-sparc.o fcrypt_b.o dest4-sparcv9.o:aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o::::::camellia.o cmll_misc.o cmll_cbc.o cmllt4-sparcv9.o:ghash-sparcv9.o::void"; my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::::void"; Loading TABLE +16 −16 Original line number Diff line number Diff line Loading @@ -306,7 +306,7 @@ $sys_id = $lflags = $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = $aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o $bf_obj = Loading Loading @@ -801,7 +801,7 @@ $sys_id = WIN64A $lflags = $bn_ops = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN $cpuid_obj = x86_64cpuid.o $bn_obj = bn_asm.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o $bn_obj = bn_asm.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = $aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o $bf_obj = Loading Loading @@ -1494,7 +1494,7 @@ $sys_id = MACOSX $lflags = -Wl,-search_paths_first% $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = $aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o $bf_obj = Loading Loading @@ -1659,7 +1659,7 @@ $sys_id = WIN64A $lflags = $bn_ops = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN $cpuid_obj = x86_64cpuid.o $bn_obj = bn_asm.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o $bn_obj = bn_asm.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = $aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o $bf_obj = Loading Loading @@ -1758,7 +1758,7 @@ $sys_id = MACOSX $lflags = -Wl,-search_paths_first% $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = $aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o $bf_obj = Loading Loading @@ -1824,7 +1824,7 @@ $sys_id = $lflags = $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = $aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o $bf_obj = Loading Loading @@ -2022,7 +2022,7 @@ $sys_id = $lflags = -ldl $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = $aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o $bf_obj = Loading Loading @@ -2550,7 +2550,7 @@ $sys_id = $lflags = -ldl $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = $aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o $bf_obj = Loading Loading @@ -2748,7 +2748,7 @@ $sys_id = $lflags = -ldl $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = $aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o $bf_obj = Loading Loading @@ -2814,7 +2814,7 @@ $sys_id = $lflags = -ldl $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = $aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o $bf_obj = Loading Loading @@ -4464,7 +4464,7 @@ $sys_id = $lflags = -ldl $bn_ops = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = $aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o $bf_obj = Loading Loading @@ -4497,7 +4497,7 @@ $sys_id = $lflags = -ldl $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = $aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o $bf_obj = Loading Loading @@ -4530,7 +4530,7 @@ $sys_id = $lflags = -ldl -no_cpprt $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = $aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o $bf_obj = Loading Loading @@ -4728,7 +4728,7 @@ $sys_id = MINGW64 $lflags = -lws2_32 -lgdi32 -lcrypt32 $bn_ops = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = $aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o $bf_obj = Loading Loading @@ -5718,7 +5718,7 @@ $sys_id = $lflags = -lsocket -lnsl -ldl $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = $aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o $bf_obj = Loading Loading @@ -5751,7 +5751,7 @@ $sys_id = $lflags = -lsocket -lnsl -ldl $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = $aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o $bf_obj = Loading crypto/bn/Makefile +4 −0 Original line number Diff line number Diff line Loading @@ -110,6 +110,10 @@ x86_64-gf2m.s: asm/x86_64-gf2m.pl $(PERL) asm/x86_64-gf2m.pl $(PERLASM_SCHEME) > $@ modexp512-x86_64.s: asm/modexp512-x86_64.pl $(PERL) asm/modexp512-x86_64.pl $(PERLASM_SCHEME) > $@ rsaz-x86_64.s: asm/rsaz-x86_64.pl $(PERL) asm/rsaz-x86_64.pl $(PERLASM_SCHEME) > $@ rsaz-avx2.s: asm/rsaz-avx2.pl $(PERL) asm/rsaz-avx2.pl $(PERLASM_SCHEME) > $@ bn-ia64.s: asm/ia64.S $(CC) $(CFLAGS) -E asm/ia64.S > $@ Loading crypto/bn/bn_exp.c +37 −0 Original line number Diff line number Diff line Loading @@ -128,6 +128,14 @@ # include <alloca.h> #endif #undef RSAZ_ENABLED #if defined(OPENSSL_BN_ASM_MONT) && \ (defined(__x86_64) || defined(__x86_64__) || \ defined(_M_AMD64) || defined(_M_X64)) # include "rsaz_exp.h" # define RSAZ_ENABLED #endif #undef SPARC_T4_MONT #if defined(OPENSSL_BN_ASM_MONT) && (defined(__sparc__) || defined(__sparc)) # include "sparc_arch.h" Loading Loading @@ -677,6 +685,35 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, if (!BN_MONT_CTX_set(mont,m,ctx)) goto err; } #ifdef RSAZ_ENABLED /* * If the size of the operands allow it, perform the optimized * RSAZ exponentiation. For further information see * crypto/bn/rsaz_exp.c and accompanying assembly modules. */ if ((16 == a->top) && (16 == p->top) && (BN_num_bits(m) == 1024) && rsaz_avx2_eligible()) { if (NULL == bn_wexpand(rr, 16)) goto err; RSAZ_1024_mod_exp_avx2(rr->d, a->d, p->d, m->d, mont->RR.d, mont->n0[0]); rr->top = 16; rr->neg = 0; bn_correct_top(rr); ret = 1; goto err; } else if ((8 == a->top) && (8 == p->top) && (BN_num_bits(m) == 512)) { if (NULL == bn_wexpand(rr,8)) goto err; RSAZ_512_mod_exp(rr->d, a->d, p->d, m->d, mont->n0[0], mont->RR.d); rr->top = 8; rr->neg = 0; bn_correct_top(rr); ret = 1; goto err; } #endif /* Get the window size to use with size of p. */ window = BN_window_bits_for_ctime_exponent_size(bits); #if defined(SPARC_T4_MONT) Loading crypto/bn/rsaz_exp.c 0 → 100644 +306 −0 Original line number Diff line number Diff line /****************************************************************************** * Copyright(c) 2012, Intel Corp. * Developers and authors: * Shay Gueron (1, 2), and Vlad Krasnov (1) * (1) Intel Corporation, Israel Development Center, Haifa, Israel * (2) University of Haifa, Israel ****************************************************************************** * LICENSE: * This submission to OpenSSL is to be made available under the OpenSSL * license, and only to the OpenSSL project, in order to allow integration * into the publicly distributed code. * The use of this code, or portions of this code, or concepts embedded in * this code, or modification of this code and/or algorithm(s) in it, or the * use of this code for any other purpose than stated above, requires special * licensing. ****************************************************************************** * DISCLAIMER: * THIS SOFTWARE IS PROVIDED BY THE CONTRIBUTORS AND THE COPYRIGHT OWNERS * ``AS IS''. ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS OR THE COPYRIGHT * OWNERS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. ******************************************************************************/ #include "rsaz_exp.h" /* * See crypto/bn/asm/rsaz-avx2.pl for further details. */ void rsaz_1024_norm2red_avx2(void *red,const void *norm); void rsaz_1024_mul_avx2(void *ret,const void *a,const void *b,const void *n,unsigned long k); void rsaz_1024_sqr_avx2(void *ret,const void *a,const void *n,unsigned long k,int cnt); void rsaz_1024_scatter5_avx2(void *tbl,const void *val,int i); void rsaz_1024_gather5_avx2(void *val,const void *tbl,int i); void rsaz_1024_red2norm_avx2(void *norm,const void *red); #if defined(__GNUC__) # define ALIGN64 __attribute__((aligned(64))) #elif defined(_MSC_VER) # define ALIGN64 __declspec(align(64)) #elif defined(__SUNPRO_C) # define ALIGN64 # pragma align 64(one,two80) #else # define ALIGN64 /* not fatal, might hurt performance a little */ #endif ALIGN64 static const unsigned long one[40] = {1,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; ALIGN64 static const unsigned long two80[40] = {0,0,1<<22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; void RSAZ_1024_mod_exp_avx2(BN_ULONG result_norm[16], const BN_ULONG base_norm[16], const BN_ULONG exponent[16], const BN_ULONG m_norm[16], const BN_ULONG RR[16], BN_ULONG k0) { unsigned char storage[320*3+32*9*16+64]; /* 5.5KB */ unsigned char *p_str = storage + (64-((size_t)storage%64)); unsigned char *a_inv, *m, *result, *table_s = p_str+320*3, *R2 = table_s; /* borrow */ int index; int wvalue; if ((((size_t)p_str&4095)+320)>>12) { result = p_str; a_inv = p_str + 320; m = p_str + 320*2; /* should not cross page */ } else { m = p_str; /* should not cross page */ result = p_str + 320; a_inv = p_str + 320*2; } rsaz_1024_norm2red_avx2(m, m_norm); rsaz_1024_norm2red_avx2(a_inv, base_norm); rsaz_1024_norm2red_avx2(R2, RR); rsaz_1024_mul_avx2(R2, R2, R2, m, k0); rsaz_1024_mul_avx2(R2, R2, two80, m, k0); /* table[0] = 1 */ rsaz_1024_mul_avx2(result, R2, one, m, k0); /* table[1] = a_inv^1 */ rsaz_1024_mul_avx2(a_inv, a_inv, R2, m, k0); rsaz_1024_scatter5_avx2(table_s,result,0); rsaz_1024_scatter5_avx2(table_s,a_inv,1); /* table[2] = a_inv^2 */ rsaz_1024_sqr_avx2(result, a_inv, m, k0, 1); rsaz_1024_scatter5_avx2(table_s,result,2); #if 0 /* this is almost 2x smaller and less than 1% slower */ for (index=3; index<32; index++) { rsaz_1024_mul_avx2(result, result, a_inv, m, k0); rsaz_1024_scatter5_avx2(table_s,result,index); } #else /* table[4] = a_inv^4 */ rsaz_1024_sqr_avx2(result, result, m, k0, 1); rsaz_1024_scatter5_avx2(table_s,result,4); /* table[8] = a_inv^8 */ rsaz_1024_sqr_avx2(result, result, m, k0, 1); rsaz_1024_scatter5_avx2(table_s,result,8); /* table[16] = a_inv^16 */ rsaz_1024_sqr_avx2(result, result, m, k0, 1); rsaz_1024_scatter5_avx2(table_s,result,16); /* table[17] = a_inv^17 */ rsaz_1024_mul_avx2(result, result, a_inv, m, k0); rsaz_1024_scatter5_avx2(table_s,result,17); /* table[3] */ rsaz_1024_gather5_avx2(result,table_s,2); rsaz_1024_mul_avx2(result,result,a_inv,m,k0); rsaz_1024_scatter5_avx2(table_s,result,3); /* table[6] */ rsaz_1024_sqr_avx2(result, result, m, k0, 1); rsaz_1024_scatter5_avx2(table_s,result,6); /* table[12] */ rsaz_1024_sqr_avx2(result, result, m, k0, 1); rsaz_1024_scatter5_avx2(table_s,result,12); /* table[24] */ rsaz_1024_sqr_avx2(result, result, m, k0, 1); rsaz_1024_scatter5_avx2(table_s,result,24); /* table[25] */ rsaz_1024_mul_avx2(result, result, a_inv, m, k0); rsaz_1024_scatter5_avx2(table_s,result,25); /* table[5] */ rsaz_1024_gather5_avx2(result,table_s,4); rsaz_1024_mul_avx2(result,result,a_inv,m,k0); rsaz_1024_scatter5_avx2(table_s,result,5); /* table[10] */ rsaz_1024_sqr_avx2(result, result, m, k0, 1); rsaz_1024_scatter5_avx2(table_s,result,10); /* table[20] */ rsaz_1024_sqr_avx2(result, result, m, k0, 1); rsaz_1024_scatter5_avx2(table_s,result,20); /* table[21] */ rsaz_1024_mul_avx2(result, result, a_inv, m, k0); rsaz_1024_scatter5_avx2(table_s,result,21); /* table[7] */ rsaz_1024_gather5_avx2(result,table_s,6); rsaz_1024_mul_avx2(result,result,a_inv,m,k0); rsaz_1024_scatter5_avx2(table_s,result,7); /* table[14] */ rsaz_1024_sqr_avx2(result, result, m, k0, 1); rsaz_1024_scatter5_avx2(table_s,result,14); /* table[28] */ rsaz_1024_sqr_avx2(result, result, m, k0, 1); rsaz_1024_scatter5_avx2(table_s,result,28); /* table[29] */ rsaz_1024_mul_avx2(result, result, a_inv, m, k0); rsaz_1024_scatter5_avx2(table_s,result,29); /* table[9] */ rsaz_1024_gather5_avx2(result,table_s,8); rsaz_1024_mul_avx2(result,result,a_inv,m,k0); rsaz_1024_scatter5_avx2(table_s,result,9); /* table[18] */ rsaz_1024_sqr_avx2(result, result, m, k0, 1); rsaz_1024_scatter5_avx2(table_s,result,18); /* table[19] */ rsaz_1024_mul_avx2(result, result, a_inv, m, k0); rsaz_1024_scatter5_avx2(table_s,result,19); /* table[11] */ rsaz_1024_gather5_avx2(result,table_s,10); rsaz_1024_mul_avx2(result,result,a_inv,m,k0); rsaz_1024_scatter5_avx2(table_s,result,11); /* table[22] */ rsaz_1024_sqr_avx2(result, result, m, k0, 1); rsaz_1024_scatter5_avx2(table_s,result,22); /* table[23] */ rsaz_1024_mul_avx2(result, result, a_inv, m, k0); rsaz_1024_scatter5_avx2(table_s,result,23); /* table[13] */ rsaz_1024_gather5_avx2(result,table_s,12); rsaz_1024_mul_avx2(result,result,a_inv,m,k0); rsaz_1024_scatter5_avx2(table_s,result,13); /* table[26] */ rsaz_1024_sqr_avx2(result, result, m, k0, 1); rsaz_1024_scatter5_avx2(table_s,result,26); /* table[27] */ rsaz_1024_mul_avx2(result, result, a_inv, m, k0); rsaz_1024_scatter5_avx2(table_s,result,27); /* table[15] */ rsaz_1024_gather5_avx2(result,table_s,14); rsaz_1024_mul_avx2(result,result,a_inv,m,k0); rsaz_1024_scatter5_avx2(table_s,result,15); /* table[30] */ rsaz_1024_sqr_avx2(result, result, m, k0, 1); rsaz_1024_scatter5_avx2(table_s,result,30); /* table[31] */ rsaz_1024_mul_avx2(result, result, a_inv, m, k0); rsaz_1024_scatter5_avx2(table_s,result,31); #endif /* load first window */ p_str = (unsigned char*)exponent; wvalue = p_str[127] >> 3; rsaz_1024_gather5_avx2(result,table_s,wvalue); index = 1014; while(index > -1) { /* loop for the remaining 127 windows */ rsaz_1024_sqr_avx2(result, result, m, k0, 5); wvalue = *((unsigned short*)&p_str[index/8]); wvalue = (wvalue>> (index%8)) & 31; index-=5; rsaz_1024_gather5_avx2(a_inv,table_s,wvalue); /* borrow a_inv */ rsaz_1024_mul_avx2(result, result, a_inv, m, k0); } /* square four times */ rsaz_1024_sqr_avx2(result, result, m, k0, 4); wvalue = p_str[0] & 15; rsaz_1024_gather5_avx2(a_inv,table_s,wvalue); /* borrow a_inv */ rsaz_1024_mul_avx2(result, result, a_inv, m, k0); /* from Montgomery */ rsaz_1024_mul_avx2(result, result, one, m, k0); rsaz_1024_red2norm_avx2(result_norm, result); OPENSSL_cleanse(storage,sizeof(storage)); } /* * See crypto/bn/rsaz-x86_64.pl for further details. */ void rsaz_512_mul(void *ret,const void *a,const void *b,const void *n,unsigned long k); void rsaz_512_mul_scatter4(void *ret,const void *a,const void *n,unsigned long k,const void *tbl,unsigned int power); void rsaz_512_mul_gather4(void *ret,const void *a,const void *tbl,const void *n,unsigned long k,unsigned int power); void rsaz_512_mul_by_one(void *ret,const void *a,const void *n,unsigned long k); void rsaz_512_sqr(void *ret,const void *a,const void *n,unsigned long k,int cnt); void rsaz_512_scatter4(void *tbl, const unsigned long *val, int power); void rsaz_512_gather4(unsigned long *val, const void *tbl, int power); void RSAZ_512_mod_exp(BN_ULONG result[8], const BN_ULONG base[8], const BN_ULONG exponent[8], const BN_ULONG m[8], BN_ULONG k0, const BN_ULONG RR[8]) { unsigned char storage[16*8*8+64*2+64]; /* 1.2KB */ unsigned char *table = storage + (64-((size_t)storage%64)); unsigned long *a_inv = (unsigned long *)(table+16*8*8), *temp = (unsigned long *)(table+16*8*8+8*8); unsigned char *p_str = (unsigned char*)exponent; int index; unsigned int wvalue; /* table[0] = 1_inv */ temp[0] = 0-m[0]; temp[1] = ~m[1]; temp[2] = ~m[2]; temp[3] = ~m[3]; temp[4] = ~m[4]; temp[5] = ~m[5]; temp[6] = ~m[6]; temp[7] = ~m[7]; rsaz_512_scatter4(table, temp, 0); /* table [1] = a_inv^1 */ rsaz_512_mul(a_inv, base, RR, m, k0); rsaz_512_scatter4(table, a_inv, 1); /* table [2] = a_inv^2 */ rsaz_512_sqr(temp, a_inv, m, k0, 1); rsaz_512_scatter4(table, temp, 2); for (index=3; index<16; index++) rsaz_512_mul_scatter4(temp, a_inv, m, k0, table, index); /* load first window */ wvalue = p_str[63]; rsaz_512_gather4(temp, table, wvalue>>4); rsaz_512_sqr(temp, temp, m, k0, 4); rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue&0xf); for (index=62; index>=0; index--) { wvalue = p_str[index]; rsaz_512_sqr(temp, temp, m, k0, 4); rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue>>4); rsaz_512_sqr(temp, temp, m, k0, 4); rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue&0x0f); } /* from Montgomery */ rsaz_512_mul_by_one(result, temp, m, k0); OPENSSL_cleanse(storage,sizeof(storage)); } Loading
Configure +1 −1 Original line number Diff line number Diff line Loading @@ -128,7 +128,7 @@ my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o x86-gf2m.o:des-586.o crypt5 my $x86_elf_asm="$x86_asm:elf"; my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o aesni-gcm-x86_64.o:e_padlock-x86_64.o"; my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o aesni-gcm-x86_64.o:e_padlock-x86_64.o"; my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o::void"; my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o:des_enc-sparc.o fcrypt_b.o dest4-sparcv9.o:aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o::::::camellia.o cmll_misc.o cmll_cbc.o cmllt4-sparcv9.o:ghash-sparcv9.o::void"; my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::::void"; Loading
TABLE +16 −16 Original line number Diff line number Diff line Loading @@ -306,7 +306,7 @@ $sys_id = $lflags = $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = $aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o $bf_obj = Loading Loading @@ -801,7 +801,7 @@ $sys_id = WIN64A $lflags = $bn_ops = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN $cpuid_obj = x86_64cpuid.o $bn_obj = bn_asm.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o $bn_obj = bn_asm.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = $aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o $bf_obj = Loading Loading @@ -1494,7 +1494,7 @@ $sys_id = MACOSX $lflags = -Wl,-search_paths_first% $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = $aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o $bf_obj = Loading Loading @@ -1659,7 +1659,7 @@ $sys_id = WIN64A $lflags = $bn_ops = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN $cpuid_obj = x86_64cpuid.o $bn_obj = bn_asm.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o $bn_obj = bn_asm.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = $aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o $bf_obj = Loading Loading @@ -1758,7 +1758,7 @@ $sys_id = MACOSX $lflags = -Wl,-search_paths_first% $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = $aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o $bf_obj = Loading Loading @@ -1824,7 +1824,7 @@ $sys_id = $lflags = $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = $aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o $bf_obj = Loading Loading @@ -2022,7 +2022,7 @@ $sys_id = $lflags = -ldl $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = $aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o $bf_obj = Loading Loading @@ -2550,7 +2550,7 @@ $sys_id = $lflags = -ldl $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = $aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o $bf_obj = Loading Loading @@ -2748,7 +2748,7 @@ $sys_id = $lflags = -ldl $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = $aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o $bf_obj = Loading Loading @@ -2814,7 +2814,7 @@ $sys_id = $lflags = -ldl $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = $aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o $bf_obj = Loading Loading @@ -4464,7 +4464,7 @@ $sys_id = $lflags = -ldl $bn_ops = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = $aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o $bf_obj = Loading Loading @@ -4497,7 +4497,7 @@ $sys_id = $lflags = -ldl $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = $aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o $bf_obj = Loading Loading @@ -4530,7 +4530,7 @@ $sys_id = $lflags = -ldl -no_cpprt $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = $aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o $bf_obj = Loading Loading @@ -4728,7 +4728,7 @@ $sys_id = MINGW64 $lflags = -lws2_32 -lgdi32 -lcrypt32 $bn_ops = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = $aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o $bf_obj = Loading Loading @@ -5718,7 +5718,7 @@ $sys_id = $lflags = -lsocket -lnsl -ldl $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = $aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o $bf_obj = Loading Loading @@ -5751,7 +5751,7 @@ $sys_id = $lflags = -lsocket -lnsl -ldl $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o $bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o $des_obj = $aes_obj = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o $bf_obj = Loading
crypto/bn/Makefile +4 −0 Original line number Diff line number Diff line Loading @@ -110,6 +110,10 @@ x86_64-gf2m.s: asm/x86_64-gf2m.pl $(PERL) asm/x86_64-gf2m.pl $(PERLASM_SCHEME) > $@ modexp512-x86_64.s: asm/modexp512-x86_64.pl $(PERL) asm/modexp512-x86_64.pl $(PERLASM_SCHEME) > $@ rsaz-x86_64.s: asm/rsaz-x86_64.pl $(PERL) asm/rsaz-x86_64.pl $(PERLASM_SCHEME) > $@ rsaz-avx2.s: asm/rsaz-avx2.pl $(PERL) asm/rsaz-avx2.pl $(PERLASM_SCHEME) > $@ bn-ia64.s: asm/ia64.S $(CC) $(CFLAGS) -E asm/ia64.S > $@ Loading
crypto/bn/bn_exp.c +37 −0 Original line number Diff line number Diff line Loading @@ -128,6 +128,14 @@ # include <alloca.h> #endif #undef RSAZ_ENABLED #if defined(OPENSSL_BN_ASM_MONT) && \ (defined(__x86_64) || defined(__x86_64__) || \ defined(_M_AMD64) || defined(_M_X64)) # include "rsaz_exp.h" # define RSAZ_ENABLED #endif #undef SPARC_T4_MONT #if defined(OPENSSL_BN_ASM_MONT) && (defined(__sparc__) || defined(__sparc)) # include "sparc_arch.h" Loading Loading @@ -677,6 +685,35 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, if (!BN_MONT_CTX_set(mont,m,ctx)) goto err; } #ifdef RSAZ_ENABLED /* * If the size of the operands allow it, perform the optimized * RSAZ exponentiation. For further information see * crypto/bn/rsaz_exp.c and accompanying assembly modules. */ if ((16 == a->top) && (16 == p->top) && (BN_num_bits(m) == 1024) && rsaz_avx2_eligible()) { if (NULL == bn_wexpand(rr, 16)) goto err; RSAZ_1024_mod_exp_avx2(rr->d, a->d, p->d, m->d, mont->RR.d, mont->n0[0]); rr->top = 16; rr->neg = 0; bn_correct_top(rr); ret = 1; goto err; } else if ((8 == a->top) && (8 == p->top) && (BN_num_bits(m) == 512)) { if (NULL == bn_wexpand(rr,8)) goto err; RSAZ_512_mod_exp(rr->d, a->d, p->d, m->d, mont->n0[0], mont->RR.d); rr->top = 8; rr->neg = 0; bn_correct_top(rr); ret = 1; goto err; } #endif /* Get the window size to use with size of p. */ window = BN_window_bits_for_ctime_exponent_size(bits); #if defined(SPARC_T4_MONT) Loading
crypto/bn/rsaz_exp.c 0 → 100644 +306 −0 Original line number Diff line number Diff line /****************************************************************************** * Copyright(c) 2012, Intel Corp. * Developers and authors: * Shay Gueron (1, 2), and Vlad Krasnov (1) * (1) Intel Corporation, Israel Development Center, Haifa, Israel * (2) University of Haifa, Israel ****************************************************************************** * LICENSE: * This submission to OpenSSL is to be made available under the OpenSSL * license, and only to the OpenSSL project, in order to allow integration * into the publicly distributed code. * The use of this code, or portions of this code, or concepts embedded in * this code, or modification of this code and/or algorithm(s) in it, or the * use of this code for any other purpose than stated above, requires special * licensing. ****************************************************************************** * DISCLAIMER: * THIS SOFTWARE IS PROVIDED BY THE CONTRIBUTORS AND THE COPYRIGHT OWNERS * ``AS IS''. ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS OR THE COPYRIGHT * OWNERS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. ******************************************************************************/ #include "rsaz_exp.h" /* * See crypto/bn/asm/rsaz-avx2.pl for further details. */ void rsaz_1024_norm2red_avx2(void *red,const void *norm); void rsaz_1024_mul_avx2(void *ret,const void *a,const void *b,const void *n,unsigned long k); void rsaz_1024_sqr_avx2(void *ret,const void *a,const void *n,unsigned long k,int cnt); void rsaz_1024_scatter5_avx2(void *tbl,const void *val,int i); void rsaz_1024_gather5_avx2(void *val,const void *tbl,int i); void rsaz_1024_red2norm_avx2(void *norm,const void *red); #if defined(__GNUC__) # define ALIGN64 __attribute__((aligned(64))) #elif defined(_MSC_VER) # define ALIGN64 __declspec(align(64)) #elif defined(__SUNPRO_C) # define ALIGN64 # pragma align 64(one,two80) #else # define ALIGN64 /* not fatal, might hurt performance a little */ #endif ALIGN64 static const unsigned long one[40] = {1,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; ALIGN64 static const unsigned long two80[40] = {0,0,1<<22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; void RSAZ_1024_mod_exp_avx2(BN_ULONG result_norm[16], const BN_ULONG base_norm[16], const BN_ULONG exponent[16], const BN_ULONG m_norm[16], const BN_ULONG RR[16], BN_ULONG k0) { unsigned char storage[320*3+32*9*16+64]; /* 5.5KB */ unsigned char *p_str = storage + (64-((size_t)storage%64)); unsigned char *a_inv, *m, *result, *table_s = p_str+320*3, *R2 = table_s; /* borrow */ int index; int wvalue; if ((((size_t)p_str&4095)+320)>>12) { result = p_str; a_inv = p_str + 320; m = p_str + 320*2; /* should not cross page */ } else { m = p_str; /* should not cross page */ result = p_str + 320; a_inv = p_str + 320*2; } rsaz_1024_norm2red_avx2(m, m_norm); rsaz_1024_norm2red_avx2(a_inv, base_norm); rsaz_1024_norm2red_avx2(R2, RR); rsaz_1024_mul_avx2(R2, R2, R2, m, k0); rsaz_1024_mul_avx2(R2, R2, two80, m, k0); /* table[0] = 1 */ rsaz_1024_mul_avx2(result, R2, one, m, k0); /* table[1] = a_inv^1 */ rsaz_1024_mul_avx2(a_inv, a_inv, R2, m, k0); rsaz_1024_scatter5_avx2(table_s,result,0); rsaz_1024_scatter5_avx2(table_s,a_inv,1); /* table[2] = a_inv^2 */ rsaz_1024_sqr_avx2(result, a_inv, m, k0, 1); rsaz_1024_scatter5_avx2(table_s,result,2); #if 0 /* this is almost 2x smaller and less than 1% slower */ for (index=3; index<32; index++) { rsaz_1024_mul_avx2(result, result, a_inv, m, k0); rsaz_1024_scatter5_avx2(table_s,result,index); } #else /* table[4] = a_inv^4 */ rsaz_1024_sqr_avx2(result, result, m, k0, 1); rsaz_1024_scatter5_avx2(table_s,result,4); /* table[8] = a_inv^8 */ rsaz_1024_sqr_avx2(result, result, m, k0, 1); rsaz_1024_scatter5_avx2(table_s,result,8); /* table[16] = a_inv^16 */ rsaz_1024_sqr_avx2(result, result, m, k0, 1); rsaz_1024_scatter5_avx2(table_s,result,16); /* table[17] = a_inv^17 */ rsaz_1024_mul_avx2(result, result, a_inv, m, k0); rsaz_1024_scatter5_avx2(table_s,result,17); /* table[3] */ rsaz_1024_gather5_avx2(result,table_s,2); rsaz_1024_mul_avx2(result,result,a_inv,m,k0); rsaz_1024_scatter5_avx2(table_s,result,3); /* table[6] */ rsaz_1024_sqr_avx2(result, result, m, k0, 1); rsaz_1024_scatter5_avx2(table_s,result,6); /* table[12] */ rsaz_1024_sqr_avx2(result, result, m, k0, 1); rsaz_1024_scatter5_avx2(table_s,result,12); /* table[24] */ rsaz_1024_sqr_avx2(result, result, m, k0, 1); rsaz_1024_scatter5_avx2(table_s,result,24); /* table[25] */ rsaz_1024_mul_avx2(result, result, a_inv, m, k0); rsaz_1024_scatter5_avx2(table_s,result,25); /* table[5] */ rsaz_1024_gather5_avx2(result,table_s,4); rsaz_1024_mul_avx2(result,result,a_inv,m,k0); rsaz_1024_scatter5_avx2(table_s,result,5); /* table[10] */ rsaz_1024_sqr_avx2(result, result, m, k0, 1); rsaz_1024_scatter5_avx2(table_s,result,10); /* table[20] */ rsaz_1024_sqr_avx2(result, result, m, k0, 1); rsaz_1024_scatter5_avx2(table_s,result,20); /* table[21] */ rsaz_1024_mul_avx2(result, result, a_inv, m, k0); rsaz_1024_scatter5_avx2(table_s,result,21); /* table[7] */ rsaz_1024_gather5_avx2(result,table_s,6); rsaz_1024_mul_avx2(result,result,a_inv,m,k0); rsaz_1024_scatter5_avx2(table_s,result,7); /* table[14] */ rsaz_1024_sqr_avx2(result, result, m, k0, 1); rsaz_1024_scatter5_avx2(table_s,result,14); /* table[28] */ rsaz_1024_sqr_avx2(result, result, m, k0, 1); rsaz_1024_scatter5_avx2(table_s,result,28); /* table[29] */ rsaz_1024_mul_avx2(result, result, a_inv, m, k0); rsaz_1024_scatter5_avx2(table_s,result,29); /* table[9] */ rsaz_1024_gather5_avx2(result,table_s,8); rsaz_1024_mul_avx2(result,result,a_inv,m,k0); rsaz_1024_scatter5_avx2(table_s,result,9); /* table[18] */ rsaz_1024_sqr_avx2(result, result, m, k0, 1); rsaz_1024_scatter5_avx2(table_s,result,18); /* table[19] */ rsaz_1024_mul_avx2(result, result, a_inv, m, k0); rsaz_1024_scatter5_avx2(table_s,result,19); /* table[11] */ rsaz_1024_gather5_avx2(result,table_s,10); rsaz_1024_mul_avx2(result,result,a_inv,m,k0); rsaz_1024_scatter5_avx2(table_s,result,11); /* table[22] */ rsaz_1024_sqr_avx2(result, result, m, k0, 1); rsaz_1024_scatter5_avx2(table_s,result,22); /* table[23] */ rsaz_1024_mul_avx2(result, result, a_inv, m, k0); rsaz_1024_scatter5_avx2(table_s,result,23); /* table[13] */ rsaz_1024_gather5_avx2(result,table_s,12); rsaz_1024_mul_avx2(result,result,a_inv,m,k0); rsaz_1024_scatter5_avx2(table_s,result,13); /* table[26] */ rsaz_1024_sqr_avx2(result, result, m, k0, 1); rsaz_1024_scatter5_avx2(table_s,result,26); /* table[27] */ rsaz_1024_mul_avx2(result, result, a_inv, m, k0); rsaz_1024_scatter5_avx2(table_s,result,27); /* table[15] */ rsaz_1024_gather5_avx2(result,table_s,14); rsaz_1024_mul_avx2(result,result,a_inv,m,k0); rsaz_1024_scatter5_avx2(table_s,result,15); /* table[30] */ rsaz_1024_sqr_avx2(result, result, m, k0, 1); rsaz_1024_scatter5_avx2(table_s,result,30); /* table[31] */ rsaz_1024_mul_avx2(result, result, a_inv, m, k0); rsaz_1024_scatter5_avx2(table_s,result,31); #endif /* load first window */ p_str = (unsigned char*)exponent; wvalue = p_str[127] >> 3; rsaz_1024_gather5_avx2(result,table_s,wvalue); index = 1014; while(index > -1) { /* loop for the remaining 127 windows */ rsaz_1024_sqr_avx2(result, result, m, k0, 5); wvalue = *((unsigned short*)&p_str[index/8]); wvalue = (wvalue>> (index%8)) & 31; index-=5; rsaz_1024_gather5_avx2(a_inv,table_s,wvalue); /* borrow a_inv */ rsaz_1024_mul_avx2(result, result, a_inv, m, k0); } /* square four times */ rsaz_1024_sqr_avx2(result, result, m, k0, 4); wvalue = p_str[0] & 15; rsaz_1024_gather5_avx2(a_inv,table_s,wvalue); /* borrow a_inv */ rsaz_1024_mul_avx2(result, result, a_inv, m, k0); /* from Montgomery */ rsaz_1024_mul_avx2(result, result, one, m, k0); rsaz_1024_red2norm_avx2(result_norm, result); OPENSSL_cleanse(storage,sizeof(storage)); } /* * See crypto/bn/rsaz-x86_64.pl for further details. */ void rsaz_512_mul(void *ret,const void *a,const void *b,const void *n,unsigned long k); void rsaz_512_mul_scatter4(void *ret,const void *a,const void *n,unsigned long k,const void *tbl,unsigned int power); void rsaz_512_mul_gather4(void *ret,const void *a,const void *tbl,const void *n,unsigned long k,unsigned int power); void rsaz_512_mul_by_one(void *ret,const void *a,const void *n,unsigned long k); void rsaz_512_sqr(void *ret,const void *a,const void *n,unsigned long k,int cnt); void rsaz_512_scatter4(void *tbl, const unsigned long *val, int power); void rsaz_512_gather4(unsigned long *val, const void *tbl, int power); void RSAZ_512_mod_exp(BN_ULONG result[8], const BN_ULONG base[8], const BN_ULONG exponent[8], const BN_ULONG m[8], BN_ULONG k0, const BN_ULONG RR[8]) { unsigned char storage[16*8*8+64*2+64]; /* 1.2KB */ unsigned char *table = storage + (64-((size_t)storage%64)); unsigned long *a_inv = (unsigned long *)(table+16*8*8), *temp = (unsigned long *)(table+16*8*8+8*8); unsigned char *p_str = (unsigned char*)exponent; int index; unsigned int wvalue; /* table[0] = 1_inv */ temp[0] = 0-m[0]; temp[1] = ~m[1]; temp[2] = ~m[2]; temp[3] = ~m[3]; temp[4] = ~m[4]; temp[5] = ~m[5]; temp[6] = ~m[6]; temp[7] = ~m[7]; rsaz_512_scatter4(table, temp, 0); /* table [1] = a_inv^1 */ rsaz_512_mul(a_inv, base, RR, m, k0); rsaz_512_scatter4(table, a_inv, 1); /* table [2] = a_inv^2 */ rsaz_512_sqr(temp, a_inv, m, k0, 1); rsaz_512_scatter4(table, temp, 2); for (index=3; index<16; index++) rsaz_512_mul_scatter4(temp, a_inv, m, k0, table, index); /* load first window */ wvalue = p_str[63]; rsaz_512_gather4(temp, table, wvalue>>4); rsaz_512_sqr(temp, temp, m, k0, 4); rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue&0xf); for (index=62; index>=0; index--) { wvalue = p_str[index]; rsaz_512_sqr(temp, temp, m, k0, 4); rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue>>4); rsaz_512_sqr(temp, temp, m, k0, 4); rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue&0x0f); } /* from Montgomery */ rsaz_512_mul_by_one(result, temp, m, k0); OPENSSL_cleanse(storage,sizeof(storage)); }