Commit 68c06bf6 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

Support for SPARC T4 MONT[MUL|SQR] instructions.

Submitted by: David Miller, Andy Polyakov
parent c7b7984a
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -130,7 +130,7 @@ my $x86_elf_asm="$x86_asm:elf";

my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o:e_padlock-x86_64.o";
my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o::void";
my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparcv9-gf2m.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o::::::camellia.o cmll_misc.o cmll_cbc.o cmllt4-sparcv9.o:ghash-sparcv9.o::void";
my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o::::::camellia.o cmll_misc.o cmll_cbc.o cmllt4-sparcv9.o:ghash-sparcv9.o::void";
my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::::void";
my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-alpha.o::void";
my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o::::::::";
+42 −9
Original line number Diff line number Diff line
@@ -174,7 +174,7 @@ $sys_id =
$lflags       = 
$bn_ops       = BN_LLONG RC2_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC2 BF_PTR
$cpuid_obj    = sparcv9cap.o sparccpuid.o
$bn_obj       = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparcv9-gf2m.o
$bn_obj       = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o
$des_obj      = des_enc-sparc.o fcrypt_b.o
$aes_obj      = aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o
$bf_obj       = 
@@ -1782,6 +1782,39 @@ $ranlib =
$arflags      = 
$multilib     = 

*** debug-ben-debug-64
$cc           = gcc
$cflags       = -Wall -pedantic -DPEDANTIC -Wno-long-long -Wsign-compare -Wmissing-prototypes -Wshadow -Wformat -Werror -DCRYPTO_MDEBUG_ALL -DCRYPTO_MDEBUG_ABORT -DREF_CHECK -DOPENSSL_NO_DEPRECATED -DL_ENDIAN -DTERMIOS -g3 -O3
$unistd       = 
$thread_cflag = -pthread -D_THREAD_SAFE -D_REENTRANT
$sys_id       = 
$lflags       = 
$bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj    = x86_64cpuid.o
$bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
$des_obj      = 
$aes_obj      = aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o
$bf_obj       = 
$md5_obj      = md5-x86_64.o
$sha1_obj     = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
$cast_obj     = 
$rc4_obj      = rc4-x86_64.o rc4-md5-x86_64.o
$rmd160_obj   = 
$rc5_obj      = 
$wp_obj       = wp-x86_64.o
$cmll_obj     = cmll-x86_64.o cmll_misc.o
$modes_obj    = ghash-x86_64.o
$engines_obj  = e_padlock-x86_64.o
$perlasm_scheme = elf
$dso_scheme   = dlfcn
$shared_target= bsd-gcc-shared
$shared_cflag = -fPIC
$shared_ldflag = 
$shared_extension = .so.$(SHLIB_MAJOR).$(SHLIB_MINOR)
$ranlib       = 
$arflags      = 
$multilib     = 

*** debug-ben-macos
$cc           = cc
$cflags       = -Wall -pedantic -DPEDANTIC -Wno-long-long -Wsign-compare -Wmissing-prototypes -Wshadow -Wformat -Werror -DCRYPTO_MDEBUG_ALL -DCRYPTO_MDEBUG_ABORT -DREF_CHECK -DOPENSSL_NO_DEPRECATED -DOPENSSL_NO_ASM -DBN_DEBUG -DCONF_DEBUG -DDEBUG_SAFESTACK -DDEBUG_UNUSED -DOPENSSL_THREADS -D_REENTRANT -DDSO_DLFCN -DHAVE_DLFCN_H -arch i386 -O3 -DL_ENDIAN -g3 -pipe
@@ -2616,7 +2649,7 @@ $sys_id = ULTRASPARC
$lflags       = -lsocket -lnsl -ldl
$bn_ops       = BN_LLONG RC4_CHAR RC4_CHUNK_LL DES_PTR DES_RISC1 DES_UNROLL BF_PTR
$cpuid_obj    = sparcv9cap.o sparccpuid.o
$bn_obj       = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparcv9-gf2m.o
$bn_obj       = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o
$des_obj      = des_enc-sparc.o fcrypt_b.o
$aes_obj      = aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o
$bf_obj       = 
@@ -2649,7 +2682,7 @@ $sys_id = ULTRASPARC
$lflags       = -lsocket -lnsl -ldl
$bn_ops       = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR
$cpuid_obj    = sparcv9cap.o sparccpuid.o
$bn_obj       = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparcv9-gf2m.o
$bn_obj       = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o
$des_obj      = des_enc-sparc.o fcrypt_b.o
$aes_obj      = aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o
$bf_obj       = 
@@ -4398,7 +4431,7 @@ $sys_id = ULTRASPARC
$lflags       = -ldl
$bn_ops       = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR
$cpuid_obj    = sparcv9cap.o sparccpuid.o
$bn_obj       = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparcv9-gf2m.o
$bn_obj       = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o
$des_obj      = des_enc-sparc.o fcrypt_b.o
$aes_obj      = aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o
$bf_obj       = 
@@ -4596,7 +4629,7 @@ $sys_id = ULTRASPARC
$lflags       = -ldl
$bn_ops       = BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR
$cpuid_obj    = sparcv9cap.o sparccpuid.o
$bn_obj       = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparcv9-gf2m.o
$bn_obj       = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o
$des_obj      = des_enc-sparc.o fcrypt_b.o
$aes_obj      = aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o
$bf_obj       = 
@@ -5454,7 +5487,7 @@ $sys_id = ULTRASPARC
$lflags       = -lsocket -lnsl -ldl
$bn_ops       = BN_LLONG RC4_CHAR RC4_CHUNK_LL DES_PTR DES_RISC1 DES_UNROLL BF_PTR
$cpuid_obj    = sparcv9cap.o sparccpuid.o
$bn_obj       = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparcv9-gf2m.o
$bn_obj       = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o
$des_obj      = des_enc-sparc.o fcrypt_b.o
$aes_obj      = aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o
$bf_obj       = 
@@ -5487,7 +5520,7 @@ $sys_id = ULTRASPARC
$lflags       = -lsocket -lnsl -ldl
$bn_ops       = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR
$cpuid_obj    = sparcv9cap.o sparccpuid.o
$bn_obj       = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparcv9-gf2m.o
$bn_obj       = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o
$des_obj      = des_enc-sparc.o fcrypt_b.o
$aes_obj      = aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o
$bf_obj       = 
@@ -5586,7 +5619,7 @@ $sys_id = ULTRASPARC
$lflags       = -lsocket -lnsl -ldl
$bn_ops       = BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR
$cpuid_obj    = sparcv9cap.o sparccpuid.o
$bn_obj       = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparcv9-gf2m.o
$bn_obj       = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o
$des_obj      = des_enc-sparc.o fcrypt_b.o
$aes_obj      = aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o
$bf_obj       = 
@@ -5619,7 +5652,7 @@ $sys_id = ULTRASPARC
$lflags       = -lsocket -lnsl -ldl
$bn_ops       = BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR
$cpuid_obj    = sparcv9cap.o sparccpuid.o
$bn_obj       = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparcv9-gf2m.o
$bn_obj       = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o
$des_obj      = des_enc-sparc.o fcrypt_b.o
$aes_obj      = aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o
$bf_obj       = 
+2 −0
Original line number Diff line number Diff line
@@ -79,6 +79,8 @@ sparcv9-mont.s: asm/sparcv9-mont.pl
	$(PERL) asm/sparcv9-mont.pl $(CFLAGS) > $@
vis3-mont.s:		asm/vis3-mont.pl
	$(PERL) asm/vis3-mont.pl $(CFLAGS) > $@
sparct4-mont.S:	asm/sparct4-mont.pl
	$(PERL) asm/sparct4-mont.pl $(CFLAGS) > $@
sparcv9-gf2m.S:	asm/sparcv9-gf2m.pl
	$(PERL) asm/sparcv9-gf2m.pl $(CFLAGS) > $@

+1201 −0

File added.

Preview size limit exceeded, changes collapsed.

+105 −0
Original line number Diff line number Diff line
@@ -126,6 +126,11 @@
# endif
#endif

#if defined(OPENSSL_BN_ASM_MONT) && defined(__sparc__)
# include "sparc_arch.h"
extern unsigned int OPENSSL_sparcv9cap_P[];
#endif

/* maximum precomputation table size for *variable* sliding windows */
#define TABLE_SIZE	32

@@ -588,6 +593,9 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
	int powerbufLen = 0;
	unsigned char *powerbuf=NULL;
	BIGNUM tmp, am;
#if defined(OPENSSL_BN_ASM_MONT) && defined(__sparc__)
	unsigned int t4=0;
#endif

	bn_check_top(a);
	bn_check_top(p);
@@ -622,9 +630,18 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,

	/* Get the window size to use with size of p. */
	window = BN_window_bits_for_ctime_exponent_size(bits);
#if defined(OPENSSL_BN_ASM_MONT) && defined(__sparc__)
	if (window>=5 && (top&15)==0 && top<=64 &&
	    (OPENSSL_sparcv9cap_P[1]&(CFR_MONTMUL|CFR_MONTSQR))==
	    			     (CFR_MONTMUL|CFR_MONTSQR) &&
	    (t4=OPENSSL_sparcv9cap_P[0]))
		window=5;
	else
#endif
#if defined(OPENSSL_BN_ASM_MONT5)
	if (window==6 && bits<=1024) window=5;	/* ~5% improvement of 2048-bit RSA sign */
#endif
	(void)0;

	/* Allocate a buffer large enough to hold all of the pre-computed
	 * powers of am, am itself and tmp.
@@ -674,6 +691,94 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
		}
	else	if (!BN_to_montgomery(&am,a,mont,ctx))		goto err;

#if defined(OPENSSL_BN_ASM_MONT) && defined(__sparc__)
    if (t4)
	{
	typedef int (*bn_pwr5_mont_f)(BN_ULONG *tp,const BN_ULONG *np,
			const BN_ULONG *n0,const void *table,int power);
	int bn_pwr5_mont_t4_8(BN_ULONG *tp,const BN_ULONG *np,
			const BN_ULONG *n0,const void *table,int power);
	int bn_pwr5_mont_t4_16(BN_ULONG *tp,const BN_ULONG *np,
			const BN_ULONG *n0,const void *table,int power);
	int bn_pwr5_mont_t4_24(BN_ULONG *tp,const BN_ULONG *np,
			const BN_ULONG *n0,const void *table,int power);
	int bn_pwr5_mont_t4_32(BN_ULONG *tp,const BN_ULONG *np,
			const BN_ULONG *n0,const void *table,int power);
	static const bn_pwr5_mont_f funcs[4] = {
			bn_pwr5_mont_t4_8,	bn_pwr5_mont_t4_16,
			bn_pwr5_mont_t4_24,	bn_pwr5_mont_t4_32 };
	bn_pwr5_mont_f worker = funcs[top/16-1];

	void bn_mul_mont_t4(BN_ULONG *rp,const BN_ULONG *ap,
			const void *bp,const BN_ULONG *np,
			const BN_ULONG *n0,int num);
	void bn_mul_mont_gather5_t4(BN_ULONG *rp,const BN_ULONG *ap,
			const void *table,const BN_ULONG *np,
			const BN_ULONG *n0,int num,int power);
	void bn_scatter5_t4(const BN_ULONG *inp,size_t num,
			void *table,size_t power);
	void bn_gather5_t4(BN_ULONG *out,size_t num,
			void *table,size_t power);
	void bn_flip_t4(BN_ULONG *dst,BN_ULONG *src,size_t num);

	BN_ULONG *np=alloca(top*sizeof(BN_ULONG)), *n0=mont->n0;

	/* BN_to_montgomery can contaminate words above .top
	 * [in BN_DEBUG[_DEBUG] build]... */
	for (i=am.top; i<top; i++)	am.d[i]=0;
	for (i=tmp.top; i<top; i++)	tmp.d[i]=0;

	/* switch to 64-bit domain */ 
	top /= 2;
	bn_flip_t4(np,mont->N.d,top);
	bn_flip_t4(tmp.d,tmp.d,top);
	bn_flip_t4(am.d,am.d,top);

	bn_scatter5_t4(tmp.d,top,powerbuf,0);
	bn_scatter5_t4(am.d,top,powerbuf,1);
	bn_mul_mont_t4(tmp.d,am.d,am.d,np,n0,top);
	bn_scatter5_t4(tmp.d,top,powerbuf,2);

	for (i=3; i<32; i++)
		{
		/* Calculate a^i = a^(i-1) * a */
		bn_mul_mont_gather5_t4(tmp.d,am.d,powerbuf,np,n0,top,i-1);
		bn_scatter5_t4(tmp.d,top,powerbuf,i);
		}

	bits--;
	for (wvalue=0, i=bits%5; i>=0; i--,bits--)
		wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
	bn_gather5_t4(tmp.d,top,powerbuf,wvalue);

	/* Scan the exponent one window at a time starting from the most
	 * significant bits.
	 */
	while (bits >= 0)
		{
		for (wvalue=0, i=0; i<5; i++,bits--)
			wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);

		if ((*worker)(tmp.d,np,n0,powerbuf,wvalue)) continue;
		/* retry once and fall back */
		if ((*worker)(tmp.d,np,n0,powerbuf,wvalue)) continue;
		bn_mul_mont_t4(tmp.d,tmp.d,tmp.d,np,n0,top);
		bn_mul_mont_t4(tmp.d,tmp.d,tmp.d,np,n0,top);
		bn_mul_mont_t4(tmp.d,tmp.d,tmp.d,np,n0,top);
		bn_mul_mont_t4(tmp.d,tmp.d,tmp.d,np,n0,top);
		bn_mul_mont_t4(tmp.d,tmp.d,tmp.d,np,n0,top);
		bn_mul_mont_gather5_t4(tmp.d,tmp.d,powerbuf,np,n0,top,wvalue);
		}

	bn_flip_t4(tmp.d,tmp.d,top);
	top *= 2;
	/* back to 32-bit domain */
	tmp.top=top;
	bn_correct_top(&tmp);
	OPENSSL_cleanse(np,top*sizeof(BN_ULONG));
	}
    else
#endif
#if defined(OPENSSL_BN_ASM_MONT5)
    /* This optimization uses ideas from http://eprint.iacr.org/2011/239,
     * specifically optimization of cache-timing attack countermeasures
Loading