x86_64 performance patch. (2f98abbc) · Commits · CYBER - Cyber Security / TS 103 523 MSP / TLMSP / TLMSP OpenSSL

Configure

+1 −1

Original line number	Diff line number	Diff line
		@@ -391,7 +391,7 @@ my %table=(
		"linux-s390", "gcc:-DB_ENDIAN -DTERMIO -DNO_ASM -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG::::::::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
		"linux-s390x", "gcc:-DB_ENDIAN -DTERMIO -DNO_ASM -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG::::::::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
		"linux-ia64", "gcc:-DL_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK RC4_CHAR:asm/ia64.o:::::::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
		"linux-x86_64", "gcc:-DL_ENDIAN -DNO_ASM ::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG::::::::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
		"linux-x86_64", "gcc:-m64 -DL_ENDIAN -DTERMIO -O3 -Wall -DMD32_REG_T=int::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK RC4_CHAR BF_PTR2 DES_INT DES_UNROLL:asm/x86_64-gcc.o:::::::::dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
		"NetBSD-sparc", "gcc:-DTERMIOS -O3 -fomit-frame-pointer -mv8 -Wall -DB_ENDIAN::(unknown):::BN_LLONG MD2_CHAR RC4_INDEX DES_UNROLL::::::::::dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
		"NetBSD-m68", "gcc:-DTERMIOS -O3 -fomit-frame-pointer -Wall -DB_ENDIAN::(unknown):::BN_LLONG MD2_CHAR RC4_INDEX DES_UNROLL::::::::::dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
		"NetBSD-x86", "gcc:-DTERMIOS -O3 -fomit-frame-pointer -m486 -Wall::(unknown):::BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}::::::::::dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",

crypto/bn/Makefile.ssl

+2 −0

Original line number	Diff line number	Diff line
		@@ -138,6 +138,8 @@ asm/ia64-cpp.o: asm/ia64.S
		$(CC) $(ASFLAGS) -c -o asm/ia64-cpp.o /tmp/ia64.$$$$.s; \
		rm -f /tmp/ia64.$$$$.s

		asm/x86_64-gcc.o: asm/x86_64-gcc.c

		files:
		$(PERL) $(TOP)/util/files.pl Makefile.ssl >> $(TOP)/MINFO

crypto/bn/asm/x86_64-gcc.c

0 → 100644

+575 −0

Original line number	Diff line number	Diff line
		/*
		* x86_64 BIGNUM accelerator version 0.1, December 2002.
		*
		* Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
		* project.
		*
		* Rights for redistribution and usage in source and binary forms are
		* granted according to the OpenSSL license. Warranty of any kind is
		* disclaimed.
		*
		* Q. Version 0.1? It doesn't sound like Andy, he used to assign real
		* versions, like 1.0...
		* A. Well, that's because this code is basically a quick-n-dirty
		* proof-of-concept hack. As you can see it's implemented with
		* inline assembler, which means that you're bound to GCC and that
		* there must be a room for fine-tuning.
		*
		* Q. Why inline assembler?
		* A. x86_64 features own ABI I'm not familiar with. Which is why
		* I decided to let the compiler take care of subroutine
		* prologue/epilogue as well as register allocation.
		*
		* Q. How much faster does it get?
		* A. Unfortunately people sitting on x86_64 hardware are prohibited
		* to disclose the performance numbers, so they (SuSE labs to be
		* specific) wouldn't tell me. However! Very similar coding technique
		* (reaching out for 128-bit result from 64x64-bit multiplication)
		* results in >3 times performance improvement on MIPS and I see no
		* reason why gain on x86_64 would be so much different:-)
		*/

		#define BN_ULONG unsigned long

		/*
		* "m"(a), "+m"(r) is the way to favor DirectPath -code;
		* "g"(0) let the compiler to decide where does it
		* want to keep the value of zero;
		*/
		#define mul_add(r,a,word,carry) do { \
		register BN_ULONG high,low; \
		asm ("mulq %3" \
		: "=a"(low),"=d"(high) \
		: "a"(word),"m"(a) \
		: "cc"); \
		asm ("addq %2,%0; adcq %3,%1" \
		: "+r"(carry),"+d"(high)\
		: "a"(low),"g"(0) \
		: "cc"); \
		asm ("addq %2,%0; adcq %3,%1" \
		: "+m"(r),"+d"(high) \
		: "r"(carry),"g"(0) \
		: "cc"); \
		carry=high; \
		} while (0)

		#define mul(r,a,word,carry) do { \
		register BN_ULONG high,low; \
		asm ("mulq %3" \
		: "=a"(low),"=d"(high) \
		: "a"(word),"g"(a) \
		: "cc"); \
		asm ("addq %2,%0; adcq %3,%1" \
		: "+r"(carry),"+d"(high)\
		: "a"(low),"g"(0) \
		: "cc"); \
		(r)=carry, carry=high; \
		} while (0)

		#define sqr(r0,r1,a) \
		asm ("mulq %2" \
		: "=a"(r0),"=d"(r1) \
		: "a"(a) \
		: "cc");

		BN_ULONG bn_mul_add_words(BN_ULONG rp, BN_ULONG ap, int num, BN_ULONG w)
		{
		BN_ULONG c1=0;

		if (num <= 0) return(c1);

		while (num&~3)
		{
		mul_add(rp[0],ap[0],w,c1);
		mul_add(rp[1],ap[1],w,c1);
		mul_add(rp[2],ap[2],w,c1);
		mul_add(rp[3],ap[3],w,c1);
		ap+=4; rp+=4; num-=4;
		}
		if (num)
		{
		mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
		mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
		mul_add(rp[2],ap[2],w,c1); return c1;
		}

		return(c1);
		}

		BN_ULONG bn_mul_words(BN_ULONG rp, BN_ULONG ap, int num, BN_ULONG w)
		{
		BN_ULONG c1=0;

		if (num <= 0) return(c1);

		while (num&~3)
		{
		mul(rp[0],ap[0],w,c1);
		mul(rp[1],ap[1],w,c1);
		mul(rp[2],ap[2],w,c1);
		mul(rp[3],ap[3],w,c1);
		ap+=4; rp+=4; num-=4;
		}
		if (num)
		{
		mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
		mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
		mul(rp[2],ap[2],w,c1);
		}
		return(c1);
		}

		void bn_sqr_words(BN_ULONG r, BN_ULONG a, int n)
		{
		if (n <= 0) return;

		while (n&~3)
		{
		sqr(r[0],r[1],a[0]);
		sqr(r[2],r[3],a[1]);
		sqr(r[4],r[5],a[2]);
		sqr(r[6],r[7],a[3]);
		a+=4; r+=8; n-=4;
		}
		if (n)
		{
		sqr(r[0],r[1],a[0]); if (--n == 0) return;
		sqr(r[2],r[3],a[1]); if (--n == 0) return;
		sqr(r[4],r[5],a[2]);
		}
		}

		BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
		{ BN_ULONG ret,waste;

		asm ("divq %3"
		: "=a"(ret),"=d"(waste)
		: "a"(l),"d"(h),"g"(d)
		: "cc");

		return ret;
		}

		BN_ULONG bn_add_words (BN_ULONG rp, BN_ULONG ap, BN_ULONG *bp,int n)
		{ BN_ULONG ret,i;

		if (n <= 0) return 0;

		asm (
		" subq %2,%2 \n"
		".align 16 \n"
		"1: movq (%4,%2,8),%0 \n"
		" adcq (%5,%2,8),%0 \n"
		" movq %0,(%3,%2,8) \n"
		" leaq 1(%2),%2 \n"
		" loop 1b \n"
		" sbbq %0,%0 \n"
		: "+a"(ret),"+c"(n),"+r"(i)
		: "r"(rp),"r"(ap),"r"(bp)
		: "cc"
		);

		return ret&1;
		}

		#ifndef SIMICS
		BN_ULONG bn_sub_words (BN_ULONG rp, BN_ULONG ap, BN_ULONG *bp,int n)
		{ BN_ULONG ret,i;

		if (n <= 0) return 0;

		asm (
		" subq %2,%2 \n"
		".align 16 \n"
		"1: movq (%4,%2,8),%0 \n"
		" sbbq (%5,%2,8),%0 \n"
		" movq %0,(%3,%2,8) \n"
		" leaq 1(%2),%2 \n"
		" loop 1b \n"
		" sbbq %0,%0 \n"
		: "+a"(ret),"+c"(n),"+r"(i)
		: "r"(rp),"r"(ap),"r"(bp)
		: "cc"
		);

		return ret&1;
		}
		#else
		/* Simics 1.4<7 has buggy sbbq:-( */
		#define BN_MASK2 0xffffffffffffffffL
		BN_ULONG bn_sub_words(BN_ULONG r, BN_ULONG a, BN_ULONG *b, int n)
		{
		BN_ULONG t1,t2;
		int c=0;

		if (n <= 0) return((BN_ULONG)0);

		for (;;)
		{
		t1=a[0]; t2=b[0];
		r[0]=(t1-t2-c)&BN_MASK2;
		if (t1 != t2) c=(t1 < t2);
		if (--n <= 0) break;

		t1=a[1]; t2=b[1];
		r[1]=(t1-t2-c)&BN_MASK2;
		if (t1 != t2) c=(t1 < t2);
		if (--n <= 0) break;

		t1=a[2]; t2=b[2];
		r[2]=(t1-t2-c)&BN_MASK2;
		if (t1 != t2) c=(t1 < t2);
		if (--n <= 0) break;

		t1=a[3]; t2=b[3];
		r[3]=(t1-t2-c)&BN_MASK2;
		if (t1 != t2) c=(t1 < t2);
		if (--n <= 0) break;

		a+=4;
		b+=4;
		r+=4;
		}
		return(c);
		}
		#endif

		/* mul_add_c(a,b,c0,c1,c2) -- c+=ab for three word number c=(c2,c1,c0) /
		/* mul_add_c2(a,b,c0,c1,c2) -- c+=2ab for three word number c=(c2,c1,c0) */
		/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
		/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2a[i]a[j] for three word number c=(c2,c1,c0) */

		#if 0
		/* original macros are kept for reference purposes */
		#define mul_add_c(a,b,c0,c1,c2) { \
		BN_ULONG ta=(a),tb=(b); \
		t1 = ta * tb; \
		t2 = BN_UMULT_HIGH(ta,tb); \
		c0 += t1; t2 += (c0<t1)?1:0; \
		c1 += t2; c2 += (c1<t2)?1:0; \
		}

		#define mul_add_c2(a,b,c0,c1,c2) { \
		BN_ULONG ta=(a),tb=(b),t0; \
		t1 = BN_UMULT_HIGH(ta,tb); \
		t0 = ta * tb; \
		t2 = t1+t1; c2 += (t2<t1)?1:0; \
		t1 = t0+t0; t2 += (t1<t0)?1:0; \
		c0 += t1; t2 += (c0<t1)?1:0; \
		c1 += t2; c2 += (c1<t2)?1:0; \
		}
		#else
		#define mul_add_c(a,b,c0,c1,c2) do { \
		asm ("mulq %3" \
		: "=a"(t1),"=d"(t2) \
		: "a"(a),"m"(b) \
		: "cc"); \
		asm ("addq %2,%0; adcq %3,%1" \
		: "+r"(c0),"+d"(t2) \
		: "a"(t1),"g"(0) \
		: "cc"); \
		asm ("addq %2,%0; adcq %3,%1" \
		: "+r"(c1),"+r"(c2) \
		: "d"(t2),"g"(0) \
		: "cc"); \
		} while (0)

		#define sqr_add_c(a,i,c0,c1,c2) do { \
		asm ("mulq %2" \
		: "=a"(t1),"=d"(t2) \
		: "a"(a[i]) \
		: "cc"); \
		asm ("addq %2,%0; adcq %3,%1" \
		: "+r"(c0),"+d"(t2) \
		: "a"(t1),"g"(0) \
		: "cc"); \
		asm ("addq %2,%0; adcq %3,%1" \
		: "+r"(c1),"+r"(c2) \
		: "d"(t2),"g"(0) \
		: "cc"); \
		} while (0)

		#define mul_add_c2(a,b,c0,c1,c2) do { \
		asm ("mulq %3" \
		: "=a"(t1),"=d"(t2) \
		: "a"(a),"m"(b) \
		: "cc"); \
		asm ("addq %0,%0; adcq %2,%1" \
		: "+d"(t2),"+r"(c2) \
		: "g"(0) \
		: "cc"); \
		asm ("addq %0,%0; adcq %2,%1" \
		: "+a"(t1),"+d"(t2) \
		: "g"(0) \
		: "cc"); \
		asm ("addq %2,%0; adcq %3,%1" \
		: "+r"(c0),"+d"(t2) \
		: "a"(t1),"g"(0) \
		: "cc"); \
		asm ("addq %2,%0; adcq %3,%1" \
		: "+r"(c1),"+r"(c2) \
		: "d"(t2),"g"(0) \
		: "cc"); \
		} while (0)
		#endif

		#define sqr_add_c2(a,i,j,c0,c1,c2) \
		mul_add_c2((a)[i],(a)[j],c0,c1,c2)

		void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
		{
		BN_ULONG bl,bh;
		BN_ULONG t1,t2;
		BN_ULONG c1,c2,c3;

		c1=0;
		c2=0;
		c3=0;
		mul_add_c(a[0],b[0],c1,c2,c3);
		r[0]=c1;
		c1=0;
		mul_add_c(a[0],b[1],c2,c3,c1);
		mul_add_c(a[1],b[0],c2,c3,c1);
		r[1]=c2;
		c2=0;
		mul_add_c(a[2],b[0],c3,c1,c2);
		mul_add_c(a[1],b[1],c3,c1,c2);
		mul_add_c(a[0],b[2],c3,c1,c2);
		r[2]=c3;
		c3=0;
		mul_add_c(a[0],b[3],c1,c2,c3);
		mul_add_c(a[1],b[2],c1,c2,c3);
		mul_add_c(a[2],b[1],c1,c2,c3);
		mul_add_c(a[3],b[0],c1,c2,c3);
		r[3]=c1;
		c1=0;
		mul_add_c(a[4],b[0],c2,c3,c1);
		mul_add_c(a[3],b[1],c2,c3,c1);
		mul_add_c(a[2],b[2],c2,c3,c1);
		mul_add_c(a[1],b[3],c2,c3,c1);
		mul_add_c(a[0],b[4],c2,c3,c1);
		r[4]=c2;
		c2=0;
		mul_add_c(a[0],b[5],c3,c1,c2);
		mul_add_c(a[1],b[4],c3,c1,c2);
		mul_add_c(a[2],b[3],c3,c1,c2);
		mul_add_c(a[3],b[2],c3,c1,c2);
		mul_add_c(a[4],b[1],c3,c1,c2);
		mul_add_c(a[5],b[0],c3,c1,c2);
		r[5]=c3;
		c3=0;
		mul_add_c(a[6],b[0],c1,c2,c3);
		mul_add_c(a[5],b[1],c1,c2,c3);
		mul_add_c(a[4],b[2],c1,c2,c3);
		mul_add_c(a[3],b[3],c1,c2,c3);
		mul_add_c(a[2],b[4],c1,c2,c3);
		mul_add_c(a[1],b[5],c1,c2,c3);
		mul_add_c(a[0],b[6],c1,c2,c3);
		r[6]=c1;
		c1=0;
		mul_add_c(a[0],b[7],c2,c3,c1);
		mul_add_c(a[1],b[6],c2,c3,c1);
		mul_add_c(a[2],b[5],c2,c3,c1);
		mul_add_c(a[3],b[4],c2,c3,c1);
		mul_add_c(a[4],b[3],c2,c3,c1);
		mul_add_c(a[5],b[2],c2,c3,c1);
		mul_add_c(a[6],b[1],c2,c3,c1);
		mul_add_c(a[7],b[0],c2,c3,c1);
		r[7]=c2;
		c2=0;
		mul_add_c(a[7],b[1],c3,c1,c2);
		mul_add_c(a[6],b[2],c3,c1,c2);
		mul_add_c(a[5],b[3],c3,c1,c2);
		mul_add_c(a[4],b[4],c3,c1,c2);
		mul_add_c(a[3],b[5],c3,c1,c2);
		mul_add_c(a[2],b[6],c3,c1,c2);
		mul_add_c(a[1],b[7],c3,c1,c2);
		r[8]=c3;
		c3=0;
		mul_add_c(a[2],b[7],c1,c2,c3);
		mul_add_c(a[3],b[6],c1,c2,c3);
		mul_add_c(a[4],b[5],c1,c2,c3);
		mul_add_c(a[5],b[4],c1,c2,c3);
		mul_add_c(a[6],b[3],c1,c2,c3);
		mul_add_c(a[7],b[2],c1,c2,c3);
		r[9]=c1;
		c1=0;
		mul_add_c(a[7],b[3],c2,c3,c1);
		mul_add_c(a[6],b[4],c2,c3,c1);
		mul_add_c(a[5],b[5],c2,c3,c1);
		mul_add_c(a[4],b[6],c2,c3,c1);
		mul_add_c(a[3],b[7],c2,c3,c1);
		r[10]=c2;
		c2=0;
		mul_add_c(a[4],b[7],c3,c1,c2);
		mul_add_c(a[5],b[6],c3,c1,c2);
		mul_add_c(a[6],b[5],c3,c1,c2);
		mul_add_c(a[7],b[4],c3,c1,c2);
		r[11]=c3;
		c3=0;
		mul_add_c(a[7],b[5],c1,c2,c3);
		mul_add_c(a[6],b[6],c1,c2,c3);
		mul_add_c(a[5],b[7],c1,c2,c3);
		r[12]=c1;
		c1=0;
		mul_add_c(a[6],b[7],c2,c3,c1);
		mul_add_c(a[7],b[6],c2,c3,c1);
		r[13]=c2;
		c2=0;
		mul_add_c(a[7],b[7],c3,c1,c2);
		r[14]=c3;
		r[15]=c1;
		}

		void bn_mul_comba4(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
		{
		BN_ULONG bl,bh;
		BN_ULONG t1,t2;
		BN_ULONG c1,c2,c3;

		c1=0;
		c2=0;
		c3=0;
		mul_add_c(a[0],b[0],c1,c2,c3);
		r[0]=c1;
		c1=0;
		mul_add_c(a[0],b[1],c2,c3,c1);
		mul_add_c(a[1],b[0],c2,c3,c1);
		r[1]=c2;
		c2=0;
		mul_add_c(a[2],b[0],c3,c1,c2);
		mul_add_c(a[1],b[1],c3,c1,c2);
		mul_add_c(a[0],b[2],c3,c1,c2);
		r[2]=c3;
		c3=0;
		mul_add_c(a[0],b[3],c1,c2,c3);
		mul_add_c(a[1],b[2],c1,c2,c3);
		mul_add_c(a[2],b[1],c1,c2,c3);
		mul_add_c(a[3],b[0],c1,c2,c3);
		r[3]=c1;
		c1=0;
		mul_add_c(a[3],b[1],c2,c3,c1);
		mul_add_c(a[2],b[2],c2,c3,c1);
		mul_add_c(a[1],b[3],c2,c3,c1);
		r[4]=c2;
		c2=0;
		mul_add_c(a[2],b[3],c3,c1,c2);
		mul_add_c(a[3],b[2],c3,c1,c2);
		r[5]=c3;
		c3=0;
		mul_add_c(a[3],b[3],c1,c2,c3);
		r[6]=c1;
		r[7]=c2;
		}

		void bn_sqr_comba8(BN_ULONG r, BN_ULONG a)
		{
		BN_ULONG bl,bh;
		BN_ULONG t1,t2;
		BN_ULONG c1,c2,c3;

		c1=0;
		c2=0;
		c3=0;
		sqr_add_c(a,0,c1,c2,c3);
		r[0]=c1;
		c1=0;
		sqr_add_c2(a,1,0,c2,c3,c1);
		r[1]=c2;
		c2=0;
		sqr_add_c(a,1,c3,c1,c2);
		sqr_add_c2(a,2,0,c3,c1,c2);
		r[2]=c3;
		c3=0;
		sqr_add_c2(a,3,0,c1,c2,c3);
		sqr_add_c2(a,2,1,c1,c2,c3);
		r[3]=c1;
		c1=0;
		sqr_add_c(a,2,c2,c3,c1);
		sqr_add_c2(a,3,1,c2,c3,c1);
		sqr_add_c2(a,4,0,c2,c3,c1);
		r[4]=c2;
		c2=0;
		sqr_add_c2(a,5,0,c3,c1,c2);
		sqr_add_c2(a,4,1,c3,c1,c2);
		sqr_add_c2(a,3,2,c3,c1,c2);
		r[5]=c3;
		c3=0;
		sqr_add_c(a,3,c1,c2,c3);
		sqr_add_c2(a,4,2,c1,c2,c3);
		sqr_add_c2(a,5,1,c1,c2,c3);
		sqr_add_c2(a,6,0,c1,c2,c3);
		r[6]=c1;
		c1=0;
		sqr_add_c2(a,7,0,c2,c3,c1);
		sqr_add_c2(a,6,1,c2,c3,c1);
		sqr_add_c2(a,5,2,c2,c3,c1);
		sqr_add_c2(a,4,3,c2,c3,c1);
		r[7]=c2;
		c2=0;
		sqr_add_c(a,4,c3,c1,c2);
		sqr_add_c2(a,5,3,c3,c1,c2);
		sqr_add_c2(a,6,2,c3,c1,c2);
		sqr_add_c2(a,7,1,c3,c1,c2);
		r[8]=c3;
		c3=0;
		sqr_add_c2(a,7,2,c1,c2,c3);
		sqr_add_c2(a,6,3,c1,c2,c3);
		sqr_add_c2(a,5,4,c1,c2,c3);
		r[9]=c1;
		c1=0;
		sqr_add_c(a,5,c2,c3,c1);
		sqr_add_c2(a,6,4,c2,c3,c1);
		sqr_add_c2(a,7,3,c2,c3,c1);
		r[10]=c2;
		c2=0;
		sqr_add_c2(a,7,4,c3,c1,c2);
		sqr_add_c2(a,6,5,c3,c1,c2);
		r[11]=c3;
		c3=0;
		sqr_add_c(a,6,c1,c2,c3);
		sqr_add_c2(a,7,5,c1,c2,c3);
		r[12]=c1;
		c1=0;
		sqr_add_c2(a,7,6,c2,c3,c1);
		r[13]=c2;
		c2=0;
		sqr_add_c(a,7,c3,c1,c2);
		r[14]=c3;
		r[15]=c1;
		}

		void bn_sqr_comba4(BN_ULONG r, BN_ULONG a)
		{
		BN_ULONG bl,bh;
		BN_ULONG t1,t2;
		BN_ULONG c1,c2,c3;

		c1=0;
		c2=0;
		c3=0;
		sqr_add_c(a,0,c1,c2,c3);
		r[0]=c1;
		c1=0;
		sqr_add_c2(a,1,0,c2,c3,c1);
		r[1]=c2;
		c2=0;
		sqr_add_c(a,1,c3,c1,c2);
		sqr_add_c2(a,2,0,c3,c1,c2);
		r[2]=c3;
		c3=0;
		sqr_add_c2(a,3,0,c1,c2,c3);
		sqr_add_c2(a,2,1,c1,c2,c3);
		r[3]=c1;
		c1=0;
		sqr_add_c(a,2,c2,c3,c1);
		sqr_add_c2(a,3,1,c2,c3,c1);
		r[4]=c2;
		c2=0;
		sqr_add_c2(a,3,2,c3,c1,c2);
		r[5]=c3;
		c3=0;
		sqr_add_c(a,3,c1,c2,c3);
		r[6]=c1;
		r[7]=c2;
		}

crypto/bn/bn_div.c

+17 −1

Original line number	Diff line number	Diff line
		@@ -150,6 +150,20 @@ int BN_div(BIGNUM dv, BIGNUM rem, const BIGNUM m, const BIGNUM d,
		q; \
		})
		# define REMAINDER_IS_ALREADY_CALCULATED
		# elif defined(__x86_64) && defined(SIXTY_FOUR_BIT_LONG)
		/*
		* Same story here, but it's 128-bit by 64-bit division. Wow!
		* <appro@fy.chalmers.se>
		*/
		# define bn_div_words(n0,n1,d0) \
		({ asm volatile ( \
		"divq %4" \
		: "=a"(q), "=d"(rem) \
		: "a"(n1), "d"(n0), "g"(d0) \
		: "cc"); \
		q; \
		})
		# define REMAINDER_IS_ALREADY_CALCULATED
		# endif /* __<cpu> */
		# endif /* __GNUC__ */
		#endif /* OPENSSL_NO_ASM */
		@@ -296,7 +310,9 @@ int BN_div(BIGNUM dv, BIGNUM rm, const BIGNUM num, const BIGNUM divisor,
		rem=(n1-q*d0)&BN_MASK2;
		#endif

		#ifdef BN_UMULT_HIGH
		#if defined(BN_UMULT_LOHI)
		BN_UMULT_LOHI(t2l,t2h,d1,q);
		#elif defined(BN_UMULT_HIGH)
		t2l = d1 * q;
		t2h = BN_UMULT_HIGH(d1,q);
		#else

crypto/bn/bn_lcl.h

+16 −1

Original line number	Diff line number	Diff line
		@@ -230,6 +230,21 @@ struct bignum_ctx
		: "r"(a), "r"(b)); \
		ret; })
		# endif /* compiler */
		# elif defined(__x86_64) && defined(SIXTY_FOUR_BIT_LONG)
		# if defined(__GNUC__)
		# define BN_UMULT_HIGH(a,b) ({ \
		register BN_ULONG ret,discard; \
		asm ("mulq %3" \
		: "=a"(discard),"=d"(ret) \
		: "a"(a), "g"(b) \
		: "cc"); \
		ret; })
		# define BN_UMULT_LOHI(low,high,a,b) \
		asm ("mulq %3" \
		: "=a"(low),"=d"(high) \
		: "a"(a),"g"(b) \
		: "cc");
		# endif
		# endif /* cpu */
		#endif /* OPENSSL_NO_ASM */

		@@ -347,7 +362,7 @@ struct bignum_ctx

		#define LBITS(a) ((a)&BN_MASK2l)
		#define HBITS(a) (((a)>>BN_BITS4)&BN_MASK2l)
		#define L2HBITS(a) ((BN_ULONG)((a)&BN_MASK2l)<<BN_BITS4)
		#define L2HBITS(a) (((a)<<BN_BITS4)&BN_MASK2)

		#define LLBITS(a) ((a)&BN_MASKl)
		#define LHBITS(a) (((a)>>BN_BITS2)&BN_MASKl)