Commit 361512da authored by Andy Polyakov's avatar Andy Polyakov
Browse files

This commit completes recent modular exponentiation optimizations on

x86_64 platform. It targets specifically RSA1024 sign (using ideas
from http://eprint.iacr.org/2011/239) and adds more than 10% on most
platforms. Overall performance improvement relative to 1.0.0 is ~40%
in average, with best result of 54% on Westmere. Incidentally ~40%
is average improvement even for longer key lengths.
parent 20735f4c
Loading
Loading
Loading
Loading
+4 −3
Original line number Diff line number Diff line
@@ -127,7 +127,7 @@ my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o x86-gf2m.o:des-586.o crypt5

my $x86_elf_asm="$x86_asm:elf";

my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o::aes-x86_64.o aesni-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o";
my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o::aes-x86_64.o aesni-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o";
my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o:void";
my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o:void";
my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o::::::::::::void";
@@ -513,9 +513,9 @@ my %table=(
#
# Win64 targets, WIN64I denotes IA-64 and WIN64A - AMD64
"VC-WIN64I","cl:-W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64I::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:ia64cpuid.o:ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o:::::::ghash-ia64.o:ias:win32",
"VC-WIN64A","cl:-W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64A::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:x86_64cpuid.o:bn_asm.o x86_64-mont.o::aes-x86_64.o aesni-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o:auto:win32",
"VC-WIN64A","cl:-W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64A::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:".eval{my $asm=$x86_64_asm;$asm=~s/x86_64-gcc\.o/bn_asm.o/;$asm}.":auto:win32",
"debug-VC-WIN64I","cl:-W3 -Gs0 -Gy -Zi -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64I::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:ia64cpuid.o:ia64.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o:::::::ghash-ia64.o:ias:win32",
"debug-VC-WIN64A","cl:-W3 -Gs0 -Gy -Zi -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64A::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:x86_64cpuid.o:bn_asm.o x86_64-mont.o::aes-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o:auto:win32",
"debug-VC-WIN64A","cl:-W3 -Gs0 -Gy -Zi -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64A::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:".eval{my $asm=$x86_64_asm;$asm=~s/x86_64-gcc\.o/bn_asm.o/;$asm}.":auto:win32",
# x86 Win32 target defaults to ANSI API, if you want UNICODE, complement
# 'perl Configure VC-WIN32' with '-DUNICODE -D_UNICODE'
"VC-WIN32","cl:-W3 -Gs0 -GF -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -D_CRT_SECURE_NO_DEPRECATE:::WIN32::BN_LLONG RC4_INDEX EXPORT_VAR_AS_FN ${x86_gcc_opts}:${x86_asm}:win32n:win32",
@@ -1510,6 +1510,7 @@ $cflags.=" -DOPENSSL_BN_ASM_PART_WORDS" if ($bn_obj =~ /bn-586/);
$cflags.=" -DOPENSSL_IA32_SSE2" if (!$no_sse2 && $bn_obj =~ /86/);

$cflags.=" -DOPENSSL_BN_ASM_MONT" if ($bn_obj =~ /-mont/);
$cflags.=" -DOPENSSL_BN_ASM_MONT5" if ($bn_obj =~ /-mont5/);
$cflags.=" -DOPENSSL_BN_ASM_GF2m" if ($bn_obj =~ /-gf2m/);

if ($fips)
+12 −12
Original line number Diff line number Diff line
@@ -297,7 +297,7 @@ $sys_id =
$lflags       = 
$bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj    = x86_64cpuid.o
$bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o
$bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
$des_obj      = 
$aes_obj      = aes-x86_64.o aesni-x86_64.o
$bf_obj       = 
@@ -777,7 +777,7 @@ $sys_id = WIN64A
$lflags       = 
$bn_ops       = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN
$cpuid_obj    = x86_64cpuid.o
$bn_obj       = bn_asm.o x86_64-mont.o
$bn_obj       = bn_asm.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
$des_obj      = 
$aes_obj      = aes-x86_64.o aesni-x86_64.o
$bf_obj       = 
@@ -1385,7 +1385,7 @@ $sys_id = MACOSX
$lflags       = -Wl,-search_paths_first%
$bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj    = x86_64cpuid.o
$bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o
$bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
$des_obj      = 
$aes_obj      = aes-x86_64.o aesni-x86_64.o
$bf_obj       = 
@@ -1545,9 +1545,9 @@ $sys_id = WIN64A
$lflags       = 
$bn_ops       = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN
$cpuid_obj    = x86_64cpuid.o
$bn_obj       = bn_asm.o x86_64-mont.o
$bn_obj       = bn_asm.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
$des_obj      = 
$aes_obj      = aes-x86_64.o
$aes_obj      = aes-x86_64.o aesni-x86_64.o
$bf_obj       = 
$md5_obj      = md5-x86_64.o
$sha1_obj     = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
@@ -2313,7 +2313,7 @@ $sys_id =
$lflags       = -ldl
$bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj    = x86_64cpuid.o
$bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o
$bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
$des_obj      = 
$aes_obj      = aes-x86_64.o aesni-x86_64.o
$bf_obj       = 
@@ -2505,7 +2505,7 @@ $sys_id =
$lflags       = -ldl
$bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj    = x86_64cpuid.o
$bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o
$bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
$des_obj      = 
$aes_obj      = aes-x86_64.o aesni-x86_64.o
$bf_obj       = 
@@ -2569,7 +2569,7 @@ $sys_id =
$lflags       = -ldl
$bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj    = x86_64cpuid.o
$bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o
$bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
$des_obj      = 
$aes_obj      = aes-x86_64.o aesni-x86_64.o
$bf_obj       = 
@@ -4073,7 +4073,7 @@ $sys_id =
$lflags       = -ldl
$bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj    = x86_64cpuid.o
$bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o
$bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
$des_obj      = 
$aes_obj      = aes-x86_64.o aesni-x86_64.o
$bf_obj       = 
@@ -4233,7 +4233,7 @@ $sys_id = MINGW64
$lflags       = -lws2_32 -lgdi32 -lcrypt32
$bn_ops       = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN
$cpuid_obj    = x86_64cpuid.o
$bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o
$bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
$des_obj      = 
$aes_obj      = aes-x86_64.o aesni-x86_64.o
$bf_obj       = 
@@ -5193,7 +5193,7 @@ $sys_id =
$lflags       = -lsocket -lnsl -ldl
$bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj    = x86_64cpuid.o
$bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o
$bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
$des_obj      = 
$aes_obj      = aes-x86_64.o aesni-x86_64.o
$bf_obj       = 
@@ -5225,7 +5225,7 @@ $sys_id =
$lflags       = -lsocket -lnsl -ldl
$bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj    = x86_64cpuid.o
$bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o
$bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
$des_obj      = 
$aes_obj      = aes-x86_64.o aesni-x86_64.o
$bf_obj       = 
+2 −0
Original line number Diff line number Diff line
@@ -98,6 +98,8 @@ x86_64-gcc.o: asm/x86_64-gcc.c
	$(CC) $(CFLAGS) -c -o $@ asm/x86_64-gcc.c
x86_64-mont.s:	asm/x86_64-mont.pl
	$(PERL) asm/x86_64-mont.pl $(PERLASM_SCHEME) > $@
x86_64-mont5.s:	asm/x86_64-mont5.pl
	$(PERL) asm/x86_64-mont5.pl $(PERLASM_SCHEME) > $@
x86_64-gf2m.s:  asm/x86_64-gf2m.pl
	$(PERL) asm/x86_64-gf2m.pl $(PERLASM_SCHEME) > $@
modexp512-x86_64.s:	asm/modexp512-x86_64.pl
+3 −2
Original line number Diff line number Diff line
#!/usr/bin/env perl

# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
@@ -142,6 +142,7 @@ $code.=<<___;
	jne	.L1st

	add	%rax,$hi1
	mov	($ap),%rax		# ap[0]
	adc	\$0,%rdx
	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
	adc	\$0,%rdx
@@ -161,7 +162,6 @@ $code.=<<___;
.Louter:
	mov	($bp,$i,8),$m0		# m0=bp[i]
	xor	$j,$j			# j=0
	mov	($ap),%rax		# ap[0]
	mov	$n0,$m1
	mov	(%rsp),$lo0
	mulq	$m0			# ap[0]*bp[i]
@@ -208,6 +208,7 @@ $code.=<<___;
	jne	.Linner

	add	%rax,$hi1
	mov	($ap),%rax		# ap[0]
	adc	\$0,%rdx
	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
	mov	(%rsp,$j,8),$lo0
+834 −0

File added.

Preview size limit exceeded, changes collapsed.

Loading