Loading Configure +3 −3 Original line number Diff line number Diff line Loading @@ -135,8 +135,8 @@ my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o::::::::::::void"; my $mips3_asm=":bn-mips3.o::::::::::::void"; my $s390x_asm="s390xcpuid.o:bn-s390x.o s390x-mont.o::aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::void"; my $armv4_asm=":bn_asm.o armv4-mont.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::void"; my $ppc32_asm="ppccpuid.o:bn-ppc.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o::::::"; my $ppc64_asm="ppccpuid.o:bn-ppc.o ppc-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o::::::"; my $ppc32_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o::::::"; my $ppc64_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o::::::"; my $no_asm=":::::::::::::void"; # As for $BSDthreads. Idea is to maintain "collective" set of flags, Loading Loading @@ -547,7 +547,7 @@ my %table=( ##### MacOS X (a.k.a. Rhapsody or Darwin) setup "rhapsody-ppc-cc","cc:-O3 -DB_ENDIAN::(unknown):MACOSX_RHAPSODY::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:${no_asm}::", "darwin-ppc-cc","cc:-arch ppc -O3 -DB_ENDIAN::-D_REENTRANT:MACOSX:-Wl,-search_paths_first%:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:${ppc32_asm}:osx32:dlfcn:darwin-shared:-fPIC -fno-common:-arch ppc -dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib", "darwin-ppc-cc","cc:-arch ppc -O3 -DB_ENDIAN -Wa,-force_cpusubtype_ALL::-D_REENTRANT:MACOSX:-Wl,-search_paths_first%:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:${ppc32_asm}:osx32:dlfcn:darwin-shared:-fPIC -fno-common:-arch ppc -dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib", "darwin64-ppc-cc","cc:-arch ppc64 -O3 -DB_ENDIAN::-D_REENTRANT:MACOSX:-Wl,-search_paths_first%:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:${ppc64_asm}:osx64:dlfcn:darwin-shared:-fPIC -fno-common:-arch ppc64 -dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib", "darwin-i386-cc","cc:-arch i386 -O3 -fomit-frame-pointer -DL_ENDIAN::-D_REENTRANT:MACOSX:-Wl,-search_paths_first%:BN_LLONG RC4_INT RC4_CHUNK DES_UNROLL BF_PTR:${x86_asm}:macosx:dlfcn:darwin-shared:-fPIC -fno-common:-arch i386 -dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib", "debug-darwin-i386-cc","cc:-arch i386 -g3 -DL_ENDIAN::-D_REENTRANT:MACOSX:-Wl,-search_paths_first%:BN_LLONG RC4_INT RC4_CHUNK DES_UNROLL BF_PTR:${x86_asm}:macosx:dlfcn:darwin-shared:-fPIC -fno-common:-arch i386 -dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib", Loading TABLE +21 −21 Original line number Diff line number Diff line Loading @@ -814,8 +814,8 @@ $thread_cflag = -qthreaded $sys_id = AIX $lflags = $bn_ops = BN_LLONG RC4_CHAR $cpuid_obj = ppccpuid.o $bn_obj = bn-ppc.o $cpuid_obj = ppccpuid.o ppccap.o $bn_obj = bn-ppc.o ppc-mont.o ppc64-mont.o $des_obj = $aes_obj = aes_core.o aes_cbc.o aes-ppc.o $bf_obj = Loading Loading @@ -845,8 +845,8 @@ $thread_cflag = -pthread $sys_id = AIX $lflags = $bn_ops = BN_LLONG RC4_CHAR $cpuid_obj = ppccpuid.o $bn_obj = bn-ppc.o $cpuid_obj = ppccpuid.o ppccap.o $bn_obj = bn-ppc.o ppc-mont.o ppc64-mont.o $des_obj = $aes_obj = aes_core.o aes_cbc.o aes-ppc.o $bf_obj = Loading Loading @@ -907,8 +907,8 @@ $thread_cflag = -qthreaded $sys_id = AIX $lflags = $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR $cpuid_obj = ppccpuid.o $bn_obj = bn-ppc.o ppc-mont.o $cpuid_obj = ppccpuid.o ppccap.o $bn_obj = bn-ppc.o ppc-mont.o ppc64-mont.o $des_obj = $aes_obj = aes_core.o aes_cbc.o aes-ppc.o $bf_obj = Loading Loading @@ -938,8 +938,8 @@ $thread_cflag = -pthread $sys_id = AIX $lflags = $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR $cpuid_obj = ppccpuid.o $bn_obj = bn-ppc.o ppc-mont.o $cpuid_obj = ppccpuid.o ppccap.o $bn_obj = bn-ppc.o ppc-mont.o ppc64-mont.o $des_obj = $aes_obj = aes_core.o aes_cbc.o aes-ppc.o $bf_obj = Loading Loading @@ -1211,14 +1211,14 @@ $multilib = *** darwin-ppc-cc $cc = cc $cflags = -arch ppc -O3 -DB_ENDIAN $cflags = -arch ppc -O3 -DB_ENDIAN -Wa,-force_cpusubtype_ALL $unistd = $thread_cflag = -D_REENTRANT $sys_id = MACOSX $lflags = -Wl,-search_paths_first% $bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR $cpuid_obj = ppccpuid.o $bn_obj = bn-ppc.o $cpuid_obj = ppccpuid.o ppccap.o $bn_obj = bn-ppc.o ppc-mont.o ppc64-mont.o $des_obj = $aes_obj = aes_core.o aes_cbc.o aes-ppc.o $bf_obj = Loading Loading @@ -1248,8 +1248,8 @@ $thread_cflag = -D_REENTRANT $sys_id = MACOSX $lflags = -Wl,-search_paths_first% $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR $cpuid_obj = ppccpuid.o $bn_obj = bn-ppc.o ppc-mont.o $cpuid_obj = ppccpuid.o ppccap.o $bn_obj = bn-ppc.o ppc-mont.o ppc64-mont.o $des_obj = $aes_obj = aes_core.o aes_cbc.o aes-ppc.o $bf_obj = Loading Loading @@ -1682,8 +1682,8 @@ $thread_cflag = -D_REENTRANT $sys_id = MACOSX $lflags = $bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR $cpuid_obj = ppccpuid.o $bn_obj = bn-ppc.o $cpuid_obj = ppccpuid.o ppccap.o $bn_obj = bn-ppc.o ppc-mont.o ppc64-mont.o $des_obj = $aes_obj = aes_core.o aes_cbc.o aes-ppc.o $bf_obj = Loading Loading @@ -2327,7 +2327,7 @@ $multilib = *** debug-steve32 $cc = gcc $cflags = -Wall -pedantic -DPEDANTIC -Wno-long-long -Wsign-compare -Wmissing-prototypes -Wshadow -Wformat -Werror -DCRYPTO_MDEBUG_ALL -DCRYPTO_MDEBUG_ABORT -DREF_CHECK -DOPENSSL_NO_DEPRECATED -m32 -DL_ENDIAN -DCONF_DEBUG -DDEBUG_SAFESTACK -DDEBUG_UNUSED -g -pipe $cflags = -Wall -pedantic -DPEDANTIC -Wno-long-long -Wsign-compare -Wmissing-prototypes -Wshadow -Wformat -Werror -DCRYPTO_MDEBUG_ALL -DCRYPTO_MDEBUG_ABORT -DREF_CHECK -DOPENSSL_NO_DEPRECATED -m32 -DL_ENDIAN -DCONF_DEBUG -DDEBUG_SAFESTACK -g -pipe $unistd = $thread_cflag = -D_REENTRANT $sys_id = Loading Loading @@ -2358,7 +2358,7 @@ $multilib = *** debug-steve64 $cc = gcc $cflags = -Wall -pedantic -DPEDANTIC -Wno-long-long -Wsign-compare -Wmissing-prototypes -Wshadow -Wformat -Werror -DCRYPTO_MDEBUG_ALL -DCRYPTO_MDEBUG_ABORT -DREF_CHECK -DOPENSSL_NO_DEPRECATED -m64 -DL_ENDIAN -DTERMIO -DCONF_DEBUG -DDEBUG_SAFESTACK -DDEBUG_UNUSED -g -DMD32_REG_T=int $cflags = -Wall -pedantic -DPEDANTIC -Wno-long-long -Wsign-compare -Wmissing-prototypes -Wshadow -Wformat -Werror -DCRYPTO_MDEBUG_ALL -DCRYPTO_MDEBUG_ABORT -DREF_CHECK -DOPENSSL_NO_DEPRECATED -m64 -DL_ENDIAN -DTERMIO -DCONF_DEBUG -DDEBUG_SAFESTACK -g -DMD32_REG_T=int $unistd = $thread_cflag = -D_REENTRANT $sys_id = Loading Loading @@ -3666,8 +3666,8 @@ $thread_cflag = -D_REENTRANT $sys_id = $lflags = -ldl $bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL $cpuid_obj = ppccpuid.o $bn_obj = bn-ppc.o $cpuid_obj = ppccpuid.o ppccap.o $bn_obj = bn-ppc.o ppc-mont.o ppc64-mont.o $des_obj = $aes_obj = aes_core.o aes_cbc.o aes-ppc.o $bf_obj = Loading Loading @@ -3697,8 +3697,8 @@ $thread_cflag = -D_REENTRANT $sys_id = $lflags = -ldl $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL $cpuid_obj = ppccpuid.o $bn_obj = bn-ppc.o ppc-mont.o $cpuid_obj = ppccpuid.o ppccap.o $bn_obj = bn-ppc.o ppc-mont.o ppc64-mont.o $des_obj = $aes_obj = aes_core.o aes_cbc.o aes-ppc.o $bf_obj = Loading crypto/bn/Makefile +1 −0 Original line number Diff line number Diff line Loading @@ -103,6 +103,7 @@ pa-risc2.o: asm/pa-risc2.s # ppc - AIX, Linux, MacOS X... bn-ppc.s: asm/ppc.pl; $(PERL) asm/ppc.pl $(PERLASM_SCHEME) $@ ppc-mont.s: asm/ppc-mont.pl;$(PERL) asm/ppc-mont.pl $(PERLASM_SCHEME) $@ ppc64-mont.s: asm/ppc64-mont.pl;$(PERL) asm/ppc64-mont.pl $(PERLASM_SCHEME) $@ alpha-mont.s: asm/alpha-mont.pl $(PERL) $< | $(CC) -E - | tee $@ > /dev/null Loading crypto/bn/asm/ppc-mont.pl +8 −3 Original line number Diff line number Diff line Loading @@ -108,14 +108,19 @@ $code=<<___; .machine "any" .text .globl .bn_mul_mont .globl .bn_mul_mont_int .align 4 .bn_mul_mont: .bn_mul_mont_int: cmpwi $num,4 mr $rp,r3 ; $rp is reassigned li r3,0 bltlr ___ $code.=<<___ if ($BNSZ==4); cmpwi $num,32 ; longer key performance is not better bgelr ___ $code.=<<___; slwi $num,$num,`log($BNSZ)/log(2)` li $tj,-4096 addi $ovf,$num,`$FRAME+$RZONE` Loading crypto/bn/asm/ppc64-mont.pl +175 −8 Original line number Diff line number Diff line Loading @@ -45,23 +45,41 @@ # on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive # in absolute terms, but it's apparently the way Power 6 is... # December 2009 # Adapted for 32-bit build this module delivers 25-120%, more for # longer keys, performance improvement on 1.8GHz PPC970. However! # This implementation utilizes even 64-bit integer operations and # trouble is that most PPC operating systems don't preserve upper # halves of general purpose registers upong signal delivery. They do # preserve them upon context switch, but not signalling:-( This means # that asynchronous signals have to be blocked upon entry to this # subroutine. Signal masking (and complementary unmasking) has quite # an impact on performance, naturally larger for shorter keys. It's # so severe that shorter key performance as low as 1/3 of expected # one. This is why this routine should be engaged for longer key # operations only, see crypto/ppccap.c for further details. # Alternative is to break dependance on upper halves on GPRs... # MacOS X is an exception from this and doesn't require signal # masking, and that's where above improvement coefficients were # collected. $flavour = shift; if ($flavour =~ /32/) { $SIZE_T=4; $RZONE= 224; $FRAME= $SIZE_T*12+8*12; $fname= "bn_mul_mont_ppc64"; $fname= "bn_mul_mont_fpu64"; $STUX= "stwux"; # store indexed and update $PUSH= "stw"; $POP= "lwz"; die "not implemented yet"; } elsif ($flavour =~ /64/) { $SIZE_T=8; $RZONE= 288; $FRAME= $SIZE_T*12+8*12; $fname= "bn_mul_mont"; $fname= "bn_mul_mont_fpu64"; # same as above, but 64-bit mnemonics... $STUX= "stdux"; # store indexed and update Loading Loading @@ -181,14 +199,14 @@ $code=<<___; .globl .$fname .align 5 .$fname: cmpwi $num,4 cmpwi $num,`3*8/$SIZE_T` mr $rp,r3 ; $rp is reassigned li r3,0 ; possible "not handled" return code bltlr- andi. r0,$num,1 ; $num has to be even andi. r0,$num,`16/$SIZE_T-1` ; $num has to be "even" bnelr- slwi $num,$num,3 ; num*=8 slwi $num,$num,`log($SIZE_T)/log(2)` ; num*=sizeof(BN_LONG) li $i,-4096 slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num add $tp,$tp,$num ; place for tp[num+1] Loading Loading @@ -220,11 +238,25 @@ $code=<<___; stfd f23,`12*$SIZE_T+72`($sp) stfd f24,`12*$SIZE_T+80`($sp) stfd f25,`12*$SIZE_T+88`($sp) ___ $code.=<<___ if ($SIZE_T==8); ld $a0,0($ap) ; pull ap[0] value ld $n0,0($n0) ; pull n0[0] value ld $t3,0($bp) ; bp[0] ___ $code.=<<___ if ($SIZE_T==4); mr $t1,$n0 lwz $a0,0($ap) ; pull ap[0,1] value lwz $t0,4($ap) lwz $n0,0($t1) ; pull n0[0,1] value lwz $t1,4($t1) lwz $t3,0($bp) ; bp[0,1] lwz $t2,4($bp) insrdi $a0,$t0,32,0 insrdi $n0,$t1,32,0 insrdi $t3,$t2,32,0 ___ $code.=<<___; addi $tp,$sp,`$FRAME+$TRANSFER+8+64` li $i,-64 add $nap_d,$tp,$num Loading Loading @@ -258,6 +290,8 @@ $code=<<___; std $t5,`$FRAME+40`($sp) std $t6,`$FRAME+48`($sp) std $t7,`$FRAME+56`($sp) ___ $code.=<<___ if ($SIZE_T==8); lwz $t0,4($ap) ; load a[j] as 32-bit word pair lwz $t1,0($ap) lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair Loading @@ -266,6 +300,18 @@ $code=<<___; lwz $t5,0($np) lwz $t6,12($np) ; load n[j+1] as 32-bit word pair lwz $t7,8($np) ___ $code.=<<___ if ($SIZE_T==4); lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs lwz $t1,4($ap) lwz $t2,8($ap) lwz $t3,12($ap) lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs lwz $t5,4($np) lwz $t6,8($np) lwz $t7,12($np) ___ $code.=<<___; lfd $ba,`$FRAME+0`($sp) lfd $bb,`$FRAME+8`($sp) lfd $bc,`$FRAME+16`($sp) Loading Loading @@ -374,6 +420,8 @@ $code=<<___; .align 5 L1st: ___ $code.=<<___ if ($SIZE_T==8); lwz $t0,4($ap) ; load a[j] as 32-bit word pair lwz $t1,0($ap) lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair Loading @@ -382,6 +430,18 @@ L1st: lwz $t5,0($np) lwz $t6,12($np) ; load n[j+1] as 32-bit word pair lwz $t7,8($np) ___ $code.=<<___ if ($SIZE_T==4); lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs lwz $t1,4($ap) lwz $t2,8($ap) lwz $t3,12($ap) lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs lwz $t5,4($np) lwz $t6,8($np) lwz $t7,12($np) ___ $code.=<<___; std $t0,`$FRAME+64`($sp) std $t1,`$FRAME+72`($sp) std $t2,`$FRAME+80`($sp) Loading Loading @@ -559,7 +619,17 @@ L1st: li $i,8 ; i=1 .align 5 Louter: ___ $code.=<<___ if ($SIZE_T==8); ldx $t3,$bp,$i ; bp[i] ___ $code.=<<___ if ($SIZE_T==4); add $t0,$bp,$i lwz $t3,0($t0) ; bp[i,i+1] lwz $t0,4($t0) insrdi $t3,$t0,32,0 ___ $code.=<<___; ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0] mulld $t7,$a0,$t3 ; ap[0]*bp[i] Loading Loading @@ -761,6 +831,13 @@ Linner: stfd $T0b,`$FRAME+8`($sp) add $t7,$t7,$carry addc $t3,$t0,$t1 ___ $code.=<<___ if ($SIZE_T==4); # adjust XER[CA] extrdi $t0,$t0,32,0 extrdi $t1,$t1,32,0 adde $t0,$t0,$t1 ___ $code.=<<___; stfd $T1a,`$FRAME+16`($sp) stfd $T1b,`$FRAME+24`($sp) insrdi $t4,$t7,16,0 ; 64..127 bits Loading @@ -768,6 +845,13 @@ Linner: stfd $T2a,`$FRAME+32`($sp) stfd $T2b,`$FRAME+40`($sp) adde $t5,$t4,$t2 ___ $code.=<<___ if ($SIZE_T==4); # adjust XER[CA] extrdi $t4,$t4,32,0 extrdi $t2,$t2,32,0 adde $t4,$t4,$t2 ___ $code.=<<___; stfd $T3a,`$FRAME+48`($sp) stfd $T3b,`$FRAME+56`($sp) addze $carry,$carry Loading Loading @@ -816,7 +900,21 @@ Linner: ld $t7,`$FRAME+72`($sp) addc $t3,$t0,$t1 ___ $code.=<<___ if ($SIZE_T==4); # adjust XER[CA] extrdi $t0,$t0,32,0 extrdi $t1,$t1,32,0 adde $t0,$t0,$t1 ___ $code.=<<___; adde $t5,$t4,$t2 ___ $code.=<<___ if ($SIZE_T==4); # adjust XER[CA] extrdi $t4,$t4,32,0 extrdi $t2,$t2,32,0 adde $t4,$t4,$t2 ___ $code.=<<___; addze $carry,$carry std $t3,-16($tp) ; tp[j-1] Loading @@ -835,7 +933,9 @@ Linner: subf $nap_d,$t7,$nap_d ; rewind pointer cmpw $i,$num blt- Louter ___ $code.=<<___ if ($SIZE_T==8); subf $np,$num,$np ; rewind np addi $j,$j,1 ; restore counter subfc $i,$i,$i ; j=0 and "clear" XER[CA] Loading Loading @@ -883,7 +983,74 @@ Lcopy: ; copy or in-place refresh stdx $i,$t4,$i addi $i,$i,16 bdnz- Lcopy ___ $code.=<<___ if ($SIZE_T==4); subf $np,$num,$np ; rewind np addi $j,$j,1 ; restore counter subfc $i,$i,$i ; j=0 and "clear" XER[CA] addi $tp,$sp,`$FRAME+$TRANSFER` addi $np,$np,-4 addi $rp,$rp,-4 addi $ap,$sp,`$FRAME+$TRANSFER+4` mtctr $j .align 4 Lsub: ld $t0,8($tp) ; load tp[j..j+3] in 64-bit word order ldu $t2,16($tp) lwz $t4,4($np) ; load np[j..j+3] in 32-bit word order lwz $t5,8($np) lwz $t6,12($np) lwzu $t7,16($np) extrdi $t1,$t0,32,0 extrdi $t3,$t2,32,0 subfe $t4,$t4,$t0 ; tp[j]-np[j] stw $t0,4($ap) ; save tp[j..j+3] in 32-bit word order subfe $t5,$t5,$t1 ; tp[j+1]-np[j+1] stw $t1,8($ap) subfe $t6,$t6,$t2 ; tp[j+2]-np[j+2] stw $t2,12($ap) subfe $t7,$t7,$t3 ; tp[j+3]-np[j+3] stwu $t3,16($ap) stw $t4,4($rp) stw $t5,8($rp) stw $t6,12($rp) stwu $t7,16($rp) bdnz- Lsub li $i,0 subfe $ovf,$i,$ovf ; handle upmost overflow bit addi $tp,$sp,`$FRAME+$TRANSFER+4` subf $rp,$num,$rp ; rewind rp and $ap,$tp,$ovf andc $np,$rp,$ovf or $ap,$ap,$np ; ap=borrow?tp:rp addi $tp,$sp,`$FRAME+$TRANSFER` mtctr $j .align 4 Lcopy: ; copy or in-place refresh lwz $t0,4($ap) lwz $t1,8($ap) lwz $t2,12($ap) lwzu $t3,16($ap) std $i,8($nap_d) ; zap nap_d std $i,16($nap_d) std $i,24($nap_d) std $i,32($nap_d) std $i,40($nap_d) std $i,48($nap_d) std $i,56($nap_d) stdu $i,64($nap_d) stw $t0,4($rp) stw $t1,8($rp) stw $t2,12($rp) stwu $t3,16($rp) std $i,8($tp) ; zap tp at once stdu $i,16($tp) bdnz- Lcopy ___ $code.=<<___; $POP r14,`2*$SIZE_T`($sp) $POP r15,`3*$SIZE_T`($sp) $POP r16,`4*$SIZE_T`($sp) Loading Loading
Configure +3 −3 Original line number Diff line number Diff line Loading @@ -135,8 +135,8 @@ my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o::::::::::::void"; my $mips3_asm=":bn-mips3.o::::::::::::void"; my $s390x_asm="s390xcpuid.o:bn-s390x.o s390x-mont.o::aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::void"; my $armv4_asm=":bn_asm.o armv4-mont.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::void"; my $ppc32_asm="ppccpuid.o:bn-ppc.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o::::::"; my $ppc64_asm="ppccpuid.o:bn-ppc.o ppc-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o::::::"; my $ppc32_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o::::::"; my $ppc64_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o::::::"; my $no_asm=":::::::::::::void"; # As for $BSDthreads. Idea is to maintain "collective" set of flags, Loading Loading @@ -547,7 +547,7 @@ my %table=( ##### MacOS X (a.k.a. Rhapsody or Darwin) setup "rhapsody-ppc-cc","cc:-O3 -DB_ENDIAN::(unknown):MACOSX_RHAPSODY::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:${no_asm}::", "darwin-ppc-cc","cc:-arch ppc -O3 -DB_ENDIAN::-D_REENTRANT:MACOSX:-Wl,-search_paths_first%:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:${ppc32_asm}:osx32:dlfcn:darwin-shared:-fPIC -fno-common:-arch ppc -dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib", "darwin-ppc-cc","cc:-arch ppc -O3 -DB_ENDIAN -Wa,-force_cpusubtype_ALL::-D_REENTRANT:MACOSX:-Wl,-search_paths_first%:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:${ppc32_asm}:osx32:dlfcn:darwin-shared:-fPIC -fno-common:-arch ppc -dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib", "darwin64-ppc-cc","cc:-arch ppc64 -O3 -DB_ENDIAN::-D_REENTRANT:MACOSX:-Wl,-search_paths_first%:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:${ppc64_asm}:osx64:dlfcn:darwin-shared:-fPIC -fno-common:-arch ppc64 -dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib", "darwin-i386-cc","cc:-arch i386 -O3 -fomit-frame-pointer -DL_ENDIAN::-D_REENTRANT:MACOSX:-Wl,-search_paths_first%:BN_LLONG RC4_INT RC4_CHUNK DES_UNROLL BF_PTR:${x86_asm}:macosx:dlfcn:darwin-shared:-fPIC -fno-common:-arch i386 -dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib", "debug-darwin-i386-cc","cc:-arch i386 -g3 -DL_ENDIAN::-D_REENTRANT:MACOSX:-Wl,-search_paths_first%:BN_LLONG RC4_INT RC4_CHUNK DES_UNROLL BF_PTR:${x86_asm}:macosx:dlfcn:darwin-shared:-fPIC -fno-common:-arch i386 -dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib", Loading
TABLE +21 −21 Original line number Diff line number Diff line Loading @@ -814,8 +814,8 @@ $thread_cflag = -qthreaded $sys_id = AIX $lflags = $bn_ops = BN_LLONG RC4_CHAR $cpuid_obj = ppccpuid.o $bn_obj = bn-ppc.o $cpuid_obj = ppccpuid.o ppccap.o $bn_obj = bn-ppc.o ppc-mont.o ppc64-mont.o $des_obj = $aes_obj = aes_core.o aes_cbc.o aes-ppc.o $bf_obj = Loading Loading @@ -845,8 +845,8 @@ $thread_cflag = -pthread $sys_id = AIX $lflags = $bn_ops = BN_LLONG RC4_CHAR $cpuid_obj = ppccpuid.o $bn_obj = bn-ppc.o $cpuid_obj = ppccpuid.o ppccap.o $bn_obj = bn-ppc.o ppc-mont.o ppc64-mont.o $des_obj = $aes_obj = aes_core.o aes_cbc.o aes-ppc.o $bf_obj = Loading Loading @@ -907,8 +907,8 @@ $thread_cflag = -qthreaded $sys_id = AIX $lflags = $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR $cpuid_obj = ppccpuid.o $bn_obj = bn-ppc.o ppc-mont.o $cpuid_obj = ppccpuid.o ppccap.o $bn_obj = bn-ppc.o ppc-mont.o ppc64-mont.o $des_obj = $aes_obj = aes_core.o aes_cbc.o aes-ppc.o $bf_obj = Loading Loading @@ -938,8 +938,8 @@ $thread_cflag = -pthread $sys_id = AIX $lflags = $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR $cpuid_obj = ppccpuid.o $bn_obj = bn-ppc.o ppc-mont.o $cpuid_obj = ppccpuid.o ppccap.o $bn_obj = bn-ppc.o ppc-mont.o ppc64-mont.o $des_obj = $aes_obj = aes_core.o aes_cbc.o aes-ppc.o $bf_obj = Loading Loading @@ -1211,14 +1211,14 @@ $multilib = *** darwin-ppc-cc $cc = cc $cflags = -arch ppc -O3 -DB_ENDIAN $cflags = -arch ppc -O3 -DB_ENDIAN -Wa,-force_cpusubtype_ALL $unistd = $thread_cflag = -D_REENTRANT $sys_id = MACOSX $lflags = -Wl,-search_paths_first% $bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR $cpuid_obj = ppccpuid.o $bn_obj = bn-ppc.o $cpuid_obj = ppccpuid.o ppccap.o $bn_obj = bn-ppc.o ppc-mont.o ppc64-mont.o $des_obj = $aes_obj = aes_core.o aes_cbc.o aes-ppc.o $bf_obj = Loading Loading @@ -1248,8 +1248,8 @@ $thread_cflag = -D_REENTRANT $sys_id = MACOSX $lflags = -Wl,-search_paths_first% $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR $cpuid_obj = ppccpuid.o $bn_obj = bn-ppc.o ppc-mont.o $cpuid_obj = ppccpuid.o ppccap.o $bn_obj = bn-ppc.o ppc-mont.o ppc64-mont.o $des_obj = $aes_obj = aes_core.o aes_cbc.o aes-ppc.o $bf_obj = Loading Loading @@ -1682,8 +1682,8 @@ $thread_cflag = -D_REENTRANT $sys_id = MACOSX $lflags = $bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR $cpuid_obj = ppccpuid.o $bn_obj = bn-ppc.o $cpuid_obj = ppccpuid.o ppccap.o $bn_obj = bn-ppc.o ppc-mont.o ppc64-mont.o $des_obj = $aes_obj = aes_core.o aes_cbc.o aes-ppc.o $bf_obj = Loading Loading @@ -2327,7 +2327,7 @@ $multilib = *** debug-steve32 $cc = gcc $cflags = -Wall -pedantic -DPEDANTIC -Wno-long-long -Wsign-compare -Wmissing-prototypes -Wshadow -Wformat -Werror -DCRYPTO_MDEBUG_ALL -DCRYPTO_MDEBUG_ABORT -DREF_CHECK -DOPENSSL_NO_DEPRECATED -m32 -DL_ENDIAN -DCONF_DEBUG -DDEBUG_SAFESTACK -DDEBUG_UNUSED -g -pipe $cflags = -Wall -pedantic -DPEDANTIC -Wno-long-long -Wsign-compare -Wmissing-prototypes -Wshadow -Wformat -Werror -DCRYPTO_MDEBUG_ALL -DCRYPTO_MDEBUG_ABORT -DREF_CHECK -DOPENSSL_NO_DEPRECATED -m32 -DL_ENDIAN -DCONF_DEBUG -DDEBUG_SAFESTACK -g -pipe $unistd = $thread_cflag = -D_REENTRANT $sys_id = Loading Loading @@ -2358,7 +2358,7 @@ $multilib = *** debug-steve64 $cc = gcc $cflags = -Wall -pedantic -DPEDANTIC -Wno-long-long -Wsign-compare -Wmissing-prototypes -Wshadow -Wformat -Werror -DCRYPTO_MDEBUG_ALL -DCRYPTO_MDEBUG_ABORT -DREF_CHECK -DOPENSSL_NO_DEPRECATED -m64 -DL_ENDIAN -DTERMIO -DCONF_DEBUG -DDEBUG_SAFESTACK -DDEBUG_UNUSED -g -DMD32_REG_T=int $cflags = -Wall -pedantic -DPEDANTIC -Wno-long-long -Wsign-compare -Wmissing-prototypes -Wshadow -Wformat -Werror -DCRYPTO_MDEBUG_ALL -DCRYPTO_MDEBUG_ABORT -DREF_CHECK -DOPENSSL_NO_DEPRECATED -m64 -DL_ENDIAN -DTERMIO -DCONF_DEBUG -DDEBUG_SAFESTACK -g -DMD32_REG_T=int $unistd = $thread_cflag = -D_REENTRANT $sys_id = Loading Loading @@ -3666,8 +3666,8 @@ $thread_cflag = -D_REENTRANT $sys_id = $lflags = -ldl $bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL $cpuid_obj = ppccpuid.o $bn_obj = bn-ppc.o $cpuid_obj = ppccpuid.o ppccap.o $bn_obj = bn-ppc.o ppc-mont.o ppc64-mont.o $des_obj = $aes_obj = aes_core.o aes_cbc.o aes-ppc.o $bf_obj = Loading Loading @@ -3697,8 +3697,8 @@ $thread_cflag = -D_REENTRANT $sys_id = $lflags = -ldl $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL $cpuid_obj = ppccpuid.o $bn_obj = bn-ppc.o ppc-mont.o $cpuid_obj = ppccpuid.o ppccap.o $bn_obj = bn-ppc.o ppc-mont.o ppc64-mont.o $des_obj = $aes_obj = aes_core.o aes_cbc.o aes-ppc.o $bf_obj = Loading
crypto/bn/Makefile +1 −0 Original line number Diff line number Diff line Loading @@ -103,6 +103,7 @@ pa-risc2.o: asm/pa-risc2.s # ppc - AIX, Linux, MacOS X... bn-ppc.s: asm/ppc.pl; $(PERL) asm/ppc.pl $(PERLASM_SCHEME) $@ ppc-mont.s: asm/ppc-mont.pl;$(PERL) asm/ppc-mont.pl $(PERLASM_SCHEME) $@ ppc64-mont.s: asm/ppc64-mont.pl;$(PERL) asm/ppc64-mont.pl $(PERLASM_SCHEME) $@ alpha-mont.s: asm/alpha-mont.pl $(PERL) $< | $(CC) -E - | tee $@ > /dev/null Loading
crypto/bn/asm/ppc-mont.pl +8 −3 Original line number Diff line number Diff line Loading @@ -108,14 +108,19 @@ $code=<<___; .machine "any" .text .globl .bn_mul_mont .globl .bn_mul_mont_int .align 4 .bn_mul_mont: .bn_mul_mont_int: cmpwi $num,4 mr $rp,r3 ; $rp is reassigned li r3,0 bltlr ___ $code.=<<___ if ($BNSZ==4); cmpwi $num,32 ; longer key performance is not better bgelr ___ $code.=<<___; slwi $num,$num,`log($BNSZ)/log(2)` li $tj,-4096 addi $ovf,$num,`$FRAME+$RZONE` Loading
crypto/bn/asm/ppc64-mont.pl +175 −8 Original line number Diff line number Diff line Loading @@ -45,23 +45,41 @@ # on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive # in absolute terms, but it's apparently the way Power 6 is... # December 2009 # Adapted for 32-bit build this module delivers 25-120%, more for # longer keys, performance improvement on 1.8GHz PPC970. However! # This implementation utilizes even 64-bit integer operations and # trouble is that most PPC operating systems don't preserve upper # halves of general purpose registers upong signal delivery. They do # preserve them upon context switch, but not signalling:-( This means # that asynchronous signals have to be blocked upon entry to this # subroutine. Signal masking (and complementary unmasking) has quite # an impact on performance, naturally larger for shorter keys. It's # so severe that shorter key performance as low as 1/3 of expected # one. This is why this routine should be engaged for longer key # operations only, see crypto/ppccap.c for further details. # Alternative is to break dependance on upper halves on GPRs... # MacOS X is an exception from this and doesn't require signal # masking, and that's where above improvement coefficients were # collected. $flavour = shift; if ($flavour =~ /32/) { $SIZE_T=4; $RZONE= 224; $FRAME= $SIZE_T*12+8*12; $fname= "bn_mul_mont_ppc64"; $fname= "bn_mul_mont_fpu64"; $STUX= "stwux"; # store indexed and update $PUSH= "stw"; $POP= "lwz"; die "not implemented yet"; } elsif ($flavour =~ /64/) { $SIZE_T=8; $RZONE= 288; $FRAME= $SIZE_T*12+8*12; $fname= "bn_mul_mont"; $fname= "bn_mul_mont_fpu64"; # same as above, but 64-bit mnemonics... $STUX= "stdux"; # store indexed and update Loading Loading @@ -181,14 +199,14 @@ $code=<<___; .globl .$fname .align 5 .$fname: cmpwi $num,4 cmpwi $num,`3*8/$SIZE_T` mr $rp,r3 ; $rp is reassigned li r3,0 ; possible "not handled" return code bltlr- andi. r0,$num,1 ; $num has to be even andi. r0,$num,`16/$SIZE_T-1` ; $num has to be "even" bnelr- slwi $num,$num,3 ; num*=8 slwi $num,$num,`log($SIZE_T)/log(2)` ; num*=sizeof(BN_LONG) li $i,-4096 slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num add $tp,$tp,$num ; place for tp[num+1] Loading Loading @@ -220,11 +238,25 @@ $code=<<___; stfd f23,`12*$SIZE_T+72`($sp) stfd f24,`12*$SIZE_T+80`($sp) stfd f25,`12*$SIZE_T+88`($sp) ___ $code.=<<___ if ($SIZE_T==8); ld $a0,0($ap) ; pull ap[0] value ld $n0,0($n0) ; pull n0[0] value ld $t3,0($bp) ; bp[0] ___ $code.=<<___ if ($SIZE_T==4); mr $t1,$n0 lwz $a0,0($ap) ; pull ap[0,1] value lwz $t0,4($ap) lwz $n0,0($t1) ; pull n0[0,1] value lwz $t1,4($t1) lwz $t3,0($bp) ; bp[0,1] lwz $t2,4($bp) insrdi $a0,$t0,32,0 insrdi $n0,$t1,32,0 insrdi $t3,$t2,32,0 ___ $code.=<<___; addi $tp,$sp,`$FRAME+$TRANSFER+8+64` li $i,-64 add $nap_d,$tp,$num Loading Loading @@ -258,6 +290,8 @@ $code=<<___; std $t5,`$FRAME+40`($sp) std $t6,`$FRAME+48`($sp) std $t7,`$FRAME+56`($sp) ___ $code.=<<___ if ($SIZE_T==8); lwz $t0,4($ap) ; load a[j] as 32-bit word pair lwz $t1,0($ap) lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair Loading @@ -266,6 +300,18 @@ $code=<<___; lwz $t5,0($np) lwz $t6,12($np) ; load n[j+1] as 32-bit word pair lwz $t7,8($np) ___ $code.=<<___ if ($SIZE_T==4); lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs lwz $t1,4($ap) lwz $t2,8($ap) lwz $t3,12($ap) lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs lwz $t5,4($np) lwz $t6,8($np) lwz $t7,12($np) ___ $code.=<<___; lfd $ba,`$FRAME+0`($sp) lfd $bb,`$FRAME+8`($sp) lfd $bc,`$FRAME+16`($sp) Loading Loading @@ -374,6 +420,8 @@ $code=<<___; .align 5 L1st: ___ $code.=<<___ if ($SIZE_T==8); lwz $t0,4($ap) ; load a[j] as 32-bit word pair lwz $t1,0($ap) lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair Loading @@ -382,6 +430,18 @@ L1st: lwz $t5,0($np) lwz $t6,12($np) ; load n[j+1] as 32-bit word pair lwz $t7,8($np) ___ $code.=<<___ if ($SIZE_T==4); lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs lwz $t1,4($ap) lwz $t2,8($ap) lwz $t3,12($ap) lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs lwz $t5,4($np) lwz $t6,8($np) lwz $t7,12($np) ___ $code.=<<___; std $t0,`$FRAME+64`($sp) std $t1,`$FRAME+72`($sp) std $t2,`$FRAME+80`($sp) Loading Loading @@ -559,7 +619,17 @@ L1st: li $i,8 ; i=1 .align 5 Louter: ___ $code.=<<___ if ($SIZE_T==8); ldx $t3,$bp,$i ; bp[i] ___ $code.=<<___ if ($SIZE_T==4); add $t0,$bp,$i lwz $t3,0($t0) ; bp[i,i+1] lwz $t0,4($t0) insrdi $t3,$t0,32,0 ___ $code.=<<___; ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0] mulld $t7,$a0,$t3 ; ap[0]*bp[i] Loading Loading @@ -761,6 +831,13 @@ Linner: stfd $T0b,`$FRAME+8`($sp) add $t7,$t7,$carry addc $t3,$t0,$t1 ___ $code.=<<___ if ($SIZE_T==4); # adjust XER[CA] extrdi $t0,$t0,32,0 extrdi $t1,$t1,32,0 adde $t0,$t0,$t1 ___ $code.=<<___; stfd $T1a,`$FRAME+16`($sp) stfd $T1b,`$FRAME+24`($sp) insrdi $t4,$t7,16,0 ; 64..127 bits Loading @@ -768,6 +845,13 @@ Linner: stfd $T2a,`$FRAME+32`($sp) stfd $T2b,`$FRAME+40`($sp) adde $t5,$t4,$t2 ___ $code.=<<___ if ($SIZE_T==4); # adjust XER[CA] extrdi $t4,$t4,32,0 extrdi $t2,$t2,32,0 adde $t4,$t4,$t2 ___ $code.=<<___; stfd $T3a,`$FRAME+48`($sp) stfd $T3b,`$FRAME+56`($sp) addze $carry,$carry Loading Loading @@ -816,7 +900,21 @@ Linner: ld $t7,`$FRAME+72`($sp) addc $t3,$t0,$t1 ___ $code.=<<___ if ($SIZE_T==4); # adjust XER[CA] extrdi $t0,$t0,32,0 extrdi $t1,$t1,32,0 adde $t0,$t0,$t1 ___ $code.=<<___; adde $t5,$t4,$t2 ___ $code.=<<___ if ($SIZE_T==4); # adjust XER[CA] extrdi $t4,$t4,32,0 extrdi $t2,$t2,32,0 adde $t4,$t4,$t2 ___ $code.=<<___; addze $carry,$carry std $t3,-16($tp) ; tp[j-1] Loading @@ -835,7 +933,9 @@ Linner: subf $nap_d,$t7,$nap_d ; rewind pointer cmpw $i,$num blt- Louter ___ $code.=<<___ if ($SIZE_T==8); subf $np,$num,$np ; rewind np addi $j,$j,1 ; restore counter subfc $i,$i,$i ; j=0 and "clear" XER[CA] Loading Loading @@ -883,7 +983,74 @@ Lcopy: ; copy or in-place refresh stdx $i,$t4,$i addi $i,$i,16 bdnz- Lcopy ___ $code.=<<___ if ($SIZE_T==4); subf $np,$num,$np ; rewind np addi $j,$j,1 ; restore counter subfc $i,$i,$i ; j=0 and "clear" XER[CA] addi $tp,$sp,`$FRAME+$TRANSFER` addi $np,$np,-4 addi $rp,$rp,-4 addi $ap,$sp,`$FRAME+$TRANSFER+4` mtctr $j .align 4 Lsub: ld $t0,8($tp) ; load tp[j..j+3] in 64-bit word order ldu $t2,16($tp) lwz $t4,4($np) ; load np[j..j+3] in 32-bit word order lwz $t5,8($np) lwz $t6,12($np) lwzu $t7,16($np) extrdi $t1,$t0,32,0 extrdi $t3,$t2,32,0 subfe $t4,$t4,$t0 ; tp[j]-np[j] stw $t0,4($ap) ; save tp[j..j+3] in 32-bit word order subfe $t5,$t5,$t1 ; tp[j+1]-np[j+1] stw $t1,8($ap) subfe $t6,$t6,$t2 ; tp[j+2]-np[j+2] stw $t2,12($ap) subfe $t7,$t7,$t3 ; tp[j+3]-np[j+3] stwu $t3,16($ap) stw $t4,4($rp) stw $t5,8($rp) stw $t6,12($rp) stwu $t7,16($rp) bdnz- Lsub li $i,0 subfe $ovf,$i,$ovf ; handle upmost overflow bit addi $tp,$sp,`$FRAME+$TRANSFER+4` subf $rp,$num,$rp ; rewind rp and $ap,$tp,$ovf andc $np,$rp,$ovf or $ap,$ap,$np ; ap=borrow?tp:rp addi $tp,$sp,`$FRAME+$TRANSFER` mtctr $j .align 4 Lcopy: ; copy or in-place refresh lwz $t0,4($ap) lwz $t1,8($ap) lwz $t2,12($ap) lwzu $t3,16($ap) std $i,8($nap_d) ; zap nap_d std $i,16($nap_d) std $i,24($nap_d) std $i,32($nap_d) std $i,40($nap_d) std $i,48($nap_d) std $i,56($nap_d) stdu $i,64($nap_d) stw $t0,4($rp) stw $t1,8($rp) stw $t2,12($rp) stwu $t3,16($rp) std $i,8($tp) ; zap tp at once stdu $i,16($tp) bdnz- Lcopy ___ $code.=<<___; $POP r14,`2*$SIZE_T`($sp) $POP r15,`3*$SIZE_T`($sp) $POP r16,`4*$SIZE_T`($sp) Loading