Loading Configure +16 −2 Original line number Diff line number Diff line Loading @@ -134,7 +134,7 @@ my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o::::::::::::void"; my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-alpha.o:void"; my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o:::::::"; my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o:::::::"; my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o::aes_ctr.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o:void"; my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o::aes_ctr.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o"; my $armv4_asm=":bn_asm.o armv4-mont.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o:void"; my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o:32"; my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o:64"; Loading Loading @@ -356,7 +356,21 @@ my %table=( "linux-ia64-ecc","ecc:-DL_ENDIAN -DTERMIO -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "linux-ia64-icc","icc:-DL_ENDIAN -DTERMIO -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "linux-x86_64", "gcc:-m64 -DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:${x86_64_asm}:elf:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64", "linux-s390x", "gcc:-m64 -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL:${s390x_asm}:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64", "linux64-s390x", "gcc:-m64 -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL:${s390x_asm}:64:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64", #### So called "highgprs" target for z/Architecture CPUs # "Highgprs" is kernel feature first implemented in Linux 2.6.32, see # /proc/cpuinfo. The idea is to preserve most significant bits of # general purpose registers not only upon 32-bit process context # switch, but even on asynchronous signal delivery to such process. # This makes it possible to deploy 64-bit instructions even in legacy # application context and achieve better [or should we say adequate] # performance. The build is binary compatible with linux-generic32, # and the idea is to be able to install the resulting libcrypto.so # alongside generic one, e.g. as /lib/highgprs/libcrypto.so.x.y, for # ldconfig and run-time linker to autodiscover. Unfortunately it # doesn't work just yet, because of couple of bugs in glibc # sysdep/s390/dl-procinfo.c affecting ldconfig and ld.so.1... "linux32-s390x", "gcc:-m31 -Wa,-mzarch -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL:s390xcap.o s390xcpuid.o:bn_asm.o s390x-mont.o::aes_ctr.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o:31:dlfcn:linux-shared:-fPIC:-m31:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::/highgprs", #### SPARC Linux setups # Ray Miller <ray.miller@computing-services.oxford.ac.uk> has patiently # assisted with debugging of following two configs. Loading config +12 −1 Original line number Diff line number Diff line Loading @@ -629,7 +629,18 @@ case "$GUESSOS" in sh*-*-linux2) OUT="linux-generic32"; options="$options -DL_ENDIAN" ;; m68k*-*-linux2) OUT="linux-generic32"; options="$options -DB_ENDIAN" ;; s390-*-linux2) OUT="linux-generic32"; options="$options -DB_ENDIAN" ;; s390x-*-linux2) OUT="linux-s390x" ;; s390x-*-linux2) # To be uncommented when glibc bug is fixed, see Configure... #if egrep -e '^features.* highgprs' /proc/cpuinfo >/dev/null ; then # echo "WARNING! If you wish to build \"highgprs\" 32-bit library, then you" # echo " have to invoke './Configure linux32-s390x' *manually*." # if [ "$TEST" = "false" -a -t -1 ]; then # echo " You have about 5 seconds to press Ctrl-C to abort." # (trap "stty `stty -g`" 2 0; stty -icanon min 0 time 50; read waste) <&1 # fi #fi OUT="linux64-s390x" ;; x86_64-*-linux?) OUT="linux-x86_64" ;; *86-*-linux2) OUT="linux-elf" if [ "$GCCVER" -gt 28 ]; then Loading crypto/aes/asm/aes-s390x.pl +95 −72 Original line number Diff line number Diff line Loading @@ -60,6 +60,26 @@ # maximum, but *on average* it would be as much as ~98%. Meaning that # worst case is unlike, it's like hitting ravine on plateau. # November 2010. # # Adapt for -m31 build. If kernel supports what's called "highgprs" # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit # instructions and achieve "64-bit" performance even in 31-bit legacy # application context. The feature is not specific to any particular # processor, as long as it's "z-CPU". Latter implies that the code # remains z/Architecture specific. On z990 it was measured to perform # 2x better than code generated by gcc 4.3. $flavour = shift; if ($flavour =~ /3[12]/) { $SIZE_T=4; $g=""; } else { $SIZE_T=8; $g="g"; } while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; Loading @@ -82,6 +102,8 @@ $rounds="%r13"; $ra="%r14"; $sp="%r15"; $stdframe=16*$SIZE_T+4*8; sub _data_word() { my $i; while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; } Loading Loading @@ -223,7 +245,7 @@ $code.=<<___ if (!$softonly); .Lesoft: ___ $code.=<<___; stmg %r3,$ra,24($sp) stm${g} %r3,$ra,3*$SIZE_T($sp) llgf $s0,0($inp) llgf $s1,4($inp) Loading @@ -233,20 +255,20 @@ $code.=<<___; larl $tbl,AES_Te bras $ra,_s390x_AES_encrypt lg $out,24($sp) l${g} $out,3*$SIZE_T($sp) st $s0,0($out) st $s1,4($out) st $s2,8($out) st $s3,12($out) lmg %r6,$ra,48($sp) lm${g} %r6,$ra,6*$SIZE_T($sp) br $ra .size AES_encrypt,.-AES_encrypt .type _s390x_AES_encrypt,\@function .align 16 _s390x_AES_encrypt: stg $ra,152($sp) st${g} $ra,`$stdframe-$SIZE_T`($sp) x $s0,0($key) x $s1,4($key) x $s2,8($key) Loading Loading @@ -410,7 +432,7 @@ _s390x_AES_encrypt: or $s2,$i3 or $s3,$t3 lg $ra,152($sp) l${g} $ra,`$stdframe-$SIZE_T`($sp) xr $s0,$t0 xr $s1,$t2 x $s2,24($key) Loading Loading @@ -549,7 +571,7 @@ $code.=<<___ if (!$softonly); .Ldsoft: ___ $code.=<<___; stmg %r3,$ra,24($sp) stm${g} %r3,$ra,3*$SIZE_T($sp) llgf $s0,0($inp) llgf $s1,4($inp) Loading @@ -559,20 +581,20 @@ $code.=<<___; larl $tbl,AES_Td bras $ra,_s390x_AES_decrypt lg $out,24($sp) l${g} $out,3*$SIZE_T($sp) st $s0,0($out) st $s1,4($out) st $s2,8($out) st $s3,12($out) lmg %r6,$ra,48($sp) lm${g} %r6,$ra,6*$SIZE_T($sp) br $ra .size AES_decrypt,.-AES_decrypt .type _s390x_AES_decrypt,\@function .align 16 _s390x_AES_decrypt: stg $ra,152($sp) st${g} $ra,`$stdframe-$SIZE_T`($sp) x $s0,0($key) x $s1,4($key) x $s2,8($key) Loading Loading @@ -716,7 +738,7 @@ _s390x_AES_decrypt: nr $i1,$mask nr $i2,$mask lg $ra,152($sp) l${g} $ra,`$stdframe-$SIZE_T`($sp) or $s1,$t1 l $t0,16($key) l $t1,20($key) Loading Loading @@ -750,9 +772,9 @@ $code.=<<___; .align 16 AES_set_encrypt_key: lghi $t0,0 clgr $inp,$t0 cl${g}r $inp,$t0 je .Lminus1 clgr $key,$t0 cl${g}r $key,$t0 je .Lminus1 lghi $t0,128 Loading Loading @@ -810,7 +832,7 @@ ___ $code.=<<___; .align 16 .Lekey_internal: stmg %r6,%r13,48($sp) # all non-volatile regs stm${g} %r6,%r13,6*$SIZE_T($sp) # all non-volatile regs larl $tbl,AES_Te+2048 Loading Loading @@ -871,7 +893,7 @@ $code.=<<___; la $t3,4($t3) # i++ brct $rounds,.L128_loop lghi %r2,0 lmg %r6,%r13,48($sp) lm${g} %r6,%r13,6*$SIZE_T($sp) br $ra .align 16 Loading Loading @@ -919,7 +941,7 @@ $code.=<<___; st $s3,36($key) brct $rounds,.L192_continue lghi %r2,0 lmg %r6,%r13,48($sp) lm${g} %r6,%r13,6*$SIZE_T($sp) br $ra .align 16 Loading Loading @@ -981,7 +1003,7 @@ $code.=<<___; st $s3,44($key) brct $rounds,.L256_continue lghi %r2,0 lmg %r6,%r13,48($sp) lm${g} %r6,%r13,6*$SIZE_T($sp) br $ra .align 16 Loading Loading @@ -1032,11 +1054,11 @@ $code.=<<___; .type AES_set_decrypt_key,\@function .align 16 AES_set_decrypt_key: stg $key,32($sp) # I rely on AES_set_encrypt_key to stg $ra,112($sp) # save non-volatile registers! st${g} $key,4*$SIZE_T($sp) # I rely on AES_set_encrypt_key to st${g} $ra,14*$SIZE_T($sp) # save non-volatile registers! bras $ra,AES_set_encrypt_key lg $key,32($sp) lg $ra,112($sp) l${g} $key,4*$SIZE_T($sp) l${g} $ra,14*$SIZE_T($sp) ltgr %r2,%r2 bnzr $ra ___ Loading @@ -1051,11 +1073,11 @@ $code.=<<___ if (!$softonly); .align 16 .Ldkey_internal: stg $key,32($sp) stg $ra,40($sp) st${g} $key,4*$SIZE_T($sp) st${g} $ra,14*$SIZE_T($sp) bras $ra,.Lekey_internal lg $key,32($sp) lg $ra,40($sp) l${g} $key,4*$SIZE_T($sp) l${g} $ra,14*$SIZE_T($sp) ___ $code.=<<___; Loading Loading @@ -1136,7 +1158,7 @@ $code.=<<___; la $key,4($key) brct $rounds,.Lmix lmg %r6,%r13,48($sp)# as was saved by AES_set_encrypt_key! lm${g} %r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key! lghi %r2,0 br $ra .size AES_set_decrypt_key,.-AES_set_decrypt_key Loading Loading @@ -1176,7 +1198,7 @@ $code.=<<___ if (!$softonly); l %r0,240($key) # load kmc code lghi $key,15 # res=len%16, len-=res; ngr $key,$len slgr $len,$key sl${g}r $len,$key la %r1,16($sp) # parameter block - ivec || key jz .Lkmc_truncated .long 0xb92f0042 # kmc %r4,%r2 Loading @@ -1194,34 +1216,34 @@ $code.=<<___ if (!$softonly); tmll %r0,0x80 jnz .Lkmc_truncated_dec lghi %r1,0 stg %r1,128($sp) stg %r1,136($sp) stg %r1,16*$SIZE_T($sp) stg %r1,16*$SIZE_T+8($sp) bras %r1,1f mvc 128(1,$sp),0($inp) mvc 16*$SIZE_T(1,$sp),0($inp) 1: ex $key,0(%r1) la %r1,16($sp) # restore parameter block la $inp,128($sp) la $inp,16*$SIZE_T($sp) lghi $len,16 .long 0xb92f0042 # kmc %r4,%r2 j .Lkmc_done .align 16 .Lkmc_truncated_dec: stg $out,64($sp) la $out,128($sp) st${g} $out,4*$SIZE_T($sp) la $out,16*$SIZE_T($sp) lghi $len,16 .long 0xb92f0042 # kmc %r4,%r2 lg $out,64($sp) l${g} $out,4*$SIZE_T($sp) bras %r1,2f mvc 0(1,$out),128($sp) mvc 0(1,$out),16*$SIZE_T($sp) 2: ex $key,0(%r1) j .Lkmc_done .align 16 .Lcbc_software: ___ $code.=<<___; stmg $key,$ra,40($sp) stm${g} $key,$ra,5*$SIZE_T($sp) lhi %r0,0 cl %r0,164($sp) cl %r0,`$stdframe+$SIZE_T-4`($sp) je .Lcbc_decrypt larl $tbl,AES_Te Loading @@ -1232,10 +1254,10 @@ $code.=<<___; llgf $s3,12($ivp) lghi $t0,16 slgr $len,$t0 sl${g}r $len,$t0 brc 4,.Lcbc_enc_tail # if borrow .Lcbc_enc_loop: stmg $inp,$out,16($sp) stm${g} $inp,$out,2*$SIZE_T($sp) x $s0,0($inp) x $s1,4($inp) x $s2,8($inp) Loading @@ -1244,7 +1266,7 @@ $code.=<<___; bras $ra,_s390x_AES_encrypt lmg $inp,$key,16($sp) lm${g} $inp,$key,2*$SIZE_T($sp) st $s0,0($out) st $s1,4($out) st $s2,8($out) Loading @@ -1253,33 +1275,33 @@ $code.=<<___; la $inp,16($inp) la $out,16($out) lghi $t0,16 ltgr $len,$len lt${g}r $len,$len jz .Lcbc_enc_done slgr $len,$t0 sl${g}r $len,$t0 brc 4,.Lcbc_enc_tail # if borrow j .Lcbc_enc_loop .align 16 .Lcbc_enc_done: lg $ivp,48($sp) l${g} $ivp,6*$SIZE_T($sp) st $s0,0($ivp) st $s1,4($ivp) st $s2,8($ivp) st $s3,12($ivp) lmg %r7,$ra,56($sp) lm${g} %r7,$ra,7*$SIZE_T($sp) br $ra .align 16 .Lcbc_enc_tail: aghi $len,15 lghi $t0,0 stg $t0,128($sp) stg $t0,136($sp) stg $t0,16*$SIZE_T($sp) stg $t0,16*$SIZE_T+8($sp) bras $t1,3f mvc 128(1,$sp),0($inp) mvc 16*$SIZE_T(1,$sp),0($inp) 3: ex $len,0($t1) lghi $len,0 la $inp,128($sp) la $inp,16*$SIZE_T($sp) j .Lcbc_enc_loop .align 16 Loading @@ -1288,10 +1310,10 @@ $code.=<<___; lg $t0,0($ivp) lg $t1,8($ivp) stmg $t0,$t1,128($sp) stmg $t0,$t1,16*$SIZE_T($sp) .Lcbc_dec_loop: stmg $inp,$out,16($sp) stm${g} $inp,$out,2*$SIZE_T($sp) llgf $s0,0($inp) llgf $s1,4($inp) llgf $s2,8($inp) Loading @@ -1300,7 +1322,7 @@ $code.=<<___; bras $ra,_s390x_AES_decrypt lmg $inp,$key,16($sp) lm${g} $inp,$key,2*$SIZE_T($sp) sllg $s0,$s0,32 sllg $s2,$s2,32 lr $s0,$s1 Loading @@ -1308,15 +1330,15 @@ $code.=<<___; lg $t0,0($inp) lg $t1,8($inp) xg $s0,128($sp) xg $s2,136($sp) xg $s0,16*$SIZE_T($sp) xg $s2,16*$SIZE_T+8($sp) lghi $s1,16 slgr $len,$s1 sl${g}r $len,$s1 brc 4,.Lcbc_dec_tail # if borrow brc 2,.Lcbc_dec_done # if zero stg $s0,0($out) stg $s2,8($out) stmg $t0,$t1,128($sp) stmg $t0,$t1,16*$SIZE_T($sp) la $inp,16($inp) la $out,16($out) Loading @@ -1326,7 +1348,7 @@ $code.=<<___; stg $s0,0($out) stg $s2,8($out) .Lcbc_dec_exit: lmg $ivp,$ra,48($sp) lm${g} %r6,$ra,6*$SIZE_T($sp) stmg $t0,$t1,0($ivp) br $ra Loading @@ -1334,10 +1356,10 @@ $code.=<<___; .align 16 .Lcbc_dec_tail: aghi $len,15 stg $s0,128($sp) stg $s2,136($sp) stg $s0,16*$SIZE_T($sp) stg $s2,16*$SIZE_T+8($sp) bras $s1,4f mvc 0(1,$out),128($sp) mvc 0(1,$out),16*$SIZE_T($sp) 4: ex $len,0($s1) j .Lcbc_dec_exit .size AES_cbc_encrypt,.-AES_cbc_encrypt Loading @@ -1359,6 +1381,7 @@ $code.=<<___; .type AES_ctr32_encrypt,\@function .align 16 AES_ctr32_encrypt: llgfr $len,$len # safe in ctr32 subroutine even in 64-bit case ___ $code.=<<___ if (!$softonly); l %r0,240($key) Loading @@ -1366,7 +1389,7 @@ $code.=<<___ if (!$softonly); clr %r0,%r1 jl .Lctr32_software stmg %r6,$s3,48($sp) stm${g} %r6,$s3,6*$SIZE_T($sp) slgr $out,$inp la %r1,0($key) # %r1 is permanent copy of $key Loading @@ -1388,14 +1411,14 @@ $code.=<<___ if (!$softonly); la $sp,1024($s0) # alloca srlg $fp,$fp,4 # convert bytes to blocks, minimum 16 stg $s2,0($sp) # back-chain stg $fp,8($sp) st${g} $s2,0($sp) # back-chain st${g} $fp,$SIZE_T($sp) slgr $len,$fp brc 1,.Lctr32_hw_loop # not zero, no borrow algr $fp,$len # input is shorter than allocated buffer lghi $len,0 stg $fp,8($sp) st${g} $fp,$SIZE_T($sp) .Lctr32_hw_loop: la $s2,16($sp) Loading Loading @@ -1432,8 +1455,8 @@ $code.=<<___ if (!$softonly); lghi $len,0 brc 4+1,.Lctr32_hw_loop # not zero lg $s0,0($sp) lg $s1,8($sp) l${g} $s0,0($sp) l${g} $s1,$SIZE_T($sp) la $s2,16($sp) .Lctr32_hw_zap: stg $s0,0($s2) Loading @@ -1442,30 +1465,30 @@ $code.=<<___ if (!$softonly); brct $s1,.Lctr32_hw_zap la $sp,0($s0) lmg %r6,$s3,48($sp) lm${g} %r6,$s3,6*$SIZE_T($sp) br $ra .align 16 .Lctr32_software: ___ $code.=<<___; stmg $key,$ra,40($sp) slgr $out,$inp stm${g} $key,$ra,5*$SIZE_T($sp) sl${g}r $out,$inp larl $tbl,AES_Te llgf $t1,12($ivp) .Lctr32_loop: stmg $inp,$len,16($sp) stm${g} $inp,$len,2*$SIZE_T($sp) llgf $s0,0($ivp) llgf $s1,4($ivp) llgf $s2,8($ivp) lgr $s3,$t1 st $t1,128($sp) st $t1,16*$SIZE_T($sp) lgr %r4,$key bras $ra,_s390x_AES_encrypt lmg $inp,$ivp,16($sp) llgf $t1,128($sp) lm${g} $inp,$ivp,2*$SIZE_T($sp) llgf $t1,16*$SIZE_T($sp) x $s0,0($inp) x $s1,4($inp) x $s2,8($inp) Loading @@ -1479,7 +1502,7 @@ $code.=<<___; ahi $t1,1 # 32-bit increment brct $len,.Lctr32_loop lmg %r6,$ra,48($sp) lm${g} %r6,$ra,6*$SIZE_T($sp) br $ra .size AES_ctr32_encrypt,.-AES_ctr32_encrypt ___ Loading crypto/bn/asm/s390x-mont.pl +74 −25 Original line number Diff line number Diff line Loading @@ -32,9 +32,33 @@ # Reschedule to minimize/avoid Address Generation Interlock hazard, # make inner loops counter-based. # November 2010. # # Adapt for -m31 build. If kernel supports what's called "highgprs" # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit # instructions and achieve "64-bit" performance even in 31-bit legacy # application context. The feature is not specific to any particular # processor, as long as it's "z-CPU". Latter implies that the code # remains z/Architecture specific. Compatibility with 32-bit BN_ULONG # is achieved by swapping words after 64-bit loads, follow _dswap-s. # On z990 it was measured to perform 2.6-2.2 times better, less for # longer keys... $flavour = shift; if ($flavour =~ /3[12]/) { $SIZE_T=4; $g=""; } else { $SIZE_T=8; $g="g"; } while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; $stdframe=16*$SIZE_T+4*8; $mn0="%r0"; $num="%r1"; Loading Loading @@ -63,34 +87,44 @@ $code.=<<___; .globl bn_mul_mont .type bn_mul_mont,\@function bn_mul_mont: lgf $num,164($sp) # pull $num sla $num,3 # $num to enumerate bytes lgf $num,`$stdframe+$SIZE_T-4`($sp) # pull $num sla $num,`log($SIZE_T)/log(2)` # $num to enumerate bytes la $bp,0($num,$bp) stg %r2,16($sp) st${g} %r2,2*$SIZE_T($sp) cghi $num,16 # lghi %r2,0 # blr %r14 # if($num<16) return 0; ___ $code.=<<___ if ($flavour =~ /3[12]/); tmll $num,4 bnzr %r14 # if ($num&1) return 0; ___ $code.=<<___ if ($flavour !~ /3[12]/); cghi $num,128 # bhr %r14 # if($num>128) return 0; ___ $code.=<<___; stm${g} %r3,%r15,3*$SIZE_T($sp) stmg %r3,%r15,24($sp) lghi $rp,-160-8 # leave room for carry bit lghi $rp,-$stdframe-8 # leave room for carry bit lcgr $j,$num # -$num lgr %r0,$sp la $rp,0($rp,$sp) la $sp,0($j,$rp) # alloca stg %r0,0($sp) # back chain st${g} %r0,0($sp) # back chain sra $num,3 # restore $num la $bp,0($j,$bp) # restore $bp ahi $num,-1 # adjust $num for inner loop lg $n0,0($n0) # pull n0 _dswap $n0 lg $bi,0($bp) _dswap $bi lg $alo,0($ap) _dswap $alo mlgr $ahi,$bi # ap[0]*bp[0] lgr $AHI,$ahi Loading @@ -98,6 +132,7 @@ bn_mul_mont: msgr $mn0,$n0 lg $nlo,0($np) # _dswap $nlo mlgr $nhi,$mn0 # np[0]*m1 algr $nlo,$alo # +="tp[0]" lghi $NHI,0 Loading @@ -109,12 +144,14 @@ bn_mul_mont: .align 16 .L1st: lg $alo,0($j,$ap) _dswap $alo mlgr $ahi,$bi # ap[j]*bp[0] algr $alo,$AHI lghi $AHI,0 alcgr $AHI,$ahi lg $nlo,0($j,$np) _dswap $nlo mlgr $nhi,$mn0 # np[j]*m1 algr $nlo,$NHI lghi $NHI,0 Loading @@ -122,22 +159,24 @@ bn_mul_mont: algr $nlo,$alo alcgr $NHI,$nhi stg $nlo,160-8($j,$sp) # tp[j-1]= stg $nlo,$stdframe-8($j,$sp) # tp[j-1]= la $j,8($j) # j++ brct $count,.L1st algr $NHI,$AHI lghi $AHI,0 alcgr $AHI,$AHI # upmost overflow bit stg $NHI,160-8($j,$sp) stg $AHI,160($j,$sp) stg $NHI,$stdframe-8($j,$sp) stg $AHI,$stdframe($j,$sp) la $bp,8($bp) # bp++ .Louter: lg $bi,0($bp) # bp[i] _dswap $bi lg $alo,0($ap) _dswap $alo mlgr $ahi,$bi # ap[0]*bp[i] alg $alo,160($sp) # +=tp[0] alg $alo,$stdframe($sp) # +=tp[0] lghi $AHI,0 alcgr $AHI,$ahi Loading @@ -145,6 +184,7 @@ bn_mul_mont: msgr $mn0,$n0 # tp[0]*n0 lg $nlo,0($np) # np[0] _dswap $nlo mlgr $nhi,$mn0 # np[0]*m1 algr $nlo,$alo # +="tp[0]" lghi $NHI,0 Loading @@ -156,14 +196,16 @@ bn_mul_mont: .align 16 .Linner: lg $alo,0($j,$ap) _dswap $alo mlgr $ahi,$bi # ap[j]*bp[i] algr $alo,$AHI lghi $AHI,0 alcgr $ahi,$AHI alg $alo,160($j,$sp)# +=tp[j] alg $alo,$stdframe($j,$sp)# +=tp[j] alcgr $AHI,$ahi lg $nlo,0($j,$np) _dswap $nlo mlgr $nhi,$mn0 # np[j]*m1 algr $nlo,$NHI lghi $NHI,0 Loading @@ -171,31 +213,33 @@ bn_mul_mont: algr $nlo,$alo # +="tp[j]" alcgr $NHI,$nhi stg $nlo,160-8($j,$sp) # tp[j-1]= stg $nlo,$stdframe-8($j,$sp) # tp[j-1]= la $j,8($j) # j++ brct $count,.Linner algr $NHI,$AHI lghi $AHI,0 alcgr $AHI,$AHI alg $NHI,160($j,$sp)# accumulate previous upmost overflow bit alg $NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit lghi $ahi,0 alcgr $AHI,$ahi # new upmost overflow bit stg $NHI,160-8($j,$sp) stg $AHI,160($j,$sp) stg $NHI,$stdframe-8($j,$sp) stg $AHI,$stdframe($j,$sp) la $bp,8($bp) # bp++ clg $bp,160+8+32($j,$sp) # compare to &bp[num] cl${g} $bp,`$stdframe+8+4*$SIZE_T`($j,$sp) # compare to &bp[num] jne .Louter lg $rp,160+8+16($j,$sp) # reincarnate rp la $ap,160($sp) l${g} $rp,`$stdframe+8+2*$SIZE_T`($j,$sp) # reincarnate rp la $ap,$stdframe($sp) ahi $num,1 # restore $num, incidentally clears "borrow" la $j,0(%r0) lr $count,$num .Lsub: lg $alo,0($j,$ap) slbg $alo,0($j,$np) lg $nlo,0($j,$np) _dswap $nlo slbgr $alo,$nlo stg $alo,0($j,$rp) la $j,8($j) brct $count,.Lsub Loading @@ -211,18 +255,23 @@ bn_mul_mont: la $j,0(%r0) lgr $count,$num .Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh stg $j,160($j,$sp) # zap tp _dswap $alo stg $j,$stdframe($j,$sp) # zap tp stg $alo,0($j,$rp) la $j,8($j) brct $count,.Lcopy la %r1,160+8+48($j,$sp) lmg %r6,%r15,0(%r1) la %r1,`$stdframe+8+6*$SIZE_T`($j,$sp) lm${g} %r6,%r15,0(%r1) lghi %r2,1 # signal "processed" br %r14 .size bn_mul_mont,.-bn_mul_mont .string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>" ___ print $code; foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/ge; s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e; print $_,"\n"; } close STDOUT; crypto/modes/asm/ghash-s390x.pl +26 −3 Original line number Diff line number Diff line Loading @@ -18,6 +18,26 @@ # and the result should be close to 12. In the lack of instruction- # level profiling data it's impossible to tell why... # November 2010. # # Adapt for -m31 build. If kernel supports what's called "highgprs" # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit # instructions and achieve "64-bit" performance even in 31-bit legacy # application context. The feature is not specific to any particular # processor, as long as it's "z-CPU". Latter implies that the code # remains z/Architecture specific. On z990 it was measured to perform # 2.8x better than 32-bit code generated by gcc 4.3. $flavour = shift; if ($flavour =~ /3[12]/) { $SIZE_T=4; $g=""; } else { $SIZE_T=8; $g="g"; } while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; Loading Loading @@ -74,7 +94,7 @@ $code.=<<___ if(!$softonly); .Lsoft_gmult: ___ $code.=<<___; stmg %r6,%r14,48($sp) stm${g} %r6,%r14,6*$SIZE_T($sp) aghi $Xi,-1 lghi $len,1 Loading Loading @@ -109,8 +129,11 @@ $code.=<<___ if(!$softonly); .align 32 .Lsoft_ghash: ___ $cdoe.=<<___ if ($flavour =~ /3[12]/); llgfr $len,$len ___ $code.=<<___; stmg %r6,%r14,48($sp) stm${g} %r6,%r14,6*$SIZE_T($sp) aghi $Xi,-1 srlg $len,$len,4 Loading Loading @@ -209,7 +232,7 @@ $code.=<<___; xgr $Zhi,$tmp stg $Zlo,8+1($Xi) stg $Zhi,0+1($Xi) lmg %r6,%r14,48($sp) lm${g} %r6,%r14,6*$SIZE_T($sp) br %r14 .type gcm_ghash_4bit,\@function .size gcm_ghash_4bit,(.-gcm_ghash_4bit) Loading Loading
Configure +16 −2 Original line number Diff line number Diff line Loading @@ -134,7 +134,7 @@ my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o::::::::::::void"; my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-alpha.o:void"; my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o:::::::"; my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o:::::::"; my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o::aes_ctr.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o:void"; my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o::aes_ctr.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o"; my $armv4_asm=":bn_asm.o armv4-mont.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o:void"; my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o:32"; my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o:64"; Loading Loading @@ -356,7 +356,21 @@ my %table=( "linux-ia64-ecc","ecc:-DL_ENDIAN -DTERMIO -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "linux-ia64-icc","icc:-DL_ENDIAN -DTERMIO -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "linux-x86_64", "gcc:-m64 -DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:${x86_64_asm}:elf:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64", "linux-s390x", "gcc:-m64 -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL:${s390x_asm}:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64", "linux64-s390x", "gcc:-m64 -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL:${s390x_asm}:64:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64", #### So called "highgprs" target for z/Architecture CPUs # "Highgprs" is kernel feature first implemented in Linux 2.6.32, see # /proc/cpuinfo. The idea is to preserve most significant bits of # general purpose registers not only upon 32-bit process context # switch, but even on asynchronous signal delivery to such process. # This makes it possible to deploy 64-bit instructions even in legacy # application context and achieve better [or should we say adequate] # performance. The build is binary compatible with linux-generic32, # and the idea is to be able to install the resulting libcrypto.so # alongside generic one, e.g. as /lib/highgprs/libcrypto.so.x.y, for # ldconfig and run-time linker to autodiscover. Unfortunately it # doesn't work just yet, because of couple of bugs in glibc # sysdep/s390/dl-procinfo.c affecting ldconfig and ld.so.1... "linux32-s390x", "gcc:-m31 -Wa,-mzarch -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL:s390xcap.o s390xcpuid.o:bn_asm.o s390x-mont.o::aes_ctr.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o:31:dlfcn:linux-shared:-fPIC:-m31:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::/highgprs", #### SPARC Linux setups # Ray Miller <ray.miller@computing-services.oxford.ac.uk> has patiently # assisted with debugging of following two configs. Loading
config +12 −1 Original line number Diff line number Diff line Loading @@ -629,7 +629,18 @@ case "$GUESSOS" in sh*-*-linux2) OUT="linux-generic32"; options="$options -DL_ENDIAN" ;; m68k*-*-linux2) OUT="linux-generic32"; options="$options -DB_ENDIAN" ;; s390-*-linux2) OUT="linux-generic32"; options="$options -DB_ENDIAN" ;; s390x-*-linux2) OUT="linux-s390x" ;; s390x-*-linux2) # To be uncommented when glibc bug is fixed, see Configure... #if egrep -e '^features.* highgprs' /proc/cpuinfo >/dev/null ; then # echo "WARNING! If you wish to build \"highgprs\" 32-bit library, then you" # echo " have to invoke './Configure linux32-s390x' *manually*." # if [ "$TEST" = "false" -a -t -1 ]; then # echo " You have about 5 seconds to press Ctrl-C to abort." # (trap "stty `stty -g`" 2 0; stty -icanon min 0 time 50; read waste) <&1 # fi #fi OUT="linux64-s390x" ;; x86_64-*-linux?) OUT="linux-x86_64" ;; *86-*-linux2) OUT="linux-elf" if [ "$GCCVER" -gt 28 ]; then Loading
crypto/aes/asm/aes-s390x.pl +95 −72 Original line number Diff line number Diff line Loading @@ -60,6 +60,26 @@ # maximum, but *on average* it would be as much as ~98%. Meaning that # worst case is unlike, it's like hitting ravine on plateau. # November 2010. # # Adapt for -m31 build. If kernel supports what's called "highgprs" # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit # instructions and achieve "64-bit" performance even in 31-bit legacy # application context. The feature is not specific to any particular # processor, as long as it's "z-CPU". Latter implies that the code # remains z/Architecture specific. On z990 it was measured to perform # 2x better than code generated by gcc 4.3. $flavour = shift; if ($flavour =~ /3[12]/) { $SIZE_T=4; $g=""; } else { $SIZE_T=8; $g="g"; } while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; Loading @@ -82,6 +102,8 @@ $rounds="%r13"; $ra="%r14"; $sp="%r15"; $stdframe=16*$SIZE_T+4*8; sub _data_word() { my $i; while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; } Loading Loading @@ -223,7 +245,7 @@ $code.=<<___ if (!$softonly); .Lesoft: ___ $code.=<<___; stmg %r3,$ra,24($sp) stm${g} %r3,$ra,3*$SIZE_T($sp) llgf $s0,0($inp) llgf $s1,4($inp) Loading @@ -233,20 +255,20 @@ $code.=<<___; larl $tbl,AES_Te bras $ra,_s390x_AES_encrypt lg $out,24($sp) l${g} $out,3*$SIZE_T($sp) st $s0,0($out) st $s1,4($out) st $s2,8($out) st $s3,12($out) lmg %r6,$ra,48($sp) lm${g} %r6,$ra,6*$SIZE_T($sp) br $ra .size AES_encrypt,.-AES_encrypt .type _s390x_AES_encrypt,\@function .align 16 _s390x_AES_encrypt: stg $ra,152($sp) st${g} $ra,`$stdframe-$SIZE_T`($sp) x $s0,0($key) x $s1,4($key) x $s2,8($key) Loading Loading @@ -410,7 +432,7 @@ _s390x_AES_encrypt: or $s2,$i3 or $s3,$t3 lg $ra,152($sp) l${g} $ra,`$stdframe-$SIZE_T`($sp) xr $s0,$t0 xr $s1,$t2 x $s2,24($key) Loading Loading @@ -549,7 +571,7 @@ $code.=<<___ if (!$softonly); .Ldsoft: ___ $code.=<<___; stmg %r3,$ra,24($sp) stm${g} %r3,$ra,3*$SIZE_T($sp) llgf $s0,0($inp) llgf $s1,4($inp) Loading @@ -559,20 +581,20 @@ $code.=<<___; larl $tbl,AES_Td bras $ra,_s390x_AES_decrypt lg $out,24($sp) l${g} $out,3*$SIZE_T($sp) st $s0,0($out) st $s1,4($out) st $s2,8($out) st $s3,12($out) lmg %r6,$ra,48($sp) lm${g} %r6,$ra,6*$SIZE_T($sp) br $ra .size AES_decrypt,.-AES_decrypt .type _s390x_AES_decrypt,\@function .align 16 _s390x_AES_decrypt: stg $ra,152($sp) st${g} $ra,`$stdframe-$SIZE_T`($sp) x $s0,0($key) x $s1,4($key) x $s2,8($key) Loading Loading @@ -716,7 +738,7 @@ _s390x_AES_decrypt: nr $i1,$mask nr $i2,$mask lg $ra,152($sp) l${g} $ra,`$stdframe-$SIZE_T`($sp) or $s1,$t1 l $t0,16($key) l $t1,20($key) Loading Loading @@ -750,9 +772,9 @@ $code.=<<___; .align 16 AES_set_encrypt_key: lghi $t0,0 clgr $inp,$t0 cl${g}r $inp,$t0 je .Lminus1 clgr $key,$t0 cl${g}r $key,$t0 je .Lminus1 lghi $t0,128 Loading Loading @@ -810,7 +832,7 @@ ___ $code.=<<___; .align 16 .Lekey_internal: stmg %r6,%r13,48($sp) # all non-volatile regs stm${g} %r6,%r13,6*$SIZE_T($sp) # all non-volatile regs larl $tbl,AES_Te+2048 Loading Loading @@ -871,7 +893,7 @@ $code.=<<___; la $t3,4($t3) # i++ brct $rounds,.L128_loop lghi %r2,0 lmg %r6,%r13,48($sp) lm${g} %r6,%r13,6*$SIZE_T($sp) br $ra .align 16 Loading Loading @@ -919,7 +941,7 @@ $code.=<<___; st $s3,36($key) brct $rounds,.L192_continue lghi %r2,0 lmg %r6,%r13,48($sp) lm${g} %r6,%r13,6*$SIZE_T($sp) br $ra .align 16 Loading Loading @@ -981,7 +1003,7 @@ $code.=<<___; st $s3,44($key) brct $rounds,.L256_continue lghi %r2,0 lmg %r6,%r13,48($sp) lm${g} %r6,%r13,6*$SIZE_T($sp) br $ra .align 16 Loading Loading @@ -1032,11 +1054,11 @@ $code.=<<___; .type AES_set_decrypt_key,\@function .align 16 AES_set_decrypt_key: stg $key,32($sp) # I rely on AES_set_encrypt_key to stg $ra,112($sp) # save non-volatile registers! st${g} $key,4*$SIZE_T($sp) # I rely on AES_set_encrypt_key to st${g} $ra,14*$SIZE_T($sp) # save non-volatile registers! bras $ra,AES_set_encrypt_key lg $key,32($sp) lg $ra,112($sp) l${g} $key,4*$SIZE_T($sp) l${g} $ra,14*$SIZE_T($sp) ltgr %r2,%r2 bnzr $ra ___ Loading @@ -1051,11 +1073,11 @@ $code.=<<___ if (!$softonly); .align 16 .Ldkey_internal: stg $key,32($sp) stg $ra,40($sp) st${g} $key,4*$SIZE_T($sp) st${g} $ra,14*$SIZE_T($sp) bras $ra,.Lekey_internal lg $key,32($sp) lg $ra,40($sp) l${g} $key,4*$SIZE_T($sp) l${g} $ra,14*$SIZE_T($sp) ___ $code.=<<___; Loading Loading @@ -1136,7 +1158,7 @@ $code.=<<___; la $key,4($key) brct $rounds,.Lmix lmg %r6,%r13,48($sp)# as was saved by AES_set_encrypt_key! lm${g} %r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key! lghi %r2,0 br $ra .size AES_set_decrypt_key,.-AES_set_decrypt_key Loading Loading @@ -1176,7 +1198,7 @@ $code.=<<___ if (!$softonly); l %r0,240($key) # load kmc code lghi $key,15 # res=len%16, len-=res; ngr $key,$len slgr $len,$key sl${g}r $len,$key la %r1,16($sp) # parameter block - ivec || key jz .Lkmc_truncated .long 0xb92f0042 # kmc %r4,%r2 Loading @@ -1194,34 +1216,34 @@ $code.=<<___ if (!$softonly); tmll %r0,0x80 jnz .Lkmc_truncated_dec lghi %r1,0 stg %r1,128($sp) stg %r1,136($sp) stg %r1,16*$SIZE_T($sp) stg %r1,16*$SIZE_T+8($sp) bras %r1,1f mvc 128(1,$sp),0($inp) mvc 16*$SIZE_T(1,$sp),0($inp) 1: ex $key,0(%r1) la %r1,16($sp) # restore parameter block la $inp,128($sp) la $inp,16*$SIZE_T($sp) lghi $len,16 .long 0xb92f0042 # kmc %r4,%r2 j .Lkmc_done .align 16 .Lkmc_truncated_dec: stg $out,64($sp) la $out,128($sp) st${g} $out,4*$SIZE_T($sp) la $out,16*$SIZE_T($sp) lghi $len,16 .long 0xb92f0042 # kmc %r4,%r2 lg $out,64($sp) l${g} $out,4*$SIZE_T($sp) bras %r1,2f mvc 0(1,$out),128($sp) mvc 0(1,$out),16*$SIZE_T($sp) 2: ex $key,0(%r1) j .Lkmc_done .align 16 .Lcbc_software: ___ $code.=<<___; stmg $key,$ra,40($sp) stm${g} $key,$ra,5*$SIZE_T($sp) lhi %r0,0 cl %r0,164($sp) cl %r0,`$stdframe+$SIZE_T-4`($sp) je .Lcbc_decrypt larl $tbl,AES_Te Loading @@ -1232,10 +1254,10 @@ $code.=<<___; llgf $s3,12($ivp) lghi $t0,16 slgr $len,$t0 sl${g}r $len,$t0 brc 4,.Lcbc_enc_tail # if borrow .Lcbc_enc_loop: stmg $inp,$out,16($sp) stm${g} $inp,$out,2*$SIZE_T($sp) x $s0,0($inp) x $s1,4($inp) x $s2,8($inp) Loading @@ -1244,7 +1266,7 @@ $code.=<<___; bras $ra,_s390x_AES_encrypt lmg $inp,$key,16($sp) lm${g} $inp,$key,2*$SIZE_T($sp) st $s0,0($out) st $s1,4($out) st $s2,8($out) Loading @@ -1253,33 +1275,33 @@ $code.=<<___; la $inp,16($inp) la $out,16($out) lghi $t0,16 ltgr $len,$len lt${g}r $len,$len jz .Lcbc_enc_done slgr $len,$t0 sl${g}r $len,$t0 brc 4,.Lcbc_enc_tail # if borrow j .Lcbc_enc_loop .align 16 .Lcbc_enc_done: lg $ivp,48($sp) l${g} $ivp,6*$SIZE_T($sp) st $s0,0($ivp) st $s1,4($ivp) st $s2,8($ivp) st $s3,12($ivp) lmg %r7,$ra,56($sp) lm${g} %r7,$ra,7*$SIZE_T($sp) br $ra .align 16 .Lcbc_enc_tail: aghi $len,15 lghi $t0,0 stg $t0,128($sp) stg $t0,136($sp) stg $t0,16*$SIZE_T($sp) stg $t0,16*$SIZE_T+8($sp) bras $t1,3f mvc 128(1,$sp),0($inp) mvc 16*$SIZE_T(1,$sp),0($inp) 3: ex $len,0($t1) lghi $len,0 la $inp,128($sp) la $inp,16*$SIZE_T($sp) j .Lcbc_enc_loop .align 16 Loading @@ -1288,10 +1310,10 @@ $code.=<<___; lg $t0,0($ivp) lg $t1,8($ivp) stmg $t0,$t1,128($sp) stmg $t0,$t1,16*$SIZE_T($sp) .Lcbc_dec_loop: stmg $inp,$out,16($sp) stm${g} $inp,$out,2*$SIZE_T($sp) llgf $s0,0($inp) llgf $s1,4($inp) llgf $s2,8($inp) Loading @@ -1300,7 +1322,7 @@ $code.=<<___; bras $ra,_s390x_AES_decrypt lmg $inp,$key,16($sp) lm${g} $inp,$key,2*$SIZE_T($sp) sllg $s0,$s0,32 sllg $s2,$s2,32 lr $s0,$s1 Loading @@ -1308,15 +1330,15 @@ $code.=<<___; lg $t0,0($inp) lg $t1,8($inp) xg $s0,128($sp) xg $s2,136($sp) xg $s0,16*$SIZE_T($sp) xg $s2,16*$SIZE_T+8($sp) lghi $s1,16 slgr $len,$s1 sl${g}r $len,$s1 brc 4,.Lcbc_dec_tail # if borrow brc 2,.Lcbc_dec_done # if zero stg $s0,0($out) stg $s2,8($out) stmg $t0,$t1,128($sp) stmg $t0,$t1,16*$SIZE_T($sp) la $inp,16($inp) la $out,16($out) Loading @@ -1326,7 +1348,7 @@ $code.=<<___; stg $s0,0($out) stg $s2,8($out) .Lcbc_dec_exit: lmg $ivp,$ra,48($sp) lm${g} %r6,$ra,6*$SIZE_T($sp) stmg $t0,$t1,0($ivp) br $ra Loading @@ -1334,10 +1356,10 @@ $code.=<<___; .align 16 .Lcbc_dec_tail: aghi $len,15 stg $s0,128($sp) stg $s2,136($sp) stg $s0,16*$SIZE_T($sp) stg $s2,16*$SIZE_T+8($sp) bras $s1,4f mvc 0(1,$out),128($sp) mvc 0(1,$out),16*$SIZE_T($sp) 4: ex $len,0($s1) j .Lcbc_dec_exit .size AES_cbc_encrypt,.-AES_cbc_encrypt Loading @@ -1359,6 +1381,7 @@ $code.=<<___; .type AES_ctr32_encrypt,\@function .align 16 AES_ctr32_encrypt: llgfr $len,$len # safe in ctr32 subroutine even in 64-bit case ___ $code.=<<___ if (!$softonly); l %r0,240($key) Loading @@ -1366,7 +1389,7 @@ $code.=<<___ if (!$softonly); clr %r0,%r1 jl .Lctr32_software stmg %r6,$s3,48($sp) stm${g} %r6,$s3,6*$SIZE_T($sp) slgr $out,$inp la %r1,0($key) # %r1 is permanent copy of $key Loading @@ -1388,14 +1411,14 @@ $code.=<<___ if (!$softonly); la $sp,1024($s0) # alloca srlg $fp,$fp,4 # convert bytes to blocks, minimum 16 stg $s2,0($sp) # back-chain stg $fp,8($sp) st${g} $s2,0($sp) # back-chain st${g} $fp,$SIZE_T($sp) slgr $len,$fp brc 1,.Lctr32_hw_loop # not zero, no borrow algr $fp,$len # input is shorter than allocated buffer lghi $len,0 stg $fp,8($sp) st${g} $fp,$SIZE_T($sp) .Lctr32_hw_loop: la $s2,16($sp) Loading Loading @@ -1432,8 +1455,8 @@ $code.=<<___ if (!$softonly); lghi $len,0 brc 4+1,.Lctr32_hw_loop # not zero lg $s0,0($sp) lg $s1,8($sp) l${g} $s0,0($sp) l${g} $s1,$SIZE_T($sp) la $s2,16($sp) .Lctr32_hw_zap: stg $s0,0($s2) Loading @@ -1442,30 +1465,30 @@ $code.=<<___ if (!$softonly); brct $s1,.Lctr32_hw_zap la $sp,0($s0) lmg %r6,$s3,48($sp) lm${g} %r6,$s3,6*$SIZE_T($sp) br $ra .align 16 .Lctr32_software: ___ $code.=<<___; stmg $key,$ra,40($sp) slgr $out,$inp stm${g} $key,$ra,5*$SIZE_T($sp) sl${g}r $out,$inp larl $tbl,AES_Te llgf $t1,12($ivp) .Lctr32_loop: stmg $inp,$len,16($sp) stm${g} $inp,$len,2*$SIZE_T($sp) llgf $s0,0($ivp) llgf $s1,4($ivp) llgf $s2,8($ivp) lgr $s3,$t1 st $t1,128($sp) st $t1,16*$SIZE_T($sp) lgr %r4,$key bras $ra,_s390x_AES_encrypt lmg $inp,$ivp,16($sp) llgf $t1,128($sp) lm${g} $inp,$ivp,2*$SIZE_T($sp) llgf $t1,16*$SIZE_T($sp) x $s0,0($inp) x $s1,4($inp) x $s2,8($inp) Loading @@ -1479,7 +1502,7 @@ $code.=<<___; ahi $t1,1 # 32-bit increment brct $len,.Lctr32_loop lmg %r6,$ra,48($sp) lm${g} %r6,$ra,6*$SIZE_T($sp) br $ra .size AES_ctr32_encrypt,.-AES_ctr32_encrypt ___ Loading
crypto/bn/asm/s390x-mont.pl +74 −25 Original line number Diff line number Diff line Loading @@ -32,9 +32,33 @@ # Reschedule to minimize/avoid Address Generation Interlock hazard, # make inner loops counter-based. # November 2010. # # Adapt for -m31 build. If kernel supports what's called "highgprs" # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit # instructions and achieve "64-bit" performance even in 31-bit legacy # application context. The feature is not specific to any particular # processor, as long as it's "z-CPU". Latter implies that the code # remains z/Architecture specific. Compatibility with 32-bit BN_ULONG # is achieved by swapping words after 64-bit loads, follow _dswap-s. # On z990 it was measured to perform 2.6-2.2 times better, less for # longer keys... $flavour = shift; if ($flavour =~ /3[12]/) { $SIZE_T=4; $g=""; } else { $SIZE_T=8; $g="g"; } while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; $stdframe=16*$SIZE_T+4*8; $mn0="%r0"; $num="%r1"; Loading Loading @@ -63,34 +87,44 @@ $code.=<<___; .globl bn_mul_mont .type bn_mul_mont,\@function bn_mul_mont: lgf $num,164($sp) # pull $num sla $num,3 # $num to enumerate bytes lgf $num,`$stdframe+$SIZE_T-4`($sp) # pull $num sla $num,`log($SIZE_T)/log(2)` # $num to enumerate bytes la $bp,0($num,$bp) stg %r2,16($sp) st${g} %r2,2*$SIZE_T($sp) cghi $num,16 # lghi %r2,0 # blr %r14 # if($num<16) return 0; ___ $code.=<<___ if ($flavour =~ /3[12]/); tmll $num,4 bnzr %r14 # if ($num&1) return 0; ___ $code.=<<___ if ($flavour !~ /3[12]/); cghi $num,128 # bhr %r14 # if($num>128) return 0; ___ $code.=<<___; stm${g} %r3,%r15,3*$SIZE_T($sp) stmg %r3,%r15,24($sp) lghi $rp,-160-8 # leave room for carry bit lghi $rp,-$stdframe-8 # leave room for carry bit lcgr $j,$num # -$num lgr %r0,$sp la $rp,0($rp,$sp) la $sp,0($j,$rp) # alloca stg %r0,0($sp) # back chain st${g} %r0,0($sp) # back chain sra $num,3 # restore $num la $bp,0($j,$bp) # restore $bp ahi $num,-1 # adjust $num for inner loop lg $n0,0($n0) # pull n0 _dswap $n0 lg $bi,0($bp) _dswap $bi lg $alo,0($ap) _dswap $alo mlgr $ahi,$bi # ap[0]*bp[0] lgr $AHI,$ahi Loading @@ -98,6 +132,7 @@ bn_mul_mont: msgr $mn0,$n0 lg $nlo,0($np) # _dswap $nlo mlgr $nhi,$mn0 # np[0]*m1 algr $nlo,$alo # +="tp[0]" lghi $NHI,0 Loading @@ -109,12 +144,14 @@ bn_mul_mont: .align 16 .L1st: lg $alo,0($j,$ap) _dswap $alo mlgr $ahi,$bi # ap[j]*bp[0] algr $alo,$AHI lghi $AHI,0 alcgr $AHI,$ahi lg $nlo,0($j,$np) _dswap $nlo mlgr $nhi,$mn0 # np[j]*m1 algr $nlo,$NHI lghi $NHI,0 Loading @@ -122,22 +159,24 @@ bn_mul_mont: algr $nlo,$alo alcgr $NHI,$nhi stg $nlo,160-8($j,$sp) # tp[j-1]= stg $nlo,$stdframe-8($j,$sp) # tp[j-1]= la $j,8($j) # j++ brct $count,.L1st algr $NHI,$AHI lghi $AHI,0 alcgr $AHI,$AHI # upmost overflow bit stg $NHI,160-8($j,$sp) stg $AHI,160($j,$sp) stg $NHI,$stdframe-8($j,$sp) stg $AHI,$stdframe($j,$sp) la $bp,8($bp) # bp++ .Louter: lg $bi,0($bp) # bp[i] _dswap $bi lg $alo,0($ap) _dswap $alo mlgr $ahi,$bi # ap[0]*bp[i] alg $alo,160($sp) # +=tp[0] alg $alo,$stdframe($sp) # +=tp[0] lghi $AHI,0 alcgr $AHI,$ahi Loading @@ -145,6 +184,7 @@ bn_mul_mont: msgr $mn0,$n0 # tp[0]*n0 lg $nlo,0($np) # np[0] _dswap $nlo mlgr $nhi,$mn0 # np[0]*m1 algr $nlo,$alo # +="tp[0]" lghi $NHI,0 Loading @@ -156,14 +196,16 @@ bn_mul_mont: .align 16 .Linner: lg $alo,0($j,$ap) _dswap $alo mlgr $ahi,$bi # ap[j]*bp[i] algr $alo,$AHI lghi $AHI,0 alcgr $ahi,$AHI alg $alo,160($j,$sp)# +=tp[j] alg $alo,$stdframe($j,$sp)# +=tp[j] alcgr $AHI,$ahi lg $nlo,0($j,$np) _dswap $nlo mlgr $nhi,$mn0 # np[j]*m1 algr $nlo,$NHI lghi $NHI,0 Loading @@ -171,31 +213,33 @@ bn_mul_mont: algr $nlo,$alo # +="tp[j]" alcgr $NHI,$nhi stg $nlo,160-8($j,$sp) # tp[j-1]= stg $nlo,$stdframe-8($j,$sp) # tp[j-1]= la $j,8($j) # j++ brct $count,.Linner algr $NHI,$AHI lghi $AHI,0 alcgr $AHI,$AHI alg $NHI,160($j,$sp)# accumulate previous upmost overflow bit alg $NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit lghi $ahi,0 alcgr $AHI,$ahi # new upmost overflow bit stg $NHI,160-8($j,$sp) stg $AHI,160($j,$sp) stg $NHI,$stdframe-8($j,$sp) stg $AHI,$stdframe($j,$sp) la $bp,8($bp) # bp++ clg $bp,160+8+32($j,$sp) # compare to &bp[num] cl${g} $bp,`$stdframe+8+4*$SIZE_T`($j,$sp) # compare to &bp[num] jne .Louter lg $rp,160+8+16($j,$sp) # reincarnate rp la $ap,160($sp) l${g} $rp,`$stdframe+8+2*$SIZE_T`($j,$sp) # reincarnate rp la $ap,$stdframe($sp) ahi $num,1 # restore $num, incidentally clears "borrow" la $j,0(%r0) lr $count,$num .Lsub: lg $alo,0($j,$ap) slbg $alo,0($j,$np) lg $nlo,0($j,$np) _dswap $nlo slbgr $alo,$nlo stg $alo,0($j,$rp) la $j,8($j) brct $count,.Lsub Loading @@ -211,18 +255,23 @@ bn_mul_mont: la $j,0(%r0) lgr $count,$num .Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh stg $j,160($j,$sp) # zap tp _dswap $alo stg $j,$stdframe($j,$sp) # zap tp stg $alo,0($j,$rp) la $j,8($j) brct $count,.Lcopy la %r1,160+8+48($j,$sp) lmg %r6,%r15,0(%r1) la %r1,`$stdframe+8+6*$SIZE_T`($j,$sp) lm${g} %r6,%r15,0(%r1) lghi %r2,1 # signal "processed" br %r14 .size bn_mul_mont,.-bn_mul_mont .string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>" ___ print $code; foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/ge; s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e; print $_,"\n"; } close STDOUT;
crypto/modes/asm/ghash-s390x.pl +26 −3 Original line number Diff line number Diff line Loading @@ -18,6 +18,26 @@ # and the result should be close to 12. In the lack of instruction- # level profiling data it's impossible to tell why... # November 2010. # # Adapt for -m31 build. If kernel supports what's called "highgprs" # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit # instructions and achieve "64-bit" performance even in 31-bit legacy # application context. The feature is not specific to any particular # processor, as long as it's "z-CPU". Latter implies that the code # remains z/Architecture specific. On z990 it was measured to perform # 2.8x better than 32-bit code generated by gcc 4.3. $flavour = shift; if ($flavour =~ /3[12]/) { $SIZE_T=4; $g=""; } else { $SIZE_T=8; $g="g"; } while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; Loading Loading @@ -74,7 +94,7 @@ $code.=<<___ if(!$softonly); .Lsoft_gmult: ___ $code.=<<___; stmg %r6,%r14,48($sp) stm${g} %r6,%r14,6*$SIZE_T($sp) aghi $Xi,-1 lghi $len,1 Loading Loading @@ -109,8 +129,11 @@ $code.=<<___ if(!$softonly); .align 32 .Lsoft_ghash: ___ $cdoe.=<<___ if ($flavour =~ /3[12]/); llgfr $len,$len ___ $code.=<<___; stmg %r6,%r14,48($sp) stm${g} %r6,%r14,6*$SIZE_T($sp) aghi $Xi,-1 srlg $len,$len,4 Loading Loading @@ -209,7 +232,7 @@ $code.=<<___; xgr $Zhi,$tmp stg $Zlo,8+1($Xi) stg $Zhi,0+1($Xi) lmg %r6,%r14,48($sp) lm${g} %r6,%r14,6*$SIZE_T($sp) br %r14 .type gcm_ghash_4bit,\@function .size gcm_ghash_4bit,(.-gcm_ghash_4bit) Loading