Loading Configure +2 −2 Original line number Diff line number Diff line Loading @@ -134,7 +134,7 @@ my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o::::::::::::void"; my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-alpha.o:void"; my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o:::::::"; my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o:::::::"; my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o::aes_ctr.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o"; my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o s390x-gf2m.o::aes_ctr.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o"; my $armv4_asm=":bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o:void"; my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o:32"; my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o:64"; Loading Loading @@ -370,7 +370,7 @@ my %table=( # ldconfig and run-time linker to autodiscover. Unfortunately it # doesn't work just yet, because of couple of bugs in glibc # sysdep/s390/dl-procinfo.c affecting ldconfig and ld.so.1... "linux32-s390x", "gcc:-m31 -Wa,-mzarch -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL:s390xcap.o s390xcpuid.o:bn_asm.o s390x-mont.o::aes_ctr.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o:31:dlfcn:linux-shared:-fPIC:-m31:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::/highgprs", "linux32-s390x", "gcc:-m31 -Wa,-mzarch -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL:s390xcap.o s390xcpuid.o:bn_asm.o s390x-mont.o s390x-gf2m.o::aes_ctr.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o:31:dlfcn:linux-shared:-fPIC:-m31:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::/highgprs", #### SPARC Linux setups # Ray Miller <ray.miller@computing-services.oxford.ac.uk> has patiently # assisted with debugging of following two configs. Loading TABLE +2 −2 Original line number Diff line number Diff line Loading @@ -4105,7 +4105,7 @@ $sys_id = $lflags = -ldl $bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = s390xcap.o s390xcpuid.o $bn_obj = bn_asm.o s390x-mont.o $bn_obj = bn_asm.o s390x-mont.o s390x-gf2m.o $des_obj = $aes_obj = aes_ctr.o aes-s390x.o $bf_obj = Loading Loading @@ -4137,7 +4137,7 @@ $sys_id = $lflags = -ldl $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = s390xcap.o s390xcpuid.o $bn_obj = bn-s390x.o s390x-mont.o $bn_obj = bn-s390x.o s390x-mont.o s390x-gf2m.o $des_obj = $aes_obj = aes_ctr.o aes-s390x.o $bf_obj = Loading crypto/aes/asm/aes-s390x.pl +12 −11 Original line number Diff line number Diff line Loading @@ -78,9 +78,9 @@ # February 2011. # # Add AES_xts_[en|de]crypt. This includes support for z196 # km-xts-aes instructions, which deliver ~70% improvement at 8KB # block size over vanilla km-based code. # Add AES_xts_[en|de]crypt. This includes support for z196 km-xts-aes # instructions, which deliver ~70% improvement at 8KB block size over # vanilla km-based code, 37% - at most like 512-bytes block size. $flavour = shift; Loading Loading @@ -1579,7 +1579,8 @@ ___ ######################################################################## # void AES_xts_encrypt(const char *inp,char *out,size_t len, # const AES_KEY *key1, const AES_KEY *key2,u64 secno); # const AES_KEY *key1, const AES_KEY *key2, # const unsigned char iv[16]); # { my $inp="%r2"; Loading @@ -1595,7 +1596,7 @@ $code.=<<___; .align 16 _s390x_xts_km: ___ $code.=<<___ if(0); $code.=<<___ if(1); llgfr $s0,%r0 # put aside the function code lghi $s1,0x7f nr $s1,%r0 Loading Loading @@ -1789,9 +1790,10 @@ $code.=<<___ if (!$softonly); sllg $len,$len,4 # $len&=~15 slgr $out,$inp lrvg $s0,$stdframe($sp) # load secno lghi $s1,0 # generate the tweak value l${g} $s3,$stdframe($sp) # pointer to iv la $s2,$tweak($sp) lmg $s0,$s1,0($s3) lghi $s3,16 stmg $s0,$s1,0($s2) la %r1,0($key2) # $key2 is not needed anymore Loading Loading @@ -1996,12 +1998,11 @@ $code.=<<___ if (!$softonly); slgr $out,$inp # generate the tweak value lrvg $s0,$stdframe($sp) # load secno lghi $s1,0 l${g} $s3,$stdframe($sp) # pointer to iv la $s2,$tweak($sp) lmg $s0,$s1,0($s3) lghi $s3,16 stg $s0,0($s2) stg $s1,8($s2) stmg $s0,$s1,0($s2) la %r1,0($key2) # $key2 is not needed past this point .long 0xb92e00aa # km $s2,$s2, generate the tweak brc 1,.-4 # can this happen? Loading crypto/bn/Makefile +2 −0 Original line number Diff line number Diff line Loading @@ -91,6 +91,8 @@ mips-mont.s: asm/mips-mont.pl bn-s390x.o: asm/s390x.S $(CC) $(CFLAGS) -c -o $@ asm/s390x.S s390x-gf2m.s: asm/s390x-gfm2.pl $(PERL) asm/s390x-gfm2.pl $(PERLASM_SCHEME) $@ x86_64-gcc.o: asm/x86_64-gcc.c $(CC) $(CFLAGS) -c -o $@ asm/x86_64-gcc.c Loading crypto/bn/asm/s390x-gf2m.pl 0 → 100644 +220 −0 Original line number Diff line number Diff line #!/usr/bin/env perl # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # May 2011 # # The module implements bn_GF2m_mul_2x2 polynomial multiplication used # in bn_gf2m.c. It's kind of low-hanging mechanical port from C for # the time being... gcc 4.3 appeared to generate poor code, therefore # the effort. The module delivers 55%-90% improvement on haviest ECDSA # verify and ECDH benchmarks for 163- and 571-bit keys on z990, and # 25%-30% - on z196(*). This is for 64-bit build. In 32-bit "highgprs" # case improvement is even higher, for example on z990 it was measured # 80%-150%. ECDSA sign is modest 9%-12% faster. Keep in mind that # these coefficients are not ones for bn_GF2m_mul_2x2 itself, as not # all CPU time is burnt in it... # # (*) Though no improvement could be measured if compared to code # generated by gcc 4.1. Keep in mind that z196 is out-of-order # execution core and is better at executing poor code. $flavour = shift; if ($flavour =~ /3[12]/) { $SIZE_T=4; $g=""; } else { $SIZE_T=8; $g="g"; } while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; $stdframe=16*$SIZE_T+4*8; $rp="%r2"; $a1="%r3"; $a0="%r4"; $b1="%r5"; $b0="%r6"; $ra="%r14"; $sp="%r15"; @T=("%r0","%r1"); @i=("%r12","%r13"); ($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(6..11)); ($lo,$hi,$b)=map("%r$_",(3..5)); $a=$lo; $mask=$a8; $code.=<<___; .text .type _mul_1x1,\@function .align 16 _mul_1x1: lgr $a1,$a sllg $a2,$a,1 sllg $a4,$a,2 sllg $a8,$a,3 srag $lo,$a1,63 # broadcast 63rd bit nihh $a1,0x1fff srag @i[0],$a2,63 # broadcast 62nd bit nihh $a2,0x3fff srag @i[1],$a4,63 # broadcast 61st bit nihh $a4,0x7fff ngr $lo,$b ngr @i[0],$b ngr @i[1],$b lghi @T[0],0 lgr $a12,$a1 stg @T[0],`$stdframe+0*8`($sp) # tab[0]=0 xgr $a12,$a2 stg $a1,`$stdframe+1*8`($sp) # tab[1]=a1 lgr $a48,$a4 stg $a2,`$stdframe+2*8`($sp) # tab[2]=a2 xgr $a48,$a8 stg $a12,`$stdframe+3*8`($sp) # tab[3]=a1^a2 xgr $a1,$a4 stg $a4,`$stdframe+4*8`($sp) # tab[4]=a4 xgr $a2,$a4 stg $a1,`$stdframe+5*8`($sp) # tab[5]=a1^a4 xgr $a12,$a4 stg $a2,`$stdframe+6*8`($sp) # tab[6]=a2^a4 xgr $a1,$a48 stg $a12,`$stdframe+7*8`($sp) # tab[7]=a1^a2^a4 xgr $a2,$a48 stg $a8,`$stdframe+8*8`($sp) # tab[8]=a8 xgr $a12,$a48 stg $a1,`$stdframe+9*8`($sp) # tab[9]=a1^a8 xgr $a1,$a4 stg $a2,`$stdframe+10*8`($sp) # tab[10]=a2^a8 xgr $a2,$a4 stg $a12,`$stdframe+11*8`($sp) # tab[11]=a1^a2^a8 xgr $a12,$a4 stg $a48,`$stdframe+12*8`($sp) # tab[12]=a4^a8 srlg $hi,$lo,1 stg $a1,`$stdframe+13*8`($sp) # tab[13]=a1^a4^a8 sllg $lo,$lo,63 stg $a2,`$stdframe+14*8`($sp) # tab[14]=a2^a4^a8 srlg @T[0],@i[0],2 stg $a12,`$stdframe+15*8`($sp) # tab[15]=a1^a2^a4^a8 lghi $mask,`0xf<<3` sllg $a1,@i[0],62 sllg @i[0],$b,3 srlg @T[1],@i[1],3 ngr @i[0],$mask sllg $a2,@i[1],61 srlg @i[1],$b,4-3 xgr $hi,@T[0] ngr @i[1],$mask xgr $lo,$a1 xgr $hi,@T[1] xgr $lo,$a2 xg $lo,$stdframe(@i[0],$sp) srlg @i[0],$b,8-3 ngr @i[0],$mask ___ for($n=1;$n<14;$n++) { $code.=<<___; lg @T[1],$stdframe(@i[1],$sp) srlg @i[1],$b,`($n+2)*4`-3 sllg @T[0],@T[1],`$n*4` ngr @i[1],$mask srlg @T[1],@T[1],`64-$n*4` xgr $lo,@T[0] xgr $hi,@T[1] ___ push(@i,shift(@i)); push(@T,shift(@T)); } $code.=<<___; lg @T[1],$stdframe(@i[1],$sp) sllg @T[0],@T[1],`$n*4` srlg @T[1],@T[1],`64-$n*4` xgr $lo,@T[0] xgr $hi,@T[1] lg @T[0],$stdframe(@i[0],$sp) sllg @T[1],@T[0],`($n+1)*4` srlg @T[0],@T[0],`64-($n+1)*4` xgr $lo,@T[1] xgr $hi,@T[0] br $ra .size _mul_1x1,.-_mul_1x1 .globl bn_GF2m_mul_2x2 .type bn_GF2m_mul_2x2,\@function .align 16 bn_GF2m_mul_2x2: stm${g} %r3,%r15,3*$SIZE_T($sp) lghi %r1,-$stdframe-128 la %r0,0($sp) la $sp,0(%r1,$sp) # alloca st${g} %r0,0($sp) # back chain ___ if ($SIZE_T==8) { my @r=map("%r$_",(6..9)); $code.=<<___; bras $ra,_mul_1x1 # a1b1 stmg $lo,$hi,16($rp) lg $a,`$stdframe+128+4*$SIZE_T`($sp) lg $b,`$stdframe+128+6*$SIZE_T`($sp) bras $ra,_mul_1x1 # a0b0 stmg $lo,$hi,0($rp) lg $a,`$stdframe+128+3*$SIZE_T`($sp) lg $b,`$stdframe+128+5*$SIZE_T`($sp) xg $a,`$stdframe+128+4*$SIZE_T`($sp) xg $b,`$stdframe+128+6*$SIZE_T`($sp) bras $ra,_mul_1x1 # (a0+a1)(b0+b1) lmg @r[0],@r[3],0($rp) xgr $lo,$hi xgr $hi,@r[1] xgr $lo,@r[0] xgr $hi,@r[2] xgr $lo,@r[3] xgr $hi,@r[3] xgr $lo,$hi stg $hi,16($rp) stg $lo,8($rp) ___ } else { $code.=<<___; sllg %r3,%r3,32 sllg %r5,%r5,32 or %r3,%r4 or %r5,%r6 bras $ra,_mul_1x1 rllg $lo,$lo,32 rllg $hi,$hi,32 stmg $lo,$hi,0($rp) ___ } $code.=<<___; lm${g} %r6,%r15,`$stdframe+128+6*$SIZE_T`($sp) br $ra .size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 .string "GF(2^m) Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>" ___ $code =~ s/\`([^\`]*)\`/eval($1)/gem; print $code; close STDOUT; Loading
Configure +2 −2 Original line number Diff line number Diff line Loading @@ -134,7 +134,7 @@ my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o::::::::::::void"; my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-alpha.o:void"; my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o:::::::"; my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o:::::::"; my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o::aes_ctr.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o"; my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o s390x-gf2m.o::aes_ctr.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o"; my $armv4_asm=":bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o:void"; my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o:32"; my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o:64"; Loading Loading @@ -370,7 +370,7 @@ my %table=( # ldconfig and run-time linker to autodiscover. Unfortunately it # doesn't work just yet, because of couple of bugs in glibc # sysdep/s390/dl-procinfo.c affecting ldconfig and ld.so.1... "linux32-s390x", "gcc:-m31 -Wa,-mzarch -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL:s390xcap.o s390xcpuid.o:bn_asm.o s390x-mont.o::aes_ctr.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o:31:dlfcn:linux-shared:-fPIC:-m31:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::/highgprs", "linux32-s390x", "gcc:-m31 -Wa,-mzarch -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL:s390xcap.o s390xcpuid.o:bn_asm.o s390x-mont.o s390x-gf2m.o::aes_ctr.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o:31:dlfcn:linux-shared:-fPIC:-m31:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::/highgprs", #### SPARC Linux setups # Ray Miller <ray.miller@computing-services.oxford.ac.uk> has patiently # assisted with debugging of following two configs. Loading
TABLE +2 −2 Original line number Diff line number Diff line Loading @@ -4105,7 +4105,7 @@ $sys_id = $lflags = -ldl $bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = s390xcap.o s390xcpuid.o $bn_obj = bn_asm.o s390x-mont.o $bn_obj = bn_asm.o s390x-mont.o s390x-gf2m.o $des_obj = $aes_obj = aes_ctr.o aes-s390x.o $bf_obj = Loading Loading @@ -4137,7 +4137,7 @@ $sys_id = $lflags = -ldl $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = s390xcap.o s390xcpuid.o $bn_obj = bn-s390x.o s390x-mont.o $bn_obj = bn-s390x.o s390x-mont.o s390x-gf2m.o $des_obj = $aes_obj = aes_ctr.o aes-s390x.o $bf_obj = Loading
crypto/aes/asm/aes-s390x.pl +12 −11 Original line number Diff line number Diff line Loading @@ -78,9 +78,9 @@ # February 2011. # # Add AES_xts_[en|de]crypt. This includes support for z196 # km-xts-aes instructions, which deliver ~70% improvement at 8KB # block size over vanilla km-based code. # Add AES_xts_[en|de]crypt. This includes support for z196 km-xts-aes # instructions, which deliver ~70% improvement at 8KB block size over # vanilla km-based code, 37% - at most like 512-bytes block size. $flavour = shift; Loading Loading @@ -1579,7 +1579,8 @@ ___ ######################################################################## # void AES_xts_encrypt(const char *inp,char *out,size_t len, # const AES_KEY *key1, const AES_KEY *key2,u64 secno); # const AES_KEY *key1, const AES_KEY *key2, # const unsigned char iv[16]); # { my $inp="%r2"; Loading @@ -1595,7 +1596,7 @@ $code.=<<___; .align 16 _s390x_xts_km: ___ $code.=<<___ if(0); $code.=<<___ if(1); llgfr $s0,%r0 # put aside the function code lghi $s1,0x7f nr $s1,%r0 Loading Loading @@ -1789,9 +1790,10 @@ $code.=<<___ if (!$softonly); sllg $len,$len,4 # $len&=~15 slgr $out,$inp lrvg $s0,$stdframe($sp) # load secno lghi $s1,0 # generate the tweak value l${g} $s3,$stdframe($sp) # pointer to iv la $s2,$tweak($sp) lmg $s0,$s1,0($s3) lghi $s3,16 stmg $s0,$s1,0($s2) la %r1,0($key2) # $key2 is not needed anymore Loading Loading @@ -1996,12 +1998,11 @@ $code.=<<___ if (!$softonly); slgr $out,$inp # generate the tweak value lrvg $s0,$stdframe($sp) # load secno lghi $s1,0 l${g} $s3,$stdframe($sp) # pointer to iv la $s2,$tweak($sp) lmg $s0,$s1,0($s3) lghi $s3,16 stg $s0,0($s2) stg $s1,8($s2) stmg $s0,$s1,0($s2) la %r1,0($key2) # $key2 is not needed past this point .long 0xb92e00aa # km $s2,$s2, generate the tweak brc 1,.-4 # can this happen? Loading
crypto/bn/Makefile +2 −0 Original line number Diff line number Diff line Loading @@ -91,6 +91,8 @@ mips-mont.s: asm/mips-mont.pl bn-s390x.o: asm/s390x.S $(CC) $(CFLAGS) -c -o $@ asm/s390x.S s390x-gf2m.s: asm/s390x-gfm2.pl $(PERL) asm/s390x-gfm2.pl $(PERLASM_SCHEME) $@ x86_64-gcc.o: asm/x86_64-gcc.c $(CC) $(CFLAGS) -c -o $@ asm/x86_64-gcc.c Loading
crypto/bn/asm/s390x-gf2m.pl 0 → 100644 +220 −0 Original line number Diff line number Diff line #!/usr/bin/env perl # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # May 2011 # # The module implements bn_GF2m_mul_2x2 polynomial multiplication used # in bn_gf2m.c. It's kind of low-hanging mechanical port from C for # the time being... gcc 4.3 appeared to generate poor code, therefore # the effort. The module delivers 55%-90% improvement on haviest ECDSA # verify and ECDH benchmarks for 163- and 571-bit keys on z990, and # 25%-30% - on z196(*). This is for 64-bit build. In 32-bit "highgprs" # case improvement is even higher, for example on z990 it was measured # 80%-150%. ECDSA sign is modest 9%-12% faster. Keep in mind that # these coefficients are not ones for bn_GF2m_mul_2x2 itself, as not # all CPU time is burnt in it... # # (*) Though no improvement could be measured if compared to code # generated by gcc 4.1. Keep in mind that z196 is out-of-order # execution core and is better at executing poor code. $flavour = shift; if ($flavour =~ /3[12]/) { $SIZE_T=4; $g=""; } else { $SIZE_T=8; $g="g"; } while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; $stdframe=16*$SIZE_T+4*8; $rp="%r2"; $a1="%r3"; $a0="%r4"; $b1="%r5"; $b0="%r6"; $ra="%r14"; $sp="%r15"; @T=("%r0","%r1"); @i=("%r12","%r13"); ($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(6..11)); ($lo,$hi,$b)=map("%r$_",(3..5)); $a=$lo; $mask=$a8; $code.=<<___; .text .type _mul_1x1,\@function .align 16 _mul_1x1: lgr $a1,$a sllg $a2,$a,1 sllg $a4,$a,2 sllg $a8,$a,3 srag $lo,$a1,63 # broadcast 63rd bit nihh $a1,0x1fff srag @i[0],$a2,63 # broadcast 62nd bit nihh $a2,0x3fff srag @i[1],$a4,63 # broadcast 61st bit nihh $a4,0x7fff ngr $lo,$b ngr @i[0],$b ngr @i[1],$b lghi @T[0],0 lgr $a12,$a1 stg @T[0],`$stdframe+0*8`($sp) # tab[0]=0 xgr $a12,$a2 stg $a1,`$stdframe+1*8`($sp) # tab[1]=a1 lgr $a48,$a4 stg $a2,`$stdframe+2*8`($sp) # tab[2]=a2 xgr $a48,$a8 stg $a12,`$stdframe+3*8`($sp) # tab[3]=a1^a2 xgr $a1,$a4 stg $a4,`$stdframe+4*8`($sp) # tab[4]=a4 xgr $a2,$a4 stg $a1,`$stdframe+5*8`($sp) # tab[5]=a1^a4 xgr $a12,$a4 stg $a2,`$stdframe+6*8`($sp) # tab[6]=a2^a4 xgr $a1,$a48 stg $a12,`$stdframe+7*8`($sp) # tab[7]=a1^a2^a4 xgr $a2,$a48 stg $a8,`$stdframe+8*8`($sp) # tab[8]=a8 xgr $a12,$a48 stg $a1,`$stdframe+9*8`($sp) # tab[9]=a1^a8 xgr $a1,$a4 stg $a2,`$stdframe+10*8`($sp) # tab[10]=a2^a8 xgr $a2,$a4 stg $a12,`$stdframe+11*8`($sp) # tab[11]=a1^a2^a8 xgr $a12,$a4 stg $a48,`$stdframe+12*8`($sp) # tab[12]=a4^a8 srlg $hi,$lo,1 stg $a1,`$stdframe+13*8`($sp) # tab[13]=a1^a4^a8 sllg $lo,$lo,63 stg $a2,`$stdframe+14*8`($sp) # tab[14]=a2^a4^a8 srlg @T[0],@i[0],2 stg $a12,`$stdframe+15*8`($sp) # tab[15]=a1^a2^a4^a8 lghi $mask,`0xf<<3` sllg $a1,@i[0],62 sllg @i[0],$b,3 srlg @T[1],@i[1],3 ngr @i[0],$mask sllg $a2,@i[1],61 srlg @i[1],$b,4-3 xgr $hi,@T[0] ngr @i[1],$mask xgr $lo,$a1 xgr $hi,@T[1] xgr $lo,$a2 xg $lo,$stdframe(@i[0],$sp) srlg @i[0],$b,8-3 ngr @i[0],$mask ___ for($n=1;$n<14;$n++) { $code.=<<___; lg @T[1],$stdframe(@i[1],$sp) srlg @i[1],$b,`($n+2)*4`-3 sllg @T[0],@T[1],`$n*4` ngr @i[1],$mask srlg @T[1],@T[1],`64-$n*4` xgr $lo,@T[0] xgr $hi,@T[1] ___ push(@i,shift(@i)); push(@T,shift(@T)); } $code.=<<___; lg @T[1],$stdframe(@i[1],$sp) sllg @T[0],@T[1],`$n*4` srlg @T[1],@T[1],`64-$n*4` xgr $lo,@T[0] xgr $hi,@T[1] lg @T[0],$stdframe(@i[0],$sp) sllg @T[1],@T[0],`($n+1)*4` srlg @T[0],@T[0],`64-($n+1)*4` xgr $lo,@T[1] xgr $hi,@T[0] br $ra .size _mul_1x1,.-_mul_1x1 .globl bn_GF2m_mul_2x2 .type bn_GF2m_mul_2x2,\@function .align 16 bn_GF2m_mul_2x2: stm${g} %r3,%r15,3*$SIZE_T($sp) lghi %r1,-$stdframe-128 la %r0,0($sp) la $sp,0(%r1,$sp) # alloca st${g} %r0,0($sp) # back chain ___ if ($SIZE_T==8) { my @r=map("%r$_",(6..9)); $code.=<<___; bras $ra,_mul_1x1 # a1b1 stmg $lo,$hi,16($rp) lg $a,`$stdframe+128+4*$SIZE_T`($sp) lg $b,`$stdframe+128+6*$SIZE_T`($sp) bras $ra,_mul_1x1 # a0b0 stmg $lo,$hi,0($rp) lg $a,`$stdframe+128+3*$SIZE_T`($sp) lg $b,`$stdframe+128+5*$SIZE_T`($sp) xg $a,`$stdframe+128+4*$SIZE_T`($sp) xg $b,`$stdframe+128+6*$SIZE_T`($sp) bras $ra,_mul_1x1 # (a0+a1)(b0+b1) lmg @r[0],@r[3],0($rp) xgr $lo,$hi xgr $hi,@r[1] xgr $lo,@r[0] xgr $hi,@r[2] xgr $lo,@r[3] xgr $hi,@r[3] xgr $lo,$hi stg $hi,16($rp) stg $lo,8($rp) ___ } else { $code.=<<___; sllg %r3,%r3,32 sllg %r5,%r5,32 or %r3,%r4 or %r5,%r6 bras $ra,_mul_1x1 rllg $lo,$lo,32 rllg $hi,$hi,32 stmg $lo,$hi,0($rp) ___ } $code.=<<___; lm${g} %r6,%r15,`$stdframe+128+6*$SIZE_T`($sp) br $ra .size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 .string "GF(2^m) Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>" ___ $code =~ s/\`([^\`]*)\`/eval($1)/gem; print $code; close STDOUT;