Loading crypto/chacha/Makefile.in +2 −0 Original line number Diff line number Diff line Loading @@ -36,6 +36,8 @@ lib: $(LIBOBJ) $(RANLIB) $(LIB) || echo Never mind. @touch lib chacha-%.S: asm/chacha-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@ files: $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO Loading crypto/chacha/asm/chacha-s390x.pl 0 → 100755 +317 −0 Original line number Diff line number Diff line #!/usr/bin/env perl # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # December 2015 # # ChaCha20 for s390x. # # 3 times faster than compiler-generated code. $flavour = shift; if ($flavour =~ /3[12]/) { $SIZE_T=4; $g=""; } else { $SIZE_T=8; $g="g"; } while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; sub AUTOLOAD() # thunk [simplified] x86-style perlasm { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $code .= "\t$opcode\t".join(',',@_)."\n"; } my $sp="%r15"; my $stdframe=16*$SIZE_T+4*8; my $frame=$stdframe+4*20; my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6)); my @x=map("%r$_",(0..7,"x","x","x","x",(10..13))); my @t=map("%r$_",(8,9)); sub ROUND { my ($a0,$b0,$c0,$d0)=@_; my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); my ($xc,$xc_)=map("\"$_\"",@t); my @x=map("\"$_\"",@x); # Consider order in which variables are addressed by their # index: # # a b c d # # 0 4 8 12 < even round # 1 5 9 13 # 2 6 10 14 # 3 7 11 15 # 0 5 10 15 < odd round # 1 6 11 12 # 2 7 8 13 # 3 4 9 14 # # 'a', 'b' and 'd's are permanently allocated in registers, # @x[0..7,12..15], while 'c's are maintained in memory. If # you observe 'c' column, you'll notice that pair of 'c's is # invariant between rounds. This means that we have to reload # them once per round, in the middle. This is why you'll see # 'c' stores and loads in the middle, but none in the beginning # or end. ( "&alr (@x[$a0],@x[$b0])", # Q1 "&alr (@x[$a1],@x[$b1])", # Q2 "&xr (@x[$d0],@x[$a0])", "&xr (@x[$d1],@x[$a1])", "&rll (@x[$d0],@x[$d0],16)", "&rll (@x[$d1],@x[$d1],16)", "&alr ($xc,@x[$d0])", "&alr ($xc_,@x[$d1])", "&xr (@x[$b0],$xc)", "&xr (@x[$b1],$xc_)", "&rll (@x[$b0],@x[$b0],12)", "&rll (@x[$b1],@x[$b1],12)", "&alr (@x[$a0],@x[$b0])", "&alr (@x[$a1],@x[$b1])", "&xr (@x[$d0],@x[$a0])", "&xr (@x[$d1],@x[$a1])", "&rll (@x[$d0],@x[$d0],8)", "&rll (@x[$d1],@x[$d1],8)", "&alr ($xc,@x[$d0])", "&alr ($xc_,@x[$d1])", "&xr (@x[$b0],$xc)", "&xr (@x[$b1],$xc_)", "&rll (@x[$b0],@x[$b0],7)", "&rll (@x[$b1],@x[$b1],7)", "&stm ($xc,$xc_,'$stdframe+4*8+4*$c0($sp)')", # reload pair of 'c's "&lm ($xc,$xc_,'$stdframe+4*8+4*$c2($sp)')", "&alr (@x[$a2],@x[$b2])", # Q3 "&alr (@x[$a3],@x[$b3])", # Q4 "&xr (@x[$d2],@x[$a2])", "&xr (@x[$d3],@x[$a3])", "&rll (@x[$d2],@x[$d2],16)", "&rll (@x[$d3],@x[$d3],16)", "&alr ($xc,@x[$d2])", "&alr ($xc_,@x[$d3])", "&xr (@x[$b2],$xc)", "&xr (@x[$b3],$xc_)", "&rll (@x[$b2],@x[$b2],12)", "&rll (@x[$b3],@x[$b3],12)", "&alr (@x[$a2],@x[$b2])", "&alr (@x[$a3],@x[$b3])", "&xr (@x[$d2],@x[$a2])", "&xr (@x[$d3],@x[$a3])", "&rll (@x[$d2],@x[$d2],8)", "&rll (@x[$d3],@x[$d3],8)", "&alr ($xc,@x[$d2])", "&alr ($xc_,@x[$d3])", "&xr (@x[$b2],$xc)", "&xr (@x[$b3],$xc_)", "&rll (@x[$b2],@x[$b2],7)", "&rll (@x[$b3],@x[$b3],7)" ); } $code.=<<___; .text .globl ChaCha20_ctr32 .type ChaCha20_ctr32,\@function .align 32 ChaCha20_ctr32: a${g}hi $len,-64 l${g}hi %r1,-$frame stm${g} %r6,%r15,`6*$SIZE_T`($sp) sl${g}r $out,$inp # difference la $len,0($inp,$len) # end of input minus 64 larl %r7,.Lsigma lgr %r0,$sp la $sp,0(%r1,$sp) st${g} %r0,0($sp) lmg %r8,%r11,0($key) # load key lmg %r12,%r13,0($counter) # load counter lmg %r6,%r7,0(%r7) # load sigma constant la %r14,0($inp) st${g} $out,$frame+3*$SIZE_T($sp) st${g} $len,$frame+4*$SIZE_T($sp) stmg %r6,%r13,$stdframe($sp) # copy key schedule to stack srlg @x[12],%r12,32 # 32-bit counter value j .Loop_outer .align 16 .Loop_outer: lm @x[0],@x[7],$stdframe+4*0($sp) # load x[0]-x[7] lm @t[0],@t[1],$stdframe+4*10($sp) # load x[10]-x[11] lm @x[13],@x[15],$stdframe+4*13($sp) # load x[13]-x[15] stm @t[0],@t[1],$stdframe+4*8+4*10($sp) # offload x[10]-x[11] lm @t[0],@t[1],$stdframe+4*8($sp) # load x[8]-x[9] st @x[12],$stdframe+4*12($sp) # save counter st${g} %r14,$frame+2*$SIZE_T($sp) # save input pointer lhi %r14,10 j .Loop .align 4 .Loop: ___ foreach (&ROUND(0, 4, 8,12)) { eval; } foreach (&ROUND(0, 5,10,15)) { eval; } $code.=<<___; brct %r14,.Loop l${g} %r14,$frame+2*$SIZE_T($sp) # pull input pointer stm @t[0],@t[1],$stdframe+4*8+4*8($sp) # offload x[8]-x[9] lm${g} @t[0],@t[1],$frame+3*$SIZE_T($sp) al @x[0],$stdframe+4*0($sp) # accumulate key schedule al @x[1],$stdframe+4*1($sp) al @x[2],$stdframe+4*2($sp) al @x[3],$stdframe+4*3($sp) al @x[4],$stdframe+4*4($sp) al @x[5],$stdframe+4*5($sp) al @x[6],$stdframe+4*6($sp) al @x[7],$stdframe+4*7($sp) lrvr @x[0],@x[0] lrvr @x[1],@x[1] lrvr @x[2],@x[2] lrvr @x[3],@x[3] lrvr @x[4],@x[4] lrvr @x[5],@x[5] lrvr @x[6],@x[6] lrvr @x[7],@x[7] al @x[12],$stdframe+4*12($sp) al @x[13],$stdframe+4*13($sp) al @x[14],$stdframe+4*14($sp) al @x[15],$stdframe+4*15($sp) lrvr @x[12],@x[12] lrvr @x[13],@x[13] lrvr @x[14],@x[14] lrvr @x[15],@x[15] la @t[0],0(@t[0],%r14) # reconstruct output pointer cl${g}r %r14,@t[1] jh .Ltail x @x[0],4*0(%r14) # xor with input x @x[1],4*1(%r14) st @x[0],4*0(@t[0]) # store output x @x[2],4*2(%r14) st @x[1],4*1(@t[0]) x @x[3],4*3(%r14) st @x[2],4*2(@t[0]) x @x[4],4*4(%r14) st @x[3],4*3(@t[0]) lm @x[0],@x[3],$stdframe+4*8+4*8($sp) # load x[8]-x[11] x @x[5],4*5(%r14) st @x[4],4*4(@t[0]) x @x[6],4*6(%r14) al @x[0],$stdframe+4*8($sp) st @x[5],4*5(@t[0]) x @x[7],4*7(%r14) al @x[1],$stdframe+4*9($sp) st @x[6],4*6(@t[0]) x @x[12],4*12(%r14) al @x[2],$stdframe+4*10($sp) st @x[7],4*7(@t[0]) x @x[13],4*13(%r14) al @x[3],$stdframe+4*11($sp) st @x[12],4*12(@t[0]) x @x[14],4*14(%r14) st @x[13],4*13(@t[0]) x @x[15],4*15(%r14) st @x[14],4*14(@t[0]) lrvr @x[0],@x[0] st @x[15],4*15(@t[0]) lrvr @x[1],@x[1] lrvr @x[2],@x[2] lrvr @x[3],@x[3] lhi @x[12],1 x @x[0],4*8(%r14) al @x[12],$stdframe+4*12($sp) # increment counter x @x[1],4*9(%r14) st @x[0],4*8(@t[0]) x @x[2],4*10(%r14) st @x[1],4*9(@t[0]) x @x[3],4*11(%r14) st @x[2],4*10(@t[0]) la %r14,64(%r14) st @x[3],4*11(@t[0]) cl${g}r %r14,@t[1] # done yet? jle .Loop_outer .Ldone: xgr %r0,%r0 xgr %r1,%r1 xgr %r2,%r2 xgr %r3,%r3 stmg %r0,%r3,$stdframe+4*4($sp) # wipe key copy stmg %r0,%r3,$stdframe+4*12($sp) lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp) br %r14 .align 16 .Ltail: la @t[1],64($t[1]) stm @x[0],@x[7],$stdframe+4*0($sp) sl${g}r @t[1],%r14 lm @x[0],@x[3],$stdframe+4*8+4*8($sp) l${g}hi @x[6],0 stm @x[12],@x[15],$stdframe+4*12($sp) al @x[0],$stdframe+4*8($sp) al @x[1],$stdframe+4*9($sp) al @x[2],$stdframe+4*10($sp) al @x[3],$stdframe+4*11($sp) lrvr @x[0],@x[0] lrvr @x[1],@x[1] lrvr @x[2],@x[2] lrvr @x[3],@x[3] stm @x[0],@x[3],$stdframe+4*8+4*8($sp) .Loop_tail: llgc @x[4],0(@x[6],%r14) llgc @x[5],$stdframe(@x[6],$sp) xr @x[5],@x[4] stc @x[5],0(@x[6],@t[0]) la @x[6],1(@x[6]) brct @t[1],.Loop_tail j .Ldone .size ChaCha20_ctr32,.-ChaCha20_ctr32 .align 32 .Lsigma: .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral .asciz "ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>" .align 4 ___ foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/ge; print $_,"\n"; } close STDOUT; crypto/poly1305/Makefile.in +2 −0 Original line number Diff line number Diff line Loading @@ -39,6 +39,8 @@ lib: $(LIBOBJ) poly1305-sparcv9.S: asm/poly1305-sparcv9.pl $(PERL) asm/poly1305-sparcv9.pl > $@ poly1305-%.S: asm/poly1305-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@ files: $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO Loading crypto/poly1305/asm/poly1305-s390x.pl 0 → 100755 +216 −0 Original line number Diff line number Diff line #!/usr/bin/env perl # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # This module implements Poly1305 hash for s390x. # # June 2015 # # ~6.4/2.2 cpb on z10/z196+, >2x improvement over compiler-generated # code. For older compiler improvement coefficient is >3x, because # then base 2^64 and base 2^32 implementations are compared. # # On side note, z13 enables vector base 2^26 implementation... $flavour = shift; if ($flavour =~ /3[12]/) { $SIZE_T=4; $g=""; } else { $SIZE_T=8; $g="g"; } while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; $sp="%r15"; my ($ctx,$inp,$len,$padbit) = map("%r$_",(2..5)); $code.=<<___; .text .globl poly1305_init .type poly1305_init,\@function .align 16 poly1305_init: lghi %r0,0 lghi %r1,-1 stg %r0,0($ctx) # zero hash value stg %r0,8($ctx) stg %r0,16($ctx) cl${g}r $inp,%r0 je .Lno_key lrvg %r4,0($inp) # load little-endian key lrvg %r5,8($inp) nihl %r1,0xffc0 # 0xffffffc0ffffffff srlg %r0,%r1,4 # 0x0ffffffc0fffffff srlg %r1,%r1,4 nill %r1,0xfffc # 0x0ffffffc0ffffffc ngr %r4,%r0 ngr %r5,%r1 stg %r4,32($ctx) stg %r5,40($ctx) .Lno_key: lghi %r2,0 br %r14 .size poly1305_init,.-poly1305_init ___ { my ($d0hi,$d0lo,$d1hi,$d1lo,$t0,$h0,$t1,$h1,$h2) = map("%r$_",(6..14)); my ($r0,$r1,$s1) = map("%r$_",(0..2)); $code.=<<___; .globl poly1305_blocks .type poly1305_blocks,\@function .align 16 poly1305_blocks: srl${g} $len,$len,4 lghi %r0,0 cl${g}r $len,%r0 je .Lno_data stm${g} %r6,%r14,`6*$SIZE_T`($sp) lg $r0,32($ctx) # load key lg $r1,40($ctx) lg $h0,0($ctx) # load hash value lg $h1,8($ctx) lg $h2,16($ctx) st$g $ctx,`2*$SIZE_T`($sp) # off-load $ctx srlg $s1,$r1,2 algr $s1,$r1 # s1 = r1 + r1>>2 j .Loop .align 16 .Loop: lrvg $d0lo,0($inp) # load little-endian input lrvg $d1lo,8($inp) la $inp,16($inp) algr $d0lo,$h0 # accumulate input alcgr $d1lo,$h1 lgr $h0,$d0lo mlgr $d0hi,$r0 # h0*r0 -> $d0hi:$d0lo lgr $h1,$d1lo mlgr $d1hi,$s1 # h1*5*r1 -> $d1hi:$d1lo mlgr $t0,$r1 # h0*r1 -> $t0:$h0 mlgr $t1,$r0 # h1*r0 -> $t1:$h1 alcgr $h2,$padbit algr $d0lo,$d1lo lgr $d1lo,$h2 alcgr $d0hi,$d1hi lghi $d1hi,0 algr $h1,$h0 alcgr $t1,$t0 msgr $d1lo,$s1 # h2*s1 msgr $h2,$r0 # h2*r0 algr $h1,$d1lo alcgr $t1,$d1hi # $d1hi is zero algr $h1,$d0hi alcgr $h2,$t1 lghi $h0,-4 # final reduction step ngr $h0,$h2 srlg $t0,$h2,2 algr $h0,$t0 algr $h0,$d0lo lghi $t1,3 alcgr $h1,$d1hi # $d1hi is still zero ngr $h2,$t1 brct$g $len,.Loop l$g $ctx,`2*$SIZE_T`($sp) # restore $ctx stg $h0,0($ctx) # store hash value stg $h1,8($ctx) stg $h2,16($ctx) lm${g} %r6,%r14,`6*$SIZE_T`($sp) .Lno_data: br %r14 .size poly1305_blocks,.-poly1305_blocks ___ } { my ($mac,$nonce)=($inp,$len); my ($h0,$h1,$h2,$d0,$d1)=map("%r$_",(5..9)); $code.=<<___; .globl poly1305_emit .type poly1305_emit,\@function .align 16 poly1305_emit: stm${g} %r6,%r9,`6*$SIZE_T`($sp) lg $h0,0($ctx) lg $h1,8($ctx) lg $h2,16($ctx) lghi %r0,5 lghi %r1,0 lgr $d0,$h0 lgr $d1,$h1 algr $h0,%r0 # compare to modulus alcgr $h1,%r1 alcgr $h2,%r1 srlg $h2,$h2,2 # did it borrow/carry? slgr %r1,$h2 # 0-$h2>>2 lg $h2,0($nonce) # load nonce lghi %r0,-1 lg $ctx,8($nonce) xgr %r0,%r1 # ~%r1 ngr $h0,%r1 ngr $d0,%r0 ngr $h1,%r1 ngr $d1,%r0 ogr $h0,$d0 rllg $d0,$h2,32 # flip nonce words ogr $h1,$d1 rllg $d1,$ctx,32 algr $h0,$d0 # accumulate nonce alcgr $h1,$d1 strvg $h0,0($mac) # write little-endian result strvg $h1,8($mac) lm${g} %r6,%r9,`6*$SIZE_T`($sp) br %r14 .size poly1305_emit,.-poly1305_emit .string "Poly1305 for s390x, CRYPTOGAMS by <appro\@openssl.org>" ___ } $code =~ s/\`([^\`]*)\`/eval $1/gem; print $code; close STDOUT; Loading
crypto/chacha/Makefile.in +2 −0 Original line number Diff line number Diff line Loading @@ -36,6 +36,8 @@ lib: $(LIBOBJ) $(RANLIB) $(LIB) || echo Never mind. @touch lib chacha-%.S: asm/chacha-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@ files: $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO Loading
crypto/chacha/asm/chacha-s390x.pl 0 → 100755 +317 −0 Original line number Diff line number Diff line #!/usr/bin/env perl # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # December 2015 # # ChaCha20 for s390x. # # 3 times faster than compiler-generated code. $flavour = shift; if ($flavour =~ /3[12]/) { $SIZE_T=4; $g=""; } else { $SIZE_T=8; $g="g"; } while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; sub AUTOLOAD() # thunk [simplified] x86-style perlasm { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $code .= "\t$opcode\t".join(',',@_)."\n"; } my $sp="%r15"; my $stdframe=16*$SIZE_T+4*8; my $frame=$stdframe+4*20; my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6)); my @x=map("%r$_",(0..7,"x","x","x","x",(10..13))); my @t=map("%r$_",(8,9)); sub ROUND { my ($a0,$b0,$c0,$d0)=@_; my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); my ($xc,$xc_)=map("\"$_\"",@t); my @x=map("\"$_\"",@x); # Consider order in which variables are addressed by their # index: # # a b c d # # 0 4 8 12 < even round # 1 5 9 13 # 2 6 10 14 # 3 7 11 15 # 0 5 10 15 < odd round # 1 6 11 12 # 2 7 8 13 # 3 4 9 14 # # 'a', 'b' and 'd's are permanently allocated in registers, # @x[0..7,12..15], while 'c's are maintained in memory. If # you observe 'c' column, you'll notice that pair of 'c's is # invariant between rounds. This means that we have to reload # them once per round, in the middle. This is why you'll see # 'c' stores and loads in the middle, but none in the beginning # or end. ( "&alr (@x[$a0],@x[$b0])", # Q1 "&alr (@x[$a1],@x[$b1])", # Q2 "&xr (@x[$d0],@x[$a0])", "&xr (@x[$d1],@x[$a1])", "&rll (@x[$d0],@x[$d0],16)", "&rll (@x[$d1],@x[$d1],16)", "&alr ($xc,@x[$d0])", "&alr ($xc_,@x[$d1])", "&xr (@x[$b0],$xc)", "&xr (@x[$b1],$xc_)", "&rll (@x[$b0],@x[$b0],12)", "&rll (@x[$b1],@x[$b1],12)", "&alr (@x[$a0],@x[$b0])", "&alr (@x[$a1],@x[$b1])", "&xr (@x[$d0],@x[$a0])", "&xr (@x[$d1],@x[$a1])", "&rll (@x[$d0],@x[$d0],8)", "&rll (@x[$d1],@x[$d1],8)", "&alr ($xc,@x[$d0])", "&alr ($xc_,@x[$d1])", "&xr (@x[$b0],$xc)", "&xr (@x[$b1],$xc_)", "&rll (@x[$b0],@x[$b0],7)", "&rll (@x[$b1],@x[$b1],7)", "&stm ($xc,$xc_,'$stdframe+4*8+4*$c0($sp)')", # reload pair of 'c's "&lm ($xc,$xc_,'$stdframe+4*8+4*$c2($sp)')", "&alr (@x[$a2],@x[$b2])", # Q3 "&alr (@x[$a3],@x[$b3])", # Q4 "&xr (@x[$d2],@x[$a2])", "&xr (@x[$d3],@x[$a3])", "&rll (@x[$d2],@x[$d2],16)", "&rll (@x[$d3],@x[$d3],16)", "&alr ($xc,@x[$d2])", "&alr ($xc_,@x[$d3])", "&xr (@x[$b2],$xc)", "&xr (@x[$b3],$xc_)", "&rll (@x[$b2],@x[$b2],12)", "&rll (@x[$b3],@x[$b3],12)", "&alr (@x[$a2],@x[$b2])", "&alr (@x[$a3],@x[$b3])", "&xr (@x[$d2],@x[$a2])", "&xr (@x[$d3],@x[$a3])", "&rll (@x[$d2],@x[$d2],8)", "&rll (@x[$d3],@x[$d3],8)", "&alr ($xc,@x[$d2])", "&alr ($xc_,@x[$d3])", "&xr (@x[$b2],$xc)", "&xr (@x[$b3],$xc_)", "&rll (@x[$b2],@x[$b2],7)", "&rll (@x[$b3],@x[$b3],7)" ); } $code.=<<___; .text .globl ChaCha20_ctr32 .type ChaCha20_ctr32,\@function .align 32 ChaCha20_ctr32: a${g}hi $len,-64 l${g}hi %r1,-$frame stm${g} %r6,%r15,`6*$SIZE_T`($sp) sl${g}r $out,$inp # difference la $len,0($inp,$len) # end of input minus 64 larl %r7,.Lsigma lgr %r0,$sp la $sp,0(%r1,$sp) st${g} %r0,0($sp) lmg %r8,%r11,0($key) # load key lmg %r12,%r13,0($counter) # load counter lmg %r6,%r7,0(%r7) # load sigma constant la %r14,0($inp) st${g} $out,$frame+3*$SIZE_T($sp) st${g} $len,$frame+4*$SIZE_T($sp) stmg %r6,%r13,$stdframe($sp) # copy key schedule to stack srlg @x[12],%r12,32 # 32-bit counter value j .Loop_outer .align 16 .Loop_outer: lm @x[0],@x[7],$stdframe+4*0($sp) # load x[0]-x[7] lm @t[0],@t[1],$stdframe+4*10($sp) # load x[10]-x[11] lm @x[13],@x[15],$stdframe+4*13($sp) # load x[13]-x[15] stm @t[0],@t[1],$stdframe+4*8+4*10($sp) # offload x[10]-x[11] lm @t[0],@t[1],$stdframe+4*8($sp) # load x[8]-x[9] st @x[12],$stdframe+4*12($sp) # save counter st${g} %r14,$frame+2*$SIZE_T($sp) # save input pointer lhi %r14,10 j .Loop .align 4 .Loop: ___ foreach (&ROUND(0, 4, 8,12)) { eval; } foreach (&ROUND(0, 5,10,15)) { eval; } $code.=<<___; brct %r14,.Loop l${g} %r14,$frame+2*$SIZE_T($sp) # pull input pointer stm @t[0],@t[1],$stdframe+4*8+4*8($sp) # offload x[8]-x[9] lm${g} @t[0],@t[1],$frame+3*$SIZE_T($sp) al @x[0],$stdframe+4*0($sp) # accumulate key schedule al @x[1],$stdframe+4*1($sp) al @x[2],$stdframe+4*2($sp) al @x[3],$stdframe+4*3($sp) al @x[4],$stdframe+4*4($sp) al @x[5],$stdframe+4*5($sp) al @x[6],$stdframe+4*6($sp) al @x[7],$stdframe+4*7($sp) lrvr @x[0],@x[0] lrvr @x[1],@x[1] lrvr @x[2],@x[2] lrvr @x[3],@x[3] lrvr @x[4],@x[4] lrvr @x[5],@x[5] lrvr @x[6],@x[6] lrvr @x[7],@x[7] al @x[12],$stdframe+4*12($sp) al @x[13],$stdframe+4*13($sp) al @x[14],$stdframe+4*14($sp) al @x[15],$stdframe+4*15($sp) lrvr @x[12],@x[12] lrvr @x[13],@x[13] lrvr @x[14],@x[14] lrvr @x[15],@x[15] la @t[0],0(@t[0],%r14) # reconstruct output pointer cl${g}r %r14,@t[1] jh .Ltail x @x[0],4*0(%r14) # xor with input x @x[1],4*1(%r14) st @x[0],4*0(@t[0]) # store output x @x[2],4*2(%r14) st @x[1],4*1(@t[0]) x @x[3],4*3(%r14) st @x[2],4*2(@t[0]) x @x[4],4*4(%r14) st @x[3],4*3(@t[0]) lm @x[0],@x[3],$stdframe+4*8+4*8($sp) # load x[8]-x[11] x @x[5],4*5(%r14) st @x[4],4*4(@t[0]) x @x[6],4*6(%r14) al @x[0],$stdframe+4*8($sp) st @x[5],4*5(@t[0]) x @x[7],4*7(%r14) al @x[1],$stdframe+4*9($sp) st @x[6],4*6(@t[0]) x @x[12],4*12(%r14) al @x[2],$stdframe+4*10($sp) st @x[7],4*7(@t[0]) x @x[13],4*13(%r14) al @x[3],$stdframe+4*11($sp) st @x[12],4*12(@t[0]) x @x[14],4*14(%r14) st @x[13],4*13(@t[0]) x @x[15],4*15(%r14) st @x[14],4*14(@t[0]) lrvr @x[0],@x[0] st @x[15],4*15(@t[0]) lrvr @x[1],@x[1] lrvr @x[2],@x[2] lrvr @x[3],@x[3] lhi @x[12],1 x @x[0],4*8(%r14) al @x[12],$stdframe+4*12($sp) # increment counter x @x[1],4*9(%r14) st @x[0],4*8(@t[0]) x @x[2],4*10(%r14) st @x[1],4*9(@t[0]) x @x[3],4*11(%r14) st @x[2],4*10(@t[0]) la %r14,64(%r14) st @x[3],4*11(@t[0]) cl${g}r %r14,@t[1] # done yet? jle .Loop_outer .Ldone: xgr %r0,%r0 xgr %r1,%r1 xgr %r2,%r2 xgr %r3,%r3 stmg %r0,%r3,$stdframe+4*4($sp) # wipe key copy stmg %r0,%r3,$stdframe+4*12($sp) lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp) br %r14 .align 16 .Ltail: la @t[1],64($t[1]) stm @x[0],@x[7],$stdframe+4*0($sp) sl${g}r @t[1],%r14 lm @x[0],@x[3],$stdframe+4*8+4*8($sp) l${g}hi @x[6],0 stm @x[12],@x[15],$stdframe+4*12($sp) al @x[0],$stdframe+4*8($sp) al @x[1],$stdframe+4*9($sp) al @x[2],$stdframe+4*10($sp) al @x[3],$stdframe+4*11($sp) lrvr @x[0],@x[0] lrvr @x[1],@x[1] lrvr @x[2],@x[2] lrvr @x[3],@x[3] stm @x[0],@x[3],$stdframe+4*8+4*8($sp) .Loop_tail: llgc @x[4],0(@x[6],%r14) llgc @x[5],$stdframe(@x[6],$sp) xr @x[5],@x[4] stc @x[5],0(@x[6],@t[0]) la @x[6],1(@x[6]) brct @t[1],.Loop_tail j .Ldone .size ChaCha20_ctr32,.-ChaCha20_ctr32 .align 32 .Lsigma: .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral .asciz "ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>" .align 4 ___ foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/ge; print $_,"\n"; } close STDOUT;
crypto/poly1305/Makefile.in +2 −0 Original line number Diff line number Diff line Loading @@ -39,6 +39,8 @@ lib: $(LIBOBJ) poly1305-sparcv9.S: asm/poly1305-sparcv9.pl $(PERL) asm/poly1305-sparcv9.pl > $@ poly1305-%.S: asm/poly1305-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@ files: $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO Loading
crypto/poly1305/asm/poly1305-s390x.pl 0 → 100755 +216 −0 Original line number Diff line number Diff line #!/usr/bin/env perl # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # This module implements Poly1305 hash for s390x. # # June 2015 # # ~6.4/2.2 cpb on z10/z196+, >2x improvement over compiler-generated # code. For older compiler improvement coefficient is >3x, because # then base 2^64 and base 2^32 implementations are compared. # # On side note, z13 enables vector base 2^26 implementation... $flavour = shift; if ($flavour =~ /3[12]/) { $SIZE_T=4; $g=""; } else { $SIZE_T=8; $g="g"; } while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; $sp="%r15"; my ($ctx,$inp,$len,$padbit) = map("%r$_",(2..5)); $code.=<<___; .text .globl poly1305_init .type poly1305_init,\@function .align 16 poly1305_init: lghi %r0,0 lghi %r1,-1 stg %r0,0($ctx) # zero hash value stg %r0,8($ctx) stg %r0,16($ctx) cl${g}r $inp,%r0 je .Lno_key lrvg %r4,0($inp) # load little-endian key lrvg %r5,8($inp) nihl %r1,0xffc0 # 0xffffffc0ffffffff srlg %r0,%r1,4 # 0x0ffffffc0fffffff srlg %r1,%r1,4 nill %r1,0xfffc # 0x0ffffffc0ffffffc ngr %r4,%r0 ngr %r5,%r1 stg %r4,32($ctx) stg %r5,40($ctx) .Lno_key: lghi %r2,0 br %r14 .size poly1305_init,.-poly1305_init ___ { my ($d0hi,$d0lo,$d1hi,$d1lo,$t0,$h0,$t1,$h1,$h2) = map("%r$_",(6..14)); my ($r0,$r1,$s1) = map("%r$_",(0..2)); $code.=<<___; .globl poly1305_blocks .type poly1305_blocks,\@function .align 16 poly1305_blocks: srl${g} $len,$len,4 lghi %r0,0 cl${g}r $len,%r0 je .Lno_data stm${g} %r6,%r14,`6*$SIZE_T`($sp) lg $r0,32($ctx) # load key lg $r1,40($ctx) lg $h0,0($ctx) # load hash value lg $h1,8($ctx) lg $h2,16($ctx) st$g $ctx,`2*$SIZE_T`($sp) # off-load $ctx srlg $s1,$r1,2 algr $s1,$r1 # s1 = r1 + r1>>2 j .Loop .align 16 .Loop: lrvg $d0lo,0($inp) # load little-endian input lrvg $d1lo,8($inp) la $inp,16($inp) algr $d0lo,$h0 # accumulate input alcgr $d1lo,$h1 lgr $h0,$d0lo mlgr $d0hi,$r0 # h0*r0 -> $d0hi:$d0lo lgr $h1,$d1lo mlgr $d1hi,$s1 # h1*5*r1 -> $d1hi:$d1lo mlgr $t0,$r1 # h0*r1 -> $t0:$h0 mlgr $t1,$r0 # h1*r0 -> $t1:$h1 alcgr $h2,$padbit algr $d0lo,$d1lo lgr $d1lo,$h2 alcgr $d0hi,$d1hi lghi $d1hi,0 algr $h1,$h0 alcgr $t1,$t0 msgr $d1lo,$s1 # h2*s1 msgr $h2,$r0 # h2*r0 algr $h1,$d1lo alcgr $t1,$d1hi # $d1hi is zero algr $h1,$d0hi alcgr $h2,$t1 lghi $h0,-4 # final reduction step ngr $h0,$h2 srlg $t0,$h2,2 algr $h0,$t0 algr $h0,$d0lo lghi $t1,3 alcgr $h1,$d1hi # $d1hi is still zero ngr $h2,$t1 brct$g $len,.Loop l$g $ctx,`2*$SIZE_T`($sp) # restore $ctx stg $h0,0($ctx) # store hash value stg $h1,8($ctx) stg $h2,16($ctx) lm${g} %r6,%r14,`6*$SIZE_T`($sp) .Lno_data: br %r14 .size poly1305_blocks,.-poly1305_blocks ___ } { my ($mac,$nonce)=($inp,$len); my ($h0,$h1,$h2,$d0,$d1)=map("%r$_",(5..9)); $code.=<<___; .globl poly1305_emit .type poly1305_emit,\@function .align 16 poly1305_emit: stm${g} %r6,%r9,`6*$SIZE_T`($sp) lg $h0,0($ctx) lg $h1,8($ctx) lg $h2,16($ctx) lghi %r0,5 lghi %r1,0 lgr $d0,$h0 lgr $d1,$h1 algr $h0,%r0 # compare to modulus alcgr $h1,%r1 alcgr $h2,%r1 srlg $h2,$h2,2 # did it borrow/carry? slgr %r1,$h2 # 0-$h2>>2 lg $h2,0($nonce) # load nonce lghi %r0,-1 lg $ctx,8($nonce) xgr %r0,%r1 # ~%r1 ngr $h0,%r1 ngr $d0,%r0 ngr $h1,%r1 ngr $d1,%r0 ogr $h0,$d0 rllg $d0,$h2,32 # flip nonce words ogr $h1,$d1 rllg $d1,$ctx,32 algr $h0,$d0 # accumulate nonce alcgr $h1,$d1 strvg $h0,0($mac) # write little-endian result strvg $h1,8($mac) lm${g} %r6,%r9,`6*$SIZE_T`($sp) br %r14 .size poly1305_emit,.-poly1305_emit .string "Poly1305 for s390x, CRYPTOGAMS by <appro\@openssl.org>" ___ } $code =~ s/\`([^\`]*)\`/eval $1/gem; print $code; close STDOUT;