Loading crypto/chacha/asm/chacha-s390x.pl +557 −259 Original line number Diff line number Diff line #! /usr/bin/env perl # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. # Copyright 2016-2019 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy Loading @@ -20,41 +20,46 @@ # # 3 times faster than compiler-generated code. $flavour = shift; # # August 2018 # # Add vx code path. # # Copyright IBM Corp. 2018 # Author: Patrick Steuer <patrick.steuer@de.ibm.com> use strict; use FindBin qw($Bin); use lib "$Bin/../.."; use perlasm::s390x qw(:DEFAULT :VX AUTOLOAD LABEL INCLUDE); my $flavour = shift; my ($z,$SIZE_T); if ($flavour =~ /3[12]/) { $z=0; # S/390 ABI $SIZE_T=4; $g=""; } else { $z=1; # zSeries ABI $SIZE_T=8; $g="g"; } my $output; while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; sub AUTOLOAD() # thunk [simplified] x86-style perlasm { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $code .= "\t$opcode\t".join(',',@_)."\n"; } my $sp="%r15"; my $stdframe=16*$SIZE_T+4*8; my $frame=$stdframe+4*20; my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6)); my @x=map("%r$_",(0..7,"x","x","x","x",(10..13))); my @t=map("%r$_",(8,9)); my @v=map("%v$_",(16..31)); sub ROUND { my ($a0,$b0,$c0,$d0)=@_; my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); my ($xc,$xc_)=map("\"$_\"",@t); my @x=map("\"$_\"",@x); my ($xc,$xc_)=map("$_",@t); # Consider order in which variables are addressed by their # index: Loading @@ -78,249 +83,542 @@ my @x=map("\"$_\"",@x); # 'c' stores and loads in the middle, but none in the beginning # or end. ( "&alr (@x[$a0],@x[$b0])", # Q1 "&alr (@x[$a1],@x[$b1])", # Q2 "&xr (@x[$d0],@x[$a0])", "&xr (@x[$d1],@x[$a1])", "&rll (@x[$d0],@x[$d0],16)", "&rll (@x[$d1],@x[$d1],16)", "&alr ($xc,@x[$d0])", "&alr ($xc_,@x[$d1])", "&xr (@x[$b0],$xc)", "&xr (@x[$b1],$xc_)", "&rll (@x[$b0],@x[$b0],12)", "&rll (@x[$b1],@x[$b1],12)", "&alr (@x[$a0],@x[$b0])", "&alr (@x[$a1],@x[$b1])", "&xr (@x[$d0],@x[$a0])", "&xr (@x[$d1],@x[$a1])", "&rll (@x[$d0],@x[$d0],8)", "&rll (@x[$d1],@x[$d1],8)", "&alr ($xc,@x[$d0])", "&alr ($xc_,@x[$d1])", "&xr (@x[$b0],$xc)", "&xr (@x[$b1],$xc_)", "&rll (@x[$b0],@x[$b0],7)", "&rll (@x[$b1],@x[$b1],7)", "&stm ($xc,$xc_,'$stdframe+4*8+4*$c0($sp)')", # reload pair of 'c's "&lm ($xc,$xc_,'$stdframe+4*8+4*$c2($sp)')", "&alr (@x[$a2],@x[$b2])", # Q3 "&alr (@x[$a3],@x[$b3])", # Q4 "&xr (@x[$d2],@x[$a2])", "&xr (@x[$d3],@x[$a3])", "&rll (@x[$d2],@x[$d2],16)", "&rll (@x[$d3],@x[$d3],16)", "&alr ($xc,@x[$d2])", "&alr ($xc_,@x[$d3])", "&xr (@x[$b2],$xc)", "&xr (@x[$b3],$xc_)", "&rll (@x[$b2],@x[$b2],12)", "&rll (@x[$b3],@x[$b3],12)", "&alr (@x[$a2],@x[$b2])", "&alr (@x[$a3],@x[$b3])", "&xr (@x[$d2],@x[$a2])", "&xr (@x[$d3],@x[$a3])", "&rll (@x[$d2],@x[$d2],8)", "&rll (@x[$d3],@x[$d3],8)", "&alr ($xc,@x[$d2])", "&alr ($xc_,@x[$d3])", "&xr (@x[$b2],$xc)", "&xr (@x[$b3],$xc_)", "&rll (@x[$b2],@x[$b2],7)", "&rll (@x[$b3],@x[$b3],7)" ); alr (@x[$a0],@x[$b0]); # Q1 alr (@x[$a1],@x[$b1]); # Q2 xr (@x[$d0],@x[$a0]); xr (@x[$d1],@x[$a1]); rll (@x[$d0],@x[$d0],16); rll (@x[$d1],@x[$d1],16); alr ($xc,@x[$d0]); alr ($xc_,@x[$d1]); xr (@x[$b0],$xc); xr (@x[$b1],$xc_); rll (@x[$b0],@x[$b0],12); rll (@x[$b1],@x[$b1],12); alr (@x[$a0],@x[$b0]); alr (@x[$a1],@x[$b1]); xr (@x[$d0],@x[$a0]); xr (@x[$d1],@x[$a1]); rll (@x[$d0],@x[$d0],8); rll (@x[$d1],@x[$d1],8); alr ($xc,@x[$d0]); alr ($xc_,@x[$d1]); xr (@x[$b0],$xc); xr (@x[$b1],$xc_); rll (@x[$b0],@x[$b0],7); rll (@x[$b1],@x[$b1],7); stm ($xc,$xc_,"$stdframe+4*8+4*$c0($sp)"); # reload pair of 'c's lm ($xc,$xc_,"$stdframe+4*8+4*$c2($sp)"); alr (@x[$a2],@x[$b2]); # Q3 alr (@x[$a3],@x[$b3]); # Q4 xr (@x[$d2],@x[$a2]); xr (@x[$d3],@x[$a3]); rll (@x[$d2],@x[$d2],16); rll (@x[$d3],@x[$d3],16); alr ($xc,@x[$d2]); alr ($xc_,@x[$d3]); xr (@x[$b2],$xc); xr (@x[$b3],$xc_); rll (@x[$b2],@x[$b2],12); rll (@x[$b3],@x[$b3],12); alr (@x[$a2],@x[$b2]); alr (@x[$a3],@x[$b3]); xr (@x[$d2],@x[$a2]); xr (@x[$d3],@x[$a3]); rll (@x[$d2],@x[$d2],8); rll (@x[$d3],@x[$d3],8); alr ($xc,@x[$d2]); alr ($xc_,@x[$d3]); xr (@x[$b2],$xc); xr (@x[$b3],$xc_); rll (@x[$b2],@x[$b2],7); rll (@x[$b3],@x[$b3],7); } sub VX_ROUND { my ($a0,$b0,$c0,$d0)=@_; my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); vaf (@v[$a0],@v[$a0],@v[$b0]); vaf (@v[$a1],@v[$a1],@v[$b1]); vaf (@v[$a2],@v[$a2],@v[$b2]); vaf (@v[$a3],@v[$a3],@v[$b3]); vx (@v[$d0],@v[$d0],@v[$a0]); vx (@v[$d1],@v[$d1],@v[$a1]); vx (@v[$d2],@v[$d2],@v[$a2]); vx (@v[$d3],@v[$d3],@v[$a3]); verllf (@v[$d0],@v[$d0],16); verllf (@v[$d1],@v[$d1],16); verllf (@v[$d2],@v[$d2],16); verllf (@v[$d3],@v[$d3],16); vaf (@v[$c0],@v[$c0],@v[$d0]); vaf (@v[$c1],@v[$c1],@v[$d1]); vaf (@v[$c2],@v[$c2],@v[$d2]); vaf (@v[$c3],@v[$c3],@v[$d3]); vx (@v[$b0],@v[$b0],@v[$c0]); vx (@v[$b1],@v[$b1],@v[$c1]); vx (@v[$b2],@v[$b2],@v[$c2]); vx (@v[$b3],@v[$b3],@v[$c3]); verllf (@v[$b0],@v[$b0],12); verllf (@v[$b1],@v[$b1],12); verllf (@v[$b2],@v[$b2],12); verllf (@v[$b3],@v[$b3],12); vaf (@v[$a0],@v[$a0],@v[$b0]); vaf (@v[$a1],@v[$a1],@v[$b1]); vaf (@v[$a2],@v[$a2],@v[$b2]); vaf (@v[$a3],@v[$a3],@v[$b3]); vx (@v[$d0],@v[$d0],@v[$a0]); vx (@v[$d1],@v[$d1],@v[$a1]); vx (@v[$d2],@v[$d2],@v[$a2]); vx (@v[$d3],@v[$d3],@v[$a3]); verllf (@v[$d0],@v[$d0],8); verllf (@v[$d1],@v[$d1],8); verllf (@v[$d2],@v[$d2],8); verllf (@v[$d3],@v[$d3],8); vaf (@v[$c0],@v[$c0],@v[$d0]); vaf (@v[$c1],@v[$c1],@v[$d1]); vaf (@v[$c2],@v[$c2],@v[$d2]); vaf (@v[$c3],@v[$c3],@v[$d3]); vx (@v[$b0],@v[$b0],@v[$c0]); vx (@v[$b1],@v[$b1],@v[$c1]); vx (@v[$b2],@v[$b2],@v[$c2]); vx (@v[$b3],@v[$b3],@v[$c3]); verllf (@v[$b0],@v[$b0],7); verllf (@v[$b1],@v[$b1],7); verllf (@v[$b2],@v[$b2],7); verllf (@v[$b3],@v[$b3],7); } PERLASM_BEGIN($output); INCLUDE ("s390x_arch.h"); TEXT (); ################ # void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp, size_t len, # const unsigned int key[8], const unsigned int counter[4]) { my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6)); # VX CODE PATH { my $off=$z*8*16+8; # offset(initial state) my $frame=$stdframe+4*16+$off; GLOBL ("ChaCha20_ctr32"); TYPE ("ChaCha20_ctr32","\@function"); ALIGN (32); LABEL ("ChaCha20_ctr32"); larl ("%r1","OPENSSL_s390xcap_P"); lghi ("%r0",64); &{$z? \&cgr:\&cr} ($len,"%r0"); jle ("_s390x_chacha_novx"); lg ("%r0","S390X_STFLE+16(%r1)"); tmhh ("%r0",0x4000); # check for vector facility jz ("_s390x_chacha_novx"); if (!$z) { llgfr ($len,$len); std ("%f4","16*$SIZE_T+2*8($sp)"); std ("%f6","16*$SIZE_T+3*8($sp)"); } &{$z? \&stmg:\&stm} ("%r6","%r7","6*$SIZE_T($sp)"); lghi ("%r1",-$frame); lgr ("%r0",$sp); la ($sp,"0(%r1,$sp)"); # allocate stack frame larl ("%r7",".Lsigma"); &{$z? \&stg:\&st} ("%r0","0($sp)"); # backchain vstm ("%v8","%v15","8($sp)") if ($z); vlm ("%v1","%v2","0($key)"); # load key vl ("%v0","0(%r7)"); # load sigma constant vl ("%v3","0($counter)"); # load iv (counter||nonce) l ("%r0","0($counter)"); # load counter vstm ("%v0","%v3","$off($sp)"); # copy initial state to stack srlg ("%r1",$len,8); ltgr ("%r1","%r1"); jz (".Lvx_4x_done"); ALIGN (16); # process 4 64-byte blocks LABEL (".Lvx_4x"); vlrepf ("%v$_",($_*4)."+$off($sp)") for (0..15); # load initial # state vl ("%v31","16(%r7)"); vaf ("%v12","%v12","%v31"); # increment counter vlr (@v[$_],"%v$_") for (0..15); # copy initial state lhi ("%r6",10); j (".Loop_vx_4x"); ALIGN (16); LABEL (".Loop_vx_4x"); VX_ROUND( 0, 4, 8,12); # column round VX_ROUND( 0, 5,10,15); # diagonal round brct ("%r6",".Loop_vx_4x"); vaf (@v[$_],@v[$_],"%v$_") for (0..15); # state += initial # state (mod 32) vlm ("%v6","%v7","32(%r7)"); # load vperm operands for (0..3) { # blocks 1,2 vmrhf ("%v0",@v[$_*4+0],@v[$_*4+1]); # ks = serialize(state) vmrhf ("%v1",@v[$_*4+2],@v[$_*4+3]); vperm ("%v".($_+ 8),"%v0","%v1","%v6"); vperm ("%v".($_+12),"%v0","%v1","%v7"); } vlm ("%v0","%v7","0($inp)"); # load in vx ("%v$_","%v$_","%v".($_+8)) for (0..7); # out = in ^ ks vstm ("%v0","%v7","0($out)"); # store out vlm ("%v6","%v7","32(%r7)"); # restore vperm operands for (0..3) { # blocks 2,3 vmrlf ("%v0",@v[$_*4+0],@v[$_*4+1]); # ks = serialize(state) vmrlf ("%v1",@v[$_*4+2],@v[$_*4+3]); vperm ("%v".($_+ 8),"%v0","%v1","%v6"); vperm ("%v".($_+12),"%v0","%v1","%v7"); } vlm ("%v0","%v7","128($inp)"); # load in vx ("%v$_","%v$_","%v".($_+8)) for (0..7); # out = in ^ ks vstm ("%v0","%v7","128($out)"); # store out ahi ("%r0",4); st ("%r0","48+$off($sp)"); # update initial state la ($inp,"256($inp)"); la ($out,"256($out)"); brctg ("%r1",".Lvx_4x"); ALIGN (16); LABEL (".Lvx_4x_done"); lghi ("%r1",0xff); ngr ($len,"%r1"); jnz (".Lvx_rem"); ALIGN (16); LABEL (".Lvx_done"); vzero ("%v$_") for (16..31); # wipe ks and key copy vstm ("%v16","%v17","16+$off($sp)"); vlm ("%v8","%v15","8($sp)") if ($z); la ($sp,"$frame($sp)"); &{$z? \&lmg:\&lm} ("%r6","%r7","6*$SIZE_T($sp)"); if (!$z) { ld ("%f4","16*$SIZE_T+2*8($sp)"); ld ("%f6","16*$SIZE_T+3*8($sp)"); vzero ("%v$_") for (8..15); } br ("%r14"); ALIGN (16); LABEL (".Lvx_rem"); lhi ("%r0",64); sr ($len,"%r0"); brc (2,".Lvx_rem_g64"); # cc==2? lghi ("%r1",-$stdframe); la ($counter,"48+$off($sp)"); # load updated iv ar ($len,"%r0"); # restore len lgr ("%r7",$counter); &{$z? \&stg:\&st} ("%r14","14*$SIZE_T+$frame($sp)"); la ($sp,"0(%r1,$sp)"); bras ("%r14","_s390x_chacha_novx"); la ($sp,"$stdframe($sp)"); &{$z? \&lg:\&l} ("%r14","14*$SIZE_T+$frame($sp)"); lgr ($counter,"%r7"); j (".Lvx_done"); ALIGN (16); LABEL (".Lvx_rem_g64"); vlrepf ("%v$_",($_*4)."+$off($sp)") for (0..15); # load initial # state vl ("%v31","16(%r7)"); vaf ("%v12","%v12","%v31"); # increment counter $code.=<<___; .text .globl ChaCha20_ctr32 .type ChaCha20_ctr32,\@function .align 32 ChaCha20_ctr32: lt${g}r $len,$len # $len==0? bzr %r14 a${g}hi $len,-64 l${g}hi %r1,-$frame stm${g} %r6,%r15,`6*$SIZE_T`($sp) sl${g}r $out,$inp # difference la $len,0($inp,$len) # end of input minus 64 larl %r7,.Lsigma lgr %r0,$sp la $sp,0(%r1,$sp) st${g} %r0,0($sp) lmg %r8,%r11,0($key) # load key lmg %r12,%r13,0($counter) # load counter lmg %r6,%r7,0(%r7) # load sigma constant la %r14,0($inp) st${g} $out,$frame+3*$SIZE_T($sp) st${g} $len,$frame+4*$SIZE_T($sp) stmg %r6,%r13,$stdframe($sp) # copy key schedule to stack srlg @x[12],%r12,32 # 32-bit counter value j .Loop_outer .align 16 .Loop_outer: lm @x[0],@x[7],$stdframe+4*0($sp) # load x[0]-x[7] lm @t[0],@t[1],$stdframe+4*10($sp) # load x[10]-x[11] lm @x[13],@x[15],$stdframe+4*13($sp) # load x[13]-x[15] stm @t[0],@t[1],$stdframe+4*8+4*10($sp) # offload x[10]-x[11] lm @t[0],@t[1],$stdframe+4*8($sp) # load x[8]-x[9] st @x[12],$stdframe+4*12($sp) # save counter st${g} %r14,$frame+2*$SIZE_T($sp) # save input pointer lhi %r14,10 j .Loop .align 4 .Loop: ___ foreach (&ROUND(0, 4, 8,12)) { eval; } foreach (&ROUND(0, 5,10,15)) { eval; } $code.=<<___; brct %r14,.Loop l${g} %r14,$frame+2*$SIZE_T($sp) # pull input pointer stm @t[0],@t[1],$stdframe+4*8+4*8($sp) # offload x[8]-x[9] lm${g} @t[0],@t[1],$frame+3*$SIZE_T($sp) al @x[0],$stdframe+4*0($sp) # accumulate key schedule al @x[1],$stdframe+4*1($sp) al @x[2],$stdframe+4*2($sp) al @x[3],$stdframe+4*3($sp) al @x[4],$stdframe+4*4($sp) al @x[5],$stdframe+4*5($sp) al @x[6],$stdframe+4*6($sp) al @x[7],$stdframe+4*7($sp) lrvr @x[0],@x[0] lrvr @x[1],@x[1] lrvr @x[2],@x[2] lrvr @x[3],@x[3] lrvr @x[4],@x[4] lrvr @x[5],@x[5] lrvr @x[6],@x[6] lrvr @x[7],@x[7] al @x[12],$stdframe+4*12($sp) al @x[13],$stdframe+4*13($sp) al @x[14],$stdframe+4*14($sp) al @x[15],$stdframe+4*15($sp) lrvr @x[12],@x[12] lrvr @x[13],@x[13] lrvr @x[14],@x[14] lrvr @x[15],@x[15] la @t[0],0(@t[0],%r14) # reconstruct output pointer cl${g}r %r14,@t[1] jh .Ltail x @x[0],4*0(%r14) # xor with input x @x[1],4*1(%r14) st @x[0],4*0(@t[0]) # store output x @x[2],4*2(%r14) st @x[1],4*1(@t[0]) x @x[3],4*3(%r14) st @x[2],4*2(@t[0]) x @x[4],4*4(%r14) st @x[3],4*3(@t[0]) lm @x[0],@x[3],$stdframe+4*8+4*8($sp) # load x[8]-x[11] x @x[5],4*5(%r14) st @x[4],4*4(@t[0]) x @x[6],4*6(%r14) al @x[0],$stdframe+4*8($sp) st @x[5],4*5(@t[0]) x @x[7],4*7(%r14) al @x[1],$stdframe+4*9($sp) st @x[6],4*6(@t[0]) x @x[12],4*12(%r14) al @x[2],$stdframe+4*10($sp) st @x[7],4*7(@t[0]) x @x[13],4*13(%r14) al @x[3],$stdframe+4*11($sp) st @x[12],4*12(@t[0]) x @x[14],4*14(%r14) st @x[13],4*13(@t[0]) x @x[15],4*15(%r14) st @x[14],4*14(@t[0]) lrvr @x[0],@x[0] st @x[15],4*15(@t[0]) lrvr @x[1],@x[1] lrvr @x[2],@x[2] lrvr @x[3],@x[3] lhi @x[12],1 x @x[0],4*8(%r14) al @x[12],$stdframe+4*12($sp) # increment counter x @x[1],4*9(%r14) st @x[0],4*8(@t[0]) x @x[2],4*10(%r14) st @x[1],4*9(@t[0]) x @x[3],4*11(%r14) st @x[2],4*10(@t[0]) st @x[3],4*11(@t[0]) cl${g}r %r14,@t[1] # done yet? la %r14,64(%r14) jl .Loop_outer .Ldone: xgr %r0,%r0 xgr %r1,%r1 xgr %r2,%r2 xgr %r3,%r3 stmg %r0,%r3,$stdframe+4*4($sp) # wipe key copy stmg %r0,%r3,$stdframe+4*12($sp) lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp) br %r14 .align 16 .Ltail: la @t[1],64($t[1]) stm @x[0],@x[7],$stdframe+4*0($sp) sl${g}r @t[1],%r14 lm @x[0],@x[3],$stdframe+4*8+4*8($sp) l${g}hi @x[6],0 stm @x[12],@x[15],$stdframe+4*12($sp) al @x[0],$stdframe+4*8($sp) al @x[1],$stdframe+4*9($sp) al @x[2],$stdframe+4*10($sp) al @x[3],$stdframe+4*11($sp) lrvr @x[0],@x[0] lrvr @x[1],@x[1] lrvr @x[2],@x[2] lrvr @x[3],@x[3] stm @x[0],@x[3],$stdframe+4*8($sp) .Loop_tail: llgc @x[4],0(@x[6],%r14) llgc @x[5],$stdframe(@x[6],$sp) xr @x[5],@x[4] stc @x[5],0(@x[6],@t[0]) la @x[6],1(@x[6]) brct @t[1],.Loop_tail j .Ldone .size ChaCha20_ctr32,.-ChaCha20_ctr32 .align 32 .Lsigma: .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral .asciz "ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>" .align 4 ___ foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/ge; print $_,"\n"; vlr (@v[$_],"%v$_") for (0..15); # state = initial state lhi ("%r6",10); j (".Loop_vx_rem"); ALIGN (16); LABEL (".Loop_vx_rem"); VX_ROUND( 0, 4, 8,12); # column round VX_ROUND( 0, 5,10,15); # diagonal round brct ("%r6",".Loop_vx_rem"); vaf (@v[$_],@v[$_],"%v$_") for (0..15); # state += initial # state (mod 32) vlm ("%v6","%v7","32(%r7)"); # load vperm operands for (0..3) { # blocks 1,2 vmrhf ("%v0",@v[$_*4+0],@v[$_*4+1]); # ks = serialize(state) vmrhf ("%v1",@v[$_*4+2],@v[$_*4+3]); vperm ("%v".($_+8),"%v0","%v1","%v6"); vperm ("%v".($_+12),"%v0","%v1","%v7"); } vlm ("%v0","%v3","0($inp)"); # load in vx ("%v$_","%v$_","%v".($_+8)) for (0..3); # out = in ^ ks vstm ("%v0","%v3","0($out)"); # store out la ($inp,"64($inp)"); la ($out,"64($out)"); sr ($len,"%r0"); brc (4,".Lvx_tail"); # cc==4? vlm ("%v0","%v3","0($inp)"); # load in vx ("%v$_","%v$_","%v".($_+12)) for (0..3); # out = in ^ ks vstm ("%v0","%v3","0($out)"); # store out jz (".Lvx_done"); for (0..3) { # blocks 3,4 vmrlf ("%v0",@v[$_*4+0],@v[$_*4+1]); # ks = serialize(state) vmrlf ("%v1",@v[$_*4+2],@v[$_*4+3]); vperm ("%v".($_+12),"%v0","%v1","%v6"); vperm ("%v".($_+8),"%v0","%v1","%v7"); } la ($inp,"64($inp)"); la ($out,"64($out)"); sr ($len,"%r0"); brc (4,".Lvx_tail"); # cc==4? vlm ("%v0","%v3","0($inp)"); # load in vx ("%v$_","%v$_","%v".($_+12)) for (0..3); # out = in ^ ks vstm ("%v0","%v3","0($out)"); # store out jz (".Lvx_done"); la ($inp,"64($inp)"); la ($out,"64($out)"); sr ($len,"%r0"); vlr ("%v".($_+4),"%v$_") for (8..11); j (".Lvx_tail"); ALIGN (16); LABEL (".Lvx_tail"); ar ($len,"%r0"); # restore $len ahi ($len,-1); lhi ("%r0",16); for (0..2) { vll ("%v0",$len,($_*16)."($inp)"); vx ("%v0","%v0","%v".($_+12)); vstl ("%v0",$len,($_*16)."($out)"); sr ($len,"%r0"); brc (4,".Lvx_done"); # cc==4? } vll ("%v0",$len,"3*16($inp)"); vx ("%v0","%v0","%v15"); vstl ("%v0",$len,"3*16($out)"); j (".Lvx_done"); SIZE ("ChaCha20_ctr32",".-ChaCha20_ctr32"); } # NOVX CODE PATH { my $frame=$stdframe+4*20; TYPE ("_s390x_chacha_novx","\@function"); ALIGN (32); LABEL ("_s390x_chacha_novx"); &{$z? \<gr:\<r} ($len,$len); # $len==0? bzr ("%r14"); &{$z? \&aghi:\&ahi} ($len,-64); &{$z? \&lghi:\&lhi} ("%r1",-$frame); &{$z? \&stmg:\&stm} ("%r6","%r15","6*$SIZE_T($sp)"); &{$z? \&slgr:\&slr} ($out,$inp); # difference la ($len,"0($inp,$len)"); # end of input minus 64 larl ("%r7",".Lsigma"); lgr ("%r0",$sp); la ($sp,"0(%r1,$sp)"); &{$z? \&stg:\&st} ("%r0","0($sp)"); lmg ("%r8","%r11","0($key)"); # load key lmg ("%r12","%r13","0($counter)"); # load counter lmg ("%r6","%r7","0(%r7)"); # load sigma constant la ("%r14","0($inp)"); &{$z? \&stg:\&st} ($out,"$frame+3*$SIZE_T($sp)"); &{$z? \&stg:\&st} ($len,"$frame+4*$SIZE_T($sp)"); stmg ("%r6","%r13","$stdframe($sp)");# copy key schedule to stack srlg (@x[12],"%r12",32); # 32-bit counter value j (".Loop_outer"); ALIGN (16); LABEL (".Loop_outer"); lm (@x[0],@x[7],"$stdframe+4*0($sp)"); # load x[0]-x[7] lm (@t[0],@t[1],"$stdframe+4*10($sp)"); # load x[10]-x[11] lm (@x[13],@x[15],"$stdframe+4*13($sp)"); # load x[13]-x[15] stm (@t[0],@t[1],"$stdframe+4*8+4*10($sp)");# offload x[10]-x[11] lm (@t[0],@t[1],"$stdframe+4*8($sp)"); # load x[8]-x[9] st (@x[12],"$stdframe+4*12($sp)"); # save counter &{$z? \&stg:\&st} ("%r14","$frame+2*$SIZE_T($sp)");# save input pointer lhi ("%r14",10); j (".Loop"); ALIGN (4); LABEL (".Loop"); ROUND (0, 4, 8,12); ROUND (0, 5,10,15); brct ("%r14",".Loop"); &{$z? \&lg:\&l} ("%r14","$frame+2*$SIZE_T($sp)");# pull input pointer stm (@t[0],@t[1],"$stdframe+4*8+4*8($sp)"); # offload x[8]-x[9] &{$z? \&lmg:\&lm} (@t[0],@t[1],"$frame+3*$SIZE_T($sp)"); al (@x[0],"$stdframe+4*0($sp)"); # accumulate key schedule al (@x[1],"$stdframe+4*1($sp)"); al (@x[2],"$stdframe+4*2($sp)"); al (@x[3],"$stdframe+4*3($sp)"); al (@x[4],"$stdframe+4*4($sp)"); al (@x[5],"$stdframe+4*5($sp)"); al (@x[6],"$stdframe+4*6($sp)"); al (@x[7],"$stdframe+4*7($sp)"); lrvr (@x[0],@x[0]); lrvr (@x[1],@x[1]); lrvr (@x[2],@x[2]); lrvr (@x[3],@x[3]); lrvr (@x[4],@x[4]); lrvr (@x[5],@x[5]); lrvr (@x[6],@x[6]); lrvr (@x[7],@x[7]); al (@x[12],"$stdframe+4*12($sp)"); al (@x[13],"$stdframe+4*13($sp)"); al (@x[14],"$stdframe+4*14($sp)"); al (@x[15],"$stdframe+4*15($sp)"); lrvr (@x[12],@x[12]); lrvr (@x[13],@x[13]); lrvr (@x[14],@x[14]); lrvr (@x[15],@x[15]); la (@t[0],"0(@t[0],%r14)"); # reconstruct output pointer &{$z? \&clgr:\&clr} ("%r14",@t[1]); jh (".Ltail"); x (@x[0],"4*0(%r14)"); # xor with input x (@x[1],"4*1(%r14)"); st (@x[0],"4*0(@t[0])"); # store output x (@x[2],"4*2(%r14)"); st (@x[1],"4*1(@t[0])"); x (@x[3],"4*3(%r14)"); st (@x[2],"4*2(@t[0])"); x (@x[4],"4*4(%r14)"); st (@x[3],"4*3(@t[0])"); lm (@x[0],@x[3],"$stdframe+4*8+4*8($sp)"); # load x[8]-x[11] x (@x[5],"4*5(%r14)"); st (@x[4],"4*4(@t[0])"); x (@x[6],"4*6(%r14)"); al (@x[0],"$stdframe+4*8($sp)"); st (@x[5],"4*5(@t[0])"); x (@x[7],"4*7(%r14)"); al (@x[1],"$stdframe+4*9($sp)"); st (@x[6],"4*6(@t[0])"); x (@x[12],"4*12(%r14)"); al (@x[2],"$stdframe+4*10($sp)"); st (@x[7],"4*7(@t[0])"); x (@x[13],"4*13(%r14)"); al (@x[3],"$stdframe+4*11($sp)"); st (@x[12],"4*12(@t[0])"); x (@x[14],"4*14(%r14)"); st (@x[13],"4*13(@t[0])"); x (@x[15],"4*15(%r14)"); st (@x[14],"4*14(@t[0])"); lrvr (@x[0],@x[0]); st (@x[15],"4*15(@t[0])"); lrvr (@x[1],@x[1]); lrvr (@x[2],@x[2]); lrvr (@x[3],@x[3]); lhi (@x[12],1); x (@x[0],"4*8(%r14)"); al (@x[12],"$stdframe+4*12($sp)"); # increment counter x (@x[1],"4*9(%r14)"); st (@x[0],"4*8(@t[0])"); x (@x[2],"4*10(%r14)"); st (@x[1],"4*9(@t[0])"); x (@x[3],"4*11(%r14)"); st (@x[2],"4*10(@t[0])"); st (@x[3],"4*11(@t[0])"); &{$z? \&clgr:\&clr} ("%r14",@t[1]); # done yet? la ("%r14","64(%r14)"); jl (".Loop_outer"); LABEL (".Ldone"); xgr ("%r0","%r0"); xgr ("%r1","%r1"); xgr ("%r2","%r2"); xgr ("%r3","%r3"); stmg ("%r0","%r3","$stdframe+4*4($sp)"); # wipe key copy stmg ("%r0","%r3","$stdframe+4*12($sp)"); &{$z? \&lmg:\&lm} ("%r6","%r15","$frame+6*$SIZE_T($sp)"); br ("%r14"); ALIGN (16); LABEL (".Ltail"); la (@t[1],"64($t[1])"); stm (@x[0],@x[7],"$stdframe+4*0($sp)"); &{$z? \&slgr:\&slr} (@t[1],"%r14"); lm (@x[0],@x[3],"$stdframe+4*8+4*8($sp)"); &{$z? \&lghi:\&lhi} (@x[6],0); stm (@x[12],@x[15],"$stdframe+4*12($sp)"); al (@x[0],"$stdframe+4*8($sp)"); al (@x[1],"$stdframe+4*9($sp)"); al (@x[2],"$stdframe+4*10($sp)"); al (@x[3],"$stdframe+4*11($sp)"); lrvr (@x[0],@x[0]); lrvr (@x[1],@x[1]); lrvr (@x[2],@x[2]); lrvr (@x[3],@x[3]); stm (@x[0],@x[3],"$stdframe+4*8($sp)"); LABEL (".Loop_tail"); llgc (@x[4],"0(@x[6],%r14)"); llgc (@x[5],"$stdframe(@x[6],$sp)"); xr (@x[5],@x[4]); stc (@x[5],"0(@x[6],@t[0])"); la (@x[6],"1(@x[6])"); brct (@t[1],".Loop_tail"); j (".Ldone"); SIZE ("_s390x_chacha_novx",".-_s390x_chacha_novx"); } } close STDOUT; ################ ALIGN (64); LABEL (".Lsigma"); LONG (0x61707865,0x3320646e,0x79622d32,0x6b206574); # endian-neutral sigma LONG (0x00000000,0x00000001,0x00000002,0x00000003); # vaf counter increment LONG (0x03020100,0x07060504,0x13121110,0x17161514); # vperm serialization LONG (0x0b0a0908,0x0f0e0d0c,0x1b1a1918,0x1f1e1d1c); # vperm serialization ASCIZ ("\"ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>\""); ALIGN (4); PERLASM_END(); crypto/chacha/build.info +1 −0 Original line number Diff line number Diff line Loading @@ -9,6 +9,7 @@ GENERATE[chacha-armv4.S]=asm/chacha-armv4.pl $(PERLASM_SCHEME) INCLUDE[chacha-armv4.o]=.. GENERATE[chacha-armv8.S]=asm/chacha-armv8.pl $(PERLASM_SCHEME) INCLUDE[chacha-armv8.o]=.. INCLUDE[chacha-s390x.o]=.. BEGINRAW[Makefile(unix)] ##### CHACHA assembler implementations Loading Loading
crypto/chacha/asm/chacha-s390x.pl +557 −259 Original line number Diff line number Diff line #! /usr/bin/env perl # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. # Copyright 2016-2019 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy Loading @@ -20,41 +20,46 @@ # # 3 times faster than compiler-generated code. $flavour = shift; # # August 2018 # # Add vx code path. # # Copyright IBM Corp. 2018 # Author: Patrick Steuer <patrick.steuer@de.ibm.com> use strict; use FindBin qw($Bin); use lib "$Bin/../.."; use perlasm::s390x qw(:DEFAULT :VX AUTOLOAD LABEL INCLUDE); my $flavour = shift; my ($z,$SIZE_T); if ($flavour =~ /3[12]/) { $z=0; # S/390 ABI $SIZE_T=4; $g=""; } else { $z=1; # zSeries ABI $SIZE_T=8; $g="g"; } my $output; while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; sub AUTOLOAD() # thunk [simplified] x86-style perlasm { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $code .= "\t$opcode\t".join(',',@_)."\n"; } my $sp="%r15"; my $stdframe=16*$SIZE_T+4*8; my $frame=$stdframe+4*20; my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6)); my @x=map("%r$_",(0..7,"x","x","x","x",(10..13))); my @t=map("%r$_",(8,9)); my @v=map("%v$_",(16..31)); sub ROUND { my ($a0,$b0,$c0,$d0)=@_; my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); my ($xc,$xc_)=map("\"$_\"",@t); my @x=map("\"$_\"",@x); my ($xc,$xc_)=map("$_",@t); # Consider order in which variables are addressed by their # index: Loading @@ -78,249 +83,542 @@ my @x=map("\"$_\"",@x); # 'c' stores and loads in the middle, but none in the beginning # or end. ( "&alr (@x[$a0],@x[$b0])", # Q1 "&alr (@x[$a1],@x[$b1])", # Q2 "&xr (@x[$d0],@x[$a0])", "&xr (@x[$d1],@x[$a1])", "&rll (@x[$d0],@x[$d0],16)", "&rll (@x[$d1],@x[$d1],16)", "&alr ($xc,@x[$d0])", "&alr ($xc_,@x[$d1])", "&xr (@x[$b0],$xc)", "&xr (@x[$b1],$xc_)", "&rll (@x[$b0],@x[$b0],12)", "&rll (@x[$b1],@x[$b1],12)", "&alr (@x[$a0],@x[$b0])", "&alr (@x[$a1],@x[$b1])", "&xr (@x[$d0],@x[$a0])", "&xr (@x[$d1],@x[$a1])", "&rll (@x[$d0],@x[$d0],8)", "&rll (@x[$d1],@x[$d1],8)", "&alr ($xc,@x[$d0])", "&alr ($xc_,@x[$d1])", "&xr (@x[$b0],$xc)", "&xr (@x[$b1],$xc_)", "&rll (@x[$b0],@x[$b0],7)", "&rll (@x[$b1],@x[$b1],7)", "&stm ($xc,$xc_,'$stdframe+4*8+4*$c0($sp)')", # reload pair of 'c's "&lm ($xc,$xc_,'$stdframe+4*8+4*$c2($sp)')", "&alr (@x[$a2],@x[$b2])", # Q3 "&alr (@x[$a3],@x[$b3])", # Q4 "&xr (@x[$d2],@x[$a2])", "&xr (@x[$d3],@x[$a3])", "&rll (@x[$d2],@x[$d2],16)", "&rll (@x[$d3],@x[$d3],16)", "&alr ($xc,@x[$d2])", "&alr ($xc_,@x[$d3])", "&xr (@x[$b2],$xc)", "&xr (@x[$b3],$xc_)", "&rll (@x[$b2],@x[$b2],12)", "&rll (@x[$b3],@x[$b3],12)", "&alr (@x[$a2],@x[$b2])", "&alr (@x[$a3],@x[$b3])", "&xr (@x[$d2],@x[$a2])", "&xr (@x[$d3],@x[$a3])", "&rll (@x[$d2],@x[$d2],8)", "&rll (@x[$d3],@x[$d3],8)", "&alr ($xc,@x[$d2])", "&alr ($xc_,@x[$d3])", "&xr (@x[$b2],$xc)", "&xr (@x[$b3],$xc_)", "&rll (@x[$b2],@x[$b2],7)", "&rll (@x[$b3],@x[$b3],7)" ); alr (@x[$a0],@x[$b0]); # Q1 alr (@x[$a1],@x[$b1]); # Q2 xr (@x[$d0],@x[$a0]); xr (@x[$d1],@x[$a1]); rll (@x[$d0],@x[$d0],16); rll (@x[$d1],@x[$d1],16); alr ($xc,@x[$d0]); alr ($xc_,@x[$d1]); xr (@x[$b0],$xc); xr (@x[$b1],$xc_); rll (@x[$b0],@x[$b0],12); rll (@x[$b1],@x[$b1],12); alr (@x[$a0],@x[$b0]); alr (@x[$a1],@x[$b1]); xr (@x[$d0],@x[$a0]); xr (@x[$d1],@x[$a1]); rll (@x[$d0],@x[$d0],8); rll (@x[$d1],@x[$d1],8); alr ($xc,@x[$d0]); alr ($xc_,@x[$d1]); xr (@x[$b0],$xc); xr (@x[$b1],$xc_); rll (@x[$b0],@x[$b0],7); rll (@x[$b1],@x[$b1],7); stm ($xc,$xc_,"$stdframe+4*8+4*$c0($sp)"); # reload pair of 'c's lm ($xc,$xc_,"$stdframe+4*8+4*$c2($sp)"); alr (@x[$a2],@x[$b2]); # Q3 alr (@x[$a3],@x[$b3]); # Q4 xr (@x[$d2],@x[$a2]); xr (@x[$d3],@x[$a3]); rll (@x[$d2],@x[$d2],16); rll (@x[$d3],@x[$d3],16); alr ($xc,@x[$d2]); alr ($xc_,@x[$d3]); xr (@x[$b2],$xc); xr (@x[$b3],$xc_); rll (@x[$b2],@x[$b2],12); rll (@x[$b3],@x[$b3],12); alr (@x[$a2],@x[$b2]); alr (@x[$a3],@x[$b3]); xr (@x[$d2],@x[$a2]); xr (@x[$d3],@x[$a3]); rll (@x[$d2],@x[$d2],8); rll (@x[$d3],@x[$d3],8); alr ($xc,@x[$d2]); alr ($xc_,@x[$d3]); xr (@x[$b2],$xc); xr (@x[$b3],$xc_); rll (@x[$b2],@x[$b2],7); rll (@x[$b3],@x[$b3],7); } sub VX_ROUND { my ($a0,$b0,$c0,$d0)=@_; my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); vaf (@v[$a0],@v[$a0],@v[$b0]); vaf (@v[$a1],@v[$a1],@v[$b1]); vaf (@v[$a2],@v[$a2],@v[$b2]); vaf (@v[$a3],@v[$a3],@v[$b3]); vx (@v[$d0],@v[$d0],@v[$a0]); vx (@v[$d1],@v[$d1],@v[$a1]); vx (@v[$d2],@v[$d2],@v[$a2]); vx (@v[$d3],@v[$d3],@v[$a3]); verllf (@v[$d0],@v[$d0],16); verllf (@v[$d1],@v[$d1],16); verllf (@v[$d2],@v[$d2],16); verllf (@v[$d3],@v[$d3],16); vaf (@v[$c0],@v[$c0],@v[$d0]); vaf (@v[$c1],@v[$c1],@v[$d1]); vaf (@v[$c2],@v[$c2],@v[$d2]); vaf (@v[$c3],@v[$c3],@v[$d3]); vx (@v[$b0],@v[$b0],@v[$c0]); vx (@v[$b1],@v[$b1],@v[$c1]); vx (@v[$b2],@v[$b2],@v[$c2]); vx (@v[$b3],@v[$b3],@v[$c3]); verllf (@v[$b0],@v[$b0],12); verllf (@v[$b1],@v[$b1],12); verllf (@v[$b2],@v[$b2],12); verllf (@v[$b3],@v[$b3],12); vaf (@v[$a0],@v[$a0],@v[$b0]); vaf (@v[$a1],@v[$a1],@v[$b1]); vaf (@v[$a2],@v[$a2],@v[$b2]); vaf (@v[$a3],@v[$a3],@v[$b3]); vx (@v[$d0],@v[$d0],@v[$a0]); vx (@v[$d1],@v[$d1],@v[$a1]); vx (@v[$d2],@v[$d2],@v[$a2]); vx (@v[$d3],@v[$d3],@v[$a3]); verllf (@v[$d0],@v[$d0],8); verllf (@v[$d1],@v[$d1],8); verllf (@v[$d2],@v[$d2],8); verllf (@v[$d3],@v[$d3],8); vaf (@v[$c0],@v[$c0],@v[$d0]); vaf (@v[$c1],@v[$c1],@v[$d1]); vaf (@v[$c2],@v[$c2],@v[$d2]); vaf (@v[$c3],@v[$c3],@v[$d3]); vx (@v[$b0],@v[$b0],@v[$c0]); vx (@v[$b1],@v[$b1],@v[$c1]); vx (@v[$b2],@v[$b2],@v[$c2]); vx (@v[$b3],@v[$b3],@v[$c3]); verllf (@v[$b0],@v[$b0],7); verllf (@v[$b1],@v[$b1],7); verllf (@v[$b2],@v[$b2],7); verllf (@v[$b3],@v[$b3],7); } PERLASM_BEGIN($output); INCLUDE ("s390x_arch.h"); TEXT (); ################ # void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp, size_t len, # const unsigned int key[8], const unsigned int counter[4]) { my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6)); # VX CODE PATH { my $off=$z*8*16+8; # offset(initial state) my $frame=$stdframe+4*16+$off; GLOBL ("ChaCha20_ctr32"); TYPE ("ChaCha20_ctr32","\@function"); ALIGN (32); LABEL ("ChaCha20_ctr32"); larl ("%r1","OPENSSL_s390xcap_P"); lghi ("%r0",64); &{$z? \&cgr:\&cr} ($len,"%r0"); jle ("_s390x_chacha_novx"); lg ("%r0","S390X_STFLE+16(%r1)"); tmhh ("%r0",0x4000); # check for vector facility jz ("_s390x_chacha_novx"); if (!$z) { llgfr ($len,$len); std ("%f4","16*$SIZE_T+2*8($sp)"); std ("%f6","16*$SIZE_T+3*8($sp)"); } &{$z? \&stmg:\&stm} ("%r6","%r7","6*$SIZE_T($sp)"); lghi ("%r1",-$frame); lgr ("%r0",$sp); la ($sp,"0(%r1,$sp)"); # allocate stack frame larl ("%r7",".Lsigma"); &{$z? \&stg:\&st} ("%r0","0($sp)"); # backchain vstm ("%v8","%v15","8($sp)") if ($z); vlm ("%v1","%v2","0($key)"); # load key vl ("%v0","0(%r7)"); # load sigma constant vl ("%v3","0($counter)"); # load iv (counter||nonce) l ("%r0","0($counter)"); # load counter vstm ("%v0","%v3","$off($sp)"); # copy initial state to stack srlg ("%r1",$len,8); ltgr ("%r1","%r1"); jz (".Lvx_4x_done"); ALIGN (16); # process 4 64-byte blocks LABEL (".Lvx_4x"); vlrepf ("%v$_",($_*4)."+$off($sp)") for (0..15); # load initial # state vl ("%v31","16(%r7)"); vaf ("%v12","%v12","%v31"); # increment counter vlr (@v[$_],"%v$_") for (0..15); # copy initial state lhi ("%r6",10); j (".Loop_vx_4x"); ALIGN (16); LABEL (".Loop_vx_4x"); VX_ROUND( 0, 4, 8,12); # column round VX_ROUND( 0, 5,10,15); # diagonal round brct ("%r6",".Loop_vx_4x"); vaf (@v[$_],@v[$_],"%v$_") for (0..15); # state += initial # state (mod 32) vlm ("%v6","%v7","32(%r7)"); # load vperm operands for (0..3) { # blocks 1,2 vmrhf ("%v0",@v[$_*4+0],@v[$_*4+1]); # ks = serialize(state) vmrhf ("%v1",@v[$_*4+2],@v[$_*4+3]); vperm ("%v".($_+ 8),"%v0","%v1","%v6"); vperm ("%v".($_+12),"%v0","%v1","%v7"); } vlm ("%v0","%v7","0($inp)"); # load in vx ("%v$_","%v$_","%v".($_+8)) for (0..7); # out = in ^ ks vstm ("%v0","%v7","0($out)"); # store out vlm ("%v6","%v7","32(%r7)"); # restore vperm operands for (0..3) { # blocks 2,3 vmrlf ("%v0",@v[$_*4+0],@v[$_*4+1]); # ks = serialize(state) vmrlf ("%v1",@v[$_*4+2],@v[$_*4+3]); vperm ("%v".($_+ 8),"%v0","%v1","%v6"); vperm ("%v".($_+12),"%v0","%v1","%v7"); } vlm ("%v0","%v7","128($inp)"); # load in vx ("%v$_","%v$_","%v".($_+8)) for (0..7); # out = in ^ ks vstm ("%v0","%v7","128($out)"); # store out ahi ("%r0",4); st ("%r0","48+$off($sp)"); # update initial state la ($inp,"256($inp)"); la ($out,"256($out)"); brctg ("%r1",".Lvx_4x"); ALIGN (16); LABEL (".Lvx_4x_done"); lghi ("%r1",0xff); ngr ($len,"%r1"); jnz (".Lvx_rem"); ALIGN (16); LABEL (".Lvx_done"); vzero ("%v$_") for (16..31); # wipe ks and key copy vstm ("%v16","%v17","16+$off($sp)"); vlm ("%v8","%v15","8($sp)") if ($z); la ($sp,"$frame($sp)"); &{$z? \&lmg:\&lm} ("%r6","%r7","6*$SIZE_T($sp)"); if (!$z) { ld ("%f4","16*$SIZE_T+2*8($sp)"); ld ("%f6","16*$SIZE_T+3*8($sp)"); vzero ("%v$_") for (8..15); } br ("%r14"); ALIGN (16); LABEL (".Lvx_rem"); lhi ("%r0",64); sr ($len,"%r0"); brc (2,".Lvx_rem_g64"); # cc==2? lghi ("%r1",-$stdframe); la ($counter,"48+$off($sp)"); # load updated iv ar ($len,"%r0"); # restore len lgr ("%r7",$counter); &{$z? \&stg:\&st} ("%r14","14*$SIZE_T+$frame($sp)"); la ($sp,"0(%r1,$sp)"); bras ("%r14","_s390x_chacha_novx"); la ($sp,"$stdframe($sp)"); &{$z? \&lg:\&l} ("%r14","14*$SIZE_T+$frame($sp)"); lgr ($counter,"%r7"); j (".Lvx_done"); ALIGN (16); LABEL (".Lvx_rem_g64"); vlrepf ("%v$_",($_*4)."+$off($sp)") for (0..15); # load initial # state vl ("%v31","16(%r7)"); vaf ("%v12","%v12","%v31"); # increment counter $code.=<<___; .text .globl ChaCha20_ctr32 .type ChaCha20_ctr32,\@function .align 32 ChaCha20_ctr32: lt${g}r $len,$len # $len==0? bzr %r14 a${g}hi $len,-64 l${g}hi %r1,-$frame stm${g} %r6,%r15,`6*$SIZE_T`($sp) sl${g}r $out,$inp # difference la $len,0($inp,$len) # end of input minus 64 larl %r7,.Lsigma lgr %r0,$sp la $sp,0(%r1,$sp) st${g} %r0,0($sp) lmg %r8,%r11,0($key) # load key lmg %r12,%r13,0($counter) # load counter lmg %r6,%r7,0(%r7) # load sigma constant la %r14,0($inp) st${g} $out,$frame+3*$SIZE_T($sp) st${g} $len,$frame+4*$SIZE_T($sp) stmg %r6,%r13,$stdframe($sp) # copy key schedule to stack srlg @x[12],%r12,32 # 32-bit counter value j .Loop_outer .align 16 .Loop_outer: lm @x[0],@x[7],$stdframe+4*0($sp) # load x[0]-x[7] lm @t[0],@t[1],$stdframe+4*10($sp) # load x[10]-x[11] lm @x[13],@x[15],$stdframe+4*13($sp) # load x[13]-x[15] stm @t[0],@t[1],$stdframe+4*8+4*10($sp) # offload x[10]-x[11] lm @t[0],@t[1],$stdframe+4*8($sp) # load x[8]-x[9] st @x[12],$stdframe+4*12($sp) # save counter st${g} %r14,$frame+2*$SIZE_T($sp) # save input pointer lhi %r14,10 j .Loop .align 4 .Loop: ___ foreach (&ROUND(0, 4, 8,12)) { eval; } foreach (&ROUND(0, 5,10,15)) { eval; } $code.=<<___; brct %r14,.Loop l${g} %r14,$frame+2*$SIZE_T($sp) # pull input pointer stm @t[0],@t[1],$stdframe+4*8+4*8($sp) # offload x[8]-x[9] lm${g} @t[0],@t[1],$frame+3*$SIZE_T($sp) al @x[0],$stdframe+4*0($sp) # accumulate key schedule al @x[1],$stdframe+4*1($sp) al @x[2],$stdframe+4*2($sp) al @x[3],$stdframe+4*3($sp) al @x[4],$stdframe+4*4($sp) al @x[5],$stdframe+4*5($sp) al @x[6],$stdframe+4*6($sp) al @x[7],$stdframe+4*7($sp) lrvr @x[0],@x[0] lrvr @x[1],@x[1] lrvr @x[2],@x[2] lrvr @x[3],@x[3] lrvr @x[4],@x[4] lrvr @x[5],@x[5] lrvr @x[6],@x[6] lrvr @x[7],@x[7] al @x[12],$stdframe+4*12($sp) al @x[13],$stdframe+4*13($sp) al @x[14],$stdframe+4*14($sp) al @x[15],$stdframe+4*15($sp) lrvr @x[12],@x[12] lrvr @x[13],@x[13] lrvr @x[14],@x[14] lrvr @x[15],@x[15] la @t[0],0(@t[0],%r14) # reconstruct output pointer cl${g}r %r14,@t[1] jh .Ltail x @x[0],4*0(%r14) # xor with input x @x[1],4*1(%r14) st @x[0],4*0(@t[0]) # store output x @x[2],4*2(%r14) st @x[1],4*1(@t[0]) x @x[3],4*3(%r14) st @x[2],4*2(@t[0]) x @x[4],4*4(%r14) st @x[3],4*3(@t[0]) lm @x[0],@x[3],$stdframe+4*8+4*8($sp) # load x[8]-x[11] x @x[5],4*5(%r14) st @x[4],4*4(@t[0]) x @x[6],4*6(%r14) al @x[0],$stdframe+4*8($sp) st @x[5],4*5(@t[0]) x @x[7],4*7(%r14) al @x[1],$stdframe+4*9($sp) st @x[6],4*6(@t[0]) x @x[12],4*12(%r14) al @x[2],$stdframe+4*10($sp) st @x[7],4*7(@t[0]) x @x[13],4*13(%r14) al @x[3],$stdframe+4*11($sp) st @x[12],4*12(@t[0]) x @x[14],4*14(%r14) st @x[13],4*13(@t[0]) x @x[15],4*15(%r14) st @x[14],4*14(@t[0]) lrvr @x[0],@x[0] st @x[15],4*15(@t[0]) lrvr @x[1],@x[1] lrvr @x[2],@x[2] lrvr @x[3],@x[3] lhi @x[12],1 x @x[0],4*8(%r14) al @x[12],$stdframe+4*12($sp) # increment counter x @x[1],4*9(%r14) st @x[0],4*8(@t[0]) x @x[2],4*10(%r14) st @x[1],4*9(@t[0]) x @x[3],4*11(%r14) st @x[2],4*10(@t[0]) st @x[3],4*11(@t[0]) cl${g}r %r14,@t[1] # done yet? la %r14,64(%r14) jl .Loop_outer .Ldone: xgr %r0,%r0 xgr %r1,%r1 xgr %r2,%r2 xgr %r3,%r3 stmg %r0,%r3,$stdframe+4*4($sp) # wipe key copy stmg %r0,%r3,$stdframe+4*12($sp) lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp) br %r14 .align 16 .Ltail: la @t[1],64($t[1]) stm @x[0],@x[7],$stdframe+4*0($sp) sl${g}r @t[1],%r14 lm @x[0],@x[3],$stdframe+4*8+4*8($sp) l${g}hi @x[6],0 stm @x[12],@x[15],$stdframe+4*12($sp) al @x[0],$stdframe+4*8($sp) al @x[1],$stdframe+4*9($sp) al @x[2],$stdframe+4*10($sp) al @x[3],$stdframe+4*11($sp) lrvr @x[0],@x[0] lrvr @x[1],@x[1] lrvr @x[2],@x[2] lrvr @x[3],@x[3] stm @x[0],@x[3],$stdframe+4*8($sp) .Loop_tail: llgc @x[4],0(@x[6],%r14) llgc @x[5],$stdframe(@x[6],$sp) xr @x[5],@x[4] stc @x[5],0(@x[6],@t[0]) la @x[6],1(@x[6]) brct @t[1],.Loop_tail j .Ldone .size ChaCha20_ctr32,.-ChaCha20_ctr32 .align 32 .Lsigma: .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral .asciz "ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>" .align 4 ___ foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/ge; print $_,"\n"; vlr (@v[$_],"%v$_") for (0..15); # state = initial state lhi ("%r6",10); j (".Loop_vx_rem"); ALIGN (16); LABEL (".Loop_vx_rem"); VX_ROUND( 0, 4, 8,12); # column round VX_ROUND( 0, 5,10,15); # diagonal round brct ("%r6",".Loop_vx_rem"); vaf (@v[$_],@v[$_],"%v$_") for (0..15); # state += initial # state (mod 32) vlm ("%v6","%v7","32(%r7)"); # load vperm operands for (0..3) { # blocks 1,2 vmrhf ("%v0",@v[$_*4+0],@v[$_*4+1]); # ks = serialize(state) vmrhf ("%v1",@v[$_*4+2],@v[$_*4+3]); vperm ("%v".($_+8),"%v0","%v1","%v6"); vperm ("%v".($_+12),"%v0","%v1","%v7"); } vlm ("%v0","%v3","0($inp)"); # load in vx ("%v$_","%v$_","%v".($_+8)) for (0..3); # out = in ^ ks vstm ("%v0","%v3","0($out)"); # store out la ($inp,"64($inp)"); la ($out,"64($out)"); sr ($len,"%r0"); brc (4,".Lvx_tail"); # cc==4? vlm ("%v0","%v3","0($inp)"); # load in vx ("%v$_","%v$_","%v".($_+12)) for (0..3); # out = in ^ ks vstm ("%v0","%v3","0($out)"); # store out jz (".Lvx_done"); for (0..3) { # blocks 3,4 vmrlf ("%v0",@v[$_*4+0],@v[$_*4+1]); # ks = serialize(state) vmrlf ("%v1",@v[$_*4+2],@v[$_*4+3]); vperm ("%v".($_+12),"%v0","%v1","%v6"); vperm ("%v".($_+8),"%v0","%v1","%v7"); } la ($inp,"64($inp)"); la ($out,"64($out)"); sr ($len,"%r0"); brc (4,".Lvx_tail"); # cc==4? vlm ("%v0","%v3","0($inp)"); # load in vx ("%v$_","%v$_","%v".($_+12)) for (0..3); # out = in ^ ks vstm ("%v0","%v3","0($out)"); # store out jz (".Lvx_done"); la ($inp,"64($inp)"); la ($out,"64($out)"); sr ($len,"%r0"); vlr ("%v".($_+4),"%v$_") for (8..11); j (".Lvx_tail"); ALIGN (16); LABEL (".Lvx_tail"); ar ($len,"%r0"); # restore $len ahi ($len,-1); lhi ("%r0",16); for (0..2) { vll ("%v0",$len,($_*16)."($inp)"); vx ("%v0","%v0","%v".($_+12)); vstl ("%v0",$len,($_*16)."($out)"); sr ($len,"%r0"); brc (4,".Lvx_done"); # cc==4? } vll ("%v0",$len,"3*16($inp)"); vx ("%v0","%v0","%v15"); vstl ("%v0",$len,"3*16($out)"); j (".Lvx_done"); SIZE ("ChaCha20_ctr32",".-ChaCha20_ctr32"); } # NOVX CODE PATH { my $frame=$stdframe+4*20; TYPE ("_s390x_chacha_novx","\@function"); ALIGN (32); LABEL ("_s390x_chacha_novx"); &{$z? \<gr:\<r} ($len,$len); # $len==0? bzr ("%r14"); &{$z? \&aghi:\&ahi} ($len,-64); &{$z? \&lghi:\&lhi} ("%r1",-$frame); &{$z? \&stmg:\&stm} ("%r6","%r15","6*$SIZE_T($sp)"); &{$z? \&slgr:\&slr} ($out,$inp); # difference la ($len,"0($inp,$len)"); # end of input minus 64 larl ("%r7",".Lsigma"); lgr ("%r0",$sp); la ($sp,"0(%r1,$sp)"); &{$z? \&stg:\&st} ("%r0","0($sp)"); lmg ("%r8","%r11","0($key)"); # load key lmg ("%r12","%r13","0($counter)"); # load counter lmg ("%r6","%r7","0(%r7)"); # load sigma constant la ("%r14","0($inp)"); &{$z? \&stg:\&st} ($out,"$frame+3*$SIZE_T($sp)"); &{$z? \&stg:\&st} ($len,"$frame+4*$SIZE_T($sp)"); stmg ("%r6","%r13","$stdframe($sp)");# copy key schedule to stack srlg (@x[12],"%r12",32); # 32-bit counter value j (".Loop_outer"); ALIGN (16); LABEL (".Loop_outer"); lm (@x[0],@x[7],"$stdframe+4*0($sp)"); # load x[0]-x[7] lm (@t[0],@t[1],"$stdframe+4*10($sp)"); # load x[10]-x[11] lm (@x[13],@x[15],"$stdframe+4*13($sp)"); # load x[13]-x[15] stm (@t[0],@t[1],"$stdframe+4*8+4*10($sp)");# offload x[10]-x[11] lm (@t[0],@t[1],"$stdframe+4*8($sp)"); # load x[8]-x[9] st (@x[12],"$stdframe+4*12($sp)"); # save counter &{$z? \&stg:\&st} ("%r14","$frame+2*$SIZE_T($sp)");# save input pointer lhi ("%r14",10); j (".Loop"); ALIGN (4); LABEL (".Loop"); ROUND (0, 4, 8,12); ROUND (0, 5,10,15); brct ("%r14",".Loop"); &{$z? \&lg:\&l} ("%r14","$frame+2*$SIZE_T($sp)");# pull input pointer stm (@t[0],@t[1],"$stdframe+4*8+4*8($sp)"); # offload x[8]-x[9] &{$z? \&lmg:\&lm} (@t[0],@t[1],"$frame+3*$SIZE_T($sp)"); al (@x[0],"$stdframe+4*0($sp)"); # accumulate key schedule al (@x[1],"$stdframe+4*1($sp)"); al (@x[2],"$stdframe+4*2($sp)"); al (@x[3],"$stdframe+4*3($sp)"); al (@x[4],"$stdframe+4*4($sp)"); al (@x[5],"$stdframe+4*5($sp)"); al (@x[6],"$stdframe+4*6($sp)"); al (@x[7],"$stdframe+4*7($sp)"); lrvr (@x[0],@x[0]); lrvr (@x[1],@x[1]); lrvr (@x[2],@x[2]); lrvr (@x[3],@x[3]); lrvr (@x[4],@x[4]); lrvr (@x[5],@x[5]); lrvr (@x[6],@x[6]); lrvr (@x[7],@x[7]); al (@x[12],"$stdframe+4*12($sp)"); al (@x[13],"$stdframe+4*13($sp)"); al (@x[14],"$stdframe+4*14($sp)"); al (@x[15],"$stdframe+4*15($sp)"); lrvr (@x[12],@x[12]); lrvr (@x[13],@x[13]); lrvr (@x[14],@x[14]); lrvr (@x[15],@x[15]); la (@t[0],"0(@t[0],%r14)"); # reconstruct output pointer &{$z? \&clgr:\&clr} ("%r14",@t[1]); jh (".Ltail"); x (@x[0],"4*0(%r14)"); # xor with input x (@x[1],"4*1(%r14)"); st (@x[0],"4*0(@t[0])"); # store output x (@x[2],"4*2(%r14)"); st (@x[1],"4*1(@t[0])"); x (@x[3],"4*3(%r14)"); st (@x[2],"4*2(@t[0])"); x (@x[4],"4*4(%r14)"); st (@x[3],"4*3(@t[0])"); lm (@x[0],@x[3],"$stdframe+4*8+4*8($sp)"); # load x[8]-x[11] x (@x[5],"4*5(%r14)"); st (@x[4],"4*4(@t[0])"); x (@x[6],"4*6(%r14)"); al (@x[0],"$stdframe+4*8($sp)"); st (@x[5],"4*5(@t[0])"); x (@x[7],"4*7(%r14)"); al (@x[1],"$stdframe+4*9($sp)"); st (@x[6],"4*6(@t[0])"); x (@x[12],"4*12(%r14)"); al (@x[2],"$stdframe+4*10($sp)"); st (@x[7],"4*7(@t[0])"); x (@x[13],"4*13(%r14)"); al (@x[3],"$stdframe+4*11($sp)"); st (@x[12],"4*12(@t[0])"); x (@x[14],"4*14(%r14)"); st (@x[13],"4*13(@t[0])"); x (@x[15],"4*15(%r14)"); st (@x[14],"4*14(@t[0])"); lrvr (@x[0],@x[0]); st (@x[15],"4*15(@t[0])"); lrvr (@x[1],@x[1]); lrvr (@x[2],@x[2]); lrvr (@x[3],@x[3]); lhi (@x[12],1); x (@x[0],"4*8(%r14)"); al (@x[12],"$stdframe+4*12($sp)"); # increment counter x (@x[1],"4*9(%r14)"); st (@x[0],"4*8(@t[0])"); x (@x[2],"4*10(%r14)"); st (@x[1],"4*9(@t[0])"); x (@x[3],"4*11(%r14)"); st (@x[2],"4*10(@t[0])"); st (@x[3],"4*11(@t[0])"); &{$z? \&clgr:\&clr} ("%r14",@t[1]); # done yet? la ("%r14","64(%r14)"); jl (".Loop_outer"); LABEL (".Ldone"); xgr ("%r0","%r0"); xgr ("%r1","%r1"); xgr ("%r2","%r2"); xgr ("%r3","%r3"); stmg ("%r0","%r3","$stdframe+4*4($sp)"); # wipe key copy stmg ("%r0","%r3","$stdframe+4*12($sp)"); &{$z? \&lmg:\&lm} ("%r6","%r15","$frame+6*$SIZE_T($sp)"); br ("%r14"); ALIGN (16); LABEL (".Ltail"); la (@t[1],"64($t[1])"); stm (@x[0],@x[7],"$stdframe+4*0($sp)"); &{$z? \&slgr:\&slr} (@t[1],"%r14"); lm (@x[0],@x[3],"$stdframe+4*8+4*8($sp)"); &{$z? \&lghi:\&lhi} (@x[6],0); stm (@x[12],@x[15],"$stdframe+4*12($sp)"); al (@x[0],"$stdframe+4*8($sp)"); al (@x[1],"$stdframe+4*9($sp)"); al (@x[2],"$stdframe+4*10($sp)"); al (@x[3],"$stdframe+4*11($sp)"); lrvr (@x[0],@x[0]); lrvr (@x[1],@x[1]); lrvr (@x[2],@x[2]); lrvr (@x[3],@x[3]); stm (@x[0],@x[3],"$stdframe+4*8($sp)"); LABEL (".Loop_tail"); llgc (@x[4],"0(@x[6],%r14)"); llgc (@x[5],"$stdframe(@x[6],$sp)"); xr (@x[5],@x[4]); stc (@x[5],"0(@x[6],@t[0])"); la (@x[6],"1(@x[6])"); brct (@t[1],".Loop_tail"); j (".Ldone"); SIZE ("_s390x_chacha_novx",".-_s390x_chacha_novx"); } } close STDOUT; ################ ALIGN (64); LABEL (".Lsigma"); LONG (0x61707865,0x3320646e,0x79622d32,0x6b206574); # endian-neutral sigma LONG (0x00000000,0x00000001,0x00000002,0x00000003); # vaf counter increment LONG (0x03020100,0x07060504,0x13121110,0x17161514); # vperm serialization LONG (0x0b0a0908,0x0f0e0d0c,0x1b1a1918,0x1f1e1d1c); # vperm serialization ASCIZ ("\"ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>\""); ALIGN (4); PERLASM_END();
crypto/chacha/build.info +1 −0 Original line number Diff line number Diff line Loading @@ -9,6 +9,7 @@ GENERATE[chacha-armv4.S]=asm/chacha-armv4.pl $(PERLASM_SCHEME) INCLUDE[chacha-armv4.o]=.. GENERATE[chacha-armv8.S]=asm/chacha-armv8.pl $(PERLASM_SCHEME) INCLUDE[chacha-armv8.o]=.. INCLUDE[chacha-s390x.o]=.. BEGINRAW[Makefile(unix)] ##### CHACHA assembler implementations Loading