Loading crypto/aes/asm/aes-s390x.pl +764 −25 Original line number Diff line number Diff line Loading @@ -70,6 +70,18 @@ # remains z/Architecture specific. On z990 it was measured to perform # 2x better than code generated by gcc 4.3. # December 2010. # # Add support for z196 "cipher message with counter" instruction. # Note however that it's disengaged, because it was measured to # perform ~12% worse than vanilla km-based code... # February 2011. # # Add AES_xts_[en|de]crypt. This includes support for z196 # km-xts-aes instructions, which deliver ~70% improvement at 8KB # block size over vanilla km-based code. $flavour = shift; if ($flavour =~ /3[12]/) { Loading Loading @@ -268,7 +280,7 @@ $code.=<<___; .type _s390x_AES_encrypt,\@function .align 16 _s390x_AES_encrypt: st${g} $ra,`$stdframe-$SIZE_T`($sp) st${g} $ra,15*$SIZE_T($sp) x $s0,0($key) x $s1,4($key) x $s2,8($key) Loading Loading @@ -432,7 +444,7 @@ _s390x_AES_encrypt: or $s2,$i3 or $s3,$t3 l${g} $ra,`$stdframe-$SIZE_T`($sp) l${g} $ra,15*$SIZE_T($sp) xr $s0,$t0 xr $s1,$t2 x $s2,24($key) Loading Loading @@ -594,7 +606,7 @@ $code.=<<___; .type _s390x_AES_decrypt,\@function .align 16 _s390x_AES_decrypt: st${g} $ra,`$stdframe-$SIZE_T`($sp) st${g} $ra,15*$SIZE_T($sp) x $s0,0($key) x $s1,4($key) x $s2,8($key) Loading Loading @@ -738,7 +750,7 @@ _s390x_AES_decrypt: nr $i1,$mask nr $i2,$mask l${g} $ra,`$stdframe-$SIZE_T`($sp) l${g} $ra,15*$SIZE_T($sp) or $s1,$t1 l $t0,16($key) l $t1,20($key) Loading Loading @@ -1164,6 +1176,7 @@ $code.=<<___; .size AES_set_decrypt_key,.-AES_set_decrypt_key ___ ######################################################################## # void AES_cbc_encrypt(const unsigned char *in, unsigned char *out, # size_t length, const AES_KEY *key, # unsigned char *ivec, const int enc) Loading Loading @@ -1365,13 +1378,14 @@ $code.=<<___; .size AES_cbc_encrypt,.-AES_cbc_encrypt ___ } ######################################################################## # void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out, # size_t blocks, const AES_KEY *key, # const unsigned char *ivec) { my $inp="%r2"; my $out="%r3"; my $len="%r4"; my $out="%r4"; # blocks and out are swapped my $len="%r3"; my $key="%r5"; my $iv0="%r5"; my $ivp="%r6"; my $fp ="%r7"; Loading @@ -1381,6 +1395,9 @@ $code.=<<___; .type AES_ctr32_encrypt,\@function .align 16 AES_ctr32_encrypt: xgr %r3,%r4 # flip %r3 and %r4, $out and $len xgr %r4,%r3 xgr %r3,%r4 llgfr $len,$len # safe in ctr32 subroutine even in 64-bit case ___ $code.=<<___ if (!$softonly); Loading Loading @@ -1415,20 +1432,75 @@ $code.=<<___ if (!$softonly); st${g} $fp,$SIZE_T($sp) slgr $len,$fp brc 1,.Lctr32_hw_loop # not zero, no borrow brc 1,.Lctr32_hw_switch # not zero, no borrow algr $fp,$len # input is shorter than allocated buffer lghi $len,0 st${g} $fp,$SIZE_T($sp) .Lctr32_hw_loop: .Lctr32_hw_switch: ___ $code.=<<___ if (0); ######### kmctr code was measured to be ~12% slower larl $s0,OPENSSL_s390xcap_P lg $s0,8($s0) tmhh $s0,0x0004 # check for message_security-assist-4 jz .Lctr32_km_loop llgfr $s0,%r0 lgr $s1,%r1 lghi %r0,0 la %r1,16($sp) .long 0xb92d2042 # kmctr %r4,%r2,%r2 llihh %r0,0x8000 # check if kmctr supports the function code srlg %r0,%r0,0($s0) ng %r0,16($sp) lgr %r0,$s0 lgr %r1,$s1 jz .Lctr32_km_loop ####### kmctr code algr $out,$inp # restore $out lgr $s1,$len # $s1 undertakes $len j .Lctr32_kmctr_loop .align 16 .Lctr32_kmctr_loop: la $s2,16($sp) lgr $s3,$fp .Lctr32_hw_prepare: .Lctr32_kmctr_prepare: stg $iv0,0($s2) stg $ivp,8($s2) la $s2,16($s2) ahi $ivp,1 # 32-bit increment, preserves upper half brct $s3,.Lctr32_hw_prepare brct $s3,.Lctr32_kmctr_prepare #la $inp,0($inp) # inp sllg $len,$fp,4 # len #la $out,0($out) # out la $s2,16($sp) # iv .long 0xb92da042 # kmctr $out,$s2,$inp brc 1,.-4 # pay attention to "partial completion" slgr $s1,$fp brc 1,.Lctr32_kmctr_loop # not zero, no borrow algr $fp,$s1 lghi $s1,0 brc 4+1,.Lctr32_kmctr_loop # not zero l${g} $sp,0($sp) lm${g} %r6,$s3,6*$SIZE_T($sp) br $ra .align 16 ___ $code.=<<___; .Lctr32_km_loop: la $s2,16($sp) lgr $s3,$fp .Lctr32_km_prepare: stg $iv0,0($s2) stg $ivp,8($s2) la $s2,16($s2) ahi $ivp,1 # 32-bit increment, preserves upper half brct $s3,.Lctr32_km_prepare la $s0,16($sp) # inp sllg $s1,$fp,4 # len Loading @@ -1439,7 +1511,7 @@ $code.=<<___ if (!$softonly); la $s2,16($sp) lgr $s3,$fp slgr $s2,$inp .Lctr32_hw_xor: .Lctr32_km_xor: lg $s0,0($inp) lg $s1,8($inp) xg $s0,0($s2,$inp) Loading @@ -1447,22 +1519,22 @@ $code.=<<___ if (!$softonly); stg $s0,0($out,$inp) stg $s1,8($out,$inp) la $inp,16($inp) brct $s3,.Lctr32_hw_xor brct $s3,.Lctr32_km_xor slgr $len,$fp brc 1,.Lctr32_hw_loop # not zero, no borrow brc 1,.Lctr32_km_loop # not zero, no borrow algr $fp,$len lghi $len,0 brc 4+1,.Lctr32_hw_loop # not zero brc 4+1,.Lctr32_km_loop # not zero l${g} $s0,0($sp) l${g} $s1,$SIZE_T($sp) la $s2,16($sp) .Lctr32_hw_zap: .Lctr32_km_zap: stg $s0,0($s2) stg $s0,8($s2) la $s2,16($s2) brct $s1,.Lctr32_hw_zap brct $s1,.Lctr32_km_zap la $sp,0($s0) lm${g} %r6,$s3,6*$SIZE_T($sp) Loading @@ -1472,12 +1544,12 @@ $code.=<<___ if (!$softonly); ___ $code.=<<___; stm${g} $key,$ra,5*$SIZE_T($sp) sl${g}r $out,$inp sl${g}r $inp,$out larl $tbl,AES_Te llgf $t1,12($ivp) .Lctr32_loop: stm${g} $inp,$len,2*$SIZE_T($sp) stm${g} $inp,$out,2*$SIZE_T($sp) llgf $s0,0($ivp) llgf $s1,4($ivp) llgf $s2,8($ivp) Loading @@ -1489,27 +1561,694 @@ $code.=<<___; lm${g} $inp,$ivp,2*$SIZE_T($sp) llgf $t1,16*$SIZE_T($sp) x $s0,0($inp) x $s0,0($inp,$out) x $s1,4($inp,$out) x $s2,8($inp,$out) x $s3,12($inp,$out) stm $s0,$s3,0($out) la $out,16($out) ahi $t1,1 # 32-bit increment brct $len,.Lctr32_loop lm${g} %r6,$ra,6*$SIZE_T($sp) br $ra .size AES_ctr32_encrypt,.-AES_ctr32_encrypt ___ } ######################################################################## # void AES_xts_encrypt(const char *inp,char *out,size_t len, # const AES_KEY *key1, const AES_KEY *key2,u64 secno); # { my $inp="%r2"; my $out="%r4"; # len and out are swapped my $len="%r3"; my $key1="%r5"; # $i1 my $key2="%r6"; # $i2 my $fp="%r7"; # $i3 my $tweak=16*$SIZE_T+16; # or $stdframe-16, bottom of the frame... $code.=<<___; .type _s390x_xts_km,\@function .align 16 _s390x_xts_km: ___ $code.=<<___ if(0); llgfr $s0,%r0 # put aside the function code lghi $s1,0x7f nr $s1,%r0 lghi %r0,0 # query capability vector la %r1,2*$SIZE_T($sp) .long 0xb92e0042 # km %r4,%r2 llihh %r1,0x8000 srlg %r1,%r1,32($s1) # check for 32+function code ng %r1,2*$SIZE_T($sp) lgr %r0,$s0 # restore the function code la %r1,0($key1) # restore $key1 jz .Lxts_km_vanilla lmg $i2,$i3,$tweak($sp) # put aside the tweak value algr $out,$inp oill %r0,32 # switch to xts function code aghi $s1,-18 # sllg $s1,$s1,3 # (function code - 18)*8, 0 or 16 la %r1,$tweak-16($sp) slgr %r1,$s1 # parameter block position lmg $s0,$s3,0($key1) # load 256 bits of key material, stmg $s0,$s3,0(%r1) # and copy it to parameter block. # yes, it contains junk and overlaps # with the tweak in 128-bit case. # it's done to avoid conditional # branch. stmg $i2,$i3,$tweak($sp) # "re-seat" the tweak value .long 0xb92e0042 # km %r4,%r2 brc 1,.-4 # pay attention to "partial completion" lrvg $s0,$tweak+0($sp) # load the last tweak lrvg $s1,$tweak+8($sp) stmg %r0,%r3,$tweak-32(%r1) # wipe copy of the key nill %r0,0xffdf # switch back to original function code la %r1,0($key1) # restore pointer to $key1 slgr $out,$inp llgc $len,2*$SIZE_T-1($sp) nill $len,0x0f # $len%=16 br $ra .align 16 .Lxts_km_vanilla: ___ $code.=<<___; # prepare and allocate stack frame at the top of 4K page # with 1K reserved for eventual signal handling lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer lghi $s1,-4096 algr $s0,$sp lgr $fp,$sp ngr $s0,$s1 # align at page boundary slgr $fp,$s0 # total buffer size lgr $s2,$sp lghi $s1,1024+16 # sl[g]fi is extended-immediate facility slgr $fp,$s1 # deduct reservation to get usable buffer size # buffer size is at lest 256 and at most 3072+256-16 la $sp,1024($s0) # alloca nill $fp,0xfff0 # round to 16*n st${g} $s2,0($sp) # back-chain nill $len,0xfff0 # redundant st${g} $fp,$SIZE_T($sp) slgr $len,$fp brc 1,.Lxts_km_go # not zero, no borrow algr $fp,$len # input is shorter than allocated buffer lghi $len,0 st${g} $fp,$SIZE_T($sp) .Lxts_km_go: lrvg $s0,$tweak+0($s2) # load the tweak value in little-endian lrvg $s1,$tweak+8($s2) la $s2,16($sp) # vector of ascending tweak values slgr $s2,$inp srlg $s3,$fp,4 j .Lxts_km_start .Lxts_km_loop: la $s2,16($sp) slgr $s2,$inp srlg $s3,$fp,4 .Lxts_km_prepare: lghi $i1,0x87 srag $i2,$s1,63 # broadcast upper bit ngr $i1,$i2 # rem srlg $i2,$s0,63 # carry bit from lower half sllg $s0,$s0,1 sllg $s1,$s1,1 xgr $s0,$i1 ogr $s1,$i2 .Lxts_km_start: lrvgr $i1,$s0 # flip byte order lrvgr $i2,$s1 stg $i1,0($s2,$inp) stg $i2,8($s2,$inp) xg $i1,0($inp) xg $i2,8($inp) stg $i1,0($out,$inp) stg $i2,8($out,$inp) la $inp,16($inp) brct $s3,.Lxts_km_prepare slgr $inp,$fp # rewind $inp la $s2,0($out,$inp) lgr $s3,$fp .long 0xb92e00aa # km $s2,$s2 brc 1,.-4 # pay attention to "partial completion" la $s2,16($sp) slgr $s2,$inp srlg $s3,$fp,4 .Lxts_km_xor: lg $i1,0($out,$inp) lg $i2,8($out,$inp) xg $i1,0($s2,$inp) xg $i2,8($s2,$inp) stg $i1,0($out,$inp) stg $i2,8($out,$inp) la $inp,16($inp) brct $s3,.Lxts_km_xor slgr $len,$fp brc 1,.Lxts_km_loop # not zero, no borrow algr $fp,$len lghi $len,0 brc 4+1,.Lxts_km_loop # not zero l${g} $i1,0($sp) # back-chain llgf $fp,`2*$SIZE_T-4`($sp) # bytes used la $i2,16($sp) srlg $fp,$fp,4 .Lxts_km_zap: stg $i1,0($i2) stg $i1,8($i2) la $i2,16($i2) brct $fp,.Lxts_km_zap la $sp,0($i1) llgc $len,2*$SIZE_T-1($i1) nill $len,0x0f # $len%=16 bzr $ra # generate one more tweak... lghi $i1,0x87 srag $i2,$s1,63 # broadcast upper bit ngr $i1,$i2 # rem srlg $i2,$s0,63 # carry bit from lower half sllg $s0,$s0,1 sllg $s1,$s1,1 xgr $s0,$i1 ogr $s1,$i2 ltr $len,$len # clear zero flag br $ra .size _s390x_xts_km,.-_s390x_xts_km .globl AES_xts_encrypt .type AES_xts_encrypt,\@function .align 16 AES_xts_encrypt: xgr %r3,%r4 # flip %r3 and %r4, $out and $len xgr %r4,%r3 xgr %r3,%r4 ___ $code.=<<___ if ($SIZE_T==4); llgfr $len,$len ___ $code.=<<___; st${g} $len,1*$SIZE_T($sp) # save copy of $len srag $len,$len,4 # formally wrong, because it expands # sign byte, but who can afford asking # to process more than 2^63-1 bytes? # I use it, because it sets condition # code... bcr 8,$ra # abort if zero (i.e. less than 16) ___ $code.=<<___ if (!$softonly); llgf %r0,240($key2) lhi %r1,16 clr %r0,%r1 jl .Lxts_enc_software stm${g} %r6,$s3,6*$SIZE_T($sp) st${g} $ra,14*$SIZE_T($sp) sllg $len,$len,4 # $len&=~15 slgr $out,$inp lrvg $s0,$stdframe($sp) # load secno lghi $s1,0 la $s2,$tweak($sp) lghi $s3,16 stmg $s0,$s1,0($s2) la %r1,0($key2) # $key2 is not needed anymore .long 0xb92e00aa # km $s2,$s2, generate the tweak brc 1,.-4 # can this happen? l %r0,240($key1) la %r1,0($key1) # $key1 is not needed anymore bras $ra,_s390x_xts_km jz .Lxts_enc_km_done aghi $inp,-16 # take one step back la $i3,0($out,$inp) # put aside real $out .Lxts_enc_km_steal: llgc $i1,16($inp) llgc $i2,0($out,$inp) stc $i1,0($out,$inp) stc $i2,16($out,$inp) la $inp,1($inp) brct $len,.Lxts_enc_km_steal la $s2,0($i3) lghi $s3,16 lrvgr $i1,$s0 # flip byte order lrvgr $i2,$s1 xg $i1,0($s2) xg $i2,8($s2) stg $i1,0($s2) stg $i2,8($s2) .long 0xb92e00aa # km $s2,$s2 brc 1,.-4 # can this happen? lrvgr $i1,$s0 # flip byte order lrvgr $i2,$s1 xg $i1,0($i3) xg $i2,8($i3) stg $i1,0($i3) stg $i2,8($i3) .Lxts_enc_km_done: l${g} $ra,14*$SIZE_T($sp) st${g} $sp,$tweak($sp) # wipe tweak st${g} $sp,$tweak($sp) lm${g} %r6,$s3,6*$SIZE_T($sp) br $ra .align 16 .Lxts_enc_software: ___ $code.=<<___; stm${g} %r6,$ra,6*$SIZE_T($sp) slgr $out,$inp xgr $s0,$s0 # clear upper half xgr $s1,$s1 lrv $s0,$stdframe+4($sp) # load secno lrv $s1,$stdframe+0($sp) xgr $s2,$s2 xgr $s3,$s3 stm${g} %r2,%r5,2*$SIZE_T($sp) la $key,0($key2) larl $tbl,AES_Te bras $ra,_s390x_AES_encrypt # generate the tweak lm${g} %r2,%r5,2*$SIZE_T($sp) stm $s0,$s3,$tweak($sp) # save the tweak j .Lxts_enc_enter .align 16 .Lxts_enc_loop: lrvg $s1,$tweak+0($sp) # load the tweak in little-endian lrvg $s3,$tweak+8($sp) lghi %r1,0x87 srag %r0,$s3,63 # broadcast upper bit ngr %r1,%r0 # rem srlg %r0,$s1,63 # carry bit from lower half sllg $s1,$s1,1 sllg $s3,$s3,1 xgr $s1,%r1 ogr $s3,%r0 lrvgr $s1,$s1 # flip byte order lrvgr $s3,$s3 srlg $s0,$s1,32 # smash the tweak to 4x32-bits stg $s1,$tweak+0($sp) # save the tweak llgfr $s1,$s1 srlg $s2,$s3,32 stg $s3,$tweak+8($sp) llgfr $s3,$s3 la $inp,16($inp) # $inp+=16 .Lxts_enc_enter: x $s0,0($inp) # ^=*($inp) x $s1,4($inp) x $s2,8($inp) x $s3,12($inp) stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing la $key,0($key1) bras $ra,_s390x_AES_encrypt lm${g} %r2,%r5,2*$SIZE_T($sp) x $s0,$tweak+0($sp) # ^=tweak x $s1,$tweak+4($sp) x $s2,$tweak+8($sp) x $s3,$tweak+12($sp) st $s0,0($out,$inp) st $s1,4($out,$inp) st $s2,8($out,$inp) st $s3,12($out,$inp) brct${g} $len,.Lxts_enc_loop llgc $len,`2*$SIZE_T-1`($sp) nill $len,0x0f # $len%16 jz .Lxts_enc_done la $i3,0($inp,$out) # put aside real $out .Lxts_enc_steal: llgc %r0,16($inp) llgc %r1,0($out,$inp) stc %r0,0($out,$inp) stc %r1,16($out,$inp) la $inp,1($inp) brct $len,.Lxts_enc_steal la $out,0($i3) # restore real $out # generate last tweak... lrvg $s1,$tweak+0($sp) # load the tweak in little-endian lrvg $s3,$tweak+8($sp) lghi %r1,0x87 srag %r0,$s3,63 # broadcast upper bit ngr %r1,%r0 # rem srlg %r0,$s1,63 # carry bit from lower half sllg $s1,$s1,1 sllg $s3,$s3,1 xgr $s1,%r1 ogr $s3,%r0 lrvgr $s1,$s1 # flip byte order lrvgr $s3,$s3 srlg $s0,$s1,32 # smash the tweak to 4x32-bits stg $s1,$tweak+0($sp) # save the tweak llgfr $s1,$s1 srlg $s2,$s3,32 stg $s3,$tweak+8($sp) llgfr $s3,$s3 x $s0,0($out) # ^=*(inp)|stolen cipther-text x $s1,4($out) x $s2,8($out) x $s3,12($out) st${g} $out,4*$SIZE_T($sp) la $key,0($key1) bras $ra,_s390x_AES_encrypt l${g} $out,4*$SIZE_T($sp) x $s0,`$tweak+0`($sp) # ^=tweak x $s1,`$tweak+4`($sp) x $s2,`$tweak+8`($sp) x $s3,`$tweak+12`($sp) st $s0,0($out) st $s1,4($out) st $s2,8($out) st $s3,12($out) .Lxts_enc_done: stg $sp,$tweak+0($sp) # wipe tweak stg $sp,$twesk+8($sp) lm${g} %r6,$ra,6*$SIZE_T($sp) br $ra .size AES_xts_encrypt,.-AES_xts_encrypt ___ # void AES_xts_decrypt(const char *inp,char *out,size_t len, # const AES_KEY *key1, const AES_KEY *key2,u64 secno); # $code.=<<___; .globl AES_xts_decrypt .type AES_xts_decrypt,\@function .align 16 AES_xts_decrypt: xgr %r3,%r4 # flip %r3 and %r4, $out and $len xgr %r4,%r3 xgr %r3,%r4 ___ $code.=<<___ if ($SIZE_T==4); llgfr $len,$len ___ $code.=<<___; st${g} $len,1*$SIZE_T($sp) # save copy of $len aghi $len,-16 bcr 4,$ra # abort if less than zero. formally # wrong, because $len is unsigned, # but who can afford asking to # process more than 2^63-1 bytes? tmll $len,0x0f jnz .Lxts_dec_proceed aghi $len,16 .Lxts_dec_proceed: ___ $code.=<<___ if (!$softonly); llgf %r0,240($key2) lhi %r1,16 clr %r0,%r1 jl .Lxts_dec_software stm${g} %r6,$s3,6*$SIZE_T($sp) st${g} $ra,14*$SIZE_T($sp) nill $len,0xfff0 # $len&=~15 slgr $out,$inp # generate the tweak value lrvg $s0,$stdframe($sp) # load secno lghi $s1,0 la $s2,$tweak($sp) lghi $s3,16 stg $s0,0($s2) stg $s1,8($s2) la %r1,0($key2) # $key2 is not needed past this point .long 0xb92e00aa # km $s2,$s2, generate the tweak brc 1,.-4 # can this happen? l %r0,240($key1) la %r1,0($key1) # $key1 is not needed anymore ltgr $len,$len jz .Lxts_dec_km_short bras $ra,_s390x_xts_km jz .Lxts_dec_km_done lrvgr $s2,$s0 # make copy in reverse byte order lrvgr $s3,$s1 j .Lxts_dec_km_2ndtweak .Lxts_dec_km_short: llgc $len,`2*$SIZE_T-1`($sp) nill $len,0x0f # $len%=16 lrvg $s0,$tweak+0($sp) # load the tweak lrvg $s1,$tweak+8($sp) lrvgr $s2,$s0 # make copy in reverse byte order lrvgr $s3,$s1 .Lxts_dec_km_2ndtweak: lghi $i1,0x87 srag $i2,$s1,63 # broadcast upper bit ngr $i1,$i2 # rem srlg $i2,$s0,63 # carry bit from lower half sllg $s0,$s0,1 sllg $s1,$s1,1 xgr $s0,$i1 ogr $s1,$i2 lrvgr $i1,$s0 # flip byte order lrvgr $i2,$s1 xg $i1,0($inp) xg $i2,8($inp) stg $i1,0($out,$inp) stg $i2,8($out,$inp) la $i2,0($out,$inp) lghi $i3,16 .long 0xb92e0066 # km $i2,$i2 brc 1,.-4 # can this happen? lrvgr $i1,$s0 lrvgr $i2,$s1 xg $i1,0($out,$inp) xg $i2,8($out,$inp) stg $i1,0($out,$inp) stg $i2,8($out,$inp) la $i3,0($out,$inp) # put aside real $out .Lxts_dec_km_steal: llgc $i1,16($inp) llgc $i2,0($out,$inp) stc $i1,0($out,$inp) stc $i2,16($out,$inp) la $inp,1($inp) brct $len,.Lxts_dec_km_steal lgr $s0,$s2 lgr $s1,$s3 xg $s0,0($i3) xg $s1,8($i3) stg $s0,0($i3) stg $s1,8($i3) la $s0,0($i3) lghi $s1,16 .long 0xb92e0088 # km $s0,$s0 brc 1,.-4 # can this happen? xg $s2,0($i3) xg $s3,8($i3) stg $s2,0($i3) stg $s3,8($i3) .Lxts_dec_km_done: l${g} $ra,14*$SIZE_T($sp) st${g} $sp,$tweak($sp) # wipe tweak st${g} $sp,$tweak($sp) lm${g} %r6,$s3,6*$SIZE_T($sp) br $ra .align 16 .Lxts_dec_software: ___ $code.=<<___; stm${g} %r6,$ra,6*$SIZE_T($sp) srlg $len,$len,4 slgr $out,$inp xgr $s0,$s0 # clear upper half xgr $s1,$s1 lrv $s0,$stdframe+4($sp) # load secno lrv $s1,$stdframe+0($sp) xgr $s2,$s2 xgr $s3,$s3 stm${g} %r2,%r5,2*$SIZE_T($sp) la $key,0($key2) larl $tbl,AES_Te bras $ra,_s390x_AES_encrypt # generate the tweak lm${g} %r2,%r5,2*$SIZE_T($sp) larl $tbl,AES_Td lt${g}r $len,$len stm $s0,$s3,$tweak($sp) # save the tweak jz .Lxts_dec_short j .Lxts_dec_enter .align 16 .Lxts_dec_loop: lrvg $s1,$tweak+0($sp) # load the tweak in little-endian lrvg $s3,$tweak+8($sp) lghi %r1,0x87 srag %r0,$s3,63 # broadcast upper bit ngr %r1,%r0 # rem srlg %r0,$s1,63 # carry bit from lower half sllg $s1,$s1,1 sllg $s3,$s3,1 xgr $s1,%r1 ogr $s3,%r0 lrvgr $s1,$s1 # flip byte order lrvgr $s3,$s3 srlg $s0,$s1,32 # smash the tweak to 4x32-bits stg $s1,$tweak+0($sp) # save the tweak llgfr $s1,$s1 srlg $s2,$s3,32 stg $s3,$tweak+8($sp) llgfr $s3,$s3 .Lxts_dec_enter: x $s0,0($inp) # tweak^=*(inp) x $s1,4($inp) x $s2,8($inp) x $s3,12($inp) stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing la $key,0($key1) bras $ra,_s390x_AES_decrypt lm${g} %r2,%r5,2*$SIZE_T($sp) x $s0,$tweak+0($sp) # ^=tweak x $s1,$tweak+4($sp) x $s2,$tweak+8($sp) x $s3,$tweak+12($sp) st $s0,0($out,$inp) st $s1,4($out,$inp) st $s2,8($out,$inp) st $s3,12($out,$inp) la $inp,16($inp) ahi $t1,1 # 32-bit increment brct $len,.Lctr32_loop brct${g} $len,.Lxts_dec_loop llgc $len,`2*$SIZE_T-1`($sp) nill $len,0x0f # $len%16 jz .Lxts_dec_done # generate pair of tweaks... lrvg $s1,$tweak+0($sp) # load the tweak in little-endian lrvg $s3,$tweak+8($sp) lghi %r1,0x87 srag %r0,$s3,63 # broadcast upper bit ngr %r1,%r0 # rem srlg %r0,$s1,63 # carry bit from lower half sllg $s1,$s1,1 sllg $s3,$s3,1 xgr $s1,%r1 ogr $s3,%r0 lrvgr $i2,$s1 # flip byte order lrvgr $i3,$s3 stmg $i2,$i3,$tweak($sp) # save the 1st tweak j .Lxts_dec_2ndtweak .align 16 .Lxts_dec_short: llgc $len,`2*$SIZE_T-1`($sp) nill $len,0x0f # $len%16 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian lrvg $s3,$tweak+8($sp) .Lxts_dec_2ndtweak: lghi %r1,0x87 srag %r0,$s3,63 # broadcast upper bit ngr %r1,%r0 # rem srlg %r0,$s1,63 # carry bit from lower half sllg $s1,$s1,1 sllg $s3,$s3,1 xgr $s1,%r1 ogr $s3,%r0 lrvgr $s1,$s1 # flip byte order lrvgr $s3,$s3 srlg $s0,$s1,32 # smash the tweak to 4x32-bits stg $s1,$tweak-16+0($sp) # save the 2nd tweak llgfr $s1,$s1 srlg $s2,$s3,32 stg $s3,$tweak-16+8($sp) llgfr $s3,$s3 x $s0,0($inp) # tweak_the_2nd^=*(inp) x $s1,4($inp) x $s2,8($inp) x $s3,12($inp) stm${g} %r2,%r3,2*$SIZE_T($sp) la $key,0($key1) bras $ra,_s390x_AES_decrypt lm${g} %r2,%r5,2*$SIZE_T($sp) x $s0,$tweak-16+0($sp) # ^=tweak_the_2nd x $s1,$tweak-16+4($sp) x $s2,$tweak-16+8($sp) x $s3,$tweak-16+12($sp) st $s0,0($out,$inp) st $s1,4($out,$inp) st $s2,8($out,$inp) st $s3,12($out,$inp) la $i3,0($out,$inp) # put aside real $out .Lxts_dec_steal: llgc %r0,16($inp) llgc %r1,0($out,$inp) stc %r0,0($out,$inp) stc %r1,16($out,$inp) la $inp,1($inp) brct $len,.Lxts_dec_steal la $out,0($i3) # restore real $out lm $s0,$s3,$tweak($sp) # load the 1st tweak x $s0,0($out) # tweak^=*(inp)|stolen cipher-text x $s1,4($out) x $s2,8($out) x $s3,12($out) st${g} $out,4*$SIZE_T($sp) la $key,0($key1) bras $ra,_s390x_AES_decrypt l${g} $out,4*$SIZE_T($sp) x $s0,$tweak+0($sp) # ^=tweak x $s1,$tweak+4($sp) x $s2,$tweak+8($sp) x $s3,$tweak+12($sp) st $s0,0($out) st $s1,4($out) st $s2,8($out) st $s3,12($out) stg $sp,$tweak-16+0($sp) # wipe 2nd tweak stg $sp,$tweak-16+8($sp) .Lxts_dec_done: stg $sp,$tweak+0($sp) # wipe tweak stg $sp,$twesk+8($sp) lm${g} %r6,$ra,6*$SIZE_T($sp) br $ra .size AES_ctr32_encrypt,.-AES_ctr32_encrypt .size AES_xts_decrypt,.-AES_xts_decrypt ___ } $code.=<<___; .comm OPENSSL_s390xcap_P,16,8 .string "AES for s390x, CRYPTOGAMS by <appro\@openssl.org>" .comm OPENSSL_s390xcap_P,16,8 ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; Loading crypto/bn/asm/s390x-mont.pl +4 −4 Original line number Diff line number Diff line Loading @@ -41,8 +41,8 @@ # processor, as long as it's "z-CPU". Latter implies that the code # remains z/Architecture specific. Compatibility with 32-bit BN_ULONG # is achieved by swapping words after 64-bit loads, follow _dswap-s. # On z990 it was measured to perform 2.6-2.2 times better, less for # longer keys... # On z990 it was measured to perform 2.6-2.2 times better than # compiler-generated code, less for longer keys... $flavour = shift; Loading Loading @@ -102,8 +102,8 @@ $code.=<<___ if ($flavour =~ /3[12]/); bnzr %r14 # if ($num&1) return 0; ___ $code.=<<___ if ($flavour !~ /3[12]/); cghi $num,128 # bhr %r14 # if($num>128) return 0; cghi $num,96 # bhr %r14 # if($num>96) return 0; ___ $code.=<<___; stm${g} %r3,%r15,3*$SIZE_T($sp) Loading crypto/modes/asm/ghash-s390x.pl +11 −2 Original line number Diff line number Diff line Loading @@ -28,6 +28,15 @@ # remains z/Architecture specific. On z990 it was measured to perform # 2.8x better than 32-bit code generated by gcc 4.3. # March 2011. # # Support for hardware KIMD-GHASH is verified to produce correct # result and therefore is engaged. On z196 it was measured to process # 8KB buffer ~7 faster than software implementation. It's not as # impressive for smaller buffer sizes and for smallest 16-bytes buffer # it's actually almost 2 times slower. Which is the reason why # KIMD-GHASH is not used in gcm_gmult_4bit. $flavour = shift; if ($flavour =~ /3[12]/) { Loading @@ -41,7 +50,7 @@ if ($flavour =~ /3[12]/) { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; $softonly=1; # disable hardware support for now $softonly=0; $Zhi="%r0"; $Zlo="%r1"; Loading Loading @@ -70,7 +79,7 @@ $code.=<<___; .align 32 gcm_gmult_4bit: ___ $code.=<<___ if(!$softonly); $code.=<<___ if(!$softonly && 0); # hardware is slow for single block... larl %r1,OPENSSL_s390xcap_P lg %r0,0(%r1) tmhl %r0,0x4000 # check for message-security-assist Loading Loading
crypto/aes/asm/aes-s390x.pl +764 −25 Original line number Diff line number Diff line Loading @@ -70,6 +70,18 @@ # remains z/Architecture specific. On z990 it was measured to perform # 2x better than code generated by gcc 4.3. # December 2010. # # Add support for z196 "cipher message with counter" instruction. # Note however that it's disengaged, because it was measured to # perform ~12% worse than vanilla km-based code... # February 2011. # # Add AES_xts_[en|de]crypt. This includes support for z196 # km-xts-aes instructions, which deliver ~70% improvement at 8KB # block size over vanilla km-based code. $flavour = shift; if ($flavour =~ /3[12]/) { Loading Loading @@ -268,7 +280,7 @@ $code.=<<___; .type _s390x_AES_encrypt,\@function .align 16 _s390x_AES_encrypt: st${g} $ra,`$stdframe-$SIZE_T`($sp) st${g} $ra,15*$SIZE_T($sp) x $s0,0($key) x $s1,4($key) x $s2,8($key) Loading Loading @@ -432,7 +444,7 @@ _s390x_AES_encrypt: or $s2,$i3 or $s3,$t3 l${g} $ra,`$stdframe-$SIZE_T`($sp) l${g} $ra,15*$SIZE_T($sp) xr $s0,$t0 xr $s1,$t2 x $s2,24($key) Loading Loading @@ -594,7 +606,7 @@ $code.=<<___; .type _s390x_AES_decrypt,\@function .align 16 _s390x_AES_decrypt: st${g} $ra,`$stdframe-$SIZE_T`($sp) st${g} $ra,15*$SIZE_T($sp) x $s0,0($key) x $s1,4($key) x $s2,8($key) Loading Loading @@ -738,7 +750,7 @@ _s390x_AES_decrypt: nr $i1,$mask nr $i2,$mask l${g} $ra,`$stdframe-$SIZE_T`($sp) l${g} $ra,15*$SIZE_T($sp) or $s1,$t1 l $t0,16($key) l $t1,20($key) Loading Loading @@ -1164,6 +1176,7 @@ $code.=<<___; .size AES_set_decrypt_key,.-AES_set_decrypt_key ___ ######################################################################## # void AES_cbc_encrypt(const unsigned char *in, unsigned char *out, # size_t length, const AES_KEY *key, # unsigned char *ivec, const int enc) Loading Loading @@ -1365,13 +1378,14 @@ $code.=<<___; .size AES_cbc_encrypt,.-AES_cbc_encrypt ___ } ######################################################################## # void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out, # size_t blocks, const AES_KEY *key, # const unsigned char *ivec) { my $inp="%r2"; my $out="%r3"; my $len="%r4"; my $out="%r4"; # blocks and out are swapped my $len="%r3"; my $key="%r5"; my $iv0="%r5"; my $ivp="%r6"; my $fp ="%r7"; Loading @@ -1381,6 +1395,9 @@ $code.=<<___; .type AES_ctr32_encrypt,\@function .align 16 AES_ctr32_encrypt: xgr %r3,%r4 # flip %r3 and %r4, $out and $len xgr %r4,%r3 xgr %r3,%r4 llgfr $len,$len # safe in ctr32 subroutine even in 64-bit case ___ $code.=<<___ if (!$softonly); Loading Loading @@ -1415,20 +1432,75 @@ $code.=<<___ if (!$softonly); st${g} $fp,$SIZE_T($sp) slgr $len,$fp brc 1,.Lctr32_hw_loop # not zero, no borrow brc 1,.Lctr32_hw_switch # not zero, no borrow algr $fp,$len # input is shorter than allocated buffer lghi $len,0 st${g} $fp,$SIZE_T($sp) .Lctr32_hw_loop: .Lctr32_hw_switch: ___ $code.=<<___ if (0); ######### kmctr code was measured to be ~12% slower larl $s0,OPENSSL_s390xcap_P lg $s0,8($s0) tmhh $s0,0x0004 # check for message_security-assist-4 jz .Lctr32_km_loop llgfr $s0,%r0 lgr $s1,%r1 lghi %r0,0 la %r1,16($sp) .long 0xb92d2042 # kmctr %r4,%r2,%r2 llihh %r0,0x8000 # check if kmctr supports the function code srlg %r0,%r0,0($s0) ng %r0,16($sp) lgr %r0,$s0 lgr %r1,$s1 jz .Lctr32_km_loop ####### kmctr code algr $out,$inp # restore $out lgr $s1,$len # $s1 undertakes $len j .Lctr32_kmctr_loop .align 16 .Lctr32_kmctr_loop: la $s2,16($sp) lgr $s3,$fp .Lctr32_hw_prepare: .Lctr32_kmctr_prepare: stg $iv0,0($s2) stg $ivp,8($s2) la $s2,16($s2) ahi $ivp,1 # 32-bit increment, preserves upper half brct $s3,.Lctr32_hw_prepare brct $s3,.Lctr32_kmctr_prepare #la $inp,0($inp) # inp sllg $len,$fp,4 # len #la $out,0($out) # out la $s2,16($sp) # iv .long 0xb92da042 # kmctr $out,$s2,$inp brc 1,.-4 # pay attention to "partial completion" slgr $s1,$fp brc 1,.Lctr32_kmctr_loop # not zero, no borrow algr $fp,$s1 lghi $s1,0 brc 4+1,.Lctr32_kmctr_loop # not zero l${g} $sp,0($sp) lm${g} %r6,$s3,6*$SIZE_T($sp) br $ra .align 16 ___ $code.=<<___; .Lctr32_km_loop: la $s2,16($sp) lgr $s3,$fp .Lctr32_km_prepare: stg $iv0,0($s2) stg $ivp,8($s2) la $s2,16($s2) ahi $ivp,1 # 32-bit increment, preserves upper half brct $s3,.Lctr32_km_prepare la $s0,16($sp) # inp sllg $s1,$fp,4 # len Loading @@ -1439,7 +1511,7 @@ $code.=<<___ if (!$softonly); la $s2,16($sp) lgr $s3,$fp slgr $s2,$inp .Lctr32_hw_xor: .Lctr32_km_xor: lg $s0,0($inp) lg $s1,8($inp) xg $s0,0($s2,$inp) Loading @@ -1447,22 +1519,22 @@ $code.=<<___ if (!$softonly); stg $s0,0($out,$inp) stg $s1,8($out,$inp) la $inp,16($inp) brct $s3,.Lctr32_hw_xor brct $s3,.Lctr32_km_xor slgr $len,$fp brc 1,.Lctr32_hw_loop # not zero, no borrow brc 1,.Lctr32_km_loop # not zero, no borrow algr $fp,$len lghi $len,0 brc 4+1,.Lctr32_hw_loop # not zero brc 4+1,.Lctr32_km_loop # not zero l${g} $s0,0($sp) l${g} $s1,$SIZE_T($sp) la $s2,16($sp) .Lctr32_hw_zap: .Lctr32_km_zap: stg $s0,0($s2) stg $s0,8($s2) la $s2,16($s2) brct $s1,.Lctr32_hw_zap brct $s1,.Lctr32_km_zap la $sp,0($s0) lm${g} %r6,$s3,6*$SIZE_T($sp) Loading @@ -1472,12 +1544,12 @@ $code.=<<___ if (!$softonly); ___ $code.=<<___; stm${g} $key,$ra,5*$SIZE_T($sp) sl${g}r $out,$inp sl${g}r $inp,$out larl $tbl,AES_Te llgf $t1,12($ivp) .Lctr32_loop: stm${g} $inp,$len,2*$SIZE_T($sp) stm${g} $inp,$out,2*$SIZE_T($sp) llgf $s0,0($ivp) llgf $s1,4($ivp) llgf $s2,8($ivp) Loading @@ -1489,27 +1561,694 @@ $code.=<<___; lm${g} $inp,$ivp,2*$SIZE_T($sp) llgf $t1,16*$SIZE_T($sp) x $s0,0($inp) x $s0,0($inp,$out) x $s1,4($inp,$out) x $s2,8($inp,$out) x $s3,12($inp,$out) stm $s0,$s3,0($out) la $out,16($out) ahi $t1,1 # 32-bit increment brct $len,.Lctr32_loop lm${g} %r6,$ra,6*$SIZE_T($sp) br $ra .size AES_ctr32_encrypt,.-AES_ctr32_encrypt ___ } ######################################################################## # void AES_xts_encrypt(const char *inp,char *out,size_t len, # const AES_KEY *key1, const AES_KEY *key2,u64 secno); # { my $inp="%r2"; my $out="%r4"; # len and out are swapped my $len="%r3"; my $key1="%r5"; # $i1 my $key2="%r6"; # $i2 my $fp="%r7"; # $i3 my $tweak=16*$SIZE_T+16; # or $stdframe-16, bottom of the frame... $code.=<<___; .type _s390x_xts_km,\@function .align 16 _s390x_xts_km: ___ $code.=<<___ if(0); llgfr $s0,%r0 # put aside the function code lghi $s1,0x7f nr $s1,%r0 lghi %r0,0 # query capability vector la %r1,2*$SIZE_T($sp) .long 0xb92e0042 # km %r4,%r2 llihh %r1,0x8000 srlg %r1,%r1,32($s1) # check for 32+function code ng %r1,2*$SIZE_T($sp) lgr %r0,$s0 # restore the function code la %r1,0($key1) # restore $key1 jz .Lxts_km_vanilla lmg $i2,$i3,$tweak($sp) # put aside the tweak value algr $out,$inp oill %r0,32 # switch to xts function code aghi $s1,-18 # sllg $s1,$s1,3 # (function code - 18)*8, 0 or 16 la %r1,$tweak-16($sp) slgr %r1,$s1 # parameter block position lmg $s0,$s3,0($key1) # load 256 bits of key material, stmg $s0,$s3,0(%r1) # and copy it to parameter block. # yes, it contains junk and overlaps # with the tweak in 128-bit case. # it's done to avoid conditional # branch. stmg $i2,$i3,$tweak($sp) # "re-seat" the tweak value .long 0xb92e0042 # km %r4,%r2 brc 1,.-4 # pay attention to "partial completion" lrvg $s0,$tweak+0($sp) # load the last tweak lrvg $s1,$tweak+8($sp) stmg %r0,%r3,$tweak-32(%r1) # wipe copy of the key nill %r0,0xffdf # switch back to original function code la %r1,0($key1) # restore pointer to $key1 slgr $out,$inp llgc $len,2*$SIZE_T-1($sp) nill $len,0x0f # $len%=16 br $ra .align 16 .Lxts_km_vanilla: ___ $code.=<<___; # prepare and allocate stack frame at the top of 4K page # with 1K reserved for eventual signal handling lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer lghi $s1,-4096 algr $s0,$sp lgr $fp,$sp ngr $s0,$s1 # align at page boundary slgr $fp,$s0 # total buffer size lgr $s2,$sp lghi $s1,1024+16 # sl[g]fi is extended-immediate facility slgr $fp,$s1 # deduct reservation to get usable buffer size # buffer size is at lest 256 and at most 3072+256-16 la $sp,1024($s0) # alloca nill $fp,0xfff0 # round to 16*n st${g} $s2,0($sp) # back-chain nill $len,0xfff0 # redundant st${g} $fp,$SIZE_T($sp) slgr $len,$fp brc 1,.Lxts_km_go # not zero, no borrow algr $fp,$len # input is shorter than allocated buffer lghi $len,0 st${g} $fp,$SIZE_T($sp) .Lxts_km_go: lrvg $s0,$tweak+0($s2) # load the tweak value in little-endian lrvg $s1,$tweak+8($s2) la $s2,16($sp) # vector of ascending tweak values slgr $s2,$inp srlg $s3,$fp,4 j .Lxts_km_start .Lxts_km_loop: la $s2,16($sp) slgr $s2,$inp srlg $s3,$fp,4 .Lxts_km_prepare: lghi $i1,0x87 srag $i2,$s1,63 # broadcast upper bit ngr $i1,$i2 # rem srlg $i2,$s0,63 # carry bit from lower half sllg $s0,$s0,1 sllg $s1,$s1,1 xgr $s0,$i1 ogr $s1,$i2 .Lxts_km_start: lrvgr $i1,$s0 # flip byte order lrvgr $i2,$s1 stg $i1,0($s2,$inp) stg $i2,8($s2,$inp) xg $i1,0($inp) xg $i2,8($inp) stg $i1,0($out,$inp) stg $i2,8($out,$inp) la $inp,16($inp) brct $s3,.Lxts_km_prepare slgr $inp,$fp # rewind $inp la $s2,0($out,$inp) lgr $s3,$fp .long 0xb92e00aa # km $s2,$s2 brc 1,.-4 # pay attention to "partial completion" la $s2,16($sp) slgr $s2,$inp srlg $s3,$fp,4 .Lxts_km_xor: lg $i1,0($out,$inp) lg $i2,8($out,$inp) xg $i1,0($s2,$inp) xg $i2,8($s2,$inp) stg $i1,0($out,$inp) stg $i2,8($out,$inp) la $inp,16($inp) brct $s3,.Lxts_km_xor slgr $len,$fp brc 1,.Lxts_km_loop # not zero, no borrow algr $fp,$len lghi $len,0 brc 4+1,.Lxts_km_loop # not zero l${g} $i1,0($sp) # back-chain llgf $fp,`2*$SIZE_T-4`($sp) # bytes used la $i2,16($sp) srlg $fp,$fp,4 .Lxts_km_zap: stg $i1,0($i2) stg $i1,8($i2) la $i2,16($i2) brct $fp,.Lxts_km_zap la $sp,0($i1) llgc $len,2*$SIZE_T-1($i1) nill $len,0x0f # $len%=16 bzr $ra # generate one more tweak... lghi $i1,0x87 srag $i2,$s1,63 # broadcast upper bit ngr $i1,$i2 # rem srlg $i2,$s0,63 # carry bit from lower half sllg $s0,$s0,1 sllg $s1,$s1,1 xgr $s0,$i1 ogr $s1,$i2 ltr $len,$len # clear zero flag br $ra .size _s390x_xts_km,.-_s390x_xts_km .globl AES_xts_encrypt .type AES_xts_encrypt,\@function .align 16 AES_xts_encrypt: xgr %r3,%r4 # flip %r3 and %r4, $out and $len xgr %r4,%r3 xgr %r3,%r4 ___ $code.=<<___ if ($SIZE_T==4); llgfr $len,$len ___ $code.=<<___; st${g} $len,1*$SIZE_T($sp) # save copy of $len srag $len,$len,4 # formally wrong, because it expands # sign byte, but who can afford asking # to process more than 2^63-1 bytes? # I use it, because it sets condition # code... bcr 8,$ra # abort if zero (i.e. less than 16) ___ $code.=<<___ if (!$softonly); llgf %r0,240($key2) lhi %r1,16 clr %r0,%r1 jl .Lxts_enc_software stm${g} %r6,$s3,6*$SIZE_T($sp) st${g} $ra,14*$SIZE_T($sp) sllg $len,$len,4 # $len&=~15 slgr $out,$inp lrvg $s0,$stdframe($sp) # load secno lghi $s1,0 la $s2,$tweak($sp) lghi $s3,16 stmg $s0,$s1,0($s2) la %r1,0($key2) # $key2 is not needed anymore .long 0xb92e00aa # km $s2,$s2, generate the tweak brc 1,.-4 # can this happen? l %r0,240($key1) la %r1,0($key1) # $key1 is not needed anymore bras $ra,_s390x_xts_km jz .Lxts_enc_km_done aghi $inp,-16 # take one step back la $i3,0($out,$inp) # put aside real $out .Lxts_enc_km_steal: llgc $i1,16($inp) llgc $i2,0($out,$inp) stc $i1,0($out,$inp) stc $i2,16($out,$inp) la $inp,1($inp) brct $len,.Lxts_enc_km_steal la $s2,0($i3) lghi $s3,16 lrvgr $i1,$s0 # flip byte order lrvgr $i2,$s1 xg $i1,0($s2) xg $i2,8($s2) stg $i1,0($s2) stg $i2,8($s2) .long 0xb92e00aa # km $s2,$s2 brc 1,.-4 # can this happen? lrvgr $i1,$s0 # flip byte order lrvgr $i2,$s1 xg $i1,0($i3) xg $i2,8($i3) stg $i1,0($i3) stg $i2,8($i3) .Lxts_enc_km_done: l${g} $ra,14*$SIZE_T($sp) st${g} $sp,$tweak($sp) # wipe tweak st${g} $sp,$tweak($sp) lm${g} %r6,$s3,6*$SIZE_T($sp) br $ra .align 16 .Lxts_enc_software: ___ $code.=<<___; stm${g} %r6,$ra,6*$SIZE_T($sp) slgr $out,$inp xgr $s0,$s0 # clear upper half xgr $s1,$s1 lrv $s0,$stdframe+4($sp) # load secno lrv $s1,$stdframe+0($sp) xgr $s2,$s2 xgr $s3,$s3 stm${g} %r2,%r5,2*$SIZE_T($sp) la $key,0($key2) larl $tbl,AES_Te bras $ra,_s390x_AES_encrypt # generate the tweak lm${g} %r2,%r5,2*$SIZE_T($sp) stm $s0,$s3,$tweak($sp) # save the tweak j .Lxts_enc_enter .align 16 .Lxts_enc_loop: lrvg $s1,$tweak+0($sp) # load the tweak in little-endian lrvg $s3,$tweak+8($sp) lghi %r1,0x87 srag %r0,$s3,63 # broadcast upper bit ngr %r1,%r0 # rem srlg %r0,$s1,63 # carry bit from lower half sllg $s1,$s1,1 sllg $s3,$s3,1 xgr $s1,%r1 ogr $s3,%r0 lrvgr $s1,$s1 # flip byte order lrvgr $s3,$s3 srlg $s0,$s1,32 # smash the tweak to 4x32-bits stg $s1,$tweak+0($sp) # save the tweak llgfr $s1,$s1 srlg $s2,$s3,32 stg $s3,$tweak+8($sp) llgfr $s3,$s3 la $inp,16($inp) # $inp+=16 .Lxts_enc_enter: x $s0,0($inp) # ^=*($inp) x $s1,4($inp) x $s2,8($inp) x $s3,12($inp) stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing la $key,0($key1) bras $ra,_s390x_AES_encrypt lm${g} %r2,%r5,2*$SIZE_T($sp) x $s0,$tweak+0($sp) # ^=tweak x $s1,$tweak+4($sp) x $s2,$tweak+8($sp) x $s3,$tweak+12($sp) st $s0,0($out,$inp) st $s1,4($out,$inp) st $s2,8($out,$inp) st $s3,12($out,$inp) brct${g} $len,.Lxts_enc_loop llgc $len,`2*$SIZE_T-1`($sp) nill $len,0x0f # $len%16 jz .Lxts_enc_done la $i3,0($inp,$out) # put aside real $out .Lxts_enc_steal: llgc %r0,16($inp) llgc %r1,0($out,$inp) stc %r0,0($out,$inp) stc %r1,16($out,$inp) la $inp,1($inp) brct $len,.Lxts_enc_steal la $out,0($i3) # restore real $out # generate last tweak... lrvg $s1,$tweak+0($sp) # load the tweak in little-endian lrvg $s3,$tweak+8($sp) lghi %r1,0x87 srag %r0,$s3,63 # broadcast upper bit ngr %r1,%r0 # rem srlg %r0,$s1,63 # carry bit from lower half sllg $s1,$s1,1 sllg $s3,$s3,1 xgr $s1,%r1 ogr $s3,%r0 lrvgr $s1,$s1 # flip byte order lrvgr $s3,$s3 srlg $s0,$s1,32 # smash the tweak to 4x32-bits stg $s1,$tweak+0($sp) # save the tweak llgfr $s1,$s1 srlg $s2,$s3,32 stg $s3,$tweak+8($sp) llgfr $s3,$s3 x $s0,0($out) # ^=*(inp)|stolen cipther-text x $s1,4($out) x $s2,8($out) x $s3,12($out) st${g} $out,4*$SIZE_T($sp) la $key,0($key1) bras $ra,_s390x_AES_encrypt l${g} $out,4*$SIZE_T($sp) x $s0,`$tweak+0`($sp) # ^=tweak x $s1,`$tweak+4`($sp) x $s2,`$tweak+8`($sp) x $s3,`$tweak+12`($sp) st $s0,0($out) st $s1,4($out) st $s2,8($out) st $s3,12($out) .Lxts_enc_done: stg $sp,$tweak+0($sp) # wipe tweak stg $sp,$twesk+8($sp) lm${g} %r6,$ra,6*$SIZE_T($sp) br $ra .size AES_xts_encrypt,.-AES_xts_encrypt ___ # void AES_xts_decrypt(const char *inp,char *out,size_t len, # const AES_KEY *key1, const AES_KEY *key2,u64 secno); # $code.=<<___; .globl AES_xts_decrypt .type AES_xts_decrypt,\@function .align 16 AES_xts_decrypt: xgr %r3,%r4 # flip %r3 and %r4, $out and $len xgr %r4,%r3 xgr %r3,%r4 ___ $code.=<<___ if ($SIZE_T==4); llgfr $len,$len ___ $code.=<<___; st${g} $len,1*$SIZE_T($sp) # save copy of $len aghi $len,-16 bcr 4,$ra # abort if less than zero. formally # wrong, because $len is unsigned, # but who can afford asking to # process more than 2^63-1 bytes? tmll $len,0x0f jnz .Lxts_dec_proceed aghi $len,16 .Lxts_dec_proceed: ___ $code.=<<___ if (!$softonly); llgf %r0,240($key2) lhi %r1,16 clr %r0,%r1 jl .Lxts_dec_software stm${g} %r6,$s3,6*$SIZE_T($sp) st${g} $ra,14*$SIZE_T($sp) nill $len,0xfff0 # $len&=~15 slgr $out,$inp # generate the tweak value lrvg $s0,$stdframe($sp) # load secno lghi $s1,0 la $s2,$tweak($sp) lghi $s3,16 stg $s0,0($s2) stg $s1,8($s2) la %r1,0($key2) # $key2 is not needed past this point .long 0xb92e00aa # km $s2,$s2, generate the tweak brc 1,.-4 # can this happen? l %r0,240($key1) la %r1,0($key1) # $key1 is not needed anymore ltgr $len,$len jz .Lxts_dec_km_short bras $ra,_s390x_xts_km jz .Lxts_dec_km_done lrvgr $s2,$s0 # make copy in reverse byte order lrvgr $s3,$s1 j .Lxts_dec_km_2ndtweak .Lxts_dec_km_short: llgc $len,`2*$SIZE_T-1`($sp) nill $len,0x0f # $len%=16 lrvg $s0,$tweak+0($sp) # load the tweak lrvg $s1,$tweak+8($sp) lrvgr $s2,$s0 # make copy in reverse byte order lrvgr $s3,$s1 .Lxts_dec_km_2ndtweak: lghi $i1,0x87 srag $i2,$s1,63 # broadcast upper bit ngr $i1,$i2 # rem srlg $i2,$s0,63 # carry bit from lower half sllg $s0,$s0,1 sllg $s1,$s1,1 xgr $s0,$i1 ogr $s1,$i2 lrvgr $i1,$s0 # flip byte order lrvgr $i2,$s1 xg $i1,0($inp) xg $i2,8($inp) stg $i1,0($out,$inp) stg $i2,8($out,$inp) la $i2,0($out,$inp) lghi $i3,16 .long 0xb92e0066 # km $i2,$i2 brc 1,.-4 # can this happen? lrvgr $i1,$s0 lrvgr $i2,$s1 xg $i1,0($out,$inp) xg $i2,8($out,$inp) stg $i1,0($out,$inp) stg $i2,8($out,$inp) la $i3,0($out,$inp) # put aside real $out .Lxts_dec_km_steal: llgc $i1,16($inp) llgc $i2,0($out,$inp) stc $i1,0($out,$inp) stc $i2,16($out,$inp) la $inp,1($inp) brct $len,.Lxts_dec_km_steal lgr $s0,$s2 lgr $s1,$s3 xg $s0,0($i3) xg $s1,8($i3) stg $s0,0($i3) stg $s1,8($i3) la $s0,0($i3) lghi $s1,16 .long 0xb92e0088 # km $s0,$s0 brc 1,.-4 # can this happen? xg $s2,0($i3) xg $s3,8($i3) stg $s2,0($i3) stg $s3,8($i3) .Lxts_dec_km_done: l${g} $ra,14*$SIZE_T($sp) st${g} $sp,$tweak($sp) # wipe tweak st${g} $sp,$tweak($sp) lm${g} %r6,$s3,6*$SIZE_T($sp) br $ra .align 16 .Lxts_dec_software: ___ $code.=<<___; stm${g} %r6,$ra,6*$SIZE_T($sp) srlg $len,$len,4 slgr $out,$inp xgr $s0,$s0 # clear upper half xgr $s1,$s1 lrv $s0,$stdframe+4($sp) # load secno lrv $s1,$stdframe+0($sp) xgr $s2,$s2 xgr $s3,$s3 stm${g} %r2,%r5,2*$SIZE_T($sp) la $key,0($key2) larl $tbl,AES_Te bras $ra,_s390x_AES_encrypt # generate the tweak lm${g} %r2,%r5,2*$SIZE_T($sp) larl $tbl,AES_Td lt${g}r $len,$len stm $s0,$s3,$tweak($sp) # save the tweak jz .Lxts_dec_short j .Lxts_dec_enter .align 16 .Lxts_dec_loop: lrvg $s1,$tweak+0($sp) # load the tweak in little-endian lrvg $s3,$tweak+8($sp) lghi %r1,0x87 srag %r0,$s3,63 # broadcast upper bit ngr %r1,%r0 # rem srlg %r0,$s1,63 # carry bit from lower half sllg $s1,$s1,1 sllg $s3,$s3,1 xgr $s1,%r1 ogr $s3,%r0 lrvgr $s1,$s1 # flip byte order lrvgr $s3,$s3 srlg $s0,$s1,32 # smash the tweak to 4x32-bits stg $s1,$tweak+0($sp) # save the tweak llgfr $s1,$s1 srlg $s2,$s3,32 stg $s3,$tweak+8($sp) llgfr $s3,$s3 .Lxts_dec_enter: x $s0,0($inp) # tweak^=*(inp) x $s1,4($inp) x $s2,8($inp) x $s3,12($inp) stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing la $key,0($key1) bras $ra,_s390x_AES_decrypt lm${g} %r2,%r5,2*$SIZE_T($sp) x $s0,$tweak+0($sp) # ^=tweak x $s1,$tweak+4($sp) x $s2,$tweak+8($sp) x $s3,$tweak+12($sp) st $s0,0($out,$inp) st $s1,4($out,$inp) st $s2,8($out,$inp) st $s3,12($out,$inp) la $inp,16($inp) ahi $t1,1 # 32-bit increment brct $len,.Lctr32_loop brct${g} $len,.Lxts_dec_loop llgc $len,`2*$SIZE_T-1`($sp) nill $len,0x0f # $len%16 jz .Lxts_dec_done # generate pair of tweaks... lrvg $s1,$tweak+0($sp) # load the tweak in little-endian lrvg $s3,$tweak+8($sp) lghi %r1,0x87 srag %r0,$s3,63 # broadcast upper bit ngr %r1,%r0 # rem srlg %r0,$s1,63 # carry bit from lower half sllg $s1,$s1,1 sllg $s3,$s3,1 xgr $s1,%r1 ogr $s3,%r0 lrvgr $i2,$s1 # flip byte order lrvgr $i3,$s3 stmg $i2,$i3,$tweak($sp) # save the 1st tweak j .Lxts_dec_2ndtweak .align 16 .Lxts_dec_short: llgc $len,`2*$SIZE_T-1`($sp) nill $len,0x0f # $len%16 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian lrvg $s3,$tweak+8($sp) .Lxts_dec_2ndtweak: lghi %r1,0x87 srag %r0,$s3,63 # broadcast upper bit ngr %r1,%r0 # rem srlg %r0,$s1,63 # carry bit from lower half sllg $s1,$s1,1 sllg $s3,$s3,1 xgr $s1,%r1 ogr $s3,%r0 lrvgr $s1,$s1 # flip byte order lrvgr $s3,$s3 srlg $s0,$s1,32 # smash the tweak to 4x32-bits stg $s1,$tweak-16+0($sp) # save the 2nd tweak llgfr $s1,$s1 srlg $s2,$s3,32 stg $s3,$tweak-16+8($sp) llgfr $s3,$s3 x $s0,0($inp) # tweak_the_2nd^=*(inp) x $s1,4($inp) x $s2,8($inp) x $s3,12($inp) stm${g} %r2,%r3,2*$SIZE_T($sp) la $key,0($key1) bras $ra,_s390x_AES_decrypt lm${g} %r2,%r5,2*$SIZE_T($sp) x $s0,$tweak-16+0($sp) # ^=tweak_the_2nd x $s1,$tweak-16+4($sp) x $s2,$tweak-16+8($sp) x $s3,$tweak-16+12($sp) st $s0,0($out,$inp) st $s1,4($out,$inp) st $s2,8($out,$inp) st $s3,12($out,$inp) la $i3,0($out,$inp) # put aside real $out .Lxts_dec_steal: llgc %r0,16($inp) llgc %r1,0($out,$inp) stc %r0,0($out,$inp) stc %r1,16($out,$inp) la $inp,1($inp) brct $len,.Lxts_dec_steal la $out,0($i3) # restore real $out lm $s0,$s3,$tweak($sp) # load the 1st tweak x $s0,0($out) # tweak^=*(inp)|stolen cipher-text x $s1,4($out) x $s2,8($out) x $s3,12($out) st${g} $out,4*$SIZE_T($sp) la $key,0($key1) bras $ra,_s390x_AES_decrypt l${g} $out,4*$SIZE_T($sp) x $s0,$tweak+0($sp) # ^=tweak x $s1,$tweak+4($sp) x $s2,$tweak+8($sp) x $s3,$tweak+12($sp) st $s0,0($out) st $s1,4($out) st $s2,8($out) st $s3,12($out) stg $sp,$tweak-16+0($sp) # wipe 2nd tweak stg $sp,$tweak-16+8($sp) .Lxts_dec_done: stg $sp,$tweak+0($sp) # wipe tweak stg $sp,$twesk+8($sp) lm${g} %r6,$ra,6*$SIZE_T($sp) br $ra .size AES_ctr32_encrypt,.-AES_ctr32_encrypt .size AES_xts_decrypt,.-AES_xts_decrypt ___ } $code.=<<___; .comm OPENSSL_s390xcap_P,16,8 .string "AES for s390x, CRYPTOGAMS by <appro\@openssl.org>" .comm OPENSSL_s390xcap_P,16,8 ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; Loading
crypto/bn/asm/s390x-mont.pl +4 −4 Original line number Diff line number Diff line Loading @@ -41,8 +41,8 @@ # processor, as long as it's "z-CPU". Latter implies that the code # remains z/Architecture specific. Compatibility with 32-bit BN_ULONG # is achieved by swapping words after 64-bit loads, follow _dswap-s. # On z990 it was measured to perform 2.6-2.2 times better, less for # longer keys... # On z990 it was measured to perform 2.6-2.2 times better than # compiler-generated code, less for longer keys... $flavour = shift; Loading Loading @@ -102,8 +102,8 @@ $code.=<<___ if ($flavour =~ /3[12]/); bnzr %r14 # if ($num&1) return 0; ___ $code.=<<___ if ($flavour !~ /3[12]/); cghi $num,128 # bhr %r14 # if($num>128) return 0; cghi $num,96 # bhr %r14 # if($num>96) return 0; ___ $code.=<<___; stm${g} %r3,%r15,3*$SIZE_T($sp) Loading
crypto/modes/asm/ghash-s390x.pl +11 −2 Original line number Diff line number Diff line Loading @@ -28,6 +28,15 @@ # remains z/Architecture specific. On z990 it was measured to perform # 2.8x better than 32-bit code generated by gcc 4.3. # March 2011. # # Support for hardware KIMD-GHASH is verified to produce correct # result and therefore is engaged. On z196 it was measured to process # 8KB buffer ~7 faster than software implementation. It's not as # impressive for smaller buffer sizes and for smallest 16-bytes buffer # it's actually almost 2 times slower. Which is the reason why # KIMD-GHASH is not used in gcm_gmult_4bit. $flavour = shift; if ($flavour =~ /3[12]/) { Loading @@ -41,7 +50,7 @@ if ($flavour =~ /3[12]/) { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; $softonly=1; # disable hardware support for now $softonly=0; $Zhi="%r0"; $Zlo="%r1"; Loading Loading @@ -70,7 +79,7 @@ $code.=<<___; .align 32 gcm_gmult_4bit: ___ $code.=<<___ if(!$softonly); $code.=<<___ if(!$softonly && 0); # hardware is slow for single block... larl %r1,OPENSSL_s390xcap_P lg %r0,0(%r1) tmhl %r0,0x4000 # check for message-security-assist Loading