#!/usr/bin/env perl # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # February 2009 # # Performance is 2x of gcc 3.4.6 on z10. Coding "secret" is to # "cluster" Address Generation Interlocks, so that one pipeline stall # resolves several dependencies. # November 2010. # # Adapt for -m31 build. If kernel supports what's called "highgprs" # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit # instructions and achieve "64-bit" performance even in 31-bit legacy # application context. The feature is not specific to any particular # processor, as long as it's "z-CPU". Latter implies that the code # remains z/Architecture specific. On z990 it was measured to perform # 50% better than code generated by gcc 4.3. $flavour = shift; if ($flavour =~ /3[12]/) { $SIZE_T=4; $g=""; } else { $SIZE_T=8; $g="g"; } while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; $rp="%r14"; $sp="%r15"; $code=<<___; .text ___ # void RC4(RC4_KEY *key,size_t len,const void *inp,void *out) { $acc="%r0"; $cnt="%r1"; $key="%r2"; $len="%r3"; $inp="%r4"; $out="%r5"; @XX=("%r6","%r7"); @TX=("%r8","%r9"); $YY="%r10"; $TY="%r11"; $code.=<<___; .globl RC4 .type RC4,\@function .align 64 RC4: stm${g} %r6,%r11,6*$SIZE_T($sp) ___ $code.=<<___ if ($flavour =~ /3[12]/); llgfr $len,$len ___ $code.=<<___; llgc $XX[0],0($key) llgc $YY,1($key) la $XX[0],1($XX[0]) nill $XX[0],0xff srlg $cnt,$len,3 ltgr $cnt,$cnt llgc $TX[0],2($XX[0],$key) jz .Lshort j .Loop8 .align 64 .Loop8: ___ for ($i=0;$i<8;$i++) { $code.=<<___; la $YY,0($YY,$TX[0]) # $i nill $YY,255 la $XX[1],1($XX[0]) nill $XX[1],255 ___ $code.=<<___ if ($i==1); llgc $acc,2($TY,$key) ___ $code.=<<___ if ($i>1); sllg $acc,$acc,8 ic $acc,2($TY,$key) ___ $code.=<<___; llgc $TY,2($YY,$key) stc $TX[0],2($YY,$key) llgc $TX[1],2($XX[1],$key) stc $TY,2($XX[0],$key) cr $XX[1],$YY jne .Lcmov$i la $TX[1],0($TX[0]) .Lcmov$i: la $TY,0($TY,$TX[0]) nill $TY,255 ___ push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers } $code.=<<___; lg $TX[1],0($inp) sllg $acc,$acc,8 la $inp,8($inp) ic $acc,2($TY,$key) xgr $acc,$TX[1] stg $acc,0($out) la $out,8($out) brctg $cnt,.Loop8 .Lshort: lghi $acc,7 ngr $len,$acc jz .Lexit j .Loop1 .align 16 .Loop1: la $YY,0($YY,$TX[0]) nill $YY,255 llgc $TY,2($YY,$key) stc $TX[0],2($YY,$key) stc $TY,2($XX[0],$key) ar $TY,$TX[0] ahi $XX[0],1 nill $TY,255 nill $XX[0],255 llgc $acc,0($inp) la $inp,1($inp) llgc $TY,2($TY,$key) llgc $TX[0],2($XX[0],$key) xr $acc,$TY stc $acc,0($out) la $out,1($out) brct $len,.Loop1 .Lexit: ahi $XX[0],-1 stc $XX[0],0($key) stc $YY,1($key) lm${g} %r6,%r11,6*$SIZE_T($sp) br $rp .size RC4,.-RC4 .string "RC4 for s390x, CRYPTOGAMS by " ___ } # void RC4_set_key(RC4_KEY *key,unsigned int len,const void *inp) { $cnt="%r0"; $idx="%r1"; $key="%r2"; $len="%r3"; $inp="%r4"; $acc="%r5"; $dat="%r6"; $ikey="%r7"; $iinp="%r8"; $code.=<<___; .globl private_RC4_set_key .type private_RC4_set_key,\@function .align 64 private_RC4_set_key: stm${g} %r6,%r8,6*$SIZE_T($sp) lhi $cnt,256 la $idx,0(%r0) sth $idx,0($key) .align 4 .L1stloop: stc $idx,2($idx,$key) la $idx,1($idx) brct $cnt,.L1stloop lghi $ikey,-256 lr $cnt,$len la $iinp,0(%r0) la $idx,0(%r0) .align 16 .L2ndloop: llgc $acc,2+256($ikey,$key) llgc $dat,0($iinp,$inp) la $idx,0($idx,$acc) la $ikey,1($ikey) la $idx,0($idx,$dat) nill $idx,255 la $iinp,1($iinp) tml $ikey,255 llgc $dat,2($idx,$key) stc $dat,2+256-1($ikey,$key) stc $acc,2($idx,$key) jz .Ldone brct $cnt,.L2ndloop lr $cnt,$len la $iinp,0(%r0) j .L2ndloop .Ldone: lm${g} %r6,%r8,6*$SIZE_T($sp) br $rp .size private_RC4_set_key,.-private_RC4_set_key ___ } # const char *RC4_options() $code.=<<___; .globl RC4_options .type RC4_options,\@function .align 16 RC4_options: larl %r2,.Loptions br %r14 .size RC4_options,.-RC4_options .section .rodata .Loptions: .align 8 .string "rc4(8x,char)" ___ print $code; close STDOUT; # force flush