Loading crypto/sha/asm/sha1-586.pl +56 −46 Original line number Diff line number Diff line Loading @@ -12,6 +12,8 @@ # commentary below], and in 2006 the rest was rewritten in order to # gain freedom to liberate licensing terms. # January, September 2004. # # It was noted that Intel IA-32 C compiler generates code which # performs ~30% *faster* on P4 CPU than original *hand-coded* # SHA1 assembler implementation. To address this problem (and Loading @@ -31,6 +33,17 @@ # ---------------------------------------------------------------- # <appro@fy.chalmers.se> # August 2009. # # George Spelvin has tipped that F_40_59(b,c,d) can be rewritten as # '(c&d) + (b&(c^d))', which allows to accumulate partial results # and lighten "pressure" on scratch registers. This resulted in # >12% performance improvement on contemporary AMD cores (with no # degradation on other CPUs:-). Also, the code was revised to maximize # "distance" between instructions producing input to 'lea' instruction # and the 'lea' instruction itself, which is essential for Intel Atom # core. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; Loading Loading @@ -59,15 +72,16 @@ sub BODY_00_15 &rotl($tmp1,5); # tmp1=ROTATE(a,5) &xor($f,$d); &add($tmp1,$e); # tmp1+=e; &and($f,$b); &mov($e,&swtmp($n%16)); # e becomes volatile and is loaded # with xi, also note that e becomes # f in next round... &xor($f,$d); # f holds F_00_19(b,c,d) &and($f,$b); &rotr($b,2); # b=ROTATE(b,30) &xor($f,$d); # f holds F_00_19(b,c,d) &lea($tmp1,&DWP(0x5a827999,$tmp1,$e)); # tmp1+=K_00_19+xi if ($n==15) { &add($f,$tmp1); } # f+=tmp1 if ($n==15) { &mov($e,&swtmp(($n+1)%16));# pre-fetch f for next round &add($f,$tmp1); } # f+=tmp1 else { &add($tmp1,$f); } # f becomes a in next round } Loading @@ -77,22 +91,22 @@ sub BODY_16_19 &comment("16_19 $n"); &mov($f,&swtmp($n%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) &mov($tmp1,$c); # tmp1 to hold F_00_19(b,c,d) &xor($f,&swtmp(($n+2)%16)); &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) &xor($tmp1,$d); &xor($f,&swtmp(($n+8)%16)); &and($tmp1,$b); # tmp1 holds F_00_19(b,c,d) &rotr($b,2); # b=ROTATE(b,30) &and($tmp1,$b); &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd &rotl($f,1); # f=ROTATE(f,1) &xor($tmp1,$d); # tmp1=F_00_19(b,c,d) &add($e,$tmp1); # e+=F_00_19(b,c,d) &mov($tmp1,$a); &rotr($b,2); # b=ROTATE(b,30) &mov(&swtmp($n%16),$f); # xi=f &lea($f,&DWP(0x5a827999,$f,$e));# f+=K_00_19+e &mov($e,$a); # e becomes volatile &rotl($e,5); # e=ROTATE(a,5) &add($f,$tmp1); # f+=F_00_19(b,c,d) &add($f,$e); # f+=ROTATE(a,5) &rotl($tmp1,5); # ROTATE(a,5) &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round &add($f,$tmp1); # f+=ROTATE(a,5) } sub BODY_20_39 Loading @@ -103,20 +117,20 @@ sub BODY_20_39 &comment("20_39 $n"); &mov($tmp1,$b); # tmp1 to hold F_20_39(b,c,d) &mov($f,&swtmp($n%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) &rotr($b,2); # b=ROTATE(b,30) &xor($f,&swtmp(($n+2)%16)); &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) &xor($tmp1,$c); &xor($f,&swtmp(($n+8)%16)); &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d) &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd &rotl($f,1); # f=ROTATE(f,1) &add($tmp1,$e); &mov(&swtmp($n%16),$f); # xi=f &mov($e,$a); # e becomes volatile &rotl($e,5); # e=ROTATE(a,5) &lea($f,&DWP($K,$f,$tmp1)); # f+=K_20_39+e &add($f,$e); # f+=ROTATE(a,5) &add($e,$tmp1); # e+=F_20_39(b,c,d) &rotr($b,2); # b=ROTATE(b,30) &mov($tmp1,$a); &rotl($tmp1,5); # ROTATE(a,5) &mov(&swtmp($n%16),$f) if($n<77);# xi=f &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round &add($f,$tmp1); # f+=ROTATE(a,5) } sub BODY_40_59 Loading @@ -125,28 +139,24 @@ sub BODY_40_59 &comment("40_59 $n"); &mov($f,&swtmp($n%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) &mov($tmp1,&swtmp(($n+2)%16)); &xor($f,$tmp1); &mov($tmp1,&swtmp(($n+8)%16)); &xor($f,$tmp1); &mov($tmp1,&swtmp(($n+13)%16)); &xor($f,$tmp1); # f holds xa^xb^xc^xd &mov($tmp1,$b); # tmp1 to hold F_40_59(b,c,d) &mov($tmp1,$c); # tmp1 to hold F_40_59(b,c,d) &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) &xor($tmp1,$d); &xor($f,&swtmp(($n+8)%16)); &and($tmp1,$b); &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd &rotl($f,1); # f=ROTATE(f,1) &or($tmp1,$c); &mov(&swtmp($n%16),$f); # xi=f &and($tmp1,$d); &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e &mov($e,$b); # e becomes volatile and is used # to calculate F_40_59(b,c,d) &add($tmp1,$e); # b&(c^d)+=e &rotr($b,2); # b=ROTATE(b,30) &and($e,$c); &or($tmp1,$e); # tmp1 holds F_40_59(b,c,d) &mov($e,$a); &rotl($e,5); # e=ROTATE(a,5) &add($f,$tmp1); # f+=tmp1; &mov($e,$a); # e becomes volatile &rotl($e,5); # ROTATE(a,5) &mov(&swtmp($n%16),$f); # xi=f &lea($f,&DWP(0x8f1bbcdc,$f,$tmp1));# f+=K_40_59+e+(b&(c^d)) &mov($tmp1,$c); &add($f,$e); # f+=ROTATE(a,5) &and($tmp1,$d); &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round &add($f,$tmp1); # f+=c&d } &function_begin("sha1_block_data_order"); Loading crypto/sha/asm/sha1-x86_64.pl +133 −134 Original line number Diff line number Diff line Loading @@ -16,7 +16,7 @@ # There was suggestion to mechanically translate 32-bit code, but I # dismissed it, reasoning that x86_64 offers enough register bank # capacity to fully utilize SHA-1 parallelism. Therefore this fresh # implementation:-) However! While 64-bit code does performs better # implementation:-) However! While 64-bit code does perform better # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well, # x86_64 does offer larger *addressable* bank, but out-of-order core # reaches for even more registers through dynamic aliasing, and EM64T Loading @@ -29,6 +29,13 @@ # Xeon P4 +65% +0% 9.9 # Core2 +60% +10% 7.0 # August 2009. # # The code was revised to minimize code size and to maximize # "distance" between instructions producing input to 'lea' # instruction and the 'lea' instruction itself, which is essential # for Intel Atom core. $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } Loading @@ -51,194 +58,184 @@ $ctx="%r8"; $inp="%r9"; $num="%r10"; $xi="%eax"; $t0="%ebx"; $t1="%ecx"; $A="%edx"; $B="%esi"; $C="%edi"; $D="%ebp"; $E="%r11d"; $T="%r12d"; $t0="%eax"; $t1="%ebx"; $t2="%ecx"; @xi=("%edx","%ebp"); $A="%esi"; $B="%edi"; $C="%r11d"; $D="%r12d"; $E="%r13d"; @V=($A,$B,$C,$D,$E,$T); sub PROLOGUE { my $func=shift; $code.=<<___; .globl $func .type $func,\@function,3 .align 16 $func: push %rbx push %rbp push %r12 mov %rsp,%r11 mov %rdi,$ctx # reassigned argument sub \$`8+16*4`,%rsp mov %rsi,$inp # reassigned argument and \$-64,%rsp mov %rdx,$num # reassigned argument mov %r11,`16*4`(%rsp) .Lprologue: mov 0($ctx),$A mov 4($ctx),$B mov 8($ctx),$C mov 12($ctx),$D mov 16($ctx),$E ___ } sub EPILOGUE { my $func=shift; $code.=<<___; mov `16*4`(%rsp),%rsi mov (%rsi),%r12 mov 8(%rsi),%rbp mov 16(%rsi),%rbx lea 24(%rsi),%rsp .Lepilogue: ret .size $func,.-$func ___ } @V=($A,$B,$C,$D,$E); sub BODY_00_19 { my ($i,$a,$b,$c,$d,$e,$f,$host)=@_; my ($i,$a,$b,$c,$d,$e)=@_; my $j=$i+1; $code.=<<___ if ($i==0); mov `4*$i`($inp),$xi `"bswap $xi" if(!defined($host))` mov $xi,`4*$i`(%rsp) mov `4*$i`($inp),$xi[0] bswap $xi[0] mov $xi[0],`4*$i`(%rsp) ___ $code.=<<___ if ($i<15); lea 0x5a827999($xi,$e),$f mov $c,$t0 mov `4*$j`($inp),$xi mov $a,$e mov `4*$j`($inp),$xi[1] mov $a,$t2 xor $d,$t0 `"bswap $xi" if(!defined($host))` rol \$5,$e bswap $xi[1] rol \$5,$t2 lea 0x5a827999($xi[0],$e),$e and $b,$t0 mov $xi,`4*$j`(%rsp) add $e,$f mov $xi[1],`4*$j`(%rsp) add $t2,$e xor $d,$t0 rol \$30,$b add $t0,$f add $t0,$e ___ $code.=<<___ if ($i>=15); lea 0x5a827999($xi,$e),$f mov `4*($j%16)`(%rsp),$xi mov `4*($j%16)`(%rsp),$xi[1] mov $c,$t0 mov $a,$e xor `4*(($j+2)%16)`(%rsp),$xi mov $a,$t2 xor `4*(($j+2)%16)`(%rsp),$xi[1] xor $d,$t0 rol \$5,$e xor `4*(($j+8)%16)`(%rsp),$xi rol \$5,$t2 xor `4*(($j+8)%16)`(%rsp),$xi[1] and $b,$t0 add $e,$f xor `4*(($j+13)%16)`(%rsp),$xi lea 0x5a827999($xi[0],$e),$e xor `4*(($j+13)%16)`(%rsp),$xi[1] xor $d,$t0 rol \$1,$xi[1] add $t2,$e rol \$30,$b add $t0,$f rol \$1,$xi mov $xi,`4*($j%16)`(%rsp) mov $xi[1],`4*($j%16)`(%rsp) add $t0,$e ___ unshift(@xi,pop(@xi)); } sub BODY_20_39 { my ($i,$a,$b,$c,$d,$e,$f)=@_; my ($i,$a,$b,$c,$d,$e)=@_; my $j=$i+1; my $K=($i<40)?0x6ed9eba1:0xca62c1d6; $code.=<<___ if ($i<79); lea $K($xi,$e),$f mov `4*($j%16)`(%rsp),$xi mov `4*($j%16)`(%rsp),$xi[1] mov $c,$t0 mov $a,$e xor `4*(($j+2)%16)`(%rsp),$xi mov $a,$t2 xor `4*(($j+2)%16)`(%rsp),$xi[1] xor $b,$t0 rol \$5,$e xor `4*(($j+8)%16)`(%rsp),$xi rol \$5,$t2 lea $K($xi[0],$e),$e xor `4*(($j+8)%16)`(%rsp),$xi[1] xor $d,$t0 add $e,$f xor `4*(($j+13)%16)`(%rsp),$xi add $t2,$e xor `4*(($j+13)%16)`(%rsp),$xi[1] rol \$30,$b add $t0,$f rol \$1,$xi add $t0,$e rol \$1,$xi[1] ___ $code.=<<___ if ($i<76); mov $xi,`4*($j%16)`(%rsp) mov $xi[1],`4*($j%16)`(%rsp) ___ $code.=<<___ if ($i==79); lea $K($xi,$e),$f mov $c,$t0 mov $a,$e mov $a,$t2 xor $b,$t0 rol \$5,$e lea $K($xi[0],$e),$e rol \$5,$t2 xor $d,$t0 add $e,$f add $t2,$e rol \$30,$b add $t0,$f add $t0,$e ___ unshift(@xi,pop(@xi)); } sub BODY_40_59 { my ($i,$a,$b,$c,$d,$e,$f)=@_; my ($i,$a,$b,$c,$d,$e)=@_; my $j=$i+1; $code.=<<___; lea 0x8f1bbcdc($xi,$e),$f mov `4*($j%16)`(%rsp),$xi mov $b,$t0 mov $b,$t1 xor `4*(($j+2)%16)`(%rsp),$xi mov $a,$e and $c,$t0 xor `4*(($j+8)%16)`(%rsp),$xi or $c,$t1 rol \$5,$e xor `4*(($j+13)%16)`(%rsp),$xi and $d,$t1 add $e,$f rol \$1,$xi or $t1,$t0 mov `4*($j%16)`(%rsp),$xi[1] mov $c,$t0 mov $c,$t1 xor `4*(($j+2)%16)`(%rsp),$xi[1] and $d,$t0 mov $a,$t2 xor `4*(($j+8)%16)`(%rsp),$xi[1] xor $d,$t1 lea 0x8f1bbcdc($xi[0],$e),$e rol \$5,$t2 xor `4*(($j+13)%16)`(%rsp),$xi[1] add $t0,$e and $b,$t1 rol \$1,$xi[1] add $t1,$e rol \$30,$b mov $xi,`4*($j%16)`(%rsp) add $t0,$f mov $xi[1],`4*($j%16)`(%rsp) add $t2,$e ___ unshift(@xi,pop(@xi)); } $code=".text\n"; $code.=<<___; .text &PROLOGUE("sha1_block_data_order"); $code.=".align 4\n.Lloop:\n"; .globl sha1_block_data_order .type sha1_block_data_order,\@function,3 .align 16 sha1_block_data_order: push %rbx push %rbp push %r12 push %r13 mov %rsp,%r11 mov %rdi,$ctx # reassigned argument sub \$`8+16*4`,%rsp mov %rsi,$inp # reassigned argument and \$-64,%rsp mov %rdx,$num # reassigned argument mov %r11,`16*4`(%rsp) .Lprologue: mov 0($ctx),$A mov 4($ctx),$B mov 8($ctx),$C mov 12($ctx),$D mov 16($ctx),$E .align 4 .Lloop: ___ for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } $code.=<<___; add 0($ctx),$E add 4($ctx),$T add 8($ctx),$A add 12($ctx),$B add 16($ctx),$C mov $E,0($ctx) mov $T,4($ctx) mov $A,8($ctx) mov $B,12($ctx) mov $C,16($ctx) xchg $E,$A # mov $E,$A xchg $T,$B # mov $T,$B xchg $E,$C # mov $A,$C xchg $T,$D # mov $B,$D # mov $C,$E lea `16*4`($inp),$inp add 0($ctx),$A add 4($ctx),$B add 8($ctx),$C add 12($ctx),$D add 16($ctx),$E mov $A,0($ctx) mov $B,4($ctx) mov $C,8($ctx) mov $D,12($ctx) mov $E,16($ctx) sub \$1,$num lea `16*4`($inp),$inp jnz .Lloop ___ &EPILOGUE("sha1_block_data_order"); $code.=<<___; mov `16*4`(%rsp),%rsi mov (%rsi),%r13 mov 8(%rsi),%r12 mov 16(%rsi),%rbp mov 24(%rsi),%rbx lea 32(%rsi),%rsp .Lepilogue: ret .size sha1_block_data_order,.-sha1_block_data_order .asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" .align 16 ___ Loading Loading @@ -281,14 +278,16 @@ se_handler: jae .Lin_prologue mov `16*4`(%rax),%rax # pull saved stack pointer lea 24(%rax),%rax lea 32(%rax),%rax mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 mov -32(%rax),%r13 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 .Lin_prologue: mov 8(%rax),%rdi Loading Loading
crypto/sha/asm/sha1-586.pl +56 −46 Original line number Diff line number Diff line Loading @@ -12,6 +12,8 @@ # commentary below], and in 2006 the rest was rewritten in order to # gain freedom to liberate licensing terms. # January, September 2004. # # It was noted that Intel IA-32 C compiler generates code which # performs ~30% *faster* on P4 CPU than original *hand-coded* # SHA1 assembler implementation. To address this problem (and Loading @@ -31,6 +33,17 @@ # ---------------------------------------------------------------- # <appro@fy.chalmers.se> # August 2009. # # George Spelvin has tipped that F_40_59(b,c,d) can be rewritten as # '(c&d) + (b&(c^d))', which allows to accumulate partial results # and lighten "pressure" on scratch registers. This resulted in # >12% performance improvement on contemporary AMD cores (with no # degradation on other CPUs:-). Also, the code was revised to maximize # "distance" between instructions producing input to 'lea' instruction # and the 'lea' instruction itself, which is essential for Intel Atom # core. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; Loading Loading @@ -59,15 +72,16 @@ sub BODY_00_15 &rotl($tmp1,5); # tmp1=ROTATE(a,5) &xor($f,$d); &add($tmp1,$e); # tmp1+=e; &and($f,$b); &mov($e,&swtmp($n%16)); # e becomes volatile and is loaded # with xi, also note that e becomes # f in next round... &xor($f,$d); # f holds F_00_19(b,c,d) &and($f,$b); &rotr($b,2); # b=ROTATE(b,30) &xor($f,$d); # f holds F_00_19(b,c,d) &lea($tmp1,&DWP(0x5a827999,$tmp1,$e)); # tmp1+=K_00_19+xi if ($n==15) { &add($f,$tmp1); } # f+=tmp1 if ($n==15) { &mov($e,&swtmp(($n+1)%16));# pre-fetch f for next round &add($f,$tmp1); } # f+=tmp1 else { &add($tmp1,$f); } # f becomes a in next round } Loading @@ -77,22 +91,22 @@ sub BODY_16_19 &comment("16_19 $n"); &mov($f,&swtmp($n%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) &mov($tmp1,$c); # tmp1 to hold F_00_19(b,c,d) &xor($f,&swtmp(($n+2)%16)); &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) &xor($tmp1,$d); &xor($f,&swtmp(($n+8)%16)); &and($tmp1,$b); # tmp1 holds F_00_19(b,c,d) &rotr($b,2); # b=ROTATE(b,30) &and($tmp1,$b); &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd &rotl($f,1); # f=ROTATE(f,1) &xor($tmp1,$d); # tmp1=F_00_19(b,c,d) &add($e,$tmp1); # e+=F_00_19(b,c,d) &mov($tmp1,$a); &rotr($b,2); # b=ROTATE(b,30) &mov(&swtmp($n%16),$f); # xi=f &lea($f,&DWP(0x5a827999,$f,$e));# f+=K_00_19+e &mov($e,$a); # e becomes volatile &rotl($e,5); # e=ROTATE(a,5) &add($f,$tmp1); # f+=F_00_19(b,c,d) &add($f,$e); # f+=ROTATE(a,5) &rotl($tmp1,5); # ROTATE(a,5) &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round &add($f,$tmp1); # f+=ROTATE(a,5) } sub BODY_20_39 Loading @@ -103,20 +117,20 @@ sub BODY_20_39 &comment("20_39 $n"); &mov($tmp1,$b); # tmp1 to hold F_20_39(b,c,d) &mov($f,&swtmp($n%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) &rotr($b,2); # b=ROTATE(b,30) &xor($f,&swtmp(($n+2)%16)); &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) &xor($tmp1,$c); &xor($f,&swtmp(($n+8)%16)); &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d) &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd &rotl($f,1); # f=ROTATE(f,1) &add($tmp1,$e); &mov(&swtmp($n%16),$f); # xi=f &mov($e,$a); # e becomes volatile &rotl($e,5); # e=ROTATE(a,5) &lea($f,&DWP($K,$f,$tmp1)); # f+=K_20_39+e &add($f,$e); # f+=ROTATE(a,5) &add($e,$tmp1); # e+=F_20_39(b,c,d) &rotr($b,2); # b=ROTATE(b,30) &mov($tmp1,$a); &rotl($tmp1,5); # ROTATE(a,5) &mov(&swtmp($n%16),$f) if($n<77);# xi=f &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round &add($f,$tmp1); # f+=ROTATE(a,5) } sub BODY_40_59 Loading @@ -125,28 +139,24 @@ sub BODY_40_59 &comment("40_59 $n"); &mov($f,&swtmp($n%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) &mov($tmp1,&swtmp(($n+2)%16)); &xor($f,$tmp1); &mov($tmp1,&swtmp(($n+8)%16)); &xor($f,$tmp1); &mov($tmp1,&swtmp(($n+13)%16)); &xor($f,$tmp1); # f holds xa^xb^xc^xd &mov($tmp1,$b); # tmp1 to hold F_40_59(b,c,d) &mov($tmp1,$c); # tmp1 to hold F_40_59(b,c,d) &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) &xor($tmp1,$d); &xor($f,&swtmp(($n+8)%16)); &and($tmp1,$b); &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd &rotl($f,1); # f=ROTATE(f,1) &or($tmp1,$c); &mov(&swtmp($n%16),$f); # xi=f &and($tmp1,$d); &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e &mov($e,$b); # e becomes volatile and is used # to calculate F_40_59(b,c,d) &add($tmp1,$e); # b&(c^d)+=e &rotr($b,2); # b=ROTATE(b,30) &and($e,$c); &or($tmp1,$e); # tmp1 holds F_40_59(b,c,d) &mov($e,$a); &rotl($e,5); # e=ROTATE(a,5) &add($f,$tmp1); # f+=tmp1; &mov($e,$a); # e becomes volatile &rotl($e,5); # ROTATE(a,5) &mov(&swtmp($n%16),$f); # xi=f &lea($f,&DWP(0x8f1bbcdc,$f,$tmp1));# f+=K_40_59+e+(b&(c^d)) &mov($tmp1,$c); &add($f,$e); # f+=ROTATE(a,5) &and($tmp1,$d); &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round &add($f,$tmp1); # f+=c&d } &function_begin("sha1_block_data_order"); Loading
crypto/sha/asm/sha1-x86_64.pl +133 −134 Original line number Diff line number Diff line Loading @@ -16,7 +16,7 @@ # There was suggestion to mechanically translate 32-bit code, but I # dismissed it, reasoning that x86_64 offers enough register bank # capacity to fully utilize SHA-1 parallelism. Therefore this fresh # implementation:-) However! While 64-bit code does performs better # implementation:-) However! While 64-bit code does perform better # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well, # x86_64 does offer larger *addressable* bank, but out-of-order core # reaches for even more registers through dynamic aliasing, and EM64T Loading @@ -29,6 +29,13 @@ # Xeon P4 +65% +0% 9.9 # Core2 +60% +10% 7.0 # August 2009. # # The code was revised to minimize code size and to maximize # "distance" between instructions producing input to 'lea' # instruction and the 'lea' instruction itself, which is essential # for Intel Atom core. $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } Loading @@ -51,194 +58,184 @@ $ctx="%r8"; $inp="%r9"; $num="%r10"; $xi="%eax"; $t0="%ebx"; $t1="%ecx"; $A="%edx"; $B="%esi"; $C="%edi"; $D="%ebp"; $E="%r11d"; $T="%r12d"; $t0="%eax"; $t1="%ebx"; $t2="%ecx"; @xi=("%edx","%ebp"); $A="%esi"; $B="%edi"; $C="%r11d"; $D="%r12d"; $E="%r13d"; @V=($A,$B,$C,$D,$E,$T); sub PROLOGUE { my $func=shift; $code.=<<___; .globl $func .type $func,\@function,3 .align 16 $func: push %rbx push %rbp push %r12 mov %rsp,%r11 mov %rdi,$ctx # reassigned argument sub \$`8+16*4`,%rsp mov %rsi,$inp # reassigned argument and \$-64,%rsp mov %rdx,$num # reassigned argument mov %r11,`16*4`(%rsp) .Lprologue: mov 0($ctx),$A mov 4($ctx),$B mov 8($ctx),$C mov 12($ctx),$D mov 16($ctx),$E ___ } sub EPILOGUE { my $func=shift; $code.=<<___; mov `16*4`(%rsp),%rsi mov (%rsi),%r12 mov 8(%rsi),%rbp mov 16(%rsi),%rbx lea 24(%rsi),%rsp .Lepilogue: ret .size $func,.-$func ___ } @V=($A,$B,$C,$D,$E); sub BODY_00_19 { my ($i,$a,$b,$c,$d,$e,$f,$host)=@_; my ($i,$a,$b,$c,$d,$e)=@_; my $j=$i+1; $code.=<<___ if ($i==0); mov `4*$i`($inp),$xi `"bswap $xi" if(!defined($host))` mov $xi,`4*$i`(%rsp) mov `4*$i`($inp),$xi[0] bswap $xi[0] mov $xi[0],`4*$i`(%rsp) ___ $code.=<<___ if ($i<15); lea 0x5a827999($xi,$e),$f mov $c,$t0 mov `4*$j`($inp),$xi mov $a,$e mov `4*$j`($inp),$xi[1] mov $a,$t2 xor $d,$t0 `"bswap $xi" if(!defined($host))` rol \$5,$e bswap $xi[1] rol \$5,$t2 lea 0x5a827999($xi[0],$e),$e and $b,$t0 mov $xi,`4*$j`(%rsp) add $e,$f mov $xi[1],`4*$j`(%rsp) add $t2,$e xor $d,$t0 rol \$30,$b add $t0,$f add $t0,$e ___ $code.=<<___ if ($i>=15); lea 0x5a827999($xi,$e),$f mov `4*($j%16)`(%rsp),$xi mov `4*($j%16)`(%rsp),$xi[1] mov $c,$t0 mov $a,$e xor `4*(($j+2)%16)`(%rsp),$xi mov $a,$t2 xor `4*(($j+2)%16)`(%rsp),$xi[1] xor $d,$t0 rol \$5,$e xor `4*(($j+8)%16)`(%rsp),$xi rol \$5,$t2 xor `4*(($j+8)%16)`(%rsp),$xi[1] and $b,$t0 add $e,$f xor `4*(($j+13)%16)`(%rsp),$xi lea 0x5a827999($xi[0],$e),$e xor `4*(($j+13)%16)`(%rsp),$xi[1] xor $d,$t0 rol \$1,$xi[1] add $t2,$e rol \$30,$b add $t0,$f rol \$1,$xi mov $xi,`4*($j%16)`(%rsp) mov $xi[1],`4*($j%16)`(%rsp) add $t0,$e ___ unshift(@xi,pop(@xi)); } sub BODY_20_39 { my ($i,$a,$b,$c,$d,$e,$f)=@_; my ($i,$a,$b,$c,$d,$e)=@_; my $j=$i+1; my $K=($i<40)?0x6ed9eba1:0xca62c1d6; $code.=<<___ if ($i<79); lea $K($xi,$e),$f mov `4*($j%16)`(%rsp),$xi mov `4*($j%16)`(%rsp),$xi[1] mov $c,$t0 mov $a,$e xor `4*(($j+2)%16)`(%rsp),$xi mov $a,$t2 xor `4*(($j+2)%16)`(%rsp),$xi[1] xor $b,$t0 rol \$5,$e xor `4*(($j+8)%16)`(%rsp),$xi rol \$5,$t2 lea $K($xi[0],$e),$e xor `4*(($j+8)%16)`(%rsp),$xi[1] xor $d,$t0 add $e,$f xor `4*(($j+13)%16)`(%rsp),$xi add $t2,$e xor `4*(($j+13)%16)`(%rsp),$xi[1] rol \$30,$b add $t0,$f rol \$1,$xi add $t0,$e rol \$1,$xi[1] ___ $code.=<<___ if ($i<76); mov $xi,`4*($j%16)`(%rsp) mov $xi[1],`4*($j%16)`(%rsp) ___ $code.=<<___ if ($i==79); lea $K($xi,$e),$f mov $c,$t0 mov $a,$e mov $a,$t2 xor $b,$t0 rol \$5,$e lea $K($xi[0],$e),$e rol \$5,$t2 xor $d,$t0 add $e,$f add $t2,$e rol \$30,$b add $t0,$f add $t0,$e ___ unshift(@xi,pop(@xi)); } sub BODY_40_59 { my ($i,$a,$b,$c,$d,$e,$f)=@_; my ($i,$a,$b,$c,$d,$e)=@_; my $j=$i+1; $code.=<<___; lea 0x8f1bbcdc($xi,$e),$f mov `4*($j%16)`(%rsp),$xi mov $b,$t0 mov $b,$t1 xor `4*(($j+2)%16)`(%rsp),$xi mov $a,$e and $c,$t0 xor `4*(($j+8)%16)`(%rsp),$xi or $c,$t1 rol \$5,$e xor `4*(($j+13)%16)`(%rsp),$xi and $d,$t1 add $e,$f rol \$1,$xi or $t1,$t0 mov `4*($j%16)`(%rsp),$xi[1] mov $c,$t0 mov $c,$t1 xor `4*(($j+2)%16)`(%rsp),$xi[1] and $d,$t0 mov $a,$t2 xor `4*(($j+8)%16)`(%rsp),$xi[1] xor $d,$t1 lea 0x8f1bbcdc($xi[0],$e),$e rol \$5,$t2 xor `4*(($j+13)%16)`(%rsp),$xi[1] add $t0,$e and $b,$t1 rol \$1,$xi[1] add $t1,$e rol \$30,$b mov $xi,`4*($j%16)`(%rsp) add $t0,$f mov $xi[1],`4*($j%16)`(%rsp) add $t2,$e ___ unshift(@xi,pop(@xi)); } $code=".text\n"; $code.=<<___; .text &PROLOGUE("sha1_block_data_order"); $code.=".align 4\n.Lloop:\n"; .globl sha1_block_data_order .type sha1_block_data_order,\@function,3 .align 16 sha1_block_data_order: push %rbx push %rbp push %r12 push %r13 mov %rsp,%r11 mov %rdi,$ctx # reassigned argument sub \$`8+16*4`,%rsp mov %rsi,$inp # reassigned argument and \$-64,%rsp mov %rdx,$num # reassigned argument mov %r11,`16*4`(%rsp) .Lprologue: mov 0($ctx),$A mov 4($ctx),$B mov 8($ctx),$C mov 12($ctx),$D mov 16($ctx),$E .align 4 .Lloop: ___ for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } $code.=<<___; add 0($ctx),$E add 4($ctx),$T add 8($ctx),$A add 12($ctx),$B add 16($ctx),$C mov $E,0($ctx) mov $T,4($ctx) mov $A,8($ctx) mov $B,12($ctx) mov $C,16($ctx) xchg $E,$A # mov $E,$A xchg $T,$B # mov $T,$B xchg $E,$C # mov $A,$C xchg $T,$D # mov $B,$D # mov $C,$E lea `16*4`($inp),$inp add 0($ctx),$A add 4($ctx),$B add 8($ctx),$C add 12($ctx),$D add 16($ctx),$E mov $A,0($ctx) mov $B,4($ctx) mov $C,8($ctx) mov $D,12($ctx) mov $E,16($ctx) sub \$1,$num lea `16*4`($inp),$inp jnz .Lloop ___ &EPILOGUE("sha1_block_data_order"); $code.=<<___; mov `16*4`(%rsp),%rsi mov (%rsi),%r13 mov 8(%rsi),%r12 mov 16(%rsi),%rbp mov 24(%rsi),%rbx lea 32(%rsi),%rsp .Lepilogue: ret .size sha1_block_data_order,.-sha1_block_data_order .asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" .align 16 ___ Loading Loading @@ -281,14 +278,16 @@ se_handler: jae .Lin_prologue mov `16*4`(%rax),%rax # pull saved stack pointer lea 24(%rax),%rax lea 32(%rax),%rax mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 mov -32(%rax),%r13 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 .Lin_prologue: mov 8(%rax),%rdi Loading