Loading crypto/aes/asm/aesni-sha1-x86_64.pl +606 −82 File changed.Preview size limit exceeded, changes collapsed. Show changes crypto/sha/asm/sha1-586.pl +105 −25 Original line number Diff line number Diff line Loading @@ -93,8 +93,9 @@ # Westmere 7.3 5.5/+33% - # Sandy Bridge 8.8 6.2/+40% 5.1(**)/+73% # Ivy Bridge 7.2 4.8/+51% 4.7(**)/+53% # Haswell 6.5 4.3/+51% 4.1(**)/+58% # Bulldozer 11.6 6.0/+92% # VIA Nano 10.6 7.4/+43% # VIA Nano 10.6 7.5/+41% # # (*) Loop is 1056 instructions long and expected result is ~8.25. # It remains mystery [to me] why ILP is limited to 1.7. Loading Loading @@ -512,7 +513,7 @@ my $_ror=sub { &ror(@_) }; &mov (@T[1],$C); &psubd (@X[-2&7],@X[3]); &xor (@T[1],$D); &movdqa (@X[0],@X[-3&7]); &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]); &and (@T[0],@T[1]); &jmp (&label("loop")); Loading @@ -539,76 +540,77 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); # ror eval(shift(@insns)); eval(shift(@insns)); &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]" &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8); &movdqa (@X[2],@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[3],@X[-1&7]); &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); &psrldq (@X[2],4); # "X[-3]", 3 dwords eval(shift(@insns)); eval(shift(@insns)); &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror &pxor (@X[2],@X[-2&7]); # "X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pxor (@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); &movdqa (@X[4],@X[0]); &movdqa (@X[2],@X[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror &movdqa (@X[2],@X[0]); eval(shift(@insns)); &pslldq (@X[4],12); # "X[0]"<<96, extract one dword &paddd (@X[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &psrld (@X[2],31); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol &movdqa (@X[3],@X[4]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &psrld (@X[4],30); &por (@X[0],@X[2]); # "X[0]"<<<=1 eval(shift(@insns)); eval(shift(@insns)); # ror &por (@X[0],@X[2]); # "X[0]"<<<=1 eval(shift(@insns)); &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer eval(shift(@insns)); eval(shift(@insns)); &pslld (@X[3],2); &pxor (@X[0],@X[4]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol &pxor (@X[0],@X[4]); &movdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX eval(shift(@insns)); eval(shift(@insns)); &pxor (@X[0],@X[3]); # "X[0]"^=("X[0]"<<96)<<<2 &movdqa (@X[1],@X[-2&7]) if ($Xi<7); &pshufd (@X[1],@X[-3&7],0xee) if ($Xi<7); # was &movdqa (@X[1],@X[-2&7]) &pshufd (@X[3],@X[-1&7],0xee) if ($Xi==7); eval(shift(@insns)); eval(shift(@insns)); Loading @@ -623,10 +625,9 @@ sub Xupdate_ssse3_32_79() my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions my ($a,$b,$c,$d,$e); &movdqa (@X[2],@X[-1&7]) if ($Xi==8); eval(shift(@insns)); # body_20_39 &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" &palignr(@X[2],@X[-2&7],8); # compose "X[-6]" &punpcklqdq(@X[2],@X[-1&7]); # compose "X[-6]", was &palignr(@X[2],@X[-2&7],8) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol Loading @@ -635,13 +636,14 @@ sub Xupdate_ssse3_32_79() &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X[] to backtrace buffer eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)) if (@insns[0] =~ /_rol/); if ($Xi%5) { &movdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX... } else { # ... or load next one &movdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp")); } &paddd (@X[3],@X[-1&7]); eval(shift(@insns)); # ror &paddd (@X[3],@X[-1&7]); eval(shift(@insns)); &pxor (@X[0],@X[2]); # "X[0]"^="X[-6]" Loading @@ -656,6 +658,7 @@ sub Xupdate_ssse3_32_79() eval(shift(@insns)); eval(shift(@insns)); # ror eval(shift(@insns)); eval(shift(@insns)) if (@insns[0] =~ /_rol/); &pslld (@X[0],2); eval(shift(@insns)); # body_20_39 Loading @@ -667,6 +670,8 @@ sub Xupdate_ssse3_32_79() eval(shift(@insns)); eval(shift(@insns)); # ror eval(shift(@insns)); eval(shift(@insns)) if (@insns[1] =~ /_rol/); eval(shift(@insns)) if (@insns[0] =~ /_rol/); &por (@X[0],@X[2]); # "X[0]"<<<=2 eval(shift(@insns)); # body_20_39 Loading @@ -677,7 +682,7 @@ sub Xupdate_ssse3_32_79() eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror &movdqa (@X[3],@X[0]) if ($Xi<19); &pshufd (@X[3],@X[-1],0xee) if ($Xi<19); # was &movdqa (@X[3],@X[0]) eval(shift(@insns)); foreach (@insns) { eval; } # remaining instructions Loading @@ -691,6 +696,12 @@ sub Xuplast_ssse3_80() my @insns = (&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[3],@X[-1&7]); eval(shift(@insns)); Loading Loading @@ -728,9 +739,16 @@ sub Xloop_ssse3() eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pshufb (@X[($Xi-3)&7],@X[2]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[($Xi-4)&7],@X[3]); eval(shift(@insns)); eval(shift(@insns)); Loading @@ -739,6 +757,8 @@ sub Xloop_ssse3() &movdqa (&QWP(0+16*$Xi,"esp"),@X[($Xi-4)&7]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &psubd (@X[($Xi-4)&7],@X[3]); foreach (@insns) { eval; } Loading Loading @@ -816,6 +836,64 @@ sub body_40_59 () { # ((b^c)&(c^d))^c '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); } ###### sub bodyx_00_19 () { # ((c^d)&b)^d # on start @T[0]=(b&c)^(~b&d), $e+=X[]+K return &bodyx_20_39() if ($rx==19); $rx++; ( '($a,$b,$c,$d,$e)=@V;'. '&rorx ($b,$b,2) if ($j==0);'. # $b>>>2 '&rorx ($b,@T[1],7) if ($j!=0);', # $b>>>2 '&lea ($e,&DWP(0,$e,@T[0]));', '&rorx (@T[0],$a,5);', '&andn (@T[1],$a,$c);', '&and ($a,$b)', '&add ($d,&DWP(4*(($j+1)&15),"esp"));', # X[]+K xfer '&xor (@T[1],$a)', '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); } sub bodyx_20_39 () { # b^d^c # on start $b=b^c^d return &bodyx_40_59() if ($rx==39); $rx++; ( '($a,$b,$c,$d,$e)=@V;'. '&add ($e,($j==19?@T[0]:$b))', '&rorx ($b,@T[1],7);', # $b>>>2 '&rorx (@T[0],$a,5);', '&xor ($a,$b) if ($j<79);', '&add ($d,&DWP(4*(($j+1)&15),"esp")) if ($j<79);', # X[]+K xfer '&xor ($a,$c) if ($j<79);', '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); } sub bodyx_40_59 () { # ((b^c)&(c^d))^c # on start $b=((b^c)&(c^d))^c return &bodyx_20_39() if ($rx==59); $rx++; ( '($a,$b,$c,$d,$e)=@V;'. '&rorx (@T[0],$a,5)', '&lea ($e,&DWP(0,$e,$b))', '&rorx ($b,@T[1],7)', # $b>>>2 '&add ($d,&DWP(4*(($j+1)&15),"esp"))', # X[]+K xfer '&mov (@T[1],$c)', '&xor ($a,$b)', # b^c for next round '&xor (@T[1],$b)', # c^d for next round '&and ($a,@T[1])', '&add ($e,@T[0])', '&xor ($a,$b)' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); } &set_label("loop",16); &Xupdate_ssse3_16_31(\&body_00_19); Loading Loading @@ -855,9 +933,10 @@ sub body_40_59 () { # ((b^c)&(c^d))^c &mov (&DWP(12,@T[1]),$D); &xor ($B,$D); &mov (&DWP(16,@T[1]),$E); &and ($B,@T[0]); &movdqa (@X[0],@X[-3&7]); &xchg ($B,@T[0]); &mov (@T[1],@T[0]); &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]); &and (@T[0],$B); &mov ($B,$T[1]); &jmp (&label("loop")); Loading Loading @@ -1226,9 +1305,10 @@ sub Xtail_avx() &mov (&DWP(8,@T[1]),$C); &xor ($B,$D); &mov (&DWP(12,@T[1]),$D); &and ($B,@T[0]); &mov (&DWP(16,@T[1]),$E); &xchg ($B,@T[0]); &mov (@T[1],@T[0]); &and (@T[0],$B); &mov ($B,@T[1]); &jmp (&label("loop")); Loading crypto/sha/asm/sha1-x86_64.pl +97 −85 Original line number Diff line number Diff line Loading @@ -62,16 +62,20 @@ # CPU clock cycles spent to process single byte (less is better). # # x86_64 SSSE3 AVX[2] # P4 9.8 - # Opteron 6.65 - # Core2 6.70 6.05/+11% - # Westmere 7.08 5.44/+30% - # Sandy Bridge 7.93 6.16/+28% 4.99/+59% # Ivy Bridge 6.30 4.63/+36% 4.60/+37% # Haswell 5.98 4.12/+45% 3.57/+67% # Bulldozer 10.9 5.95/+82% # VIA Nano 10.2 7.46/+37% # Atom 11.0 9.61/+14% # P4 9.05 - # Opteron 6.26 - # Core2 6.55 6.05/+8% - # Westmere 6.73 5.30/+27% - # Sandy Bridge 7.70 6.10/+26% 4.99/+54% # Ivy Bridge 6.06 4.67/+30% 4.60/+32% # Haswell 5.45 4.15/+31% 3.57/+53% # Bulldozer 9.11 5.95/+53% # VIA Nano 9.32 7.15/+30% # Atom [10.5?] [9.23?]/+14% # Silvermont 13.1(*) 9.37/+40% # # (*) obviously suboptimal result, nothing was done about it, # because SSSE3 code is compiled unconditionally; $flavour = shift; $output = shift; Loading Loading @@ -114,7 +118,7 @@ $num="%r10"; $t0="%eax"; $t1="%ebx"; $t2="%ecx"; @xi=("%edx","%ebp"); @xi=("%edx","%ebp","%r14d"); $A="%esi"; $B="%edi"; $C="%r11d"; Loading @@ -129,42 +133,40 @@ my $j=$i+1; $code.=<<___ if ($i==0); mov `4*$i`($inp),$xi[0] bswap $xi[0] mov $xi[0],`4*$i`(%rsp) ___ $code.=<<___ if ($i<15); mov $c,$t0 mov `4*$j`($inp),$xi[1] mov $d,$t0 mov $xi[0],`4*$i`(%rsp) mov $a,$t2 xor $d,$t0 bswap $xi[1] xor $c,$t0 rol \$5,$t2 lea 0x5a827999($xi[0],$e),$e and $b,$t0 mov $xi[1],`4*$j`(%rsp) lea 0x5a827999($xi[0],$e),$e add $t2,$e xor $d,$t0 rol \$30,$b add $t0,$e ___ $code.=<<___ if ($i>=15); mov `4*($j%16)`(%rsp),$xi[1] mov $c,$t0 xor `4*($j%16)`(%rsp),$xi[1] mov $d,$t0 mov $xi[0],`4*($i%16)`(%rsp) mov $a,$t2 xor `4*(($j+2)%16)`(%rsp),$xi[1] xor $d,$t0 xor $c,$t0 rol \$5,$t2 xor `4*(($j+8)%16)`(%rsp),$xi[1] and $b,$t0 lea 0x5a827999($xi[0],$e),$e xor `4*(($j+13)%16)`(%rsp),$xi[1] rol \$30,$b xor $d,$t0 rol \$1,$xi[1] add $t2,$e rol \$30,$b mov $xi[1],`4*($j%16)`(%rsp) rol \$1,$xi[1] add $t0,$e ___ unshift(@xi,pop(@xi)); push(@xi,shift(@xi)); } sub BODY_20_39 { Loading @@ -172,62 +174,58 @@ my ($i,$a,$b,$c,$d,$e)=@_; my $j=$i+1; my $K=($i<40)?0x6ed9eba1:0xca62c1d6; $code.=<<___ if ($i<79); mov `4*($j%16)`(%rsp),$xi[1] mov $c,$t0 xor `4*($j%16)`(%rsp),$xi[1] mov $b,$t0 `"mov $xi[0],".4*($i%16)."(%rsp)" if ($i<72)` mov $a,$t2 xor `4*(($j+2)%16)`(%rsp),$xi[1] xor $b,$t0 xor $d,$t0 rol \$5,$t2 lea $K($xi[0],$e),$e xor `4*(($j+8)%16)`(%rsp),$xi[1] xor $d,$t0 lea $K($xi[0],$e),$e xor $c,$t0 add $t2,$e xor `4*(($j+13)%16)`(%rsp),$xi[1] rol \$30,$b add $t0,$e rol \$1,$xi[1] ___ $code.=<<___ if ($i<76); mov $xi[1],`4*($j%16)`(%rsp) ___ $code.=<<___ if ($i==79); mov $c,$t0 mov $b,$t0 mov $a,$t2 xor $b,$t0 xor $d,$t0 lea $K($xi[0],$e),$e rol \$5,$t2 xor $d,$t0 xor $c,$t0 add $t2,$e rol \$30,$b add $t0,$e ___ unshift(@xi,pop(@xi)); push(@xi,shift(@xi)); } sub BODY_40_59 { my ($i,$a,$b,$c,$d,$e)=@_; my $j=$i+1; $code.=<<___; mov `4*($j%16)`(%rsp),$xi[1] mov $c,$t0 mov $c,$t1 xor `4*($j%16)`(%rsp),$xi[1] mov $d,$t0 mov $xi[0],`4*($i%16)`(%rsp) mov $d,$t1 xor `4*(($j+2)%16)`(%rsp),$xi[1] and $d,$t0 and $c,$t0 mov $a,$t2 xor `4*(($j+8)%16)`(%rsp),$xi[1] xor $d,$t1 lea 0x8f1bbcdc($xi[0],$e),$e xor $c,$t1 rol \$5,$t2 xor `4*(($j+13)%16)`(%rsp),$xi[1] add $t0,$e and $b,$t1 rol \$1,$xi[1] add $t1,$e rol \$30,$b mov $xi[1],`4*($j%16)`(%rsp) and $b,$t1 add $t2,$e rol \$30,$b add $t1,$e ___ unshift(@xi,pop(@xi)); push(@xi,shift(@xi)); } $code.=<<___; Loading Loading @@ -261,17 +259,18 @@ $code.=<<___; .align 16 .Lialu: mov %rsp,%rax push %rbx push %rbp push %r12 push %r13 mov %rsp,%r11 push %r14 mov %rdi,$ctx # reassigned argument sub \$`8+16*4`,%rsp mov %rsi,$inp # reassigned argument and \$-64,%rsp mov %rdx,$num # reassigned argument mov %r11,`16*4`(%rsp) mov %rax,`16*4`(%rsp) .Lprologue: mov 0($ctx),$A Loading Loading @@ -305,11 +304,12 @@ $code.=<<___; jnz .Lloop mov `16*4`(%rsp),%rsi mov (%rsi),%r13 mov 8(%rsi),%r12 mov 16(%rsi),%rbp mov 24(%rsi),%rbx lea 32(%rsi),%rsp mov -40(%rsi),%r14 mov -32(%rsi),%r13 mov -24(%rsi),%r12 mov -16(%rsi),%rbp mov -8(%rsi),%rbx lea (%rsi),%rsp .Lepilogue: ret .size sha1_block_data_order,.-sha1_block_data_order Loading Loading @@ -389,11 +389,11 @@ $code.=<<___; movdqu 32($inp),@X[-2&7] movdqu 48($inp),@X[-1&7] pshufb @X[2],@X[-4&7] # byte swap add \$64,$inp pshufb @X[2],@X[-3&7] pshufb @X[2],@X[-2&7] pshufb @X[2],@X[-1&7] add \$64,$inp paddd @Tx[1],@X[-4&7] # add K_00_19 pshufb @X[2],@X[-1&7] paddd @Tx[1],@X[-3&7] paddd @Tx[1],@X[-2&7] movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU Loading @@ -418,74 +418,75 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions my ($a,$b,$c,$d,$e); &movdqa (@X[0],@X[-3&7]); eval(shift(@insns)); eval(shift(@insns)); # ror &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]); eval(shift(@insns)); &movdqa (@Tx[0],@X[-1&7]); &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]" &paddd (@Tx[1],@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); &paddd (@Tx[1],@X[-1&7]); &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); &psrldq (@Tx[0],4); # "X[-3]", 3 dwords eval(shift(@insns)); eval(shift(@insns)); &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); &movdqa (@Tx[2],@X[0]); &movdqa (@Tx[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror &movdqa (@Tx[0],@X[0]); eval(shift(@insns)); &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword &paddd (@X[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &psrld (@Tx[0],31); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); &movdqa (@Tx[1],@Tx[2]); eval(shift(@insns)); eval(shift(@insns)); &psrld (@Tx[2],30); &por (@X[0],@Tx[0]); # "X[0]"<<<=1 eval(shift(@insns)); eval(shift(@insns)); # ror &por (@X[0],@Tx[0]); # "X[0]"<<<=1 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pslld (@Tx[1],2); &pxor (@X[0],@Tx[2]); eval(shift(@insns)); eval(shift(@insns)); &movdqa (@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)"); # K_XX_XX eval(shift(@insns)); # rol eval(shift(@insns)); eval(shift(@insns)); &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 &pshufd (@Tx[1],@X[-1&7],0xee) if ($Xi==7); # was &movdqa (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79 foreach (@insns) { eval; } # remaining instructions [if any] Loading @@ -499,24 +500,27 @@ sub Xupdate_ssse3_32_79() my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions my ($a,$b,$c,$d,$e); &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8); eval(shift(@insns)); # body_20_39 eval(shift(@insns)) if ($Xi==8); &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]" eval(shift(@insns)) if ($Xi==8); eval(shift(@insns)); # body_20_39 eval(shift(@insns)); eval(shift(@insns)) if (@insns[1] =~ /_ror/); eval(shift(@insns)) if (@insns[0] =~ /_ror/); &punpcklqdq(@Tx[0],@X[-1&7]); # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8); eval(shift(@insns)); eval(shift(@insns)); # rol &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" eval(shift(@insns)); eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); eval(shift(@insns)); if ($Xi%5) { &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... } else { # ... or load next one &movdqa (@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)"); } &paddd (@Tx[1],@X[-1&7]); eval(shift(@insns)); # ror &paddd (@Tx[1],@X[-1&7]); eval(shift(@insns)); &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]" Loading @@ -524,29 +528,31 @@ sub Xupdate_ssse3_32_79() eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)) if (@insns[0] =~ /_ror/); &movdqa (@Tx[0],@X[0]); &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU eval(shift(@insns)); # ror eval(shift(@insns)); eval(shift(@insns)); # body_20_39 &pslld (@X[0],2); eval(shift(@insns)); # body_20_39 eval(shift(@insns)); &psrld (@Tx[0],30); eval(shift(@insns)); eval(shift(@insns)); # rol &psrld (@Tx[0],30); eval(shift(@insns)) if (@insns[0] =~ /_rol/);# rol eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror eval(shift(@insns)); &por (@X[0],@Tx[0]); # "X[0]"<<<=2 eval(shift(@insns)); # body_20_39 eval(shift(@insns)); &movdqa (@Tx[1],@X[0]) if ($Xi<19); eval(shift(@insns)); # body_20_39 eval(shift(@insns)) if (@insns[1] =~ /_rol/); eval(shift(@insns)) if (@insns[0] =~ /_rol/); &pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19); # was &movdqa (@Tx[1],@X[0]) eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); Loading @@ -567,9 +573,10 @@ sub Xuplast_ssse3_80() my ($a,$b,$c,$d,$e); eval(shift(@insns)); &paddd (@Tx[1],@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &paddd (@Tx[1],@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); Loading Loading @@ -602,10 +609,12 @@ sub Xloop_ssse3() eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pshufb (@X[($Xi-3)&7],@X[2]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[($Xi-4)&7],@Tx[1]); eval(shift(@insns)); eval(shift(@insns)); Loading @@ -614,6 +623,8 @@ sub Xloop_ssse3() &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &psubd (@X[($Xi-4)&7],@Tx[1]); foreach (@insns) { eval; } Loading Loading @@ -1680,16 +1691,17 @@ se_handler: jae .Lcommon_seh_tail mov `16*4`(%rax),%rax # pull saved stack pointer lea 32(%rax),%rax mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 mov -32(%rax),%r13 mov -40(%rax),%r14 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 jmp .Lcommon_seh_tail .size se_handler,.-se_handler Loading Loading
crypto/aes/asm/aesni-sha1-x86_64.pl +606 −82 File changed.Preview size limit exceeded, changes collapsed. Show changes
crypto/sha/asm/sha1-586.pl +105 −25 Original line number Diff line number Diff line Loading @@ -93,8 +93,9 @@ # Westmere 7.3 5.5/+33% - # Sandy Bridge 8.8 6.2/+40% 5.1(**)/+73% # Ivy Bridge 7.2 4.8/+51% 4.7(**)/+53% # Haswell 6.5 4.3/+51% 4.1(**)/+58% # Bulldozer 11.6 6.0/+92% # VIA Nano 10.6 7.4/+43% # VIA Nano 10.6 7.5/+41% # # (*) Loop is 1056 instructions long and expected result is ~8.25. # It remains mystery [to me] why ILP is limited to 1.7. Loading Loading @@ -512,7 +513,7 @@ my $_ror=sub { &ror(@_) }; &mov (@T[1],$C); &psubd (@X[-2&7],@X[3]); &xor (@T[1],$D); &movdqa (@X[0],@X[-3&7]); &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]); &and (@T[0],@T[1]); &jmp (&label("loop")); Loading @@ -539,76 +540,77 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); # ror eval(shift(@insns)); eval(shift(@insns)); &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]" &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8); &movdqa (@X[2],@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[3],@X[-1&7]); &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); &psrldq (@X[2],4); # "X[-3]", 3 dwords eval(shift(@insns)); eval(shift(@insns)); &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror &pxor (@X[2],@X[-2&7]); # "X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pxor (@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); &movdqa (@X[4],@X[0]); &movdqa (@X[2],@X[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror &movdqa (@X[2],@X[0]); eval(shift(@insns)); &pslldq (@X[4],12); # "X[0]"<<96, extract one dword &paddd (@X[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &psrld (@X[2],31); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol &movdqa (@X[3],@X[4]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &psrld (@X[4],30); &por (@X[0],@X[2]); # "X[0]"<<<=1 eval(shift(@insns)); eval(shift(@insns)); # ror &por (@X[0],@X[2]); # "X[0]"<<<=1 eval(shift(@insns)); &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer eval(shift(@insns)); eval(shift(@insns)); &pslld (@X[3],2); &pxor (@X[0],@X[4]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol &pxor (@X[0],@X[4]); &movdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX eval(shift(@insns)); eval(shift(@insns)); &pxor (@X[0],@X[3]); # "X[0]"^=("X[0]"<<96)<<<2 &movdqa (@X[1],@X[-2&7]) if ($Xi<7); &pshufd (@X[1],@X[-3&7],0xee) if ($Xi<7); # was &movdqa (@X[1],@X[-2&7]) &pshufd (@X[3],@X[-1&7],0xee) if ($Xi==7); eval(shift(@insns)); eval(shift(@insns)); Loading @@ -623,10 +625,9 @@ sub Xupdate_ssse3_32_79() my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions my ($a,$b,$c,$d,$e); &movdqa (@X[2],@X[-1&7]) if ($Xi==8); eval(shift(@insns)); # body_20_39 &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" &palignr(@X[2],@X[-2&7],8); # compose "X[-6]" &punpcklqdq(@X[2],@X[-1&7]); # compose "X[-6]", was &palignr(@X[2],@X[-2&7],8) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol Loading @@ -635,13 +636,14 @@ sub Xupdate_ssse3_32_79() &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X[] to backtrace buffer eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)) if (@insns[0] =~ /_rol/); if ($Xi%5) { &movdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX... } else { # ... or load next one &movdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp")); } &paddd (@X[3],@X[-1&7]); eval(shift(@insns)); # ror &paddd (@X[3],@X[-1&7]); eval(shift(@insns)); &pxor (@X[0],@X[2]); # "X[0]"^="X[-6]" Loading @@ -656,6 +658,7 @@ sub Xupdate_ssse3_32_79() eval(shift(@insns)); eval(shift(@insns)); # ror eval(shift(@insns)); eval(shift(@insns)) if (@insns[0] =~ /_rol/); &pslld (@X[0],2); eval(shift(@insns)); # body_20_39 Loading @@ -667,6 +670,8 @@ sub Xupdate_ssse3_32_79() eval(shift(@insns)); eval(shift(@insns)); # ror eval(shift(@insns)); eval(shift(@insns)) if (@insns[1] =~ /_rol/); eval(shift(@insns)) if (@insns[0] =~ /_rol/); &por (@X[0],@X[2]); # "X[0]"<<<=2 eval(shift(@insns)); # body_20_39 Loading @@ -677,7 +682,7 @@ sub Xupdate_ssse3_32_79() eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror &movdqa (@X[3],@X[0]) if ($Xi<19); &pshufd (@X[3],@X[-1],0xee) if ($Xi<19); # was &movdqa (@X[3],@X[0]) eval(shift(@insns)); foreach (@insns) { eval; } # remaining instructions Loading @@ -691,6 +696,12 @@ sub Xuplast_ssse3_80() my @insns = (&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[3],@X[-1&7]); eval(shift(@insns)); Loading Loading @@ -728,9 +739,16 @@ sub Xloop_ssse3() eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pshufb (@X[($Xi-3)&7],@X[2]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[($Xi-4)&7],@X[3]); eval(shift(@insns)); eval(shift(@insns)); Loading @@ -739,6 +757,8 @@ sub Xloop_ssse3() &movdqa (&QWP(0+16*$Xi,"esp"),@X[($Xi-4)&7]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &psubd (@X[($Xi-4)&7],@X[3]); foreach (@insns) { eval; } Loading Loading @@ -816,6 +836,64 @@ sub body_40_59 () { # ((b^c)&(c^d))^c '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); } ###### sub bodyx_00_19 () { # ((c^d)&b)^d # on start @T[0]=(b&c)^(~b&d), $e+=X[]+K return &bodyx_20_39() if ($rx==19); $rx++; ( '($a,$b,$c,$d,$e)=@V;'. '&rorx ($b,$b,2) if ($j==0);'. # $b>>>2 '&rorx ($b,@T[1],7) if ($j!=0);', # $b>>>2 '&lea ($e,&DWP(0,$e,@T[0]));', '&rorx (@T[0],$a,5);', '&andn (@T[1],$a,$c);', '&and ($a,$b)', '&add ($d,&DWP(4*(($j+1)&15),"esp"));', # X[]+K xfer '&xor (@T[1],$a)', '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); } sub bodyx_20_39 () { # b^d^c # on start $b=b^c^d return &bodyx_40_59() if ($rx==39); $rx++; ( '($a,$b,$c,$d,$e)=@V;'. '&add ($e,($j==19?@T[0]:$b))', '&rorx ($b,@T[1],7);', # $b>>>2 '&rorx (@T[0],$a,5);', '&xor ($a,$b) if ($j<79);', '&add ($d,&DWP(4*(($j+1)&15),"esp")) if ($j<79);', # X[]+K xfer '&xor ($a,$c) if ($j<79);', '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); } sub bodyx_40_59 () { # ((b^c)&(c^d))^c # on start $b=((b^c)&(c^d))^c return &bodyx_20_39() if ($rx==59); $rx++; ( '($a,$b,$c,$d,$e)=@V;'. '&rorx (@T[0],$a,5)', '&lea ($e,&DWP(0,$e,$b))', '&rorx ($b,@T[1],7)', # $b>>>2 '&add ($d,&DWP(4*(($j+1)&15),"esp"))', # X[]+K xfer '&mov (@T[1],$c)', '&xor ($a,$b)', # b^c for next round '&xor (@T[1],$b)', # c^d for next round '&and ($a,@T[1])', '&add ($e,@T[0])', '&xor ($a,$b)' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); } &set_label("loop",16); &Xupdate_ssse3_16_31(\&body_00_19); Loading Loading @@ -855,9 +933,10 @@ sub body_40_59 () { # ((b^c)&(c^d))^c &mov (&DWP(12,@T[1]),$D); &xor ($B,$D); &mov (&DWP(16,@T[1]),$E); &and ($B,@T[0]); &movdqa (@X[0],@X[-3&7]); &xchg ($B,@T[0]); &mov (@T[1],@T[0]); &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]); &and (@T[0],$B); &mov ($B,$T[1]); &jmp (&label("loop")); Loading Loading @@ -1226,9 +1305,10 @@ sub Xtail_avx() &mov (&DWP(8,@T[1]),$C); &xor ($B,$D); &mov (&DWP(12,@T[1]),$D); &and ($B,@T[0]); &mov (&DWP(16,@T[1]),$E); &xchg ($B,@T[0]); &mov (@T[1],@T[0]); &and (@T[0],$B); &mov ($B,@T[1]); &jmp (&label("loop")); Loading
crypto/sha/asm/sha1-x86_64.pl +97 −85 Original line number Diff line number Diff line Loading @@ -62,16 +62,20 @@ # CPU clock cycles spent to process single byte (less is better). # # x86_64 SSSE3 AVX[2] # P4 9.8 - # Opteron 6.65 - # Core2 6.70 6.05/+11% - # Westmere 7.08 5.44/+30% - # Sandy Bridge 7.93 6.16/+28% 4.99/+59% # Ivy Bridge 6.30 4.63/+36% 4.60/+37% # Haswell 5.98 4.12/+45% 3.57/+67% # Bulldozer 10.9 5.95/+82% # VIA Nano 10.2 7.46/+37% # Atom 11.0 9.61/+14% # P4 9.05 - # Opteron 6.26 - # Core2 6.55 6.05/+8% - # Westmere 6.73 5.30/+27% - # Sandy Bridge 7.70 6.10/+26% 4.99/+54% # Ivy Bridge 6.06 4.67/+30% 4.60/+32% # Haswell 5.45 4.15/+31% 3.57/+53% # Bulldozer 9.11 5.95/+53% # VIA Nano 9.32 7.15/+30% # Atom [10.5?] [9.23?]/+14% # Silvermont 13.1(*) 9.37/+40% # # (*) obviously suboptimal result, nothing was done about it, # because SSSE3 code is compiled unconditionally; $flavour = shift; $output = shift; Loading Loading @@ -114,7 +118,7 @@ $num="%r10"; $t0="%eax"; $t1="%ebx"; $t2="%ecx"; @xi=("%edx","%ebp"); @xi=("%edx","%ebp","%r14d"); $A="%esi"; $B="%edi"; $C="%r11d"; Loading @@ -129,42 +133,40 @@ my $j=$i+1; $code.=<<___ if ($i==0); mov `4*$i`($inp),$xi[0] bswap $xi[0] mov $xi[0],`4*$i`(%rsp) ___ $code.=<<___ if ($i<15); mov $c,$t0 mov `4*$j`($inp),$xi[1] mov $d,$t0 mov $xi[0],`4*$i`(%rsp) mov $a,$t2 xor $d,$t0 bswap $xi[1] xor $c,$t0 rol \$5,$t2 lea 0x5a827999($xi[0],$e),$e and $b,$t0 mov $xi[1],`4*$j`(%rsp) lea 0x5a827999($xi[0],$e),$e add $t2,$e xor $d,$t0 rol \$30,$b add $t0,$e ___ $code.=<<___ if ($i>=15); mov `4*($j%16)`(%rsp),$xi[1] mov $c,$t0 xor `4*($j%16)`(%rsp),$xi[1] mov $d,$t0 mov $xi[0],`4*($i%16)`(%rsp) mov $a,$t2 xor `4*(($j+2)%16)`(%rsp),$xi[1] xor $d,$t0 xor $c,$t0 rol \$5,$t2 xor `4*(($j+8)%16)`(%rsp),$xi[1] and $b,$t0 lea 0x5a827999($xi[0],$e),$e xor `4*(($j+13)%16)`(%rsp),$xi[1] rol \$30,$b xor $d,$t0 rol \$1,$xi[1] add $t2,$e rol \$30,$b mov $xi[1],`4*($j%16)`(%rsp) rol \$1,$xi[1] add $t0,$e ___ unshift(@xi,pop(@xi)); push(@xi,shift(@xi)); } sub BODY_20_39 { Loading @@ -172,62 +174,58 @@ my ($i,$a,$b,$c,$d,$e)=@_; my $j=$i+1; my $K=($i<40)?0x6ed9eba1:0xca62c1d6; $code.=<<___ if ($i<79); mov `4*($j%16)`(%rsp),$xi[1] mov $c,$t0 xor `4*($j%16)`(%rsp),$xi[1] mov $b,$t0 `"mov $xi[0],".4*($i%16)."(%rsp)" if ($i<72)` mov $a,$t2 xor `4*(($j+2)%16)`(%rsp),$xi[1] xor $b,$t0 xor $d,$t0 rol \$5,$t2 lea $K($xi[0],$e),$e xor `4*(($j+8)%16)`(%rsp),$xi[1] xor $d,$t0 lea $K($xi[0],$e),$e xor $c,$t0 add $t2,$e xor `4*(($j+13)%16)`(%rsp),$xi[1] rol \$30,$b add $t0,$e rol \$1,$xi[1] ___ $code.=<<___ if ($i<76); mov $xi[1],`4*($j%16)`(%rsp) ___ $code.=<<___ if ($i==79); mov $c,$t0 mov $b,$t0 mov $a,$t2 xor $b,$t0 xor $d,$t0 lea $K($xi[0],$e),$e rol \$5,$t2 xor $d,$t0 xor $c,$t0 add $t2,$e rol \$30,$b add $t0,$e ___ unshift(@xi,pop(@xi)); push(@xi,shift(@xi)); } sub BODY_40_59 { my ($i,$a,$b,$c,$d,$e)=@_; my $j=$i+1; $code.=<<___; mov `4*($j%16)`(%rsp),$xi[1] mov $c,$t0 mov $c,$t1 xor `4*($j%16)`(%rsp),$xi[1] mov $d,$t0 mov $xi[0],`4*($i%16)`(%rsp) mov $d,$t1 xor `4*(($j+2)%16)`(%rsp),$xi[1] and $d,$t0 and $c,$t0 mov $a,$t2 xor `4*(($j+8)%16)`(%rsp),$xi[1] xor $d,$t1 lea 0x8f1bbcdc($xi[0],$e),$e xor $c,$t1 rol \$5,$t2 xor `4*(($j+13)%16)`(%rsp),$xi[1] add $t0,$e and $b,$t1 rol \$1,$xi[1] add $t1,$e rol \$30,$b mov $xi[1],`4*($j%16)`(%rsp) and $b,$t1 add $t2,$e rol \$30,$b add $t1,$e ___ unshift(@xi,pop(@xi)); push(@xi,shift(@xi)); } $code.=<<___; Loading Loading @@ -261,17 +259,18 @@ $code.=<<___; .align 16 .Lialu: mov %rsp,%rax push %rbx push %rbp push %r12 push %r13 mov %rsp,%r11 push %r14 mov %rdi,$ctx # reassigned argument sub \$`8+16*4`,%rsp mov %rsi,$inp # reassigned argument and \$-64,%rsp mov %rdx,$num # reassigned argument mov %r11,`16*4`(%rsp) mov %rax,`16*4`(%rsp) .Lprologue: mov 0($ctx),$A Loading Loading @@ -305,11 +304,12 @@ $code.=<<___; jnz .Lloop mov `16*4`(%rsp),%rsi mov (%rsi),%r13 mov 8(%rsi),%r12 mov 16(%rsi),%rbp mov 24(%rsi),%rbx lea 32(%rsi),%rsp mov -40(%rsi),%r14 mov -32(%rsi),%r13 mov -24(%rsi),%r12 mov -16(%rsi),%rbp mov -8(%rsi),%rbx lea (%rsi),%rsp .Lepilogue: ret .size sha1_block_data_order,.-sha1_block_data_order Loading Loading @@ -389,11 +389,11 @@ $code.=<<___; movdqu 32($inp),@X[-2&7] movdqu 48($inp),@X[-1&7] pshufb @X[2],@X[-4&7] # byte swap add \$64,$inp pshufb @X[2],@X[-3&7] pshufb @X[2],@X[-2&7] pshufb @X[2],@X[-1&7] add \$64,$inp paddd @Tx[1],@X[-4&7] # add K_00_19 pshufb @X[2],@X[-1&7] paddd @Tx[1],@X[-3&7] paddd @Tx[1],@X[-2&7] movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU Loading @@ -418,74 +418,75 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions my ($a,$b,$c,$d,$e); &movdqa (@X[0],@X[-3&7]); eval(shift(@insns)); eval(shift(@insns)); # ror &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]); eval(shift(@insns)); &movdqa (@Tx[0],@X[-1&7]); &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]" &paddd (@Tx[1],@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); &paddd (@Tx[1],@X[-1&7]); &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); &psrldq (@Tx[0],4); # "X[-3]", 3 dwords eval(shift(@insns)); eval(shift(@insns)); &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); &movdqa (@Tx[2],@X[0]); &movdqa (@Tx[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror &movdqa (@Tx[0],@X[0]); eval(shift(@insns)); &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword &paddd (@X[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &psrld (@Tx[0],31); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); &movdqa (@Tx[1],@Tx[2]); eval(shift(@insns)); eval(shift(@insns)); &psrld (@Tx[2],30); &por (@X[0],@Tx[0]); # "X[0]"<<<=1 eval(shift(@insns)); eval(shift(@insns)); # ror &por (@X[0],@Tx[0]); # "X[0]"<<<=1 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pslld (@Tx[1],2); &pxor (@X[0],@Tx[2]); eval(shift(@insns)); eval(shift(@insns)); &movdqa (@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)"); # K_XX_XX eval(shift(@insns)); # rol eval(shift(@insns)); eval(shift(@insns)); &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 &pshufd (@Tx[1],@X[-1&7],0xee) if ($Xi==7); # was &movdqa (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79 foreach (@insns) { eval; } # remaining instructions [if any] Loading @@ -499,24 +500,27 @@ sub Xupdate_ssse3_32_79() my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions my ($a,$b,$c,$d,$e); &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8); eval(shift(@insns)); # body_20_39 eval(shift(@insns)) if ($Xi==8); &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]" eval(shift(@insns)) if ($Xi==8); eval(shift(@insns)); # body_20_39 eval(shift(@insns)); eval(shift(@insns)) if (@insns[1] =~ /_ror/); eval(shift(@insns)) if (@insns[0] =~ /_ror/); &punpcklqdq(@Tx[0],@X[-1&7]); # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8); eval(shift(@insns)); eval(shift(@insns)); # rol &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" eval(shift(@insns)); eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); eval(shift(@insns)); if ($Xi%5) { &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... } else { # ... or load next one &movdqa (@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)"); } &paddd (@Tx[1],@X[-1&7]); eval(shift(@insns)); # ror &paddd (@Tx[1],@X[-1&7]); eval(shift(@insns)); &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]" Loading @@ -524,29 +528,31 @@ sub Xupdate_ssse3_32_79() eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)) if (@insns[0] =~ /_ror/); &movdqa (@Tx[0],@X[0]); &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU eval(shift(@insns)); # ror eval(shift(@insns)); eval(shift(@insns)); # body_20_39 &pslld (@X[0],2); eval(shift(@insns)); # body_20_39 eval(shift(@insns)); &psrld (@Tx[0],30); eval(shift(@insns)); eval(shift(@insns)); # rol &psrld (@Tx[0],30); eval(shift(@insns)) if (@insns[0] =~ /_rol/);# rol eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror eval(shift(@insns)); &por (@X[0],@Tx[0]); # "X[0]"<<<=2 eval(shift(@insns)); # body_20_39 eval(shift(@insns)); &movdqa (@Tx[1],@X[0]) if ($Xi<19); eval(shift(@insns)); # body_20_39 eval(shift(@insns)) if (@insns[1] =~ /_rol/); eval(shift(@insns)) if (@insns[0] =~ /_rol/); &pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19); # was &movdqa (@Tx[1],@X[0]) eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); Loading @@ -567,9 +573,10 @@ sub Xuplast_ssse3_80() my ($a,$b,$c,$d,$e); eval(shift(@insns)); &paddd (@Tx[1],@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &paddd (@Tx[1],@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); Loading Loading @@ -602,10 +609,12 @@ sub Xloop_ssse3() eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pshufb (@X[($Xi-3)&7],@X[2]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[($Xi-4)&7],@Tx[1]); eval(shift(@insns)); eval(shift(@insns)); Loading @@ -614,6 +623,8 @@ sub Xloop_ssse3() &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &psubd (@X[($Xi-4)&7],@Tx[1]); foreach (@insns) { eval; } Loading Loading @@ -1680,16 +1691,17 @@ se_handler: jae .Lcommon_seh_tail mov `16*4`(%rax),%rax # pull saved stack pointer lea 32(%rax),%rax mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 mov -32(%rax),%r13 mov -40(%rax),%r14 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 jmp .Lcommon_seh_tail .size se_handler,.-se_handler Loading