Loading crypto/aes/asm/aesni-mb-x86_64.pl +44 −8 Original line number Diff line number Diff line Loading @@ -15,8 +15,8 @@ # asymptotic measured # --------------------------- # Westmere 5.00/4=1.25 5.13/4=1.28 # Atom 15.0/4=3.75 15.7/4=3.93 # Sandy Bridge 5.06/4=1.27 5.15/4=1.29 # Atom 15.0/4=3.75 ?15.7/4=3.93 # Sandy Bridge 5.06/4=1.27 5.18/4=1.29 # Ivy Bridge 5.06/4=1.27 5.14/4=1.29 # Haswell 4.44/4=1.11 4.44/4=1.11 # Bulldozer 5.75/4=1.44 5.76/4=1.44 Loading @@ -27,8 +27,8 @@ # # asymptotic measured # --------------------------- # Sandy Bridge 5.06/8=0.64 7.05/8=0.88(*) # Ivy Bridge 5.06/8=0.64 7.02/8=0.88(*) # Sandy Bridge 5.06/8=0.64 7.10/8=0.89(*) # Ivy Bridge 5.06/8=0.64 7.14/8=0.89(*) # Haswell 5.00/8=0.63 5.00/8=0.63 # Bulldozer 5.75/8=0.72 5.77/8=0.72 # Loading Loading @@ -188,7 +188,11 @@ $code.=<<___; sub $offset,$sink aesenc $rndkey1,@out[0] prefetcht0 31(@inptr[0],$offset) # prefetch input prefetcht0 31(@inptr[1],$offset) aesenc $rndkey1,@out[1] prefetcht0 31(@inptr[2],$offset) prefetcht0 31(@inptr[2],$offset) aesenc $rndkey1,@out[2] aesenc $rndkey1,@out[3] movups 0x30-0x78($key),$rndkey1 Loading @@ -199,8 +203,8 @@ $code.=<<___; cmp `32+4*$i`(%rsp),$one aesenc $rndkey,@out[0] aesenc $rndkey,@out[1] cmovge $sink,@inptr[$i] # cancel input aesenc $rndkey,@out[2] cmovge $sink,@inptr[$i] # cancel input cmovg $sink,@outptr[$i] # sink output aesenc $rndkey,@out[3] movups `0x40+16*$i-0x78`($key),$rndkey Loading @@ -209,7 +213,11 @@ ___ $code.=<<___; movdqa $counters,$mask aesenc $rndkey0,@out[0] prefetcht0 15(@outptr[0],$offset) # prefetch output prefetcht0 15(@outptr[1],$offset) aesenc $rndkey0,@out[1] prefetcht0 15(@outptr[2],$offset) prefetcht0 15(@outptr[3],$offset) aesenc $rndkey0,@out[2] aesenc $rndkey0,@out[3] movups 0x80-0x78($key),$rndkey0 Loading Loading @@ -260,13 +268,15 @@ $code.=<<___; aesenc $rndkey0,@out[2] aesenc $rndkey0,@out[3] movups 0xe0-0x78($key),$rndkey0 jmp .Lenc4x_tail .align 32 .Lenc4x_tail: aesenc $rndkey1,@out[0] aesenc $rndkey1,@out[1] aesenc $rndkey1,@out[2] movdqu (@inptr[0],$offset),@inp[0] aesenc $rndkey1,@out[3] movdqu (@inptr[0],$offset),@inp[0] movdqu 0x10-0x78($key),$rndkey1 aesenclast $rndkey0,@out[0] Loading Loading @@ -426,7 +436,11 @@ $code.=<<___; sub $offset,$sink aesdec $rndkey1,@out[0] prefetcht0 31(@inptr[0],$offset) # prefetch input prefetcht0 31(@inptr[1],$offset) aesdec $rndkey1,@out[1] prefetcht0 31(@inptr[2],$offset) prefetcht0 31(@inptr[3],$offset) aesdec $rndkey1,@out[2] aesdec $rndkey1,@out[3] movups 0x30-0x78($key),$rndkey1 Loading @@ -447,7 +461,11 @@ ___ $code.=<<___; movdqa $counters,$mask aesdec $rndkey0,@out[0] prefetcht0 15(@outptr[0],$offset) # prefetch output prefetcht0 15(@outptr[1],$offset) aesdec $rndkey0,@out[1] prefetcht0 15(@outptr[2],$offset) prefetcht0 15(@outptr[3],$offset) aesdec $rndkey0,@out[2] aesdec $rndkey0,@out[3] movups 0x80-0x78($key),$rndkey0 Loading Loading @@ -498,7 +516,9 @@ $code.=<<___; aesdec $rndkey0,@out[2] aesdec $rndkey0,@out[3] movups 0xe0-0x78($key),$rndkey0 jmp .Ldec4x_tail .align 32 .Ldec4x_tail: aesdec $rndkey1,@out[0] aesdec $rndkey1,@out[1] Loading @@ -512,12 +532,12 @@ $code.=<<___; movdqu 0x20-0x78($key),$rndkey0 aesdeclast @inp[0],@out[0] movdqu -16(@inptr[0],$offset),@inp[0] # load next IV aesdeclast @inp[1],@out[1] movdqu -16(@inptr[0],$offset),@inp[0] # load next IV movdqu -16(@inptr[1],$offset),@inp[1] aesdeclast @inp[2],@out[2] movdqu -16(@inptr[2],$offset),@inp[2] aesdeclast @inp[3],@out[3] movdqu -16(@inptr[2],$offset),@inp[2] movdqu -16(@inptr[3],$offset),@inp[3] movups @out[0],-16(@outptr[0],$offset) Loading Loading @@ -682,7 +702,13 @@ $code.=<<___ if ($i); ___ $code.=<<___; vaesenc $rndkey,@out[1],@out[1] prefetcht0 31(@ptr[$i]) # prefetch input vaesenc $rndkey,@out[2],@out[2] ___ $code.=<<___ if ($i>1); prefetcht0 15(@ptr[$i-2]) # prefetch output ___ $code.=<<___; vaesenc $rndkey,@out[3],@out[3] lea (@ptr[$i],$offset),$offset cmovge %rsp,@ptr[$i] # cancel input Loading @@ -703,6 +729,8 @@ ___ } $code.=<<___; vmovdqu 32(%rsp),$counters prefetcht0 15(@ptr[$i-2]) # prefetch output prefetcht0 15(@ptr[$i-1]) cmp \$11,$rounds jb .Lenc8x_tail Loading Loading @@ -958,7 +986,13 @@ $code.=<<___ if ($i); ___ $code.=<<___; vaesdec $rndkey,@out[1],@out[1] prefetcht0 31(@ptr[$i]) # prefetch input vaesdec $rndkey,@out[2],@out[2] ___ $code.=<<___ if ($i>1); prefetcht0 15(@ptr[$i-2]) # prefetch output ___ $code.=<<___; vaesdec $rndkey,@out[3],@out[3] lea (@ptr[$i],$offset),$offset cmovge %rsp,@ptr[$i] # cancel input Loading @@ -979,6 +1013,8 @@ ___ } $code.=<<___; vmovdqu 32(%rsp),$counters prefetcht0 15(@ptr[$i-2]) # prefetch output prefetcht0 15(@ptr[$i-1]) cmp \$11,$rounds jb .Ldec8x_tail Loading crypto/sha/asm/sha1-mb-x86_64.pl +31 −9 Original line number Diff line number Diff line Loading @@ -14,20 +14,21 @@ # # this +aesni(i) sha1 aesni-sha1 gain(iv) # ------------------------------------------------------------------- # Westmere(ii) 10.4/n +1.28=3.88(n=4) 5.44 6.58 +70% # Atom(ii) 18.9/n +3.93=8.66(n=4) 10.0 14.0 +62% # Westmere(ii) 10.7/n +1.28=3.96(n=4) 5.30 6.66 +68% # Atom(ii) 18.9?/n +3.93=8.66(n=4) 10.0 14.0 +62% # Sandy Bridge (8.16 +5.15=13.3)/n 4.99 5.98 +80% # Ivy Bridge (8.03 +5.14=13.2)/n 4.60 5.54 +68% # Ivy Bridge (8.08 +5.14=13.2)/n 4.60 5.54 +68% # Haswell(iii) (8.96 +5.00=14.0)/n 3.57 4.55 +160% # Bulldozer (9.75 +5.76=15.5)/n 5.95 6.37 +64% # Bulldozer (9.76 +5.76=15.5)/n 5.95 6.37 +64% # # (i) multi-block CBC encrypt with 128-bit key; # (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom, # because of lower AES-NI instruction throughput; # (iii) "this" is for n=8, when we gather twice as much data, result # for n=4 is 7.98+4.44=12.4; # (iv) improvement coefficients in real-life application are somewhat # lower and range from 30% to 100% (on Haswell); # for n=4 is 8.00+4.44=12.4; # (iv) presented improvement coefficients are asymptotic limits and # in real-life application are somewhat lower, e.g. for 2KB # fragments they range from 30% to 100% (on Haswell); $flavour = shift; $output = shift; Loading Loading @@ -80,6 +81,14 @@ $Tbl="%rbp"; @Xi=map("%xmm$_",(10..14)); $K="%xmm15"; if (1) { # Atom-specific optimization aiming to eliminate pshufb with high # registers [and thus get rid of 48 cycles accumulated penalty] @Xi=map("%xmm$_",(0..4)); ($tx,$t0,$t1,$t2,$t3)=map("%xmm$_",(5..9)); @V=($A,$B,$C,$D,$E)=map("%xmm$_",(10..14)); } $REG_SZ=16; sub Xi_off { Loading Loading @@ -139,8 +148,8 @@ $code.=<<___ if ($i<14); # just load input psrld \$2,$b paddd $t2,$e # e+=rol(a,5) movd `4*$j-16*4`(@ptr[2]),$t2 pshufb $tx,@Xi[1] movd `4*$j-16*4`(@ptr[2]),$t2 por $t1,$b # b=rol(b,30) ___ $code.=<<___ if ($i==14); # just load input Loading @@ -152,6 +161,7 @@ $code.=<<___ if ($i==14); # just load input movdqa $b,$t1 movdqa $b,$t0 pslld \$5,$t2 prefetcht0 63(@ptr[0]) pandn $d,$t1 pand $c,$t0 punpckldq $t3,@Xi[1] Loading @@ -162,14 +172,17 @@ $code.=<<___ if ($i==14); # just load input psrld \$27,$t3 pxor $t1,$t0 # Ch(b,c,d) movdqa $b,$t1 prefetcht0 63(@ptr[1]) por $t3,$t2 # rol(a,5) pslld \$30,$t1 paddd $t0,$e # e+=Ch(b,c,d) prefetcht0 63(@ptr[2]) psrld \$2,$b paddd $t2,$e # e+=rol(a,5) pshufb $tx,@Xi[1] prefetcht0 63(@ptr[3]) por $t1,$b # b=rol(b,30) ___ $code.=<<___ if ($i>=13 && $i<15); Loading Loading @@ -382,12 +395,12 @@ $code.=<<___; movdqu 0x60($ctx),$D movdqu 0x80($ctx),$E movdqa 0x60($Tbl),$tx # pbswap_mask movdqa -0x20($Tbl),$K # K_00_19 jmp .Loop .align 32 .Loop: ___ $code.=" movdqa -0x20($Tbl),$K\n"; # K_00_19 for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } $code.=" movdqa 0x00($Tbl),$K\n"; # K_20_39 for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } Loading Loading @@ -434,6 +447,7 @@ $code.=<<___; movdqa @Xi[0],(%rbx) # save counters movdqa 0x60($Tbl),$tx # pbswap_mask movdqa -0x20($Tbl),$K # K_00_19 dec $num jnz .Loop Loading Loading @@ -551,6 +565,7 @@ $code.=<<___ if ($i<14); ___ $code.=<<___ if ($i==14); vpaddd $K,$e,$e # e+=K_00_19 prefetcht0 63(@ptr[0]) vpslld \$5,$a,$t2 vpandn $d,$b,$t1 vpand $c,$b,$t0 Loading @@ -559,14 +574,17 @@ $code.=<<___ if ($i==14); vpaddd @Xi[0],$e,$e # e+=X[i] $vpack $t3,@Xi[1],@Xi[1] vpsrld \$27,$a,$t3 prefetcht0 63(@ptr[1]) vpxor $t1,$t0,$t0 # Ch(b,c,d) vpslld \$30,$b,$t1 vpor $t3,$t2,$t2 # rol(a,5) prefetcht0 63(@ptr[2]) vpaddd $t0,$e,$e # e+=Ch(b,c,d) vpsrld \$2,$b,$b vpaddd $t2,$e,$e # e+=rol(a,5) prefetcht0 63(@ptr[3]) vpshufb $tx,@Xi[1],@Xi[1] vpor $t1,$b,$b # b=rol(b,30) ___ Loading @@ -580,6 +598,7 @@ $code.=<<___ if ($i>=15); # apply Xupdate vpaddd $K,$e,$e # e+=K_00_19 vpslld \$5,$a,$t2 vpandn $d,$b,$t1 `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)` vpand $c,$b,$t0 vmovdqa @Xi[0],`&Xi_off($i)` Loading @@ -588,14 +607,17 @@ $code.=<<___ if ($i>=15); # apply Xupdate vpsrld \$27,$a,$t3 vpxor $t1,$t0,$t0 # Ch(b,c,d) vpxor @Xi[3],@Xi[1],@Xi[1] `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)` vpslld \$30,$b,$t1 vpor $t3,$t2,$t2 # rol(a,5) vpaddd $t0,$e,$e # e+=Ch(b,c,d) `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)` vpsrld \$31,@Xi[1],$tx vpaddd @Xi[1],@Xi[1],@Xi[1] vpsrld \$2,$b,$b `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)` vpaddd $t2,$e,$e # e+=rol(a,5) vpor $tx,@Xi[1],@Xi[1] # rol \$1,@Xi[1] vpor $t1,$b,$b # b=rol(b,30) Loading crypto/sha/asm/sha256-mb-x86_64.pl +16 −3 Original line number Diff line number Diff line Loading @@ -15,7 +15,7 @@ # this +aesni(i) sha256 aesni-sha256 gain(iv) # ------------------------------------------------------------------- # Westmere(ii) 23.3/n +1.28=7.11(n=4) 12.3 +3.75=16.1 +126% # Atom(ii) 39.1/n +3.93=13.7(n=4) 20.8 +5.69=26.5 +93% # Atom(ii) ?39.1/n +3.93=13.7(n=4) 20.8 +5.69=26.5 +93% # Sandy Bridge (20.5 +5.15=25.7)/n 11.6 13.0 +103% # Ivy Bridge (20.4 +5.14=25.5)/n 10.3 11.6 +82% # Haswell(iii) (21.0 +5.00=26.0)/n 7.80 8.79 +170% Loading @@ -27,8 +27,9 @@ # AES-NI-SHA256 stitch for these processors; # (iii) "this" is for n=8, when we gather twice as much data, result # for n=4 is 20.3+4.44=24.7; # (iv) improvement coefficients in real-life application are somewhat # lower and range from 75% to 130% (on Haswell); # (iv) presented improvement coefficients are asymptotic limits and # in real-life application are somewhat lower, e.g. for 2KB # fragments they range from 75% to 13% (on Haswell); $flavour = shift; $output = shift; Loading Loading @@ -135,6 +136,7 @@ $code.=<<___; psrld \$25-11,$t2 movdqa $e,$t1 `"prefetch 63(@ptr[0])" if ($i==15)` pxor $t3,$sigma movdqa $e,$axb # borrow $axb pslld \$26-21,$t3 Loading @@ -142,6 +144,7 @@ $code.=<<___; pand $f,$axb pxor $t2,$sigma `"prefetch 63(@ptr[1])" if ($i==15)` movdqa $a,$t2 pxor $t3,$sigma # Sigma1(e) movdqa $a,$t3 Loading @@ -153,6 +156,7 @@ $code.=<<___; pslld \$10,$t3 pxor $a,$axb # a^b, b^c in next round `"prefetch 63(@ptr[2])" if ($i==15)` psrld \$13,$sigma pxor $t3,$t2 paddd $t1,$Xi # Xi+=Ch(e,f,g) Loading @@ -160,6 +164,7 @@ $code.=<<___; pand $axb,$bxc pxor $sigma,$t2 `"prefetch 63(@ptr[3])" if ($i==15)` psrld \$22-13,$sigma pxor $t3,$t2 movdqa $b,$h Loading Loading @@ -465,30 +470,38 @@ $code.=<<___; vpsrld \$25,$e,$t2 vpxor $t3,$sigma,$sigma `"prefetch 63(@ptr[0])" if ($i==15)` vpslld \$7,$e,$t3 vpandn $g,$e,$t1 vpand $f,$e,$axb # borrow $axb `"prefetch 63(@ptr[1])" if ($i==15)` vpxor $t2,$sigma,$sigma vpsrld \$2,$a,$h # borrow $h vpxor $t3,$sigma,$sigma # Sigma1(e) `"prefetch 63(@ptr[2])" if ($i==15)` vpslld \$30,$a,$t2 vpxor $axb,$t1,$t1 # Ch(e,f,g) vpxor $a,$b,$axb # a^b, b^c in next round `"prefetch 63(@ptr[3])" if ($i==15)` vpxor $t2,$h,$h vpaddd $sigma,$Xi,$Xi # Xi+=Sigma1(e) vpsrld \$13,$a,$t2 `"prefetch 63(@ptr[4])" if ($i==15 && $REG_SZ==32)` vpslld \$19,$a,$t3 vpaddd $t1,$Xi,$Xi # Xi+=Ch(e,f,g) vpand $axb,$bxc,$bxc `"prefetch 63(@ptr[5])" if ($i==15 && $REG_SZ==32)` vpxor $t2,$h,$sigma vpsrld \$22,$a,$t2 vpxor $t3,$sigma,$sigma `"prefetch 63(@ptr[6])" if ($i==15 && $REG_SZ==32)` vpslld \$10,$a,$t3 vpxor $bxc,$b,$h # h=Maj(a,b,c)=Ch(a^b,c,b) vpaddd $Xi,$d,$d # d+=Xi `"prefetch 63(@ptr[7])" if ($i==15 && $REG_SZ==32)` vpxor $t2,$sigma,$sigma vpxor $t3,$sigma,$sigma # Sigma0(a) Loading Loading
crypto/aes/asm/aesni-mb-x86_64.pl +44 −8 Original line number Diff line number Diff line Loading @@ -15,8 +15,8 @@ # asymptotic measured # --------------------------- # Westmere 5.00/4=1.25 5.13/4=1.28 # Atom 15.0/4=3.75 15.7/4=3.93 # Sandy Bridge 5.06/4=1.27 5.15/4=1.29 # Atom 15.0/4=3.75 ?15.7/4=3.93 # Sandy Bridge 5.06/4=1.27 5.18/4=1.29 # Ivy Bridge 5.06/4=1.27 5.14/4=1.29 # Haswell 4.44/4=1.11 4.44/4=1.11 # Bulldozer 5.75/4=1.44 5.76/4=1.44 Loading @@ -27,8 +27,8 @@ # # asymptotic measured # --------------------------- # Sandy Bridge 5.06/8=0.64 7.05/8=0.88(*) # Ivy Bridge 5.06/8=0.64 7.02/8=0.88(*) # Sandy Bridge 5.06/8=0.64 7.10/8=0.89(*) # Ivy Bridge 5.06/8=0.64 7.14/8=0.89(*) # Haswell 5.00/8=0.63 5.00/8=0.63 # Bulldozer 5.75/8=0.72 5.77/8=0.72 # Loading Loading @@ -188,7 +188,11 @@ $code.=<<___; sub $offset,$sink aesenc $rndkey1,@out[0] prefetcht0 31(@inptr[0],$offset) # prefetch input prefetcht0 31(@inptr[1],$offset) aesenc $rndkey1,@out[1] prefetcht0 31(@inptr[2],$offset) prefetcht0 31(@inptr[2],$offset) aesenc $rndkey1,@out[2] aesenc $rndkey1,@out[3] movups 0x30-0x78($key),$rndkey1 Loading @@ -199,8 +203,8 @@ $code.=<<___; cmp `32+4*$i`(%rsp),$one aesenc $rndkey,@out[0] aesenc $rndkey,@out[1] cmovge $sink,@inptr[$i] # cancel input aesenc $rndkey,@out[2] cmovge $sink,@inptr[$i] # cancel input cmovg $sink,@outptr[$i] # sink output aesenc $rndkey,@out[3] movups `0x40+16*$i-0x78`($key),$rndkey Loading @@ -209,7 +213,11 @@ ___ $code.=<<___; movdqa $counters,$mask aesenc $rndkey0,@out[0] prefetcht0 15(@outptr[0],$offset) # prefetch output prefetcht0 15(@outptr[1],$offset) aesenc $rndkey0,@out[1] prefetcht0 15(@outptr[2],$offset) prefetcht0 15(@outptr[3],$offset) aesenc $rndkey0,@out[2] aesenc $rndkey0,@out[3] movups 0x80-0x78($key),$rndkey0 Loading Loading @@ -260,13 +268,15 @@ $code.=<<___; aesenc $rndkey0,@out[2] aesenc $rndkey0,@out[3] movups 0xe0-0x78($key),$rndkey0 jmp .Lenc4x_tail .align 32 .Lenc4x_tail: aesenc $rndkey1,@out[0] aesenc $rndkey1,@out[1] aesenc $rndkey1,@out[2] movdqu (@inptr[0],$offset),@inp[0] aesenc $rndkey1,@out[3] movdqu (@inptr[0],$offset),@inp[0] movdqu 0x10-0x78($key),$rndkey1 aesenclast $rndkey0,@out[0] Loading Loading @@ -426,7 +436,11 @@ $code.=<<___; sub $offset,$sink aesdec $rndkey1,@out[0] prefetcht0 31(@inptr[0],$offset) # prefetch input prefetcht0 31(@inptr[1],$offset) aesdec $rndkey1,@out[1] prefetcht0 31(@inptr[2],$offset) prefetcht0 31(@inptr[3],$offset) aesdec $rndkey1,@out[2] aesdec $rndkey1,@out[3] movups 0x30-0x78($key),$rndkey1 Loading @@ -447,7 +461,11 @@ ___ $code.=<<___; movdqa $counters,$mask aesdec $rndkey0,@out[0] prefetcht0 15(@outptr[0],$offset) # prefetch output prefetcht0 15(@outptr[1],$offset) aesdec $rndkey0,@out[1] prefetcht0 15(@outptr[2],$offset) prefetcht0 15(@outptr[3],$offset) aesdec $rndkey0,@out[2] aesdec $rndkey0,@out[3] movups 0x80-0x78($key),$rndkey0 Loading Loading @@ -498,7 +516,9 @@ $code.=<<___; aesdec $rndkey0,@out[2] aesdec $rndkey0,@out[3] movups 0xe0-0x78($key),$rndkey0 jmp .Ldec4x_tail .align 32 .Ldec4x_tail: aesdec $rndkey1,@out[0] aesdec $rndkey1,@out[1] Loading @@ -512,12 +532,12 @@ $code.=<<___; movdqu 0x20-0x78($key),$rndkey0 aesdeclast @inp[0],@out[0] movdqu -16(@inptr[0],$offset),@inp[0] # load next IV aesdeclast @inp[1],@out[1] movdqu -16(@inptr[0],$offset),@inp[0] # load next IV movdqu -16(@inptr[1],$offset),@inp[1] aesdeclast @inp[2],@out[2] movdqu -16(@inptr[2],$offset),@inp[2] aesdeclast @inp[3],@out[3] movdqu -16(@inptr[2],$offset),@inp[2] movdqu -16(@inptr[3],$offset),@inp[3] movups @out[0],-16(@outptr[0],$offset) Loading Loading @@ -682,7 +702,13 @@ $code.=<<___ if ($i); ___ $code.=<<___; vaesenc $rndkey,@out[1],@out[1] prefetcht0 31(@ptr[$i]) # prefetch input vaesenc $rndkey,@out[2],@out[2] ___ $code.=<<___ if ($i>1); prefetcht0 15(@ptr[$i-2]) # prefetch output ___ $code.=<<___; vaesenc $rndkey,@out[3],@out[3] lea (@ptr[$i],$offset),$offset cmovge %rsp,@ptr[$i] # cancel input Loading @@ -703,6 +729,8 @@ ___ } $code.=<<___; vmovdqu 32(%rsp),$counters prefetcht0 15(@ptr[$i-2]) # prefetch output prefetcht0 15(@ptr[$i-1]) cmp \$11,$rounds jb .Lenc8x_tail Loading Loading @@ -958,7 +986,13 @@ $code.=<<___ if ($i); ___ $code.=<<___; vaesdec $rndkey,@out[1],@out[1] prefetcht0 31(@ptr[$i]) # prefetch input vaesdec $rndkey,@out[2],@out[2] ___ $code.=<<___ if ($i>1); prefetcht0 15(@ptr[$i-2]) # prefetch output ___ $code.=<<___; vaesdec $rndkey,@out[3],@out[3] lea (@ptr[$i],$offset),$offset cmovge %rsp,@ptr[$i] # cancel input Loading @@ -979,6 +1013,8 @@ ___ } $code.=<<___; vmovdqu 32(%rsp),$counters prefetcht0 15(@ptr[$i-2]) # prefetch output prefetcht0 15(@ptr[$i-1]) cmp \$11,$rounds jb .Ldec8x_tail Loading
crypto/sha/asm/sha1-mb-x86_64.pl +31 −9 Original line number Diff line number Diff line Loading @@ -14,20 +14,21 @@ # # this +aesni(i) sha1 aesni-sha1 gain(iv) # ------------------------------------------------------------------- # Westmere(ii) 10.4/n +1.28=3.88(n=4) 5.44 6.58 +70% # Atom(ii) 18.9/n +3.93=8.66(n=4) 10.0 14.0 +62% # Westmere(ii) 10.7/n +1.28=3.96(n=4) 5.30 6.66 +68% # Atom(ii) 18.9?/n +3.93=8.66(n=4) 10.0 14.0 +62% # Sandy Bridge (8.16 +5.15=13.3)/n 4.99 5.98 +80% # Ivy Bridge (8.03 +5.14=13.2)/n 4.60 5.54 +68% # Ivy Bridge (8.08 +5.14=13.2)/n 4.60 5.54 +68% # Haswell(iii) (8.96 +5.00=14.0)/n 3.57 4.55 +160% # Bulldozer (9.75 +5.76=15.5)/n 5.95 6.37 +64% # Bulldozer (9.76 +5.76=15.5)/n 5.95 6.37 +64% # # (i) multi-block CBC encrypt with 128-bit key; # (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom, # because of lower AES-NI instruction throughput; # (iii) "this" is for n=8, when we gather twice as much data, result # for n=4 is 7.98+4.44=12.4; # (iv) improvement coefficients in real-life application are somewhat # lower and range from 30% to 100% (on Haswell); # for n=4 is 8.00+4.44=12.4; # (iv) presented improvement coefficients are asymptotic limits and # in real-life application are somewhat lower, e.g. for 2KB # fragments they range from 30% to 100% (on Haswell); $flavour = shift; $output = shift; Loading Loading @@ -80,6 +81,14 @@ $Tbl="%rbp"; @Xi=map("%xmm$_",(10..14)); $K="%xmm15"; if (1) { # Atom-specific optimization aiming to eliminate pshufb with high # registers [and thus get rid of 48 cycles accumulated penalty] @Xi=map("%xmm$_",(0..4)); ($tx,$t0,$t1,$t2,$t3)=map("%xmm$_",(5..9)); @V=($A,$B,$C,$D,$E)=map("%xmm$_",(10..14)); } $REG_SZ=16; sub Xi_off { Loading Loading @@ -139,8 +148,8 @@ $code.=<<___ if ($i<14); # just load input psrld \$2,$b paddd $t2,$e # e+=rol(a,5) movd `4*$j-16*4`(@ptr[2]),$t2 pshufb $tx,@Xi[1] movd `4*$j-16*4`(@ptr[2]),$t2 por $t1,$b # b=rol(b,30) ___ $code.=<<___ if ($i==14); # just load input Loading @@ -152,6 +161,7 @@ $code.=<<___ if ($i==14); # just load input movdqa $b,$t1 movdqa $b,$t0 pslld \$5,$t2 prefetcht0 63(@ptr[0]) pandn $d,$t1 pand $c,$t0 punpckldq $t3,@Xi[1] Loading @@ -162,14 +172,17 @@ $code.=<<___ if ($i==14); # just load input psrld \$27,$t3 pxor $t1,$t0 # Ch(b,c,d) movdqa $b,$t1 prefetcht0 63(@ptr[1]) por $t3,$t2 # rol(a,5) pslld \$30,$t1 paddd $t0,$e # e+=Ch(b,c,d) prefetcht0 63(@ptr[2]) psrld \$2,$b paddd $t2,$e # e+=rol(a,5) pshufb $tx,@Xi[1] prefetcht0 63(@ptr[3]) por $t1,$b # b=rol(b,30) ___ $code.=<<___ if ($i>=13 && $i<15); Loading Loading @@ -382,12 +395,12 @@ $code.=<<___; movdqu 0x60($ctx),$D movdqu 0x80($ctx),$E movdqa 0x60($Tbl),$tx # pbswap_mask movdqa -0x20($Tbl),$K # K_00_19 jmp .Loop .align 32 .Loop: ___ $code.=" movdqa -0x20($Tbl),$K\n"; # K_00_19 for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } $code.=" movdqa 0x00($Tbl),$K\n"; # K_20_39 for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } Loading Loading @@ -434,6 +447,7 @@ $code.=<<___; movdqa @Xi[0],(%rbx) # save counters movdqa 0x60($Tbl),$tx # pbswap_mask movdqa -0x20($Tbl),$K # K_00_19 dec $num jnz .Loop Loading Loading @@ -551,6 +565,7 @@ $code.=<<___ if ($i<14); ___ $code.=<<___ if ($i==14); vpaddd $K,$e,$e # e+=K_00_19 prefetcht0 63(@ptr[0]) vpslld \$5,$a,$t2 vpandn $d,$b,$t1 vpand $c,$b,$t0 Loading @@ -559,14 +574,17 @@ $code.=<<___ if ($i==14); vpaddd @Xi[0],$e,$e # e+=X[i] $vpack $t3,@Xi[1],@Xi[1] vpsrld \$27,$a,$t3 prefetcht0 63(@ptr[1]) vpxor $t1,$t0,$t0 # Ch(b,c,d) vpslld \$30,$b,$t1 vpor $t3,$t2,$t2 # rol(a,5) prefetcht0 63(@ptr[2]) vpaddd $t0,$e,$e # e+=Ch(b,c,d) vpsrld \$2,$b,$b vpaddd $t2,$e,$e # e+=rol(a,5) prefetcht0 63(@ptr[3]) vpshufb $tx,@Xi[1],@Xi[1] vpor $t1,$b,$b # b=rol(b,30) ___ Loading @@ -580,6 +598,7 @@ $code.=<<___ if ($i>=15); # apply Xupdate vpaddd $K,$e,$e # e+=K_00_19 vpslld \$5,$a,$t2 vpandn $d,$b,$t1 `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)` vpand $c,$b,$t0 vmovdqa @Xi[0],`&Xi_off($i)` Loading @@ -588,14 +607,17 @@ $code.=<<___ if ($i>=15); # apply Xupdate vpsrld \$27,$a,$t3 vpxor $t1,$t0,$t0 # Ch(b,c,d) vpxor @Xi[3],@Xi[1],@Xi[1] `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)` vpslld \$30,$b,$t1 vpor $t3,$t2,$t2 # rol(a,5) vpaddd $t0,$e,$e # e+=Ch(b,c,d) `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)` vpsrld \$31,@Xi[1],$tx vpaddd @Xi[1],@Xi[1],@Xi[1] vpsrld \$2,$b,$b `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)` vpaddd $t2,$e,$e # e+=rol(a,5) vpor $tx,@Xi[1],@Xi[1] # rol \$1,@Xi[1] vpor $t1,$b,$b # b=rol(b,30) Loading
crypto/sha/asm/sha256-mb-x86_64.pl +16 −3 Original line number Diff line number Diff line Loading @@ -15,7 +15,7 @@ # this +aesni(i) sha256 aesni-sha256 gain(iv) # ------------------------------------------------------------------- # Westmere(ii) 23.3/n +1.28=7.11(n=4) 12.3 +3.75=16.1 +126% # Atom(ii) 39.1/n +3.93=13.7(n=4) 20.8 +5.69=26.5 +93% # Atom(ii) ?39.1/n +3.93=13.7(n=4) 20.8 +5.69=26.5 +93% # Sandy Bridge (20.5 +5.15=25.7)/n 11.6 13.0 +103% # Ivy Bridge (20.4 +5.14=25.5)/n 10.3 11.6 +82% # Haswell(iii) (21.0 +5.00=26.0)/n 7.80 8.79 +170% Loading @@ -27,8 +27,9 @@ # AES-NI-SHA256 stitch for these processors; # (iii) "this" is for n=8, when we gather twice as much data, result # for n=4 is 20.3+4.44=24.7; # (iv) improvement coefficients in real-life application are somewhat # lower and range from 75% to 130% (on Haswell); # (iv) presented improvement coefficients are asymptotic limits and # in real-life application are somewhat lower, e.g. for 2KB # fragments they range from 75% to 13% (on Haswell); $flavour = shift; $output = shift; Loading Loading @@ -135,6 +136,7 @@ $code.=<<___; psrld \$25-11,$t2 movdqa $e,$t1 `"prefetch 63(@ptr[0])" if ($i==15)` pxor $t3,$sigma movdqa $e,$axb # borrow $axb pslld \$26-21,$t3 Loading @@ -142,6 +144,7 @@ $code.=<<___; pand $f,$axb pxor $t2,$sigma `"prefetch 63(@ptr[1])" if ($i==15)` movdqa $a,$t2 pxor $t3,$sigma # Sigma1(e) movdqa $a,$t3 Loading @@ -153,6 +156,7 @@ $code.=<<___; pslld \$10,$t3 pxor $a,$axb # a^b, b^c in next round `"prefetch 63(@ptr[2])" if ($i==15)` psrld \$13,$sigma pxor $t3,$t2 paddd $t1,$Xi # Xi+=Ch(e,f,g) Loading @@ -160,6 +164,7 @@ $code.=<<___; pand $axb,$bxc pxor $sigma,$t2 `"prefetch 63(@ptr[3])" if ($i==15)` psrld \$22-13,$sigma pxor $t3,$t2 movdqa $b,$h Loading Loading @@ -465,30 +470,38 @@ $code.=<<___; vpsrld \$25,$e,$t2 vpxor $t3,$sigma,$sigma `"prefetch 63(@ptr[0])" if ($i==15)` vpslld \$7,$e,$t3 vpandn $g,$e,$t1 vpand $f,$e,$axb # borrow $axb `"prefetch 63(@ptr[1])" if ($i==15)` vpxor $t2,$sigma,$sigma vpsrld \$2,$a,$h # borrow $h vpxor $t3,$sigma,$sigma # Sigma1(e) `"prefetch 63(@ptr[2])" if ($i==15)` vpslld \$30,$a,$t2 vpxor $axb,$t1,$t1 # Ch(e,f,g) vpxor $a,$b,$axb # a^b, b^c in next round `"prefetch 63(@ptr[3])" if ($i==15)` vpxor $t2,$h,$h vpaddd $sigma,$Xi,$Xi # Xi+=Sigma1(e) vpsrld \$13,$a,$t2 `"prefetch 63(@ptr[4])" if ($i==15 && $REG_SZ==32)` vpslld \$19,$a,$t3 vpaddd $t1,$Xi,$Xi # Xi+=Ch(e,f,g) vpand $axb,$bxc,$bxc `"prefetch 63(@ptr[5])" if ($i==15 && $REG_SZ==32)` vpxor $t2,$h,$sigma vpsrld \$22,$a,$t2 vpxor $t3,$sigma,$sigma `"prefetch 63(@ptr[6])" if ($i==15 && $REG_SZ==32)` vpslld \$10,$a,$t3 vpxor $bxc,$b,$h # h=Maj(a,b,c)=Ch(a^b,c,b) vpaddd $Xi,$d,$d # d+=Xi `"prefetch 63(@ptr[7])" if ($i==15 && $REG_SZ==32)` vpxor $t2,$sigma,$sigma vpxor $t3,$sigma,$sigma # Sigma0(a) Loading