Loading crypto/aes/asm/aesni-sha1-x86_64.pl +8 −0 Original line number Diff line number Diff line Loading @@ -23,14 +23,20 @@ # AES-128-CBC +SHA1 stitch gain # Westmere 3.77[+5.6] 9.37 6.65 +41% # Sandy Bridge 5.05[+5.2(6.3)] 10.25(11.35) 6.16(7.08) +67%(+60%) # Ivy Bridge 5.05[+4.7] 9.75 5.59 +74% # Bulldozer 5.77[+6.1] 11.87 6.47 +83% # # AES-192-CBC # Westmere 4.51 10.11 6.97 +45% # Sandy Bridge 6.05 11.25(12.35) 6.34(7.27) +77%(+70%) # Ivy Bridge 6.05 10.75 6.07 +77% # Bulldozer 6.89 12.99 7.02 +85% # # AES-256-CBC # Westmere 5.25 10.85 7.25 +50% # Sandy Bridge 7.05 12.25(13.35) 7.06(7.70) +74%(+73%) # Ivy Bridge 7.05 11.75 7.12 +65% # Bulldozer 8.00 14.10 8.24 +71% # # (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for # background information. Above numbers in parentheses are SSSE3 Loading @@ -47,6 +53,8 @@ # AES-128-CBC AES-192-CBC AES-256-CBC # Westmere 1.31 1.55 1.80 # Sandy Bridge 0.93 1.06 1.22 # Ivy Bridge 0.92 1.06 1.21 # Bulldozer 0.76 0.90 1.04 $flavour = shift; $output = shift; Loading crypto/aes/asm/aesni-x86_64.pl +7 −0 Original line number Diff line number Diff line Loading @@ -157,6 +157,13 @@ # one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like # in CTR mode AES instruction interleave factor was chosen to be 6x. ###################################################################### # For reference, AMD Bulldozer spends 5.77 cycles per byte processed # with 128-bit key in CBC encrypt and 0.76 cycles in CBC decrypt, 0.70 # in ECB, 0.94 in CTR, 0.95 in XTS... This means that aes[enc|dec] # instruction latency is 9 cycles and that they can be issued every # cycle. $PREFIX="aesni"; # if $PREFIX is set to "AES", the script # generates drop-in replacement for # crypto/aes/asm/aes-x86_64.pl:-) Loading crypto/modes/asm/ghash-x86.pl +6 −0 Original line number Diff line number Diff line Loading @@ -26,6 +26,8 @@ # P4 125/125 17.8 84(***) # Opteron 66 /70 10.1 30 # Core2 54 /67 8.4 18 # Atom 105/105 16.8 53 # VIA Nano 69 /71 13.0 27 # # (*) gcc 3.4.x was observed to generate few percent slower code, # which is one of reasons why 2.95.3 results were chosen, Loading Loading @@ -113,6 +115,10 @@ # similar manner resulted in almost 20% degradation on Sandy Bridge, # where original 64-bit code processes one byte in 1.95 cycles. ##################################################################### # For reference, AMD Bulldozer processes one byte in 1.98 cycles in # 32-bit mode and 1.89 in 64-bit. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; Loading crypto/modes/asm/ghash-x86_64.pl +2 −0 Original line number Diff line number Diff line Loading @@ -22,6 +22,8 @@ # P4 28.6 14.0 +100% # Opteron 19.3 7.7 +150% # Core2 17.8 8.1(**) +120% # Atom 31.6 16.8 +88% # VIA Nano 21.8 10.1 +115% # # (*) comparison is not completely fair, because C results are # for vanilla "256B" implementation, while assembler results Loading crypto/rc4/asm/rc4-586.pl +3 −0 Original line number Diff line number Diff line Loading @@ -43,6 +43,9 @@ # Westmere 5.1/+94%(**) # Sandy Bridge 5.0/+8% # Atom 12.6/+6% # VIA Nano 6.4/+9% # Ivy Bridge 4.9/0% # Bulldozer 4.9/+15% # # (*) PIII can actually deliver 6.6 cycles per byte with MMX code, # but this specific code performs poorly on Core2. And vice Loading Loading
crypto/aes/asm/aesni-sha1-x86_64.pl +8 −0 Original line number Diff line number Diff line Loading @@ -23,14 +23,20 @@ # AES-128-CBC +SHA1 stitch gain # Westmere 3.77[+5.6] 9.37 6.65 +41% # Sandy Bridge 5.05[+5.2(6.3)] 10.25(11.35) 6.16(7.08) +67%(+60%) # Ivy Bridge 5.05[+4.7] 9.75 5.59 +74% # Bulldozer 5.77[+6.1] 11.87 6.47 +83% # # AES-192-CBC # Westmere 4.51 10.11 6.97 +45% # Sandy Bridge 6.05 11.25(12.35) 6.34(7.27) +77%(+70%) # Ivy Bridge 6.05 10.75 6.07 +77% # Bulldozer 6.89 12.99 7.02 +85% # # AES-256-CBC # Westmere 5.25 10.85 7.25 +50% # Sandy Bridge 7.05 12.25(13.35) 7.06(7.70) +74%(+73%) # Ivy Bridge 7.05 11.75 7.12 +65% # Bulldozer 8.00 14.10 8.24 +71% # # (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for # background information. Above numbers in parentheses are SSSE3 Loading @@ -47,6 +53,8 @@ # AES-128-CBC AES-192-CBC AES-256-CBC # Westmere 1.31 1.55 1.80 # Sandy Bridge 0.93 1.06 1.22 # Ivy Bridge 0.92 1.06 1.21 # Bulldozer 0.76 0.90 1.04 $flavour = shift; $output = shift; Loading
crypto/aes/asm/aesni-x86_64.pl +7 −0 Original line number Diff line number Diff line Loading @@ -157,6 +157,13 @@ # one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like # in CTR mode AES instruction interleave factor was chosen to be 6x. ###################################################################### # For reference, AMD Bulldozer spends 5.77 cycles per byte processed # with 128-bit key in CBC encrypt and 0.76 cycles in CBC decrypt, 0.70 # in ECB, 0.94 in CTR, 0.95 in XTS... This means that aes[enc|dec] # instruction latency is 9 cycles and that they can be issued every # cycle. $PREFIX="aesni"; # if $PREFIX is set to "AES", the script # generates drop-in replacement for # crypto/aes/asm/aes-x86_64.pl:-) Loading
crypto/modes/asm/ghash-x86.pl +6 −0 Original line number Diff line number Diff line Loading @@ -26,6 +26,8 @@ # P4 125/125 17.8 84(***) # Opteron 66 /70 10.1 30 # Core2 54 /67 8.4 18 # Atom 105/105 16.8 53 # VIA Nano 69 /71 13.0 27 # # (*) gcc 3.4.x was observed to generate few percent slower code, # which is one of reasons why 2.95.3 results were chosen, Loading Loading @@ -113,6 +115,10 @@ # similar manner resulted in almost 20% degradation on Sandy Bridge, # where original 64-bit code processes one byte in 1.95 cycles. ##################################################################### # For reference, AMD Bulldozer processes one byte in 1.98 cycles in # 32-bit mode and 1.89 in 64-bit. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; Loading
crypto/modes/asm/ghash-x86_64.pl +2 −0 Original line number Diff line number Diff line Loading @@ -22,6 +22,8 @@ # P4 28.6 14.0 +100% # Opteron 19.3 7.7 +150% # Core2 17.8 8.1(**) +120% # Atom 31.6 16.8 +88% # VIA Nano 21.8 10.1 +115% # # (*) comparison is not completely fair, because C results are # for vanilla "256B" implementation, while assembler results Loading
crypto/rc4/asm/rc4-586.pl +3 −0 Original line number Diff line number Diff line Loading @@ -43,6 +43,9 @@ # Westmere 5.1/+94%(**) # Sandy Bridge 5.0/+8% # Atom 12.6/+6% # VIA Nano 6.4/+9% # Ivy Bridge 4.9/0% # Bulldozer 4.9/+15% # # (*) PIII can actually deliver 6.6 cycles per byte with MMX code, # but this specific code performs poorly on Core2. And vice Loading