x86[_64] assembly pack: update benchmark results. (d2e18031) · Commits · CYBER - Cyber Security / TS 103 523 MSP / TLMSP / TLMSP OpenSSL

crypto/aes/asm/aesni-sha1-x86_64.pl

+8 −0

Original line number	Diff line number	Diff line
		@@ -23,14 +23,20 @@
		# AES-128-CBC +SHA1 stitch gain
		# Westmere 3.77[+5.6] 9.37 6.65 +41%
		# Sandy Bridge 5.05[+5.2(6.3)] 10.25(11.35) 6.16(7.08) +67%(+60%)
		# Ivy Bridge 5.05[+4.7] 9.75 5.59 +74%
		# Bulldozer 5.77[+6.1] 11.87 6.47 +83%
		#
		# AES-192-CBC
		# Westmere 4.51 10.11 6.97 +45%
		# Sandy Bridge 6.05 11.25(12.35) 6.34(7.27) +77%(+70%)
		# Ivy Bridge 6.05 10.75 6.07 +77%
		# Bulldozer 6.89 12.99 7.02 +85%
		#
		# AES-256-CBC
		# Westmere 5.25 10.85 7.25 +50%
		# Sandy Bridge 7.05 12.25(13.35) 7.06(7.70) +74%(+73%)
		# Ivy Bridge 7.05 11.75 7.12 +65%
		# Bulldozer 8.00 14.10 8.24 +71%
		#
		# (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for
		# background information. Above numbers in parentheses are SSSE3
		@@ -47,6 +53,8 @@
		# AES-128-CBC AES-192-CBC AES-256-CBC
		# Westmere 1.31 1.55 1.80
		# Sandy Bridge 0.93 1.06 1.22
		# Ivy Bridge 0.92 1.06 1.21
		# Bulldozer 0.76 0.90 1.04

		$flavour = shift;
		$output = shift;

+7 −0

Original line number	Diff line number	Diff line
		@@ -157,6 +157,13 @@
		# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like
		# in CTR mode AES instruction interleave factor was chosen to be 6x.

		######################################################################
		# For reference, AMD Bulldozer spends 5.77 cycles per byte processed
		# with 128-bit key in CBC encrypt and 0.76 cycles in CBC decrypt, 0.70
		# in ECB, 0.94 in CTR, 0.95 in XTS... This means that aes[enc\|dec]
		# instruction latency is 9 cycles and that they can be issued every
		# cycle.

		$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
		# generates drop-in replacement for
		# crypto/aes/asm/aes-x86_64.pl:-)

+6 −0

Original line number	Diff line number	Diff line
		@@ -26,6 +26,8 @@
		# P4 125/125 17.8 84(***)
		# Opteron 66 /70 10.1 30
		# Core2 54 /67 8.4 18
		# Atom 105/105 16.8 53
		# VIA Nano 69 /71 13.0 27
		#
		# (*) gcc 3.4.x was observed to generate few percent slower code,
		# which is one of reasons why 2.95.3 results were chosen,
		@@ -113,6 +115,10 @@
		# similar manner resulted in almost 20% degradation on Sandy Bridge,
		# where original 64-bit code processes one byte in 1.95 cycles.

		#####################################################################
		# For reference, AMD Bulldozer processes one byte in 1.98 cycles in
		# 32-bit mode and 1.89 in 64-bit.

		$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
		push(@INC,"${dir}","${dir}../../perlasm");
		require "x86asm.pl";

+2 −0

Original line number	Diff line number	Diff line
		@@ -22,6 +22,8 @@
		# P4 28.6 14.0 +100%
		# Opteron 19.3 7.7 +150%
		# Core2 17.8 8.1(**) +120%
		# Atom 31.6 16.8 +88%
		# VIA Nano 21.8 10.1 +115%
		#
		# (*) comparison is not completely fair, because C results are
		# for vanilla "256B" implementation, while assembler results

+3 −0

Original line number	Diff line number	Diff line
		@@ -43,6 +43,9 @@
		# Westmere 5.1/+94%(**)
		# Sandy Bridge 5.0/+8%
		# Atom 12.6/+6%
		# VIA Nano 6.4/+9%
		# Ivy Bridge 4.9/0%
		# Bulldozer 4.9/+15%
		#
		# (*) PIII can actually deliver 6.6 cycles per byte with MMX code,
		# but this specific code performs poorly on Core2. And vice