Commit 251718e4 authored May 02, 2007 by Andy Polyakov

Fix s390x bugs and correct performance coefficients.

parent c504a5e7

crypto/aes/asm/aes-s390x.pl

+1 −1

Original line number	Diff line number	Diff line
		@@ -23,7 +23,7 @@
		# for CBC is not utilized, nor multiple blocks are ever processed.
		# Then software key schedule can be postponed till hardware support
		# detection... Performance improvement over assembler is reportedly
		# ~2.5x, but can reach >15x [naturally on larger chunks] if proper
		# ~2.5x, but can reach >8x [naturally on larger chunks] if proper
		# support is implemented.

		$t1="%r0";

crypto/sha/asm/sha1-s390x.pl

+2 −1

Original line number	Diff line number	Diff line
		@@ -13,7 +13,7 @@
		#
		# Performance is >30% better than gcc 3.3 generated code. But the real
		# twist is that SHA1 hardware support is detected and utilized. In
		# which case performance can reach further >8x for larger chunks.
		# which case performance can reach further >4.5x for larger chunks.

		$kimdfunc=1; # magic function code for kimd instruction

		@@ -160,6 +160,7 @@ $code.=<<___ if ($kimdfunc);
		lgr %r2,$inp
		sllg %r3,$len,6
		.long 0xb93e0002 # kimd %r0,%r2
		brc 1,.-4 # pay attention to "partial completion"
		br %r14
		.Lsoftware:
		___

crypto/sha/asm/sha512-s390x.pl

+2 −1

Original line number	Diff line number	Diff line
		@@ -16,7 +16,7 @@
		# "pathologically" high, in particular in comparison to other SHA
		# modules). But the real twist is that it detects if hardware support
		# for SHA256 is available and in such case utilizes it. Then the
		# performance can reach >12x of assembler one for larger chunks.
		# performance can reach >6.5x of assembler one for larger chunks.
		#
		# sha512_block_data_order is ~70% faster than gcc 3.3 generated code.

		@@ -219,6 +219,7 @@ $code.=<<___ if ($kimdfunc);
		lgr %r2,$inp
		sllg %r3,$len,`log(16*$SZ)/log(2)`
		.long 0xb93e0002 # kimd %r0,%r2
		brc 1,.-4 # pay attention to "partial completion"
		br %r14
		.Lsoftware:
		___