md5-sparcv9.pl: avoid %asi modifications, improve short input performance (d17b59e4) · Commits · CYBER - Cyber Security / TS 103 523 MSP / TLMSP / TLMSP OpenSSL

crypto/md5/asm/md5-sparcv9.pl

+14 −13

Original line number	Diff line number	Diff line
		@@ -12,7 +12,7 @@
		# MD5 for SPARCv9, 6.9 cycles per byte on UltraSPARC, >40% faster than
		# code generated by Sun C 5.2.

		# SPARC T4 MD5 hardware achieves 3.24 cycles per byte, which is 2.1x
		# SPARC T4 MD5 hardware achieves 3.20 cycles per byte, which is 2.1x
		# faster than software. Multi-process benchmark saturates at 12x
		# single-process result on 8-core processor, or ~11GBps per 2.85GHz
		# socket.
		@@ -221,15 +221,15 @@ md5_block_asm_data_order:
		be .Lsoftware
		nop

		rd %asi, %g5
		wr %g0, 0x88, %asi ! ASI_PRIMARY_LITTLE

		lda [%o0 + 0x00] %asi, %f0 ! load context
		lda [%o0 + 0x04] %asi, %f1
		mov 4, %g1
		andcc %o1, 0x7, %g0
		lda [%o0 + 0x08] %asi, %f2
		lda [%o0 + %g0]0x88, %f0 ! load context
		lda [%o0 + %g1]0x88, %f1
		add %o0, 8, %o0
		lda [%o0 + %g0]0x88, %f2
		lda [%o0 + %g1]0x88, %f3
		bne,pn %icc, .Lhwunaligned
		lda [%o0 + 0x0c] %asi, %f3
		sub %o0, 8, %o0

		.Lhw_loop:
		ldd [%o1 + 0x00], %f8
		@@ -250,12 +250,13 @@ md5_block_asm_data_order:
		nop

		.Lhwfinish:
		sta %f0, [%o0 + 0x00] %asi ! store context
		sta %f1, [%o0 + 0x04] %asi
		sta %f2, [%o0 + 0x08] %asi
		sta %f3, [%o0 + 0x0c] %asi
		sta %f0, [%o0 + %g0]0x88 ! store context
		sta %f1, [%o0 + %g1]0x88
		add %o0, 8, %o0
		sta %f2, [%o0 + %g0]0x88
		sta %f3, [%o0 + %g1]0x88
		retl
		wr %g5, 0x0, %asi ! restore %asi
		nop

		.align 8
		.Lhwunaligned: