ghash-s390x.pl: reschedule instructions for better performance. (8986e372) · Commits · CYBER - Cyber Security / TS 103 523 MSP / TLMSP / TLMSP OpenSSL

crypto/modes/asm/ghash-s390x.pl

+103 −44

Original line number	Diff line number	Diff line
		@@ -8,10 +8,21 @@
		# ====================================================================

		# September 2010.
		#
		# The module implements "4-bit" GCM GHASH function and underlying
		# single multiplication operation in GF(2^128). "4-bit" means that it
		# uses 256 bytes per-key table [+128 bytes shared table]. Performance
		# was measured to be ~18 cycles per processed byte on z10, which is
		# almost 40% better than gcc-generated code. It should be noted that
		# 18 cycles is worse result than expected: loop is scheduled for 12
		# and the result should be close to 12. In the lack of instruction-
		# level profiling data it's impossible to tell why...

		while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
		open STDOUT,">$output";

		$softonly=1; # disable hardware support for now

		$Zhi="%r0";
		$Zlo="%r1";

		@@ -38,6 +49,31 @@ $code.=<<___;
		.globl gcm_gmult_4bit
		.align 32
		gcm_gmult_4bit:
		___
		$code.=<<___ if(!$softonly);
		larl %r1,OPENSSL_s390xcap_P
		lg %r0,0(%r1)
		tmhl %r0,0x4000 # check for message-security-assist
		jz .Lsoft_gmult
		lghi %r0,0
		la %r1,16($sp)
		.long 0xb93e0004 # kimd %r0,%r4
		lg %r1,24($sp)
		tmhh %r1,0x4000 # check for function 65
		jz .Lsoft_gmult
		stg %r0,16($sp) # arrange 16 bytes of zero input
		stg %r0,24($sp)
		lghi %r0,65 # function 65
		la %r1,0($Xi) # H lies right after Xi in gcm128_context
		la $inp,16($sp)
		lghi $len,16
		.long 0xb93e0004 # kimd %r0,$inp
		brc 1,.-4 # pay attention to "partial completion"
		br %r14
		.align 32
		.Lsoft_gmult:
		___
		$code.=<<___;
		stmg %r6,%r14,48($sp)

		aghi $Xi,-1
		@@ -53,6 +89,27 @@ gcm_gmult_4bit:
		.globl gcm_ghash_4bit
		.align 32
		gcm_ghash_4bit:
		___
		$code.=<<___ if(!$softonly);
		larl %r1,OPENSSL_s390xcap_P
		lg %r0,0(%r1)
		tmhl %r0,0x4000 # check for message-security-assist
		jz .Lsoft_ghash
		lghi %r0,0
		la %r1,16($sp)
		.long 0xb93e0004 # kimd %r0,%r4
		lg %r1,24($sp)
		tmhh %r1,0x4000 # check for function 65
		jz .Lsoft_ghash
		lghi %r0,65 # function 65
		la %r1,0($Xi) # H lies right after Xi in gcm128_context
		.long 0xb93e0004 # kimd %r0,$inp
		brc 1,.-4 # pay attention to "partial completion"
		br %r14
		.align 32
		.Lsoft_ghash:
		___
		$code.=<<___;
		stmg %r6,%r14,48($sp)

		aghi $Xi,-1
		@@ -62,92 +119,94 @@ gcm_ghash_4bit:

		lg $Zlo,8+1($Xi) # Xi
		lg $Zhi,0+1($Xi)
		lghi $tmp,0
		.Louter:
		xg $Zlo,8($inp) # Xi ^= inp
		xg $Zhi,0($inp)
		xg $Zhi,0($inp) # Xi ^= inp
		xg $Zlo,8($inp)
		xgr $Zhi,$tmp
		stg $Zlo,8+1($Xi)
		stg $Zhi,0+1($Xi)

		.Lgmult_shortcut:
		lghi $tmp,0xff
		srlg $xi,$Zlo,8 # extract first two bytes
		lghi $tmp,0xf0
		sllg $nlo,$Zlo,4
		srlg $xi,$Zlo,8 # extract second byte
		ngr $nlo,$tmp
		lgr $nhi,$Zlo
		ngr $xi,$tmp
		ngr $nhi,$tmp

		sllg $nlo,$nhi,4
		nill $nhi,0xf0
		nill $nlo,0xf0
		lghi $cnt,14
		ngr $nhi,$tmp

		lg $Zlo,8($nlo,$Htbl)
		lg $Zhi,0($nlo,$Htbl)

		sllg $nlo,$xi,4
		nill $xi,0xf0
		sllg $rem0,$Zlo,3
		nill $nlo,0xf0

		srlg $Zlo,$Zlo,4
		ngr $nlo,$tmp
		ngr $rem0,$x78
		ngr $xi,$tmp

		sllg $tmp,$Zhi,60
		xg $Zlo,8($nhi,$Htbl)
		srlg $Zlo,$Zlo,4
		srlg $Zhi,$Zhi,4
		xgr $Zlo,$tmp
		xg $Zlo,8($nhi,$Htbl)
		xg $Zhi,0($nhi,$Htbl)
		lgr $nhi,$xi
		sllg $rem1,$Zlo,3

		xgr $Zlo,$tmp
		ngr $rem1,$x78
		j .Lghash_inner
		.align 16
		.Lghash_inner:
		srlg $Zlo,$Zlo,4
		ngr $rem1,$x78
		xg $Zlo,8($nlo,$Htbl)
		sllg $tmp,$Zhi,60
		xg $Zhi,0($rem0,$rem_4bit)
		xgr $Zlo,$tmp
		xg $Zlo,8($nlo,$Htbl)
		srlg $Zhi,$Zhi,4
		llgc $xi,0($cnt,$Xi)
		sllg $rem0,$Zlo,3
		xg $Zhi,0($nlo,$Htbl)
		sllg $nlo,$xi,4
		nill $xi,0xf0
		xg $Zhi,0($rem0,$rem_4bit)
		nill $nlo,0xf0

		srlg $Zlo,$Zlo,4
		sllg $rem0,$Zlo,3
		xgr $Zlo,$tmp
		ngr $rem0,$x78
		xg $Zlo,8($nhi,$Htbl)
		nill $xi,0xf0

		sllg $tmp,$Zhi,60
		xg $Zhi,0($rem1,$rem_4bit)
		xgr $Zlo,$tmp
		srlg $Zlo,$Zlo,4
		srlg $Zhi,$Zhi,4
		sllg $rem1,$Zlo,3
		xg $Zlo,8($nhi,$Htbl)
		xg $Zhi,0($nhi,$Htbl)
		lgr $nhi,$xi
		xg $Zhi,0($rem1,$rem_4bit)
		sllg $rem1,$Zlo,3
		xgr $Zlo,$tmp
		ngr $rem1,$x78
		brct $cnt,.Lghash_inner

		sllg $tmp,$Zhi,60
		srlg $Zlo,$Zlo,4
		ngr $rem1,$x78
		srlg $Zhi,$Zhi,4
		xg $Zlo,8($nlo,$Htbl)
		sllg $tmp,$Zhi,60
		xg $Zhi,0($nlo,$Htbl)
		sllg $xi,$Zlo,3
		xg $Zhi,0($rem0,$rem_4bit)
		xgr $Zlo,$tmp
		srlg $Zhi,$Zhi,4
		sllg $rem0,$Zlo,3
		xg $Zhi,0($nlo,$Htbl)
		ngr $xi,$x78

		srlg $Zlo,$Zlo,4
		ngr $rem0,$x78
		xg $Zhi,0($rem1,$rem_4bit)
		sllg $tmp,$Zhi,60
		xg $Zlo,8($nhi,$Htbl)
		srlg $Zlo,$Zlo,4
		srlg $Zhi,$Zhi,4
		xgr $Zlo,$tmp
		xg $Zlo,8($nhi,$Htbl)
		xg $Zhi,0($nhi,$Htbl)
		xgr $Zlo,$tmp
		xg $Zhi,0($rem1,$rem_4bit)

		lg $tmp,0($xi,$rem_4bit)
		la $inp,16($inp)
		xg $Zhi,0($rem0,$rem_4bit)
		sllg $tmp,$tmp,4 # correct last rem_4bit[rem]
		brctg $len,.Louter

		xgr $Zhi,$tmp
		stg $Zlo,8+1($Xi)
		stg $Zhi,0+1($Xi)
		lmg %r6,%r14,48($sp)
		@@ -157,10 +216,10 @@ gcm_ghash_4bit:

		.align 64
		rem_4bit:
		.long `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
		.long `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
		.long `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
		.long `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
		.long `0x0000<<12`,0,`0x1C20<<12`,0,`0x3840<<12`,0,`0x2460<<12`,0
		.long `0x7080<<12`,0,`0x6CA0<<12`,0,`0x48C0<<12`,0,`0x54E0<<12`,0
		.long `0xE100<<12`,0,`0xFD20<<12`,0,`0xD940<<12`,0,`0xC560<<12`,0
		.long `0x9180<<12`,0,`0x8DA0<<12`,0,`0xA9C0<<12`,0,`0xB5E0<<12`,0
		.type rem_4bit,\@object
		.size rem_4bit,(.-rem_4bit)
		.string "GHASH for s390x, CRYPTOGAMS by <appro\@openssl.org>"