Commit e822c756 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

s390x assembler pack: adapt for -m31 build, see commentary in Configure

for more details.
parent 300b1d76
Loading
Loading
Loading
Loading
+16 −2
Original line number Diff line number Diff line
@@ -134,7 +134,7 @@ my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o::::::::::::void";
my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-alpha.o:void";
my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o:::::::";
my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o:::::::";
my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o::aes_ctr.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o:void";
my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o::aes_ctr.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o";
my $armv4_asm=":bn_asm.o armv4-mont.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o:void";
my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o:32";
my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o:64";
@@ -356,7 +356,21 @@ my %table=(
"linux-ia64-ecc","ecc:-DL_ENDIAN -DTERMIO -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"linux-ia64-icc","icc:-DL_ENDIAN -DTERMIO -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"linux-x86_64",	"gcc:-m64 -DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:${x86_64_asm}:elf:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64",
"linux-s390x",	"gcc:-m64 -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL:${s390x_asm}:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64",
"linux64-s390x",	"gcc:-m64 -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL:${s390x_asm}:64:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64",
#### So called "highgprs" target for z/Architecture CPUs
# "Highgprs" is kernel feature first implemented in Linux 2.6.32, see
# /proc/cpuinfo. The idea is to preserve most significant bits of
# general purpose registers not only upon 32-bit process context
# switch, but even on asynchronous signal delivery to such process.
# This makes it possible to deploy 64-bit instructions even in legacy
# application context and achieve better [or should we say adequate]
# performance. The build is binary compatible with linux-generic32,
# and the idea is to be able to install the resulting libcrypto.so
# alongside generic one, e.g. as /lib/highgprs/libcrypto.so.x.y, for
# ldconfig and run-time linker to autodiscover. Unfortunately it
# doesn't work just yet, because of couple of bugs in glibc
# sysdep/s390/dl-procinfo.c affecting ldconfig and ld.so.1...
"linux32-s390x",	"gcc:-m31 -Wa,-mzarch -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL:s390xcap.o s390xcpuid.o:bn_asm.o s390x-mont.o::aes_ctr.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o:31:dlfcn:linux-shared:-fPIC:-m31:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::/highgprs",
#### SPARC Linux setups
# Ray Miller <ray.miller@computing-services.oxford.ac.uk> has patiently
# assisted with debugging of following two configs.
+12 −1
Original line number Diff line number Diff line
@@ -629,7 +629,18 @@ case "$GUESSOS" in
  sh*-*-linux2)  OUT="linux-generic32"; options="$options -DL_ENDIAN" ;;
  m68k*-*-linux2) OUT="linux-generic32"; options="$options -DB_ENDIAN" ;;
  s390-*-linux2) OUT="linux-generic32"; options="$options -DB_ENDIAN" ;;
  s390x-*-linux2) OUT="linux-s390x" ;;
  s390x-*-linux2)
	# To be uncommented when glibc bug is fixed, see Configure...
	#if egrep -e '^features.* highgprs' /proc/cpuinfo >/dev/null ; then
	#  echo "WARNING! If you wish to build \"highgprs\" 32-bit library, then you"
	#  echo "         have to invoke './Configure linux32-s390x' *manually*."
	#  if [ "$TEST" = "false" -a -t -1 ]; then
	#    echo "         You have about 5 seconds to press Ctrl-C to abort."
	#    (trap "stty `stty -g`" 2 0; stty -icanon min 0 time 50; read waste) <&1
	#  fi
	#fi
	OUT="linux64-s390x"
	;;
  x86_64-*-linux?) OUT="linux-x86_64" ;;
  *86-*-linux2) OUT="linux-elf"
	if [ "$GCCVER" -gt 28 ]; then
+95 −72
Original line number Diff line number Diff line
@@ -60,6 +60,26 @@
# maximum, but *on average* it would be as much as ~98%. Meaning that
# worst case is unlike, it's like hitting ravine on plateau.

# November 2010.
#
# Adapt for -m31 build. If kernel supports what's called "highgprs"
# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
# instructions and achieve "64-bit" performance even in 31-bit legacy
# application context. The feature is not specific to any particular
# processor, as long as it's "z-CPU". Latter implies that the code
# remains z/Architecture specific. On z990 it was measured to perform
# 2x better than code generated by gcc 4.3.

$flavour = shift;

if ($flavour =~ /3[12]/) {
	$SIZE_T=4;
	$g="";
} else {
	$SIZE_T=8;
	$g="g";
}

while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";

@@ -82,6 +102,8 @@ $rounds="%r13";
$ra="%r14";
$sp="%r15";

$stdframe=16*$SIZE_T+4*8;

sub _data_word()
{ my $i;
    while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
@@ -223,7 +245,7 @@ $code.=<<___ if (!$softonly);
.Lesoft:
___
$code.=<<___;
	stmg	%r3,$ra,24($sp)
	stm${g}	%r3,$ra,3*$SIZE_T($sp)

	llgf	$s0,0($inp)
	llgf	$s1,4($inp)
@@ -233,20 +255,20 @@ $code.=<<___;
	larl	$tbl,AES_Te
	bras	$ra,_s390x_AES_encrypt

	lg	$out,24($sp)
	l${g}	$out,3*$SIZE_T($sp)
	st	$s0,0($out)
	st	$s1,4($out)
	st	$s2,8($out)
	st	$s3,12($out)

	lmg	%r6,$ra,48($sp)
	lm${g}	%r6,$ra,6*$SIZE_T($sp)
	br	$ra
.size	AES_encrypt,.-AES_encrypt

.type   _s390x_AES_encrypt,\@function
.align	16
_s390x_AES_encrypt:
	stg	$ra,152($sp)
	st${g}	$ra,`$stdframe-$SIZE_T`($sp)
	x	$s0,0($key)
	x	$s1,4($key)
	x	$s2,8($key)
@@ -410,7 +432,7 @@ _s390x_AES_encrypt:
	or	$s2,$i3
	or	$s3,$t3

	lg	$ra,152($sp)
	l${g}	$ra,`$stdframe-$SIZE_T`($sp)
	xr	$s0,$t0
	xr	$s1,$t2
	x	$s2,24($key)
@@ -549,7 +571,7 @@ $code.=<<___ if (!$softonly);
.Ldsoft:
___
$code.=<<___;
	stmg	%r3,$ra,24($sp)
	stm${g}	%r3,$ra,3*$SIZE_T($sp)

	llgf	$s0,0($inp)
	llgf	$s1,4($inp)
@@ -559,20 +581,20 @@ $code.=<<___;
	larl	$tbl,AES_Td
	bras	$ra,_s390x_AES_decrypt

	lg	$out,24($sp)
	l${g}	$out,3*$SIZE_T($sp)
	st	$s0,0($out)
	st	$s1,4($out)
	st	$s2,8($out)
	st	$s3,12($out)

	lmg	%r6,$ra,48($sp)
	lm${g}	%r6,$ra,6*$SIZE_T($sp)
	br	$ra
.size	AES_decrypt,.-AES_decrypt

.type   _s390x_AES_decrypt,\@function
.align	16
_s390x_AES_decrypt:
	stg	$ra,152($sp)
	st${g}	$ra,`$stdframe-$SIZE_T`($sp)
	x	$s0,0($key)
	x	$s1,4($key)
	x	$s2,8($key)
@@ -716,7 +738,7 @@ _s390x_AES_decrypt:
	nr	$i1,$mask
	nr	$i2,$mask

	lg	$ra,152($sp)
	l${g}	$ra,`$stdframe-$SIZE_T`($sp)
	or	$s1,$t1
	l	$t0,16($key)
	l	$t1,20($key)
@@ -750,9 +772,9 @@ $code.=<<___;
.align	16
AES_set_encrypt_key:
	lghi	$t0,0
	clgr	$inp,$t0
	cl${g}r	$inp,$t0
	je	.Lminus1
	clgr	$key,$t0
	cl${g}r	$key,$t0
	je	.Lminus1

	lghi	$t0,128
@@ -810,7 +832,7 @@ ___
$code.=<<___;
.align	16
.Lekey_internal:
	stmg	%r6,%r13,48($sp)	# all non-volatile regs
	stm${g}	%r6,%r13,6*$SIZE_T($sp)	# all non-volatile regs

	larl	$tbl,AES_Te+2048

@@ -871,7 +893,7 @@ $code.=<<___;
	la	$t3,4($t3)		# i++
	brct	$rounds,.L128_loop
	lghi	%r2,0
	lmg	%r6,%r13,48($sp)
	lm${g}	%r6,%r13,6*$SIZE_T($sp)
	br	$ra

.align	16
@@ -919,7 +941,7 @@ $code.=<<___;
	st	$s3,36($key)
	brct	$rounds,.L192_continue
	lghi	%r2,0
	lmg	%r6,%r13,48($sp)
	lm${g}	%r6,%r13,6*$SIZE_T($sp)
	br	$ra

.align	16
@@ -981,7 +1003,7 @@ $code.=<<___;
	st	$s3,44($key)
	brct	$rounds,.L256_continue
	lghi	%r2,0
	lmg	%r6,%r13,48($sp)
	lm${g}	%r6,%r13,6*$SIZE_T($sp)
	br	$ra

.align	16
@@ -1032,11 +1054,11 @@ $code.=<<___;
.type	AES_set_decrypt_key,\@function
.align	16
AES_set_decrypt_key:
	stg	$key,32($sp)		# I rely on AES_set_encrypt_key to
	stg	$ra,112($sp)		# save non-volatile registers!
	st${g}	$key,4*$SIZE_T($sp)	# I rely on AES_set_encrypt_key to
	st${g}	$ra,14*$SIZE_T($sp)	# save non-volatile registers!
	bras	$ra,AES_set_encrypt_key
	lg	$key,32($sp)
	lg	$ra,112($sp)
	l${g}	$key,4*$SIZE_T($sp)
	l${g}	$ra,14*$SIZE_T($sp)
	ltgr	%r2,%r2
	bnzr	$ra
___
@@ -1051,11 +1073,11 @@ $code.=<<___ if (!$softonly);

.align	16
.Ldkey_internal:
	stg	$key,32($sp)
	stg	$ra,40($sp)
	st${g}	$key,4*$SIZE_T($sp)
	st${g}	$ra,14*$SIZE_T($sp)
	bras	$ra,.Lekey_internal
	lg	$key,32($sp)
	lg	$ra,40($sp)
	l${g}	$key,4*$SIZE_T($sp)
	l${g}	$ra,14*$SIZE_T($sp)
___
$code.=<<___;

@@ -1136,7 +1158,7 @@ $code.=<<___;
	la	$key,4($key)
	brct	$rounds,.Lmix

	lmg	%r6,%r13,48($sp)# as was saved by AES_set_encrypt_key!
	lm${g}	%r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key!
	lghi	%r2,0
	br	$ra
.size	AES_set_decrypt_key,.-AES_set_decrypt_key
@@ -1176,7 +1198,7 @@ $code.=<<___ if (!$softonly);
	l	%r0,240($key)	# load kmc code
	lghi	$key,15		# res=len%16, len-=res;
	ngr	$key,$len
	slgr	$len,$key
	sl${g}r	$len,$key
	la	%r1,16($sp)	# parameter block - ivec || key
	jz	.Lkmc_truncated
	.long	0xb92f0042	# kmc %r4,%r2
@@ -1194,34 +1216,34 @@ $code.=<<___ if (!$softonly);
	tmll	%r0,0x80
	jnz	.Lkmc_truncated_dec
	lghi	%r1,0
	stg	%r1,128($sp)
	stg	%r1,136($sp)
	stg	%r1,16*$SIZE_T($sp)
	stg	%r1,16*$SIZE_T+8($sp)
	bras	%r1,1f
	mvc	128(1,$sp),0($inp)
	mvc	16*$SIZE_T(1,$sp),0($inp)
1:	ex	$key,0(%r1)
	la	%r1,16($sp)	# restore parameter block
	la	$inp,128($sp)
	la	$inp,16*$SIZE_T($sp)
	lghi	$len,16
	.long	0xb92f0042	# kmc %r4,%r2
	j	.Lkmc_done
.align	16
.Lkmc_truncated_dec:
	stg	$out,64($sp)
	la	$out,128($sp)
	st${g}	$out,4*$SIZE_T($sp)
	la	$out,16*$SIZE_T($sp)
	lghi	$len,16
	.long	0xb92f0042	# kmc %r4,%r2
	lg	$out,64($sp)
	l${g}	$out,4*$SIZE_T($sp)
	bras	%r1,2f
	mvc	0(1,$out),128($sp)
	mvc	0(1,$out),16*$SIZE_T($sp)
2:	ex	$key,0(%r1)
	j	.Lkmc_done
.align	16
.Lcbc_software:
___
$code.=<<___;
	stmg	$key,$ra,40($sp)
	stm${g}	$key,$ra,5*$SIZE_T($sp)
	lhi	%r0,0
	cl	%r0,164($sp)
	cl	%r0,`$stdframe+$SIZE_T-4`($sp)
	je	.Lcbc_decrypt

	larl	$tbl,AES_Te
@@ -1232,10 +1254,10 @@ $code.=<<___;
	llgf	$s3,12($ivp)

	lghi	$t0,16
	slgr	$len,$t0
	sl${g}r	$len,$t0
	brc	4,.Lcbc_enc_tail	# if borrow
.Lcbc_enc_loop:
	stmg	$inp,$out,16($sp)
	stm${g}	$inp,$out,2*$SIZE_T($sp)
	x	$s0,0($inp)
	x	$s1,4($inp)
	x	$s2,8($inp)
@@ -1244,7 +1266,7 @@ $code.=<<___;

	bras	$ra,_s390x_AES_encrypt

	lmg	$inp,$key,16($sp)
	lm${g}	$inp,$key,2*$SIZE_T($sp)
	st	$s0,0($out)
	st	$s1,4($out)
	st	$s2,8($out)
@@ -1253,33 +1275,33 @@ $code.=<<___;
	la	$inp,16($inp)
	la	$out,16($out)
	lghi	$t0,16
	ltgr	$len,$len
	lt${g}r	$len,$len
	jz	.Lcbc_enc_done
	slgr	$len,$t0
	sl${g}r	$len,$t0
	brc	4,.Lcbc_enc_tail	# if borrow
	j	.Lcbc_enc_loop
.align	16
.Lcbc_enc_done:
	lg	$ivp,48($sp)
	l${g}	$ivp,6*$SIZE_T($sp)
	st	$s0,0($ivp)
	st	$s1,4($ivp)	
	st	$s2,8($ivp)
	st	$s3,12($ivp)

	lmg	%r7,$ra,56($sp)
	lm${g}	%r7,$ra,7*$SIZE_T($sp)
	br	$ra

.align	16
.Lcbc_enc_tail:
	aghi	$len,15
	lghi	$t0,0
	stg	$t0,128($sp)
	stg	$t0,136($sp)
	stg	$t0,16*$SIZE_T($sp)
	stg	$t0,16*$SIZE_T+8($sp)
	bras	$t1,3f
	mvc	128(1,$sp),0($inp)
	mvc	16*$SIZE_T(1,$sp),0($inp)
3:	ex	$len,0($t1)
	lghi	$len,0
	la	$inp,128($sp)
	la	$inp,16*$SIZE_T($sp)
	j	.Lcbc_enc_loop

.align	16
@@ -1288,10 +1310,10 @@ $code.=<<___;

	lg	$t0,0($ivp)
	lg	$t1,8($ivp)
	stmg	$t0,$t1,128($sp)
	stmg	$t0,$t1,16*$SIZE_T($sp)

.Lcbc_dec_loop:
	stmg	$inp,$out,16($sp)
	stm${g}	$inp,$out,2*$SIZE_T($sp)
	llgf	$s0,0($inp)
	llgf	$s1,4($inp)
	llgf	$s2,8($inp)
@@ -1300,7 +1322,7 @@ $code.=<<___;

	bras	$ra,_s390x_AES_decrypt

	lmg	$inp,$key,16($sp)
	lm${g}	$inp,$key,2*$SIZE_T($sp)
	sllg	$s0,$s0,32
	sllg	$s2,$s2,32
	lr	$s0,$s1
@@ -1308,15 +1330,15 @@ $code.=<<___;

	lg	$t0,0($inp)
	lg	$t1,8($inp)
	xg	$s0,128($sp)
	xg	$s2,136($sp)
	xg	$s0,16*$SIZE_T($sp)
	xg	$s2,16*$SIZE_T+8($sp)
	lghi	$s1,16
	slgr	$len,$s1
	sl${g}r	$len,$s1
	brc	4,.Lcbc_dec_tail	# if borrow
	brc	2,.Lcbc_dec_done	# if zero
	stg	$s0,0($out)
	stg	$s2,8($out)
	stmg	$t0,$t1,128($sp)
	stmg	$t0,$t1,16*$SIZE_T($sp)

	la	$inp,16($inp)
	la	$out,16($out)
@@ -1326,7 +1348,7 @@ $code.=<<___;
	stg	$s0,0($out)
	stg	$s2,8($out)
.Lcbc_dec_exit:
	lmg	$ivp,$ra,48($sp)
	lm${g}	%r6,$ra,6*$SIZE_T($sp)
	stmg	$t0,$t1,0($ivp)

	br	$ra
@@ -1334,10 +1356,10 @@ $code.=<<___;
.align	16
.Lcbc_dec_tail:
	aghi	$len,15
	stg	$s0,128($sp)
	stg	$s2,136($sp)
	stg	$s0,16*$SIZE_T($sp)
	stg	$s2,16*$SIZE_T+8($sp)
	bras	$s1,4f
	mvc	0(1,$out),128($sp)
	mvc	0(1,$out),16*$SIZE_T($sp)
4:	ex	$len,0($s1)
	j	.Lcbc_dec_exit
.size	AES_cbc_encrypt,.-AES_cbc_encrypt
@@ -1359,6 +1381,7 @@ $code.=<<___;
.type	AES_ctr32_encrypt,\@function
.align	16
AES_ctr32_encrypt:
	llgfr	$len,$len	# safe in ctr32 subroutine even in 64-bit case
___
$code.=<<___ if (!$softonly);
	l	%r0,240($key)
@@ -1366,7 +1389,7 @@ $code.=<<___ if (!$softonly);
	clr	%r0,%r1
	jl	.Lctr32_software

	stmg	%r6,$s3,48($sp)
	stm${g}	%r6,$s3,6*$SIZE_T($sp)

	slgr	$out,$inp
	la	%r1,0($key)	# %r1 is permanent copy of $key
@@ -1388,14 +1411,14 @@ $code.=<<___ if (!$softonly);

	la	$sp,1024($s0)	# alloca
	srlg	$fp,$fp,4	# convert bytes to blocks, minimum 16
	stg	$s2,0($sp)	# back-chain
	stg	$fp,8($sp)
	st${g}	$s2,0($sp)	# back-chain
	st${g}	$fp,$SIZE_T($sp)

	slgr	$len,$fp
	brc	1,.Lctr32_hw_loop	# not zero, no borrow
	algr	$fp,$len	# input is shorter than allocated buffer
	lghi	$len,0
	stg	$fp,8($sp)
	st${g}	$fp,$SIZE_T($sp)

.Lctr32_hw_loop:
	la	$s2,16($sp)
@@ -1432,8 +1455,8 @@ $code.=<<___ if (!$softonly);
	lghi	$len,0
	brc	4+1,.Lctr32_hw_loop	# not zero

	lg	$s0,0($sp)
	lg	$s1,8($sp)
	l${g}	$s0,0($sp)
	l${g}	$s1,$SIZE_T($sp)
	la	$s2,16($sp)
.Lctr32_hw_zap:
	stg	$s0,0($s2)
@@ -1442,30 +1465,30 @@ $code.=<<___ if (!$softonly);
	brct	$s1,.Lctr32_hw_zap

	la	$sp,0($s0)
	lmg	%r6,$s3,48($sp)
	lm${g}	%r6,$s3,6*$SIZE_T($sp)
	br	$ra
.align	16
.Lctr32_software:
___
$code.=<<___;
	stmg	$key,$ra,40($sp)
	slgr	$out,$inp
	stm${g}	$key,$ra,5*$SIZE_T($sp)
	sl${g}r	$out,$inp
	larl	$tbl,AES_Te
	llgf	$t1,12($ivp)

.Lctr32_loop:
	stmg	$inp,$len,16($sp)
	stm${g}	$inp,$len,2*$SIZE_T($sp)
	llgf	$s0,0($ivp)
	llgf	$s1,4($ivp)
	llgf	$s2,8($ivp)
	lgr	$s3,$t1
	st	$t1,128($sp)
	st	$t1,16*$SIZE_T($sp)
	lgr	%r4,$key

	bras	$ra,_s390x_AES_encrypt

	lmg	$inp,$ivp,16($sp)
	llgf	$t1,128($sp)
	lm${g}	$inp,$ivp,2*$SIZE_T($sp)
	llgf	$t1,16*$SIZE_T($sp)
	x	$s0,0($inp)
	x	$s1,4($inp)
	x	$s2,8($inp)
@@ -1479,7 +1502,7 @@ $code.=<<___;
	ahi	$t1,1		# 32-bit increment
	brct	$len,.Lctr32_loop

	lmg	%r6,$ra,48($sp)
	lm${g}	%r6,$ra,6*$SIZE_T($sp)
	br	$ra
.size	AES_ctr32_encrypt,.-AES_ctr32_encrypt
___
+74 −25
Original line number Diff line number Diff line
@@ -32,9 +32,33 @@
# Reschedule to minimize/avoid Address Generation Interlock hazard,
# make inner loops counter-based.

# November 2010.
#
# Adapt for -m31 build. If kernel supports what's called "highgprs"
# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
# instructions and achieve "64-bit" performance even in 31-bit legacy
# application context. The feature is not specific to any particular
# processor, as long as it's "z-CPU". Latter implies that the code
# remains z/Architecture specific. Compatibility with 32-bit BN_ULONG
# is achieved by swapping words after 64-bit loads, follow _dswap-s.
# On z990 it was measured to perform 2.6-2.2 times better, less for
# longer keys...

$flavour = shift;

if ($flavour =~ /3[12]/) {
	$SIZE_T=4;
	$g="";
} else {
	$SIZE_T=8;
	$g="g";
}

while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";

$stdframe=16*$SIZE_T+4*8;

$mn0="%r0";
$num="%r1";

@@ -63,34 +87,44 @@ $code.=<<___;
.globl	bn_mul_mont
.type	bn_mul_mont,\@function
bn_mul_mont:
	lgf	$num,164($sp)	# pull $num
	sla	$num,3		# $num to enumerate bytes
	lgf	$num,`$stdframe+$SIZE_T-4`($sp)	# pull $num
	sla	$num,`log($SIZE_T)/log(2)`	# $num to enumerate bytes
	la	$bp,0($num,$bp)

	stg	%r2,16($sp)
	st${g}	%r2,2*$SIZE_T($sp)

	cghi	$num,16		#
	lghi	%r2,0		#
	blr	%r14		# if($num<16) return 0;
___
$code.=<<___ if ($flavour =~ /3[12]/);
	tmll	$num,4
	bnzr	%r14		# if ($num&1) return 0;
___
$code.=<<___ if ($flavour !~ /3[12]/);
	cghi	$num,128	#
	bhr	%r14		# if($num>128) return 0;
___
$code.=<<___;
	stm${g}	%r3,%r15,3*$SIZE_T($sp)

	stmg	%r3,%r15,24($sp)

	lghi	$rp,-160-8	# leave room for carry bit
	lghi	$rp,-$stdframe-8	# leave room for carry bit
	lcgr	$j,$num		# -$num
	lgr	%r0,$sp
	la	$rp,0($rp,$sp)
	la	$sp,0($j,$rp)	# alloca
	stg	%r0,0($sp)	# back chain
	st${g}	%r0,0($sp)	# back chain

	sra	$num,3		# restore $num
	la	$bp,0($j,$bp)	# restore $bp
	ahi	$num,-1		# adjust $num for inner loop
	lg	$n0,0($n0)	# pull n0
	_dswap	$n0

	lg	$bi,0($bp)
	_dswap	$bi
	lg	$alo,0($ap)
	_dswap	$alo
	mlgr	$ahi,$bi	# ap[0]*bp[0]
	lgr	$AHI,$ahi

@@ -98,6 +132,7 @@ bn_mul_mont:
	msgr	$mn0,$n0

	lg	$nlo,0($np)	#
	_dswap	$nlo
	mlgr	$nhi,$mn0	# np[0]*m1
	algr	$nlo,$alo	# +="tp[0]"
	lghi	$NHI,0
@@ -109,12 +144,14 @@ bn_mul_mont:
.align	16
.L1st:
	lg	$alo,0($j,$ap)
	_dswap	$alo
	mlgr	$ahi,$bi	# ap[j]*bp[0]
	algr	$alo,$AHI
	lghi	$AHI,0
	alcgr	$AHI,$ahi

	lg	$nlo,0($j,$np)
	_dswap	$nlo
	mlgr	$nhi,$mn0	# np[j]*m1
	algr	$nlo,$NHI
	lghi	$NHI,0
@@ -122,22 +159,24 @@ bn_mul_mont:
	algr	$nlo,$alo
	alcgr	$NHI,$nhi

	stg	$nlo,160-8($j,$sp)	# tp[j-1]=
	stg	$nlo,$stdframe-8($j,$sp)	# tp[j-1]=
	la	$j,8($j)	# j++
	brct	$count,.L1st

	algr	$NHI,$AHI
	lghi	$AHI,0
	alcgr	$AHI,$AHI	# upmost overflow bit
	stg	$NHI,160-8($j,$sp)
	stg	$AHI,160($j,$sp)
	stg	$NHI,$stdframe-8($j,$sp)
	stg	$AHI,$stdframe($j,$sp)
	la	$bp,8($bp)	# bp++

.Louter:
	lg	$bi,0($bp)	# bp[i]
	_dswap	$bi
	lg	$alo,0($ap)
	_dswap	$alo
	mlgr	$ahi,$bi	# ap[0]*bp[i]
	alg	$alo,160($sp)	# +=tp[0]
	alg	$alo,$stdframe($sp)	# +=tp[0]
	lghi	$AHI,0
	alcgr	$AHI,$ahi

@@ -145,6 +184,7 @@ bn_mul_mont:
	msgr	$mn0,$n0	# tp[0]*n0

	lg	$nlo,0($np)	# np[0]
	_dswap	$nlo
	mlgr	$nhi,$mn0	# np[0]*m1
	algr	$nlo,$alo	# +="tp[0]"
	lghi	$NHI,0
@@ -156,14 +196,16 @@ bn_mul_mont:
.align	16
.Linner:
	lg	$alo,0($j,$ap)
	_dswap	$alo
	mlgr	$ahi,$bi	# ap[j]*bp[i]
	algr	$alo,$AHI
	lghi	$AHI,0
	alcgr	$ahi,$AHI
	alg	$alo,160($j,$sp)# +=tp[j]
	alg	$alo,$stdframe($j,$sp)# +=tp[j]
	alcgr	$AHI,$ahi

	lg	$nlo,0($j,$np)
	_dswap	$nlo
	mlgr	$nhi,$mn0	# np[j]*m1
	algr	$nlo,$NHI
	lghi	$NHI,0
@@ -171,31 +213,33 @@ bn_mul_mont:
	algr	$nlo,$alo	# +="tp[j]"
	alcgr	$NHI,$nhi

	stg	$nlo,160-8($j,$sp)	# tp[j-1]=
	stg	$nlo,$stdframe-8($j,$sp)	# tp[j-1]=
	la	$j,8($j)	# j++
	brct	$count,.Linner

	algr	$NHI,$AHI
	lghi	$AHI,0
	alcgr	$AHI,$AHI
	alg	$NHI,160($j,$sp)# accumulate previous upmost overflow bit
	alg	$NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit
	lghi	$ahi,0
	alcgr	$AHI,$ahi	# new upmost overflow bit
	stg	$NHI,160-8($j,$sp)
	stg	$AHI,160($j,$sp)
	stg	$NHI,$stdframe-8($j,$sp)
	stg	$AHI,$stdframe($j,$sp)

	la	$bp,8($bp)	# bp++
	clg	$bp,160+8+32($j,$sp)	# compare to &bp[num]
	cl${g}	$bp,`$stdframe+8+4*$SIZE_T`($j,$sp)	# compare to &bp[num]
	jne	.Louter

	lg	$rp,160+8+16($j,$sp)	# reincarnate rp
	la	$ap,160($sp)
	l${g}	$rp,`$stdframe+8+2*$SIZE_T`($j,$sp)	# reincarnate rp
	la	$ap,$stdframe($sp)
	ahi	$num,1		# restore $num, incidentally clears "borrow"

	la	$j,0(%r0)
	lr	$count,$num
.Lsub:	lg	$alo,0($j,$ap)
	slbg	$alo,0($j,$np)
	lg	$nlo,0($j,$np)
	_dswap	$nlo
	slbgr	$alo,$nlo
	stg	$alo,0($j,$rp)
	la	$j,8($j)
	brct	$count,.Lsub
@@ -211,18 +255,23 @@ bn_mul_mont:
	la	$j,0(%r0)
	lgr	$count,$num
.Lcopy:	lg	$alo,0($j,$ap)		# copy or in-place refresh
	stg	$j,160($j,$sp)	# zap tp
	_dswap	$alo
	stg	$j,$stdframe($j,$sp)	# zap tp
	stg	$alo,0($j,$rp)
	la	$j,8($j)
	brct	$count,.Lcopy

	la	%r1,160+8+48($j,$sp)
	lmg	%r6,%r15,0(%r1)
	la	%r1,`$stdframe+8+6*$SIZE_T`($j,$sp)
	lm${g}	%r6,%r15,0(%r1)
	lghi	%r2,1		# signal "processed"
	br	%r14
.size	bn_mul_mont,.-bn_mul_mont
.string	"Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
___

print $code;
foreach (split("\n",$code)) {
	s/\`([^\`]*)\`/eval $1/ge;
	s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e;
	print $_,"\n";
}
close STDOUT;
+26 −3
Original line number Diff line number Diff line
@@ -18,6 +18,26 @@
# and the result should be close to 12. In the lack of instruction-
# level profiling data it's impossible to tell why...

# November 2010.
#
# Adapt for -m31 build. If kernel supports what's called "highgprs"
# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
# instructions and achieve "64-bit" performance even in 31-bit legacy
# application context. The feature is not specific to any particular
# processor, as long as it's "z-CPU". Latter implies that the code
# remains z/Architecture specific. On z990 it was measured to perform
# 2.8x better than 32-bit code generated by gcc 4.3.

$flavour = shift;

if ($flavour =~ /3[12]/) {
	$SIZE_T=4;
	$g="";
} else {
	$SIZE_T=8;
	$g="g";
}

while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";

@@ -74,7 +94,7 @@ $code.=<<___ if(!$softonly);
.Lsoft_gmult:
___
$code.=<<___;
	stmg	%r6,%r14,48($sp)
	stm${g}	%r6,%r14,6*$SIZE_T($sp)

	aghi	$Xi,-1
	lghi	$len,1
@@ -109,8 +129,11 @@ $code.=<<___ if(!$softonly);
.align	32
.Lsoft_ghash:
___
$cdoe.=<<___ if ($flavour =~ /3[12]/);
	llgfr	$len,$len
___
$code.=<<___;
	stmg	%r6,%r14,48($sp)
	stm${g}	%r6,%r14,6*$SIZE_T($sp)

	aghi	$Xi,-1
	srlg	$len,$len,4
@@ -209,7 +232,7 @@ $code.=<<___;
	xgr	$Zhi,$tmp
	stg	$Zlo,8+1($Xi)
	stg	$Zhi,0+1($Xi)
	lmg	%r6,%r14,48($sp)
	lm${g}	%r6,%r14,6*$SIZE_T($sp)
	br	%r14
.type	gcm_ghash_4bit,\@function
.size	gcm_ghash_4bit,(.-gcm_ghash_4bit)
Loading