Commit 0c237e42 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

s390x assembler pack: add s390x-gf2m.pl and harmonize AES_xts_[en|de]crypt.

parent 0772f3b4
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -134,7 +134,7 @@ my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o::::::::::::void";
my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-alpha.o:void";
my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o:::::::";
my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o:::::::";
my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o::aes_ctr.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o";
my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o s390x-gf2m.o::aes_ctr.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o";
my $armv4_asm=":bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o:void";
my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o:32";
my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o:64";
@@ -370,7 +370,7 @@ my %table=(
# ldconfig and run-time linker to autodiscover. Unfortunately it
# doesn't work just yet, because of couple of bugs in glibc
# sysdep/s390/dl-procinfo.c affecting ldconfig and ld.so.1...
"linux32-s390x",	"gcc:-m31 -Wa,-mzarch -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL:s390xcap.o s390xcpuid.o:bn_asm.o s390x-mont.o::aes_ctr.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o:31:dlfcn:linux-shared:-fPIC:-m31:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::/highgprs",
"linux32-s390x",	"gcc:-m31 -Wa,-mzarch -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL:s390xcap.o s390xcpuid.o:bn_asm.o s390x-mont.o s390x-gf2m.o::aes_ctr.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o:31:dlfcn:linux-shared:-fPIC:-m31:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::/highgprs",
#### SPARC Linux setups
# Ray Miller <ray.miller@computing-services.oxford.ac.uk> has patiently
# assisted with debugging of following two configs.
+2 −2
Original line number Diff line number Diff line
@@ -4105,7 +4105,7 @@ $sys_id =
$lflags       = -ldl
$bn_ops       = BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj    = s390xcap.o s390xcpuid.o
$bn_obj       = bn_asm.o s390x-mont.o
$bn_obj       = bn_asm.o s390x-mont.o s390x-gf2m.o
$des_obj      = 
$aes_obj      = aes_ctr.o aes-s390x.o
$bf_obj       = 
@@ -4137,7 +4137,7 @@ $sys_id =
$lflags       = -ldl
$bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj    = s390xcap.o s390xcpuid.o
$bn_obj       = bn-s390x.o s390x-mont.o
$bn_obj       = bn-s390x.o s390x-mont.o s390x-gf2m.o
$des_obj      = 
$aes_obj      = aes_ctr.o aes-s390x.o
$bf_obj       = 
+12 −11
Original line number Diff line number Diff line
@@ -78,9 +78,9 @@

# February 2011.
#
# Add AES_xts_[en|de]crypt. This includes support for z196
# km-xts-aes instructions, which deliver ~70% improvement at 8KB
# block size over vanilla km-based code.
# Add AES_xts_[en|de]crypt. This includes support for z196 km-xts-aes
# instructions, which deliver ~70% improvement at 8KB block size over
# vanilla km-based code, 37% - at most like 512-bytes block size.

$flavour = shift;

@@ -1579,7 +1579,8 @@ ___

########################################################################
# void AES_xts_encrypt(const char *inp,char *out,size_t len,
#	const AES_KEY *key1, const AES_KEY *key2,u64 secno);
#	const AES_KEY *key1, const AES_KEY *key2,
#	const unsigned char iv[16]);
#
{
my $inp="%r2";
@@ -1595,7 +1596,7 @@ $code.=<<___;
.align	16
_s390x_xts_km:
___
$code.=<<___ if(0);
$code.=<<___ if(1);
	llgfr	$s0,%r0			# put aside the function code
	lghi	$s1,0x7f
	nr	$s1,%r0
@@ -1789,9 +1790,10 @@ $code.=<<___ if (!$softonly);
	sllg	$len,$len,4		# $len&=~15
	slgr	$out,$inp

	lrvg	$s0,$stdframe($sp)	# load secno
	lghi	$s1,0
	# generate the tweak value
	l${g}	$s3,$stdframe($sp)	# pointer to iv
	la	$s2,$tweak($sp)
	lmg	$s0,$s1,0($s3)
	lghi	$s3,16
	stmg	$s0,$s1,0($s2)
	la	%r1,0($key2)		# $key2 is not needed anymore
@@ -1996,12 +1998,11 @@ $code.=<<___ if (!$softonly);
	slgr	$out,$inp

	# generate the tweak value
	lrvg	$s0,$stdframe($sp)	# load secno
	lghi	$s1,0
	l${g}	$s3,$stdframe($sp)	# pointer to iv
	la	$s2,$tweak($sp)
	lmg	$s0,$s1,0($s3)
	lghi	$s3,16
	stg	$s0,0($s2)
	stg	$s1,8($s2)
	stmg	$s0,$s1,0($s2)
	la	%r1,0($key2)		# $key2 is not needed past this point
	.long	0xb92e00aa		# km $s2,$s2, generate the tweak
	brc	1,.-4			# can this happen?
+2 −0
Original line number Diff line number Diff line
@@ -91,6 +91,8 @@ mips-mont.s: asm/mips-mont.pl

bn-s390x.o:	asm/s390x.S
	$(CC) $(CFLAGS) -c -o $@ asm/s390x.S
s390x-gf2m.s:	asm/s390x-gfm2.pl
	$(PERL) asm/s390x-gfm2.pl $(PERLASM_SCHEME) $@

x86_64-gcc.o:	asm/x86_64-gcc.c
	$(CC) $(CFLAGS) -c -o $@ asm/x86_64-gcc.c
+220 −0
Original line number Diff line number Diff line
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# May 2011
#
# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
# the time being... gcc 4.3 appeared to generate poor code, therefore
# the effort. The module delivers 55%-90% improvement on haviest ECDSA
# verify and ECDH benchmarks for 163- and 571-bit keys on z990, and
# 25%-30% - on z196(*). This is for 64-bit build. In 32-bit "highgprs"
# case improvement is even higher, for example on z990 it was measured
# 80%-150%. ECDSA sign is modest 9%-12% faster. Keep in mind that
# these coefficients are not ones for bn_GF2m_mul_2x2 itself, as not
# all CPU time is burnt in it...
#
# (*)	Though no improvement could be measured if compared to code
#	generated by gcc 4.1. Keep in mind that z196 is out-of-order
#	execution core and is better at executing poor code.

$flavour = shift;

if ($flavour =~ /3[12]/) {
        $SIZE_T=4;
        $g="";
} else {
        $SIZE_T=8;
        $g="g";
}

while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";

$stdframe=16*$SIZE_T+4*8;

$rp="%r2";
$a1="%r3";
$a0="%r4";
$b1="%r5";
$b0="%r6";

$ra="%r14";
$sp="%r15";

@T=("%r0","%r1");
@i=("%r12","%r13");

($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(6..11));
($lo,$hi,$b)=map("%r$_",(3..5)); $a=$lo; $mask=$a8;

$code.=<<___;
.text

.type	_mul_1x1,\@function
.align	16
_mul_1x1:
	lgr	$a1,$a
	sllg	$a2,$a,1
	sllg	$a4,$a,2
	sllg	$a8,$a,3

	srag	$lo,$a1,63			# broadcast 63rd bit
	nihh	$a1,0x1fff
	srag	@i[0],$a2,63			# broadcast 62nd bit
	nihh	$a2,0x3fff
	srag	@i[1],$a4,63			# broadcast 61st bit
	nihh	$a4,0x7fff
	ngr	$lo,$b
	ngr	@i[0],$b
	ngr	@i[1],$b

	lghi	@T[0],0
	lgr	$a12,$a1
	stg	@T[0],`$stdframe+0*8`($sp)	# tab[0]=0
	xgr	$a12,$a2
	stg	$a1,`$stdframe+1*8`($sp)	# tab[1]=a1
	 lgr	$a48,$a4
	stg	$a2,`$stdframe+2*8`($sp)	# tab[2]=a2
	 xgr	$a48,$a8
	stg	$a12,`$stdframe+3*8`($sp)	# tab[3]=a1^a2
	 xgr	$a1,$a4

	stg	$a4,`$stdframe+4*8`($sp)	# tab[4]=a4
	xgr	$a2,$a4
	stg	$a1,`$stdframe+5*8`($sp)	# tab[5]=a1^a4
	xgr	$a12,$a4
	stg	$a2,`$stdframe+6*8`($sp)	# tab[6]=a2^a4
	 xgr	$a1,$a48
	stg	$a12,`$stdframe+7*8`($sp)	# tab[7]=a1^a2^a4
	 xgr	$a2,$a48

	stg	$a8,`$stdframe+8*8`($sp)	# tab[8]=a8
	xgr	$a12,$a48
	stg	$a1,`$stdframe+9*8`($sp)	# tab[9]=a1^a8
	 xgr	$a1,$a4
	stg	$a2,`$stdframe+10*8`($sp)	# tab[10]=a2^a8
	 xgr	$a2,$a4
	stg	$a12,`$stdframe+11*8`($sp)	# tab[11]=a1^a2^a8

	xgr	$a12,$a4
	stg	$a48,`$stdframe+12*8`($sp)	# tab[12]=a4^a8
	 srlg	$hi,$lo,1
	stg	$a1,`$stdframe+13*8`($sp)	# tab[13]=a1^a4^a8
	 sllg	$lo,$lo,63
	stg	$a2,`$stdframe+14*8`($sp)	# tab[14]=a2^a4^a8
	 srlg	@T[0],@i[0],2
	stg	$a12,`$stdframe+15*8`($sp)	# tab[15]=a1^a2^a4^a8

	lghi	$mask,`0xf<<3`
	sllg	$a1,@i[0],62
	 sllg	@i[0],$b,3
	srlg	@T[1],@i[1],3
	 ngr	@i[0],$mask
	sllg	$a2,@i[1],61
	 srlg	@i[1],$b,4-3
	xgr	$hi,@T[0]
	 ngr	@i[1],$mask
	xgr	$lo,$a1
	xgr	$hi,@T[1]
	xgr	$lo,$a2

	xg	$lo,$stdframe(@i[0],$sp)
	srlg	@i[0],$b,8-3
	ngr	@i[0],$mask
___
for($n=1;$n<14;$n++) {
$code.=<<___;
	lg	@T[1],$stdframe(@i[1],$sp)
	srlg	@i[1],$b,`($n+2)*4`-3
	sllg	@T[0],@T[1],`$n*4`
	ngr	@i[1],$mask
	srlg	@T[1],@T[1],`64-$n*4`
	xgr	$lo,@T[0]
	xgr	$hi,@T[1]
___
	push(@i,shift(@i)); push(@T,shift(@T));
}
$code.=<<___;
	lg	@T[1],$stdframe(@i[1],$sp)
	sllg	@T[0],@T[1],`$n*4`
	srlg	@T[1],@T[1],`64-$n*4`
	xgr	$lo,@T[0]
	xgr	$hi,@T[1]

	lg	@T[0],$stdframe(@i[0],$sp)
	sllg	@T[1],@T[0],`($n+1)*4`
	srlg	@T[0],@T[0],`64-($n+1)*4`
	xgr	$lo,@T[1]
	xgr	$hi,@T[0]

	br	$ra
.size	_mul_1x1,.-_mul_1x1

.globl	bn_GF2m_mul_2x2
.type	bn_GF2m_mul_2x2,\@function
.align	16
bn_GF2m_mul_2x2:
	stm${g}	%r3,%r15,3*$SIZE_T($sp)

	lghi	%r1,-$stdframe-128
	la	%r0,0($sp)
	la	$sp,0(%r1,$sp)			# alloca
	st${g}	%r0,0($sp)			# back chain
___
if ($SIZE_T==8) {
my @r=map("%r$_",(6..9));
$code.=<<___;
	bras	$ra,_mul_1x1			# a1b1
	stmg	$lo,$hi,16($rp)

	lg	$a,`$stdframe+128+4*$SIZE_T`($sp)
	lg	$b,`$stdframe+128+6*$SIZE_T`($sp)
	bras	$ra,_mul_1x1			# a0b0
	stmg	$lo,$hi,0($rp)

	lg	$a,`$stdframe+128+3*$SIZE_T`($sp)
	lg	$b,`$stdframe+128+5*$SIZE_T`($sp)
	xg	$a,`$stdframe+128+4*$SIZE_T`($sp)
	xg	$b,`$stdframe+128+6*$SIZE_T`($sp)
	bras	$ra,_mul_1x1			# (a0+a1)(b0+b1)
	lmg	@r[0],@r[3],0($rp)

	xgr	$lo,$hi
	xgr	$hi,@r[1]
	xgr	$lo,@r[0]
	xgr	$hi,@r[2]
	xgr	$lo,@r[3]	
	xgr	$hi,@r[3]
	xgr	$lo,$hi
	stg	$hi,16($rp)
	stg	$lo,8($rp)
___
} else {
$code.=<<___;
	sllg	%r3,%r3,32
	sllg	%r5,%r5,32
	or	%r3,%r4
	or	%r5,%r6
	bras	$ra,_mul_1x1
	rllg	$lo,$lo,32
	rllg	$hi,$hi,32
	stmg	$lo,$hi,0($rp)
___
}
$code.=<<___;
	lm${g}	%r6,%r15,`$stdframe+128+6*$SIZE_T`($sp)
	br	$ra
.size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
.string	"GF(2^m) Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
___

$code =~ s/\`([^\`]*)\`/eval($1)/gem;
print $code;
close STDOUT;