Commit 0ab8fd58 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

s390x assembler pack: tune-up and support for new z196 hardware.

parent 8aa6cff4
Loading
Loading
Loading
Loading
+764 −25
Original line number Diff line number Diff line
@@ -70,6 +70,18 @@
# remains z/Architecture specific. On z990 it was measured to perform
# 2x better than code generated by gcc 4.3.

# December 2010.
#
# Add support for z196 "cipher message with counter" instruction.
# Note however that it's disengaged, because it was measured to
# perform ~12% worse than vanilla km-based code...

# February 2011.
#
# Add AES_xts_[en|de]crypt. This includes support for z196
# km-xts-aes instructions, which deliver ~70% improvement at 8KB
# block size over vanilla km-based code.

$flavour = shift;

if ($flavour =~ /3[12]/) {
@@ -268,7 +280,7 @@ $code.=<<___;
.type   _s390x_AES_encrypt,\@function
.align	16
_s390x_AES_encrypt:
	st${g}	$ra,`$stdframe-$SIZE_T`($sp)
	st${g}	$ra,15*$SIZE_T($sp)
	x	$s0,0($key)
	x	$s1,4($key)
	x	$s2,8($key)
@@ -432,7 +444,7 @@ _s390x_AES_encrypt:
	or	$s2,$i3
	or	$s3,$t3

	l${g}	$ra,`$stdframe-$SIZE_T`($sp)
	l${g}	$ra,15*$SIZE_T($sp)
	xr	$s0,$t0
	xr	$s1,$t2
	x	$s2,24($key)
@@ -594,7 +606,7 @@ $code.=<<___;
.type   _s390x_AES_decrypt,\@function
.align	16
_s390x_AES_decrypt:
	st${g}	$ra,`$stdframe-$SIZE_T`($sp)
	st${g}	$ra,15*$SIZE_T($sp)
	x	$s0,0($key)
	x	$s1,4($key)
	x	$s2,8($key)
@@ -738,7 +750,7 @@ _s390x_AES_decrypt:
	nr	$i1,$mask
	nr	$i2,$mask

	l${g}	$ra,`$stdframe-$SIZE_T`($sp)
	l${g}	$ra,15*$SIZE_T($sp)
	or	$s1,$t1
	l	$t0,16($key)
	l	$t1,20($key)
@@ -1164,6 +1176,7 @@ $code.=<<___;
.size	AES_set_decrypt_key,.-AES_set_decrypt_key
___

########################################################################
# void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
#                     size_t length, const AES_KEY *key,
#                     unsigned char *ivec, const int enc)
@@ -1365,13 +1378,14 @@ $code.=<<___;
.size	AES_cbc_encrypt,.-AES_cbc_encrypt
___
}
########################################################################
# void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
#                     size_t blocks, const AES_KEY *key,
#                     const unsigned char *ivec)
{
my $inp="%r2";
my $out="%r3";
my $len="%r4";
my $out="%r4";	# blocks and out are swapped
my $len="%r3";
my $key="%r5";	my $iv0="%r5";
my $ivp="%r6";
my $fp ="%r7";
@@ -1381,6 +1395,9 @@ $code.=<<___;
.type	AES_ctr32_encrypt,\@function
.align	16
AES_ctr32_encrypt:
	xgr	%r3,%r4		# flip %r3 and %r4, $out and $len
	xgr	%r4,%r3
	xgr	%r3,%r4
	llgfr	$len,$len	# safe in ctr32 subroutine even in 64-bit case
___
$code.=<<___ if (!$softonly);
@@ -1415,20 +1432,75 @@ $code.=<<___ if (!$softonly);
	st${g}	$fp,$SIZE_T($sp)

	slgr	$len,$fp
	brc	1,.Lctr32_hw_loop	# not zero, no borrow
	brc	1,.Lctr32_hw_switch	# not zero, no borrow
	algr	$fp,$len	# input is shorter than allocated buffer
	lghi	$len,0
	st${g}	$fp,$SIZE_T($sp)

.Lctr32_hw_loop:
.Lctr32_hw_switch:
___
$code.=<<___ if (0);	######### kmctr code was measured to be ~12% slower
	larl	$s0,OPENSSL_s390xcap_P
	lg	$s0,8($s0)
	tmhh	$s0,0x0004	# check for message_security-assist-4
	jz	.Lctr32_km_loop

	llgfr	$s0,%r0
	lgr	$s1,%r1
	lghi	%r0,0
	la	%r1,16($sp)
	.long	0xb92d2042	# kmctr %r4,%r2,%r2

	llihh	%r0,0x8000	# check if kmctr supports the function code
	srlg	%r0,%r0,0($s0)
	ng	%r0,16($sp)
	lgr	%r0,$s0
	lgr	%r1,$s1
	jz	.Lctr32_km_loop

####### kmctr code
	algr	$out,$inp	# restore $out
	lgr	$s1,$len	# $s1 undertakes $len
	j	.Lctr32_kmctr_loop
.align	16
.Lctr32_kmctr_loop:
	la	$s2,16($sp)
	lgr	$s3,$fp
.Lctr32_hw_prepare:
.Lctr32_kmctr_prepare:
	stg	$iv0,0($s2)
	stg	$ivp,8($s2)
	la	$s2,16($s2)
	ahi	$ivp,1		# 32-bit increment, preserves upper half
	brct	$s3,.Lctr32_hw_prepare
	brct	$s3,.Lctr32_kmctr_prepare

	#la	$inp,0($inp)	# inp
	sllg	$len,$fp,4	# len
	#la	$out,0($out)	# out
	la	$s2,16($sp)	# iv
	.long	0xb92da042	# kmctr $out,$s2,$inp
	brc	1,.-4		# pay attention to "partial completion"

	slgr	$s1,$fp
	brc	1,.Lctr32_kmctr_loop	# not zero, no borrow
	algr	$fp,$s1
	lghi	$s1,0
	brc	4+1,.Lctr32_kmctr_loop	# not zero

	l${g}	$sp,0($sp)
	lm${g}	%r6,$s3,6*$SIZE_T($sp)
	br	$ra
.align	16
___
$code.=<<___;
.Lctr32_km_loop:
	la	$s2,16($sp)
	lgr	$s3,$fp
.Lctr32_km_prepare:
	stg	$iv0,0($s2)
	stg	$ivp,8($s2)
	la	$s2,16($s2)
	ahi	$ivp,1		# 32-bit increment, preserves upper half
	brct	$s3,.Lctr32_km_prepare

	la	$s0,16($sp)	# inp
	sllg	$s1,$fp,4	# len
@@ -1439,7 +1511,7 @@ $code.=<<___ if (!$softonly);
	la	$s2,16($sp)
	lgr	$s3,$fp
	slgr	$s2,$inp
.Lctr32_hw_xor:
.Lctr32_km_xor:
	lg	$s0,0($inp)
	lg	$s1,8($inp)
	xg	$s0,0($s2,$inp)
@@ -1447,22 +1519,22 @@ $code.=<<___ if (!$softonly);
	stg	$s0,0($out,$inp)
	stg	$s1,8($out,$inp)
	la	$inp,16($inp)
	brct	$s3,.Lctr32_hw_xor
	brct	$s3,.Lctr32_km_xor

	slgr	$len,$fp
	brc	1,.Lctr32_hw_loop	# not zero, no borrow
	brc	1,.Lctr32_km_loop	# not zero, no borrow
	algr	$fp,$len
	lghi	$len,0
	brc	4+1,.Lctr32_hw_loop	# not zero
	brc	4+1,.Lctr32_km_loop	# not zero

	l${g}	$s0,0($sp)
	l${g}	$s1,$SIZE_T($sp)
	la	$s2,16($sp)
.Lctr32_hw_zap:
.Lctr32_km_zap:
	stg	$s0,0($s2)
	stg	$s0,8($s2)
	la	$s2,16($s2)
	brct	$s1,.Lctr32_hw_zap
	brct	$s1,.Lctr32_km_zap

	la	$sp,0($s0)
	lm${g}	%r6,$s3,6*$SIZE_T($sp)
@@ -1472,12 +1544,12 @@ $code.=<<___ if (!$softonly);
___
$code.=<<___;
	stm${g}	$key,$ra,5*$SIZE_T($sp)
	sl${g}r	$out,$inp
	sl${g}r	$inp,$out
	larl	$tbl,AES_Te
	llgf	$t1,12($ivp)

.Lctr32_loop:
	stm${g}	$inp,$len,2*$SIZE_T($sp)
	stm${g}	$inp,$out,2*$SIZE_T($sp)
	llgf	$s0,0($ivp)
	llgf	$s1,4($ivp)
	llgf	$s2,8($ivp)
@@ -1489,27 +1561,694 @@ $code.=<<___;

	lm${g}	$inp,$ivp,2*$SIZE_T($sp)
	llgf	$t1,16*$SIZE_T($sp)
	x	$s0,0($inp)
	x	$s0,0($inp,$out)
	x	$s1,4($inp,$out)
	x	$s2,8($inp,$out)
	x	$s3,12($inp,$out)
	stm	$s0,$s3,0($out)

	la	$out,16($out)
	ahi	$t1,1		# 32-bit increment
	brct	$len,.Lctr32_loop

	lm${g}	%r6,$ra,6*$SIZE_T($sp)
	br	$ra
.size	AES_ctr32_encrypt,.-AES_ctr32_encrypt
___
}

########################################################################
# void AES_xts_encrypt(const char *inp,char *out,size_t len,
#	const AES_KEY *key1, const AES_KEY *key2,u64 secno);
#
{
my $inp="%r2";
my $out="%r4";	# len and out are swapped
my $len="%r3";
my $key1="%r5";	# $i1
my $key2="%r6";	# $i2
my $fp="%r7";	# $i3
my $tweak=16*$SIZE_T+16;	# or $stdframe-16, bottom of the frame...

$code.=<<___;
.type	_s390x_xts_km,\@function
.align	16
_s390x_xts_km:
___
$code.=<<___ if(0);
	llgfr	$s0,%r0			# put aside the function code
	lghi	$s1,0x7f
	nr	$s1,%r0
	lghi	%r0,0			# query capability vector
	la	%r1,2*$SIZE_T($sp)
	.long	0xb92e0042		# km %r4,%r2
	llihh	%r1,0x8000
	srlg	%r1,%r1,32($s1)		# check for 32+function code
	ng	%r1,2*$SIZE_T($sp)
	lgr	%r0,$s0			# restore the function code
	la	%r1,0($key1)		# restore $key1
	jz	.Lxts_km_vanilla

	lmg	$i2,$i3,$tweak($sp)	# put aside the tweak value
	algr	$out,$inp

	oill	%r0,32			# switch to xts function code
	aghi	$s1,-18			#
	sllg	$s1,$s1,3		# (function code - 18)*8, 0 or 16
	la	%r1,$tweak-16($sp)
	slgr	%r1,$s1			# parameter block position
	lmg	$s0,$s3,0($key1)	# load 256 bits of key material,
	stmg	$s0,$s3,0(%r1)		# and copy it to parameter block.
					# yes, it contains junk and overlaps
					# with the tweak in 128-bit case.
					# it's done to avoid conditional
					# branch.
	stmg	$i2,$i3,$tweak($sp)	# "re-seat" the tweak value

	.long	0xb92e0042		# km %r4,%r2
	brc	1,.-4			# pay attention to "partial completion"

	lrvg	$s0,$tweak+0($sp)	# load the last tweak
	lrvg	$s1,$tweak+8($sp)
	stmg	%r0,%r3,$tweak-32(%r1)	# wipe copy of the key

	nill	%r0,0xffdf		# switch back to original function code
	la	%r1,0($key1)		# restore pointer to $key1
	slgr	$out,$inp

	llgc	$len,2*$SIZE_T-1($sp)
	nill	$len,0x0f		# $len%=16
	br	$ra
	
.align	16
.Lxts_km_vanilla:
___
$code.=<<___;
	# prepare and allocate stack frame at the top of 4K page
	# with 1K reserved for eventual signal handling
	lghi	$s0,-1024-256-16# guarantee at least 256-bytes buffer
	lghi	$s1,-4096
	algr	$s0,$sp
	lgr	$fp,$sp
	ngr	$s0,$s1		# align at page boundary
	slgr	$fp,$s0		# total buffer size
	lgr	$s2,$sp
	lghi	$s1,1024+16	# sl[g]fi is extended-immediate facility
	slgr	$fp,$s1		# deduct reservation to get usable buffer size
	# buffer size is at lest 256 and at most 3072+256-16

	la	$sp,1024($s0)	# alloca
	nill	$fp,0xfff0	# round to 16*n
	st${g}	$s2,0($sp)	# back-chain
	nill	$len,0xfff0	# redundant
	st${g}	$fp,$SIZE_T($sp)

	slgr	$len,$fp
	brc	1,.Lxts_km_go	# not zero, no borrow
	algr	$fp,$len	# input is shorter than allocated buffer
	lghi	$len,0
	st${g}	$fp,$SIZE_T($sp)

.Lxts_km_go:
	lrvg	$s0,$tweak+0($s2)	# load the tweak value in little-endian
	lrvg	$s1,$tweak+8($s2)

	la	$s2,16($sp)		# vector of ascending tweak values
	slgr	$s2,$inp
	srlg	$s3,$fp,4
	j	.Lxts_km_start

.Lxts_km_loop:
	la	$s2,16($sp)
	slgr	$s2,$inp
	srlg	$s3,$fp,4
.Lxts_km_prepare:
	lghi	$i1,0x87
	srag	$i2,$s1,63		# broadcast upper bit
	ngr	$i1,$i2			# rem
	srlg	$i2,$s0,63		# carry bit from lower half
	sllg	$s0,$s0,1
	sllg	$s1,$s1,1
	xgr	$s0,$i1
	ogr	$s1,$i2
.Lxts_km_start:
	lrvgr	$i1,$s0			# flip byte order
	lrvgr	$i2,$s1
	stg	$i1,0($s2,$inp)
	stg	$i2,8($s2,$inp)
	xg	$i1,0($inp)
	xg	$i2,8($inp)
	stg	$i1,0($out,$inp)
	stg	$i2,8($out,$inp)
	la	$inp,16($inp)
	brct	$s3,.Lxts_km_prepare

	slgr	$inp,$fp		# rewind $inp
	la	$s2,0($out,$inp)
	lgr	$s3,$fp
	.long	0xb92e00aa		# km $s2,$s2
	brc	1,.-4			# pay attention to "partial completion"

	la	$s2,16($sp)
	slgr	$s2,$inp
	srlg	$s3,$fp,4
.Lxts_km_xor:
	lg	$i1,0($out,$inp)
	lg	$i2,8($out,$inp)
	xg	$i1,0($s2,$inp)
	xg	$i2,8($s2,$inp)
	stg	$i1,0($out,$inp)
	stg	$i2,8($out,$inp)
	la	$inp,16($inp)
	brct	$s3,.Lxts_km_xor

	slgr	$len,$fp
	brc	1,.Lxts_km_loop		# not zero, no borrow
	algr	$fp,$len
	lghi	$len,0
	brc	4+1,.Lxts_km_loop	# not zero

	l${g}	$i1,0($sp)		# back-chain
	llgf	$fp,`2*$SIZE_T-4`($sp)	# bytes used
	la	$i2,16($sp)
	srlg	$fp,$fp,4
.Lxts_km_zap:
	stg	$i1,0($i2)
	stg	$i1,8($i2)
	la	$i2,16($i2)
	brct	$fp,.Lxts_km_zap

	la	$sp,0($i1)
	llgc	$len,2*$SIZE_T-1($i1)
	nill	$len,0x0f		# $len%=16
	bzr	$ra

	# generate one more tweak...
	lghi	$i1,0x87
	srag	$i2,$s1,63		# broadcast upper bit
	ngr	$i1,$i2			# rem
	srlg	$i2,$s0,63		# carry bit from lower half
	sllg	$s0,$s0,1
	sllg	$s1,$s1,1
	xgr	$s0,$i1
	ogr	$s1,$i2

	ltr	$len,$len		# clear zero flag
	br	$ra
.size	_s390x_xts_km,.-_s390x_xts_km

.globl	AES_xts_encrypt
.type	AES_xts_encrypt,\@function
.align	16
AES_xts_encrypt:
	xgr	%r3,%r4			# flip %r3 and %r4, $out and $len
	xgr	%r4,%r3
	xgr	%r3,%r4
___
$code.=<<___ if ($SIZE_T==4);
	llgfr	$len,$len
___
$code.=<<___;
	st${g}	$len,1*$SIZE_T($sp)	# save copy of $len
	srag	$len,$len,4		# formally wrong, because it expands
					# sign byte, but who can afford asking
					# to process more than 2^63-1 bytes?
					# I use it, because it sets condition
					# code...
	bcr	8,$ra			# abort if zero (i.e. less than 16)
___
$code.=<<___ if (!$softonly);
	llgf	%r0,240($key2)
	lhi	%r1,16
	clr	%r0,%r1
	jl	.Lxts_enc_software

	stm${g}	%r6,$s3,6*$SIZE_T($sp)
	st${g}	$ra,14*$SIZE_T($sp)

	sllg	$len,$len,4		# $len&=~15
	slgr	$out,$inp

	lrvg	$s0,$stdframe($sp)	# load secno
	lghi	$s1,0
	la	$s2,$tweak($sp)
	lghi	$s3,16
	stmg	$s0,$s1,0($s2)
	la	%r1,0($key2)		# $key2 is not needed anymore
	.long	0xb92e00aa		# km $s2,$s2, generate the tweak
	brc	1,.-4			# can this happen?

	l	%r0,240($key1)
	la	%r1,0($key1)		# $key1 is not needed anymore
	bras	$ra,_s390x_xts_km
	jz	.Lxts_enc_km_done

	aghi	$inp,-16		# take one step back
	la	$i3,0($out,$inp)	# put aside real $out
.Lxts_enc_km_steal:
	llgc	$i1,16($inp)
	llgc	$i2,0($out,$inp)
	stc	$i1,0($out,$inp)
	stc	$i2,16($out,$inp)
	la	$inp,1($inp)
	brct	$len,.Lxts_enc_km_steal

	la	$s2,0($i3)
	lghi	$s3,16
	lrvgr	$i1,$s0			# flip byte order
	lrvgr	$i2,$s1
	xg	$i1,0($s2)
	xg	$i2,8($s2)
	stg	$i1,0($s2)
	stg	$i2,8($s2)
	.long	0xb92e00aa		# km $s2,$s2
	brc	1,.-4			# can this happen?
	lrvgr	$i1,$s0			# flip byte order
	lrvgr	$i2,$s1
	xg	$i1,0($i3)
	xg	$i2,8($i3)
	stg	$i1,0($i3)
	stg	$i2,8($i3)

.Lxts_enc_km_done:
	l${g}	$ra,14*$SIZE_T($sp)
	st${g}	$sp,$tweak($sp)		# wipe tweak
	st${g}	$sp,$tweak($sp)
	lm${g}	%r6,$s3,6*$SIZE_T($sp)
	br	$ra
.align	16
.Lxts_enc_software:
___
$code.=<<___;
	stm${g}	%r6,$ra,6*$SIZE_T($sp)

	slgr	$out,$inp

	xgr	$s0,$s0			# clear upper half
	xgr	$s1,$s1
	lrv	$s0,$stdframe+4($sp)	# load secno
	lrv	$s1,$stdframe+0($sp)
	xgr	$s2,$s2
	xgr	$s3,$s3
	stm${g}	%r2,%r5,2*$SIZE_T($sp)
	la	$key,0($key2)
	larl	$tbl,AES_Te
	bras	$ra,_s390x_AES_encrypt	# generate the tweak
	lm${g}	%r2,%r5,2*$SIZE_T($sp)
	stm	$s0,$s3,$tweak($sp)	# save the tweak
	j	.Lxts_enc_enter

.align	16
.Lxts_enc_loop:
	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
	lrvg	$s3,$tweak+8($sp)
	lghi	%r1,0x87
	srag	%r0,$s3,63		# broadcast upper bit
	ngr	%r1,%r0			# rem
	srlg	%r0,$s1,63		# carry bit from lower half
	sllg	$s1,$s1,1
	sllg	$s3,$s3,1
	xgr	$s1,%r1
	ogr	$s3,%r0
	lrvgr	$s1,$s1			# flip byte order
	lrvgr	$s3,$s3
	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits 
	stg	$s1,$tweak+0($sp)	# save the tweak
	llgfr	$s1,$s1
	srlg	$s2,$s3,32
	stg	$s3,$tweak+8($sp)
	llgfr	$s3,$s3
	la	$inp,16($inp)		# $inp+=16
.Lxts_enc_enter:
	x	$s0,0($inp)		# ^=*($inp)
	x	$s1,4($inp)
	x	$s2,8($inp)
	x	$s3,12($inp)
	stm${g}	%r2,%r3,2*$SIZE_T($sp)	# only two registers are changing
	la	$key,0($key1)
	bras	$ra,_s390x_AES_encrypt
	lm${g}	%r2,%r5,2*$SIZE_T($sp)
	x	$s0,$tweak+0($sp)	# ^=tweak
	x	$s1,$tweak+4($sp)
	x	$s2,$tweak+8($sp)
	x	$s3,$tweak+12($sp)
	st	$s0,0($out,$inp)
	st	$s1,4($out,$inp)
	st	$s2,8($out,$inp)
	st	$s3,12($out,$inp)
	brct${g}	$len,.Lxts_enc_loop

	llgc	$len,`2*$SIZE_T-1`($sp)
	nill	$len,0x0f		# $len%16
	jz	.Lxts_enc_done

	la	$i3,0($inp,$out)	# put aside real $out
.Lxts_enc_steal:
	llgc	%r0,16($inp)
	llgc	%r1,0($out,$inp)
	stc	%r0,0($out,$inp)
	stc	%r1,16($out,$inp)
	la	$inp,1($inp)
	brct	$len,.Lxts_enc_steal
	la	$out,0($i3)		# restore real $out

	# generate last tweak...
	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
	lrvg	$s3,$tweak+8($sp)
	lghi	%r1,0x87
	srag	%r0,$s3,63		# broadcast upper bit
	ngr	%r1,%r0			# rem
	srlg	%r0,$s1,63		# carry bit from lower half
	sllg	$s1,$s1,1
	sllg	$s3,$s3,1
	xgr	$s1,%r1
	ogr	$s3,%r0
	lrvgr	$s1,$s1			# flip byte order
	lrvgr	$s3,$s3
	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits 
	stg	$s1,$tweak+0($sp)	# save the tweak
	llgfr	$s1,$s1
	srlg	$s2,$s3,32
	stg	$s3,$tweak+8($sp)
	llgfr	$s3,$s3

	x	$s0,0($out)		# ^=*(inp)|stolen cipther-text
	x	$s1,4($out)
	x	$s2,8($out)
	x	$s3,12($out)
	st${g}	$out,4*$SIZE_T($sp)
	la	$key,0($key1)
	bras	$ra,_s390x_AES_encrypt
	l${g}	$out,4*$SIZE_T($sp)
	x	$s0,`$tweak+0`($sp)	# ^=tweak
	x	$s1,`$tweak+4`($sp)
	x	$s2,`$tweak+8`($sp)
	x	$s3,`$tweak+12`($sp)
	st	$s0,0($out)
	st	$s1,4($out)
	st	$s2,8($out)
	st	$s3,12($out)

.Lxts_enc_done:
	stg	$sp,$tweak+0($sp)	# wipe tweak
	stg	$sp,$twesk+8($sp)
	lm${g}	%r6,$ra,6*$SIZE_T($sp)
	br	$ra
.size	AES_xts_encrypt,.-AES_xts_encrypt
___
# void AES_xts_decrypt(const char *inp,char *out,size_t len,
#	const AES_KEY *key1, const AES_KEY *key2,u64 secno);
#
$code.=<<___;
.globl	AES_xts_decrypt
.type	AES_xts_decrypt,\@function
.align	16
AES_xts_decrypt:
	xgr	%r3,%r4			# flip %r3 and %r4, $out and $len
	xgr	%r4,%r3
	xgr	%r3,%r4
___
$code.=<<___ if ($SIZE_T==4);
	llgfr	$len,$len
___
$code.=<<___;
	st${g}	$len,1*$SIZE_T($sp)	# save copy of $len
	aghi	$len,-16
	bcr	4,$ra			# abort if less than zero. formally
					# wrong, because $len is unsigned,
					# but who can afford asking to
					# process more than 2^63-1 bytes?
	tmll	$len,0x0f
	jnz	.Lxts_dec_proceed
	aghi	$len,16
.Lxts_dec_proceed:
___
$code.=<<___ if (!$softonly);
	llgf	%r0,240($key2)
	lhi	%r1,16
	clr	%r0,%r1
	jl	.Lxts_dec_software

	stm${g}	%r6,$s3,6*$SIZE_T($sp)
	st${g}	$ra,14*$SIZE_T($sp)

	nill	$len,0xfff0		# $len&=~15
	slgr	$out,$inp

	# generate the tweak value
	lrvg	$s0,$stdframe($sp)	# load secno
	lghi	$s1,0
	la	$s2,$tweak($sp)
	lghi	$s3,16
	stg	$s0,0($s2)
	stg	$s1,8($s2)
	la	%r1,0($key2)		# $key2 is not needed past this point
	.long	0xb92e00aa		# km $s2,$s2, generate the tweak
	brc	1,.-4			# can this happen?

	l	%r0,240($key1)
	la	%r1,0($key1)		# $key1 is not needed anymore

	ltgr	$len,$len
	jz	.Lxts_dec_km_short
	bras	$ra,_s390x_xts_km
	jz	.Lxts_dec_km_done

	lrvgr	$s2,$s0			# make copy in reverse byte order
	lrvgr	$s3,$s1
	j	.Lxts_dec_km_2ndtweak

.Lxts_dec_km_short:
	llgc	$len,`2*$SIZE_T-1`($sp)
	nill	$len,0x0f		# $len%=16
	lrvg	$s0,$tweak+0($sp)	# load the tweak
	lrvg	$s1,$tweak+8($sp)
	lrvgr	$s2,$s0			# make copy in reverse byte order
	lrvgr	$s3,$s1

.Lxts_dec_km_2ndtweak:
	lghi	$i1,0x87
	srag	$i2,$s1,63		# broadcast upper bit
	ngr	$i1,$i2			# rem
	srlg	$i2,$s0,63		# carry bit from lower half
	sllg	$s0,$s0,1
	sllg	$s1,$s1,1
	xgr	$s0,$i1
	ogr	$s1,$i2
	lrvgr	$i1,$s0			# flip byte order
	lrvgr	$i2,$s1

	xg	$i1,0($inp)
	xg	$i2,8($inp)
	stg	$i1,0($out,$inp)
	stg	$i2,8($out,$inp)
	la	$i2,0($out,$inp)
	lghi	$i3,16
	.long	0xb92e0066		# km $i2,$i2
	brc	1,.-4			# can this happen?
	lrvgr	$i1,$s0
	lrvgr	$i2,$s1
	xg	$i1,0($out,$inp)
	xg	$i2,8($out,$inp)
	stg	$i1,0($out,$inp)
	stg	$i2,8($out,$inp)

	la	$i3,0($out,$inp)	# put aside real $out
.Lxts_dec_km_steal:
	llgc	$i1,16($inp)
	llgc	$i2,0($out,$inp)
	stc	$i1,0($out,$inp)
	stc	$i2,16($out,$inp)
	la	$inp,1($inp)
	brct	$len,.Lxts_dec_km_steal

	lgr	$s0,$s2
	lgr	$s1,$s3
	xg	$s0,0($i3)
	xg	$s1,8($i3)
	stg	$s0,0($i3)
	stg	$s1,8($i3)
	la	$s0,0($i3)
	lghi	$s1,16
	.long	0xb92e0088		# km $s0,$s0
	brc	1,.-4			# can this happen?
	xg	$s2,0($i3)
	xg	$s3,8($i3)
	stg	$s2,0($i3)
	stg	$s3,8($i3)
.Lxts_dec_km_done:
	l${g}	$ra,14*$SIZE_T($sp)
	st${g}	$sp,$tweak($sp)		# wipe tweak
	st${g}	$sp,$tweak($sp)
	lm${g}	%r6,$s3,6*$SIZE_T($sp)
	br	$ra
.align	16
.Lxts_dec_software:
___
$code.=<<___;
	stm${g}	%r6,$ra,6*$SIZE_T($sp)

	srlg	$len,$len,4
	slgr	$out,$inp

	xgr	$s0,$s0			# clear upper half
	xgr	$s1,$s1
	lrv	$s0,$stdframe+4($sp)	# load secno
	lrv	$s1,$stdframe+0($sp)
	xgr	$s2,$s2
	xgr	$s3,$s3
	stm${g}	%r2,%r5,2*$SIZE_T($sp)
	la	$key,0($key2)
	larl	$tbl,AES_Te
	bras	$ra,_s390x_AES_encrypt	# generate the tweak
	lm${g}	%r2,%r5,2*$SIZE_T($sp)
	larl	$tbl,AES_Td
	lt${g}r	$len,$len
	stm	$s0,$s3,$tweak($sp)	# save the tweak
	jz	.Lxts_dec_short
	j	.Lxts_dec_enter

.align	16
.Lxts_dec_loop:
	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
	lrvg	$s3,$tweak+8($sp)
	lghi	%r1,0x87
	srag	%r0,$s3,63		# broadcast upper bit
	ngr	%r1,%r0			# rem
	srlg	%r0,$s1,63		# carry bit from lower half
	sllg	$s1,$s1,1
	sllg	$s3,$s3,1
	xgr	$s1,%r1
	ogr	$s3,%r0
	lrvgr	$s1,$s1			# flip byte order
	lrvgr	$s3,$s3
	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits 
	stg	$s1,$tweak+0($sp)	# save the tweak
	llgfr	$s1,$s1
	srlg	$s2,$s3,32
	stg	$s3,$tweak+8($sp)
	llgfr	$s3,$s3
.Lxts_dec_enter:
	x	$s0,0($inp)		# tweak^=*(inp)
	x	$s1,4($inp)
	x	$s2,8($inp)
	x	$s3,12($inp)
	stm${g}	%r2,%r3,2*$SIZE_T($sp)	# only two registers are changing
	la	$key,0($key1)
	bras	$ra,_s390x_AES_decrypt
	lm${g}	%r2,%r5,2*$SIZE_T($sp)
	x	$s0,$tweak+0($sp)	# ^=tweak
	x	$s1,$tweak+4($sp)
	x	$s2,$tweak+8($sp)
	x	$s3,$tweak+12($sp)
	st	$s0,0($out,$inp)
	st	$s1,4($out,$inp)
	st	$s2,8($out,$inp)
	st	$s3,12($out,$inp)
	la	$inp,16($inp)
	ahi	$t1,1		# 32-bit increment
	brct	$len,.Lctr32_loop
	brct${g}	$len,.Lxts_dec_loop

	llgc	$len,`2*$SIZE_T-1`($sp)
	nill	$len,0x0f		# $len%16
	jz	.Lxts_dec_done

	# generate pair of tweaks...
	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
	lrvg	$s3,$tweak+8($sp)
	lghi	%r1,0x87
	srag	%r0,$s3,63		# broadcast upper bit
	ngr	%r1,%r0			# rem
	srlg	%r0,$s1,63		# carry bit from lower half
	sllg	$s1,$s1,1
	sllg	$s3,$s3,1
	xgr	$s1,%r1
	ogr	$s3,%r0
	lrvgr	$i2,$s1			# flip byte order
	lrvgr	$i3,$s3
	stmg	$i2,$i3,$tweak($sp)	# save the 1st tweak
	j	.Lxts_dec_2ndtweak

.align	16
.Lxts_dec_short:
	llgc	$len,`2*$SIZE_T-1`($sp)
	nill	$len,0x0f		# $len%16
	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
	lrvg	$s3,$tweak+8($sp)
.Lxts_dec_2ndtweak:
	lghi	%r1,0x87
	srag	%r0,$s3,63		# broadcast upper bit
	ngr	%r1,%r0			# rem
	srlg	%r0,$s1,63		# carry bit from lower half
	sllg	$s1,$s1,1
	sllg	$s3,$s3,1
	xgr	$s1,%r1
	ogr	$s3,%r0
	lrvgr	$s1,$s1			# flip byte order
	lrvgr	$s3,$s3
	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits
	stg	$s1,$tweak-16+0($sp)	# save the 2nd tweak
	llgfr	$s1,$s1
	srlg	$s2,$s3,32
	stg	$s3,$tweak-16+8($sp)
	llgfr	$s3,$s3

	x	$s0,0($inp)		# tweak_the_2nd^=*(inp)
	x	$s1,4($inp)
	x	$s2,8($inp)
	x	$s3,12($inp)
	stm${g}	%r2,%r3,2*$SIZE_T($sp)
	la	$key,0($key1)
	bras	$ra,_s390x_AES_decrypt
	lm${g}	%r2,%r5,2*$SIZE_T($sp)
	x	$s0,$tweak-16+0($sp)	# ^=tweak_the_2nd
	x	$s1,$tweak-16+4($sp)
	x	$s2,$tweak-16+8($sp)
	x	$s3,$tweak-16+12($sp)
	st	$s0,0($out,$inp)
	st	$s1,4($out,$inp)
	st	$s2,8($out,$inp)
	st	$s3,12($out,$inp)

	la	$i3,0($out,$inp)	# put aside real $out
.Lxts_dec_steal:
	llgc	%r0,16($inp)
	llgc	%r1,0($out,$inp)
	stc	%r0,0($out,$inp)
	stc	%r1,16($out,$inp)
	la	$inp,1($inp)
	brct	$len,.Lxts_dec_steal
	la	$out,0($i3)		# restore real $out

	lm	$s0,$s3,$tweak($sp)	# load the 1st tweak
	x	$s0,0($out)		# tweak^=*(inp)|stolen cipher-text
	x	$s1,4($out)
	x	$s2,8($out)
	x	$s3,12($out)
	st${g}	$out,4*$SIZE_T($sp)
	la	$key,0($key1)
	bras	$ra,_s390x_AES_decrypt
	l${g}	$out,4*$SIZE_T($sp)
	x	$s0,$tweak+0($sp)	# ^=tweak
	x	$s1,$tweak+4($sp)
	x	$s2,$tweak+8($sp)
	x	$s3,$tweak+12($sp)
	st	$s0,0($out)
	st	$s1,4($out)
	st	$s2,8($out)
	st	$s3,12($out)
	stg	$sp,$tweak-16+0($sp)	# wipe 2nd tweak
	stg	$sp,$tweak-16+8($sp)
.Lxts_dec_done:
	stg	$sp,$tweak+0($sp)	# wipe tweak
	stg	$sp,$twesk+8($sp)
	lm${g}	%r6,$ra,6*$SIZE_T($sp)
	br	$ra
.size	AES_ctr32_encrypt,.-AES_ctr32_encrypt
.size	AES_xts_decrypt,.-AES_xts_decrypt
___
}
$code.=<<___;
.comm  OPENSSL_s390xcap_P,16,8
.string	"AES for s390x, CRYPTOGAMS by <appro\@openssl.org>"
.comm	OPENSSL_s390xcap_P,16,8
___

$code =~ s/\`([^\`]*)\`/eval $1/gem;
+4 −4
Original line number Diff line number Diff line
@@ -41,8 +41,8 @@
# processor, as long as it's "z-CPU". Latter implies that the code
# remains z/Architecture specific. Compatibility with 32-bit BN_ULONG
# is achieved by swapping words after 64-bit loads, follow _dswap-s.
# On z990 it was measured to perform 2.6-2.2 times better, less for
# longer keys...
# On z990 it was measured to perform 2.6-2.2 times better than
# compiler-generated code, less for longer keys...

$flavour = shift;

@@ -102,8 +102,8 @@ $code.=<<___ if ($flavour =~ /3[12]/);
	bnzr	%r14		# if ($num&1) return 0;
___
$code.=<<___ if ($flavour !~ /3[12]/);
	cghi	$num,128	#
	bhr	%r14		# if($num>128) return 0;
	cghi	$num,96		#
	bhr	%r14		# if($num>96) return 0;
___
$code.=<<___;
	stm${g}	%r3,%r15,3*$SIZE_T($sp)
+11 −2
Original line number Diff line number Diff line
@@ -28,6 +28,15 @@
# remains z/Architecture specific. On z990 it was measured to perform
# 2.8x better than 32-bit code generated by gcc 4.3.

# March 2011.
#
# Support for hardware KIMD-GHASH is verified to produce correct
# result and therefore is engaged. On z196 it was measured to process
# 8KB buffer ~7 faster than software implementation. It's not as
# impressive for smaller buffer sizes and for smallest 16-bytes buffer
# it's actually almost 2 times slower. Which is the reason why
# KIMD-GHASH is not used in gcm_gmult_4bit.

$flavour = shift;

if ($flavour =~ /3[12]/) {
@@ -41,7 +50,7 @@ if ($flavour =~ /3[12]/) {
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";

$softonly=1;	# disable hardware support for now
$softonly=0;

$Zhi="%r0";
$Zlo="%r1";
@@ -70,7 +79,7 @@ $code.=<<___;
.align	32
gcm_gmult_4bit:
___
$code.=<<___ if(!$softonly);
$code.=<<___ if(!$softonly && 0);	# hardware is slow for single block...
	larl	%r1,OPENSSL_s390xcap_P
	lg	%r0,0(%r1)
	tmhl	%r0,0x4000	# check for message-security-assist