Commit 7cbdb975 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

aesp8-ppc.pl: optimize CBC decrypt even further.

10-19% improvement depending on key length and endianness.
parent 028bac06
Loading
Loading
Loading
Loading
+231 −334
Original line number Diff line number Diff line
@@ -10,11 +10,16 @@
# This module implements support for AES instructions as per PowerISA
# specification version 2.07, first implemented by POWER8 processor.
# The module is endian-agnostic in sense that it supports both big-
# and little-endian cases. As well as alignment-agnostic, and it is
# guaranteed not to cause alignment exceptions. [One of options was
# to use VSX loads and stores, which tolerate unaligned references,
# but even then specification doesn't prohibit exceptions on page
# boundaries.]
# and little-endian cases. Data alignment in parallelizable modes is
# handled with VSX loads and stores, which implies MSR.VSX flag being
# set. It should also be noted that ISA specification doesn't prohibit
# alignment exceptions for these instructions on page boundaries.
# Initially alignment was handled in pure AltiVec/VMX way [when data
# is aligned programmatically, which in turn guarantees exception-
# free execution], but it turned to hamper performance when vcipher
# instructions are interleaved. It's reckoned that eventual
# misalignment penalties at page boundaries are in average lower
# than additional overhead in pure AltiVec approach.

$flavour = shift;

@@ -49,7 +54,8 @@ $prefix="AES";
$sp="r1";
$vrsave="r12";

{{{
#########################################################################
{{{	Key setup procedures						#
my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8));
my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6));
my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11));
@@ -96,13 +102,9 @@ Lset_encrypt_key:
	li		r8,0x20
	cmpwi		$bits,192
	lvx		$in1,0,$inp
___
$code.=<<___		if ($LITTLE_ENDIAN);
	vspltisb	$mask,0x0f		# borrow $mask
	vxor		$key,$key,$mask		# adjust for byte swap
___
$code.=<<___;
	le?vspltisb	$mask,0x0f		# borrow $mask
	lvx		$rcon,0,$ptr
	le?vxor		$key,$key,$mask		# adjust for byte swap
	lvx		$mask,r8,$ptr
	addi		$ptr,$ptr,0x10
	vperm		$in0,$in0,$in1,$key	# align [and byte swap in LE]
@@ -324,6 +326,7 @@ Ldone:
	blr
	.long		0
	.byte		0,12,0x14,1,0,0,3,0
	.long		0
.size	.${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key

.globl	.${prefix}_set_decrypt_key
@@ -367,10 +370,12 @@ Ldeckey:
	blr
	.long		0
	.byte		0,12,4,1,0x80,0,3,0
	.long		0
.size	.${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key
___
}}}
{{{
#########################################################################
{{{	Single block en- and decrypt procedures				#
sub gen_block () {
my $dir = shift;
my $n   = $dir eq "de" ? "n" : "";
@@ -390,9 +395,9 @@ $code.=<<___;
	neg		r11,$out
	lvx		v1,$idx,$inp
	lvsl		v2,0,$inp		# inpperm
	`"vspltisb	v4,0x0f"		if ($LITTLE_ENDIAN)`
	le?vspltisb	v4,0x0f
	?lvsl		v3,0,r11		# outperm
	`"vxor		v2,v2,v4"		if ($LITTLE_ENDIAN)`
	le?vxor		v2,v2,v4
	li		$idx,16
	vperm		v0,v0,v1,v2		# align [and byte swap in LE]
	lvx		v1,0,$key
@@ -429,7 +434,7 @@ Loop_${dir}c:
	vxor		v1,v1,v1
	li		$idx,15			# 15 is not typo
	?vperm		v2,v1,v2,v3		# outmask
	`"vxor		v3,v3,v4"		if ($LITTLE_ENDIAN)`
	le?vxor		v3,v3,v4
	lvx		v1,0,$out		# outhead
	vperm		v0,v0,v0,v3		# rotate [and byte swap in LE]
	vsel		v1,v1,v0,v2
@@ -442,17 +447,19 @@ Loop_${dir}c:
	blr
	.long		0
	.byte		0,12,0x14,0,0,0,3,0
	.long		0
.size	.${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt
___
}
&gen_block("en");
&gen_block("de");
}}}
{{{
#########################################################################
{{{	CBC en- and decrypt procedures					#
my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10));
my ($rndkey0,$rndkey1,$inout,$tmp)=		map("v$_",(0..3));
my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)=map("v$_",(4..10));

my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)=
						map("v$_",(4..10));
$code.=<<___;
.globl	.${prefix}_cbc_encrypt
.align	5
@@ -467,12 +474,12 @@ $code.=<<___;

	li		$idx,15
	vxor		$rndkey0,$rndkey0,$rndkey0
	`"vspltisb	$tmp,0x0f"		if ($LITTLE_ENDIAN)`
	le?vspltisb	$tmp,0x0f

	lvx		$ivec,0,$ivp		# load [unaligned] iv
	lvsl		$inpperm,0,$ivp
	lvx		$inptail,$idx,$ivp
	`"vxor		$inpperm,$inpperm,$tmp"	if ($LITTLE_ENDIAN)`
	le?vxor		$inpperm,$inpperm,$tmp
	vperm		$ivec,$ivec,$inptail,$inpperm

	neg		r11,$inp
@@ -482,13 +489,13 @@ $code.=<<___;
	lvsr		$inpperm,0,r11		# prepare for unaligned load
	lvx		$inptail,0,$inp
	addi		$inp,$inp,15		# 15 is not typo
	`"vxor		$inpperm,$inpperm,$tmp"	if ($LITTLE_ENDIAN)`
	le?vxor		$inpperm,$inpperm,$tmp

	?lvsr		$outperm,0,$out		# prepare for unaligned store
	vspltisb	$outmask,-1
	lvx		$outhead,0,$out
	?vperm		$outmask,$rndkey0,$outmask,$outperm
	`"vxor		$outperm,$outperm,$tmp"	if ($LITTLE_ENDIAN)`
	le?vxor		$outperm,$outperm,$tmp

	srwi		$rounds,$rounds,1
	li		$idx,16
@@ -597,10 +604,10 @@ Lcbc_done:
	li		$idx,15			# 15 is not typo
	vxor		$rndkey0,$rndkey0,$rndkey0
	vspltisb	$outmask,-1
	`"vspltisb	$tmp,0x0f"		if ($LITTLE_ENDIAN)`
	le?vspltisb	$tmp,0x0f
	?lvsl		$outperm,0,$enc
	?vperm		$outmask,$rndkey0,$outmask,$outperm
	`"vxor		$outperm,$outperm,$tmp"	if ($LITTLE_ENDIAN)`
	le?vxor		$outperm,$outperm,$tmp
	lvx		$outhead,0,$ivp
	vperm		$ivec,$ivec,$ivec,$outperm
	vsel		$inout,$outhead,$ivec,$outmask
@@ -613,9 +620,12 @@ Lcbc_done:
	blr
	.long		0
	.byte		0,12,0x14,0,0,0,6,0
	.long		0
___
{{
#########################################################################
{{	Optimized CBC decrypt procedure					#
my $key_="r11";
my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13));
my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21));
my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
@@ -625,7 +635,7 @@ my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment
$code.=<<___;
.align	5
_aesp8_cbc_decrypt8x:
	$STU		$sp,-`($FRAME+21*16)`($sp)
	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
	li		r10,`$FRAME+8*16+15`
	li		r11,`$FRAME+8*16+31`
	stvx		v20,r10,$sp		# ABI says so
@@ -650,104 +660,103 @@ _aesp8_cbc_decrypt8x:
	addi		r11,r11,32
	stvx		v30,r10,$sp
	stvx		v31,r11,$sp
	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
	li		r0,-1
	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
	li		$x10,0x10
	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
	li		$x20,0x20
	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
	li		$x30,0x30
	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
	li		$x40,0x40
	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
	li		$x50,0x50
	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
	li		$x60,0x60
	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
	li		$x70,0x70
	mtspr		256,r0

	subi		$rounds,$rounds,3	# -4 in total
	subi		$len,$len,128		# bias

	addi		$idx,$key,16		# load key schedule
	lvx		$rndkey0,0,$key
	addi		$key,$key,32
	lvx		v30,0,$idx
	addi		$idx,$idx,32
	lvx		v31,0,$key
	addi		$key,$key,32
	lvx		$rndkey0,$x00,$key	# load key schedule
	lvx		v30,$x10,$key
	addi		$key,$key,0x20
	lvx		v31,$x00,$key
	?vperm		$rndkey0,$rndkey0,v30,$keyperm
	addi		$key_,$sp,$FRAME+15
	mtctr		$rounds

Load_cbc_dec_key:
	?vperm		v24,v30,v31,$keyperm
	lvx		v30,0,$idx
	addi		$idx,$idx,32
	stvx		v24,0,$key_		# off-load round[1]
	addi		$key_,$key_,16
	lvx		v30,$x10,$key
	addi		$key,$key,0x20
	stvx		v24,$x00,$key_		# off-load round[1]
	?vperm		v25,v31,v30,$keyperm
	lvx		v31,0,$key
	addi		$key,$key,32
	stvx		v25,0,$key_		# off-load round[2]
	addi		$key_,$key_,16
	lvx		v31,$x00,$key
	stvx		v25,$x10,$key_		# off-load round[2]
	addi		$key_,$key_,0x20
	bdnz		Load_cbc_dec_key

	lvx		v26,0,$idx
	addi		$idx,$idx,32
	lvx		v26,$x10,$key
	?vperm		v24,v30,v31,$keyperm
	lvx		v27,0,$key
	addi		$key,$key,32
	stvx		v24,0,$key_		# off-load round[3]
	addi		$key_,$key_,16
	lvx		v27,$x20,$key
	stvx		v24,$x00,$key_		# off-load round[3]
	?vperm		v25,v31,v26,$keyperm
	lvx		v28,0,$idx
	addi		$idx,$idx,32
	stvx		v25,0,$key_		# off-load round[4]
	lvx		v28,$x30,$key
	stvx		v25,$x10,$key_		# off-load round[4]
	addi		$key_,$sp,$FRAME+15	# rewind $key_
	?vperm		v26,v26,v27,$keyperm
	lvx		v29,0,$key
	addi		$key,$key,32
	lvx		v29,$x40,$key
	?vperm		v27,v27,v28,$keyperm
	lvx		v30,0,$idx
	addi		$idx,$idx,32
	lvx		v30,$x50,$key
	?vperm		v28,v28,v29,$keyperm
	lvx		v31,0,$key
	lvx		v31,$x60,$key
	?vperm		v29,v29,v30,$keyperm
	lvx		$out0,0,$idx		# borrow $out0
	lvx		$out0,$x70,$key		# borrow $out0
	?vperm		v30,v30,v31,$keyperm
	lvx		v24,0,$key_		# pre-load round[1]
	addi		$key_,$key_,16
	lvx		v24,$x00,$key_		# pre-load round[1]
	?vperm		v31,v31,$out0,$keyperm
	lvx		v25,0,$key_		# pre-load round[2]
	addi		$key_,$key_,16

	lvx		v25,$x10,$key_		# pre-load round[2]

	#lvx		$inptail,0,$inp		# "caller" already did this
	#addi		$inp,$inp,15		# 15 is not typo

	lvx		$in1,0,$inp		# load first 8 "words"
	addi		$inp,$inp,16
	lvx		$in2,0,$inp
	addi		$inp,$inp,16
	lvx		$in3,0,$inp
	addi		$inp,$inp,16
	vperm		$in0,$inptail,$in1,$inpperm
	lvx		$in4,0,$inp
	addi		$inp,$inp,16
	vperm		$in1,$in1,$in2,$inpperm
	lvx		$in5,0,$inp
	addi		$inp,$inp,16
	vperm		$in2,$in2,$in3,$inpperm
	subi		$inp,$inp,15		# undo "caller"

	 le?li		$idx,8
	lvx_u		$in0,$x00,$inp		# load first 8 "words"
	 le?lvsl	$inpperm,0,$idx
	 le?vspltisb	$tmp,0x0f
	lvx_u		$in1,$x10,$inp
	 le?vxor	$inpperm,$inpperm,$tmp	# transform for lvx_u/stvx_u
	lvx_u		$in2,$x20,$inp
	 le?vperm	$in0,$in0,$in0,$inpperm
	lvx_u		$in3,$x30,$inp
	 le?vperm	$in1,$in1,$in1,$inpperm
	lvx_u		$in4,$x40,$inp
	 le?vperm	$in2,$in2,$in2,$inpperm
	vxor		$out0,$in0,$rndkey0
	lvx		$in6,0,$inp
	addi		$inp,$inp,16
	vperm		$in3,$in3,$in4,$inpperm
	lvx_u		$in5,$x50,$inp
	 le?vperm	$in3,$in3,$in3,$inpperm
	vxor		$out1,$in1,$rndkey0
	lvx		$in7,0,$inp
	addi		$inp,$inp,16
	vperm		$in4,$in4,$in5,$inpperm
	lvx_u		$in6,$x60,$inp
	 le?vperm	$in4,$in4,$in4,$inpperm
	vxor		$out2,$in2,$rndkey0
	lvx		$inptail,0,$inp
	addi		$inp,$inp,16
	vperm		$in5,$in5,$in6,$inpperm
	lvx_u		$in7,$x70,$inp
	addi		$inp,$inp,0x80
	 le?vperm	$in5,$in5,$in5,$inpperm
	vxor		$out3,$in3,$rndkey0
	vperm		$in6,$in6,$in7,$inpperm
	 le?vperm	$in6,$in6,$in6,$inpperm
	vxor		$out4,$in4,$rndkey0
	vperm		$in7,$in7,$inptail,$inpperm
	 le?vperm	$in7,$in7,$in7,$inpperm
	vxor		$out5,$in5,$rndkey0
	vxor		$out6,$in6,$rndkey0
	vxor		$out7,$in7,$rndkey0

	mtctr		$rounds
	b		Loop_cbc_dec8x
.align	5
Loop_cbc_dec8x:
	vncipher	$out0,$out0,v24
	vncipher	$out1,$out1,v24
@@ -757,8 +766,8 @@ Loop_cbc_dec8x:
	vncipher	$out5,$out5,v24
	vncipher	$out6,$out6,v24
	vncipher	$out7,$out7,v24
	lvx		v24,0,$key_		# round[3]
	addi		$key_,$key_,16
	lvx		v24,$x20,$key_		# round[3]
	addi		$key_,$key_,0x20

	vncipher	$out0,$out0,v25
	vncipher	$out1,$out1,v25
@@ -768,8 +777,7 @@ Loop_cbc_dec8x:
	vncipher	$out5,$out5,v25
	vncipher	$out6,$out6,v25
	vncipher	$out7,$out7,v25
	lvx		v25,0,$key_		# round[4]
	addi		$key_,$key_,16
	lvx		v25,$x10,$key_		# round[4]
	bdnz		Loop_cbc_dec8x

	subic		$len,$len,128		# $len-=128
@@ -824,8 +832,7 @@ Loop_cbc_dec8x:
	vncipher	$out5,$out5,v28
	vncipher	$out6,$out6,v28
	vncipher	$out7,$out7,v28
	lvx		v24,0,$key_		# re-pre-load round[1]
	addi		$key_,$key_,16
	lvx		v24,$x00,$key_		# re-pre-load round[1]

	vncipher	$out0,$out0,v29
	vncipher	$out1,$out1,v29
@@ -835,8 +842,7 @@ Loop_cbc_dec8x:
	vncipher	$out5,$out5,v29
	vncipher	$out6,$out6,v29
	vncipher	$out7,$out7,v29
	lvx		v25,0,$key_		# re-pre-load round[2]
	addi		$key_,$key_,16
	lvx		v25,$x10,$key_		# re-pre-load round[2]

	vncipher	$out0,$out0,v30
	 vxor		$ivec,$ivec,v31		# xor with last round key
@@ -857,86 +863,55 @@ Loop_cbc_dec8x:

	vncipherlast	$out0,$out0,$ivec
	vncipherlast	$out1,$out1,$in0
	 lvx_u		$in0,$x00,$inp		# load next input block
	vncipherlast	$out2,$out2,$in1
	 lvx		$in1,0,$inp		# load next input block
	 addi		$inp,$inp,16
	 lvx_u		$in1,$x10,$inp
	vncipherlast	$out3,$out3,$in2
	 lvx		$in2,0,$inp
	 addi		$inp,$inp,16
	 le?vperm	$in0,$in0,$in0,$inpperm
	 lvx_u		$in2,$x20,$inp
	vncipherlast	$out4,$out4,$in3
	 lvx		$in3,0,$inp
	 addi		$inp,$inp,16
	 vperm		$in0,$inptail,$in1,$inpperm
	 le?vperm	$in1,$in1,$in1,$inpperm
	 lvx_u		$in3,$x30,$inp
	vncipherlast	$out5,$out5,$in4
	 lvx		$in4,0,$inp
	 addi		$inp,$inp,16
	 vperm		$in1,$in1,$in2,$inpperm
	 le?vperm	$in2,$in2,$in2,$inpperm
	 lvx_u		$in4,$x40,$inp
	vncipherlast	$out6,$out6,$in5
	 lvx		$in5,0,$inp
	 addi		$inp,$inp,16
	 vperm		$in2,$in2,$in3,$inpperm
	 le?vperm	$in3,$in3,$in3,$inpperm
	 lvx_u		$in5,$x50,$inp
	vncipherlast	$out7,$out7,$in6
	 lvx		$in6,0,$inp
	 addi		$inp,$inp,16
	 vperm		$in3,$in3,$in4,$inpperm
	 le?vperm	$in4,$in4,$in4,$inpperm
	 lvx_u		$in6,$x60,$inp
	vmr		$ivec,$in7


	vperm		$out0,$out0,$out0,$outperm
	 lvx		$in7,0,$inp
	 addi		$inp,$inp,16
	vperm		$out1,$out1,$out1,$outperm
	vsel		$outhead,$outhead,$out0,$outmask
	 vperm		$in4,$in4,$in5,$inpperm
	 lvx		$inptail,0,$inp
	 addi		$inp,$inp,16
	vsel		$out0,$out0,$out1,$outmask
	stvx		$outhead,0,$out
	addi		$out,$out,16

	vperm		$out2,$out2,$out2,$outperm
	 vperm		$in5,$in5,$in6,$inpperm
	vsel		$out1,$out1,$out2,$outmask
	stvx		$out0,0,$out
	addi		$out,$out,16

	vperm		$out3,$out3,$out3,$outperm
	 le?vperm	$in5,$in5,$in5,$inpperm
	 lvx_u		$in7,$x70,$inp
	 addi		$inp,$inp,0x80

	le?vperm	$out0,$out0,$out0,$inpperm
	le?vperm	$out1,$out1,$out1,$inpperm
	stvx_u		$out0,$x00,$out
	 le?vperm	$in6,$in6,$in6,$inpperm
	 vxor		$out0,$in0,$rndkey0
	 vperm		$in6,$in6,$in7,$inpperm
	vsel		$out2,$out2,$out3,$outmask
	stvx		$out1,0,$out
	addi		$out,$out,16

	vperm		$out4,$out4,$out4,$outperm
	le?vperm	$out2,$out2,$out2,$inpperm
	stvx_u		$out1,$x10,$out
	 le?vperm	$in7,$in7,$in7,$inpperm
	 vxor		$out1,$in1,$rndkey0
	 vperm		$in7,$in7,$inptail,$inpperm
	vsel		$out3,$out3,$out4,$outmask
	stvx		$out2,0,$out
	addi		$out,$out,16

	vperm		$out5,$out5,$out5,$outperm
	le?vperm	$out3,$out3,$out3,$inpperm
	stvx_u		$out2,$x20,$out
	 vxor		$out2,$in2,$rndkey0
	vsel		$out4,$out4,$out5,$outmask
	stvx		$out3,0,$out
	addi		$out,$out,16

	vperm		$out6,$out6,$out6,$outperm
	le?vperm	$out4,$out4,$out4,$inpperm
	stvx_u		$out3,$x30,$out
	 vxor		$out3,$in3,$rndkey0
	vsel		$out5,$out5,$out6,$outmask
	stvx		$out4,0,$out
	addi		$out,$out,16

	vperm		$outhead,$out7,$out7,$outperm
	le?vperm	$out5,$out5,$out5,$inpperm
	stvx_u		$out4,$x40,$out
	 vxor		$out4,$in4,$rndkey0
	vsel		$out6,$out6,$outhead,$outmask
	stvx		$out5,0,$out
	addi		$out,$out,16

	le?vperm	$out6,$out6,$out6,$inpperm
	stvx_u		$out5,$x50,$out
	 vxor		$out5,$in5,$rndkey0
	stvx		$out6,0,$out
	addi		$out,$out,16

	le?vperm	$out7,$out7,$out7,$inpperm
	stvx_u		$out6,$x60,$out
	 vxor		$out6,$in6,$rndkey0
	stvx_u		$out7,$x70,$out
	addi		$out,$out,0x80
	 vxor		$out7,$in7,$rndkey0

	mtctr		$rounds
@@ -945,6 +920,7 @@ Loop_cbc_dec8x:
	addic.		$len,$len,128
	beq		Lcbc_dec8x_done
	nop
	nop

Loop_cbc_dec8x_tail:				# up to 7 "words" tail...
	vncipher	$out1,$out1,v24
@@ -954,8 +930,8 @@ Loop_cbc_dec8x_tail: # up to 7 "words" tail...
	vncipher	$out5,$out5,v24
	vncipher	$out6,$out6,v24
	vncipher	$out7,$out7,v24
	lvx		v24,0,$key_		# round[3]
	addi		$key_,$key_,16
	lvx		v24,$x20,$key_		# round[3]
	addi		$key_,$key_,0x20

	vncipher	$out1,$out1,v25
	vncipher	$out2,$out2,v25
@@ -964,8 +940,7 @@ Loop_cbc_dec8x_tail: # up to 7 "words" tail...
	vncipher	$out5,$out5,v25
	vncipher	$out6,$out6,v25
	vncipher	$out7,$out7,v25
	lvx		v25,0,$key_		# round[4]
	addi		$key_,$key_,16
	lvx		v25,$x10,$key_		# round[4]
	bdnz		Loop_cbc_dec8x_tail

	vncipher	$out1,$out1,v24
@@ -1044,6 +1019,7 @@ Loop_cbc_dec8x_tail: # up to 7 "words" tail...
	nop
	beq		Lcbc_dec8x_six

Lcbc_dec8x_seven:
	vncipherlast	$out1,$out1,$ivec
	vncipherlast	$out2,$out2,$in1
	vncipherlast	$out3,$out3,$in2
@@ -1053,40 +1029,21 @@ Loop_cbc_dec8x_tail: # up to 7 "words" tail...
	vncipherlast	$out7,$out7,$in6
	vmr		$ivec,$in7

	vperm		$out1,$out1,$out1,$outperm
	vsel		$outhead,$outhead,$out1,$outmask
	stvx		$outhead,0,$out
	addi		$out,$out,16

	vperm		$out2,$out2,$out2,$outperm
	vsel		$out1,$out1,$out2,$outmask
	stvx		$out1,0,$out
	addi		$out,$out,16

	vperm		$out3,$out3,$out3,$outperm
	vsel		$out2,$out2,$out3,$outmask
	stvx		$out2,0,$out
	addi		$out,$out,16

	vperm		$out4,$out4,$out4,$outperm
	vsel		$out3,$out3,$out4,$outmask
	stvx		$out3,0,$out
	addi		$out,$out,16

	vperm		$out5,$out5,$out5,$outperm
	vsel		$out4,$out4,$out5,$outmask
	stvx		$out4,0,$out
	addi		$out,$out,16

	vperm		$out6,$out6,$out6,$outperm
	vsel		$out5,$out5,$out6,$outmask
	stvx		$out5,0,$out
	addi		$out,$out,16

	vperm		$outhead,$out7,$out7,$outperm
	vsel		$out6,$out6,$outhead,$outmask
	stvx		$out6,0,$out
	addi		$out,$out,16
	le?vperm	$out1,$out1,$out1,$inpperm
	le?vperm	$out2,$out2,$out2,$inpperm
	stvx_u		$out1,$x00,$out
	le?vperm	$out3,$out3,$out3,$inpperm
	stvx_u		$out2,$x10,$out
	le?vperm	$out4,$out4,$out4,$inpperm
	stvx_u		$out3,$x20,$out
	le?vperm	$out5,$out5,$out5,$inpperm
	stvx_u		$out4,$x30,$out
	le?vperm	$out6,$out6,$out6,$inpperm
	stvx_u		$out5,$x40,$out
	le?vperm	$out7,$out7,$out7,$inpperm
	stvx_u		$out6,$x50,$out
	stvx_u		$out7,$x60,$out
	addi		$out,$out,0x70
	b		Lcbc_dec8x_done

.align	5
@@ -1099,35 +1056,19 @@ Lcbc_dec8x_six:
	vncipherlast	$out7,$out7,$in6
	vmr		$ivec,$in7

	vperm		$out2,$out2,$out2,$outperm
	vsel		$outhead,$outhead,$out2,$outmask
	stvx		$outhead,0,$out
	addi		$out,$out,16

	vperm		$out3,$out3,$out3,$outperm
	vsel		$out2,$out2,$out3,$outmask
	stvx		$out2,0,$out
	addi		$out,$out,16

	vperm		$out4,$out4,$out4,$outperm
	vsel		$out3,$out3,$out4,$outmask
	stvx		$out3,0,$out
	addi		$out,$out,16

	vperm		$out5,$out5,$out5,$outperm
	vsel		$out4,$out4,$out5,$outmask
	stvx		$out4,0,$out
	addi		$out,$out,16

	vperm		$out6,$out6,$out6,$outperm
	vsel		$out5,$out5,$out6,$outmask
	stvx		$out5,0,$out
	addi		$out,$out,16

	vperm		$outhead,$out7,$out7,$outperm
	vsel		$out6,$out6,$outhead,$outmask
	stvx		$out6,0,$out
	addi		$out,$out,16
	le?vperm	$out2,$out2,$out2,$inpperm
	le?vperm	$out3,$out3,$out3,$inpperm
	stvx_u		$out2,$x00,$out
	le?vperm	$out4,$out4,$out4,$inpperm
	stvx_u		$out3,$x10,$out
	le?vperm	$out5,$out5,$out5,$inpperm
	stvx_u		$out4,$x20,$out
	le?vperm	$out6,$out6,$out6,$inpperm
	stvx_u		$out5,$x30,$out
	le?vperm	$out7,$out7,$out7,$inpperm
	stvx_u		$out6,$x40,$out
	stvx_u		$out7,$x50,$out
	addi		$out,$out,0x60
	b		Lcbc_dec8x_done

.align	5
@@ -1139,30 +1080,17 @@ Lcbc_dec8x_five:
	vncipherlast	$out7,$out7,$in6
	vmr		$ivec,$in7

	vperm		$out3,$out3,$out3,$outperm
	vsel		$outhead,$outhead,$out3,$outmask
	stvx		$outhead,0,$out
	addi		$out,$out,16

	vperm		$out4,$out4,$out4,$outperm
	vsel		$out3,$out3,$out4,$outmask
	stvx		$out3,0,$out
	addi		$out,$out,16

	vperm		$out5,$out5,$out5,$outperm
	vsel		$out4,$out4,$out5,$outmask
	stvx		$out4,0,$out
	addi		$out,$out,16

	vperm		$out6,$out6,$out6,$outperm
	vsel		$out5,$out5,$out6,$outmask
	stvx		$out5,0,$out
	addi		$out,$out,16

	vperm		$outhead,$out7,$out7,$outperm
	vsel		$out6,$out6,$outhead,$outmask
	stvx		$out6,0,$out
	addi		$out,$out,16
	le?vperm	$out3,$out3,$out3,$inpperm
	le?vperm	$out4,$out4,$out4,$inpperm
	stvx_u		$out3,$x00,$out
	le?vperm	$out5,$out5,$out5,$inpperm
	stvx_u		$out4,$x10,$out
	le?vperm	$out6,$out6,$out6,$inpperm
	stvx_u		$out5,$x20,$out
	le?vperm	$out7,$out7,$out7,$inpperm
	stvx_u		$out6,$x30,$out
	stvx_u		$out7,$x40,$out
	addi		$out,$out,0x50
	b		Lcbc_dec8x_done

.align	5
@@ -1173,25 +1101,15 @@ Lcbc_dec8x_four:
	vncipherlast	$out7,$out7,$in6
	vmr		$ivec,$in7

	vperm		$out4,$out4,$out4,$outperm
	vsel		$outhead,$outhead,$out4,$outmask
	stvx		$outhead,0,$out
	addi		$out,$out,16

	vperm		$out5,$out5,$out5,$outperm
	vsel		$out4,$out4,$out5,$outmask
	stvx		$out4,0,$out
	addi		$out,$out,16

	vperm		$out6,$out6,$out6,$outperm
	vsel		$out5,$out5,$out6,$outmask
	stvx		$out5,0,$out
	addi		$out,$out,16

	vperm		$outhead,$out7,$out7,$outperm
	vsel		$out6,$out6,$outhead,$outmask
	stvx		$out6,0,$out
	addi		$out,$out,16
	le?vperm	$out4,$out4,$out4,$inpperm
	le?vperm	$out5,$out5,$out5,$inpperm
	stvx_u		$out4,$x00,$out
	le?vperm	$out6,$out6,$out6,$inpperm
	stvx_u		$out5,$x10,$out
	le?vperm	$out7,$out7,$out7,$inpperm
	stvx_u		$out6,$x20,$out
	stvx_u		$out7,$x30,$out
	addi		$out,$out,0x40
	b		Lcbc_dec8x_done

.align	5
@@ -1201,20 +1119,13 @@ Lcbc_dec8x_three:
	vncipherlast	$out7,$out7,$in6
	vmr		$ivec,$in7

	vperm		$out5,$out5,$out5,$outperm
	vsel		$outhead,$outhead,$out5,$outmask
	stvx		$outhead,0,$out
	addi		$out,$out,16

	vperm		$out6,$out6,$out6,$outperm
	vsel		$out5,$out5,$out6,$outmask
	stvx		$out5,0,$out
	addi		$out,$out,16

	vperm		$outhead,$out7,$out7,$outperm
	vsel		$out6,$out6,$outhead,$outmask
	stvx		$out6,0,$out
	addi		$out,$out,16
	le?vperm	$out5,$out5,$out5,$inpperm
	le?vperm	$out6,$out6,$out6,$inpperm
	stvx_u		$out5,$x00,$out
	le?vperm	$out7,$out7,$out7,$inpperm
	stvx_u		$out6,$x10,$out
	stvx_u		$out7,$x20,$out
	addi		$out,$out,0x30
	b		Lcbc_dec8x_done

.align	5
@@ -1223,15 +1134,11 @@ Lcbc_dec8x_two:
	vncipherlast	$out7,$out7,$in6
	vmr		$ivec,$in7

	vperm		$out6,$out6,$out6,$outperm
	vsel		$outhead,$outhead,$out6,$outmask
	stvx		$outhead,0,$out
	addi		$out,$out,16

	vperm		$outhead,$out7,$out7,$outperm
	vsel		$out6,$out6,$outhead,$outmask
	stvx		$out6,0,$out
	addi		$out,$out,16
	le?vperm	$out6,$out6,$out6,$inpperm
	le?vperm	$out7,$out7,$out7,$inpperm
	stvx_u		$out6,$x00,$out
	stvx_u		$out7,$x10,$out
	addi		$out,$out,0x20
	b		Lcbc_dec8x_done

.align	5
@@ -1239,52 +1146,31 @@ Lcbc_dec8x_one:
	vncipherlast	$out7,$out7,$ivec
	vmr		$ivec,$in7

	vperm		$out7,$out7,$out7,$outperm
	vsel		$outhead,$outhead,$out7,$outmask
	stvx		$outhead,0,$out
	addi		$out,$out,16
	vmr		$outhead,$out7
	nop
	le?vperm	$out7,$out7,$out7,$inpperm
	stvx_u		$out7,0,$out
	addi		$out,$out,0x10

Lcbc_dec8x_done:
	addi		$out,$out,-1
	lvx		$out7,0,$out		# redundant in aligned case
	vsel		$out7,$outhead,$out7,$outmask
	stvx		$out7,0,$out

	neg		$enc,$ivp		# write [unaligned] iv
	li		$idx,15			# 15 is not typo
	vxor		$rndkey0,$rndkey0,$rndkey0
	vspltisb	$outmask,-1
	`"vspltisb	$tmp,0x0f"		if ($LITTLE_ENDIAN)`
	?lvsl		$outperm,0,$enc
	?vperm		$outmask,$rndkey0,$outmask,$outperm
	`"vxor		$outperm,$outperm,$tmp"	if ($LITTLE_ENDIAN)`
	lvx		$outhead,0,$ivp
	vperm		$ivec,$ivec,$ivec,$outperm
	vsel		$in0,$outhead,$ivec,$outmask
	lvx		$inptail,$idx,$ivp
	stvx		$in0,0,$ivp
	vsel		$in0,$ivec,$inptail,$outmask
	stvx		$in0,$idx,$ivp
	le?vperm	$ivec,$ivec,$ivec,$inpperm
	stvx_u		$ivec,0,$ivp		# write [unaligned] iv

	li		r10,`$FRAME+15`
	li		r11,`$FRAME+31`
	stvx		$outmask,r10,$sp	# wipe copies of rounds keys
	stvx		$inpperm,r10,$sp	# wipe copies of round keys
	addi		r10,r10,32
	stvx		$outmask,r11,$sp
	stvx		$inpperm,r11,$sp
	addi		r11,r11,32
	stvx		$outmask,r10,$sp
	stvx		$inpperm,r10,$sp
	addi		r10,r10,32
	stvx		$outmask,r11,$sp
	stvx		$inpperm,r11,$sp
	addi		r11,r11,32
	stvx		$outmask,r10,$sp
	stvx		$inpperm,r10,$sp
	addi		r10,r10,32
	stvx		$outmask,r11,$sp
	stvx		$inpperm,r11,$sp
	addi		r11,r11,32
	stvx		$outmask,r10,$sp
	stvx		$inpperm,r10,$sp
	addi		r10,r10,32
	stvx		$outmask,r11,$sp
	stvx		$inpperm,r11,$sp
	addi		r11,r11,32

	mtspr		256,$vrsave
@@ -1310,10 +1196,17 @@ Lcbc_dec8x_done:
	addi		r11,r11,32
	lvx		v30,r10,$sp
	lvx		v31,r11,$sp
	addi		$sp,$sp,`$FRAME+21*16`
	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
	blr
	.long		0
	.byte		0,12,0x14,0,0x80,0,6,0
	.byte		0,12,0x14,0,0x80,6,6,0
	.long		0
.size	.${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt
___
}}	}}}
@@ -1354,12 +1247,16 @@ foreach(split("\n",$code)) {
	# instructions prefixed with '?' are endian-specific and need
	# to be adjusted accordingly...
	if ($flavour =~ /le$/o) {	# little-endian
	    s/le\?//o		or
	    s/be\?/#be#/o	or
	    s/\?lvsr/lvsl/o	or
	    s/\?lvsl/lvsr/o	or
	    s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
	    s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
	    s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
	} else {			# big-endian
	    s/le\?/#le#/o	or
	    s/be\?//o		or
	    s/\?([a-z]+)/$1/o;
	}