Commit 1fb83a3b authored by Andy Polyakov's avatar Andy Polyakov
Browse files

aes/asm/vpaes-ppc.pl: add little-endian support.

parent f0170ebb
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -365,7 +365,7 @@ my %table=(
####
"linux-generic64","gcc:-DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"linux-ppc64",	"gcc:-m64 -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL:${ppc64_asm}:linux64:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64",
"linux-ppc64le","gcc:-m64 -DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL:".eval{my $asm=$ppc64_asm;$asm=~s/vpaes\-ppc\.o//;$asm}.":linux64le:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::",
"linux-ppc64le","gcc:-m64 -DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL:$ppc64_asm:linux64le:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::",
"linux-ia64",	"gcc:-DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_UNROLL DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"linux-ia64-icc","icc:-DL_ENDIAN -DTERMIO -O2 -Wall::-D_REENTRANT::-ldl -no_cpprt:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"linux-x86_64",	"gcc:-m64 -DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:${x86_64_asm}:elf:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64",
+1 −1
Original line number Diff line number Diff line
@@ -4532,7 +4532,7 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL
$cpuid_obj    = ppccpuid.o ppccap.o
$bn_obj       = bn-ppc.o ppc-mont.o ppc64-mont.o
$des_obj      = 
$aes_obj      = aes_core.o aes_cbc.o aes-ppc.o
$aes_obj      = aes_core.o aes_cbc.o aes-ppc.o vpaes-ppc.o
$bf_obj       = 
$md5_obj      = 
$sha1_obj     = sha1-ppc.o sha256-ppc.o sha512-ppc.o
+160 −130
Original line number Diff line number Diff line
@@ -61,89 +61,89 @@ $code.=<<___;
.align	7	# totally strategic alignment
_vpaes_consts:
Lk_mc_forward:	# mc_forward
	.long	0x01020300, 0x05060704, 0x090a0b08, 0x0d0e0f0c
	.long	0x05060704, 0x090a0b08, 0x0d0e0f0c, 0x01020300
	.long	0x090a0b08, 0x0d0e0f0c, 0x01020300, 0x05060704
	.long	0x0d0e0f0c, 0x01020300, 0x05060704, 0x090a0b08
	.long	0x01020300, 0x05060704, 0x090a0b08, 0x0d0e0f0c	?inv
	.long	0x05060704, 0x090a0b08, 0x0d0e0f0c, 0x01020300	?inv
	.long	0x090a0b08, 0x0d0e0f0c, 0x01020300, 0x05060704	?inv
	.long	0x0d0e0f0c, 0x01020300, 0x05060704, 0x090a0b08	?inv
Lk_mc_backward:	# mc_backward
	.long	0x03000102, 0x07040506, 0x0b08090a, 0x0f0c0d0e
	.long	0x0f0c0d0e, 0x03000102, 0x07040506, 0x0b08090a
	.long	0x0b08090a, 0x0f0c0d0e, 0x03000102, 0x07040506
	.long	0x07040506, 0x0b08090a, 0x0f0c0d0e, 0x03000102
	.long	0x03000102, 0x07040506, 0x0b08090a, 0x0f0c0d0e	?inv
	.long	0x0f0c0d0e, 0x03000102, 0x07040506, 0x0b08090a	?inv
	.long	0x0b08090a, 0x0f0c0d0e, 0x03000102, 0x07040506	?inv
	.long	0x07040506, 0x0b08090a, 0x0f0c0d0e, 0x03000102	?inv
Lk_sr:		# sr
	.long	0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
	.long	0x00050a0f, 0x04090e03, 0x080d0207, 0x0c01060b
	.long	0x0009020b, 0x040d060f, 0x08010a03, 0x0c050e07
	.long	0x000d0a07, 0x04010e0b, 0x0805020f, 0x0c090603
	.long	0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f	?inv
	.long	0x00050a0f, 0x04090e03, 0x080d0207, 0x0c01060b	?inv
	.long	0x0009020b, 0x040d060f, 0x08010a03, 0x0c050e07	?inv
	.long	0x000d0a07, 0x04010e0b, 0x0805020f, 0x0c090603	?inv

##
## "Hot" constants
##
Lk_inv:		# inv, inva
	.long	0xf001080d, 0x0f06050e, 0x020c0b0a, 0x09030704
	.long	0xf0070b0f, 0x060a0401, 0x09080502, 0x0c0e0d03
	.long	0xf001080d, 0x0f06050e, 0x020c0b0a, 0x09030704	?rev
	.long	0xf0070b0f, 0x060a0401, 0x09080502, 0x0c0e0d03	?rev
Lk_ipt:		# input transform (lo, hi)
	.long	0x00702a5a, 0x98e8b2c2, 0x08782252, 0x90e0baca
	.long	0x004d7c31, 0x7d30014c, 0x81ccfdb0, 0xfcb180cd
	.long	0x00702a5a, 0x98e8b2c2, 0x08782252, 0x90e0baca	?rev
	.long	0x004d7c31, 0x7d30014c, 0x81ccfdb0, 0xfcb180cd	?rev
Lk_sbo:		# sbou, sbot
	.long	0x00c7bd6f, 0x176dd2d0, 0x78a802c5, 0x7abfaa15
	.long	0x006abb5f, 0xa574e4cf, 0xfa352b41, 0xd1901e8e
	.long	0x00c7bd6f, 0x176dd2d0, 0x78a802c5, 0x7abfaa15	?rev
	.long	0x006abb5f, 0xa574e4cf, 0xfa352b41, 0xd1901e8e	?rev
Lk_sb1:		# sb1u, sb1t
	.long	0x0023e2fa, 0x15d41836, 0xefd92e0d, 0xc1ccf73b
	.long	0x003e50cb, 0x8fe19bb1, 0x44f52a14, 0x6e7adfa5
	.long	0x0023e2fa, 0x15d41836, 0xefd92e0d, 0xc1ccf73b	?rev
	.long	0x003e50cb, 0x8fe19bb1, 0x44f52a14, 0x6e7adfa5	?rev
Lk_sb2:		# sb2u, sb2t
	.long	0x0029e10a, 0x4088eb69, 0x4a2382ab, 0xc863a1c2
	.long	0x0024710b, 0xc6937ae2, 0xcd2f98bc, 0x55e9b75e
	.long	0x0029e10a, 0x4088eb69, 0x4a2382ab, 0xc863a1c2	?rev
	.long	0x0024710b, 0xc6937ae2, 0xcd2f98bc, 0x55e9b75e	?rev

##
##  Decryption stuff
##
Lk_dipt:	# decryption input transform
	.long	0x005f540b, 0x045b500f, 0x1a454e11, 0x1e414a15
	.long	0x00650560, 0xe683e386, 0x94f191f4, 0x72177712
	.long	0x005f540b, 0x045b500f, 0x1a454e11, 0x1e414a15	?rev
	.long	0x00650560, 0xe683e386, 0x94f191f4, 0x72177712	?rev
Lk_dsbo:	# decryption sbox final output
	.long	0x0040f97e, 0x53ea8713, 0x2d3e94d4, 0xb96daac7
	.long	0x001d4493, 0x0f56d712, 0x9c8ec5d8, 0x59814bca
	.long	0x0040f97e, 0x53ea8713, 0x2d3e94d4, 0xb96daac7	?rev
	.long	0x001d4493, 0x0f56d712, 0x9c8ec5d8, 0x59814bca	?rev
Lk_dsb9:	# decryption sbox output *9*u, *9*t
	.long	0x00d6869a, 0x53031c85, 0xc94c994f, 0x501fd5ca
	.long	0x0049d7ec, 0x89173bc0, 0x65a5fbb2, 0x9e2c5e72
	.long	0x00d6869a, 0x53031c85, 0xc94c994f, 0x501fd5ca	?rev
	.long	0x0049d7ec, 0x89173bc0, 0x65a5fbb2, 0x9e2c5e72	?rev
Lk_dsbd:	# decryption sbox output *D*u, *D*t
	.long	0x00a2b1e6, 0xdfcc577d, 0x39442a88, 0x139b6ef5
	.long	0x00cbc624, 0xf7fae23c, 0xd3efde15, 0x0d183129
	.long	0x00a2b1e6, 0xdfcc577d, 0x39442a88, 0x139b6ef5	?rev
	.long	0x00cbc624, 0xf7fae23c, 0xd3efde15, 0x0d183129	?rev
Lk_dsbb:	# decryption sbox output *B*u, *B*t
	.long	0x0042b496, 0x926422d0, 0x04d4f2b0, 0xf6462660
	.long	0x006759cd, 0xa69894c1, 0x6baa5532, 0x3e0cfff3
	.long	0x0042b496, 0x926422d0, 0x04d4f2b0, 0xf6462660	?rev
	.long	0x006759cd, 0xa69894c1, 0x6baa5532, 0x3e0cfff3	?rev
Lk_dsbe:	# decryption sbox output *E*u, *E*t
	.long	0x00d0d426, 0x9692f246, 0xb0f6b464, 0x04604222
	.long	0x00c1aaff, 0xcda6550c, 0x323e5998, 0x6bf36794
	.long	0x00d0d426, 0x9692f246, 0xb0f6b464, 0x04604222	?rev
	.long	0x00c1aaff, 0xcda6550c, 0x323e5998, 0x6bf36794	?rev

##
##  Key schedule constants
##
Lk_dksd:	# decryption key schedule: invskew x*D
	.long	0x0047e4a3, 0x5d1ab9fe, 0xf9be1d5a, 0xa4e34007
	.long	0x008336b5, 0xf477c241, 0x1e9d28ab, 0xea69dc5f
	.long	0x0047e4a3, 0x5d1ab9fe, 0xf9be1d5a, 0xa4e34007	?rev
	.long	0x008336b5, 0xf477c241, 0x1e9d28ab, 0xea69dc5f	?rev
Lk_dksb:	# decryption key schedule: invskew x*B
	.long	0x00d55085, 0x1fca4f9a, 0x994cc91c, 0x8653d603
	.long	0x004afcb6, 0xa7ed5b11, 0xc882347e, 0x6f2593d9
	.long	0x00d55085, 0x1fca4f9a, 0x994cc91c, 0x8653d603	?rev
	.long	0x004afcb6, 0xa7ed5b11, 0xc882347e, 0x6f2593d9	?rev
Lk_dkse:	# decryption key schedule: invskew x*E + 0x63
	.long	0x00d6c91f, 0xca1c03d5, 0x86504f99, 0x4c9a8553
	.long	0xe87bdc4f, 0x059631a2, 0x8714b320, 0x6af95ecd
	.long	0x00d6c91f, 0xca1c03d5, 0x86504f99, 0x4c9a8553	?rev
	.long	0xe87bdc4f, 0x059631a2, 0x8714b320, 0x6af95ecd	?rev
Lk_dks9:	# decryption key schedule: invskew x*9
	.long	0x00a7d97e, 0xc86f11b6, 0xfc5b2582, 0x3493ed4a
	.long	0x00331427, 0x62517645, 0xcefddae9, 0xac9fb88b
	.long	0x00a7d97e, 0xc86f11b6, 0xfc5b2582, 0x3493ed4a	?rev
	.long	0x00331427, 0x62517645, 0xcefddae9, 0xac9fb88b	?rev

Lk_rcon:	# rcon
	.long	0xb6ee9daf, 0xb991831f, 0x817d7c4d, 0x08982a70
	.long	0xb6ee9daf, 0xb991831f, 0x817d7c4d, 0x08982a70	?asis
Lk_s63:
	.long	0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b
	.long	0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b	?asis

Lk_opt:		# output transform
	.long	0x0060b6d6, 0x29499fff, 0x0868bede, 0x214197f7
	.long	0x00ecbc50, 0x51bded01, 0xe00c5cb0, 0xb15d0de1
	.long	0x0060b6d6, 0x29499fff, 0x0868bede, 0x214197f7	?rev
	.long	0x00ecbc50, 0x51bded01, 0xe00c5cb0, 0xb15d0de1	?rev
Lk_deskew:	# deskew tables: inverts the sbox's "skew"
	.long	0x00e3a447, 0x40a3e407, 0x1af9be5d, 0x5ab9fe1d
	.long	0x0069ea83, 0xdcb5365f, 0x771e9df4, 0xabc24128
	.long	0x00e3a447, 0x40a3e407, 0x1af9be5d, 0x5ab9fe1d	?rev
	.long	0x0069ea83, 0xdcb5365f, 0x771e9df4, 0xabc24128	?rev
.align	5
Lconsts:
	mflr	r0
@@ -227,7 +227,7 @@ _vpaes_encrypt_core:
	li	r11, 0x10
	lvx	v6, r9, $key
	addi	r9, r9, 16
	vperm	v5, v5, v6, $keyperm	# align round key
	?vperm	v5, v5, v6, $keyperm	# align round key
	addi	r10, r11, 0x40
	vsrb	v1, v0, v8		# vpsrlb	\$4,	%xmm0,	%xmm0
	vperm	v0, $iptlo, $iptlo, v0	# vpshufb	%xmm1,	%xmm2,	%xmm1
@@ -275,7 +275,7 @@ Lenc_entry:
	vperm	v3, $invlo, v7, v4	# vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
	addi	r9, r9, 16
	vxor	v2, v2, v0		# vpxor		%xmm1,	%xmm2,	%xmm2  	# 2 = io
	vperm	v5, v5, v6, $keyperm	# align round key
	?vperm	v5, v5, v6, $keyperm	# align round key
	vxor	v3, v3, v1		# vpxor		%xmm0,	%xmm3,	%xmm3	# 3 = jo
	bdnz	Lenc_loop

@@ -330,25 +330,20 @@ Lenc_entry:

	bl	_vpaes_encrypt_preheat

	neg	r8, $inp		# prepare for unaligned access
	lvsl	$keyperm, 0, $key
	 lvsr	$outperm, 0, $out
	lvsr	$inpperm, 0, r8		# -$inp
	 vnor	$outmask, v7, v7	# 0xff..ff
	lvx	$inptail, 0, $inp
	 vperm	$outmask, v7, $outmask, $outperm
	?lvsl	$inpperm, 0, $inp	# prepare for unaligned access
	lvx	v0, 0, $inp
	addi	$inp, $inp, 15		# 15 is not a typo
	 lvx	$outhead, 0, $out

	########
	vmr	v0, $inptail
	 ?lvsr	$outperm, 0, $out
	?lvsl	$keyperm, 0, $key	# prepare for unaligned access
	 vnor	$outmask, v7, v7	# 0xff..ff
	lvx	$inptail, 0, $inp	# redundant in aligned case
	addi	$inp, $inp, 16
	vperm	v0, v0, $inptail, $inpperm
	 ?vperm	$outmask, v7, $outmask, $outperm
	 lvx	$outhead, 0, $out
	?vperm	v0, v0, $inptail, $inpperm

	bl	_vpaes_encrypt_core

	vperm	v0, v0, v0, $outperm	# rotate left
	vperm	v0, v0, v0, $outperm	# rotate right/left
	vsel	v1, $outhead, v0, $outmask
	vmr	$outhead, v0
	stvx	v1, 0, $out
@@ -445,7 +440,7 @@ _vpaes_decrypt_core:
	li	r11, 0x30
	lvx	v6, r9, $key
	addi	r9, r9, 16
	vperm	v5, v5, v6, $keyperm	# align round key
	?vperm	v5, v5, v6, $keyperm	# align round key
	vsrb	v1, v0, v8		# vpsrlb	\$4,	%xmm0,	%xmm0
	vperm	v0, $iptlo, $iptlo, v0	# vpshufb	%xmm1,	%xmm2,	%xmm2
	vperm	v1, $ipthi, $ipthi, v1	# vpshufb	%xmm0,	%xmm1,	%xmm0
@@ -509,7 +504,7 @@ Ldec_entry:
	vperm	v3, $invlo, v7, v4	# vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
	addi	r9, r9, 16
	vxor	v2, v2, v0		# vpxor		%xmm1,	%xmm2,	%xmm2	# 2 = io
	vperm	v5, v5, v6, $keyperm	# align round key
	?vperm	v5, v5, v6, $keyperm	# align round key
	vxor	v3, v3, v1		# vpxor		%xmm0,  %xmm3,	%xmm3	# 3 = jo
	bdnz	Ldec_loop

@@ -564,25 +559,20 @@ Ldec_entry:

	bl	_vpaes_decrypt_preheat

	neg	r8, $inp		# prepare for unaligned access
	lvsl	$keyperm, 0, $key
	 lvsr	$outperm, 0, $out
	lvsr	$inpperm, 0, r8		# -$inp
	 vnor	$outmask, v7, v7	# 0xff..ff
	lvx	$inptail, 0, $inp
	 vperm	$outmask, v7, $outmask, $outperm
	?lvsl	$inpperm, 0, $inp	# prepare for unaligned access
	lvx	v0, 0, $inp
	addi	$inp, $inp, 15		# 15 is not a typo
	 lvx	$outhead, 0, $out

	########
	vmr	v0, $inptail
	 ?lvsr	$outperm, 0, $out
	?lvsl	$keyperm, 0, $key
	 vnor	$outmask, v7, v7	# 0xff..ff
	lvx	$inptail, 0, $inp	# redundant in aligned case
	addi	$inp, $inp, 16
	vperm	v0, v0, $inptail, $inpperm
	 ?vperm	$outmask, v7, $outmask, $outperm
	 lvx	$outhead, 0, $out
	?vperm	v0, v0, $inptail, $inpperm

	bl	_vpaes_decrypt_core

	vperm	v0, v0, v0, $outperm	# rotate left
	vperm	v0, v0, v0, $outperm	# rotate right/left
	vsel	v1, $outhead, v0, $outmask
	vmr	$outhead, v0
	stvx	v1, 0, $out
@@ -673,18 +663,18 @@ Ldec_entry:

	lvx	v24, 0, r31		# load [potentially unaligned] iv
	li	r9, 15
	lvsl	$inpperm, 0, r31
	?lvsl	$inpperm, 0, r31
	lvx	v25, r9, r31
	vperm	v24, v24, v25, $inpperm
	?vperm	v24, v24, v25, $inpperm

	neg	r8, $inp		# prepare for unaligned access
	 vxor	v7, v7, v7
	lvsl	$keyperm, 0, $key
	 lvsr	$outperm, 0, $out
	lvsr	$inpperm, 0, r8		# -$inp
	?lvsl	$keyperm, 0, $key
	 ?lvsr	$outperm, 0, $out
	?lvsr	$inpperm, 0, r8		# -$inp
	 vnor	$outmask, v7, v7	# 0xff..ff
	lvx	$inptail, 0, $inp
	 vperm	$outmask, v7, $outmask, $outperm
	 ?vperm	$outmask, v7, $outmask, $outperm
	addi	$inp, $inp, 15		# 15 is not a typo
	 lvx	$outhead, 0, $out

@@ -697,14 +687,14 @@ Lcbc_enc_loop:
	vmr	v0, $inptail
	lvx	$inptail, 0, $inp
	addi	$inp, $inp, 16
	vperm	v0, v0, $inptail, $inpperm
	?vperm	v0, v0, $inptail, $inpperm
	vxor	v0, v0, v24		# ^= iv

	bl	_vpaes_encrypt_core

	vmr	v24, v0			# put aside iv
	sub.	r30, r30, r0		# len -= 16
	vperm	v0, v0, v0, $outperm	# rotate left
	vperm	v0, v0, v0, $outperm	# rotate right/left
	vsel	v1, $outhead, v0, $outmask
	vmr	$outhead, v0
	stvx	v1, 0, $out
@@ -722,7 +712,7 @@ Lcbc_dec_loop:
	vmr	v0, $inptail
	lvx	$inptail, 0, $inp
	addi	$inp, $inp, 16
	vperm	v0, v0, $inptail, $inpperm
	?vperm	v0, v0, $inptail, $inpperm
	vmr	v25, v0			# put aside input

	bl	_vpaes_decrypt_core
@@ -730,7 +720,7 @@ Lcbc_dec_loop:
	vxor	v0, v0, v24		# ^= iv
	vmr	v24, v25
	sub.	r30, r30, r0		# len -= 16
	vperm	v0, v0, v0, $outperm	# rotate left
	vperm	v0, v0, v0, $outperm	# rotate right/left
	vsel	v1, $outhead, v0, $outmask
	vmr	$outhead, v0
	stvx	v1, 0, $out
@@ -744,12 +734,12 @@ Lcbc_done:
	stvx	v1, 0, $out

	neg	r8, r31			# write [potentially unaligned] iv
	lvsl	$outperm, 0, r8
	?lvsl	$outperm, 0, r8
	li	r6, 15
	vnor	$outmask, v7, v7	# 0xff..ff
	vperm	$outmask, v7, $outmask, $outperm
	?vperm	$outmask, v7, $outmask, $outperm
	lvx	$outhead, 0, r31
	vperm	v24, v24, v24, $outperm	# rotate
	vperm	v24, v24, v24, $outperm	# rotate right/left
	vsel	v0, $outhead, v24, $outmask
	lvx	v1, r6, r31
	stvx	v0, 0, r31
@@ -863,10 +853,10 @@ _vpaes_schedule_core:
	neg	r8, $inp		# prepare for unaligned access
	lvx	v0, 0, $inp
	addi	$inp, $inp, 15		# 15 is not typo
	lvsr	$inpperm, 0, r8		# -$inp
	?lvsr	$inpperm, 0, r8		# -$inp
	lvx	v6, 0, $inp		# v6 serves as inptail
	addi	$inp, $inp, 8
	vperm	v0, v0, v6, $inpperm
	?vperm	v0, v0, v6, $inpperm

	# input transform
	vmr	v3, v0			# vmovdqa	%xmm0,	%xmm3
@@ -879,13 +869,13 @@ _vpaes_schedule_core:
	li	r8, 0x30		# mov	\$0x30,%r8d
	addi	r10, r12, 0x80		# lea	.Lk_sr(%rip),%r10

	lvsr	$outperm, 0, $out	# prepare for unaligned access
	vspltisb	$outmask, -1	# 0xff..ff
	?lvsr	$outperm, 0, $out	# prepare for unaligned access
	vnor	$outmask, v9, v9	# 0xff..ff
	lvx	$outhead, 0, $out
	vperm	$outmask, v9, $outmask, $outperm
	?vperm	$outmask, v9, $outmask, $outperm

	#stvx	v0, 0, $out		# vmovdqu	%xmm0,	(%rdx)
	vperm	v1, v0, v0, $outperm	# rotate left
	vperm	v1, v0, v0, $outperm	# rotate right/left
	vsel	v2, $outhead, v1, $outmask
	vmr	$outhead, v1
	stvx	v2, 0, $out
@@ -901,14 +891,14 @@ Lschedule_am_decrypting:
	vperm	v4, v3, v3, v1		# vpshufb	%xmm1,	%xmm3,	%xmm3

	neg	r0, $out		# prepare for unaligned access
	lvsl	$outperm, 0, r0
	?lvsl	$outperm, 0, r0
	addi	$out, $out, 15		# 15 is not typo
	vspltisb	$outmask, -1	# 0xff..ff
	vnor	$outmask, v9, v9	# 0xff..ff
	lvx	$outhead, 0, $out
	vperm	$outmask, $outmask, v9, $outperm
	?vperm	$outmask, $outmask, v9, $outperm

	#stvx	v4, 0, $out		# vmovdqu	%xmm3,	(%rdx)
	vperm	v4, v4, v4, $outperm	# rotate left
	vperm	v4, v4, v4, $outperm	# rotate right/left
	vsel	v2, $outhead, v4, $outmask
	vmr	$outhead, v4
	stvx	v2, 0, $out
@@ -957,16 +947,16 @@ Loop_schedule_128:
Lschedule_192:
	li	r0, 4			# mov	\$4,	%esi
	lvx	v0, 0, $inp
	vperm	v0, v6, v0, $inpperm
	vsldoi	v0, v3, v0, 8		# vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
	?vperm	v0, v6, v0, $inpperm
	?vsldoi	v0, v3, v0, 8		# vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
	bl	_vpaes_schedule_transform	# input transform
	vsldoi	v6, v0, v9, 8
	vsldoi	v6, v9, v6, 8		# clobber "low" side with zeros
	?vsldoi	v6, v0, v9, 8
	?vsldoi	v6, v9, v6, 8		# clobber "low" side with zeros
	mtctr	r0

Loop_schedule_192:
	bl	_vpaes_schedule_round
	vsldoi	v0, v6, v0, 8		# vpalignr	\$8,%xmm6,%xmm0,%xmm0
	?vsldoi	v0, v6, v0, 8		# vpalignr	\$8,%xmm6,%xmm0,%xmm0
	bl	_vpaes_schedule_mangle	# save key n
	bl	_vpaes_schedule_192_smear
	bl	_vpaes_schedule_mangle	# save key n+1
@@ -991,7 +981,7 @@ Lschedule_256:
	li	r0, 7			# mov	\$7, %esi
	addi	$inp, $inp, 8
	lvx	v0, 0, $inp		# vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
	vperm	v0, v6, v0, $inpperm
	?vperm	v0, v6, v0, $inpperm
	bl	_vpaes_schedule_transform	# input transform
	mtctr	r0

@@ -1005,7 +995,7 @@ Loop_schedule_256:
	bl	_vpaes_schedule_mangle	

	# low round. swap xmm7 and xmm6
	vspltw	v0, v0, 3		# vpshufd	\$0xFF,	%xmm0,	%xmm0
	?vspltw	v0, v0, 3		# vpshufd	\$0xFF,	%xmm0,	%xmm0
	vmr	v5, v7			# vmovdqa	%xmm7,	%xmm5
	vmr	v7, v6			# vmovdqa	%xmm6,	%xmm7
	bl	_vpaes_schedule_low_round
@@ -1042,7 +1032,7 @@ Lschedule_mangle_last:
	bl	_vpaes_schedule_transform	# output transform

	#stvx	v0, r0, $out		# vmovdqu	%xmm0,	(%rdx)		# save last key
	vperm	v0, v0, v0, $outperm	# rotate left
	vperm	v0, v0, v0, $outperm	# rotate right/left
	vsel	v2, $outhead, v0, $outmask
	vmr	$outhead, v0
	stvx	v2, 0, $out
@@ -1062,7 +1052,7 @@ Lschedule_mangle_last_dec:
	bl	_vpaes_schedule_transform	# output transform

	#stvx	v0, r0, $out		# vmovdqu	%xmm0,	(%rdx)		# save last key
	vperm	v0, v0, v0, $outperm	# rotate left
	vperm	v0, v0, v0, $outperm	# rotate right/left
	vsel	v2, $outhead, v0, $outmask
	vmr	$outhead, v0
	stvx	v2, 0, $out
@@ -1104,14 +1094,14 @@ Lschedule_mangle_done:
##
.align	4
_vpaes_schedule_192_smear:
	vspltw	v0, v7, 3
	vsldoi	v1, v9, v6, 12		# vpshufd	\$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
	vsldoi	v0, v7, v0, 8		# vpshufd	\$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
	?vspltw	v0, v7, 3
	?vsldoi	v1, v9, v6, 12		# vpshufd	\$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
	?vsldoi	v0, v7, v0, 8		# vpshufd	\$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
	vxor	v6, v6, v1		# vpxor		%xmm1,	%xmm6,	%xmm6	# -> c+d c 0 0
	vxor	v6, v6, v0		# vpxor		%xmm0,	%xmm6,	%xmm6	# -> b+c+d b+c b a
	vmr	v0, v6
	vsldoi	v6, v6, v9, 8
	vsldoi	v6, v9, v6, 8		# clobber low side with zeros
	?vsldoi	v6, v6, v9, 8
	?vsldoi	v6, v9, v6, 8		# clobber low side with zeros
	blr
	.long	0
	.byte	0,12,0x14,0,0,0,0,0
@@ -1138,23 +1128,23 @@ _vpaes_schedule_192_smear:
_vpaes_schedule_round:
	# extract rcon from xmm8
	#vxor	v4, v4, v4		# vpxor		%xmm4,	%xmm4,	%xmm4
	vsldoi	v1, $rcon, v9, 15	# vpalignr	\$15,	%xmm8,	%xmm4,	%xmm1
	vsldoi	$rcon, $rcon, $rcon, 15	# vpalignr	\$15,	%xmm8,	%xmm8,	%xmm8
	?vsldoi	v1, $rcon, v9, 15	# vpalignr	\$15,	%xmm8,	%xmm4,	%xmm1
	?vsldoi	$rcon, $rcon, $rcon, 15	# vpalignr	\$15,	%xmm8,	%xmm8,	%xmm8
	vxor	v7, v7, v1		# vpxor		%xmm1,	%xmm7,	%xmm7

	# rotate
	vspltw	v0, v0, 3		# vpshufd	\$0xFF,	%xmm0,	%xmm0
	vsldoi	v0, v0, v0, 1		# vpalignr	\$1,	%xmm0,	%xmm0,	%xmm0
	?vspltw	v0, v0, 3		# vpshufd	\$0xFF,	%xmm0,	%xmm0
	?vsldoi	v0, v0, v0, 1		# vpalignr	\$1,	%xmm0,	%xmm0,	%xmm0

	# fall through...

	# low round: same as high round, but no rotation and no rcon.
_vpaes_schedule_low_round:
	# smear xmm7
	vsldoi	v1, v9, v7, 12		# vpslldq	\$4,	%xmm7,	%xmm1
	?vsldoi	v1, v9, v7, 12		# vpslldq	\$4,	%xmm7,	%xmm1
	vxor	v7, v7, v1		# vpxor		%xmm1,	%xmm7,	%xmm7
	vspltisb	v1, 0x0f	# 0x0f..0f
	vsldoi	v4, v9, v7, 8		# vpslldq	\$8,	%xmm7,	%xmm4
	?vsldoi	v4, v9, v7, 8		# vpslldq	\$8,	%xmm7,	%xmm4

	# subbytes
	vand	v1, v1, v0		# vpand		%xmm9,	%xmm0,	%xmm1		# 0 = k
@@ -1248,7 +1238,7 @@ _vpaes_schedule_mangle:
	andi.	r8, r8, 0x30		# and	\$0x30,	%r8

	#stvx	v3, 0, $out		# vmovdqu	%xmm3,	(%rdx)
	vperm	v1, v3, v3, $outperm	# rotate left
	vperm	v1, v3, v3, $outperm	# rotate right/left
	vsel	v2, $outhead, v1, $outmask
	vmr	$outhead, v1
	stvx	v2, 0, $out
@@ -1299,7 +1289,7 @@ Lschedule_mangle_dec:
	andi.	r8, r8, 0x30		# and	\$0x30,	%r8

	#stvx	v3, 0, $out		# vmovdqu	%xmm3,	(%rdx)
	vperm	v1, v3, v3, $outperm	# rotate left
	vperm	v1, v3, v3, $outperm	# rotate right/left
	vsel	v2, $outhead, v1, $outmask
	vmr	$outhead, v1
	stvx	v2, 0, $out
@@ -1346,7 +1336,7 @@ Lschedule_mangle_dec:
	addi	r9, r9, 6		# add	\$5,%eax
	stw	r9, 240($out)		# mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;

	cmplw	$dir, $bits, $bits
	cmplw	$dir, $bits, $bits	# set encrypt direction
	li	r8, 0x30		# mov	\$0x30,%r8d
	bl	_vpaes_schedule_core

@@ -1427,7 +1417,7 @@ Lschedule_mangle_dec:
	slwi	r9, r9, 4		# shl	\$4,%eax
	add	$out, $out, r9		# lea	(%rdx,%rax),%rdx

	cmplwi	$dir, $bits, 0
	cmplwi	$dir, $bits, 0		# set decrypt direction
	srwi	r8, $bits, 1		# shr	\$1,%r8d
	andi.	r8, r8, 32		# and	\$32,%r8d
	xori	r8, r8, 32		# xor	\$32,%r8d	# nbits==192?0:32
@@ -1470,8 +1460,48 @@ Lschedule_mangle_dec:
___
}

$code =~ s/\`([^\`]*)\`/eval($1)/gem;
my $consts=1;
foreach  (split("\n",$code)) {
	s/\`([^\`]*)\`/eval $1/geo;

	# constants table endian-specific conversion
	if ($consts && m/\.long\s+(.+)\s+(\?[a-z]*)$/o) {
	    my $conv=$2;
	    my @bytes=();

	    # convert to endian-agnostic format
	    foreach (split(/,\s+/,$1)) {
		my $l = /^0/?oct:int;
		push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
	    }

	    # little-endian conversion
	    if ($flavour =~ /le$/o) {
		SWITCH: for($conv)  {
		    /\?inv/ && do   { @bytes=map($_^0xf,@bytes); last; };
		    /\?rev/ && do   { @bytes=reverse(@bytes);    last; }; 
		}
	    }

	    #emit
	    print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
	    next;
	}
	$consts=0 if (m/Lconsts:/o);	# end of table

	# instructions prefixed with '?' are endian-specific and need
	# to be adjusted accordingly...
	if ($flavour =~ /le$/o) {	# little-endian
	    s/\?lvsr/lvsl/o or
	    s/\?lvsl/lvsr/o or
	    s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
	    s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
	    s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
	} else {			# big-endian
	    s/\?([a-z]+)/$1/o;
	}

print $code;
	print $_,"\n";
}

close STDOUT;