Commit 98dc1784 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

aes-x86_64.pl: Atom-specific optimizations, +10%.

vpaes-x86_64.pl: minor performance squeeze.
parent 89f1eb82
Loading
Loading
Loading
Loading
+122 −124
Original line number Diff line number Diff line
@@ -19,9 +19,10 @@
# Performance in number of cycles per processed byte for 128-bit key:
#
#		ECB encrypt	ECB decrypt	CBC large chunk
# AMD64		33		41		13.0
# EM64T		38		59		18.6(*)
# Core 2	30		43		14.5(*)
# AMD64		33		43		13.0
# EM64T		38		56		18.6(*)
# Core 2	30		42		14.5(*)
# Atom		65		86		32.1(*)
#
# (*) with hyper-threading off

@@ -365,68 +366,66 @@ $code.=<<___;
	movzb	`&lo("$s0")`,$t0
	movzb	`&lo("$s1")`,$t1
	movzb	`&lo("$s2")`,$t2
	movzb	($sbox,$t0,1),$t0
	movzb	($sbox,$t1,1),$t1
	movzb	($sbox,$t2,1),$t2

	movzb	`&lo("$s3")`,$t3
	movzb	`&hi("$s1")`,$acc0
	movzb	`&hi("$s2")`,$acc1
	shr	\$16,$s2
	movzb	`&hi("$s3")`,$acc2
	movzb	($sbox,$t0,1),$t0
	movzb	($sbox,$t1,1),$t1
	movzb	($sbox,$t2,1),$t2
	movzb	($sbox,$t3,1),$t3
	movzb	($sbox,$acc0,1),$t4	#$t0
	movzb	($sbox,$acc1,1),$t5	#$t1

	movzb	`&hi("$s3")`,$acc2
	movzb	($sbox,$acc0,1),$t4	#$t0
	movzb	`&hi("$s0")`,$acc0
	shr	\$16,$s2
	movzb	($sbox,$acc1,1),$t5	#$t1
	movzb	`&lo("$s2")`,$acc1
	movzb	($sbox,$acc2,1),$acc2	#$t2
	movzb	($sbox,$acc0,1),$acc0	#$t3
	shr	\$16,$s3

	movzb	`&lo("$s2")`,$acc1
	shl	\$8,$t4
	shr	\$16,$s3
	shl	\$8,$t5
	movzb	($sbox,$acc1,1),$acc1	#$t0
	xor	$t4,$t0
	xor	$t5,$t1

	movzb	`&lo("$s3")`,$t4
	shr	\$16,$s0
	movzb	`&lo("$s3")`,$t4
	shr	\$16,$s1
	movzb	`&lo("$s0")`,$t5
	xor	$t5,$t1
	shl	\$8,$acc2
	shl	\$8,$acc0
	movzb	($sbox,$t4,1),$t4	#$t1
	movzb	($sbox,$t5,1),$t5	#$t2
	movzb	`&lo("$s0")`,$t5
	movzb	($sbox,$acc1,1),$acc1	#$t0
	xor	$acc2,$t2
	xor	$acc0,$t3

	shl	\$8,$acc0
	movzb	`&lo("$s1")`,$acc2
	movzb	`&hi("$s3")`,$acc0
	shl	\$16,$acc1
	movzb	($sbox,$acc2,1),$acc2	#$t3
	movzb	($sbox,$acc0,1),$acc0	#$t0
	xor	$acc0,$t3
	movzb	($sbox,$t4,1),$t4	#$t1
	movzb	`&hi("$s3")`,$acc0
	movzb	($sbox,$t5,1),$t5	#$t2
	xor	$acc1,$t0

	movzb	`&hi("$s0")`,$acc1
	shr	\$8,$s2
	movzb	`&hi("$s0")`,$acc1
	shl	\$16,$t4
	shr	\$8,$s1
	shl	\$16,$t5
	xor	$t4,$t1
	movzb	($sbox,$acc2,1),$acc2	#$t3
	movzb	($sbox,$acc0,1),$acc0	#$t0
	movzb	($sbox,$acc1,1),$acc1	#$t1
	movzb	($sbox,$s2,1),$s3	#$t3
	movzb	($sbox,$s1,1),$s2	#$t2
	shl	\$16,$t4
	shl	\$16,$t5

	shl	\$16,$acc2
	xor	$t4,$t1
	xor	$t5,$t2
	xor	$acc2,$t3

	shl	\$24,$acc0
	xor	$acc2,$t3
	shl	\$24,$acc1
	shl	\$24,$s3
	xor	$acc0,$t0
	shl	\$24,$s2
	shl	\$24,$s3
	xor	$acc1,$t1
	shl	\$24,$s2
	mov	$t0,$s0
	mov	$t1,$s1
	xor	$t2,$s2
@@ -465,12 +464,12 @@ sub enctransform()
{ my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d");

$code.=<<___;
	mov	$s0,$acc0
	mov	$s1,$acc1
	and	\$0x80808080,$acc0
	and	\$0x80808080,$acc1
	mov	$acc0,$t0
	mov	$acc1,$t1
	mov	\$0x80808080,$t0
	mov	\$0x80808080,$t1
	and	$s0,$t0
	and	$s1,$t1
	mov	$t0,$acc0
	mov	$t1,$acc1
	shr	\$7,$t0
	lea	($s0,$s0),$r20
	shr	\$7,$t1
@@ -488,25 +487,25 @@ $code.=<<___;

	xor	$r20,$s0
	xor	$r21,$s1
	 mov	$s2,$acc0
	 mov	$s3,$acc1
	 mov	\$0x80808080,$t2
	rol	\$24,$s0
	 mov	\$0x80808080,$t3
	rol	\$24,$s1
	 and	\$0x80808080,$acc0
	 and	\$0x80808080,$acc1
	 and	$s2,$t2
	 and	$s3,$t3
	xor	$r20,$s0
	xor	$r21,$s1
	 mov	$acc0,$t2
	 mov	$acc1,$t3
	 mov	$t2,$acc0
	ror	\$16,$t0
	 mov	$t3,$acc1
	ror	\$16,$t1
	 shr	\$7,$t2
	 lea	($s2,$s2),$r20
	 shr	\$7,$t2
	xor	$t0,$s0
	xor	$t1,$s1
	 shr	\$7,$t3
	 lea	($s3,$s3),$r21
	xor	$t1,$s1
	ror	\$8,$t0
	 lea	($s3,$s3),$r21
	ror	\$8,$t1
	 sub	$t2,$acc0
	 sub	$t3,$acc1
@@ -522,23 +521,23 @@ $code.=<<___;
	xor	$acc0,$r20
	xor	$acc1,$r21

	ror	\$16,$t2
	xor	$r20,$s2
	ror	\$16,$t3
	xor	$r21,$s3
	rol	\$24,$s2
	mov	0($sbox),$acc0			# prefetch Te4
	rol	\$24,$s3
	xor	$r20,$s2
	xor	$r21,$s3
	mov	0($sbox),$acc0			# prefetch Te4
	ror	\$16,$t2
	ror	\$16,$t3
	mov	64($sbox),$acc1
	xor	$t2,$s2
	xor	$t3,$s3
	xor	$r21,$s3
	mov	128($sbox),$r20
	xor	$t2,$s2
	ror	\$8,$t2
	xor	$t3,$s3
	ror	\$8,$t3
	mov	192($sbox),$r21
	xor	$t2,$s2
	mov	192($sbox),$r21
	xor	$t3,$s3
___
}
@@ -935,70 +934,69 @@ $code.=<<___;
	movzb	`&lo("$s0")`,$t0
	movzb	`&lo("$s1")`,$t1
	movzb	`&lo("$s2")`,$t2
	movzb	($sbox,$t0,1),$t0
	movzb	($sbox,$t1,1),$t1
	movzb	($sbox,$t2,1),$t2

	movzb	`&lo("$s3")`,$t3
	movzb	`&hi("$s3")`,$acc0
	movzb	`&hi("$s0")`,$acc1
	shr	\$16,$s3
	movzb	`&hi("$s1")`,$acc2
	movzb	($sbox,$t0,1),$t0
	movzb	($sbox,$t1,1),$t1
	movzb	($sbox,$t2,1),$t2
	movzb	($sbox,$t3,1),$t3
	movzb	($sbox,$acc0,1),$t4	#$t0
	movzb	($sbox,$acc1,1),$t5	#$t1

	movzb	`&hi("$s1")`,$acc2
	movzb	($sbox,$acc0,1),$t4	#$t0
	movzb	`&hi("$s2")`,$acc0
	shr	\$16,$s2
	movzb	($sbox,$acc1,1),$t5	#$t1
	movzb	($sbox,$acc2,1),$acc2	#$t2
	movzb	($sbox,$acc0,1),$acc0	#$t3
	shr	\$16,$s3

	movzb	`&lo("$s2")`,$acc1
	shl	\$8,$t4
	shr	\$16,$s2
	shl	\$8,$t5
	movzb	($sbox,$acc1,1),$acc1	#$t0
	xor	$t4,$t0
	xor	$t5,$t1

	movzb	`&lo("$s3")`,$t4
	shl	\$8,$t4
	movzb	`&lo("$s2")`,$acc1
	shr	\$16,$s0
	xor	$t4,$t0
	shr	\$16,$s1
	movzb	`&lo("$s0")`,$t5
	movzb	`&lo("$s3")`,$t4

	shl	\$8,$acc2
	xor	$t5,$t1
	shl	\$8,$acc0
	movzb	($sbox,$t4,1),$t4	#$t1
	movzb	($sbox,$t5,1),$t5	#$t2
	movzb	`&lo("$s0")`,$t5
	movzb	($sbox,$acc1,1),$acc1	#$t0
	xor	$acc2,$t2
	xor	$acc0,$t3

	movzb	`&lo("$s1")`,$acc2
	movzb	`&hi("$s1")`,$acc0

	shl	\$16,$acc1
	xor	$acc0,$t3
	movzb	($sbox,$t4,1),$t4	#$t1
	movzb	`&hi("$s1")`,$acc0
	movzb	($sbox,$acc2,1),$acc2	#$t3
	movzb	($sbox,$acc0,1),$acc0	#$t0
	xor	$acc1,$t0

	movzb	($sbox,$t5,1),$t5	#$t2
	movzb	`&hi("$s2")`,$acc1

	shl	\$16,$acc2
	shl	\$16,$t4
	shl	\$16,$t5
	movzb	($sbox,$acc1,1),$s1	#$t1
	xor	$acc2,$t3
	movzb	`&hi("$s3")`,$acc2
	xor	$t4,$t1
	shr	\$8,$s0
	xor	$t5,$t2

	movzb	`&hi("$s3")`,$acc1
	shr	\$8,$s0
	shl	\$16,$acc2
	movzb	($sbox,$acc1,1),$s2	#$t2
	movzb	($sbox,$acc0,1),$acc0	#$t0
	movzb	($sbox,$acc1,1),$s1	#$t1
	movzb	($sbox,$acc2,1),$s2	#$t2
	movzb	($sbox,$s0,1),$s3	#$t3
	xor	$acc2,$t3

	mov	$t0,$s0
	shl	\$24,$acc0
	shl	\$24,$s1
	shl	\$24,$s2
	xor	$acc0,$t0
	xor	$acc0,$s0
	shl	\$24,$s3
	xor	$t1,$s1
	mov	$t0,$s0
	xor	$t2,$s2
	xor	$t3,$s3
___
@@ -1013,12 +1011,12 @@ sub dectransform()
  my $prefetch = shift;

$code.=<<___;
	mov	$tp10,$acc0
	mov	$tp18,$acc8
	and	$mask80,$acc0
	and	$mask80,$acc8
	mov	$acc0,$tp40
	mov	$acc8,$tp48
	mov	$mask80,$tp40
	mov	$mask80,$tp48
	and	$tp10,$tp40
	and	$tp18,$tp48
	mov	$tp40,$acc0
	mov	$tp48,$acc8
	shr	\$7,$tp40
	lea	($tp10,$tp10),$tp20
	shr	\$7,$tp48
@@ -1029,15 +1027,15 @@ $code.=<<___;
	and	$maskfe,$tp28
	and	$mask1b,$acc0
	and	$mask1b,$acc8
	xor	$tp20,$acc0
	xor	$tp28,$acc8
	mov	$acc0,$tp20
	mov	$acc8,$tp28

	and	$mask80,$acc0
	and	$mask80,$acc8
	mov	$acc0,$tp80
	mov	$acc8,$tp88
	xor	$acc0,$tp20
	xor	$acc8,$tp28
	mov	$mask80,$tp80
	mov	$mask80,$tp88

	and	$tp20,$tp80
	and	$tp28,$tp88
	mov	$tp80,$acc0
	mov	$tp88,$acc8
	shr	\$7,$tp80
	lea	($tp20,$tp20),$tp40
	shr	\$7,$tp88
@@ -1048,15 +1046,15 @@ $code.=<<___;
	and	$maskfe,$tp48
	and	$mask1b,$acc0
	and	$mask1b,$acc8
	xor	$tp40,$acc0
	xor	$tp48,$acc8
	mov	$acc0,$tp40
	mov	$acc8,$tp48

	and	$mask80,$acc0
	and	$mask80,$acc8
	mov	$acc0,$tp80
	mov	$acc8,$tp88
	xor	$acc0,$tp40
	xor	$acc8,$tp48
	mov	$mask80,$tp80
	mov	$mask80,$tp88

	and	$tp40,$tp80
	and	$tp48,$tp88
	mov	$tp80,$acc0
	mov	$tp88,$acc8
	shr	\$7,$tp80
	 xor	$tp10,$tp20		# tp2^=tp1
	shr	\$7,$tp88
@@ -1081,51 +1079,51 @@ $code.=<<___;
	mov	$tp10,$acc0
	mov	$tp18,$acc8
	xor	$tp80,$tp40		# tp4^tp1^=tp8
	xor	$tp88,$tp48		# tp4^tp1^=tp8
	shr	\$32,$acc0
	xor	$tp88,$tp48		# tp4^tp1^=tp8
	shr	\$32,$acc8
	xor	$tp20,$tp80		# tp8^=tp8^tp2^tp1=tp2^tp1
	xor	$tp28,$tp88		# tp8^=tp8^tp2^tp1=tp2^tp1
	rol	\$8,`&LO("$tp10")`	# ROTATE(tp1^tp8,8)
	xor	$tp28,$tp88		# tp8^=tp8^tp2^tp1=tp2^tp1
	rol	\$8,`&LO("$tp18")`	# ROTATE(tp1^tp8,8)
	xor	$tp40,$tp80		# tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
	rol	\$8,`&LO("$acc0")`	# ROTATE(tp1^tp8,8)
	xor	$tp48,$tp88		# tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2

	rol	\$8,`&LO("$acc0")`	# ROTATE(tp1^tp8,8)
	rol	\$8,`&LO("$acc8")`	# ROTATE(tp1^tp8,8)
	xor	`&LO("$tp80")`,`&LO("$tp10")`
	xor	`&LO("$tp88")`,`&LO("$tp18")`
	shr	\$32,$tp80
	xor	`&LO("$tp88")`,`&LO("$tp18")`
	shr	\$32,$tp88
	xor	`&LO("$tp80")`,`&LO("$acc0")`
	xor	`&LO("$tp88")`,`&LO("$acc8")`

	mov	$tp20,$tp80
	mov	$tp28,$tp88
	shr	\$32,$tp80
	shr	\$32,$tp88
	rol	\$24,`&LO("$tp20")`	# ROTATE(tp2^tp1^tp8,24)
	mov	$tp28,$tp88
	rol	\$24,`&LO("$tp28")`	# ROTATE(tp2^tp1^tp8,24)
	rol	\$24,`&LO("$tp80")`	# ROTATE(tp2^tp1^tp8,24)
	rol	\$24,`&LO("$tp88")`	# ROTATE(tp2^tp1^tp8,24)
	shr	\$32,$tp80
	xor	`&LO("$tp20")`,`&LO("$tp10")`
	shr	\$32,$tp88
	xor	`&LO("$tp28")`,`&LO("$tp18")`
	rol	\$24,`&LO("$tp80")`	# ROTATE(tp2^tp1^tp8,24)
	mov	$tp40,$tp20
	rol	\$24,`&LO("$tp88")`	# ROTATE(tp2^tp1^tp8,24)
	mov	$tp48,$tp28
	shr	\$32,$tp20
	xor	`&LO("$tp80")`,`&LO("$acc0")`
	shr	\$32,$tp28
	xor	`&LO("$tp88")`,`&LO("$acc8")`

	`"mov	0($sbox),$mask80"	if ($prefetch)`
	shr	\$32,$tp20
	shr	\$32,$tp28
	`"mov	64($sbox),$maskfe"	if ($prefetch)`
	rol	\$16,`&LO("$tp40")`	# ROTATE(tp4^tp1^tp8,16)
	`"mov	64($sbox),$maskfe"	if ($prefetch)`
	rol	\$16,`&LO("$tp48")`	# ROTATE(tp4^tp1^tp8,16)
	`"mov	128($sbox),$mask1b"	if ($prefetch)`
	rol	\$16,`&LO("$tp20")`	# ROTATE(tp4^tp1^tp8,16)
	rol	\$16,`&LO("$tp28")`	# ROTATE(tp4^tp1^tp8,16)
	`"mov	192($sbox),$tp80"	if ($prefetch)`
	xor	`&LO("$tp40")`,`&LO("$tp10")`
	rol	\$16,`&LO("$tp28")`	# ROTATE(tp4^tp1^tp8,16)
	xor	`&LO("$tp48")`,`&LO("$tp18")`
	`"mov	256($sbox),$tp88"	if ($prefetch)`
	xor	`&LO("$tp20")`,`&LO("$acc0")`
+42 −43
Original line number Diff line number Diff line
@@ -27,9 +27,9 @@
#
#		aes-x86_64.pl		vpaes-x86_64.pl
#
# Core 2(**)	30.5/43.7/14.3		21.8/25.7(***)
# Nehalem	30.5/42.2/14.6		 9.8/11.8
# Atom		63.9/79.0/32.1		64.0/84.8(***)
# Core 2(**)	29.6/41.1/14.3		21.9/25.2(***)
# Nehalem	29.6/40.3/14.6		10.0/11.8
# Atom		57.3/74.2/32.1		60.9/82.3(***)
#
# (*)	"Hyper-threading" in the context refers rather to cache shared
#	among multiple cores, than to specifically Intel HTT. As vast
@@ -40,7 +40,7 @@
# (**)	"Core 2" refers to initial 65nm design, a.k.a. Conroe.
#
# (***)	Less impressive improvement on Core 2 and Atom is due to slow
#	pshufb,	yet it's respectable +40%/78% improvement on Core 2
#	pshufb,	yet it's respectable +36%/62% improvement on Core 2
#	(as implied, over "hyper-threading-safe" code path).
#
#						<appro@openssl.org>
@@ -94,8 +94,8 @@ _vpaes_encrypt_core:
	movdqa	.Lk_ipt+16(%rip), %xmm0	# ipthi
	pshufb	%xmm1,	%xmm0
	pxor	%xmm5,	%xmm2
	pxor	%xmm2,	%xmm0
	add	\$16,	%r9
	pxor	%xmm2,	%xmm0
	lea	.Lk_mc_backward(%rip),%r10
	jmp	.Lenc_entry

@@ -103,19 +103,19 @@ _vpaes_encrypt_core:
.Lenc_loop:
	# middle of middle round
	movdqa  %xmm13,	%xmm4	# 4 : sb1u
	pshufb  %xmm2,	%xmm4	# 4 = sb1u
	pxor	%xmm5,	%xmm4	# 4 = sb1u + k
	movdqa  %xmm12,	%xmm0	# 0 : sb1t
	pshufb  %xmm2,	%xmm4	# 4 = sb1u
	pshufb  %xmm3,	%xmm0	# 0 = sb1t
	pxor	%xmm4,	%xmm0	# 0 = A
	pxor	%xmm5,	%xmm4	# 4 = sb1u + k
	movdqa  %xmm15,	%xmm5	# 4 : sb2u
	pshufb	%xmm2,	%xmm5	# 4 = sb2u
	pxor	%xmm4,	%xmm0	# 0 = A
	movdqa	-0x40(%r11,%r10), %xmm1		# .Lk_mc_forward[]
	pshufb	%xmm2,	%xmm5	# 4 = sb2u
	movdqa	(%r11,%r10), %xmm4		# .Lk_mc_backward[]
	movdqa	%xmm14, %xmm2	# 2 : sb2t
	pshufb	%xmm3,  %xmm2	# 2 = sb2t
	pxor	%xmm5,	%xmm2	# 2 = 2A
	movdqa	(%r11,%r10), %xmm4		# .Lk_mc_backward[]
	movdqa	%xmm0,  %xmm3	# 3 = A
	pxor	%xmm5,	%xmm2	# 2 = 2A
	pshufb  %xmm1,  %xmm0	# 0 = B
	add	\$16,	%r9	# next key
	pxor	%xmm2,  %xmm0	# 0 = 2A+B
@@ -124,30 +124,30 @@ _vpaes_encrypt_core:
	pxor	%xmm0,	%xmm3	# 3 = 2A+B+D
	pshufb  %xmm1,	%xmm0	# 0 = 2B+C
	and	\$0x30,	%r11	# ... mod 4
	pxor	%xmm3,	%xmm0	# 0 = 2A+3B+C+D
	sub	\$1,%rax	# nr--
	pxor	%xmm3,	%xmm0	# 0 = 2A+3B+C+D

.Lenc_entry:
	# top of round
	movdqa  %xmm9, 	%xmm1	# 1 : i
	movdqa	%xmm11, %xmm5	# 2 : a/k
	pandn	%xmm0, 	%xmm1	# 1 = i<<4
	psrld	\$4,   	%xmm1   # 1 = i
	pand	%xmm9, 	%xmm0   # 0 = k
	movdqa	%xmm11, %xmm5	# 2 : a/k
	pshufb  %xmm0,  %xmm5	# 2 = a/k
	pxor	%xmm1,	%xmm0	# 0 = j
	movdqa	%xmm10,	%xmm3  	# 3 : 1/i
	pxor	%xmm1,	%xmm0	# 0 = j
	pshufb  %xmm1, 	%xmm3  	# 3 = 1/i
	pxor	%xmm5, 	%xmm3  	# 3 = iak = 1/i + a/k
	movdqa	%xmm10,	%xmm4  	# 4 : 1/j
	pxor	%xmm5, 	%xmm3  	# 3 = iak = 1/i + a/k
	pshufb	%xmm0, 	%xmm4  	# 4 = 1/j
	pxor	%xmm5, 	%xmm4  	# 4 = jak = 1/j + a/k
	movdqa	%xmm10,	%xmm2  	# 2 : 1/iak
	pxor	%xmm5, 	%xmm4  	# 4 = jak = 1/j + a/k
	pshufb  %xmm3,	%xmm2  	# 2 = 1/iak
	pxor	%xmm0, 	%xmm2  	# 2 = io
	movdqa	%xmm10, %xmm3   # 3 : 1/jak
	movdqu	(%r9),	%xmm5
	pxor	%xmm0, 	%xmm2  	# 2 = io
	pshufb  %xmm4,  %xmm3   # 3 = 1/jak
	movdqu	(%r9),	%xmm5
	pxor	%xmm1,  %xmm3   # 3 = jo
	jnz	.Lenc_loop

@@ -200,62 +200,61 @@ _vpaes_decrypt_core:
##  Inverse mix columns
##
	movdqa  -0x20(%r10),%xmm4	# 4 : sb9u
	movdqa  -0x10(%r10),%xmm1	# 0 : sb9t
	pshufb	%xmm2,	%xmm4		# 4 = sb9u
	pshufb	%xmm3,	%xmm1		# 0 = sb9t
	pxor	%xmm0,	%xmm4
	movdqa  -0x10(%r10),%xmm0	# 0 : sb9t
	pshufb	%xmm3,	%xmm0		# 0 = sb9t
	pxor	%xmm4,	%xmm0		# 0 = ch
	add	\$16, %r9		# next round key
	pxor	%xmm4,	%xmm1		# 0 = ch

	pshufb	%xmm5,	%xmm0		# MC ch
	movdqa  0x00(%r10),%xmm4	# 4 : sbdu
	pshufb	%xmm5,	%xmm1		# MC ch
	pshufb	%xmm2,	%xmm4		# 4 = sbdu
	pxor	%xmm0,	%xmm4		# 4 = ch
	movdqa  0x10(%r10),%xmm0	# 0 : sbdt
	pxor	%xmm1,	%xmm4		# 4 = ch
	pshufb	%xmm3,	%xmm0		# 0 = sbdt
	pxor	%xmm4,	%xmm0		# 0 = ch
	sub	\$1,%rax		# nr--
	pxor	%xmm4,	%xmm0		# 0 = ch

	pshufb	%xmm5,	%xmm0		# MC ch
	movdqa  0x20(%r10),%xmm4	# 4 : sbbu
	pshufb	%xmm5,	%xmm0		# MC ch
	movdqa  0x30(%r10),%xmm1	# 0 : sbbt
	pshufb	%xmm2,	%xmm4		# 4 = sbbu
	pshufb	%xmm3,	%xmm1		# 0 = sbbt
	pxor	%xmm0,	%xmm4		# 4 = ch
	movdqa  0x30(%r10),%xmm0	# 0 : sbbt
	pshufb	%xmm3,	%xmm0		# 0 = sbbt
	pxor	%xmm4,	%xmm0		# 0 = ch
	pxor	%xmm4,	%xmm1		# 0 = ch

	pshufb	%xmm5,	%xmm0		# MC ch
	movdqa  0x40(%r10),%xmm4	# 4 : sbeu
	pshufb	%xmm2,	%xmm4		# 4 = sbeu
	pxor	%xmm0,	%xmm4		# 4 = ch
	pshufb	%xmm5,	%xmm1		# MC ch
	movdqa  0x50(%r10),%xmm0	# 0 : sbet
	pshufb	%xmm2,	%xmm4		# 4 = sbeu
	pshufb	%xmm3,	%xmm0		# 0 = sbet
	pxor	%xmm4,	%xmm0		# 0 = ch

	palignr	\$12,	%xmm5,	%xmm5
	pxor	%xmm1,	%xmm4		# 4 = ch
	pxor	%xmm4,	%xmm0		# 0 = ch

.Ldec_entry:
	# top of round
	movdqa  %xmm9, 	%xmm1	# 1 : i
	pandn	%xmm0, 	%xmm1	# 1 = i<<4
	movdqa	%xmm11, %xmm2	# 2 : a/k
	psrld	\$4,    %xmm1	# 1 = i
	pand	%xmm9, 	%xmm0	# 0 = k
	movdqa	%xmm11, %xmm2	# 2 : a/k
	pshufb  %xmm0,  %xmm2	# 2 = a/k
	pxor	%xmm1,	%xmm0	# 0 = j
	movdqa	%xmm10,	%xmm3	# 3 : 1/i
	pxor	%xmm1,	%xmm0	# 0 = j
	pshufb  %xmm1, 	%xmm3	# 3 = 1/i
	pxor	%xmm2, 	%xmm3	# 3 = iak = 1/i + a/k
	movdqa	%xmm10,	%xmm4	# 4 : 1/j
	pxor	%xmm2, 	%xmm3	# 3 = iak = 1/i + a/k
	pshufb	%xmm0, 	%xmm4	# 4 = 1/j
	pxor	%xmm2, 	%xmm4	# 4 = jak = 1/j + a/k
	movdqa	%xmm10,	%xmm2	# 2 : 1/iak
	pshufb  %xmm3,	%xmm2	# 2 = 1/iak
	pxor	%xmm0, 	%xmm2	# 2 = io
	movdqa	%xmm10, %xmm3	# 3 : 1/jak
	pxor	%xmm0, 	%xmm2	# 2 = io
	pshufb  %xmm4,  %xmm3	# 3 = 1/jak
	pxor	%xmm1,  %xmm3	# 3 = jo
	movdqu	(%r9),	%xmm0
	pxor	%xmm1,  %xmm3	# 3 = jo
	jnz	.Ldec_loop

	# middle of last round
@@ -463,12 +462,12 @@ _vpaes_schedule_core:
.type	_vpaes_schedule_192_smear,\@abi-omnipotent
.align	16
_vpaes_schedule_192_smear:
	pshufd	\$0x80,	%xmm6,	%xmm0	# d c 0 0 -> c 0 0 0
	pxor	%xmm0,	%xmm6		# -> c+d c 0 0
	pshufd	\$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
	pshufd	\$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
	pxor	%xmm1,	%xmm6		# -> c+d c 0 0
	pxor	%xmm1,	%xmm1
	pxor	%xmm0,	%xmm6		# -> b+c+d b+c b a
	movdqa	%xmm6,	%xmm0
	pxor	%xmm1,	%xmm1
	movhlps	%xmm1,	%xmm6		# clobber low side with zeros
	ret
.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear