Commit 89f1eb82 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

aes-586.pl: Atom-specific optimization, +44/29%, minor improvement on others.

vpaes-x86.pl: minor performance squeeze.
parent f717abd7
Loading
Loading
Loading
Loading
+135 −128
Original line number Diff line number Diff line
@@ -103,11 +103,12 @@
# byte for 128-bit key.
#
#		ECB encrypt	ECB decrypt	CBC large chunk
# P4		56[60]		84[100]		23
# AMD K8	48[44]		70[79]		18
# PIII		41[50]		61[91]		24
# Core 2	32[38]		45[70]		18.5
# Pentium	120		160		77
# P4		52[54]		83[95]		23
# AMD K8	46[41]		66[70]		18
# PIII		41[50]		60[77]		24
# Core 2	31[36]		45[64]		18.5
# Atom		76[100]		96[138]		60
# Pentium	115		150		77
#
# Version 4.1 switches to compact S-box even in key schedule setup.
#
@@ -476,24 +477,25 @@ sub enctransform()
  my $tmp = $tbl;
  my $r2  = $key ;

	&mov	($acc,$s[$i]);
	&and	($acc,0x80808080);
	&mov	($tmp,$acc);
	&shr	($tmp,7);
	&and	($tmp,$s[$i]);
	&lea	($r2,&DWP(0,$s[$i],$s[$i]));
	&sub	($acc,$tmp);
	&mov	($acc,$tmp);
	&shr	($tmp,7);
	&and	($r2,0xfefefefe);
	&and	($acc,0x1b1b1b1b);
	&sub	($acc,$tmp);
	&mov	($tmp,$s[$i]);
	&and	($acc,0x1b1b1b1b);
	&rotr	($tmp,16);
	&xor	($acc,$r2);	# r2
	&mov	($r2,$s[$i]);

	&xor	($s[$i],$acc);	# r0 ^ r2
	&rotr	($r2,16+8);
	&xor	($acc,$tmp);
	&rotl	($s[$i],24);
	&xor	($acc,$r2);
	&mov	($tmp,0x80808080)	if ($i!=1);
	&xor	($s[$i],$acc);	# ROTATE(r2^r0,24) ^ r2
	&rotr	($tmp,16);
	&xor	($s[$i],$tmp);
	&rotr	($tmp,8);
	&xor	($s[$i],$tmp);
}

&function_begin_B("_x86_AES_encrypt_compact");
@@ -526,6 +528,7 @@ sub enctransform()
		&enccompact(1,$tbl,$s1,$s2,$s3,$s0,1);
		&enccompact(2,$tbl,$s2,$s3,$s0,$s1,1);
		&enccompact(3,$tbl,$s3,$s0,$s1,$s2,1);
		&mov	($tbl,0x80808080);
		&enctransform(2);
		&enctransform(3);
		&enctransform(0);
@@ -607,82 +610,84 @@ sub sse_enccompact()
	&pshufw	("mm5","mm4",0x0d);		# 15,14,11,10
	&movd	("eax","mm1");			#  5, 4, 1, 0
	&movd	("ebx","mm5");			# 15,14,11,10
	&mov	($__key,$key);

	&movz	($acc,&LB("eax"));		#  0
	&movz	("ecx",&BP(-128,$tbl,$acc,1));	#  0
	&pshufw	("mm2","mm0",0x0d);		#  7, 6, 3, 2
	&movz	("edx",&HB("eax"));		#  1
	&pshufw	("mm2","mm0",0x0d);		#  7, 6, 3, 2
	&movz	("ecx",&BP(-128,$tbl,$acc,1));	#  0
	&movz	($key,&LB("ebx"));		# 10
	&movz	("edx",&BP(-128,$tbl,"edx",1));	#  1
	&shl	("edx",8);			#  1
	&shr	("eax",16);			#  5, 4
	&shl	("edx",8);			#  1

	&movz	($acc,&LB("ebx"));		# 10
	&movz	($acc,&BP(-128,$tbl,$acc,1));	# 10
	&movz	($acc,&BP(-128,$tbl,$key,1));	# 10
	&movz	($key,&HB("ebx"));		# 11
	&shl	($acc,16);			# 10
	&or	("ecx",$acc);			# 10
	&pshufw	("mm6","mm4",0x08);		# 13,12, 9, 8
	&movz	($acc,&HB("ebx"));		# 11
	&movz	($acc,&BP(-128,$tbl,$acc,1));	# 11
	&or	("ecx",$acc);			# 10
	&movz	($acc,&BP(-128,$tbl,$key,1));	# 11
	&movz	($key,&HB("eax"));		#  5
	&shl	($acc,24);			# 11
	&or	("edx",$acc);			# 11
	&shr	("ebx",16);			# 15,14
	&or	("edx",$acc);			# 11

	&movz	($acc,&HB("eax"));		#  5
	&movz	($acc,&BP(-128,$tbl,$acc,1));	#  5
	&movz	($acc,&BP(-128,$tbl,$key,1));	#  5
	&movz	($key,&HB("ebx"));		# 15
	&shl	($acc,8);			#  5
	&or	("ecx",$acc);			#  5
	&movz	($acc,&HB("ebx"));		# 15
	&movz	($acc,&BP(-128,$tbl,$acc,1));	# 15
	&movz	($acc,&BP(-128,$tbl,$key,1));	# 15
	&movz	($key,&LB("eax"));		#  4
	&shl	($acc,24);			# 15
	&or	("ecx",$acc);			# 15
	&movd	("mm0","ecx");			# t[0] collected

	&movz	($acc,&LB("eax"));		#  4
	&movz	("ecx",&BP(-128,$tbl,$acc,1));	#  4
	&movz	($acc,&BP(-128,$tbl,$key,1));	#  4
	&movz	($key,&LB("ebx"));		# 14
	&movd	("eax","mm2");			#  7, 6, 3, 2
	&movz	($acc,&LB("ebx"));		# 14
	&movz	($acc,&BP(-128,$tbl,$acc,1));	# 14
	&shl	($acc,16);			# 14
	&movd	("mm0","ecx");			# t[0] collected
	&movz	("ecx",&BP(-128,$tbl,$key,1));	# 14
	&movz	($key,&HB("eax"));		#  3
	&shl	("ecx",16);			# 14
	&movd	("ebx","mm6");			# 13,12, 9, 8
	&or	("ecx",$acc);			# 14

	&movd	("ebx","mm6");			# 13,12, 9, 8
	&movz	($acc,&HB("eax"));		#  3
	&movz	($acc,&BP(-128,$tbl,$acc,1));	#  3
	&movz	($acc,&BP(-128,$tbl,$key,1));	#  3
	&movz	($key,&HB("ebx"));		#  9
	&shl	($acc,24);			#  3
	&or	("ecx",$acc);			#  3
	&movz	($acc,&HB("ebx"));		#  9
	&movz	($acc,&BP(-128,$tbl,$acc,1));	#  9
	&movz	($acc,&BP(-128,$tbl,$key,1));	#  9
	&movz	($key,&LB("ebx"));		#  8
	&shl	($acc,8);			#  9
	&shr	("ebx",16);			# 13,12
	&or	("ecx",$acc);			#  9
	&movd	("mm1","ecx");			# t[1] collected

	&movz	($acc,&LB("ebx"));		#  8
	&movz	("ecx",&BP(-128,$tbl,$acc,1));	#  8
	&shr	("ebx",16);			# 13,12
	&movz	($acc,&LB("eax"));		#  2
	&movz	($acc,&BP(-128,$tbl,$acc,1));	#  2
	&shl	($acc,16);			#  2
	&or	("ecx",$acc);			#  2
	&movz	($acc,&BP(-128,$tbl,$key,1));	#  8
	&movz	($key,&LB("eax"));		#  2
	&shr	("eax",16);			#  7, 6
	&movd	("mm1","ecx");			# t[1] collected
	&movz	("ecx",&BP(-128,$tbl,$key,1));	#  2
	&movz	($key,&HB("eax"));		#  7
	&shl	("ecx",16);			#  2
	&and	("eax",0xff);			#  6
	&or	("ecx",$acc);			#  2

	&punpckldq	("mm0","mm1");		# t[0,1] collected

	&movz	($acc,&HB("eax"));		#  7
	&movz	($acc,&BP(-128,$tbl,$acc,1));	#  7
	&movz	($acc,&BP(-128,$tbl,$key,1));	#  7
	&movz	($key,&HB("ebx"));		# 13
	&shl	($acc,24);			#  7
	&or	("ecx",$acc);			#  7
	&and	("eax",0xff);			#  6
	&and	("ebx",0xff);			# 12
	&movz	("eax",&BP(-128,$tbl,"eax",1));	#  6
	&or	("ecx",$acc);			#  7
	&shl	("eax",16);			#  6
	&movz	($acc,&BP(-128,$tbl,$key,1));	# 13
	&or	("edx","eax");			#  6
	&movz	($acc,&HB("ebx"));		# 13
	&movz	($acc,&BP(-128,$tbl,$acc,1));	# 13
	&shl	($acc,8);			# 13
	&or	("ecx",$acc);			# 13
	&movd	("mm4","ecx");			# t[2] collected
	&and	("ebx",0xff);			# 12
	&movz	("ebx",&BP(-128,$tbl,"ebx",1));	# 12
	&or	("ecx",$acc);			# 13
	&or	("edx","ebx");			# 12
	&mov	($key,$__key);
	&movd	("mm4","ecx");			# t[2] collected
	&movd	("mm5","edx");			# t[3] collected

	&punpckldq	("mm4","mm5");		# t[2,3] collected
@@ -1270,30 +1275,30 @@ sub dectransform()
  my $tp4 = @s[($i+3)%4]; $tp4 = @s[3] if ($i==1);
  my $tp8 = $tbl;

	&mov	($acc,$s[$i]);
	&and	($acc,0x80808080);
	&mov	($tmp,$acc);
	&mov	($tmp,0x80808080);
	&and	($tmp,$s[$i]);
	&mov	($acc,$tmp);
	&shr	($tmp,7);
	&lea	($tp2,&DWP(0,$s[$i],$s[$i]));
	&sub	($acc,$tmp);
	&and	($tp2,0xfefefefe);
	&and	($acc,0x1b1b1b1b);
	&xor	($acc,$tp2);
	&mov	($tp2,$acc);
	&xor	($tp2,$acc);
	&mov	($tmp,0x80808080);

	&and	($acc,0x80808080);
	&mov	($tmp,$acc);
	&and	($tmp,$tp2);
	&mov	($acc,$tmp);
	&shr	($tmp,7);
	&lea	($tp4,&DWP(0,$tp2,$tp2));
	&sub	($acc,$tmp);
	&and	($tp4,0xfefefefe);
	&and	($acc,0x1b1b1b1b);
	 &xor	($tp2,$s[$i]);	# tp2^tp1
	&xor	($acc,$tp4);
	&mov	($tp4,$acc);
	&xor	($tp4,$acc);
	&mov	($tmp,0x80808080);

	&and	($acc,0x80808080);
	&mov	($tmp,$acc);
	&and	($tmp,$tp4);
	&mov	($acc,$tmp);
	&shr	($tmp,7);
	&lea	($tp8,&DWP(0,$tp4,$tp4));
	&sub	($acc,$tmp);
@@ -1305,13 +1310,13 @@ sub dectransform()

	&xor	($s[$i],$tp2);
	&xor	($tp2,$tp8);
	&rotl	($tp2,24);
	&xor	($s[$i],$tp4);
	&xor	($tp4,$tp8);
	&rotl	($tp4,16);
	&rotl	($tp2,24);
	&xor	($s[$i],$tp8);	# ^= tp8^(tp4^tp1)^(tp2^tp1)
	&rotl	($tp8,8);
	&rotl	($tp4,16);
	&xor	($s[$i],$tp2);	# ^= ROTATE(tp8^tp2^tp1,24)
	&rotl	($tp8,8);
	&xor	($s[$i],$tp4);	# ^= ROTATE(tp8^tp4^tp1,16)
	 &mov	($s[0],$__s0)			if($i==2); #prefetch $s0
	 &mov	($s[1],$__s1)			if($i==3); #prefetch $s1
@@ -1389,85 +1394,87 @@ sub dectransform()
sub sse_deccompact()
{
	&pshufw	("mm1","mm0",0x0c);		#  7, 6, 1, 0
	&pshufw	("mm5","mm4",0x09);		# 13,12,11,10
	&movd	("eax","mm1");			#  7, 6, 1, 0
	&movd	("ebx","mm5");			# 13,12,11,10
	&mov	($__key,$key);

	&pshufw	("mm5","mm4",0x09);		# 13,12,11,10
	&movz	($acc,&LB("eax"));		#  0
	&movz	("ecx",&BP(-128,$tbl,$acc,1));	#  0
	&movd	("ebx","mm5");			# 13,12,11,10
	&movz	("edx",&HB("eax"));		#  1
	&pshufw	("mm2","mm0",0x06);		#  3, 2, 5, 4
	&movz	("ecx",&BP(-128,$tbl,$acc,1));	#  0
	&movz	($key,&LB("ebx"));		# 10
	&movz	("edx",&BP(-128,$tbl,"edx",1));	#  1
	&shr	("eax",16);			#  7, 6
	&shl	("edx",8);			#  1

	&pshufw	("mm2","mm0",0x06);		#  3, 2, 5, 4
	&movz	($acc,&LB("ebx"));		# 10
	&movz	($acc,&BP(-128,$tbl,$acc,1));	# 10
	&movz	($acc,&BP(-128,$tbl,$key,1));	# 10
	&movz	($key,&HB("ebx"));		# 11
	&shl	($acc,16);			# 10
	&pshufw	("mm6","mm4",0x03);		# 9, 8,15,14
	&or	("ecx",$acc);			# 10
	&shr	("eax",16);			#  7, 6
	&movz	($acc,&HB("ebx"));		# 11
	&movz	($acc,&BP(-128,$tbl,$acc,1));	# 11
	&movz	($acc,&BP(-128,$tbl,$key,1));	# 11
	&movz	($key,&HB("eax"));		#  7
	&shl	($acc,24);			# 11
	&or	("edx",$acc);			# 11
	&shr	("ebx",16);			# 13,12
	&or	("edx",$acc);			# 11

	&pshufw	("mm6","mm4",0x03);		# 9, 8,15,14
	&movz	($acc,&HB("eax"));		#  7
	&movz	($acc,&BP(-128,$tbl,$acc,1));	#  7
	&movz	($acc,&BP(-128,$tbl,$key,1));	#  7
	&movz	($key,&HB("ebx"));		# 13
	&shl	($acc,24);			#  7
	&or	("ecx",$acc);			#  7
	&movz	($acc,&HB("ebx"));		# 13
	&movz	($acc,&BP(-128,$tbl,$acc,1));	# 13
	&movz	($acc,&BP(-128,$tbl,$key,1));	# 13
	&movz	($key,&LB("eax"));		#  6
	&shl	($acc,8);			# 13
	&movd	("eax","mm2");			#  3, 2, 5, 4
	&or	("ecx",$acc);			# 13
	&movd	("mm0","ecx");			# t[0] collected

	&movz	($acc,&LB("eax"));		#  6
	&movd	("eax","mm2");			#  3, 2, 5, 4
	&movz	("ecx",&BP(-128,$tbl,$acc,1));	#  6
	&shl	("ecx",16);			#  6
	&movz	($acc,&LB("ebx"));		# 12
	&movz	($acc,&BP(-128,$tbl,$key,1));	#  6
	&movz	($key,&LB("ebx"));		# 12
	&shl	($acc,16);			#  6
	&movd	("ebx","mm6");			#  9, 8,15,14
	&movz	($acc,&BP(-128,$tbl,$acc,1));	# 12
	&movd	("mm0","ecx");			# t[0] collected
	&movz	("ecx",&BP(-128,$tbl,$key,1));	# 12
	&movz	($key,&LB("eax"));		#  4
	&or	("ecx",$acc);			# 12

	&movz	($acc,&LB("eax"));		#  4
	&movz	($acc,&BP(-128,$tbl,$acc,1));	#  4
	&movz	($acc,&BP(-128,$tbl,$key,1));	#  4
	&movz	($key,&LB("ebx"));		# 14
	&or	("edx",$acc);			#  4
	&movz	($acc,&LB("ebx"));		# 14
	&movz	($acc,&BP(-128,$tbl,$acc,1));	# 14
	&movz	($acc,&BP(-128,$tbl,$key,1));	# 14
	&movz	($key,&HB("eax"));		#  5
	&shl	($acc,16);			# 14
	&shr	("eax",16);			#  3, 2
	&or	("edx",$acc);			# 14
	&movd	("mm1","edx");			# t[1] collected

	&movz	($acc,&HB("eax"));		#  5
	&movz	("edx",&BP(-128,$tbl,$acc,1));	#  5
	&shl	("edx",8);			#  5
	&movz	($acc,&HB("ebx"));		# 15
	&shr	("eax",16);			#  3, 2
	&movz	($acc,&BP(-128,$tbl,$acc,1));	# 15
	&shl	($acc,24);			# 15
	&or	("edx",$acc);			# 15
	&movz	($acc,&BP(-128,$tbl,$key,1));	#  5
	&movz	($key,&HB("ebx"));		# 15
	&shr	("ebx",16);			#  9, 8
	&shl	($acc,8);			#  5
	&movd	("mm1","edx");			# t[1] collected
	&movz	("edx",&BP(-128,$tbl,$key,1));	# 15
	&movz	($key,&HB("ebx"));		#  9
	&shl	("edx",24);			# 15
	&and	("ebx",0xff);			#  8
	&or	("edx",$acc);			# 15

	&punpckldq	("mm0","mm1");		# t[0,1] collected

	&movz	($acc,&HB("ebx"));		#  9
	&movz	($acc,&BP(-128,$tbl,$acc,1));	#  9
	&movz	($acc,&BP(-128,$tbl,$key,1));	#  9
	&movz	($key,&LB("eax"));		#  2
	&shl	($acc,8);			#  9
	&or	("ecx",$acc);			#  9
	&and	("ebx",0xff);			#  8
	&movz	("eax",&HB("eax"));		#  3
	&movz	("ebx",&BP(-128,$tbl,"ebx",1));	#  8
	&or	("ecx",$acc);			#  9
	&movz	($acc,&BP(-128,$tbl,$key,1));	#  2
	&or	("edx","ebx");			#  8
	&movz	($acc,&LB("eax"));		#  2
	&movz	($acc,&BP(-128,$tbl,$acc,1));	#  2
	&shl	($acc,16);			#  2
	&or	("edx",$acc);			#  2
	&movd	("mm4","edx");			# t[2] collected
	&movz	("eax",&HB("eax"));		#  3
	&movz	("eax",&BP(-128,$tbl,"eax",1));	#  3
	&or	("edx",$acc);			#  2
	&shl	("eax",24);			#  3
	&or	("ecx","eax");			#  3
	&mov	($key,$__key);
	&movd	("mm4","edx");			# t[2] collected
	&movd	("mm5","ecx");			# t[3] collected

	&punpckldq	("mm4","mm5");		# t[2,3] collected
@@ -2865,32 +2872,32 @@ sub deckey()
{ my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
  my $tmp = $tbl;

	&mov	($acc,$tp1);
	&and	($acc,0x80808080);
	&mov	($tmp,$acc);
	&shr	($tmp,7);
	&mov	($tmp,0x80808080);
	&and	($tmp,$tp1);
	&lea	($tp2,&DWP(0,$tp1,$tp1));
	&mov	($acc,$tmp);
	&shr	($tmp,7);
	&sub	($acc,$tmp);
	&and	($tp2,0xfefefefe);
	&and	($acc,0x1b1b1b1b);
	&xor	($acc,$tp2);
	&mov	($tp2,$acc);
	&xor	($tp2,$acc);
	&mov	($tmp,0x80808080);

	&and	($acc,0x80808080);
	&mov	($tmp,$acc);
	&shr	($tmp,7);
	&and	($tmp,$tp2);
	&lea	($tp4,&DWP(0,$tp2,$tp2));
	&mov	($acc,$tmp);
	&shr	($tmp,7);
	&sub	($acc,$tmp);
	&and	($tp4,0xfefefefe);
	&and	($acc,0x1b1b1b1b);
	 &xor	($tp2,$tp1);	# tp2^tp1
	&xor	($acc,$tp4);
	&mov	($tp4,$acc);
	&xor	($tp4,$acc);
	&mov	($tmp,0x80808080);

	&and	($acc,0x80808080);
	&mov	($tmp,$acc);
	&shr	($tmp,7);
	&and	($tmp,$tp4);
	&lea	($tp8,&DWP(0,$tp4,$tp4));
	&mov	($acc,$tmp);
	&shr	($tmp,7);
	 &xor	($tp4,$tp1);	# tp4^tp1
	&sub	($acc,$tmp);
	&and	($tp8,0xfefefefe);
+45 −46
Original line number Diff line number Diff line
@@ -27,9 +27,9 @@
#
#		aes-586.pl		vpaes-x86.pl
#
# Core 2(**)	29.1/42.3/18.3		22.0/25.6(***)
# Nehalem	27.9/40.4/18.1		10.3/12.0
# Atom		102./119./60.1		64.5/85.3(***)
# Core 2(**)	28.1/41.4/18.3		21.9/25.2(***)
# Nehalem	27.9/40.4/18.1		10.2/11.9
# Atom		70.7/92.1/60.1		61.1/81.0(***)
#
# (*)	"Hyper-threading" in the context refers rather to cache shared
#	among multiple cores, than to specifically Intel HTT. As vast
@@ -40,8 +40,8 @@
# (**)	"Core 2" refers to initial 65nm design, a.k.a. Conroe.
#
# (***)	Less impressive improvement on Core 2 and Atom is due to slow
#	pshufb,	yet it's respectable +32%/65%  improvement on Core 2
#	and +58%/40% on Atom (as implied, over "hyper-threading-safe"
#	pshufb,	yet it's respectable +28%/64%  improvement on Core 2
#	and +15% on Atom (as implied, over "hyper-threading-safe"
#	code path).
#
#						<appro@openssl.org>
@@ -183,35 +183,35 @@ $k_dsbo=0x2c0; # decryption sbox final output
	&movdqa	("xmm1","xmm6")
	&movdqa	("xmm2",&QWP($k_ipt,$const));
	&pandn	("xmm1","xmm0");
	&movdqu	("xmm5",&QWP(0,$key));
	&psrld	("xmm1",4);
	&pand	("xmm0","xmm6");
	&movdqu	("xmm5",&QWP(0,$key));
	&pshufb	("xmm2","xmm0");
	&movdqa	("xmm0",&QWP($k_ipt+16,$const));
	&pshufb	("xmm0","xmm1");
	&pxor	("xmm2","xmm5");
	&pxor	("xmm0","xmm2");
	&psrld	("xmm1",4);
	&add	($key,16);
	&pshufb	("xmm0","xmm1");
	&lea	($base,&DWP($k_mc_backward,$const));
	&pxor	("xmm0","xmm2");
	&jmp	(&label("enc_entry"));


&set_label("enc_loop",16);
	# middle of middle round
	&movdqa	("xmm4",&QWP($k_sb1,$const));	# 4 : sb1u
	&pshufb	("xmm4","xmm2");		# 4 = sb1u
	&pxor	("xmm4","xmm5");		# 4 = sb1u + k
	&movdqa	("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
	&pshufb	("xmm4","xmm2");		# 4 = sb1u
	&pshufb	("xmm0","xmm3");		# 0 = sb1t
	&pxor	("xmm0","xmm4");		# 0 = A
	&pxor	("xmm4","xmm5");		# 4 = sb1u + k
	&movdqa	("xmm5",&QWP($k_sb2,$const));	# 4 : sb2u
	&pshufb	("xmm5","xmm2");		# 4 = sb2u
	&pxor	("xmm0","xmm4");		# 0 = A
	&movdqa	("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
	&pshufb	("xmm5","xmm2");		# 4 = sb2u
	&movdqa	("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
	&pshufb	("xmm2","xmm3");		# 2 = sb2t
	&pxor	("xmm2","xmm5");		# 2 = 2A
	&movdqa	("xmm4",&QWP(0,$base,$magic));	# .Lk_mc_backward[]
	&pshufb	("xmm2","xmm3");		# 2 = sb2t
	&movdqa	("xmm3","xmm0");		# 3 = A
	&pxor	("xmm2","xmm5");		# 2 = 2A
	&pshufb	("xmm0","xmm1");		# 0 = B
	&add	($key,16);			# next key
	&pxor	("xmm0","xmm2");		# 0 = 2A+B
@@ -220,30 +220,30 @@ $k_dsbo=0x2c0; # decryption sbox final output
	&pxor	("xmm3","xmm0");		# 3 = 2A+B+D
	&pshufb	("xmm0","xmm1");		# 0 = 2B+C
	&and	($magic,0x30);			# ... mod 4
	&pxor	("xmm0","xmm3");		# 0 = 2A+3B+C+D
	&sub	($round,1);			# nr--
	&pxor	("xmm0","xmm3");		# 0 = 2A+3B+C+D

&set_label("enc_entry");
	# top of round
	&movdqa	("xmm1","xmm6");		# 1 : i
	&movdqa	("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
	&pandn	("xmm1","xmm0");		# 1 = i<<4
	&psrld	("xmm1",4);			# 1 = i
	&pand	("xmm0","xmm6");		# 0 = k
	&movdqa	("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
	&pshufb	("xmm5","xmm0");		# 2 = a/k
	&pxor	("xmm0","xmm1");		# 0 = j
	&movdqa	("xmm3","xmm7");		# 3 : 1/i
	&pxor	("xmm0","xmm1");		# 0 = j
	&pshufb	("xmm3","xmm1");		# 3 = 1/i
	&pxor	("xmm3","xmm5");		# 3 = iak = 1/i + a/k
	&movdqa	("xmm4","xmm7");		# 4 : 1/j
	&pxor	("xmm3","xmm5");		# 3 = iak = 1/i + a/k
	&pshufb	("xmm4","xmm0");		# 4 = 1/j
	&pxor	("xmm4","xmm5");		# 4 = jak = 1/j + a/k
	&movdqa	("xmm2","xmm7");		# 2 : 1/iak
	&pxor	("xmm4","xmm5");		# 4 = jak = 1/j + a/k
	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
	&pxor	("xmm2","xmm0");		# 2 = io
	&movdqa	("xmm3","xmm7");		# 3 : 1/jak
	&movdqu	("xmm5",&QWP(0,$key));
	&pxor	("xmm2","xmm0");		# 2 = io
	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
	&movdqu	("xmm5",&QWP(0,$key));
	&pxor	("xmm3","xmm1");		# 3 = jo
	&jnz	(&label("enc_loop"));

@@ -265,8 +265,8 @@ $k_dsbo=0x2c0; # decryption sbox final output
##  Same API as encryption core.
##
&function_begin_B("_vpaes_decrypt_core");
	&mov	($round,&DWP(240,$key));
	&lea	($base,&DWP($k_dsbd,$const));
	&mov	($round,&DWP(240,$key));
	&movdqa	("xmm1","xmm6");
	&movdqa	("xmm2",&QWP($k_dipt-$k_dsbd,$base));
	&pandn	("xmm1","xmm0");
@@ -292,62 +292,61 @@ $k_dsbo=0x2c0; # decryption sbox final output
##  Inverse mix columns
##
	&movdqa	("xmm4",&QWP(-0x20,$base));	# 4 : sb9u
	&movdqa	("xmm1",&QWP(-0x10,$base));	# 0 : sb9t
	&pshufb	("xmm4","xmm2");		# 4 = sb9u
	&pshufb	("xmm1","xmm3");		# 0 = sb9t
	&pxor	("xmm4","xmm0");
	&movdqa	("xmm0",&QWP(-0x10,$base));	# 0 : sb9t
	&pshufb	("xmm0","xmm3");		# 0 = sb9t
	&pxor	("xmm0","xmm4");		# 0 = ch
	&add	($key,16);			# next round key
	&pxor	("xmm1","xmm4");		# 0 = ch

	&pshufb	("xmm0","xmm5");		# MC ch
	&movdqa	("xmm4",&QWP(0,$base));		# 4 : sbdu
	&pshufb	("xmm1","xmm5");		# MC ch
	&pshufb	("xmm4","xmm2");		# 4 = sbdu
	&pxor	("xmm4","xmm0");		# 4 = ch
	&movdqa	("xmm0",&QWP(0x10,$base));	# 0 : sbdt
	&pxor	("xmm4","xmm1");		# 4 = ch
	&pshufb	("xmm0","xmm3");		# 0 = sbdt
	&pxor	("xmm0","xmm4");		# 0 = ch
	&sub	($round,1);			# nr--
	&pxor	("xmm0","xmm4");		# 0 = ch

	&pshufb	("xmm0","xmm5");		# MC ch
	&movdqa	("xmm4",&QWP(0x20,$base));	# 4 : sbbu
	&pshufb	("xmm0","xmm5");		# MC ch
	&movdqa	("xmm1",&QWP(0x30,$base));	# 0 : sbbt
	&pshufb	("xmm4","xmm2");		# 4 = sbbu
	&pshufb	("xmm1","xmm3");		# 0 = sbbt
	&pxor	("xmm4","xmm0");		# 4 = ch
	&movdqa	("xmm0",&QWP(0x30,$base));	# 0 : sbbt
	&pshufb	("xmm0","xmm3");		# 0 = sbbt
	&pxor	("xmm0","xmm4");		# 0 = ch
	&pxor	("xmm1","xmm4");		# 0 = ch

	&pshufb	("xmm0","xmm5");		# MC ch
	&movdqa	("xmm4",&QWP(0x40,$base));	# 4 : sbeu
	&pshufb	("xmm4","xmm2");		# 4 = sbeu
	&pxor	("xmm4","xmm0");		# 4 = ch
	&pshufb	("xmm1","xmm5");		# MC ch
	&movdqa	("xmm0",&QWP(0x50,$base));	# 0 : sbet
	&pshufb	("xmm4","xmm2");		# 4 = sbeu
	&pshufb	("xmm0","xmm3");		# 0 = sbet
	&pxor	("xmm0","xmm4");		# 0 = ch

	&palignr("xmm5","xmm5",12);
	&pxor	("xmm4","xmm1");		# 4 = ch
	&pxor	("xmm0","xmm4");		# 0 = ch

&set_label("dec_entry");
	# top of round
	&movdqa	("xmm1","xmm6");		# 1 : i
	&pandn	("xmm1","xmm0");		# 1 = i<<4
	&movdqa	("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
	&psrld	("xmm1",4);			# 1 = i
	&pand	("xmm0","xmm6");		# 0 = k
	&movdqa	("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
	&pshufb	("xmm2","xmm0");		# 2 = a/k
	&pxor	("xmm0","xmm1");		# 0 = j
	&movdqa	("xmm3","xmm7");		# 3 : 1/i
	&pxor	("xmm0","xmm1");		# 0 = j
	&pshufb	("xmm3","xmm1");		# 3 = 1/i
	&pxor	("xmm3","xmm2");		# 3 = iak = 1/i + a/k
	&movdqa	("xmm4","xmm7");		# 4 : 1/j
	&pxor	("xmm3","xmm2");		# 3 = iak = 1/i + a/k
	&pshufb	("xmm4","xmm0");		# 4 = 1/j
	&pxor	("xmm4","xmm2");		# 4 = jak = 1/j + a/k
	&movdqa	("xmm2","xmm7");		# 2 : 1/iak
	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
	&pxor	("xmm2","xmm0");		# 2 = io
	&movdqa	("xmm3","xmm7");		# 3 : 1/jak
	&pxor	("xmm2","xmm0");		# 2 = io
	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
	&pxor	("xmm3","xmm1");		# 3 = jo
	&movdqu	("xmm0",&QWP(0,$key));
	&pxor	("xmm3","xmm1");		# 3 = jo
	&jnz	(&label("dec_loop"));

	# middle of last round
@@ -542,12 +541,12 @@ $k_dsbo=0x2c0; # decryption sbox final output
##    %xmm0: b+c+d  b+c  b  a
##
&function_begin_B("_vpaes_schedule_192_smear");
	&pshufd	("xmm0","xmm6",0x80);		# d c 0 0 -> c 0 0 0
	&pxor	("xmm6","xmm0");		# -> c+d c 0 0
	&pshufd	("xmm1","xmm6",0x80);		# d c 0 0 -> c 0 0 0
	&pshufd	("xmm0","xmm7",0xFE);		# b a _ _ -> b b b a
	&pxor	("xmm6","xmm1");		# -> c+d c 0 0
	&pxor	("xmm1","xmm1");
	&pxor	("xmm6","xmm0");		# -> b+c+d b+c b a
	&movdqa	("xmm0","xmm6");
	&pxor	("xmm1","xmm1");
	&movhlps("xmm6","xmm1");		# clobber low side with zeros
	&ret	();
&function_end_B("_vpaes_schedule_192_smear");