Commit d90bf2ab authored by Andy Polyakov's avatar Andy Polyakov
Browse files

[vp]aes-x86[_64].pl: update from HEAD.

parent 02620cfc
Loading
Loading
Loading
Loading
+145 −138
Original line number Diff line number Diff line
@@ -39,7 +39,7 @@
# but exhibits up to 10% improvement on other cores.
#
# Second version is "monolithic" replacement for aes_core.c, which in
# addition to AES_[de|en]crypt implements private_AES_set_[de|en]cryption_key.
# addition to AES_[de|en]crypt implements AES_set_[de|en]cryption_key.
# This made it possible to implement little-endian variant of the
# algorithm without modifying the base C code. Motivating factor for
# the undertaken effort was that it appeared that in tight IA-32
@@ -103,11 +103,12 @@
# byte for 128-bit key.
#
#		ECB encrypt	ECB decrypt	CBC large chunk
# P4		56[60]		84[100]		23
# AMD K8	48[44]		70[79]		18
# PIII		41[50]		61[91]		24
# Core 2	32[38]		45[70]		18.5
# Pentium	120		160		77
# P4		52[54]		83[95]		23
# AMD K8	46[41]		66[70]		18
# PIII		41[50]		60[77]		24
# Core 2	31[36]		45[64]		18.5
# Atom		76[100]		96[138]		60
# Pentium	115		150		77
#
# Version 4.1 switches to compact S-box even in key schedule setup.
#
@@ -242,7 +243,7 @@ $vertical_spin=0; # shift "verticaly" defaults to 0, because of

sub encvert()
{ my ($te,@s) = @_;
  my $v0 = $acc, $v1 = $key;
  my ($v0,$v1) = ($acc,$key);

	&mov	($v0,$s[3]);				# copy s3
	&mov	(&DWP(4,"esp"),$s[2]);			# save s2
@@ -299,7 +300,7 @@ sub encvert()
# Another experimental routine, which features "horizontal spin," but
# eliminates one reference to stack. Strangely enough runs slower...
sub enchoriz()
{ my $v0 = $key, $v1 = $acc;
{ my ($v0,$v1) = ($key,$acc);

	&movz	($v0,&LB($s0));			#  3, 2, 1, 0*
	&rotr	($s2,8);			#  8,11,10, 9
@@ -427,7 +428,7 @@ sub sse_encbody()
######################################################################

sub enccompact()
{ my $Fn = mov;
{ my $Fn = \&mov;
  while ($#_>5) { pop(@_); $Fn=sub{}; }
  my ($i,$te,@s)=@_;
  my $tmp = $key;
@@ -476,24 +477,25 @@ sub enctransform()
  my $tmp = $tbl;
  my $r2  = $key ;

	&mov	($acc,$s[$i]);
	&and	($acc,0x80808080);
	&mov	($tmp,$acc);
	&shr	($tmp,7);
	&and	($tmp,$s[$i]);
	&lea	($r2,&DWP(0,$s[$i],$s[$i]));
	&sub	($acc,$tmp);
	&mov	($acc,$tmp);
	&shr	($tmp,7);
	&and	($r2,0xfefefefe);
	&and	($acc,0x1b1b1b1b);
	&sub	($acc,$tmp);
	&mov	($tmp,$s[$i]);
	&and	($acc,0x1b1b1b1b);
	&rotr	($tmp,16);
	&xor	($acc,$r2);	# r2
	&mov	($r2,$s[$i]);

	&xor	($s[$i],$acc);	# r0 ^ r2
	&rotr	($r2,16+8);
	&xor	($acc,$tmp);
	&rotl	($s[$i],24);
	&xor	($s[$i],$acc)	# ROTATE(r2^r0,24) ^ r2
	&rotr	($tmp,16);
	&xor	($s[$i],$tmp);
	&rotr	($tmp,8);
	&xor	($s[$i],$tmp);
	&xor	($acc,$r2);
	&mov	($tmp,0x80808080)	if ($i!=1);
	&xor	($s[$i],$acc);	# ROTATE(r2^r0,24) ^ r2
}

&function_begin_B("_x86_AES_encrypt_compact");
@@ -526,6 +528,7 @@ sub enctransform()
		&enccompact(1,$tbl,$s1,$s2,$s3,$s0,1);
		&enccompact(2,$tbl,$s2,$s3,$s0,$s1,1);
		&enccompact(3,$tbl,$s3,$s0,$s1,$s2,1);
		&mov	($tbl,0x80808080);
		&enctransform(2);
		&enctransform(3);
		&enctransform(0);
@@ -607,82 +610,84 @@ sub sse_enccompact()
	&pshufw	("mm5","mm4",0x0d);		# 15,14,11,10
	&movd	("eax","mm1");			#  5, 4, 1, 0
	&movd	("ebx","mm5");			# 15,14,11,10
	&mov	($__key,$key);

	&movz	($acc,&LB("eax"));		#  0
	&movz	("ecx",&BP(-128,$tbl,$acc,1));	#  0
	&pshufw	("mm2","mm0",0x0d);		#  7, 6, 3, 2
	&movz	("edx",&HB("eax"));		#  1
	&pshufw	("mm2","mm0",0x0d);		#  7, 6, 3, 2
	&movz	("ecx",&BP(-128,$tbl,$acc,1));	#  0
	&movz	($key,&LB("ebx"));		# 10
	&movz	("edx",&BP(-128,$tbl,"edx",1));	#  1
	&shl	("edx",8);			#  1
	&shr	("eax",16);			#  5, 4
	&shl	("edx",8);			#  1

	&movz	($acc,&LB("ebx"));		# 10
	&movz	($acc,&BP(-128,$tbl,$acc,1));	# 10
	&movz	($acc,&BP(-128,$tbl,$key,1));	# 10
	&movz	($key,&HB("ebx"));		# 11
	&shl	($acc,16);			# 10
	&or	("ecx",$acc);			# 10
	&pshufw	("mm6","mm4",0x08);		# 13,12, 9, 8
	&movz	($acc,&HB("ebx"));		# 11
	&movz	($acc,&BP(-128,$tbl,$acc,1));	# 11
	&or	("ecx",$acc);			# 10
	&movz	($acc,&BP(-128,$tbl,$key,1));	# 11
	&movz	($key,&HB("eax"));		#  5
	&shl	($acc,24);			# 11
	&or	("edx",$acc);			# 11
	&shr	("ebx",16);			# 15,14
	&or	("edx",$acc);			# 11

	&movz	($acc,&HB("eax"));		#  5
	&movz	($acc,&BP(-128,$tbl,$acc,1));	#  5
	&movz	($acc,&BP(-128,$tbl,$key,1));	#  5
	&movz	($key,&HB("ebx"));		# 15
	&shl	($acc,8);			#  5
	&or	("ecx",$acc);			#  5
	&movz	($acc,&HB("ebx"));		# 15
	&movz	($acc,&BP(-128,$tbl,$acc,1));	# 15
	&movz	($acc,&BP(-128,$tbl,$key,1));	# 15
	&movz	($key,&LB("eax"));		#  4
	&shl	($acc,24);			# 15
	&or	("ecx",$acc);			# 15
	&movd	("mm0","ecx");			# t[0] collected

	&movz	($acc,&LB("eax"));		#  4
	&movz	("ecx",&BP(-128,$tbl,$acc,1));	#  4
	&movz	($acc,&BP(-128,$tbl,$key,1));	#  4
	&movz	($key,&LB("ebx"));		# 14
	&movd	("eax","mm2");			#  7, 6, 3, 2
	&movz	($acc,&LB("ebx"));		# 14
	&movz	($acc,&BP(-128,$tbl,$acc,1));	# 14
	&shl	($acc,16);			# 14
	&movd	("mm0","ecx");			# t[0] collected
	&movz	("ecx",&BP(-128,$tbl,$key,1));	# 14
	&movz	($key,&HB("eax"));		#  3
	&shl	("ecx",16);			# 14
	&movd	("ebx","mm6");			# 13,12, 9, 8
	&or	("ecx",$acc);			# 14

	&movd	("ebx","mm6");			# 13,12, 9, 8
	&movz	($acc,&HB("eax"));		#  3
	&movz	($acc,&BP(-128,$tbl,$acc,1));	#  3
	&movz	($acc,&BP(-128,$tbl,$key,1));	#  3
	&movz	($key,&HB("ebx"));		#  9
	&shl	($acc,24);			#  3
	&or	("ecx",$acc);			#  3
	&movz	($acc,&HB("ebx"));		#  9
	&movz	($acc,&BP(-128,$tbl,$acc,1));	#  9
	&movz	($acc,&BP(-128,$tbl,$key,1));	#  9
	&movz	($key,&LB("ebx"));		#  8
	&shl	($acc,8);			#  9
	&shr	("ebx",16);			# 13,12
	&or	("ecx",$acc);			#  9
	&movd	("mm1","ecx");			# t[1] collected

	&movz	($acc,&LB("ebx"));		#  8
	&movz	("ecx",&BP(-128,$tbl,$acc,1));	#  8
	&shr	("ebx",16);			# 13,12
	&movz	($acc,&LB("eax"));		#  2
	&movz	($acc,&BP(-128,$tbl,$acc,1));	#  2
	&shl	($acc,16);			#  2
	&or	("ecx",$acc);			#  2
	&movz	($acc,&BP(-128,$tbl,$key,1));	#  8
	&movz	($key,&LB("eax"));		#  2
	&shr	("eax",16);			#  7, 6
	&movd	("mm1","ecx");			# t[1] collected
	&movz	("ecx",&BP(-128,$tbl,$key,1));	#  2
	&movz	($key,&HB("eax"));		#  7
	&shl	("ecx",16);			#  2
	&and	("eax",0xff);			#  6
	&or	("ecx",$acc);			#  2

	&punpckldq	("mm0","mm1");		# t[0,1] collected

	&movz	($acc,&HB("eax"));		#  7
	&movz	($acc,&BP(-128,$tbl,$acc,1));	#  7
	&movz	($acc,&BP(-128,$tbl,$key,1));	#  7
	&movz	($key,&HB("ebx"));		# 13
	&shl	($acc,24);			#  7
	&or	("ecx",$acc);			#  7
	&and	("eax",0xff);			#  6
	&and	("ebx",0xff);			# 12
	&movz	("eax",&BP(-128,$tbl,"eax",1));	#  6
	&or	("ecx",$acc);			#  7
	&shl	("eax",16);			#  6
	&movz	($acc,&BP(-128,$tbl,$key,1));	# 13
	&or	("edx","eax");			#  6
	&movz	($acc,&HB("ebx"));		# 13
	&movz	($acc,&BP(-128,$tbl,$acc,1));	# 13
	&shl	($acc,8);			# 13
	&or	("ecx",$acc);			# 13
	&movd	("mm4","ecx");			# t[2] collected
	&and	("ebx",0xff);			# 12
	&movz	("ebx",&BP(-128,$tbl,"ebx",1));	# 12
	&or	("ecx",$acc);			# 13
	&or	("edx","ebx");			# 12
	&mov	($key,$__key);
	&movd	("mm4","ecx");			# t[2] collected
	&movd	("mm5","edx");			# t[3] collected

	&punpckldq	("mm4","mm5");		# t[2,3] collected
@@ -1222,7 +1227,7 @@ sub enclast()
######################################################################

sub deccompact()
{ my $Fn = mov;
{ my $Fn = \&mov;
  while ($#_>5) { pop(@_); $Fn=sub{}; }
  my ($i,$td,@s)=@_;
  my $tmp = $key;
@@ -1270,30 +1275,30 @@ sub dectransform()
  my $tp4 = @s[($i+3)%4]; $tp4 = @s[3] if ($i==1);
  my $tp8 = $tbl;

	&mov	($acc,$s[$i]);
	&and	($acc,0x80808080);
	&mov	($tmp,$acc);
	&mov	($tmp,0x80808080);
	&and	($tmp,$s[$i]);
	&mov	($acc,$tmp);
	&shr	($tmp,7);
	&lea	($tp2,&DWP(0,$s[$i],$s[$i]));
	&sub	($acc,$tmp);
	&and	($tp2,0xfefefefe);
	&and	($acc,0x1b1b1b1b);
	&xor	($acc,$tp2);
	&mov	($tp2,$acc);
	&xor	($tp2,$acc);
	&mov	($tmp,0x80808080);

	&and	($acc,0x80808080);
	&mov	($tmp,$acc);
	&and	($tmp,$tp2);
	&mov	($acc,$tmp);
	&shr	($tmp,7);
	&lea	($tp4,&DWP(0,$tp2,$tp2));
	&sub	($acc,$tmp);
	&and	($tp4,0xfefefefe);
	&and	($acc,0x1b1b1b1b);
	 &xor	($tp2,$s[$i]);	# tp2^tp1
	&xor	($acc,$tp4);
	&mov	($tp4,$acc);
	&xor	($tp4,$acc);
	&mov	($tmp,0x80808080);

	&and	($acc,0x80808080);
	&mov	($tmp,$acc);
	&and	($tmp,$tp4);
	&mov	($acc,$tmp);
	&shr	($tmp,7);
	&lea	($tp8,&DWP(0,$tp4,$tp4));
	&sub	($acc,$tmp);
@@ -1305,13 +1310,13 @@ sub dectransform()

	&xor	($s[$i],$tp2);
	&xor	($tp2,$tp8);
	&rotl	($tp2,24);
	&xor	($s[$i],$tp4);
	&xor	($tp4,$tp8);
	&rotl	($tp4,16);
	&rotl	($tp2,24);
	&xor	($s[$i],$tp8);	# ^= tp8^(tp4^tp1)^(tp2^tp1)
	&rotl	($tp8,8);
	&rotl	($tp4,16);
	&xor	($s[$i],$tp2);	# ^= ROTATE(tp8^tp2^tp1,24)
	&rotl	($tp8,8);
	&xor	($s[$i],$tp4);	# ^= ROTATE(tp8^tp4^tp1,16)
	 &mov	($s[0],$__s0)			if($i==2); #prefetch $s0
	 &mov	($s[1],$__s1)			if($i==3); #prefetch $s1
@@ -1389,85 +1394,87 @@ sub dectransform()
sub sse_deccompact()
{
	&pshufw	("mm1","mm0",0x0c);		#  7, 6, 1, 0
	&pshufw	("mm5","mm4",0x09);		# 13,12,11,10
	&movd	("eax","mm1");			#  7, 6, 1, 0
	&movd	("ebx","mm5");			# 13,12,11,10
	&mov	($__key,$key);

	&pshufw	("mm5","mm4",0x09);		# 13,12,11,10
	&movz	($acc,&LB("eax"));		#  0
	&movz	("ecx",&BP(-128,$tbl,$acc,1));	#  0
	&movd	("ebx","mm5");			# 13,12,11,10
	&movz	("edx",&HB("eax"));		#  1
	&pshufw	("mm2","mm0",0x06);		#  3, 2, 5, 4
	&movz	("ecx",&BP(-128,$tbl,$acc,1));	#  0
	&movz	($key,&LB("ebx"));		# 10
	&movz	("edx",&BP(-128,$tbl,"edx",1));	#  1
	&shr	("eax",16);			#  7, 6
	&shl	("edx",8);			#  1

	&pshufw	("mm2","mm0",0x06);		#  3, 2, 5, 4
	&movz	($acc,&LB("ebx"));		# 10
	&movz	($acc,&BP(-128,$tbl,$acc,1));	# 10
	&movz	($acc,&BP(-128,$tbl,$key,1));	# 10
	&movz	($key,&HB("ebx"));		# 11
	&shl	($acc,16);			# 10
	&pshufw	("mm6","mm4",0x03);		# 9, 8,15,14
	&or	("ecx",$acc);			# 10
	&shr	("eax",16);			#  7, 6
	&movz	($acc,&HB("ebx"));		# 11
	&movz	($acc,&BP(-128,$tbl,$acc,1));	# 11
	&movz	($acc,&BP(-128,$tbl,$key,1));	# 11
	&movz	($key,&HB("eax"));		#  7
	&shl	($acc,24);			# 11
	&or	("edx",$acc);			# 11
	&shr	("ebx",16);			# 13,12
	&or	("edx",$acc);			# 11

	&pshufw	("mm6","mm4",0x03);		# 9, 8,15,14
	&movz	($acc,&HB("eax"));		#  7
	&movz	($acc,&BP(-128,$tbl,$acc,1));	#  7
	&movz	($acc,&BP(-128,$tbl,$key,1));	#  7
	&movz	($key,&HB("ebx"));		# 13
	&shl	($acc,24);			#  7
	&or	("ecx",$acc);			#  7
	&movz	($acc,&HB("ebx"));		# 13
	&movz	($acc,&BP(-128,$tbl,$acc,1));	# 13
	&movz	($acc,&BP(-128,$tbl,$key,1));	# 13
	&movz	($key,&LB("eax"));		#  6
	&shl	($acc,8);			# 13
	&movd	("eax","mm2");			#  3, 2, 5, 4
	&or	("ecx",$acc);			# 13
	&movd	("mm0","ecx");			# t[0] collected

	&movz	($acc,&LB("eax"));		#  6
	&movd	("eax","mm2");			#  3, 2, 5, 4
	&movz	("ecx",&BP(-128,$tbl,$acc,1));	#  6
	&shl	("ecx",16);			#  6
	&movz	($acc,&LB("ebx"));		# 12
	&movz	($acc,&BP(-128,$tbl,$key,1));	#  6
	&movz	($key,&LB("ebx"));		# 12
	&shl	($acc,16);			#  6
	&movd	("ebx","mm6");			#  9, 8,15,14
	&movz	($acc,&BP(-128,$tbl,$acc,1));	# 12
	&movd	("mm0","ecx");			# t[0] collected
	&movz	("ecx",&BP(-128,$tbl,$key,1));	# 12
	&movz	($key,&LB("eax"));		#  4
	&or	("ecx",$acc);			# 12

	&movz	($acc,&LB("eax"));		#  4
	&movz	($acc,&BP(-128,$tbl,$acc,1));	#  4
	&movz	($acc,&BP(-128,$tbl,$key,1));	#  4
	&movz	($key,&LB("ebx"));		# 14
	&or	("edx",$acc);			#  4
	&movz	($acc,&LB("ebx"));		# 14
	&movz	($acc,&BP(-128,$tbl,$acc,1));	# 14
	&movz	($acc,&BP(-128,$tbl,$key,1));	# 14
	&movz	($key,&HB("eax"));		#  5
	&shl	($acc,16);			# 14
	&shr	("eax",16);			#  3, 2
	&or	("edx",$acc);			# 14
	&movd	("mm1","edx");			# t[1] collected

	&movz	($acc,&HB("eax"));		#  5
	&movz	("edx",&BP(-128,$tbl,$acc,1));	#  5
	&shl	("edx",8);			#  5
	&movz	($acc,&HB("ebx"));		# 15
	&shr	("eax",16);			#  3, 2
	&movz	($acc,&BP(-128,$tbl,$acc,1));	# 15
	&shl	($acc,24);			# 15
	&or	("edx",$acc);			# 15
	&movz	($acc,&BP(-128,$tbl,$key,1));	#  5
	&movz	($key,&HB("ebx"));		# 15
	&shr	("ebx",16);			#  9, 8
	&shl	($acc,8);			#  5
	&movd	("mm1","edx");			# t[1] collected
	&movz	("edx",&BP(-128,$tbl,$key,1));	# 15
	&movz	($key,&HB("ebx"));		#  9
	&shl	("edx",24);			# 15
	&and	("ebx",0xff);			#  8
	&or	("edx",$acc);			# 15

	&punpckldq	("mm0","mm1");		# t[0,1] collected

	&movz	($acc,&HB("ebx"));		#  9
	&movz	($acc,&BP(-128,$tbl,$acc,1));	#  9
	&movz	($acc,&BP(-128,$tbl,$key,1));	#  9
	&movz	($key,&LB("eax"));		#  2
	&shl	($acc,8);			#  9
	&or	("ecx",$acc);			#  9
	&and	("ebx",0xff);			#  8
	&movz	("eax",&HB("eax"));		#  3
	&movz	("ebx",&BP(-128,$tbl,"ebx",1));	#  8
	&or	("ecx",$acc);			#  9
	&movz	($acc,&BP(-128,$tbl,$key,1));	#  2
	&or	("edx","ebx");			#  8
	&movz	($acc,&LB("eax"));		#  2
	&movz	($acc,&BP(-128,$tbl,$acc,1));	#  2
	&shl	($acc,16);			#  2
	&or	("edx",$acc);			#  2
	&movd	("mm4","edx");			# t[2] collected
	&movz	("eax",&HB("eax"));		#  3
	&movz	("eax",&BP(-128,$tbl,"eax",1));	#  3
	&or	("edx",$acc);			#  2
	&shl	("eax",24);			#  3
	&or	("ecx","eax");			#  3
	&mov	($key,$__key);
	&movd	("mm4","edx");			# t[2] collected
	&movd	("mm5","ecx");			# t[3] collected

	&punpckldq	("mm4","mm5");		# t[2,3] collected
@@ -2182,7 +2189,7 @@ my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds
	&xor	("eax","eax");
	&align	(4);
	&data_word(0xABF3F689);		# rep stosd
	&set_label("skip_ezero")
	&set_label("skip_ezero");
	&mov	("esp",$_esp);
	&popf	();
    &set_label("drop_out");
@@ -2302,7 +2309,7 @@ my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds
	&xor	("eax","eax");
	&align	(4);
	&data_word(0xABF3F689);		# rep stosd
	&set_label("skip_dzero")
	&set_label("skip_dzero");
	&mov	("esp",$_esp);
	&popf	();
	&function_end_A();
@@ -2865,32 +2872,32 @@ sub deckey()
{ my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
  my $tmp = $tbl;

	&mov	($acc,$tp1);
	&and	($acc,0x80808080);
	&mov	($tmp,$acc);
	&shr	($tmp,7);
	&mov	($tmp,0x80808080);
	&and	($tmp,$tp1);
	&lea	($tp2,&DWP(0,$tp1,$tp1));
	&mov	($acc,$tmp);
	&shr	($tmp,7);
	&sub	($acc,$tmp);
	&and	($tp2,0xfefefefe);
	&and	($acc,0x1b1b1b1b);
	&xor	($acc,$tp2);
	&mov	($tp2,$acc);
	&xor	($tp2,$acc);
	&mov	($tmp,0x80808080);

	&and	($acc,0x80808080);
	&mov	($tmp,$acc);
	&shr	($tmp,7);
	&and	($tmp,$tp2);
	&lea	($tp4,&DWP(0,$tp2,$tp2));
	&mov	($acc,$tmp);
	&shr	($tmp,7);
	&sub	($acc,$tmp);
	&and	($tp4,0xfefefefe);
	&and	($acc,0x1b1b1b1b);
	 &xor	($tp2,$tp1);	# tp2^tp1
	&xor	($acc,$tp4);
	&mov	($tp4,$acc);
	&xor	($tp4,$acc);
	&mov	($tmp,0x80808080);

	&and	($acc,0x80808080);
	&mov	($tmp,$acc);
	&shr	($tmp,7);
	&and	($tmp,$tp4);
	&lea	($tp8,&DWP(0,$tp4,$tp4));
	&mov	($acc,$tmp);
	&shr	($tmp,7);
	 &xor	($tp4,$tp1);	# tp4^tp1
	&sub	($acc,$tmp);
	&and	($tp8,0xfefefefe);
+122 −128
Original line number Diff line number Diff line
@@ -19,9 +19,10 @@
# Performance in number of cycles per processed byte for 128-bit key:
#
#		ECB encrypt	ECB decrypt	CBC large chunk
# AMD64		33		41		13.0
# EM64T		38		59		18.6(*)
# Core 2	30		43		14.5(*)
# AMD64		33		43		13.0
# EM64T		38		56		18.6(*)
# Core 2	30		42		14.5(*)
# Atom		65		86		32.1(*)
#
# (*) with hyper-threading off

@@ -365,68 +366,66 @@ $code.=<<___;
	movzb	`&lo("$s0")`,$t0
	movzb	`&lo("$s1")`,$t1
	movzb	`&lo("$s2")`,$t2
	movzb	($sbox,$t0,1),$t0
	movzb	($sbox,$t1,1),$t1
	movzb	($sbox,$t2,1),$t2

	movzb	`&lo("$s3")`,$t3
	movzb	`&hi("$s1")`,$acc0
	movzb	`&hi("$s2")`,$acc1
	shr	\$16,$s2
	movzb	`&hi("$s3")`,$acc2
	movzb	($sbox,$t0,1),$t0
	movzb	($sbox,$t1,1),$t1
	movzb	($sbox,$t2,1),$t2
	movzb	($sbox,$t3,1),$t3
	movzb	($sbox,$acc0,1),$t4	#$t0
	movzb	($sbox,$acc1,1),$t5	#$t1

	movzb	`&hi("$s3")`,$acc2
	movzb	($sbox,$acc0,1),$t4	#$t0
	movzb	`&hi("$s0")`,$acc0
	shr	\$16,$s2
	movzb	($sbox,$acc1,1),$t5	#$t1
	movzb	`&lo("$s2")`,$acc1
	movzb	($sbox,$acc2,1),$acc2	#$t2
	movzb	($sbox,$acc0,1),$acc0	#$t3
	shr	\$16,$s3

	movzb	`&lo("$s2")`,$acc1
	shl	\$8,$t4
	shr	\$16,$s3
	shl	\$8,$t5
	movzb	($sbox,$acc1,1),$acc1	#$t0
	xor	$t4,$t0
	xor	$t5,$t1

	movzb	`&lo("$s3")`,$t4
	shr	\$16,$s0
	movzb	`&lo("$s3")`,$t4
	shr	\$16,$s1
	movzb	`&lo("$s0")`,$t5
	xor	$t5,$t1
	shl	\$8,$acc2
	shl	\$8,$acc0
	movzb	($sbox,$t4,1),$t4	#$t1
	movzb	($sbox,$t5,1),$t5	#$t2
	movzb	`&lo("$s0")`,$t5
	movzb	($sbox,$acc1,1),$acc1	#$t0
	xor	$acc2,$t2
	xor	$acc0,$t3

	shl	\$8,$acc0
	movzb	`&lo("$s1")`,$acc2
	movzb	`&hi("$s3")`,$acc0
	shl	\$16,$acc1
	movzb	($sbox,$acc2,1),$acc2	#$t3
	movzb	($sbox,$acc0,1),$acc0	#$t0
	xor	$acc0,$t3
	movzb	($sbox,$t4,1),$t4	#$t1
	movzb	`&hi("$s3")`,$acc0
	movzb	($sbox,$t5,1),$t5	#$t2
	xor	$acc1,$t0

	movzb	`&hi("$s0")`,$acc1
	shr	\$8,$s2
	movzb	`&hi("$s0")`,$acc1
	shl	\$16,$t4
	shr	\$8,$s1
	shl	\$16,$t5
	xor	$t4,$t1
	movzb	($sbox,$acc2,1),$acc2	#$t3
	movzb	($sbox,$acc0,1),$acc0	#$t0
	movzb	($sbox,$acc1,1),$acc1	#$t1
	movzb	($sbox,$s2,1),$s3	#$t3
	movzb	($sbox,$s1,1),$s2	#$t2
	shl	\$16,$t4
	shl	\$16,$t5

	shl	\$16,$acc2
	xor	$t4,$t1
	xor	$t5,$t2
	xor	$acc2,$t3

	shl	\$24,$acc0
	xor	$acc2,$t3
	shl	\$24,$acc1
	shl	\$24,$s3
	xor	$acc0,$t0
	shl	\$24,$s2
	shl	\$24,$s3
	xor	$acc1,$t1
	shl	\$24,$s2
	mov	$t0,$s0
	mov	$t1,$s1
	xor	$t2,$s2
@@ -465,12 +464,12 @@ sub enctransform()
{ my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d");

$code.=<<___;
	mov	$s0,$acc0
	mov	$s1,$acc1
	and	\$0x80808080,$acc0
	and	\$0x80808080,$acc1
	mov	$acc0,$t0
	mov	$acc1,$t1
	mov	\$0x80808080,$t0
	mov	\$0x80808080,$t1
	and	$s0,$t0
	and	$s1,$t1
	mov	$t0,$acc0
	mov	$t1,$acc1
	shr	\$7,$t0
	lea	($s0,$s0),$r20
	shr	\$7,$t1
@@ -488,25 +487,25 @@ $code.=<<___;

	xor	$r20,$s0
	xor	$r21,$s1
	 mov	$s2,$acc0
	 mov	$s3,$acc1
	 mov	\$0x80808080,$t2
	rol	\$24,$s0
	 mov	\$0x80808080,$t3
	rol	\$24,$s1
	 and	\$0x80808080,$acc0
	 and	\$0x80808080,$acc1
	 and	$s2,$t2
	 and	$s3,$t3
	xor	$r20,$s0
	xor	$r21,$s1
	 mov	$acc0,$t2
	 mov	$acc1,$t3
	 mov	$t2,$acc0
	ror	\$16,$t0
	 mov	$t3,$acc1
	ror	\$16,$t1
	 shr	\$7,$t2
	 lea	($s2,$s2),$r20
	 shr	\$7,$t2
	xor	$t0,$s0
	xor	$t1,$s1
	 shr	\$7,$t3
	 lea	($s3,$s3),$r21
	xor	$t1,$s1
	ror	\$8,$t0
	 lea	($s3,$s3),$r21
	ror	\$8,$t1
	 sub	$t2,$acc0
	 sub	$t3,$acc1
@@ -522,23 +521,23 @@ $code.=<<___;
	xor	$acc0,$r20
	xor	$acc1,$r21

	ror	\$16,$t2
	xor	$r20,$s2
	ror	\$16,$t3
	xor	$r21,$s3
	rol	\$24,$s2
	mov	0($sbox),$acc0			# prefetch Te4
	rol	\$24,$s3
	xor	$r20,$s2
	xor	$r21,$s3
	mov	0($sbox),$acc0			# prefetch Te4
	ror	\$16,$t2
	ror	\$16,$t3
	mov	64($sbox),$acc1
	xor	$t2,$s2
	xor	$t3,$s3
	xor	$r21,$s3
	mov	128($sbox),$r20
	xor	$t2,$s2
	ror	\$8,$t2
	xor	$t3,$s3
	ror	\$8,$t3
	mov	192($sbox),$r21
	xor	$t2,$s2
	mov	192($sbox),$r21
	xor	$t3,$s3
___
}
@@ -935,70 +934,69 @@ $code.=<<___;
	movzb	`&lo("$s0")`,$t0
	movzb	`&lo("$s1")`,$t1
	movzb	`&lo("$s2")`,$t2
	movzb	($sbox,$t0,1),$t0
	movzb	($sbox,$t1,1),$t1
	movzb	($sbox,$t2,1),$t2

	movzb	`&lo("$s3")`,$t3
	movzb	`&hi("$s3")`,$acc0
	movzb	`&hi("$s0")`,$acc1
	shr	\$16,$s3
	movzb	`&hi("$s1")`,$acc2
	movzb	($sbox,$t0,1),$t0
	movzb	($sbox,$t1,1),$t1
	movzb	($sbox,$t2,1),$t2
	movzb	($sbox,$t3,1),$t3
	movzb	($sbox,$acc0,1),$t4	#$t0
	movzb	($sbox,$acc1,1),$t5	#$t1

	movzb	`&hi("$s1")`,$acc2
	movzb	($sbox,$acc0,1),$t4	#$t0
	movzb	`&hi("$s2")`,$acc0
	shr	\$16,$s2
	movzb	($sbox,$acc1,1),$t5	#$t1
	movzb	($sbox,$acc2,1),$acc2	#$t2
	movzb	($sbox,$acc0,1),$acc0	#$t3
	shr	\$16,$s3

	movzb	`&lo("$s2")`,$acc1
	shl	\$8,$t4
	shr	\$16,$s2
	shl	\$8,$t5
	movzb	($sbox,$acc1,1),$acc1	#$t0
	xor	$t4,$t0
	xor	$t5,$t1

	movzb	`&lo("$s3")`,$t4
	shl	\$8,$t4
	movzb	`&lo("$s2")`,$acc1
	shr	\$16,$s0
	xor	$t4,$t0
	shr	\$16,$s1
	movzb	`&lo("$s0")`,$t5
	movzb	`&lo("$s3")`,$t4

	shl	\$8,$acc2
	xor	$t5,$t1
	shl	\$8,$acc0
	movzb	($sbox,$t4,1),$t4	#$t1
	movzb	($sbox,$t5,1),$t5	#$t2
	movzb	`&lo("$s0")`,$t5
	movzb	($sbox,$acc1,1),$acc1	#$t0
	xor	$acc2,$t2
	xor	$acc0,$t3

	movzb	`&lo("$s1")`,$acc2
	movzb	`&hi("$s1")`,$acc0

	shl	\$16,$acc1
	xor	$acc0,$t3
	movzb	($sbox,$t4,1),$t4	#$t1
	movzb	`&hi("$s1")`,$acc0
	movzb	($sbox,$acc2,1),$acc2	#$t3
	movzb	($sbox,$acc0,1),$acc0	#$t0
	xor	$acc1,$t0

	movzb	($sbox,$t5,1),$t5	#$t2
	movzb	`&hi("$s2")`,$acc1

	shl	\$16,$acc2
	shl	\$16,$t4
	shl	\$16,$t5
	movzb	($sbox,$acc1,1),$s1	#$t1
	xor	$acc2,$t3
	movzb	`&hi("$s3")`,$acc2
	xor	$t4,$t1
	shr	\$8,$s0
	xor	$t5,$t2

	movzb	`&hi("$s3")`,$acc1
	shr	\$8,$s0
	shl	\$16,$acc2
	movzb	($sbox,$acc1,1),$s2	#$t2
	movzb	($sbox,$acc0,1),$acc0	#$t0
	movzb	($sbox,$acc1,1),$s1	#$t1
	movzb	($sbox,$acc2,1),$s2	#$t2
	movzb	($sbox,$s0,1),$s3	#$t3
	xor	$acc2,$t3

	mov	$t0,$s0
	shl	\$24,$acc0
	shl	\$24,$s1
	shl	\$24,$s2
	xor	$acc0,$t0
	xor	$acc0,$s0
	shl	\$24,$s3
	xor	$t1,$s1
	mov	$t0,$s0
	xor	$t2,$s2
	xor	$t3,$s3
___
@@ -1013,12 +1011,12 @@ sub dectransform()
  my $prefetch = shift;

$code.=<<___;
	mov	$tp10,$acc0
	mov	$tp18,$acc8
	and	$mask80,$acc0
	and	$mask80,$acc8
	mov	$acc0,$tp40
	mov	$acc8,$tp48
	mov	$mask80,$tp40
	mov	$mask80,$tp48
	and	$tp10,$tp40
	and	$tp18,$tp48
	mov	$tp40,$acc0
	mov	$tp48,$acc8
	shr	\$7,$tp40
	lea	($tp10,$tp10),$tp20
	shr	\$7,$tp48
@@ -1029,15 +1027,15 @@ $code.=<<___;
	and	$maskfe,$tp28
	and	$mask1b,$acc0
	and	$mask1b,$acc8
	xor	$tp20,$acc0
	xor	$tp28,$acc8
	mov	$acc0,$tp20
	mov	$acc8,$tp28

	and	$mask80,$acc0
	and	$mask80,$acc8
	mov	$acc0,$tp80
	mov	$acc8,$tp88
	xor	$acc0,$tp20
	xor	$acc8,$tp28
	mov	$mask80,$tp80
	mov	$mask80,$tp88

	and	$tp20,$tp80
	and	$tp28,$tp88
	mov	$tp80,$acc0
	mov	$tp88,$acc8
	shr	\$7,$tp80
	lea	($tp20,$tp20),$tp40
	shr	\$7,$tp88
@@ -1048,15 +1046,15 @@ $code.=<<___;
	and	$maskfe,$tp48
	and	$mask1b,$acc0
	and	$mask1b,$acc8
	xor	$tp40,$acc0
	xor	$tp48,$acc8
	mov	$acc0,$tp40
	mov	$acc8,$tp48

	and	$mask80,$acc0
	and	$mask80,$acc8
	mov	$acc0,$tp80
	mov	$acc8,$tp88
	xor	$acc0,$tp40
	xor	$acc8,$tp48
	mov	$mask80,$tp80
	mov	$mask80,$tp88

	and	$tp40,$tp80
	and	$tp48,$tp88
	mov	$tp80,$acc0
	mov	$tp88,$acc8
	shr	\$7,$tp80
	 xor	$tp10,$tp20		# tp2^=tp1
	shr	\$7,$tp88
@@ -1081,51 +1079,51 @@ $code.=<<___;
	mov	$tp10,$acc0
	mov	$tp18,$acc8
	xor	$tp80,$tp40		# tp4^tp1^=tp8
	xor	$tp88,$tp48		# tp4^tp1^=tp8
	shr	\$32,$acc0
	xor	$tp88,$tp48		# tp4^tp1^=tp8
	shr	\$32,$acc8
	xor	$tp20,$tp80		# tp8^=tp8^tp2^tp1=tp2^tp1
	xor	$tp28,$tp88		# tp8^=tp8^tp2^tp1=tp2^tp1
	rol	\$8,`&LO("$tp10")`	# ROTATE(tp1^tp8,8)
	xor	$tp28,$tp88		# tp8^=tp8^tp2^tp1=tp2^tp1
	rol	\$8,`&LO("$tp18")`	# ROTATE(tp1^tp8,8)
	xor	$tp40,$tp80		# tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
	rol	\$8,`&LO("$acc0")`	# ROTATE(tp1^tp8,8)
	xor	$tp48,$tp88		# tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2

	rol	\$8,`&LO("$acc0")`	# ROTATE(tp1^tp8,8)
	rol	\$8,`&LO("$acc8")`	# ROTATE(tp1^tp8,8)
	xor	`&LO("$tp80")`,`&LO("$tp10")`
	xor	`&LO("$tp88")`,`&LO("$tp18")`
	shr	\$32,$tp80
	xor	`&LO("$tp88")`,`&LO("$tp18")`
	shr	\$32,$tp88
	xor	`&LO("$tp80")`,`&LO("$acc0")`
	xor	`&LO("$tp88")`,`&LO("$acc8")`

	mov	$tp20,$tp80
	mov	$tp28,$tp88
	shr	\$32,$tp80
	shr	\$32,$tp88
	rol	\$24,`&LO("$tp20")`	# ROTATE(tp2^tp1^tp8,24)
	mov	$tp28,$tp88
	rol	\$24,`&LO("$tp28")`	# ROTATE(tp2^tp1^tp8,24)
	rol	\$24,`&LO("$tp80")`	# ROTATE(tp2^tp1^tp8,24)
	rol	\$24,`&LO("$tp88")`	# ROTATE(tp2^tp1^tp8,24)
	shr	\$32,$tp80
	xor	`&LO("$tp20")`,`&LO("$tp10")`
	shr	\$32,$tp88
	xor	`&LO("$tp28")`,`&LO("$tp18")`
	rol	\$24,`&LO("$tp80")`	# ROTATE(tp2^tp1^tp8,24)
	mov	$tp40,$tp20
	rol	\$24,`&LO("$tp88")`	# ROTATE(tp2^tp1^tp8,24)
	mov	$tp48,$tp28
	shr	\$32,$tp20
	xor	`&LO("$tp80")`,`&LO("$acc0")`
	shr	\$32,$tp28
	xor	`&LO("$tp88")`,`&LO("$acc8")`

	`"mov	0($sbox),$mask80"	if ($prefetch)`
	shr	\$32,$tp20
	shr	\$32,$tp28
	`"mov	64($sbox),$maskfe"	if ($prefetch)`
	rol	\$16,`&LO("$tp40")`	# ROTATE(tp4^tp1^tp8,16)
	`"mov	64($sbox),$maskfe"	if ($prefetch)`
	rol	\$16,`&LO("$tp48")`	# ROTATE(tp4^tp1^tp8,16)
	`"mov	128($sbox),$mask1b"	if ($prefetch)`
	rol	\$16,`&LO("$tp20")`	# ROTATE(tp4^tp1^tp8,16)
	rol	\$16,`&LO("$tp28")`	# ROTATE(tp4^tp1^tp8,16)
	`"mov	192($sbox),$tp80"	if ($prefetch)`
	xor	`&LO("$tp40")`,`&LO("$tp10")`
	rol	\$16,`&LO("$tp28")`	# ROTATE(tp4^tp1^tp8,16)
	xor	`&LO("$tp48")`,`&LO("$tp18")`
	`"mov	256($sbox),$tp88"	if ($prefetch)`
	xor	`&LO("$tp20")`,`&LO("$acc0")`
@@ -1301,10 +1299,6 @@ private_AES_set_encrypt_key:

	call	_x86_64_AES_set_encrypt_key

	mov	8(%rsp),%r15
	mov	16(%rsp),%r14
	mov	24(%rsp),%r13
	mov	32(%rsp),%r12
	mov	40(%rsp),%rbp
	mov	48(%rsp),%rbx
	add	\$56,%rsp
+45 −46

File changed.

Preview size limit exceeded, changes collapsed.

+42 −43

File changed.

Preview size limit exceeded, changes collapsed.