[vp]aes-x86[_64].pl: update from HEAD. (d90bf2ab) · Commits · CYBER - Cyber Security / TS 103 523 MSP / TLMSP / TLMSP OpenSSL

crypto/aes/asm/aes-586.pl

+145 −138

Original line number	Diff line number	Diff line
		@@ -39,7 +39,7 @@
		# but exhibits up to 10% improvement on other cores.
		#
		# Second version is "monolithic" replacement for aes_core.c, which in
		# addition to AES_[de\|en]crypt implements private_AES_set_[de\|en]cryption_key.
		# addition to AES_[de\|en]crypt implements AES_set_[de\|en]cryption_key.
		# This made it possible to implement little-endian variant of the
		# algorithm without modifying the base C code. Motivating factor for
		# the undertaken effort was that it appeared that in tight IA-32
		@@ -103,11 +103,12 @@
		# byte for 128-bit key.
		#
		# ECB encrypt ECB decrypt CBC large chunk
		# P4 56[60] 84[100] 23
		# AMD K8 48[44] 70[79] 18
		# PIII 41[50] 61[91] 24
		# Core 2 32[38] 45[70] 18.5
		# Pentium 120 160 77
		# P4 52[54] 83[95] 23
		# AMD K8 46[41] 66[70] 18
		# PIII 41[50] 60[77] 24
		# Core 2 31[36] 45[64] 18.5
		# Atom 76[100] 96[138] 60
		# Pentium 115 150 77
		#
		# Version 4.1 switches to compact S-box even in key schedule setup.
		#
		@@ -242,7 +243,7 @@ $vertical_spin=0; # shift "verticaly" defaults to 0, because of

		sub encvert()
		{ my ($te,@s) = @_;
		my $v0 = $acc, $v1 = $key;
		my ($v0,$v1) = ($acc,$key);

		&mov ($v0,$s[3]); # copy s3
		&mov (&DWP(4,"esp"),$s[2]); # save s2
		@@ -299,7 +300,7 @@ sub encvert()
		# Another experimental routine, which features "horizontal spin," but
		# eliminates one reference to stack. Strangely enough runs slower...
		sub enchoriz()
		{ my $v0 = $key, $v1 = $acc;
		{ my ($v0,$v1) = ($key,$acc);

		&movz ($v0,&LB($s0)); # 3, 2, 1, 0*
		&rotr ($s2,8); # 8,11,10, 9
		@@ -427,7 +428,7 @@ sub sse_encbody()
		######################################################################

		sub enccompact()
		{ my $Fn = mov;
		{ my $Fn = \&mov;
		while ($#_>5) { pop(@_); $Fn=sub{}; }
		my ($i,$te,@s)=@_;
		my $tmp = $key;
		@@ -476,24 +477,25 @@ sub enctransform()
		my $tmp = $tbl;
		my $r2 = $key ;

		&mov ($acc,$s[$i]);
		&and ($acc,0x80808080);
		&mov ($tmp,$acc);
		&shr ($tmp,7);
		&and ($tmp,$s[$i]);
		&lea ($r2,&DWP(0,$s[$i],$s[$i]));
		&sub ($acc,$tmp);
		&mov ($acc,$tmp);
		&shr ($tmp,7);
		&and ($r2,0xfefefefe);
		&and ($acc,0x1b1b1b1b);
		&sub ($acc,$tmp);
		&mov ($tmp,$s[$i]);
		&and ($acc,0x1b1b1b1b);
		&rotr ($tmp,16);
		&xor ($acc,$r2); # r2
		&mov ($r2,$s[$i]);

		&xor ($s[$i],$acc); # r0 ^ r2
		&rotr ($r2,16+8);
		&xor ($acc,$tmp);
		&rotl ($s[$i],24);
		&xor ($s[$i],$acc) # ROTATE(r2^r0,24) ^ r2
		&rotr ($tmp,16);
		&xor ($s[$i],$tmp);
		&rotr ($tmp,8);
		&xor ($s[$i],$tmp);
		&xor ($acc,$r2);
		&mov ($tmp,0x80808080) if ($i!=1);
		&xor ($s[$i],$acc); # ROTATE(r2^r0,24) ^ r2
		}

		&function_begin_B("_x86_AES_encrypt_compact");
		@@ -526,6 +528,7 @@ sub enctransform()
		&enccompact(1,$tbl,$s1,$s2,$s3,$s0,1);
		&enccompact(2,$tbl,$s2,$s3,$s0,$s1,1);
		&enccompact(3,$tbl,$s3,$s0,$s1,$s2,1);
		&mov ($tbl,0x80808080);
		&enctransform(2);
		&enctransform(3);
		&enctransform(0);
		@@ -607,82 +610,84 @@ sub sse_enccompact()
		&pshufw ("mm5","mm4",0x0d); # 15,14,11,10
		&movd ("eax","mm1"); # 5, 4, 1, 0
		&movd ("ebx","mm5"); # 15,14,11,10
		&mov ($__key,$key);

		&movz ($acc,&LB("eax")); # 0
		&movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
		&pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
		&movz ("edx",&HB("eax")); # 1
		&pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
		&movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
		&movz ($key,&LB("ebx")); # 10
		&movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
		&shl ("edx",8); # 1
		&shr ("eax",16); # 5, 4
		&shl ("edx",8); # 1

		&movz ($acc,&LB("ebx")); # 10
		&movz ($acc,&BP(-128,$tbl,$acc,1)); # 10
		&movz ($acc,&BP(-128,$tbl,$key,1)); # 10
		&movz ($key,&HB("ebx")); # 11
		&shl ($acc,16); # 10
		&or ("ecx",$acc); # 10
		&pshufw ("mm6","mm4",0x08); # 13,12, 9, 8
		&movz ($acc,&HB("ebx")); # 11
		&movz ($acc,&BP(-128,$tbl,$acc,1)); # 11
		&or ("ecx",$acc); # 10
		&movz ($acc,&BP(-128,$tbl,$key,1)); # 11
		&movz ($key,&HB("eax")); # 5
		&shl ($acc,24); # 11
		&or ("edx",$acc); # 11
		&shr ("ebx",16); # 15,14
		&or ("edx",$acc); # 11

		&movz ($acc,&HB("eax")); # 5
		&movz ($acc,&BP(-128,$tbl,$acc,1)); # 5
		&movz ($acc,&BP(-128,$tbl,$key,1)); # 5
		&movz ($key,&HB("ebx")); # 15
		&shl ($acc,8); # 5
		&or ("ecx",$acc); # 5
		&movz ($acc,&HB("ebx")); # 15
		&movz ($acc,&BP(-128,$tbl,$acc,1)); # 15
		&movz ($acc,&BP(-128,$tbl,$key,1)); # 15
		&movz ($key,&LB("eax")); # 4
		&shl ($acc,24); # 15
		&or ("ecx",$acc); # 15
		&movd ("mm0","ecx"); # t[0] collected

		&movz ($acc,&LB("eax")); # 4
		&movz ("ecx",&BP(-128,$tbl,$acc,1)); # 4
		&movz ($acc,&BP(-128,$tbl,$key,1)); # 4
		&movz ($key,&LB("ebx")); # 14
		&movd ("eax","mm2"); # 7, 6, 3, 2
		&movz ($acc,&LB("ebx")); # 14
		&movz ($acc,&BP(-128,$tbl,$acc,1)); # 14
		&shl ($acc,16); # 14
		&movd ("mm0","ecx"); # t[0] collected
		&movz ("ecx",&BP(-128,$tbl,$key,1)); # 14
		&movz ($key,&HB("eax")); # 3
		&shl ("ecx",16); # 14
		&movd ("ebx","mm6"); # 13,12, 9, 8
		&or ("ecx",$acc); # 14

		&movd ("ebx","mm6"); # 13,12, 9, 8
		&movz ($acc,&HB("eax")); # 3
		&movz ($acc,&BP(-128,$tbl,$acc,1)); # 3
		&movz ($acc,&BP(-128,$tbl,$key,1)); # 3
		&movz ($key,&HB("ebx")); # 9
		&shl ($acc,24); # 3
		&or ("ecx",$acc); # 3
		&movz ($acc,&HB("ebx")); # 9
		&movz ($acc,&BP(-128,$tbl,$acc,1)); # 9
		&movz ($acc,&BP(-128,$tbl,$key,1)); # 9
		&movz ($key,&LB("ebx")); # 8
		&shl ($acc,8); # 9
		&shr ("ebx",16); # 13,12
		&or ("ecx",$acc); # 9
		&movd ("mm1","ecx"); # t[1] collected

		&movz ($acc,&LB("ebx")); # 8
		&movz ("ecx",&BP(-128,$tbl,$acc,1)); # 8
		&shr ("ebx",16); # 13,12
		&movz ($acc,&LB("eax")); # 2
		&movz ($acc,&BP(-128,$tbl,$acc,1)); # 2
		&shl ($acc,16); # 2
		&or ("ecx",$acc); # 2
		&movz ($acc,&BP(-128,$tbl,$key,1)); # 8
		&movz ($key,&LB("eax")); # 2
		&shr ("eax",16); # 7, 6
		&movd ("mm1","ecx"); # t[1] collected
		&movz ("ecx",&BP(-128,$tbl,$key,1)); # 2
		&movz ($key,&HB("eax")); # 7
		&shl ("ecx",16); # 2
		&and ("eax",0xff); # 6
		&or ("ecx",$acc); # 2

		&punpckldq ("mm0","mm1"); # t[0,1] collected

		&movz ($acc,&HB("eax")); # 7
		&movz ($acc,&BP(-128,$tbl,$acc,1)); # 7
		&movz ($acc,&BP(-128,$tbl,$key,1)); # 7
		&movz ($key,&HB("ebx")); # 13
		&shl ($acc,24); # 7
		&or ("ecx",$acc); # 7
		&and ("eax",0xff); # 6
		&and ("ebx",0xff); # 12
		&movz ("eax",&BP(-128,$tbl,"eax",1)); # 6
		&or ("ecx",$acc); # 7
		&shl ("eax",16); # 6
		&movz ($acc,&BP(-128,$tbl,$key,1)); # 13
		&or ("edx","eax"); # 6
		&movz ($acc,&HB("ebx")); # 13
		&movz ($acc,&BP(-128,$tbl,$acc,1)); # 13
		&shl ($acc,8); # 13
		&or ("ecx",$acc); # 13
		&movd ("mm4","ecx"); # t[2] collected
		&and ("ebx",0xff); # 12
		&movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 12
		&or ("ecx",$acc); # 13
		&or ("edx","ebx"); # 12
		&mov ($key,$__key);
		&movd ("mm4","ecx"); # t[2] collected
		&movd ("mm5","edx"); # t[3] collected

		&punpckldq ("mm4","mm5"); # t[2,3] collected
		@@ -1222,7 +1227,7 @@ sub enclast()
		######################################################################

		sub deccompact()
		{ my $Fn = mov;
		{ my $Fn = \&mov;
		while ($#_>5) { pop(@_); $Fn=sub{}; }
		my ($i,$td,@s)=@_;
		my $tmp = $key;
		@@ -1270,30 +1275,30 @@ sub dectransform()
		my $tp4 = @s[($i+3)%4]; $tp4 = @s[3] if ($i==1);
		my $tp8 = $tbl;

		&mov ($acc,$s[$i]);
		&and ($acc,0x80808080);
		&mov ($tmp,$acc);
		&mov ($tmp,0x80808080);
		&and ($tmp,$s[$i]);
		&mov ($acc,$tmp);
		&shr ($tmp,7);
		&lea ($tp2,&DWP(0,$s[$i],$s[$i]));
		&sub ($acc,$tmp);
		&and ($tp2,0xfefefefe);
		&and ($acc,0x1b1b1b1b);
		&xor ($acc,$tp2);
		&mov ($tp2,$acc);
		&xor ($tp2,$acc);
		&mov ($tmp,0x80808080);

		&and ($acc,0x80808080);
		&mov ($tmp,$acc);
		&and ($tmp,$tp2);
		&mov ($acc,$tmp);
		&shr ($tmp,7);
		&lea ($tp4,&DWP(0,$tp2,$tp2));
		&sub ($acc,$tmp);
		&and ($tp4,0xfefefefe);
		&and ($acc,0x1b1b1b1b);
		&xor ($tp2,$s[$i]); # tp2^tp1
		&xor ($acc,$tp4);
		&mov ($tp4,$acc);
		&xor ($tp4,$acc);
		&mov ($tmp,0x80808080);

		&and ($acc,0x80808080);
		&mov ($tmp,$acc);
		&and ($tmp,$tp4);
		&mov ($acc,$tmp);
		&shr ($tmp,7);
		&lea ($tp8,&DWP(0,$tp4,$tp4));
		&sub ($acc,$tmp);
		@@ -1305,13 +1310,13 @@ sub dectransform()

		&xor ($s[$i],$tp2);
		&xor ($tp2,$tp8);
		&rotl ($tp2,24);
		&xor ($s[$i],$tp4);
		&xor ($tp4,$tp8);
		&rotl ($tp4,16);
		&rotl ($tp2,24);
		&xor ($s[$i],$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1)
		&rotl ($tp8,8);
		&rotl ($tp4,16);
		&xor ($s[$i],$tp2); # ^= ROTATE(tp8^tp2^tp1,24)
		&rotl ($tp8,8);
		&xor ($s[$i],$tp4); # ^= ROTATE(tp8^tp4^tp1,16)
		&mov ($s[0],$__s0) if($i==2); #prefetch $s0
		&mov ($s[1],$__s1) if($i==3); #prefetch $s1
		@@ -1389,85 +1394,87 @@ sub dectransform()
		sub sse_deccompact()
		{
		&pshufw ("mm1","mm0",0x0c); # 7, 6, 1, 0
		&pshufw ("mm5","mm4",0x09); # 13,12,11,10
		&movd ("eax","mm1"); # 7, 6, 1, 0
		&movd ("ebx","mm5"); # 13,12,11,10
		&mov ($__key,$key);

		&pshufw ("mm5","mm4",0x09); # 13,12,11,10
		&movz ($acc,&LB("eax")); # 0
		&movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
		&movd ("ebx","mm5"); # 13,12,11,10
		&movz ("edx",&HB("eax")); # 1
		&pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4
		&movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
		&movz ($key,&LB("ebx")); # 10
		&movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
		&shr ("eax",16); # 7, 6
		&shl ("edx",8); # 1

		&pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4
		&movz ($acc,&LB("ebx")); # 10
		&movz ($acc,&BP(-128,$tbl,$acc,1)); # 10
		&movz ($acc,&BP(-128,$tbl,$key,1)); # 10
		&movz ($key,&HB("ebx")); # 11
		&shl ($acc,16); # 10
		&pshufw ("mm6","mm4",0x03); # 9, 8,15,14
		&or ("ecx",$acc); # 10
		&shr ("eax",16); # 7, 6
		&movz ($acc,&HB("ebx")); # 11
		&movz ($acc,&BP(-128,$tbl,$acc,1)); # 11
		&movz ($acc,&BP(-128,$tbl,$key,1)); # 11
		&movz ($key,&HB("eax")); # 7
		&shl ($acc,24); # 11
		&or ("edx",$acc); # 11
		&shr ("ebx",16); # 13,12
		&or ("edx",$acc); # 11

		&pshufw ("mm6","mm4",0x03); # 9, 8,15,14
		&movz ($acc,&HB("eax")); # 7
		&movz ($acc,&BP(-128,$tbl,$acc,1)); # 7
		&movz ($acc,&BP(-128,$tbl,$key,1)); # 7
		&movz ($key,&HB("ebx")); # 13
		&shl ($acc,24); # 7
		&or ("ecx",$acc); # 7
		&movz ($acc,&HB("ebx")); # 13
		&movz ($acc,&BP(-128,$tbl,$acc,1)); # 13
		&movz ($acc,&BP(-128,$tbl,$key,1)); # 13
		&movz ($key,&LB("eax")); # 6
		&shl ($acc,8); # 13
		&movd ("eax","mm2"); # 3, 2, 5, 4
		&or ("ecx",$acc); # 13
		&movd ("mm0","ecx"); # t[0] collected

		&movz ($acc,&LB("eax")); # 6
		&movd ("eax","mm2"); # 3, 2, 5, 4
		&movz ("ecx",&BP(-128,$tbl,$acc,1)); # 6
		&shl ("ecx",16); # 6
		&movz ($acc,&LB("ebx")); # 12
		&movz ($acc,&BP(-128,$tbl,$key,1)); # 6
		&movz ($key,&LB("ebx")); # 12
		&shl ($acc,16); # 6
		&movd ("ebx","mm6"); # 9, 8,15,14
		&movz ($acc,&BP(-128,$tbl,$acc,1)); # 12
		&movd ("mm0","ecx"); # t[0] collected
		&movz ("ecx",&BP(-128,$tbl,$key,1)); # 12
		&movz ($key,&LB("eax")); # 4
		&or ("ecx",$acc); # 12

		&movz ($acc,&LB("eax")); # 4
		&movz ($acc,&BP(-128,$tbl,$acc,1)); # 4
		&movz ($acc,&BP(-128,$tbl,$key,1)); # 4
		&movz ($key,&LB("ebx")); # 14
		&or ("edx",$acc); # 4
		&movz ($acc,&LB("ebx")); # 14
		&movz ($acc,&BP(-128,$tbl,$acc,1)); # 14
		&movz ($acc,&BP(-128,$tbl,$key,1)); # 14
		&movz ($key,&HB("eax")); # 5
		&shl ($acc,16); # 14
		&shr ("eax",16); # 3, 2
		&or ("edx",$acc); # 14
		&movd ("mm1","edx"); # t[1] collected

		&movz ($acc,&HB("eax")); # 5
		&movz ("edx",&BP(-128,$tbl,$acc,1)); # 5
		&shl ("edx",8); # 5
		&movz ($acc,&HB("ebx")); # 15
		&shr ("eax",16); # 3, 2
		&movz ($acc,&BP(-128,$tbl,$acc,1)); # 15
		&shl ($acc,24); # 15
		&or ("edx",$acc); # 15
		&movz ($acc,&BP(-128,$tbl,$key,1)); # 5
		&movz ($key,&HB("ebx")); # 15
		&shr ("ebx",16); # 9, 8
		&shl ($acc,8); # 5
		&movd ("mm1","edx"); # t[1] collected
		&movz ("edx",&BP(-128,$tbl,$key,1)); # 15
		&movz ($key,&HB("ebx")); # 9
		&shl ("edx",24); # 15
		&and ("ebx",0xff); # 8
		&or ("edx",$acc); # 15

		&punpckldq ("mm0","mm1"); # t[0,1] collected

		&movz ($acc,&HB("ebx")); # 9
		&movz ($acc,&BP(-128,$tbl,$acc,1)); # 9
		&movz ($acc,&BP(-128,$tbl,$key,1)); # 9
		&movz ($key,&LB("eax")); # 2
		&shl ($acc,8); # 9
		&or ("ecx",$acc); # 9
		&and ("ebx",0xff); # 8
		&movz ("eax",&HB("eax")); # 3
		&movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 8
		&or ("ecx",$acc); # 9
		&movz ($acc,&BP(-128,$tbl,$key,1)); # 2
		&or ("edx","ebx"); # 8
		&movz ($acc,&LB("eax")); # 2
		&movz ($acc,&BP(-128,$tbl,$acc,1)); # 2
		&shl ($acc,16); # 2
		&or ("edx",$acc); # 2
		&movd ("mm4","edx"); # t[2] collected
		&movz ("eax",&HB("eax")); # 3
		&movz ("eax",&BP(-128,$tbl,"eax",1)); # 3
		&or ("edx",$acc); # 2
		&shl ("eax",24); # 3
		&or ("ecx","eax"); # 3
		&mov ($key,$__key);
		&movd ("mm4","edx"); # t[2] collected
		&movd ("mm5","ecx"); # t[3] collected

		&punpckldq ("mm4","mm5"); # t[2,3] collected
		@@ -2182,7 +2189,7 @@ my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds
		&xor ("eax","eax");
		&align (4);
		&data_word(0xABF3F689); # rep stosd
		&set_label("skip_ezero")
		&set_label("skip_ezero");
		&mov ("esp",$_esp);
		&popf ();
		&set_label("drop_out");
		@@ -2302,7 +2309,7 @@ my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds
		&xor ("eax","eax");
		&align (4);
		&data_word(0xABF3F689); # rep stosd
		&set_label("skip_dzero")
		&set_label("skip_dzero");
		&mov ("esp",$_esp);
		&popf ();
		&function_end_A();
		@@ -2865,32 +2872,32 @@ sub deckey()
		{ my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
		my $tmp = $tbl;

		&mov ($acc,$tp1);
		&and ($acc,0x80808080);
		&mov ($tmp,$acc);
		&shr ($tmp,7);
		&mov ($tmp,0x80808080);
		&and ($tmp,$tp1);
		&lea ($tp2,&DWP(0,$tp1,$tp1));
		&mov ($acc,$tmp);
		&shr ($tmp,7);
		&sub ($acc,$tmp);
		&and ($tp2,0xfefefefe);
		&and ($acc,0x1b1b1b1b);
		&xor ($acc,$tp2);
		&mov ($tp2,$acc);
		&xor ($tp2,$acc);
		&mov ($tmp,0x80808080);

		&and ($acc,0x80808080);
		&mov ($tmp,$acc);
		&shr ($tmp,7);
		&and ($tmp,$tp2);
		&lea ($tp4,&DWP(0,$tp2,$tp2));
		&mov ($acc,$tmp);
		&shr ($tmp,7);
		&sub ($acc,$tmp);
		&and ($tp4,0xfefefefe);
		&and ($acc,0x1b1b1b1b);
		&xor ($tp2,$tp1); # tp2^tp1
		&xor ($acc,$tp4);
		&mov ($tp4,$acc);
		&xor ($tp4,$acc);
		&mov ($tmp,0x80808080);

		&and ($acc,0x80808080);
		&mov ($tmp,$acc);
		&shr ($tmp,7);
		&and ($tmp,$tp4);
		&lea ($tp8,&DWP(0,$tp4,$tp4));
		&mov ($acc,$tmp);
		&shr ($tmp,7);
		&xor ($tp4,$tp1); # tp4^tp1
		&sub ($acc,$tmp);
		&and ($tp8,0xfefefefe);

crypto/aes/asm/aes-x86_64.pl

+122 −128

Original line number	Diff line number	Diff line
		@@ -19,9 +19,10 @@
		# Performance in number of cycles per processed byte for 128-bit key:
		#
		# ECB encrypt ECB decrypt CBC large chunk
		# AMD64 33 41 13.0
		# EM64T 38 59 18.6(*)
		# Core 2 30 43 14.5(*)
		# AMD64 33 43 13.0
		# EM64T 38 56 18.6(*)
		# Core 2 30 42 14.5(*)
		# Atom 65 86 32.1(*)
		#
		# (*) with hyper-threading off

		@@ -365,68 +366,66 @@ $code.=<<___;
		movzb `&lo("$s0")`,$t0
		movzb `&lo("$s1")`,$t1
		movzb `&lo("$s2")`,$t2
		movzb ($sbox,$t0,1),$t0
		movzb ($sbox,$t1,1),$t1
		movzb ($sbox,$t2,1),$t2

		movzb `&lo("$s3")`,$t3
		movzb `&hi("$s1")`,$acc0
		movzb `&hi("$s2")`,$acc1
		shr \$16,$s2
		movzb `&hi("$s3")`,$acc2
		movzb ($sbox,$t0,1),$t0
		movzb ($sbox,$t1,1),$t1
		movzb ($sbox,$t2,1),$t2
		movzb ($sbox,$t3,1),$t3
		movzb ($sbox,$acc0,1),$t4 #$t0
		movzb ($sbox,$acc1,1),$t5 #$t1

		movzb `&hi("$s3")`,$acc2
		movzb ($sbox,$acc0,1),$t4 #$t0
		movzb `&hi("$s0")`,$acc0
		shr \$16,$s2
		movzb ($sbox,$acc1,1),$t5 #$t1
		movzb `&lo("$s2")`,$acc1
		movzb ($sbox,$acc2,1),$acc2 #$t2
		movzb ($sbox,$acc0,1),$acc0 #$t3
		shr \$16,$s3

		movzb `&lo("$s2")`,$acc1
		shl \$8,$t4
		shr \$16,$s3
		shl \$8,$t5
		movzb ($sbox,$acc1,1),$acc1 #$t0
		xor $t4,$t0
		xor $t5,$t1

		movzb `&lo("$s3")`,$t4
		shr \$16,$s0
		movzb `&lo("$s3")`,$t4
		shr \$16,$s1
		movzb `&lo("$s0")`,$t5
		xor $t5,$t1
		shl \$8,$acc2
		shl \$8,$acc0
		movzb ($sbox,$t4,1),$t4 #$t1
		movzb ($sbox,$t5,1),$t5 #$t2
		movzb `&lo("$s0")`,$t5
		movzb ($sbox,$acc1,1),$acc1 #$t0
		xor $acc2,$t2
		xor $acc0,$t3

		shl \$8,$acc0
		movzb `&lo("$s1")`,$acc2
		movzb `&hi("$s3")`,$acc0
		shl \$16,$acc1
		movzb ($sbox,$acc2,1),$acc2 #$t3
		movzb ($sbox,$acc0,1),$acc0 #$t0
		xor $acc0,$t3
		movzb ($sbox,$t4,1),$t4 #$t1
		movzb `&hi("$s3")`,$acc0
		movzb ($sbox,$t5,1),$t5 #$t2
		xor $acc1,$t0

		movzb `&hi("$s0")`,$acc1
		shr \$8,$s2
		movzb `&hi("$s0")`,$acc1
		shl \$16,$t4
		shr \$8,$s1
		shl \$16,$t5
		xor $t4,$t1
		movzb ($sbox,$acc2,1),$acc2 #$t3
		movzb ($sbox,$acc0,1),$acc0 #$t0
		movzb ($sbox,$acc1,1),$acc1 #$t1
		movzb ($sbox,$s2,1),$s3 #$t3
		movzb ($sbox,$s1,1),$s2 #$t2
		shl \$16,$t4
		shl \$16,$t5

		shl \$16,$acc2
		xor $t4,$t1
		xor $t5,$t2
		xor $acc2,$t3

		shl \$24,$acc0
		xor $acc2,$t3
		shl \$24,$acc1
		shl \$24,$s3
		xor $acc0,$t0
		shl \$24,$s2
		shl \$24,$s3
		xor $acc1,$t1
		shl \$24,$s2
		mov $t0,$s0
		mov $t1,$s1
		xor $t2,$s2
		@@ -465,12 +464,12 @@ sub enctransform()
		{ my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d");

		$code.=<<___;
		mov $s0,$acc0
		mov $s1,$acc1
		and \$0x80808080,$acc0
		and \$0x80808080,$acc1
		mov $acc0,$t0
		mov $acc1,$t1
		mov \$0x80808080,$t0
		mov \$0x80808080,$t1
		and $s0,$t0
		and $s1,$t1
		mov $t0,$acc0
		mov $t1,$acc1
		shr \$7,$t0
		lea ($s0,$s0),$r20
		shr \$7,$t1
		@@ -488,25 +487,25 @@ $code.=<<___;

		xor $r20,$s0
		xor $r21,$s1
		mov $s2,$acc0
		mov $s3,$acc1
		mov \$0x80808080,$t2
		rol \$24,$s0
		mov \$0x80808080,$t3
		rol \$24,$s1
		and \$0x80808080,$acc0
		and \$0x80808080,$acc1
		and $s2,$t2
		and $s3,$t3
		xor $r20,$s0
		xor $r21,$s1
		mov $acc0,$t2
		mov $acc1,$t3
		mov $t2,$acc0
		ror \$16,$t0
		mov $t3,$acc1
		ror \$16,$t1
		shr \$7,$t2
		lea ($s2,$s2),$r20
		shr \$7,$t2
		xor $t0,$s0
		xor $t1,$s1
		shr \$7,$t3
		lea ($s3,$s3),$r21
		xor $t1,$s1
		ror \$8,$t0
		lea ($s3,$s3),$r21
		ror \$8,$t1
		sub $t2,$acc0
		sub $t3,$acc1
		@@ -522,23 +521,23 @@ $code.=<<___;
		xor $acc0,$r20
		xor $acc1,$r21

		ror \$16,$t2
		xor $r20,$s2
		ror \$16,$t3
		xor $r21,$s3
		rol \$24,$s2
		mov 0($sbox),$acc0 # prefetch Te4
		rol \$24,$s3
		xor $r20,$s2
		xor $r21,$s3
		mov 0($sbox),$acc0 # prefetch Te4
		ror \$16,$t2
		ror \$16,$t3
		mov 64($sbox),$acc1
		xor $t2,$s2
		xor $t3,$s3
		xor $r21,$s3
		mov 128($sbox),$r20
		xor $t2,$s2
		ror \$8,$t2
		xor $t3,$s3
		ror \$8,$t3
		mov 192($sbox),$r21
		xor $t2,$s2
		mov 192($sbox),$r21
		xor $t3,$s3
		___
		}
		@@ -935,70 +934,69 @@ $code.=<<___;
		movzb `&lo("$s0")`,$t0
		movzb `&lo("$s1")`,$t1
		movzb `&lo("$s2")`,$t2
		movzb ($sbox,$t0,1),$t0
		movzb ($sbox,$t1,1),$t1
		movzb ($sbox,$t2,1),$t2

		movzb `&lo("$s3")`,$t3
		movzb `&hi("$s3")`,$acc0
		movzb `&hi("$s0")`,$acc1
		shr \$16,$s3
		movzb `&hi("$s1")`,$acc2
		movzb ($sbox,$t0,1),$t0
		movzb ($sbox,$t1,1),$t1
		movzb ($sbox,$t2,1),$t2
		movzb ($sbox,$t3,1),$t3
		movzb ($sbox,$acc0,1),$t4 #$t0
		movzb ($sbox,$acc1,1),$t5 #$t1

		movzb `&hi("$s1")`,$acc2
		movzb ($sbox,$acc0,1),$t4 #$t0
		movzb `&hi("$s2")`,$acc0
		shr \$16,$s2
		movzb ($sbox,$acc1,1),$t5 #$t1
		movzb ($sbox,$acc2,1),$acc2 #$t2
		movzb ($sbox,$acc0,1),$acc0 #$t3
		shr \$16,$s3

		movzb `&lo("$s2")`,$acc1
		shl \$8,$t4
		shr \$16,$s2
		shl \$8,$t5
		movzb ($sbox,$acc1,1),$acc1 #$t0
		xor $t4,$t0
		xor $t5,$t1

		movzb `&lo("$s3")`,$t4
		shl \$8,$t4
		movzb `&lo("$s2")`,$acc1
		shr \$16,$s0
		xor $t4,$t0
		shr \$16,$s1
		movzb `&lo("$s0")`,$t5
		movzb `&lo("$s3")`,$t4

		shl \$8,$acc2
		xor $t5,$t1
		shl \$8,$acc0
		movzb ($sbox,$t4,1),$t4 #$t1
		movzb ($sbox,$t5,1),$t5 #$t2
		movzb `&lo("$s0")`,$t5
		movzb ($sbox,$acc1,1),$acc1 #$t0
		xor $acc2,$t2
		xor $acc0,$t3

		movzb `&lo("$s1")`,$acc2
		movzb `&hi("$s1")`,$acc0

		shl \$16,$acc1
		xor $acc0,$t3
		movzb ($sbox,$t4,1),$t4 #$t1
		movzb `&hi("$s1")`,$acc0
		movzb ($sbox,$acc2,1),$acc2 #$t3
		movzb ($sbox,$acc0,1),$acc0 #$t0
		xor $acc1,$t0

		movzb ($sbox,$t5,1),$t5 #$t2
		movzb `&hi("$s2")`,$acc1

		shl \$16,$acc2
		shl \$16,$t4
		shl \$16,$t5
		movzb ($sbox,$acc1,1),$s1 #$t1
		xor $acc2,$t3
		movzb `&hi("$s3")`,$acc2
		xor $t4,$t1
		shr \$8,$s0
		xor $t5,$t2

		movzb `&hi("$s3")`,$acc1
		shr \$8,$s0
		shl \$16,$acc2
		movzb ($sbox,$acc1,1),$s2 #$t2
		movzb ($sbox,$acc0,1),$acc0 #$t0
		movzb ($sbox,$acc1,1),$s1 #$t1
		movzb ($sbox,$acc2,1),$s2 #$t2
		movzb ($sbox,$s0,1),$s3 #$t3
		xor $acc2,$t3

		mov $t0,$s0
		shl \$24,$acc0
		shl \$24,$s1
		shl \$24,$s2
		xor $acc0,$t0
		xor $acc0,$s0
		shl \$24,$s3
		xor $t1,$s1
		mov $t0,$s0
		xor $t2,$s2
		xor $t3,$s3
		___
		@@ -1013,12 +1011,12 @@ sub dectransform()
		my $prefetch = shift;

		$code.=<<___;
		mov $tp10,$acc0
		mov $tp18,$acc8
		and $mask80,$acc0
		and $mask80,$acc8
		mov $acc0,$tp40
		mov $acc8,$tp48
		mov $mask80,$tp40
		mov $mask80,$tp48
		and $tp10,$tp40
		and $tp18,$tp48
		mov $tp40,$acc0
		mov $tp48,$acc8
		shr \$7,$tp40
		lea ($tp10,$tp10),$tp20
		shr \$7,$tp48
		@@ -1029,15 +1027,15 @@ $code.=<<___;
		and $maskfe,$tp28
		and $mask1b,$acc0
		and $mask1b,$acc8
		xor $tp20,$acc0
		xor $tp28,$acc8
		mov $acc0,$tp20
		mov $acc8,$tp28

		and $mask80,$acc0
		and $mask80,$acc8
		mov $acc0,$tp80
		mov $acc8,$tp88
		xor $acc0,$tp20
		xor $acc8,$tp28
		mov $mask80,$tp80
		mov $mask80,$tp88

		and $tp20,$tp80
		and $tp28,$tp88
		mov $tp80,$acc0
		mov $tp88,$acc8
		shr \$7,$tp80
		lea ($tp20,$tp20),$tp40
		shr \$7,$tp88
		@@ -1048,15 +1046,15 @@ $code.=<<___;
		and $maskfe,$tp48
		and $mask1b,$acc0
		and $mask1b,$acc8
		xor $tp40,$acc0
		xor $tp48,$acc8
		mov $acc0,$tp40
		mov $acc8,$tp48

		and $mask80,$acc0
		and $mask80,$acc8
		mov $acc0,$tp80
		mov $acc8,$tp88
		xor $acc0,$tp40
		xor $acc8,$tp48
		mov $mask80,$tp80
		mov $mask80,$tp88

		and $tp40,$tp80
		and $tp48,$tp88
		mov $tp80,$acc0
		mov $tp88,$acc8
		shr \$7,$tp80
		xor $tp10,$tp20 # tp2^=tp1
		shr \$7,$tp88
		@@ -1081,51 +1079,51 @@ $code.=<<___;
		mov $tp10,$acc0
		mov $tp18,$acc8
		xor $tp80,$tp40 # tp4^tp1^=tp8
		xor $tp88,$tp48 # tp4^tp1^=tp8
		shr \$32,$acc0
		xor $tp88,$tp48 # tp4^tp1^=tp8
		shr \$32,$acc8
		xor $tp20,$tp80 # tp8^=tp8^tp2^tp1=tp2^tp1
		xor $tp28,$tp88 # tp8^=tp8^tp2^tp1=tp2^tp1
		rol \$8,`&LO("$tp10")` # ROTATE(tp1^tp8,8)
		xor $tp28,$tp88 # tp8^=tp8^tp2^tp1=tp2^tp1
		rol \$8,`&LO("$tp18")` # ROTATE(tp1^tp8,8)
		xor $tp40,$tp80 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
		rol \$8,`&LO("$acc0")` # ROTATE(tp1^tp8,8)
		xor $tp48,$tp88 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2

		rol \$8,`&LO("$acc0")` # ROTATE(tp1^tp8,8)
		rol \$8,`&LO("$acc8")` # ROTATE(tp1^tp8,8)
		xor `&LO("$tp80")`,`&LO("$tp10")`
		xor `&LO("$tp88")`,`&LO("$tp18")`
		shr \$32,$tp80
		xor `&LO("$tp88")`,`&LO("$tp18")`
		shr \$32,$tp88
		xor `&LO("$tp80")`,`&LO("$acc0")`
		xor `&LO("$tp88")`,`&LO("$acc8")`

		mov $tp20,$tp80
		mov $tp28,$tp88
		shr \$32,$tp80
		shr \$32,$tp88
		rol \$24,`&LO("$tp20")` # ROTATE(tp2^tp1^tp8,24)
		mov $tp28,$tp88
		rol \$24,`&LO("$tp28")` # ROTATE(tp2^tp1^tp8,24)
		rol \$24,`&LO("$tp80")` # ROTATE(tp2^tp1^tp8,24)
		rol \$24,`&LO("$tp88")` # ROTATE(tp2^tp1^tp8,24)
		shr \$32,$tp80
		xor `&LO("$tp20")`,`&LO("$tp10")`
		shr \$32,$tp88
		xor `&LO("$tp28")`,`&LO("$tp18")`
		rol \$24,`&LO("$tp80")` # ROTATE(tp2^tp1^tp8,24)
		mov $tp40,$tp20
		rol \$24,`&LO("$tp88")` # ROTATE(tp2^tp1^tp8,24)
		mov $tp48,$tp28
		shr \$32,$tp20
		xor `&LO("$tp80")`,`&LO("$acc0")`
		shr \$32,$tp28
		xor `&LO("$tp88")`,`&LO("$acc8")`

		`"mov 0($sbox),$mask80" if ($prefetch)`
		shr \$32,$tp20
		shr \$32,$tp28
		`"mov 64($sbox),$maskfe" if ($prefetch)`
		rol \$16,`&LO("$tp40")` # ROTATE(tp4^tp1^tp8,16)
		`"mov 64($sbox),$maskfe" if ($prefetch)`
		rol \$16,`&LO("$tp48")` # ROTATE(tp4^tp1^tp8,16)
		`"mov 128($sbox),$mask1b" if ($prefetch)`
		rol \$16,`&LO("$tp20")` # ROTATE(tp4^tp1^tp8,16)
		rol \$16,`&LO("$tp28")` # ROTATE(tp4^tp1^tp8,16)
		`"mov 192($sbox),$tp80" if ($prefetch)`
		xor `&LO("$tp40")`,`&LO("$tp10")`
		rol \$16,`&LO("$tp28")` # ROTATE(tp4^tp1^tp8,16)
		xor `&LO("$tp48")`,`&LO("$tp18")`
		`"mov 256($sbox),$tp88" if ($prefetch)`
		xor `&LO("$tp20")`,`&LO("$acc0")`
		@@ -1301,10 +1299,6 @@ private_AES_set_encrypt_key:

		call _x86_64_AES_set_encrypt_key

		mov 8(%rsp),%r15
		mov 16(%rsp),%r14
		mov 24(%rsp),%r13
		mov 32(%rsp),%r12
		mov 40(%rsp),%rbp
		mov 48(%rsp),%rbx
		add \$56,%rsp

crypto/aes/asm/vpaes-x86.pl

+45 −46

File changed.

Preview size limit exceeded, changes collapsed.

crypto/aes/asm/vpaes-x86_64.pl

+42 −43

File changed.

Preview size limit exceeded, changes collapsed.