aes-586.pl

#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. Rights for redistribution and usage in source and binary
# forms are granted according to the OpenSSL license.
# ====================================================================
#
# You might fail to appreciate this module performance from the first
# try. If compared to "vanilla" linux-ia32-icc target, i.e. Intel C
# without -KPIC, performance appears to be virtually identical... But
# try to configure with shared library support... Aha! Intel compiler
# "suddenly" lags behind by 30% [on P4]:-) And if compared to
# position-independent code generated by GNU C, this code performs
# more than *twice* as fast! Yes, all this buzz about PIC means that
# [unlike other implementations] this module was explicitly designed
# to be safe to use even in shared library context...
#
# Special note about instruction choice. Do you recall RC4_INT code
# performing poorly on P4? It might be the time to figure out why.
# RC4_INT code implies effective address calculations in base+offset*4
# form. Trouble is that it seems that offset scaling turned to be
# critical path... At least eliminating scaling resulted in 2.8x RC4
# performance improvement [as you might recall]. As AES code is hungry
# for scaling too, I [try to] avoid the latter by favoring off-by-2
# shifts and masking the result with 0xFF<<2 instead of "boring" 0xFF.

push(@INC,"perlasm","../../perlasm");
require "x86asm.pl";

&asm_init($ARGV[0],"aes-586.pl",$ARGV[$#ARGV] eq "386");

$small_footprint=1;	# $small_footprint=1 code is 5-9% slower, but
			# 5 times smaller! I default to compact code.
$s0="eax";
$s1="ebx";
$s2="ecx";
$s3="edx";

sub encstep()
{ my ($i,$te,@s) = @_;
  my $tmp,$out;

	if ($i==3)  {	$out=$s[0]; &mov ("edi",&DWP(12,"esp"));}
	else        {	$out="esi"; &mov ($out,$s[0]);		}
			&shr	($out,24-2);
			&and	($out,0xFF<<2);
			&mov	($out,&DWP(1024*0,$te,$out));

	if ($i==3)  {	$tmp=$s[1];				}
	else        {	$tmp="edi"; &mov ($tmp,$s[1]);		}
			&shr	($tmp,16-2);
			&and	($tmp,0xFF<<2);
			&xor	($out,&DWP(1024*1,$te,$tmp));

	if ($i==3)  {	$tmp=$s[2]; &mov ($s[1],&DWP(0,"esp"));	}
	else        {	$tmp="edi";				}
			&movz	($tmp,&HB($s[2]));
			&xor	($out,&DWP(1024*2,$te,$tmp,4));

	if ($i==3)  {	$tmp=$s[3]; &mov ($s[2],&DWP(4,"esp"));	}
	else        {	$tmp="edi"; &mov ($tmp,$s[3]);		} 
			&and	($tmp,0xFF);
			&xor	($out,&DWP(1024*3,$te,$tmp,4));
	if ($i<2)   {	&mov	(&DWP(4*$i,"esp"),$out);	}
	if ($i==3)  {	&mov	($s[3],"esi");			}
}

sub enclast()
{ my ($i,$te,@s)=@_;
  my $tmp,$out;

	if ($i==3)  {	$out=$s[0]; &mov ("edi",&DWP(12,"esp"));}
	else        {	$out="esi"; &mov ($out,$s[0]);		}
			&shr	($out,24-2);
			&and	($out,0xFF<<2);
			&mov	($out,&DWP(0,$te,$out));
			&and	($out,0xff000000);

	if ($i==3)  {	$tmp=$s[1];				}
	else        {	$tmp="edi"; &mov ($tmp,$s[1]);		}
			&shr	($tmp,16-2);
			&and	($tmp,0xFF<<2);
			&mov	($tmp,&DWP(0,$te,$tmp));
			&and	($tmp,0x00ff0000);
			&xor	($out,$tmp);

	if ($i==3)  {	$tmp=$s[2]; &mov ($s[1],&DWP(0,"esp"));	}
	else        {	$tmp="edi"; 				}
			&movz	($tmp,&HB($s[2]));
			&mov	($tmp,&DWP(0,$te,$tmp,4));
			&and	($tmp,0x0000ff00);
			&xor	($out,$tmp);

	if ($i==3)  {	$tmp=$s[3]; &mov ($s[2],&DWP(4,"esp"));	}
	else        {	$tmp="edi"; &mov ($tmp,$s[3]);		} 
			&and	($tmp,0xFF);
			&mov	($tmp,&DWP(0,$te,$tmp,4));
			&and	($tmp,0x000000ff);
			&xor	($out,$tmp);
	if ($i<2)   {	&mov	(&DWP(4*$i,"esp"),$out);	}
	if ($i==3)  {	&mov	($s[3],"esi");			}
}

# void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
&public_label("AES_Te");
&function_begin("AES_encrypt");
	&mov	("esi",&wparam(0));		# load inp
	&mov	("edi",&wparam(2));		# load key

        &call   (&label("pic_point"));          # make it PIC!
&set_label("pic_point");
        &blindpop("ebp");
        &lea    ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp"));

	# allocate aligned stack frame
	&mov	("eax","esp");
	&sub	("esp",20);
	&and	("esp",-16);

	&mov	(&DWP(12,"esp"),"edi");		# save key
	&mov	(&DWP(16,"esp"),"eax");		# save %esp

	&mov	($s0,&DWP(0,"esi"));		# load input data
	&mov	($s1,&DWP(4,"esi"));
	&mov	($s2,&DWP(8,"esi"));
	&mov	($s3,&DWP(12,"esi"));
	#
	# It's perfectly possible to implement algorithm as
	# little-endian and get rid of bswaps... It would give
	# less than 1% performance improvement, so I judge it
	# doesn't worth the trouble...
	#
	&bswap	($s0);
	&bswap	($s1);
	&bswap	($s2);
	&bswap	($s3);
	&xor	($s0,&DWP(0,"edi"));
	&xor	($s1,&DWP(4,"edi"));
	&xor	($s2,&DWP(8,"edi"));
	&xor	($s3,&DWP(12,"edi"));

	&mov	("esi",&DWP(240,"edi"));	# load key->rounds

	if ($small_footprint) {
	    &lea	("esi",&DWP(-2,"esi","esi"));
	    &lea	("esi",&DWP(0,"edi","esi",8));
	    &mov	(&DWP(8,"esp"),"esi");	# end of key schedule
	    &align	(4);
	    &set_label("loop");
		&encstep(0,"ebp",$s0,$s1,$s2,$s3);
		&encstep(1,"ebp",$s1,$s2,$s3,$s0);
		&encstep(2,"ebp",$s2,$s3,$s0,$s1);
		&encstep(3,"ebp",$s3,$s0,$s1,$s2);
		&add	("edi",16);			# advance rd_key
		&xor	($s0,&DWP(0,"edi"));
		&xor	($s1,&DWP(4,"edi"));
		&xor	($s2,&DWP(8,"edi"));
		&xor	($s3,&DWP(12,"edi"));
	    &cmp	("edi",&DWP(8,"esp"));
	    &mov	(&DWP(12,"esp"),"edi");
	    &jb		(&label("loop"));
	}
	else {
	    &cmp	("esi",10);
	    &jle	(&label("10rounds"));
	    &cmp	("esi",12);
	    &jle	(&label("12rounds"));

	&set_label("14rounds");
	    for ($i=1;$i<3;$i++) {
		&encstep(0,"ebp",$s0,$s1,$s2,$s3);
		&encstep(1,"ebp",$s1,$s2,$s3,$s0);
		&encstep(2,"ebp",$s2,$s3,$s0,$s1);
		&encstep(3,"ebp",$s3,$s0,$s1,$s2);
		&xor	($s0,&DWP(16*$i+0,"edi"));
		&xor	($s1,&DWP(16*$i+4,"edi"));
		&xor	($s2,&DWP(16*$i+8,"edi"));
		&xor	($s3,&DWP(16*$i+12,"edi"));
	    }
	    &add	("edi",32);
	    &mov	(&DWP(12,"esp"),"edi");		# advance rd_key
	&set_label("12rounds");
	    for ($i=1;$i<3;$i++) {
		&encstep(0,"ebp",$s0,$s1,$s2,$s3);
		&encstep(1,"ebp",$s1,$s2,$s3,$s0);
		&encstep(2,"ebp",$s2,$s3,$s0,$s1);
		&encstep(3,"ebp",$s3,$s0,$s1,$s2);
		&xor	($s0,&DWP(16*$i+0,"edi"));
		&xor	($s1,&DWP(16*$i+4,"edi"));
		&xor	($s2,&DWP(16*$i+8,"edi"));
		&xor	($s3,&DWP(16*$i+12,"edi"));
	    }
	    &add	("edi",32);
	    &mov	(&DWP(12,"esp"),"edi");		# advance rd_key
	&set_label("10rounds");
	    for ($i=1;$i<10;$i++) {
		&encstep(0,"ebp",$s0,$s1,$s2,$s3);
		&encstep(1,"ebp",$s1,$s2,$s3,$s0);
		&encstep(2,"ebp",$s2,$s3,$s0,$s1);