aes-586.pl

#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. Rights for redistribution and usage in source and binary
# forms are granted according to the OpenSSL license.
# ====================================================================
#
# Version 3.5.
#
# You might fail to appreciate this module performance from the first
# try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
# to be *the* best Intel C compiler without -KPIC, performance appears
# to be virtually identical... But try to re-configure with shared
# library support... Aha! Intel compiler "suddenly" lags behind by 30%
# [on P4, more on others]:-) And if compared to position-independent
# code generated by GNU C, this code performs *more* than *twice* as
# fast! Yes, all this buzz about PIC means that unlike other hand-
# coded implementations, this one was explicitly designed to be safe
# to use even in shared library context... This also means that this
# code isn't necessarily absolutely fastest "ever," because in order
# to achieve position independence an extra register has to be
# off-loaded to stack, which affects the benchmark result.
#
# Special note about instruction choice. Do you recall RC4_INT code
# performing poorly on P4? It might be the time to figure out why.
# RC4_INT code implies effective address calculations in base+offset*4
# form. Trouble is that it seems that offset scaling turned to be
# critical path... At least eliminating scaling resulted in 2.8x RC4
# performance improvement [as you might recall]. As AES code is hungry
# for scaling too, I [try to] avoid the latter by favoring off-by-2
# shifts and masking the result with 0xFF<<2 instead of "boring" 0xFF.
#
# As was shown by Dean Gaudet <dean@arctic.org>, the above note turned
# void. Performance improvement with off-by-2 shifts was observed on
# intermediate implementation, which was spilling yet another register
# to stack... Final offset*4 code below runs just a tad faster on P4,
# but exhibits up to 10% improvement on other cores.
#
# Second version is "monolithic" replacement for aes_core.c, which in
# addition to AES_[de|en]crypt implements AES_set_[de|en]cryption_key.
# This made it possible to implement little-endian variant of the
# algorithm without modifying the base C code. Motivating factor for
# the undertaken effort was that it appeared that in tight IA-32
# register window little-endian flavor could achieve slightly higher
# Instruction Level Parallelism, and it indeed resulted in up to 15%
# better performance on most recent µ-archs...
#
# Third version adds AES_cbc_encrypt implementation, which resulted in
# up to 40% performance imrovement of CBC benchmark results. 40% was
# observed on P4 core, where "overall" imrovement coefficient, i.e. if
# compared to PIC generated by GCC and in CBC mode, was observed to be
# as large as 4x:-) CBC performance is virtually identical to ECB now
# and on some platforms even better, e.g. 17.6 "small" cycles/byte on
# Opteron, because certain function prologues and epilogues are
# effectively taken out of the loop...
#
# Version 3.2 implements compressed tables and prefetch of these tables
# in CBC[!] mode. Former means that 3/4 of table references are now
# misaligned, which unfortunately has negative impact on elder IA-32
# implementations, Pentium suffered 30% penalty, PIII - 10%.
#
# Version 3.3 avoids L1 cache aliasing between stack frame and
# S-boxes, and 3.4 - L1 cache aliasing even between key schedule. The
# latter is achieved by copying the key schedule to controlled place in
# stack. This unfortunately has rather strong impact on small block CBC
# performance, ~2x deterioration on 16-byte block if compared to 3.3.
#
# Version 3.5 checks if there L1 cache aliasing between user-supplied
# key schedule and S-boxes and abstains from copying the former if
# there no. This allows end-user to consciously retain small block
# performance by aligning key schedule in specific manner.
#
# Current ECB performance numbers for 128-bit key in CPU cycles per
# processed byte [measure commonly used by AES benchmarkers] are:
#
#		small footprint		fully unrolled
# P4		24			22
# AMD K8	20			19
# PIII		25			23
# Pentium	81			78

push(@INC,"perlasm","../../perlasm");
require "x86asm.pl";

&asm_init($ARGV[0],"aes-586.pl",$ARGV[$#ARGV] eq "386");

$s0="eax";
$s1="ebx";
$s2="ecx";
$s3="edx";
$key="edi";
$acc="esi";

$compromise=0;		# $compromise=128 abstains from copying key
			# schedule to stack when encrypting inputs
			# shorter than 128 bytes at the cost of
			# risksing aliasing with S-boxes. In return
			# you get way better, up to +70%, small block
			# performance.
$small_footprint=1;	# $small_footprint=1 code is ~5% slower [on
			# recent µ-archs], but ~5 times smaller!
			# I favor compact code to minimize cache
			# contention and in hope to "collect" 5% back
			# in real-life applications...
$vertical_spin=0;	# shift "verticaly" defaults to 0, because of
			# its proof-of-concept status...

# Note that there is no decvert(), as well as last encryption round is
# performed with "horizontal" shifts. This is because this "vertical"
# implementation [one which groups shifts on a given $s[i] to form a
# "column," unlike "horizontal" one, which groups shifts on different
# $s[i] to form a "row"] is work in progress. It was observed to run
# few percents faster on Intel cores, but not AMD. On AMD K8 core it's
# whole 12% slower:-( So we face a trade-off... Shall it be resolved
# some day? Till then the code is considered experimental and by
# default remains dormant...

sub encvert()
{ my ($te,@s) = @_;
  my $v0 = $acc, $v1 = $key;

	&mov	($v0,$s[3]);				# copy s3
	&mov	(&DWP(4,"esp"),$s[2]);			# save s2
	&mov	($v1,$s[0]);				# copy s0
	&mov	(&DWP(8,"esp"),$s[1]);			# save s1

	&movz	($s[2],&HB($s[0]));
	&and	($s[0],0xFF);
	&mov	($s[0],&DWP(0,$te,$s[0],8));		# s0>>0
	&shr	($v1,16);
	&mov	($s[3],&DWP(3,$te,$s[2],8));		# s0>>8
	&movz	($s[1],&HB($v1));
	&and	($v1,0xFF);
	&mov	($s[2],&DWP(2,$te,$v1,8));		# s0>>16
	 &mov	($v1,$v0);
	&mov	($s[1],&DWP(1,$te,$s[1],8));		# s0>>24

	&and	($v0,0xFF);
	&xor	($s[3],&DWP(0,$te,$v0,8));		# s3>>0
	&movz	($v0,&HB($v1));
	&shr	($v1,16);
	&xor	($s[2],&DWP(3,$te,$v0,8));		# s3>>8
	&movz	($v0,&HB($v1));
	&and	($v1,0xFF);
	&xor	($s[1],&DWP(2,$te,$v1,8));		# s3>>16
	 &mov	($v1,&DWP(4,"esp"));			# restore s2
	&xor	($s[0],&DWP(1,$te,$v0,8));		# s3>>24

	&mov	($v0,$v1);
	&and	($v1,0xFF);
	&xor	($s[2],&DWP(0,$te,$v1,8));		# s2>>0
	&movz	($v1,&HB($v0));
	&shr	($v0,16);
	&xor	($s[1],&DWP(3,$te,$v1,8));		# s2>>8
	&movz	($v1,&HB($v0));
	&and	($v0,0xFF);
	&xor	($s[0],&DWP(2,$te,$v0,8));		# s2>>16
	 &mov	($v0,&DWP(8,"esp"));			# restore s1
	&xor	($s[3],&DWP(1,$te,$v1,8));		# s2>>24

	&mov	($v1,$v0);
	&and	($v0,0xFF);
	&xor	($s[1],&DWP(0,$te,$v0,8));		# s1>>0
	&movz	($v0,&HB($v1));
	&shr	($v1,16);
	&xor	($s[0],&DWP(3,$te,$v0,8));		# s1>>8
	&movz	($v0,&HB($v1));
	&and	($v1,0xFF);
	&xor	($s[3],&DWP(2,$te,$v1,8));		# s1>>16
	 &mov	($key,&DWP(12,"esp"));			# reincarnate v1 as key
	&xor	($s[2],&DWP(1,$te,$v0,8));		# s1>>24
}

sub encstep()
{ my ($i,$te,@s) = @_;
  my $tmp = $key;
  my $out = $i==3?$s[0]:$acc;

	# lines marked with #%e?x[i] denote "reordered" instructions...
	if ($i==3)  {	&mov	($key,&DWP(12,"esp"));		}##%edx
	else        {	&mov	($out,$s[0]);
			&and	($out,0xFF);			}
	if ($i==1)  {	&shr	($s[0],16);			}#%ebx[1]
	if ($i==2)  {	&shr	($s[0],24);			}#%ecx[2]
			&mov	($out,&DWP(0,$te,$out,8));

	if ($i==3)  {	$tmp=$s[1];				}##%eax
			&movz	($tmp,&HB($s[1]));
			&xor	($out,&DWP(3,$te,$tmp,8));

	if ($i==3)  {	$tmp=$s[2]; &mov ($s[1],&DWP(4,"esp"));	}##%ebx
	else        {	&mov	($tmp,$s[2]);
			&shr	($tmp,16);			}
	if ($i==2)  {	&and	($s[1],0xFF);			}#%edx[2]
			&and	($tmp,0xFF);
			&xor	($out,&DWP(2,$te,$tmp,8));

	if ($i==3)  {	$tmp=$s[3]; &mov ($s[2],&DWP(8,"esp"));	}##%ecx
	elsif($i==2){	&movz	($tmp,&HB($s[3]));		}#%ebx[2]