aes-x86_64.pl

#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# Version 2.1.
#
# aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on
# Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version
# [you'll notice a lot of resemblance], such as compressed S-boxes
# in little-endian byte order, prefetch of these tables in CBC mode,
# as well as avoiding L1 cache aliasing between stack frame and key
# schedule and already mentioned tables, compressed Td4...
#
# Performance in number of cycles per processed byte for 128-bit key:
#
#		ECB encrypt	ECB decrypt	CBC large chunk
# AMD64		33		41		13.0
# EM64T		38		59		18.6(*)
# Core 2	30		43		14.5(*)
#
# (*) with hyper-threading off

$verticalspin=1;	# unlike 32-bit version $verticalspin performs
			# ~15% better on both AMD and Intel cores
$speed_limit=512;	# see aes-586.pl for details
$output=shift;

$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";

open STDOUT,"| $^X $xlate $output";

$code=".text\n";

$s0="%eax";
$s1="%ebx";
$s2="%ecx";
$s3="%edx";
$acc0="%esi";	$mask80="%rsi";
$acc1="%edi";	$maskfe="%rdi";
$acc2="%ebp";	$mask1b="%rbp";
$inp="%r8";
$out="%r9";
$t0="%r10d";
$t1="%r11d";
$t2="%r12d";
$rnds="%r13d";
$sbox="%r14";
$key="%r15";

sub hi() { my $r=shift;	$r =~ s/%[er]([a-d])x/%\1h/;	$r; }
sub lo() { my $r=shift;	$r =~ s/%[er]([a-d])x/%\1l/;
			$r =~ s/%[er]([sd]i)/%\1l/;
			$r =~ s/%(r[0-9]+)[d]?/%\1b/;	$r; }
sub LO() { my $r=shift; $r =~ s/%r([a-z]+)/%e\1/;
			$r =~ s/%r([0-9]+)/%r\1d/;	$r; }
sub _data_word()
{ my $i;
    while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
}
sub data_word()
{ my $i;
  my $last=pop(@_);
    $code.=".long\t";
    while(defined($i=shift)) { $code.=sprintf"0x%08x,",$i; }
    $code.=sprintf"0x%08x\n",$last;
}

sub data_byte()
{ my $i;
  my $last=pop(@_);
    $code.=".byte\t";
    while(defined($i=shift)) { $code.=sprintf"0x%02x,",$i&0xff; }
    $code.=sprintf"0x%02x\n",$last&0xff;
}

sub encvert()
{ my $t3="%r8d";	# zaps $inp!

$code.=<<___;
	# favor 3-way issue Opteron pipeline...
	movzb	`&lo("$s0")`,$acc0
	movzb	`&lo("$s1")`,$acc1
	movzb	`&lo("$s2")`,$acc2
	mov	0($sbox,$acc0,8),$t0
	mov	0($sbox,$acc1,8),$t1
	mov	0($sbox,$acc2,8),$t2

	movzb	`&hi("$s1")`,$acc0
	movzb	`&hi("$s2")`,$acc1
	movzb	`&lo("$s3")`,$acc2
	xor	3($sbox,$acc0,8),$t0
	xor	3($sbox,$acc1,8),$t1
	mov	0($sbox,$acc2,8),$t3

	movzb	`&hi("$s3")`,$acc0
	shr	\$16,$s2
	movzb	`&hi("$s0")`,$acc2
	xor	3($sbox,$acc0,8),$t2
	shr	\$16,$s3
	xor	3($sbox,$acc2,8),$t3

	shr	\$16,$s1
	lea	16($key),$key
	shr	\$16,$s0

	movzb	`&lo("$s2")`,$acc0
	movzb	`&lo("$s3")`,$acc1
	movzb	`&lo("$s0")`,$acc2
	xor	2($sbox,$acc0,8),$t0
	xor	2($sbox,$acc1,8),$t1
	xor	2($sbox,$acc2,8),$t2

	movzb	`&hi("$s3")`,$acc0
	movzb	`&hi("$s0")`,$acc1
	movzb	`&lo("$s1")`,$acc2
	xor	1($sbox,$acc0,8),$t0
	xor	1($sbox,$acc1,8),$t1
	xor	2($sbox,$acc2,8),$t3

	mov	12($key),$s3
	movzb	`&hi("$s1")`,$acc1
	movzb	`&hi("$s2")`,$acc2
	mov	0($key),$s0
	xor	1($sbox,$acc1,8),$t2
	xor	1($sbox,$acc2,8),$t3

	mov	4($key),$s1
	mov	8($key),$s2
	xor	$t0,$s0
	xor	$t1,$s1
	xor	$t2,$s2
	xor	$t3,$s3
___
}

sub enclastvert()
{ my $t3="%r8d";	# zaps $inp!

$code.=<<___;
	movzb	`&lo("$s0")`,$acc0
	movzb	`&lo("$s1")`,$acc1
	movzb	`&lo("$s2")`,$acc2
	movzb	2($sbox,$acc0,8),$t0
	movzb	2($sbox,$acc1,8),$t1
	movzb	2($sbox,$acc2,8),$t2

	movzb	`&lo("$s3")`,$acc0
	movzb	`&hi("$s1")`,$acc1
	movzb	`&hi("$s2")`,$acc2
	movzb	2($sbox,$acc0,8),$t3
	mov	0($sbox,$acc1,8),$acc1	#$t0
	mov	0($sbox,$acc2,8),$acc2	#$t1

	and	\$0x0000ff00,$acc1
	and	\$0x0000ff00,$acc2

	xor	$acc1,$t0
	xor	$acc2,$t1
	shr	\$16,$s2

	movzb	`&hi("$s3")`,$acc0
	movzb	`&hi("$s0")`,$acc1
	shr	\$16,$s3
	mov	0($sbox,$acc0,8),$acc0	#$t2
	mov	0($sbox,$acc1,8),$acc1	#$t3

	and	\$0x0000ff00,$acc0
	and	\$0x0000ff00,$acc1
	shr	\$16,$s1
	xor	$acc0,$t2
	xor	$acc1,$t3
	shr	\$16,$s0

	movzb	`&lo("$s2")`,$acc0
	movzb	`&lo("$s3")`,$acc1
	movzb	`&lo("$s0")`,$acc2
	mov	0($sbox,$acc0,8),$acc0	#$t0
	mov	0($sbox,$acc1,8),$acc1	#$t1
	mov	0($sbox,$acc2,8),$acc2	#$t2

	and	\$0x00ff0000,$acc0
	and	\$0x00ff0000,$acc1
	and	\$0x00ff0000,$acc2

	xor	$acc0,$t0
	xor	$acc1,$t1
	xor	$acc2,$t2

	movzb	`&lo("$s1")`,$acc0
	movzb	`&hi("$s3")`,$acc1
	movzb	`&hi("$s0")`,$acc2
	mov	0($sbox,$acc0,8),$acc0	#$t3
	mov	2($sbox,$acc1,8),$acc1	#$t0
	mov	2($sbox,$acc2,8),$acc2	#$t1

	and	\$0x00ff0000,$acc0
	and	\$0xff000000,$acc1
	and	\$0xff000000,$acc2

	xor	$acc0,$t3
	xor	$acc1,$t0
	xor	$acc2,$t1

	movzb	`&hi("$s1")`,$acc0
	movzb	`&hi("$s2")`,$acc1
	mov	16+12($key),$s3
	mov	2($sbox,$acc0,8),$acc0	#$t2
	mov	2($sbox,$acc1,8),$acc1	#$t3
	mov	16+0($key),$s0

	and	\$0xff000000,$acc0
	and	\$0xff000000,$acc1

	xor	$acc0,$t2
	xor	$acc1,$t3

	mov	16+4($key),$s1
	mov	16+8($key),$s2
	xor	$t0,$s0
	xor	$t1,$s1
	xor	$t2,$s2
	xor	$t3,$s3
___
}

sub encstep()
{ my ($i,@s) = @_;
  my $tmp0=$acc0;
  my $tmp1=$acc1;
  my $tmp2=$acc2;
  my $out=($t0,$t1,$t2,$s[0])[$i];

	if ($i==3) {
		$tmp0=$s[1];
		$tmp1=$s[2];
		$tmp2=$s[3];
	}
	$code.="	movzb	".&lo($s[0]).",$out\n";
	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
	$code.="	lea	16($key),$key\n"	if ($i==0);

	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
	$code.="	mov	0($sbox,$out,8),$out\n";

	$code.="	shr	\$16,$tmp1\n";
	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);
	$code.="	xor	3($sbox,$tmp0,8),$out\n";

	$code.="	movzb	".&lo($tmp1).",$tmp1\n";
	$code.="	shr	\$24,$tmp2\n";
	$code.="	xor	4*$i($key),$out\n";

	$code.="	xor	2($sbox,$tmp1,8),$out\n";
	$code.="	xor	1($sbox,$tmp2,8),$out\n";

	$code.="	mov	$t0,$s[1]\n"		if ($i==3);
	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
	$code.="	mov	$t2,$s[3]\n"		if ($i==3);
	$code.="\n";
}

sub enclast()
{ my ($i,@s)=@_;
  my $tmp0=$acc0;
  my $tmp1=$acc1;
  my $tmp2=$acc2;
  my $out=($t0,$t1,$t2,$s[0])[$i];

	if ($i==3) {
		$tmp0=$s[1];
		$tmp1=$s[2];
		$tmp2=$s[3];
	}
	$code.="	movzb	".&lo($s[0]).",$out\n";
	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);

	$code.="	mov	2($sbox,$out,8),$out\n";
	$code.="	shr	\$16,$tmp1\n";
	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);

	$code.="	and	\$0x000000ff,$out\n";
	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
	$code.="	movzb	".&lo($tmp1).",$tmp1\n";
	$code.="	shr	\$24,$tmp2\n";

	$code.="	mov	0($sbox,$tmp0,8),$tmp0\n";
	$code.="	mov	0($sbox,$tmp1,8),$tmp1\n";
	$code.="	mov	2($sbox,$tmp2,8),$tmp2\n";

	$code.="	and	\$0x0000ff00,$tmp0\n";
	$code.="	and	\$0x00ff0000,$tmp1\n";
	$code.="	and	\$0xff000000,$tmp2\n";

	$code.="	xor	$tmp0,$out\n";
	$code.="	mov	$t0,$s[1]\n"		if ($i==3);
	$code.="	xor	$tmp1,$out\n";
	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
	$code.="	xor	$tmp2,$out\n";
	$code.="	mov	$t2,$s[3]\n"		if ($i==3);
	$code.="\n";
}

$code.=<<___;
.type	_x86_64_AES_encrypt,\@abi-omnipotent
.align	16
_x86_64_AES_encrypt:
	xor	0($key),$s0			# xor with key
	xor	4($key),$s1
	xor	8($key),$s2
	xor	12($key),$s3

	mov	240($key),$rnds			# load key->rounds
	sub	\$1,$rnds
	jmp	.Lenc_loop
.align	16
.Lenc_loop:
___
	if ($verticalspin) { &encvert(); }
	else {	&encstep(0,$s0,$s1,$s2,$s3);
		&encstep(1,$s1,$s2,$s3,$s0);
		&encstep(2,$s2,$s3,$s0,$s1);
		&encstep(3,$s3,$s0,$s1,$s2);
	}
$code.=<<___;
	sub	\$1,$rnds
	jnz	.Lenc_loop
___
	if ($verticalspin) { &enclastvert(); }
	else {	&enclast(0,$s0,$s1,$s2,$s3);
		&enclast(1,$s1,$s2,$s3,$s0);
		&enclast(2,$s2,$s3,$s0,$s1);
		&enclast(3,$s3,$s0,$s1,$s2);
		$code.=<<___;
		xor	16+0($key),$s0		# xor with key
		xor	16+4($key),$s1
		xor	16+8($key),$s2
		xor	16+12($key),$s3
___
	}
$code.=<<___;
	.byte	0xf3,0xc3			# rep ret
.size	_x86_64_AES_encrypt,.-_x86_64_AES_encrypt
___

# it's possible to implement this by shifting tN by 8, filling least
# significant byte with byte load and finally bswap-ing at the end,
# but such partial register load kills Core 2...
sub enccompactvert()
{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");

$code.=<<___;
	movzb	`&lo("$s0")`,$t0
	movzb	`&lo("$s1")`,$t1
	movzb	`&lo("$s2")`,$t2
	movzb	($sbox,$t0,1),$t0
	movzb	($sbox,$t1,1),$t1
	movzb	($sbox,$t2,1),$t2

	movzb	`&lo("$s3")`,$t3
	movzb	`&hi("$s1")`,$acc0
	movzb	`&hi("$s2")`,$acc1
	movzb	($sbox,$t3,1),$t3
	movzb	($sbox,$acc0,1),$t4	#$t0
	movzb	($sbox,$acc1,1),$t5	#$t1

	movzb	`&hi("$s3")`,$acc2
	movzb	`&hi("$s0")`,$acc0
	shr	\$16,$s2
	movzb	($sbox,$acc2,1),$acc2	#$t2
	movzb	($sbox,$acc0,1),$acc0	#$t3
	shr	\$16,$s3

	movzb	`&lo("$s2")`,$acc1
	shl	\$8,$t4
	shl	\$8,$t5
	movzb	($sbox,$acc1,1),$acc1	#$t0
	xor	$t4,$t0
	xor	$t5,$t1

	movzb	`&lo("$s3")`,$t4
	shr	\$16,$s0
	shr	\$16,$s1
	movzb	`&lo("$s0")`,$t5
	shl	\$8,$acc2
	shl	\$8,$acc0
	movzb	($sbox,$t4,1),$t4	#$t1
	movzb	($sbox,$t5,1),$t5	#$t2
	xor	$acc2,$t2
	xor	$acc0,$t3

	movzb	`&lo("$s1")`,$acc2
	movzb	`&hi("$s3")`,$acc0
	shl	\$16,$acc1
	movzb	($sbox,$acc2,1),$acc2	#$t3
	movzb	($sbox,$acc0,1),$acc0	#$t0
	xor	$acc1,$t0

	movzb	`&hi("$s0")`,$acc1
	shr	\$8,$s2
	shr	\$8,$s1
	movzb	($sbox,$acc1,1),$acc1	#$t1
	movzb	($sbox,$s2,1),$s3	#$t3
	movzb	($sbox,$s1,1),$s2	#$t2
	shl	\$16,$t4
	shl	\$16,$t5
	shl	\$16,$acc2
	xor	$t4,$t1
	xor	$t5,$t2
	xor	$acc2,$t3

	shl	\$24,$acc0
	shl	\$24,$acc1
	shl	\$24,$s3
	xor	$acc0,$t0
	shl	\$24,$s2
	xor	$acc1,$t1
	mov	$t0,$s0
	mov	$t1,$s1
	xor	$t2,$s2
	xor	$t3,$s3
___
}

sub enctransform_ref()
{ my $sn = shift;
  my ($acc,$r2,$tmp)=("%r8d","%r9d","%r13d");

$code.=<<___;
	mov	$sn,$acc
	and	\$0x80808080,$acc
	mov	$acc,$tmp
	shr	\$7,$tmp
	lea	($sn,$sn),$r2
	sub	$tmp,$acc
	and	\$0xfefefefe,$r2
	and	\$0x1b1b1b1b,$acc
	mov	$sn,$tmp
	xor	$acc,$r2

	xor	$r2,$sn
	rol	\$24,$sn
	xor	$r2,$sn
	ror	\$16,$tmp
	xor	$tmp,$sn
	ror	\$8,$tmp
	xor	$tmp,$sn
___
}

# unlike decrypt case it does not pay off to parallelize enctransform
sub enctransform()
{ my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d");

$code.=<<___;
	mov	$s0,$acc0
	mov	$s1,$acc1
	and	\$0x80808080,$acc0
	and	\$0x80808080,$acc1
	mov	$acc0,$t0
	mov	$acc1,$t1
	shr	\$7,$t0
	lea	($s0,$s0),$r20
	shr	\$7,$t1
	lea	($s1,$s1),$r21
	sub	$t0,$acc0
	sub	$t1,$acc1
	and	\$0xfefefefe,$r20
	and	\$0xfefefefe,$r21
	and	\$0x1b1b1b1b,$acc0
	and	\$0x1b1b1b1b,$acc1
	mov	$s0,$t0
	mov	$s1,$t1
	xor	$acc0,$r20
	xor	$acc1,$r21

	xor	$r20,$s0
	xor	$r21,$s1
	 mov	$s2,$acc0
	 mov	$s3,$acc1
	rol	\$24,$s0
	rol	\$24,$s1
	 and	\$0x80808080,$acc0
	 and	\$0x80808080,$acc1
	xor	$r20,$s0
	xor	$r21,$s1
	 mov	$acc0,$t2
	 mov	$acc1,$t3
	ror	\$16,$t0
	ror	\$16,$t1
	 shr	\$7,$t2
	 lea	($s2,$s2),$r20
	xor	$t0,$s0
	xor	$t1,$s1
	 shr	\$7,$t3
	 lea	($s3,$s3),$r21
	ror	\$8,$t0
	ror	\$8,$t1
	 sub	$t2,$acc0
	 sub	$t3,$acc1
	xor	$t0,$s0
	xor	$t1,$s1

	and	\$0xfefefefe,$r20
	and	\$0xfefefefe,$r21
	and	\$0x1b1b1b1b,$acc0
	and	\$0x1b1b1b1b,$acc1
	mov	$s2,$t2
	mov	$s3,$t3
	xor	$acc0,$r20
	xor	$acc1,$r21

	xor	$r20,$s2
	xor	$r21,$s3
	rol	\$24,$s2
	rol	\$24,$s3
	xor	$r20,$s2
	xor	$r21,$s3
	mov	0($sbox),$acc0			# prefetch Te4
	ror	\$16,$t2
	ror	\$16,$t3
	mov	64($sbox),$acc1
	xor	$t2,$s2
	xor	$t3,$s3
	mov	128($sbox),$r20
	ror	\$8,$t2
	ror	\$8,$t3
	mov	192($sbox),$r21
	xor	$t2,$s2
	xor	$t3,$s3
___
}

$code.=<<___;
.type	_x86_64_AES_encrypt_compact,\@abi-omnipotent
.align	16
_x86_64_AES_encrypt_compact:
	lea	128($sbox),$inp			# size optimization
	mov	0-128($inp),$acc1		# prefetch Te4
	mov	32-128($inp),$acc2
	mov	64-128($inp),$t0
	mov	96-128($inp),$t1
	mov	128-128($inp),$acc1
	mov	160-128($inp),$acc2
	mov	192-128($inp),$t0
	mov	224-128($inp),$t1
	jmp	.Lenc_loop_compact
.align	16
.Lenc_loop_compact:
		xor	0($key),$s0		# xor with key
		xor	4($key),$s1
		xor	8($key),$s2
		xor	12($key),$s3
		lea	16($key),$key
___
		&enccompactvert();
$code.=<<___;
		cmp	16(%rsp),$key
		je	.Lenc_compact_done
___
		&enctransform();
$code.=<<___;
	jmp	.Lenc_loop_compact
.align	16
.Lenc_compact_done:
	xor	0($key),$s0
	xor	4($key),$s1
	xor	8($key),$s2
	xor	12($key),$s3
	.byte	0xf3,0xc3			# rep ret
.size	_x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact
___

# void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
$code.=<<___;
.globl	AES_encrypt
.type	AES_encrypt,\@function,3
.align	16
AES_encrypt:
	push	%rbx
	push	%rbp
	push	%r12
	push	%r13
	push	%r14
	push	%r15

	# allocate frame "above" key schedule
	mov	%rsp,%rax
	mov	%rdx,$key
	lea	-63(%rdx),%rcx
	and	\$-64,%rsp
	sub	%rsp,%rcx
	neg	%rcx
	and	\$0x3c0,%rcx
	sub	%rcx,%rsp

	push	%rax		# save real stack pointer
	push	%rsi		# save out

	mov	240($key),$rnds	# load rounds

	mov	0(%rdi),$s0	# load input vector
	mov	4(%rdi),$s1
	mov	8(%rdi),$s2
	mov	12(%rdi),$s3

	shl	\$4,$rnds
	lea	($key,$rnds),%rbp
	push	%rbp
	push	$key

	# pick Te4 copy which can't "overlap" with stack frame or key schedule
	.picmeup	$sbox
	lea	AES_Te+2048-.($sbox),$sbox
	lea	768(%rsp),%rbp
	sub	$sbox,%rbp
	and	\$0x300,%rbp
	lea	($sbox,%rbp),$sbox

	call	_x86_64_AES_encrypt_compact

	mov	16(%rsp),$out	# restore out
	mov	24(%rsp),%rsp
	mov	$s0,0($out)	# write output vector
	mov	$s1,4($out)
	mov	$s2,8($out)
	mov	$s3,12($out)

	pop	%r15
	pop	%r14
	pop	%r13
	pop	%r12
	pop	%rbp
	pop	%rbx
	ret
.size	AES_encrypt,.-AES_encrypt
___

#------------------------------------------------------------------#

sub decvert()
{ my $t3="%r8d";	# zaps $inp!

$code.=<<___;
	# favor 3-way issue Opteron pipeline...
	movzb	`&lo("$s0")`,$acc0
	movzb	`&lo("$s1")`,$acc1
	movzb	`&lo("$s2")`,$acc2
	mov	0($sbox,$acc0,8),$t0
	mov	0($sbox,$acc1,8),$t1
	mov	0($sbox,$acc2,8),$t2

	movzb	`&hi("$s3")`,$acc0
	movzb	`&hi("$s0")`,$acc1
	movzb	`&lo("$s3")`,$acc2
	xor	3($sbox,$acc0,8),$t0
	xor	3($sbox,$acc1,8),$t1
	mov	0($sbox,$acc2,8),$t3

	movzb	`&hi("$s1")`,$acc0
	shr	\$16,$s0
	movzb	`&hi("$s2")`,$acc2
	xor	3($sbox,$acc0,8),$t2
	shr	\$16,$s3
	xor	3($sbox,$acc2,8),$t3

	shr	\$16,$s1
	lea	16($key),$key
	shr	\$16,$s2

	movzb	`&lo("$s2")`,$acc0
	movzb	`&lo("$s3")`,$acc1
	movzb	`&lo("$s0")`,$acc2
	xor	2($sbox,$acc0,8),$t0
	xor	2($sbox,$acc1,8),$t1
	xor	2($sbox,$acc2,8),$t2

	movzb	`&hi("$s1")`,$acc0
	movzb	`&hi("$s2")`,$acc1
	movzb	`&lo("$s1")`,$acc2
	xor	1($sbox,$acc0,8),$t0
	xor	1($sbox,$acc1,8),$t1
	xor	2($sbox,$acc2,8),$t3

	movzb	`&hi("$s3")`,$acc0
	mov	12($key),$s3
	movzb	`&hi("$s0")`,$acc2
	xor	1($sbox,$acc0,8),$t2
	mov	0($key),$s0
	xor	1($sbox,$acc2,8),$t3

	xor	$t0,$s0
	mov	4($key),$s1
	mov	8($key),$s2
	xor	$t2,$s2
	xor	$t1,$s1
	xor	$t3,$s3
___
}

sub declastvert()
{ my $t3="%r8d";	# zaps $inp!

$code.=<<___;
	lea	2048($sbox),$sbox	# size optimization
	movzb	`&lo("$s0")`,$acc0
	movzb	`&lo("$s1")`,$acc1
	movzb	`&lo("$s2")`,$acc2
	movzb	($sbox,$acc0,1),$t0
	movzb	($sbox,$acc1,1),$t1
	movzb	($sbox,$acc2,1),$t2

	movzb	`&lo("$s3")`,$acc0
	movzb	`&hi("$s3")`,$acc1
	movzb	`&hi("$s0")`,$acc2
	movzb	($sbox,$acc0,1),$t3
	movzb	($sbox,$acc1,1),$acc1	#$t0
	movzb	($sbox,$acc2,1),$acc2	#$t1

	shl	\$8,$acc1
	shl	\$8,$acc2

	xor	$acc1,$t0
	xor	$acc2,$t1
	shr	\$16,$s3

	movzb	`&hi("$s1")`,$acc0
	movzb	`&hi("$s2")`,$acc1
	shr	\$16,$s0
	movzb	($sbox,$acc0,1),$acc0	#$t2
	movzb	($sbox,$acc1,1),$acc1	#$t3

	shl	\$8,$acc0
	shl	\$8,$acc1
	shr	\$16,$s1
	xor	$acc0,$t2
	xor	$acc1,$t3
	shr	\$16,$s2

	movzb	`&lo("$s2")`,$acc0
	movzb	`&lo("$s3")`,$acc1
	movzb	`&lo("$s0")`,$acc2
	movzb	($sbox,$acc0,1),$acc0	#$t0
	movzb	($sbox,$acc1,1),$acc1	#$t1
	movzb	($sbox,$acc2,1),$acc2	#$t2

	shl	\$16,$acc0
	shl	\$16,$acc1
	shl	\$16,$acc2

	xor	$acc0,$t0
	xor	$acc1,$t1
	xor	$acc2,$t2

	movzb	`&lo("$s1")`,$acc0
	movzb	`&hi("$s1")`,$acc1
	movzb	`&hi("$s2")`,$acc2
	movzb	($sbox,$acc0,1),$acc0	#$t3
	movzb	($sbox,$acc1,1),$acc1	#$t0
	movzb	($sbox,$acc2,1),$acc2	#$t1

	shl	\$16,$acc0
	shl	\$24,$acc1
	shl	\$24,$acc2

	xor	$acc0,$t3
	xor	$acc1,$t0
	xor	$acc2,$t1

	movzb	`&hi("$s3")`,$acc0
	movzb	`&hi("$s0")`,$acc1
	mov	16+12($key),$s3
	movzb	($sbox,$acc0,1),$acc0	#$t2
	movzb	($sbox,$acc1,1),$acc1	#$t3
	mov	16+0($key),$s0

	shl	\$24,$acc0
	shl	\$24,$acc1

	xor	$acc0,$t2
	xor	$acc1,$t3

	mov	16+4($key),$s1
	mov	16+8($key),$s2
	lea	-2048($sbox),$sbox
	xor	$t0,$s0
	xor	$t1,$s1
	xor	$t2,$s2
	xor	$t3,$s3
___
}

sub decstep()
{ my ($i,@s) = @_;
  my $tmp0=$acc0;
  my $tmp1=$acc1;
  my $tmp2=$acc2;
  my $out=($t0,$t1,$t2,$s[0])[$i];

	$code.="	mov	$s[0],$out\n"		if ($i!=3);
			$tmp1=$s[2]			if ($i==3);
	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
	$code.="	and	\$0xFF,$out\n";

	$code.="	mov	0($sbox,$out,8),$out\n";
	$code.="	shr	\$16,$tmp1\n";
			$tmp2=$s[3]			if ($i==3);
	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);

			$tmp0=$s[1]			if ($i==3);
	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
	$code.="	and	\$0xFF,$tmp1\n";
	$code.="	shr	\$24,$tmp2\n";

	$code.="	xor	3($sbox,$tmp0,8),$out\n";
	$code.="	xor	2($sbox,$tmp1,8),$out\n";
	$code.="	xor	1($sbox,$tmp2,8),$out\n";

	$code.="	mov	$t2,$s[1]\n"		if ($i==3);
	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
	$code.="	mov	$t0,$s[3]\n"		if ($i==3);
	$code.="\n";
}

sub declast()
{ my ($i,@s)=@_;
  my $tmp0=$acc0;
  my $tmp1=$acc1;
  my $tmp2=$acc2;
  my $out=($t0,$t1,$t2,$s[0])[$i];

	$code.="	mov	$s[0],$out\n"		if ($i!=3);
			$tmp1=$s[2]			if ($i==3);
	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
	$code.="	and	\$0xFF,$out\n";

	$code.="	movzb	2048($sbox,$out,1),$out\n";
	$code.="	shr	\$16,$tmp1\n";
			$tmp2=$s[3]			if ($i==3);
	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);

			$tmp0=$s[1]			if ($i==3);
	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
	$code.="	and	\$0xFF,$tmp1\n";
	$code.="	shr	\$24,$tmp2\n";

	$code.="	movzb	2048($sbox,$tmp0,1),$tmp0\n";
	$code.="	movzb	2048($sbox,$tmp1,1),$tmp1\n";
	$code.="	movzb	2048($sbox,$tmp2,1),$tmp2\n";

	$code.="	shl	\$8,$tmp0\n";
	$code.="	shl	\$16,$tmp1\n";
	$code.="	shl	\$24,$tmp2\n";

	$code.="	xor	$tmp0,$out\n";
	$code.="	mov	$t2,$s[1]\n"		if ($i==3);
	$code.="	xor	$tmp1,$out\n";
	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
	$code.="	xor	$tmp2,$out\n";
	$code.="	mov	$t0,$s[3]\n"		if ($i==3);
	$code.="\n";
}

$code.=<<___;
.type	_x86_64_AES_decrypt,\@abi-omnipotent
.align	16
_x86_64_AES_decrypt:
	xor	0($key),$s0			# xor with key
	xor	4($key),$s1
	xor	8($key),$s2
	xor	12($key),$s3

	mov	240($key),$rnds			# load key->rounds
	sub	\$1,$rnds
	jmp	.Ldec_loop
.align	16
.Ldec_loop:
___
	if ($verticalspin) { &decvert(); }
	else {	&decstep(0,$s0,$s3,$s2,$s1);
		&decstep(1,$s1,$s0,$s3,$s2);
		&decstep(2,$s2,$s1,$s0,$s3);
		&decstep(3,$s3,$s2,$s1,$s0);
		$code.=<<___;
		lea	16($key),$key
		xor	0($key),$s0			# xor with key
		xor	4($key),$s1
		xor	8($key),$s2
		xor	12($key),$s3
___
	}
$code.=<<___;
	sub	\$1,$rnds
	jnz	.Ldec_loop
___
	if ($verticalspin) { &declastvert(); }
	else {	&declast(0,$s0,$s3,$s2,$s1);
		&declast(1,$s1,$s0,$s3,$s2);
		&declast(2,$s2,$s1,$s0,$s3);
		&declast(3,$s3,$s2,$s1,$s0);
		$code.=<<___;
		xor	16+0($key),$s0			# xor with key
		xor	16+4($key),$s1
		xor	16+8($key),$s2
		xor	16+12($key),$s3
___
	}
$code.=<<___;
	.byte	0xf3,0xc3			# rep ret
.size	_x86_64_AES_decrypt,.-_x86_64_AES_decrypt
___

sub deccompactvert()
{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");

$code.=<<___;
	movzb	`&lo("$s0")`,$t0
	movzb	`&lo("$s1")`,$t1
	movzb	`&lo("$s2")`,$t2
	movzb	($sbox,$t0,1),$t0
	movzb	($sbox,$t1,1),$t1
	movzb	($sbox,$t2,1),$t2

	movzb	`&lo("$s3")`,$t3
	movzb	`&hi("$s3")`,$acc0
	movzb	`&hi("$s0")`,$acc1
	movzb	($sbox,$t3,1),$t3
	movzb	($sbox,$acc0,1),$t4	#$t0
	movzb	($sbox,$acc1,1),$t5	#$t1

	movzb	`&hi("$s1")`,$acc2
	movzb	`&hi("$s2")`,$acc0
	shr	\$16,$s2
	movzb	($sbox,$acc2,1),$acc2	#$t2
	movzb	($sbox,$acc0,1),$acc0	#$t3
	shr	\$16,$s3

	movzb	`&lo("$s2")`,$acc1
	shl	\$8,$t4
	shl	\$8,$t5
	movzb	($sbox,$acc1,1),$acc1	#$t0
	xor	$t4,$t0
	xor	$t5,$t1

	movzb	`&lo("$s3")`,$t4
	shr	\$16,$s0
	shr	\$16,$s1
	movzb	`&lo("$s0")`,$t5
	shl	\$8,$acc2
	shl	\$8,$acc0
	movzb	($sbox,$t4,1),$t4	#$t1
	movzb	($sbox,$t5,1),$t5	#$t2
	xor	$acc2,$t2
	xor	$acc0,$t3

	movzb	`&lo("$s1")`,$acc2
	movzb	`&hi("$s1")`,$acc0
	shl	\$16,$acc1
	movzb	($sbox,$acc2,1),$acc2	#$t3
	movzb	($sbox,$acc0,1),$acc0	#$t0
	xor	$acc1,$t0

	movzb	`&hi("$s2")`,$acc1
	shl	\$16,$t4
	shl	\$16,$t5
	movzb	($sbox,$acc1,1),$s1	#$t1
	xor	$t4,$t1
	xor	$t5,$t2

	movzb	`&hi("$s3")`,$acc1
	shr	\$8,$s0
	shl	\$16,$acc2
	movzb	($sbox,$acc1,1),$s2	#$t2
	movzb	($sbox,$s0,1),$s3	#$t3
	xor	$acc2,$t3

	shl	\$24,$acc0
	shl	\$24,$s1
	shl	\$24,$s2
	xor	$acc0,$t0
	shl	\$24,$s3
	xor	$t1,$s1
	mov	$t0,$s0
	xor	$t2,$s2
	xor	$t3,$s3
___
}

# parallelized version! input is pair of 64-bit values: %rax=s1.s0
# and %rcx=s3.s2, output is four 32-bit values in %eax=s0, %ebx=s1,
# %ecx=s2 and %edx=s3.
sub dectransform()
{ my ($tp10,$tp20,$tp40,$tp80,$acc0)=("%rax","%r8", "%r9", "%r10","%rbx");