Commit afbe674e authored by Andy Polyakov's avatar Andy Polyakov
Browse files

~15% better AES x86_64 assembler.

parent f42e6d24
Loading
Loading
Loading
Loading
+517 −173
Original line number Diff line number Diff line
@@ -6,9 +6,9 @@
# forms are granted according to the OpenSSL license.
# ====================================================================
#
# Version 1.0.
# Version 1.1.
#
# aes-*-cbc benchmarks are improved by 50% [compared to gcc 3.3.2 on
# aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on
# Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version
# [you'll notice a lot of resemblance], such as compressed S-boxes
# in little-endian byte order, prefetch of these tables in CBC mode,
@@ -18,14 +18,14 @@
# Performance in number of cycles per processed byte for 128-bit key:
#
#		ECB		CBC encrypt
# AMD64		15.6		14.6(*)
# EM64T		23.3(**)	21.4(*)
# AMD64		13.7		13.0(*)
# EM64T		20.2		18.6(*)
#
# (*)	CBC benchmarks are better than ECB thanks to custom ABI used
#	by the private block encryption function.
# (**)	This module exhibits virtually same ECB performance as 32-bit
#	counterpart on [current] Intel CPU.

$verticalspin=1;	# unlike 32-bit version $verticalspin performs
			# ~15% better on both AMD and Intel cores
$output=shift;
open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output";

@@ -35,20 +35,22 @@ $s0="%eax";
$s1="%ebx";
$s2="%ecx";
$s3="%edx";
$inp="%rdi";
$out="%rsi";
$acc0="%ebp";
$acc1="%r8d";
$acc2="%r9d";
$acc0="%esi";
$acc1="%edi";
$acc2="%ebp";
$inp="%r8";
$out="%r9";
$t0="%r10d";
$t1="%r11d";
$t2="%r12d";
$cnt="%r13d";
$tbl="%r14";
$rnds="%r13d";
$sbox="%r14";
$key="%r15";

sub hi() { my $r=shift;	$r =~ s/%[er]([a-d])x/%\1h/;	$r; }
sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/; $r; }
sub lo() { my $r=shift;	$r =~ s/%[er]([a-d])x/%\1l/;
			$r =~ s/%[er]([sd]i)/%\1l/;
			$r =~ s/%(r[0-9]+)[d]?/%\1b/;	$r; }
sub _data_word()
{ my $i;
    while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
@@ -61,6 +63,169 @@ sub data_word()
    $code.=sprintf"0x%08x\n",$last;
}

sub encvert()
{ my $t3="%r8d";	# zaps $inp!
  my $qs0='"$s0"';
  my $qs1='"$s1"';
  my $qs2='"$s2"';
  my $qs3='"$s3"';

$code.=<<___;
	# favor 3-way issue Opteron pipeline...
	movzb	`&lo($qs0)`,$acc0
	movzb	`&lo($qs1)`,$acc1
	movzb	`&lo($qs2)`,$acc2
	mov	0($sbox,$acc0,8),$t0
	mov	0($sbox,$acc1,8),$t1
	mov	0($sbox,$acc2,8),$t2

	movzb	`&hi($qs1)`,$acc0
	movzb	`&hi($qs2)`,$acc1
	movzb	`&lo($qs3)`,$acc2
	xor	3($sbox,$acc0,8),$t0
	xor	3($sbox,$acc1,8),$t1
	mov	0($sbox,$acc2,8),$t3

	movzb	`&hi($qs3)`,$acc0
	shr	\$16,$s2
	movzb	`&hi($qs0)`,$acc2
	xor	3($sbox,$acc0,8),$t2
	shr	\$16,$s3
	xor	3($sbox,$acc2,8),$t3

	shr	\$16,$s1
	lea	16($key),$key
	shr	\$16,$s0

	movzb	`&lo($qs2)`,$acc0
	movzb	`&lo($qs3)`,$acc1
	movzb	`&lo($qs0)`,$acc2
	xor	2($sbox,$acc0,8),$t0
	xor	2($sbox,$acc1,8),$t1
	xor	2($sbox,$acc2,8),$t2

	movzb	`&hi($qs3)`,$acc0
	movzb	`&hi($qs0)`,$acc1
	movzb	`&lo($qs1)`,$acc2
	xor	1($sbox,$acc0,8),$t0
	xor	1($sbox,$acc1,8),$t1
	xor	2($sbox,$acc2,8),$t3

	mov	12($key),$s3
	movzb	`&hi($qs1)`,$acc1
	movzb	`&hi($qs2)`,$acc2
	mov	0($key),$s0
	xor	1($sbox,$acc1,8),$t2
	xor	1($sbox,$acc2,8),$t3

	mov	4($key),$s1
	mov	8($key),$s2
	xor	$t0,$s0
	xor	$t1,$s1
	xor	$t2,$s2
	xor	$t3,$s3
___
}

sub enclastvert()
{ my $t3="%r8d";	# zaps $inp!
  my $qs0='"$s0"';
  my $qs1='"$s1"';
  my $qs2='"$s2"';
  my $qs3='"$s3"';

$code.=<<___;
	movzb	`&lo($qs0)`,$acc0
	movzb	`&lo($qs1)`,$acc1
	movzb	`&lo($qs2)`,$acc2
	mov	2($sbox,$acc0,8),$t0
	mov	2($sbox,$acc1,8),$t1
	mov	2($sbox,$acc2,8),$t2

	and	\$0x000000ff,$t0
	and	\$0x000000ff,$t1
	and	\$0x000000ff,$t2

	movzb	`&lo($qs3)`,$acc0
	movzb	`&hi($qs1)`,$acc1
	movzb	`&hi($qs2)`,$acc2
	mov	2($sbox,$acc0,8),$t3
	mov	0($sbox,$acc1,8),$acc1	#$t0
	mov	0($sbox,$acc2,8),$acc2	#$t1

	and	\$0x000000ff,$t3
	and	\$0x0000ff00,$acc1
	and	\$0x0000ff00,$acc2

	xor	$acc1,$t0
	xor	$acc2,$t1
	shr	\$16,$s2

	movzb	`&hi($qs3)`,$acc0
	movzb	`&hi($qs0)`,$acc1
	shr	\$16,$s3
	mov	0($sbox,$acc0,8),$acc0	#$t2
	mov	0($sbox,$acc1,8),$acc1	#$t3

	and	\$0x0000ff00,$acc0
	and	\$0x0000ff00,$acc1
	shr	\$16,$s1
	xor	$acc0,$t2
	xor	$acc1,$t3
	shr	\$16,$s0

	movzb	`&lo($qs2)`,$acc0
	movzb	`&lo($qs3)`,$acc1
	movzb	`&lo($qs0)`,$acc2
	mov	0($sbox,$acc0,8),$acc0	#$t0
	mov	0($sbox,$acc1,8),$acc1	#$t1
	mov	0($sbox,$acc2,8),$acc2	#$t2

	and	\$0x00ff0000,$acc0
	and	\$0x00ff0000,$acc1
	and	\$0x00ff0000,$acc2

	xor	$acc0,$t0
	xor	$acc1,$t1
	xor	$acc2,$t2

	movzb	`&lo($qs1)`,$acc0
	movzb	`&hi($qs3)`,$acc1
	movzb	`&hi($qs0)`,$acc2
	mov	0($sbox,$acc0,8),$acc0	#$t3
	mov	2($sbox,$acc1,8),$acc1	#$t0
	mov	2($sbox,$acc2,8),$acc2	#$t1

	and	\$0x00ff0000,$acc0
	and	\$0xff000000,$acc1
	and	\$0xff000000,$acc2

	xor	$acc0,$t3
	xor	$acc1,$t0
	xor	$acc2,$t1

	movzb	`&hi($qs1)`,$acc0
	movzb	`&hi($qs2)`,$acc1
	mov	16+12($key),$s3
	mov	2($sbox,$acc0,8),$acc0	#$t2
	mov	2($sbox,$acc1,8),$acc1	#$t3
	mov	16+0($key),$s0

	and	\$0xff000000,$acc0
	and	\$0xff000000,$acc1

	xor	$acc0,$t2
	xor	$acc1,$t3

	mov	16+4($key),$s1
	mov	16+8($key),$s2
	xor	$t0,$s0
	xor	$t1,$s1
	xor	$t2,$s2
	xor	$t3,$s3
___
}

sub encstep()
{ my ($i,@s) = @_;
  my $tmp0=$acc0;
@@ -68,24 +233,28 @@ sub encstep()
  my $tmp2=$acc2;
  my $out=($t0,$t1,$t2,$s[0])[$i];

	$code.="	mov	$s[0],$out\n"		if ($i!=3);
			$tmp1=$s[2]			if ($i==3);
	if ($i==3) {
		$tmp0=$s[1];
		$tmp1=$s[2];
		$tmp2=$s[3];
	}
	$code.="	movzb	".&lo($s[0]).",$out\n";
	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
	$code.="	and	\$0xFF,$out\n";
	$code.="	lea	16($key),$key\n"	if ($i==0);

	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
	$code.="	mov	0($sbox,$out,8),$out\n";

	$code.="	mov	0($tbl,$out,8),$out\n";
	$code.="	shr	\$16,$tmp1\n";
			$tmp2=$s[3]			if ($i==3);
	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);
	$code.="	xor	3($sbox,$tmp0,8),$out\n";

			$tmp0=$s[1]			if ($i==3);
	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
	$code.="	and	\$0xFF,$tmp1\n";
	$code.="	movzb	".&lo($tmp1).",$tmp1\n";
	$code.="	shr	\$24,$tmp2\n";
	$code.="	xor	4*$i($key),$out\n";

	$code.="	xor	3($tbl,$tmp0,8),$out\n";
	$code.="	xor	2($tbl,$tmp1,8),$out\n";
	$code.="	xor	1($tbl,$tmp2,8),$out\n";
	$code.="	xor	2($sbox,$tmp1,8),$out\n";
	$code.="	xor	1($sbox,$tmp2,8),$out\n";

	$code.="	mov	$t0,$s[1]\n"		if ($i==3);
	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
@@ -100,25 +269,26 @@ sub enclast()
  my $tmp2=$acc2;
  my $out=($t0,$t1,$t2,$s[0])[$i];

	$code.="	mov	$s[0],$out\n"		if ($i!=3);
			$tmp1=$s[2]			if ($i==3);
	if ($i==3) {
		$tmp0=$s[1];
		$tmp1=$s[2];
		$tmp2=$s[3];
	}
	$code.="	movzb	".&lo($s[0]).",$out\n";
	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
	$code.="	and	\$0xFF,$out\n";

	$code.="	mov	2($tbl,$out,8),$out\n";
	$code.="	mov	2($sbox,$out,8),$out\n";
	$code.="	shr	\$16,$tmp1\n";
			$tmp2=$s[3]			if ($i==3);
	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);

	$code.="	and	\$0x000000ff,$out\n";
			$tmp0=$s[1]			if ($i==3);
	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
	$code.="	and	\$0xFF,$tmp1\n";
	$code.="	movzb	".&lo($tmp1).",$tmp1\n";
	$code.="	shr	\$24,$tmp2\n";

	$code.="	mov	0($tbl,$tmp0,8),$tmp0\n";
	$code.="	mov	0($tbl,$tmp1,8),$tmp1\n";
	$code.="	mov	2($tbl,$tmp2,8),$tmp2\n";
	$code.="	mov	0($sbox,$tmp0,8),$tmp0\n";
	$code.="	mov	0($sbox,$tmp1,8),$tmp1\n";
	$code.="	mov	2($sbox,$tmp2,8),$tmp2\n";

	$code.="	and	\$0x0000ff00,$tmp0\n";
	$code.="	and	\$0x00ff0000,$tmp1\n";
@@ -142,36 +312,35 @@ _x86_64_AES_encrypt:
	xor	8($key),$s2
	xor	12($key),$s3

	mov	240($key),$cnt			# load key->rounds
	sub	\$1,$cnt
.align	4
	mov	240($key),$rnds			# load key->rounds
	sub	\$1,$rnds
	jmp	.Lenc_loop
.align	16
.Lenc_loop:
___
	&encstep(0,$s0,$s1,$s2,$s3);
	if ($verticalspin) { &encvert(); }
	else {	&encstep(0,$s0,$s1,$s2,$s3);
		&encstep(1,$s1,$s2,$s3,$s0);
		&encstep(2,$s2,$s3,$s0,$s1);
		&encstep(3,$s3,$s0,$s1,$s2);
	}
$code.=<<___;
	lea	16($key),$key
	xor	0($key),$s0			# xor with key
	xor	4($key),$s1
	xor	8($key),$s2
	xor	12($key),$s3

	sub	\$1,$cnt
	sub	\$1,$rnds
	jnz	.Lenc_loop
___
	&enclast(0,$s0,$s1,$s2,$s3);
	if ($verticalspin) { &enclastvert(); }
	else {	&enclast(0,$s0,$s1,$s2,$s3);
		&enclast(1,$s1,$s2,$s3,$s0);
		&enclast(2,$s2,$s3,$s0,$s1);
		&enclast(3,$s3,$s0,$s1,$s2);
		$code.=<<___;
	lea	16($key),$key
	xor	0($key),$s0			# xor with key
	xor	4($key),$s1
	xor	8($key),$s2
	xor	12($key),$s3

		xor	16+0($key),$s0		# xor with key
		xor	16+4($key),$s1
		xor	16+8($key),$s2
		xor	16+12($key),$s3
___
	}
$code.=<<___;
	.byte	0xf3,0xc3			# rep ret
.size	_x86_64_AES_encrypt,.-_x86_64_AES_encrypt
___
@@ -190,9 +359,11 @@ AES_encrypt:
	push	%r15

	mov	%rdx,$key
	mov	%rdi,$inp
	mov	%rsi,$out

	.picmeup	$tbl
	lea	AES_Te-.($tbl),$tbl
	.picmeup	$sbox
	lea	AES_Te-.($sbox),$sbox

	mov	0($inp),$s0
	mov	4($inp),$s1
@@ -218,6 +389,169 @@ ___

#------------------------------------------------------------------#

sub decvert()
{ my $t3="%r8d";	# zaps $inp!
  my $qs0='"$s0"';
  my $qs1='"$s1"';
  my $qs2='"$s2"';
  my $qs3='"$s3"';

$code.=<<___;
	# favor 3-way issue Opteron pipeline...
	movzb	`&lo($qs0)`,$acc0
	movzb	`&lo($qs1)`,$acc1
	movzb	`&lo($qs2)`,$acc2
	mov	0($sbox,$acc0,8),$t0
	mov	0($sbox,$acc1,8),$t1
	mov	0($sbox,$acc2,8),$t2

	movzb	`&hi($qs3)`,$acc0
	movzb	`&hi($qs0)`,$acc1
	movzb	`&lo($qs3)`,$acc2
	xor	3($sbox,$acc0,8),$t0
	xor	3($sbox,$acc1,8),$t1
	mov	0($sbox,$acc2,8),$t3

	movzb	`&hi($qs1)`,$acc0
	shr	\$16,$s0
	movzb	`&hi($qs2)`,$acc2
	xor	3($sbox,$acc0,8),$t2
	shr	\$16,$s3
	xor	3($sbox,$acc2,8),$t3

	shr	\$16,$s1
	lea	16($key),$key
	shr	\$16,$s2

	movzb	`&lo($qs2)`,$acc0
	movzb	`&lo($qs3)`,$acc1
	movzb	`&lo($qs0)`,$acc2
	xor	2($sbox,$acc0,8),$t0
	xor	2($sbox,$acc1,8),$t1
	xor	2($sbox,$acc2,8),$t2

	movzb	`&hi($qs1)`,$acc0
	movzb	`&hi($qs2)`,$acc1
	movzb	`&lo($qs1)`,$acc2
	xor	1($sbox,$acc0,8),$t0
	xor	1($sbox,$acc1,8),$t1
	xor	2($sbox,$acc2,8),$t3

	movzb	`&hi($qs3)`,$acc0
	mov	12($key),$s3
	movzb	`&hi($qs0)`,$acc2
	xor	1($sbox,$acc0,8),$t2
	mov	0($key),$s0
	xor	1($sbox,$acc2,8),$t3

	xor	$t0,$s0
	mov	4($key),$s1
	mov	8($key),$s2
	xor	$t2,$s2
	xor	$t1,$s1
	xor	$t3,$s3
___
}

sub declastvert()
{ my $t3="%r8d";	# zaps $inp!
  my $qs0='"$s0"';
  my $qs1='"$s1"';
  my $qs2='"$s2"';
  my $qs3='"$s3"';

$code.=<<___;
	movzb	`&lo($qs0)`,$acc0
	movzb	`&lo($qs1)`,$acc1
	movzb	`&lo($qs2)`,$acc2
	mov	2048($sbox,$acc0,4),$t0
	mov	2048($sbox,$acc1,4),$t1
	mov	2048($sbox,$acc2,4),$t2

	and	\$0x000000ff,$t0
	and	\$0x000000ff,$t1
	and	\$0x000000ff,$t2

	movzb	`&lo($qs3)`,$acc0
	movzb	`&hi($qs3)`,$acc1
	movzb	`&hi($qs0)`,$acc2
	mov	2048($sbox,$acc0,4),$t3
	mov	2048($sbox,$acc1,4),$acc1	#$t0
	mov	2048($sbox,$acc2,4),$acc2	#$t1

	and	\$0x000000ff,$t3
	and	\$0x0000ff00,$acc1
	and	\$0x0000ff00,$acc2

	xor	$acc1,$t0
	xor	$acc2,$t1
	shr	\$16,$s3

	movzb	`&hi($qs1)`,$acc0
	movzb	`&hi($qs2)`,$acc1
	shr	\$16,$s0
	mov	2048($sbox,$acc0,4),$acc0	#$t2
	mov	2048($sbox,$acc1,4),$acc1	#$t3

	and	\$0x0000ff00,$acc0
	and	\$0x0000ff00,$acc1
	shr	\$16,$s1
	xor	$acc0,$t2
	xor	$acc1,$t3
	shr	\$16,$s2

	movzb	`&lo($qs2)`,$acc0
	movzb	`&lo($qs3)`,$acc1
	movzb	`&lo($qs0)`,$acc2
	mov	2048($sbox,$acc0,4),$acc0	#$t0
	mov	2048($sbox,$acc1,4),$acc1	#$t1
	mov	2048($sbox,$acc2,4),$acc2	#$t2

	and	\$0x00ff0000,$acc0
	and	\$0x00ff0000,$acc1
	and	\$0x00ff0000,$acc2

	xor	$acc0,$t0
	xor	$acc1,$t1
	xor	$acc2,$t2

	movzb	`&lo($qs1)`,$acc0
	movzb	`&hi($qs1)`,$acc1
	movzb	`&hi($qs2)`,$acc2
	mov	2048($sbox,$acc0,4),$acc0	#$t3
	mov	2048($sbox,$acc1,4),$acc1	#$t0
	mov	2048($sbox,$acc2,4),$acc2	#$t1

	and	\$0x00ff0000,$acc0
	and	\$0xff000000,$acc1
	and	\$0xff000000,$acc2

	xor	$acc0,$t3
	xor	$acc1,$t0
	xor	$acc2,$t1

	movzb	`&hi($qs3)`,$acc0
	movzb	`&hi($qs0)`,$acc1
	mov	16+12($key),$s3
	mov	2048($sbox,$acc0,4),$acc0	#$t2
	mov	2048($sbox,$acc1,4),$acc1	#$t3
	mov	16+0($key),$s0

	and	\$0xff000000,$acc0
	and	\$0xff000000,$acc1

	xor	$acc0,$t2
	xor	$acc1,$t3

	mov	16+4($key),$s1
	mov	16+8($key),$s2
	xor	$t0,$s0
	xor	$t1,$s1
	xor	$t2,$s2
	xor	$t3,$s3
___
}

sub decstep()
{ my ($i,@s) = @_;
  my $tmp0=$acc0;
@@ -230,7 +564,7 @@ sub decstep()
	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
	$code.="	and	\$0xFF,$out\n";

	$code.="	mov	0($tbl,$out,8),$out\n";
	$code.="	mov	0($sbox,$out,8),$out\n";
	$code.="	shr	\$16,$tmp1\n";
			$tmp2=$s[3]			if ($i==3);
	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);
@@ -240,9 +574,9 @@ sub decstep()
	$code.="	and	\$0xFF,$tmp1\n";
	$code.="	shr	\$24,$tmp2\n";

	$code.="	xor	3($tbl,$tmp0,8),$out\n";
	$code.="	xor	2($tbl,$tmp1,8),$out\n";
	$code.="	xor	1($tbl,$tmp2,8),$out\n";
	$code.="	xor	3($sbox,$tmp0,8),$out\n";
	$code.="	xor	2($sbox,$tmp1,8),$out\n";
	$code.="	xor	1($sbox,$tmp2,8),$out\n";

	$code.="	mov	$t2,$s[1]\n"		if ($i==3);
	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
@@ -262,7 +596,7 @@ sub declast()
	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
	$code.="	and	\$0xFF,$out\n";

	$code.="	mov	2048($tbl,$out,4),$out\n";
	$code.="	mov	2048($sbox,$out,4),$out\n";
	$code.="	shr	\$16,$tmp1\n";
			$tmp2=$s[3]			if ($i==3);
	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);
@@ -273,9 +607,9 @@ sub declast()
	$code.="	and	\$0xFF,$tmp1\n";
	$code.="	shr	\$24,$tmp2\n";

	$code.="	mov	2048($tbl,$tmp0,4),$tmp0\n";
	$code.="	mov	2048($tbl,$tmp1,4),$tmp1\n";
	$code.="	mov	2048($tbl,$tmp2,4),$tmp2\n";
	$code.="	mov	2048($sbox,$tmp0,4),$tmp0\n";
	$code.="	mov	2048($sbox,$tmp1,4),$tmp1\n";
	$code.="	mov	2048($sbox,$tmp2,4),$tmp2\n";

	$code.="	and	\$0x0000ff00,$tmp0\n";
	$code.="	and	\$0x00ff0000,$tmp1\n";
@@ -299,12 +633,14 @@ _x86_64_AES_decrypt:
	xor	8($key),$s2
	xor	12($key),$s3

	mov	240($key),$cnt			# load key->rounds
	sub	\$1,$cnt
.align	4
	mov	240($key),$rnds			# load key->rounds
	sub	\$1,$rnds
	jmp	.Ldec_loop
.align	16
.Ldec_loop:
___
	&decstep(0,$s0,$s3,$s2,$s1);
	if ($verticalspin) { &decvert(); }
	else {	&decstep(0,$s0,$s3,$s2,$s1);
		&decstep(1,$s1,$s0,$s3,$s2);
		&decstep(2,$s2,$s1,$s0,$s3);
		&decstep(3,$s3,$s2,$s1,$s0);
@@ -314,21 +650,25 @@ $code.=<<___;
		xor	4($key),$s1
		xor	8($key),$s2
		xor	12($key),$s3

	sub	\$1,$cnt
___
	}
$code.=<<___;
	sub	\$1,$rnds
	jnz	.Ldec_loop
___
	&declast(0,$s0,$s3,$s2,$s1);
	if ($verticalspin) { &declastvert(); }
	else {	&declast(0,$s0,$s3,$s2,$s1);
		&declast(1,$s1,$s0,$s3,$s2);
		&declast(2,$s2,$s1,$s0,$s3);
		&declast(3,$s3,$s2,$s1,$s0);
		$code.=<<___;
	lea	16($key),$key
	xor	0($key),$s0			# xor with key
		xor	16+0($key),$s0			# xor with key
		xor	4($key),$s1
		xor	8($key),$s2
		xor	12($key),$s3

___
	}
$code.=<<___;
	.byte	0xf3,0xc3			# rep ret
.size	_x86_64_AES_decrypt,.-_x86_64_AES_decrypt
___
@@ -347,9 +687,11 @@ AES_decrypt:
	push	%r15

	mov	%rdx,$key
	mov	%rdi,$inp
	mov	%rsi,$out

	.picmeup	$tbl
	lea	AES_Td-.($tbl),$tbl
	.picmeup	$sbox
	lea	AES_Td-.($sbox),$sbox

	mov	0($inp),$s0
	mov	4($inp),$s1
@@ -719,24 +1061,24 @@ AES_cbc_encrypt:
	pushfq
	cld

	.picmeup $tbl
	.picmeup $sbox
.Lcbc_pic_point:

	cmp	\$0,%r9
	je	.LDECRYPT

	lea	AES_Te-.Lcbc_pic_point($tbl),$tbl
	lea	AES_Te-.Lcbc_pic_point($sbox),$sbox

	# allocate aligned stack frame...
	lea	-64-248(%rsp),$key
	and	\$-64,$key

	# ... and make it doesn't alias with AES_Te modulo 4096
	mov	$tbl,%r10
	lea	2048($tbl),%r11
	mov	$sbox,%r10
	lea	2048($sbox),%r11
	mov	$key,%r12
	and	\$0xFFF,%r10	# s = $tbl&0xfff
	and	\$0xFFF,%r11	# e = ($tbl+2048)&0xfff
	and	\$0xFFF,%r10	# s = $sbox&0xfff
	and	\$0xFFF,%r11	# e = ($sbox+2048)&0xfff
	and	\$0xFFF,%r12	# p = %rsp&0xfff

	cmp	%r11,%r12	# if (p=>e) %rsp =- (p-e);
@@ -758,13 +1100,15 @@ AES_cbc_encrypt:
	mov	%rdx,$_len	# save copy of len
	mov	%rcx,$_key	# save copy of key
	mov	%r8,$_ivp	# save copy of ivp

	movl	\$0,$mark	# copy of aes_key->rounds = 0;
	mov	%r8,%rbp	# rearrange input arguments
	mov	%rsi,$out
	mov	%rdi,$inp
	mov	%rcx,$key

	# do we copy key schedule to stack?
	mov	$key,%r10
	sub	$tbl,%r10
	sub	$sbox,%r10
	and	\$0xfff,%r10
	cmp	\$2048,%r10
	jb	.Lcbc_do_ecopy
@@ -772,8 +1116,6 @@ AES_cbc_encrypt:
	jb	.Lcbc_skip_ecopy
.align	4
.Lcbc_do_ecopy:
		mov	%rsi,%r10	# backup $inp,$out
		mov	%rdi,%r11
		mov	$key,%rsi
		lea	$aes_key,%rdi
		lea	$aes_key,$key
@@ -781,29 +1123,27 @@ AES_cbc_encrypt:
		.long	0x90A548F3	# rep movsq
		mov	(%rsi),%eax	# copy aes_key->rounds
		mov	%eax,(%rdi)
		mov	%r10,%rsi	# restore $inp,$out
		mov	%r11,%rdi
.Lcbc_skip_ecopy:
	mov	$key,$keyp	# save key pointer

	mov	\$16,%ecx
.align	4
.Lcbc_prefetch_te:
		mov	0($tbl),%r10
		mov	32($tbl),%r11
		mov	64($tbl),%r12
		mov	96($tbl),%r13
		lea	128($tbl),$tbl
		mov	0($sbox),%r10
		mov	32($sbox),%r11
		mov	64($sbox),%r12
		mov	96($sbox),%r13
		lea	128($sbox),$sbox
		sub	\$1,%ecx
	jnz	.Lcbc_prefetch_te
	sub	\$2048,$tbl
	sub	\$2048,$sbox

	test	\$-16,%rdx		# check upon length
	mov	%rdx,%r10
	mov	0(%r8),$s0		# load iv
	mov	4(%r8),$s1
	mov	8(%r8),$s2
	mov	12(%r8),$s3
	mov	0(%rbp),$s0		# load iv
	mov	4(%rbp),$s1
	mov	8(%rbp),$s2
	mov	12(%rbp),$s3
	jz	.Lcbc_enc_tail		# short input...

.align	4
@@ -812,10 +1152,12 @@ AES_cbc_encrypt:
		xor	4($inp),$s1
		xor	8($inp),$s2
		xor	12($inp),$s3
		mov	$inp,$ivec	# if ($verticalspin) save inp

		mov	$keyp,$key	# restore key
		call	_x86_64_AES_encrypt

		mov	$ivec,$inp	# if ($verticalspin) restore inp
		mov	$s0,0($out)
		mov	$s1,4($out)
		mov	$s2,8($out)
@@ -830,11 +1172,11 @@ AES_cbc_encrypt:
	jnz	.Lcbc_enc_loop
	test	\$15,%r10
	jnz	.Lcbc_enc_tail
	mov	$_ivp,%r10	# restore ivp
	mov	$s0,0(%r10)	# save ivec
	mov	$s1,4(%r10)
	mov	$s2,8(%r10)
	mov	$s3,12(%r10)
	mov	$_ivp,%rbp	# restore ivp
	mov	$s0,0(%rbp)	# save ivec
	mov	$s1,4(%rbp)
	mov	$s2,8(%rbp)
	mov	$s3,12(%rbp)

.align	4
.Lcbc_cleanup:
@@ -858,36 +1200,34 @@ AES_cbc_encrypt:
.align	4
.Lcbc_enc_tail:
	cmp	$inp,$out
	mov	$inp,%r11
	mov	$out,%r12
	je	.Lcbc_enc_in_place
	mov	%r10,%rcx
	xchg	%rsi,%rdi
	mov	$inp,%rsi
	mov	$out,%rdi
	.long	0xF689A4F3		# rep movsb
.Lcbc_enc_in_place:
	mov	\$16,%rcx		# zero tail
	sub	%r10,%rcx
	xor	%rax,%rax
	.long	0xF689AAF3		# rep stosb
	mov	%r12,$inp		# this is not a mistake!
	mov	%r12,$out
	mov	$out,$inp		# this is not a mistake!
	movq	\$16,$_len		# len=16
	jmp	.Lcbc_enc_loop		# one more spin...
#----------------------------- DECRYPT -----------------------------#
.align	16
.LDECRYPT:
	lea	AES_Td-.Lcbc_pic_point($tbl),$tbl
	lea	AES_Td-.Lcbc_pic_point($sbox),$sbox

	# allocate aligned stack frame...
	lea	-64-248(%rsp),$key
	and	\$-64,$key

	# ... and make it doesn't alias with AES_Td modulo 4096
	mov	$tbl,%r10
	lea	3072($tbl),%r11
	mov	$sbox,%r10
	lea	3072($sbox),%r11
	mov	$key,%r12
	and	\$0xFFF,%r10	# s = $tbl&0xfff
	and	\$0xFFF,%r11	# e = ($tbl+2048)&0xfff
	and	\$0xFFF,%r10	# s = $sbox&0xfff
	and	\$0xFFF,%r11	# e = ($sbox+2048)&0xfff
	and	\$0xFFF,%r12	# p = %rsp&0xfff

	cmp	%r11,%r12	# if (p=>e) %rsp =- (p-e);
@@ -909,13 +1249,15 @@ AES_cbc_encrypt:
	mov	%rdx,$_len	# save copy of len
	mov	%rcx,$_key	# save copy of key
	mov	%r8,$_ivp	# save copy of ivp

	movl	\$0,$mark	# copy of aes_key->rounds = 0;
	mov	%r8,%rbp	# rearrange input arguments
	mov	%rsi,$out
	mov	%rdi,$inp
	mov	%rcx,$key

	# do we copy key schedule to stack?
	mov	$key,%r10
	sub	$tbl,%r10
	sub	$sbox,%r10
	and	\$0xfff,%r10
	cmp	\$3072,%r10
	jb	.Lcbc_do_dcopy
@@ -923,8 +1265,6 @@ AES_cbc_encrypt:
	jb	.Lcbc_skip_dcopy
.align	4
.Lcbc_do_dcopy:
		mov	%rsi,%r10	# backup $inp,$out
		mov	%rdi,%r11
		mov	$key,%rsi
		lea	$aes_key,%rdi
		lea	$aes_key,$key
@@ -932,49 +1272,49 @@ AES_cbc_encrypt:
		.long	0x90A548F3	# rep movsq
		mov	(%rsi),%eax	# copy aes_key->rounds
		mov	%eax,(%rdi)
		mov	%r10,%rsi	# restore $inp,$out
		mov	%r11,%rdi
.Lcbc_skip_dcopy:
	mov	$key,$keyp	# save key pointer

	mov	\$24,%ecx
.align	4
.Lcbc_prefetch_td:
		mov	0($tbl),%r10
		mov	32($tbl),%r11
		mov	64($tbl),%r12
		mov	96($tbl),%r13
		lea	128($tbl),$tbl
		mov	0($sbox),%r10
		mov	32($sbox),%r11
		mov	64($sbox),%r12
		mov	96($sbox),%r13
		lea	128($sbox),$sbox
		sub	\$1,%ecx
	jnz	.Lcbc_prefetch_td
	sub	\$3072,$tbl
	sub	\$3072,$sbox

	cmp	$inp,$out
	je	.Lcbc_dec_in_place

	mov	%r8,$ivec
	mov	%rbp,$ivec
.align	4
.Lcbc_dec_loop:
		mov	0($inp),$s0		# read input
		mov	4($inp),$s1
		mov	8($inp),$s2
		mov	12($inp),$s3
		mov	$inp,8+$ivec	# if ($verticalspin) save inp

		mov	$keyp,$key		# load key
		mov	$keyp,$key	# restore key
		call	_x86_64_AES_decrypt

		mov	$ivec,%r8		# load ivp
		xor	0(%r8),$s0		# xor iv
		xor	4(%r8),$s1
		xor	8(%r8),$s2
		xor	12(%r8),$s3
		mov	$inp,%r8		# current input, next iv
		mov	$ivec,%rbp	# load ivp
		mov	8+$ivec,$inp	# if ($verticalspin) restore inp
		xor	0(%rbp),$s0	# xor iv
		xor	4(%rbp),$s1
		xor	8(%rbp),$s2
		xor	12(%rbp),$s3
		mov	$inp,%rbp	# current input, next iv

		mov	$_len,%r10	# load len
		sub	\$16,%r10
		jc	.Lcbc_dec_partial
		mov	%r10,$_len	# update len
		mov	%r8,$ivec		# update ivp
		mov	%rbp,$ivec	# update ivp

		mov	$s0,0($out)	# write output
		mov	$s1,4($out)
@@ -985,11 +1325,11 @@ AES_cbc_encrypt:
		lea	16($out),$out
	jnz	.Lcbc_dec_loop
.Lcbc_dec_end:
	mov	$_ivp,%r9		# load user ivp
	mov	0(%r8),%r10		# load iv
	mov	8(%r8),%r11
	mov	%r10,0(%r9)		# copy back to user
	mov	%r11,8(%r9)
	mov	$_ivp,%r12		# load user ivp
	mov	0(%rbp),%r10		# load iv
	mov	8(%rbp),%r11
	mov	%r10,0(%r12)		# copy back to user
	mov	%r11,8(%r12)
	jmp	.Lcbc_cleanup

.align	4
@@ -1012,19 +1352,21 @@ AES_cbc_encrypt:
		mov	8($inp),$s2
		mov	12($inp),$s3

		mov	$inp,$ivec	# if ($verticalspin) save inp
		mov	$keyp,$key
		call	_x86_64_AES_decrypt

		mov	$_ivp,%r8
		xor	0(%r8),$s0
		xor	4(%r8),$s1
		xor	8(%r8),$s2
		xor	12(%r8),$s3
		mov	$ivec,$inp	# if ($verticalspin) restore inp
		mov	$_ivp,%rbp
		xor	0(%rbp),$s0
		xor	4(%rbp),$s1
		xor	8(%rbp),$s2
		xor	12(%rbp),$s3

		mov	0($inp),%r10	# copy input to iv
		mov	8($inp),%r11
		mov	%r10,0(%r8)
		mov	%r11,8(%r8)
		mov	%r10,0(%rbp)
		mov	%r11,8(%rbp)

		mov	$s0,0($out)	# save output [zaps input]
		mov	$s1,4($out)
@@ -1044,7 +1386,7 @@ AES_cbc_encrypt:
.Lcbc_dec_in_place_partial:
	# one can argue if this is actually required
	lea	($out,%rcx),%rdi
	lea	(%r8,%rcx),%rsi
	lea	(%rbp,%rcx),%rsi
	neg	%rcx
	.long	0xF689A4F3	# rep movsb	# restore tail
	jmp	.Lcbc_cleanup
@@ -1262,6 +1604,8 @@ ___
	&data_word(0xe1e1e1e1, 0x69696969, 0x14141414, 0x63636363);
	&data_word(0x55555555, 0x21212121, 0x0c0c0c0c, 0x7d7d7d7d);

$code =~ s/\`([^\`]*)\`/eval($1)/gem;

print $code;

close STDOUT;
+6 −4
Original line number Diff line number Diff line
@@ -167,10 +167,12 @@ my $current_function;
    	my $self = shift;
	my $sz = shift;

	# silently convert all EAs to 64-bit, required for elder GNU
	# assembler and results in more compact code
	$self->{index} =~ s/^[er](.?[0-9xp])[d]?$/r\1/;
	$self->{base}  =~ s/^[er](.?[0-9xp])[d]?$/r\1/;
	# Silently convert all EAs to 64-bit. This is required for
	# elder GNU assembler and results in more compact code,
	# *but* most importantly AES module depends on this feature!
	$self->{index} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/;
	$self->{base}  =~ s/^[er](.?[0-9xpi])[d]?$/r\1/;

	if (!$masm) {
	    # Solaris /usr/ccs/bin/as can't handle multiplications
	    # in $self->{label}