Newer
Older
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on
# Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version
# [you'll notice a lot of resemblance], such as compressed S-boxes
# in little-endian byte order, prefetch of these tables in CBC mode,
# as well as avoiding L1 cache aliasing between stack frame and key
# schedule and already mentioned tables, compressed Td4...
# Performance in number of cycles per processed byte for 128-bit key:
#
# ECB encrypt ECB decrypt CBC large chunk
# AMD64 33 43 13.0
# EM64T 38 56 18.6(*)
# Core 2 30 42 14.5(*)
# Atom 65 86 32.1(*)
#
# (*) with hyper-threading off
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
$verticalspin=1; # unlike 32-bit version $verticalspin performs
# ~15% better on both AMD and Intel cores
$speed_limit=512; # see aes-586.pl for details
$code=".text\n";
$s0="%eax";
$s1="%ebx";
$s2="%ecx";
$s3="%edx";
$acc0="%esi"; $mask80="%rsi";
$acc1="%edi"; $maskfe="%rdi";
$acc2="%ebp"; $mask1b="%rbp";
$t0="%r10d";
$t1="%r11d";
$t2="%r12d";
sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; }
sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
$r =~ s/%[er]([sd]i)/%\1l/;
$r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
sub LO() { my $r=shift; $r =~ s/%r([a-z]+)/%e\1/;
$r =~ s/%r([0-9]+)/%r\1d/; $r; }
sub _data_word()
{ my $i;
while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
}
sub data_word()
{ my $i;
my $last=pop(@_);
$code.=".long\t";
while(defined($i=shift)) { $code.=sprintf"0x%08x,",$i; }
$code.=sprintf"0x%08x\n",$last;
}
sub data_byte()
{ my $i;
my $last=pop(@_);
$code.=".byte\t";
while(defined($i=shift)) { $code.=sprintf"0x%02x,",$i&0xff; }
$code.=sprintf"0x%02x\n",$last&0xff;
}
sub encvert()
{ my $t3="%r8d"; # zaps $inp!
$code.=<<___;
# favor 3-way issue Opteron pipeline...
movzb `&lo("$s0")`,$acc0
movzb `&lo("$s1")`,$acc1
movzb `&lo("$s2")`,$acc2
mov 0($sbox,$acc0,8),$t0
mov 0($sbox,$acc1,8),$t1
mov 0($sbox,$acc2,8),$t2
movzb `&hi("$s1")`,$acc0
movzb `&hi("$s2")`,$acc1
movzb `&lo("$s3")`,$acc2
xor 3($sbox,$acc0,8),$t0
xor 3($sbox,$acc1,8),$t1
mov 0($sbox,$acc2,8),$t3
movzb `&hi("$s3")`,$acc0
movzb `&hi("$s0")`,$acc2
xor 3($sbox,$acc0,8),$t2
shr \$16,$s3
xor 3($sbox,$acc2,8),$t3
shr \$16,$s1
lea 16($key),$key
shr \$16,$s0
movzb `&lo("$s2")`,$acc0
movzb `&lo("$s3")`,$acc1
movzb `&lo("$s0")`,$acc2
xor 2($sbox,$acc0,8),$t0
xor 2($sbox,$acc1,8),$t1
xor 2($sbox,$acc2,8),$t2
movzb `&hi("$s3")`,$acc0
movzb `&hi("$s0")`,$acc1
movzb `&lo("$s1")`,$acc2
xor 1($sbox,$acc0,8),$t0
xor 1($sbox,$acc1,8),$t1
xor 2($sbox,$acc2,8),$t3
mov 12($key),$s3
movzb `&hi("$s1")`,$acc1
movzb `&hi("$s2")`,$acc2
mov 0($key),$s0
xor 1($sbox,$acc1,8),$t2
xor 1($sbox,$acc2,8),$t3
mov 4($key),$s1
mov 8($key),$s2
xor $t0,$s0
xor $t1,$s1
xor $t2,$s2
xor $t3,$s3
___
}
sub enclastvert()
{ my $t3="%r8d"; # zaps $inp!
$code.=<<___;
movzb `&lo("$s0")`,$acc0
movzb `&lo("$s1")`,$acc1
movzb `&lo("$s2")`,$acc2
movzb 2($sbox,$acc0,8),$t0
movzb 2($sbox,$acc1,8),$t1
movzb 2($sbox,$acc2,8),$t2
movzb `&lo("$s3")`,$acc0
movzb `&hi("$s1")`,$acc1
movzb `&hi("$s2")`,$acc2
movzb 2($sbox,$acc0,8),$t3
mov 0($sbox,$acc1,8),$acc1 #$t0
mov 0($sbox,$acc2,8),$acc2 #$t1
and \$0x0000ff00,$acc1
and \$0x0000ff00,$acc2
xor $acc1,$t0
xor $acc2,$t1
shr \$16,$s2
movzb `&hi("$s3")`,$acc0
movzb `&hi("$s0")`,$acc1
shr \$16,$s3
mov 0($sbox,$acc0,8),$acc0 #$t2
mov 0($sbox,$acc1,8),$acc1 #$t3
and \$0x0000ff00,$acc0
and \$0x0000ff00,$acc1
shr \$16,$s1
xor $acc0,$t2
xor $acc1,$t3
shr \$16,$s0
movzb `&lo("$s2")`,$acc0
movzb `&lo("$s3")`,$acc1
movzb `&lo("$s0")`,$acc2
mov 0($sbox,$acc0,8),$acc0 #$t0
mov 0($sbox,$acc1,8),$acc1 #$t1
mov 0($sbox,$acc2,8),$acc2 #$t2
and \$0x00ff0000,$acc0
and \$0x00ff0000,$acc1
and \$0x00ff0000,$acc2
xor $acc0,$t0
xor $acc1,$t1
xor $acc2,$t2
movzb `&lo("$s1")`,$acc0
movzb `&hi("$s3")`,$acc1
movzb `&hi("$s0")`,$acc2
mov 0($sbox,$acc0,8),$acc0 #$t3
mov 2($sbox,$acc1,8),$acc1 #$t0
mov 2($sbox,$acc2,8),$acc2 #$t1
and \$0x00ff0000,$acc0
and \$0xff000000,$acc1
and \$0xff000000,$acc2
xor $acc0,$t3
xor $acc1,$t0
xor $acc2,$t1
movzb `&hi("$s1")`,$acc0
movzb `&hi("$s2")`,$acc1
mov 16+12($key),$s3
mov 2($sbox,$acc0,8),$acc0 #$t2
mov 2($sbox,$acc1,8),$acc1 #$t3
mov 16+0($key),$s0
and \$0xff000000,$acc0
and \$0xff000000,$acc1
xor $acc0,$t2
xor $acc1,$t3
mov 16+4($key),$s1
mov 16+8($key),$s2
xor $t0,$s0
xor $t1,$s1
xor $t2,$s2
xor $t3,$s3
___
}
sub encstep()
{ my ($i,@s) = @_;
my $tmp0=$acc0;
my $tmp1=$acc1;
my $tmp2=$acc2;
my $out=($t0,$t1,$t2,$s[0])[$i];
if ($i==3) {
$tmp0=$s[1];
$tmp1=$s[2];
$tmp2=$s[3];
}
$code.=" movzb ".&lo($s[0]).",$out\n";
$code.=" mov $s[2],$tmp1\n" if ($i!=3);
$code.=" lea 16($key),$key\n" if ($i==0);
$code.=" movzb ".&hi($s[1]).",$tmp0\n";
$code.=" mov 0($sbox,$out,8),$out\n";
$code.=" shr \$16,$tmp1\n";
$code.=" mov $s[3],$tmp2\n" if ($i!=3);
$code.=" xor 3($sbox,$tmp0,8),$out\n";
$code.=" movzb ".&lo($tmp1).",$tmp1\n";
$code.=" xor 2($sbox,$tmp1,8),$out\n";
$code.=" xor 1($sbox,$tmp2,8),$out\n";
$code.=" mov $t0,$s[1]\n" if ($i==3);
$code.=" mov $t1,$s[2]\n" if ($i==3);
$code.=" mov $t2,$s[3]\n" if ($i==3);
$code.="\n";
}
sub enclast()
{ my ($i,@s)=@_;
my $tmp0=$acc0;
my $tmp1=$acc1;
my $tmp2=$acc2;
my $out=($t0,$t1,$t2,$s[0])[$i];
if ($i==3) {
$tmp0=$s[1];
$tmp1=$s[2];
$tmp2=$s[3];
}
$code.=" movzb ".&lo($s[0]).",$out\n";
$code.=" mov $s[2],$tmp1\n" if ($i!=3);
$code.=" shr \$16,$tmp1\n";
$code.=" mov $s[3],$tmp2\n" if ($i!=3);
$code.=" and \$0x000000ff,$out\n";
$code.=" movzb ".&hi($s[1]).",$tmp0\n";
$code.=" movzb ".&lo($tmp1).",$tmp1\n";
$code.=" mov 0($sbox,$tmp0,8),$tmp0\n";
$code.=" mov 0($sbox,$tmp1,8),$tmp1\n";
$code.=" mov 2($sbox,$tmp2,8),$tmp2\n";
$code.=" and \$0x0000ff00,$tmp0\n";
$code.=" and \$0x00ff0000,$tmp1\n";
$code.=" and \$0xff000000,$tmp2\n";
$code.=" xor $tmp0,$out\n";
$code.=" mov $t0,$s[1]\n" if ($i==3);
$code.=" xor $tmp1,$out\n";
$code.=" mov $t1,$s[2]\n" if ($i==3);
$code.=" xor $tmp2,$out\n";
$code.=" mov $t2,$s[3]\n" if ($i==3);
$code.="\n";
}
$code.=<<___;
.type _x86_64_AES_encrypt,\@abi-omnipotent
.align 16
_x86_64_AES_encrypt:
xor 0($key),$s0 # xor with key
xor 4($key),$s1
xor 8($key),$s2
xor 12($key),$s3
mov 240($key),$rnds # load key->rounds
sub \$1,$rnds
jmp .Lenc_loop
.align 16
if ($verticalspin) { &encvert(); }
else { &encstep(0,$s0,$s1,$s2,$s3);
&encstep(1,$s1,$s2,$s3,$s0);
&encstep(2,$s2,$s3,$s0,$s1);
&encstep(3,$s3,$s0,$s1,$s2);
}
if ($verticalspin) { &enclastvert(); }
else { &enclast(0,$s0,$s1,$s2,$s3);
&enclast(1,$s1,$s2,$s3,$s0);
&enclast(2,$s2,$s3,$s0,$s1);
&enclast(3,$s3,$s0,$s1,$s2);
$code.=<<___;
xor 16+0($key),$s0 # xor with key
xor 16+4($key),$s1
xor 16+8($key),$s2
xor 16+12($key),$s3
___
}
$code.=<<___;
.byte 0xf3,0xc3 # rep ret
.size _x86_64_AES_encrypt,.-_x86_64_AES_encrypt
___
# it's possible to implement this by shifting tN by 8, filling least
# significant byte with byte load and finally bswap-ing at the end,
# but such partial register load kills Core 2...
sub enccompactvert()
{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
$code.=<<___;
movzb `&lo("$s0")`,$t0
movzb `&lo("$s1")`,$t1
movzb `&lo("$s2")`,$t2
movzb `&lo("$s3")`,$t3
movzb `&hi("$s1")`,$acc0
movzb `&hi("$s2")`,$acc1
shr \$16,$s2
movzb `&hi("$s3")`,$acc2
movzb ($sbox,$t0,1),$t0
movzb ($sbox,$t1,1),$t1
movzb ($sbox,$t2,1),$t2
movzb ($sbox,$acc0,1),$t4 #$t0
movzb ($sbox,$acc1,1),$t5 #$t1
movzb `&lo("$s2")`,$acc1
movzb ($sbox,$acc2,1),$acc2 #$t2
movzb ($sbox,$acc0,1),$acc0 #$t3
shl \$8,$t4
shl \$8,$t5
xor $t4,$t0
shr \$16,$s0
movzb `&lo("$s3")`,$t4
movzb `&lo("$s0")`,$t5
movzb ($sbox,$acc1,1),$acc1 #$t0
movzb `&lo("$s1")`,$acc2
shl \$16,$acc1
xor $acc0,$t3
movzb ($sbox,$t4,1),$t4 #$t1
movzb `&hi("$s3")`,$acc0
movzb ($sbox,$t5,1),$t5 #$t2
movzb `&hi("$s0")`,$acc1
shl \$16,$t4
shl \$16,$t5
xor $t4,$t1
movzb ($sbox,$acc2,1),$acc2 #$t3
movzb ($sbox,$acc0,1),$acc0 #$t0
movzb ($sbox,$acc1,1),$acc1 #$t1
movzb ($sbox,$s2,1),$s3 #$t3
movzb ($sbox,$s1,1),$s2 #$t2
shl \$16,$acc2
xor $t5,$t2
shl \$24,$acc0
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
mov $t0,$s0
mov $t1,$s1
xor $t2,$s2
xor $t3,$s3
___
}
sub enctransform_ref()
{ my $sn = shift;
my ($acc,$r2,$tmp)=("%r8d","%r9d","%r13d");
$code.=<<___;
mov $sn,$acc
and \$0x80808080,$acc
mov $acc,$tmp
shr \$7,$tmp
lea ($sn,$sn),$r2
sub $tmp,$acc
and \$0xfefefefe,$r2
and \$0x1b1b1b1b,$acc
mov $sn,$tmp
xor $acc,$r2
xor $r2,$sn
rol \$24,$sn
xor $r2,$sn
ror \$16,$tmp
xor $tmp,$sn
ror \$8,$tmp
xor $tmp,$sn
___
}
# unlike decrypt case it does not pay off to parallelize enctransform
sub enctransform()
{ my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d");
$code.=<<___;
mov \$0x80808080,$t0
mov \$0x80808080,$t1
and $s0,$t0
and $s1,$t1
mov $t0,$acc0
mov $t1,$acc1
shr \$7,$t0
lea ($s0,$s0),$r20
shr \$7,$t1
lea ($s1,$s1),$r21
sub $t0,$acc0
sub $t1,$acc1
and \$0xfefefefe,$r20
and \$0xfefefefe,$r21
and \$0x1b1b1b1b,$acc0
and \$0x1b1b1b1b,$acc1
mov $s0,$t0
mov $s1,$t1
xor $acc0,$r20
xor $acc1,$r21
xor $r20,$s0
xor $r21,$s1
and $s2,$t2
and $s3,$t3
ror \$16,$t1
lea ($s2,$s2),$r20
ror \$8,$t1
sub $t2,$acc0
sub $t3,$acc1
xor $t0,$s0
xor $t1,$s1
and \$0xfefefefe,$r20
and \$0xfefefefe,$r21
and \$0x1b1b1b1b,$acc0
and \$0x1b1b1b1b,$acc1
mov $s2,$t2
mov $s3,$t3
xor $acc0,$r20
xor $acc1,$r21
mov 0($sbox),$acc0 # prefetch Te4
mov 64($sbox),$acc1
mov 128($sbox),$r20
xor $t3,$s3
___
}
$code.=<<___;
.type _x86_64_AES_encrypt_compact,\@abi-omnipotent
.align 16
_x86_64_AES_encrypt_compact:
lea 128($sbox),$inp # size optimization
mov 0-128($inp),$acc1 # prefetch Te4
mov 32-128($inp),$acc2
mov 64-128($inp),$t0
mov 96-128($inp),$t1
mov 128-128($inp),$acc1
mov 160-128($inp),$acc2
mov 192-128($inp),$t0
mov 224-128($inp),$t1
jmp .Lenc_loop_compact
.align 16
.Lenc_loop_compact:
xor 0($key),$s0 # xor with key
xor 4($key),$s1
xor 8($key),$s2
xor 12($key),$s3
lea 16($key),$key
___
&enccompactvert();
$code.=<<___;
je .Lenc_compact_done
&enctransform();
jmp .Lenc_loop_compact
.align 16
.Lenc_compact_done:
xor 0($key),$s0
xor 4($key),$s1
xor 8($key),$s2
xor 12($key),$s3
.byte 0xf3,0xc3 # rep ret
.size _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact
___
# void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
$code.=<<___;
.globl AES_encrypt
.type AES_encrypt,\@function,3
.align 16
.globl asm_AES_encrypt
.hidden asm_AES_encrypt
asm_AES_encrypt:
AES_encrypt:
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
# allocate frame "above" key schedule
mov %rsp,%r10
lea -63(%rdx),%rcx # %rdx is key argument
and \$-64,%rsp
sub %rsp,%rcx
neg %rcx
and \$0x3c0,%rcx
sub %rcx,%rsp
mov %rsi,16(%rsp) # save out
mov %r10,24(%rsp) # save real stack pointer
.Lenc_prologue:
mov 0(%rdi),$s0 # load input vector
mov 4(%rdi),$s1
mov 8(%rdi),$s2
mov 12(%rdi),$s3
shl \$4,$rnds
lea ($key,$rnds),%rbp
mov $key,(%rsp) # key schedule
mov %rbp,8(%rsp) # end of key schedule
# pick Te4 copy which can't "overlap" with stack frame or key schedule
lea .LAES_Te+2048(%rip),$sbox
sub $sbox,%rbp
and \$0x300,%rbp
lea ($sbox,%rbp),$sbox
call _x86_64_AES_encrypt_compact
mov 16(%rsp),$out # restore out
mov 24(%rsp),%rsi # restore saved stack pointer
mov $s0,0($out) # write output vector
mov $s1,4($out)
mov $s2,8($out)
mov $s3,12($out)
mov (%rsi),%r15
mov 8(%rsi),%r14
mov 16(%rsi),%r13
mov 24(%rsi),%r12
mov 32(%rsi),%rbp
mov 40(%rsi),%rbx
lea 48(%rsi),%rsp
.Lenc_epilogue:
ret
.size AES_encrypt,.-AES_encrypt
___
#------------------------------------------------------------------#
sub decvert()
{ my $t3="%r8d"; # zaps $inp!
$code.=<<___;
# favor 3-way issue Opteron pipeline...
movzb `&lo("$s0")`,$acc0
movzb `&lo("$s1")`,$acc1
movzb `&lo("$s2")`,$acc2
mov 0($sbox,$acc0,8),$t0
mov 0($sbox,$acc1,8),$t1
mov 0($sbox,$acc2,8),$t2
movzb `&hi("$s3")`,$acc0
movzb `&hi("$s0")`,$acc1
movzb `&lo("$s3")`,$acc2
xor 3($sbox,$acc0,8),$t0
xor 3($sbox,$acc1,8),$t1
mov 0($sbox,$acc2,8),$t3
movzb `&hi("$s1")`,$acc0
movzb `&hi("$s2")`,$acc2
xor 3($sbox,$acc0,8),$t2
shr \$16,$s3
xor 3($sbox,$acc2,8),$t3
shr \$16,$s1
lea 16($key),$key
shr \$16,$s2
movzb `&lo("$s2")`,$acc0
movzb `&lo("$s3")`,$acc1
movzb `&lo("$s0")`,$acc2
xor 2($sbox,$acc0,8),$t0
xor 2($sbox,$acc1,8),$t1
xor 2($sbox,$acc2,8),$t2
movzb `&hi("$s1")`,$acc0
movzb `&hi("$s2")`,$acc1
movzb `&lo("$s1")`,$acc2
xor 1($sbox,$acc0,8),$t0
xor 1($sbox,$acc1,8),$t1
xor 2($sbox,$acc2,8),$t3
movzb `&hi("$s3")`,$acc0
movzb `&hi("$s0")`,$acc2
xor 1($sbox,$acc0,8),$t2
mov 0($key),$s0
xor 1($sbox,$acc2,8),$t3
xor $t0,$s0
mov 4($key),$s1
mov 8($key),$s2
xor $t2,$s2
xor $t1,$s1
xor $t3,$s3
___
}
sub declastvert()
{ my $t3="%r8d"; # zaps $inp!
$code.=<<___;
lea 2048($sbox),$sbox # size optimization
movzb `&lo("$s0")`,$acc0
movzb `&lo("$s1")`,$acc1
movzb `&lo("$s2")`,$acc2
movzb ($sbox,$acc0,1),$t0
movzb ($sbox,$acc1,1),$t1
movzb ($sbox,$acc2,1),$t2
movzb `&lo("$s3")`,$acc0
movzb `&hi("$s3")`,$acc1
movzb `&hi("$s0")`,$acc2
movzb ($sbox,$acc0,1),$t3
movzb ($sbox,$acc1,1),$acc1 #$t0
movzb ($sbox,$acc2,1),$acc2 #$t1
shl \$8,$acc1
shl \$8,$acc2
xor $acc1,$t0
xor $acc2,$t1
shr \$16,$s3
movzb `&hi("$s1")`,$acc0
movzb `&hi("$s2")`,$acc1
movzb ($sbox,$acc0,1),$acc0 #$t2
movzb ($sbox,$acc1,1),$acc1 #$t3
shl \$8,$acc0
shl \$8,$acc1
shr \$16,$s1
xor $acc0,$t2
xor $acc1,$t3
shr \$16,$s2
movzb `&lo("$s2")`,$acc0
movzb `&lo("$s3")`,$acc1
movzb `&lo("$s0")`,$acc2
movzb ($sbox,$acc0,1),$acc0 #$t0
movzb ($sbox,$acc1,1),$acc1 #$t1
movzb ($sbox,$acc2,1),$acc2 #$t2
shl \$16,$acc0
shl \$16,$acc1
shl \$16,$acc2
xor $acc0,$t0
xor $acc1,$t1
xor $acc2,$t2
movzb `&lo("$s1")`,$acc0
movzb `&hi("$s1")`,$acc1
movzb `&hi("$s2")`,$acc2
movzb ($sbox,$acc0,1),$acc0 #$t3
movzb ($sbox,$acc1,1),$acc1 #$t0
movzb ($sbox,$acc2,1),$acc2 #$t1
shl \$16,$acc0
shl \$24,$acc1
shl \$24,$acc2
xor $acc0,$t3
xor $acc1,$t0
xor $acc2,$t1
movzb `&hi("$s3")`,$acc0
movzb `&hi("$s0")`,$acc1
movzb ($sbox,$acc0,1),$acc0 #$t2
movzb ($sbox,$acc1,1),$acc1 #$t3
shl \$24,$acc0
shl \$24,$acc1
xor $acc0,$t2
xor $acc1,$t3
mov 16+4($key),$s1
mov 16+8($key),$s2
xor $t0,$s0
xor $t1,$s1
xor $t2,$s2
xor $t3,$s3
___
}
sub decstep()
{ my ($i,@s) = @_;
my $tmp0=$acc0;
my $tmp1=$acc1;
my $tmp2=$acc2;
my $out=($t0,$t1,$t2,$s[0])[$i];
$code.=" mov $s[0],$out\n" if ($i!=3);
$tmp1=$s[2] if ($i==3);
$code.=" mov $s[2],$tmp1\n" if ($i!=3);
$code.=" and \$0xFF,$out\n";
$code.=" shr \$16,$tmp1\n";
$tmp2=$s[3] if ($i==3);
$code.=" mov $s[3],$tmp2\n" if ($i!=3);
$tmp0=$s[1] if ($i==3);
$code.=" movzb ".&hi($s[1]).",$tmp0\n";
$code.=" and \$0xFF,$tmp1\n";
$code.=" shr \$24,$tmp2\n";
$code.=" xor 3($sbox,$tmp0,8),$out\n";
$code.=" xor 2($sbox,$tmp1,8),$out\n";
$code.=" xor 1($sbox,$tmp2,8),$out\n";
$code.=" mov $t2,$s[1]\n" if ($i==3);
$code.=" mov $t1,$s[2]\n" if ($i==3);
$code.=" mov $t0,$s[3]\n" if ($i==3);
$code.="\n";
}
sub declast()
{ my ($i,@s)=@_;
my $tmp0=$acc0;
my $tmp1=$acc1;
my $tmp2=$acc2;
my $out=($t0,$t1,$t2,$s[0])[$i];
$code.=" mov $s[0],$out\n" if ($i!=3);
$tmp1=$s[2] if ($i==3);
$code.=" mov $s[2],$tmp1\n" if ($i!=3);
$code.=" and \$0xFF,$out\n";
$code.=" movzb 2048($sbox,$out,1),$out\n";
$code.=" shr \$16,$tmp1\n";
$tmp2=$s[3] if ($i==3);
$code.=" mov $s[3],$tmp2\n" if ($i!=3);
$tmp0=$s[1] if ($i==3);
$code.=" movzb ".&hi($s[1]).",$tmp0\n";
$code.=" and \$0xFF,$tmp1\n";
$code.=" shr \$24,$tmp2\n";
$code.=" movzb 2048($sbox,$tmp0,1),$tmp0\n";
$code.=" movzb 2048($sbox,$tmp1,1),$tmp1\n";
$code.=" movzb 2048($sbox,$tmp2,1),$tmp2\n";
$code.=" shl \$8,$tmp0\n";
$code.=" shl \$16,$tmp1\n";
$code.=" shl \$24,$tmp2\n";
$code.=" xor $tmp0,$out\n";
$code.=" mov $t2,$s[1]\n" if ($i==3);
$code.=" xor $tmp1,$out\n";
$code.=" mov $t1,$s[2]\n" if ($i==3);
$code.=" xor $tmp2,$out\n";
$code.=" mov $t0,$s[3]\n" if ($i==3);
$code.="\n";
}
$code.=<<___;
.type _x86_64_AES_decrypt,\@abi-omnipotent
.align 16
_x86_64_AES_decrypt:
xor 0($key),$s0 # xor with key
xor 4($key),$s1
xor 8($key),$s2
xor 12($key),$s3
mov 240($key),$rnds # load key->rounds
sub \$1,$rnds
jmp .Ldec_loop
.align 16
if ($verticalspin) { &decvert(); }
else { &decstep(0,$s0,$s3,$s2,$s1);
&decstep(1,$s1,$s0,$s3,$s2);
&decstep(2,$s2,$s1,$s0,$s3);
&decstep(3,$s3,$s2,$s1,$s0);
$code.=<<___;
lea 16($key),$key
xor 0($key),$s0 # xor with key
xor 4($key),$s1
xor 8($key),$s2
xor 12($key),$s3
___
}
if ($verticalspin) { &declastvert(); }
else { &declast(0,$s0,$s3,$s2,$s1);
&declast(1,$s1,$s0,$s3,$s2);
&declast(2,$s2,$s1,$s0,$s3);
&declast(3,$s3,$s2,$s1,$s0);
$code.=<<___;
xor 16+0($key),$s0 # xor with key
xor 16+4($key),$s1
xor 16+8($key),$s2
xor 16+12($key),$s3
$code.=<<___;
.byte 0xf3,0xc3 # rep ret
.size _x86_64_AES_decrypt,.-_x86_64_AES_decrypt
___
sub deccompactvert()
{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
$code.=<<___;
movzb `&lo("$s0")`,$t0
movzb `&lo("$s1")`,$t1
movzb `&lo("$s2")`,$t2
movzb `&lo("$s3")`,$t3
movzb `&hi("$s3")`,$acc0
movzb `&hi("$s0")`,$acc1
shr \$16,$s3
movzb `&hi("$s1")`,$acc2
movzb ($sbox,$t0,1),$t0
movzb ($sbox,$t1,1),$t1
movzb ($sbox,$t2,1),$t2
movzb ($sbox,$acc0,1),$t4 #$t0
movzb ($sbox,$acc1,1),$t5 #$t1
movzb ($sbox,$acc2,1),$acc2 #$t2
movzb ($sbox,$acc0,1),$acc0 #$t3
shl \$8,$t4
movzb `&lo("$s2")`,$acc1
movzb `&lo("$s3")`,$t4
movzb `&lo("$s0")`,$t5
movzb ($sbox,$acc1,1),$acc1 #$t0
xor $acc2,$t2
movzb `&lo("$s1")`,$acc2
xor $acc0,$t3
movzb ($sbox,$t4,1),$t4 #$t1
movzb `&hi("$s1")`,$acc0
movzb ($sbox,$acc2,1),$acc2 #$t3
xor $acc1,$t0
movzb ($sbox,$t5,1),$t5 #$t2
xor $acc2,$t3
movzb `&hi("$s3")`,$acc2
movzb ($sbox,$acc0,1),$acc0 #$t0
movzb ($sbox,$acc1,1),$s1 #$t1
movzb ($sbox,$acc2,1),$s2 #$t2
shl \$24,$acc0
shl \$24,$s1
shl \$24,$s2