Newer
Older
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on
# Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version
# [you'll notice a lot of resemblance], such as compressed S-boxes
# in little-endian byte order, prefetch of these tables in CBC mode,
# as well as avoiding L1 cache aliasing between stack frame and key
# schedule and already mentioned tables, compressed Td4...
# Performance in number of cycles per processed byte for 128-bit key:
#
# ECB encrypt ECB decrypt CBC large chunk
# AMD64 33 41 13.0
# EM64T 38 59 18.6(*)
# Core 2 30 43 14.5(*)
#
# (*) with hyper-threading off
$verticalspin=1; # unlike 32-bit version $verticalspin performs
# ~15% better on both AMD and Intel cores
$speed_limit=512; # see aes-586.pl for details
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open STDOUT,"| $^X $xlate $output";
$code=".text\n";
$s0="%eax";
$s1="%ebx";
$s2="%ecx";
$s3="%edx";
$acc0="%esi"; $mask80="%rsi";
$acc1="%edi"; $maskfe="%rdi";
$acc2="%ebp"; $mask1b="%rbp";
$t0="%r10d";
$t1="%r11d";
$t2="%r12d";
sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; }
sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
$r =~ s/%[er]([sd]i)/%\1l/;
$r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
sub LO() { my $r=shift; $r =~ s/%r([a-z]+)/%e\1/;
$r =~ s/%r([0-9]+)/%r\1d/; $r; }
sub _data_word()
{ my $i;
while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
}
sub data_word()
{ my $i;
my $last=pop(@_);
$code.=".long\t";
while(defined($i=shift)) { $code.=sprintf"0x%08x,",$i; }
$code.=sprintf"0x%08x\n",$last;
}
sub data_byte()
{ my $i;
my $last=pop(@_);
$code.=".byte\t";
while(defined($i=shift)) { $code.=sprintf"0x%02x,",$i&0xff; }
$code.=sprintf"0x%02x\n",$last&0xff;
}
sub encvert()
{ my $t3="%r8d"; # zaps $inp!
$code.=<<___;
# favor 3-way issue Opteron pipeline...
movzb `&lo("$s0")`,$acc0
movzb `&lo("$s1")`,$acc1
movzb `&lo("$s2")`,$acc2
mov 0($sbox,$acc0,8),$t0
mov 0($sbox,$acc1,8),$t1
mov 0($sbox,$acc2,8),$t2
movzb `&hi("$s1")`,$acc0
movzb `&hi("$s2")`,$acc1
movzb `&lo("$s3")`,$acc2
xor 3($sbox,$acc0,8),$t0
xor 3($sbox,$acc1,8),$t1
mov 0($sbox,$acc2,8),$t3
movzb `&hi("$s3")`,$acc0
movzb `&hi("$s0")`,$acc2
xor 3($sbox,$acc0,8),$t2
shr \$16,$s3
xor 3($sbox,$acc2,8),$t3
shr \$16,$s1
lea 16($key),$key
shr \$16,$s0
movzb `&lo("$s2")`,$acc0
movzb `&lo("$s3")`,$acc1
movzb `&lo("$s0")`,$acc2
xor 2($sbox,$acc0,8),$t0
xor 2($sbox,$acc1,8),$t1
xor 2($sbox,$acc2,8),$t2
movzb `&hi("$s3")`,$acc0
movzb `&hi("$s0")`,$acc1
movzb `&lo("$s1")`,$acc2
xor 1($sbox,$acc0,8),$t0
xor 1($sbox,$acc1,8),$t1
xor 2($sbox,$acc2,8),$t3
mov 12($key),$s3
movzb `&hi("$s1")`,$acc1
movzb `&hi("$s2")`,$acc2
mov 0($key),$s0
xor 1($sbox,$acc1,8),$t2
xor 1($sbox,$acc2,8),$t3
mov 4($key),$s1
mov 8($key),$s2
xor $t0,$s0
xor $t1,$s1
xor $t2,$s2
xor $t3,$s3
___
}
sub enclastvert()
{ my $t3="%r8d"; # zaps $inp!
$code.=<<___;
movzb `&lo("$s0")`,$acc0
movzb `&lo("$s1")`,$acc1
movzb `&lo("$s2")`,$acc2
movzb 2($sbox,$acc0,8),$t0
movzb 2($sbox,$acc1,8),$t1
movzb 2($sbox,$acc2,8),$t2
movzb `&lo("$s3")`,$acc0
movzb `&hi("$s1")`,$acc1
movzb `&hi("$s2")`,$acc2
movzb 2($sbox,$acc0,8),$t3
mov 0($sbox,$acc1,8),$acc1 #$t0
mov 0($sbox,$acc2,8),$acc2 #$t1
and \$0x0000ff00,$acc1
and \$0x0000ff00,$acc2
xor $acc1,$t0
xor $acc2,$t1
shr \$16,$s2
movzb `&hi("$s3")`,$acc0
movzb `&hi("$s0")`,$acc1
shr \$16,$s3
mov 0($sbox,$acc0,8),$acc0 #$t2
mov 0($sbox,$acc1,8),$acc1 #$t3
and \$0x0000ff00,$acc0
and \$0x0000ff00,$acc1
shr \$16,$s1
xor $acc0,$t2
xor $acc1,$t3
shr \$16,$s0
movzb `&lo("$s2")`,$acc0
movzb `&lo("$s3")`,$acc1
movzb `&lo("$s0")`,$acc2
mov 0($sbox,$acc0,8),$acc0 #$t0
mov 0($sbox,$acc1,8),$acc1 #$t1
mov 0($sbox,$acc2,8),$acc2 #$t2
and \$0x00ff0000,$acc0
and \$0x00ff0000,$acc1
and \$0x00ff0000,$acc2
xor $acc0,$t0
xor $acc1,$t1
xor $acc2,$t2
movzb `&lo("$s1")`,$acc0
movzb `&hi("$s3")`,$acc1
movzb `&hi("$s0")`,$acc2
mov 0($sbox,$acc0,8),$acc0 #$t3
mov 2($sbox,$acc1,8),$acc1 #$t0
mov 2($sbox,$acc2,8),$acc2 #$t1
and \$0x00ff0000,$acc0
and \$0xff000000,$acc1
and \$0xff000000,$acc2
xor $acc0,$t3
xor $acc1,$t0
xor $acc2,$t1
movzb `&hi("$s1")`,$acc0
movzb `&hi("$s2")`,$acc1
mov 16+12($key),$s3
mov 2($sbox,$acc0,8),$acc0 #$t2
mov 2($sbox,$acc1,8),$acc1 #$t3
mov 16+0($key),$s0
and \$0xff000000,$acc0
and \$0xff000000,$acc1
xor $acc0,$t2
xor $acc1,$t3
mov 16+4($key),$s1
mov 16+8($key),$s2
xor $t0,$s0
xor $t1,$s1
xor $t2,$s2
xor $t3,$s3
___
}
sub encstep()
{ my ($i,@s) = @_;
my $tmp0=$acc0;
my $tmp1=$acc1;
my $tmp2=$acc2;
my $out=($t0,$t1,$t2,$s[0])[$i];
if ($i==3) {
$tmp0=$s[1];
$tmp1=$s[2];
$tmp2=$s[3];
}
$code.=" movzb ".&lo($s[0]).",$out\n";
$code.=" mov $s[2],$tmp1\n" if ($i!=3);
$code.=" lea 16($key),$key\n" if ($i==0);
$code.=" movzb ".&hi($s[1]).",$tmp0\n";
$code.=" mov 0($sbox,$out,8),$out\n";
$code.=" shr \$16,$tmp1\n";
$code.=" mov $s[3],$tmp2\n" if ($i!=3);
$code.=" xor 3($sbox,$tmp0,8),$out\n";
$code.=" movzb ".&lo($tmp1).",$tmp1\n";
$code.=" xor 2($sbox,$tmp1,8),$out\n";
$code.=" xor 1($sbox,$tmp2,8),$out\n";
$code.=" mov $t0,$s[1]\n" if ($i==3);
$code.=" mov $t1,$s[2]\n" if ($i==3);
$code.=" mov $t2,$s[3]\n" if ($i==3);
$code.="\n";
}
sub enclast()
{ my ($i,@s)=@_;
my $tmp0=$acc0;
my $tmp1=$acc1;
my $tmp2=$acc2;
my $out=($t0,$t1,$t2,$s[0])[$i];
if ($i==3) {
$tmp0=$s[1];
$tmp1=$s[2];
$tmp2=$s[3];
}
$code.=" movzb ".&lo($s[0]).",$out\n";
$code.=" mov $s[2],$tmp1\n" if ($i!=3);
$code.=" shr \$16,$tmp1\n";
$code.=" mov $s[3],$tmp2\n" if ($i!=3);
$code.=" and \$0x000000ff,$out\n";
$code.=" movzb ".&hi($s[1]).",$tmp0\n";
$code.=" movzb ".&lo($tmp1).",$tmp1\n";
$code.=" mov 0($sbox,$tmp0,8),$tmp0\n";
$code.=" mov 0($sbox,$tmp1,8),$tmp1\n";
$code.=" mov 2($sbox,$tmp2,8),$tmp2\n";
$code.=" and \$0x0000ff00,$tmp0\n";
$code.=" and \$0x00ff0000,$tmp1\n";
$code.=" and \$0xff000000,$tmp2\n";
$code.=" xor $tmp0,$out\n";
$code.=" mov $t0,$s[1]\n" if ($i==3);
$code.=" xor $tmp1,$out\n";
$code.=" mov $t1,$s[2]\n" if ($i==3);
$code.=" xor $tmp2,$out\n";
$code.=" mov $t2,$s[3]\n" if ($i==3);
$code.="\n";
}
$code.=<<___;
.type _x86_64_AES_encrypt,\@abi-omnipotent
.align 16
_x86_64_AES_encrypt:
xor 0($key),$s0 # xor with key
xor 4($key),$s1
xor 8($key),$s2
xor 12($key),$s3
mov 240($key),$rnds # load key->rounds
sub \$1,$rnds
jmp .Lenc_loop
.align 16
if ($verticalspin) { &encvert(); }
else { &encstep(0,$s0,$s1,$s2,$s3);
&encstep(1,$s1,$s2,$s3,$s0);
&encstep(2,$s2,$s3,$s0,$s1);
&encstep(3,$s3,$s0,$s1,$s2);
}
if ($verticalspin) { &enclastvert(); }
else { &enclast(0,$s0,$s1,$s2,$s3);
&enclast(1,$s1,$s2,$s3,$s0);
&enclast(2,$s2,$s3,$s0,$s1);
&enclast(3,$s3,$s0,$s1,$s2);
$code.=<<___;
xor 16+0($key),$s0 # xor with key
xor 16+4($key),$s1
xor 16+8($key),$s2
xor 16+12($key),$s3
___
}
$code.=<<___;
.byte 0xf3,0xc3 # rep ret
.size _x86_64_AES_encrypt,.-_x86_64_AES_encrypt
___
# it's possible to implement this by shifting tN by 8, filling least
# significant byte with byte load and finally bswap-ing at the end,
# but such partial register load kills Core 2...
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
sub enccompactvert()
{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
$code.=<<___;
movzb `&lo("$s0")`,$t0
movzb `&lo("$s1")`,$t1
movzb `&lo("$s2")`,$t2
movzb ($sbox,$t0,1),$t0
movzb ($sbox,$t1,1),$t1
movzb ($sbox,$t2,1),$t2
movzb `&lo("$s3")`,$t3
movzb `&hi("$s1")`,$acc0
movzb `&hi("$s2")`,$acc1
movzb ($sbox,$t3,1),$t3
movzb ($sbox,$acc0,1),$t4 #$t0
movzb ($sbox,$acc1,1),$t5 #$t1
movzb `&hi("$s3")`,$acc2
movzb `&hi("$s0")`,$acc0
shr \$16,$s2
movzb ($sbox,$acc2,1),$acc2 #$t2
movzb ($sbox,$acc0,1),$acc0 #$t3
shr \$16,$s3
movzb `&lo("$s2")`,$acc1
shl \$8,$t4
shl \$8,$t5
movzb ($sbox,$acc1,1),$acc1 #$t0
xor $t4,$t0
xor $t5,$t1
movzb `&lo("$s3")`,$t4
shr \$16,$s0
shr \$16,$s1
movzb `&lo("$s0")`,$t5
shl \$8,$acc2
shl \$8,$acc0
movzb ($sbox,$t4,1),$t4 #$t1
movzb ($sbox,$t5,1),$t5 #$t2
xor $acc2,$t2
xor $acc0,$t3
movzb `&lo("$s1")`,$acc2
movzb `&hi("$s3")`,$acc0
shl \$16,$acc1
movzb ($sbox,$acc2,1),$acc2 #$t3
movzb ($sbox,$acc0,1),$acc0 #$t0
xor $acc1,$t0
movzb `&hi("$s0")`,$acc1
shr \$8,$s2
shr \$8,$s1
movzb ($sbox,$acc1,1),$acc1 #$t1
movzb ($sbox,$s2,1),$s3 #$t3
movzb ($sbox,$s1,1),$s2 #$t2
shl \$16,$t4
shl \$16,$t5
shl \$16,$acc2
xor $t4,$t1
xor $t5,$t2
xor $acc2,$t3
shl \$24,$acc0
shl \$24,$acc1
shl \$24,$s3
xor $acc0,$t0
shl \$24,$s2
xor $acc1,$t1
mov $t0,$s0
mov $t1,$s1
xor $t2,$s2
xor $t3,$s3
___
}
sub enctransform_ref()
{ my $sn = shift;
my ($acc,$r2,$tmp)=("%r8d","%r9d","%r13d");
$code.=<<___;
mov $sn,$acc
and \$0x80808080,$acc
mov $acc,$tmp
shr \$7,$tmp
lea ($sn,$sn),$r2
sub $tmp,$acc
and \$0xfefefefe,$r2
and \$0x1b1b1b1b,$acc
mov $sn,$tmp
xor $acc,$r2
xor $r2,$sn
rol \$24,$sn
xor $r2,$sn
ror \$16,$tmp
xor $tmp,$sn
ror \$8,$tmp
xor $tmp,$sn
___
}
# unlike decrypt case it does not pay off to parallelize enctransform
sub enctransform()
{ my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d");
$code.=<<___;
mov $s0,$acc0
mov $s1,$acc1
and \$0x80808080,$acc0
and \$0x80808080,$acc1
mov $acc0,$t0
mov $acc1,$t1
shr \$7,$t0
lea ($s0,$s0),$r20
shr \$7,$t1
lea ($s1,$s1),$r21
sub $t0,$acc0
sub $t1,$acc1
and \$0xfefefefe,$r20
and \$0xfefefefe,$r21
and \$0x1b1b1b1b,$acc0
and \$0x1b1b1b1b,$acc1
mov $s0,$t0
mov $s1,$t1
xor $acc0,$r20
xor $acc1,$r21
xor $r20,$s0
xor $r21,$s1
mov $s2,$acc0
mov $s3,$acc1
rol \$24,$s0
rol \$24,$s1
and \$0x80808080,$acc0
and \$0x80808080,$acc1
xor $r20,$s0
xor $r21,$s1
mov $acc0,$t2
mov $acc1,$t3
ror \$16,$t0
ror \$16,$t1
shr \$7,$t2
lea ($s2,$s2),$r20
xor $t0,$s0
xor $t1,$s1
shr \$7,$t3
lea ($s3,$s3),$r21
ror \$8,$t0
ror \$8,$t1
sub $t2,$acc0
sub $t3,$acc1
xor $t0,$s0
xor $t1,$s1
and \$0xfefefefe,$r20
and \$0xfefefefe,$r21
and \$0x1b1b1b1b,$acc0
and \$0x1b1b1b1b,$acc1
mov $s2,$t2
mov $s3,$t3
xor $acc0,$r20
xor $acc1,$r21
xor $r20,$s2
xor $r21,$s3
rol \$24,$s2
rol \$24,$s3
xor $r20,$s2
xor $r21,$s3
mov 0($sbox),$acc0 # prefetch Te4
mov 64($sbox),$acc1
mov 128($sbox),$r20
mov 192($sbox),$r21
xor $t2,$s2
xor $t3,$s3
___
}
$code.=<<___;
.type _x86_64_AES_encrypt_compact,\@abi-omnipotent
.align 16
_x86_64_AES_encrypt_compact:
lea 128($sbox),$inp # size optimization
mov 0-128($inp),$acc1 # prefetch Te4
mov 32-128($inp),$acc2
mov 64-128($inp),$t0
mov 96-128($inp),$t1
mov 128-128($inp),$acc1
mov 160-128($inp),$acc2
mov 192-128($inp),$t0
mov 224-128($inp),$t1
jmp .Lenc_loop_compact
.align 16
.Lenc_loop_compact:
xor 0($key),$s0 # xor with key
xor 4($key),$s1
xor 8($key),$s2
xor 12($key),$s3
lea 16($key),$key
___
&enccompactvert();
$code.=<<___;
je .Lenc_compact_done
&enctransform();
jmp .Lenc_loop_compact
.align 16
.Lenc_compact_done:
xor 0($key),$s0
xor 4($key),$s1
xor 8($key),$s2
xor 12($key),$s3
.byte 0xf3,0xc3 # rep ret
.size _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact
___
# void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
$code.=<<___;
.globl AES_encrypt
.type AES_encrypt,\@function,3
.align 16
AES_encrypt:
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
# allocate frame "above" key schedule
mov %rsp,%rax
lea -63(%rdx),%rcx
and \$-64,%rsp
sub %rsp,%rcx
neg %rcx
and \$0x3c0,%rcx
sub %rcx,%rsp
push %rax # save real stack pointer
push %rsi # save out
mov 0(%rdi),$s0 # load input vector
mov 4(%rdi),$s1
mov 8(%rdi),$s2
mov 12(%rdi),$s3
shl \$4,$rnds
lea ($key,$rnds),%rbp
push %rbp
push $key
# pick Te4 copy which can't "overlap" with stack frame or key schedule
lea .LAES_Te+2048(%rip),$sbox
sub $sbox,%rbp
and \$0x300,%rbp
lea ($sbox,%rbp),$sbox
call _x86_64_AES_encrypt_compact
mov 16(%rsp),$out # restore out
mov 24(%rsp),%rsp
mov $s0,0($out) # write output vector
mov $s1,4($out)
mov $s2,8($out)
mov $s3,12($out)
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbp
pop %rbx
ret
.size AES_encrypt,.-AES_encrypt
___
#------------------------------------------------------------------#
sub decvert()
{ my $t3="%r8d"; # zaps $inp!
$code.=<<___;
# favor 3-way issue Opteron pipeline...
movzb `&lo("$s0")`,$acc0
movzb `&lo("$s1")`,$acc1
movzb `&lo("$s2")`,$acc2
mov 0($sbox,$acc0,8),$t0
mov 0($sbox,$acc1,8),$t1
mov 0($sbox,$acc2,8),$t2
movzb `&hi("$s3")`,$acc0
movzb `&hi("$s0")`,$acc1
movzb `&lo("$s3")`,$acc2
xor 3($sbox,$acc0,8),$t0
xor 3($sbox,$acc1,8),$t1
mov 0($sbox,$acc2,8),$t3
movzb `&hi("$s1")`,$acc0
movzb `&hi("$s2")`,$acc2
xor 3($sbox,$acc0,8),$t2
shr \$16,$s3
xor 3($sbox,$acc2,8),$t3
shr \$16,$s1
lea 16($key),$key
shr \$16,$s2
movzb `&lo("$s2")`,$acc0
movzb `&lo("$s3")`,$acc1
movzb `&lo("$s0")`,$acc2
xor 2($sbox,$acc0,8),$t0
xor 2($sbox,$acc1,8),$t1
xor 2($sbox,$acc2,8),$t2
movzb `&hi("$s1")`,$acc0
movzb `&hi("$s2")`,$acc1
movzb `&lo("$s1")`,$acc2
xor 1($sbox,$acc0,8),$t0
xor 1($sbox,$acc1,8),$t1
xor 2($sbox,$acc2,8),$t3
movzb `&hi("$s3")`,$acc0
movzb `&hi("$s0")`,$acc2
xor 1($sbox,$acc0,8),$t2
mov 0($key),$s0
xor 1($sbox,$acc2,8),$t3
xor $t0,$s0
mov 4($key),$s1
mov 8($key),$s2
xor $t2,$s2
xor $t1,$s1
xor $t3,$s3
___
}
sub declastvert()
{ my $t3="%r8d"; # zaps $inp!
$code.=<<___;
lea 2048($sbox),$sbox # size optimization
movzb `&lo("$s0")`,$acc0
movzb `&lo("$s1")`,$acc1
movzb `&lo("$s2")`,$acc2
movzb ($sbox,$acc0,1),$t0
movzb ($sbox,$acc1,1),$t1
movzb ($sbox,$acc2,1),$t2
movzb `&lo("$s3")`,$acc0
movzb `&hi("$s3")`,$acc1
movzb `&hi("$s0")`,$acc2
movzb ($sbox,$acc0,1),$t3
movzb ($sbox,$acc1,1),$acc1 #$t0
movzb ($sbox,$acc2,1),$acc2 #$t1
shl \$8,$acc1
shl \$8,$acc2
xor $acc1,$t0
xor $acc2,$t1
shr \$16,$s3
movzb `&hi("$s1")`,$acc0
movzb `&hi("$s2")`,$acc1
movzb ($sbox,$acc0,1),$acc0 #$t2
movzb ($sbox,$acc1,1),$acc1 #$t3
shl \$8,$acc0
shl \$8,$acc1
shr \$16,$s1
xor $acc0,$t2
xor $acc1,$t3
shr \$16,$s2
movzb `&lo("$s2")`,$acc0
movzb `&lo("$s3")`,$acc1
movzb `&lo("$s0")`,$acc2
movzb ($sbox,$acc0,1),$acc0 #$t0
movzb ($sbox,$acc1,1),$acc1 #$t1
movzb ($sbox,$acc2,1),$acc2 #$t2
shl \$16,$acc0
shl \$16,$acc1
shl \$16,$acc2
xor $acc0,$t0
xor $acc1,$t1
xor $acc2,$t2
movzb `&lo("$s1")`,$acc0
movzb `&hi("$s1")`,$acc1
movzb `&hi("$s2")`,$acc2
movzb ($sbox,$acc0,1),$acc0 #$t3
movzb ($sbox,$acc1,1),$acc1 #$t0
movzb ($sbox,$acc2,1),$acc2 #$t1
shl \$16,$acc0
shl \$24,$acc1
shl \$24,$acc2
xor $acc0,$t3
xor $acc1,$t0
xor $acc2,$t1
movzb `&hi("$s3")`,$acc0
movzb `&hi("$s0")`,$acc1
movzb ($sbox,$acc0,1),$acc0 #$t2
movzb ($sbox,$acc1,1),$acc1 #$t3
shl \$24,$acc0
shl \$24,$acc1
xor $acc0,$t2
xor $acc1,$t3
mov 16+4($key),$s1
mov 16+8($key),$s2
xor $t0,$s0
xor $t1,$s1
xor $t2,$s2
xor $t3,$s3
___
}
sub decstep()
{ my ($i,@s) = @_;
my $tmp0=$acc0;
my $tmp1=$acc1;
my $tmp2=$acc2;
my $out=($t0,$t1,$t2,$s[0])[$i];
$code.=" mov $s[0],$out\n" if ($i!=3);
$tmp1=$s[2] if ($i==3);
$code.=" mov $s[2],$tmp1\n" if ($i!=3);
$code.=" and \$0xFF,$out\n";
$code.=" shr \$16,$tmp1\n";
$tmp2=$s[3] if ($i==3);
$code.=" mov $s[3],$tmp2\n" if ($i!=3);
$tmp0=$s[1] if ($i==3);
$code.=" movzb ".&hi($s[1]).",$tmp0\n";
$code.=" and \$0xFF,$tmp1\n";
$code.=" shr \$24,$tmp2\n";
$code.=" xor 3($sbox,$tmp0,8),$out\n";
$code.=" xor 2($sbox,$tmp1,8),$out\n";
$code.=" xor 1($sbox,$tmp2,8),$out\n";
$code.=" mov $t2,$s[1]\n" if ($i==3);
$code.=" mov $t1,$s[2]\n" if ($i==3);
$code.=" mov $t0,$s[3]\n" if ($i==3);
$code.="\n";
}
sub declast()
{ my ($i,@s)=@_;
my $tmp0=$acc0;
my $tmp1=$acc1;
my $tmp2=$acc2;
my $out=($t0,$t1,$t2,$s[0])[$i];
$code.=" mov $s[0],$out\n" if ($i!=3);
$tmp1=$s[2] if ($i==3);
$code.=" mov $s[2],$tmp1\n" if ($i!=3);
$code.=" and \$0xFF,$out\n";
$code.=" movzb 2048($sbox,$out,1),$out\n";
$code.=" shr \$16,$tmp1\n";
$tmp2=$s[3] if ($i==3);
$code.=" mov $s[3],$tmp2\n" if ($i!=3);
$tmp0=$s[1] if ($i==3);
$code.=" movzb ".&hi($s[1]).",$tmp0\n";
$code.=" and \$0xFF,$tmp1\n";
$code.=" shr \$24,$tmp2\n";
$code.=" movzb 2048($sbox,$tmp0,1),$tmp0\n";
$code.=" movzb 2048($sbox,$tmp1,1),$tmp1\n";
$code.=" movzb 2048($sbox,$tmp2,1),$tmp2\n";
$code.=" shl \$8,$tmp0\n";
$code.=" shl \$16,$tmp1\n";
$code.=" shl \$24,$tmp2\n";
$code.=" xor $tmp0,$out\n";
$code.=" mov $t2,$s[1]\n" if ($i==3);
$code.=" xor $tmp1,$out\n";
$code.=" mov $t1,$s[2]\n" if ($i==3);
$code.=" xor $tmp2,$out\n";
$code.=" mov $t0,$s[3]\n" if ($i==3);
$code.="\n";
}
$code.=<<___;
.type _x86_64_AES_decrypt,\@abi-omnipotent
.align 16
_x86_64_AES_decrypt:
xor 0($key),$s0 # xor with key
xor 4($key),$s1
xor 8($key),$s2
xor 12($key),$s3
mov 240($key),$rnds # load key->rounds
sub \$1,$rnds
jmp .Ldec_loop
.align 16
if ($verticalspin) { &decvert(); }
else { &decstep(0,$s0,$s3,$s2,$s1);
&decstep(1,$s1,$s0,$s3,$s2);
&decstep(2,$s2,$s1,$s0,$s3);
&decstep(3,$s3,$s2,$s1,$s0);
$code.=<<___;
lea 16($key),$key
xor 0($key),$s0 # xor with key
xor 4($key),$s1
xor 8($key),$s2
xor 12($key),$s3
___
}
if ($verticalspin) { &declastvert(); }
else { &declast(0,$s0,$s3,$s2,$s1);
&declast(1,$s1,$s0,$s3,$s2);
&declast(2,$s2,$s1,$s0,$s3);
&declast(3,$s3,$s2,$s1,$s0);
$code.=<<___;
xor 16+0($key),$s0 # xor with key
xor 16+4($key),$s1
xor 16+8($key),$s2
xor 16+12($key),$s3
$code.=<<___;
.byte 0xf3,0xc3 # rep ret
.size _x86_64_AES_decrypt,.-_x86_64_AES_decrypt
___
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
sub deccompactvert()
{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
$code.=<<___;
movzb `&lo("$s0")`,$t0
movzb `&lo("$s1")`,$t1
movzb `&lo("$s2")`,$t2
movzb ($sbox,$t0,1),$t0
movzb ($sbox,$t1,1),$t1
movzb ($sbox,$t2,1),$t2
movzb `&lo("$s3")`,$t3
movzb `&hi("$s3")`,$acc0
movzb `&hi("$s0")`,$acc1
movzb ($sbox,$t3,1),$t3
movzb ($sbox,$acc0,1),$t4 #$t0
movzb ($sbox,$acc1,1),$t5 #$t1
movzb `&hi("$s1")`,$acc2
movzb `&hi("$s2")`,$acc0
shr \$16,$s2
movzb ($sbox,$acc2,1),$acc2 #$t2
movzb ($sbox,$acc0,1),$acc0 #$t3
shr \$16,$s3
movzb `&lo("$s2")`,$acc1
shl \$8,$t4
shl \$8,$t5
movzb ($sbox,$acc1,1),$acc1 #$t0
xor $t4,$t0
xor $t5,$t1
movzb `&lo("$s3")`,$t4
shr \$16,$s0
shr \$16,$s1
movzb `&lo("$s0")`,$t5
shl \$8,$acc2
shl \$8,$acc0
movzb ($sbox,$t4,1),$t4 #$t1
movzb ($sbox,$t5,1),$t5 #$t2
xor $acc2,$t2
xor $acc0,$t3
movzb `&lo("$s1")`,$acc2
movzb `&hi("$s1")`,$acc0
shl \$16,$acc1
movzb ($sbox,$acc2,1),$acc2 #$t3
movzb ($sbox,$acc0,1),$acc0 #$t0
xor $acc1,$t0
movzb `&hi("$s2")`,$acc1
shl \$16,$t4
shl \$16,$t5
movzb ($sbox,$acc1,1),$s1 #$t1
xor $t4,$t1
xor $t5,$t2
movzb `&hi("$s3")`,$acc1
shr \$8,$s0
shl \$16,$acc2
movzb ($sbox,$acc1,1),$s2 #$t2
movzb ($sbox,$s0,1),$s3 #$t3
xor $acc2,$t3
shl \$24,$acc0
shl \$24,$s1
shl \$24,$s2
xor $acc0,$t0
shl \$24,$s3
xor $t1,$s1
mov $t0,$s0
xor $t2,$s2
xor $t3,$s3
___
}
# parallelized version! input is pair of 64-bit values: %rax=s1.s0
# and %rcx=s3.s2, output is four 32-bit values in %eax=s0, %ebx=s1,
# %ecx=s2 and %edx=s3.
sub dectransform()
{ my ($tp10,$tp20,$tp40,$tp80,$acc0)=("%rax","%r8", "%r9", "%r10","%rbx");
my ($tp18,$tp28,$tp48,$tp88,$acc8)=("%rcx","%r11","%r12","%r13","%rdx");