aesni-x86_64.pl

#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# This module implements support for Intel AES-NI extension. In
# OpenSSL context it's used with Intel engine, but can also be used as
# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
# details].
#
# Performance.
#
# Given aes(enc|dec) instructions' latency asymptotic performance for
# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
# processed with 128-bit key. And given their throughput asymptotic
# performance for parallelizable modes is 1.25 cycles per byte. Being
# asymptotic limit it's not something you commonly achieve in reality,
# but how close does one get? Below are results collected for
# different modes and block sized. Pairs of numbers are for en-/
# decryption.
#
#	16-byte     64-byte     256-byte    1-KB        8-KB
# ECB	4.25/4.25   1.38/1.38   1.28/1.28   1.26/1.26	1.26/1.26
# CTR	5.42/5.42   1.92/1.92   1.44/1.44   1.28/1.28   1.26/1.26
# CBC	4.38/4.43   4.15/1.43   4.07/1.32   4.07/1.29   4.06/1.28
# CCM	5.66/9.42   4.42/5.41   4.16/4.40   4.09/4.15   4.06/4.07   
# OFB	5.42/5.42   4.64/4.64   4.44/4.44   4.39/4.39   4.38/4.38
# CFB	5.73/5.85   5.56/5.62   5.48/5.56   5.47/5.55   5.47/5.55
#
# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
# The results were collected with specially crafted speed.c benchmark
# in order to compare them with results reported in "Intel Advanced
# Encryption Standard (AES) New Instruction Set" White Paper Revision
# 3.0 dated May 2010. All above results are consistently better. This
# module also provides better performance for block sizes smaller than
# 128 bytes in points *not* represented in the above table.
#
# Looking at the results for 8-KB buffer.
#
# CFB and OFB results are far from the limit, because implementation
# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
# single-block aesni_encrypt, which is not the most optimal way to go.
# CBC encrypt result is unexpectedly high and there is no documented
# explanation for it. Seemingly there is a small penalty for feeding
# the result back to AES unit the way it's done in CBC mode. There is
# nothing one can do and the result appears optimal. CCM result is
# identical to CBC, because CBC-MAC is essentially CBC encrypt without
# saving output. CCM CTR "stays invisible," because it's neatly
# interleaved wih CBC-MAC. This provides ~30% improvement over
# "straghtforward" CCM implementation with CTR and CBC-MAC performed
# disjointly. Parallelizable modes practically achieve the theoretical
# limit.
#
# Looking at how results vary with buffer size.
#
# Curves are practically saturated at 1-KB buffer size. In most cases
# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
# CTR curve doesn't follow this pattern and is "slowest" changing one
# with "256-byte" result being 87% of "8-KB." This is because overhead
# in CTR mode is most computationally intensive. Small-block CCM
# decrypt is slower than encrypt, because first CTR and last CBC-MAC
# iterations can't be interleaved.
#
# Results for 192- and 256-bit keys.
#
# EVP-free results were observed to scale perfectly with number of
# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
# are a tad smaller, because the above mentioned penalty biases all
# results by same constant value. In similar way function call
# overhead affects small-block performance, as well as OFB and CFB
# results. Differences are not large, most common coefficients are
# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...

# January 2011
#
# While Westmere processor features 6 cycles latency for aes[enc|dec]
# instructions, which can be scheduled every second cycle, Sandy
# Bridge spends 8 cycles per instruction, but it can schedule them
# every cycle. This means that code targeting Westmere would perform
# suboptimally on Sandy Bridge. Therefore this update.
#
# In addition, non-parallelizable CBC encrypt (as well as CCM) is
# optimized. Relative improvement might appear modest, 8% on Westmere,
# but in absolute terms it's 3.77 cycles per byte encrypted with
# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
# should be compared to asymptotic limits of 3.75 for Westmere and
# 5.00 for Sandy Bridge. Actually, the fact that they get this close
# to asymptotic limits is quite amazing. Indeed, the limit is
# calculated as latency times number of rounds, 10 for 128-bit key,
# and divided by 16, the number of bytes in block, or in other words
# it accounts *solely* for aesenc instructions. But there are extra
# instructions, and numbers so close to the asymptotic limits mean
# that it's as if it takes as little as *one* additional cycle to
# execute all of them. How is it possible? It is possible thanks to
# out-of-order execution logic, which manages to overlap post-
# processing of previous block, things like saving the output, with
# actual encryption of current block, as well as pre-processing of
# current block, things like fetching input and xor-ing it with
# 0-round element of the key schedule, with actual encryption of
# previous block. Keep this in mind...
#
# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
# performance is achieved by interleaving instructions working on
# independent blocks. In which case asymptotic limit for such modes
# can be obtained by dividing above mentioned numbers by AES
# instructions' interleave factor. Westmere can execute at most 3 
# instructions at a time, meaning that optimal interleave factor is 3,
# and that's where the "magic" number of 1.25 come from. "Optimal
# interleave factor" means that increase of interleave factor does
# not improve performance. The formula has proven to reflect reality
# pretty well on Westmere... Sandy Bridge on the other hand can
# execute up to 8 AES instructions at a time, so how does varying
# interleave factor affect the performance? Here is table for ECB
# (numbers are cycles per byte processed with 128-bit key):
#
# instruction interleave factor		3x	6x	8x
# theoretical asymptotic limit		1.67	0.83	0.625
# measured performance for 8KB block	1.05	0.86	0.84
#
# "as if" interleave factor		4.7x	5.8x	6.0x
#
# Further data for other parallelizable modes:
#
# CBC decrypt				1.16	0.93	0.93
# CTR					1.14	0.91	n/a
#
# Well, given 3x column it's probably inappropriate to call the limit
# asymptotic, if it can be surpassed, isn't it? What happens there?
# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
# magic is responsible for this. Processor overlaps not only the
# additional instructions with AES ones, but even AES instuctions
# processing adjacent triplets of independent blocks. In the 6x case
# additional instructions  still claim disproportionally small amount
# of additional cycles, but in 8x case number of instructions must be
# a tad too high for out-of-order logic to cope with, and AES unit
# remains underutilized... As you can see 8x interleave is hardly
# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
# utilizies 6x interleave because of limited register bank capacity.
#
# Higher interleave factors do have negative impact on Westmere
# performance. While for ECB mode it's negligible ~1.5%, other
# parallelizables perform ~5% worse, which is outweighed by ~25%
# improvement on Sandy Bridge. To balance regression on Westmere
# CTR mode was implemented with 6x aesenc interleave factor.

# April 2011
#
# Add aesni_xts_[en|de]crypt. Westmere spends 1.33 cycles processing
# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like
# in CTR mode AES instruction interleave factor was chosen to be 6x.

$PREFIX="aesni";	# if $PREFIX is set to "AES", the script
			# generates drop-in replacement for
			# crypto/aes/asm/aes-x86_64.pl:-)

$flavour = shift;
$output  = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }

$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);

$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";

open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;

$movkey = $PREFIX eq "aesni" ? "movups" : "movups";
@_4args=$win64?	("%rcx","%rdx","%r8", "%r9") :	# Win64 order
		("%rdi","%rsi","%rdx","%rcx");	# Unix order

$code=".text\n";

$rounds="%eax";	# input to and changed by aesni_[en|de]cryptN !!!
# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
$inp="%rdi";
$out="%rsi";
$len="%rdx";
$key="%rcx";	# input to and changed by aesni_[en|de]cryptN !!!
$ivp="%r8";	# cbc, ctr, ...

$rnds_="%r10d";	# backup copy for $rounds
$key_="%r11";	# backup copy for $key

# %xmm register layout
$rndkey0="%xmm0";	$rndkey1="%xmm1";
$inout0="%xmm2";	$inout1="%xmm3";
$inout2="%xmm4";	$inout3="%xmm5";
$inout4="%xmm6";	$inout5="%xmm7";
$inout6="%xmm8";	$inout7="%xmm9";

$in2="%xmm6";		$in1="%xmm7";	# used in CBC decrypt, CTR, ...
$in0="%xmm8";		$iv="%xmm9";

# Inline version of internal aesni_[en|de]crypt1.
#
# Why folded loop? Because aes[enc|dec] is slow enough to accommodate
# cycles which take care of loop variables...
{ my $sn;
sub aesni_generate1 {
my ($p,$key,$rounds,$inout,$ivec)=@_;	$inout=$inout0 if (!defined($inout));
++$sn;
$code.=<<___;
	$movkey	($key),$rndkey0
	$movkey	16($key),$rndkey1
___
$code.=<<___ if (defined($ivec));
	xorps	$rndkey0,$ivec
	lea	32($key),$key
	xorps	$ivec,$inout
___
$code.=<<___ if (!defined($ivec));
	lea	32($key),$key
	xorps	$rndkey0,$inout
___
$code.=<<___;
.Loop_${p}1_$sn:
	aes${p}	$rndkey1,$inout
	dec	$rounds
	$movkey	($key),$rndkey1
	lea	16($key),$key
	jnz	.Loop_${p}1_$sn	# loop body is 16 bytes
	aes${p}last	$rndkey1,$inout
___
}}
# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
#
{ my ($inp,$out,$key) = @_4args;

$code.=<<___;
.globl	${PREFIX}_encrypt
.type	${PREFIX}_encrypt,\@abi-omnipotent
.align	16
${PREFIX}_encrypt:
	movups	($inp),$inout0		# load input
	mov	240($key),$rounds	# key->rounds
___
	&aesni_generate1("enc",$key,$rounds);
$code.=<<___;
	movups	$inout0,($out)		# output
	ret
.size	${PREFIX}_encrypt,.-${PREFIX}_encrypt

.globl	${PREFIX}_decrypt
.type	${PREFIX}_decrypt,\@abi-omnipotent
.align	16
${PREFIX}_decrypt:
	movups	($inp),$inout0		# load input
	mov	240($key),$rounds	# key->rounds
___
	&aesni_generate1("dec",$key,$rounds);
$code.=<<___;
	movups	$inout0,($out)		# output
	ret
.size	${PREFIX}_decrypt, .-${PREFIX}_decrypt
___
}

# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
# factor. Why 3x subroutine were originally used in loops? Even though
# aes[enc|dec] latency was originally 6, it could be scheduled only
# every *2nd* cycle. Thus 3x interleave was the one providing optimal
# utilization, i.e. when subroutine's throughput is virtually same as
# of non-interleaved subroutine [for number of input blocks up to 3].
# This is why it makes no sense to implement 2x subroutine.
# aes[enc|dec] latency in next processor generation is 8, but the
# instructions can be scheduled every cycle. Optimal interleave for
# new processor is therefore 8x...
sub aesni_generate3 {
my $dir=shift;
# As already mentioned it takes in $key and $rounds, which are *not*
# preserved. $inout[0-2] is cipher/clear text...
$code.=<<___;
.type	_aesni_${dir}rypt3,\@abi-omnipotent
.align	16
_aesni_${dir}rypt3:
	$movkey	($key),$rndkey0
	shr	\$1,$rounds
	$movkey	16($key),$rndkey1
	lea	32($key),$key
	xorps	$rndkey0,$inout0
	xorps	$rndkey0,$inout1
	xorps	$rndkey0,$inout2
	$movkey		($key),$rndkey0

.L${dir}_loop3:
	aes${dir}	$rndkey1,$inout0
	aes${dir}	$rndkey1,$inout1
	dec		$rounds
	aes${dir}	$rndkey1,$inout2
	$movkey		16($key),$rndkey1
	aes${dir}	$rndkey0,$inout0
	aes${dir}	$rndkey0,$inout1
	lea		32($key),$key
	aes${dir}	$rndkey0,$inout2
	$movkey		($key),$rndkey0
	jnz		.L${dir}_loop3

	aes${dir}	$rndkey1,$inout0
	aes${dir}	$rndkey1,$inout1
	aes${dir}	$rndkey1,$inout2
	aes${dir}last	$rndkey0,$inout0
	aes${dir}last	$rndkey0,$inout1
	aes${dir}last	$rndkey0,$inout2
	ret
.size	_aesni_${dir}rypt3,.-_aesni_${dir}rypt3
___
}
# 4x interleave is implemented to improve small block performance,
# most notably [and naturally] 4 block by ~30%. One can argue that one
# should have implemented 5x as well, but improvement would be <20%,
# so it's not worth it...
sub aesni_generate4 {
my $dir=shift;
# As already mentioned it takes in $key and $rounds, which are *not*
# preserved. $inout[0-3] is cipher/clear text...
$code.=<<___;
.type	_aesni_${dir}rypt4,\@abi-omnipotent
.align	16
_aesni_${dir}rypt4:
	$movkey	($key),$rndkey0
	shr	\$1,$rounds
	$movkey	16($key),$rndkey1
	lea	32($key),$key
	xorps	$rndkey0,$inout0
	xorps	$rndkey0,$inout1
	xorps	$rndkey0,$inout2
	xorps	$rndkey0,$inout3
	$movkey	($key),$rndkey0

.L${dir}_loop4:
	aes${dir}	$rndkey1,$inout0
	aes${dir}	$rndkey1,$inout1
	dec		$rounds
	aes${dir}	$rndkey1,$inout2
	aes${dir}	$rndkey1,$inout3
	$movkey		16($key),$rndkey1
	aes${dir}	$rndkey0,$inout0
	aes${dir}	$rndkey0,$inout1
	lea		32($key),$key
	aes${dir}	$rndkey0,$inout2
	aes${dir}	$rndkey0,$inout3
	$movkey		($key),$rndkey0
	jnz		.L${dir}_loop4

	aes${dir}	$rndkey1,$inout0
	aes${dir}	$rndkey1,$inout1
	aes${dir}	$rndkey1,$inout2
	aes${dir}	$rndkey1,$inout3
	aes${dir}last	$rndkey0,$inout0
	aes${dir}last	$rndkey0,$inout1
	aes${dir}last	$rndkey0,$inout2
	aes${dir}last	$rndkey0,$inout3
	ret
.size	_aesni_${dir}rypt4,.-_aesni_${dir}rypt4
___
}
sub aesni_generate6 {
my $dir=shift;
# As already mentioned it takes in $key and $rounds, which are *not*
# preserved. $inout[0-5] is cipher/clear text...
$code.=<<___;
.type	_aesni_${dir}rypt6,\@abi-omnipotent
.align	16
_aesni_${dir}rypt6:
	$movkey		($key),$rndkey0
	shr		\$1,$rounds
	$movkey		16($key),$rndkey1
	lea		32($key),$key
	xorps		$rndkey0,$inout0
	pxor		$rndkey0,$inout1
	aes${dir}	$rndkey1,$inout0
	pxor		$rndkey0,$inout2
	aes${dir}	$rndkey1,$inout1
	pxor		$rndkey0,$inout3
	aes${dir}	$rndkey1,$inout2
	pxor		$rndkey0,$inout4
	aes${dir}	$rndkey1,$inout3
	pxor		$rndkey0,$inout5
	dec		$rounds
	aes${dir}	$rndkey1,$inout4
	$movkey		($key),$rndkey0
	aes${dir}	$rndkey1,$inout5
	jmp		.L${dir}_loop6_enter
.align	16
.L${dir}_loop6:
	aes${dir}	$rndkey1,$inout0
	aes${dir}	$rndkey1,$inout1
	dec		$rounds
	aes${dir}	$rndkey1,$inout2
	aes${dir}	$rndkey1,$inout3
	aes${dir}	$rndkey1,$inout4
	aes${dir}	$rndkey1,$inout5
.L${dir}_loop6_enter:				# happens to be 16-byte aligned
	$movkey		16($key),$rndkey1
	aes${dir}	$rndkey0,$inout0
	aes${dir}	$rndkey0,$inout1
	lea		32($key),$key
	aes${dir}	$rndkey0,$inout2
	aes${dir}	$rndkey0,$inout3
	aes${dir}	$rndkey0,$inout4
	aes${dir}	$rndkey0,$inout5
	$movkey		($key),$rndkey0
	jnz		.L${dir}_loop6

	aes${dir}	$rndkey1,$inout0
	aes${dir}	$rndkey1,$inout1
	aes${dir}	$rndkey1,$inout2
	aes${dir}	$rndkey1,$inout3
	aes${dir}	$rndkey1,$inout4
	aes${dir}	$rndkey1,$inout5
	aes${dir}last	$rndkey0,$inout0
	aes${dir}last	$rndkey0,$inout1
	aes${dir}last	$rndkey0,$inout2
	aes${dir}last	$rndkey0,$inout3
	aes${dir}last	$rndkey0,$inout4
	aes${dir}last	$rndkey0,$inout5
	ret
.size	_aesni_${dir}rypt6,.-_aesni_${dir}rypt6
___
}
sub aesni_generate8 {
my $dir=shift;
# As already mentioned it takes in $key and $rounds, which are *not*
# preserved. $inout[0-7] is cipher/clear text...
$code.=<<___;
.type	_aesni_${dir}rypt8,\@abi-omnipotent
.align	16
_aesni_${dir}rypt8:
	$movkey		($key),$rndkey0
	shr		\$1,$rounds
	$movkey		16($key),$rndkey1
	lea		32($key),$key
	xorps		$rndkey0,$inout0
	xorps		$rndkey0,$inout1
	aes${dir}	$rndkey1,$inout0
	pxor		$rndkey0,$inout2
	aes${dir}	$rndkey1,$inout1
	pxor		$rndkey0,$inout3
	aes${dir}	$rndkey1,$inout2
	pxor		$rndkey0,$inout4
	aes${dir}	$rndkey1,$inout3
	pxor		$rndkey0,$inout5
	dec		$rounds
	aes${dir}	$rndkey1,$inout4
	pxor		$rndkey0,$inout6
	aes${dir}	$rndkey1,$inout5
	pxor		$rndkey0,$inout7
	$movkey		($key),$rndkey0
	aes${dir}	$rndkey1,$inout6
	aes${dir}	$rndkey1,$inout7
	$movkey		16($key),$rndkey1
	jmp		.L${dir}_loop8_enter
.align	16
.L${dir}_loop8:
	aes${dir}	$rndkey1,$inout0
	aes${dir}	$rndkey1,$inout1
	dec		$rounds
	aes${dir}	$rndkey1,$inout2
	aes${dir}	$rndkey1,$inout3
	aes${dir}	$rndkey1,$inout4
	aes${dir}	$rndkey1,$inout5
	aes${dir}	$rndkey1,$inout6
	aes${dir}	$rndkey1,$inout7
	$movkey		16($key),$rndkey1
.L${dir}_loop8_enter:				# happens to be 16-byte aligned
	aes${dir}	$rndkey0,$inout0
	aes${dir}	$rndkey0,$inout1
	lea		32($key),$key
	aes${dir}	$rndkey0,$inout2
	aes${dir}	$rndkey0,$inout3
	aes${dir}	$rndkey0,$inout4
	aes${dir}	$rndkey0,$inout5
	aes${dir}	$rndkey0,$inout6
	aes${dir}	$rndkey0,$inout7
	$movkey		($key),$rndkey0
	jnz		.L${dir}_loop8

	aes${dir}	$rndkey1,$inout0
	aes${dir}	$rndkey1,$inout1
	aes${dir}	$rndkey1,$inout2
	aes${dir}	$rndkey1,$inout3
	aes${dir}	$rndkey1,$inout4
	aes${dir}	$rndkey1,$inout5
	aes${dir}	$rndkey1,$inout6
	aes${dir}	$rndkey1,$inout7
	aes${dir}last	$rndkey0,$inout0
	aes${dir}last	$rndkey0,$inout1
	aes${dir}last	$rndkey0,$inout2
	aes${dir}last	$rndkey0,$inout3
	aes${dir}last	$rndkey0,$inout4
	aes${dir}last	$rndkey0,$inout5
	aes${dir}last	$rndkey0,$inout6
	aes${dir}last	$rndkey0,$inout7
	ret
.size	_aesni_${dir}rypt8,.-_aesni_${dir}rypt8
___
}
&aesni_generate3("enc") if ($PREFIX eq "aesni");
&aesni_generate3("dec");
&aesni_generate4("enc") if ($PREFIX eq "aesni");
&aesni_generate4("dec");
&aesni_generate6("enc") if ($PREFIX eq "aesni");
&aesni_generate6("dec");
&aesni_generate8("enc") if ($PREFIX eq "aesni");
&aesni_generate8("dec");

if ($PREFIX eq "aesni") {
########################################################################
# void aesni_ecb_encrypt (const void *in, void *out,
#			  size_t length, const AES_KEY *key,
#			  int enc);
$code.=<<___;
.globl	aesni_ecb_encrypt
.type	aesni_ecb_encrypt,\@function,5
.align	16
aesni_ecb_encrypt:
___
$code.=<<___ if ($win64);
	lea	-0x58(%rsp),%rsp
	movaps	%xmm6,(%rsp)
	movaps	%xmm7,0x10(%rsp)
	movaps	%xmm8,0x20(%rsp)
	movaps	%xmm9,0x30(%rsp)
.Lecb_enc_body:
___
$code.=<<___;
	and	\$-16,$len
	jz	.Lecb_ret

	mov	240($key),$rounds	# key->rounds
	$movkey	($key),$rndkey0
	mov	$key,$key_		# backup $key
	mov	$rounds,$rnds_		# backup $rounds
	test	%r8d,%r8d		# 5th argument
	jz	.Lecb_decrypt
#--------------------------- ECB ENCRYPT ------------------------------#
	cmp	\$0x80,$len
	jb	.Lecb_enc_tail

	movdqu	($inp),$inout0
	movdqu	0x10($inp),$inout1
	movdqu	0x20($inp),$inout2
	movdqu	0x30($inp),$inout3
	movdqu	0x40($inp),$inout4
	movdqu	0x50($inp),$inout5
	movdqu	0x60($inp),$inout6
	movdqu	0x70($inp),$inout7
	lea	0x80($inp),$inp
	sub	\$0x80,$len
	jmp	.Lecb_enc_loop8_enter
.align 16
.Lecb_enc_loop8:
	movups	$inout0,($out)
	mov	$key_,$key		# restore $key
	movdqu	($inp),$inout0
	mov	$rnds_,$rounds		# restore $rounds
	movups	$inout1,0x10($out)
	movdqu	0x10($inp),$inout1
	movups	$inout2,0x20($out)
	movdqu	0x20($inp),$inout2
	movups	$inout3,0x30($out)
	movdqu	0x30($inp),$inout3
	movups	$inout4,0x40($out)
	movdqu	0x40($inp),$inout4
	movups	$inout5,0x50($out)
	movdqu	0x50($inp),$inout5
	movups	$inout6,0x60($out)
	movdqu	0x60($inp),$inout6
	movups	$inout7,0x70($out)
	lea	0x80($out),$out
	movdqu	0x70($inp),$inout7
	lea	0x80($inp),$inp
.Lecb_enc_loop8_enter:

	call	_aesni_encrypt8

	sub	\$0x80,$len
	jnc	.Lecb_enc_loop8

	movups	$inout0,($out)
	mov	$key_,$key		# restore $key
	movups	$inout1,0x10($out)
	mov	$rnds_,$rounds		# restore $rounds
	movups	$inout2,0x20($out)
	movups	$inout3,0x30($out)
	movups	$inout4,0x40($out)
	movups	$inout5,0x50($out)
	movups	$inout6,0x60($out)
	movups	$inout7,0x70($out)
	lea	0x80($out),$out
	add	\$0x80,$len
	jz	.Lecb_ret

.Lecb_enc_tail:
	movups	($inp),$inout0
	cmp	\$0x20,$len
	jb	.Lecb_enc_one
	movups	0x10($inp),$inout1
	je	.Lecb_enc_two
	movups	0x20($inp),$inout2
	cmp	\$0x40,$len
	jb	.Lecb_enc_three
	movups	0x30($inp),$inout3
	je	.Lecb_enc_four
	movups	0x40($inp),$inout4
	cmp	\$0x60,$len
	jb	.Lecb_enc_five
	movups	0x50($inp),$inout5
	je	.Lecb_enc_six
	movdqu	0x60($inp),$inout6
	call	_aesni_encrypt8
	movups	$inout0,($out)
	movups	$inout1,0x10($out)
	movups	$inout2,0x20($out)
	movups	$inout3,0x30($out)
	movups	$inout4,0x40($out)
	movups	$inout5,0x50($out)
	movups	$inout6,0x60($out)
	jmp	.Lecb_ret
.align	16
.Lecb_enc_one:
___
	&aesni_generate1("enc",$key,$rounds);
$code.=<<___;
	movups	$inout0,($out)
	jmp	.Lecb_ret
.align	16
.Lecb_enc_two:
	xorps	$inout2,$inout2
	call	_aesni_encrypt3
	movups	$inout0,($out)
	movups	$inout1,0x10($out)
	jmp	.Lecb_ret
.align	16
.Lecb_enc_three:
	call	_aesni_encrypt3
	movups	$inout0,($out)
	movups	$inout1,0x10($out)
	movups	$inout2,0x20($out)
	jmp	.Lecb_ret
.align	16
.Lecb_enc_four:
	call	_aesni_encrypt4
	movups	$inout0,($out)
	movups	$inout1,0x10($out)
	movups	$inout2,0x20($out)
	movups	$inout3,0x30($out)
	jmp	.Lecb_ret
.align	16
.Lecb_enc_five:
	xorps	$inout5,$inout5
	call	_aesni_encrypt6
	movups	$inout0,($out)
	movups	$inout1,0x10($out)
	movups	$inout2,0x20($out)
	movups	$inout3,0x30($out)
	movups	$inout4,0x40($out)
	jmp	.Lecb_ret
.align	16
.Lecb_enc_six:
	call	_aesni_encrypt6
	movups	$inout0,($out)
	movups	$inout1,0x10($out)
	movups	$inout2,0x20($out)
	movups	$inout3,0x30($out)
	movups	$inout4,0x40($out)
	movups	$inout5,0x50($out)
	jmp	.Lecb_ret
#--------------------------- ECB DECRYPT ------------------------------#
.align	16
.Lecb_decrypt:
	cmp	\$0x80,$len
	jb	.Lecb_dec_tail

	movdqu	($inp),$inout0
	movdqu	0x10($inp),$inout1
	movdqu	0x20($inp),$inout2
	movdqu	0x30($inp),$inout3
	movdqu	0x40($inp),$inout4
	movdqu	0x50($inp),$inout5
	movdqu	0x60($inp),$inout6
	movdqu	0x70($inp),$inout7
	lea	0x80($inp),$inp
	sub	\$0x80,$len
	jmp	.Lecb_dec_loop8_enter
.align 16
.Lecb_dec_loop8:
	movups	$inout0,($out)
	mov	$key_,$key		# restore $key
	movdqu	($inp),$inout0
	mov	$rnds_,$rounds		# restore $rounds
	movups	$inout1,0x10($out)
	movdqu	0x10($inp),$inout1
	movups	$inout2,0x20($out)
	movdqu	0x20($inp),$inout2
	movups	$inout3,0x30($out)
	movdqu	0x30($inp),$inout3
	movups	$inout4,0x40($out)
	movdqu	0x40($inp),$inout4
	movups	$inout5,0x50($out)
	movdqu	0x50($inp),$inout5
	movups	$inout6,0x60($out)
	movdqu	0x60($inp),$inout6
	movups	$inout7,0x70($out)
	lea	0x80($out),$out
	movdqu	0x70($inp),$inout7
	lea	0x80($inp),$inp
.Lecb_dec_loop8_enter:

	call	_aesni_decrypt8

	$movkey	($key_),$rndkey0
	sub	\$0x80,$len
	jnc	.Lecb_dec_loop8

	movups	$inout0,($out)
	mov	$key_,$key		# restore $key
	movups	$inout1,0x10($out)
	mov	$rnds_,$rounds		# restore $rounds
	movups	$inout2,0x20($out)
	movups	$inout3,0x30($out)
	movups	$inout4,0x40($out)
	movups	$inout5,0x50($out)
	movups	$inout6,0x60($out)
	movups	$inout7,0x70($out)
	lea	0x80($out),$out
	add	\$0x80,$len
	jz	.Lecb_ret

.Lecb_dec_tail:
	movups	($inp),$inout0
	cmp	\$0x20,$len
	jb	.Lecb_dec_one
	movups	0x10($inp),$inout1
	je	.Lecb_dec_two
	movups	0x20($inp),$inout2
	cmp	\$0x40,$len
	jb	.Lecb_dec_three
	movups	0x30($inp),$inout3
	je	.Lecb_dec_four
	movups	0x40($inp),$inout4
	cmp	\$0x60,$len
	jb	.Lecb_dec_five
	movups	0x50($inp),$inout5
	je	.Lecb_dec_six
	movups	0x60($inp),$inout6
	$movkey	($key),$rndkey0
	call	_aesni_decrypt8
	movups	$inout0,($out)
	movups	$inout1,0x10($out)
	movups	$inout2,0x20($out)
	movups	$inout3,0x30($out)
	movups	$inout4,0x40($out)
	movups	$inout5,0x50($out)
	movups	$inout6,0x60($out)
	jmp	.Lecb_ret
.align	16
.Lecb_dec_one:
___
	&aesni_generate1("dec",$key,$rounds);
$code.=<<___;
	movups	$inout0,($out)
	jmp	.Lecb_ret
.align	16
.Lecb_dec_two:
	xorps	$inout2,$inout2
	call	_aesni_decrypt3
	movups	$inout0,($out)
	movups	$inout1,0x10($out)
	jmp	.Lecb_ret
.align	16
.Lecb_dec_three:
	call	_aesni_decrypt3
	movups	$inout0,($out)
	movups	$inout1,0x10($out)
	movups	$inout2,0x20($out)
	jmp	.Lecb_ret
.align	16
.Lecb_dec_four:
	call	_aesni_decrypt4
	movups	$inout0,($out)
	movups	$inout1,0x10($out)
	movups	$inout2,0x20($out)
	movups	$inout3,0x30($out)
	jmp	.Lecb_ret
.align	16
.Lecb_dec_five:
	xorps	$inout5,$inout5
	call	_aesni_decrypt6
	movups	$inout0,($out)
	movups	$inout1,0x10($out)
	movups	$inout2,0x20($out)
	movups	$inout3,0x30($out)
	movups	$inout4,0x40($out)
	jmp	.Lecb_ret
.align	16
.Lecb_dec_six:
	call	_aesni_decrypt6
	movups	$inout0,($out)
	movups	$inout1,0x10($out)
	movups	$inout2,0x20($out)
	movups	$inout3,0x30($out)
	movups	$inout4,0x40($out)
	movups	$inout5,0x50($out)

.Lecb_ret:
___
$code.=<<___ if ($win64);
	movaps	(%rsp),%xmm6
	movaps	0x10(%rsp),%xmm7
	movaps	0x20(%rsp),%xmm8
	movaps	0x30(%rsp),%xmm9
	lea	0x58(%rsp),%rsp
.Lecb_enc_ret:
___
$code.=<<___;
	ret
.size	aesni_ecb_encrypt,.-aesni_ecb_encrypt
___

{
######################################################################
# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
#                         size_t blocks, const AES_KEY *key,
#                         const char *ivec,char *cmac);
#
# Handles only complete blocks, operates on 64-bit counter and
# does not update *ivec! Nor does it finalize CMAC value
# (see engine/eng_aesni.c for details)
#
{
my $cmac="%r9";	# 6th argument

my $increment="%xmm6";
my $bswap_mask="%xmm7";

$code.=<<___;
.globl	aesni_ccm64_encrypt_blocks
.type	aesni_ccm64_encrypt_blocks,\@function,6
.align	16
aesni_ccm64_encrypt_blocks:
___
$code.=<<___ if ($win64);
	lea	-0x58(%rsp),%rsp
	movaps	%xmm6,(%rsp)
	movaps	%xmm7,0x10(%rsp)
	movaps	%xmm8,0x20(%rsp)
	movaps	%xmm9,0x30(%rsp)
.Lccm64_enc_body:
___
$code.=<<___;
	mov	240($key),$rounds		# key->rounds
	movdqu	($ivp),$iv
	movdqa	.Lincrement64(%rip),$increment
	movdqa	.Lbswap_mask(%rip),$bswap_mask

	shr	\$1,$rounds
	lea	0($key),$key_
	movdqu	($cmac),$inout1
	movdqa	$iv,$inout0
	mov	$rounds,$rnds_
	pshufb	$bswap_mask,$iv
	jmp	.Lccm64_enc_outer
.align	16
.Lccm64_enc_outer:
	$movkey	($key_),$rndkey0
	mov	$rnds_,$rounds
	movups	($inp),$in0			# load inp

	xorps	$rndkey0,$inout0		# counter
	$movkey	16($key_),$rndkey1
	xorps	$in0,$rndkey0
	lea	32($key_),$key
	xorps	$rndkey0,$inout1		# cmac^=inp
	$movkey	($key),$rndkey0

.Lccm64_enc2_loop:
	aesenc	$rndkey1,$inout0
	dec	$rounds
	aesenc	$rndkey1,$inout1
	$movkey	16($key),$rndkey1
	aesenc	$rndkey0,$inout0
	lea	32($key),$key
	aesenc	$rndkey0,$inout1
	$movkey	0($key),$rndkey0
	jnz	.Lccm64_enc2_loop
	aesenc	$rndkey1,$inout0
	aesenc	$rndkey1,$inout1
	paddq	$increment,$iv
	aesenclast	$rndkey0,$inout0
	aesenclast	$rndkey0,$inout1

	dec	$len
	lea	16($inp),$inp
	xorps	$inout0,$in0			# inp ^= E(iv)
	movdqa	$iv,$inout0
	movups	$in0,($out)			# save output
	lea	16($out),$out
	pshufb	$bswap_mask,$inout0
	jnz	.Lccm64_enc_outer

	movups	$inout1,($cmac)
___
$code.=<<___ if ($win64);
	movaps	(%rsp),%xmm6
	movaps	0x10(%rsp),%xmm7
	movaps	0x20(%rsp),%xmm8
	movaps	0x30(%rsp),%xmm9
	lea	0x58(%rsp),%rsp
.Lccm64_enc_ret:
___
$code.=<<___;
	ret
.size	aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
___
######################################################################
$code.=<<___;
.globl	aesni_ccm64_decrypt_blocks
.type	aesni_ccm64_decrypt_blocks,\@function,6
.align	16
aesni_ccm64_decrypt_blocks:
___
$code.=<<___ if ($win64);
	lea	-0x58(%rsp),%rsp
	movaps	%xmm6,(%rsp)
	movaps	%xmm7,0x10(%rsp)
	movaps	%xmm8,0x20(%rsp)
	movaps	%xmm9,0x30(%rsp)
.Lccm64_dec_body:
___
$code.=<<___;
	mov	240($key),$rounds		# key->rounds
	movups	($ivp),$iv
	movdqu	($cmac),$inout1
	movdqa	.Lincrement64(%rip),$increment
	movdqa	.Lbswap_mask(%rip),$bswap_mask

	movaps	$iv,$inout0
	mov	$rounds,$rnds_
	mov	$key,$key_
	pshufb	$bswap_mask,$iv
___
	&aesni_generate1("enc",$key,$rounds);
$code.=<<___;
	movups	($inp),$in0			# load inp
	paddq	$increment,$iv
	lea	16($inp),$inp
	jmp	.Lccm64_dec_outer
.align	16
.Lccm64_dec_outer:
	xorps	$inout0,$in0			# inp ^= E(iv)
	movdqa	$iv,$inout0
	mov	$rnds_,$rounds
	movups	$in0,($out)			# save output
	lea	16($out),$out
	pshufb	$bswap_mask,$inout0

	sub	\$1,$len
	jz	.Lccm64_dec_break

	$movkey	($key_),$rndkey0
	shr	\$1,$rounds
	$movkey	16($key_),$rndkey1
	xorps	$rndkey0,$in0
	lea	32($key_),$key
	xorps	$rndkey0,$inout0
	xorps	$in0,$inout1			# cmac^=out
	$movkey	($key),$rndkey0

.Lccm64_dec2_loop:
	aesenc	$rndkey1,$inout0
	dec	$rounds
	aesenc	$rndkey1,$inout1
	$movkey	16($key),$rndkey1
	aesenc	$rndkey0,$inout0
	lea	32($key),$key
	aesenc	$rndkey0,$inout1
	$movkey	0($key),$rndkey0
	jnz	.Lccm64_dec2_loop
	movups	($inp),$in0			# load inp
	paddq	$increment,$iv
	aesenc	$rndkey1,$inout0
	aesenc	$rndkey1,$inout1
	lea	16($inp),$inp
	aesenclast	$rndkey0,$inout0
	aesenclast	$rndkey0,$inout1
	jmp	.Lccm64_dec_outer

.align	16