Commit 6c83629b authored by Andy Polyakov's avatar Andy Polyakov
Browse files

AESNI engine: add counter mode.

parent fead2539
Loading
Loading
Loading
Loading
+190 −21
Original line number Diff line number Diff line
@@ -23,7 +23,8 @@ require "x86asm.pl";

&asm_init($ARGV[0],$0);

$movekey = eval($RREFIX eq "aseni" ? "*movaps" : "*movups");
if ($PREFIX eq "aesni")	{ $movekey=*movaps; }
else			{ $movekey=*movups; }

$len="eax";
$rounds="ecx";
@@ -41,7 +42,7 @@ $rndkey1="xmm4";
$ivec="xmm5";
$in0="xmm6";
$in1="xmm7";	$inout3="xmm7";


# Inline version of internal aesni_[en|de]crypt1
sub aesni_inline_generate1
{ my $p=shift;
@@ -104,7 +105,7 @@ sub aesni_generate1 # fully unrolled loop
    &ret();
    &function_end_B("_aesni_${p}rypt1");
}


# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
&aesni_generate1("enc") if (!$inline);
&function_begin_B("${PREFIX}_encrypt");
@@ -136,7 +137,7 @@ sub aesni_generate1 # fully unrolled loop
	&movups	(&QWP(0,"eax"),$inout0);
	&ret	();
&function_end_B("${PREFIX}_decrypt");


# _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave
# factor. Why 3x subroutine is used in loops? Even though aes[enc|dec]
# latency is 6, it turned out that it can be scheduled only every
@@ -229,8 +230,9 @@ sub aesni_generate4
&aesni_generate3("dec");
&aesni_generate4("enc") if ($PREFIX eq "aesni");
&aesni_generate4("dec");


if ($PREFIX eq "aesni") {
######################################################################
# void aesni_ecb_encrypt (const void *in, void *out,
#                         size_t length, const AES_KEY *key,
#                         int enc);
@@ -249,8 +251,9 @@ if ($PREFIX eq "aesni") {
	&mov	($rounds_,$rounds);	# backup $rounds
	&jz	(&label("ecb_decrypt"));

	&sub	($len,0x40);
	&cmp	($len,0x40);
	&jbe	(&label("ecb_enc_tail"));
	&sub	($len,0x40);
	&jmp	(&label("ecb_enc_loop3"));

&set_label("ecb_enc_loop3",16);
@@ -268,14 +271,13 @@ if ($PREFIX eq "aesni") {
	&movups	(&QWP(-0x10,$out),$inout2);
	&ja	(&label("ecb_enc_loop3"));

&set_label("ecb_enc_tail");
	&add	($len,0x40);
	&jz	(&label("ecb_ret"));

	&cmp	($len,0x10);
	&movups	($inout0,&QWP(0,$inp));
	&je	(&label("ecb_enc_one"));
&set_label("ecb_enc_tail");
	&cmp	($len,0x20);
	&movups	($inout0,&QWP(0,$inp));
	&jb	(&label("ecb_enc_one"));
	&movups	($inout1,&QWP(0x10,$inp));
	&je	(&label("ecb_enc_two"));
	&cmp	($len,0x30);
@@ -309,10 +311,11 @@ if ($PREFIX eq "aesni") {
	&movups	(&QWP(0x10,$out),$inout1);
	&movups	(&QWP(0x20,$out),$inout2);
	&jmp	(&label("ecb_ret"));

######################################################################
&set_label("ecb_decrypt",16);
	&sub	($len,0x40);
	&cmp	($len,0x40);
	&jbe	(&label("ecb_dec_tail"));
	&sub	($len,0x40);
	&jmp	(&label("ecb_dec_loop3"));

&set_label("ecb_dec_loop3",16);
@@ -330,14 +333,13 @@ if ($PREFIX eq "aesni") {
	&movups	(&QWP(-0x10,$out),$inout2);
	&ja	(&label("ecb_dec_loop3"));

&set_label("ecb_dec_tail");
	&add	($len,0x40);
	&jz	(&label("ecb_ret"));

	&cmp	($len,0x10);
	&movups	($inout0,&QWP(0,$inp));
	&je	(&label("ecb_dec_one"));
&set_label("ecb_dec_tail");
	&cmp	($len,0x20);
	&movups	($inout0,&QWP(0,$inp));
	&jb	(&label("ecb_dec_one"));
	&movups	($inout1,&QWP(0x10,$inp));
	&je	(&label("ecb_dec_two"));
	&cmp	($len,0x30);
@@ -373,8 +375,173 @@ if ($PREFIX eq "aesni") {

&set_label("ecb_ret");
&function_end("aesni_ecb_encrypt");
}

######################################################################
# handles only complete blocks, operates on 32-bit counter and
# does not update *ivec! (see engine/eng_aesni.c for details)
#
# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
#                         size_t blocks, const AES_KEY *key,
#                         const char *ivec);
&function_begin("aesni_ctr32_encrypt_blocks");
	&mov	($inp,&wparam(0));
	&mov	($out,&wparam(1));
	&mov	($len,&wparam(2));
	&mov	($key,&wparam(3));
	&mov	($rounds_,&wparam(4));
	&mov	($key_,"esp");
	&sub	("esp",60);
	&and	("esp",-16);			# align stack
	&mov	(&DWP(48,"esp"),$key_);

	&movups	($inout3,&QWP(0,$rounds_));	# load ivec

	# compose byte-swap control mask for pshufb on stack
	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
	&mov	(&DWP(4,"esp"),0x08090a0b);
	&mov	(&DWP(8,"esp"),0x04050607);
	&mov	(&DWP(12,"esp"),0x00010203);

	# compose counter increment vector on stack
	&mov	($rounds,3);
	&xor	($key_,$key_);
	&mov	(&DWP(16,"esp"),$rounds);
	&mov	(&DWP(20,"esp"),$rounds);
	&mov	(&DWP(24,"esp"),$rounds);
	&mov	(&DWP(28,"esp"),$key_);

	&pextrd	($rounds_,$inout3,3);		# pull 32-bit counter
	&pinsrd	($inout3,$key_,3);		# wipe 32-bit counter

	&mov	($rounds,&DWP(240,$key));	# key->rounds
	&movaps	($rndkey0,&QWP(0,"esp"));	# load byte-swap mask

	# $ivec is vector of 3 32-bit counters
	&pxor	($ivec,$ivec);
	&bswap	($rounds_);
	&pinsrd	($ivec,$rounds_,0);
	&inc	($rounds_);
	&pinsrd	($ivec,$rounds_,1);
	&inc	($rounds_);
	&pinsrd	($ivec,$rounds_,2);

	&cmp	($len,4);
	&pshufb	($ivec,$rndkey0);		# byte swap
	&jbe	(&label("ctr32_tail"));
	&movaps	(&QWP(32,"esp"),$inout3);	# save counter-less ivec
	&mov	($rounds_,$rounds);
	&mov	($key_,$key);
	&sub	($len,4);
	&jmp	(&label("ctr32_loop3"));

&set_label("ctr32_loop3",16);
	&pshufd	($inout0,$ivec,3<<6);		# place counter to upper dword
	&pshufd	($inout1,$ivec,2<<6);
	&pshufd	($inout2,$ivec,1<<6);
	&por	($inout0,$inout3);		# merge counter-less ivec
	&por	($inout1,$inout3);
	&por	($inout2,$inout3);

	&call	("_aesni_encrypt3");

	 &movaps($rndkey0,&QWP(0,"esp"));	# load byte-swap mask
	&movups	($in0,&QWP(0,$inp));
	&movups	($in1,&QWP(0x10,$inp));
	&movups	($rndkey1,&QWP(0x20,$inp));
	 &pshufb($ivec,$rndkey0);		# byte swap
	 &paddd	($ivec,&QWP(16,"esp"));		# counter increment
	&pxor	($in0,$inout0);
	&pxor	($in1,$inout1);
	&pxor	($rndkey1,$inout2);
	&movups	(&QWP(0,$out),$in0);
	&movups	(&QWP(0x10,$out),$in1);
	&movups	(&QWP(0x20,$out),$rndkey1);
	&movaps	($inout3,&QWP(32,"esp"));	# load counter-less ivec
	 &pshufb($ivec,$rndkey0);		# byte swap

	&sub	($len,3);
	&lea	($inp,&DWP(0x30,$inp));
	&lea	($out,&DWP(0x30,$out));
	&mov	($key,$key_);
	&mov	($rounds,$rounds_);
	&ja	(&label("ctr32_loop3"));

	&add	($len,4);
	&pextrd	($rounds_,$ivec,1);		# might need last counter value
	&jz	(&label("ctr32_ret"));
	&bswap	($rounds_);

&set_label("ctr32_tail");
	&cmp	($len,2);
	&pshufd	($inout0,$ivec,3<<6);
	&pshufd	($inout1,$ivec,2<<6);
	&pshufd	($inout2,$ivec,1<<6);
	&por	($inout0,$inout3);
	&jb	(&label("ctr32_one"));
	&por	($inout1,$inout3);
	&je	(&label("ctr32_two"));
	&cmp	($len,3);
	&por	($inout2,$inout3);
	&je	(&label("ctr32_three"));

	&inc	($rounds_);			# compose last counter value
	&bswap	($rounds_);
	&pinsrd	($inout3,$rounds_,3);

	&call	("_aesni_encrypt4");

	&movups	($in0,&QWP(0,$inp));
	&movups	($rndkey1,&QWP(0x10,$inp));
	&movups	($rndkey0,&QWP(0x20,$inp));
	&movups	($ivec,&QWP(0x30,$inp));
	&pxor	($in0,$inout0);
	&pxor	($rndkey1,$inout1);
	&pxor	($rndkey0,$inout2);
	&pxor	($ivec,$inout3);
	&movups	(&QWP(0,$out),$in0);
	&movups	(&QWP(0x10,$out),$rndkey1);
	&movups	(&QWP(0x20,$out),$rndkey0);
	&movups	(&QWP(0x30,$out),$ivec);
	&jmp	(&label("ctr32_ret"));

&set_label("ctr32_one",16);
	if ($inline)
	{   &aesni_inline_generate1("enc");	}
	else
	{   &call	("_aesni_encrypt1");	}
	&movups	($in0,&QWP(0,$inp));
	&pxor	($in0,$inout0);
	&movups	(&QWP(0,$out),$in0);
	&jmp	(&label("ctr32_ret"));

&set_label("ctr32_two",16);
	&call	("_aesni_encrypt3");
	&movups	($in0,&QWP(0,$inp));
	&movups	($in1,&QWP(0x10,$inp));
	&pxor	($in0,$inout0);
	&pxor	($in1,$inout1);
	&movups	(&QWP(0,$out),$in0);
	&movups	(&QWP(0x10,$out),$in1);
	&jmp	(&label("ctr32_ret"));

&set_label("ctr32_three",16);
	&call	("_aesni_encrypt3");
	&movups	($in0,&QWP(0,$inp));
	&movups	($in1,&QWP(0x10,$inp));
	&movups	($rndkey1,&QWP(0x20,$inp));
	&pxor	($in0,$inout0);
	&pxor	($in1,$inout1);
	&pxor	($rndkey1,$inout2);
	&movups	(&QWP(0,$out),$in0);
	&movups	(&QWP(0x10,$out),$in1);
	&movups	(&QWP(0x20,$out),$rndkey1);

&set_label("ctr32_ret");
	&mov	("esp",&DWP(48,"esp"));
&function_end("aesni_ctr32_encrypt_blocks");
}

######################################################################
# void $PREFIX_cbc_encrypt (const void *inp, void *out,
#                           size_t length, const AES_KEY *key,
#                           unsigned char *ivp,const int enc);
@@ -431,10 +598,11 @@ if ($PREFIX eq "aesni") {
	&mov	($inp,$out);		# $inp and $out are the same
	&mov	($key,$key_);		# restore $key
	&jmp	(&label("cbc_enc_loop"));

######################################################################
&set_label("cbc_decrypt",16);
	&sub	($len,0x40);
	&cmp	($len,0x40);
	&jbe	(&label("cbc_dec_tail"));
	&sub	($len,0x40);
	&jmp	(&label("cbc_dec_loop3"));

&set_label("cbc_dec_loop3",16);
@@ -458,10 +626,10 @@ if ($PREFIX eq "aesni") {
	&movups	(&QWP(-0x10,$out),$inout2);
	&ja	(&label("cbc_dec_loop3"));

&set_label("cbc_dec_tail");
	&add	($len,0x40);
	&jz	(&label("cbc_ret"));

&set_label("cbc_dec_tail");
	&movups	($inout0,&QWP(0,$inp));
	&cmp	($len,0x10);
	&movaps	($in0,$inout0);
@@ -539,7 +707,8 @@ if ($PREFIX eq "aesni") {
	&mov	($key_,&wparam(4));
	&movups	(&QWP(0,$key_),$ivec);	# output IV
&function_end("${PREFIX}_cbc_encrypt");


######################################################################
# Mechanical port from aesni-x86_64.pl.
#
# _aesni_set_encrypt_key is private interface,
+270 −36
Original line number Diff line number Diff line
@@ -41,7 +41,7 @@ $inp="%rdi";
$out="%rsi";
$len="%rdx";
$key="%rcx";	# input to and changed by aesni_[en|de]cryptN !!!
$ivp="%r8";	# cbc
$ivp="%r8";	# cbc, ctr

$rnds_="%r10d";	# backup copy for $rounds
$key_="%r11";	# backup copy for $key
@@ -51,7 +51,7 @@ $inout0="%xmm0"; $inout1="%xmm1";
$inout2="%xmm2";	$inout3="%xmm3";
$rndkey0="%xmm4";	$rndkey1="%xmm5";

$iv="%xmm6";		$in0="%xmm7";	# used in CBC decrypt
$iv="%xmm6";		$in0="%xmm7";	# used in CBC decrypt, CTR
$in1="%xmm8";		$in2="%xmm9";

# Inline version of internal aesni_[en|de]crypt1.
@@ -214,6 +214,7 @@ ___
&aesni_generate4("dec");

if ($PREFIX eq "aesni") {
########################################################################
# void aesni_ecb_encrypt (const void *in, void *out,
#			  size_t length, const AES_KEY *key,
#			  int enc);
@@ -232,8 +233,9 @@ aesni_ecb_encrypt:
	mov	$rounds,$rnds_		# backup $rounds
	jz	.Lecb_decrypt
#--------------------------- ECB ENCRYPT ------------------------------#
	sub	\$0x40,$len
	cmp	\$0x40,$len
	jbe	.Lecb_enc_tail
	sub	\$0x40,$len
	jmp	.Lecb_enc_loop3
.align 16
.Lecb_enc_loop3:
@@ -251,14 +253,13 @@ aesni_ecb_encrypt:
	movups	$inout2,-0x10($out)
	ja	.Lecb_enc_loop3

.Lecb_enc_tail:
	add	\$0x40,$len
	jz	.Lecb_ret

	cmp	\$0x10,$len
	movups	($inp),$inout0
	je	.Lecb_enc_one
.Lecb_enc_tail:
	cmp	\$0x20,$len
	movups	($inp),$inout0
	jb	.Lecb_enc_one
	movups	0x10($inp),$inout1
	je	.Lecb_enc_two
	cmp	\$0x30,$len
@@ -294,8 +295,9 @@ $code.=<<___;
#--------------------------- ECB DECRYPT ------------------------------#
.align	16
.Lecb_decrypt:
	sub	\$0x40,$len
	cmp	\$0x40,$len
	jbe	.Lecb_dec_tail
	sub	\$0x40,$len
	jmp	.Lecb_dec_loop3
.align 16
.Lecb_dec_loop3:
@@ -313,14 +315,13 @@ $code.=<<___;
	movups	$inout2,-0x10($out)
	ja	.Lecb_dec_loop3

.Lecb_dec_tail:
	add	\$0x40,$len
	jz	.Lecb_ret

	cmp	\$0x10,$len
	movups	($inp),$inout0
	je	.Lecb_dec_one
.Lecb_dec_tail:
	cmp	\$0x20,$len
	movups	($inp),$inout0
	jb	.Lecb_dec_one
	movups	0x10($inp),$inout1
	je	.Lecb_dec_two
	cmp	\$0x30,$len
@@ -357,8 +358,175 @@ $code.=<<___;
	ret
.size	aesni_ecb_encrypt,.-aesni_ecb_encrypt
___
######################################################################
# handles only complete blocks, operates on 32-bit counter and
# does not update *ivec! (see engine/eng_aesni.c for details)
#
# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
#                         size_t blocks, const AES_KEY *key,
#                         const char *ivec);
$increment="%xmm10";
$bswap_mask="%xmm11";

$code.=<<___;
.globl	aesni_ctr32_encrypt_blocks
.type	aesni_ctr32_encrypt_blocks,\@function,5
.align	16
aesni_ctr32_encrypt_blocks:
___
$code.=<<___ if ($win64);
	lea	-0x68(%rsp),%rsp
	movaps	%xmm6,(%rsp)
	movaps	%xmm7,0x10(%rsp)
	movaps	%xmm8,0x20(%rsp)
	movaps	%xmm9,0x30(%rsp)
	movaps	%xmm10,0x40(%rsp)
	movaps	%xmm11,0x50(%rsp)

.Lctr32_body:
___
$code.=<<___;
	movups	($ivp),$inout3
	movaps	.Lincrement(%rip),$increment
	movaps	.Lbswap_mask(%rip),$bswap_mask
	xor	$rounds,$rounds
	pextrd	\$3,$inout3,$rnds_		# pull 32-bit counter
	pinsrd	\$3,$rounds,$inout3		# wipe 32-bit counter

	mov	240($key),$rounds		# key->rounds
	pxor	$iv,$iv				# vector of 3 32-bit counters
	bswap	$rnds_
	pinsrd	\$0,$rnds_,$iv
	inc	$rnds_
	pinsrd	\$1,$rnds_,$iv
	inc	$rnds_
	pinsrd	\$2,$rnds_,$iv

	cmp	\$4,$len
	pshufb	$bswap_mask,$iv
	jbe	.Lctr32_tail
	mov	$rounds,$rnds_
	mov	$key,$key_
	sub	\$4,$len
	jmp	.Lctr32_loop3

.align	16
.Lctr32_loop3:
	pshufd	\$`3<<6`,$iv,$inout0		# place counter to upper dword
	pshufd	\$`2<<6`,$iv,$inout1
	pshufd	\$`1<<6`,$iv,$inout2
	movups	($inp),$in0
	movups	0x10($inp),$in1
	movups	0x20($inp),$in2
	por	$inout3,$inout0			# merge counter-less ivec
	por	$inout3,$inout1
	por	$inout3,$inout2
	pshufb	$bswap_mask,$iv

	call	_aesni_encrypt3

	paddd	$increment,$iv
	pxor	$inout0,$in0
	pxor	$inout1,$in1
	pxor	$inout2,$in2
	pshufb	$bswap_mask,$iv
	movups	$in0,($out)
	movups	$in1,0x10($out)
	movups	$in2,0x20($out)

	sub	\$3,$len
	lea	0x30($inp),$inp
	lea	0x30($out),$out
	mov	$key_,$key
	mov	$rnds_,$rounds
	ja	.Lctr32_loop3

	add	\$4,$len
	pextrd	\$1,$iv,$rnds_			# migh need last counter value
	jz	.Lctr32_done
	bswap	$rnds_

.Lctr32_tail:
	cmp	\$2,$len
	pshufd	\$`3<<6`,$iv,$inout0
	pshufd	\$`2<<6`,$iv,$inout1
	pshufd	\$`1<<6`,$iv,$inout2
	por	$inout3,$inout0
	movups	($inp),$in0
	jb	.Lctr32_one
	por	$inout3,$inout1
	movups	0x10($inp),$in1
	je	.Lctr32_two
	cmp	\$3,$len
	por	$inout3,$inout2
	movups	0x20($inp),$in2
	je	.Lctr32_three

	inc	$rnds_				# compose last counter value
	bswap	$rnds_
	pinsrd	\$3,$rnds_,$inout3
	movups	0x30($inp),$iv

	call	_aesni_encrypt4

	pxor	$inout0,$in0
	pxor	$inout1,$in1
	pxor	$inout2,$in2
	pxor	$inout3,$iv
	movups	$in0,($out)
	movups	$in1,0x10($out)
	movups	$in2,0x20($out)
	movups	$iv,0x30($out)
	jmp	.Lctr32_done

.align	16
.Lctr32_one:
___
	&aesni_generate1("enc",$key,$rounds);
$code.=<<___;
	pxor	$inout0,$in0
	movups	$in0,($out)
	jmp	.Lctr32_done

.align	16
.Lctr32_two:
	call	_aesni_encrypt3
	pxor	$inout0,$in0
	pxor	$inout1,$in1
	movups	$in0,($out)
	movups	$in1,0x10($out)
	jmp	.Lctr32_done

.align	16
.Lctr32_three:
	call	_aesni_encrypt3
	pxor	$inout0,$in0
	pxor	$inout1,$in1
	pxor	$inout2,$in2
	movups	$in0,($out)
	movups	$in1,0x10($out)
	movups	$in2,0x20($out)

.Lctr32_done:
___

$code.=<<___ if ($win64);
	movaps	(%rsp),%xmm6
	movaps	0x10(%rsp),%xmm7
	movaps	0x20(%rsp),%xmm8
	movaps	0x30(%rsp),%xmm9
	movaps	0x40(%rsp),%xmm10
	movaps	0x50(%rsp),%xmm11
	lea	0x68(%rsp),%rsp
___
$code.=<<___;
.Lctr32_ret:
	ret
.size	aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
___
}

########################################################################
# void $PREFIX_cbc_encrypt (const void *inp, void *out,
#			    size_t length, const AES_KEY *key,
#			    unsigned char *ivp,const int enc);
@@ -429,9 +597,10 @@ $code.=<<___ if ($win64);
___
$code.=<<___;
	movups	($ivp),$iv
	sub	\$0x40,$len
	cmp	\$0x40,$len
	mov	$rnds_,$rounds
	jbe	.Lcbc_dec_tail
	sub	\$0x40,$len
	jmp	.Lcbc_dec_loop3
.align 16
.Lcbc_dec_loop3:
@@ -456,11 +625,11 @@ $code.=<<___;
	movups	$inout2,-0x10($out)
	ja	.Lcbc_dec_loop3

.Lcbc_dec_tail:
	add	\$0x40,$len
	movups	$iv,($ivp)
	jz	.Lcbc_dec_ret

.Lcbc_dec_tail:
	movups	($inp),$inout0
	cmp	\$0x10,$len
	movaps	$inout0,$in0
@@ -796,6 +965,11 @@ ___
}

$code.=<<___;
.align	64
.Lbswap_mask:
	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
.Lincrement:
	.long	3,3,3,0
.asciz  "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
.align	64
___
@@ -810,9 +984,11 @@ $disp="%r9";

$code.=<<___;
.extern	__imp_RtlVirtualUnwind
.type	cbc_se_handler,\@abi-omnipotent
___
$code.=<<___ if ($PREFIX eq "aesni");
.type	ecb_se_handler,\@abi-omnipotent
.align	16
cbc_se_handler:
ecb_se_handler:
	push	%rsi
	push	%rdi
	push	%rbx
@@ -825,30 +1001,48 @@ cbc_se_handler:
	sub	\$64,%rsp

	mov	152($context),%rax	# pull context->Rsp
	mov	8(%rax),%rdi
	mov	16(%rax),%rsi
	mov	%rsi,168($context)	# restore context->Rsi
	mov	%rdi,176($context)	# restore context->Rdi

	jmp	.Lcommon_seh_exit
.size	ecb_se_handler,.-ecb_se_handler

.type	ctr32_se_handler,\@abi-omnipotent
.align	16
ctr32_se_handler:
	push	%rsi
	push	%rdi
	push	%rbx
	push	%rbp
	push	%r12
	push	%r13
	push	%r14
	push	%r15
	pushfq
	sub	\$64,%rsp

	mov	120($context),%rax	# pull context->Rax
	mov	248($context),%rbx	# pull context->Rip

	lea	.Lcbc_decrypt(%rip),%r10
	lea	.Lctr32_body(%rip),%r10
	cmp	%r10,%rbx		# context->Rip<"prologue" label
	jb	.Lin_prologue
	jb	.Lin_ctr32_prologue

	lea	.Lcbc_decrypt_body(%rip),%r10
	cmp	%r10,%rbx		# context->Rip<cbc_decrypt_body
	jb	.Lrestore_rax
	mov	152($context),%rax	# pull context->Rsp

	lea	.Lcbc_ret(%rip),%r10
	cmp	%r10,%rbx		# context->Rip>="epilogue" label
	jae	.Lin_prologue
	lea	.Lctr32_ret(%rip),%r10
	cmp	%r10,%rbx
	jae	.Lin_ctr32_prologue

	lea	0(%rax),%rsi		# top of stack
	lea	512($context),%rdi	# &context.Xmm6
	mov	\$8,%ecx		# 4*sizeof(%xmm0)/sizeof(%rax)
	mov	\$12,%ecx		# 6*sizeof(%xmm0)/sizeof(%rax)
	.long	0xa548f3fc		# cld; rep movsq
	lea	0x58(%rax),%rax		# adjust stack pointer
	jmp	.Lin_prologue
	lea	0x68(%rax),%rax		# adjust stack pointer

.Lrestore_rax:
	mov	120($context),%rax
.Lin_prologue:
.Lin_ctr32_prologue:
	mov	8(%rax),%rdi
	mov	16(%rax),%rsi
	mov	%rax,152($context)	# restore context->Rsp
@@ -856,11 +1050,12 @@ cbc_se_handler:
	mov	%rdi,176($context)	# restore context->Rdi

	jmp	.Lcommon_seh_exit
.size	cbc_se_handler,.-cbc_se_handler

.type	ecb_se_handler,\@abi-omnipotent
.size	ctr32_se_handler,.-ctr32_se_handler
___
$code.=<<___;
.type	cbc_se_handler,\@abi-omnipotent
.align	16
ecb_se_handler:
cbc_se_handler:
	push	%rsi
	push	%rdi
	push	%rbx
@@ -873,8 +1068,33 @@ ecb_se_handler:
	sub	\$64,%rsp

	mov	152($context),%rax	# pull context->Rsp
	mov	248($context),%rbx	# pull context->Rip

	lea	.Lcbc_decrypt(%rip),%r10
	cmp	%r10,%rbx		# context->Rip<"prologue" label
	jb	.Lin_cbc_prologue

	lea	.Lcbc_decrypt_body(%rip),%r10
	cmp	%r10,%rbx		# context->Rip<cbc_decrypt_body
	jb	.Lrestore_cbc_rax

	lea	.Lcbc_ret(%rip),%r10
	cmp	%r10,%rbx		# context->Rip>="epilogue" label
	jae	.Lin_cbc_prologue

	lea	0(%rax),%rsi		# top of stack
	lea	512($context),%rdi	# &context.Xmm6
	mov	\$8,%ecx		# 4*sizeof(%xmm0)/sizeof(%rax)
	.long	0xa548f3fc		# cld; rep movsq
	lea	0x58(%rax),%rax		# adjust stack pointer
	jmp	.Lin_cbc_prologue

.Lrestore_cbc_rax:
	mov	120($context),%rax
.Lin_cbc_prologue:
	mov	8(%rax),%rdi
	mov	16(%rax),%rsi
	mov	%rax,152($context)	# restore context->Rsp
	mov	%rsi,168($context)	# restore context->Rsi
	mov	%rdi,176($context)	# restore context->Rdi

@@ -915,10 +1135,17 @@ ecb_se_handler:

.section	.pdata
.align	4
	.rva	.LSEH_begin_${PREFIX}_ecb_encrypt
	.rva	.LSEH_end_${PREFIX}_ecb_encrypt
___
$code.=<<___ if ($PREFIX eq "aesni");
	.rva	.LSEH_begin_aesni_ecb_encrypt
	.rva	.LSEH_end_aesni_ecb_encrypt
	.rva	.LSEH_info_ecb

	.rva	.LSEH_begin_aesni_ctr32_encrypt_blocks
	.rva	.LSEH_end_aesni_ctr32_encrypt_blocks
	.rva	.LSEH_info_ctr32
___
$code.=<<___;
	.rva	.LSEH_begin_${PREFIX}_cbc_encrypt
	.rva	.LSEH_end_${PREFIX}_cbc_encrypt
	.rva	.LSEH_info_cbc
@@ -932,9 +1159,16 @@ ecb_se_handler:
	.rva	.LSEH_info_key
.section	.xdata
.align	8
___
$code.=<<___ if ($PREFIX eq "aesni");
.LSEH_info_ecb:
	.byte	9,0,0,0
	.rva	ecb_se_handler
.LSEH_info_ctr32:
	.byte	9,0,0,0
	.rva	ctr32_se_handler
___
$code.=<<___;
.LSEH_info_cbc:
	.byte	9,0,0,0
	.rva	cbc_se_handler
+173 −5
Original line number Diff line number Diff line
@@ -111,6 +111,35 @@ void ENGINE_load_aesni (void)
}

#ifdef COMPILE_HW_AESNI

typedef unsigned int u32;
typedef unsigned char u8;

#if defined(__GNUC__) && __GNUC__>=2
#  define BSWAP4(x) ({	u32 ret=(x);			\
			asm volatile ("bswapl %0"	\
			: "+r"(ret));	ret;		})
#elif defined(_MSC_VER)
# if _MSC_VER>=1300
#  pragma intrinsic(_byteswap_ulong)
#  define BSWAP4(x)	_byteswap_ulong((u32)(x))
# elif defined(_M_IX86)
   __inline u32 _bswap4(u32 val) {
	_asm mov eax,val
	_asm bswap eax
   }
#  define BSWAP4(x)	_bswap4(x)
# endif
#endif

#ifdef BSWAP4
#define GETU32(p)	BSWAP4(*(const u32 *)(p))
#define PUTU32(p,v)	*(u32 *)(p) = BSWAP4(v)
#else
#define GETU32(p)	((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3])
#define PUTU32(p,v)	((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v))
#endif

int aesni_set_encrypt_key(const unsigned char *userKey, int bits,
			      AES_KEY *key);
int aesni_set_decrypt_key(const unsigned char *userKey, int bits,
@@ -132,6 +161,12 @@ void aesni_cbc_encrypt(const unsigned char *in,
			   const AES_KEY *key,
			   unsigned char *ivec, int enc);

void aesni_ctr32_encrypt_blocks(const unsigned char *in,
			   unsigned char *out,
			   size_t blocks,
			   const AES_KEY *key,
			   const unsigned char *ivec);

/* Function for ENGINE detection and control */
static int aesni_init(ENGINE *e);

@@ -224,16 +259,19 @@ static int aesni_cipher_nids[] = {
	NID_aes_128_cbc,
	NID_aes_128_cfb,
	NID_aes_128_ofb,
	NID_aes_128_ctr,

	NID_aes_192_ecb,
	NID_aes_192_cbc,
	NID_aes_192_cfb,
	NID_aes_192_ofb,
	NID_aes_192_ctr,

	NID_aes_256_ecb,
	NID_aes_256_cbc,
	NID_aes_256_cfb,
	NID_aes_256_ofb,
	NID_aes_256_ctr,
};
static int aesni_cipher_nids_num =
	(sizeof(aesni_cipher_nids)/sizeof(aesni_cipher_nids[0]));
@@ -251,18 +289,28 @@ aesni_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *user_key,
	int ret;
	AES_KEY *key = AESNI_ALIGN(ctx->cipher_data);

	if ((ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_CFB_MODE
	    || (ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_OFB_MODE
	    || enc)
		ret=aesni_set_encrypt_key(user_key, ctx->key_len * 8, key);
	else
	if (((ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_ECB_MODE
	    || (ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_CBC_MODE)
	    && !enc)
		ret=aesni_set_decrypt_key(user_key, ctx->key_len * 8, key);
	else
		ret=aesni_set_encrypt_key(user_key, ctx->key_len * 8, key);

	if(ret < 0) {
		EVPerr(EVP_F_AESNI_INIT_KEY,EVP_R_AES_KEY_SETUP_FAILED);
		return 0;
	}

	if (ctx->cipher->flags&EVP_CIPH_CUSTOM_IV)
		{
		if (iv!=NULL)
			memcpy (ctx->iv,iv,ctx->cipher->iv_len);
		else	{
			EVPerr(EVP_F_AESNI_INIT_KEY,EVP_R_AES_IV_SETUP_FAILED);
			return 0;
			}
		}

	return 1;
}

@@ -336,6 +384,117 @@ DECLARE_AES_EVP(256,cbc,CBC);
DECLARE_AES_EVP(256,cfb,CFB);
DECLARE_AES_EVP(256,ofb,OFB);

static void ctr96_inc(unsigned char *counter) {
	u32 n=12;
	u8  c;

	do {
		--n;
		c = counter[n];
		++c;
		counter[n] = c;
		if (c) return;
	} while (n);
}

static int aesni_counter(EVP_CIPHER_CTX *ctx, unsigned char *out,
		const unsigned char *in, size_t len)
{
	AES_KEY *key = AESNI_ALIGN(ctx->cipher_data);
	u32 n, ctr32;
	n = ctx->num;

	while (n && len) {
		*(out++) = *(in++) ^ ctx->buf[n];
		--len;
		n = (n+1) % 16;
	}

	ctr32 = GETU32(ctx->iv+12);
	while (len>=16) {
		size_t blocks = len/16;
		/*
		 * 1<<24 is just a not-so-small yet not-so-large number...
		 */
		if (blocks > (1U<<24)) blocks = (1U<<24);
		/*
		 * As aesni_ctr32 operates on 32-bit counter, caller
		 * has to handle overflow. 'if' below detects the
		 * overflow, which is then handled by limiting the
		 * amount of blocks to the exact overflow point...
		 */
		ctr32 += (u32)blocks;
		if (ctr32 < blocks) {
			blocks -= ctr32;
			ctr32   = 0;
		}
		aesni_ctr32_encrypt_blocks(in,out,blocks,key,ctx->iv);
		/* aesni_ctr32 does not update ctx->iv, caller does: */
		PUTU32(ctx->iv+12,ctr32);
		/* ... overflow was detected, propogate carry. */
		if (ctr32 == 0)	ctr96_inc(ctx->iv);
		blocks *= 16;
		len -= blocks;
		out += blocks;
		in  += blocks;
	}
	if (len) {
		aesni_encrypt(ctx->iv,ctx->buf,key);
		++ctr32;
		PUTU32(ctx->iv+12,ctr32);
		if (ctr32 == 0)	ctr96_inc(ctx->iv);
		while (len--) {
			out[n] = in[n] ^ ctx->buf[n];
			++n;
		}
	}
	ctx->num = n;

	return 1;
}

static const EVP_CIPHER aesni_128_ctr=
	{
	NID_aes_128_ctr,1,16,16,
	EVP_CIPH_CUSTOM_IV,
	aesni_init_key,
	aesni_counter,
	NULL,
	sizeof(AESNI_KEY),
	NULL,
	NULL,
	NULL,
	NULL
	};

static const EVP_CIPHER aesni_192_ctr=
	{
	NID_aes_192_ctr,1,24,16,
	EVP_CIPH_CUSTOM_IV,
	aesni_init_key,
	aesni_counter,
	NULL,
	sizeof(AESNI_KEY),
	NULL,
	NULL,
	NULL,
	NULL
	};

static const EVP_CIPHER aesni_256_ctr=
	{
	NID_aes_256_ctr,1,32,16,
	EVP_CIPH_CUSTOM_IV,
	aesni_init_key,
	aesni_counter,
	NULL,
	sizeof(AESNI_KEY),
	NULL,
	NULL,
	NULL,
	NULL
	};

static int
aesni_ciphers (ENGINE *e, const EVP_CIPHER **cipher,
		      const int **nids, int nid)
@@ -360,6 +519,9 @@ aesni_ciphers (ENGINE *e, const EVP_CIPHER **cipher,
	case NID_aes_128_ofb:
		*cipher = &aesni_128_ofb;
		break;
	case NID_aes_128_ctr:
		*cipher = &aesni_128_ctr;
		break;

	case NID_aes_192_ecb:
		*cipher = &aesni_192_ecb;
@@ -373,6 +535,9 @@ aesni_ciphers (ENGINE *e, const EVP_CIPHER **cipher,
	case NID_aes_192_ofb:
		*cipher = &aesni_192_ofb;
		break;
	case NID_aes_192_ctr:
		*cipher = &aesni_192_ctr;
		break;

	case NID_aes_256_ecb:
		*cipher = &aesni_256_ecb;
@@ -386,6 +551,9 @@ aesni_ciphers (ENGINE *e, const EVP_CIPHER **cipher,
	case NID_aes_256_ofb:
		*cipher = &aesni_256_ofb;
		break;
	case NID_aes_256_ctr:
		*cipher = &aesni_256_ctr;
		break;

	default:
		/* Sorry, we don't support this NID */