Commit 3d5fd312 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

Avoid L1 cache aliasing even between key and S-boxes.

parent c7199e62
Loading
Loading
Loading
Loading
+108 −32
Original line number Diff line number Diff line
@@ -6,7 +6,7 @@
# forms are granted according to the OpenSSL license.
# ====================================================================
#
# Version 3.3.
# Version 3.4.
#
# You might fail to appreciate this module performance from the first
# try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
@@ -60,6 +60,12 @@
# misaligned, which unfortunately has negative impact on elder IA-32
# implementations, Pentium suffered 30% penalty, PIII - 10%.
#
# Version 3.3 avoids L1 cache aliasing between stack frame and
# S-boxes, and 3.4 - L1 cache aliasing even between key schedule. The
# latter is achieved by copying the key schedule to controlled place in
# stack. This unfortunately has rather strong impact on small block CBC
# performance, ~2x deterioration on 16-byte block if compared to 3.3.
#
# Current ECB performance numbers for 128-bit key in CPU cycles per
# processed byte [measure commonly used by AES benchmarkers] are:
#
@@ -81,6 +87,12 @@ $s3="edx";
$key="edi";
$acc="esi";

$compromise=0;		# $compromise=128 abstains from copying key
			# schedule to stack when encrypting inputs
			# shorter than 128 bytes at the cost of
			# risksing aliasing with S-boxes. In return
			# you get way better, up to +70%, small block
			# performance.
$small_footprint=1;	# $small_footprint=1 code is ~5% slower [on
			# recent µ-archs], but ~5 times smaller!
			# I favor compact code to minimize cache
@@ -792,6 +804,7 @@ my $_key=&DWP(32,"esp"); #copy of wparam(3)
my $_ivp=&DWP(36,"esp");	#copy of wparam(4)
my $_tmp=&DWP(40,"esp");	#volatile variable
my $ivec=&DWP(44,"esp");	#ivec[16]
my $aes_key=&DWP(60,"esp");	#copy of aes_key

&public_label("AES_Te");
&public_label("AES_Td");
@@ -804,28 +817,37 @@ my $ivec=&DWP(44,"esp"); #ivec[16]
	&set_label("pic_point");
	&blindpop("ebp");

	&pushf	();
	&cld	();

	&cmp	(&wparam(5),0);
	&je	(&label("DECRYPT"));

	&lea    ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp"));

	# allocate aligned stack frame...
	&lea	($key,&DWP(-44,"esp"));
	&lea	($key,&DWP(-64-260,"esp"));
	&and	($key,-64);

	# ... and make sure it doesn't alias with AES_Te modulo 4096
	&mov	($s1,"ebp");
	&mov	($s0,"ebp");
	&lea	($s1,&DWP(2048,"ebp"));
	&mov	($s3,$key);
	&and	($s1,0xfff);		# t = %ebp&0xfff
	&and	($s0,0xfff);		# s = %ebp&0xfff
	&and	($s1,0xfff);		# e = (%ebp+2048)&0xfff
	&and	($s3,0xfff);		# p = %esp&0xfff

	&cmp	($s3,$s1);		# if (p<t) goto ok
	&jb	(&label("te_ok"));
	&lea	($acc,&DWP(2048,$s1));
	&cmp	($s3,$acc);		# if (p>=(t+2048)) goto ok
	&jae	(&label("te_ok"));
	&sub	($s1,$s3);		# t -= p
	&lea	($key,&DWP(-64,$key,$s1));# %esp -= (p-t)+64
	&cmp	($s3,$s1);		# if (p>=e) %esp =- (p-e);
	&jb	(&label("te_break_out"));
	&sub	($s3,$s1);
	&sub	($key,$s3);
	&jmp	(&label("te_ok"));
	&set_label("te_break_out");	# else %esp -= (p-s)&0xfff + framesz;
	&sub	($s3,$s0);
	&and	($s3,0xfff);
	&add	($s3,64+320);
	&sub	($key,$s3);
	&align	(4);
	&set_label("te_ok");

	&mov	($s0,&wparam(0));	# load inp
@@ -843,6 +865,19 @@ my $ivec=&DWP(44,"esp"); #ivec[16]
	&mov	($_key,$s3);		# save copy of key
	&mov	($_ivp,$acc);		# save copy of ivp

	if ($compromise) {
		&cmp	($s2,$compromise);
		&jb	(&label("skip_ecopy"));
	}
	# copy key schedule to stack
	&mov	("ecx",260/4);
	&mov	("esi",$s3);
	&lea	("edi",$aes_key);
	&mov	($_key,"edi");
	&align	(4);
	&data_word(0xF689A5F3);	# rep movsd
	&set_label("skip_ecopy") if ($compromise);

	&mov	($acc,$s0);
	&mov	($key,16);
	&align	(4);
@@ -906,28 +941,42 @@ my $ivec=&DWP(44,"esp"); #ivec[16]
	&mov	(&DWP(4,$acc),$s1);
	&mov	(&DWP(8,$acc),$s2);
	&mov	(&DWP(12,$acc),$s3);

	&mov	("edi",$_key);
	&mov	("esp",$_esp);
	if ($compromise) {
		&cmp	(&wparam(2),$compromise);
		&jb	(&label("skip_ezero"));
	}
	# zero copy of key schedule
	&mov	("ecx",256/4);
	&xor	("eax","eax");
	&align	(4);
	&data_word(0xF689ABF3);	# rep stosd
	&set_label("skip_ezero") if ($compromise);
	&popf	();
    &set_label("enc_out");
	&function_end_A();
	&pushf	();			# kludge, never executed

    &align	(4);
    &set_label("enc_tail");
	&push	($key eq "edi" ? $key : "");	# push ivp
	&pushf	();
	&mov	($key,$_out);			# load out
	&mov	($s1,16);
	&sub	($s1,$s2);
	&cmp	($key,$acc);			# compare with inp
	&je	(&label("enc_in_place"));
	&data_word(0x90A4F3FC);	# cld; rep movsb; nop	# copy input
	&align	(4);
	&data_word(0xF689A4F3);	# rep movsb	# copy input
	&jmp	(&label("enc_skip_in_place"));
    &set_label("enc_in_place");
	&lea	($key,&DWP(0,$key,$s2));
    &set_label("enc_skip_in_place");
	&mov	($s2,$s1);
	&xor	($s0,$s0);
	&data_word(0x90AAF3FC);	# cld; rep stosb; nop	# zero tail
	&popf	();
	&align	(4);
	&data_word(0xF689AAF3);	# rep stosb	# zero tail
	&pop	($key);				# pop ivp

	&mov	($acc,$_out);			# output as input
@@ -942,22 +991,28 @@ my $ivec=&DWP(44,"esp"); #ivec[16]
	&lea    ("ebp",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp"));

	# allocate aligned stack frame...
	&lea	($key,&DWP(-64,"esp"));
	&lea	($key,&DWP(-64-260,"esp"));
	&and	($key,-64);

	# ... and make sure it doesn't alias with AES_Td modulo 4096
	&mov	($s1,"ebp");
	&mov	($s0,"ebp");
	&lea	($s1,&DWP(3072,"ebp"));
	&mov	($s3,$key);
	&and	($s1,0xfff);		# t = %ebp&0xfff
	&and	($s0,0xfff);		# s = %ebp&0xfff
	&and	($s1,0xfff);		# e = (%ebp+3072)&0xfff
	&and	($s3,0xfff);		# p = %esp&0xfff

	&cmp	($s3,$s1);		# if (p<t) goto ok
	&jb	(&label("td_ok"));
	&lea	($acc,&DWP(3072,$s1));
	&cmp	($s3,$acc);		# if (p>=(t+3072)) goto ok
	&jae	(&label("td_ok"));
	&sub	($s1,$s3);		# t -= p
	&lea	($key,&DWP(-64,$key,$s1));# %esp -= (p-t)+64
	&cmp	($s3,$s1);		# if (p>=e) %esp =- (p-e);
	&jb	(&label("td_break_out"));
	&sub	($s3,$s1);
	&sub	($key,$s3);
	&jmp	(&label("td_ok"));
	&set_label("td_break_out");	# else %esp -= (p-s)&0xfff + framesz;
	&sub	($s3,$s0);
	&and	($s3,0xfff);
	&add	($s3,64+320);
	&sub	($key,$s3);
	&align	(4);
	&set_label("td_ok");

	&mov	($s0,&wparam(0));	# load inp
@@ -975,6 +1030,19 @@ my $ivec=&DWP(44,"esp"); #ivec[16]
	&mov	($_key,$s3);		# save copy of key
	&mov	($_ivp,$acc);		# save copy of ivp

	if ($compromise) {
		&cmp	($s2,$compromise);
		&jb	(&label("skip_dcopy"));
	}
	# copy key schedule to stack
	&mov	("ecx",260/4);
	&mov	("esi",$s3);
	&lea	("edi",$aes_key);
	&mov	($_key,"edi");
	&align	(4);
	&data_word(0xF689A5F3);	# rep movsd
	&set_label("skip_dcopy") if ($compromise);

	&mov	($acc,$s0);
	&mov	($key,24);
	&align	(4);
@@ -1053,9 +1121,7 @@ my $ivec=&DWP(44,"esp"); #ivec[16]
	&lea	($s2 eq "ecx" ? $s2 : "",&DWP(16,$acc));
	&mov	($acc eq "esi" ? $acc : "",$key);
	&mov	($key eq "edi" ? $key : "",$_out);	# load out
	&pushf	();
	&data_word(0x90A4F3FC);	# cld; rep movsb; nop	# copy output
	&popf	();
	&data_word(0xF689A4F3);	# rep movsb		# copy output
	&mov	($key,$_inp);				# use inp as temp ivp
	&jmp	(&label("dec_end"));

@@ -1122,13 +1188,23 @@ my $ivec=&DWP(44,"esp"); #ivec[16]
	&lea	($key,&DWP(0,$key,$s2));
	&lea	($acc,&DWP(16,$acc,$s2));
	&neg	($s2 eq "ecx" ? $s2 : "");
	&pushf	();
	&data_word(0x90A4F3FC);	# cld; rep movsb; nop	# restore tail
	&popf	();
	&data_word(0xF689A4F3);	# rep movsb	# restore tail

    &align	(4);
    &set_label("dec_out");
    &mov	("edi",$_key);
    &mov	("esp",$_esp);
    if ($compromise) {
	&cmp	(&wparam(2),$compromise);
	&jb	(&label("skip_dzero"));
    }
    # zero copy of key schedule
    &mov	("ecx",256/4);
    &xor	("eax","eax");
    &align	(4);
    &data_word(0xF689ABF3);	# rep stosd
    &set_label("skip_dzero") if ($compromise);
    &popf	();
&function_end("AES_cbc_encrypt");
}

+1 −0
Original line number Diff line number Diff line
@@ -176,6 +176,7 @@ sub main'rdtsc { &out0("DW\t0310Fh"); }
sub main'halt	{ &out0("hlt"); }
sub main'movz	{ &out2("movzx",@_); }
sub main'neg	{ &out1("neg",@_); }
sub main'cld	{ &out0("cld"); }

# SSE2
sub main'emms	{ &out0("emms"); }
+1 −0
Original line number Diff line number Diff line
@@ -194,6 +194,7 @@ sub main'rdtsc { &out0("rdtsc"); }
sub main'halt	{ &out0("hlt"); }
sub main'movz	{ &out2("movzx",@_); }
sub main'neg	{ &out1("neg",@_); }
sub main'cld	{ &out0("cld"); }

# SSE2
sub main'emms	{ &out0("emms"); }
+1 −0
Original line number Diff line number Diff line
@@ -210,6 +210,7 @@ sub main'rdtsc { &out0(".byte\t0x0f,0x31"); }
sub main'halt	{ &out0("hlt"); }
sub main'movz	{ &out2("movzbl",@_); }
sub main'neg	{ &out1("negl",@_); }
sub main'cld	{ &out0("cld"); }

# SSE2
sub main'emms	{ &out0("emms"); }