Commit ed998634 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

e_padlock-x86[_64].pl: better understanding of prefetch errata and proper

workaround.
parent 884c580e
Loading
Loading
Loading
Loading
+81 −23
Original line number Diff line number Diff line
@@ -37,7 +37,7 @@ require "x86asm.pl";

&asm_init($ARGV[0],$0);

%PADLOCK_MARGIN=(ecb=>128, cbc=>64); # prefetch errata
%PADLOCK_PREFETCH=(ecb=>128, cbc=>64);	# prefetch errata
$PADLOCK_CHUNK=512;	# Must be a power of 2 larger than 16

$ctx="edx";
@@ -188,10 +188,6 @@ my ($mode,$opcode) = @_;
	&movq	("mm0",&QWP(-16,$ctx));	# load [upper part of] counter
					} else {
	&xor	("ebx","ebx");
    if ($PADLOCK_MARGIN{$mode}) {
	&cmp	($len,$PADLOCK_MARGIN{$mode});
	&jbe	(&label("${mode}_short"));
    }
	&test	(&DWP(0,$ctx),1<<5);	# align bit in control word
	&jnz	(&label("${mode}_aligned"));
	&test	($out,0x0f);
@@ -212,7 +208,27 @@ my ($mode,$opcode) = @_;
	&neg	("eax");
	&and	($chunk,$PADLOCK_CHUNK-1);	# chunk=len%PADLOCK_CHUNK
	&lea	("esp",&DWP(0,"eax","ebp"));	# alloca
	&mov	("eax",$PADLOCK_CHUNK);
	&cmovz	($chunk,"eax");			# chunk=chunk?:PADLOCK_CHUNK
	&mov	("eax","ebp");
	&and	("ebp",-16);
	&and	("esp",-16);
	&mov	(&DWP(16,"ebp"),"eax");
    if ($PADLOCK_PREFETCH{$mode}) {
	&cmp	($len,$chunk);
	&ja	(&label("${mode}_loop"));
	&mov	("eax",$inp);		# check if prefetch crosses page
	&cmp	("ebp","esp");
	&cmove	("eax",$out);
	&add	("eax",$len);
	&neg	("eax");
	&and	("eax",0xfff);		# distance to page boundary
	&cmp	("eax",$PADLOCK_PREFETCH{$mode});
	&mov	("eax",-$PADLOCK_PREFETCH{$mode});
	&cmovae	("eax",$chunk);		# mask=distance<prefetch?-prefetch:-1
	&and	($chunk,"eax");
	&jz	(&label("${mode}_unaligned_tail"));
    }
	&jmp	(&label("${mode}_loop"));

&set_label("${mode}_loop",16);
@@ -276,8 +292,8 @@ my ($mode,$opcode) = @_;
	&test	($out,0x0f);
	&jz	(&label("${mode}_out_aligned"));
	&mov	($len,$chunk);
	&shr	($len,2);
	&lea	($inp,&DWP(0,"esp"));
	&shr	($len,2);
	&data_byte(0xf3,0xa5);			# rep movsl
	&sub	($out,$chunk);
&set_label("${mode}_out_aligned");
@@ -288,7 +304,30 @@ my ($mode,$opcode) = @_;
	&add	($inp,$chunk);
	&sub	($len,$chunk);
	&mov	($chunk,$PADLOCK_CHUNK);
    if (!$PADLOCK_PREFETCH{$mode}) {
	&jnz	(&label("${mode}_loop"));
    } else {
	&jz	(&label("${mode}_break"));
	&cmp	($len,$chunk);
	&jae	(&label("${mode}_loop"));

&set_label("${mode}_unaligned_tail");
	&xor	("eax","eax");
	&cmp	("esp","ebp");
	&cmove	("eax",$len);
	&sub	("esp","eax");			# alloca
	&mov	("eax", $out);			# save parameters
	&mov	($chunk,$len);
	&shr	($len,2);
	&lea	($out,&DWP(0,"esp"));
	&data_byte(0xf3,0xa5);			# rep movsl
	&mov	($inp,"esp");
	&mov	($out,"eax");			# restore parameters
	&mov	($len,$chunk);
	&jmp	(&label("${mode}_loop"));

&set_label("${mode}_break",16);
    }
						if ($mode ne "ctr32") {
	&cmp	("esp","ebp");
	&je	(&label("${mode}_done"));
@@ -302,28 +341,24 @@ my ($mode,$opcode) = @_;
	&ja	(&label("${mode}_bzero"));

&set_label("${mode}_done");
	&mov	("ebp",&DWP(16,"ebp"));
	&lea	("esp",&DWP(24,"ebp"));
						if ($mode ne "ctr32") {
	&jmp	(&label("${mode}_exit"));

&set_label("${mode}_short",16);
	&xor	("eax","eax");
	&lea	("ebp",&DWP(-24,"esp"));
	&sub	("eax",$len);
	&lea	("esp",&DWP(0,"eax","ebp"));
	&and	("esp",-16);
	&xor	($chunk,$chunk);
&set_label("${mode}_short_copy");
	&movups	("xmm0",&QWP(0,$inp,$chunk));
	&lea	($chunk,&DWP(16,$chunk));
	&cmp	($len,$chunk);
	&movaps	(&QWP(-16,"esp",$chunk),"xmm0");
	&ja	(&label("${mode}_short_copy"));
	&mov	($inp,"esp");
	&mov	($chunk,$len);
	&jmp	(&label("${mode}_loop"));

&set_label("${mode}_aligned",16);
    if ($PADLOCK_PREFETCH{$mode}) {
	&lea	("ebp",&DWP(0,$inp,$len));
	&neg	("ebp");
	&and	("ebp",0xfff);			# distance to page boundary
	&xor	("eax","eax");
	&cmp	("ebp",$PADLOCK_PREFETCH{$mode});
	&mov	("ebp",$PADLOCK_PREFETCH{$mode}-1);
	&cmovae	("ebp","eax");
	&and	("ebp",$len);			# remainder
	&sub	($len,"ebp");
	&jz	(&label("${mode}_aligned_tail"));
    }
	&lea	("eax",&DWP(-16,$ctx));		# ivp
	&lea	("ebx",&DWP(16,$ctx));		# key
	&shr	($len,4);			# len/=AES_BLOCK_SIZE
@@ -332,6 +367,29 @@ my ($mode,$opcode) = @_;
	&movaps	("xmm0",&QWP(0,"eax"));
	&movaps	(&QWP(-16,$ctx),"xmm0");	# copy [or refresh] iv
						}
    if ($PADLOCK_PREFETCH{$mode}) {
	&test	("ebp","ebp");
	&jz	(&label("${mode}_exit"));

&set_label("${mode}_aligned_tail");
	&mov	($len,"ebp");
	&lea	("ebp",&DWP(-24,"esp"));
	&mov	("esp","ebp");
	&mov	("eax","ebp");
	&sub	("esp",$len);
	&and	("ebp",-16);
	&and	("esp",-16);
	&mov	(&DWP(16,"ebp"),"eax");
	&mov	("eax", $out);			# save parameters
	&mov	($chunk,$len);
	&shr	($len,2);
	&lea	($out,&DWP(0,"esp"));
	&data_byte(0xf3,0xa5);			# rep movsl
	&mov	($inp,"esp");
	&mov	($out,"eax");			# restore parameters
	&mov	($len,$chunk);
	&jmp	(&label("${mode}_loop"));
    }
&set_label("${mode}_exit");			}
	&mov	("eax",1);
	&lea	("esp",&DWP(4,"esp"));		# popf
+123 −55
Original line number Diff line number Diff line
@@ -27,7 +27,7 @@ open STDOUT,"| $^X $xlate $flavour $output";

$code=".text\n";

%PADLOCK_MARGIN=(ecb=>128, cbc=>64, ctr32=>64);	# prefetch errata
%PADLOCK_PREFETCH=(ecb=>128, cbc=>64, ctr32=>32);	# prefetch errata
$PADLOCK_CHUNK=512;	# Must be a power of 2 between 32 and 2^20

$ctx="%rdx";
@@ -285,17 +285,6 @@ padlock_${mode}_encrypt:
	lea	16($ctx),$ctx		# control word
	xor	%eax,%eax
	xor	%ebx,%ebx
___
# Formally speaking correct condtion is $len<=$margin and $inp+$margin
# crosses page boundary [and next page is unreadable]. But $inp can
# be unaligned in which case data can be copied to $out if latter is
# aligned, in which case $out+$margin has to be checked. Covering all
# cases appears more complicated than just copying short input...
$code.=<<___	if ($PADLOCK_MARGIN{$mode});
	cmp	\$$PADLOCK_MARGIN{$mode},$len
	jbe	.L${mode}_short
___
$code.=<<___;
	testl	\$`1<<5`,($ctx)		# align bit in control word
	jnz	.L${mode}_aligned
	test	\$0x0f,$out
@@ -315,6 +304,8 @@ $code.=<<___;
	neg	%rax
	and	\$$PADLOCK_CHUNK-1,$chunk	# chunk%=PADLOCK_CHUNK
	lea	(%rax,%rbp),%rsp
	mov	\$$PADLOCK_CHUNK,%rax
	cmovz	%rax,$chunk			# chunk=chunk?:PADLOCK_CHUNK
___
$code.=<<___				if ($mode eq "ctr32");
.L${mode}_reenter:
@@ -322,10 +313,27 @@ $code.=<<___ if ($mode eq "ctr32");
	bswap	%eax
	neg	%eax
	and	\$`$PADLOCK_CHUNK/16-1`,%eax
	jz	.L${mode}_loop
	mov	\$$PADLOCK_CHUNK,$chunk
	shl	\$4,%eax
	cmovz	$chunk,%rax
	cmp	%rax,$len
	cmova	%rax,$chunk		# don't let counter cross PADLOCK_CHUNK
	cmovbe	$len,$chunk
___
$code.=<<___				if ($PADLOCK_PREFETCH{$mode});
	cmp	$chunk,$len
	ja	.L${mode}_loop
	mov	$inp,%rax		# check if prefetch crosses page
	cmp	%rsp,%rbp
	cmove	$out,%rax
	add	$len,%rax
	neg	%rax
	and	\$0xfff,%rax		# distance to page boundary
	cmp	\$$PADLOCK_PREFETCH{$mode},%rax
	mov	\$-$PADLOCK_PREFETCH{$mode},%rax
	cmovae	$chunk,%rax		# mask=distance<prefetch?-prefetch:-1
	and	%rax,$chunk
	jz	.L${mode}_unaligned_tail
___
$code.=<<___;
	jmp	.L${mode}_loop
@@ -360,12 +368,12 @@ ___
$code.=<<___				if ($mode eq "ctr32");
	mov	-4($ctx),%eax		# pull 32-bit counter
	test	\$0xffff0000,%eax
	jnz	.L${mode}_no_corr
	jnz	.L${mode}_no_carry
	bswap	%eax
	add	\$0x10000,%eax
	bswap	%eax
	mov	%eax,-4($ctx)
.L${mode}_no_corr:
.L${mode}_no_carry:
___
$code.=<<___;
	mov	%r8,$out		# restore paramters
@@ -373,8 +381,8 @@ $code.=<<___;
	test	\$0x0f,$out
	jz	.L${mode}_out_aligned
	mov	$chunk,$len
	shr	\$3,$len
	lea	(%rsp),$inp
	shr	\$3,$len
	.byte	0xf3,0x48,0xa5		# rep movsq
	sub	$chunk,$out
.L${mode}_out_aligned:
@@ -384,9 +392,52 @@ $code.=<<___;
	add	$chunk,$inp
	sub	$chunk,$len
	mov	\$$PADLOCK_CHUNK,$chunk
___
					if (!$PADLOCK_PREFETCH{$mode}) {
$code.=<<___;
	jnz	.L${mode}_loop

___
					} else {
$code.=<<___;
	jz	.L${mode}_break
	cmp	$chunk,$len
	jae	.L${mode}_loop
___
$code.=<<___				if ($mode eq "ctr32");
	mov	$len,$chunk
	mov	$inp,%rax		# check if prefetch crosses page
	cmp	%rsp,%rbp
	cmove	$out,%rax
	add	$len,%rax
	neg	%rax
	and	\$0xfff,%rax		# distance to page boundary
	cmp	\$$PADLOCK_PREFETCH{$mode},%rax
	mov	\$-$PADLOCK_PREFETCH{$mode},%rax
	cmovae	$chunk,%rax
	and	%rax,$chunk
	jnz	.L${mode}_loop
___
$code.=<<___;
.L${mode}_unaligned_tail:
	xor	%eax,%eax
	cmp	%rsp,%rbp
	cmove	$len,%rax
	mov	$out,%r8		# save parameters
	mov	$len,$chunk
	sub	%rax,%rsp		# alloca
	shr	\$3,$len
	lea	(%rsp),$out
	.byte	0xf3,0x48,0xa5		# rep movsq
	mov	%rsp,$inp
	mov	%r8, $out		# restore parameters
	mov	$chunk,$len
	jmp	.L${mode}_loop
.align	16
.L${mode}_break:
___
					}
$code.=<<___;
	cmp	%rbp,%rsp
	je	.L${mode}_done

	pxor	%xmm0,%xmm0
@@ -400,70 +451,87 @@ $code.=<<___;
.L${mode}_done:
	lea	(%rbp),%rsp
	jmp	.L${mode}_exit
___
$code.=<<___ if ($PADLOCK_MARGIN{$mode});
.align	16
.L${mode}_short:
	mov	%rsp,%rbp
	sub	$len,%rsp
	xor	$chunk,$chunk
.L${mode}_short_copy:
	movups	($inp,$chunk),%xmm0
	lea	16($chunk),$chunk
	cmp	$chunk,$len
	movaps	%xmm0,-16(%rsp,$chunk)
	ja	.L${mode}_short_copy
	mov	%rsp,$inp
	mov	$len,$chunk
	jmp	.L${mode}_`${mode} eq "ctr32"?"reenter":"loop"`
___
$code.=<<___;

.align	16
.L${mode}_aligned:
___
$code.=<<___				if ($mode eq "ctr32");
	mov	-4($ctx),%eax		# pull 32-bit counter
	mov	\$`16*0x10000`,$chunk
	bswap	%eax
	cmp	$len,$chunk
	cmova	$len,$chunk
	neg	%eax
	and	\$0xffff,%eax
	jz	.L${mode}_aligned_loop
	mov	\$`16*0x10000`,$chunk
	shl	\$4,%eax
	cmovz	$chunk,%rax
	cmp	%rax,$len
	cmova	%rax,$chunk		# don't let counter cross 2^16
	jmp	.L${mode}_aligned_loop
.align	16
	cmovbe	$len,$chunk
	jbe	.L${mode}_aligned_skip

.L${mode}_aligned_loop:
	cmp	$len,$chunk
	cmova	$len,$chunk
	mov	$len,%r10		# save parameters
	mov	$chunk,$len
	mov	$chunk,%r11
___
$code.=<<___;

	lea	-16($ctx),%rax		# ivp
	lea	16($ctx),%rbx		# key
	shr	\$4,$len		# len/=AES_BLOCK_SIZE
	.byte	0xf3,0x0f,0xa7,$opcode	# rep xcrypt*
___
$code.=<<___				if ($mode !~ /ecb|ctr/);
	movdqa	(%rax),%xmm0
	movdqa	%xmm0,-16($ctx)		# copy [or refresh] iv
___
$code.=<<___				if ($mode eq "ctr32");

	mov	-4($ctx),%eax		# pull 32-bit counter
	bswap	%eax
	add	\$0x10000,%eax
	bswap	%eax
	mov	%eax,-4($ctx)

	mov	%r11,$chunk		# restore paramters
	mov	%r10,$len
	sub	$chunk,$len
	mov	%r10,$len		# restore paramters
	sub	%r11,$len
	mov	\$`16*0x10000`,$chunk
	jnz	.L${mode}_aligned_loop
	jz	.L${mode}_exit
	cmp	$chunk,$len
	jae	.L${mode}_aligned_loop

.L${mode}_aligned_skip:
___
$code.=<<___				if ($PADLOCK_PREFETCH{$mode});
	lea	($inp,$len),%rbp
	neg	%rbp
	and	\$0xfff,%rbp		# distance to page boundary
	xor	%eax,%eax
	cmp	\$$PADLOCK_PREFETCH{$mode},%rbp
	mov	\$$PADLOCK_PREFETCH{$mode}-1,%rbp
	cmovae	%rax,%rbp
	and	$len,%rbp		# remainder
	sub	%rbp,$len
	jz	.L${mode}_aligned_tail
___
$code.=<<___;
	lea	-16($ctx),%rax		# ivp
	lea	16($ctx),%rbx		# key
	shr	\$4,$len		# len/=AES_BLOCK_SIZE
	.byte	0xf3,0x0f,0xa7,$opcode	# rep xcrypt*
___
$code.=<<___				if ($mode !~ /ecb|ctr/);
	movdqa	(%rax),%xmm0
	movdqa	%xmm0,-16($ctx)		# copy [or refresh] iv
___
$code.=<<___				if ($PADLOCK_PREFETCH{$mode});
	test	%rbp,%rbp		# check remainder
	jz	.L${mode}_exit

.L${mode}_aligned_tail:
	mov	$out,%r8
	mov	%rbp,$chunk
	mov	%rbp,$len
	lea	(%rsp),%rbp
	sub	$len,%rsp
	shr	\$3,$len
	lea	(%rsp),$out
	.byte	0xf3,0x48,0xa5		# rep movsq	
	lea	(%r8),$out
	lea	(%rsp),$inp
	mov	$chunk,$len
	jmp	.L${mode}_loop
___
$code.=<<___;
.L${mode}_exit: