Commit a1597194 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

bn/asm/x86[_64]-mont*.pl: complement alloca with page-walking.



Some OSes, *cough*-dows, insist on stack being "wired" to
physical memory in strictly sequential manner, i.e. if stack
allocation spans two pages, then reference to farmost one can
be punishable by SEGV. But page walking can do good even on
other OSes, because it guarantees that villain thread hits
the guard page before it can make damage to innocent one...

Reviewed-by: default avatarRich Salz <rsalz@openssl.org>
(cherry picked from commit adc4f1fc)

Resolved conflicts:
	crypto/bn/asm/x86_64-mont.pl
	crypto/bn/asm/x86_64-mont5.pl

Reviewed-by: default avatarRichard Levitte <levitte@openssl.org>
parent 6e7a1f35
Loading
Loading
Loading
Loading
+15 −0
Original line number Diff line number Diff line
@@ -85,6 +85,21 @@ $frame=32; # size of above frame rounded up to 16n

	&and	("esp",-64);		# align to cache line

	# Some OSes, *cough*-dows, insist on stack being "wired" to
	# physical memory in strictly sequential manner, i.e. if stack
	# allocation spans two pages, then reference to farmost one can
	# be punishable by SEGV. But page walking can do good even on
	# other OSes, because it guarantees that villain thread hits
	# the guard page before it can make damage to innocent one...
	&mov	("eax","ebp");
	&sub	("eax","esp");
	&and	("eax",-4096);
&set_label("page_walk");
	&mov	("edx",&DWP(0,"esp","eax"));
	&sub	("eax",4096);
	&data_byte(0x2e);
	&jnc	(&label("page_walk"));

	################################# load argument block...
	&mov	("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
	&mov	("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
+37 −3
Original line number Diff line number Diff line
@@ -91,6 +91,20 @@ bn_mul_mont:

	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
.Lmul_body:
	# Some OSes, *cough*-dows, insist on stack being "wired" to
	# physical memory in strictly sequential manner, i.e. if stack
	# allocation spans two pages, then reference to farmost one can
	# be punishable by SEGV. But page walking can do good even on
	# other OSes, because it guarantees that villain thread hits
	# the guard page before it can make damage to innocent one...
	sub	%rsp,%r11
	and	\$-4096,%r11
.Lmul_page_walk:
	mov	(%rsp,%r11),%r10
	sub	\$4096,%r11
	.byte	0x66,0x2e		# predict non-taken
	jnc	.Lmul_page_walk

	mov	$bp,%r12		# reassign $bp
___
		$bp="%r12";
@@ -296,6 +310,14 @@ bn_mul4x_mont:

	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
.Lmul4x_body:
	sub	%rsp,%r11
	and	\$-4096,%r11
.Lmul4x_page_walk:
	mov	(%rsp,%r11),%r10
	sub	\$4096,%r11
	.byte	0x2e			# predict non-taken
	jnc	.Lmul4x_page_walk

	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
	mov	%rdx,%r12		# reassign $bp
___
@@ -707,6 +729,7 @@ $code.=<<___;
.align	16
bn_sqr4x_mont:
.Lsqr4x_enter:
	mov	%rsp,%rax
	push	%rbx
	push	%rbp
	push	%r12
@@ -715,12 +738,23 @@ bn_sqr4x_mont:
	push	%r15

	shl	\$3,${num}d		# convert $num to bytes
	xor	%r10,%r10
	mov	%rsp,%r11		# put aside %rsp
	sub	$num,%r10		# -$num
	neg	$num			# -$num
	mov	($n0),$n0		# *n0
	lea	-72(%rsp,%r10,2),%rsp	# alloca(frame+2*$num)
	lea	-72(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
	and	\$-1024,%rsp		# minimize TLB usage

	sub	%rsp,%r11
	and	\$-4096,%r11
.Lsqr4x_page_walk:
	mov	(%rsp,%r11),%r10
	sub	\$4096,%r11
	.byte	0x2e			# predict non-taken
	jnc	.Lsqr4x_page_walk

	mov	$num,%r10
	neg	$num			# restore $num
	lea	-48(%rax),%r11		# restore saved %rsp
	##############################################################
	# Stack layout
	#
+22 −0
Original line number Diff line number Diff line
@@ -84,6 +84,20 @@ bn_mul_mont_gather5:

	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
.Lmul_body:
	# Some OSes, *cough*-dows, insist on stack being "wired" to
	# physical memory in strictly sequential manner, i.e. if stack
	# allocation spans two pages, then reference to farmost one can
	# be punishable by SEGV. But page walking can do good even on
	# other OSes, because it guarantees that villain thread hits
	# the guard page before it can make damage to innocent one...
	sub	%rsp,%rax
	and	\$-4096,%rax
.Lmul_page_walk:
	mov	(%rsp,%rax),%r11
	sub	\$4096,%rax
	.byte	0x2e			# predict non-taken
	jnc	.Lmul_page_walk

	lea	128($bp),%r12		# reassign $bp (+size optimization)
___
		$bp="%r12";
@@ -407,6 +421,14 @@ bn_mul4x_mont_gather5:

	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
.Lmul4x_body:
	sub	%rsp,%rax
	and	\$-4096,%rax
.Lmul4x_page_walk:
	mov	(%rsp,%rax),%r11
	sub	\$4096,%rax
	.byte	0x2e			# predict non-taken
	jnc	.Lmul4x_page_walk

	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
	lea	128(%rdx),%r12		# reassign $bp (+size optimization)
___