Commit adc4f1fc authored by Andy Polyakov's avatar Andy Polyakov
Browse files

bn/asm/x86[_64]-mont*.pl: complement alloca with page-walking.



Some OSes, *cough*-dows, insist on stack being "wired" to
physical memory in strictly sequential manner, i.e. if stack
allocation spans two pages, then reference to farmost one can
be punishable by SEGV. But page walking can do good even on
other OSes, because it guarantees that villain thread hits
the guard page before it can make damage to innocent one...

Reviewed-by: default avatarRich Salz <rsalz@openssl.org>
parent 56cd71b4
Loading
Loading
Loading
Loading
+15 −0
Original line number Diff line number Diff line
@@ -85,6 +85,21 @@ $frame=32; # size of above frame rounded up to 16n

	&and	("esp",-64);		# align to cache line

	# Some OSes, *cough*-dows, insist on stack being "wired" to
	# physical memory in strictly sequential manner, i.e. if stack
	# allocation spans two pages, then reference to farmost one can
	# be punishable by SEGV. But page walking can do good even on
	# other OSes, because it guarantees that villain thread hits
	# the guard page before it can make damage to innocent one...
	&mov	("eax","ebp");
	&sub	("eax","esp");
	&and	("eax",-4096);
&set_label("page_walk");
	&mov	("edx",&DWP(0,"esp","eax"));
	&sub	("eax",4096);
	&data_byte(0x2e);
	&jnc	(&label("page_walk"));

	################################# load argument block...
	&mov	("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
	&mov	("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
+41 −1
Original line number Diff line number Diff line
@@ -130,6 +130,20 @@ $code.=<<___;

	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
.Lmul_body:
	# Some OSes, *cough*-dows, insist on stack being "wired" to
	# physical memory in strictly sequential manner, i.e. if stack
	# allocation spans two pages, then reference to farmost one can
	# be punishable by SEGV. But page walking can do good even on
	# other OSes, because it guarantees that villain thread hits
	# the guard page before it can make damage to innocent one...
	sub	%rsp,%r11
	and	\$-4096,%r11
.Lmul_page_walk:
	mov	(%rsp,%r11),%r10
	sub	\$4096,%r11
	.byte	0x66,0x2e		# predict non-taken
	jnc	.Lmul_page_walk

	mov	$bp,%r12		# reassign $bp
___
		$bp="%r12";
@@ -342,6 +356,14 @@ $code.=<<___;

	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
.Lmul4x_body:
	sub	%rsp,%r11
	and	\$-4096,%r11
.Lmul4x_page_walk:
	mov	(%rsp,%r11),%r10
	sub	\$4096,%r11
	.byte	0x2e			# predict non-taken
	jnc	.Lmul4x_page_walk

	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
	mov	%rdx,%r12		# reassign $bp
___
@@ -795,6 +817,15 @@ bn_sqr8x_mont:
	sub	%r11,%rsp
.Lsqr8x_sp_done:
	and	\$-64,%rsp
	mov	%rax,%r11
	sub	%rsp,%r11
	and	\$-4096,%r11
.Lsqr8x_page_walk:
	mov	(%rsp,%r11),%r10
	sub	\$4096,%r11
	.byte	0x2e			# predict non-taken
	jnc	.Lsqr8x_page_walk

	mov	$num,%r10
	neg	$num

@@ -932,8 +963,17 @@ bn_mulx4x_mont:
	sub	$num,%r10		# -$num
	mov	($n0),$n0		# *n0
	lea	-72(%rsp,%r10),%rsp	# alloca(frame+$num+8)
	lea	($bp,$num),%r10
	and	\$-128,%rsp
	mov	%rax,%r11
	sub	%rsp,%r11
	and	\$-4096,%r11
.Lmulx4x_page_walk:
	mov	(%rsp,%r11),%r10
	sub	\$4096,%r11
	.byte	0x66,0x2e		# predict non-taken
	jnc	.Lmulx4x_page_walk

	lea	($bp,$num),%r10
	##############################################################
	# Stack layout
	# +0	num
+60 −1
Original line number Diff line number Diff line
@@ -115,6 +115,20 @@ $code.=<<___;

	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
.Lmul_body:
	# Some OSes, *cough*-dows, insist on stack being "wired" to
	# physical memory in strictly sequential manner, i.e. if stack
	# allocation spans two pages, then reference to farmost one can
	# be punishable by SEGV. But page walking can do good even on
	# other OSes, because it guarantees that villain thread hits
	# the guard page before it can make damage to innocent one...
	sub	%rsp,%rax
	and	\$-4096,%rax
.Lmul_page_walk:
	mov	(%rsp,%rax),%r11
	sub	\$4096,%rax
	.byte	0x2e			# predict non-taken
	jnc	.Lmul_page_walk

	lea	128($bp),%r12		# reassign $bp (+size optimization)
___
		$bp="%r12";
@@ -469,6 +483,15 @@ $code.=<<___;
	sub	%r11,%rsp
.Lmul4xsp_done:
	and	\$-64,%rsp
	mov	%rax,%r11
	sub	%rsp,%r11
	and	\$-4096,%r11
.Lmul4x_page_walk:
	mov	(%rsp,%r11),%r10
	sub	\$4096,%r11
	.byte	0x2e			# predict non-taken
	jnc	.Lmul4x_page_walk

	neg	$num

	mov	%rax,40(%rsp)
@@ -1058,6 +1081,15 @@ $code.=<<___;
	sub	%r11,%rsp
.Lpwr_sp_done:
	and	\$-64,%rsp
	mov	%rax,%r11
	sub	%rsp,%r11
	and	\$-4096,%r11
.Lpwr_page_walk:
	mov	(%rsp,%r11),%r10
	sub	\$4096,%r11
	.byte	0x2e			# predict non-taken
	jnc	.Lpwr_page_walk

	mov	$num,%r10	
	neg	$num

@@ -2028,6 +2060,15 @@ bn_from_mont8x:
	sub	%r11,%rsp
.Lfrom_sp_done:
	and	\$-64,%rsp
	mov	%rax,%r11
	sub	%rsp,%r11
	and	\$-4096,%r11
.Lfrom_page_walk:
	mov	(%rsp,%r11),%r10
	sub	\$4096,%r11
	.byte	0x2e			# predict non-taken
	jnc	.Lfrom_page_walk

	mov	$num,%r10
	neg	$num

@@ -2173,6 +2214,15 @@ bn_mulx4x_mont_gather5:
	sub	%r11,%rsp
.Lmulx4xsp_done:	
	and	\$-64,%rsp		# ensure alignment
	mov	%rax,%r11
	sub	%rsp,%r11
	and	\$-4096,%r11
.Lmulx4x_page_walk:
	mov	(%rsp,%r11),%r10
	sub	\$4096,%r11
	.byte	0x2e			# predict non-taken
	jnc	.Lmulx4x_page_walk

	##############################################################
	# Stack layout
	# +0	-num
@@ -2619,6 +2669,15 @@ bn_powerx5:
	sub	%r11,%rsp
.Lpwrx_sp_done:
	and	\$-64,%rsp
	mov	%rax,%r11
	sub	%rsp,%r11
	and	\$-4096,%r11
.Lpwrx_page_walk:
	mov	(%rsp,%r11),%r10
	sub	\$4096,%r11
	.byte	0x2e			# predict non-taken
	jnc	.Lpwrx_page_walk

	mov	$num,%r10	
	neg	$num