Commit 9c8bca1c authored by Andy Polyakov's avatar Andy Polyakov
Browse files

bn/asm/x86[_64]-mont*.pl: implement slightly alternative page-walking.



Original strategy for page-walking was adjust stack pointer and then
touch pages in order. This kind of asks for double-fault, because
if touch fails, then signal will be delivered to frame above adjusted
stack pointer. But touching pages prior adjusting stack pointer would
upset valgrind. As compromise let's adjust stack pointer in pages,
touching top of the stack. This still asks for double-fault, but at
least prevents corruption of neighbour stack if allocation is to
overstep the guard page.

Also omit predict-non-taken hints as they reportedly trigger illegal
instructions in some VM setups.

Reviewed-by: default avatarRichard Levitte <levitte@openssl.org>
(cherry picked from commit 3ba1ef82)
parent 91dc6054
Loading
Loading
Loading
Loading
+24 −17
Original line number Diff line number Diff line
@@ -63,27 +63,26 @@ $frame=32; # size of above frame rounded up to 16n

	&lea	("esi",&wparam(0));	# put aside pointer to argument block
	&lea	("edx",&wparam(1));	# load ap
	&mov	("ebp","esp");		# saved stack pointer!
	&add	("edi",2);		# extra two words on top of tp
	&neg	("edi");
	&lea	("esp",&DWP(-$frame,"esp","edi",4));	# alloca($frame+4*(num+2))
	&lea	("ebp",&DWP(-$frame,"esp","edi",4));	# future alloca($frame+4*(num+2))
	&neg	("edi");

	# minimize cache contention by arraning 2K window between stack
	# pointer and ap argument [np is also position sensitive vector,
	# but it's assumed to be near ap, as it's allocated at ~same
	# time].
	&mov	("eax","esp");
	&mov	("eax","ebp");
	&sub	("eax","edx");
	&and	("eax",2047);
	&sub	("esp","eax");		# this aligns sp and ap modulo 2048
	&sub	("ebp","eax");		# this aligns sp and ap modulo 2048

	&xor	("edx","esp");
	&xor	("edx","ebp");
	&and	("edx",2048);
	&xor	("edx",2048);
	&sub	("esp","edx");		# this splits them apart modulo 4096
	&sub	("ebp","edx");		# this splits them apart modulo 4096

	&and	("esp",-64);		# align to cache line
	&and	("ebp",-64);		# align to cache line

	# Some OSes, *cough*-dows, insist on stack being "wired" to
	# physical memory in strictly sequential manner, i.e. if stack
@@ -91,20 +90,28 @@ $frame=32; # size of above frame rounded up to 16n
	# be punishable by SEGV. But page walking can do good even on
	# other OSes, because it guarantees that villain thread hits
	# the guard page before it can make damage to innocent one...
	&mov	("eax","ebp");
	&sub	("eax","esp");
	&mov	("eax","esp");
	&sub	("eax","ebp");
	&and	("eax",-4096);
&set_label("page_walk");
	&mov	("edx",&DWP(0,"esp","eax"));
	&sub	("eax",4096);
	&data_byte(0x2e);
	&jnc	(&label("page_walk"));
	&mov	("edx","esp");		# saved stack pointer!
	&lea	("esp",&DWP(0,"ebp","eax"));
	&mov	("eax",&DWP(0,"esp"));
	&cmp	("esp","ebp");
	&ja	(&label("page_walk"));
	&jmp	(&label("page_walk_done"));

&set_label("page_walk",16);
	&lea	("esp",&DWP(-4096,"esp"));
	&mov	("eax",&DWP(0,"esp"));
	&cmp	("esp","ebp");
	&ja	(&label("page_walk"));
&set_label("page_walk_done");

	################################# load argument block...
	&mov	("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
	&mov	("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
	&mov	("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
	&mov	("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
	&mov	("ebp",&DWP(3*4,"esi"));# const BN_ULONG *np
	&mov	("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
	#&mov	("edi",&DWP(5*4,"esi"));# int num

@@ -112,11 +119,11 @@ $frame=32; # size of above frame rounded up to 16n
	&mov	($_rp,"eax");		# ... save a copy of argument block
	&mov	($_ap,"ebx");
	&mov	($_bp,"ecx");
	&mov	($_np,"edx");
	&mov	($_np,"ebp");
	&mov	($_n0,"esi");
	&lea	($num,&DWP(-3,"edi"));	# num=num-1 to assist modulo-scheduling
	#&mov	($_num,$num);		# redundant as $num is not reused
	&mov	($_sp,"ebp");		# saved stack pointer!
	&mov	($_sp,"edx");		# saved stack pointer!

if($sse2) {
$acc0="mm0";	# mmx register bank layout
+107 −78
Original line number Diff line number Diff line
@@ -97,6 +97,8 @@ $code=<<___;
.type	bn_mul_mont,\@function,6
.align	16
bn_mul_mont:
	mov	${num}d,${num}d
	mov	%rsp,%rax
	test	\$3,${num}d
	jnz	.Lmul_enter
	cmp	\$8,${num}d
@@ -121,29 +123,36 @@ $code.=<<___;
	push	%r14
	push	%r15

	mov	${num}d,${num}d
	lea	2($num),%r10
	neg	$num
	mov	%rsp,%r11
	neg	%r10
	lea	(%rsp,%r10,8),%rsp	# tp=alloca(8*(num+2))
	and	\$-1024,%rsp		# minimize TLB usage
	lea	-16(%rsp,$num,8),%r10	# future alloca(8*(num+2))
	neg	$num			# restore $num
	and	\$-1024,%r10		# minimize TLB usage

	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
.Lmul_body:
	# Some OSes, *cough*-dows, insist on stack being "wired" to
	# physical memory in strictly sequential manner, i.e. if stack
	# allocation spans two pages, then reference to farmost one can
	# be punishable by SEGV. But page walking can do good even on
	# other OSes, because it guarantees that villain thread hits
	# the guard page before it can make damage to innocent one...
	sub	%rsp,%r11
	sub	%r10,%r11
	and	\$-4096,%r11
	lea	(%r10,%r11),%rsp
	mov	(%rsp),%r11
	cmp	%r10,%rsp
	ja	.Lmul_page_walk
	jmp	.Lmul_page_walk_done

.align	16
.Lmul_page_walk:
	mov	(%rsp,%r11),%r10
	sub	\$4096,%r11
	.byte	0x66,0x2e		# predict non-taken
	jnc	.Lmul_page_walk
	lea	-4096(%rsp),%rsp
	mov	(%rsp),%r11
	cmp	%r10,%rsp
	ja	.Lmul_page_walk
.Lmul_page_walk_done:

	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
.Lmul_body:
	mov	$bp,%r12		# reassign $bp
___
		$bp="%r12";
@@ -314,13 +323,13 @@ $code.=<<___;

	mov	8(%rsp,$num,8),%rsi	# restore %rsp
	mov	\$1,%rax
	mov	(%rsi),%r15
	mov	8(%rsi),%r14
	mov	16(%rsi),%r13
	mov	24(%rsi),%r12
	mov	32(%rsi),%rbp
	mov	40(%rsi),%rbx
	lea	48(%rsi),%rsp
	mov	-48(%rsi),%r15
	mov	-40(%rsi),%r14
	mov	-32(%rsi),%r13
	mov	-24(%rsi),%r12
	mov	-16(%rsi),%rbp
	mov	-8(%rsi),%rbx
	lea	(%rsi),%rsp
.Lmul_epilogue:
	ret
.size	bn_mul_mont,.-bn_mul_mont
@@ -332,6 +341,8 @@ $code.=<<___;
.type	bn_mul4x_mont,\@function,6
.align	16
bn_mul4x_mont:
	mov	${num}d,${num}d
	mov	%rsp,%rax
.Lmul4x_enter:
___
$code.=<<___ if ($addx);
@@ -347,23 +358,29 @@ $code.=<<___;
	push	%r14
	push	%r15

	mov	${num}d,${num}d
	lea	4($num),%r10
	neg	$num
	mov	%rsp,%r11
	neg	%r10
	lea	(%rsp,%r10,8),%rsp	# tp=alloca(8*(num+4))
	and	\$-1024,%rsp		# minimize TLB usage
	lea	-32(%rsp,$num,8),%r10	# future alloca(8*(num+4))
	neg	$num			# restore
	and	\$-1024,%r10		# minimize TLB usage

	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
.Lmul4x_body:
	sub	%rsp,%r11
	sub	%r10,%r11
	and	\$-4096,%r11
	lea	(%r10,%r11),%rsp
	mov	(%rsp),%r11
	cmp	%r10,%rsp
	ja	.Lmul4x_page_walk
	jmp	.Lmul4x_page_walk_done

.Lmul4x_page_walk:
	mov	(%rsp,%r11),%r10
	sub	\$4096,%r11
	.byte	0x2e			# predict non-taken
	jnc	.Lmul4x_page_walk
	lea	-4096(%rsp),%rsp
	mov	(%rsp),%r11
	cmp	%r10,%rsp
	ja	.Lmul4x_page_walk
.Lmul4x_page_walk_done:

	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
.Lmul4x_body:
	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
	mov	%rdx,%r12		# reassign $bp
___
@@ -742,13 +759,13 @@ ___
$code.=<<___;
	mov	8(%rsp,$num,8),%rsi	# restore %rsp
	mov	\$1,%rax
	mov	(%rsi),%r15
	mov	8(%rsi),%r14
	mov	16(%rsi),%r13
	mov	24(%rsi),%r12
	mov	32(%rsi),%rbp
	mov	40(%rsi),%rbx
	lea	48(%rsi),%rsp
	mov	-48(%rsi),%r15
	mov	-40(%rsi),%r14
	mov	-32(%rsi),%r13
	mov	-24(%rsi),%r12
	mov	-16(%rsi),%rbp
	mov	-8(%rsi),%rbx
	lea	(%rsi),%rsp
.Lmul4x_epilogue:
	ret
.size	bn_mul4x_mont,.-bn_mul4x_mont
@@ -778,14 +795,15 @@ $code.=<<___;
.type	bn_sqr8x_mont,\@function,6
.align	32
bn_sqr8x_mont:
.Lsqr8x_enter:
	mov	%rsp,%rax
.Lsqr8x_enter:
	push	%rbx
	push	%rbp
	push	%r12
	push	%r13
	push	%r14
	push	%r15
.Lsqr8x_prologue:

	mov	${num}d,%r10d
	shl	\$3,${num}d		# convert $num to bytes
@@ -798,33 +816,42 @@ bn_sqr8x_mont:
	# do its job.
	#
	lea	-64(%rsp,$num,2),%r11
	mov	%rsp,%rbp
	mov	($n0),$n0		# *n0
	sub	$aptr,%r11
	and	\$4095,%r11
	cmp	%r11,%r10
	jb	.Lsqr8x_sp_alt
	sub	%r11,%rsp		# align with $aptr
	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
	sub	%r11,%rbp		# align with $aptr
	lea	-64(%rbp,$num,2),%rbp	# future alloca(frame+2*$num)
	jmp	.Lsqr8x_sp_done

.align	32
.Lsqr8x_sp_alt:
	lea	4096-64(,$num,2),%r10	# 4096-frame-2*$num
	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
	lea	-64(%rbp,$num,2),%rbp	# future alloca(frame+2*$num)
	sub	%r10,%r11
	mov	\$0,%r10
	cmovc	%r10,%r11
	sub	%r11,%rsp
	sub	%r11,%rbp
.Lsqr8x_sp_done:
	and	\$-64,%rsp
	mov	%rax,%r11
	sub	%rsp,%r11
	and	\$-64,%rbp
	mov	%rsp,%r11
	sub	%rbp,%r11
	and	\$-4096,%r11
	lea	(%rbp,%r11),%rsp
	mov	(%rsp),%r10
	cmp	%rbp,%rsp
	ja	.Lsqr8x_page_walk
	jmp	.Lsqr8x_page_walk_done

.align	16
.Lsqr8x_page_walk:
	mov	(%rsp,%r11),%r10
	sub	\$4096,%r11
	.byte	0x2e			# predict non-taken
	jnc	.Lsqr8x_page_walk
	lea	-4096(%rsp),%rsp
	mov	(%rsp),%r10
	cmp	%rbp,%rsp
	ja	.Lsqr8x_page_walk
.Lsqr8x_page_walk_done:

	mov	$num,%r10
	neg	$num
@@ -948,30 +975,38 @@ $code.=<<___;
.type	bn_mulx4x_mont,\@function,6
.align	32
bn_mulx4x_mont:
.Lmulx4x_enter:
	mov	%rsp,%rax
.Lmulx4x_enter:
	push	%rbx
	push	%rbp
	push	%r12
	push	%r13
	push	%r14
	push	%r15
.Lmulx4x_prologue:

	shl	\$3,${num}d		# convert $num to bytes
	.byte	0x67
	xor	%r10,%r10
	sub	$num,%r10		# -$num
	mov	($n0),$n0		# *n0
	lea	-72(%rsp,%r10),%rsp	# alloca(frame+$num+8)
	and	\$-128,%rsp
	mov	%rax,%r11
	sub	%rsp,%r11
	lea	-72(%rsp,%r10),%rbp	# future alloca(frame+$num+8)
	and	\$-128,%rbp
	mov	%rsp,%r11
	sub	%rbp,%r11
	and	\$-4096,%r11
	lea	(%rbp,%r11),%rsp
	mov	(%rsp),%r10
	cmp	%rbp,%rsp
	ja	.Lmulx4x_page_walk
	jmp	.Lmulx4x_page_walk_done

.align	16
.Lmulx4x_page_walk:
	mov	(%rsp,%r11),%r10
	sub	\$4096,%r11
	.byte	0x66,0x2e		# predict non-taken
	jnc	.Lmulx4x_page_walk
	lea	-4096(%rsp),%rsp
	mov	(%rsp),%r10
	cmp	%rbp,%rsp
	ja	.Lmulx4x_page_walk
.Lmulx4x_page_walk_done:

	lea	($bp,$num),%r10
	##############################################################
@@ -1332,22 +1367,8 @@ mul_handler:

	mov	192($context),%r10	# pull $num
	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
	lea	48(%rax),%rax

	mov	-8(%rax),%rbx
	mov	-16(%rax),%rbp
	mov	-24(%rax),%r12
	mov	-32(%rax),%r13
	mov	-40(%rax),%r14
	mov	-48(%rax),%r15
	mov	%rbx,144($context)	# restore context->Rbx
	mov	%rbp,160($context)	# restore context->Rbp
	mov	%r12,216($context)	# restore context->R12
	mov	%r13,224($context)	# restore context->R13
	mov	%r14,232($context)	# restore context->R14
	mov	%r15,240($context)	# restore context->R15

	jmp	.Lcommon_seh_tail
	jmp	.Lcommon_pop_regs
.size	mul_handler,.-mul_handler

.type	sqr_handler,\@abi-omnipotent
@@ -1375,15 +1396,21 @@ sqr_handler:
	cmp	%r10,%rbx		# context->Rip<.Lsqr_body
	jb	.Lcommon_seh_tail

	mov	4(%r11),%r10d		# HandlerData[1]
	lea	(%rsi,%r10),%r10	# body label
	cmp	%r10,%rbx		# context->Rip>=.Lsqr_epilogue
	jb	.Lcommon_pop_regs

	mov	152($context),%rax	# pull context->Rsp

	mov	4(%r11),%r10d		# HandlerData[1]
	mov	8(%r11),%r10d		# HandlerData[2]
	lea	(%rsi,%r10),%r10	# epilogue label
	cmp	%r10,%rbx		# context->Rip>=.Lsqr_epilogue
	jae	.Lcommon_seh_tail

	mov	40(%rax),%rax		# pull saved stack pointer

.Lcommon_pop_regs:
	mov	-8(%rax),%rbx
	mov	-16(%rax),%rbp
	mov	-24(%rax),%r12
@@ -1470,13 +1497,15 @@ $code.=<<___;
.LSEH_info_bn_sqr8x_mont:
	.byte	9,0,0,0
	.rva	sqr_handler
	.rva	.Lsqr8x_body,.Lsqr8x_epilogue	# HandlerData[]
	.rva	.Lsqr8x_prologue,.Lsqr8x_body,.Lsqr8x_epilogue		# HandlerData[]
.align	8
___
$code.=<<___ if ($addx);
.LSEH_info_bn_mulx4x_mont:
	.byte	9,0,0,0
	.rva	sqr_handler
	.rva	.Lmulx4x_body,.Lmulx4x_epilogue	# HandlerData[]
	.rva	.Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue	# HandlerData[]
.align	8
___
}

+143 −84
Original line number Diff line number Diff line
@@ -86,6 +86,8 @@ $code=<<___;
.type	bn_mul_mont_gather5,\@function,6
.align	64
bn_mul_mont_gather5:
	mov	${num}d,${num}d
	mov	%rsp,%rax
	test	\$7,${num}d
	jnz	.Lmul_enter
___
@@ -97,10 +99,7 @@ $code.=<<___;

.align	16
.Lmul_enter:
	mov	${num}d,${num}d
	mov	%rsp,%rax
	movd	`($win64?56:8)`(%rsp),%xmm5	# load 7th argument
	lea	.Linc(%rip),%r10
	push	%rbx
	push	%rbp
	push	%r12
@@ -108,26 +107,36 @@ $code.=<<___;
	push	%r14
	push	%r15

	lea	2($num),%r11
	neg	%r11
	lea	-264(%rsp,%r11,8),%rsp	# tp=alloca(8*(num+2)+256+8)
	and	\$-1024,%rsp		# minimize TLB usage
	neg	$num
	mov	%rsp,%r11
	lea	-280(%rsp,$num,8),%r10	# future alloca(8*(num+2)+256+8)
	neg	$num			# restore $num
	and	\$-1024,%r10		# minimize TLB usage

	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
.Lmul_body:
	# Some OSes, *cough*-dows, insist on stack being "wired" to
	# physical memory in strictly sequential manner, i.e. if stack
	# allocation spans two pages, then reference to farmost one can
	# be punishable by SEGV. But page walking can do good even on
	# other OSes, because it guarantees that villain thread hits
	# the guard page before it can make damage to innocent one...
	sub	%rsp,%rax
	and	\$-4096,%rax
	sub	%r10,%r11
	and	\$-4096,%r11
	lea	(%r10,%r11),%rsp
	mov	(%rsp),%r11
	cmp	%r10,%rsp
	ja	.Lmul_page_walk
	jmp	.Lmul_page_walk_done

.Lmul_page_walk:
	mov	(%rsp,%rax),%r11
	sub	\$4096,%rax
	.byte	0x2e			# predict non-taken
	jnc	.Lmul_page_walk
	lea	-4096(%rsp),%rsp
	mov	(%rsp),%r11
	cmp	%r10,%rsp
	ja	.Lmul_page_walk
.Lmul_page_walk_done:

	lea	.Linc(%rip),%r10
	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
.Lmul_body:

	lea	128($bp),%r12		# reassign $bp (+size optimization)
___
@@ -433,6 +442,8 @@ $code.=<<___;
.type	bn_mul4x_mont_gather5,\@function,6
.align	32
bn_mul4x_mont_gather5:
	.byte	0x67
	mov	%rsp,%rax
.Lmul4x_enter:
___
$code.=<<___ if ($addx);
@@ -441,14 +452,13 @@ $code.=<<___ if ($addx);
	je	.Lmulx4x_enter
___
$code.=<<___;
	.byte	0x67
	mov	%rsp,%rax
	push	%rbx
	push	%rbp
	push	%r12
	push	%r13
	push	%r14
	push	%r15
.Lmul4x_prologue:

	.byte	0x67
	shl	\$3,${num}d		# convert $num to bytes
@@ -465,32 +475,40 @@ $code.=<<___;
	# calculated from 7th argument, the index.]
	#
	lea	-320(%rsp,$num,2),%r11
	mov	%rsp,%rbp
	sub	$rp,%r11
	and	\$4095,%r11
	cmp	%r11,%r10
	jb	.Lmul4xsp_alt
	sub	%r11,%rsp		# align with $rp
	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*num*8+256)
	sub	%r11,%rbp		# align with $rp
	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*num*8+256)
	jmp	.Lmul4xsp_done

.align	32
.Lmul4xsp_alt:
	lea	4096-320(,$num,2),%r10
	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*num*8+256)
	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*num*8+256)
	sub	%r10,%r11
	mov	\$0,%r10
	cmovc	%r10,%r11
	sub	%r11,%rsp
	sub	%r11,%rbp
.Lmul4xsp_done:
	and	\$-64,%rsp
	mov	%rax,%r11
	sub	%rsp,%r11
	and	\$-64,%rbp
	mov	%rsp,%r11
	sub	%rbp,%r11
	and	\$-4096,%r11
	lea	(%rbp,%r11),%rsp
	mov	(%rsp),%r10
	cmp	%rbp,%rsp
	ja	.Lmul4x_page_walk
	jmp	.Lmul4x_page_walk_done

.Lmul4x_page_walk:
	mov	(%rsp,%r11),%r10
	sub	\$4096,%r11
	.byte	0x2e			# predict non-taken
	jnc	.Lmul4x_page_walk
	lea	-4096(%rsp),%rsp
	mov	(%rsp),%r10
	cmp	%rbp,%rsp
	ja	.Lmul4x_page_walk
.Lmul4x_page_walk_done:

	neg	$num

@@ -1034,6 +1052,7 @@ $code.=<<___;
.type	bn_power5,\@function,6
.align	32
bn_power5:
	mov	%rsp,%rax
___
$code.=<<___ if ($addx);
	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
@@ -1042,13 +1061,13 @@ $code.=<<___ if ($addx);
	je	.Lpowerx5_enter
___
$code.=<<___;
	mov	%rsp,%rax
	push	%rbx
	push	%rbp
	push	%r12
	push	%r13
	push	%r14
	push	%r15
.Lpower5_prologue:

	shl	\$3,${num}d		# convert $num to bytes
	lea	($num,$num,2),%r10d	# 3*$num
@@ -1063,32 +1082,40 @@ $code.=<<___;
	# calculated from 7th argument, the index.]
	#
	lea	-320(%rsp,$num,2),%r11
	mov	%rsp,%rbp
	sub	$rptr,%r11
	and	\$4095,%r11
	cmp	%r11,%r10
	jb	.Lpwr_sp_alt
	sub	%r11,%rsp		# align with $aptr
	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*num*8+256)
	sub	%r11,%rbp		# align with $aptr
	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*num*8+256)
	jmp	.Lpwr_sp_done

.align	32
.Lpwr_sp_alt:
	lea	4096-320(,$num,2),%r10
	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*num*8+256)
	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*num*8+256)
	sub	%r10,%r11
	mov	\$0,%r10
	cmovc	%r10,%r11
	sub	%r11,%rsp
	sub	%r11,%rbp
.Lpwr_sp_done:
	and	\$-64,%rsp
	mov	%rax,%r11
	sub	%rsp,%r11
	and	\$-64,%rbp
	mov	%rsp,%r11
	sub	%rbp,%r11
	and	\$-4096,%r11
	lea	(%rbp,%r11),%rsp
	mov	(%rsp),%r10
	cmp	%rbp,%rsp
	ja	.Lpwr_page_walk
	jmp	.Lpwr_page_walk_done

.Lpwr_page_walk:
	mov	(%rsp,%r11),%r10
	sub	\$4096,%r11
	.byte	0x2e			# predict non-taken
	jnc	.Lpwr_page_walk
	lea	-4096(%rsp),%rsp
	mov	(%rsp),%r10
	cmp	%rbp,%rsp
	ja	.Lpwr_page_walk
.Lpwr_page_walk_done:

	mov	$num,%r10	
	neg	$num
@@ -2028,6 +2055,7 @@ bn_from_mont8x:
	push	%r13
	push	%r14
	push	%r15
.Lfrom_prologue:

	shl	\$3,${num}d		# convert $num to bytes
	lea	($num,$num,2),%r10	# 3*$num in bytes
@@ -2042,32 +2070,40 @@ bn_from_mont8x:
	# last operation, we use the opportunity to cleanse it.
	#
	lea	-320(%rsp,$num,2),%r11
	mov	%rsp,%rbp
	sub	$rptr,%r11
	and	\$4095,%r11
	cmp	%r11,%r10
	jb	.Lfrom_sp_alt
	sub	%r11,%rsp		# align with $aptr
	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*$num*8+256)
	sub	%r11,%rbp		# align with $aptr
	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*$num*8+256)
	jmp	.Lfrom_sp_done

.align	32
.Lfrom_sp_alt:
	lea	4096-320(,$num,2),%r10
	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*$num*8+256)
	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*$num*8+256)
	sub	%r10,%r11
	mov	\$0,%r10
	cmovc	%r10,%r11
	sub	%r11,%rsp
	sub	%r11,%rbp
.Lfrom_sp_done:
	and	\$-64,%rsp
	mov	%rax,%r11
	sub	%rsp,%r11
	and	\$-64,%rbp
	mov	%rsp,%r11
	sub	%rbp,%r11
	and	\$-4096,%r11
	lea	(%rbp,%r11),%rsp
	mov	(%rsp),%r10
	cmp	%rbp,%rsp
	ja	.Lfrom_page_walk
	jmp	.Lfrom_page_walk_done

.Lfrom_page_walk:
	mov	(%rsp,%r11),%r10
	sub	\$4096,%r11
	.byte	0x2e			# predict non-taken
	jnc	.Lfrom_page_walk
	lea	-4096(%rsp),%rsp
	mov	(%rsp),%r10
	cmp	%rbp,%rsp
	ja	.Lfrom_page_walk
.Lfrom_page_walk_done:

	mov	$num,%r10
	neg	$num
@@ -2173,14 +2209,15 @@ $code.=<<___;
.type	bn_mulx4x_mont_gather5,\@function,6
.align	32
bn_mulx4x_mont_gather5:
.Lmulx4x_enter:
	mov	%rsp,%rax
.Lmulx4x_enter:
	push	%rbx
	push	%rbp
	push	%r12
	push	%r13
	push	%r14
	push	%r15
.Lmulx4x_prologue:

	shl	\$3,${num}d		# convert $num to bytes
	lea	($num,$num,2),%r10	# 3*$num in bytes
@@ -2197,31 +2234,39 @@ bn_mulx4x_mont_gather5:
	# calculated from 7th argument, the index.]
	#
	lea	-320(%rsp,$num,2),%r11
	mov	%rsp,%rbp
	sub	$rp,%r11
	and	\$4095,%r11
	cmp	%r11,%r10
	jb	.Lmulx4xsp_alt
	sub	%r11,%rsp		# align with $aptr
	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*$num*8+256)
	sub	%r11,%rbp		# align with $aptr
	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*$num*8+256)
	jmp	.Lmulx4xsp_done

.Lmulx4xsp_alt:
	lea	4096-320(,$num,2),%r10
	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*$num*8+256)
	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*$num*8+256)
	sub	%r10,%r11
	mov	\$0,%r10
	cmovc	%r10,%r11
	sub	%r11,%rsp
	sub	%r11,%rbp
.Lmulx4xsp_done:	
	and	\$-64,%rsp		# ensure alignment
	mov	%rax,%r11
	sub	%rsp,%r11
	and	\$-64,%rbp		# ensure alignment
	mov	%rsp,%r11
	sub	%rbp,%r11
	and	\$-4096,%r11
	lea	(%rbp,%r11),%rsp
	mov	(%rsp),%r10
	cmp	%rbp,%rsp
	ja	.Lmulx4x_page_walk
	jmp	.Lmulx4x_page_walk_done

.Lmulx4x_page_walk:
	mov	(%rsp,%r11),%r10
	sub	\$4096,%r11
	.byte	0x2e			# predict non-taken
	jnc	.Lmulx4x_page_walk
	lea	-4096(%rsp),%rsp
	mov	(%rsp),%r10
	cmp	%rbp,%rsp
	ja	.Lmulx4x_page_walk
.Lmulx4x_page_walk_done:

	##############################################################
	# Stack layout
@@ -2629,14 +2674,15 @@ $code.=<<___;
.type	bn_powerx5,\@function,6
.align	32
bn_powerx5:
.Lpowerx5_enter:
	mov	%rsp,%rax
.Lpowerx5_enter:
	push	%rbx
	push	%rbp
	push	%r12
	push	%r13
	push	%r14
	push	%r15
.Lpowerx5_prologue:

	shl	\$3,${num}d		# convert $num to bytes
	lea	($num,$num,2),%r10	# 3*$num in bytes
@@ -2651,32 +2697,40 @@ bn_powerx5:
	# calculated from 7th argument, the index.]
	#
	lea	-320(%rsp,$num,2),%r11
	mov	%rsp,%rbp
	sub	$rptr,%r11
	and	\$4095,%r11
	cmp	%r11,%r10
	jb	.Lpwrx_sp_alt
	sub	%r11,%rsp		# align with $aptr
	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*$num*8+256)
	sub	%r11,%rbp		# align with $aptr
	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*$num*8+256)
	jmp	.Lpwrx_sp_done

.align	32
.Lpwrx_sp_alt:
	lea	4096-320(,$num,2),%r10
	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*$num*8+256)
	lea	-320(%rbp,$num,2),%rbp	# alloca(frame+2*$num*8+256)
	sub	%r10,%r11
	mov	\$0,%r10
	cmovc	%r10,%r11
	sub	%r11,%rsp
	sub	%r11,%rbp
.Lpwrx_sp_done:
	and	\$-64,%rsp
	mov	%rax,%r11
	sub	%rsp,%r11
	and	\$-64,%rbp
	mov	%rsp,%r11
	sub	%rbp,%r11
	and	\$-4096,%r11
	lea	(%rbp,%r11),%rsp
	mov	(%rsp),%r10
	cmp	%rbp,%rsp
	ja	.Lpwrx_page_walk
	jmp	.Lpwrx_page_walk_done

.Lpwrx_page_walk:
	mov	(%rsp,%r11),%r10
	sub	\$4096,%r11
	.byte	0x2e			# predict non-taken
	jnc	.Lpwrx_page_walk
	lea	-4096(%rsp),%rsp
	mov	(%rsp),%r10
	cmp	%rbp,%rsp
	ja	.Lpwrx_page_walk
.Lpwrx_page_walk_done:

	mov	$num,%r10	
	neg	$num
@@ -3607,9 +3661,14 @@ mul_handler:
	cmp	%r10,%rbx		# context->Rip<end of prologue label
	jb	.Lcommon_seh_tail

	mov	4(%r11),%r10d		# HandlerData[1]
	lea	(%rsi,%r10),%r10	# epilogue label
	cmp	%r10,%rbx		# context->Rip>=epilogue label
	jb	.Lcommon_pop_regs

	mov	152($context),%rax	# pull context->Rsp

	mov	4(%r11),%r10d		# HandlerData[1]
	mov	8(%r11),%r10d		# HandlerData[2]
	lea	(%rsi,%r10),%r10	# epilogue label
	cmp	%r10,%rbx		# context->Rip>=epilogue label
	jae	.Lcommon_seh_tail
@@ -3621,11 +3680,11 @@ mul_handler:
	mov	192($context),%r10	# pull $num
	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer

	jmp	.Lbody_proceed
	jmp	.Lcommon_pop_regs

.Lbody_40:
	mov	40(%rax),%rax		# pull saved stack pointer
.Lbody_proceed:
.Lcommon_pop_regs:
	mov	-8(%rax),%rbx
	mov	-16(%rax),%rbp
	mov	-24(%rax),%r12
@@ -3716,34 +3775,34 @@ $code.=<<___;
.LSEH_info_bn_mul_mont_gather5:
	.byte	9,0,0,0
	.rva	mul_handler
	.rva	.Lmul_body,.Lmul_epilogue		# HandlerData[]
	.rva	.Lmul_body,.Lmul_body,.Lmul_epilogue		# HandlerData[]
.align	8
.LSEH_info_bn_mul4x_mont_gather5:
	.byte	9,0,0,0
	.rva	mul_handler
	.rva	.Lmul4x_body,.Lmul4x_epilogue		# HandlerData[]
	.rva	.Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue		# HandlerData[]
.align	8
.LSEH_info_bn_power5:
	.byte	9,0,0,0
	.rva	mul_handler
	.rva	.Lpower5_body,.Lpower5_epilogue		# HandlerData[]
	.rva	.Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue	# HandlerData[]
.align	8
.LSEH_info_bn_from_mont8x:
	.byte	9,0,0,0
	.rva	mul_handler
	.rva	.Lfrom_body,.Lfrom_epilogue		# HandlerData[]
	.rva	.Lfrom_prologue,.Lfrom_body,.Lfrom_epilogue		# HandlerData[]
___
$code.=<<___ if ($addx);
.align	8
.LSEH_info_bn_mulx4x_mont_gather5:
	.byte	9,0,0,0
	.rva	mul_handler
	.rva	.Lmulx4x_body,.Lmulx4x_epilogue		# HandlerData[]
	.rva	.Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue	# HandlerData[]
.align	8
.LSEH_info_bn_powerx5:
	.byte	9,0,0,0
	.rva	mul_handler
	.rva	.Lpowerx5_body,.Lpowerx5_epilogue	# HandlerData[]
	.rva	.Lpowerx5_prologue,.Lpowerx5_body,.Lpowerx5_epilogue	# HandlerData[]
___
$code.=<<___;
.align	8