Commit 4eeb750d authored by Andy Polyakov's avatar Andy Polyakov
Browse files

bn/asm/x86_64-mont.pl: minor optimization [for Decoded ICache].

parent ed77017b
Loading
Loading
Loading
Loading
+151 −138
Original line number Diff line number Diff line
@@ -1685,6 +1685,7 @@ bn_mulx4x_mont:
	push	%r15

	shl	\$3,${num}d		# convert $num to bytes
	.byte	0x67
	xor	%r10,%r10
	mov	%rsp,%r11		# put aside %rsp
	sub	$num,%r10		# -$num
@@ -1725,15 +1726,14 @@ $code.=<<___;
	mov	($bp),%rdx		# b[0], $bp==%rdx actually
	lea	64+32(%rsp),$tptr
	mov	%rdx,$bi
	xor	$zero,$zero		# of=0,cf=0

	mulx	0*8($aptr),$mi,%rax	# a[0]*b[0]
	mulx	1*8($aptr),%r11,%r14	# a[1]*b[0]
	adcx	%rax,%r11
	add	%rax,%r11
	mov	$bptr,8(%rsp)		# off-load &b[i]
	mulx	2*8($aptr),%r12,%r13	# ...
	adcx	%r14,%r12
	adcx	$zero,%r13
	adc	%r14,%r12
	adc	\$0,%r13

	mov	$mi,$bptr		# borrow $bptr
	imulq	24(%rsp),$mi		# "t[0]"*n0
@@ -1751,13 +1751,12 @@ $code.=<<___;
	mulx	1*8($nptr),%rax,%r11
	adcx	%rax,%r10
	adox	%r12,%r11
	mulx	2*8($nptr),%rax,%r12
	.byte	0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00	# mulx	2*8($nptr),%rax,%r12
	mov	48(%rsp),$bptr		# counter value
	mov	%r10,-4*8($tptr)
	adcx	%rax,%r11
	adox	%r13,%r12
	mulx	3*8($nptr),%rax,%r15
	 .byte	0x66,0x66
	 mov	$bi,%rdx
	mov	%r11,-3*8($tptr)
	adcx	%rax,%r12
@@ -1765,7 +1764,7 @@ $code.=<<___;
	lea	4*8($nptr),$nptr
	mov	%r12,-2*8($tptr)

	#jmp	.Lmulx4x_1st
	jmp	.Lmulx4x_1st

.align	32
.Lmulx4x_1st:
@@ -1863,7 +1862,6 @@ $code.=<<___;
	adox	%r12,%r11
	mulx	2*8($nptr),%rax,%r12
	mov	%r10,-4*8($tptr)
	mov	0*8($tptr),%r10
	adcx	%rax,%r11
	adox	%r13,%r12
	mulx	3*8($nptr),%rax,%r15
@@ -1872,23 +1870,22 @@ $code.=<<___;
	adcx	%rax,%r12
	adox	$zero,%r15		# of=0
	mov	48(%rsp),$bptr		# counter value
	.byte	0x66,0x3e
	mov	%r12,-2*8($tptr)
	.byte	0x66
	lea	4*8($nptr),$nptr

	jmp	.Lmulx4x_inner
	#jmp	.Lmulx4x_inner

.align	32
.Lmulx4x_inner:
	adcx	$zero,%r15		# cf=0, modulo-scheduled
	adox	%r10,%r14
	adox	0*8($tptr),%r14
	mulx	0*8($aptr),%r10,%rax	# a[4]*b[i]
	mov	1*8($tptr),%r13
	adcx	%r14,%r10
	mulx	1*8($aptr),%r11,%r14	# a[5]*b[i]
	adox	%rax,%r11
	mulx	2*8($aptr),%r12,%rax	# ...
	adcx	%r13,%r11
	adcx	1*8($tptr),%r11
	adox	%r14,%r12
	mulx	3*8($aptr),%r13,%r14
	 mov	$mi,%rdx
@@ -1896,8 +1893,8 @@ $code.=<<___;
	adox	%rax,%r13
	adcx	3*8($tptr),%r13
	adox	$zero,%r14		# of=0
	.byte	0x48,0x8d,0xb6,0x20,0x00,0x00,0x00	# lea	4*8($aptr),$aptr
	.byte	0x48,0x8d,0x9b,0x20,0x00,0x00,0x00	# lea	4*8($tptr),$tptr
	lea	4*8($aptr),$aptr
	lea	4*8($tptr),$tptr
	adcx	$zero,%r14		# cf=0

	adox	%r15,%r10
@@ -1909,7 +1906,6 @@ $code.=<<___;
	adox	%r15,%r12
	mulx	2*8($nptr),%rax,%r15
	mov	%r10,-5*8($tptr)
	mov	0*8($tptr),%r10
	adcx	%rax,%r12
	adox	%r15,%r13
	mulx	3*8($nptr),%rax,%r15
@@ -1927,7 +1923,7 @@ $code.=<<___;
	mov	0(%rsp),$num		# load num
	mov	8(%rsp),$bptr		# re-load &b[i]
	adc	$zero,%r15		# modulo-scheduled
	sub	%r10,$zero		# pull top-most carry
	sub	0*8($tptr),$zero	# pull top-most carry
	adc	%r15,%r14
	sbb	%r15,%r15		# top-most carry
	mov	%r14,-1*8($tptr)
@@ -1936,10 +1932,10 @@ $code.=<<___;
	jne	.Lmulx4x_outer

	neg	$num
	xor	%rdx,%rdx
	mov	32(%rsp),$rptr		# restore rp
	lea	64(%rsp),$tptr

	xor	%rdx,%rdx
	pxor	%xmm0,%xmm0
	mov	0*8($nptr,$num),%r8
	mov	1*8($nptr,$num),%r9
@@ -2022,6 +2018,7 @@ bn_sqrx8x_mont:
	push	%r15

	shl	\$3,${num}d		# convert $num to bytes
	.byte	0x67
	xor	%r10,%r10
	mov	%rsp,%r11		# put aside %rsp
	sub	$num,%r10		# -$num
@@ -2043,6 +2040,12 @@ bn_sqrx8x_mont:
	movq	%r10, %xmm3		# -$num
	movq	%r11, %xmm4		# save original %rsp
	mov	$n0,  32(%rsp)
___
$code.=<<___ if ($win64);
	jmp	.Lsqrx8x_body
.align	32
___
$code.=<<___;
.Lsqrx8x_body:
	##################################################################
	# Squaring part:
@@ -2096,12 +2099,15 @@ $code.=<<___;
	mov	$aaptr,8(%rsp)		# save end of $aptr
	jmp	.Lsqr8x_zero_start

.align	32
.byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
.Lsqrx8x_zero:
	.byte	0x3e
	movdqa	%xmm0,0*8($tptr)
	movdqa	%xmm0,2*8($tptr)
	movdqa	%xmm0,4*8($tptr)
	movdqa	%xmm0,6*8($tptr)
.Lsqr8x_zero_start:
.Lsqr8x_zero_start:			# aligned at 32
	movdqa	%xmm0,8*8($tptr)
	movdqa	%xmm0,10*8($tptr)
	movdqa	%xmm0,12*8($tptr)
@@ -2111,47 +2117,47 @@ $code.=<<___;
	jnz	.Lsqrx8x_zero

	mov	0*8($aptr),%rdx		# a[0], modulo-scheduled
	xor	%r8,%r8
	xor	%r9,%r9
	#xor	%r9,%r9			# t[1], ex-$num, zero already
	xor	%r10,%r10
	xor	%r11,%r11
	xor	%r12,%r12
	xor	%r13,%r13
	xor	%r14,%r14
	xor	%r15,%r15
	lea	48(%rsp),$tptr
	xor	$zero,$zero		# cf=0, cf=0
	jmp	.Lsqrx8x_outer_loop

.align	32
.Lsqrx8x_outer_loop:
	mulx	1*8($aptr),%rax,%rbx	# a[1]*a[0]
	adcx	%rax,%r8		# a[1]*a[0]+=t[1]
	adox	%rbx,%r9
	mulx	2*8($aptr),%rax,%rbx	# a[2]*a[0]
	adcx	%rax,%r9
	adox	%rbx,%r10
	.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00	# mulx	3*8($aptr),%rax,%rbx	# ...
	adcx	%rax,%r10
	adox	%rbx,%r11
	.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00	# mulx	4*8($aptr),%rax,%rbx
	adcx	%rax,%r11
	adox	%rbx,%r12
	mulx	5*8($aptr),%rax,%rbx
	adcx	%rax,%r12
	adox	%rbx,%r13
	mulx	6*8($aptr),%rax,%rbx
	adcx	%rax,%r13
	adox	%rbx,%r14
	mulx	7*8($aptr),%rax,%r15
	mulx	1*8($aptr),%r8,%rax	# a[1]*a[0]
	adcx	%r9,%r8			# a[1]*a[0]+=t[1]
	adox	%rax,%r10
	mulx	2*8($aptr),%r9,%rax	# a[2]*a[0]
	adcx	%r10,%r9
	adox	%rax,%r11
	.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00	# mulx	3*8($aptr),%r10,%rax	# ...
	adcx	%r11,%r10
	adox	%rax,%r12
	.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00	# mulx	4*8($aptr),%r11,%rax
	adcx	%r12,%r11
	adox	%rax,%r13
	mulx	5*8($aptr),%r12,%rax
	adcx	%r13,%r12
	adox	%rax,%r14
	mulx	6*8($aptr),%r13,%rax
	adcx	%r14,%r13
	adox	%r15,%rax
	mulx	7*8($aptr),%r14,%r15
	 mov	1*8($aptr),%rdx		# a[1]
	adcx	%rax,%r14
	adox	$zero,%r15
	adc	8*8($tptr),%r15
	mov	%r8,1*8($tptr)		# t[1]
	mov	%r9,2*8($tptr)		# t[2]
	sbb	$carry,$carry		# mov %cf,$carry
	xor	$zero,$zero		# cf=0, of=0

	mov	%r8,1*8($tptr)		# t[1]
	mov	%r9,2*8($tptr)		# t[2]

	mulx	2*8($aptr),%r8,%rbx	# a[2]*a[1]
	mulx	3*8($aptr),%r9,%rax	# a[3]*a[1]
@@ -2193,13 +2199,12 @@ $code.=<<___;
	adcx	%rbx,%r11
	adox	%rax,%r12
	adcx	%r14,%r12
	adox	$zero,%r13		# of=0
	adcx	$zero,%r13		# cf=0

	mov	%r8,5*8($tptr)		# t[5]
	mov	%r9,6*8($tptr)		# t[6]

	 mulx	4*8($aptr),%r8,%rax	# a[4]*a[3]
	adox	$zero,%r13		# of=0
	adcx	$zero,%r13		# cf=0

	mulx	5*8($aptr),%r9,%rbx	# a[5]*a[3]
	adcx	%r10,%r8
	adox	%rax,%r9
@@ -2239,9 +2244,9 @@ $code.=<<___;
	adcx	%r14,%r11
	adox	%rbx,%r12
	adcx	%rax,%r12
	.byte	0x66,0x66
	adox	$zero,%r13

	.byte	0x67,0x67
	mulx	%r8,%r8,%r14		# a[7]*a[6]
	adcx	%r8,%r13
	adcx	$zero,%r14
@@ -2250,26 +2255,26 @@ $code.=<<___;
	je	.Lsqrx8x_outer_break

	neg	$carry			# mov $carry,%cf
	mov	\$-8,%rcx
	mov	$zero,%r15
	mov	8*8($tptr),%r8
	adc	9*8($tptr),%r9		# +=t[9]
	adc	10*8($tptr),%r10	# ...
	adc	11*8($tptr),%r11
	adcx	9*8($tptr),%r9		# +=t[9]
	adcx	10*8($tptr),%r10	# ...
	adcx	11*8($tptr),%r11
	adc	12*8($tptr),%r12
	adc	13*8($tptr),%r13
	adc	14*8($tptr),%r14
	adc	15*8($tptr),%r15
	lea	8*8($tptr),$tptr
	sbb	$carry,$carry		# mov %cf,$carry
	lea	($aptr),$aaptr
	lea	2*8*8($tptr),$tptr
	sbb	%rax,%rax		# mov %cf,$carry

	mov	-64($aptr),%rdx		# a[0]
	lea	($aptr),$aaptr
	mov	$carry,16(%rsp)		# offload $carry
	mov	%rax,16(%rsp)		# offload $carry
	mov	$tptr,24(%rsp)

	lea	8*8($tptr),$tptr
	#lea	8*8($tptr),$tptr	# see 2*8*8($tptr) above
	xor	%eax,%eax		# cf=0, of=0
	mov	\$-8,%rcx
	jmp	.Lsqrx8x_loop

.align	32
@@ -2311,17 +2316,20 @@ $code.=<<___;
	adox	%rbx,%r15		# %rbx is 0, of=0
	adcx	%rbx,%r15		# cf=0

	.byte	0x67
	inc	%rcx			# of=0
	jnz	.Lsqrx8x_loop

	lea	8*8($aaptr),$aaptr
	mov	\$-8,%rcx
	cmp	8(%rsp),$aaptr		# done?
	je	.Lsqrx8x_break

	sub	16(%rsp),%rbx		# mov 16(%rsp),%cf
	.byte	0x66
	mov	-64($aptr),%rdx
	adc	0*8($tptr),%r8
	adc	1*8($tptr),%r9
	adcx	0*8($tptr),%r8
	adcx	1*8($tptr),%r9
	adc	2*8($tptr),%r10
	adc	3*8($tptr),%r11
	adc	4*8($tptr),%r12
@@ -2329,35 +2337,37 @@ $code.=<<___;
	adc	6*8($tptr),%r14
	adc	7*8($tptr),%r15
	lea	8*8($tptr),$tptr
	sbb	%rbx,%rbx		# mov %cf,%rbx
	xor	%eax,%eax		# cf=0, of=0
	mov	%rbx,16(%rsp)		# offload carry
	mov	\$-8,%rcx
	.byte	0x67
	sbb	%rax,%rax		# mov %cf,%rax
	xor	%ebx,%ebx		# cf=0, of=0
	mov	%rax,16(%rsp)		# offload carry
	jmp	.Lsqrx8x_loop

.align	32
.Lsqrx8x_break:
	sub	16(%rsp),%r8		# consume last carry
	mov	24(%rsp),$aaptr		# initial $tptr
	mov	24(%rsp),$carry		# initial $tptr, borrow $carry
	mov	0*8($aptr),%rdx		# a[8], modulo-scheduled
	xor	%ebp,%ebp		# xor	$zero,$zero
	mov	%r8,0*8($tptr)
	lea	8*8($aaptr),$aaptr
	cmp	$carry,$tptr		# cf=0, of=0
	je	.Lsqrx8x_outer_loop

	mov	%r9,1*8($tptr)
	 mov	1*8($aaptr),%r8		# potentially forwarded store
	 mov	1*8($carry),%r9
	mov	%r10,2*8($tptr)
	 mov	2*8($aaptr),%r9		# ...
	 mov	2*8($carry),%r10
	mov	%r11,3*8($tptr)
	 mov	3*8($aaptr),%r10
	 mov	3*8($carry),%r11
	mov	%r12,4*8($tptr)
	 mov	4*8($aaptr),%r11
	 mov	4*8($carry),%r12
	mov	%r13,5*8($tptr)
	 mov	5*8($aaptr),%r12
	 mov	5*8($carry),%r13
	mov	%r14,6*8($tptr)
	 mov	6*8($aaptr),%r13
	 mov	6*8($carry),%r14
	mov	%r15,7*8($tptr)
	 mov	7*8($aaptr),%r14
	mov	$aaptr,$tptr
	xor	$zero,$zero		# cf=0, cf=0
	 mov	7*8($carry),%r15
	mov	$carry,$tptr
	jmp	.Lsqrx8x_outer_loop

.align	32
@@ -2373,13 +2383,12 @@ ___
}{
my $i="%rcx";
$code.=<<___;
	mov	(%rsp),$num		# restore $num

	lea	48(%rsp),$tptr
	mov	($aptr,$i),%rdx		# a[0]

	mov	8($tptr),$A0[1]		# t[1]
	xor	$A0[0],$A0[0]		# t[0], of=0, cf=0
	mov	(%rsp),$num		# restore $num
	adox	$A0[1],$A0[1]
	 mov	16($tptr),$A1[0]	# t[2]	# prefetch
	 mov	24($tptr),$A1[1]	# t[3]	# prefetch
@@ -2440,9 +2449,9 @@ $code.=<<___;
.align	32
.Lsqrx4x_shift_n_add_break:
	adcx	$A1[1],%rbx
	.byte	0x48,0x89,0x87,0x30,0x00,0x00,0x00	# mov	%rax,48($tptr)
	.byte	0x48,0x89,0x9f,0x38,0x00,0x00,0x00	# mov	%rbx,56($tptr)
	.byte	0x48,0x8d,0xbf,0x40,0x00,0x00,0x00	# lea	64($tptr),$tptr
	mov	%rax,48($tptr)
	mov	%rbx,56($tptr)
	lea	64($tptr),$tptr		# end of t[] buffer
___
}
######################################################################
@@ -2456,17 +2465,16 @@ my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx");

$code.=<<___;
	movq	%xmm2,$nptr
	xor	%eax,%eax		# initial top-most carry bit
	mov	32(%rsp),%rbx		# n0
	mov	48(%rsp),%rdx		# "%r8", 8*0($tptr)
	lea	($nptr,$num),%rax	# end of n[]
	lea	-64($nptr,$num),%rcx	# end of n[]
	#lea	48(%rsp,$num,2),$tptr	# end of t[] buffer
	mov	%rax, 0(%rsp)		# save end of n[]
	mov	%rcx, 0(%rsp)		# save end of n[]
	mov	$tptr,8(%rsp)		# save end of t[]

	lea	48(%rsp),$tptr		# initial t[] window
	xor	%rax,%rax
	nop
	#jmp	.Lsqrx8x_reduction_loop
	jmp	.Lsqrx8x_reduction_loop

.align	32
.Lsqrx8x_reduction_loop:
@@ -2529,29 +2537,31 @@ $code.=<<___;
	adox	$carry,%r15		# $carry is 0
	adcx	$carry,%r15		# cf=0

	.byte	0x67
	inc	%rcx			# of=0
	jnz	.Lsqrx8x_reduce

	lea	8*8($nptr),$nptr
	xor	%rax,%rax
	.byte	0x66,0x67
	mov	$carry,%rax		# xor	%rax,%rax
	cmp	0(%rsp),$nptr		# end of n[]?
	jae	.Lsqrx8x_no_tail

	mov	48(%rsp),%rdx		# pull n0*a[0]
	add	8*0($tptr),%r8
	adcx	8*1($tptr),%r9
	adcx	8*2($tptr),%r10
	adcx	8*3($tptr),%r11
	adcx	8*4($tptr),%r12
	adcx	8*5($tptr),%r13
	adcx	8*6($tptr),%r14
	adcx	8*7($tptr),%r15
	lea	8*8($nptr),$nptr
	mov	\$-8,%rcx
	adc	8*1($tptr),%r9
	adc	8*2($tptr),%r10
	adc	8*3($tptr),%r11
	adc	8*4($tptr),%r12
	adc	8*5($tptr),%r13
	adc	8*6($tptr),%r14
	adc	8*7($tptr),%r15
	lea	8*8($tptr),$tptr
	sbb	$carry,$carry		# top carry
	sbb	%rax,%rax		# top carry

	mov	\$-8,%rcx
	mov	$carry,16(%rsp)
	xor	$carry,$carry		# of=0, cf=0
	mov	%rax,16(%rsp)
	jmp	.Lsqrx8x_tail

.align	32
@@ -2588,7 +2598,7 @@ $code.=<<___;
	mulx	8*7($nptr),%rax,%r15
	 mov	48+72(%rsp,%rcx,8),%rdx	# pull n0*a[i]
	adcx	%rax,%r14
	.byte	0x66
	.byte	0x67
	adox	$carry,%r15
	 mov	%rbx,($tptr,%rcx,8)	# save result
	 mov	%r8,%rbx
@@ -2597,35 +2607,35 @@ $code.=<<___;
	inc	%rcx			# of=0
	jnz	.Lsqrx8x_tail

	lea	8*8($nptr),$nptr
	cmp	0(%rsp),$nptr		# end of n[]?
	jae	.Lsqrx8x_tail_done	# break out of loop

	sub	16(%rsp),$carry		# neg	$carry
	sub	16(%rsp),$carry		# mov 16(%rsp),%cf
	 mov	48(%rsp),%rdx		# pull n0*a[0]
	adcx	8*0($tptr),%r8
	adcx	8*1($tptr),%r9
	adcx	8*2($tptr),%r10
	adcx	8*3($tptr),%r11
	adcx	8*4($tptr),%r12
	adcx	8*5($tptr),%r13
	adcx	8*6($tptr),%r14
	adcx	8*7($tptr),%r15
	 lea	8*8($nptr),$nptr
	adc	8*0($tptr),%r8
	adc	8*1($tptr),%r9
	adc	8*2($tptr),%r10
	adc	8*3($tptr),%r11
	adc	8*4($tptr),%r12
	adc	8*5($tptr),%r13
	adc	8*6($tptr),%r14
	adc	8*7($tptr),%r15
	lea	8*8($tptr),$tptr
	sbb	$carry,$carry

	mov	\$-8,%rcx
	mov	$carry,16(%rsp)
	sbb	%rax,%rax

	xor	$carry,$carry		# of=0, cf=0
	mov	%rax,16(%rsp)
	jmp	.Lsqrx8x_tail

.align	32
.Lsqrx8x_tail_done:
	add	24(%rsp),%r8		# can this overflow?
	xor	%rax,%rax
	mov	$carry,%rax		# xor	%rax,%rax

	sub	16(%rsp),$carry		# neg $carry
.Lsqrx8x_no_tail:			# carry flag is 0
	sub	16(%rsp),$carry		# mov 16(%rsp),%cf
.Lsqrx8x_no_tail:			# %cf is 0 if jumped here
	adc	8*0($tptr),%r8
	 movq	%xmm3,%rcx
	adc	8*1($tptr),%r9
@@ -2639,24 +2649,24 @@ $code.=<<___;
	adc	8*7($tptr),%r15
	adc	%rax,%rax		# top-most carry

	cmp	8(%rsp),$carry		# end of t[]?
	mov	32(%rsp),%rbx		# n0
	mov	8*8($tptr,%rcx),%rdx	# modulo-scheduled "%r8"

	lea	8*8($tptr,%rcx),$tptr	# start of current t[] window
	mov	%r8,-8*8($carry)	# store top 512 bits
	mov	%r9,-8*7($carry)
	mov	%r10,-8*6($carry)
	mov	%r11,-8*5($carry)
	mov	%r12,-8*4($carry)
	mov	%r13,-8*3($carry)
	mov	%r14,-8*2($carry)
	mov	%r15,-8*1($carry)
	mov	%r8,8*0($tptr)		# store top 512 bits
	mov	%r9,8*1($tptr)
	mov	%r10,8*2($tptr)
	mov	%r11,8*3($tptr)
	mov	%r12,8*4($tptr)
	mov	%r13,8*5($tptr)
	mov	%r14,8*6($tptr)
	mov	%r15,8*7($tptr)

	lea	8*8($tptr,%rcx),$tptr	# start of current t[] window
	cmp	8(%rsp),$carry		# end of t[]?
	jb	.Lsqrx8x_reduction_loop

	mov	%rcx,$num
	neg	$num			# restore $num
	mov	%rcx,%rdx		# -$num
	jmp	.Lsqrx8x_post
___
}
##############################################################
@@ -2667,24 +2677,28 @@ my ($rptr,$nptr,$lptr,$i)=($aptr,"%rbp","%rbx","%rcx");
my @ri=map("%r$_",(10..13));
my @ni=map("%r$_",(14..15));
$code.=<<___;
	lea	($nptr,$num),$nptr	# end of $nptr
	lea	48(%rsp,$num),$lptr	# end of lower half of t[2*num]
	lea	48(%rsp,$num),$tptr
.align	32
.Lsqrx8x_post:
	neg	%rdx			# restore $num
	neg	%rax			# top-most carry as mask
	mov	0*8($nptr),%r8
	mov	1*8($nptr),%r9
	lea	($nptr,%rdx),$nptr	# end of $nptr
	lea	48(%rsp,%rdx),$lptr	# end of lower half of t[2*num]
	lea	48(%rsp,%rdx),$tptr
	.byte	0x67
	xor	%rdx,%rdx
	movq	%xmm1,$rptr		# restore $rptr

	mov	0*8($nptr,$i),%r8
	mov	1*8($nptr,$i),%r9
	neg	%r8
	jmp	.Lsqrx8x_sub_entry

.align	32
.byte	0x66,0x66,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
.Lsqrx8x_sub:
	mov	0*8($nptr,$i),%r8
	mov	1*8($nptr,$i),%r9
	not	%r8
.Lsqrx8x_sub_entry:
.Lsqrx8x_sub_entry:			# aligned at 32
	mov	2*8($nptr,$i),%r10
	not	%r9
	and	%rax,%r8
@@ -2709,28 +2723,27 @@ $code.=<<___;
	movdqa	%xmm0,2*8($lptr,$i)
	and	%rax,%r15

	neg	%rdx			# mov %rdx,%cf
	neg	%edx			# mov %edx,%cf
	movdqa	%xmm0,4*8($lptr,$i)
	adc	0*8($tptr),%r8
	mov	%r8,0*8($rptr)		# result
	adc	1*8($tptr),%r9
	movdqa	%xmm0,6*8($lptr,$i)
	adc	2*8($tptr),%r10
	mov	%r9,1*8($rptr)
	adc	3*8($tptr),%r11
	movdqa	%xmm0,0*8($tptr)	# zap upper half
	adc	4*8($tptr),%r12
	mov	%r10,2*8($rptr)
	adc	5*8($tptr),%r13
	movdqa	%xmm0,2*8($tptr)
	adc	6*8($tptr),%r14
	mov	%r11,3*8($rptr)
	adc	7*8($tptr),%r15
	sbb	%edx,%edx		# mov %cf,%edx
	movdqa	%xmm0,4*8($tptr)
	sbb	%rdx,%rdx		# mov %cf,%rdx
	movdqa	%xmm0,6*8($tptr)
	lea	8*8($tptr),$tptr

	mov	%r8,0*8($rptr)
	mov	%r9,1*8($rptr)
	mov	%r10,2*8($rptr)
	mov	%r11,3*8($rptr)
	mov	%r12,4*8($rptr)
	mov	%r13,5*8($rptr)
	mov	%r14,6*8($rptr)