Commit 8fc8f486 authored by Andy Polyakov's avatar Andy Polyakov Committed by Matt Caswell
Browse files

crypto/bn/x86_64-mont5.pl: constant-time gather procedure.



At the same time remove miniscule bias in final subtraction.
Performance penalty varies from platform to platform, and even with
key length. For rsa2048 sign it was observed to be 4% for Sandy
Bridge and 7% on Broadwell.

CVE-2016-0702

Reviewed-by: default avatarRichard Levitte <levitte@openssl.org>
Reviewed-by: default avatarRich Salz <rsalz@openssl.org>
parent d6d422e1
Loading
Loading
Loading
Loading
+6 −33
Original line number Diff line number Diff line
@@ -775,20 +775,20 @@ bn_sqr8x_mont:
	# 4096. this is done to allow memory disambiguation logic
	# do its job.
	#
	lea	-64(%rsp,$num,4),%r11
	lea	-64(%rsp,$num,2),%r11
	mov	($n0),$n0		# *n0
	sub	$aptr,%r11
	and	\$4095,%r11
	cmp	%r11,%r10
	jb	.Lsqr8x_sp_alt
	sub	%r11,%rsp		# align with $aptr
	lea	-64(%rsp,$num,4),%rsp	# alloca(frame+4*$num)
	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
	jmp	.Lsqr8x_sp_done

.align	32
.Lsqr8x_sp_alt:
	lea	4096-64(,$num,4),%r10	# 4096-frame-4*$num
	lea	-64(%rsp,$num,4),%rsp	# alloca(frame+4*$num)
	lea	4096-64(,$num,2),%r10	# 4096-frame-2*$num
	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
	sub	%r10,%r11
	mov	\$0,%r10
	cmovc	%r10,%r11
@@ -798,37 +798,17 @@ bn_sqr8x_mont:
	mov	$num,%r10	
	neg	$num

	lea	64(%rsp,$num,2),%r11	# copy of modulus
	mov	$n0,  32(%rsp)
	mov	%rax, 40(%rsp)		# save original %rsp
.Lsqr8x_body:

	mov	$num,$i
	movq	%r11, %xmm2		# save pointer to modulus copy
	shr	\$3+2,$i
	mov	OPENSSL_ia32cap_P+8(%rip),%eax
	jmp	.Lsqr8x_copy_n

.align	32
.Lsqr8x_copy_n:
	movq	8*0($nptr),%xmm0
	movq	8*1($nptr),%xmm1
	movq	8*2($nptr),%xmm3
	movq	8*3($nptr),%xmm4
	lea	8*4($nptr),$nptr
	movdqa	%xmm0,16*0(%r11)
	movdqa	%xmm1,16*1(%r11)
	movdqa	%xmm3,16*2(%r11)
	movdqa	%xmm4,16*3(%r11)
	lea	16*4(%r11),%r11
	dec	$i
	jnz	.Lsqr8x_copy_n

	movq	$nptr, %xmm2		# save pointer to modulus
	pxor	%xmm0,%xmm0
	movq	$rptr,%xmm1		# save $rptr
	movq	%r10, %xmm3		# -$num
___
$code.=<<___ if ($addx);
	mov	OPENSSL_ia32cap_P+8(%rip),%eax
	and	\$0x80100,%eax
	cmp	\$0x80100,%eax
	jne	.Lsqr8x_nox
@@ -837,7 +817,6 @@ $code.=<<___ if ($addx);

	pxor	%xmm0,%xmm0
	lea	48(%rsp),%rax
	lea	64(%rsp,$num,2),%rdx
	shr	\$3+2,$num
	mov	40(%rsp),%rsi		# restore %rsp
	jmp	.Lsqr8x_zero
@@ -850,7 +829,6 @@ $code.=<<___;

	pxor	%xmm0,%xmm0
	lea	48(%rsp),%rax
	lea	64(%rsp,$num,2),%rdx
	shr	\$3+2,$num
	mov	40(%rsp),%rsi		# restore %rsp
	jmp	.Lsqr8x_zero
@@ -862,11 +840,6 @@ $code.=<<___;
	movdqa	%xmm0,16*2(%rax)
	movdqa	%xmm0,16*3(%rax)
	lea	16*4(%rax),%rax
	movdqa	%xmm0,16*0(%rdx)	# wipe n
	movdqa	%xmm0,16*1(%rdx)
	movdqa	%xmm0,16*2(%rdx)
	movdqa	%xmm0,16*3(%rdx)
	lea	16*4(%rdx),%rdx
	dec	$num
	jnz	.Lsqr8x_zero

+658 −511

File changed.

Preview size limit exceeded, changes collapsed.

+14 −14
Original line number Diff line number Diff line
@@ -787,8 +787,8 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
    if (window >= 5) {
        window = 5;             /* ~5% improvement for RSA2048 sign, and even
                                 * for RSA4096 */
        if ((top & 7) == 0)
            powerbufLen += 2 * top * sizeof(m->d[0]);
        /* reserve space for mont->N.d[] copy */
        powerbufLen += top * sizeof(mont->N.d[0]);
    }
#endif
    (void)0;
@@ -1008,7 +1008,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
                               const BN_ULONG *not_used, const BN_ULONG *np,
                               const BN_ULONG *n0, int num);

        BN_ULONG *np = mont->N.d, *n0 = mont->n0, *np2;
        BN_ULONG *n0 = mont->n0, *np;

        /*
         * BN_to_montgomery can contaminate words above .top [in
@@ -1019,11 +1019,11 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
        for (i = tmp.top; i < top; i++)
            tmp.d[i] = 0;

        if (top & 7)
            np2 = np;
        else
            for (np2 = am.d + top, i = 0; i < top; i++)
                np2[2 * i] = np[i];
        /*
         * copy mont->N.d[] to improve cache locality
         */
        for (np = am.d + top, i = 0; i < top; i++)
            np[i] = mont->N.d[i];

        bn_scatter5(tmp.d, top, powerbuf, 0);
        bn_scatter5(am.d, am.top, powerbuf, 1);
@@ -1033,7 +1033,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
# if 0
        for (i = 3; i < 32; i++) {
            /* Calculate a^i = a^(i-1) * a */
            bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
            bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
            bn_scatter5(tmp.d, top, powerbuf, i);
        }
# else
@@ -1044,7 +1044,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
        }
        for (i = 3; i < 8; i += 2) {
            int j;
            bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
            bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
            bn_scatter5(tmp.d, top, powerbuf, i);
            for (j = 2 * i; j < 32; j *= 2) {
                bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
@@ -1052,13 +1052,13 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
            }
        }
        for (; i < 16; i += 2) {
            bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
            bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
            bn_scatter5(tmp.d, top, powerbuf, i);
            bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
            bn_scatter5(tmp.d, top, powerbuf, 2 * i);
        }
        for (; i < 32; i += 2) {
            bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
            bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
            bn_scatter5(tmp.d, top, powerbuf, i);
        }
# endif
@@ -1087,11 +1087,11 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
            while (bits >= 0) {
                wvalue = bn_get_bits5(p->d, bits - 4);
                bits -= 5;
                bn_power5(tmp.d, tmp.d, powerbuf, np2, n0, top, wvalue);
                bn_power5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue);
            }
        }

        ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np2, n0, top);
        ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np, n0, top);
        tmp.top = top;
        bn_correct_top(&tmp);
        if (ret) {