Commit eb791696 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

ec/ecp_nistz256.c: improve ECDSA sign by 30-40%.



This is based on RT#3810, which added dedicated modular inversion.
ECDSA verify results improves as well, but not as much.

Reviewed-by: default avatarRich Salz <rsalz@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/5001)
parent 617b49db
Loading
Loading
Loading
Loading
+1031 −13
Original line number Diff line number Diff line
#! /usr/bin/env perl
# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
# Copyright (c) 2014, Intel Corporation. All Rights Reserved.
# Copyright (c) 2015 CloudFlare, Inc.
#
# Licensed under the OpenSSL license (the "License").  You may not use
# this file except in compliance with the License.  You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3)
# (1) Intel Corporation, Israel Development Center, Haifa, Israel
# (2) University of Haifa, Israel
# (3) CloudFlare, Inc.
#
# Reference:
# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
@@ -18,23 +20,25 @@
# Further optimization by <appro@openssl.org>:
#
#		this/original	with/without -DECP_NISTZ256_ASM(*)
# Opteron	+12-49%		+110-150%
# Bulldozer	+14-45%		+175-210%
# P4		+18-46%		n/a :-(
# Westmere	+12-34%		+80-87%
# Sandy Bridge	+9-35%		+110-120%
# Ivy Bridge	+9-35%		+110-125%
# Haswell	+8-37%		+140-160%
# Broadwell	+18-58%		+145-210%
# Atom		+15-50%		+130-180%
# VIA Nano	+43-160%	+300-480%
# Opteron	+15-49%		+150-195%
# Bulldozer	+18-45%		+175-240%
# P4		+24-46%		+100-150%
# Westmere	+18-34%		+87-160%
# Sandy Bridge	+14-35%		+120-185%
# Ivy Bridge	+11-35%		+125-180%
# Haswell	+10-37%		+160-200%
# Broadwell	+24-58%		+210-270%
# Atom		+20-50%		+180-240%
# VIA Nano	+50-160%	+480-480%
#
# (*)	"without -DECP_NISTZ256_ASM" refers to build with
#	"enable-ec_nistp_64_gcc_128";
#
# Ranges denote minimum and maximum improvement coefficients depending
# on benchmark. Lower coefficients are for ECDSA sign, relatively fastest
# server-side operation. Keep in mind that +100% means 2x improvement.
# on benchmark. In "this/original" column lower coefficient is for
# ECDSA sign, while in "with/without" - for ECDH key agreement, and
# higher - for ECDSA sign, relatively fastest server-side operation.
# Keep in mind that +100% means 2x improvement.

$flavour = shift;
$output  = shift;
@@ -95,6 +99,12 @@ $code.=<<___;
.long 3,3,3,3,3,3,3,3
.LONE_mont:
.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe

# Constants for computations modulo ord(p256)
.Lord:
.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000
.LordK:
.quad 0xccd1c8aaee00bc4f
___

{
@@ -481,6 +491,1014 @@ my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax");
my ($poly1,$poly3)=($acc6,$acc7);

$code.=<<___;
################################################################################
# void ecp_nistz256_ord_mul_mont(
#   uint64_t res[4],
#   uint64_t a[4],
#   uint64_t b[4]);

.globl	ecp_nistz256_ord_mul_mont
.type	ecp_nistz256_ord_mul_mont,\@function,3
.align	32
ecp_nistz256_ord_mul_mont:
___
$code.=<<___	if ($addx);
	mov	\$0x80100, %ecx
	and	OPENSSL_ia32cap_P+8(%rip), %ecx
	cmp	\$0x80100, %ecx
	je	.Lecp_nistz256_ord_mul_montx
___
$code.=<<___;
	push	%rbp
	push	%rbx
	push	%r12
	push	%r13
	push	%r14
	push	%r15

	mov	8*0($b_org), %rax
	mov	$b_org, $b_ptr
	lea	.Lord(%rip), %r14
	mov	.LordK(%rip), %r15

	################################# * b[0]
	mov	%rax, $t0
	mulq	8*0($a_ptr)
	mov	%rax, $acc0
	mov	$t0, %rax
	mov	%rdx, $acc1

	mulq	8*1($a_ptr)
	add	%rax, $acc1
	mov	$t0, %rax
	adc	\$0, %rdx
	mov	%rdx, $acc2

	mulq	8*2($a_ptr)
	add	%rax, $acc2
	mov	$t0, %rax
	adc	\$0, %rdx

	 mov	$acc0, $acc5
	 imulq	%r15,$acc0

	mov	%rdx, $acc3
	mulq	8*3($a_ptr)
	add	%rax, $acc3
	 mov	$acc0, %rax
	adc	\$0, %rdx
	mov	%rdx, $acc4

	################################# First reduction step
	mulq	8*0(%r14)
	mov	$acc0, $t1
	add	%rax, $acc5		# guaranteed to be zero
	mov	$acc0, %rax
	adc	\$0, %rdx
	mov	%rdx, $t0

	sub	$acc0, $acc2
	sbb	\$0, $acc0		# can't borrow

	mulq	8*1(%r14)
	add	$t0, $acc1
	adc	\$0, %rdx
	add	%rax, $acc1
	mov	$t1, %rax
	adc	%rdx, $acc2
	mov	$t1, %rdx
	adc	\$0, $acc0		# can't overflow

	shl	\$32, %rax
	shr	\$32, %rdx
	sub	%rax, $acc3
	 mov	8*1($b_ptr), %rax
	sbb	%rdx, $t1		# can't borrow

	add	$acc0, $acc3
	adc	$t1, $acc4
	adc	\$0, $acc5

	################################# * b[1]
	mov	%rax, $t0
	mulq	8*0($a_ptr)
	add	%rax, $acc1
	mov	$t0, %rax
	adc	\$0, %rdx
	mov	%rdx, $t1

	mulq	8*1($a_ptr)
	add	$t1, $acc2
	adc	\$0, %rdx
	add	%rax, $acc2
	mov	$t0, %rax
	adc	\$0, %rdx
	mov	%rdx, $t1

	mulq	8*2($a_ptr)
	add	$t1, $acc3
	adc	\$0, %rdx
	add	%rax, $acc3
	mov	$t0, %rax
	adc	\$0, %rdx

	 mov	$acc1, $t0
	 imulq	%r15, $acc1

	mov	%rdx, $t1
	mulq	8*3($a_ptr)
	add	$t1, $acc4
	adc	\$0, %rdx
	xor	$acc0, $acc0
	add	%rax, $acc4
	 mov	$acc1, %rax
	adc	%rdx, $acc5
	adc	\$0, $acc0

	################################# Second reduction step
	mulq	8*0(%r14)
	mov	$acc1, $t1
	add	%rax, $t0		# guaranteed to be zero
	mov	$acc1, %rax
	adc	%rdx, $t0

	sub	$acc1, $acc3
	sbb	\$0, $acc1		# can't borrow

	mulq	8*1(%r14)
	add	$t0, $acc2
	adc	\$0, %rdx
	add	%rax, $acc2
	mov	$t1, %rax
	adc	%rdx, $acc3
	mov	$t1, %rdx
	adc	\$0, $acc1		# can't overflow

	shl	\$32, %rax
	shr	\$32, %rdx
	sub	%rax, $acc4
	 mov	8*2($b_ptr), %rax
	sbb	%rdx, $t1		# can't borrow

	add	$acc1, $acc4
	adc	$t1, $acc5
	adc	\$0, $acc0

	################################## * b[2]
	mov	%rax, $t0
	mulq	8*0($a_ptr)
	add	%rax, $acc2
	mov	$t0, %rax
	adc	\$0, %rdx
	mov	%rdx, $t1

	mulq	8*1($a_ptr)
	add	$t1, $acc3
	adc	\$0, %rdx
	add	%rax, $acc3
	mov	$t0, %rax
	adc	\$0, %rdx
	mov	%rdx, $t1

	mulq	8*2($a_ptr)
	add	$t1, $acc4
	adc	\$0, %rdx
	add	%rax, $acc4
	mov	$t0, %rax
	adc	\$0, %rdx

	 mov	$acc2, $t0
	 imulq	%r15, $acc2

	mov	%rdx, $t1
	mulq	8*3($a_ptr)
	add	$t1, $acc5
	adc	\$0, %rdx
	xor	$acc1, $acc1
	add	%rax, $acc5
	 mov	$acc2, %rax
	adc	%rdx, $acc0
	adc	\$0, $acc1

	################################# Third reduction step
	mulq	8*0(%r14)
	mov	$acc2, $t1
	add	%rax, $t0		# guaranteed to be zero
	mov	$acc2, %rax
	adc	%rdx, $t0

	sub	$acc2, $acc4
	sbb	\$0, $acc2		# can't borrow

	mulq	8*1(%r14)
	add	$t0, $acc3
	adc	\$0, %rdx
	add	%rax, $acc3
	mov	$t1, %rax
	adc	%rdx, $acc4
	mov	$t1, %rdx
	adc	\$0, $acc2		# can't overflow

	shl	\$32, %rax
	shr	\$32, %rdx
	sub	%rax, $acc5
	 mov	8*3($b_ptr), %rax
	sbb	%rdx, $t1		# can't borrow

	add	$acc2, $acc5
	adc	$t1, $acc0
	adc	\$0, $acc1

	################################# * b[3]
	mov	%rax, $t0
	mulq	8*0($a_ptr)
	add	%rax, $acc3
	mov	$t0, %rax
	adc	\$0, %rdx
	mov	%rdx, $t1

	mulq	8*1($a_ptr)
	add	$t1, $acc4
	adc	\$0, %rdx
	add	%rax, $acc4
	mov	$t0, %rax
	adc	\$0, %rdx
	mov	%rdx, $t1

	mulq	8*2($a_ptr)
	add	$t1, $acc5
	adc	\$0, %rdx
	add	%rax, $acc5
	mov	$t0, %rax
	adc	\$0, %rdx

	 mov	$acc3, $t0
	 imulq	%r15, $acc3

	mov	%rdx, $t1
	mulq	8*3($a_ptr)
	add	$t1, $acc0
	adc	\$0, %rdx
	xor	$acc2, $acc2
	add	%rax, $acc0
	 mov	$acc3, %rax
	adc	%rdx, $acc1
	adc	\$0, $acc2

	################################# Last reduction step
	mulq	8*0(%r14)
	mov	$acc3, $t1
	add	%rax, $t0		# guaranteed to be zero
	mov	$acc3, %rax
	adc	%rdx, $t0

	sub	$acc3, $acc5
	sbb	\$0, $acc3		# can't borrow

	mulq	8*1(%r14)
	add	$t0, $acc4
	adc	\$0, %rdx
	add	%rax, $acc4
	mov	$t1, %rax
	adc	%rdx, $acc5
	mov	$t1, %rdx
	adc	\$0, $acc3		# can't overflow

	shl	\$32, %rax
	shr	\$32, %rdx
	sub	%rax, $acc0
	sbb	%rdx, $t1		# can't borrow

	add	$acc3, $acc0
	adc	$t1, $acc1
	adc	\$0, $acc2

	################################# Subtract ord
	 mov	$acc4, $a_ptr
	sub	8*0(%r14), $acc4
	 mov	$acc5, $acc3
	sbb	8*1(%r14), $acc5
	 mov	$acc0, $t0
	sbb	8*2(%r14), $acc0
	 mov	$acc1, $t1
	sbb	8*3(%r14), $acc1
	sbb	\$0, $acc2

	cmovc	$a_ptr, $acc4
	cmovc	$acc3, $acc5
	cmovc	$t0, $acc0
	cmovc	$t1, $acc1

	mov	$acc4, 8*0($r_ptr)
	mov	$acc5, 8*1($r_ptr)
	mov	$acc0, 8*2($r_ptr)
	mov	$acc1, 8*3($r_ptr)

	pop	%r15
	pop	%r14
	pop	%r13
	pop	%r12
	pop	%rbx
	pop	%rbp
	ret
.size	ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont

################################################################################
# void ecp_nistz256_ord_sqr_mont(
#   uint64_t res[4],
#   uint64_t a[4],
#   int rep);

.globl	ecp_nistz256_ord_sqr_mont
.type	ecp_nistz256_ord_sqr_mont,\@function,3
.align	32
ecp_nistz256_ord_sqr_mont:
___
$code.=<<___	if ($addx);
	mov	\$0x80100, %ecx
	and	OPENSSL_ia32cap_P+8(%rip), %ecx
	cmp	\$0x80100, %ecx
	je	.Lecp_nistz256_ord_sqr_montx
___
$code.=<<___;
	push	%rbp
	push	%rbx
	push	%r12
	push	%r13
	push	%r14
	push	%r15

	mov	8*0($a_ptr), $acc0
	mov	8*1($a_ptr), %rax
	mov	8*2($a_ptr), $acc6
	mov	8*3($a_ptr), $acc7
	lea	.Lord(%rip), $a_ptr	# pointer to modulus
	mov	$b_org, $b_ptr
	jmp	.Loop_ord_sqr

.align	32
.Loop_ord_sqr:
	################################# a[1:] * a[0]
	mov	%rax, $t1		# put aside a[1]
	mul	$acc0			# a[1] * a[0]
	mov	%rax, $acc1
	movq	$t1, %xmm1		# offload a[1]
	mov	$acc6, %rax
	mov	%rdx, $acc2

	mul	$acc0			# a[2] * a[0]
	add	%rax, $acc2
	mov	$acc7, %rax
	movq	$acc6, %xmm2		# offload a[2]
	adc	\$0, %rdx
	mov	%rdx, $acc3

	mul	$acc0			# a[3] * a[0]
	add	%rax, $acc3
	mov	$acc7, %rax
	movq	$acc7, %xmm3		# offload a[3]
	adc	\$0, %rdx
	mov	%rdx, $acc4

	################################# a[3] * a[2]
	mul	$acc6			# a[3] * a[2]
	mov	%rax, $acc5
	mov	$acc6, %rax
	mov	%rdx, $acc6

	################################# a[2:] * a[1]
	mul	$t1			# a[2] * a[1]
	add	%rax, $acc3
	mov	$acc7, %rax
	adc	\$0, %rdx
	mov	%rdx, $acc7

	mul	$t1			# a[3] * a[1]
	add	%rax, $acc4
	adc	\$0, %rdx

	add	$acc7, $acc4
	adc	%rdx, $acc5
	adc	\$0, $acc6		# can't overflow

	################################# *2
	xor	$acc7, $acc7
	mov	$acc0, %rax
	add	$acc1, $acc1
	adc	$acc2, $acc2
	adc	$acc3, $acc3
	adc	$acc4, $acc4
	adc	$acc5, $acc5
	adc	$acc6, $acc6
	adc	\$0, $acc7

	################################# Missing products
	mul	%rax			# a[0] * a[0]
	mov	%rax, $acc0
	movq	%xmm1, %rax
	mov	%rdx, $t1

	mul	%rax			# a[1] * a[1]
	add	$t1, $acc1
	adc	%rax, $acc2
	movq	%xmm2, %rax
	adc	\$0, %rdx
	mov	%rdx, $t1

	mul	%rax			# a[2] * a[2]
	add	$t1, $acc3
	adc	%rax, $acc4
	movq	%xmm3, %rax
	adc	\$0, %rdx
	mov	%rdx, $t1

	 mov	$acc0, $t0
	 imulq	8*4($a_ptr), $acc0	# *= .LordK

	mul	%rax			# a[3] * a[3]
	add	$t1, $acc5
	adc	%rax, $acc6
	 mov	8*0($a_ptr), %rax	# modulus[0]
	adc	%rdx, $acc7		# can't overflow

	################################# First reduction step
	mul	$acc0
	mov	$acc0, $t1
	add	%rax, $t0		# guaranteed to be zero
	mov	8*1($a_ptr), %rax	# modulus[1]
	adc	%rdx, $t0

	sub	$acc0, $acc2
	sbb	\$0, $t1		# can't borrow

	mul	$acc0
	add	$t0, $acc1
	adc	\$0, %rdx
	add	%rax, $acc1
	mov	$acc0, %rax
	adc	%rdx, $acc2
	mov	$acc0, %rdx
	adc	\$0, $t1		# can't overflow

	 mov	$acc1, $t0
	 imulq	8*4($a_ptr), $acc1	# *= .LordK

	shl	\$32, %rax
	shr	\$32, %rdx
	sub	%rax, $acc3
	 mov	8*0($a_ptr), %rax
	sbb	%rdx, $acc0		# can't borrow

	add	$t1, $acc3
	adc	\$0, $acc0		# can't overflow

	################################# Second reduction step
	mul	$acc1
	mov	$acc1, $t1
	add	%rax, $t0		# guaranteed to be zero
	mov	8*1($a_ptr), %rax
	adc	%rdx, $t0

	sub	$acc1, $acc3
	sbb	\$0, $t1		# can't borrow

	mul	$acc1
	add	$t0, $acc2
	adc	\$0, %rdx
	add	%rax, $acc2
	mov	$acc1, %rax
	adc	%rdx, $acc3
	mov	$acc1, %rdx
	adc	\$0, $t1		# can't overflow

	 mov	$acc2, $t0
	 imulq	8*4($a_ptr), $acc2	# *= .LordK

	shl	\$32, %rax
	shr	\$32, %rdx
	sub	%rax, $acc0
	 mov	8*0($a_ptr), %rax
	sbb	%rdx, $acc1		# can't borrow

	add	$t1, $acc0
	adc	\$0, $acc1		# can't overflow

	################################# Third reduction step
	mul	$acc2
	mov	$acc2, $t1
	add	%rax, $t0		# guaranteed to be zero
	mov	8*1($a_ptr), %rax
	adc	%rdx, $t0

	sub	$acc2, $acc0
	sbb	\$0, $t1		# can't borrow

	mul	$acc2
	add	$t0, $acc3
	adc	\$0, %rdx
	add	%rax, $acc3
	mov	$acc2, %rax
	adc	%rdx, $acc0
	mov	$acc2, %rdx
	adc	\$0, $t1		# can't overflow

	 mov	$acc3, $t0
	 imulq	8*4($a_ptr), $acc3	# *= .LordK

	shl	\$32, %rax
	shr	\$32, %rdx
	sub	%rax, $acc1
	 mov	8*0($a_ptr), %rax
	sbb	%rdx, $acc2		# can't borrow

	add	$t1, $acc1
	adc	\$0, $acc2		# can't overflow

	################################# Last reduction step
	mul	$acc3
	mov	$acc3, $t1
	add	%rax, $t0		# guaranteed to be zero
	mov	8*1($a_ptr), %rax
	adc	%rdx, $t0

	sub	$acc3, $acc1
	sbb	\$0, $t1		# can't borrow

	mul	$acc3
	add	$t0, $acc0
	adc	\$0, %rdx
	add	%rax, $acc0
	mov	$acc3, %rax
	adc	%rdx, $acc1
	mov	$acc3, %rdx
	adc	\$0, $t1		# can't overflow

	shl	\$32, %rax
	shr	\$32, %rdx
	sub	%rax, $acc2
	sbb	%rdx, $acc3		# can't borrow

	add	$t1, $acc2
	adc	\$0, $acc3		# can't overflow

	################################# Add bits [511:256] of the sqr result
	xor	%rdx, %rdx
	add	$acc4, $acc0
	adc	$acc5, $acc1
	 mov	$acc0, $acc4
	adc	$acc6, $acc2
	adc	$acc7, $acc3
	 mov	$acc1, %rax
	adc	\$0, %rdx

	################################# Compare to modulus
	sub	8*0($a_ptr), $acc0
	 mov	$acc2, $acc6
	sbb	8*1($a_ptr), $acc1
	sbb	8*2($a_ptr), $acc2
	 mov	$acc3, $acc7
	sbb	8*3($a_ptr), $acc3
	sbb	\$0, %rdx

	cmovc	$acc4, $acc0
	cmovnc	$acc1, %rax
	cmovnc	$acc2, $acc6
	cmovnc	$acc3, $acc7

	dec	$b_ptr
	jnz	.Loop_ord_sqr

	mov	$acc0, 8*0($r_ptr)
	mov	%rax,  8*1($r_ptr)
	pxor	%xmm1, %xmm1
	mov	$acc6, 8*2($r_ptr)
	pxor	%xmm2, %xmm2
	mov	$acc7, 8*3($r_ptr)
	pxor	%xmm3, %xmm3

	pop	%r15
	pop	%r14
	pop	%r13
	pop	%r12
	pop	%rbx
	pop	%rbp
	ret
.size	ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
___

$code.=<<___	if ($addx);
################################################################################
.type	ecp_nistz256_ord_mul_montx,\@function,3
.align	32
ecp_nistz256_ord_mul_montx:
.Lecp_nistz256_ord_mul_montx:
	push	%rbp
	push	%rbx
	push	%r12
	push	%r13
	push	%r14
	push	%r15

	mov	$b_org, $b_ptr
	mov	8*0($b_org), %rdx
	mov	8*0($a_ptr), $acc1
	mov	8*1($a_ptr), $acc2
	mov	8*2($a_ptr), $acc3
	mov	8*3($a_ptr), $acc4
	lea	-128($a_ptr), $a_ptr	# control u-op density
	lea	.Lord-128(%rip), %r14
	mov	.LordK(%rip), %r15

	################################# Multiply by b[0]
	mulx	$acc1, $acc0, $acc1
	mulx	$acc2, $t0, $acc2
	mulx	$acc3, $t1, $acc3
	add	$t0, $acc1
	mulx	$acc4, $t0, $acc4
	 mov	$acc0, %rdx
	 mulx	%r15, %rdx, %rax
	adc	$t1, $acc2
	adc	$t0, $acc3
	adc	\$0, $acc4

	################################# reduction
	xor	$acc5, $acc5		# $acc5=0, cf=0, of=0
	mulx	8*0+128(%r14), $t0, $t1
	adcx	$t0, $acc0		# guaranteed to be zero
	adox	$t1, $acc1

	mulx	8*1+128(%r14), $t0, $t1
	adcx	$t0, $acc1
	adox	$t1, $acc2

	mulx	8*2+128(%r14), $t0, $t1
	adcx	$t0, $acc2
	adox	$t1, $acc3

	mulx	8*3+128(%r14), $t0, $t1
	 mov	8*1($b_ptr), %rdx
	adcx	$t0, $acc3
	adox	$t1, $acc4
	adcx	$acc0, $acc4
	adox	$acc0, $acc5
	adc	\$0, $acc5		# cf=0, of=0

	################################# Multiply by b[1]
	mulx	8*0+128($a_ptr), $t0, $t1
	adcx	$t0, $acc1
	adox	$t1, $acc2

	mulx	8*1+128($a_ptr), $t0, $t1
	adcx	$t0, $acc2
	adox	$t1, $acc3

	mulx	8*2+128($a_ptr), $t0, $t1
	adcx	$t0, $acc3
	adox	$t1, $acc4

	mulx	8*3+128($a_ptr), $t0, $t1
	 mov	$acc1, %rdx
	 mulx	%r15, %rdx, %rax
	adcx	$t0, $acc4
	adox	$t1, $acc5

	adcx	$acc0, $acc5
	adox	$acc0, $acc0
	adc	\$0, $acc0		# cf=0, of=0

	################################# reduction
	mulx	8*0+128(%r14), $t0, $t1
	adcx	$t0, $acc1		# guaranteed to be zero
	adox	$t1, $acc2

	mulx	8*1+128(%r14), $t0, $t1
	adcx	$t0, $acc2
	adox	$t1, $acc3

	mulx	8*2+128(%r14), $t0, $t1
	adcx	$t0, $acc3
	adox	$t1, $acc4

	mulx	8*3+128(%r14), $t0, $t1
	 mov	8*2($b_ptr), %rdx
	adcx	$t0, $acc4
	adox	$t1, $acc5
	adcx	$acc1, $acc5
	adox	$acc1, $acc0
	adc	\$0, $acc0		# cf=0, of=0

	################################# Multiply by b[2]
	mulx	8*0+128($a_ptr), $t0, $t1
	adcx	$t0, $acc2
	adox	$t1, $acc3

	mulx	8*1+128($a_ptr), $t0, $t1
	adcx	$t0, $acc3
	adox	$t1, $acc4

	mulx	8*2+128($a_ptr), $t0, $t1
	adcx	$t0, $acc4
	adox	$t1, $acc5

	mulx	8*3+128($a_ptr), $t0, $t1
	 mov	$acc2, %rdx
	 mulx	%r15, %rdx, %rax
	adcx	$t0, $acc5
	adox	$t1, $acc0

	adcx	$acc1, $acc0
	adox	$acc1, $acc1
	adc	\$0, $acc1		# cf=0, of=0

	################################# reduction
	mulx	8*0+128(%r14), $t0, $t1
	adcx	$t0, $acc2		# guaranteed to be zero
	adox	$t1, $acc3

	mulx	8*1+128(%r14), $t0, $t1
	adcx	$t0, $acc3
	adox	$t1, $acc4

	mulx	8*2+128(%r14), $t0, $t1
	adcx	$t0, $acc4
	adox	$t1, $acc5

	mulx	8*3+128(%r14), $t0, $t1
	 mov	8*3($b_ptr), %rdx
	adcx	$t0, $acc5
	adox	$t1, $acc0
	adcx	$acc2, $acc0
	adox	$acc2, $acc1
	adc	\$0, $acc1		# cf=0, of=0

	################################# Multiply by b[3]
	mulx	8*0+128($a_ptr), $t0, $t1
	adcx	$t0, $acc3
	adox	$t1, $acc4

	mulx	8*1+128($a_ptr), $t0, $t1
	adcx	$t0, $acc4
	adox	$t1, $acc5

	mulx	8*2+128($a_ptr), $t0, $t1
	adcx	$t0, $acc5
	adox	$t1, $acc0

	mulx	8*3+128($a_ptr), $t0, $t1
	 mov	$acc3, %rdx
	 mulx	%r15, %rdx, %rax
	adcx	$t0, $acc0
	adox	$t1, $acc1

	adcx	$acc2, $acc1
	adox	$acc2, $acc2
	adc	\$0, $acc2		# cf=0, of=0

	################################# reduction
	mulx	8*0+128(%r14), $t0, $t1
	adcx	$t0, $acc3		# guranteed to be zero
	adox	$t1, $acc4

	mulx	8*1+128(%r14), $t0, $t1
	adcx	$t0, $acc4
	adox	$t1, $acc5

	mulx	8*2+128(%r14), $t0, $t1
	adcx	$t0, $acc5
	adox	$t1, $acc0

	mulx	8*3+128(%r14), $t0, $t1
	lea	128(%r14),%r14
	 mov	$acc4, $t2
	adcx	$t0, $acc0
	adox	$t1, $acc1
	 mov	$acc5, $t3
	adcx	$acc3, $acc1
	adox	$acc3, $acc2
	adc	\$0, $acc2

	#################################
	# Branch-less conditional subtraction of P
	 mov	$acc0, $t0
	sub	8*0(%r14), $acc4
	sbb	8*1(%r14), $acc5
	sbb	8*2(%r14), $acc0
	 mov	$acc1, $t1
	sbb	8*3(%r14), $acc1
	sbb	\$0, $acc2

	cmovc	$t2, $acc4
	cmovc	$t3, $acc5
	cmovc	$t0, $acc0
	cmovc	$t1, $acc1

	mov	$acc4, 8*0($r_ptr)
	mov	$acc5, 8*1($r_ptr)
	mov	$acc0, 8*2($r_ptr)
	mov	$acc1, 8*3($r_ptr)

	pop	%r15
	pop	%r14
	pop	%r13
	pop	%r12
	pop	%rbx
	pop	%rbp
	ret
.size	ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx

.type	ecp_nistz256_ord_sqr_montx,\@function,3
.align	32
ecp_nistz256_ord_sqr_montx:
.Lecp_nistz256_ord_sqr_montx:
	push	%rbp
	push	%rbx
	push	%r12
	push	%r13
	push	%r14
	push	%r15

	mov	$b_org, $b_ptr
	mov	8*0($a_ptr), %rdx
	mov	8*1($a_ptr), $acc6
	mov	8*2($a_ptr), $acc7
	mov	8*3($a_ptr), $acc0
	lea	.Lord(%rip), $a_ptr
	jmp	.Loop_ord_sqrx

.align	32
.Loop_ord_sqrx:
	mulx	$acc6, $acc1, $acc2	# a[0]*a[1]
	mulx	$acc7, $t0, $acc3	# a[0]*a[2]
	 mov	%rdx, %rax		# offload a[0]
	 movq	$acc6, %xmm1		# offload a[1]
	mulx	$acc0, $t1, $acc4	# a[0]*a[3]
	 mov	$acc6, %rdx
	add	$t0, $acc2
	 movq	$acc7, %xmm2		# offload a[2]
	adc	$t1, $acc3
	adc	\$0, $acc4
	xor	$acc5, $acc5		# $acc5=0,cf=0,of=0
	#################################
	mulx	$acc7, $t0, $t1		# a[1]*a[2]
	adcx	$t0, $acc3
	adox	$t1, $acc4

	mulx	$acc0, $t0, $t1		# a[1]*a[3]
	 mov	$acc7, %rdx
	adcx	$t0, $acc4
	adox	$t1, $acc5
	adc	\$0, $acc5
	#################################
	mulx	$acc0, $t0, $acc6	# a[2]*a[3]
	mov	%rax, %rdx
	 movq	$acc0, %xmm3		# offload a[3]
	xor	$acc7, $acc7		# $acc7=0,cf=0,of=0
	 adcx	$acc1, $acc1		# acc1:6<<1
	adox	$t0, $acc5
	 adcx	$acc2, $acc2
	adox	$acc7, $acc6		# of=0

	################################# a[i]*a[i]
	mulx	%rdx, $acc0, $t1
	movq	%xmm1, %rdx
	 adcx	$acc3, $acc3
	adox	$t1, $acc1
	 adcx	$acc4, $acc4
	mulx	%rdx, $t0, $t4
	movq	%xmm2, %rdx
	 adcx	$acc5, $acc5
	adox	$t0, $acc2
	 adcx	$acc6, $acc6
	mulx	%rdx, $t0, $t1
	.byte	0x67
	movq	%xmm3, %rdx
	adox	$t4, $acc3
	 adcx	$acc7, $acc7
	adox	$t0, $acc4
	adox	$t1, $acc5
	mulx	%rdx, $t0, $t4
	adox	$t0, $acc6
	adox	$t4, $acc7

	################################# reduction
	mov	$acc0, %rdx
	mulx	8*4($a_ptr), %rdx, $t0

	xor	%rax, %rax		# cf=0, of=0
	mulx	8*0($a_ptr), $t0, $t1
	adcx	$t0, $acc0		# guaranteed to be zero
	adox	$t1, $acc1
	mulx	8*1($a_ptr), $t0, $t1
	adcx	$t0, $acc1
	adox	$t1, $acc2
	mulx	8*2($a_ptr), $t0, $t1
	adcx	$t0, $acc2
	adox	$t1, $acc3
	mulx	8*3($a_ptr), $t0, $t1
	adcx	$t0, $acc3
	adox	$t1, $acc0		# of=0
	adcx	%rax, $acc0		# cf=0

	#################################
	mov	$acc1, %rdx
	mulx	8*4($a_ptr), %rdx, $t0

	mulx	8*0($a_ptr), $t0, $t1
	adox	$t0, $acc1		# guaranteed to be zero
	adcx	$t1, $acc2
	mulx	8*1($a_ptr), $t0, $t1
	adox	$t0, $acc2
	adcx	$t1, $acc3
	mulx	8*2($a_ptr), $t0, $t1
	adox	$t0, $acc3
	adcx	$t1, $acc0
	mulx	8*3($a_ptr), $t0, $t1
	adox	$t0, $acc0
	adcx	$t1, $acc1		# cf=0
	adox	%rax, $acc1		# of=0

	#################################
	mov	$acc2, %rdx
	mulx	8*4($a_ptr), %rdx, $t0

	mulx	8*0($a_ptr), $t0, $t1
	adcx	$t0, $acc2		# guaranteed to be zero
	adox	$t1, $acc3
	mulx	8*1($a_ptr), $t0, $t1
	adcx	$t0, $acc3
	adox	$t1, $acc0
	mulx	8*2($a_ptr), $t0, $t1
	adcx	$t0, $acc0
	adox	$t1, $acc1
	mulx	8*3($a_ptr), $t0, $t1
	adcx	$t0, $acc1
	adox	$t1, $acc2		# of=0
	adcx	%rax, $acc2		# cf=0

	#################################
	mov	$acc3, %rdx
	mulx	8*4($a_ptr), %rdx, $t0

	mulx	8*0($a_ptr), $t0, $t1
	adox	$t0, $acc3		# guaranteed to be zero
	adcx	$t1, $acc0
	mulx	8*1($a_ptr), $t0, $t1
	adox	$t0, $acc0
	adcx	$t1, $acc1
	mulx	8*2($a_ptr), $t0, $t1
	adox	$t0, $acc1
	adcx	$t1, $acc2
	mulx	8*3($a_ptr), $t0, $t1
	adox	$t0, $acc2
	adcx	$t1, $acc3
	adox	%rax, $acc3

	################################# accumulate upper half
	add	$acc0, $acc4		# add	$acc4, $acc0
	adc	$acc5, $acc1
	 mov	$acc4, %rdx
	adc	$acc6, $acc2
	adc	$acc7, $acc3
	 mov	$acc1, $acc6
	adc	\$0, %rax

	################################# compare to modulus
	sub	8*0($a_ptr), $acc4
	 mov	$acc2, $acc7
	sbb	8*1($a_ptr), $acc1
	sbb	8*2($a_ptr), $acc2
	 mov	$acc3, $acc0
	sbb	8*3($a_ptr), $acc3
	sbb	\$0, %rax

	cmovnc	$acc4, %rdx
	cmovnc	$acc1, $acc6
	cmovnc	$acc2, $acc7
	cmovnc	$acc3, $acc0

	dec	$b_ptr
	jnz	.Loop_ord_sqrx

	mov	%rdx, 8*0($r_ptr)
	mov	$acc6, 8*1($r_ptr)
	pxor	%xmm1, %xmm1
	mov	$acc7, 8*2($r_ptr)
	pxor	%xmm2, %xmm2
	mov	$acc0, 8*3($r_ptr)
	pxor	%xmm3, %xmm3

	pop	%r15
	pop	%r14
	pop	%r13
	pop	%r12
	pop	%rbx
	pop	%rbp
	ret

.size	ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx
___

$code.=<<___;
################################################################################
# void ecp_nistz256_to_mont(
+2 −0
Original line number Diff line number Diff line
@@ -48,6 +48,8 @@ static const ERR_STRING_DATA EC_str_functs[] = {
     "ECPKParameters_print_fp"},
    {ERR_PACK(ERR_LIB_EC, EC_F_ECP_NISTZ256_GET_AFFINE, 0),
     "ecp_nistz256_get_affine"},
    {ERR_PACK(ERR_LIB_EC, EC_F_ECP_NISTZ256_INV_MOD_ORD, 0),
     "ecp_nistz256_inv_mod_ord"},
    {ERR_PACK(ERR_LIB_EC, EC_F_ECP_NISTZ256_MULT_PRECOMPUTE, 0),
     "ecp_nistz256_mult_precompute"},
    {ERR_PACK(ERR_LIB_EC, EC_F_ECP_NISTZ256_POINTS_MUL, 0),
+6 −1
Original line number Diff line number Diff line
@@ -155,6 +155,9 @@ struct ec_method_st {
    /* custom ECDH operation */
    int (*ecdh_compute_key)(unsigned char **pout, size_t *poutlen,
                            const EC_POINT *pub_key, const EC_KEY *ecdh);
    /* Inverse modulo order */
    int (*field_inverse_mod_ord)(const EC_GROUP *, BIGNUM *r, BIGNUM *x,
                                 BN_CTX *ctx);
};

/*
@@ -520,7 +523,6 @@ void ec_GFp_nistp_points_make_affine_internal(size_t num, void *point_array,
void ec_GFp_nistp_recode_scalar_bits(unsigned char *sign,
                                     unsigned char *digit, unsigned char in);
#endif
int ec_precompute_mont_data(EC_GROUP *);
int ec_group_simple_order_bits(const EC_GROUP *group);

#ifdef ECP_NISTZ256_ASM
@@ -604,3 +606,6 @@ int X25519(uint8_t out_shared_key[32], const uint8_t private_key[32],
           const uint8_t peer_public_value[32]);
void X25519_public_from_private(uint8_t out_public_value[32],
                                const uint8_t private_key[32]);

int EC_GROUP_do_inverse_ord(const EC_GROUP *group, BIGNUM *res,
                            BIGNUM *x, BN_CTX *ctx);
+12 −1
Original line number Diff line number Diff line
@@ -261,6 +261,8 @@ int EC_METHOD_get_field_type(const EC_METHOD *meth)
    return meth->field_type;
}

static int ec_precompute_mont_data(EC_GROUP *);

int EC_GROUP_set_generator(EC_GROUP *group, const EC_POINT *generator,
                           const BIGNUM *order, const BIGNUM *cofactor)
{
@@ -961,7 +963,7 @@ int EC_GROUP_have_precompute_mult(const EC_GROUP *group)
 * ec_precompute_mont_data sets |group->mont_data| from |group->order| and
 * returns one on success. On error it returns zero.
 */
int ec_precompute_mont_data(EC_GROUP *group)
static int ec_precompute_mont_data(EC_GROUP *group)
{
    BN_CTX *ctx = BN_CTX_new();
    int ret = 0;
@@ -1006,3 +1008,12 @@ int ec_group_simple_order_bits(const EC_GROUP *group)
        return 0;
    return BN_num_bits(group->order);
}

int EC_GROUP_do_inverse_ord(const EC_GROUP *group, BIGNUM *res,
                            BIGNUM *x, BN_CTX *ctx)
{
    if (group->meth->field_inverse_mod_ord != NULL)
        return group->meth->field_inverse_mod_ord(group, res, x, ctx);
    else
        return 0;
}
+33 −27

File changed.

Preview size limit exceeded, changes collapsed.

Loading