ecp_nistz256-x86_64.pl: fix occasional failures. (9e557ab2) · Commits · CYBER - Cyber Security / TS 103 523 MSP / TLMSP / TLMSP OpenSSL

crypto/ec/asm/ecp_nistz256-x86_64.pl

+191 −290

Original line number	Diff line number	Diff line
		@@ -31,15 +31,16 @@
		# Further optimization by <appro@openssl.org>:
		#
		# this/original
		# Opteron +8-33%
		# Bulldozer +10-30%
		# P4 +14-38%
		# Westmere +8-23%
		# Sandy Bridge +8-24%
		# Ivy Bridge +7-25%
		# Haswell +5-25%
		# Atom +10-32%
		# VIA Nano +37-130%
		# Opteron +12-49%
		# Bulldozer +14-45%
		# P4 +18-46%
		# Westmere +12-34%
		# Sandy Bridge +9-35%
		# Ivy Bridge +9-35%
		# Haswell +8-37%
		# Broadwell +18-58%
		# Atom +15-50%
		# VIA Nano +43-160%
		#
		# Ranges denote minimum and maximum improvement coefficients depending
		# on benchmark. Lower coefficients are for ECDSA sign, relatively
		@@ -550,28 +551,20 @@ __ecp_nistz256_mul_montq:
		# and add the result to the acc.
		# Due to the special form of p256 we do some optimizations
		#
		# acc[0] x p256[0] = acc[0] x 2^64 - acc[0]
		# then we add acc[0] and get acc[0] x 2^64

		mulq $poly1
		xor $t0, $t0
		add $acc0, $acc1 # +=acc[0]*2^64
		adc \$0, %rdx
		add %rax, $acc1
		mov $acc0, %rax

		# acc[0] x p256[2] = 0
		adc %rdx, $acc2
		adc \$0, $t0
		# acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0]
		# then we add acc[0] and get acc[0] x 2^96

		mov $acc0, $t1
		shl \$32, $acc0
		mulq $poly3
		xor $acc0, $acc0
		add $t0, $acc3
		adc \$0, %rdx
		add %rax, $acc3
		shr \$32, $t1
		add $acc0, $acc1 # +=acc[0]<<96
		adc $t1, $acc2
		adc %rax, $acc3
		mov 8*1($b_ptr), %rax
		adc %rdx, $acc4
		adc \$0, $acc5
		xor $acc0, $acc0

		########################################################################
		# Multiply by b[1]
		@@ -608,23 +601,17 @@ __ecp_nistz256_mul_montq:

		########################################################################
		# Second reduction step
		mulq $poly1
		xor $t0, $t0
		add $acc1, $acc2
		adc \$0, %rdx
		add %rax, $acc2
		mov $acc1, %rax
		adc %rdx, $acc3
		adc \$0, $t0

		mov $acc1, $t1
		shl \$32, $acc1
		mulq $poly3
		xor $acc1, $acc1
		add $t0, $acc4
		adc \$0, %rdx
		add %rax, $acc4
		shr \$32, $t1
		add $acc1, $acc2
		adc $t1, $acc3
		adc %rax, $acc4
		mov 8*2($b_ptr), %rax
		adc %rdx, $acc5
		adc \$0, $acc0
		xor $acc1, $acc1

		########################################################################
		# Multiply by b[2]
		@@ -661,23 +648,17 @@ __ecp_nistz256_mul_montq:

		########################################################################
		# Third reduction step
		mulq $poly1
		xor $t0, $t0
		add $acc2, $acc3
		adc \$0, %rdx
		add %rax, $acc3
		mov $acc2, %rax
		adc %rdx, $acc4
		adc \$0, $t0

		mov $acc2, $t1
		shl \$32, $acc2
		mulq $poly3
		xor $acc2, $acc2
		add $t0, $acc5
		adc \$0, %rdx
		add %rax, $acc5
		shr \$32, $t1
		add $acc2, $acc3
		adc $t1, $acc4
		adc %rax, $acc5
		mov 8*3($b_ptr), %rax
		adc %rdx, $acc0
		adc \$0, $acc1
		xor $acc2, $acc2

		########################################################################
		# Multiply by b[3]
		@@ -714,20 +695,14 @@ __ecp_nistz256_mul_montq:

		########################################################################
		# Final reduction step
		mulq $poly1
		#xor $t0, $t0
		add $acc3, $acc4
		adc \$0, %rdx
		add %rax, $acc4
		mov $acc3, %rax
		adc %rdx, $acc5
		#adc \$0, $t0 # doesn't overflow

		mov $acc3, $t1
		shl \$32, $acc3
		mulq $poly3
		#add $t0, $acc0
		#adc \$0, %rdx
		shr \$32, $t1
		add $acc3, $acc4
		adc $t1, $acc5
		mov $acc4, $t0
		add %rax, $acc0
		adc %rax, $acc0
		adc %rdx, $acc1
		mov $acc5, $t1
		adc \$0, $acc2
		@@ -740,14 +715,14 @@ __ecp_nistz256_mul_montq:
		sbb \$0, $acc0 # .Lpoly[2]
		mov $acc1, $t3
		sbb $poly3, $acc1 # .Lpoly[3]
		neg $acc2
		sbb \$0, $acc2

		cmovnc $t0, $acc4
		cmovnc $t1, $acc5
		cmovc $t0, $acc4
		cmovc $t1, $acc5
		mov $acc4, 8*0($r_ptr)
		cmovnc $t2, $acc0
		cmovc $t2, $acc0
		mov $acc5, 8*1($r_ptr)
		cmovnc $t3, $acc1
		cmovc $t3, $acc1
		mov $acc0, 8*2($r_ptr)
		mov $acc1, 8*3($r_ptr)

		@@ -897,89 +872,62 @@ __ecp_nistz256_sqr_montq:
		##########################################
		# Now the reduction
		# First iteration
		mulq $a_ptr
		#xor $t0, $t0
		add $acc0, $acc1
		adc \$0, %rdx
		add %rax, $acc1
		mov $acc0, %rax
		adc %rdx, $acc2 # doesn't overflow
		#adc \$0, $t0

		mov $acc0, $t0
		shl \$32, $acc0
		mulq $t1
		xor $acc0, $acc0
		#add $t0, $acc3
		#adc \$0, %rdx
		add %rax, $acc3
		shr \$32, $t0
		add $acc0, $acc1 # +=acc[0]<<96
		adc $t0, $acc2
		adc %rax, $acc3
		mov $acc1, %rax
		adc %rdx, $acc4
		adc \$0, $acc0
		adc \$0, %rdx

		##########################################
		# Second iteration
		mulq $a_ptr
		#xor $t0, $t0
		add $acc1, $acc2
		adc \$0, %rdx
		add %rax, $acc2
		mov $acc1, %rax
		adc %rdx, $acc3 # doesn't overflow
		#adc \$0, $t0

		mov $acc1, $t0
		shl \$32, $acc1
		mov %rdx, $acc0
		mulq $t1
		xor $acc1, $acc1
		#add $t0, $acc4
		#adc \$0, %rdx
		add %rax, $acc4
		shr \$32, $t0
		add $acc1, $acc2
		adc $t0, $acc3
		adc %rax, $acc0
		mov $acc2, %rax
		adc %rdx, $acc0
		adc \$0, $acc1
		adc \$0, %rdx

		##########################################
		# Third iteration
		mulq $a_ptr
		#xor $t0, $t0
		add $acc2, $acc3
		adc \$0, %rdx
		add %rax, $acc3
		mov $acc2, %rax
		adc %rdx, $acc4 # doesn't overflow
		#adc \$0, $t0

		mov $acc2, $t0
		shl \$32, $acc2
		mov %rdx, $acc1
		mulq $t1
		xor $acc2, $acc2
		#add $t0, $acc0
		#adc \$0, %rdx
		add %rax, $acc0
		shr \$32, $t0
		add $acc2, $acc3
		adc $t0, $acc0
		adc %rax, $acc1
		mov $acc3, %rax
		adc %rdx, $acc1
		adc \$0, $acc2
		adc \$0, %rdx

		###########################################
		# Last iteration
		mulq $a_ptr
		#xor $t0, $t0
		add $acc3, $acc4
		adc \$0, %rdx
		add %rax, $acc4
		mov $acc3, %rax
		adc %rdx, $acc0 # doesn't overflow
		#adc \$0, $t0

		mov $acc3, $t0
		shl \$32, $acc3
		mov %rdx, $acc2
		mulq $t1
		shr \$32, $t0
		add $acc3, $acc0
		adc $t0, $acc1
		adc %rax, $acc2
		adc \$0, %rdx
		xor $acc3, $acc3
		#add $t0, $acc1
		#adc \$0, %rdx
		add %rax, $acc1
		adc %rdx, $acc2
		adc \$0, $acc3

		############################################
		# Add the rest of the acc
		add $acc0, $acc5
		add $acc0, $acc4
		adc $acc1, $acc5
		mov $acc4, $acc0
		adc $acc1, $acc6
		adc $acc2, $acc7
		adc $acc2, $acc6
		adc %rdx, $acc7
		mov $acc5, $acc1
		adc \$0, $acc3

		@@ -989,14 +937,14 @@ __ecp_nistz256_sqr_montq:
		sbb \$0, $acc6 # .Lpoly[2]
		mov $acc7, $t0
		sbb $t1, $acc7 # .Lpoly[3]
		neg $acc3
		sbb \$0, $acc3

		cmovnc $acc0, $acc4
		cmovnc $acc1, $acc5
		cmovc $acc0, $acc4
		cmovc $acc1, $acc5
		mov $acc4, 8*0($r_ptr)
		cmovnc $acc2, $acc6
		cmovc $acc2, $acc6
		mov $acc5, 8*1($r_ptr)
		cmovnc $t0, $acc7
		cmovc $t0, $acc7
		mov $acc6, 8*2($r_ptr)
		mov $acc7, 8*3($r_ptr)

		@@ -1028,18 +976,15 @@ __ecp_nistz256_mul_montx:

		########################################################################
		# First reduction step
		xor $acc0, $acc0 # $acc0=0,cf=0,of=0
		adox $t1, $acc1
		adox $t0, $acc2
		add $t1, $acc1
		adc $t0, $acc2

		mulx $poly3, $t0, $t1
		mov 8*1($b_ptr), %rdx
		adox $t0, $acc3
		adcx $t1, $acc4

		adox $acc0, $acc4
		adcx $acc0, $acc5 # cf=0
		adox $acc0, $acc5 # of=0
		adc $t0, $acc3
		adc $t1, $acc4
		adc \$0, $acc5
		xor $acc0, $acc0 # $acc0=0,cf=0,of=0

		########################################################################
		# Multiply by b[1]
		@@ -1068,18 +1013,15 @@ __ecp_nistz256_mul_montx:

		########################################################################
		# Second reduction step
		xor $acc1 ,$acc1 # $acc1=0,cf=0,of=0
		adox $t0, $acc2
		adox $t1, $acc3
		add $t0, $acc2
		adc $t1, $acc3

		mulx $poly3, $t0, $t1
		mov 8*2($b_ptr), %rdx
		adox $t0, $acc4
		adcx $t1, $acc5

		adox $acc1, $acc5
		adcx $acc1, $acc0 # cf=0
		adox $acc1, $acc0 # of=0
		adc $t0, $acc4
		adc $t1, $acc5
		adc \$0, $acc0
		xor $acc1 ,$acc1 # $acc1=0,cf=0,of=0

		########################################################################
		# Multiply by b[2]
		@@ -1108,18 +1050,15 @@ __ecp_nistz256_mul_montx:

		########################################################################
		# Third reduction step
		xor $acc2, $acc2 # $acc2=0,cf=0,of=0
		adox $t0, $acc3
		adox $t1, $acc4
		add $t0, $acc3
		adc $t1, $acc4

		mulx $poly3, $t0, $t1
		mov 8*3($b_ptr), %rdx
		adox $t0, $acc5
		adcx $t1, $acc0

		adox $acc2, $acc0
		adcx $acc2, $acc1 # cf=0
		adox $acc2, $acc1 # of=0
		adc $t0, $acc5
		adc $t1, $acc0
		adc \$0, $acc1
		xor $acc2, $acc2 # $acc2=0,cf=0,of=0

		########################################################################
		# Multiply by b[3]
		@@ -1148,38 +1087,34 @@ __ecp_nistz256_mul_montx:

		########################################################################
		# Fourth reduction step
		xor $acc3, $acc3 # $acc3=0,cf=0,of=0
		adox $t0, $acc4
		adox $t1, $acc5
		add $t0, $acc4
		adc $t1, $acc5

		mulx $poly3, $t0, $t1
		mov $acc4, $t2
		mov .Lpoly+8*1(%rip), $poly1
		adcx $t0, $acc0
		adox $t1, $acc1
		adc $t0, $acc0
		mov $acc5, $t3

		adcx $acc3, $acc1
		adox $acc3, $acc2
		adc $t1, $acc1
		adc \$0, $acc2
		mov $acc0, $t0

		########################################################################
		# Branch-less conditional subtraction of P
		xor %eax, %eax
		mov $acc0, $t0
		sbb \$-1, $acc4 # .Lpoly[0]
		sbb $poly1, $acc5 # .Lpoly[1]
		sbb \$0, $acc0 # .Lpoly[2]
		mov $acc1, $t1
		sbb $poly3, $acc1 # .Lpoly[3]
		sbb \$0, $acc2

		bt \$0,$acc2
		cmovnc $t2, $acc4
		cmovnc $t3, $acc5
		cmovc $t2, $acc4
		cmovc $t3, $acc5
		mov $acc4, 8*0($r_ptr)
		cmovnc $t0, $acc0
		cmovc $t0, $acc0
		mov $acc5, 8*1($r_ptr)
		cmovnc $t1, $acc1
		cmovc $t1, $acc1
		mov $acc0, 8*2($r_ptr)
		mov $acc1, 8*3($r_ptr)

		@@ -1247,52 +1182,44 @@ __ecp_nistz256_sqr_montx:
		mov .Lpoly+8*3(%rip), $t1

		# reduction step 1
		xor $acc0, $acc0
		adcx $t0, $acc1
		adcx $t4, $acc2
		add $t0, $acc1
		adc $t4, $acc2

		mulx $t1, $t0, $t4
		mulx $t1, $t0, $acc0
		mov $acc1, %rdx
		adcx $t0, $acc3
		adc $t0, $acc3
		shlx $a_ptr, $acc1, $t0
		adox $t4, $acc0
		shrx $a_ptr, $acc1, $t4
		adc \$0, $acc0
		shrx $a_ptr, $acc1, $t4

		# reduction step 2
		xor $acc1, $acc1
		adcx $t0, $acc2
		adcx $t4, $acc3
		add $t0, $acc2
		adc $t4, $acc3

		mulx $t1, $t0, $t4
		mulx $t1, $t0, $acc1
		mov $acc2, %rdx
		adcx $t0, $acc0
		adc $t0, $acc0
		shlx $a_ptr, $acc2, $t0
		adox $t4, $acc1
		shrx $a_ptr, $acc2, $t4
		adc \$0, $acc1
		shrx $a_ptr, $acc2, $t4

		# reduction step 3
		xor $acc2, $acc2
		adcx $t0, $acc3
		adcx $t4, $acc0
		add $t0, $acc3
		adc $t4, $acc0

		mulx $t1, $t0, $t4
		mulx $t1, $t0, $acc2
		mov $acc3, %rdx
		adcx $t0, $acc1
		adc $t0, $acc1
		shlx $a_ptr, $acc3, $t0
		adox $t4, $acc2
		shrx $a_ptr, $acc3, $t4
		adc \$0, $acc2
		shrx $a_ptr, $acc3, $t4

		# reduction step 4
		xor $acc3, $acc3
		adcx $t0, $acc0
		adcx $t4, $acc1
		add $t0, $acc0
		adc $t4, $acc1

		mulx $t1, $t0, $t4
		adcx $t0, $acc2
		adox $t4, $acc3
		mulx $t1, $t0, $acc3
		adc $t0, $acc2
		adc \$0, $acc3

		xor $t3, $t3 # cf=0
		@@ -1312,14 +1239,14 @@ __ecp_nistz256_sqr_montx:
		sbb \$0, $acc6 # .Lpoly[2]
		mov $acc7, $acc3
		sbb $t1, $acc7 # .Lpoly[3]
		sbb \$0, $t3

		bt \$0,$t3
		cmovnc $acc0, $acc4
		cmovnc $acc1, $acc5
		cmovc $acc0, $acc4
		cmovc $acc1, $acc5
		mov $acc4, 8*0($r_ptr)
		cmovnc $acc2, $acc6
		cmovc $acc2, $acc6
		mov $acc5, 8*1($r_ptr)
		cmovnc $acc3, $acc7
		cmovc $acc3, $acc7
		mov $acc6, 8*2($r_ptr)
		mov $acc7, 8*3($r_ptr)

		@@ -1330,8 +1257,8 @@ ___
		}
		{
		my ($r_ptr,$in_ptr)=("%rdi","%rsi");
		my ($acc0,$acc1,$acc2,$acc3,$acc4)=map("%r$_",(8..12));
		my ($t0,$t1)=("%rcx","%rsi");
		my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11));
		my ($t0,$t1,$t2)=("%rcx","%r12","%r13");

		$code.=<<___;
		################################################################################
		@@ -1348,109 +1275,83 @@ ecp_nistz256_from_mont:
		push %r13

		mov 8*0($in_ptr), %rax
		mov .Lpoly+8*3(%rip), $t2
		mov 8*1($in_ptr), $acc1
		mov 8*2($in_ptr), $acc2
		mov 8*3($in_ptr), $acc3
		lea .Lpoly(%rip), $in_ptr
		xor $acc4, $acc4
		mov %rax, $acc0
		mov .Lpoly+8*1(%rip), $t1

		#########################################
		# First iteration
		mulq 1*8($in_ptr)
		xor $t0, $t0
		mov %rax, $t0
		shl \$32, $acc0
		mulq $t2
		shr \$32, $t0
		add $acc0, $acc1
		adc \$0, %rdx
		add %rax, $acc1
		mov $acc0, %rax
		adc %rdx, $acc2
		adc \$0, $t0

		mulq 3*8($in_ptr)
		xor $acc0, $acc0
		add $t0, $acc3
		adc \$0, %rdx
		add %rax, $acc3
		adc $t0, $acc2
		adc %rax, $acc3
		mov $acc1, %rax
		adc %rdx, $acc4
		adc \$0, $acc0
		adc \$0, %rdx

		#########################################
		# Second iteration
		mulq 1*8($in_ptr)
		xor $t0, $t0
		mov $acc1, $t0
		shl \$32, $acc1
		mov %rdx, $acc0
		mulq $t2
		shr \$32, $t0
		add $acc1, $acc2
		adc \$0, %rdx
		add %rax, $acc2
		mov $acc1, %rax
		adc %rdx, $acc3
		adc \$0, $t0

		mulq 3*8($in_ptr)
		xor $acc1, $acc1
		add $t0, $acc4
		adc \$0, %rdx
		add %rax, $acc4
		adc $t0, $acc3
		adc %rax, $acc0
		mov $acc2, %rax
		adc %rdx, $acc0
		adc \$0, $acc1
		adc \$0, %rdx

		##########################################
		# Third iteration
		mulq 1*8($in_ptr)
		xor $t0, $t0
		mov $acc2, $t0
		shl \$32, $acc2
		mov %rdx, $acc1
		mulq $t2
		shr \$32, $t0
		add $acc2, $acc3
		adc \$0, %rdx
		add %rax, $acc3
		mov $acc2, %rax
		adc %rdx, $acc4
		adc \$0, $t0

		mulq 3*8($in_ptr)
		xor $acc2, $acc2
		add $t0, $acc0
		adc \$0, %rdx
		add %rax, $acc0
		adc $t0, $acc0
		adc %rax, $acc1
		mov $acc3, %rax
		adc %rdx, $acc1
		adc \$0, $acc2
		adc \$0, %rdx

		###########################################
		# Last iteration
		mulq 1*8($in_ptr)
		xor $t0, $t0
		add $acc3, $acc4
		adc \$0, %rdx
		add %rax, $acc4
		mov $acc3, %rax
		adc %rdx, $acc0
		adc \$0, $t0

		mulq 3*8($in_ptr)
		add $t0, $acc1
		mov $acc3, $t0
		shl \$32, $acc3
		mov %rdx, $acc2
		mulq $t2
		shr \$32, $t0
		add $acc3, $acc0
		adc $t0, $acc1
		mov $acc0, $t0
		adc %rax, $acc2
		mov $acc1, $in_ptr
		adc \$0, %rdx
		add %rax, $acc1
		adc %rdx, $acc2
		sbb $acc3, $acc3

		mov 0*8($in_ptr), %rax
		mov 1*8($in_ptr), %rdx
		mov 2*8($in_ptr), $t0
		mov 3*8($in_ptr), $t1

		and $acc3, %rax
		and $acc3, %rdx
		and $acc3, $t0
		and $acc3, $t1

		sub %rax, $acc4
		sbb %rdx, $acc0
		mov $acc4, 8*0($r_ptr)
		sbb $t0, $acc1
		mov $acc0, 8*1($r_ptr)
		sbb $t1, $acc2
		mov $acc1, 8*2($r_ptr)
		mov $acc2, 8*3($r_ptr)
		###########################################
		# Branch-less conditional subtraction
		sub \$-1, $acc0
		mov $acc2, %rax
		sbb $t1, $acc1
		sbb \$0, $acc2
		mov %rdx, $acc3
		sbb $t2, %rdx
		sbb $t2, $t2

		cmovnz $t0, $acc0
		cmovnz $in_ptr, $acc1
		mov $acc0, 8*0($r_ptr)
		cmovnz %rax, $acc2
		mov $acc1, 8*1($r_ptr)
		cmovz %rdx, $acc3
		mov $acc2, 8*2($r_ptr)
		mov $acc3, 8*3($r_ptr)

		pop %r13
		pop %r12