Commit 120a9e1a authored by Andy Polyakov's avatar Andy Polyakov
Browse files

bn/asm/sparcv9-mont.pl: fix squaring code path.



This module is used only with odd input lengths, i.e. not used in normal
PKI cases, on contemporary processors. The problem was "illuminated" by
fuzzing tests.

Reviewed-by: default avatarRichard Levitte <levitte@openssl.org>
parent 3ba4dac6
Loading
Loading
Loading
Loading
+16 −13
Original line number Diff line number Diff line
@@ -300,7 +300,7 @@ ___
######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
######## code without following dedicated squaring procedure.
########
$sbit="%i2";		# re-use $bp!
$sbit="%o5";

$code.=<<___;
.align	32
@@ -413,7 +413,7 @@ $code.=<<___;
	mulx	$apj,$mul0,$acc0
	mulx	$npj,$mul1,$acc1
	add	$acc0,$car0,$car0
	add	$tpj,$car1,$car1
	add	$tpj,$sbit,$sbit
	ld	[$ap+$j],$apj			! ap[j]
	and	$car0,$mask,$acc0
	ld	[$np+$j],$npj			! np[j]
@@ -422,7 +422,7 @@ $code.=<<___;
	ld	[$tp+8],$tpj			! tp[j]
	add	$acc0,$acc0,$acc0
	add	$j,4,$j				! j++
	or	$sbit,$acc0,$acc0
	add	$sbit,$acc0,$acc0
	srlx	$acc0,32,$sbit
	and	$acc0,$mask,$acc0
	cmp	$j,$num
@@ -436,12 +436,12 @@ $code.=<<___;
	mulx	$apj,$mul0,$acc0
	mulx	$npj,$mul1,$acc1
	add	$acc0,$car0,$car0
	add	$tpj,$car1,$car1
	add	$tpj,$sbit,$sbit
	and	$car0,$mask,$acc0
	srlx	$car0,32,$car0
	add	$acc1,$car1,$car1
	add	$acc0,$acc0,$acc0
	or	$sbit,$acc0,$acc0
	add	$sbit,$acc0,$acc0
	srlx	$acc0,32,$sbit
	and	$acc0,$mask,$acc0
	add	$acc0,$car1,$car1
@@ -449,7 +449,7 @@ $code.=<<___;
	srlx	$car1,32,$car1

	add	$car0,$car0,$car0
	or	$sbit,$car0,$car0
	add	$sbit,$car0,$car0
	add	$car0,$car1,$car1
	add	$car2,$car1,$car1
	st	$car1,[$tp+4]
@@ -509,7 +509,7 @@ $code.=<<___;
.Lsqr_inner2:
	mulx	$apj,$mul0,$acc0
	mulx	$npj,$mul1,$acc1
	add	$tpj,$car1,$car1
	add	$tpj,$sbit,$sbit
	add	$acc0,$car0,$car0
	ld	[$ap+$j],$apj			! ap[j]
	and	$car0,$mask,$acc0
@@ -517,7 +517,7 @@ $code.=<<___;
	srlx	$car0,32,$car0
	add	$acc0,$acc0,$acc0
	ld	[$tp+8],$tpj			! tp[j]
	or	$sbit,$acc0,$acc0
	add	$sbit,$acc0,$acc0
	add	$j,4,$j				! j++
	srlx	$acc0,32,$sbit
	and	$acc0,$mask,$acc0
@@ -532,12 +532,12 @@ $code.=<<___;
.Lsqr_no_inner2:
	mulx	$apj,$mul0,$acc0
	mulx	$npj,$mul1,$acc1
	add	$tpj,$car1,$car1
	add	$tpj,$sbit,$sbit
	add	$acc0,$car0,$car0
	and	$car0,$mask,$acc0
	srlx	$car0,32,$car0
	add	$acc0,$acc0,$acc0
	or	$sbit,$acc0,$acc0
	add	$sbit,$acc0,$acc0
	srlx	$acc0,32,$sbit
	and	$acc0,$mask,$acc0
	add	$acc0,$car1,$car1
@@ -546,7 +546,7 @@ $code.=<<___;
	srlx	$car1,32,$car1

	add	$car0,$car0,$car0
	or	$sbit,$car0,$car0
	add	$sbit,$car0,$car0
	add	$car0,$car1,$car1
	add	$car2,$car1,$car1
	st	$car1,[$tp+4]
@@ -591,14 +591,17 @@ $code.=<<___;
!.Lsqr_last

	mulx	$npj,$mul1,$acc1
	add	$tpj,$car1,$car1
	add	$tpj,$acc0,$acc0
	srlx	$acc0,32,$tmp0
	and	$acc0,$mask,$acc0
	add	$tmp0,$sbit,$sbit
	add	$acc0,$car1,$car1
	add	$acc1,$car1,$car1
	st	$car1,[$tp]
	srlx	$car1,32,$car1

	add	$car0,$car0,$car0		! recover $car0
	or	$sbit,$car0,$car0
	add	$sbit,$car0,$car0
	add	$car0,$car1,$car1
	add	$car2,$car1,$car1
	st	$car1,[$tp+4]