Loading crypto/bn/asm/sparcv9-mont.pl +16 −13 Original line number Diff line number Diff line Loading @@ -300,7 +300,7 @@ ___ ######## .Lbn_sqr_mont gives up to 20% *overall* improvement over ######## code without following dedicated squaring procedure. ######## $sbit="%i2"; # re-use $bp! $sbit="%o5"; $code.=<<___; .align 32 Loading Loading @@ -413,7 +413,7 @@ $code.=<<___; mulx $apj,$mul0,$acc0 mulx $npj,$mul1,$acc1 add $acc0,$car0,$car0 add $tpj,$car1,$car1 add $tpj,$sbit,$sbit ld [$ap+$j],$apj ! ap[j] and $car0,$mask,$acc0 ld [$np+$j],$npj ! np[j] Loading @@ -422,7 +422,7 @@ $code.=<<___; ld [$tp+8],$tpj ! tp[j] add $acc0,$acc0,$acc0 add $j,4,$j ! j++ or $sbit,$acc0,$acc0 add $sbit,$acc0,$acc0 srlx $acc0,32,$sbit and $acc0,$mask,$acc0 cmp $j,$num Loading @@ -436,12 +436,12 @@ $code.=<<___; mulx $apj,$mul0,$acc0 mulx $npj,$mul1,$acc1 add $acc0,$car0,$car0 add $tpj,$car1,$car1 add $tpj,$sbit,$sbit and $car0,$mask,$acc0 srlx $car0,32,$car0 add $acc1,$car1,$car1 add $acc0,$acc0,$acc0 or $sbit,$acc0,$acc0 add $sbit,$acc0,$acc0 srlx $acc0,32,$sbit and $acc0,$mask,$acc0 add $acc0,$car1,$car1 Loading @@ -449,7 +449,7 @@ $code.=<<___; srlx $car1,32,$car1 add $car0,$car0,$car0 or $sbit,$car0,$car0 add $sbit,$car0,$car0 add $car0,$car1,$car1 add $car2,$car1,$car1 st $car1,[$tp+4] Loading Loading @@ -509,7 +509,7 @@ $code.=<<___; .Lsqr_inner2: mulx $apj,$mul0,$acc0 mulx $npj,$mul1,$acc1 add $tpj,$car1,$car1 add $tpj,$sbit,$sbit add $acc0,$car0,$car0 ld [$ap+$j],$apj ! ap[j] and $car0,$mask,$acc0 Loading @@ -517,7 +517,7 @@ $code.=<<___; srlx $car0,32,$car0 add $acc0,$acc0,$acc0 ld [$tp+8],$tpj ! tp[j] or $sbit,$acc0,$acc0 add $sbit,$acc0,$acc0 add $j,4,$j ! j++ srlx $acc0,32,$sbit and $acc0,$mask,$acc0 Loading @@ -532,12 +532,12 @@ $code.=<<___; .Lsqr_no_inner2: mulx $apj,$mul0,$acc0 mulx $npj,$mul1,$acc1 add $tpj,$car1,$car1 add $tpj,$sbit,$sbit add $acc0,$car0,$car0 and $car0,$mask,$acc0 srlx $car0,32,$car0 add $acc0,$acc0,$acc0 or $sbit,$acc0,$acc0 add $sbit,$acc0,$acc0 srlx $acc0,32,$sbit and $acc0,$mask,$acc0 add $acc0,$car1,$car1 Loading @@ -546,7 +546,7 @@ $code.=<<___; srlx $car1,32,$car1 add $car0,$car0,$car0 or $sbit,$car0,$car0 add $sbit,$car0,$car0 add $car0,$car1,$car1 add $car2,$car1,$car1 st $car1,[$tp+4] Loading Loading @@ -591,14 +591,17 @@ $code.=<<___; !.Lsqr_last mulx $npj,$mul1,$acc1 add $tpj,$car1,$car1 add $tpj,$acc0,$acc0 srlx $acc0,32,$tmp0 and $acc0,$mask,$acc0 add $tmp0,$sbit,$sbit add $acc0,$car1,$car1 add $acc1,$car1,$car1 st $car1,[$tp] srlx $car1,32,$car1 add $car0,$car0,$car0 ! recover $car0 or $sbit,$car0,$car0 add $sbit,$car0,$car0 add $car0,$car1,$car1 add $car2,$car1,$car1 st $car1,[$tp+4] Loading Loading
crypto/bn/asm/sparcv9-mont.pl +16 −13 Original line number Diff line number Diff line Loading @@ -300,7 +300,7 @@ ___ ######## .Lbn_sqr_mont gives up to 20% *overall* improvement over ######## code without following dedicated squaring procedure. ######## $sbit="%i2"; # re-use $bp! $sbit="%o5"; $code.=<<___; .align 32 Loading Loading @@ -413,7 +413,7 @@ $code.=<<___; mulx $apj,$mul0,$acc0 mulx $npj,$mul1,$acc1 add $acc0,$car0,$car0 add $tpj,$car1,$car1 add $tpj,$sbit,$sbit ld [$ap+$j],$apj ! ap[j] and $car0,$mask,$acc0 ld [$np+$j],$npj ! np[j] Loading @@ -422,7 +422,7 @@ $code.=<<___; ld [$tp+8],$tpj ! tp[j] add $acc0,$acc0,$acc0 add $j,4,$j ! j++ or $sbit,$acc0,$acc0 add $sbit,$acc0,$acc0 srlx $acc0,32,$sbit and $acc0,$mask,$acc0 cmp $j,$num Loading @@ -436,12 +436,12 @@ $code.=<<___; mulx $apj,$mul0,$acc0 mulx $npj,$mul1,$acc1 add $acc0,$car0,$car0 add $tpj,$car1,$car1 add $tpj,$sbit,$sbit and $car0,$mask,$acc0 srlx $car0,32,$car0 add $acc1,$car1,$car1 add $acc0,$acc0,$acc0 or $sbit,$acc0,$acc0 add $sbit,$acc0,$acc0 srlx $acc0,32,$sbit and $acc0,$mask,$acc0 add $acc0,$car1,$car1 Loading @@ -449,7 +449,7 @@ $code.=<<___; srlx $car1,32,$car1 add $car0,$car0,$car0 or $sbit,$car0,$car0 add $sbit,$car0,$car0 add $car0,$car1,$car1 add $car2,$car1,$car1 st $car1,[$tp+4] Loading Loading @@ -509,7 +509,7 @@ $code.=<<___; .Lsqr_inner2: mulx $apj,$mul0,$acc0 mulx $npj,$mul1,$acc1 add $tpj,$car1,$car1 add $tpj,$sbit,$sbit add $acc0,$car0,$car0 ld [$ap+$j],$apj ! ap[j] and $car0,$mask,$acc0 Loading @@ -517,7 +517,7 @@ $code.=<<___; srlx $car0,32,$car0 add $acc0,$acc0,$acc0 ld [$tp+8],$tpj ! tp[j] or $sbit,$acc0,$acc0 add $sbit,$acc0,$acc0 add $j,4,$j ! j++ srlx $acc0,32,$sbit and $acc0,$mask,$acc0 Loading @@ -532,12 +532,12 @@ $code.=<<___; .Lsqr_no_inner2: mulx $apj,$mul0,$acc0 mulx $npj,$mul1,$acc1 add $tpj,$car1,$car1 add $tpj,$sbit,$sbit add $acc0,$car0,$car0 and $car0,$mask,$acc0 srlx $car0,32,$car0 add $acc0,$acc0,$acc0 or $sbit,$acc0,$acc0 add $sbit,$acc0,$acc0 srlx $acc0,32,$sbit and $acc0,$mask,$acc0 add $acc0,$car1,$car1 Loading @@ -546,7 +546,7 @@ $code.=<<___; srlx $car1,32,$car1 add $car0,$car0,$car0 or $sbit,$car0,$car0 add $sbit,$car0,$car0 add $car0,$car1,$car1 add $car2,$car1,$car1 st $car1,[$tp+4] Loading Loading @@ -591,14 +591,17 @@ $code.=<<___; !.Lsqr_last mulx $npj,$mul1,$acc1 add $tpj,$car1,$car1 add $tpj,$acc0,$acc0 srlx $acc0,32,$tmp0 and $acc0,$mask,$acc0 add $tmp0,$sbit,$sbit add $acc0,$car1,$car1 add $acc1,$car1,$car1 st $car1,[$tp] srlx $car1,32,$car1 add $car0,$car0,$car0 ! recover $car0 or $sbit,$car0,$car0 add $sbit,$car0,$car0 add $car0,$car1,$car1 add $car2,$car1,$car1 st $car1,[$tp+4] Loading