Loading crypto/bn/asm/mips.pl +131 −480 Original line number Diff line number Diff line Loading @@ -1872,6 +1872,41 @@ ___ ($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3); sub add_c2 () { my ($hi,$lo,$c0,$c1,$c2, $warm, # !$warm denotes first call with specific sequence of # $c_[XYZ] when there is no Z-carry to accumulate yet; $an,$bn # these two are arguments for multiplication which # result is used in *next* step [which is why it's # commented as "forward multiplication" below]; )=@_; $code.=<<___; mflo $lo mfhi $hi $ADDU $c0,$lo sltu $at,$c0,$lo $MULTU $an,$bn # forward multiplication $ADDU $c0,$lo $ADDU $at,$hi sltu $lo,$c0,$lo $ADDU $c1,$at $ADDU $hi,$lo ___ $code.=<<___ if (!$warm); sltu $c2,$c1,$at $ADDU $c1,$hi sltu $hi,$c1,$hi $ADDU $c2,$hi ___ $code.=<<___ if ($warm); sltu $at,$c1,$at $ADDU $c1,$hi $ADDU $c2,$at sltu $hi,$c1,$hi $ADDU $c2,$hi ___ } $code.=<<___; .align 5 Loading Loading @@ -1920,21 +1955,10 @@ $code.=<<___; sltu $at,$c_2,$t_1 $ADDU $c_3,$t_2,$at $ST $c_2,$BNSZ($a0) mflo $t_1 mfhi $t_2 slt $c_2,$t_2,$zero $SLL $t_2,1 $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2); slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at ___ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2); $code.=<<___; mflo $t_1 mfhi $t_2 $ADDU $c_3,$t_1 Loading @@ -1945,67 +1969,19 @@ $code.=<<___; sltu $at,$c_1,$t_2 $ADDU $c_2,$at $ST $c_3,2*$BNSZ($a0) mflo $t_1 mfhi $t_2 slt $c_3,$t_2,$zero $SLL $t_2,1 $MULTU $a_1,$a_2 # mul_add_c2(a[1],b[2],c1,c2,c3); slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at mflo $t_1 mfhi $t_2 slt $at,$t_2,$zero $ADDU $c_3,$at $MULTU $a_4,$a_0 # mul_add_c2(a[4],b[0],c2,c3,c1); $SLL $t_2,1 slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at ___ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, $a_1,$a_2); # mul_add_c2(a[1],b[2],c1,c2,c3); &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, $a_4,$a_0); # mul_add_c2(a[4],b[0],c2,c3,c1); $code.=<<___; $ST $c_1,3*$BNSZ($a0) mflo $t_1 mfhi $t_2 slt $c_1,$t_2,$zero $SLL $t_2,1 $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1); slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at mflo $t_1 mfhi $t_2 slt $at,$t_2,$zero $ADDU $c_1,$at $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1); $SLL $t_2,1 slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at ___ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1); &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1); $code.=<<___; mflo $t_1 mfhi $t_2 $ADDU $c_2,$t_1 Loading @@ -2016,97 +1992,23 @@ $code.=<<___; sltu $at,$c_3,$t_2 $ADDU $c_1,$at $ST $c_2,4*$BNSZ($a0) mflo $t_1 mfhi $t_2 slt $c_2,$t_2,$zero $SLL $t_2,1 $MULTU $a_1,$a_4 # mul_add_c2(a[1],b[4],c3,c1,c2); slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at mflo $t_1 mfhi $t_2 slt $at,$t_2,$zero $ADDU $c_2,$at $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2); $SLL $t_2,1 slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at mflo $t_1 mfhi $t_2 slt $at,$t_2,$zero $MULTU $a_6,$a_0 # mul_add_c2(a[6],b[0],c1,c2,c3); $ADDU $c_2,$at $SLL $t_2,1 slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at ___ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, $a_1,$a_4); # mul_add_c2(a[1],b[4],c3,c1,c2); &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, $a_2,$a_3); # mul_add_c2(a[2],b[3],c3,c1,c2); &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, $a_6,$a_0); # mul_add_c2(a[6],b[0],c1,c2,c3); $code.=<<___; $ST $c_3,5*$BNSZ($a0) mflo $t_1 mfhi $t_2 slt $c_3,$t_2,$zero $SLL $t_2,1 $MULTU $a_5,$a_1 # mul_add_c2(a[5],b[1],c1,c2,c3); slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at mflo $t_1 mfhi $t_2 slt $at,$t_2,$zero $ADDU $c_3,$at $MULTU $a_4,$a_2 # mul_add_c2(a[4],b[2],c1,c2,c3); $SLL $t_2,1 slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at mflo $t_1 mfhi $t_2 slt $at,$t_2,$zero $ADDU $c_3,$at $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3); $SLL $t_2,1 slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at ___ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, $a_5,$a_1); # mul_add_c2(a[5],b[1],c1,c2,c3); &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, $a_4,$a_2); # mul_add_c2(a[4],b[2],c1,c2,c3); &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3); $code.=<<___; mflo $t_1 mfhi $t_2 $ADDU $c_1,$t_1 Loading @@ -2117,112 +2019,25 @@ $code.=<<___; sltu $at,$c_2,$t_2 $ADDU $c_3,$at $ST $c_1,6*$BNSZ($a0) mflo $t_1 mfhi $t_2 slt $c_1,$t_2,$zero $SLL $t_2,1 $MULTU $a_1,$a_6 # mul_add_c2(a[1],b[6],c2,c3,c1); slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at mflo $t_1 mfhi $t_2 slt $at,$t_2,$zero $ADDU $c_1,$at $MULTU $a_2,$a_5 # mul_add_c2(a[2],b[5],c2,c3,c1); $SLL $t_2,1 slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at mflo $t_1 mfhi $t_2 slt $at,$t_2,$zero $ADDU $c_1,$at $MULTU $a_3,$a_4 # mul_add_c2(a[3],b[4],c2,c3,c1); $SLL $t_2,1 slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at mflo $t_1 mfhi $t_2 slt $at,$t_2,$zero $ADDU $c_1,$at $MULTU $a_7,$a_1 # mul_add_c2(a[7],b[1],c3,c1,c2); $SLL $t_2,1 slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at ___ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, $a_1,$a_6); # mul_add_c2(a[1],b[6],c2,c3,c1); &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, $a_2,$a_5); # mul_add_c2(a[2],b[5],c2,c3,c1); &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, $a_3,$a_4); # mul_add_c2(a[3],b[4],c2,c3,c1); &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, $a_7,$a_1); # mul_add_c2(a[7],b[1],c3,c1,c2); $code.=<<___; $ST $c_2,7*$BNSZ($a0) mflo $t_1 mfhi $t_2 slt $c_2,$t_2,$zero $SLL $t_2,1 $MULTU $a_6,$a_2 # mul_add_c2(a[6],b[2],c3,c1,c2); slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at mflo $t_1 mfhi $t_2 slt $at,$t_2,$zero $ADDU $c_2,$at $MULTU $a_5,$a_3 # mul_add_c2(a[5],b[3],c3,c1,c2); $SLL $t_2,1 slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at mflo $t_1 mfhi $t_2 slt $at,$t_2,$zero $ADDU $c_2,$at $MULTU $a_4,$a_4 # mul_add_c(a[4],b[4],c3,c1,c2); $SLL $t_2,1 slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at ___ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, $a_6,$a_2); # mul_add_c2(a[6],b[2],c3,c1,c2); &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, $a_5,$a_3); # mul_add_c2(a[5],b[3],c3,c1,c2); &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, $a_4,$a_4); # mul_add_c(a[4],b[4],c3,c1,c2); $code.=<<___; mflo $t_1 mfhi $t_2 $ADDU $c_3,$t_1 Loading @@ -2233,82 +2048,21 @@ $code.=<<___; sltu $at,$c_1,$t_2 $ADDU $c_2,$at $ST $c_3,8*$BNSZ($a0) mflo $t_1 mfhi $t_2 slt $c_3,$t_2,$zero $SLL $t_2,1 $MULTU $a_3,$a_6 # mul_add_c2(a[3],b[6],c1,c2,c3); slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at mflo $t_1 mfhi $t_2 slt $at,$t_2,$zero $ADDU $c_3,$at $MULTU $a_4,$a_5 # mul_add_c2(a[4],b[5],c1,c2,c3); $SLL $t_2,1 slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at mflo $t_1 mfhi $t_2 slt $at,$t_2,$zero $ADDU $c_3,$at $MULTU $a_7,$a_3 # mul_add_c2(a[7],b[3],c2,c3,c1); $SLL $t_2,1 slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at ___ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, $a_3,$a_6); # mul_add_c2(a[3],b[6],c1,c2,c3); &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, $a_4,$a_5); # mul_add_c2(a[4],b[5],c1,c2,c3); &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, $a_7,$a_3); # mul_add_c2(a[7],b[3],c2,c3,c1); $code.=<<___; $ST $c_1,9*$BNSZ($a0) mflo $t_1 mfhi $t_2 slt $c_1,$t_2,$zero $SLL $t_2,1 $MULTU $a_6,$a_4 # mul_add_c2(a[6],b[4],c2,c3,c1); slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at mflo $t_1 mfhi $t_2 slt $at,$t_2,$zero $ADDU $c_1,$at $MULTU $a_5,$a_5 # mul_add_c(a[5],b[5],c2,c3,c1); $SLL $t_2,1 slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at ___ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, $a_6,$a_4); # mul_add_c2(a[6],b[4],c2,c3,c1); &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, $a_5,$a_5); # mul_add_c(a[5],b[5],c2,c3,c1); $code.=<<___; mflo $t_1 mfhi $t_2 $ADDU $c_2,$t_1 Loading @@ -2319,52 +2073,17 @@ $code.=<<___; sltu $at,$c_3,$t_2 $ADDU $c_1,$at $ST $c_2,10*$BNSZ($a0) mflo $t_1 mfhi $t_2 slt $c_2,$t_2,$zero $SLL $t_2,1 $MULTU $a_5,$a_6 # mul_add_c2(a[5],b[6],c3,c1,c2); slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at mflo $t_1 mfhi $t_2 slt $at,$t_2,$zero $ADDU $c_2,$at $MULTU $a_7,$a_5 # mul_add_c2(a[7],b[5],c1,c2,c3); $SLL $t_2,1 slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at ___ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, $a_5,$a_6); # mul_add_c2(a[5],b[6],c3,c1,c2); &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, $a_7,$a_5); # mul_add_c2(a[7],b[5],c1,c2,c3); $code.=<<___; $ST $c_3,11*$BNSZ($a0) mflo $t_1 mfhi $t_2 slt $c_3,$t_2,$zero $SLL $t_2,1 $MULTU $a_6,$a_6 # mul_add_c(a[6],b[6],c1,c2,c3); slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at ___ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, $a_6,$a_6); # mul_add_c(a[6],b[6],c1,c2,c3); $code.=<<___; mflo $t_1 mfhi $t_2 $ADDU $c_1,$t_1 Loading @@ -2375,21 +2094,10 @@ $code.=<<___; sltu $at,$c_2,$t_2 $ADDU $c_3,$at $ST $c_1,12*$BNSZ($a0) mflo $t_1 mfhi $t_2 slt $c_1,$t_2,$zero $SLL $t_2,1 $MULTU $a_7,$a_7 # mul_add_c(a[7],b[7],c3,c1,c2); slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at ___ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, $a_7,$a_7); # mul_add_c(a[7],b[7],c3,c1,c2); $code.=<<___; $ST $c_2,13*$BNSZ($a0) mflo $t_1 Loading Loading @@ -2457,21 +2165,10 @@ $code.=<<___; sltu $at,$c_2,$t_1 $ADDU $c_3,$t_2,$at $ST $c_2,$BNSZ($a0) mflo $t_1 mfhi $t_2 slt $c_2,$t_2,$zero $SLL $t_2,1 $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2); slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at ___ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2); $code.=<<___; mflo $t_1 mfhi $t_2 $ADDU $c_3,$t_1 Loading @@ -2482,52 +2179,17 @@ $code.=<<___; sltu $at,$c_1,$t_2 $ADDU $c_2,$at $ST $c_3,2*$BNSZ($a0) mflo $t_1 mfhi $t_2 slt $c_3,$t_2,$zero $SLL $t_2,1 $MULTU $a_1,$a_2 # mul_add_c(a2[1],b[2],c1,c2,c3); slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at mflo $t_1 mfhi $t_2 slt $at,$t_2,$zero $ADDU $c_3,$at $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1); $SLL $t_2,1 slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at ___ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, $a_1,$a_2); # mul_add_c2(a2[1],b[2],c1,c2,c3); &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1); $code.=<<___; $ST $c_1,3*$BNSZ($a0) mflo $t_1 mfhi $t_2 slt $c_1,$t_2,$zero $SLL $t_2,1 $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1); slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at ___ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1); $code.=<<___; mflo $t_1 mfhi $t_2 $ADDU $c_2,$t_1 Loading @@ -2538,21 +2200,10 @@ $code.=<<___; sltu $at,$c_3,$t_2 $ADDU $c_1,$at $ST $c_2,4*$BNSZ($a0) mflo $t_1 mfhi $t_2 slt $c_2,$t_2,$zero $SLL $t_2,1 $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3); slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at ___ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3); $code.=<<___; $ST $c_3,5*$BNSZ($a0) mflo $t_1 Loading crypto/bn/asm/x86_64-gcc.c +48 −53 Original line number Diff line number Diff line Loading @@ -276,76 +276,75 @@ BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */ /* * Keep in mind that carrying into high part of multiplication result * can not overflow, because it cannot be all-ones. */ #if 0 /* original macros are kept for reference purposes */ #define mul_add_c(a,b,c0,c1,c2) { \ #define mul_add_c(a,b,c0,c1,c2) do { \ BN_ULONG ta = (a), tb = (b); \ t1 = ta * tb; \ t2 = BN_UMULT_HIGH(ta,tb); \ c0 += t1; t2 += (c0<t1)?1:0; \ c1 += t2; c2 += (c1<t2)?1:0; \ } BN_ULONG lo, hi; \ BN_UMULT_LOHI(lo,hi,ta,tb); \ c0 += lo; hi += (c0<lo)?1:0; \ c1 += hi; c2 += (c1<hi)?1:0; \ } while(0) #define mul_add_c2(a,b,c0,c1,c2) { \ BN_ULONG ta=(a),tb=(b),t0; \ t1 = BN_UMULT_HIGH(ta,tb); \ t0 = ta * tb; \ t2 = t1+t1; c2 += (t2<t1)?1:0; \ t1 = t0+t0; t2 += (t1<t0)?1:0; \ c0 += t1; t2 += (c0<t1)?1:0; \ c1 += t2; c2 += (c1<t2)?1:0; \ } #define mul_add_c2(a,b,c0,c1,c2) do { \ BN_ULONG ta = (a), tb = (b); \ BN_ULONG lo, hi, tt; \ BN_UMULT_LOHI(lo,hi,ta,tb); \ c0 += lo; tt = hi+((c0<lo)?1:0); \ c1 += tt; c2 += (c1<tt)?1:0; \ c0 += lo; hi += (c0<lo)?1:0; \ c1 += hi; c2 += (c1<hi)?1:0; \ } while(0) #define sqr_add_c(a,i,c0,c1,c2) do { \ BN_ULONG ta = (a)[i]; \ BN_ULONG lo, hi; \ BN_UMULT_LOHI(lo,hi,ta,ta); \ c0 += lo; hi += (c0<lo)?1:0; \ c1 += hi; c2 += (c1<hi)?1:0; \ } while(0) #else #define mul_add_c(a,b,c0,c1,c2) do { \ BN_ULONG t1,t2; \ asm ("mulq %3" \ : "=a"(t1),"=d"(t2) \ : "a"(a),"m"(b) \ : "cc"); \ asm ("addq %2,%0; adcq %3,%1" \ : "+r"(c0),"+d"(t2) \ : "a"(t1),"g"(0) \ : "cc"); \ asm ("addq %2,%0; adcq %3,%1" \ : "+r"(c1),"+r"(c2) \ : "d"(t2),"g"(0) \ asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ : "+r"(c0),"+r"(c1),"+r"(c2) \ : "r"(t1),"r"(t2),"g"(0) \ : "cc"); \ } while (0) #define sqr_add_c(a,i,c0,c1,c2) do { \ BN_ULONG t1,t2; \ asm ("mulq %2" \ : "=a"(t1),"=d"(t2) \ : "a"(a[i]) \ : "cc"); \ asm ("addq %2,%0; adcq %3,%1" \ : "+r"(c0),"+d"(t2) \ : "a"(t1),"g"(0) \ : "cc"); \ asm ("addq %2,%0; adcq %3,%1" \ : "+r"(c1),"+r"(c2) \ : "d"(t2),"g"(0) \ asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ : "+r"(c0),"+r"(c1),"+r"(c2) \ : "r"(t1),"r"(t2),"g"(0) \ : "cc"); \ } while (0) #define mul_add_c2(a,b,c0,c1,c2) do { \ BN_ULONG t1,t2; \ asm ("mulq %3" \ : "=a"(t1),"=d"(t2) \ : "a"(a),"m"(b) \ : "cc"); \ asm ("addq %0,%0; adcq %2,%1" \ : "+d"(t2),"+r"(c2) \ : "g"(0) \ : "cc"); \ asm ("addq %0,%0; adcq %2,%1" \ : "+a"(t1),"+d"(t2) \ : "g"(0) \ : "cc"); \ asm ("addq %2,%0; adcq %3,%1" \ : "+r"(c0),"+d"(t2) \ : "a"(t1),"g"(0) \ asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ : "+r"(c0),"+r"(c1),"+r"(c2) \ : "r"(t1),"r"(t2),"g"(0) \ : "cc"); \ asm ("addq %2,%0; adcq %3,%1" \ : "+r"(c1),"+r"(c2) \ : "d"(t2),"g"(0) \ asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ : "+r"(c0),"+r"(c1),"+r"(c2) \ : "r"(t1),"r"(t2),"g"(0) \ : "cc"); \ } while (0) #endif Loading @@ -355,7 +354,6 @@ BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) { BN_ULONG t1,t2; BN_ULONG c1,c2,c3; c1=0; Loading Loading @@ -459,7 +457,6 @@ void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) { BN_ULONG t1,t2; BN_ULONG c1,c2,c3; c1=0; Loading Loading @@ -499,7 +496,6 @@ void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) { BN_ULONG t1,t2; BN_ULONG c1,c2,c3; c1=0; Loading Loading @@ -575,7 +571,6 @@ void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) { BN_ULONG t1,t2; BN_ULONG c1,c2,c3; c1=0; Loading crypto/bn/bn_asm.c +122 −119 Original line number Diff line number Diff line Loading @@ -439,116 +439,143 @@ BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n) /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */ #ifdef BN_LLONG #define mul_add_c(a,b,c0,c1,c2) \ t=(BN_ULLONG)a*b; \ t1=(BN_ULONG)Lw(t); \ t2=(BN_ULONG)Hw(t); \ c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; #define mul_add_c2(a,b,c0,c1,c2) \ t=(BN_ULLONG)a*b; \ tt=(t+t)&BN_MASK; \ if (tt < t) c2++; \ t1=(BN_ULONG)Lw(tt); \ t2=(BN_ULONG)Hw(tt); \ c0=(c0+t1)&BN_MASK2; \ if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \ c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; #define sqr_add_c(a,i,c0,c1,c2) \ t=(BN_ULLONG)a[i]*a[i]; \ t1=(BN_ULONG)Lw(t); \ t2=(BN_ULONG)Hw(t); \ c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; /* * Keep in mind that additions to multiplication result can not * overflow, because its high half cannot be all-ones. */ #define mul_add_c(a,b,c0,c1,c2) do { \ BN_ULONG hi; \ BN_ULLONG t = (BN_ULLONG)(a)*(b); \ t += c0; /* no carry */ \ c0 = (BN_ULONG)Lw(t); \ hi = (BN_ULONG)Hw(t); \ c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ } while(0) #define mul_add_c2(a,b,c0,c1,c2) do { \ BN_ULONG hi; \ BN_ULLONG t = (BN_ULLONG)(a)*(b); \ BN_ULLONG tt = t+c0; /* no carry */ \ c0 = (BN_ULONG)Lw(tt); \ hi = (BN_ULONG)Hw(tt); \ c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ t += c0; /* no carry */ \ c0 = (BN_ULONG)Lw(t); \ hi = (BN_ULONG)Hw(t); \ c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ } while(0) #define sqr_add_c(a,i,c0,c1,c2) do { \ BN_ULONG hi; \ BN_ULLONG t = (BN_ULLONG)a[i]*a[i]; \ t += c0; /* no carry */ \ c0 = (BN_ULONG)Lw(t); \ hi = (BN_ULONG)Hw(t); \ c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ } while(0) #define sqr_add_c2(a,i,j,c0,c1,c2) \ mul_add_c2((a)[i],(a)[j],c0,c1,c2) #elif defined(BN_UMULT_LOHI) #define mul_add_c(a,b,c0,c1,c2) { \ /* * Keep in mind that additions to hi can not overflow, because * the high word of a multiplication result cannot be all-ones. */ #define mul_add_c(a,b,c0,c1,c2) do { \ BN_ULONG ta = (a), tb = (b); \ BN_UMULT_LOHI(t1,t2,ta,tb); \ c0 += t1; t2 += (c0<t1)?1:0; \ c1 += t2; c2 += (c1<t2)?1:0; \ } #define mul_add_c2(a,b,c0,c1,c2) { \ BN_ULONG ta=(a),tb=(b),t0; \ BN_UMULT_LOHI(t0,t1,ta,tb); \ t2 = t1+t1; c2 += (t2<t1)?1:0; \ t1 = t0+t0; t2 += (t1<t0)?1:0; \ c0 += t1; t2 += (c0<t1)?1:0; \ c1 += t2; c2 += (c1<t2)?1:0; \ } BN_ULONG lo, hi; \ BN_UMULT_LOHI(lo,hi,ta,tb); \ c0 += lo; hi += (c0<lo)?1:0; \ c1 += hi; c2 += (c1<hi)?1:0; \ } while(0) #define sqr_add_c(a,i,c0,c1,c2) { \ #define mul_add_c2(a,b,c0,c1,c2) do { \ BN_ULONG ta = (a), tb = (b); \ BN_ULONG lo, hi, tt; \ BN_UMULT_LOHI(lo,hi,ta,tb); \ c0 += lo; tt = hi+((c0<lo)?1:0); \ c1 += tt; c2 += (c1<tt)?1:0; \ c0 += lo; hi += (c0<lo)?1:0; \ c1 += hi; c2 += (c1<hi)?1:0; \ } while(0) #define sqr_add_c(a,i,c0,c1,c2) do { \ BN_ULONG ta = (a)[i]; \ BN_UMULT_LOHI(t1,t2,ta,ta); \ c0 += t1; t2 += (c0<t1)?1:0; \ c1 += t2; c2 += (c1<t2)?1:0; \ } BN_ULONG lo, hi; \ BN_UMULT_LOHI(lo,hi,ta,ta); \ c0 += lo; hi += (c0<lo)?1:0; \ c1 += hi; c2 += (c1<hi)?1:0; \ } while(0) #define sqr_add_c2(a,i,j,c0,c1,c2) \ mul_add_c2((a)[i],(a)[j],c0,c1,c2) #elif defined(BN_UMULT_HIGH) #define mul_add_c(a,b,c0,c1,c2) { \ /* * Keep in mind that additions to hi can not overflow, because * the high word of a multiplication result cannot be all-ones. */ #define mul_add_c(a,b,c0,c1,c2) do { \ BN_ULONG ta = (a), tb = (b); \ t1 = ta * tb; \ t2 = BN_UMULT_HIGH(ta,tb); \ c0 += t1; t2 += (c0<t1)?1:0; \ c1 += t2; c2 += (c1<t2)?1:0; \ } #define mul_add_c2(a,b,c0,c1,c2) { \ BN_ULONG ta=(a),tb=(b),t0; \ t1 = BN_UMULT_HIGH(ta,tb); \ t0 = ta * tb; \ t2 = t1+t1; c2 += (t2<t1)?1:0; \ t1 = t0+t0; t2 += (t1<t0)?1:0; \ c0 += t1; t2 += (c0<t1)?1:0; \ c1 += t2; c2 += (c1<t2)?1:0; \ } #define sqr_add_c(a,i,c0,c1,c2) { \ BN_ULONG lo = ta * tb; \ BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \ c0 += lo; hi += (c0<lo)?1:0; \ c1 += hi; c2 += (c1<hi)?1:0; \ } while(0) #define mul_add_c2(a,b,c0,c1,c2) do { \ BN_ULONG ta = (a), tb = (b), tt; \ BN_ULONG lo = ta * tb; \ BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \ c0 += lo; tt = hi + ((c0<lo)?1:0); \ c1 += tt; c2 += (c1<tt)?1:0; \ c0 += lo; hi += (c0<lo)?1:0; \ c1 += hi; c2 += (c1<hi)?1:0; \ } while(0) #define sqr_add_c(a,i,c0,c1,c2) do { \ BN_ULONG ta = (a)[i]; \ t1 = ta * ta; \ t2 = BN_UMULT_HIGH(ta,ta); \ c0 += t1; t2 += (c0<t1)?1:0; \ c1 += t2; c2 += (c1<t2)?1:0; \ } BN_ULONG lo = ta * ta; \ BN_ULONG hi = BN_UMULT_HIGH(ta,ta); \ c0 += lo; hi += (c0<lo)?1:0; \ c1 += hi; c2 += (c1<hi)?1:0; \ } while(0) #define sqr_add_c2(a,i,j,c0,c1,c2) \ mul_add_c2((a)[i],(a)[j],c0,c1,c2) #else /* !BN_LLONG */ #define mul_add_c(a,b,c0,c1,c2) \ t1=LBITS(a); t2=HBITS(a); \ bl=LBITS(b); bh=HBITS(b); \ mul64(t1,t2,bl,bh); \ c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; #define mul_add_c2(a,b,c0,c1,c2) \ t1=LBITS(a); t2=HBITS(a); \ bl=LBITS(b); bh=HBITS(b); \ mul64(t1,t2,bl,bh); \ if (t2 & BN_TBIT) c2++; \ t2=(t2+t2)&BN_MASK2; \ if (t1 & BN_TBIT) t2++; \ t1=(t1+t1)&BN_MASK2; \ c0=(c0+t1)&BN_MASK2; \ if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \ c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; #define sqr_add_c(a,i,c0,c1,c2) \ sqr64(t1,t2,(a)[i]); \ c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; /* * Keep in mind that additions to hi can not overflow, because * the high word of a multiplication result cannot be all-ones. */ #define mul_add_c(a,b,c0,c1,c2) do { \ BN_ULONG lo = LBITS(a), hi = HBITS(a); \ BN_ULONG bl = LBITS(b), bh = HBITS(b); \ mul64(lo,hi,bl,bh); \ c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \ c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ } while(0) #define mul_add_c2(a,b,c0,c1,c2) do { \ BN_ULONG tt; \ BN_ULONG lo = LBITS(a), hi = HBITS(a); \ BN_ULONG bl = LBITS(b), bh = HBITS(b); \ mul64(lo,hi,bl,bh); \ tt = hi; \ c0 = (c0+lo)&BN_MASK2; if (c0<lo) tt++; \ c1 = (c1+tt)&BN_MASK2; if (c1<tt) c2++; \ c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \ c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ } while(0) #define sqr_add_c(a,i,c0,c1,c2) do { \ BN_ULONG lo, hi; \ sqr64(lo,hi,(a)[i]); \ c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \ c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ } while(0) #define sqr_add_c2(a,i,j,c0,c1,c2) \ mul_add_c2((a)[i],(a)[j],c0,c1,c2) Loading @@ -556,12 +583,6 @@ BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n) void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) { #ifdef BN_LLONG BN_ULLONG t; #else BN_ULONG bl,bh; #endif BN_ULONG t1,t2; BN_ULONG c1,c2,c3; c1=0; Loading Loading @@ -665,12 +686,6 @@ void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) { #ifdef BN_LLONG BN_ULLONG t; #else BN_ULONG bl,bh; #endif BN_ULONG t1,t2; BN_ULONG c1,c2,c3; c1=0; Loading Loading @@ -710,12 +725,6 @@ void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) { #ifdef BN_LLONG BN_ULLONG t,tt; #else BN_ULONG bl,bh; #endif BN_ULONG t1,t2; BN_ULONG c1,c2,c3; c1=0; Loading Loading @@ -791,12 +800,6 @@ void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) { #ifdef BN_LLONG BN_ULLONG t,tt; #else BN_ULONG bl,bh; #endif BN_ULONG t1,t2; BN_ULONG c1,c2,c3; c1=0; Loading crypto/bn/bntest.c +78 −24 File changed.Preview size limit exceeded, changes collapsed. Show changes Loading
crypto/bn/asm/mips.pl +131 −480 Original line number Diff line number Diff line Loading @@ -1872,6 +1872,41 @@ ___ ($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3); sub add_c2 () { my ($hi,$lo,$c0,$c1,$c2, $warm, # !$warm denotes first call with specific sequence of # $c_[XYZ] when there is no Z-carry to accumulate yet; $an,$bn # these two are arguments for multiplication which # result is used in *next* step [which is why it's # commented as "forward multiplication" below]; )=@_; $code.=<<___; mflo $lo mfhi $hi $ADDU $c0,$lo sltu $at,$c0,$lo $MULTU $an,$bn # forward multiplication $ADDU $c0,$lo $ADDU $at,$hi sltu $lo,$c0,$lo $ADDU $c1,$at $ADDU $hi,$lo ___ $code.=<<___ if (!$warm); sltu $c2,$c1,$at $ADDU $c1,$hi sltu $hi,$c1,$hi $ADDU $c2,$hi ___ $code.=<<___ if ($warm); sltu $at,$c1,$at $ADDU $c1,$hi $ADDU $c2,$at sltu $hi,$c1,$hi $ADDU $c2,$hi ___ } $code.=<<___; .align 5 Loading Loading @@ -1920,21 +1955,10 @@ $code.=<<___; sltu $at,$c_2,$t_1 $ADDU $c_3,$t_2,$at $ST $c_2,$BNSZ($a0) mflo $t_1 mfhi $t_2 slt $c_2,$t_2,$zero $SLL $t_2,1 $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2); slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at ___ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2); $code.=<<___; mflo $t_1 mfhi $t_2 $ADDU $c_3,$t_1 Loading @@ -1945,67 +1969,19 @@ $code.=<<___; sltu $at,$c_1,$t_2 $ADDU $c_2,$at $ST $c_3,2*$BNSZ($a0) mflo $t_1 mfhi $t_2 slt $c_3,$t_2,$zero $SLL $t_2,1 $MULTU $a_1,$a_2 # mul_add_c2(a[1],b[2],c1,c2,c3); slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at mflo $t_1 mfhi $t_2 slt $at,$t_2,$zero $ADDU $c_3,$at $MULTU $a_4,$a_0 # mul_add_c2(a[4],b[0],c2,c3,c1); $SLL $t_2,1 slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at ___ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, $a_1,$a_2); # mul_add_c2(a[1],b[2],c1,c2,c3); &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, $a_4,$a_0); # mul_add_c2(a[4],b[0],c2,c3,c1); $code.=<<___; $ST $c_1,3*$BNSZ($a0) mflo $t_1 mfhi $t_2 slt $c_1,$t_2,$zero $SLL $t_2,1 $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1); slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at mflo $t_1 mfhi $t_2 slt $at,$t_2,$zero $ADDU $c_1,$at $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1); $SLL $t_2,1 slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at ___ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1); &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1); $code.=<<___; mflo $t_1 mfhi $t_2 $ADDU $c_2,$t_1 Loading @@ -2016,97 +1992,23 @@ $code.=<<___; sltu $at,$c_3,$t_2 $ADDU $c_1,$at $ST $c_2,4*$BNSZ($a0) mflo $t_1 mfhi $t_2 slt $c_2,$t_2,$zero $SLL $t_2,1 $MULTU $a_1,$a_4 # mul_add_c2(a[1],b[4],c3,c1,c2); slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at mflo $t_1 mfhi $t_2 slt $at,$t_2,$zero $ADDU $c_2,$at $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2); $SLL $t_2,1 slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at mflo $t_1 mfhi $t_2 slt $at,$t_2,$zero $MULTU $a_6,$a_0 # mul_add_c2(a[6],b[0],c1,c2,c3); $ADDU $c_2,$at $SLL $t_2,1 slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at ___ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, $a_1,$a_4); # mul_add_c2(a[1],b[4],c3,c1,c2); &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, $a_2,$a_3); # mul_add_c2(a[2],b[3],c3,c1,c2); &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, $a_6,$a_0); # mul_add_c2(a[6],b[0],c1,c2,c3); $code.=<<___; $ST $c_3,5*$BNSZ($a0) mflo $t_1 mfhi $t_2 slt $c_3,$t_2,$zero $SLL $t_2,1 $MULTU $a_5,$a_1 # mul_add_c2(a[5],b[1],c1,c2,c3); slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at mflo $t_1 mfhi $t_2 slt $at,$t_2,$zero $ADDU $c_3,$at $MULTU $a_4,$a_2 # mul_add_c2(a[4],b[2],c1,c2,c3); $SLL $t_2,1 slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at mflo $t_1 mfhi $t_2 slt $at,$t_2,$zero $ADDU $c_3,$at $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3); $SLL $t_2,1 slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at ___ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, $a_5,$a_1); # mul_add_c2(a[5],b[1],c1,c2,c3); &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, $a_4,$a_2); # mul_add_c2(a[4],b[2],c1,c2,c3); &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3); $code.=<<___; mflo $t_1 mfhi $t_2 $ADDU $c_1,$t_1 Loading @@ -2117,112 +2019,25 @@ $code.=<<___; sltu $at,$c_2,$t_2 $ADDU $c_3,$at $ST $c_1,6*$BNSZ($a0) mflo $t_1 mfhi $t_2 slt $c_1,$t_2,$zero $SLL $t_2,1 $MULTU $a_1,$a_6 # mul_add_c2(a[1],b[6],c2,c3,c1); slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at mflo $t_1 mfhi $t_2 slt $at,$t_2,$zero $ADDU $c_1,$at $MULTU $a_2,$a_5 # mul_add_c2(a[2],b[5],c2,c3,c1); $SLL $t_2,1 slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at mflo $t_1 mfhi $t_2 slt $at,$t_2,$zero $ADDU $c_1,$at $MULTU $a_3,$a_4 # mul_add_c2(a[3],b[4],c2,c3,c1); $SLL $t_2,1 slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at mflo $t_1 mfhi $t_2 slt $at,$t_2,$zero $ADDU $c_1,$at $MULTU $a_7,$a_1 # mul_add_c2(a[7],b[1],c3,c1,c2); $SLL $t_2,1 slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at ___ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, $a_1,$a_6); # mul_add_c2(a[1],b[6],c2,c3,c1); &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, $a_2,$a_5); # mul_add_c2(a[2],b[5],c2,c3,c1); &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, $a_3,$a_4); # mul_add_c2(a[3],b[4],c2,c3,c1); &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, $a_7,$a_1); # mul_add_c2(a[7],b[1],c3,c1,c2); $code.=<<___; $ST $c_2,7*$BNSZ($a0) mflo $t_1 mfhi $t_2 slt $c_2,$t_2,$zero $SLL $t_2,1 $MULTU $a_6,$a_2 # mul_add_c2(a[6],b[2],c3,c1,c2); slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at mflo $t_1 mfhi $t_2 slt $at,$t_2,$zero $ADDU $c_2,$at $MULTU $a_5,$a_3 # mul_add_c2(a[5],b[3],c3,c1,c2); $SLL $t_2,1 slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at mflo $t_1 mfhi $t_2 slt $at,$t_2,$zero $ADDU $c_2,$at $MULTU $a_4,$a_4 # mul_add_c(a[4],b[4],c3,c1,c2); $SLL $t_2,1 slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at ___ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, $a_6,$a_2); # mul_add_c2(a[6],b[2],c3,c1,c2); &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, $a_5,$a_3); # mul_add_c2(a[5],b[3],c3,c1,c2); &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, $a_4,$a_4); # mul_add_c(a[4],b[4],c3,c1,c2); $code.=<<___; mflo $t_1 mfhi $t_2 $ADDU $c_3,$t_1 Loading @@ -2233,82 +2048,21 @@ $code.=<<___; sltu $at,$c_1,$t_2 $ADDU $c_2,$at $ST $c_3,8*$BNSZ($a0) mflo $t_1 mfhi $t_2 slt $c_3,$t_2,$zero $SLL $t_2,1 $MULTU $a_3,$a_6 # mul_add_c2(a[3],b[6],c1,c2,c3); slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at mflo $t_1 mfhi $t_2 slt $at,$t_2,$zero $ADDU $c_3,$at $MULTU $a_4,$a_5 # mul_add_c2(a[4],b[5],c1,c2,c3); $SLL $t_2,1 slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at mflo $t_1 mfhi $t_2 slt $at,$t_2,$zero $ADDU $c_3,$at $MULTU $a_7,$a_3 # mul_add_c2(a[7],b[3],c2,c3,c1); $SLL $t_2,1 slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at ___ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, $a_3,$a_6); # mul_add_c2(a[3],b[6],c1,c2,c3); &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, $a_4,$a_5); # mul_add_c2(a[4],b[5],c1,c2,c3); &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, $a_7,$a_3); # mul_add_c2(a[7],b[3],c2,c3,c1); $code.=<<___; $ST $c_1,9*$BNSZ($a0) mflo $t_1 mfhi $t_2 slt $c_1,$t_2,$zero $SLL $t_2,1 $MULTU $a_6,$a_4 # mul_add_c2(a[6],b[4],c2,c3,c1); slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at mflo $t_1 mfhi $t_2 slt $at,$t_2,$zero $ADDU $c_1,$at $MULTU $a_5,$a_5 # mul_add_c(a[5],b[5],c2,c3,c1); $SLL $t_2,1 slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at ___ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, $a_6,$a_4); # mul_add_c2(a[6],b[4],c2,c3,c1); &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, $a_5,$a_5); # mul_add_c(a[5],b[5],c2,c3,c1); $code.=<<___; mflo $t_1 mfhi $t_2 $ADDU $c_2,$t_1 Loading @@ -2319,52 +2073,17 @@ $code.=<<___; sltu $at,$c_3,$t_2 $ADDU $c_1,$at $ST $c_2,10*$BNSZ($a0) mflo $t_1 mfhi $t_2 slt $c_2,$t_2,$zero $SLL $t_2,1 $MULTU $a_5,$a_6 # mul_add_c2(a[5],b[6],c3,c1,c2); slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at mflo $t_1 mfhi $t_2 slt $at,$t_2,$zero $ADDU $c_2,$at $MULTU $a_7,$a_5 # mul_add_c2(a[7],b[5],c1,c2,c3); $SLL $t_2,1 slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at ___ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, $a_5,$a_6); # mul_add_c2(a[5],b[6],c3,c1,c2); &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, $a_7,$a_5); # mul_add_c2(a[7],b[5],c1,c2,c3); $code.=<<___; $ST $c_3,11*$BNSZ($a0) mflo $t_1 mfhi $t_2 slt $c_3,$t_2,$zero $SLL $t_2,1 $MULTU $a_6,$a_6 # mul_add_c(a[6],b[6],c1,c2,c3); slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at ___ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, $a_6,$a_6); # mul_add_c(a[6],b[6],c1,c2,c3); $code.=<<___; mflo $t_1 mfhi $t_2 $ADDU $c_1,$t_1 Loading @@ -2375,21 +2094,10 @@ $code.=<<___; sltu $at,$c_2,$t_2 $ADDU $c_3,$at $ST $c_1,12*$BNSZ($a0) mflo $t_1 mfhi $t_2 slt $c_1,$t_2,$zero $SLL $t_2,1 $MULTU $a_7,$a_7 # mul_add_c(a[7],b[7],c3,c1,c2); slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at ___ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, $a_7,$a_7); # mul_add_c(a[7],b[7],c3,c1,c2); $code.=<<___; $ST $c_2,13*$BNSZ($a0) mflo $t_1 Loading Loading @@ -2457,21 +2165,10 @@ $code.=<<___; sltu $at,$c_2,$t_1 $ADDU $c_3,$t_2,$at $ST $c_2,$BNSZ($a0) mflo $t_1 mfhi $t_2 slt $c_2,$t_2,$zero $SLL $t_2,1 $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2); slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at ___ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2); $code.=<<___; mflo $t_1 mfhi $t_2 $ADDU $c_3,$t_1 Loading @@ -2482,52 +2179,17 @@ $code.=<<___; sltu $at,$c_1,$t_2 $ADDU $c_2,$at $ST $c_3,2*$BNSZ($a0) mflo $t_1 mfhi $t_2 slt $c_3,$t_2,$zero $SLL $t_2,1 $MULTU $a_1,$a_2 # mul_add_c(a2[1],b[2],c1,c2,c3); slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at mflo $t_1 mfhi $t_2 slt $at,$t_2,$zero $ADDU $c_3,$at $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1); $SLL $t_2,1 slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at ___ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, $a_1,$a_2); # mul_add_c2(a2[1],b[2],c1,c2,c3); &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1); $code.=<<___; $ST $c_1,3*$BNSZ($a0) mflo $t_1 mfhi $t_2 slt $c_1,$t_2,$zero $SLL $t_2,1 $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1); slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at ___ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1); $code.=<<___; mflo $t_1 mfhi $t_2 $ADDU $c_2,$t_1 Loading @@ -2538,21 +2200,10 @@ $code.=<<___; sltu $at,$c_3,$t_2 $ADDU $c_1,$at $ST $c_2,4*$BNSZ($a0) mflo $t_1 mfhi $t_2 slt $c_2,$t_2,$zero $SLL $t_2,1 $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3); slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at ___ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3); $code.=<<___; $ST $c_3,5*$BNSZ($a0) mflo $t_1 Loading
crypto/bn/asm/x86_64-gcc.c +48 −53 Original line number Diff line number Diff line Loading @@ -276,76 +276,75 @@ BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */ /* * Keep in mind that carrying into high part of multiplication result * can not overflow, because it cannot be all-ones. */ #if 0 /* original macros are kept for reference purposes */ #define mul_add_c(a,b,c0,c1,c2) { \ #define mul_add_c(a,b,c0,c1,c2) do { \ BN_ULONG ta = (a), tb = (b); \ t1 = ta * tb; \ t2 = BN_UMULT_HIGH(ta,tb); \ c0 += t1; t2 += (c0<t1)?1:0; \ c1 += t2; c2 += (c1<t2)?1:0; \ } BN_ULONG lo, hi; \ BN_UMULT_LOHI(lo,hi,ta,tb); \ c0 += lo; hi += (c0<lo)?1:0; \ c1 += hi; c2 += (c1<hi)?1:0; \ } while(0) #define mul_add_c2(a,b,c0,c1,c2) { \ BN_ULONG ta=(a),tb=(b),t0; \ t1 = BN_UMULT_HIGH(ta,tb); \ t0 = ta * tb; \ t2 = t1+t1; c2 += (t2<t1)?1:0; \ t1 = t0+t0; t2 += (t1<t0)?1:0; \ c0 += t1; t2 += (c0<t1)?1:0; \ c1 += t2; c2 += (c1<t2)?1:0; \ } #define mul_add_c2(a,b,c0,c1,c2) do { \ BN_ULONG ta = (a), tb = (b); \ BN_ULONG lo, hi, tt; \ BN_UMULT_LOHI(lo,hi,ta,tb); \ c0 += lo; tt = hi+((c0<lo)?1:0); \ c1 += tt; c2 += (c1<tt)?1:0; \ c0 += lo; hi += (c0<lo)?1:0; \ c1 += hi; c2 += (c1<hi)?1:0; \ } while(0) #define sqr_add_c(a,i,c0,c1,c2) do { \ BN_ULONG ta = (a)[i]; \ BN_ULONG lo, hi; \ BN_UMULT_LOHI(lo,hi,ta,ta); \ c0 += lo; hi += (c0<lo)?1:0; \ c1 += hi; c2 += (c1<hi)?1:0; \ } while(0) #else #define mul_add_c(a,b,c0,c1,c2) do { \ BN_ULONG t1,t2; \ asm ("mulq %3" \ : "=a"(t1),"=d"(t2) \ : "a"(a),"m"(b) \ : "cc"); \ asm ("addq %2,%0; adcq %3,%1" \ : "+r"(c0),"+d"(t2) \ : "a"(t1),"g"(0) \ : "cc"); \ asm ("addq %2,%0; adcq %3,%1" \ : "+r"(c1),"+r"(c2) \ : "d"(t2),"g"(0) \ asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ : "+r"(c0),"+r"(c1),"+r"(c2) \ : "r"(t1),"r"(t2),"g"(0) \ : "cc"); \ } while (0) #define sqr_add_c(a,i,c0,c1,c2) do { \ BN_ULONG t1,t2; \ asm ("mulq %2" \ : "=a"(t1),"=d"(t2) \ : "a"(a[i]) \ : "cc"); \ asm ("addq %2,%0; adcq %3,%1" \ : "+r"(c0),"+d"(t2) \ : "a"(t1),"g"(0) \ : "cc"); \ asm ("addq %2,%0; adcq %3,%1" \ : "+r"(c1),"+r"(c2) \ : "d"(t2),"g"(0) \ asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ : "+r"(c0),"+r"(c1),"+r"(c2) \ : "r"(t1),"r"(t2),"g"(0) \ : "cc"); \ } while (0) #define mul_add_c2(a,b,c0,c1,c2) do { \ BN_ULONG t1,t2; \ asm ("mulq %3" \ : "=a"(t1),"=d"(t2) \ : "a"(a),"m"(b) \ : "cc"); \ asm ("addq %0,%0; adcq %2,%1" \ : "+d"(t2),"+r"(c2) \ : "g"(0) \ : "cc"); \ asm ("addq %0,%0; adcq %2,%1" \ : "+a"(t1),"+d"(t2) \ : "g"(0) \ : "cc"); \ asm ("addq %2,%0; adcq %3,%1" \ : "+r"(c0),"+d"(t2) \ : "a"(t1),"g"(0) \ asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ : "+r"(c0),"+r"(c1),"+r"(c2) \ : "r"(t1),"r"(t2),"g"(0) \ : "cc"); \ asm ("addq %2,%0; adcq %3,%1" \ : "+r"(c1),"+r"(c2) \ : "d"(t2),"g"(0) \ asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ : "+r"(c0),"+r"(c1),"+r"(c2) \ : "r"(t1),"r"(t2),"g"(0) \ : "cc"); \ } while (0) #endif Loading @@ -355,7 +354,6 @@ BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) { BN_ULONG t1,t2; BN_ULONG c1,c2,c3; c1=0; Loading Loading @@ -459,7 +457,6 @@ void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) { BN_ULONG t1,t2; BN_ULONG c1,c2,c3; c1=0; Loading Loading @@ -499,7 +496,6 @@ void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) { BN_ULONG t1,t2; BN_ULONG c1,c2,c3; c1=0; Loading Loading @@ -575,7 +571,6 @@ void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) { BN_ULONG t1,t2; BN_ULONG c1,c2,c3; c1=0; Loading
crypto/bn/bn_asm.c +122 −119 Original line number Diff line number Diff line Loading @@ -439,116 +439,143 @@ BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n) /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */ #ifdef BN_LLONG #define mul_add_c(a,b,c0,c1,c2) \ t=(BN_ULLONG)a*b; \ t1=(BN_ULONG)Lw(t); \ t2=(BN_ULONG)Hw(t); \ c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; #define mul_add_c2(a,b,c0,c1,c2) \ t=(BN_ULLONG)a*b; \ tt=(t+t)&BN_MASK; \ if (tt < t) c2++; \ t1=(BN_ULONG)Lw(tt); \ t2=(BN_ULONG)Hw(tt); \ c0=(c0+t1)&BN_MASK2; \ if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \ c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; #define sqr_add_c(a,i,c0,c1,c2) \ t=(BN_ULLONG)a[i]*a[i]; \ t1=(BN_ULONG)Lw(t); \ t2=(BN_ULONG)Hw(t); \ c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; /* * Keep in mind that additions to multiplication result can not * overflow, because its high half cannot be all-ones. */ #define mul_add_c(a,b,c0,c1,c2) do { \ BN_ULONG hi; \ BN_ULLONG t = (BN_ULLONG)(a)*(b); \ t += c0; /* no carry */ \ c0 = (BN_ULONG)Lw(t); \ hi = (BN_ULONG)Hw(t); \ c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ } while(0) #define mul_add_c2(a,b,c0,c1,c2) do { \ BN_ULONG hi; \ BN_ULLONG t = (BN_ULLONG)(a)*(b); \ BN_ULLONG tt = t+c0; /* no carry */ \ c0 = (BN_ULONG)Lw(tt); \ hi = (BN_ULONG)Hw(tt); \ c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ t += c0; /* no carry */ \ c0 = (BN_ULONG)Lw(t); \ hi = (BN_ULONG)Hw(t); \ c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ } while(0) #define sqr_add_c(a,i,c0,c1,c2) do { \ BN_ULONG hi; \ BN_ULLONG t = (BN_ULLONG)a[i]*a[i]; \ t += c0; /* no carry */ \ c0 = (BN_ULONG)Lw(t); \ hi = (BN_ULONG)Hw(t); \ c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ } while(0) #define sqr_add_c2(a,i,j,c0,c1,c2) \ mul_add_c2((a)[i],(a)[j],c0,c1,c2) #elif defined(BN_UMULT_LOHI) #define mul_add_c(a,b,c0,c1,c2) { \ /* * Keep in mind that additions to hi can not overflow, because * the high word of a multiplication result cannot be all-ones. */ #define mul_add_c(a,b,c0,c1,c2) do { \ BN_ULONG ta = (a), tb = (b); \ BN_UMULT_LOHI(t1,t2,ta,tb); \ c0 += t1; t2 += (c0<t1)?1:0; \ c1 += t2; c2 += (c1<t2)?1:0; \ } #define mul_add_c2(a,b,c0,c1,c2) { \ BN_ULONG ta=(a),tb=(b),t0; \ BN_UMULT_LOHI(t0,t1,ta,tb); \ t2 = t1+t1; c2 += (t2<t1)?1:0; \ t1 = t0+t0; t2 += (t1<t0)?1:0; \ c0 += t1; t2 += (c0<t1)?1:0; \ c1 += t2; c2 += (c1<t2)?1:0; \ } BN_ULONG lo, hi; \ BN_UMULT_LOHI(lo,hi,ta,tb); \ c0 += lo; hi += (c0<lo)?1:0; \ c1 += hi; c2 += (c1<hi)?1:0; \ } while(0) #define sqr_add_c(a,i,c0,c1,c2) { \ #define mul_add_c2(a,b,c0,c1,c2) do { \ BN_ULONG ta = (a), tb = (b); \ BN_ULONG lo, hi, tt; \ BN_UMULT_LOHI(lo,hi,ta,tb); \ c0 += lo; tt = hi+((c0<lo)?1:0); \ c1 += tt; c2 += (c1<tt)?1:0; \ c0 += lo; hi += (c0<lo)?1:0; \ c1 += hi; c2 += (c1<hi)?1:0; \ } while(0) #define sqr_add_c(a,i,c0,c1,c2) do { \ BN_ULONG ta = (a)[i]; \ BN_UMULT_LOHI(t1,t2,ta,ta); \ c0 += t1; t2 += (c0<t1)?1:0; \ c1 += t2; c2 += (c1<t2)?1:0; \ } BN_ULONG lo, hi; \ BN_UMULT_LOHI(lo,hi,ta,ta); \ c0 += lo; hi += (c0<lo)?1:0; \ c1 += hi; c2 += (c1<hi)?1:0; \ } while(0) #define sqr_add_c2(a,i,j,c0,c1,c2) \ mul_add_c2((a)[i],(a)[j],c0,c1,c2) #elif defined(BN_UMULT_HIGH) #define mul_add_c(a,b,c0,c1,c2) { \ /* * Keep in mind that additions to hi can not overflow, because * the high word of a multiplication result cannot be all-ones. */ #define mul_add_c(a,b,c0,c1,c2) do { \ BN_ULONG ta = (a), tb = (b); \ t1 = ta * tb; \ t2 = BN_UMULT_HIGH(ta,tb); \ c0 += t1; t2 += (c0<t1)?1:0; \ c1 += t2; c2 += (c1<t2)?1:0; \ } #define mul_add_c2(a,b,c0,c1,c2) { \ BN_ULONG ta=(a),tb=(b),t0; \ t1 = BN_UMULT_HIGH(ta,tb); \ t0 = ta * tb; \ t2 = t1+t1; c2 += (t2<t1)?1:0; \ t1 = t0+t0; t2 += (t1<t0)?1:0; \ c0 += t1; t2 += (c0<t1)?1:0; \ c1 += t2; c2 += (c1<t2)?1:0; \ } #define sqr_add_c(a,i,c0,c1,c2) { \ BN_ULONG lo = ta * tb; \ BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \ c0 += lo; hi += (c0<lo)?1:0; \ c1 += hi; c2 += (c1<hi)?1:0; \ } while(0) #define mul_add_c2(a,b,c0,c1,c2) do { \ BN_ULONG ta = (a), tb = (b), tt; \ BN_ULONG lo = ta * tb; \ BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \ c0 += lo; tt = hi + ((c0<lo)?1:0); \ c1 += tt; c2 += (c1<tt)?1:0; \ c0 += lo; hi += (c0<lo)?1:0; \ c1 += hi; c2 += (c1<hi)?1:0; \ } while(0) #define sqr_add_c(a,i,c0,c1,c2) do { \ BN_ULONG ta = (a)[i]; \ t1 = ta * ta; \ t2 = BN_UMULT_HIGH(ta,ta); \ c0 += t1; t2 += (c0<t1)?1:0; \ c1 += t2; c2 += (c1<t2)?1:0; \ } BN_ULONG lo = ta * ta; \ BN_ULONG hi = BN_UMULT_HIGH(ta,ta); \ c0 += lo; hi += (c0<lo)?1:0; \ c1 += hi; c2 += (c1<hi)?1:0; \ } while(0) #define sqr_add_c2(a,i,j,c0,c1,c2) \ mul_add_c2((a)[i],(a)[j],c0,c1,c2) #else /* !BN_LLONG */ #define mul_add_c(a,b,c0,c1,c2) \ t1=LBITS(a); t2=HBITS(a); \ bl=LBITS(b); bh=HBITS(b); \ mul64(t1,t2,bl,bh); \ c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; #define mul_add_c2(a,b,c0,c1,c2) \ t1=LBITS(a); t2=HBITS(a); \ bl=LBITS(b); bh=HBITS(b); \ mul64(t1,t2,bl,bh); \ if (t2 & BN_TBIT) c2++; \ t2=(t2+t2)&BN_MASK2; \ if (t1 & BN_TBIT) t2++; \ t1=(t1+t1)&BN_MASK2; \ c0=(c0+t1)&BN_MASK2; \ if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \ c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; #define sqr_add_c(a,i,c0,c1,c2) \ sqr64(t1,t2,(a)[i]); \ c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; /* * Keep in mind that additions to hi can not overflow, because * the high word of a multiplication result cannot be all-ones. */ #define mul_add_c(a,b,c0,c1,c2) do { \ BN_ULONG lo = LBITS(a), hi = HBITS(a); \ BN_ULONG bl = LBITS(b), bh = HBITS(b); \ mul64(lo,hi,bl,bh); \ c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \ c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ } while(0) #define mul_add_c2(a,b,c0,c1,c2) do { \ BN_ULONG tt; \ BN_ULONG lo = LBITS(a), hi = HBITS(a); \ BN_ULONG bl = LBITS(b), bh = HBITS(b); \ mul64(lo,hi,bl,bh); \ tt = hi; \ c0 = (c0+lo)&BN_MASK2; if (c0<lo) tt++; \ c1 = (c1+tt)&BN_MASK2; if (c1<tt) c2++; \ c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \ c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ } while(0) #define sqr_add_c(a,i,c0,c1,c2) do { \ BN_ULONG lo, hi; \ sqr64(lo,hi,(a)[i]); \ c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \ c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ } while(0) #define sqr_add_c2(a,i,j,c0,c1,c2) \ mul_add_c2((a)[i],(a)[j],c0,c1,c2) Loading @@ -556,12 +583,6 @@ BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n) void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) { #ifdef BN_LLONG BN_ULLONG t; #else BN_ULONG bl,bh; #endif BN_ULONG t1,t2; BN_ULONG c1,c2,c3; c1=0; Loading Loading @@ -665,12 +686,6 @@ void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) { #ifdef BN_LLONG BN_ULLONG t; #else BN_ULONG bl,bh; #endif BN_ULONG t1,t2; BN_ULONG c1,c2,c3; c1=0; Loading Loading @@ -710,12 +725,6 @@ void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) { #ifdef BN_LLONG BN_ULLONG t,tt; #else BN_ULONG bl,bh; #endif BN_ULONG t1,t2; BN_ULONG c1,c2,c3; c1=0; Loading Loading @@ -791,12 +800,6 @@ void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) { #ifdef BN_LLONG BN_ULLONG t,tt; #else BN_ULONG bl,bh; #endif BN_ULONG t1,t2; BN_ULONG c1,c2,c3; c1=0; Loading