Commit 2e691f8f authored by Andy Polyakov's avatar Andy Polyakov
Browse files

FIPS MIPS assembly pack refresh.



Backport CVE-2014-3570 bug and postability fixes.

Reviewed-by: default avatarRich Salz <rsalz@openssl.org>
(cherry picked from commit 10fa6736)
parent 894c04aa
Loading
Loading
Loading
Loading
+3 −3
Original line number Diff line number Diff line
@@ -47,7 +47,7 @@
# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
#
$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64

if ($flavour =~ /64|n32/i) {
	$PTR_ADD="dadd";	# incidentally works even on n32
@@ -70,7 +70,7 @@ $pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
#
######################################################################

$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
$big_endian=(`echo MIPSEL | $ENV{CC} -E -`=~/MIPSEL/)?1:0;

for (@ARGV) {	$output=$_ if (/^\w[\w\-]*\.\w+$/);	}
open STDOUT,">$output";
@@ -89,7 +89,7 @@ $code.=<<___;
# include <openssl/fipssyms.h>
#endif

#if !defined(__vxworks) || defined(__pic__)
#if !defined(__mips_eabi) && (!defined(__vxworks) || defined(__pic__))
.option	pic2
#endif
.set	noat
+2 −2
Original line number Diff line number Diff line
@@ -46,7 +46,7 @@
# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
#
$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64

if ($flavour =~ /64|n32/i) {
	$PTR_ADD="dadd";	# incidentally works even on n32
@@ -133,7 +133,7 @@ $code.=<<___;
	bnez	$at,1f
	li	$t0,0
	slt	$at,$num,17	# on in-order CPU
	bnezl	$at,bn_mul_mont_internal
	bnez	$at,bn_mul_mont_internal
	nop
1:	jr	$ra
	li	$a0,0
+154 −505
Original line number Diff line number Diff line
@@ -48,7 +48,7 @@
# has to content with 40-85% improvement depending on benchmark and
# key length, more for longer keys.

$flavour = shift;
$flavour = shift || "o32";
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";

@@ -140,10 +140,10 @@ $code.=<<___;
	.set	reorder
	li	$minus4,-4
	and	$ta0,$a2,$minus4
	$LD	$t0,0($a1)
	beqz	$ta0,.L_bn_mul_add_words_tail

.L_bn_mul_add_words_loop:
	$LD	$t0,0($a1)
	$MULTU	$t0,$a3
	$LD	$t1,0($a0)
	$LD	$t2,$BNSZ($a1)
@@ -200,10 +200,9 @@ $code.=<<___;
	$ADDU	$v0,$ta2
	sltu	$at,$ta3,$at
	$ST	$ta3,-$BNSZ($a0)
	$ADDU	$v0,$at
	.set	noreorder
	bgtzl	$ta0,.L_bn_mul_add_words_loop
	$LD	$t0,0($a1)
	bgtz	$ta0,.L_bn_mul_add_words_loop
	$ADDU	$v0,$at

	beqz	$a2,.L_bn_mul_add_words_return
	nop
@@ -300,10 +299,10 @@ $code.=<<___;
	.set	reorder
	li	$minus4,-4
	and	$ta0,$a2,$minus4
	$LD	$t0,0($a1)
	beqz	$ta0,.L_bn_mul_words_tail

.L_bn_mul_words_loop:
	$LD	$t0,0($a1)
	$MULTU	$t0,$a3
	$LD	$t2,$BNSZ($a1)
	$LD	$ta0,2*$BNSZ($a1)
@@ -341,10 +340,9 @@ $code.=<<___;
	$ADDU	$v0,$at
	sltu	$ta3,$v0,$at
	$ST	$v0,-$BNSZ($a0)
	$ADDU	$v0,$ta3,$ta2
	.set	noreorder
	bgtzl	$ta0,.L_bn_mul_words_loop
	$LD	$t0,0($a1)
	bgtz	$ta0,.L_bn_mul_words_loop
	$ADDU	$v0,$ta3,$ta2

	beqz	$a2,.L_bn_mul_words_return
	nop
@@ -429,10 +427,10 @@ $code.=<<___;
	.set	reorder
	li	$minus4,-4
	and	$ta0,$a2,$minus4
	$LD	$t0,0($a1)
	beqz	$ta0,.L_bn_sqr_words_tail

.L_bn_sqr_words_loop:
	$LD	$t0,0($a1)
	$MULTU	$t0,$t0
	$LD	$t2,$BNSZ($a1)
	$LD	$ta0,2*$BNSZ($a1)
@@ -463,11 +461,10 @@ $code.=<<___;
	mflo	$ta3
	mfhi	$ta2
	$ST	$ta3,-2*$BNSZ($a0)
	$ST	$ta2,-$BNSZ($a0)

	.set	noreorder
	bgtzl	$ta0,.L_bn_sqr_words_loop
	$LD	$t0,0($a1)
	bgtz	$ta0,.L_bn_sqr_words_loop
	$ST	$ta2,-$BNSZ($a0)

	beqz	$a2,.L_bn_sqr_words_return
	nop
@@ -547,10 +544,10 @@ $code.=<<___;
	.set	reorder
	li	$minus4,-4
	and	$at,$a3,$minus4
	$LD	$t0,0($a1)
	beqz	$at,.L_bn_add_words_tail

.L_bn_add_words_loop:
	$LD	$t0,0($a1)
	$LD	$ta0,0($a2)
	subu	$a3,4
	$LD	$t1,$BNSZ($a1)
@@ -589,11 +586,10 @@ $code.=<<___;
	$ADDU	$t3,$ta3,$v0
	sltu	$v0,$t3,$ta3
	$ST	$t3,-$BNSZ($a0)
	$ADDU	$v0,$t9
	
	.set	noreorder
	bgtzl	$at,.L_bn_add_words_loop
	$LD	$t0,0($a1)
	bgtz	$at,.L_bn_add_words_loop
	$ADDU	$v0,$t9

	beqz	$a3,.L_bn_add_words_return
	nop
@@ -679,10 +675,10 @@ $code.=<<___;
	.set	reorder
	li	$minus4,-4
	and	$at,$a3,$minus4
	$LD	$t0,0($a1)
	beqz	$at,.L_bn_sub_words_tail

.L_bn_sub_words_loop:
	$LD	$t0,0($a1)
	$LD	$ta0,0($a2)
	subu	$a3,4
	$LD	$t1,$BNSZ($a1)
@@ -722,11 +718,10 @@ $code.=<<___;
	$SUBU	$t3,$ta3,$v0
	sgtu	$v0,$t3,$ta3
	$ST	$t3,-$BNSZ($a0)
	$ADDU	$v0,$t9

	.set	noreorder
	bgtzl	$at,.L_bn_sub_words_loop
	$LD	$t0,0($a1)
	bgtz	$at,.L_bn_sub_words_loop
	$ADDU	$v0,$t9

	beqz	$a3,.L_bn_sub_words_return
	nop
@@ -819,7 +814,7 @@ ___
$code.=<<___;
	.set	reorder
	move	$ta3,$ra
	bal	bn_div_words
	bal	bn_div_words_internal
	move	$ra,$ta3
	$MULTU	$ta2,$v0
	$LD	$t2,-2*$BNSZ($a3)
@@ -840,8 +835,9 @@ $code.=<<___;
	sltu	$ta0,$a1,$a2
	or	$t8,$ta0
	.set	noreorder
	beqzl	$at,.L_bn_div_3_words_inner_loop
	beqz	$at,.L_bn_div_3_words_inner_loop
	$SUBU	$v0,1
	$ADDU	$v0,1
	.set	reorder
.L_bn_div_3_words_inner_loop_done:
	.set	noreorder
@@ -902,7 +898,8 @@ $code.=<<___;
	and	$t2,$a0
	$SRL	$at,$a1,$t1
	.set	noreorder
	bnezl	$t2,.+8
	beqz	$t2,.+12
	nop
	break	6		# signal overflow
	.set	reorder
	$SLL	$a0,$t9
@@ -917,7 +914,8 @@ $code.=<<___;
	$SRL	$DH,$a2,4*$BNSZ	# bits
	sgeu	$at,$a0,$a2
	.set	noreorder
	bnezl	$at,.+8
	beqz	$at,.+12
	nop
	$SUBU	$a0,$a2
	.set	reorder

@@ -1874,6 +1872,41 @@ ___

($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);

sub add_c2 () {
my ($hi,$lo,$c0,$c1,$c2,
    $warm,      # !$warm denotes first call with specific sequence of
                # $c_[XYZ] when there is no Z-carry to accumulate yet;
    $an,$bn     # these two are arguments for multiplication which
                # result is used in *next* step [which is why it's
                # commented as "forward multiplication" below];
    )=@_;
$code.=<<___;
	mflo	$lo
	mfhi	$hi
	$ADDU	$c0,$lo
	sltu	$at,$c0,$lo
	 $MULTU	$an,$bn			# forward multiplication
	$ADDU	$c0,$lo
	$ADDU	$at,$hi
	sltu	$lo,$c0,$lo
	$ADDU	$c1,$at
	$ADDU	$hi,$lo
___
$code.=<<___	if (!$warm);
	sltu	$c2,$c1,$at
	$ADDU	$c1,$hi
	sltu	$hi,$c1,$hi
	$ADDU	$c2,$hi
___
$code.=<<___	if ($warm);
	sltu	$at,$c1,$at
	$ADDU	$c1,$hi
	$ADDU	$c2,$at
	sltu	$hi,$c1,$hi
	$ADDU	$c2,$hi
___
}

$code.=<<___;

.align	5
@@ -1922,21 +1955,10 @@ $code.=<<___;
	sltu	$at,$c_2,$t_1
	$ADDU	$c_3,$t_2,$at
	$ST	$c_2,$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	slt	$c_2,$t_2,$zero
	$SLL	$t_2,1
	$MULTU	$a_1,$a_1		# mul_add_c(a[1],b[1],c3,c1,c2);
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
___
	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
		$a_1,$a_1);		# mul_add_c(a[1],b[1],c3,c1,c2);
$code.=<<___;
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_3,$t_1
@@ -1947,67 +1969,19 @@ $code.=<<___;
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	$ST	$c_3,2*$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	slt	$c_3,$t_2,$zero
	$SLL	$t_2,1
	$MULTU	$a_1,$a_2		# mul_add_c2(a[1],b[2],c1,c2,c3);
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	mflo	$t_1
	mfhi	$t_2
	slt	$at,$t_2,$zero
	$ADDU	$c_3,$at
	 $MULTU	$a_4,$a_0		# mul_add_c2(a[4],b[0],c2,c3,c1);
	$SLL	$t_2,1
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
___
	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
		$a_1,$a_2);		# mul_add_c2(a[1],b[2],c1,c2,c3);
	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
		$a_4,$a_0);		# mul_add_c2(a[4],b[0],c2,c3,c1);
$code.=<<___;
	$ST	$c_1,3*$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	slt	$c_1,$t_2,$zero
	$SLL	$t_2,1
	$MULTU	$a_3,$a_1		# mul_add_c2(a[3],b[1],c2,c3,c1);
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	mflo	$t_1
	mfhi	$t_2
	slt	$at,$t_2,$zero
	$ADDU	$c_1,$at
	$MULTU	$a_2,$a_2		# mul_add_c(a[2],b[2],c2,c3,c1);
	$SLL	$t_2,1
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
___
	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
		$a_3,$a_1);		# mul_add_c2(a[3],b[1],c2,c3,c1);
	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
		$a_2,$a_2);		# mul_add_c(a[2],b[2],c2,c3,c1);
$code.=<<___;
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_2,$t_1
@@ -2018,97 +1992,23 @@ $code.=<<___;
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	$ST	$c_2,4*$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	slt	$c_2,$t_2,$zero
	$SLL	$t_2,1
	$MULTU	$a_1,$a_4		# mul_add_c2(a[1],b[4],c3,c1,c2);
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	mflo	$t_1
	mfhi	$t_2
	slt	$at,$t_2,$zero
	$ADDU	$c_2,$at
	$MULTU	$a_2,$a_3		# mul_add_c2(a[2],b[3],c3,c1,c2);
	$SLL	$t_2,1
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	mflo	$t_1
	mfhi	$t_2
	slt	$at,$t_2,$zero
	 $MULTU	$a_6,$a_0		# mul_add_c2(a[6],b[0],c1,c2,c3);
	$ADDU	$c_2,$at
	$SLL	$t_2,1
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
___
	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
		$a_1,$a_4);		# mul_add_c2(a[1],b[4],c3,c1,c2);
	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
		$a_2,$a_3);		# mul_add_c2(a[2],b[3],c3,c1,c2);
	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
		$a_6,$a_0);		# mul_add_c2(a[6],b[0],c1,c2,c3);
$code.=<<___;
	$ST	$c_3,5*$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	slt	$c_3,$t_2,$zero
	$SLL	$t_2,1
	$MULTU	$a_5,$a_1		# mul_add_c2(a[5],b[1],c1,c2,c3);
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	mflo	$t_1
	mfhi	$t_2
	slt	$at,$t_2,$zero
	$ADDU	$c_3,$at
	$MULTU	$a_4,$a_2		# mul_add_c2(a[4],b[2],c1,c2,c3);
	$SLL	$t_2,1
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	mflo	$t_1
	mfhi	$t_2
	slt	$at,$t_2,$zero
	$ADDU	$c_3,$at
	$MULTU	$a_3,$a_3		# mul_add_c(a[3],b[3],c1,c2,c3);
	$SLL	$t_2,1
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
___
	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
		$a_5,$a_1);		# mul_add_c2(a[5],b[1],c1,c2,c3);
	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
		$a_4,$a_2);		# mul_add_c2(a[4],b[2],c1,c2,c3);
	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
		$a_3,$a_3);		# mul_add_c(a[3],b[3],c1,c2,c3);
$code.=<<___;
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_1,$t_1
@@ -2119,112 +2019,25 @@ $code.=<<___;
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	$ST	$c_1,6*$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	slt	$c_1,$t_2,$zero
	$SLL	$t_2,1
	$MULTU	$a_1,$a_6		# mul_add_c2(a[1],b[6],c2,c3,c1);
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	mflo	$t_1
	mfhi	$t_2
	slt	$at,$t_2,$zero
	$ADDU	$c_1,$at
	$MULTU	$a_2,$a_5		# mul_add_c2(a[2],b[5],c2,c3,c1);
	$SLL	$t_2,1
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	mflo	$t_1
	mfhi	$t_2
	slt	$at,$t_2,$zero
	$ADDU	$c_1,$at
	$MULTU	$a_3,$a_4		# mul_add_c2(a[3],b[4],c2,c3,c1);
	$SLL	$t_2,1
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	mflo	$t_1
	mfhi	$t_2
	slt	$at,$t_2,$zero
	$ADDU	$c_1,$at
	 $MULTU	$a_7,$a_1		# mul_add_c2(a[7],b[1],c3,c1,c2);
	$SLL	$t_2,1
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
___
	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
		$a_1,$a_6);		# mul_add_c2(a[1],b[6],c2,c3,c1);
	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
		$a_2,$a_5);		# mul_add_c2(a[2],b[5],c2,c3,c1);
	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
		$a_3,$a_4);		# mul_add_c2(a[3],b[4],c2,c3,c1);
	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
		$a_7,$a_1);		# mul_add_c2(a[7],b[1],c3,c1,c2);
$code.=<<___;
	$ST	$c_2,7*$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	slt	$c_2,$t_2,$zero
	$SLL	$t_2,1
	$MULTU	$a_6,$a_2		# mul_add_c2(a[6],b[2],c3,c1,c2);
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	mflo	$t_1
	mfhi	$t_2
	slt	$at,$t_2,$zero
	$ADDU	$c_2,$at
	$MULTU	$a_5,$a_3		# mul_add_c2(a[5],b[3],c3,c1,c2);
	$SLL	$t_2,1
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	mflo	$t_1
	mfhi	$t_2
	slt	$at,$t_2,$zero
	$ADDU	$c_2,$at
	$MULTU	$a_4,$a_4		# mul_add_c(a[4],b[4],c3,c1,c2);
	$SLL	$t_2,1
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
___
	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
		$a_6,$a_2);		# mul_add_c2(a[6],b[2],c3,c1,c2);
	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
		$a_5,$a_3);		# mul_add_c2(a[5],b[3],c3,c1,c2);
	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
		$a_4,$a_4);		# mul_add_c(a[4],b[4],c3,c1,c2);
$code.=<<___;
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_3,$t_1
@@ -2235,82 +2048,21 @@ $code.=<<___;
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	$ST	$c_3,8*$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	slt	$c_3,$t_2,$zero
	$SLL	$t_2,1
	$MULTU	$a_3,$a_6		# mul_add_c2(a[3],b[6],c1,c2,c3);
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	mflo	$t_1
	mfhi	$t_2
	slt	$at,$t_2,$zero
	$ADDU	$c_3,$at
	$MULTU	$a_4,$a_5		# mul_add_c2(a[4],b[5],c1,c2,c3);
	$SLL	$t_2,1
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	mflo	$t_1
	mfhi	$t_2
	slt	$at,$t_2,$zero
	$ADDU	$c_3,$at
	 $MULTU	$a_7,$a_3		# mul_add_c2(a[7],b[3],c2,c3,c1);
	$SLL	$t_2,1
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
___
	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
		$a_3,$a_6);		# mul_add_c2(a[3],b[6],c1,c2,c3);
	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
		$a_4,$a_5);		# mul_add_c2(a[4],b[5],c1,c2,c3);
	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
		$a_7,$a_3);		# mul_add_c2(a[7],b[3],c2,c3,c1);
$code.=<<___;
	$ST	$c_1,9*$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	slt	$c_1,$t_2,$zero
	$SLL	$t_2,1
	$MULTU	$a_6,$a_4		# mul_add_c2(a[6],b[4],c2,c3,c1);
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	mflo	$t_1
	mfhi	$t_2
	slt	$at,$t_2,$zero
	$ADDU	$c_1,$at
	$MULTU	$a_5,$a_5		# mul_add_c(a[5],b[5],c2,c3,c1);
	$SLL	$t_2,1
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
___
	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
		$a_6,$a_4);		# mul_add_c2(a[6],b[4],c2,c3,c1);
	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
		$a_5,$a_5);		# mul_add_c(a[5],b[5],c2,c3,c1);
$code.=<<___;
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_2,$t_1
@@ -2321,52 +2073,17 @@ $code.=<<___;
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	$ST	$c_2,10*$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	slt	$c_2,$t_2,$zero
	$SLL	$t_2,1
	$MULTU	$a_5,$a_6		# mul_add_c2(a[5],b[6],c3,c1,c2);
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	mflo	$t_1
	mfhi	$t_2
	slt	$at,$t_2,$zero
	$ADDU	$c_2,$at
	 $MULTU	$a_7,$a_5		# mul_add_c2(a[7],b[5],c1,c2,c3);
	$SLL	$t_2,1
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
___
	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
		$a_5,$a_6);		# mul_add_c2(a[5],b[6],c3,c1,c2);
	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
		$a_7,$a_5);		# mul_add_c2(a[7],b[5],c1,c2,c3);
$code.=<<___;
	$ST	$c_3,11*$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	slt	$c_3,$t_2,$zero
	$SLL	$t_2,1
	$MULTU	$a_6,$a_6		# mul_add_c(a[6],b[6],c1,c2,c3);
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
___
	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
		$a_6,$a_6);		# mul_add_c(a[6],b[6],c1,c2,c3);
$code.=<<___;
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_1,$t_1
@@ -2377,21 +2094,10 @@ $code.=<<___;
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	$ST	$c_1,12*$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	slt	$c_1,$t_2,$zero
	$SLL	$t_2,1
	 $MULTU	$a_7,$a_7		# mul_add_c(a[7],b[7],c3,c1,c2);
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
___
	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
		$a_7,$a_7);		# mul_add_c(a[7],b[7],c3,c1,c2);
$code.=<<___;
	$ST	$c_2,13*$BNSZ($a0)

	mflo	$t_1
@@ -2459,21 +2165,10 @@ $code.=<<___;
	sltu	$at,$c_2,$t_1
	$ADDU	$c_3,$t_2,$at
	$ST	$c_2,$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	slt	$c_2,$t_2,$zero
	$SLL	$t_2,1
	$MULTU	$a_1,$a_1		# mul_add_c(a[1],b[1],c3,c1,c2);
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
___
	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
		$a_1,$a_1);		# mul_add_c(a[1],b[1],c3,c1,c2);
$code.=<<___;
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_3,$t_1
@@ -2484,52 +2179,17 @@ $code.=<<___;
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
	$ST	$c_3,2*$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	slt	$c_3,$t_2,$zero
	$SLL	$t_2,1
	$MULTU	$a_1,$a_2		# mul_add_c(a2[1],b[2],c1,c2,c3);
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
	mflo	$t_1
	mfhi	$t_2
	slt	$at,$t_2,$zero
	$ADDU	$c_3,$at
	 $MULTU	$a_3,$a_1		# mul_add_c2(a[3],b[1],c2,c3,c1);
	$SLL	$t_2,1
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_1,$t_1
	sltu	$at,$c_1,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_2,$t_2
	sltu	$at,$c_2,$t_2
	$ADDU	$c_3,$at
___
	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
		$a_1,$a_2);		# mul_add_c2(a2[1],b[2],c1,c2,c3);
	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
		$a_3,$a_1);		# mul_add_c2(a[3],b[1],c2,c3,c1);
$code.=<<___;
	$ST	$c_1,3*$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	slt	$c_1,$t_2,$zero
	$SLL	$t_2,1
	$MULTU	$a_2,$a_2		# mul_add_c(a[2],b[2],c2,c3,c1);
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_2,$t_1
	sltu	$at,$c_2,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_3,$t_2
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
___
	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
		$a_2,$a_2);		# mul_add_c(a[2],b[2],c2,c3,c1);
$code.=<<___;
	mflo	$t_1
	mfhi	$t_2
	$ADDU	$c_2,$t_1
@@ -2540,21 +2200,10 @@ $code.=<<___;
	sltu	$at,$c_3,$t_2
	$ADDU	$c_1,$at
	$ST	$c_2,4*$BNSZ($a0)

	mflo	$t_1
	mfhi	$t_2
	slt	$c_2,$t_2,$zero
	$SLL	$t_2,1
	 $MULTU	$a_3,$a_3		# mul_add_c(a[3],b[3],c1,c2,c3);
	slt	$a2,$t_1,$zero
	$ADDU	$t_2,$a2
	$SLL	$t_1,1
	$ADDU	$c_3,$t_1
	sltu	$at,$c_3,$t_1
	$ADDU	$t_2,$at
	$ADDU	$c_1,$t_2
	sltu	$at,$c_1,$t_2
	$ADDU	$c_2,$at
___
	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
		$a_3,$a_3);		# mul_add_c(a[3],b[3],c1,c2,c3);
$code.=<<___;
	$ST	$c_3,5*$BNSZ($a0)

	mflo	$t_1
+2 −2
Original line number Diff line number Diff line
@@ -42,7 +42,7 @@
# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
#
$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64

if ($flavour =~ /64|n32/i) {
	$PTR_ADD="dadd";	# incidentally works even on n32
@@ -64,7 +64,7 @@ if ($flavour =~ /64|n32/i) {
#
######################################################################

$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
$big_endian=(`echo MIPSEL | $ENV{CC} -E -`=~/MIPSEL/)?1:0;

for (@ARGV) {	$output=$_ if (/^\w[\w\-]*\.\w+$/);   }
open STDOUT,">$output";
+4 −4
Original line number Diff line number Diff line
@@ -45,7 +45,7 @@
# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
#
$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64

if ($flavour =~ /64|n32/i) {
	$PTR_ADD="dadd";	# incidentally works even on n32
@@ -68,7 +68,7 @@ $pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
#
######################################################################

$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
$big_endian=(`echo MIPSEL | $ENV{CC} -E -`=~/MIPSEL/)?1:0;

for (@ARGV) {	$output=$_ if (/^\w[\w\-]*\.\w+$/);	}
open STDOUT,">$output";
@@ -244,7 +244,7 @@ $code.=<<___;

.text
.set	noat
#if !defined(__vxworks) || defined(__pic__)
#if !defined(__mips_eabi) && (!defined(__vxworks) || defined(__pic__))
.option	pic2
#endif

@@ -351,7 +351,7 @@ $code.=<<___;
	$ST	$G,6*$SZ($ctx)
	$ST	$H,7*$SZ($ctx)

	bnel	$inp,@X[15],.Loop
	bne	$inp,@X[15],.Loop
	$PTR_SUB $Ktbl,`($rounds-16)*$SZ`	# rewind $Ktbl

	$REG_L	$ra,$FRAMESIZE-1*$SZREG($sp)