Loading crypto/bn/asm/mips-mont.pl 0 → 100644 +419 −0 Original line number Diff line number Diff line #!/usr/bin/env perl # # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # This module doesn't present direct interest for OpenSSL, because it # doesn't provide better performance for longer keys. While 512-bit # RSA private key operations are 40% faster, 1024-bit ones are hardly # faster at all, while longer key operations are slower by up to 20%. # It might be of interest to embedded system developers though, as # it's smaller than 1KB, yet offers ~3x improvement over compiler # generated code. ###################################################################### # There is a number of MIPS ABI in use, O32 and N32/64 are most # widely used. Then there is a new contender: NUBI. It appears that if # one picks the latter, it's possible to arrange code in ABI neutral # manner. Therefore let's stick to NUBI register layout: # ($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); ($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); # # The return value is placed in $a0. Following coding rules facilitate # interoperability: # # - never ever touch $tp, "thread pointer", former $gp; # - copy return value to $t0, former $v0 [or to $a0 if you're adapting # old code]; # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; # # For reference here is register layout for N32/64 MIPS ABIs: # # ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); # ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); # ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); # $flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 if ($flavour =~ /64|n32/i) { $PTR_ADD="dadd"; # incidentally works even on n32 $PTR_SUB="dsub"; # incidentally works even on n32 $REG_S="sd"; $REG_L="ld"; $SZREG=8; } else { $PTR_ADD="add"; $PTR_SUB="sub"; $REG_S="sw"; $REG_L="lw"; $SZREG=4; } $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000; # # <appro@openssl.org> # ###################################################################### while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; if ($flavour =~ /64|n32/i) { $LD="ld"; $ST="sd"; $MULTU="dmultu"; $ADDU="daddu"; $SUBU="dsubu"; $BNSZ=8; } else { $LD="lw"; $ST="sw"; $MULTU="multu"; $ADDU="addu"; $SUBU="subu"; $BNSZ=4; } # int bn_mul_mont( $rp=$a0; # BN_ULONG *rp, $ap=$a1; # const BN_ULONG *ap, $bp=$a2; # const BN_ULONG *bp, $np=$a3; # const BN_ULONG *np, $n0=$a4; # const BN_ULONG *n0, $num=$a5; # int num); $lo0=$a6; $hi0=$a7; $lo1=$t1; $hi1=$t2; $aj=$s0; $bi=$s1; $nj=$s2; $tp=$s3; $alo=$s4; $ahi=$s5; $nlo=$s6; $nhi=$s7; $tj=$s8; $i=$s9; $j=$s10; $m1=$s11; $FRAMESIZE=14; $code=<<___; .text .set noat .set noreorder .align 5 .globl bn_mul_mont .ent bn_mul_mont bn_mul_mont: ___ $code.=<<___ if ($flavour =~ /o32/i); lw $n0,16($sp) lw $num,20($sp) ___ $code.=<<___; slt $at,$num,4 beqzl $at,bn_mul_mont_internal li $t0,0 jr $ra li $a0,0 .end bn_mul_mont .align 5 .ent bn_mul_mont_internal bn_mul_mont_internal: .frame $fp,$FRAMESIZE*$SZREG,$ra .mask 0x40000000|$SAVED_REGS_MASK,-$SZREG $PTR_SUB $sp,$FRAMESIZE*$SZREG $REG_S $fp,($FRAMESIZE-1)*$SZREG($sp) $REG_S $s11,($FRAMESIZE-2)*$SZREG($sp) $REG_S $s10,($FRAMESIZE-3)*$SZREG($sp) $REG_S $s9,($FRAMESIZE-4)*$SZREG($sp) $REG_S $s8,($FRAMESIZE-5)*$SZREG($sp) $REG_S $s7,($FRAMESIZE-6)*$SZREG($sp) $REG_S $s6,($FRAMESIZE-7)*$SZREG($sp) $REG_S $s5,($FRAMESIZE-8)*$SZREG($sp) $REG_S $s4,($FRAMESIZE-9)*$SZREG($sp) ___ $code.=<<___ if ($flavour =~ /nubi/i); $REG_S $s3,($FRAMESIZE-10)*$SZREG($sp) $REG_S $s2,($FRAMESIZE-11)*$SZREG($sp) $REG_S $s1,($FRAMESIZE-12)*$SZREG($sp) $REG_S $s0,($FRAMESIZE-13)*$SZREG($sp) ___ $code.=<<___; move $fp,$sp .set reorder $LD $n0,0($n0) $LD $bi,0($bp) # bp[0] $LD $aj,0($ap) # ap[0] $LD $nj,0($np) # np[0] $PTR_SUB $sp,2*$BNSZ # place for two extra words sll $num,`log($BNSZ)/log(2)` li $at,-4096 $PTR_SUB $sp,$num and $sp,$at $MULTU $aj,$bi $LD $alo,$BNSZ($ap) $LD $nlo,$BNSZ($np) mflo $lo0 mfhi $hi0 $MULTU $lo0,$n0 mflo $m1 $MULTU $alo,$bi mflo $alo mfhi $ahi $MULTU $nj,$m1 mflo $lo1 mfhi $hi1 $MULTU $nlo,$m1 $ADDU $lo1,$lo0 sltu $at,$lo1,$lo0 $ADDU $hi1,$at mflo $nlo mfhi $nhi move $tp,$sp li $j,2*$BNSZ .align 4 .L1st: .set noreorder $PTR_ADD $aj,$ap,$j $PTR_ADD $nj,$np,$j $LD $aj,($aj) $LD $nj,($nj) $MULTU $aj,$bi $ADDU $lo0,$alo,$hi0 $ADDU $lo1,$nlo,$hi1 sltu $at,$lo0,$hi0 sltu $t0,$lo1,$hi1 $ADDU $hi0,$ahi,$at $ADDU $hi1,$nhi,$t0 mflo $alo mfhi $ahi $ADDU $lo1,$lo0 sltu $at,$lo1,$lo0 $MULTU $nj,$m1 $ADDU $hi1,$at addu $j,$BNSZ $ST $lo1,($tp) sltu $t0,$j,$num mflo $nlo mfhi $nhi bnez $t0,.L1st $PTR_ADD $tp,$BNSZ .set reorder $ADDU $lo0,$alo,$hi0 sltu $at,$lo0,$hi0 $ADDU $hi0,$ahi,$at $ADDU $lo1,$nlo,$hi1 sltu $t0,$lo1,$hi1 $ADDU $hi1,$nhi,$t0 $ADDU $lo1,$lo0 sltu $at,$lo1,$lo0 $ADDU $hi1,$at $ST $lo1,($tp) $ADDU $hi1,$hi0 sltu $at,$hi1,$hi0 $ST $hi1,$BNSZ($tp) $ST $at,2*$BNSZ($tp) li $i,$BNSZ .align 4 .Louter: $PTR_ADD $bi,$bp,$i $LD $bi,($bi) $LD $aj,($ap) $LD $alo,$BNSZ($ap) $LD $tj,($sp) $MULTU $aj,$bi $LD $nj,($np) $LD $nlo,$BNSZ($np) mflo $lo0 mfhi $hi0 $ADDU $lo0,$tj $MULTU $lo0,$n0 sltu $at,$lo0,$tj $ADDU $hi0,$at mflo $m1 $MULTU $alo,$bi mflo $alo mfhi $ahi $MULTU $nj,$m1 mflo $lo1 mfhi $hi1 $MULTU $nlo,$m1 $ADDU $lo1,$lo0 sltu $at,$lo1,$lo0 $ADDU $hi1,$at mflo $nlo mfhi $nhi move $tp,$sp li $j,2*$BNSZ $LD $tj,$BNSZ($tp) .align 4 .Linner: .set noreorder $PTR_ADD $aj,$ap,$j $PTR_ADD $nj,$np,$j $LD $aj,($aj) $LD $nj,($nj) $MULTU $aj,$bi $ADDU $lo0,$alo,$hi0 $ADDU $lo1,$nlo,$hi1 sltu $at,$lo0,$hi0 sltu $t0,$lo1,$hi1 $ADDU $hi0,$ahi,$at $ADDU $hi1,$nhi,$t0 mflo $alo mfhi $ahi $ADDU $lo0,$tj addu $j,$BNSZ $MULTU $nj,$m1 sltu $at,$lo0,$tj $ADDU $lo1,$lo0 $ADDU $hi0,$at sltu $t0,$lo1,$lo0 $LD $tj,2*$BNSZ($tp) $ADDU $hi1,$t0 sltu $at,$j,$num mflo $nlo mfhi $nhi $ST $lo1,($tp) bnez $at,.Linner $PTR_ADD $tp,$BNSZ .set reorder $ADDU $lo0,$alo,$hi0 sltu $at,$lo0,$hi0 $ADDU $hi0,$ahi,$at $ADDU $lo0,$tj sltu $t0,$lo0,$tj $ADDU $hi0,$t0 $LD $tj,2*$BNSZ($tp) $ADDU $lo1,$nlo,$hi1 sltu $at,$lo1,$hi1 $ADDU $hi1,$nhi,$at $ADDU $lo1,$lo0 sltu $t0,$lo1,$lo0 $ADDU $hi1,$t0 $ST $lo1,($tp) $ADDU $lo1,$hi1,$hi0 sltu $hi1,$lo1,$hi0 $ADDU $lo1,$tj sltu $at,$lo1,$tj $ADDU $hi1,$at $ST $lo1,$BNSZ($tp) $ST $hi1,2*$BNSZ($tp) addu $i,$BNSZ sltu $t0,$i,$num bnez $t0,.Louter .set noreorder $PTR_ADD $tj,$sp,$num # &tp[num] move $tp,$sp move $ap,$sp li $hi0,0 # clear borrow bit .align 4 .Lsub: $LD $lo0,($tp) $LD $lo1,($np) $PTR_ADD $tp,$BNSZ $PTR_ADD $np,$BNSZ $SUBU $lo1,$lo0,$lo1 # tp[i]-np[i] sgtu $at,$lo1,$lo0 $SUBU $lo0,$lo1,$hi0 sgtu $hi0,$lo0,$lo1 $ST $lo0,($rp) or $hi0,$at sltu $at,$tp,$tj bnez $at,.Lsub $PTR_ADD $rp,$BNSZ $SUBU $hi0,$hi1,$hi0 # handle upmost overflow bit move $tp,$sp $PTR_SUB $rp,$num # restore rp not $hi1,$hi0 and $ap,$hi0,$sp and $bp,$hi1,$rp or $ap,$ap,$bp # ap=borrow?tp:rp .align 4 .Lcopy: $LD $aj,($ap) $PTR_ADD $ap,$BNSZ $ST $zero,($tp) $PTR_ADD $tp,$BNSZ sltu $at,$tp,$tj $ST $aj,($rp) bnez $at,.Lcopy $PTR_ADD $rp,$BNSZ li $a0,1 li $t0,1 .set noreorder move $sp,$fp $REG_L $fp,($FRAMESIZE-1)*$SZREG($sp) $REG_L $s11,($FRAMESIZE-2)*$SZREG($sp) $REG_L $s10,($FRAMESIZE-3)*$SZREG($sp) $REG_L $s9,($FRAMESIZE-4)*$SZREG($sp) $REG_L $s8,($FRAMESIZE-5)*$SZREG($sp) $REG_L $s7,($FRAMESIZE-6)*$SZREG($sp) $REG_L $s6,($FRAMESIZE-7)*$SZREG($sp) $REG_L $s5,($FRAMESIZE-8)*$SZREG($sp) $REG_L $s4,($FRAMESIZE-9)*$SZREG($sp) ___ $code.=<<___ if ($flavour =~ /nubi/i); $REG_L $s3,($FRAMESIZE-10)*$SZREG($sp) $REG_L $s2,($FRAMESIZE-11)*$SZREG($sp) $REG_L $s1,($FRAMESIZE-12)*$SZREG($sp) $REG_L $s0,($FRAMESIZE-13)*$SZREG($sp) ___ $code.=<<___; jr $ra $PTR_ADD $sp,$FRAMESIZE*$SZREG .end bn_mul_mont_internal .rdata .asciiz "Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>" ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; print $code; close STDOUT; crypto/sha/asm/sha1-mips.pl +110 −41 Original line number Diff line number Diff line Loading @@ -14,12 +14,62 @@ # vectorized Xupdate on MIPSIII/IV, but the goal was to code MIPS32- # compatible subroutine. There is room for minor optimization on # little-endian platforms... ###################################################################### # There is a number of MIPS ABI in use, O32 and N32/64 are most # widely used. Then there is a new contender: NUBI. It appears that if # one picks the latter, it's possible to arrange code in ABI neutral # manner. Therefore let's stick to NUBI register layout: # ($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); ($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); # # The return value is placed in $a0. Following coding rules facilitate # interoperability: # # - never ever touch $tp, "thread pointer", former $gp; # - copy return value to $t0, former $v0 [or to $a0 if you're adapting # old code]; # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; # # For reference here is register layout for N32/64 MIPS ABIs: # # ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); # ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); # ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); # $flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 if ($flavour =~ /64|n32/i) { $PTR_ADD="dadd"; # incidentally works even on n32 $PTR_SUB="dsub"; # incidentally works even on n32 $REG_S="sd"; $REG_L="ld"; $PTR_SLL="dsll"; # incidentally works even on n32 $SZREG=8; } else { $PTR_ADD="add"; $PTR_SUB="sub"; $REG_S="sw"; $REG_L="lw"; $PTR_SLL="sll"; $SZREG=4; } $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff008 : 0x00ff0000; # # <appro@openssl.org> # # The code is somewhat IRIX-centric, i.e. is likely to require minor # adaptations for other OSes... ###################################################################### for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/); $big_endian=0 if (/\-DL_ENDIAN/); } $big_endian=0 if (/\-DL_ENDIAN/); $output=$_ if (/^\w[\w\-]*\.\w+$/); } open STDOUT,">$output"; if (!defined($big_endian)) { $big_endian=(unpack('L',pack('N',1))==1); } Loading @@ -37,9 +87,9 @@ $B="\$2"; $C="\$3"; $D="\$7"; $E="\$24"; @V=($A,$B,$C,$D,$E); $t0="\$25"; # jp,t9 $t1="\$28"; # gp $t2="\$30"; # fp,s8 $t0="\$25"; $t1=$num; # $num is offloaded to stack $t2="\$30"; # fp $K="\$31"; # ra $FRAMESIZE=16; Loading Loading @@ -187,9 +237,6 @@ ___ } $code=<<___; #include <asm.h> #include <regdef.h> .text .set noat Loading @@ -198,22 +245,32 @@ $code=<<___; .globl sha1_block_data_order .ent sha1_block_data_order sha1_block_data_order: .frame sp,$FRAMESIZE*SZREG,zero .mask 0xd0ff0000,-$FRAMESIZE*SZREG .frame $sp,$FRAMESIZE*$SZREG,$ra .mask 0xd0000000|$SAVED_REGS_MASK,-$SZREG .set noreorder PTR_SUB sp,$FRAMESIZE*SZREG REG_S \$31,($FRAMESIZE-1)*SZREG(sp) REG_S \$30,($FRAMESIZE-2)*SZREG(sp) REG_S \$28,($FRAMESIZE-3)*SZREG(sp) REG_S \$23,($FRAMESIZE-4)*SZREG(sp) REG_S \$22,($FRAMESIZE-5)*SZREG(sp) REG_S \$21,($FRAMESIZE-6)*SZREG(sp) REG_S \$20,($FRAMESIZE-7)*SZREG(sp) REG_S \$19,($FRAMESIZE-8)*SZREG(sp) REG_S \$18,($FRAMESIZE-9)*SZREG(sp) REG_S \$17,($FRAMESIZE-10)*SZREG(sp) REG_S \$16,($FRAMESIZE-11)*SZREG(sp) $PTR_SUB $sp,$FRAMESIZE*$SZREG $REG_S $ra,($FRAMESIZE-1)*$SZREG($sp) $REG_S $fp,($FRAMESIZE-2)*$SZREG($sp) $REG_S $s11,($FRAMESIZE-3)*$SZREG($sp) $REG_S $s10,($FRAMESIZE-4)*$SZREG($sp) $REG_S $s9,($FRAMESIZE-5)*$SZREG($sp) $REG_S $s8,($FRAMESIZE-6)*$SZREG($sp) $REG_S $s7,($FRAMESIZE-7)*$SZREG($sp) $REG_S $s6,($FRAMESIZE-8)*$SZREG($sp) $REG_S $s5,($FRAMESIZE-9)*$SZREG($sp) $REG_S $s4,($FRAMESIZE-10)*$SZREG($sp) ___ $code.=<<___ if ($flavour =~ /nubi/i); $REG_S $s3,($FRAMESIZE-11)*$SZREG($sp) $REG_S $s2,($FRAMESIZE-12)*$SZREG($sp) $REG_S $s1,($FRAMESIZE-13)*$SZREG($sp) $REG_S $s0,($FRAMESIZE-14)*$SZREG($sp) $REG_S $gp,($FRAMESIZE-15)*$SZREG($sp) ___ $code.=<<___; $PTR_SLL $num,6 $PTR_ADD $num,$inp $REG_S $num,0($sp) lw $A,0($ctx) lw $B,4($ctx) lw $C,8($ctx) Loading Loading @@ -246,6 +303,9 @@ $code.=<<___; ___ for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } $code.=<<___; $PTR_ADD $inp,64 $REG_L $num,0($sp) addu $A,$X[0] addu $B,$X[1] sw $A,0($ctx) Loading @@ -253,29 +313,38 @@ $code.=<<___; addu $D,$X[3] sw $B,4($ctx) addu $E,$X[4] PTR_SUB $num,1 sw $C,8($ctx) sw $D,12($ctx) sw $E,16($ctx) .set noreorder bnez $num,.Loop PTR_ADD $inp,64 bne $inp,$num,.Loop nop .set noreorder REG_L \$31,($FRAMESIZE-1)*SZREG(sp) REG_L \$30,($FRAMESIZE-2)*SZREG(sp) REG_L \$28,($FRAMESIZE-3)*SZREG(sp) REG_L \$23,($FRAMESIZE-4)*SZREG(sp) REG_L \$22,($FRAMESIZE-5)*SZREG(sp) REG_L \$21,($FRAMESIZE-6)*SZREG(sp) REG_L \$20,($FRAMESIZE-7)*SZREG(sp) REG_L \$19,($FRAMESIZE-8)*SZREG(sp) REG_L \$18,($FRAMESIZE-9)*SZREG(sp) REG_L \$17,($FRAMESIZE-10)*SZREG(sp) REG_L \$16,($FRAMESIZE-11)*SZREG(sp) jr ra PTR_ADD sp,$FRAMESIZE*SZREG $REG_L $ra,($FRAMESIZE-1)*$SZREG($sp) $REG_L $fp,($FRAMESIZE-2)*$SZREG($sp) $REG_L $s11,($FRAMESIZE-3)*$SZREG($sp) $REG_L $s10,($FRAMESIZE-4)*$SZREG($sp) $REG_L $s9,($FRAMESIZE-5)*$SZREG($sp) $REG_L $s8,($FRAMESIZE-6)*$SZREG($sp) $REG_L $s7,($FRAMESIZE-7)*$SZREG($sp) $REG_L $s6,($FRAMESIZE-8)*$SZREG($sp) $REG_L $s5,($FRAMESIZE-9)*$SZREG($sp) $REG_L $s4,($FRAMESIZE-10)*$SZREG($sp) ___ $code.=<<___ if ($flavour =~ /nubi/i); $REG_L $s3,($FRAMESIZE-11)*$SZREG($sp) $REG_L $s2,($FRAMESIZE-12)*$SZREG($sp) $REG_L $s1,($FRAMESIZE-13)*$SZREG($sp) $REG_L $s0,($FRAMESIZE-14)*$SZREG($sp) $REG_L $gp,($FRAMESIZE-15)*$SZREG($sp) ___ $code.=<<___; jr $ra $PTR_ADD $sp,$FRAMESIZE*$SZREG .end sha1_block_data_order .rdata .asciiz "SHA1 for MIPS, CRYPTOGAMS by <appro\@openssl.org>" ___ print $code; close STDOUT; Loading
crypto/bn/asm/mips-mont.pl 0 → 100644 +419 −0 Original line number Diff line number Diff line #!/usr/bin/env perl # # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # This module doesn't present direct interest for OpenSSL, because it # doesn't provide better performance for longer keys. While 512-bit # RSA private key operations are 40% faster, 1024-bit ones are hardly # faster at all, while longer key operations are slower by up to 20%. # It might be of interest to embedded system developers though, as # it's smaller than 1KB, yet offers ~3x improvement over compiler # generated code. ###################################################################### # There is a number of MIPS ABI in use, O32 and N32/64 are most # widely used. Then there is a new contender: NUBI. It appears that if # one picks the latter, it's possible to arrange code in ABI neutral # manner. Therefore let's stick to NUBI register layout: # ($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); ($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); # # The return value is placed in $a0. Following coding rules facilitate # interoperability: # # - never ever touch $tp, "thread pointer", former $gp; # - copy return value to $t0, former $v0 [or to $a0 if you're adapting # old code]; # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; # # For reference here is register layout for N32/64 MIPS ABIs: # # ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); # ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); # ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); # $flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 if ($flavour =~ /64|n32/i) { $PTR_ADD="dadd"; # incidentally works even on n32 $PTR_SUB="dsub"; # incidentally works even on n32 $REG_S="sd"; $REG_L="ld"; $SZREG=8; } else { $PTR_ADD="add"; $PTR_SUB="sub"; $REG_S="sw"; $REG_L="lw"; $SZREG=4; } $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000; # # <appro@openssl.org> # ###################################################################### while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; if ($flavour =~ /64|n32/i) { $LD="ld"; $ST="sd"; $MULTU="dmultu"; $ADDU="daddu"; $SUBU="dsubu"; $BNSZ=8; } else { $LD="lw"; $ST="sw"; $MULTU="multu"; $ADDU="addu"; $SUBU="subu"; $BNSZ=4; } # int bn_mul_mont( $rp=$a0; # BN_ULONG *rp, $ap=$a1; # const BN_ULONG *ap, $bp=$a2; # const BN_ULONG *bp, $np=$a3; # const BN_ULONG *np, $n0=$a4; # const BN_ULONG *n0, $num=$a5; # int num); $lo0=$a6; $hi0=$a7; $lo1=$t1; $hi1=$t2; $aj=$s0; $bi=$s1; $nj=$s2; $tp=$s3; $alo=$s4; $ahi=$s5; $nlo=$s6; $nhi=$s7; $tj=$s8; $i=$s9; $j=$s10; $m1=$s11; $FRAMESIZE=14; $code=<<___; .text .set noat .set noreorder .align 5 .globl bn_mul_mont .ent bn_mul_mont bn_mul_mont: ___ $code.=<<___ if ($flavour =~ /o32/i); lw $n0,16($sp) lw $num,20($sp) ___ $code.=<<___; slt $at,$num,4 beqzl $at,bn_mul_mont_internal li $t0,0 jr $ra li $a0,0 .end bn_mul_mont .align 5 .ent bn_mul_mont_internal bn_mul_mont_internal: .frame $fp,$FRAMESIZE*$SZREG,$ra .mask 0x40000000|$SAVED_REGS_MASK,-$SZREG $PTR_SUB $sp,$FRAMESIZE*$SZREG $REG_S $fp,($FRAMESIZE-1)*$SZREG($sp) $REG_S $s11,($FRAMESIZE-2)*$SZREG($sp) $REG_S $s10,($FRAMESIZE-3)*$SZREG($sp) $REG_S $s9,($FRAMESIZE-4)*$SZREG($sp) $REG_S $s8,($FRAMESIZE-5)*$SZREG($sp) $REG_S $s7,($FRAMESIZE-6)*$SZREG($sp) $REG_S $s6,($FRAMESIZE-7)*$SZREG($sp) $REG_S $s5,($FRAMESIZE-8)*$SZREG($sp) $REG_S $s4,($FRAMESIZE-9)*$SZREG($sp) ___ $code.=<<___ if ($flavour =~ /nubi/i); $REG_S $s3,($FRAMESIZE-10)*$SZREG($sp) $REG_S $s2,($FRAMESIZE-11)*$SZREG($sp) $REG_S $s1,($FRAMESIZE-12)*$SZREG($sp) $REG_S $s0,($FRAMESIZE-13)*$SZREG($sp) ___ $code.=<<___; move $fp,$sp .set reorder $LD $n0,0($n0) $LD $bi,0($bp) # bp[0] $LD $aj,0($ap) # ap[0] $LD $nj,0($np) # np[0] $PTR_SUB $sp,2*$BNSZ # place for two extra words sll $num,`log($BNSZ)/log(2)` li $at,-4096 $PTR_SUB $sp,$num and $sp,$at $MULTU $aj,$bi $LD $alo,$BNSZ($ap) $LD $nlo,$BNSZ($np) mflo $lo0 mfhi $hi0 $MULTU $lo0,$n0 mflo $m1 $MULTU $alo,$bi mflo $alo mfhi $ahi $MULTU $nj,$m1 mflo $lo1 mfhi $hi1 $MULTU $nlo,$m1 $ADDU $lo1,$lo0 sltu $at,$lo1,$lo0 $ADDU $hi1,$at mflo $nlo mfhi $nhi move $tp,$sp li $j,2*$BNSZ .align 4 .L1st: .set noreorder $PTR_ADD $aj,$ap,$j $PTR_ADD $nj,$np,$j $LD $aj,($aj) $LD $nj,($nj) $MULTU $aj,$bi $ADDU $lo0,$alo,$hi0 $ADDU $lo1,$nlo,$hi1 sltu $at,$lo0,$hi0 sltu $t0,$lo1,$hi1 $ADDU $hi0,$ahi,$at $ADDU $hi1,$nhi,$t0 mflo $alo mfhi $ahi $ADDU $lo1,$lo0 sltu $at,$lo1,$lo0 $MULTU $nj,$m1 $ADDU $hi1,$at addu $j,$BNSZ $ST $lo1,($tp) sltu $t0,$j,$num mflo $nlo mfhi $nhi bnez $t0,.L1st $PTR_ADD $tp,$BNSZ .set reorder $ADDU $lo0,$alo,$hi0 sltu $at,$lo0,$hi0 $ADDU $hi0,$ahi,$at $ADDU $lo1,$nlo,$hi1 sltu $t0,$lo1,$hi1 $ADDU $hi1,$nhi,$t0 $ADDU $lo1,$lo0 sltu $at,$lo1,$lo0 $ADDU $hi1,$at $ST $lo1,($tp) $ADDU $hi1,$hi0 sltu $at,$hi1,$hi0 $ST $hi1,$BNSZ($tp) $ST $at,2*$BNSZ($tp) li $i,$BNSZ .align 4 .Louter: $PTR_ADD $bi,$bp,$i $LD $bi,($bi) $LD $aj,($ap) $LD $alo,$BNSZ($ap) $LD $tj,($sp) $MULTU $aj,$bi $LD $nj,($np) $LD $nlo,$BNSZ($np) mflo $lo0 mfhi $hi0 $ADDU $lo0,$tj $MULTU $lo0,$n0 sltu $at,$lo0,$tj $ADDU $hi0,$at mflo $m1 $MULTU $alo,$bi mflo $alo mfhi $ahi $MULTU $nj,$m1 mflo $lo1 mfhi $hi1 $MULTU $nlo,$m1 $ADDU $lo1,$lo0 sltu $at,$lo1,$lo0 $ADDU $hi1,$at mflo $nlo mfhi $nhi move $tp,$sp li $j,2*$BNSZ $LD $tj,$BNSZ($tp) .align 4 .Linner: .set noreorder $PTR_ADD $aj,$ap,$j $PTR_ADD $nj,$np,$j $LD $aj,($aj) $LD $nj,($nj) $MULTU $aj,$bi $ADDU $lo0,$alo,$hi0 $ADDU $lo1,$nlo,$hi1 sltu $at,$lo0,$hi0 sltu $t0,$lo1,$hi1 $ADDU $hi0,$ahi,$at $ADDU $hi1,$nhi,$t0 mflo $alo mfhi $ahi $ADDU $lo0,$tj addu $j,$BNSZ $MULTU $nj,$m1 sltu $at,$lo0,$tj $ADDU $lo1,$lo0 $ADDU $hi0,$at sltu $t0,$lo1,$lo0 $LD $tj,2*$BNSZ($tp) $ADDU $hi1,$t0 sltu $at,$j,$num mflo $nlo mfhi $nhi $ST $lo1,($tp) bnez $at,.Linner $PTR_ADD $tp,$BNSZ .set reorder $ADDU $lo0,$alo,$hi0 sltu $at,$lo0,$hi0 $ADDU $hi0,$ahi,$at $ADDU $lo0,$tj sltu $t0,$lo0,$tj $ADDU $hi0,$t0 $LD $tj,2*$BNSZ($tp) $ADDU $lo1,$nlo,$hi1 sltu $at,$lo1,$hi1 $ADDU $hi1,$nhi,$at $ADDU $lo1,$lo0 sltu $t0,$lo1,$lo0 $ADDU $hi1,$t0 $ST $lo1,($tp) $ADDU $lo1,$hi1,$hi0 sltu $hi1,$lo1,$hi0 $ADDU $lo1,$tj sltu $at,$lo1,$tj $ADDU $hi1,$at $ST $lo1,$BNSZ($tp) $ST $hi1,2*$BNSZ($tp) addu $i,$BNSZ sltu $t0,$i,$num bnez $t0,.Louter .set noreorder $PTR_ADD $tj,$sp,$num # &tp[num] move $tp,$sp move $ap,$sp li $hi0,0 # clear borrow bit .align 4 .Lsub: $LD $lo0,($tp) $LD $lo1,($np) $PTR_ADD $tp,$BNSZ $PTR_ADD $np,$BNSZ $SUBU $lo1,$lo0,$lo1 # tp[i]-np[i] sgtu $at,$lo1,$lo0 $SUBU $lo0,$lo1,$hi0 sgtu $hi0,$lo0,$lo1 $ST $lo0,($rp) or $hi0,$at sltu $at,$tp,$tj bnez $at,.Lsub $PTR_ADD $rp,$BNSZ $SUBU $hi0,$hi1,$hi0 # handle upmost overflow bit move $tp,$sp $PTR_SUB $rp,$num # restore rp not $hi1,$hi0 and $ap,$hi0,$sp and $bp,$hi1,$rp or $ap,$ap,$bp # ap=borrow?tp:rp .align 4 .Lcopy: $LD $aj,($ap) $PTR_ADD $ap,$BNSZ $ST $zero,($tp) $PTR_ADD $tp,$BNSZ sltu $at,$tp,$tj $ST $aj,($rp) bnez $at,.Lcopy $PTR_ADD $rp,$BNSZ li $a0,1 li $t0,1 .set noreorder move $sp,$fp $REG_L $fp,($FRAMESIZE-1)*$SZREG($sp) $REG_L $s11,($FRAMESIZE-2)*$SZREG($sp) $REG_L $s10,($FRAMESIZE-3)*$SZREG($sp) $REG_L $s9,($FRAMESIZE-4)*$SZREG($sp) $REG_L $s8,($FRAMESIZE-5)*$SZREG($sp) $REG_L $s7,($FRAMESIZE-6)*$SZREG($sp) $REG_L $s6,($FRAMESIZE-7)*$SZREG($sp) $REG_L $s5,($FRAMESIZE-8)*$SZREG($sp) $REG_L $s4,($FRAMESIZE-9)*$SZREG($sp) ___ $code.=<<___ if ($flavour =~ /nubi/i); $REG_L $s3,($FRAMESIZE-10)*$SZREG($sp) $REG_L $s2,($FRAMESIZE-11)*$SZREG($sp) $REG_L $s1,($FRAMESIZE-12)*$SZREG($sp) $REG_L $s0,($FRAMESIZE-13)*$SZREG($sp) ___ $code.=<<___; jr $ra $PTR_ADD $sp,$FRAMESIZE*$SZREG .end bn_mul_mont_internal .rdata .asciiz "Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>" ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; print $code; close STDOUT;
crypto/sha/asm/sha1-mips.pl +110 −41 Original line number Diff line number Diff line Loading @@ -14,12 +14,62 @@ # vectorized Xupdate on MIPSIII/IV, but the goal was to code MIPS32- # compatible subroutine. There is room for minor optimization on # little-endian platforms... ###################################################################### # There is a number of MIPS ABI in use, O32 and N32/64 are most # widely used. Then there is a new contender: NUBI. It appears that if # one picks the latter, it's possible to arrange code in ABI neutral # manner. Therefore let's stick to NUBI register layout: # ($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); ($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); # # The return value is placed in $a0. Following coding rules facilitate # interoperability: # # - never ever touch $tp, "thread pointer", former $gp; # - copy return value to $t0, former $v0 [or to $a0 if you're adapting # old code]; # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; # # For reference here is register layout for N32/64 MIPS ABIs: # # ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); # ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); # ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); # $flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 if ($flavour =~ /64|n32/i) { $PTR_ADD="dadd"; # incidentally works even on n32 $PTR_SUB="dsub"; # incidentally works even on n32 $REG_S="sd"; $REG_L="ld"; $PTR_SLL="dsll"; # incidentally works even on n32 $SZREG=8; } else { $PTR_ADD="add"; $PTR_SUB="sub"; $REG_S="sw"; $REG_L="lw"; $PTR_SLL="sll"; $SZREG=4; } $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff008 : 0x00ff0000; # # <appro@openssl.org> # # The code is somewhat IRIX-centric, i.e. is likely to require minor # adaptations for other OSes... ###################################################################### for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/); $big_endian=0 if (/\-DL_ENDIAN/); } $big_endian=0 if (/\-DL_ENDIAN/); $output=$_ if (/^\w[\w\-]*\.\w+$/); } open STDOUT,">$output"; if (!defined($big_endian)) { $big_endian=(unpack('L',pack('N',1))==1); } Loading @@ -37,9 +87,9 @@ $B="\$2"; $C="\$3"; $D="\$7"; $E="\$24"; @V=($A,$B,$C,$D,$E); $t0="\$25"; # jp,t9 $t1="\$28"; # gp $t2="\$30"; # fp,s8 $t0="\$25"; $t1=$num; # $num is offloaded to stack $t2="\$30"; # fp $K="\$31"; # ra $FRAMESIZE=16; Loading Loading @@ -187,9 +237,6 @@ ___ } $code=<<___; #include <asm.h> #include <regdef.h> .text .set noat Loading @@ -198,22 +245,32 @@ $code=<<___; .globl sha1_block_data_order .ent sha1_block_data_order sha1_block_data_order: .frame sp,$FRAMESIZE*SZREG,zero .mask 0xd0ff0000,-$FRAMESIZE*SZREG .frame $sp,$FRAMESIZE*$SZREG,$ra .mask 0xd0000000|$SAVED_REGS_MASK,-$SZREG .set noreorder PTR_SUB sp,$FRAMESIZE*SZREG REG_S \$31,($FRAMESIZE-1)*SZREG(sp) REG_S \$30,($FRAMESIZE-2)*SZREG(sp) REG_S \$28,($FRAMESIZE-3)*SZREG(sp) REG_S \$23,($FRAMESIZE-4)*SZREG(sp) REG_S \$22,($FRAMESIZE-5)*SZREG(sp) REG_S \$21,($FRAMESIZE-6)*SZREG(sp) REG_S \$20,($FRAMESIZE-7)*SZREG(sp) REG_S \$19,($FRAMESIZE-8)*SZREG(sp) REG_S \$18,($FRAMESIZE-9)*SZREG(sp) REG_S \$17,($FRAMESIZE-10)*SZREG(sp) REG_S \$16,($FRAMESIZE-11)*SZREG(sp) $PTR_SUB $sp,$FRAMESIZE*$SZREG $REG_S $ra,($FRAMESIZE-1)*$SZREG($sp) $REG_S $fp,($FRAMESIZE-2)*$SZREG($sp) $REG_S $s11,($FRAMESIZE-3)*$SZREG($sp) $REG_S $s10,($FRAMESIZE-4)*$SZREG($sp) $REG_S $s9,($FRAMESIZE-5)*$SZREG($sp) $REG_S $s8,($FRAMESIZE-6)*$SZREG($sp) $REG_S $s7,($FRAMESIZE-7)*$SZREG($sp) $REG_S $s6,($FRAMESIZE-8)*$SZREG($sp) $REG_S $s5,($FRAMESIZE-9)*$SZREG($sp) $REG_S $s4,($FRAMESIZE-10)*$SZREG($sp) ___ $code.=<<___ if ($flavour =~ /nubi/i); $REG_S $s3,($FRAMESIZE-11)*$SZREG($sp) $REG_S $s2,($FRAMESIZE-12)*$SZREG($sp) $REG_S $s1,($FRAMESIZE-13)*$SZREG($sp) $REG_S $s0,($FRAMESIZE-14)*$SZREG($sp) $REG_S $gp,($FRAMESIZE-15)*$SZREG($sp) ___ $code.=<<___; $PTR_SLL $num,6 $PTR_ADD $num,$inp $REG_S $num,0($sp) lw $A,0($ctx) lw $B,4($ctx) lw $C,8($ctx) Loading Loading @@ -246,6 +303,9 @@ $code.=<<___; ___ for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } $code.=<<___; $PTR_ADD $inp,64 $REG_L $num,0($sp) addu $A,$X[0] addu $B,$X[1] sw $A,0($ctx) Loading @@ -253,29 +313,38 @@ $code.=<<___; addu $D,$X[3] sw $B,4($ctx) addu $E,$X[4] PTR_SUB $num,1 sw $C,8($ctx) sw $D,12($ctx) sw $E,16($ctx) .set noreorder bnez $num,.Loop PTR_ADD $inp,64 bne $inp,$num,.Loop nop .set noreorder REG_L \$31,($FRAMESIZE-1)*SZREG(sp) REG_L \$30,($FRAMESIZE-2)*SZREG(sp) REG_L \$28,($FRAMESIZE-3)*SZREG(sp) REG_L \$23,($FRAMESIZE-4)*SZREG(sp) REG_L \$22,($FRAMESIZE-5)*SZREG(sp) REG_L \$21,($FRAMESIZE-6)*SZREG(sp) REG_L \$20,($FRAMESIZE-7)*SZREG(sp) REG_L \$19,($FRAMESIZE-8)*SZREG(sp) REG_L \$18,($FRAMESIZE-9)*SZREG(sp) REG_L \$17,($FRAMESIZE-10)*SZREG(sp) REG_L \$16,($FRAMESIZE-11)*SZREG(sp) jr ra PTR_ADD sp,$FRAMESIZE*SZREG $REG_L $ra,($FRAMESIZE-1)*$SZREG($sp) $REG_L $fp,($FRAMESIZE-2)*$SZREG($sp) $REG_L $s11,($FRAMESIZE-3)*$SZREG($sp) $REG_L $s10,($FRAMESIZE-4)*$SZREG($sp) $REG_L $s9,($FRAMESIZE-5)*$SZREG($sp) $REG_L $s8,($FRAMESIZE-6)*$SZREG($sp) $REG_L $s7,($FRAMESIZE-7)*$SZREG($sp) $REG_L $s6,($FRAMESIZE-8)*$SZREG($sp) $REG_L $s5,($FRAMESIZE-9)*$SZREG($sp) $REG_L $s4,($FRAMESIZE-10)*$SZREG($sp) ___ $code.=<<___ if ($flavour =~ /nubi/i); $REG_L $s3,($FRAMESIZE-11)*$SZREG($sp) $REG_L $s2,($FRAMESIZE-12)*$SZREG($sp) $REG_L $s1,($FRAMESIZE-13)*$SZREG($sp) $REG_L $s0,($FRAMESIZE-14)*$SZREG($sp) $REG_L $gp,($FRAMESIZE-15)*$SZREG($sp) ___ $code.=<<___; jr $ra $PTR_ADD $sp,$FRAMESIZE*$SZREG .end sha1_block_data_order .rdata .asciiz "SHA1 for MIPS, CRYPTOGAMS by <appro\@openssl.org>" ___ print $code; close STDOUT;