Commit 5727f1f7 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

SHA1 assembler show off: minor performance updates and new modules for

forgotten CPUs.
parent 53f73afc
Loading
Loading
Loading
Loading
+314 −0
Original line number Diff line number Diff line
#!/usr/bin/env perl

# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================

# SHA1 block procedure for Alpha.

# On 21264 performance is 33% better than code generated by vendor
# compiler, and 75% better than GCC [3.4]. Implementation features
# vectorized byte swap, but not Xupdate.

@X=(	"\$0",	"\$1",	"\$2",	"\$3",	"\$4",	"\$5",	"\$6",	"\$7",
	"\$8",	"\$9",	"\$10",	"\$11",	"\$12",	"\$13",	"\$14",	"\$15");
$ctx="a0";	# $16
$inp="a1";
$num="a2";
$A="a3";
$B="a4";	# 20
$C="a5";
$D="t8";
$E="t9";	@V=($A,$B,$C,$D,$E);
$t0="t10";	# 24
$t1="t11";
$t2="ra";
$t3="t12";
$K="AT";	# 28

sub BODY_00_19 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___ if ($i==0);
	ldq_u	@X[0],0+0($inp)
	ldq_u	@X[1],0+7($inp)
___
$code.=<<___ if (!($i&1) && $i<14);
	ldq_u	@X[$i+2],($i+2)*4+0($inp)
	ldq_u	@X[$i+3],($i+2)*4+7($inp)
___
$code.=<<___ if (!($i&1) && $i<15);
	extql	@X[$i],$inp,@X[$i]
	extqh	@X[$i+1],$inp,@X[$i+1]

	or	@X[$i+1],@X[$i],@X[$i]	# pair of 32-bit values are fetched

	srl	@X[$i],24,$t0		# vectorized byte swap
	srl	@X[$i],8,$t2

	sll	@X[$i],8,$t3
	sll	@X[$i],24,@X[$i]
	zapnot	$t0,0x11,$t0
	zapnot	$t2,0x22,$t2

	zapnot	@X[$i],0x88,@X[$i]
	or	$t0,$t2,$t0
	zapnot	$t3,0x44,$t3
	sll	$a,5,$t1

	or	@X[$i],$t0,@X[$i]
	addl	$K,$e,$e
	and	$b,$c,$t2
	zapnot	$a,0xf,$a

	or	@X[$i],$t3,@X[$i]
	srl	$a,27,$t0
	bic	$d,$b,$t3
	sll	$b,30,$b

	extll	@X[$i],4,@X[$i+1]	# extract upper half
	or	$t2,$t3,$t2
	addl	@X[$i],$e,$e

	addl	$t1,$e,$e
	srl	$b,32,$t3
	zapnot	@X[$i],0xf,@X[$i]

	addl	$t0,$e,$e
	addl	$t2,$e,$e
	or	$t3,$b,$b
___
$code.=<<___ if (($i&1) && $i<15);
	sll	$a,5,$t1
	addl	$K,$e,$e
	and	$b,$c,$t2
	zapnot	$a,0xf,$a

	srl	$a,27,$t0
	addl	@X[$i%16],$e,$e
	bic	$d,$b,$t3
	sll	$b,30,$b

	or	$t2,$t3,$t2
	addl	$t1,$e,$e
	srl	$b,32,$t3
	zapnot	@X[$i],0xf,@X[$i]

	addl	$t0,$e,$e
	addl	$t2,$e,$e
	or	$t3,$b,$b
___
$code.=<<___ if ($i>=15);	# with forward Xupdate
	sll	$a,5,$t1
	addl	$K,$e,$e
	and	$b,$c,$t2
	xor	@X[($j+2)%16],@X[$j%16],@X[$j%16]

	zapnot	$a,0xf,$a
	addl	@X[$i%16],$e,$e
	bic	$d,$b,$t3
	xor	@X[($j+8)%16],@X[$j%16],@X[$j%16]

	srl	$a,27,$t0
	addl	$t1,$e,$e
	or	$t2,$t3,$t2
	xor	@X[($j+13)%16],@X[$j%16],@X[$j%16]

	sll	$b,30,$b
	addl	$t0,$e,$e
	srl	@X[$j%16],31,$t1

	addl	$t2,$e,$e
	srl	$b,32,$t3
	addl	@X[$j%16],@X[$j%16],@X[$j%16]

	or	$t3,$b,$b
	zapnot	@X[$i%16],0xf,@X[$i%16]
	or	$t1,@X[$j%16],@X[$j%16]
___
}

sub BODY_20_39 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___ if ($i<79);	# with forward Xupdate
	sll	$a,5,$t1
	addl	$K,$e,$e
	zapnot	$a,0xf,$a
	xor	@X[($j+2)%16],@X[$j%16],@X[$j%16]

	sll	$b,30,$t3
	addl	$t1,$e,$e
	xor	$b,$c,$t2
	xor	@X[($j+8)%16],@X[$j%16],@X[$j%16]

	srl	$b,2,$b
	addl	@X[$i%16],$e,$e
	xor	$d,$t2,$t2
	xor	@X[($j+13)%16],@X[$j%16],@X[$j%16]

	srl	@X[$j%16],31,$t1
	addl	$t2,$e,$e
	srl	$a,27,$t0
	addl	@X[$j%16],@X[$j%16],@X[$j%16]

	or	$t3,$b,$b
	addl	$t0,$e,$e
	or	$t1,@X[$j%16],@X[$j%16]
___
$code.=<<___ if ($i<77);
	zapnot	@X[$i%16],0xf,@X[$i%16]
___
$code.=<<___ if ($i==79);	# with context fetch
	sll	$a,5,$t1
	addl	$K,$e,$e
	zapnot	$a,0xf,$a
	ldl	@X[0],0($ctx)

	sll	$b,30,$t3
	addl	$t1,$e,$e
	xor	$b,$c,$t2
	ldl	@X[1],4($ctx)

	srl	$b,2,$b
	addl	@X[$i%16],$e,$e
	xor	$d,$t2,$t2
	ldl	@X[2],8($ctx)

	srl	$a,27,$t0
	addl	$t2,$e,$e
	ldl	@X[3],12($ctx)

	or	$t3,$b,$b
	addl	$t0,$e,$e
	ldl	@X[4],16($ctx)
___
}

sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___;	# with forward Xupdate
	sll	$a,5,$t1
	addl	$K,$e,$e
	zapnot	$a,0xf,$a
	xor	@X[($j+2)%16],@X[$j%16],@X[$j%16]

	srl	$a,27,$t0
	and	$b,$c,$t2
	and	$b,$d,$t3
	xor	@X[($j+8)%16],@X[$j%16],@X[$j%16]

	sll	$b,30,$b
	addl	$t1,$e,$e
	xor	@X[($j+13)%16],@X[$j%16],@X[$j%16]

	srl	@X[$j%16],31,$t1
	addl	$t0,$e,$e
	or	$t2,$t3,$t2
	and	$c,$d,$t3

	or	$t2,$t3,$t2
	srl	$b,32,$t3
	addl	@X[$i%16],$e,$e
	addl	@X[$j%16],@X[$j%16],@X[$j%16]

	or	$t3,$b,$b
	addl	$t2,$e,$e
	or	$t1,@X[$j%16],@X[$j%16]
	zapnot	@X[$i%16],0xf,@X[$i%16]
___
}

$code=<<___;
#include <asm.h>
#include <regdef.h>

.text

.set	noat
.set	noreorder
.globl	sha1_block_data_order
.align	5
.ent	sha1_block_data_order
sha1_block_data_order:
	lda	sp,-64(sp)
	stq	ra,0(sp)
	stq	s0,8(sp)
	stq	s1,16(sp)
	stq	s2,24(sp)
	stq	s3,32(sp)
	stq	s4,40(sp)
	stq	s5,48(sp)
	stq	fp,56(sp)
	.mask	0x0400fe00,-64
	.frame	sp,64,ra
	.prologue 0

	ldl	$A,0($ctx)
	ldl	$B,4($ctx)
	sll	$num,6,$num
	ldl	$C,8($ctx)
	ldl	$D,12($ctx)
	ldl	$E,16($ctx)
	addq	$inp,$num,$num

.Lloop:
	.set	noreorder
	ldah	$K,23170(zero)
	zapnot	$B,0xf,$B
	lda	$K,31129($K)	# K_00_19
___
for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }

$code.=<<___;
	ldah	$K,28378(zero)
	lda	$K,-5215($K)	# K_20_39
___
for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }

$code.=<<___;
	ldah	$K,-28900(zero)
	lda	$K,-17188($K)	# K_40_59
___
for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }

$code.=<<___;
	ldah	$K,-13725(zero)
	lda	$K,-15914($K)	# K_60_79
___
for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }

$code.=<<___;
	addl	@X[0],$A,$A
	addl	@X[1],$B,$B
	addl	@X[2],$C,$C
	addl	@X[3],$D,$D
	addl	@X[4],$E,$E
	stl	$A,0($ctx)
	stl	$B,4($ctx)
	addq	$inp,64,$inp
	stl	$C,8($ctx)
	stl	$D,12($ctx)
	stl	$E,16($ctx)
	cmpult	$inp,$num,$t1
	bne	$t1,.Lloop

	.set	noreorder
	ldq	ra,0(sp)
	ldq	s0,8(sp)
	ldq	s1,16(sp)
	ldq	s2,24(sp)
	ldq	s3,32(sp)
	ldq	s4,40(sp)
	ldq	s5,48(sp)
	ldq	fp,56(sp)
	lda	sp,64(sp)
	ret	(ra)
.end	sha1_block_data_order
___
print $code;
close STDOUT;
+11 −1
Original line number Diff line number Diff line
@@ -86,8 +86,8 @@ $code.=<<___;
	ldr	$t3,[$Xi,#2*4]
	add	$e,$K,$e,ror#2			@ E+=K_xx_xx
	eor	$t0,$t0,$t1
	eor	$t2,$t2,$t3
	eor	$t0,$t0,$t2
	eor	$t0,$t0,$t3
	add	$e,$e,$a,ror#27			@ E+=ROR(A,27)
___
$code.=<<___ if (!defined($flag));
@@ -131,6 +131,15 @@ ___

sub BODY_40_59 {
my ($a,$b,$c,$d,$e)=@_;
if (1) {
	&Xupdate(@_);
$code.=<<___;
	and	$t2,$c,$d
	and	$t1,$b,$t1,ror#2
	add	$e,$e,$t2,ror#2
	add	$e,$e,$t1			@ E+=F_40_59(B,C,D)
___
} else {
	&Xupdate(@_,1);
$code.=<<___;
	and	$t1,$b,$c,ror#2
@@ -140,6 +149,7 @@ $code.=<<___;
	add	$e,$e,$t1			@ E+=F_40_59(B,C,D)
___
}
}

$code=<<___;
.text
+95 −97
Original line number Diff line number Diff line
@@ -15,7 +15,7 @@
# is >50% better than HP C and >2x better than gcc.

$code=<<___;
.ident  \"sha1-ia64.s, version 1.2\"
.ident  \"sha1-ia64.s, version 1.3\"
.ident  \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\"
.explicit

@@ -26,14 +26,10 @@ if ($^O eq "hpux") {
    $ADDP="addp4";
    for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
} else { $ADDP="add"; }
for (@ARGV) {	$big_endian=1 if (/\-DB_ENDIAN/);
		$big_endian=0 if (/\-DL_ENDIAN/);   }
if (!defined($big_endian))
	    {	$big_endian=(unpack('L',pack('N',1))==1);   }

#$human=1;
if ($human) {	# useful for visual code auditing...
	($A,$B,$C,$D,$E,$T)   = ("A","B","C","D","E","T");
	($A,$B,$C,$D,$E)   = ("A","B","C","D","E");
	($h0,$h1,$h2,$h3,$h4) = ("h0","h1","h2","h3","h4");
	($K_00_19, $K_20_39, $K_40_59, $K_60_79) =
	    (	"K_00_19","K_20_39","K_40_59","K_60_79"	);
@@ -41,47 +37,50 @@ if ($human) { # useful for visual code auditing...
		"X8", "X9","X10","X11","X12","X13","X14","X15"	);
}
else {
	($A,$B,$C,$D,$E,$T)   = ("loc0","loc1","loc2","loc3","loc4","loc5");
	($h0,$h1,$h2,$h3,$h4) = ("loc6","loc7","loc8","loc9","loc10");
	($A,$B,$C,$D,$E)   =    ("loc0","loc1","loc2","loc3","loc4");
	($h0,$h1,$h2,$h3,$h4) = ("loc5","loc6","loc7","loc8","loc9");
	($K_00_19, $K_20_39, $K_40_59, $K_60_79) =
	    (	"r14", "r15", "loc11", "loc12"	);
	    (	"r14", "r15", "loc10", "loc11"	);
	@X= (	"r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
		"r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31"	);
}

sub BODY_00_15 {
local	*code=shift;
local	($i,$a,$b,$c,$d,$e,$f)=@_;
my	($i,$a,$b,$c,$d,$e)=@_;
my	$j=$i+1;
my	$Xn=@X[$j%16];

$code.=<<___ if ($i==0);
{ .mmi;	ld1	$X[$i&0xf]=[inp],2	    // MSB
{ .mmi;	ld1	$X[$i]=[inp],2		    // MSB
	ld1	tmp2=[tmp3],2		};;
{ .mmi;	ld1	tmp0=[inp],2
	ld1	tmp4=[tmp3],2		    // LSB
	dep	$X[$i&0xf]=$X[$i&0xf],tmp2,8,8	};;
	dep	$X[$i]=$X[$i],tmp2,8,8	};;
___
if ($i<15) {
	$code.=<<___;
{ .mmi;	ld1	$X[($i+1)&0xf]=[inp],2	    // +1
{ .mmi;	ld1	$Xn=[inp],2		    // forward Xload
	nop.m	0x0
	dep	tmp1=tmp0,tmp4,8,8	};;
{ .mmi;	ld1	tmp2=[tmp3],2		    // +1
{ .mmi;	ld1	tmp2=[tmp3],2		    // forward Xload
	and	tmp4=$c,$b
	dep	$X[$i&0xf]=$X[$i&0xf],tmp1,16,16	} //;;
{ .mmi;	andcm	tmp1=$d,$b
	add	tmp0=$e,$K_00_19
	dep	$X[$i]=$X[$i],tmp1,16,16} //;;
{ .mmi;	add	$e=$e,$K_00_19		    // e+=K_00_19
	andcm	tmp1=$d,$b
	dep.z	tmp5=$a,5,27		};; // a<<5
{ .mmi;	or	tmp4=tmp4,tmp1		    // F_00_19(b,c,d)=(b&c)|(~b&d)
	add	$f=tmp0,$X[$i&0xf]	    // f=xi+e+K_00_19
{ .mmi;	add	$e=$e,$X[$i]		    // e+=Xload
	or	tmp4=tmp4,tmp1		    // F_00_19(b,c,d)=(b&c)|(~b&d)
	extr.u	tmp1=$a,27,5		};; // a>>27
{ .mmi;	ld1	tmp0=[inp],2		    // +1
	add	$f=$f,tmp4		    // f+=F_00_19(b,c,d)
{ .mmi;	ld1	tmp0=[inp],2		    // forward Xload
	add	$e=$e,tmp4		    // e+=F_00_19(b,c,d)
	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30)
{ .mmi;	ld1	tmp4=[tmp3],2		    // +1
{ .mmi;	ld1	tmp4=[tmp3],2		    // forward Xload
	or	tmp5=tmp1,tmp5		    // ROTATE(a,5)
	mux2	tmp6=$a,0x44		};; // see b in next iteration
{ .mii;	add	$f=$f,tmp5		    // f+=ROTATE(a,5)
	dep	$X[($i+1)&0xf]=$X[($i+1)&0xf],tmp2,8,8	// +1
	mux2	$X[$i&0xf]=$X[$i&0xf],0x44	} //;;
{ .mii;	add	$e=$e,tmp5		    // e+=ROTATE(a,5)
	dep	$Xn=$Xn,tmp2,8,8	    // forward Xload
	mux2	$X[$i]=$X[$i],0x44	} //;;

___
	}
@@ -89,24 +88,24 @@ else {
	$code.=<<___;
{ .mii;	and	tmp3=$c,$b
	dep	tmp1=tmp0,tmp4,8,8;;
	dep	$X[$i&0xf]=$X[$i&0xf],tmp1,16,16	} //;;
{ .mmi;	andcm	tmp1=$d,$b
	add	tmp0=$e,$K_00_19
	dep	$X[$i]=$X[$i],tmp1,16,16} //;;
{ .mmi;	add	$e=$e,$K_00_19		    // e+=K_00_19
	andcm	tmp1=$d,$b
	dep.z	tmp5=$a,5,27		};; // a<<5
{ .mmi;	or	tmp4=tmp3,tmp1		    // F_00_19(b,c,d)=(b&c)|(~b&d)
	add	$f=tmp0,$X[$i&0xf]	    // f=xi+e+K_00_19
{ .mmi;	add	$e=$e,$X[$i]		    // e+=Xupdate
	or	tmp4=tmp3,tmp1		    // F_00_19(b,c,d)=(b&c)|(~b&d)
	extr.u	tmp1=$a,27,5		}   // a>>27
{ .mmi;	xor	tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf]	// +1
	xor	tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1
{ .mmi;	xor	$Xn=$Xn,$X[($j+2)%16]	    // forward Xupdate
	xor	tmp3=$X[($j+8)%16],$X[($j+13)%16] // forward Xupdate
	nop.i	0			};;
{ .mmi;	add	$f=$f,tmp4		    // f+=F_00_19(b,c,d)
	xor	tmp2=tmp2,tmp3		    // +1
{ .mmi;	add	$e=$e,tmp4		    // e+=F_00_19(b,c,d)
	xor	$Xn=$Xn,tmp3		    // forward Xupdate
	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30)
{ .mmi; or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
	mux2	tmp6=$a,0x44		};; // see b in next iteration
{ .mii;	add	$f=$f,tmp1		    // f+=ROTATE(a,5)
	shrp	$e=tmp2,tmp2,31		    // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
	mux2	$X[$i&0xf]=$X[$i&0xf],0x44  };;
{ .mii;	add	$e=$e,tmp1		    // e+=ROTATE(a,5)
	shrp	$Xn=$Xn,$Xn,31		    // ROTATE(x[0]^x[2]^x[8]^x[13],1)
	mux2	$X[$i]=$X[$i],0x44	};;

___
	}
@@ -114,27 +113,28 @@ ___

sub BODY_16_19 {
local	*code=shift;
local	($i,$a,$b,$c,$d,$e,$f)=@_;
my	($i,$a,$b,$c,$d,$e)=@_;
my	$j=$i+1;
my	$Xn=@X[$j%16];

$code.=<<___;
{ .mmi;	mov	$X[$i&0xf]=$f		    // Xupdate
	and	tmp0=$c,$b
{ .mib;	add	$e=$e,$K_00_19		    // e+=K_00_19
	dep.z	tmp5=$a,5,27		}   // a<<5
{ .mmi;	andcm	tmp1=$d,$b
	add	tmp4=$e,$K_00_19	};;
{ .mmi;	or	tmp0=tmp0,tmp1		    // F_00_19(b,c,d)=(b&c)|(~b&d)
	add	$f=$f,tmp4		    // f+=e+K_00_19
{ .mib;	andcm	tmp1=$d,$b
	and	tmp0=$c,$b		};;
{ .mmi;	add	$e=$e,$X[$i%16]		    // e+=Xupdate
	or	tmp0=tmp0,tmp1		    // F_00_19(b,c,d)=(b&c)|(~b&d)
	extr.u	tmp1=$a,27,5		}   // a>>27
{ .mmi;	xor	tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf]	// +1
	xor	tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf]	// +1
{ .mmi;	xor	$Xn=$Xn,$X[($j+2)%16]	    // forward Xupdate
	xor	tmp3=$X[($j+8)%16],$X[($j+13)%16]	// forward Xupdate
	nop.i	0			};;
{ .mmi;	add	$f=$f,tmp0		    // f+=F_00_19(b,c,d)
	xor	tmp2=tmp2,tmp3		    // +1
{ .mmi;	add	$e=$e,tmp0		    // f+=F_00_19(b,c,d)
	xor	$Xn=$Xn,tmp3		    // forward Xupdate
	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30)
{ .mmi;	or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
	mux2	tmp6=$a,0x44		};; // see b in next iteration
{ .mii;	add	$f=$f,tmp1		    // f+=ROTATE(a,5)
	shrp	$e=tmp2,tmp2,31		    // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
{ .mii;	add	$e=$e,tmp1		    // e+=ROTATE(a,5)
	shrp	$Xn=$Xn,$Xn,31		    // ROTATE(x[0]^x[2]^x[8]^x[13],1)
	nop.i	0			};;

___
@@ -142,49 +142,47 @@ ___

sub BODY_20_39 {
local	*code=shift;
local	($i,$a,$b,$c,$d,$e,$f,$Konst)=@_;
my	($i,$a,$b,$c,$d,$e,$Konst)=@_;
	$Konst = $K_20_39 if (!defined($Konst));
my	$j=$i+1;
my	$Xn=@X[$j%16];

if ($i<79) {
$code.=<<___;
{ .mib;	mov	$X[$i&0xf]=$f		    // Xupdate
{ .mib;	add	$e=$e,$Konst		    // e+=K_XX_XX
	dep.z	tmp5=$a,5,27		}   // a<<5
{ .mib;	xor	tmp0=$c,$b
	add	tmp4=$e,$Konst		};;
{ .mmi;	xor	tmp0=tmp0,$d		    // F_20_39(b,c,d)=b^c^d
	add	$f=$f,tmp4		    // f+=e+K_20_39
	xor	$Xn=$Xn,$X[($j+2)%16]	};; // forward Xupdate
{ .mib;	add	$e=$e,$X[$i%16]		    // e+=Xupdate
	extr.u	tmp1=$a,27,5		}   // a>>27
{ .mmi;	xor	tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf]	// +1
	xor	tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf]	// +1
	nop.i	0			};;
{ .mmi;	add	$f=$f,tmp0		    // f+=F_20_39(b,c,d)
	xor	tmp2=tmp2,tmp3		    // +1
{ .mib;	xor	tmp0=tmp0,$d		    // F_20_39(b,c,d)=b^c^d
	xor	$Xn=$Xn,$X[($j+8)%16]	};; // forward Xupdate
{ .mmi;	add	$e=$e,tmp0		    // e+=F_20_39(b,c,d)
	xor	$Xn=$Xn,$X[($j+13)%16]	    // forward Xupdate
	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30)
{ .mmi;	or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
	mux2	tmp6=$a,0x44		};; // see b in next iteration
{ .mii;	add	$f=$f,tmp1		    // f+=ROTATE(a,5)
	shrp	$e=tmp2,tmp2,31		    // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
{ .mii;	add	$e=$e,tmp1		    // e+=ROTATE(a,5)
	shrp	$Xn=$Xn,$Xn,31		    // ROTATE(x[0]^x[2]^x[8]^x[13],1)
	nop.i	0			};;

___
}
else {
$code.=<<___;
{ .mib;	mov	$X[$i&0xf]=$f		    // Xupdate
{ .mib;	add	$e=$e,$Konst		    // e+=K_60_79
	dep.z	tmp5=$a,5,27		}   // a<<5
{ .mib;	xor	tmp0=$c,$b
	add	tmp4=$e,$Konst		};;
{ .mib;	xor	tmp0=tmp0,$d		    // F_20_39(b,c,d)=b^c^d
	extr.u	tmp1=$a,27,5		}   // a>>27
{ .mib;	add	$f=$f,tmp4		    // f+=e+K_20_39
	add	$h1=$h1,$a		};; // wrap up
{ .mmi;	add	$f=$f,tmp0		    // f+=F_20_39(b,c,d)
	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30) ;;?
{ .mmi;	or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
{ .mib;	add	$e=$e,$X[$i%16]		    // e+=Xupdate
	extr.u	tmp1=$a,27,5		}   // a>>27
{ .mib;	xor	tmp0=tmp0,$d		    // F_20_39(b,c,d)=b^c^d
	add	$h3=$h3,$c		};; // wrap up
{ .mib;	add	tmp3=1,inp		    // used in unaligned codepath
	add	$f=$f,tmp1		}   // f+=ROTATE(a,5)
{ .mib;	add	$h2=$h2,$b		    // wrap up
{ .mmi;	add	$e=$e,tmp0		    // e+=F_20_39(b,c,d)
	or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
	shrp	$b=tmp6,tmp6,2		};; // b=ROTATE(b,30) ;;?
{ .mmi;	add	$e=$e,tmp1		    // e+=ROTATE(a,5)
	add	tmp3=1,inp		    // used in unaligned codepath
	add	$h4=$h4,$d		};; // wrap up

___
@@ -193,29 +191,29 @@ ___

sub BODY_40_59 {
local	*code=shift;
local	($i,$a,$b,$c,$d,$e,$f)=@_;
my	($i,$a,$b,$c,$d,$e)=@_;
my	$j=$i+1;
my	$Xn=@X[$j%16];

$code.=<<___;
{ .mmi;	mov	$X[$i&0xf]=$f		    // Xupdate
	and	tmp0=$c,$b
{ .mib;	add	$e=$e,$K_40_59		    // e+=K_40_59
	dep.z	tmp5=$a,5,27		}   // a<<5
{ .mmi;	and	tmp1=$d,$b
	add	tmp4=$e,$K_40_59	};;
{ .mmi;	or	tmp0=tmp0,tmp1		    // (b&c)|(b&d)
	add	$f=$f,tmp4		    // f+=e+K_40_59
{ .mib;	and	tmp1=$c,$d
	xor	tmp0=$c,$d		};;
{ .mmi;	add	$e=$e,$X[$i%16]		    // e+=Xupdate
	add	tmp5=tmp5,tmp1		    // a<<5+(c&d)
	extr.u	tmp1=$a,27,5		}   // a>>27
{ .mmi;	and	tmp4=$c,$d
	xor	tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf]	// +1
	xor	tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf]	// +1
	};;
{ .mmi;	or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
	xor	tmp2=tmp2,tmp3		    // +1
{ .mmi;	and	tmp0=tmp0,$b
	xor	$Xn=$Xn,$X[($j+2)%16]	    // forward Xupdate
	xor	tmp3=$X[($j+8)%16],$X[($j+13)%16] };;	// forward Xupdate
{ .mmi;	add	$e=$e,tmp0		    // e+=b&(c^d)
	add	tmp5=tmp5,tmp1		    // ROTATE(a,5)+(c&d)
	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30)
{ .mmi;	or	tmp0=tmp0,tmp4		    // F_40_59(b,c,d)=(b&c)|(b&d)|(c&d)
{ .mmi;	xor	$Xn=$Xn,tmp3
	mux2	tmp6=$a,0x44		};; // see b in next iteration
{ .mii;	add	$f=$f,tmp0		    // f+=F_40_59(b,c,d)
	shrp	$e=tmp2,tmp2,31;;	    // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
	add	$f=$f,tmp1		};; // f+=ROTATE(a,5)
{ .mii;	add	$e=$e,tmp5		    // e+=ROTATE(a,5)+(c&d)
	shrp	$Xn=$Xn,$Xn,31		    // ROTATE(x[0]^x[2]^x[8]^x[13],1)
	nop.i	0x0			};;

___
}
@@ -237,7 +235,7 @@ inp=r33; // in1
.align	32
sha1_block_data_order:
	.prologue
{ .mmi;	alloc	tmp1=ar.pfs,3,15,0,0
{ .mmi;	alloc	tmp1=ar.pfs,3,14,0,0
	$ADDP	tmp0=4,ctx
	.save	ar.lc,r3
	mov	r3=ar.lc		}
@@ -245,8 +243,8 @@ sha1_block_data_order:
	$ADDP	inp=0,inp
	mov	r2=pr			};;
tmp4=in2;
tmp5=loc13;
tmp6=loc14;
tmp5=loc12;
tmp6=loc13;
	.body
{ .mlx;	ld4	$h0=[ctx],8
	movl	$K_00_19=0x5a827999	}
@@ -273,7 +271,7 @@ tmp6=loc14;

___

{ my $i,@V=($A,$B,$C,$D,$E,$T);
{ my $i,@V=($A,$B,$C,$D,$E);

	for($i=0;$i<16;$i++)	{ &BODY_00_15(\$code,$i,@V); unshift(@V,pop(@V)); }
	for(;$i<20;$i++)	{ &BODY_16_19(\$code,$i,@V); unshift(@V,pop(@V)); }
@@ -281,12 +279,12 @@ ___
	for(;$i<60;$i++)	{ &BODY_40_59(\$code,$i,@V); unshift(@V,pop(@V)); }
	for(;$i<80;$i++)	{ &BODY_60_79(\$code,$i,@V); unshift(@V,pop(@V)); }

	(($V[5] eq $D) and ($V[0] eq $E)) or die;	# double-check
	(($V[0] eq $A) and ($V[4] eq $E)) or die;	# double-check
}

$code.=<<___;
{ .mmb;	add	$h0=$h0,$E
	nop.m	0
{ .mmb;	add	$h0=$h0,$A
	add	$h2=$h2,$C
	br.ctop.dptk.many	.Ldtop	};;
.Ldend:
{ .mmi;	add	tmp0=4,ctx
+281 −0
Original line number Diff line number Diff line
#!/usr/bin/env perl

# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================

# SHA1 block procedure for MIPS.

# Performance improvement is 30% on unaligned input. The "secret" is
# to deploy lwl/lwr pair to load unaligned input. One could have
# vectorized Xupdate on MIPSIII/IV, but the goal was to code MIPS32-
# compatible subroutine. There is room for minor optimization on
# little-endian platforms...
#
# The code is somewhat IRIX-centric, i.e. is likely to require minor
# adaptations for other OSes...

for (@ARGV) {   $big_endian=1 if (/\-DB_ENDIAN/);
                $big_endian=0 if (/\-DL_ENDIAN/);   }
if (!defined($big_endian))
            {   $big_endian=(unpack('L',pack('N',1))==1);   }

# offsets of the Most and Least Significant Bytes
$MSB=$big_endian?0:3;
$LSB=3&~$MSB;

@X=(	"\$8",	"\$9",	"\$10",	"\$11",	"\$12",	"\$13",	"\$14",	"\$15",
	"\$16",	"\$17",	"\$18",	"\$19",	"\$20",	"\$21",	"\$22",	"\$23");
$ctx="\$4";	# a0
$inp="\$5";	# a1
$num="\$6";	# a2
$A="\$1";
$B="\$2";
$C="\$3";
$D="\$7";
$E="\$24";	@V=($A,$B,$C,$D,$E);
$t0="\$25";	# jp,t9
$t1="\$28";	# gp
$t2="\$30";	# fp,s8
$K="\$31";	# ra

$FRAMESIZE=16;

sub BODY_00_14 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___	if (!$big_endian);
	srl	$t0,@X[$i],24	# byte swap($i)
	srl	$t1,@X[$i],8
	andi	$t2,@X[$i],0xFF00
	sll	@X[$i],@X[$i],24
	andi	$t1,0xFF00
	sll	$t2,$t2,8
	or	@X[$i],$t0
	or	@X[$i],$t1
	or	@X[$i],$t2
___
$code.=<<___;
	 lwl	@X[$j],$j*4+$MSB($inp)
	sll	$t0,$a,5	# $i
	addu	$e,$K
	 lwr	@X[$j],$j*4+$LSB($inp)
	srl	$t1,$a,27
	addu	$e,$t0
	xor	$t0,$c,$d
	addu	$e,$t1
	sll	$t2,$b,30
	and	$t0,$b
	srl	$b,$b,2
	xor	$t0,$d
	addu	$e,@X[$i]
	or	$b,$t2
	addu	$e,$t0
___
}

sub BODY_15_19 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;

$code.=<<___	if (!$big_endian && $i==15);
	srl	$t0,@X[$i],24	# byte swap($i)
	srl	$t1,@X[$i],8
	andi	$t2,@X[$i],0xFF00
	sll	@X[$i],@X[$i],24
	andi	$t1,0xFF00
	sll	$t2,$t2,8
	or	@X[$i],$t0
	or	@X[$i],$t1
	or	@X[$i],$t2
___
$code.=<<___;
	 xor	@X[$j%16],@X[($j+2)%16]
	sll	$t0,$a,5	# $i
	addu	$e,$K
	srl	$t1,$a,27
	addu	$e,$t0
	 xor	@X[$j%16],@X[($j+8)%16]
	xor	$t0,$c,$d
	addu	$e,$t1
	 xor	@X[$j%16],@X[($j+13)%16]
	sll	$t2,$b,30
	and	$t0,$b
	 srl	$t1,@X[$j%16],31
	 addu	@X[$j%16],@X[$j%16]
	srl	$b,$b,2
	xor	$t0,$d
	 or	@X[$j%16],$t1
	addu	$e,@X[$i%16]
	or	$b,$t2
	addu	$e,$t0
___
}

sub BODY_20_39 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___ if ($i<79);
	 xor	@X[$j%16],@X[($j+2)%16]
	sll	$t0,$a,5	# $i
	addu	$e,$K
	srl	$t1,$a,27
	addu	$e,$t0
	 xor	@X[$j%16],@X[($j+8)%16]
	xor	$t0,$c,$d
	addu	$e,$t1
	 xor	@X[$j%16],@X[($j+13)%16]
	sll	$t2,$b,30
	xor	$t0,$b
	 srl	$t1,@X[$j%16],31
	 addu	@X[$j%16],@X[$j%16]
	srl	$b,$b,2
	addu	$e,@X[$i%16]
	 or	@X[$j%16],$t1
	or	$b,$t2
	addu	$e,$t0
___
$code.=<<___ if ($i==79);
	 lw	@X[0],0($ctx)
	sll	$t0,$a,5	# $i
	addu	$e,$K
	 lw	@X[1],4($ctx)
	srl	$t1,$a,27
	addu	$e,$t0
	 lw	@X[2],8($ctx)
	xor	$t0,$c,$d
	addu	$e,$t1
	 lw	@X[3],12($ctx)
	sll	$t2,$b,30
	xor	$t0,$b
	 lw	@X[4],16($ctx)
	srl	$b,$b,2
	addu	$e,@X[$i%16]
	or	$b,$t2
	addu	$e,$t0
___
}

sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___ if ($i<79);
	 xor	@X[$j%16],@X[($j+2)%16]
	sll	$t0,$a,5	# $i
	addu	$e,$K
	srl	$t1,$a,27
	addu	$e,$t0
	 xor	@X[$j%16],@X[($j+8)%16]
	and	$t0,$c,$d
	addu	$e,$t1
	 xor	@X[$j%16],@X[($j+13)%16]
	sll	$t2,$b,30
	addu	$e,$t0
	 srl	$t1,@X[$j%16],31
	xor	$t0,$c,$d
	 addu	@X[$j%16],@X[$j%16]
	and	$t0,$b
	srl	$b,$b,2
	 or	@X[$j%16],$t1
	addu	$e,@X[$i%16]
	or	$b,$t2
	addu	$e,$t0
___
}

$code=<<___;
#include <asm.h>
#include <regdef.h>

.text

.set	noat
.set	noreorder
.align	5
.globl	sha1_block_data_order
.ent	sha1_block_data_order
sha1_block_data_order:
	.frame	sp,$FRAMESIZE*SZREG,zero
	.mask	0xd0ff0000,-$FRAMESIZE*SZREG
	.set	noreorder
	PTR_SUB	sp,$FRAMESIZE*SZREG
	REG_S	\$31,($FRAMESIZE-1)*SZREG(sp)
	REG_S	\$30,($FRAMESIZE-2)*SZREG(sp)
	REG_S	\$28,($FRAMESIZE-3)*SZREG(sp)
	REG_S	\$23,($FRAMESIZE-4)*SZREG(sp)
	REG_S	\$22,($FRAMESIZE-5)*SZREG(sp)
	REG_S	\$21,($FRAMESIZE-6)*SZREG(sp)
	REG_S	\$20,($FRAMESIZE-7)*SZREG(sp)
	REG_S	\$19,($FRAMESIZE-8)*SZREG(sp)
	REG_S	\$18,($FRAMESIZE-9)*SZREG(sp)
	REG_S	\$17,($FRAMESIZE-10)*SZREG(sp)
	REG_S	\$16,($FRAMESIZE-11)*SZREG(sp)

	lw	$A,0($ctx)
	lw	$B,4($ctx)
	lw	$C,8($ctx)
	lw	$D,12($ctx)
	b	.Loop
	lw	$E,16($ctx)
.align	4
.Loop:
	.set	reorder
	lwl	@X[0],$MSB($inp)
	lui	$K,0x5a82
	lwr	@X[0],$LSB($inp)
	ori	$K,0x7999	# K_00_19
___
for ($i=0;$i<15;$i++)	{ &BODY_00_14($i,@V); unshift(@V,pop(@V)); }
for (;$i<20;$i++)	{ &BODY_15_19($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
	lui	$K,0x6ed9
	ori	$K,0xeba1	# K_20_39
___
for (;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
	lui	$K,0x8f1b
	ori	$K,0xbcdc	# K_40_59
___
for (;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
	lui	$K,0xca62
	ori	$K,0xc1d6	# K_60_79
___
for (;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
	addu	$A,$X[0]
	addu	$B,$X[1]
	sw	$A,0($ctx)
	addu	$C,$X[2]
	addu	$D,$X[3]
	sw	$B,4($ctx)
	addu	$E,$X[4]
	PTR_SUB	$num,1
	sw	$C,8($ctx)
	sw	$D,12($ctx)
	sw	$E,16($ctx)
	.set	noreorder
	bnez	$num,.Loop
	PTR_ADD	$inp,64

	.set	noreorder
	REG_L	\$31,($FRAMESIZE-1)*SZREG(sp)
	REG_L	\$30,($FRAMESIZE-2)*SZREG(sp)
	REG_L	\$28,($FRAMESIZE-3)*SZREG(sp)
	REG_L	\$23,($FRAMESIZE-4)*SZREG(sp)
	REG_L	\$22,($FRAMESIZE-5)*SZREG(sp)
	REG_L	\$21,($FRAMESIZE-6)*SZREG(sp)
	REG_L	\$20,($FRAMESIZE-7)*SZREG(sp)
	REG_L	\$19,($FRAMESIZE-8)*SZREG(sp)
	REG_L	\$18,($FRAMESIZE-9)*SZREG(sp)
	REG_L	\$17,($FRAMESIZE-10)*SZREG(sp)
	REG_L	\$16,($FRAMESIZE-11)*SZREG(sp)
	jr	ra
	PTR_ADD	sp,$FRAMESIZE*SZREG
.end	sha1_block_data_order
___
print $code;
close STDOUT;
+259 −0

File added.

Preview size limit exceeded, changes collapsed.