Commit d2fd65f6 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

sha512-x86_64.pl: +15% better performance on Westmere and incidentally Atom.

Other Intel processors +5%, Opteron -2%.
parent 819cf4b8
Loading
Loading
Loading
Loading
+40 −46
Original line number Original line Diff line number Diff line
@@ -95,50 +95,44 @@ sub ROUND_00_15()
{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;


$code.=<<___;
$code.=<<___;
	mov	$e,$a0
	ror	\$`$Sigma1[2]-$Sigma1[1]`,$a0
	mov	$e,$a1
	mov	$f,$a2
	mov	$f,$a2
	mov	$T1,`$SZ*($i&0xf)`(%rsp)


	ror	\$$Sigma1[0],$a0
	ror	\$`$Sigma0[2]-$Sigma0[1]`,$a1
	ror	\$$Sigma1[1],$a1
	xor	$e,$a0
	xor	$g,$a2			# f^g
	xor	$g,$a2			# f^g


	xor	$a1,$a0
	ror	\$`$Sigma1[1]-$Sigma1[0]`,$a0
	ror	\$`$Sigma1[2]-$Sigma1[1]`,$a1
	add	$h,$T1			# T1+=h
	xor	$a,$a1

	add	($Tbl,$round,$SZ),$T1	# T1+=K[round]
	and	$e,$a2			# (f^g)&e
	and	$e,$a2			# (f^g)&e
	mov	$T1,`$SZ*($i&0xf)`(%rsp)
	mov	$b,$h


	xor	$a1,$a0			# Sigma1(e)
	ror	\$`$Sigma0[1]-$Sigma0[0]`,$a1
	xor	$e,$a0
	xor	$g,$a2			# Ch(e,f,g)=((f^g)&e)^g
	xor	$g,$a2			# Ch(e,f,g)=((f^g)&e)^g
	add	$h,$T1			# T1+=h

	mov	$a,$h
	add	$a0,$T1			# T1+=Sigma1(e)


	xor	$c,$h			# b^c
	xor	$a,$a1
	add	$a2,$T1			# T1+=Ch(e,f,g)
	add	$a2,$T1			# T1+=Ch(e,f,g)
	mov	$a,$a0
	mov	$b,$a2
	mov	$a,$a1


	ror	\$$Sigma0[0],$h
	ror	\$$Sigma1[0],$a0	# Sigma1(e)
	ror	\$$Sigma0[1],$a0
	and	$a,$h			# h=(b^c)&a
	mov	$a,$a2
	and	$c,$a2			# b&c
	add	($Tbl,$round,$SZ),$T1	# T1+=K[round]


	xor	$a0,$h
	ror	\$$Sigma0[0],$a1	# Sigma0(a)
	ror	\$`$Sigma0[2]-$Sigma0[1]`,$a0
	add	$a0,$T1			# T1+=Sigma1(e)
	or	$c,$a1			# a|c
	add	$a2,$h			# h+=b&c (completes +=Maj(a,b,c)


	xor	$a0,$h			# h=Sigma0(a)
	and	$c,$a2			# a&c
	add	$T1,$d			# d+=T1
	add	$T1,$d			# d+=T1

	and	$b,$a1			# (a|c)&b
	add	$T1,$h			# h+=T1
	add	$T1,$h			# h+=T1

	or	$a2,$a1			# Maj(a,b,c)=((a|c)&b)|(a&c)
	lea	1($round),$round	# round++
	lea	1($round),$round	# round++
	add	$a1,$h			# h+=Sigma0(a)


	add	$a1,$h			# h+=Maj(a,b,c)
___
___
}
}


@@ -147,32 +141,30 @@ sub ROUND_16_XX()


$code.=<<___;
$code.=<<___;
	mov	`$SZ*(($i+1)&0xf)`(%rsp),$a0
	mov	`$SZ*(($i+1)&0xf)`(%rsp),$a0
	mov	`$SZ*(($i+14)&0xf)`(%rsp),$T1
	mov	`$SZ*(($i+14)&0xf)`(%rsp),$a1

	mov	$a0,$T1
	mov	$a0,$a2
	mov	$a1,$a2


	ror	\$`$sigma0[1]-$sigma0[0]`,$T1
	xor	$a0,$T1
	shr	\$$sigma0[2],$a0
	shr	\$$sigma0[2],$a0
	ror	\$$sigma0[0],$a2

	xor	$a2,$a0
	ror	\$`$sigma0[1]-$sigma0[0]`,$a2


	xor	$a2,$a0			# sigma0(X[(i+1)&0xf])
	ror	\$$sigma0[0],$T1
	mov	$T1,$a1
	xor	$T1,$a0			# sigma0(X[(i+1)&0xf])
	mov	`$SZ*(($i+9)&0xf)`(%rsp),$T1


	shr	\$$sigma1[2],$T1
	ror	\$`$sigma1[1]-$sigma1[0]`,$a2
	ror	\$$sigma1[0],$a1
	xor	$a1,$a2

	shr	\$$sigma1[2],$a1
	xor	$a1,$T1
	ror	\$`$sigma1[1]-$sigma1[0]`,$a1

	xor	$a1,$T1			# sigma1(X[(i+14)&0xf])


	ror	\$$sigma1[0],$a2
	add	$a0,$T1
	add	$a0,$T1

	xor	$a2,$a1			# sigma1(X[(i+14)&0xf])
	add	`$SZ*(($i+9)&0xf)`(%rsp),$T1


	add	`$SZ*($i&0xf)`(%rsp),$T1
	add	`$SZ*($i&0xf)`(%rsp),$T1
	mov	$e,$a0
	add	$a1,$T1
	mov	$a,$a1
___
___
	&ROUND_00_15(@_);
	&ROUND_00_15(@_);
}
}
@@ -219,6 +211,8 @@ $func:
___
___
	for($i=0;$i<16;$i++) {
	for($i=0;$i<16;$i++) {
		$code.="	mov	$SZ*$i($inp),$T1\n";
		$code.="	mov	$SZ*$i($inp),$T1\n";
		$code.="	mov	@ROT[4],$a0\n";
		$code.="	mov	@ROT[0],$a1\n";
		$code.="	bswap	$T1\n";
		$code.="	bswap	$T1\n";
		&ROUND_00_15($i,@ROT);
		&ROUND_00_15($i,@ROT);
		unshift(@ROT,pop(@ROT));
		unshift(@ROT,pop(@ROT));