Commit 69f45c52 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

sha1-[586|x86_64].pl: shave off one instruction from body_40_59, it's

2% less instructions in SIMD code paths, so 2% improvement in average:-)
parent 7bb98eee
Loading
Loading
Loading
Loading
+15 −14
Original line number Diff line number Diff line
@@ -89,12 +89,12 @@
# P4		10.6		-
# AMD K8	7.1		-
# Core2		7.3		6.1/+20%	-
# Atom		12.5		9.5(*)/+32%	-
# Westmere	7.3		5.6/+30%	-
# Sandy Bridge	8.8		6.2/+40%	5.1(**)/+70%
# Ivy Bridge	7.2		4.9/+47%	4.8(**)/+50%
# Bulldozer	11.6		6.2/+88%
# VIA Nano	10.6		7.5/+41%
# Atom		12.5		9.3(*)/+35%	-
# Westmere	7.3		5.5/+33%	-
# Sandy Bridge	8.8		6.2/+40%	5.2(**)/+70%
# Ivy Bridge	7.2		4.8/+51%	4.7(**)/+53%
# Bulldozer	11.6		6.0/+92%
# VIA Nano	10.6		7.6/+40%
#
# (*)	Loop is 1056 instructions long and expected result is ~8.25.
#	It remains mystery [to me] why ILP is limited to 1.7.
@@ -616,7 +616,7 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
sub Xupdate_ssse3_32_79()
{ use integer;
  my $body = shift;
  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
  my ($a,$b,$c,$d,$e);

	&movdqa	(@X[2],@X[-1&7])	if ($Xi==8);
@@ -783,17 +783,16 @@ sub body_20_39 () {
sub body_40_59 () {
	(
	'($a,$b,$c,$d,$e)=@V;'.
	'&mov	(@T[1],$c);',
	'&xor	($c,$d);',
	'&xor	(@T[0],$c);',
	'&xor	(@T[1],$d);',
	'&add	($e,&DWP(4*($j++&15),"esp"));',	# X[]+K xfer
	'&and	(@T[1],$d);',
	'&and	(@T[0],$c);',	# ($b&($c^$d))
	'&and	(@T[0],@T[1]);',
	'&$_ror	($b,7);',	# $b>>>2
	'&add	($e,@T[1]);',
	'&xor	(@T[0],$c);',
	'&mov	(@T[1],$a);',	# $b in next round
	'&$_rol	($a,5);',
	'&add	($e,@T[0]);',
	'&xor	($c,$d);',	# restore $c
	'&mov	(@T[0],$b);',	# copy of $c in next round
	'&add	($e,$a);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
	);
}
@@ -809,6 +808,7 @@ sub body_40_59 () {
	&Xupdate_ssse3_32_79(\&body_20_39);
	&Xupdate_ssse3_32_79(\&body_20_39);
	&Xupdate_ssse3_32_79(\&body_20_39);
	&mov	(@T[1],@V[2]);	# copy of $c in next round
	&Xupdate_ssse3_32_79(\&body_40_59);
	&Xupdate_ssse3_32_79(\&body_40_59);
	&Xupdate_ssse3_32_79(\&body_40_59);
@@ -1032,7 +1032,7 @@ sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
sub Xupdate_avx_32_79()
{ use integer;
  my $body = shift;
  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
  my ($a,$b,$c,$d,$e);

	&vpalignr(@X[2],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
@@ -1173,6 +1173,7 @@ sub Xtail_avx()
	&Xupdate_avx_32_79(\&body_20_39);
	&Xupdate_avx_32_79(\&body_20_39);
	&Xupdate_avx_32_79(\&body_20_39);
	&mov	(@T[1],@V[2]);	# copy of $c in next round
	&Xupdate_avx_32_79(\&body_40_59);
	&Xupdate_avx_32_79(\&body_40_59);
	&Xupdate_avx_32_79(\&body_40_59);
+15 −14
Original line number Diff line number Diff line
@@ -56,12 +56,12 @@
#		x86_64		SSSE3		AVX
# P4		9.8		-
# Opteron	6.6		-
# Core2		6.7		6.1/+10%	-
# Atom		11.0		9.7/+13%	-
# Westmere	7.1		5.6/+27%	-
# Sandy Bridge	7.9		6.3/+25%	5.2/+51%
# Ivy Bridge	6.4		4.8/+33%	4.7/+36%
# Bulldozer	10.9		6.1/+79%
# Core2		6.7		6.2/+8%		-
# Atom		11.0		9.5/+15%	-
# Westmere	7.1		5.5/+29%	-
# Sandy Bridge	7.9		6.2/+28%	5.1/+54%
# Ivy Bridge	6.4		4.7/+35%	4.6/+37%
# Bulldozer	10.9		6.0/+82%
# VIA Nano	10.2		7.4/+38%

$flavour = shift;
@@ -453,7 +453,7 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
sub Xupdate_ssse3_32_79()
{ use integer;
  my $body = shift;
  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
  my ($a,$b,$c,$d,$e);

	&movdqa	(@Tx[0],@X[-1&7])	if ($Xi==8);
@@ -618,17 +618,16 @@ sub body_20_39 () {
sub body_40_59 () {
	(
	'($a,$b,$c,$d,$e)=@V;'.
	'&mov	(@T[1],$c);',
	'&xor	($c,$d);',
	'&xor	(@T[0],$c);',
	'&xor	(@T[1],$d);',
	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
	'&and	(@T[1],$d);',
	'&and	(@T[0],$c);',	# ($b&($c^$d))
	'&and	(@T[0],$T[1]);',
	'&$_ror	($b,7);',	# $b>>>2
	'&add	($e,@T[1]);',
	'&xor	(@T[0],$c);',
	'&mov	(@T[1],$a);',	# $b in next round
	'&$_rol	($a,5);',
	'&add	($e,@T[0]);',
	'&xor	($c,$d);',	# restore $c
	'&mov	(@T[0],$b);',	# copy of $c in next round
	'&add	($e,$a);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
	);
}
@@ -646,6 +645,7 @@ ___
	&Xupdate_ssse3_32_79(\&body_20_39);
	&Xupdate_ssse3_32_79(\&body_20_39);
	&Xupdate_ssse3_32_79(\&body_20_39);
	&mov	(@T[1],@V[2]);	# copy of $c in next round
	&Xupdate_ssse3_32_79(\&body_40_59);
	&Xupdate_ssse3_32_79(\&body_40_59);
	&Xupdate_ssse3_32_79(\&body_40_59);
@@ -859,7 +859,7 @@ sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
sub Xupdate_avx_32_79()
{ use integer;
  my $body = shift;
  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
  my ($a,$b,$c,$d,$e);

	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
@@ -1002,6 +1002,7 @@ ___
	&Xupdate_avx_32_79(\&body_20_39);
	&Xupdate_avx_32_79(\&body_20_39);
	&Xupdate_avx_32_79(\&body_20_39);
	&mov	(@T[1],@V[2]);	# copy of $c in next round
	&Xupdate_avx_32_79(\&body_40_59);
	&Xupdate_avx_32_79(\&body_40_59);
	&Xupdate_avx_32_79(\&body_40_59);