Commit b217ca63 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

crypto/sha/asm/sha1-x86_64.pl update:

+5% on Atom Silvermont, up to +8% improvement of legacy code.
Harmonize sha1-586.pl and aesni-sha1-x86_86.p with sha1-x86_64.pl.
parent 30ea570f
Loading
Loading
Loading
Loading
+50 −38
Original line number Original line Diff line number Diff line
@@ -21,24 +21,24 @@
# subroutine:
# subroutine:
#
#
#		AES-128-CBC	+SHA1		stitch      gain
#		AES-128-CBC	+SHA1		stitch      gain
# Westmere	3.77[+5.5]	9.26		6.66	    +39%
# Westmere	3.77[+5.3]	9.07		6.55	    +38%
# Sandy Bridge	5.05[+5.0(6.2)]	10.06(11.21)	5.98(7.01)  +68%(+60%)
# Sandy Bridge	5.05[+5.0(6.1)]	10.06(11.15)	5.98(7.05)  +68%(+58%)
# Ivy Bridge	5.05[+4.6]	9.65		5.54        +74%
# Ivy Bridge	5.05[+4.6]	9.65		5.54        +74%
# Haswell	4.43[+3.6(4.1)]	8.00(8.55)	4.55(5.21)  +75%(+64%)
# Haswell	4.43[+3.6(4.2)]	8.00(8.58)	4.55(5.21)  +75%(+65%)
# Bulldozer	5.77[+6.0]	11.72		6.37        +84%
# Bulldozer	5.77[+6.0]	11.72		6.37        +84%
#
#
#		AES-192-CBC
#		AES-192-CBC
# Westmere	4.51		10.00		6.91	    +45%
# Westmere	4.51		9.81		6.80	    +44%
# Sandy Bridge	6.05		11.06(12.21)	6.11(7.18)  +81%(+70%)
# Sandy Bridge	6.05		11.06(12.15)	6.11(7.19)  +81%(+69%)
# Ivy Bridge	6.05		10.65		6.07        +75%
# Ivy Bridge	6.05		10.65		6.07        +75%
# Haswell	5.29		8.86(9.42)	5.32(5.32)  +67%(+77%)
# Haswell	5.29		8.86(9.44)	5.32(5.32)  +67%(+77%)
# Bulldozer	6.89		12.84		6.96        +84%
# Bulldozer	6.89		12.84		6.96        +84%
#
#
#		AES-256-CBC
#		AES-256-CBC
# Westmere	5.25		10.74		7.24	    +48%
# Westmere	5.25		10.55		7.21	    +46%
# Sandy Bridge	7.05		12.06(13.21)	7.12(7.63)  +69%(+73%)
# Sandy Bridge	7.05		12.06(13.15)	7.12(7.72)  +69%(+70%)
# Ivy Bridge	7.05		11.65		7.12        +64%
# Ivy Bridge	7.05		11.65		7.12        +64%
# Haswell	6.19		9.76(10.3)	6.21(6.25)  +57%(+65%)
# Haswell	6.19		9.76(10.34)	6.21(6.25)  +57%(+65%)
# Bulldozer	8.00		13.95		8.25        +69%
# Bulldozer	8.00		13.95		8.25        +69%
#
#
# (*)	There are two code paths: SSSE3 and AVX. See sha1-568.pl for
# (*)	There are two code paths: SSSE3 and AVX. See sha1-568.pl for
@@ -230,11 +230,11 @@ $code.=<<___;
	movdqu	32($inp),@X[-2&7]
	movdqu	32($inp),@X[-2&7]
	movdqu	48($inp),@X[-1&7]
	movdqu	48($inp),@X[-1&7]
	pshufb	@Tx[2],@X[-4&7]		# byte swap
	pshufb	@Tx[2],@X[-4&7]		# byte swap
	add	\$64,$inp
	pshufb	@Tx[2],@X[-3&7]
	pshufb	@Tx[2],@X[-3&7]
	pshufb	@Tx[2],@X[-2&7]
	pshufb	@Tx[2],@X[-2&7]
	pshufb	@Tx[2],@X[-1&7]
	add	\$64,$inp
	paddd	@Tx[1],@X[-4&7]		# add K_00_19
	paddd	@Tx[1],@X[-4&7]		# add K_00_19
	pshufb	@Tx[2],@X[-1&7]
	paddd	@Tx[1],@X[-3&7]
	paddd	@Tx[1],@X[-3&7]
	paddd	@Tx[1],@X[-2&7]
	paddd	@Tx[1],@X[-2&7]
	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
@@ -297,74 +297,75 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
  my ($a,$b,$c,$d,$e);
  my ($a,$b,$c,$d,$e);


	 eval(shift(@insns));		# ror
	&pshufd	(@X[0],@X[-4&7],0xee);	# was &movdqa	(@X[0],@X[-3&7]);
	&pshufd	(@X[0],@X[-4&7],0xee);	# was &movdqa	(@X[0],@X[-3&7]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	&movdqa	(@Tx[0],@X[-1&7]);
	&movdqa	(@Tx[0],@X[-1&7]);
	&punpcklqdq(@X[0],@X[-3&7]);	# compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
	  &paddd	(@Tx[1],@X[-1&7]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));


	  &paddd	(@Tx[1],@X[-1&7]);
	&punpcklqdq(@X[0],@X[-3&7]);	# compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));		# rol
	 eval(shift(@insns));
	 eval(shift(@insns));
	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));

	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));		# ror

	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));


	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));		# rol
	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));


	&movdqa	(@Tx[2],@X[0]);
	&movdqa	(@Tx[2],@X[0]);
	&movdqa	(@Tx[0],@X[0]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));		# ror
	&movdqa	(@Tx[0],@X[0]);
	 eval(shift(@insns));
	 eval(shift(@insns));


	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
	&paddd	(@X[0],@X[0]);
	&paddd	(@X[0],@X[0]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));


	&psrld	(@Tx[0],31);
	&psrld	(@Tx[0],31);
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));		# rol
	 eval(shift(@insns));
	 eval(shift(@insns));
	&movdqa	(@Tx[1],@Tx[2]);
	&movdqa	(@Tx[1],@Tx[2]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));


	&psrld	(@Tx[2],30);
	&psrld	(@Tx[2],30);
	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));		# ror
	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));


	&pslld	(@Tx[1],2);
	&pslld	(@Tx[1],2);
	&pxor	(@X[0],@Tx[2]);
	&pxor	(@X[0],@Tx[2]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	  &movdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
	  &movdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
	 eval(shift(@insns));		# rol
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));


	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
	&pshufd (@Tx[1],@X[-1&7],0xee)	if ($Xi==7);	# was &movdqa	(@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79


	 foreach (@insns) { eval; }	# remaining instructions [if any]
	 foreach (@insns) { eval; }	# remaining instructions [if any]


@@ -375,27 +376,30 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
sub Xupdate_ssse3_32_79()
sub Xupdate_ssse3_32_79()
{ use integer;
{ use integer;
  my $body = shift;
  my $body = shift;
  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
  my ($a,$b,$c,$d,$e);
  my ($a,$b,$c,$d,$e);


	&pshufd	(@Tx[0],@X[-2&7],0xee)	if ($Xi==8);	# was &movdqa	(@Tx[0],@X[-1&7])
	 eval(shift(@insns))		if ($Xi==8);
	 eval(shift(@insns));		# body_20_39
	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
	&punpcklqdq(@Tx[0],@X[-1&7]);	# compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
	 eval(shift(@insns))		if ($Xi==8);
	 eval(shift(@insns));		# body_20_39
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns))		if (@insns[1] =~ /_ror/);
	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
	&punpcklqdq(@Tx[0],@X[-1&7]);	# compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));		# rol
	 eval(shift(@insns));		# rol


	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
	 eval(shift(@insns));
	if ($Xi%5) {
	if ($Xi%5) {
	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
	} else {			# ... or load next one
	} else {			# ... or load next one
	  &movdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
	  &movdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
	}
	}
	  &paddd	(@Tx[1],@X[-1&7]);
	 eval(shift(@insns));		# ror
	 eval(shift(@insns));		# ror
	  &paddd	(@Tx[1],@X[-1&7]);
	 eval(shift(@insns));
	 eval(shift(@insns));


	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
@@ -403,28 +407,30 @@ sub Xupdate_ssse3_32_79()
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));		# rol
	 eval(shift(@insns));		# rol
	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);


	&movdqa	(@Tx[0],@X[0]);
	&movdqa	(@Tx[0],@X[0]);
	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
	 eval(shift(@insns));		# ror
	 eval(shift(@insns));		# ror
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));		# body_20_39


	&pslld	(@X[0],2);
	&pslld	(@X[0],2);
	 eval(shift(@insns));		# body_20_39
	 eval(shift(@insns));
	 eval(shift(@insns));
	&psrld	(@Tx[0],30);
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));		# rol
	&psrld	(@Tx[0],30);
	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);# rol
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));		# ror
	 eval(shift(@insns));		# ror
	 eval(shift(@insns));


	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
	 eval(shift(@insns));		# body_20_39
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));		# body_20_39
	 eval(shift(@insns))		if (@insns[1] =~ /_rol/);
	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);
	  &pshufd(@Tx[1],@X[-1&7],0xee)	if ($Xi<19);	# was &movdqa	(@Tx[1],@X[0])
	  &pshufd(@Tx[1],@X[-1&7],0xee)	if ($Xi<19);	# was &movdqa	(@Tx[1],@X[0])
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));		# rol
	 eval(shift(@insns));		# rol
@@ -446,9 +452,10 @@ sub Xuplast_ssse3_80()
  my ($a,$b,$c,$d,$e);
  my ($a,$b,$c,$d,$e);


	 eval(shift(@insns));
	 eval(shift(@insns));
	  &paddd	(@Tx[1],@X[-1&7]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	  &paddd	(@Tx[1],@X[-1&7]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));


@@ -481,9 +488,12 @@ sub Xloop_ssse3()


	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	&pshufb	(@X[($Xi-3)&7],@Tx[2]);
	&pshufb	(@X[($Xi-3)&7],@Tx[2]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	&paddd	(@X[($Xi-4)&7],@Tx[1]);
	&paddd	(@X[($Xi-4)&7],@Tx[1]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
@@ -492,6 +502,8 @@ sub Xloop_ssse3()
	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	&psubd	(@X[($Xi-4)&7],@Tx[1]);
	&psubd	(@X[($Xi-4)&7],@Tx[1]);


	foreach (@insns) { eval; }
	foreach (@insns) { eval; }
+105 −25
Original line number Original line Diff line number Diff line
@@ -93,8 +93,9 @@
# Westmere	7.3		5.5/+33%	-
# Westmere	7.3		5.5/+33%	-
# Sandy Bridge	8.8		6.2/+40%	5.1(**)/+73%
# Sandy Bridge	8.8		6.2/+40%	5.1(**)/+73%
# Ivy Bridge	7.2		4.8/+51%	4.7(**)/+53%
# Ivy Bridge	7.2		4.8/+51%	4.7(**)/+53%
# Haswell	6.5		4.3/+51%	4.1(**)/+58%
# Bulldozer	11.6		6.0/+92%
# Bulldozer	11.6		6.0/+92%
# VIA Nano	10.6		7.4/+43%
# VIA Nano	10.6		7.5/+41%
#
#
# (*)	Loop is 1056 instructions long and expected result is ~8.25.
# (*)	Loop is 1056 instructions long and expected result is ~8.25.
#	It remains mystery [to me] why ILP is limited to 1.7.
#	It remains mystery [to me] why ILP is limited to 1.7.
@@ -512,7 +513,7 @@ my $_ror=sub { &ror(@_) };
	&mov	(@T[1],$C);
	&mov	(@T[1],$C);
	&psubd	(@X[-2&7],@X[3]);
	&psubd	(@X[-2&7],@X[3]);
	&xor	(@T[1],$D);
	&xor	(@T[1],$D);
	&movdqa	(@X[0],@X[-3&7]);
	&pshufd	(@X[0],@X[-4&7],0xee);		# was &movdqa	(@X[0],@X[-3&7]);
	&and	(@T[0],@T[1]);
	&and	(@T[0],@T[1]);
	&jmp	(&label("loop"));
	&jmp	(&label("loop"));


@@ -539,76 +540,77 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
  my ($a,$b,$c,$d,$e);
  my ($a,$b,$c,$d,$e);


	 eval(shift(@insns));		# ror
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	&palignr(@X[0],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
	&punpcklqdq(@X[0],@X[-3&7]);	# compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
	&movdqa	(@X[2],@X[-1&7]);
	&movdqa	(@X[2],@X[-1&7]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));


	  &paddd	(@X[3],@X[-1&7]);
	  &paddd	(@X[3],@X[-1&7]);
	  &movdqa	(&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer
	  &movdqa	(&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer
	 eval(shift(@insns));
	 eval(shift(@insns));		# rol
	 eval(shift(@insns));
	 eval(shift(@insns));
	&psrldq	(@X[2],4);		# "X[-3]", 3 dwords
	&psrldq	(@X[2],4);		# "X[-3]", 3 dwords
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));		# ror


	&pxor	(@X[2],@X[-2&7]);	# "X[-3]"^"X[-8]"
	&pxor	(@X[2],@X[-2&7]);	# "X[-3]"^"X[-8]"
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));


	&pxor	(@X[0],@X[2]);		# "X[0]"^="X[-3]"^"X[-8]"
	&pxor	(@X[0],@X[2]);		# "X[0]"^="X[-3]"^"X[-8]"
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));		# rol
	  &movdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer to IALU
	  &movdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer to IALU
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));


	&movdqa	(@X[4],@X[0]);
	&movdqa	(@X[4],@X[0]);
	&movdqa	(@X[2],@X[0]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));		# ror
	&movdqa (@X[2],@X[0]);
	 eval(shift(@insns));
	 eval(shift(@insns));


	&pslldq	(@X[4],12);		# "X[0]"<<96, extract one dword
	&pslldq	(@X[4],12);		# "X[0]"<<96, extract one dword
	&paddd	(@X[0],@X[0]);
	&paddd	(@X[0],@X[0]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));


	&psrld	(@X[2],31);
	&psrld	(@X[2],31);
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));		# rol
	&movdqa	(@X[3],@X[4]);
	&movdqa	(@X[3],@X[4]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));


	&psrld	(@X[4],30);
	&psrld	(@X[4],30);
	&por	(@X[0],@X[2]);		# "X[0]"<<<=1
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));		# ror
	&por	(@X[0],@X[2]);		# "X[0]"<<<=1
	 eval(shift(@insns));
	 eval(shift(@insns));
	  &movdqa	(@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5);	# restore X[] from backtrace buffer
	  &movdqa	(@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5);	# restore X[] from backtrace buffer
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));


	&pslld	(@X[3],2);
	&pslld	(@X[3],2);
	&pxor	(@X[0],@X[4]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));		# rol
	&pxor   (@X[0],@X[4]);
	  &movdqa	(@X[4],&QWP(112-16+16*(($Xi)/5),"esp"));	# K_XX_XX
	  &movdqa	(@X[4],&QWP(112-16+16*(($Xi)/5),"esp"));	# K_XX_XX
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));


	&pxor	(@X[0],@X[3]);		# "X[0]"^=("X[0]"<<96)<<<2
	&pxor	(@X[0],@X[3]);		# "X[0]"^=("X[0]"<<96)<<<2
	  &movdqa	(@X[1],@X[-2&7])	if ($Xi<7);
	  &pshufd	(@X[1],@X[-3&7],0xee)	if ($Xi<7);	# was &movdqa	(@X[1],@X[-2&7])
	  &pshufd	(@X[3],@X[-1&7],0xee)	if ($Xi==7);
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));


@@ -623,10 +625,9 @@ sub Xupdate_ssse3_32_79()
  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
  my ($a,$b,$c,$d,$e);
  my ($a,$b,$c,$d,$e);


	&movdqa	(@X[2],@X[-1&7])	if ($Xi==8);
	 eval(shift(@insns));		# body_20_39
	 eval(shift(@insns));		# body_20_39
	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
	&palignr(@X[2],@X[-2&7],8);	# compose "X[-6]"
	&punpcklqdq(@X[2],@X[-1&7]);	# compose "X[-6]", was &palignr(@X[2],@X[-2&7],8)
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));		# rol
	 eval(shift(@insns));		# rol
@@ -635,13 +636,14 @@ sub Xupdate_ssse3_32_79()
	  &movdqa	(&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);	# save X[] to backtrace buffer
	  &movdqa	(&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);	# save X[] to backtrace buffer
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);
	 if ($Xi%5) {
	 if ($Xi%5) {
	  &movdqa	(@X[4],@X[3]);	# "perpetuate" K_XX_XX...
	  &movdqa	(@X[4],@X[3]);	# "perpetuate" K_XX_XX...
	 } else {			# ... or load next one
	 } else {			# ... or load next one
	  &movdqa	(@X[4],&QWP(112-16+16*($Xi/5),"esp"));
	  &movdqa	(@X[4],&QWP(112-16+16*($Xi/5),"esp"));
	 }
	 }
	  &paddd	(@X[3],@X[-1&7]);
	 eval(shift(@insns));		# ror
	 eval(shift(@insns));		# ror
	  &paddd	(@X[3],@X[-1&7]);
	 eval(shift(@insns));
	 eval(shift(@insns));


	&pxor	(@X[0],@X[2]);		# "X[0]"^="X[-6]"
	&pxor	(@X[0],@X[2]);		# "X[0]"^="X[-6]"
@@ -656,6 +658,7 @@ sub Xupdate_ssse3_32_79()
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));		# ror
	 eval(shift(@insns));		# ror
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);


	&pslld	(@X[0],2);
	&pslld	(@X[0],2);
	 eval(shift(@insns));		# body_20_39
	 eval(shift(@insns));		# body_20_39
@@ -667,6 +670,8 @@ sub Xupdate_ssse3_32_79()
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));		# ror
	 eval(shift(@insns));		# ror
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns))		if (@insns[1] =~ /_rol/);
	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);


	&por	(@X[0],@X[2]);		# "X[0]"<<<=2
	&por	(@X[0],@X[2]);		# "X[0]"<<<=2
	 eval(shift(@insns));		# body_20_39
	 eval(shift(@insns));		# body_20_39
@@ -677,7 +682,7 @@ sub Xupdate_ssse3_32_79()
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));		# ror
	 eval(shift(@insns));		# ror
	  &movdqa	(@X[3],@X[0])	if ($Xi<19);
	  &pshufd	(@X[3],@X[-1],0xee)	if ($Xi<19);	# was &movdqa	(@X[3],@X[0])
	 eval(shift(@insns));
	 eval(shift(@insns));


	 foreach (@insns) { eval; }	# remaining instructions
	 foreach (@insns) { eval; }	# remaining instructions
@@ -691,6 +696,12 @@ sub Xuplast_ssse3_80()
  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
  my ($a,$b,$c,$d,$e);
  my ($a,$b,$c,$d,$e);


	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	  &paddd	(@X[3],@X[-1&7]);
	  &paddd	(@X[3],@X[-1&7]);
	 eval(shift(@insns));
	 eval(shift(@insns));
@@ -728,9 +739,16 @@ sub Xloop_ssse3()


	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	&pshufb	(@X[($Xi-3)&7],@X[2]);
	&pshufb	(@X[($Xi-3)&7],@X[2]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	&paddd	(@X[($Xi-4)&7],@X[3]);
	&paddd	(@X[($Xi-4)&7],@X[3]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
@@ -739,6 +757,8 @@ sub Xloop_ssse3()
	&movdqa	(&QWP(0+16*$Xi,"esp"),@X[($Xi-4)&7]);	# X[]+K xfer to IALU
	&movdqa	(&QWP(0+16*$Xi,"esp"),@X[($Xi-4)&7]);	# X[]+K xfer to IALU
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	&psubd	(@X[($Xi-4)&7],@X[3]);
	&psubd	(@X[($Xi-4)&7],@X[3]);


	foreach (@insns) { eval; }
	foreach (@insns) { eval; }
@@ -816,6 +836,64 @@ sub body_40_59 () { # ((b^c)&(c^d))^c
	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
	);
	);
}
}
######
sub bodyx_00_19 () {	# ((c^d)&b)^d
	# on start @T[0]=(b&c)^(~b&d), $e+=X[]+K
	return &bodyx_20_39()	if ($rx==19);	$rx++;
	(
	'($a,$b,$c,$d,$e)=@V;'.

	'&rorx	($b,$b,2)			if ($j==0);'.	# $b>>>2
	'&rorx	($b,@T[1],7)			if ($j!=0);',	# $b>>>2
	'&lea	($e,&DWP(0,$e,@T[0]));',
	'&rorx	(@T[0],$a,5);',

	'&andn	(@T[1],$a,$c);',
	'&and	($a,$b)',
	'&add	($d,&DWP(4*(($j+1)&15),"esp"));',	# X[]+K xfer

	'&xor	(@T[1],$a)',
	'&add	($e,@T[0]);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
	);
}

sub bodyx_20_39 () {	# b^d^c
	# on start $b=b^c^d
	return &bodyx_40_59()	if ($rx==39);	$rx++;
	(
	'($a,$b,$c,$d,$e)=@V;'.

	'&add	($e,($j==19?@T[0]:$b))',
	'&rorx	($b,@T[1],7);',	# $b>>>2
	'&rorx	(@T[0],$a,5);',

	'&xor	($a,$b)				if ($j<79);',
	'&add	($d,&DWP(4*(($j+1)&15),"esp"))	if ($j<79);',	# X[]+K xfer
	'&xor	($a,$c)				if ($j<79);',
	'&add	($e,@T[0]);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
	);
}

sub bodyx_40_59 () {	# ((b^c)&(c^d))^c
	# on start $b=((b^c)&(c^d))^c
	return &bodyx_20_39()	if ($rx==59);	$rx++;
	(
	'($a,$b,$c,$d,$e)=@V;'.

	'&rorx	(@T[0],$a,5)',
	'&lea	($e,&DWP(0,$e,$b))',
	'&rorx	($b,@T[1],7)',	# $b>>>2
	'&add	($d,&DWP(4*(($j+1)&15),"esp"))',	# X[]+K xfer

	'&mov	(@T[1],$c)',
	'&xor	($a,$b)',	# b^c for next round
	'&xor	(@T[1],$b)',	# c^d for next round

	'&and	($a,@T[1])',
	'&add	($e,@T[0])',
	'&xor	($a,$b)'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
	);
}


&set_label("loop",16);
&set_label("loop",16);
	&Xupdate_ssse3_16_31(\&body_00_19);
	&Xupdate_ssse3_16_31(\&body_00_19);
@@ -855,9 +933,10 @@ sub body_40_59 () { # ((b^c)&(c^d))^c
	&mov	(&DWP(12,@T[1]),$D);
	&mov	(&DWP(12,@T[1]),$D);
	&xor	($B,$D);
	&xor	($B,$D);
	&mov	(&DWP(16,@T[1]),$E);
	&mov	(&DWP(16,@T[1]),$E);
	&and	($B,@T[0]);
	&mov	(@T[1],@T[0]);
	&movdqa	(@X[0],@X[-3&7]);
	&pshufd	(@X[0],@X[-4&7],0xee);		# was &movdqa	(@X[0],@X[-3&7]);
	&xchg	($B,@T[0]);
	&and	(@T[0],$B);
	&mov	($B,$T[1]);


	&jmp	(&label("loop"));
	&jmp	(&label("loop"));


@@ -1226,9 +1305,10 @@ sub Xtail_avx()
	&mov	(&DWP(8,@T[1]),$C);
	&mov	(&DWP(8,@T[1]),$C);
	&xor	($B,$D);
	&xor	($B,$D);
	&mov	(&DWP(12,@T[1]),$D);
	&mov	(&DWP(12,@T[1]),$D);
	&and	($B,@T[0]);
	&mov	(&DWP(16,@T[1]),$E);
	&mov	(&DWP(16,@T[1]),$E);
	&xchg	($B,@T[0]);
	&mov	(@T[1],@T[0]);
	&and	(@T[0],$B);
	&mov	($B,@T[1]);


	&jmp	(&label("loop"));
	&jmp	(&label("loop"));


+97 −85

File changed.

Preview size limit exceeded, changes collapsed.