Commit 729d3341 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

crypto/sha/asm/sha1-x86_64.pl: jumbo update from master.

parent cacdfcb2
Loading
Loading
Loading
Loading
+606 −82

File changed.

Preview size limit exceeded, changes collapsed.

+105 −25
Original line number Diff line number Diff line
@@ -93,8 +93,9 @@
# Westmere	7.3		5.5/+33%	-
# Sandy Bridge	8.8		6.2/+40%	5.1(**)/+73%
# Ivy Bridge	7.2		4.8/+51%	4.7(**)/+53%
# Haswell	6.5		4.3/+51%	4.1(**)/+58%
# Bulldozer	11.6		6.0/+92%
# VIA Nano	10.6		7.4/+43%
# VIA Nano	10.6		7.5/+41%
#
# (*)	Loop is 1056 instructions long and expected result is ~8.25.
#	It remains mystery [to me] why ILP is limited to 1.7.
@@ -512,7 +513,7 @@ my $_ror=sub { &ror(@_) };
	&mov	(@T[1],$C);
	&psubd	(@X[-2&7],@X[3]);
	&xor	(@T[1],$D);
	&movdqa	(@X[0],@X[-3&7]);
	&pshufd	(@X[0],@X[-4&7],0xee);		# was &movdqa	(@X[0],@X[-3&7]);
	&and	(@T[0],@T[1]);
	&jmp	(&label("loop"));

@@ -539,76 +540,77 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
  my ($a,$b,$c,$d,$e);

	 eval(shift(@insns));		# ror
	 eval(shift(@insns));
	 eval(shift(@insns));
	&palignr(@X[0],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
	&punpcklqdq(@X[0],@X[-3&7]);	# compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
	&movdqa	(@X[2],@X[-1&7]);
	 eval(shift(@insns));
	 eval(shift(@insns));

	  &paddd	(@X[3],@X[-1&7]);
	  &movdqa	(&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer
	 eval(shift(@insns));
	 eval(shift(@insns));		# rol
	 eval(shift(@insns));
	&psrldq	(@X[2],4);		# "X[-3]", 3 dwords
	 eval(shift(@insns));
	 eval(shift(@insns));
	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));		# ror

	&pxor	(@X[2],@X[-2&7]);	# "X[-3]"^"X[-8]"
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));

	&pxor	(@X[0],@X[2]);		# "X[0]"^="X[-3]"^"X[-8]"
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));		# rol
	  &movdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer to IALU
	 eval(shift(@insns));
	 eval(shift(@insns));

	&movdqa	(@X[4],@X[0]);
	&movdqa	(@X[2],@X[0]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));		# ror
	&movdqa (@X[2],@X[0]);
	 eval(shift(@insns));

	&pslldq	(@X[4],12);		# "X[0]"<<96, extract one dword
	&paddd	(@X[0],@X[0]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));

	&psrld	(@X[2],31);
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));		# rol
	&movdqa	(@X[3],@X[4]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));

	&psrld	(@X[4],30);
	&por	(@X[0],@X[2]);		# "X[0]"<<<=1
	 eval(shift(@insns));
	 eval(shift(@insns));		# ror
	&por	(@X[0],@X[2]);		# "X[0]"<<<=1
	 eval(shift(@insns));
	  &movdqa	(@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5);	# restore X[] from backtrace buffer
	 eval(shift(@insns));
	 eval(shift(@insns));

	&pslld	(@X[3],2);
	&pxor	(@X[0],@X[4]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));		# rol
	&pxor   (@X[0],@X[4]);
	  &movdqa	(@X[4],&QWP(112-16+16*(($Xi)/5),"esp"));	# K_XX_XX
	 eval(shift(@insns));
	 eval(shift(@insns));

	&pxor	(@X[0],@X[3]);		# "X[0]"^=("X[0]"<<96)<<<2
	  &movdqa	(@X[1],@X[-2&7])	if ($Xi<7);
	  &pshufd	(@X[1],@X[-3&7],0xee)	if ($Xi<7);	# was &movdqa	(@X[1],@X[-2&7])
	  &pshufd	(@X[3],@X[-1&7],0xee)	if ($Xi==7);
	 eval(shift(@insns));
	 eval(shift(@insns));

@@ -623,10 +625,9 @@ sub Xupdate_ssse3_32_79()
  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
  my ($a,$b,$c,$d,$e);

	&movdqa	(@X[2],@X[-1&7])	if ($Xi==8);
	 eval(shift(@insns));		# body_20_39
	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
	&palignr(@X[2],@X[-2&7],8);	# compose "X[-6]"
	&punpcklqdq(@X[2],@X[-1&7]);	# compose "X[-6]", was &palignr(@X[2],@X[-2&7],8)
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));		# rol
@@ -635,13 +636,14 @@ sub Xupdate_ssse3_32_79()
	  &movdqa	(&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);	# save X[] to backtrace buffer
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);
	 if ($Xi%5) {
	  &movdqa	(@X[4],@X[3]);	# "perpetuate" K_XX_XX...
	 } else {			# ... or load next one
	  &movdqa	(@X[4],&QWP(112-16+16*($Xi/5),"esp"));
	 }
	  &paddd	(@X[3],@X[-1&7]);
	 eval(shift(@insns));		# ror
	  &paddd	(@X[3],@X[-1&7]);
	 eval(shift(@insns));

	&pxor	(@X[0],@X[2]);		# "X[0]"^="X[-6]"
@@ -656,6 +658,7 @@ sub Xupdate_ssse3_32_79()
	 eval(shift(@insns));
	 eval(shift(@insns));		# ror
	 eval(shift(@insns));
	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);

	&pslld	(@X[0],2);
	 eval(shift(@insns));		# body_20_39
@@ -667,6 +670,8 @@ sub Xupdate_ssse3_32_79()
	 eval(shift(@insns));
	 eval(shift(@insns));		# ror
	 eval(shift(@insns));
	 eval(shift(@insns))		if (@insns[1] =~ /_rol/);
	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);

	&por	(@X[0],@X[2]);		# "X[0]"<<<=2
	 eval(shift(@insns));		# body_20_39
@@ -677,7 +682,7 @@ sub Xupdate_ssse3_32_79()
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));		# ror
	  &movdqa	(@X[3],@X[0])	if ($Xi<19);
	  &pshufd	(@X[3],@X[-1],0xee)	if ($Xi<19);	# was &movdqa	(@X[3],@X[0])
	 eval(shift(@insns));

	 foreach (@insns) { eval; }	# remaining instructions
@@ -691,6 +696,12 @@ sub Xuplast_ssse3_80()
  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
  my ($a,$b,$c,$d,$e);

	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	  &paddd	(@X[3],@X[-1&7]);
	 eval(shift(@insns));
@@ -728,9 +739,16 @@ sub Xloop_ssse3()

	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	&pshufb	(@X[($Xi-3)&7],@X[2]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	&paddd	(@X[($Xi-4)&7],@X[3]);
	 eval(shift(@insns));
	 eval(shift(@insns));
@@ -739,6 +757,8 @@ sub Xloop_ssse3()
	&movdqa	(&QWP(0+16*$Xi,"esp"),@X[($Xi-4)&7]);	# X[]+K xfer to IALU
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	&psubd	(@X[($Xi-4)&7],@X[3]);

	foreach (@insns) { eval; }
@@ -816,6 +836,64 @@ sub body_40_59 () { # ((b^c)&(c^d))^c
	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
	);
}
######
sub bodyx_00_19 () {	# ((c^d)&b)^d
	# on start @T[0]=(b&c)^(~b&d), $e+=X[]+K
	return &bodyx_20_39()	if ($rx==19);	$rx++;
	(
	'($a,$b,$c,$d,$e)=@V;'.

	'&rorx	($b,$b,2)			if ($j==0);'.	# $b>>>2
	'&rorx	($b,@T[1],7)			if ($j!=0);',	# $b>>>2
	'&lea	($e,&DWP(0,$e,@T[0]));',
	'&rorx	(@T[0],$a,5);',

	'&andn	(@T[1],$a,$c);',
	'&and	($a,$b)',
	'&add	($d,&DWP(4*(($j+1)&15),"esp"));',	# X[]+K xfer

	'&xor	(@T[1],$a)',
	'&add	($e,@T[0]);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
	);
}

sub bodyx_20_39 () {	# b^d^c
	# on start $b=b^c^d
	return &bodyx_40_59()	if ($rx==39);	$rx++;
	(
	'($a,$b,$c,$d,$e)=@V;'.

	'&add	($e,($j==19?@T[0]:$b))',
	'&rorx	($b,@T[1],7);',	# $b>>>2
	'&rorx	(@T[0],$a,5);',

	'&xor	($a,$b)				if ($j<79);',
	'&add	($d,&DWP(4*(($j+1)&15),"esp"))	if ($j<79);',	# X[]+K xfer
	'&xor	($a,$c)				if ($j<79);',
	'&add	($e,@T[0]);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
	);
}

sub bodyx_40_59 () {	# ((b^c)&(c^d))^c
	# on start $b=((b^c)&(c^d))^c
	return &bodyx_20_39()	if ($rx==59);	$rx++;
	(
	'($a,$b,$c,$d,$e)=@V;'.

	'&rorx	(@T[0],$a,5)',
	'&lea	($e,&DWP(0,$e,$b))',
	'&rorx	($b,@T[1],7)',	# $b>>>2
	'&add	($d,&DWP(4*(($j+1)&15),"esp"))',	# X[]+K xfer

	'&mov	(@T[1],$c)',
	'&xor	($a,$b)',	# b^c for next round
	'&xor	(@T[1],$b)',	# c^d for next round

	'&and	($a,@T[1])',
	'&add	($e,@T[0])',
	'&xor	($a,$b)'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
	);
}

&set_label("loop",16);
	&Xupdate_ssse3_16_31(\&body_00_19);
@@ -855,9 +933,10 @@ sub body_40_59 () { # ((b^c)&(c^d))^c
	&mov	(&DWP(12,@T[1]),$D);
	&xor	($B,$D);
	&mov	(&DWP(16,@T[1]),$E);
	&and	($B,@T[0]);
	&movdqa	(@X[0],@X[-3&7]);
	&xchg	($B,@T[0]);
	&mov	(@T[1],@T[0]);
	&pshufd	(@X[0],@X[-4&7],0xee);		# was &movdqa	(@X[0],@X[-3&7]);
	&and	(@T[0],$B);
	&mov	($B,$T[1]);

	&jmp	(&label("loop"));

@@ -1226,9 +1305,10 @@ sub Xtail_avx()
	&mov	(&DWP(8,@T[1]),$C);
	&xor	($B,$D);
	&mov	(&DWP(12,@T[1]),$D);
	&and	($B,@T[0]);
	&mov	(&DWP(16,@T[1]),$E);
	&xchg	($B,@T[0]);
	&mov	(@T[1],@T[0]);
	&and	(@T[0],$B);
	&mov	($B,@T[1]);

	&jmp	(&label("loop"));

+97 −85
Original line number Diff line number Diff line
@@ -62,16 +62,20 @@
# CPU clock cycles spent to process single byte (less is better).
#
#		x86_64		SSSE3		AVX[2]
# P4		9.8		-
# Opteron	6.65		-
# Core2		6.70		6.05/+11%	-
# Westmere	7.08		5.44/+30%	-
# Sandy Bridge	7.93		6.16/+28%	4.99/+59%
# Ivy Bridge	6.30		4.63/+36%	4.60/+37%
# Haswell	5.98		4.12/+45%	3.57/+67%
# Bulldozer	10.9		5.95/+82%
# VIA Nano	10.2		7.46/+37%
# Atom		11.0		9.61/+14%
# P4		9.05		-
# Opteron	6.26		-
# Core2		6.55		6.05/+8%	-
# Westmere	6.73		5.30/+27%	-
# Sandy Bridge	7.70		6.10/+26%	4.99/+54%
# Ivy Bridge	6.06		4.67/+30%	4.60/+32%
# Haswell	5.45		4.15/+31%	3.57/+53%
# Bulldozer	9.11		5.95/+53%
# VIA Nano	9.32		7.15/+30%
# Atom		[10.5?]		[9.23?]/+14%
# Silvermont	13.1(*)		9.37/+40%
#
# (*)	obviously suboptimal result, nothing was done about it,
#	because SSSE3 code is compiled unconditionally;

$flavour = shift;
$output  = shift;
@@ -114,7 +118,7 @@ $num="%r10";
$t0="%eax";
$t1="%ebx";
$t2="%ecx";
@xi=("%edx","%ebp");
@xi=("%edx","%ebp","%r14d");
$A="%esi";
$B="%edi";
$C="%r11d";
@@ -129,42 +133,40 @@ my $j=$i+1;
$code.=<<___ if ($i==0);
	mov	`4*$i`($inp),$xi[0]
	bswap	$xi[0]
	mov	$xi[0],`4*$i`(%rsp)
___
$code.=<<___ if ($i<15);
	mov	$c,$t0
	mov	`4*$j`($inp),$xi[1]
	mov	$d,$t0
	mov	$xi[0],`4*$i`(%rsp)
	mov	$a,$t2
	xor	$d,$t0
	bswap	$xi[1]
	xor	$c,$t0
	rol	\$5,$t2
	lea	0x5a827999($xi[0],$e),$e
	and	$b,$t0
	mov	$xi[1],`4*$j`(%rsp)
	lea	0x5a827999($xi[0],$e),$e
	add	$t2,$e
	xor	$d,$t0
	rol	\$30,$b
	add	$t0,$e
___
$code.=<<___ if ($i>=15);
	mov	`4*($j%16)`(%rsp),$xi[1]
	mov	$c,$t0
	xor	`4*($j%16)`(%rsp),$xi[1]
	mov	$d,$t0
	mov	$xi[0],`4*($i%16)`(%rsp)
	mov	$a,$t2
	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
	xor	$d,$t0
	xor	$c,$t0
	rol	\$5,$t2
	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
	and	$b,$t0
	lea	0x5a827999($xi[0],$e),$e
	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
	rol	\$30,$b
	xor	$d,$t0
	rol	\$1,$xi[1]
	add	$t2,$e
	rol	\$30,$b
	mov	$xi[1],`4*($j%16)`(%rsp)
	rol	\$1,$xi[1]
	add	$t0,$e
___
unshift(@xi,pop(@xi));
push(@xi,shift(@xi));
}

sub BODY_20_39 {
@@ -172,62 +174,58 @@ my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
$code.=<<___ if ($i<79);
	mov	`4*($j%16)`(%rsp),$xi[1]
	mov	$c,$t0
	xor	`4*($j%16)`(%rsp),$xi[1]
	mov	$b,$t0
	`"mov	$xi[0],".4*($i%16)."(%rsp)"	if ($i<72)`
	mov	$a,$t2
	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
	xor	$b,$t0
	xor	$d,$t0
	rol	\$5,$t2
	lea	$K($xi[0],$e),$e
	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
	xor	$d,$t0
	lea	$K($xi[0],$e),$e
	xor	$c,$t0
	add	$t2,$e
	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
	rol	\$30,$b
	add	$t0,$e
	rol	\$1,$xi[1]
___
$code.=<<___ if ($i<76);
	mov	$xi[1],`4*($j%16)`(%rsp)
___
$code.=<<___ if ($i==79);
	mov	$c,$t0
	mov	$b,$t0
	mov	$a,$t2
	xor	$b,$t0
	xor	$d,$t0
	lea	$K($xi[0],$e),$e
	rol	\$5,$t2
	xor	$d,$t0
	xor	$c,$t0
	add	$t2,$e
	rol	\$30,$b
	add	$t0,$e
___
unshift(@xi,pop(@xi));
push(@xi,shift(@xi));
}

sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___;
	mov	`4*($j%16)`(%rsp),$xi[1]
	mov	$c,$t0
	mov	$c,$t1
	xor	`4*($j%16)`(%rsp),$xi[1]
	mov	$d,$t0
	mov	$xi[0],`4*($i%16)`(%rsp)
	mov	$d,$t1
	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
	and	$d,$t0
	and	$c,$t0
	mov	$a,$t2
	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
	xor	$d,$t1
	lea	0x8f1bbcdc($xi[0],$e),$e
	xor	$c,$t1
	rol	\$5,$t2
	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
	add	$t0,$e
	and	$b,$t1
	rol	\$1,$xi[1]
	add	$t1,$e
	rol	\$30,$b
	mov	$xi[1],`4*($j%16)`(%rsp)
	and	$b,$t1
	add	$t2,$e
	rol	\$30,$b
	add	$t1,$e
___
unshift(@xi,pop(@xi));
push(@xi,shift(@xi));
}

$code.=<<___;
@@ -261,17 +259,18 @@ $code.=<<___;

.align	16
.Lialu:
	mov	%rsp,%rax
	push	%rbx
	push	%rbp
	push	%r12
	push	%r13
	mov	%rsp,%r11
	push	%r14
	mov	%rdi,$ctx	# reassigned argument
	sub	\$`8+16*4`,%rsp
	mov	%rsi,$inp	# reassigned argument
	and	\$-64,%rsp
	mov	%rdx,$num	# reassigned argument
	mov	%r11,`16*4`(%rsp)
	mov	%rax,`16*4`(%rsp)
.Lprologue:

	mov	0($ctx),$A
@@ -305,11 +304,12 @@ $code.=<<___;
	jnz	.Lloop

	mov	`16*4`(%rsp),%rsi
	mov	(%rsi),%r13
	mov	8(%rsi),%r12
	mov	16(%rsi),%rbp
	mov	24(%rsi),%rbx
	lea	32(%rsi),%rsp
	mov	-40(%rsi),%r14
	mov	-32(%rsi),%r13
	mov	-24(%rsi),%r12
	mov	-16(%rsi),%rbp
	mov	-8(%rsi),%rbx
	lea	(%rsi),%rsp
.Lepilogue:
	ret
.size	sha1_block_data_order,.-sha1_block_data_order
@@ -389,11 +389,11 @@ $code.=<<___;
	movdqu	32($inp),@X[-2&7]
	movdqu	48($inp),@X[-1&7]
	pshufb	@X[2],@X[-4&7]		# byte swap
	add	\$64,$inp
	pshufb	@X[2],@X[-3&7]
	pshufb	@X[2],@X[-2&7]
	pshufb	@X[2],@X[-1&7]
	add	\$64,$inp
	paddd	@Tx[1],@X[-4&7]		# add K_00_19
	pshufb	@X[2],@X[-1&7]
	paddd	@Tx[1],@X[-3&7]
	paddd	@Tx[1],@X[-2&7]
	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
@@ -418,74 +418,75 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
  my ($a,$b,$c,$d,$e);

	&movdqa	(@X[0],@X[-3&7]);
	 eval(shift(@insns));
	 eval(shift(@insns));		# ror
	&pshufd	(@X[0],@X[-4&7],0xee);	# was &movdqa	(@X[0],@X[-3&7]);
	 eval(shift(@insns));
	&movdqa	(@Tx[0],@X[-1&7]);
	&palignr(@X[0],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
	  &paddd	(@Tx[1],@X[-1&7]);
	 eval(shift(@insns));
	 eval(shift(@insns));

	  &paddd	(@Tx[1],@X[-1&7]);
	&punpcklqdq(@X[0],@X[-3&7]);	# compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
	 eval(shift(@insns));
	 eval(shift(@insns));		# rol
	 eval(shift(@insns));
	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
	 eval(shift(@insns));
	 eval(shift(@insns));

	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
	 eval(shift(@insns));
	 eval(shift(@insns));

	 eval(shift(@insns));		# ror
	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));

	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));		# rol
	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
	 eval(shift(@insns));
	 eval(shift(@insns));

	&movdqa	(@Tx[2],@X[0]);
	&movdqa	(@Tx[0],@X[0]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));		# ror
	&movdqa	(@Tx[0],@X[0]);
	 eval(shift(@insns));

	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
	&paddd	(@X[0],@X[0]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));

	&psrld	(@Tx[0],31);
	 eval(shift(@insns));
	 eval(shift(@insns));		# rol
	 eval(shift(@insns));
	&movdqa	(@Tx[1],@Tx[2]);
	 eval(shift(@insns));
	 eval(shift(@insns));

	&psrld	(@Tx[2],30);
	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
	 eval(shift(@insns));
	 eval(shift(@insns));		# ror
	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));

	&pslld	(@Tx[1],2);
	&pxor	(@X[0],@Tx[2]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	  &movdqa	(@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)");	# K_XX_XX
	 eval(shift(@insns));		# rol
	 eval(shift(@insns));
	 eval(shift(@insns));

	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
	&pshufd (@Tx[1],@X[-1&7],0xee)	if ($Xi==7);	# was &movdqa	(@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79

	 foreach (@insns) { eval; }	# remaining instructions [if any]

@@ -499,24 +500,27 @@ sub Xupdate_ssse3_32_79()
  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
  my ($a,$b,$c,$d,$e);

	&movdqa	(@Tx[0],@X[-1&7])	if ($Xi==8);
	 eval(shift(@insns));		# body_20_39
	 eval(shift(@insns))		if ($Xi==8);
	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
	&palignr(@Tx[0],@X[-2&7],8);	# compose "X[-6]"
	 eval(shift(@insns))		if ($Xi==8);
	 eval(shift(@insns));		# body_20_39
	 eval(shift(@insns));
	 eval(shift(@insns))		if (@insns[1] =~ /_ror/);
	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
	&punpcklqdq(@Tx[0],@X[-1&7]);	# compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
	 eval(shift(@insns));
	 eval(shift(@insns));		# rol

	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
	 eval(shift(@insns));
	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
	 eval(shift(@insns));
	if ($Xi%5) {
	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
	} else {			# ... or load next one
	  &movdqa	(@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)");
	}
	  &paddd	(@Tx[1],@X[-1&7]);
	 eval(shift(@insns));		# ror
	  &paddd	(@Tx[1],@X[-1&7]);
	 eval(shift(@insns));

	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
@@ -524,29 +528,31 @@ sub Xupdate_ssse3_32_79()
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));		# rol
	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);

	&movdqa	(@Tx[0],@X[0]);
	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
	 eval(shift(@insns));
	 eval(shift(@insns));
	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
	 eval(shift(@insns));		# ror
	 eval(shift(@insns));
	 eval(shift(@insns));		# body_20_39

	&pslld	(@X[0],2);
	 eval(shift(@insns));		# body_20_39
	 eval(shift(@insns));
	&psrld	(@Tx[0],30);
	 eval(shift(@insns));
	 eval(shift(@insns));		# rol
	&psrld	(@Tx[0],30);
	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);# rol
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));		# ror
	 eval(shift(@insns));

	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
	 eval(shift(@insns));		# body_20_39
	 eval(shift(@insns));
	  &movdqa	(@Tx[1],@X[0])	if ($Xi<19);
	 eval(shift(@insns));		# body_20_39
	 eval(shift(@insns))		if (@insns[1] =~ /_rol/);
	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);
	  &pshufd(@Tx[1],@X[-1&7],0xee)	if ($Xi<19);	# was &movdqa	(@Tx[1],@X[0])
	 eval(shift(@insns));
	 eval(shift(@insns));		# rol
	 eval(shift(@insns));
@@ -567,9 +573,10 @@ sub Xuplast_ssse3_80()
  my ($a,$b,$c,$d,$e);

	 eval(shift(@insns));
	  &paddd	(@Tx[1],@X[-1&7]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	  &paddd	(@Tx[1],@X[-1&7]);
	 eval(shift(@insns));
	 eval(shift(@insns));

@@ -602,10 +609,12 @@ sub Xloop_ssse3()

	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	&pshufb	(@X[($Xi-3)&7],@X[2]);
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	&paddd	(@X[($Xi-4)&7],@Tx[1]);
	 eval(shift(@insns));
	 eval(shift(@insns));
@@ -614,6 +623,8 @@ sub Xloop_ssse3()
	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	 eval(shift(@insns));
	&psubd	(@X[($Xi-4)&7],@Tx[1]);

	foreach (@insns) { eval; }
@@ -1680,16 +1691,17 @@ se_handler:
	jae	.Lcommon_seh_tail

	mov	`16*4`(%rax),%rax	# pull saved stack pointer
	lea	32(%rax),%rax

	mov	-8(%rax),%rbx
	mov	-16(%rax),%rbp
	mov	-24(%rax),%r12
	mov	-32(%rax),%r13
	mov	-40(%rax),%r14
	mov	%rbx,144($context)	# restore context->Rbx
	mov	%rbp,160($context)	# restore context->Rbp
	mov	%r12,216($context)	# restore context->R12
	mov	%r13,224($context)	# restore context->R13
	mov	%r14,232($context)	# restore context->R14

	jmp	.Lcommon_seh_tail
.size	se_handler,.-se_handler