Commit c372482c authored by Andy Polyakov's avatar Andy Polyakov
Browse files

sha1-x86* assembler update: F_40_59 and Atom-specific optimizations.

parent ba4526e0
Loading
Loading
Loading
Loading
+56 −46
Original line number Diff line number Diff line
@@ -12,6 +12,8 @@
# commentary below], and in 2006 the rest was rewritten in order to
# gain freedom to liberate licensing terms.

# January, September 2004.
#
# It was noted that Intel IA-32 C compiler generates code which
# performs ~30% *faster* on P4 CPU than original *hand-coded*
# SHA1 assembler implementation. To address this problem (and
@@ -31,6 +33,17 @@
# ----------------------------------------------------------------
#					<appro@fy.chalmers.se>

# August 2009.
#
# George Spelvin has tipped that F_40_59(b,c,d) can be rewritten as
# '(c&d) + (b&(c^d))', which allows to accumulate partial results
# and lighten "pressure" on scratch registers. This resulted in
# >12% performance improvement on contemporary AMD cores (with no
# degradation on other CPUs:-). Also, the code was revised to maximize
# "distance" between instructions producing input to 'lea' instruction
# and the 'lea' instruction itself, which is essential for Intel Atom
# core.

$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
@@ -59,15 +72,16 @@ sub BODY_00_15
	&rotl($tmp1,5);			# tmp1=ROTATE(a,5)
	 &xor($f,$d);
	&add($tmp1,$e);			# tmp1+=e;
	 &and($f,$b);
	 &mov($e,&swtmp($n%16));	# e becomes volatile and is loaded
	 				# with xi, also note that e becomes
					# f in next round...
	 &xor($f,$d);			# f holds F_00_19(b,c,d)
	&and($f,$b);
	&rotr($b,2);			# b=ROTATE(b,30)
	 &xor($f,$d);			# f holds F_00_19(b,c,d)
	&lea($tmp1,&DWP(0x5a827999,$tmp1,$e));	# tmp1+=K_00_19+xi

	if ($n==15) { &add($f,$tmp1); }	# f+=tmp1
	if ($n==15) { &mov($e,&swtmp(($n+1)%16));# pre-fetch f for next round
		      &add($f,$tmp1); }	# f+=tmp1
	else        { &add($tmp1,$f); }	# f becomes a in next round
	}

@@ -77,22 +91,22 @@ sub BODY_16_19

	&comment("16_19 $n");

	&mov($f,&swtmp($n%16));		# f to hold Xupdate(xi,xa,xb,xc,xd)
	&mov($tmp1,$c);			# tmp1 to hold F_00_19(b,c,d)
	&xor($f,&swtmp(($n+2)%16));
	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
	&xor($tmp1,$d);
	 &xor($f,&swtmp(($n+8)%16));
	 &and($tmp1,$b);		# tmp1 holds F_00_19(b,c,d)
	&rotr($b,2);			# b=ROTATE(b,30)
	&and($tmp1,$b);
	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
	&rotl($f,1);			# f=ROTATE(f,1)
	 &xor($tmp1,$d);		# tmp1=F_00_19(b,c,d)
	&add($e,$tmp1);			# e+=F_00_19(b,c,d)
	 &mov($tmp1,$a);
	&rotr($b,2);			# b=ROTATE(b,30)
	 &mov(&swtmp($n%16),$f);	# xi=f
	&lea($f,&DWP(0x5a827999,$f,$e));# f+=K_00_19+e
	 &mov($e,$a);			# e becomes volatile
	&rotl($e,5);			# e=ROTATE(a,5)
	 &add($f,$tmp1);		# f+=F_00_19(b,c,d)
	&add($f,$e);			# f+=ROTATE(a,5)
	&rotl($tmp1,5);			# ROTATE(a,5)
	 &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e
	&mov($e,&swtmp(($n+1)%16));	# pre-fetch f for next round
	 &add($f,$tmp1);		# f+=ROTATE(a,5)
	}

sub BODY_20_39
@@ -103,20 +117,20 @@ sub BODY_20_39
	&comment("20_39 $n");

	&mov($tmp1,$b);			# tmp1 to hold F_20_39(b,c,d)
	 &mov($f,&swtmp($n%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
	&rotr($b,2);			# b=ROTATE(b,30)
	 &xor($f,&swtmp(($n+2)%16));
	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
	&xor($tmp1,$c);
	 &xor($f,&swtmp(($n+8)%16));
	&xor($tmp1,$d);			# tmp1 holds F_20_39(b,c,d)
	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
	&rotl($f,1);			# f=ROTATE(f,1)
	 &add($tmp1,$e);
	&mov(&swtmp($n%16),$f);		# xi=f
	 &mov($e,$a);			# e becomes volatile
	&rotl($e,5);			# e=ROTATE(a,5)
	 &lea($f,&DWP($K,$f,$tmp1));	# f+=K_20_39+e
	&add($f,$e);			# f+=ROTATE(a,5)
	 &add($e,$tmp1);		# e+=F_20_39(b,c,d)
	&rotr($b,2);			# b=ROTATE(b,30)
	 &mov($tmp1,$a);
	&rotl($tmp1,5);			# ROTATE(a,5)
	 &mov(&swtmp($n%16),$f) if($n<77);# xi=f
	&lea($f,&DWP($K,$f,$e));	# f+=e+K_XX_YY
	 &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round
	&add($f,$tmp1);			# f+=ROTATE(a,5)
	}

sub BODY_40_59
@@ -125,28 +139,24 @@ sub BODY_40_59

	&comment("40_59 $n");

	&mov($f,&swtmp($n%16));		# f to hold Xupdate(xi,xa,xb,xc,xd)
	 &mov($tmp1,&swtmp(($n+2)%16));
	&xor($f,$tmp1);
	 &mov($tmp1,&swtmp(($n+8)%16));
	&xor($f,$tmp1);
	 &mov($tmp1,&swtmp(($n+13)%16));
	&xor($f,$tmp1);			# f holds xa^xb^xc^xd
	 &mov($tmp1,$b);		# tmp1 to hold F_40_59(b,c,d)
	&mov($tmp1,$c);			# tmp1 to hold F_40_59(b,c,d)
	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
	&xor($tmp1,$d);
	 &xor($f,&swtmp(($n+8)%16));
	&and($tmp1,$b);
	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
	&rotl($f,1);			# f=ROTATE(f,1)
	 &or($tmp1,$c);
	&mov(&swtmp($n%16),$f);		# xi=f
	 &and($tmp1,$d);
	&lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e
	 &mov($e,$b);			# e becomes volatile and is used
					# to calculate F_40_59(b,c,d)
	 &add($tmp1,$e);		# b&(c^d)+=e
	&rotr($b,2);			# b=ROTATE(b,30)
	 &and($e,$c);
	&or($tmp1,$e);			# tmp1 holds F_40_59(b,c,d)		
	 &mov($e,$a);
	&rotl($e,5);			# e=ROTATE(a,5)
	 &add($f,$tmp1);		# f+=tmp1;
	 &mov($e,$a);			# e becomes volatile
	&rotl($e,5);			# ROTATE(a,5)
	 &mov(&swtmp($n%16),$f);	# xi=f
	&lea($f,&DWP(0x8f1bbcdc,$f,$tmp1));# f+=K_40_59+e+(b&(c^d))
	 &mov($tmp1,$c);
	&add($f,$e);			# f+=ROTATE(a,5)
	 &and($tmp1,$d);
	&mov($e,&swtmp(($n+1)%16));	# pre-fetch f for next round
	 &add($f,$tmp1);		# f+=c&d
	}

&function_begin("sha1_block_data_order");
+133 −134
Original line number Diff line number Diff line
@@ -16,7 +16,7 @@
# There was suggestion to mechanically translate 32-bit code, but I
# dismissed it, reasoning that x86_64 offers enough register bank
# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
# implementation:-) However! While 64-bit code does performs better
# implementation:-) However! While 64-bit code does perform better
# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
# x86_64 does offer larger *addressable* bank, but out-of-order core
# reaches for even more registers through dynamic aliasing, and EM64T
@@ -29,6 +29,13 @@
# Xeon P4	+65%		+0%		9.9
# Core2		+60%		+10%		7.0

# August 2009.
#
# The code was revised to minimize code size and to maximize
# "distance" between instructions producing input to 'lea'
# instruction and the 'lea' instruction itself, which is essential
# for Intel Atom core.

$flavour = shift;
$output  = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@ -51,194 +58,184 @@ $ctx="%r8";
$inp="%r9";
$num="%r10";

$xi="%eax";
$t0="%ebx";
$t1="%ecx";
$A="%edx";
$B="%esi";
$C="%edi";
$D="%ebp";
$E="%r11d";
$T="%r12d";
$t0="%eax";
$t1="%ebx";
$t2="%ecx";
@xi=("%edx","%ebp");
$A="%esi";
$B="%edi";
$C="%r11d";
$D="%r12d";
$E="%r13d";

@V=($A,$B,$C,$D,$E,$T);

sub PROLOGUE {
my $func=shift;
$code.=<<___;
.globl	$func
.type	$func,\@function,3
.align	16
$func:
	push	%rbx
	push	%rbp
	push	%r12
	mov	%rsp,%r11
	mov	%rdi,$ctx	# reassigned argument
	sub	\$`8+16*4`,%rsp
	mov	%rsi,$inp	# reassigned argument
	and	\$-64,%rsp
	mov	%rdx,$num	# reassigned argument
	mov	%r11,`16*4`(%rsp)
.Lprologue:

	mov	0($ctx),$A
	mov	4($ctx),$B
	mov	8($ctx),$C
	mov	12($ctx),$D
	mov	16($ctx),$E
___
}

sub EPILOGUE {
my $func=shift;
$code.=<<___;
	mov	`16*4`(%rsp),%rsi
	mov	(%rsi),%r12
	mov	8(%rsi),%rbp
	mov	16(%rsi),%rbx
	lea	24(%rsi),%rsp
.Lepilogue:
	ret
.size	$func,.-$func
___
}
@V=($A,$B,$C,$D,$E);

sub BODY_00_19 {
my ($i,$a,$b,$c,$d,$e,$f,$host)=@_;
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___ if ($i==0);
	mov	`4*$i`($inp),$xi	
	`"bswap	$xi"	if(!defined($host))`
	mov	$xi,`4*$i`(%rsp)
	mov	`4*$i`($inp),$xi[0]
	bswap	$xi[0]
	mov	$xi[0],`4*$i`(%rsp)
___
$code.=<<___ if ($i<15);
	lea	0x5a827999($xi,$e),$f
	mov	$c,$t0
	mov	`4*$j`($inp),$xi
	mov	$a,$e
	mov	`4*$j`($inp),$xi[1]
	mov	$a,$t2
	xor	$d,$t0
	`"bswap	$xi"	if(!defined($host))`	
	rol	\$5,$e
	bswap	$xi[1]
	rol	\$5,$t2
	lea	0x5a827999($xi[0],$e),$e
	and	$b,$t0
	mov	$xi,`4*$j`(%rsp)
	add	$e,$f
	mov	$xi[1],`4*$j`(%rsp)
	add	$t2,$e
	xor	$d,$t0
	rol	\$30,$b
	add	$t0,$f
	add	$t0,$e
___
$code.=<<___ if ($i>=15);
	lea	0x5a827999($xi,$e),$f
	mov	`4*($j%16)`(%rsp),$xi
	mov	`4*($j%16)`(%rsp),$xi[1]
	mov	$c,$t0
	mov	$a,$e
	xor	`4*(($j+2)%16)`(%rsp),$xi
	mov	$a,$t2
	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
	xor	$d,$t0
	rol	\$5,$e
	xor	`4*(($j+8)%16)`(%rsp),$xi
	rol	\$5,$t2
	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
	and	$b,$t0
	add	$e,$f
	xor	`4*(($j+13)%16)`(%rsp),$xi
	lea	0x5a827999($xi[0],$e),$e
	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
	xor	$d,$t0
	rol	\$1,$xi[1]
	add	$t2,$e
	rol	\$30,$b
	add	$t0,$f
	rol	\$1,$xi
	mov	$xi,`4*($j%16)`(%rsp)
	mov	$xi[1],`4*($j%16)`(%rsp)
	add	$t0,$e
___
unshift(@xi,pop(@xi));
}

sub BODY_20_39 {
my ($i,$a,$b,$c,$d,$e,$f)=@_;
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
$code.=<<___ if ($i<79);
	lea	$K($xi,$e),$f
	mov	`4*($j%16)`(%rsp),$xi
	mov	`4*($j%16)`(%rsp),$xi[1]
	mov	$c,$t0
	mov	$a,$e
	xor	`4*(($j+2)%16)`(%rsp),$xi
	mov	$a,$t2
	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
	xor	$b,$t0
	rol	\$5,$e
	xor	`4*(($j+8)%16)`(%rsp),$xi
	rol	\$5,$t2
	lea	$K($xi[0],$e),$e
	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
	xor	$d,$t0
	add	$e,$f
	xor	`4*(($j+13)%16)`(%rsp),$xi
	add	$t2,$e
	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
	rol	\$30,$b
	add	$t0,$f
	rol	\$1,$xi
	add	$t0,$e
	rol	\$1,$xi[1]
___
$code.=<<___ if ($i<76);
	mov	$xi,`4*($j%16)`(%rsp)
	mov	$xi[1],`4*($j%16)`(%rsp)
___
$code.=<<___ if ($i==79);
	lea	$K($xi,$e),$f
	mov	$c,$t0
	mov	$a,$e
	mov	$a,$t2
	xor	$b,$t0
	rol	\$5,$e
	lea	$K($xi[0],$e),$e
	rol	\$5,$t2
	xor	$d,$t0
	add	$e,$f
	add	$t2,$e
	rol	\$30,$b
	add	$t0,$f
	add	$t0,$e
___
unshift(@xi,pop(@xi));
}

sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e,$f)=@_;
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___;
	lea	0x8f1bbcdc($xi,$e),$f
	mov	`4*($j%16)`(%rsp),$xi
	mov	$b,$t0
	mov	$b,$t1
	xor	`4*(($j+2)%16)`(%rsp),$xi
	mov	$a,$e
	and	$c,$t0
	xor	`4*(($j+8)%16)`(%rsp),$xi
	or	$c,$t1
	rol	\$5,$e
	xor	`4*(($j+13)%16)`(%rsp),$xi
	and	$d,$t1
	add	$e,$f
	rol	\$1,$xi
	or	$t1,$t0
	mov	`4*($j%16)`(%rsp),$xi[1]
	mov	$c,$t0
	mov	$c,$t1
	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
	and	$d,$t0
	mov	$a,$t2
	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
	xor	$d,$t1
	lea	0x8f1bbcdc($xi[0],$e),$e
	rol	\$5,$t2
	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
	add	$t0,$e
	and	$b,$t1
	rol	\$1,$xi[1]
	add	$t1,$e
	rol	\$30,$b
	mov	$xi,`4*($j%16)`(%rsp)
	add	$t0,$f
	mov	$xi[1],`4*($j%16)`(%rsp)
	add	$t2,$e
___
unshift(@xi,pop(@xi));
}

$code=".text\n";
$code.=<<___;
.text

&PROLOGUE("sha1_block_data_order");
$code.=".align	4\n.Lloop:\n";
.globl	sha1_block_data_order
.type	sha1_block_data_order,\@function,3
.align	16
sha1_block_data_order:
	push	%rbx
	push	%rbp
	push	%r12
	push	%r13
	mov	%rsp,%r11
	mov	%rdi,$ctx	# reassigned argument
	sub	\$`8+16*4`,%rsp
	mov	%rsi,$inp	# reassigned argument
	and	\$-64,%rsp
	mov	%rdx,$num	# reassigned argument
	mov	%r11,`16*4`(%rsp)
.Lprologue:

	mov	0($ctx),$A
	mov	4($ctx),$B
	mov	8($ctx),$C
	mov	12($ctx),$D
	mov	16($ctx),$E

.align	4
.Lloop:
___
for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
	add	0($ctx),$E
	add	4($ctx),$T
	add	8($ctx),$A
	add	12($ctx),$B
	add	16($ctx),$C
	mov	$E,0($ctx)
	mov	$T,4($ctx)
	mov	$A,8($ctx)
	mov	$B,12($ctx)
	mov	$C,16($ctx)

	xchg	$E,$A	# mov	$E,$A
	xchg	$T,$B	# mov	$T,$B
	xchg	$E,$C	# mov	$A,$C
	xchg	$T,$D	# mov	$B,$D
			# mov	$C,$E
	lea	`16*4`($inp),$inp
	add	0($ctx),$A
	add	4($ctx),$B
	add	8($ctx),$C
	add	12($ctx),$D
	add	16($ctx),$E
	mov	$A,0($ctx)
	mov	$B,4($ctx)
	mov	$C,8($ctx)
	mov	$D,12($ctx)
	mov	$E,16($ctx)

	sub	\$1,$num
	lea	`16*4`($inp),$inp
	jnz	.Lloop
___
&EPILOGUE("sha1_block_data_order");
$code.=<<___;

	mov	`16*4`(%rsp),%rsi
	mov	(%rsi),%r13
	mov	8(%rsi),%r12
	mov	16(%rsi),%rbp
	mov	24(%rsi),%rbx
	lea	32(%rsi),%rsp
.Lepilogue:
	ret
.size	sha1_block_data_order,.-sha1_block_data_order

.asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
.align	16
___
@@ -281,14 +278,16 @@ se_handler:
	jae	.Lin_prologue

	mov	`16*4`(%rax),%rax	# pull saved stack pointer
	lea	24(%rax),%rax
	lea	32(%rax),%rax

	mov	-8(%rax),%rbx
	mov	-16(%rax),%rbp
	mov	-24(%rax),%r12
	mov	-32(%rax),%r13
	mov	%rbx,144($context)	# restore context->Rbx
	mov	%rbp,160($context)	# restore context->Rbp
	mov	%r12,216($context)	# restore context->R12
	mov	%r13,224($context)	# restore context->R13

.Lin_prologue:
	mov	8(%rax),%rdi