Commit 4f39edbf authored by Andy Polyakov's avatar Andy Polyakov
Browse files

gcm128.c and assembler modules: change argument order for gcm_ghash_4bit.

ghash-x86*.pl: fix performance numbers for Core2, as it turned out
previous ones were "tainted" by variable clock frequency.
parent 8decc967
Loading
Loading
Loading
Loading
+3 −9
Original line number Diff line number Diff line
@@ -31,10 +31,10 @@ $Thi1="t5";
$Tlo1="t6";
$rem="t7";	# $8
#################
$Xi="a0";	# $16
$Xi="a0";	# $16, input argument block
$Htbl="a1";


$inp="a2";
$len="a3";
$nlo="a4";	# $20
$nhi="a5";
$Zhi="t8";
@@ -314,12 +314,6 @@ $code.=<<___;
.end	gcm_gmult_4bit
___

# argument block for gcm_ghash_4bit
$inp="a0";	# $16
$len="a1";
$Xi ="a2";
$Htbl="a3";

$inhi="s0";
$inlo="s1";

+4 −4
Original line number Diff line number Diff line
@@ -142,13 +142,13 @@ gcm_ghash_4bit:
	.prologue
{ .mmi;	.save	ar.pfs,prevfs
	alloc	prevfs=ar.pfs,4,4,0,8
	$ADDP	inp=15,in0			// &inp[15]
	$ADDP	inp=15,in2			// &inp[15]
	mov	rem_4bitp=ip		}
{ .mmi;	$ADDP	end=in1,in0			// &inp[len]
	$ADDP	Xi=15,in2			// &Xi[15]
{ .mmi;	$ADDP	end=in3,in2			// &inp[len]
	$ADDP	Xi=15,in0			// &Xi[15]
	.save	ar.lc,prevlc
	mov	prevlc=ar.lc		};;
{ .mmi;	$ADDP	Htbl=8,in3			// &Htbl[0].lo
{ .mmi;	$ADDP	Htbl=8,in1			// &Htbl[0].lo
	mov	mask0xf0=0xf0
	.save	pr,prevpr
	mov	prevpr=pr		}
+4 −6
Original line number Diff line number Diff line
@@ -54,10 +54,10 @@ $remi="%l5";
$Htblo="%l6";
$cnt="%l7";

$inp="%i0";	# input arguments for gcm_ghash_4bit
$len="%i1";
$Xi="%i2";
$Htbl="%i3";
$Xi="%i0";	# input argument block
$Htbl="%i1";
$inp="%i2";
$len="%i3";

$code.=<<___;
.section	".text",#alloc,#execinstr
@@ -208,8 +208,6 @@ gcm_ghash_4bit:
.size	gcm_ghash_4bit,(.-gcm_ghash_4bit)
___

$Xi="%i0";	# input arguments for gcm_gmult_4bit
$Htbl="%i1";
undef $inp;
undef $len;

+17 −17
Original line number Diff line number Diff line
@@ -23,7 +23,7 @@
# PIII		63 /77		16		24
# P4		96 /122		30		84(***)
# Opteron	50 /71		21		30
# Core2		63 /102		19		28
# Core2		54 /68		13		18
#
# (*)	gcc 3.4.x was observed to generate few percent slower code,
#	which is one of reasons why 2.95.3 results were chosen,
@@ -317,12 +317,12 @@ if ($unroll) {

	&lea	("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));

	&mov	($inp,&wparam(0));	# load in
	&mov	($Zlh,&wparam(1));	# load len
	&mov	($Zhh,&wparam(2));	# load Xi
	&mov	($Htbl,&wparam(3));	# load Htable
	&mov	($Zhh,&wparam(0));	# load Xi
	&mov	($Htbl,&wparam(1));	# load Htable
	&mov	($inp,&wparam(2));	# load in
	&mov	($Zlh,&wparam(3));	# load len
	&add	($Zlh,$inp);
	&mov	(&wparam(1),$Zlh);	# len to point at the end of input
	&mov	(&wparam(3),$Zlh);	# len to point at the end of input
	&stack_push(4+1);		# +1 for stack alignment
	&mov	($Zll,&DWP(12,$Zhh));	# load Xi[16]
	&mov	($Zhl,&DWP(4,$Zhh));
@@ -344,10 +344,10 @@ if ($unroll) {
	&mmx_loop("esp","eax");

	&lea	($inp,&DWP(16,$inp));
	&cmp	($inp,&wparam(1));
	&cmp	($inp,&wparam(3));
	&jb	(&label("mmx_outer_loop"));

	&mov	($inp,&wparam(2));	# load Xi
	&mov	($inp,&wparam(0));	# load Xi
	&emms	();
	&mov	(&DWP(12,$inp),$Zll);
	&mov	(&DWP(4,$inp),$Zhl);
@@ -359,12 +359,12 @@ if ($unroll) {
    &set_label("x86",16);
    }
	&stack_push(16+4+1);			# +1 for 64-bit alignment
	&mov	($inp,&wparam(0));		# load in
	&mov	("ecx",&wparam(1));		# load len
	&mov	($Zll,&wparam(2));		# load Xi
	&mov	($Htbl,&wparam(3));		# load Htable
	&mov	($Zll,&wparam(0));		# load Xi
	&mov	($Htbl,&wparam(1));		# load Htable
	&mov	($inp,&wparam(2));		# load in
	&mov	("ecx",&wparam(3));		# load len
	&add	("ecx",$inp);
	&mov	(&wparam(1),"ecx");
	&mov	(&wparam(3),"ecx");

	&mov	($Zhh,&DWP(0,$Zll));		# load Xi[16]
	&mov	($Zhl,&DWP(4,$Zll));
@@ -390,14 +390,14 @@ if ($unroll) {
		&call	("_x86_gmult_4bit_inner");
	} else {
		&x86_loop(0);
		&mov	($inp,&wparam(0));
		&mov	($inp,&wparam(2));
	}
	&lea	($inp,&DWP(16,$inp));
	&cmp	($inp,&wparam(1));
	&mov	(&wparam(0),$inp)	if (!$unroll);
	&cmp	($inp,&wparam(3));
	&mov	(&wparam(2),$inp)	if (!$unroll);
	&jb	(&label("x86_outer_loop"));

	&mov	($inp,&wparam(2));	# load Xi
	&mov	($inp,&wparam(0));	# load Xi
	&mov	(&DWP(12,$inp),$Zll);
	&mov	(&DWP(8,$inp),$Zlh);
	&mov	(&DWP(4,$inp),$Zhl);
+4 −6
Original line number Diff line number Diff line
@@ -18,7 +18,7 @@
#		gcc 3.4.x	assembler
#
# Opteron	18.5		10.2		+80%
# Core2		26.0		16.4		+58%
# Core2		17.5		11.0		+59%

$flavour = shift;
$output  = shift;
@@ -41,10 +41,10 @@ $Zhi="%r9";
$tmp="%r10";
$rem_4bit = "%r11";

# per-function register layout
$Xi="%rdi";
$Htbl="%rsi";

# per-function register layout
$cnt="%rcx";
$rem="%rdx";

@@ -159,10 +159,8 @@ ___


# per-function register layout
$inp="%rdi";
$len="%rsi";
$Xi="%rdx";
$Htbl="%rcx";
$inp="%rdx";
$len="%rcx";

$cnt="%rbp";
$rem="%r12";
Loading