Commit 480cd6ab authored by Andy Polyakov's avatar Andy Polyakov
Browse files

ghash-ia64.pl: new file, GHASH for Itanium.

ghash-x86_64.pl: minimize stack frame usage.
ghash-x86.pl: modulo-scheduling MMX loop in respect to input vector
results in up to 10% performance improvement.
parent 6c6bdd54
Loading
Loading
Loading
Loading
+228 −0
Original line number Diff line number Diff line
#!/usr/bin/env perl

# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# March 2010
#
# The module implements "4-bit" Galois field multiplication and
# streamed GHASH function. "4-bit" means that it uses 256 bytes
# per-key table [+128 bytes shared table]. Streamed GHASH performance
# was measured to be 6.35 cycles per processed byte on Itanium 2,
# which is >90% better than Microsoft compiler generated code. Well,
# the number should have been ~6.5. The deviation has everything to do
# with the way performance is measured, as difference between GCM and
# straightforward 128-bit counter mode. To anchor to something else
# sha1-ia64.pl module processes one byte in 6.0 cycles. On Itanium
# GHASH should run at ~8.5 cycles per byte.

$output=shift and (open STDOUT,">$output" or die "can't open $output: $!");

if ($^O eq "hpux") {
    $ADDP="addp4";
    for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
} else { $ADDP="add"; }
for (@ARGV)  {  $big_endian=1 if (/\-DB_ENDIAN/);
                $big_endian=0 if (/\-DL_ENDIAN/);  }
if (!defined($big_endian))
             {  $big_endian=(unpack('L',pack('N',1))==1);  }

sub loop() {
my $label=shift;
my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp

# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
# in scalable manner;-) Naturally assuming data in L1 cache...
# Special note about 'dep' instruction, which is used to construct
# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
# bytes boundary and lower 7 bits of its address are guaranteed to
# be zero.
$code.=<<___;
$label:
{ .mfi;	(p18)	ld8	Hlo=[Hi[1]],-8
	(p19)	dep	rem=Zlo,rem_4bitp,3,4	}
{ .mfi;	(p19)	xor	Zhi=Zhi,Hhi
	($p17)	xor	xi[1]=xi[1],in[1]	};;
{ .mfi;	(p18)	ld8	Hhi=[Hi[1]]
	(p19)	shrp	Zlo=Zhi,Zlo,4		}
{ .mfi;	(p19)	ld8	rem=[rem]
	(p18)	and	Hi[1]=mask0xf0,xi[2]	};;
{ .mmi;	($p16)	ld1	in[0]=[inp],-1
	(p18)	xor	Zlo=Zlo,Hlo
	(p19)	shr.u	Zhi=Zhi,4		}
{ .mib;	(p19)	xor	Hhi=Hhi,rem
	(p18)	add	Hi[1]=Htbl,Hi[1]	};;

{ .mfi;	(p18)	ld8	Hlo=[Hi[1]],-8
	(p18)	dep	rem=Zlo,rem_4bitp,3,4	}
{ .mfi;	(p17)	shladd	Hi[0]=xi[1],4,r0
	(p18)	xor	Zhi=Zhi,Hhi		};;
{ .mfi;	(p18)	ld8	Hhi=[Hi[1]]
	(p18)	shrp	Zlo=Zhi,Zlo,4		}
{ .mfi;	(p18)	ld8	rem=[rem]
	(p17)	and	Hi[0]=mask0xf0,Hi[0]	};;
{ .mmi;	(p16)	ld1	xi[0]=[Xi],-1
	(p18)	xor	Zlo=Zlo,Hlo
	(p18)	shr.u	Zhi=Zhi,4		}
{ .mib;	(p18)	xor	Hhi=Hhi,rem
	(p17)	add	Hi[0]=Htbl,Hi[0]
	br.ctop.sptk	$label			};;
___
}

$code=<<___;
.explicit
.text

prevfs=r2;	prevlc=r3;	prevpr=r8;
mask0xf0=r21;
rem=r22;	rem_4bitp=r23;
Xi=r24;		Htbl=r25;
inp=r26;	end=r27;
Hhi=r28;	Hlo=r29;
Zhi=r30;	Zlo=r31;

.global	gcm_gmult_4bit#
.proc	gcm_gmult_4bit#
.align	128
.skip	16;;					// aligns loop body
gcm_gmult_4bit:
	.prologue
{ .mmi;	.save	ar.pfs,prevfs
	alloc	prevfs=ar.pfs,2,6,0,8
	$ADDP	Xi=15,in0			// &Xi[15]
	mov	rem_4bitp=ip		}
{ .mii;	$ADDP	Htbl=8,in1			// &Htbl[0].lo
	.save	ar.lc,prevlc
	mov	prevlc=ar.lc
	.save	pr,prevpr
	mov	prevpr=pr		};;

	.body
	.rotr	in[3],xi[3],Hi[2]

{ .mib;	ld1	xi[2]=[Xi],-1			// Xi[15]
	mov	mask0xf0=0xf0
	brp.loop.imp	.Loop1,.Lend1-16};;
{ .mmi;	ld1	xi[1]=[Xi],-1			// Xi[14]
					};;
{ .mii;	shladd	Hi[1]=xi[2],4,r0
	mov	pr.rot=0x7<<16
	mov	ar.lc=13		};;
{ .mii;	and	Hi[1]=mask0xf0,Hi[1]
	mov	ar.ec=3
	xor	Zlo=Zlo,Zlo		};;
{ .mii;	add	Hi[1]=Htbl,Hi[1]		// &Htbl[nlo].lo
	add	rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
	xor	Zhi=Zhi,Zhi		};;
___
	&loop	(".Loop1",1);
$code.=<<___;
.Lend1:
{ .mib;	xor	Zhi=Zhi,Hhi		};;	// modulo-scheduling artefact
{ .mib;	mux1	Zlo=Zlo,\@rev		};;
{ .mib;	mux1	Zhi=Zhi,\@rev		};;
{ .mmi;	add	Hlo=9,Xi;;			// ;; is here to prevent
	add	Hhi=1,Xi		};;	// pipeline flush on Itanium
{ .mib;	st8	[Hlo]=Zlo
	mov	pr=prevpr,-2		};;
{ .mib;	st8	[Hhi]=Zhi
	mov	ar.lc=prevlc
	br.ret.sptk.many	b0	};;
.endp	gcm_gmult_4bit#

.global	gcm_ghash_4bit#
.proc	gcm_ghash_4bit#
.align	32;;
gcm_ghash_4bit:
	.prologue
{ .mmi;	.save	ar.pfs,prevfs
	alloc	prevfs=ar.pfs,4,4,0,8
	$ADDP	inp=15,in0			// &inp[15]
	mov	rem_4bitp=ip		}
{ .mmi;	$ADDP	end=in1,in0			// &inp[len]
	$ADDP	Xi=15,in2			// &Xi[15]
	.save	ar.lc,prevlc
	mov	prevlc=ar.lc		};;
{ .mmi;	$ADDP	Htbl=8,in3			// &Htbl[0].lo
	mov	mask0xf0=0xf0
	.save	pr,prevpr
	mov	prevpr=pr		}

	.body
	.rotr	in[3],xi[3],Hi[2]

{ .mmi;	ld1	in[2]=[inp],-1			// inp[15]
	ld1	xi[2]=[Xi],-1			// Xi[15]
	add	end=-17,end		};;
{ .mmi;	ld1	in[1]=[inp],-1			// inp[14]
	ld1	xi[1]=[Xi],-1			// Xi[14]
	xor	xi[2]=xi[2],in[2]	};;
{ .mii;	shladd	Hi[1]=xi[2],4,r0
	mov	pr.rot=0x7<<16
	mov	ar.lc=13		};;
{ .mii;	and	Hi[1]=mask0xf0,Hi[1]
	mov	ar.ec=3
	xor	Zlo=Zlo,Zlo		};;
{ .mii;	add	Hi[1]=Htbl,Hi[1]		// &Htbl[nlo].lo
	add	rem_4bitp=rem_4bit#-gcm_ghash_4bit#,rem_4bitp
	xor	Zhi=Zhi,Zhi		};;
___
	&loop	(".LoopN");
$code.=<<___;
{ .mib;	xor	Zhi=Zhi,Hhi			// modulo-scheduling artefact
	extr.u	xi[2]=Zlo,0,8		}	// Xi[15]
{ .mib;	cmp.ltu	p6,p0=inp,end			// are we done?
	add	inp=32,inp			// advance inp
	clrrrb.pr			};;
{ .mii;
(p6)	ld1	in[2]=[inp],-1			// inp[15]
(p6)	extr.u	xi[1]=Zlo,8,8			// Xi[14]
(p6)	mov	ar.lc=13		};;
{ .mii;
(p6)	ld1	in[1]=[inp],-1			// inp[14]
(p6)	mov	ar.ec=3
	mux1	Zlo=Zlo,\@rev		};;
{ .mii;
(p6)	xor	xi[2]=xi[2],in[2]
	mux1	Zhi=Zhi,\@rev		};;
{ .mii;
(p6)	shladd	Hi[1]=xi[2],4,r0
	add	Hlo=9,Xi			// Xi is &Xi[-1]
	add	Hhi=1,Xi		};;
{ .mii;
(p6)	and	Hi[1]=mask0xf0,Hi[1]
(p6)	add	Xi=14,Xi			// &Xi[13]
(p6)	mov	pr.rot=0x7<<16		};;

{ .mii; st8	[Hlo]=Zlo
(p6)	xor	Zlo=Zlo,Zlo
(p6)	add	Hi[1]=Htbl,Hi[1]	};;
{ .mib;	st8	[Hhi]=Zhi
(p6)	xor	Zhi=Zhi,Zhi
(p6)	br.cond.dptk.many	.LoopN	};;

{ .mib;	mov	pr=prevpr,-2		}
{ .mib;	mov	ar.lc=prevlc
	br.ret.sptk.many	b0	};;
.endp	gcm_ghash_4bit#

.align	128;;
.type	rem_4bit#,\@object
rem_4bit:
        data8	0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
        data8	0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
        data8	0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
        data8	0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
.size	rem_4bit#,128
stringz	"GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
___

$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm      if ($big_endian);

print $code;
close STDOUT;
+47 −20
Original line number Diff line number Diff line
@@ -7,9 +7,11 @@
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# March 2010
#
# The module implements "4-bit" Galois field multiplication and
# streamed GHASH function. "4-bit" means that it uses 256 bytes
# per-key table [+128/256 bytes fixed table]. It has two code paths:
# per-key table [+64/128 bytes fixed table]. It has two code paths:
# vanilla x86 and vanilla MMX. Former will be executed on 486 and
# Pentium, latter on all others. Performance results are for streamed
# GHASH subroutine and are expressed in cycles per processed byte,
@@ -18,13 +20,13 @@
#		gcc 2.95.3(*)	MMX assembler	x86 assembler
#
# Pentium	100/112(**)	-		50
# PIII		63 /77		17		24
# P4		96 /122		33		84(***)
# Opteron	50 /71		22		30
# Core2		63 /102		21		28
# PIII		63 /77		16		24
# P4		96 /122		30		84(***)
# Opteron	50 /71		21		30
# Core2		63 /102		19		28
#
# (*)	gcc 3.4.x was observed to generate few percent slower code,
#	which is one of reasons why 2.95.3 result were chosen;
#	which is one of reasons why 2.95.3 results were chosen,
#	another reason is lack of 3.4.x results for older CPUs;
# (**)	second number is result for code compiled with -fPIC flag,
#	which is actually more relevant, because assembler code is
@@ -32,8 +34,8 @@
# (***)	see comment in non-MMX routine for further details;
#
# To summarize, it's 2-3 times faster than gcc-generated code. To
# anchor it to something else SHA1 assembler processes single byte
# in 11-13 cycles.
# anchor it to something else SHA1 assembler processes one byte in
# 11-13 cycles on contemporary x86 cores.

$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
@@ -52,13 +54,13 @@ $Htbl = "esi";

$unroll = 0;	# Affects x86 loop. Folded loop performs ~7% worse
		# than unrolled, which has to be weighted against
		# almost 2x code size reduction. Well, *overall*
		# code size. x86-specific code shrinks by 7.5x...
		# 1.7x code size reduction. Well, *overall* 1.7x,
		# x86-specific code itself shrinks by 2.5x...

sub mmx_loop() {
# MMX version performs 2.5 times better on P4 (see comment in non-MMX
# routine for further details), 35% better on Opteron and Core2, 40%
# better on PIII... In other words effort is considered to be well
# MMX version performs 2.8 times better on P4 (see comment in non-MMX
# routine for further details), 40% better on Opteron, 50% better
# on PIII and Core2... In other words effort is considered to be well
# spent...
    my $inp = shift;
    my $rem_4bit = shift;
@@ -74,7 +76,7 @@ sub mmx_loop() {
	&xor	($nlo,$nlo);	# avoid partial register stalls on PIII
	&mov	($nhi,$Zll);
	&mov	(&LB($nlo),&LB($nhi));
	&mov	($cnt,15);
	&mov	($cnt,14);
	&shl	(&LB($nlo),4);
	&and	($nhi,0xf0);
	&movq	($Zlo,&QWP(8,$Htbl,$nlo));
@@ -85,34 +87,59 @@ sub mmx_loop() {
    &set_label("mmx_loop",16);
	&psrlq	($Zlo,4);
	&and	($rem,0xf);
	&pxor	($Zlo,&QWP(8,$Htbl,$nhi));
	&movq	($tmp,$Zhi);
	&psrlq	($Zhi,4);
	&mov	(&LB($nlo),&BP(0,$inp,$cnt));
	&dec	($cnt);
	&pxor	($Zlo,&QWP(8,$Htbl,$nhi));
	&psllq	($tmp,60);
	&pxor	($Zhi,&QWP(0,$rem_4bit,$rem,8));
	&movd	($rem,$Zlo);
	&pxor	($Zhi,&QWP(0,$Htbl,$nhi));
	&mov	($nhi,$nlo);
	&pxor	($Zlo,$tmp);
	&js	(&label("mmx_break"));

	&movz	($nhi,&BP(0,$inp,$cnt));
	&shl	(&LB($nlo),4);
	&and	($rem,0xf);
	&psrlq	($Zlo,4);
	&mov	(&LB($nlo),&LB($nhi));
	&and	($nhi,0xf0);
	&movq	($tmp,$Zhi);
	&shl	(&LB($nlo),4);
	&psrlq	($Zhi,4);
	&and	($rem,0xf);
	&pxor	($Zlo,&QWP(8,$Htbl,$nlo));
	&psllq	($tmp,60);
	&pxor	($Zhi,&QWP(0,$rem_4bit,$rem,8));
	&movd	($rem,$Zlo);
	&pxor	($Zhi,&QWP(0,$Htbl,$nlo));
	&pxor	($Zlo,$tmp);
	&and	($nhi,0xf0);
	&jmp	(&label("mmx_loop"));

    &set_label("mmx_break",16);
	&shl	(&LB($nlo),4);
	&and	($rem,0xf);
	&psrlq	($Zlo,4);
	&and	($nhi,0xf0);
	&movq	($tmp,$Zhi);
	&psrlq	($Zhi,4);
	&pxor	($Zlo,&QWP(8,$Htbl,$nlo));
	&psllq	($tmp,60);
	&pxor	($Zhi,&QWP(0,$rem_4bit,$rem,8));
	&movd	($rem,$Zlo);
	&pxor	($Zhi,&QWP(0,$Htbl,$nlo));
	&pxor	($Zlo,$tmp);

	&psrlq	($Zlo,4);
	&and	($rem,0xf);
	&pxor	($Zlo,&QWP(8,$Htbl,$nhi));
	&movq	($tmp,$Zhi);
	&psrlq	($Zhi,4);
	&psllq	($tmp,60);
	&pxor	($Zhi,&QWP(0,$rem_4bit,$rem,8));
	&movd	($rem,$Zlo);
	&pxor	($Zhi,&QWP(0,$Htbl,$nhi));
	&mov	($nhi,$nlo);
	&pxor	($Zlo,$tmp);

	&psrlq	($Zlo,32);	# lower part of Zlo is already there
	&movd	($Zhl,$Zhi);
	&psrlq	($Zhi,32);
+15 −15
Original line number Diff line number Diff line
@@ -7,9 +7,11 @@
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# March 2010
#
# The module implements "4-bit" Galois field multiplication and
# streamed GHASH function. "4-bit" means that it uses 256 bytes
# per-key table [+128 bytes fixed table]. Performance results are for
# per-key table [+128 bytes shared table]. Performance results are for
# streamed GHASH subroutine and are expressed in cycles per processed
# byte, less is better:
#
@@ -136,9 +138,8 @@ $code=<<___;
.align	16
gcm_gmult_4bit:
	push	%rbx
	push	%rbp
	push	%r12
	sub	\$16,%rsp
	push	%rbp		# %rbp and %r12 are pushed exclusively in
	push	%r12		# order to reuse Win64 exception handler...
.Lgmult_prologue:

	movzb	15($Xi),$Zlo
@@ -149,8 +150,8 @@ $code.=<<___;
	mov	$Zlo,8($Xi)
	mov	$Zhi,($Xi)

	mov	32(%rsp),%rbx
	lea	40(%rsp),%rsp
	mov	16(%rsp),%rbx
	lea	24(%rsp),%rsp
.Lgmult_epilogue:
	ret
.size	gcm_gmult_4bit,.-gcm_gmult_4bit
@@ -174,7 +175,6 @@ gcm_ghash_4bit:
	push	%rbx
	push	%rbp
	push	%r12
	sub	\$16,%rsp
.Lghash_prologue:

	mov	8($Xi),$Zlo
@@ -186,11 +186,11 @@ gcm_ghash_4bit:
	xor	8($inp),$Zlo
	xor	($inp),$Zhi
	lea	16($inp),$inp
	mov	$Zlo,8(%rsp)
	mov	$Zhi,(%rsp)
	mov	$Zlo,8($Xi)
	mov	$Zhi,($Xi)
	shr	\$56,$Zlo
___
	&loop	("%rsp");
	&loop	($Xi);
$code.=<<___;
	cmp	$len,$inp
	jb	.Louter_loop
@@ -198,10 +198,10 @@ $code.=<<___;
	mov	$Zlo,8($Xi)
	mov	$Zhi,($Xi)

	mov	16(%rsp),%r12
	mov	24(%rsp),%rbp
	mov	32(%rsp),%rbx
	lea	40(%rsp),%rsp
	mov	0(%rsp),%r12
	mov	8(%rsp),%rbp
	mov	16(%rsp),%rbx
	lea	24(%rsp),%rsp
.Lghash_epilogue:
	ret
.size	gcm_ghash_4bit,.-gcm_ghash_4bit
@@ -259,7 +259,7 @@ se_handler:
	cmp	%r10,%rbx		# context->Rip>=epilogue label
	jae	.Lin_prologue

	lea	40(%rax),%rax		# adjust "rsp"
	lea	24(%rax),%rax		# adjust "rsp"

	mov	-8(%rax),%rbx
	mov	-16(%rax),%rbp