ghash-ia64.pl: new file, GHASH for Itanium. (480cd6ab) · Commits · CYBER - Cyber Security / TS 103 523 MSP / TLMSP / TLMSP OpenSSL

crypto/modes/asm/ghash-ia64.pl

0 → 100755

+228 −0

Original line number	Diff line number	Diff line
		#!/usr/bin/env perl

		# ====================================================================
		# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
		# project. The module is, however, dual licensed under OpenSSL and
		# CRYPTOGAMS licenses depending on where you obtain it. For further
		# details see http://www.openssl.org/~appro/cryptogams/.
		# ====================================================================
		#
		# March 2010
		#
		# The module implements "4-bit" Galois field multiplication and
		# streamed GHASH function. "4-bit" means that it uses 256 bytes
		# per-key table [+128 bytes shared table]. Streamed GHASH performance
		# was measured to be 6.35 cycles per processed byte on Itanium 2,
		# which is >90% better than Microsoft compiler generated code. Well,
		# the number should have been ~6.5. The deviation has everything to do
		# with the way performance is measured, as difference between GCM and
		# straightforward 128-bit counter mode. To anchor to something else
		# sha1-ia64.pl module processes one byte in 6.0 cycles. On Itanium
		# GHASH should run at ~8.5 cycles per byte.

		$output=shift and (open STDOUT,">$output" or die "can't open $output: $!");

		if ($^O eq "hpux") {
		$ADDP="addp4";
		for (@ARGV) { $ADDP="add" if (/[\+DD\|\-mlp]64/); }
		} else { $ADDP="add"; }
		for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/);
		$big_endian=0 if (/\-DL_ENDIAN/); }
		if (!defined($big_endian))
		{ $big_endian=(unpack('L',pack('N',1))==1); }

		sub loop() {
		my $label=shift;
		my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp

		# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
		# in scalable manner;-) Naturally assuming data in L1 cache...
		# Special note about 'dep' instruction, which is used to construct
		# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
		# bytes boundary and lower 7 bits of its address are guaranteed to
		# be zero.
		$code.=<<___;
		$label:
		{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8
		(p19) dep rem=Zlo,rem_4bitp,3,4 }
		{ .mfi; (p19) xor Zhi=Zhi,Hhi
		($p17) xor xi[1]=xi[1],in[1] };;
		{ .mfi; (p18) ld8 Hhi=[Hi[1]]
		(p19) shrp Zlo=Zhi,Zlo,4 }
		{ .mfi; (p19) ld8 rem=[rem]
		(p18) and Hi[1]=mask0xf0,xi[2] };;
		{ .mmi; ($p16) ld1 in[0]=[inp],-1
		(p18) xor Zlo=Zlo,Hlo
		(p19) shr.u Zhi=Zhi,4 }
		{ .mib; (p19) xor Hhi=Hhi,rem
		(p18) add Hi[1]=Htbl,Hi[1] };;

		{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8
		(p18) dep rem=Zlo,rem_4bitp,3,4 }
		{ .mfi; (p17) shladd Hi[0]=xi[1],4,r0
		(p18) xor Zhi=Zhi,Hhi };;
		{ .mfi; (p18) ld8 Hhi=[Hi[1]]
		(p18) shrp Zlo=Zhi,Zlo,4 }
		{ .mfi; (p18) ld8 rem=[rem]
		(p17) and Hi[0]=mask0xf0,Hi[0] };;
		{ .mmi; (p16) ld1 xi[0]=[Xi],-1
		(p18) xor Zlo=Zlo,Hlo
		(p18) shr.u Zhi=Zhi,4 }
		{ .mib; (p18) xor Hhi=Hhi,rem
		(p17) add Hi[0]=Htbl,Hi[0]
		br.ctop.sptk $label };;
		___
		}

		$code=<<___;
		.explicit
		.text

		prevfs=r2; prevlc=r3; prevpr=r8;
		mask0xf0=r21;
		rem=r22; rem_4bitp=r23;
		Xi=r24; Htbl=r25;
		inp=r26; end=r27;
		Hhi=r28; Hlo=r29;
		Zhi=r30; Zlo=r31;

		.global gcm_gmult_4bit#
		.proc gcm_gmult_4bit#
		.align 128
		.skip 16;; // aligns loop body
		gcm_gmult_4bit:
		.prologue
		{ .mmi; .save ar.pfs,prevfs
		alloc prevfs=ar.pfs,2,6,0,8
		$ADDP Xi=15,in0 // &Xi[15]
		mov rem_4bitp=ip }
		{ .mii; $ADDP Htbl=8,in1 // &Htbl[0].lo
		.save ar.lc,prevlc
		mov prevlc=ar.lc
		.save pr,prevpr
		mov prevpr=pr };;

		.body
		.rotr in[3],xi[3],Hi[2]

		{ .mib; ld1 xi[2]=[Xi],-1 // Xi[15]
		mov mask0xf0=0xf0
		brp.loop.imp .Loop1,.Lend1-16};;
		{ .mmi; ld1 xi[1]=[Xi],-1 // Xi[14]
		};;
		{ .mii; shladd Hi[1]=xi[2],4,r0
		mov pr.rot=0x7<<16
		mov ar.lc=13 };;
		{ .mii; and Hi[1]=mask0xf0,Hi[1]
		mov ar.ec=3
		xor Zlo=Zlo,Zlo };;
		{ .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo
		add rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
		xor Zhi=Zhi,Zhi };;
		___
		&loop (".Loop1",1);
		$code.=<<___;
		.Lend1:
		{ .mib; xor Zhi=Zhi,Hhi };; // modulo-scheduling artefact
		{ .mib; mux1 Zlo=Zlo,\@rev };;
		{ .mib; mux1 Zhi=Zhi,\@rev };;
		{ .mmi; add Hlo=9,Xi;; // ;; is here to prevent
		add Hhi=1,Xi };; // pipeline flush on Itanium
		{ .mib; st8 [Hlo]=Zlo
		mov pr=prevpr,-2 };;
		{ .mib; st8 [Hhi]=Zhi
		mov ar.lc=prevlc
		br.ret.sptk.many b0 };;
		.endp gcm_gmult_4bit#

		.global gcm_ghash_4bit#
		.proc gcm_ghash_4bit#
		.align 32;;
		gcm_ghash_4bit:
		.prologue
		{ .mmi; .save ar.pfs,prevfs
		alloc prevfs=ar.pfs,4,4,0,8
		$ADDP inp=15,in0 // &inp[15]
		mov rem_4bitp=ip }
		{ .mmi; $ADDP end=in1,in0 // &inp[len]
		$ADDP Xi=15,in2 // &Xi[15]
		.save ar.lc,prevlc
		mov prevlc=ar.lc };;
		{ .mmi; $ADDP Htbl=8,in3 // &Htbl[0].lo
		mov mask0xf0=0xf0
		.save pr,prevpr
		mov prevpr=pr }

		.body
		.rotr in[3],xi[3],Hi[2]

		{ .mmi; ld1 in[2]=[inp],-1 // inp[15]
		ld1 xi[2]=[Xi],-1 // Xi[15]
		add end=-17,end };;
		{ .mmi; ld1 in[1]=[inp],-1 // inp[14]
		ld1 xi[1]=[Xi],-1 // Xi[14]
		xor xi[2]=xi[2],in[2] };;
		{ .mii; shladd Hi[1]=xi[2],4,r0
		mov pr.rot=0x7<<16
		mov ar.lc=13 };;
		{ .mii; and Hi[1]=mask0xf0,Hi[1]
		mov ar.ec=3
		xor Zlo=Zlo,Zlo };;
		{ .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo
		add rem_4bitp=rem_4bit#-gcm_ghash_4bit#,rem_4bitp
		xor Zhi=Zhi,Zhi };;
		___
		&loop (".LoopN");
		$code.=<<___;
		{ .mib; xor Zhi=Zhi,Hhi // modulo-scheduling artefact
		extr.u xi[2]=Zlo,0,8 } // Xi[15]
		{ .mib; cmp.ltu p6,p0=inp,end // are we done?
		add inp=32,inp // advance inp
		clrrrb.pr };;
		{ .mii;
		(p6) ld1 in[2]=[inp],-1 // inp[15]
		(p6) extr.u xi[1]=Zlo,8,8 // Xi[14]
		(p6) mov ar.lc=13 };;
		{ .mii;
		(p6) ld1 in[1]=[inp],-1 // inp[14]
		(p6) mov ar.ec=3
		mux1 Zlo=Zlo,\@rev };;
		{ .mii;
		(p6) xor xi[2]=xi[2],in[2]
		mux1 Zhi=Zhi,\@rev };;
		{ .mii;
		(p6) shladd Hi[1]=xi[2],4,r0
		add Hlo=9,Xi // Xi is &Xi[-1]
		add Hhi=1,Xi };;
		{ .mii;
		(p6) and Hi[1]=mask0xf0,Hi[1]
		(p6) add Xi=14,Xi // &Xi[13]
		(p6) mov pr.rot=0x7<<16 };;

		{ .mii; st8 [Hlo]=Zlo
		(p6) xor Zlo=Zlo,Zlo
		(p6) add Hi[1]=Htbl,Hi[1] };;
		{ .mib; st8 [Hhi]=Zhi
		(p6) xor Zhi=Zhi,Zhi
		(p6) br.cond.dptk.many .LoopN };;

		{ .mib; mov pr=prevpr,-2 }
		{ .mib; mov ar.lc=prevlc
		br.ret.sptk.many b0 };;
		.endp gcm_ghash_4bit#

		.align 128;;
		.type rem_4bit#,\@object
		rem_4bit:
		data8 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
		data8 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
		data8 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
		data8 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
		.size rem_4bit#,128
		stringz "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
		___

		$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm if ($big_endian);

		print $code;
		close STDOUT;

crypto/modes/asm/ghash-x86.pl

+47 −20

Original line number	Diff line number	Diff line
		@@ -7,9 +7,11 @@
		# details see http://www.openssl.org/~appro/cryptogams/.
		# ====================================================================
		#
		# March 2010
		#
		# The module implements "4-bit" Galois field multiplication and
		# streamed GHASH function. "4-bit" means that it uses 256 bytes
		# per-key table [+128/256 bytes fixed table]. It has two code paths:
		# per-key table [+64/128 bytes fixed table]. It has two code paths:
		# vanilla x86 and vanilla MMX. Former will be executed on 486 and
		# Pentium, latter on all others. Performance results are for streamed
		# GHASH subroutine and are expressed in cycles per processed byte,
		@@ -18,13 +20,13 @@
		# gcc 2.95.3(*) MMX assembler x86 assembler
		#
		# Pentium 100/112(**) - 50
		# PIII 63 /77 17 24
		# P4 96 /122 33 84(***)
		# Opteron 50 /71 22 30
		# Core2 63 /102 21 28
		# PIII 63 /77 16 24
		# P4 96 /122 30 84(***)
		# Opteron 50 /71 21 30
		# Core2 63 /102 19 28
		#
		# (*) gcc 3.4.x was observed to generate few percent slower code,
		# which is one of reasons why 2.95.3 result were chosen;
		# which is one of reasons why 2.95.3 results were chosen,
		# another reason is lack of 3.4.x results for older CPUs;
		# (**) second number is result for code compiled with -fPIC flag,
		# which is actually more relevant, because assembler code is
		@@ -32,8 +34,8 @@
		# (***) see comment in non-MMX routine for further details;
		#
		# To summarize, it's 2-3 times faster than gcc-generated code. To
		# anchor it to something else SHA1 assembler processes single byte
		# in 11-13 cycles.
		# anchor it to something else SHA1 assembler processes one byte in
		# 11-13 cycles on contemporary x86 cores.

		$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
		push(@INC,"${dir}","${dir}../../perlasm");
		@@ -52,13 +54,13 @@ $Htbl = "esi";

		$unroll = 0; # Affects x86 loop. Folded loop performs ~7% worse
		# than unrolled, which has to be weighted against
		# almost 2x code size reduction. Well, overall
		# code size. x86-specific code shrinks by 7.5x...
		# 1.7x code size reduction. Well, overall 1.7x,
		# x86-specific code itself shrinks by 2.5x...

		sub mmx_loop() {
		# MMX version performs 2.5 times better on P4 (see comment in non-MMX
		# routine for further details), 35% better on Opteron and Core2, 40%
		# better on PIII... In other words effort is considered to be well
		# MMX version performs 2.8 times better on P4 (see comment in non-MMX
		# routine for further details), 40% better on Opteron, 50% better
		# on PIII and Core2... In other words effort is considered to be well
		# spent...
		my $inp = shift;
		my $rem_4bit = shift;
		@@ -74,7 +76,7 @@ sub mmx_loop() {
		&xor ($nlo,$nlo); # avoid partial register stalls on PIII
		&mov ($nhi,$Zll);
		&mov (&LB($nlo),&LB($nhi));
		&mov ($cnt,15);
		&mov ($cnt,14);
		&shl (&LB($nlo),4);
		&and ($nhi,0xf0);
		&movq ($Zlo,&QWP(8,$Htbl,$nlo));
		@@ -85,34 +87,59 @@ sub mmx_loop() {
		&set_label("mmx_loop",16);
		&psrlq ($Zlo,4);
		&and ($rem,0xf);
		&pxor ($Zlo,&QWP(8,$Htbl,$nhi));
		&movq ($tmp,$Zhi);
		&psrlq ($Zhi,4);
		&mov (&LB($nlo),&BP(0,$inp,$cnt));
		&dec ($cnt);
		&pxor ($Zlo,&QWP(8,$Htbl,$nhi));
		&psllq ($tmp,60);
		&pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
		&movd ($rem,$Zlo);
		&pxor ($Zhi,&QWP(0,$Htbl,$nhi));
		&mov ($nhi,$nlo);
		&pxor ($Zlo,$tmp);
		&js (&label("mmx_break"));

		&movz ($nhi,&BP(0,$inp,$cnt));
		&shl (&LB($nlo),4);
		&and ($rem,0xf);
		&psrlq ($Zlo,4);
		&mov (&LB($nlo),&LB($nhi));
		&and ($nhi,0xf0);
		&movq ($tmp,$Zhi);
		&shl (&LB($nlo),4);
		&psrlq ($Zhi,4);
		&and ($rem,0xf);
		&pxor ($Zlo,&QWP(8,$Htbl,$nlo));
		&psllq ($tmp,60);
		&pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
		&movd ($rem,$Zlo);
		&pxor ($Zhi,&QWP(0,$Htbl,$nlo));
		&pxor ($Zlo,$tmp);
		&and ($nhi,0xf0);
		&jmp (&label("mmx_loop"));

		&set_label("mmx_break",16);
		&shl (&LB($nlo),4);
		&and ($rem,0xf);
		&psrlq ($Zlo,4);
		&and ($nhi,0xf0);
		&movq ($tmp,$Zhi);
		&psrlq ($Zhi,4);
		&pxor ($Zlo,&QWP(8,$Htbl,$nlo));
		&psllq ($tmp,60);
		&pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
		&movd ($rem,$Zlo);
		&pxor ($Zhi,&QWP(0,$Htbl,$nlo));
		&pxor ($Zlo,$tmp);

		&psrlq ($Zlo,4);
		&and ($rem,0xf);
		&pxor ($Zlo,&QWP(8,$Htbl,$nhi));
		&movq ($tmp,$Zhi);
		&psrlq ($Zhi,4);
		&psllq ($tmp,60);
		&pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
		&movd ($rem,$Zlo);
		&pxor ($Zhi,&QWP(0,$Htbl,$nhi));
		&mov ($nhi,$nlo);
		&pxor ($Zlo,$tmp);

		&psrlq ($Zlo,32); # lower part of Zlo is already there
		&movd ($Zhl,$Zhi);
		&psrlq ($Zhi,32);

crypto/modes/asm/ghash-x86_64.pl

+15 −15

Original line number	Diff line number	Diff line
		@@ -7,9 +7,11 @@
		# details see http://www.openssl.org/~appro/cryptogams/.
		# ====================================================================
		#
		# March 2010
		#
		# The module implements "4-bit" Galois field multiplication and
		# streamed GHASH function. "4-bit" means that it uses 256 bytes
		# per-key table [+128 bytes fixed table]. Performance results are for
		# per-key table [+128 bytes shared table]. Performance results are for
		# streamed GHASH subroutine and are expressed in cycles per processed
		# byte, less is better:
		#
		@@ -136,9 +138,8 @@ $code=<<___;
		.align 16
		gcm_gmult_4bit:
		push %rbx
		push %rbp
		push %r12
		sub \$16,%rsp
		push %rbp # %rbp and %r12 are pushed exclusively in
		push %r12 # order to reuse Win64 exception handler...
		.Lgmult_prologue:

		movzb 15($Xi),$Zlo
		@@ -149,8 +150,8 @@ $code.=<<___;
		mov $Zlo,8($Xi)
		mov $Zhi,($Xi)

		mov 32(%rsp),%rbx
		lea 40(%rsp),%rsp
		mov 16(%rsp),%rbx
		lea 24(%rsp),%rsp
		.Lgmult_epilogue:
		ret
		.size gcm_gmult_4bit,.-gcm_gmult_4bit
		@@ -174,7 +175,6 @@ gcm_ghash_4bit:
		push %rbx
		push %rbp
		push %r12
		sub \$16,%rsp
		.Lghash_prologue:

		mov 8($Xi),$Zlo
		@@ -186,11 +186,11 @@ gcm_ghash_4bit:
		xor 8($inp),$Zlo
		xor ($inp),$Zhi
		lea 16($inp),$inp
		mov $Zlo,8(%rsp)
		mov $Zhi,(%rsp)
		mov $Zlo,8($Xi)
		mov $Zhi,($Xi)
		shr \$56,$Zlo
		___
		&loop ("%rsp");
		&loop ($Xi);
		$code.=<<___;
		cmp $len,$inp
		jb .Louter_loop
		@@ -198,10 +198,10 @@ $code.=<<___;
		mov $Zlo,8($Xi)
		mov $Zhi,($Xi)

		mov 16(%rsp),%r12
		mov 24(%rsp),%rbp
		mov 32(%rsp),%rbx
		lea 40(%rsp),%rsp
		mov 0(%rsp),%r12
		mov 8(%rsp),%rbp
		mov 16(%rsp),%rbx
		lea 24(%rsp),%rsp
		.Lghash_epilogue:
		ret
		.size gcm_ghash_4bit,.-gcm_ghash_4bit
		@@ -259,7 +259,7 @@ se_handler:
		cmp %r10,%rbx # context->Rip>=epilogue label
		jae .Lin_prologue

		lea 40(%rax),%rax # adjust "rsp"
		lea 24(%rax),%rax # adjust "rsp"

		mov -8(%rax),%rbx
		mov -16(%rax),%rbp