GCM "jumbo" update: (c1f092d1) · Commits · CYBER - Cyber Security / TS 103 523 MSP / TLMSP / TLMSP OpenSSL

crypto/modes/asm/ghash-x86.pl

+724 −203

File changed.

Preview size limit exceeded, changes collapsed.

crypto/modes/asm/ghash-x86_64.pl

+337 −5

Original line number	Diff line number	Diff line
		@@ -20,6 +20,12 @@
		# Opteron 18.5 10.2 +80%
		# Core2 17.5 11.0 +59%

		# May 2010
		#
		# Add PCLMULQDQ version performing at 2.07 cycles per processed byte.
		# See ghash-x86.pl for background information and details about coding
		# techniques.

		$flavour = shift;
		$output = shift;
		if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
		@@ -51,7 +57,7 @@ $rem="%rdx";
		sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
		$r =~ s/%[er]([sd]i)/%\1l/;
		$r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }


		{ my $N;
		sub loop() {
		my $inp = shift;
		@@ -156,8 +162,7 @@ $code.=<<___;
		ret
		.size gcm_gmult_4bit,.-gcm_gmult_4bit
		___



		# per-function register layout
		$inp="%rdx";
		$len="%rcx";
		@@ -203,9 +208,295 @@ $code.=<<___;
		.Lghash_epilogue:
		ret
		.size gcm_ghash_4bit,.-gcm_ghash_4bit
		___

		######################################################################
		# PCLMULQDQ version.

		@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
		("%rdi","%rsi","%rdx","%rcx"); # Unix order

		($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2";
		($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");

		sub clmul64x64_T2 { # minimal register pressure
		my ($Xhi,$Xi,$Hkey,$modulo)=@_;

		$code.=<<___ if (!defined($modulo));
		movdqa $Xi,$Xhi #
		pshufd \$0b01001110,$Xi,$T1
		pshufd \$0b01001110,$Hkey,$T2
		pxor $Xi,$T1 #
		pxor $Hkey,$T2
		___
		$code.=<<___;
		pclmulqdq \$0x00,$Hkey,$Xi #######
		pclmulqdq \$0x11,$Hkey,$Xhi #######
		pclmulqdq \$0x00,$T2,$T1 #######
		pxor $Xi,$T1 #
		pxor $Xhi,$T1 #

		movdqa $T1,$T2 #
		psrldq \$8,$T1
		pslldq \$8,$T2 #
		pxor $T1,$Xhi
		pxor $T2,$Xi #
		___
		}

		sub reduction_alg9 { # 17/13 times faster than Intel version
		my ($Xhi,$Xi) = @_;

		$code.=<<___;
		# 1st phase
		movdqa $Xi,$T1 #
		psllq \$1,$Xi
		pxor $T1,$Xi #
		psllq \$5,$Xi #
		pxor $T1,$Xi #
		psllq \$57,$Xi #
		movdqa $Xi,$T2 #
		pslldq \$8,$Xi
		psrldq \$8,$T2 #
		pxor $T1,$Xi
		pxor $T2,$Xhi #

		# 2nd phase
		movdqa $Xi,$T2
		psrlq \$5,$Xi
		pxor $T2,$Xi #
		psrlq \$1,$Xi #
		pxor $T2,$Xi #
		pxor $Xhi,$T2
		psrlq \$1,$Xi #
		pxor $T2,$Xi #
		___
		}

		{ my ($Htbl,$Xip)=@_4args;

		$code.=<<___;
		.globl gcm_init_clmul
		.type gcm_init_clmul,\@abi-omnipotent
		.align 16
		gcm_init_clmul:
		movdqu ($Xip),$Hkey
		pshufd \$0b01001110,$Hkey,$Hkey # dword swap

		# <<1 twist
		pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
		movdqa $Hkey,$T1
		psllq \$1,$Hkey
		pxor $T3,$T3 #
		psrlq \$63,$T1
		pcmpgtd $T2,$T3 # broadcast carry bit
		pslldq \$8,$T1
		por $T1,$Hkey # H<<=1

		# magic reduction
		pand .L0x1c2_polynomial(%rip),$T3
		pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial

		# calculate H^2
		movdqa $Hkey,$Xi
		___
		&clmul64x64_T2 ($Xhi,$Xi,$Hkey);
		&reduction_alg9 ($Xhi,$Xi);
		$code.=<<___;
		movdqu $Hkey,($Htbl) # save H
		movdqu $Xi,16($Htbl) # save H^2
		ret
		.size gcm_init_clmul,.-gcm_init_clmul
		___
		}

		{ my ($Xip,$Htbl)=@_4args;

		$code.=<<___;
		.globl gcm_gmult_clmul
		.type gcm_gmult_clmul,\@abi-omnipotent
		.align 16
		gcm_gmult_clmul:
		movdqu ($Xip),$Xi
		movdqa .Lbswap_mask(%rip),$T3
		movdqu ($Htbl),$Hkey
		pshufb $T3,$Xi
		___
		&clmul64x64_T2 ($Xhi,$Xi,$Hkey);
		&reduction_alg9 ($Xhi,$Xi);
		$code.=<<___;
		pshufb $T3,$Xi
		movdqu $Xi,($Xip)
		ret
		.size gcm_gmult_clmul,.-gcm_gmult_clmul
		___
		}

		{ my ($Xip,$Htbl,$inp,$len)=@_4args;
		my $Xn="%xmm6";
		my $Xhn="%xmm7";
		my $Hkey2="%xmm8";
		my $T1n="%xmm9";
		my $T2n="%xmm10";

		$code.=<<___;
		.globl gcm_ghash_clmul
		.type gcm_ghash_clmul,\@abi-omnipotent
		.align 16
		gcm_ghash_clmul:
		___
		$code.=<<___ if ($win64);
		.LSEH_begin_gcm_ghash_clmul:
		# I can't trust assembler to use specific encoding:-(
		.byte 0x48,0x83,0xec,0x58 #sub \$0x58,%rsp
		.byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
		.byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp)
		.byte 0x44,0x0f,0x29,0x44,0x24,0x20 #movaps %xmm8,0x20(%rsp)
		.byte 0x44,0x0f,0x29,0x4c,0x24,0x30 #movaps %xmm9,0x30(%rsp)
		.byte 0x44,0x0f,0x29,0x54,0x24,0x40 #movaps %xmm10,0x40(%rsp)
		___
		$code.=<<___;
		movdqa .Lbswap_mask(%rip),$T3

		movdqu ($Xip),$Xi
		movdqu ($Htbl),$Hkey
		pshufb $T3,$Xi

		sub \$0x10,$len
		jz .Lodd_tail

		movdqu 16($Htbl),$Hkey2
		#######
		# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
		# [(HIi+1) + (HXi+1)] mod P =
		# [(HIi+1) + H^2(Ii+Xi)] mod P
		#
		movdqu ($inp),$T1 # Ii
		movdqu 16($inp),$Xn # Ii+1
		pshufb $T3,$T1
		pshufb $T3,$Xn
		pxor $T1,$Xi # Ii+Xi
		___
		&clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1
		$code.=<<___;
		movdqa $Xi,$Xhi #
		pshufd \$0b01001110,$Xi,$T1
		pshufd \$0b01001110,$Hkey2,$T2
		pxor $Xi,$T1 #
		pxor $Hkey2,$T2

		lea 32($inp),$inp # i+=2
		sub \$0x20,$len
		jbe .Leven_tail

		.Lmod_loop:
		___
		&clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi)
		$code.=<<___;
		movdqu ($inp),$T1 # Ii
		pxor $Xn,$Xi # (HIi+1) + H^2(Ii+Xi)
		pxor $Xhn,$Xhi

		movdqu 16($inp),$Xn # Ii+1
		pshufb $T3,$T1
		pshufb $T3,$Xn

		movdqa $Xn,$Xhn #
		pshufd \$0b01001110,$Xn,$T1n
		pshufd \$0b01001110,$Hkey,$T2n
		pxor $Xn,$T1n #
		pxor $Hkey,$T2n
		pxor $T1,$Xhi # "Ii+Xi", consume early

		movdqa $Xi,$T1 # 1st phase
		psllq \$1,$Xi
		pxor $T1,$Xi #
		psllq \$5,$Xi #
		pxor $T1,$Xi #
		pclmulqdq \$0x00,$Hkey,$Xn #######
		psllq \$57,$Xi #
		movdqa $Xi,$T2 #
		pslldq \$8,$Xi
		psrldq \$8,$T2 #
		pxor $T1,$Xi
		pxor $T2,$Xhi #

		pclmulqdq \$0x11,$Hkey,$Xhn #######
		movdqa $Xi,$T2 # 2nd phase
		psrlq \$5,$Xi
		pxor $T2,$Xi #
		psrlq \$1,$Xi #
		pxor $T2,$Xi #
		pxor $Xhi,$T2
		psrlq \$1,$Xi #
		pxor $T2,$Xi #

		pclmulqdq \$0x00,$T2n,$T1n #######
		movdqa $Xi,$Xhi #
		pshufd \$0b01001110,$Xi,$T1
		pshufd \$0b01001110,$Hkey2,$T2
		pxor $Xi,$T1 #
		pxor $Hkey2,$T2

		pxor $Xn,$T1n #
		pxor $Xhn,$T1n #
		movdqa $T1n,$T2n #
		psrldq \$8,$T1n
		pslldq \$8,$T2n #
		pxor $T1n,$Xhn
		pxor $T2n,$Xn #

		lea 32($inp),$inp
		sub \$0x20,$len
		ja .Lmod_loop

		.Leven_tail:
		___
		&clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi)
		$code.=<<___;
		pxor $Xn,$Xi # (HIi+1) + H^2(Ii+Xi)
		pxor $Xhn,$Xhi
		___
		&reduction_alg9 ($Xhi,$Xi);
		$code.=<<___;
		test $len,$len
		jnz .Ldone

		.Lodd_tail:
		movdqu ($inp),$T1 # Ii
		pshufb $T3,$T1
		pxor $T1,$Xi # Ii+Xi
		___
		&clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
		&reduction_alg9 ($Xhi,$Xi);
		$code.=<<___;
		.Ldone:
		pshufb $T3,$Xi
		movdqu $Xi,($Xip)
		___
		$code.=<<___ if ($win64);
		movaps (%rsp),%xmm6
		movaps 0x10(%rsp),%xmm7
		movaps 0x20(%rsp),%xmm8
		movaps 0x30(%rsp),%xmm9
		movaps 0x40(%rsp),%xmm10
		add \$0x58,%rsp
		___
		$code.=<<___;
		ret
		.LSEH_end_gcm_ghash_clmul:
		.size gcm_ghash_clmul,.-gcm_ghash_clmul
		___
		}

		$code.=<<___;
		.align 64
		.type rem_4bit,\@object
		.Lbswap_mask:
		.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
		.L0x1c2_polynomial:
		.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
		.align 64
		.type .Lrem_4bit,\@object
		.Lrem_4bit:
		.long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
		.long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
		@@ -214,7 +505,7 @@ $code.=<<___;
		.asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
		.align 64
		___


		# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
		# CONTEXT context,DISPATCHER_CONTEXT disp)
		if ($win64) {
		@@ -316,6 +607,10 @@ se_handler:
		.rva .LSEH_end_gcm_ghash_4bit
		.rva .LSEH_info_gcm_ghash_4bit

		.rva .LSEH_begin_gcm_ghash_clmul
		.rva .LSEH_end_gcm_ghash_clmul
		.rva .LSEH_info_gcm_ghash_clmul

		.section .xdata
		.align 8
		.LSEH_info_gcm_gmult_4bit:
		@@ -326,9 +621,46 @@ se_handler:
		.byte 9,0,0,0
		.rva se_handler
		.rva .Lghash_prologue,.Lghash_epilogue # HandlerData
		.LSEH_info_gcm_ghash_clmul:
		.byte 0x01,0x1f,0x0b,0x00
		.byte 0x1f,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
		.byte 0x19,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
		.byte 0x13,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
		.byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
		.byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6
		.byte 0x04,0xa2,0x00,0x00 #sub rsp,0x58
		___
		}

		sub rex {
		local *opcode=shift;
		my ($dst,$src)=@_;

		if ($dst>=8 \|\| $src>=8) {
		$rex=0x40;
		$rex\|=0x04 if($dst>=8);
		$rex\|=0x01 if($src>=8);
		push @opcode,$rex;
		}
		}

		sub pclmulqdq {
		my $arg=shift;
		my @opcode=(0x66);

		if ($arg=~/\$([x0-9a-f]+),\s%xmm([0-9]+),\s%xmm([0-9]+)/) {
		rex(\@opcode,$3,$2);
		push @opcode,0x0f,0x3a,0x44;
		push @opcode,0xc0\|($2&7)\|(($3&7)<<3); # ModR/M
		my $c=$1;
		push @opcode,$c=~/^0/?oct($c):$c;
		return ".byte\t".join(',',@opcode);
		}
		return "pclmulqdq\t".$arg;
		}

		$code =~ s/\`([^\`]*)\`/eval($1)/gem;
		$code =~ s/\bpclmulqdq\s+(\$.%xmm[0-9]+).$/pclmulqdq($1)/gem;

		print $code;

crypto/modes/gcm128.c

+88 −57

Original line number	Diff line number	Diff line
		@@ -68,6 +68,19 @@ typedef struct { u64 hi,lo; } u128;
		#endif

		#define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
		#define REDUCE1BIT(V) do { \
		if (sizeof(size_t)==8) { \
		u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
		V.lo = (V.hi<<63)\|(V.lo>>1); \
		V.hi = (V.hi>>1 )^T; \
		} \
		else { \
		u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
		V.lo = (V.hi<<63)\|(V.lo>>1); \
		V.hi = (V.hi>>1 )^((u64)T<<32); \
		} \
		} while(0)

		#ifdef TABLE_BITS
		#undef TABLE_BITS
		#endif
		@@ -75,14 +88,13 @@ typedef struct { u64 hi,lo; } u128;
		* Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
		* never be set to 8. 8 is effectively reserved for testing purposes.
		* Under ideal conditions "8-bit" version should be twice as fast as
		* "4-bit" one. But world is far from ideal. For gcc-generated x86 code,
		* "8-bit" was observed to run only ~50% faster. On x86_64 observed
		* improvement was ~75%, much closer to optimal, but the fact of
		* deviation means that references to pre-computed tables end up on
		* critical path and as tables are pretty big, 4KB per key+1KB shared,
		* execution time is sensitive to cache timing. It's not actually
		* proven, but 4-bit procedure is believed to provide adequate
		* all-round performance...
		* "4-bit" one. For gcc-generated x86[_64] code, "8-bit" was observed to
		* run ~75% faster, closer to 100% for commercial compilers... But the
		* catch is that "8-bit" procedure consumes 16 times more memory, 4KB
		* per indivudual key + 1KB shared, and as access to these tables end up
		* on critical path, real-life execution time would be sensitive to
		* cache timing. It's not actually proven, but "4-bit" procedure is
		* believed to provide adequate all-round performance...
		*/
		#define TABLE_BITS 4

		@@ -99,16 +111,7 @@ static void gcm_init_8bit(u128 Htable[256], u64 H[2])
		V.lo = H[1];

		for (Htable[128]=V, i=64; i>0; i>>=1) {
		if (sizeof(size_t)==8) {
		u64 T = U64(0xe100000000000000) & (0-(V.lo&1));
		V.lo = (V.hi<<63)\|(V.lo>>1);
		V.hi = (V.hi>>1 )^T;
		}
		else {
		u32 T = 0xe1000000U & (0-(u32)(V.lo&1));
		V.lo = (V.hi<<63)\|(V.lo>>1);
		V.hi = (V.hi>>1 )^((u64)T<<32);
		}
		REDUCE1BIT(V);
		Htable[i] = V;
		}

		@@ -238,18 +241,6 @@ static void gcm_init_4bit(u128 Htable[16], u64 H[2])
		#if defined(OPENSSL_SMALL_FOOTPRINT)
		int i;
		#endif
		#define REDUCE(V) do { \
		if (sizeof(size_t)==8) { \
		u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
		V.lo = (V.hi<<63)\|(V.lo>>1); \
		V.hi = (V.hi>>1 )^T; \
		} \
		else { \
		u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
		V.lo = (V.hi<<63)\|(V.lo>>1); \
		V.hi = (V.hi>>1 )^((u64)T<<32); \
		} \
		} while(0)

		Htable[0].hi = 0;
		Htable[0].lo = 0;
		@@ -258,7 +249,7 @@ static void gcm_init_4bit(u128 Htable[16], u64 H[2])

		#if defined(OPENSSL_SMALL_FOOTPRINT)
		for (Htable[8]=V, i=4; i>0; i>>=1) {
		REDUCE(V);
		REDUCE1BIT(V);
		Htable[i] = V;
		}

		@@ -272,11 +263,11 @@ static void gcm_init_4bit(u128 Htable[16], u64 H[2])
		}
		#else
		Htable[8] = V;
		REDUCE(V);
		REDUCE1BIT(V);
		Htable[4] = V;
		REDUCE(V);
		REDUCE1BIT(V);
		Htable[2] = V;
		REDUCE(V);
		REDUCE1BIT(V);
		Htable[1] = V;
		Htable[3].hi = V.hi^Htable[2].hi, Htable[3].lo = V.lo^Htable[2].lo;
		V=Htable[4];
		@@ -314,7 +305,6 @@ static void gcm_init_4bit(u128 Htable[16], u64 H[2])
		}
		}
		#endif
		#undef REDUCE
		}

		#ifndef GHASH_ASM
		@@ -471,7 +461,7 @@ void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);

		#define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
		#if defined(GHASH_ASM) \|\| !defined(OPENSSL_SMALL_FOOTPRINT)
		#define GHASH(in,len,ctx) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
		#define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
		/* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
		* trashing effect. In other words idea is to hash data while it's
		* still in L1 cache after encryption pass... */
		@@ -514,17 +504,7 @@ static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
		Z.hi ^= V.hi&M;
		Z.lo ^= V.lo&M;

		if (sizeof(size_t)==8) {
		u64 T = U64(0xe100000000000000) & (0-(V.lo&1));
		V.lo = (V.hi<<63)\|(V.lo>>1);
		V.hi = (V.hi>>1 )^T;
		}
		else {
		u32 T = 0xe1000000U & (0-(u32)(V.lo&1));
		V.lo = (V.hi<<63)\|(V.lo>>1);
		V.hi = (V.hi>>1 )^((u64)T<<32);
		}

		REDUCE1BIT(V);
		}
		}

		@@ -559,12 +539,40 @@ struct gcm128_context {
		u128 Htable[256];
		#else
		u128 Htable[16];
		void (*gmult)(u64 Xi[2],const u128 Htable[16]);
		void (ghash)(u64 Xi[2],const u128 Htable[16],const u8 inp,size_t len);
		#endif
		unsigned int res, pad;
		block128_f block;
		void *key;
		};

		#if TABLE_BITS==4 && defined(GHASH_ASM) && !defined(I386_ONLY) && \
		(defined(__i386) \|\| defined(__i386__) \|\| \
		defined(__x86_64) \|\| defined(__x86_64__) \|\| \
		defined(_M_IX86) \|\| defined(_M_AMD64) \|\| defined(_M_X64))
		# define GHASH_ASM_IAX
		extern unsigned int OPENSSL_ia32cap_P[2];

		void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
		void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
		void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);

		# if defined(__i386) \|\| defined(__i386__) \|\| defined(_M_IX86)
		# define GHASH_ASM_X86
		void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
		void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);

		void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
		void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
		# endif

		# undef GCM_MUL
		# define GCM_MUL(ctx,Xi) (*((ctx)->gmult))(ctx->Xi.u,ctx->Htable)
		# undef GHASH
		# define GHASH(ctx,in,len) (*((ctx)->ghash))((ctx)->Xi.u,(ctx)->Htable,in,len)
		#endif

		void CRYPTO_gcm128_init(GCM128_CONTEXT ctx,void key,block128_f block)
		{
		const union { long one; char little; } is_endian = {1};
		@@ -593,7 +601,29 @@ void CRYPTO_gcm128_init(GCM128_CONTEXT ctx,void key,block128_f block)
		#if TABLE_BITS==8
		gcm_init_8bit(ctx->Htable,ctx->H.u);
		#elif TABLE_BITS==4
		# if defined(GHASH_ASM_IAX)
		if (OPENSSL_ia32cap_P[1]&(1<<1)) {
		gcm_init_clmul(ctx->Htable,ctx->H.u);
		ctx->gmult = gcm_gmult_clmul;
		ctx->ghash = gcm_ghash_clmul;
		return;
		}
		gcm_init_4bit(ctx->Htable,ctx->H.u);
		# if defined(GHASH_ASM_X86)
		if (OPENSSL_ia32cap_P[0]&(1<<23)) {
		ctx->gmult = gcm_gmult_4bit_mmx;
		ctx->ghash = gcm_ghash_4bit_mmx;
		} else {
		ctx->gmult = gcm_gmult_4bit_x86;
		ctx->ghash = gcm_ghash_4bit_x86;
		}
		# else
		ctx->gmult = gcm_gmult_4bit;
		ctx->ghash = gcm_ghash_4bit;
		# endif
		# else
		gcm_init_4bit(ctx->Htable,ctx->H.u);
		# endif
		#endif
		}

		@@ -671,7 +701,7 @@ void CRYPTO_gcm128_aad(GCM128_CONTEXT ctx,const unsigned char aad,size_t len)

		#ifdef GHASH
		if ((i = (len&(size_t)-16))) {
		GHASH(aad,i,ctx);
		GHASH(ctx,aad,i);
		aad += i;
		len -= i;
		}
		@@ -740,7 +770,7 @@ void CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
		in += 16;
		j -= 16;
		}
		GHASH(out-GHASH_CHUNK,GHASH_CHUNK,ctx);
		GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
		len -= GHASH_CHUNK;
		}
		if ((i = (len&(size_t)-16))) {
		@@ -760,7 +790,7 @@ void CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
		in += 16;
		len -= 16;
		}
		GHASH(out-j,j,ctx);
		GHASH(ctx,out-j,j);
		}
		#else
		while (len>=16) {
		@@ -854,7 +884,7 @@ void CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
		while (len>=GHASH_CHUNK) {
		size_t j=GHASH_CHUNK;

		GHASH(in,GHASH_CHUNK,ctx);
		GHASH(ctx,in,GHASH_CHUNK);
		while (j) {
		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
		++ctr;
		@@ -872,7 +902,7 @@ void CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
		len -= GHASH_CHUNK;
		}
		if ((i = (len&(size_t)-16))) {
		GHASH(in,i,ctx);
		GHASH(ctx,in,i);
		while (len>=16) {
		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
		++ctr;
		@@ -1243,6 +1273,7 @@ int main()
		{
		size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
		union { u64 u; u8 c[1024]; } buf;
		int i;

		AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
		CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
		@@ -1267,11 +1298,11 @@ int main()
		ctr_t/(double)sizeof(buf),
		(gcm_t-ctr_t)/(double)sizeof(buf));
		#ifdef GHASH
		GHASH(buf.c,sizeof(buf),&ctx);
		GHASH(&ctx,buf.c,sizeof(buf));
		start = OPENSSL_rdtsc();
		GHASH(buf.c,sizeof(buf),&ctx);
		for (i=0;i<100;++i) GHASH(&ctx,buf.c,sizeof(buf));
		gcm_t = OPENSSL_rdtsc() - start;
		printf("%.2f\n",gcm_t/(double)sizeof(buf));
		printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
		#endif
		}
		#endif