Commit b2dba9bf authored by Andy Polyakov's avatar Andy Polyakov
Browse files

Profiling revealed that OPENSSL_cleanse consumes *more* CPU time than

sha1_block_data_order when hashing short messages. Move OPENSSL_cleanse
to "cpuid" assembler module and gain 2x.
parent 932cc129
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -1209,6 +1209,7 @@ $cflags.=" -DOPENSSL_IA32_SSE2" if (!$no_sse2 && $bn_obj =~ /bn86/);

$cflags.=" -DOPENSSL_BN_ASM_MONT" if ($bn_obj =~ /\-mont|mo86\-/);

$cpuid_obj="mem_clr.o"	unless ($cpuid_obj =~ /\.o$/);
$des_obj=$des_enc	unless ($des_obj =~ /\.o$/);
$bf_obj=$bf_enc		unless ($bf_obj =~ /\.o$/);
$cast_obj=$cast_enc	unless ($cast_obj =~ /\.o$/);
@@ -1481,7 +1482,7 @@ print OUT "#ifdef OPENSSL_ALGORITHM_DEFINES\n";
print OUT $openssl_algorithm_defines_trans;
print OUT "#endif\n\n";

print OUT "#define OPENSSL_CPUID_OBJ\n\n" if ($cpuid_obj);
print OUT "#define OPENSSL_CPUID_OBJ\n\n" if ($cpuid_obj ne "mem_clr.o");

while (<IN>)
	{
+1 −1
Original line number Diff line number Diff line
@@ -34,7 +34,7 @@ GENERAL=Makefile README crypto-lib.com install.com
LIB= $(TOP)/libcrypto.a
SHARED_LIB= libcrypto$(SHLIB_EXT)
LIBSRC=	cryptlib.c mem.c mem_clr.c mem_dbg.c cversion.c ex_data.c cpt_err.c ebcdic.c uid.c o_time.c o_str.c o_dir.c
LIBOBJ= cryptlib.o mem.o mem_clr.o mem_dbg.o cversion.o ex_data.o cpt_err.o ebcdic.o uid.o o_time.o o_str.o o_dir.o $(CPUID_OBJ)
LIBOBJ= cryptlib.o mem.o mem_dbg.o cversion.o ex_data.o cpt_err.o ebcdic.o uid.o o_time.o o_str.o o_dir.o $(CPUID_OBJ)

SRC= $(LIBSRC)

+36 −0
Original line number Diff line number Diff line
// Works on all IA-64 platforms: Linux, HP-UX, Win64i...
// On Win64i compile with ias.exe.
.text

.global	OPENSSL_cpuid_setup#
.proc	OPENSSL_cpuid_setup#
OPENSSL_cpuid_setup:
{ .mib;	br.ret.sptk.many	b0		};;
.endp	OPENSSL_cpuid_setup#

.global	OPENSSL_rdtsc#
.proc	OPENSSL_rdtsc#
OPENSSL_rdtsc:
@@ -124,3 +126,37 @@ OPENSSL_wipe_cpu:
	mov		ar.lc=r3
	br.ret.sptk	b0		};;
.endp	OPENSSL_wipe_cpu#

.global	OPENSSL_cleanse#
.proc	OPENSSL_cleanse#
OPENSSL_cleanse:
{ .mib;	and		r2=7,r32
	cmp.leu		p6,p0=15,r33	    // len>=15
(p6)	br.cond.dptk	.Lot		};;

.Little:
{ .mib;	st1		[r32]=r0,1
	cmp.ltu		p6,p7=1,r33	}  // len>1
{ .mbb;	add		r33=-1,r33	   // len--
(p6)	br.cond.dptk	.Little
(p7)	br.ret.sptk.many	b0	};;

.Lot:
{ .mib;	cmp.eq		p6,p0=0,r2
(p6)	br.cond.dptk	.Laligned	};;
{ .mmi;	st1		[r32]=r0,1;;
	and		r2=7,r32	}
{ .mib;	add		r33=-1,r33
	br		.Lot		};;

.Laligned:
{ .mmi;	st8		[r32]=r0,8
	and		r2=-8,r33	    // len&~7
	add		r33=-8,r33	};; // len-=8
{ .mib;	cmp.ltu		p6,p0=8,r2	    // ((len+8)&~7)>8
(p6)	br.cond.dptk	.Laligned	};;

{ .mbb;	cmp.eq		p6,p7=r0,r33
(p7)	br.cond.dpnt	.Little
(p6)	br.ret.sptk.many	b0	};;
.endp	OPENSSL_cleanse#
+8 −2
Original line number Diff line number Diff line
@@ -250,7 +250,6 @@ void CRYPTO_get_mem_debug_functions(void (**m)(void *,int,const char *,int,int),
void *CRYPTO_malloc_locked(int num, const char *file, int line)
	{
	void *ret = NULL;
	extern unsigned char cleanse_ctr;

	if (num <= 0) return NULL;

@@ -267,11 +266,15 @@ void *CRYPTO_malloc_locked(int num, const char *file, int line)
	if (malloc_debug_func != NULL)
		malloc_debug_func(ret, num, file, line, 1);

#ifndef OPENSSL_CPUID_OBJ
        /* Create a dependency on the value of 'cleanse_ctr' so our memory
         * sanitisation function can't be optimised out. NB: We only do
         * this for >2Kb so the overhead doesn't bother us. */
        if(ret && (num > 2048))
	{	extern unsigned char cleanse_ctr;
		((unsigned char *)ret)[0] = cleanse_ctr;
	}
#endif

	return ret;
	}
@@ -291,7 +294,6 @@ void CRYPTO_free_locked(void *str)
void *CRYPTO_malloc(int num, const char *file, int line)
	{
	void *ret = NULL;
	extern unsigned char cleanse_ctr;

	if (num <= 0) return NULL;

@@ -308,11 +310,15 @@ void *CRYPTO_malloc(int num, const char *file, int line)
	if (malloc_debug_func != NULL)
		malloc_debug_func(ret, num, file, line, 1);

#ifndef OPENSSL_CPUID_OBJ
        /* Create a dependency on the value of 'cleanse_ctr' so our memory
         * sanitisation function can't be optimised out. NB: We only do
         * this for >2Kb so the overhead doesn't bother us. */
        if(ret && (num > 2048))
	{	extern unsigned char cleanse_ctr;
                ((unsigned char *)ret)[0] = cleanse_ctr;
	}
#endif

	return ret;
	}
+48 −0
Original line number Diff line number Diff line
@@ -232,6 +232,54 @@ _sparcv9_rdtick:
.type	_sparcv9_rdtick,#function
.size	_sparcv9_rdtick,.-_sparcv9_rdtick

.global	OPENSSL_cleanse
.align	32
OPENSSL_cleanse:
	cmp	%o1,6
	nop
#ifdef ABI64
	bgu	%xcc,.Lot
#else
	bgu	.Lot
#endif
	nop

.Little:
	stb	%g0,[%o0]
	subcc	%o1,1,%o1
	bnz	.Little
	add	%o0,1,%o0
	retl
	nop
.align	32
.Lot:
	andcc	%o0,3,%g0
	bz	.Laligned
	nop
	stb	%g0,[%o0]
	sub	%o1,1,%o1
	ba	.Lot
	add	%o0,1,%o0
	nop
.Laligned:
	st	%g0,[%o0]
	sub	%o1,4,%o1
	andcc	%o1,-4,%g0
#ifdef ABI64
	bnz	%xcc,.Laligned
#else
	bnz	.Laligned
#endif
	add	%o0,4,%o0

	cmp	%o1,0
	bne	.Little
	nop
	retl
	nop
.type	OPENSSL_cleanse,#function
.size	OPENSSL_cleanse,.-OPENSSL_cleanse

.section	".init",#alloc,#execinstr
	call	OPENSSL_cpuid_setup
	nop
Loading