Commit 376729e1 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

RC4 tune-up for Intel P4 core, both 32- and 64-bit ones. As it's

apparently impossible to compose blended code with would perform
satisfactory on all x86 and x86_64 cores, an extra RC4_CHAR
code-path is introduced and P4 core is detected at run-time. This
way we keep original performance on non-P4 implementations and
turbo-charge P4 performance by factor of 2.8x (on 32-bit core).
parent 00dd8f6d
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -318,7 +318,7 @@ my %table=(
"linux-s390x",	"gcc:-DB_ENDIAN -DTERMIO -DNO_ASM -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"linux-ia64",	"gcc:-DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK RC4_CHAR::asm/ia64.o::asm/aes-ia64.o:::asm/sha256-ia64.o asm/sha512-ia64.o:::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"linux-ia64-ecc","ecc:-DL_ENDIAN -DTERMIO -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK RC4_CHAR::asm/ia64.o::asm/aes-ia64.o:::asm/sha256-ia64.o asm/sha512-ia64.o:::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"linux-x86_64",	"gcc:-m64 -DL_ENDIAN -DTERMIO -O3 -Wall -DMD32_REG_T=int::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK BF_PTR2 DES_INT DES_UNROLL::asm/x86_64-gcc.o:::::::asm/rc4-amd64.o:::dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"linux-x86_64",	"gcc:-m64 -DL_ENDIAN -DTERMIO -O3 -Wall -DMD32_REG_T=int::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK BF_PTR2 DES_INT DES_UNROLL:amd64cpuid.o:asm/x86_64-gcc.o:::::::asm/rc4-amd64.o:::dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"linux-elf-arm","gcc:-DL_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"linux-parisc",	"gcc:-DB_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall -DBN_DIV2W::-D_REENTRANT:::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::",
#### SPARC Linux setups
+2 −2
Original line number Diff line number Diff line
@@ -2086,7 +2086,7 @@ $unistd =
$thread_cflag = -D_REENTRANT
$sys_id       = 
$lflags       = -rdynamic -ldl
$bn_ops       = DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
$bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
$cpuid_obj    = x86cpuid-elf.o
$bn_obj       = asm/bn86-elf.o asm/co86-elf.o
$des_obj      = asm/dx86-elf.o asm/yx86-elf.o
@@ -3572,7 +3572,7 @@ $thread_cflag = -D_REENTRANT
$sys_id       = 
$lflags       = -ldl
$bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHUNK BF_PTR2 DES_INT DES_UNROLL
$cpuid_obj    = 
$cpuid_obj    = amd64cpuid.o
$bn_obj       = asm/x86_64-gcc.o
$des_obj      = 
$aes_obj      = 
+112 −3
Original line number Diff line number Diff line
@@ -5,16 +5,63 @@ $win64a=1 if ($output =~ /win64a\.[s|asm]/);
open STDOUT,">$output" || die "can't open $output: $!";

print<<___ if(defined($win64a));
TEXT	SEGMENT
_TEXT	SEGMENT
PUBLIC	OPENSSL_rdtsc
ALIGN	16
OPENSSL_rdtsc	PROC NEAR
OPENSSL_rdtsc	PROC
	rdtsc
	shl	rdx,32
	or	rax,rdx
	ret
OPENSSL_rdtsc	ENDP
TEXT	ENDS

PUBLIC	OPENSSL_atomic_add
ALIGN	16
OPENSSL_atomic_add	PROC
	mov	eax,DWORD PTR[rcx]
\$Lspin:	lea	r8,DWORD PTR[rdx+rax]
lock	cmpxchg	DWORD PTR[rcx],r8d
	jne	\$Lspin
	mov	eax,r8d
	cdqe    
	ret
OPENSSL_atomic_add	ENDP

PUBLIC	OPENSSL_wipe_cpu
ALIGN	16
OPENSSL_wipe_cpu	PROC
	pxor	xmm0,xmm0
	pxor	xmm1,xmm1
	pxor	xmm2,xmm2
	pxor	xmm3,xmm3
	pxor	xmm4,xmm4
	pxor	xmm5,xmm5
	xor	rcx,rcx
	xor	rdx,rdx
	xor	r8,r8
	xor	r9,r9
	xor	r10,r10
	xor	r11,r11
	lea	rax,QWORD PTR[rsp+8]
	ret
OPENSSL_wipe_cpu	ENDP

OPENSSL_ia32_cpuid	PROC
	mov	r8,rbx
	mov	eax,1
	cpuid
	shl	rcx,32
	mov	eax,edx
	mov	rbx,r8
	or	rax,rcx
	ret
OPENSSL_ia32_cpuid	ENDP
_TEXT	ENDS

CRT\$XIU	SEGMENT
EXTRN	OPENSSL_cpuid_setup:PROC
DQ	OPENSSL_cpuid_setup
CRT\$XIU	ENDS
END
___
print<<___ if(!defined($win64a));
@@ -27,4 +74,66 @@ OPENSSL_rdtsc:
	or	%rdx,%rax
	ret
.size	OPENSSL_rdtsc,.-OPENSSL_rdtsc

.globl	OPENSSL_atomic_add
.type	OPENSSL_atomic_add,\@function
.align	16
OPENSSL_atomic_add:
	movl	(%rdi),%eax
.Lspin:	lea	(%rsi,%rax),%r8
lock;	cmpxchg	%r8d,(%rdi)
	jne	.Lspin
	mov	%r8d,%eax
	cdqe
	ret
.size	OPENSSL_atomic_add,.-OPENSSL_atomic_add

.globl	OPENSSL_wipe_cpu
.type	OPENSSL_wipe_cpu,\@function
.align	16
OPENSSL_wipe_cpu:
	pxor	%xmm0,%xmm0
	pxor	%xmm1,%xmm1
	pxor	%xmm2,%xmm2
	pxor	%xmm3,%xmm3
	pxor	%xmm4,%xmm4
	pxor	%xmm5,%xmm5
	pxor	%xmm6,%xmm6
	pxor	%xmm7,%xmm7
	pxor	%xmm8,%xmm8
	pxor	%xmm9,%xmm9
	pxor	%xmm10,%xmm10
	pxor	%xmm11,%xmm11
	pxor	%xmm12,%xmm12
	pxor	%xmm13,%xmm13
	pxor	%xmm14,%xmm14
	pxor	%xmm15,%xmm15
	xor	%rcx,%rcx
	xor	%rdx,%rdx
	xor	%rsi,%rsi
	xor	%rdi,%rdi
	xor	%r8,%r8
	xor	%r9,%r9
	xor	%r10,%r10
	xor	%r11,%r11
	lea	8(%rsp),%rax
	ret
.size	OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu

.globl	OPENSSL_ia32_cpuid
.align	16
OPENSSL_ia32_cpuid:
	mov	%rbx,%r8
	mov	\$1,%eax
	cpuid
	shl	\$32,%rcx
	mov	%edx,%eax
	mov	%r8,%rbx
	or	%rcx,%rax
	ret
.size	OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid

.section	.init
	call	OPENSSL_cpuid_setup
	.align	16
___
+6 −5
Original line number Diff line number Diff line
@@ -161,7 +161,7 @@ sub main'shl { &out2("sall",@_); }
sub main'shr	{ &out2("shrl",@_); }
sub main'xor	{ &out2("xorl",@_); }
sub main'xorb	{ &out2("xorb",@_); }
sub main'add	{ &out2("addl",@_); }
sub main'add	{ &out2($_[0]=~/%[a-d][lh]/?"addb":"addl",@_); }
sub main'adc	{ &out2("adcl",@_); }
sub main'sub	{ &out2("subl",@_); }
sub main'sbb	{ &out2("sbbl",@_); }
@@ -189,7 +189,7 @@ sub main'jc { &out1("jc",@_); }
sub main'jnc	{ &out1("jnc",@_); }
sub main'jno	{ &out1("jno",@_); }
sub main'dec	{ &out1("decl",@_); }
sub main'inc	{ &out1("incl",@_); }
sub main'inc	{ &out1($_[0]=~/%[a-d][hl]/?"incb":"incl",@_); }
sub main'push	{ &out1("pushl",@_); $stack+=4; }
sub main'pop	{ &out1("popl",@_); $stack-=4; }
sub main'pushf	{ &out0("pushfl"); $stack+=4; }
@@ -205,9 +205,10 @@ sub main'nop { &out0("nop"); }
sub main'test	{ &out2("testl",@_); }
sub main'bt	{ &out2("btl",@_); }
sub main'leave	{ &out0("leave"); }
sub main'cpuid	{ &out0(".byte 0x0f; .byte 0xa2"); }
sub main'rdtsc	{ &out0(".byte 0x0f; .byte 0x31"); }
sub main'cpuid	{ &out0(".byte\t0x0f,0xa2"); }
sub main'rdtsc	{ &out0(".byte\t0x0f,0x31"); }
sub main'halt	{ &out0("hlt"); }
sub main'movz	{ &out2("movzb",@_); }

# SSE2
sub main'emms	{ &out0("emms"); }
@@ -558,7 +559,7 @@ sub main'file_end
		pushl	%ebx
		movl	%edx,%edi
		movl	\$1,%eax
		.byte 0x0f; .byte 0xa2
		.byte	0x0f,0xa2
		orl	\$1<<10,%edx
		movl	%edx,0(%edi)
		popl	%ebx
+47 −12
Original line number Diff line number Diff line
@@ -7,10 +7,10 @@ require "x86asm.pl";

&asm_init($ARGV[0],"rc4-586.pl");

$tx="eax";
$ty="ebx";
$x="ecx";
$y="edx";
$x="eax";
$y="ebx";
$tx="ecx";
$ty="edx";
$in="esi";
$out="edi";
$d="ebp";
@@ -31,7 +31,7 @@ sub RC4_loop
			{
			 &mov($ty,	&swtmp(2));
			&cmp($ty,	$in);
			 &jle(&label("finished"));
			 &jbe(&label("finished"));
			&inc($in);
			}
		else
@@ -39,7 +39,7 @@ sub RC4_loop
			&add($ty,	8);
			 &inc($in);
			&cmp($ty,	$in);
			 &jl(&label("finished"));
			 &jb(&label("finished"));
			&mov(&swtmp(2),	$ty);
			}
		}
@@ -88,35 +88,44 @@ sub RC4

	&function_begin_B($name,"");

	&mov($ty,&wparam(1));		# len
	&cmp($ty,0);
	&jne(&label("proceed"));
	&ret();
	&set_label("proceed");

	&comment("");

	&push("ebp");
	 &push("ebx");
	&mov(	$d,	&wparam(0));	# key
	 &mov(	$ty,	&wparam(1));	# num
	&push("esi");
	 &push("edi");
	&mov(	$d,	&wparam(0));	# key
	 &mov(	$in,	&wparam(2));

	&mov(	$x,	&DWP(0,$d,"",1));
	 &mov(	$y,	&DWP(4,$d,"",1));

	&mov(	$in,	&wparam(2));
	&mov(	$out,	&wparam(3));
	 &inc(	$x);

	&stack_push(3);	# 3 temp variables
	 &add(	$d,	8);
	&and(	$x,		0xff);

	# detect compressed schedule, see commentary section in rc4_skey.c...
	&cmp(&DWP(256,$d),-1);
	&je(&label("RC4_CHAR"));

	 &lea(	$ty,	&DWP(-8,$ty,$in));

	# check for 0 length input

	&mov(	$out,	&wparam(3));
	 &mov(	&swtmp(2),	$ty);	# this is now address to exit at
	&mov(	$tx,	&DWP(0,$d,$x,4));

	 &cmp(	$ty,	$in);
	&jl(	&label("end")); # less than 8 bytes
	&jb(	&label("end")); # less than 8 bytes

	&set_label("start");

@@ -148,7 +157,7 @@ sub RC4
	&mov(	&DWP(-4,$out,"",0),	$tx);
	 &mov(	$tx,		&DWP(0,$d,$x,4));
	&cmp($in,	$ty);
	 &jle(&label("start"));
	 &jbe(&label("start"));

	&set_label("end");

@@ -162,6 +171,32 @@ sub RC4
	&RC4_loop(5,0,1);
	&RC4_loop(6,1,1);

	&jmp(&label("finished"));

	&align(16);
	# this is essentially Intel P4 specific codepath, see rc4_skey.c...
	&set_label("RC4_CHAR");

	&lea	($ty,&DWP(0,$in,$ty));
	&mov	(&swtmp(2),$ty);

	# strangely enough unrolled loop performs over 20% slower...
	&set_label("RC4_CHAR_loop");
		&movz	($tx,&BP(0,$d,$x));
		&add	(&LB($y),&LB($tx));
		&movz	($ty,&BP(0,$d,$y));
		&movb	(&BP(0,$d,$y),&LB($tx));
		&movb	(&BP(0,$d,$x),&LB($ty));
		&add	(&LB($ty),&LB($tx));
		&movz	($ty,&BP(0,$d,$ty));
		&xorb	(&LB($ty),&BP(0,$in));
		&movb	(&BP(0,$out),&LB($ty));
		&inc	(&LB($x));
		&inc	($in);
		&inc	($out);
		&cmp	($in,&swtmp(2));
	&jb	(&label("RC4_CHAR_loop"));

	&set_label("finished");
	&dec(	$x);
	 &stack_pop(3);
Loading