Commit 9babf392 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

RC4_set_key for x86_64 and Core2 optimization.

PR: 1447
parent 2ec0be9e
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -120,7 +120,7 @@ my $x86_elf_asm="x86cpuid-elf.o:bn86-elf.o co86-elf.o mo86-elf.o:dx86-elf.o yx86
my $x86_coff_asm="x86cpuid-cof.o:bn86-cof.o co86-cof.o mo86-cof.o:dx86-cof.o yx86-cof.o:ax86-cof.o:bx86-cof.o:mx86-cof.o:sx86-cof.o s512sse2-cof.o:cx86-cof.o:rx86-cof.o:rm86-cof.o:r586-cof.o:wp_block.o w86mmx-cof.o";
my $x86_out_asm="x86cpuid-out.o:bn86-out.o co86-out.o mo86-out.o:dx86-out.o yx86-out.o:ax86-out.o:bx86-out.o:mx86-out.o:sx86-out.o s512sse2-out.o:cx86-out.o:rx86-out.o:rm86-out.o:r586-out.o:wp_block.o w86mmx-out.o";

my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o::aes-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o rc4_skey.o:::wp-x86_64.o";
my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o::aes-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o";
my $ia64_asm="ia64cpuid.o:bn-ia64.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::";
my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::::::::";
my $alpha_asm=":bn_asm.o alpha-mont.o::::::::::";
+4 −4
Original line number Diff line number Diff line
@@ -268,7 +268,7 @@ $bf_obj =
$md5_obj      = md5-x86_64.o
$sha1_obj     = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
$cast_obj     = 
$rc4_obj      = rc4-x86_64.o rc4_skey.o
$rc4_obj      = rc4-x86_64.o
$rmd160_obj   = 
$rc5_obj      = 
$wp_obj       = wp-x86_64.o
@@ -3152,7 +3152,7 @@ $bf_obj =
$md5_obj      = md5-x86_64.o
$sha1_obj     = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
$cast_obj     = 
$rc4_obj      = rc4-x86_64.o rc4_skey.o
$rc4_obj      = rc4-x86_64.o
$rmd160_obj   = 
$rc5_obj      = 
$wp_obj       = wp-x86_64.o
@@ -3964,7 +3964,7 @@ $bf_obj =
$md5_obj      = md5-x86_64.o
$sha1_obj     = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
$cast_obj     = 
$rc4_obj      = rc4-x86_64.o rc4_skey.o
$rc4_obj      = rc4-x86_64.o
$rmd160_obj   = 
$rc5_obj      = 
$wp_obj       = wp-x86_64.o
@@ -3992,7 +3992,7 @@ $bf_obj =
$md5_obj      = md5-x86_64.o
$sha1_obj     = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
$cast_obj     = 
$rc4_obj      = rc4-x86_64.o rc4_skey.o
$rc4_obj      = rc4-x86_64.o
$rmd160_obj   = 
$rc5_obj      = 
$wp_obj       = wp-x86_64.o
+115 −0
Original line number Diff line number Diff line
@@ -49,6 +49,14 @@
# is not implemented, then this final RC4_CHAR code-path should be
# preferred, as it provides better *all-round* performance].

# Intel Core2 was observed to perform poorly on both code paths:-( It
# apparently suffers from some kind of partial register stall, which
# occurs in 64-bit mode only [as virtually identical 32-bit loop was
# observed to outperform 64-bit one by almost 50%]. Adding two movzb to
# cloop1 boosts its performance by 80%! This loop appears to be optimal
# fit for Core2 and therefore the code was modified to skip cloop8 on
# this CPU.

$output=shift;
open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output";

@@ -152,6 +160,8 @@ $code.=<<___;
	movzb	($dat,$XX[0]),$TX[0]#d
	test	\$-8,$len
	jz	.Lcloop1
	cmp	\$0,260($dat)
	jnz	.Lcloop1
	push	%rbx
	jmp	.Lcloop8
.align	16
@@ -235,6 +245,111 @@ $code.=<<___;
.size	RC4,.-RC4
___

$idx="%r8";
$ido="%r9";

$code.=<<___;
.extern	OPENSSL_ia32cap_P
.globl	RC4_set_key
.type	RC4_set_key,\@function,3
.align	16
RC4_set_key:
	lea	8($dat),$dat
	lea	($inp,$len),$inp
	neg	$len
	mov	$len,%rcx
	xor	%eax,%eax
	xor	$ido,$ido
	xor	%r10,%r10
	xor	%r11,%r11

	mov	OPENSSL_ia32cap_P(%rip),$idx#d
	bt	\$20,$idx#d
	jnc	.Lw1stloop
	bt	\$30,$idx#d
	setc	$ido#b
	mov	$ido#d,260($dat)
	jmp	.Lc1stloop

.align	16
.Lw1stloop:
	mov	%eax,($dat,%rax,4)
	add	\$1,%al
	jnc	.Lw1stloop

	xor	$ido,$ido
	xor	$idx,$idx
.align	16
.Lw2ndloop:
	mov	($dat,$ido,4),%r10d
	add	($inp,$len,1),$idx#b
	add	%r10b,$idx#b
	add	\$1,$len
	mov	($dat,$idx,4),%r11d
	cmovz	%rcx,$len
	mov	%r10d,($dat,$idx,4)
	mov	%r11d,($dat,$ido,4)
	add	\$1,$ido#b
	jnc	.Lw2ndloop
	jmp	.Lexit_key

.align	16
.Lc1stloop:
	mov	%al,($dat,%rax)
	add	\$1,%al
	jnc	.Lc1stloop

	xor	$ido,$ido
	xor	$idx,$idx
.align	16
.Lc2ndloop:
	mov	($dat,$ido),%r10b
	add	($inp,$len),$idx#b
	add	%r10b,$idx#b
	add	\$1,$len
	mov	($dat,$idx),%r11b
	jnz	.Lcnowrap
	mov	%rcx,$len
.Lcnowrap:
	mov	%r10b,($dat,$idx)
	mov	%r11b,($dat,$ido)
	add	\$1,$ido#b
	jnc	.Lc2ndloop
	movl	\$-1,256($dat)

.align	16
.Lexit_key:
	xor	%eax,%eax
	mov	%eax,-8($dat)
	mov	%eax,-4($dat)
	ret
.size	RC4_set_key,.-RC4_set_key

.globl	RC4_options
.type	RC4_options,\@function,0
.align	16
RC4_options:
	.picmeup %rax
	lea	.Lopts-.(%rax),%rax
	mov	OPENSSL_ia32cap_P(%rip),%edx
	bt	\$20,%edx
	jnc	.Ldone
	add	\$12,%rax
	bt	\$30,%edx
	jnc	.Ldone
	add	\$13,%rax
.Ldone:
	ret
.align	64
.Lopts:
.asciz	"rc4(8x,int)"
.asciz	"rc4(8x,char)"
.asciz	"rc4(1x,char)"
.asciz	"RC4 for x86_64, OpenSSL project"
.align	64
.size	RC4_options,.-RC4_options
___

$code =~ s/#([bwd])/$1/gm;

print $code;
+58 −0
Original line number Diff line number Diff line
@@ -48,8 +48,37 @@ OPENSSL_wipe_cpu ENDP

OPENSSL_ia32_cpuid	PROC
	mov	r8,rbx

	xor	eax,eax
	cpuid
	xor	eax,eax
	cmp	ebx,0756e6547h
	setne	al
	mov	r9d,eax
	cmp	edx,049656e69h
	setne	al
	or	r9d,eax
	cmp	ecx,06c65746eh
	setne	al
	or	r9d,eax

	mov	eax,1
	cpuid
	bt	edx,28
	jnc	\$Ldone
	cmp	r9,0
	jne	\$Lnotintel
	or	edx,000100000h
	and	ah,15
	cmp	ah,15
	je	\$Lnotintel
	or	edx,040000000h
\$Lnotintel:
	shr	ebx,16
	cmp	bl,1
	ja	\$Ldone
	and	edx,0efffffffh
\$Ldone:
	shl	rcx,32
	mov	eax,edx
	mov	rbx,r8
@@ -124,8 +153,37 @@ OPENSSL_wipe_cpu:
.align	16
OPENSSL_ia32_cpuid:
	movq	%rbx,%r8

	xor	%eax,%eax
	cpuid
	xor	%eax,%eax
	cmp	\$0x756e6547,%ebx	# "Genu"
	setne	%al
	mov	%eax,%r9d
	cmp	\$0x49656e69,%edx	# "ineI"
	setne	%al
	or	%eax,%r9d
	cmp	\$0x6c65746e,%ecx	# "ntel"
	setne	%al
	or	%eax,%r9d

	movl	\$1,%eax
	cpuid
	bt	\$28,%edx		# test hyper-threading bit
	jnc	.Ldone
	cmp	\$0,%r9
	jne	.Lnotintel
	or	\$1<<20,%edx		# use reserved bit to engage RC4_CHAR
	and	\$15,%ah
	cmp	\$15,%ah		# examine Family ID
	je	.Lnotintel
	or	\$1<<30,%edx		# use reserved bit to skip unrolled loop
.Lnotintel:
	shr	\$16,%ebx
	cmp	\$1,%bl			# see if cache is shared
	ja	.Ldone
	and	\$~(1<<28),%edx
.Ldone:
	shlq	\$32,%rcx
	movl	%edx,%eax
	movq	%r8,%rbx