Commit 760d2551 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

rc4-586.pl: 50% improvement on Core2 and 80% on Westmere.

parent 73ab3411
Loading
Loading
Loading
Loading
+129 −8
Original line number Diff line number Diff line
@@ -28,6 +28,31 @@
#
#					<appro@fy.chalmers.se>

# May 2011
#
# Optimize for Core2 and Westmere [and Opteron]. Current performance
# in cycles per processed byte (less is better) is:
#
# Pentium	10.2			# original numbers
# Pentium III	7.8(*)
# Intel P4	7.5
#
# Opteron	6.4/+14%		# new MMX numbers
# Core2		5.8/+50%(**)
# Westmere	5.5/+80%(**)
# Sandy Bridge	5.4/0%
#
# (*)	PIII can actually deliver 6.6 cycles per byte with MMX code,
#	but this specific code performs poorly on Core2. While below
#	MMX code delivering 5.8 on Core2 performs at 8.0 on PIII:-(
#	As PIII is not a "hot" CPU [anymore], I chose not to introduce
#	PIII-specific code path, which is why MMX code path is quarded
#	by SSE2 bit (see below), not MMX.
# (**)	Performance vs. block size on Core2 and Westmere had a maximum
#	at ... 64 bytes block size. And it was quite a maximum, 40-60%
#	in comparison to largest 8KB block size. Above improvement
#	coefficients are for the largest block size.

$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
@@ -62,6 +87,60 @@ sub RC4_loop {
	&$func	($out,&DWP(0,$dat,$ty,4));
}

if ($alt=0) {
  # works ~5% faster on Atom and ~20% slower on Core2
  my @XX=($xx,$out);
  $RC4_loop_mmx = sub {
    my $i=shift;

	&add	($yy,$tx);
	&movz	($yy,&LB($yy));
	&lea	(@XX[1],&DWP(1,@XX[0]));
	&psllq	("mm1",8*(($i-1)&7))			if (abs($i)!=1);
	&and	(@XX[1],0xff);
	&mov	($ty,&DWP(0,$dat,$yy,4));
	&mov	(&DWP(0,$dat,$yy,4),$tx);
	&mov	(&DWP(0,$dat,$XX[0],4),$ty);
	&add	($ty,$tx);
	&movd	(@XX[0],"mm7")				if ($i==0);
	&movz	($ty,&LB($ty));
	&mov	($tx,&DWP(0,$dat,@XX[1],4));
	&pxor	("mm2",$i==1?"mm0":"mm1")		if ($i>=0);
	&movq	("mm0",&QWP(0,$inp))			if ($i<=0);
	&movq	(&QWP(-8,(@XX[0],$inp)),"mm2")		if ($i==0);
	&movd	($i>0?"mm1":"mm2",&DWP(0,$dat,$ty,4));

	push	(@XX,shift(@XX))			if ($i>=0);
  }
} else {
  $RC4_loop_mmx = sub {
    my $i=shift;

	&add	($yy,$tx);
	&movz	($yy,&LB($yy));				# (*)
	&psllq	("mm1",8*(($i-1)&7))			if (abs($i)!=1);
	&mov	($ty,&DWP(0,$dat,$yy,4));
	&mov	(&DWP(0,$dat,$yy,4),$tx);
	&mov	(&DWP(0,$dat,$xx,4),$ty);
	&inc	($xx);
	&add	($ty,$tx);
	&movz	($xx,&LB($xx));				# (*)
	&movz	($ty,&LB($ty));				# (*)
	&pxor	("mm2",$i==1?"mm0":"mm1")		if ($i>=0);
	&movq	("mm0",&QWP(0,$inp))			if ($i<=0);
	&movq	(&QWP(-8,($out,$inp)),"mm2")		if ($i==0);
	&mov	($tx,&DWP(0,$dat,$xx,4));
	&movd	($i>0?"mm1":"mm2",&DWP(0,$dat,$ty,4));

	# (*)	This is the key to Core2 and Westmere performance.
	#	Whithout movz out-of-order execution logic confuses
	#	itself and fails to reorder loads and stores. Problem
	#	appears to be fixed in Sandy Bridge...
  }
}

&external_label("OPENSSL_ia32cap_P");

# void RC4(RC4_KEY *key,size_t len,const unsigned char *inp,unsigned char *out);
&function_begin("RC4");
	&mov	($dat,&wparam(0));	# load key schedule pointer
@@ -94,11 +173,48 @@ sub RC4_loop {
	&and	($ty,-4);		# how many 4-byte chunks?
	&jz	(&label("loop1"));

	&test	($ty,-8);
	&mov	(&wparam(3),$out);	# $out as accumulator in these loops
	&jz	(&label("go4loop4"));

	&picmeup($out,"OPENSSL_ia32cap_P");
	&bt	(&DWP(0,$out),26);	# check SSE2 bit [could have been MMX]
	&jnc	(&label("go4loop4"));

	&mov	($out,&wparam(3))	if (!$alt);
	&movd	("mm7",&wparam(3))	if ($alt);
	&and	($ty,-8);
	&lea	($ty,&DWP(-8,$inp,$ty));
	&mov	(&wparam(2),$ty);

	&mov	(&DWP(-4,$dat),$ty);	# save input+(len/8)*8-8

	&$RC4_loop_mmx(-1);
	&jmp(&label("loop_mmx_enter"));

	&set_label("loop_mmx",16);
		&$RC4_loop_mmx(0);
	&set_label("loop_mmx_enter");
		for 	($i=1;$i<8;$i++) { &$RC4_loop_mmx($i); }
		&cmp	($inp,&DWP(-4,$dat));
		&lea	($inp,&DWP(8,$inp));
	&jb	(&label("loop_mmx"));

	&movd	($out,"mm7")		if ($alt);
	&psllq	("mm1",56);
	&pxor	("mm2","mm1");
	&movq	(&QWP(-8,$out,$inp),"mm2");
	&emms	();

	&cmp	($inp,&wparam(1));	# compare to input+len
	&je	(&label("done"));
	&jmp	(&label("loop1"));

&set_label("go4loop4",16);
	&lea	($ty,&DWP(-4,$inp,$ty));
	&mov	(&wparam(2),$ty);	# save input+(len/4)*4-4
	&mov	(&wparam(3),$out);	# $out as accumulator in this loop

	&set_label("loop4",16);
	&set_label("loop4");
		for ($i=0;$i<4;$i++) { RC4_loop($i); }
		&ror	($out,8);
		&xor	($out,&DWP(0,$inp));
@@ -164,8 +280,6 @@ $idi="ebp";
$ido="ecx";
$idx="edx";

&external_label("OPENSSL_ia32cap_P");

# void RC4_set_key(RC4_KEY *key,int len,const unsigned char *data);
&function_begin("RC4_set_key");
	&mov	($out,&wparam(0));		# load key
@@ -254,14 +368,21 @@ $idx="edx";
	&blindpop("eax");
	&lea	("eax",&DWP(&label("opts")."-".&label("pic_point"),"eax"));
	&picmeup("edx","OPENSSL_ia32cap_P");
	&bt	(&DWP(0,"edx"),20);
	&jnc	(&label("skip"));
	&mov	("edx",&DWP(0,"edx"));
	&bt	("edx",20);
	&jc	(&label("1xchar"));
	&bt	("edx",26);
	&jnc	(&label("ret"));
	&add	("eax",25);
	&ret	();
&set_label("1xchar");
	&add	("eax",12);
	&set_label("skip");
&set_label("ret");
	&ret	();
&set_label("opts",64);
&asciz	("rc4(4x,int)");
&asciz	("rc4(1x,char)");
&asciz	("rc4(8x,mmx)");
&asciz	("RC4 for x86, CRYPTOGAMS by <appro\@openssl.org>");
&align	(64);
&function_end_B("RC4_options");