Loading crypto/rc4/asm/rc4-586.pl +129 −8 Original line number Diff line number Diff line Loading @@ -28,6 +28,31 @@ # # <appro@fy.chalmers.se> # May 2011 # # Optimize for Core2 and Westmere [and Opteron]. Current performance # in cycles per processed byte (less is better) is: # # Pentium 10.2 # original numbers # Pentium III 7.8(*) # Intel P4 7.5 # # Opteron 6.4/+14% # new MMX numbers # Core2 5.8/+50%(**) # Westmere 5.5/+80%(**) # Sandy Bridge 5.4/0% # # (*) PIII can actually deliver 6.6 cycles per byte with MMX code, # but this specific code performs poorly on Core2. While below # MMX code delivering 5.8 on Core2 performs at 8.0 on PIII:-( # As PIII is not a "hot" CPU [anymore], I chose not to introduce # PIII-specific code path, which is why MMX code path is quarded # by SSE2 bit (see below), not MMX. # (**) Performance vs. block size on Core2 and Westmere had a maximum # at ... 64 bytes block size. And it was quite a maximum, 40-60% # in comparison to largest 8KB block size. Above improvement # coefficients are for the largest block size. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; Loading Loading @@ -62,6 +87,60 @@ sub RC4_loop { &$func ($out,&DWP(0,$dat,$ty,4)); } if ($alt=0) { # works ~5% faster on Atom and ~20% slower on Core2 my @XX=($xx,$out); $RC4_loop_mmx = sub { my $i=shift; &add ($yy,$tx); &movz ($yy,&LB($yy)); &lea (@XX[1],&DWP(1,@XX[0])); &psllq ("mm1",8*(($i-1)&7)) if (abs($i)!=1); &and (@XX[1],0xff); &mov ($ty,&DWP(0,$dat,$yy,4)); &mov (&DWP(0,$dat,$yy,4),$tx); &mov (&DWP(0,$dat,$XX[0],4),$ty); &add ($ty,$tx); &movd (@XX[0],"mm7") if ($i==0); &movz ($ty,&LB($ty)); &mov ($tx,&DWP(0,$dat,@XX[1],4)); &pxor ("mm2",$i==1?"mm0":"mm1") if ($i>=0); &movq ("mm0",&QWP(0,$inp)) if ($i<=0); &movq (&QWP(-8,(@XX[0],$inp)),"mm2") if ($i==0); &movd ($i>0?"mm1":"mm2",&DWP(0,$dat,$ty,4)); push (@XX,shift(@XX)) if ($i>=0); } } else { $RC4_loop_mmx = sub { my $i=shift; &add ($yy,$tx); &movz ($yy,&LB($yy)); # (*) &psllq ("mm1",8*(($i-1)&7)) if (abs($i)!=1); &mov ($ty,&DWP(0,$dat,$yy,4)); &mov (&DWP(0,$dat,$yy,4),$tx); &mov (&DWP(0,$dat,$xx,4),$ty); &inc ($xx); &add ($ty,$tx); &movz ($xx,&LB($xx)); # (*) &movz ($ty,&LB($ty)); # (*) &pxor ("mm2",$i==1?"mm0":"mm1") if ($i>=0); &movq ("mm0",&QWP(0,$inp)) if ($i<=0); &movq (&QWP(-8,($out,$inp)),"mm2") if ($i==0); &mov ($tx,&DWP(0,$dat,$xx,4)); &movd ($i>0?"mm1":"mm2",&DWP(0,$dat,$ty,4)); # (*) This is the key to Core2 and Westmere performance. # Whithout movz out-of-order execution logic confuses # itself and fails to reorder loads and stores. Problem # appears to be fixed in Sandy Bridge... } } &external_label("OPENSSL_ia32cap_P"); # void RC4(RC4_KEY *key,size_t len,const unsigned char *inp,unsigned char *out); &function_begin("RC4"); &mov ($dat,&wparam(0)); # load key schedule pointer Loading Loading @@ -94,11 +173,48 @@ sub RC4_loop { &and ($ty,-4); # how many 4-byte chunks? &jz (&label("loop1")); &test ($ty,-8); &mov (&wparam(3),$out); # $out as accumulator in these loops &jz (&label("go4loop4")); &picmeup($out,"OPENSSL_ia32cap_P"); &bt (&DWP(0,$out),26); # check SSE2 bit [could have been MMX] &jnc (&label("go4loop4")); &mov ($out,&wparam(3)) if (!$alt); &movd ("mm7",&wparam(3)) if ($alt); &and ($ty,-8); &lea ($ty,&DWP(-8,$inp,$ty)); &mov (&wparam(2),$ty); &mov (&DWP(-4,$dat),$ty); # save input+(len/8)*8-8 &$RC4_loop_mmx(-1); &jmp(&label("loop_mmx_enter")); &set_label("loop_mmx",16); &$RC4_loop_mmx(0); &set_label("loop_mmx_enter"); for ($i=1;$i<8;$i++) { &$RC4_loop_mmx($i); } &cmp ($inp,&DWP(-4,$dat)); &lea ($inp,&DWP(8,$inp)); &jb (&label("loop_mmx")); &movd ($out,"mm7") if ($alt); &psllq ("mm1",56); &pxor ("mm2","mm1"); &movq (&QWP(-8,$out,$inp),"mm2"); &emms (); &cmp ($inp,&wparam(1)); # compare to input+len &je (&label("done")); &jmp (&label("loop1")); &set_label("go4loop4",16); &lea ($ty,&DWP(-4,$inp,$ty)); &mov (&wparam(2),$ty); # save input+(len/4)*4-4 &mov (&wparam(3),$out); # $out as accumulator in this loop &set_label("loop4",16); &set_label("loop4"); for ($i=0;$i<4;$i++) { RC4_loop($i); } &ror ($out,8); &xor ($out,&DWP(0,$inp)); Loading Loading @@ -164,8 +280,6 @@ $idi="ebp"; $ido="ecx"; $idx="edx"; &external_label("OPENSSL_ia32cap_P"); # void RC4_set_key(RC4_KEY *key,int len,const unsigned char *data); &function_begin("RC4_set_key"); &mov ($out,&wparam(0)); # load key Loading Loading @@ -254,14 +368,21 @@ $idx="edx"; &blindpop("eax"); &lea ("eax",&DWP(&label("opts")."-".&label("pic_point"),"eax")); &picmeup("edx","OPENSSL_ia32cap_P"); &bt (&DWP(0,"edx"),20); &jnc (&label("skip")); &mov ("edx",&DWP(0,"edx")); &bt ("edx",20); &jc (&label("1xchar")); &bt ("edx",26); &jnc (&label("ret")); &add ("eax",25); &ret (); &set_label("1xchar"); &add ("eax",12); &set_label("skip"); &set_label("ret"); &ret (); &set_label("opts",64); &asciz ("rc4(4x,int)"); &asciz ("rc4(1x,char)"); &asciz ("rc4(8x,mmx)"); &asciz ("RC4 for x86, CRYPTOGAMS by <appro\@openssl.org>"); &align (64); &function_end_B("RC4_options"); Loading Loading
crypto/rc4/asm/rc4-586.pl +129 −8 Original line number Diff line number Diff line Loading @@ -28,6 +28,31 @@ # # <appro@fy.chalmers.se> # May 2011 # # Optimize for Core2 and Westmere [and Opteron]. Current performance # in cycles per processed byte (less is better) is: # # Pentium 10.2 # original numbers # Pentium III 7.8(*) # Intel P4 7.5 # # Opteron 6.4/+14% # new MMX numbers # Core2 5.8/+50%(**) # Westmere 5.5/+80%(**) # Sandy Bridge 5.4/0% # # (*) PIII can actually deliver 6.6 cycles per byte with MMX code, # but this specific code performs poorly on Core2. While below # MMX code delivering 5.8 on Core2 performs at 8.0 on PIII:-( # As PIII is not a "hot" CPU [anymore], I chose not to introduce # PIII-specific code path, which is why MMX code path is quarded # by SSE2 bit (see below), not MMX. # (**) Performance vs. block size on Core2 and Westmere had a maximum # at ... 64 bytes block size. And it was quite a maximum, 40-60% # in comparison to largest 8KB block size. Above improvement # coefficients are for the largest block size. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; Loading Loading @@ -62,6 +87,60 @@ sub RC4_loop { &$func ($out,&DWP(0,$dat,$ty,4)); } if ($alt=0) { # works ~5% faster on Atom and ~20% slower on Core2 my @XX=($xx,$out); $RC4_loop_mmx = sub { my $i=shift; &add ($yy,$tx); &movz ($yy,&LB($yy)); &lea (@XX[1],&DWP(1,@XX[0])); &psllq ("mm1",8*(($i-1)&7)) if (abs($i)!=1); &and (@XX[1],0xff); &mov ($ty,&DWP(0,$dat,$yy,4)); &mov (&DWP(0,$dat,$yy,4),$tx); &mov (&DWP(0,$dat,$XX[0],4),$ty); &add ($ty,$tx); &movd (@XX[0],"mm7") if ($i==0); &movz ($ty,&LB($ty)); &mov ($tx,&DWP(0,$dat,@XX[1],4)); &pxor ("mm2",$i==1?"mm0":"mm1") if ($i>=0); &movq ("mm0",&QWP(0,$inp)) if ($i<=0); &movq (&QWP(-8,(@XX[0],$inp)),"mm2") if ($i==0); &movd ($i>0?"mm1":"mm2",&DWP(0,$dat,$ty,4)); push (@XX,shift(@XX)) if ($i>=0); } } else { $RC4_loop_mmx = sub { my $i=shift; &add ($yy,$tx); &movz ($yy,&LB($yy)); # (*) &psllq ("mm1",8*(($i-1)&7)) if (abs($i)!=1); &mov ($ty,&DWP(0,$dat,$yy,4)); &mov (&DWP(0,$dat,$yy,4),$tx); &mov (&DWP(0,$dat,$xx,4),$ty); &inc ($xx); &add ($ty,$tx); &movz ($xx,&LB($xx)); # (*) &movz ($ty,&LB($ty)); # (*) &pxor ("mm2",$i==1?"mm0":"mm1") if ($i>=0); &movq ("mm0",&QWP(0,$inp)) if ($i<=0); &movq (&QWP(-8,($out,$inp)),"mm2") if ($i==0); &mov ($tx,&DWP(0,$dat,$xx,4)); &movd ($i>0?"mm1":"mm2",&DWP(0,$dat,$ty,4)); # (*) This is the key to Core2 and Westmere performance. # Whithout movz out-of-order execution logic confuses # itself and fails to reorder loads and stores. Problem # appears to be fixed in Sandy Bridge... } } &external_label("OPENSSL_ia32cap_P"); # void RC4(RC4_KEY *key,size_t len,const unsigned char *inp,unsigned char *out); &function_begin("RC4"); &mov ($dat,&wparam(0)); # load key schedule pointer Loading Loading @@ -94,11 +173,48 @@ sub RC4_loop { &and ($ty,-4); # how many 4-byte chunks? &jz (&label("loop1")); &test ($ty,-8); &mov (&wparam(3),$out); # $out as accumulator in these loops &jz (&label("go4loop4")); &picmeup($out,"OPENSSL_ia32cap_P"); &bt (&DWP(0,$out),26); # check SSE2 bit [could have been MMX] &jnc (&label("go4loop4")); &mov ($out,&wparam(3)) if (!$alt); &movd ("mm7",&wparam(3)) if ($alt); &and ($ty,-8); &lea ($ty,&DWP(-8,$inp,$ty)); &mov (&wparam(2),$ty); &mov (&DWP(-4,$dat),$ty); # save input+(len/8)*8-8 &$RC4_loop_mmx(-1); &jmp(&label("loop_mmx_enter")); &set_label("loop_mmx",16); &$RC4_loop_mmx(0); &set_label("loop_mmx_enter"); for ($i=1;$i<8;$i++) { &$RC4_loop_mmx($i); } &cmp ($inp,&DWP(-4,$dat)); &lea ($inp,&DWP(8,$inp)); &jb (&label("loop_mmx")); &movd ($out,"mm7") if ($alt); &psllq ("mm1",56); &pxor ("mm2","mm1"); &movq (&QWP(-8,$out,$inp),"mm2"); &emms (); &cmp ($inp,&wparam(1)); # compare to input+len &je (&label("done")); &jmp (&label("loop1")); &set_label("go4loop4",16); &lea ($ty,&DWP(-4,$inp,$ty)); &mov (&wparam(2),$ty); # save input+(len/4)*4-4 &mov (&wparam(3),$out); # $out as accumulator in this loop &set_label("loop4",16); &set_label("loop4"); for ($i=0;$i<4;$i++) { RC4_loop($i); } &ror ($out,8); &xor ($out,&DWP(0,$inp)); Loading Loading @@ -164,8 +280,6 @@ $idi="ebp"; $ido="ecx"; $idx="edx"; &external_label("OPENSSL_ia32cap_P"); # void RC4_set_key(RC4_KEY *key,int len,const unsigned char *data); &function_begin("RC4_set_key"); &mov ($out,&wparam(0)); # load key Loading Loading @@ -254,14 +368,21 @@ $idx="edx"; &blindpop("eax"); &lea ("eax",&DWP(&label("opts")."-".&label("pic_point"),"eax")); &picmeup("edx","OPENSSL_ia32cap_P"); &bt (&DWP(0,"edx"),20); &jnc (&label("skip")); &mov ("edx",&DWP(0,"edx")); &bt ("edx",20); &jc (&label("1xchar")); &bt ("edx",26); &jnc (&label("ret")); &add ("eax",25); &ret (); &set_label("1xchar"); &add ("eax",12); &set_label("skip"); &set_label("ret"); &ret (); &set_label("opts",64); &asciz ("rc4(4x,int)"); &asciz ("rc4(1x,char)"); &asciz ("rc4(8x,mmx)"); &asciz ("RC4 for x86, CRYPTOGAMS by <appro\@openssl.org>"); &align (64); &function_end_B("RC4_options"); Loading