Commit 629fd3aa authored by Andy Polyakov's avatar Andy Polyakov
Browse files

rc4-x86_64.pl: "Westmere" optimization.

parent 1aa8a629
Loading
Loading
Loading
Loading
+5 −0
Original line number Diff line number Diff line
@@ -58,6 +58,10 @@
# fit for Core2 and therefore the code was modified to skip cloop8 on
# this CPU.

# Intel Westmere was observed to perform suboptimally. Adding yet
# another movzb to cloop1 improved performance by almost 50%! Core2
# performance is improved too, but nominally...

$flavour = shift;
$output  = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@ -229,6 +233,7 @@ $code.=<<___;
.align	16
.Lcloop1:
	add	$TX[0]#b,$YY#b
	movzb	$YY#b,$YY#d
	movzb	($dat,$YY),$TY#d
	movb	$TX[0]#b,($dat,$YY)
	movb	$TY#b,($dat,$XX[0])