Commit b7b46c9a authored by Andy Polyakov's avatar Andy Polyakov
Browse files

Add 0.9.7 specific comments to RC4 assembler modules.

parent e6e1f4cb
Loading
Loading
Loading
Loading
+10 −5
Original line number Diff line number Diff line
#!/usr/local/bin/perl

# At some point it became apparent that the original SSLeay RC4
# assembler implementation performs suboptimal on latest IA-32
# assembler implementation performs suboptimaly on latest IA-32
# microarchitectures. After re-tuning performance has changed as
# following:
#
@@ -15,10 +15,12 @@
#	In other words code performing further 13% faster on AMD
#	would perform almost 2 times slower on Intel PIII...
#	For reference! This code delivers ~80% of rc4-amd64.pl
#	performance on same Opteron machine.
#	performance on the same Opteron machine.
# (**)	This number requires compressed key schedule set up by
#	RC4_set_key, see commentary section in rc4_skey.c for
#	further details.
#	RC4_set_key and therefore doesn't apply to 0.9.7 [option for
#	compressed key schedule is implemented in 0.9.8 and later,
#	see commentary section in rc4_skey.c for further details].
#
#					<appro@fy.chalmers.se>

push(@INC,"perlasm","../../perlasm");
@@ -130,6 +132,8 @@ sub RC4
	 &add(	$d,	8);

	# detect compressed schedule, see commentary section in rc4_skey.c...
	# in 0.9.7 context ~50 bytes below RC4_CHAR label remain redundant,
	# as compressed key schedule is set up in 0.9.8 and later.
	&cmp(&DWP(256,$d),-1);
	&je(&label("RC4_CHAR"));

@@ -190,7 +194,8 @@ sub RC4
	&jmp(&label("finished"));

	&align(16);
	# this is essentially Intel P4 specific codepath, see rc4_skey.c...
	# this is essentially Intel P4 specific codepath, see rc4_skey.c,
	# and is engaged in 0.9.8 and later context...
	&set_label("RC4_CHAR");

	&lea	($ty,&DWP(0,$in,$ty));
+3 −1
Original line number Diff line number Diff line
@@ -30,7 +30,9 @@
# RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to
# compose blended code, which would perform even within 30% marginal
# on either AMD and Intel platforms, I implement both cases. See
# rc4_skey.c for further details...
# rc4_skey.c for further details... This applies to 0.9.8 and later.
# In 0.9.7 context RC4_CHAR codepath is never engaged and ~70 bytes
# of code remain redundant.

$output=shift;

+1 −1
Original line number Diff line number Diff line
@@ -18,7 +18,7 @@
// to input and output streams. Secondly, less obvious, it's possible
// to pull up some references to elements of the key schedule itself.
// Fact is that such prior loads are not safe only for "degenerated"
// key schedule, when all elements equal to the same value, which is
// key schedule, when some elements equal to the same value, which is
// never the case [key schedule setup routine makes sure it's not].
// Furthermore. In order to compress loop body to the minimum, I chose
// to deploy deposit instruction, which substitutes for the whole