Add 0.9.7 specific comments to RC4 assembler modules. (b7b46c9a) · Commits · CYBER - Cyber Security / TS 103 523 MSP / TLMSP / TLMSP OpenSSL

crypto/rc4/asm/rc4-586.pl

+10 −5

Original line number	Diff line number	Diff line
		#!/usr/local/bin/perl

		# At some point it became apparent that the original SSLeay RC4
		# assembler implementation performs suboptimal on latest IA-32
		# assembler implementation performs suboptimaly on latest IA-32
		# microarchitectures. After re-tuning performance has changed as
		# following:
		#
		@@ -15,10 +15,12 @@
		# In other words code performing further 13% faster on AMD
		# would perform almost 2 times slower on Intel PIII...
		# For reference! This code delivers ~80% of rc4-amd64.pl
		# performance on same Opteron machine.
		# performance on the same Opteron machine.
		# (**) This number requires compressed key schedule set up by
		# RC4_set_key, see commentary section in rc4_skey.c for
		# further details.
		# RC4_set_key and therefore doesn't apply to 0.9.7 [option for
		# compressed key schedule is implemented in 0.9.8 and later,
		# see commentary section in rc4_skey.c for further details].
		#
		# <appro@fy.chalmers.se>

		push(@INC,"perlasm","../../perlasm");
		@@ -130,6 +132,8 @@ sub RC4
		&add( $d, 8);

		# detect compressed schedule, see commentary section in rc4_skey.c...
		# in 0.9.7 context ~50 bytes below RC4_CHAR label remain redundant,
		# as compressed key schedule is set up in 0.9.8 and later.
		&cmp(&DWP(256,$d),-1);
		&je(&label("RC4_CHAR"));

		@@ -190,7 +194,8 @@ sub RC4
		&jmp(&label("finished"));

		&align(16);
		# this is essentially Intel P4 specific codepath, see rc4_skey.c...
		# this is essentially Intel P4 specific codepath, see rc4_skey.c,
		# and is engaged in 0.9.8 and later context...
		&set_label("RC4_CHAR");

		&lea ($ty,&DWP(0,$in,$ty));

crypto/rc4/asm/rc4-amd64.pl

+3 −1

Original line number	Diff line number	Diff line
		@@ -30,7 +30,9 @@
		# RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to
		# compose blended code, which would perform even within 30% marginal
		# on either AMD and Intel platforms, I implement both cases. See
		# rc4_skey.c for further details...
		# rc4_skey.c for further details... This applies to 0.9.8 and later.
		# In 0.9.7 context RC4_CHAR codepath is never engaged and ~70 bytes
		# of code remain redundant.

		$output=shift;

crypto/rc4/asm/rc4-ia64.S

+1 −1

Original line number	Diff line number	Diff line
		@@ -18,7 +18,7 @@
		// to input and output streams. Secondly, less obvious, it's possible
		// to pull up some references to elements of the key schedule itself.
		// Fact is that such prior loads are not safe only for "degenerated"
		// key schedule, when all elements equal to the same value, which is
		// key schedule, when some elements equal to the same value, which is
		// never the case [key schedule setup routine makes sure it's not].
		// Furthermore. In order to compress loop body to the minimum, I chose
		// to deploy deposit instruction, which substitutes for the whole