Loading crypto/rc4/asm/rc4-586.pl +10 −5 Original line number Diff line number Diff line #!/usr/local/bin/perl # At some point it became apparent that the original SSLeay RC4 # assembler implementation performs suboptimal on latest IA-32 # assembler implementation performs suboptimaly on latest IA-32 # microarchitectures. After re-tuning performance has changed as # following: # Loading @@ -15,10 +15,12 @@ # In other words code performing further 13% faster on AMD # would perform almost 2 times slower on Intel PIII... # For reference! This code delivers ~80% of rc4-amd64.pl # performance on same Opteron machine. # performance on the same Opteron machine. # (**) This number requires compressed key schedule set up by # RC4_set_key, see commentary section in rc4_skey.c for # further details. # RC4_set_key and therefore doesn't apply to 0.9.7 [option for # compressed key schedule is implemented in 0.9.8 and later, # see commentary section in rc4_skey.c for further details]. # # <appro@fy.chalmers.se> push(@INC,"perlasm","../../perlasm"); Loading Loading @@ -130,6 +132,8 @@ sub RC4 &add( $d, 8); # detect compressed schedule, see commentary section in rc4_skey.c... # in 0.9.7 context ~50 bytes below RC4_CHAR label remain redundant, # as compressed key schedule is set up in 0.9.8 and later. &cmp(&DWP(256,$d),-1); &je(&label("RC4_CHAR")); Loading Loading @@ -190,7 +194,8 @@ sub RC4 &jmp(&label("finished")); &align(16); # this is essentially Intel P4 specific codepath, see rc4_skey.c... # this is essentially Intel P4 specific codepath, see rc4_skey.c, # and is engaged in 0.9.8 and later context... &set_label("RC4_CHAR"); &lea ($ty,&DWP(0,$in,$ty)); Loading crypto/rc4/asm/rc4-amd64.pl +3 −1 Original line number Diff line number Diff line Loading @@ -30,7 +30,9 @@ # RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to # compose blended code, which would perform even within 30% marginal # on either AMD and Intel platforms, I implement both cases. See # rc4_skey.c for further details... # rc4_skey.c for further details... This applies to 0.9.8 and later. # In 0.9.7 context RC4_CHAR codepath is never engaged and ~70 bytes # of code remain redundant. $output=shift; Loading crypto/rc4/asm/rc4-ia64.S +1 −1 Original line number Diff line number Diff line Loading @@ -18,7 +18,7 @@ // to input and output streams. Secondly, less obvious, it's possible // to pull up some references to elements of the key schedule itself. // Fact is that such prior loads are not safe only for "degenerated" // key schedule, when all elements equal to the same value, which is // key schedule, when some elements equal to the same value, which is // never the case [key schedule setup routine makes sure it's not]. // Furthermore. In order to compress loop body to the minimum, I chose // to deploy deposit instruction, which substitutes for the whole Loading Loading
crypto/rc4/asm/rc4-586.pl +10 −5 Original line number Diff line number Diff line #!/usr/local/bin/perl # At some point it became apparent that the original SSLeay RC4 # assembler implementation performs suboptimal on latest IA-32 # assembler implementation performs suboptimaly on latest IA-32 # microarchitectures. After re-tuning performance has changed as # following: # Loading @@ -15,10 +15,12 @@ # In other words code performing further 13% faster on AMD # would perform almost 2 times slower on Intel PIII... # For reference! This code delivers ~80% of rc4-amd64.pl # performance on same Opteron machine. # performance on the same Opteron machine. # (**) This number requires compressed key schedule set up by # RC4_set_key, see commentary section in rc4_skey.c for # further details. # RC4_set_key and therefore doesn't apply to 0.9.7 [option for # compressed key schedule is implemented in 0.9.8 and later, # see commentary section in rc4_skey.c for further details]. # # <appro@fy.chalmers.se> push(@INC,"perlasm","../../perlasm"); Loading Loading @@ -130,6 +132,8 @@ sub RC4 &add( $d, 8); # detect compressed schedule, see commentary section in rc4_skey.c... # in 0.9.7 context ~50 bytes below RC4_CHAR label remain redundant, # as compressed key schedule is set up in 0.9.8 and later. &cmp(&DWP(256,$d),-1); &je(&label("RC4_CHAR")); Loading Loading @@ -190,7 +194,8 @@ sub RC4 &jmp(&label("finished")); &align(16); # this is essentially Intel P4 specific codepath, see rc4_skey.c... # this is essentially Intel P4 specific codepath, see rc4_skey.c, # and is engaged in 0.9.8 and later context... &set_label("RC4_CHAR"); &lea ($ty,&DWP(0,$in,$ty)); Loading
crypto/rc4/asm/rc4-amd64.pl +3 −1 Original line number Diff line number Diff line Loading @@ -30,7 +30,9 @@ # RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to # compose blended code, which would perform even within 30% marginal # on either AMD and Intel platforms, I implement both cases. See # rc4_skey.c for further details... # rc4_skey.c for further details... This applies to 0.9.8 and later. # In 0.9.7 context RC4_CHAR codepath is never engaged and ~70 bytes # of code remain redundant. $output=shift; Loading
crypto/rc4/asm/rc4-ia64.S +1 −1 Original line number Diff line number Diff line Loading @@ -18,7 +18,7 @@ // to input and output streams. Secondly, less obvious, it's possible // to pull up some references to elements of the key schedule itself. // Fact is that such prior loads are not safe only for "degenerated" // key schedule, when all elements equal to the same value, which is // key schedule, when some elements equal to the same value, which is // never the case [key schedule setup routine makes sure it's not]. // Furthermore. In order to compress loop body to the minimum, I chose // to deploy deposit instruction, which substitutes for the whole Loading