Commit 80451542 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

+20% performance improvement of P4-specific RC4_CHAR loop.

parent 81a86fcf
Loading
Loading
Loading
Loading
+6 −5
Original line number Diff line number Diff line
@@ -200,22 +200,23 @@ sub RC4

	&lea	($ty,&DWP(0,$in,$ty));
	&mov	(&swtmp(2),$ty);
	&movz	($tx,&BP(0,$d,$x));

	# strangely enough unrolled loop performs over 20% slower...
	&set_label("RC4_CHAR_loop");
		&movz	($tx,&BP(0,$d,$x));
		&add	(&LB($y),&LB($tx));
		&movz	($ty,&BP(0,$d,$y));
		&movb	(&BP(0,$d,$y),&LB($tx));
		&movb	(&BP(0,$d,$x),&LB($ty));
		&add	(&LB($ty),&LB($tx));
		&movz	($ty,&BP(0,$d,$ty));
		&add	(&LB($x),1);
		&xorb	(&LB($ty),&BP(0,$in));
		&movb	(&BP(0,$out),&LB($ty));
		&inc	(&LB($x));
		&inc	($in);
		&inc	($out);
		&lea	($in,&BP(1,$in));
		&movz	($tx,&BP(0,$d,$x));
		&cmp	($in,&swtmp(2));
		&movb	(&BP(0,$out),&LB($ty));
		&lea	($out,&BP(1,$out));
	&jb	(&label("RC4_CHAR_loop"));

	&set_label("finished");