Bug-fix in CBC encrypt tail processing and commentary section update. (bac252a5) · Commits · CYBER - Cyber Security / TS 103 523 MSP / TLMSP / TLMSP OpenSSL

crypto/aes/asm/aes-586.pl

+29 −16

Original line number	Diff line number	Diff line
		@@ -6,7 +6,7 @@
		# forms are granted according to the OpenSSL license.
		# ====================================================================
		#
		# Version 3.0.
		# Version 3.1.
		#
		# You might fail to appreciate this module performance from the first
		# try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
		@@ -46,23 +46,27 @@
		# Instruction Level Parallelism, and it indeed resulted in up to 15%
		# better performance on most recent µ-archs...
		#
		# Current ECB performance numbers for 128-bit key in cycles per byte
		# [measure commonly used by AES benchmarkers] are:
		# Current ECB performance numbers for 128-bit key in CPU cycles per
		# processed byte [measure commonly used by AES benchmarkers] are:
		#
		# small footprint fully unrolled
		# P4[-3] 23[24] 22[23]
		# AMD K8 19 18
		# PIII 26(*) 23
		# PIII 26 23
		# Pentium 63(*) 52
		#
		# (*) Performance difference between small footprint code and fully
		# unrolled in more commonly used CBC mode is not as big, 7% for
		# PIII and 15% for Pentium, which I consider tolerable.
		# unrolled in more commonly used CBC mode is not as big, 4% for
		# for Pentium. PIII's ~13% difference [in both cases in 3rd
		# version] is considered tolerable...
		#
		# Third version adds AES_cbc_encrypt implementation, which resulted in
		# up to 40% performance imrovement of CBC benchmark results [on most
		# recent -archs]. CBC performance is virtually as good as ECB now and
		# sometimes even better, because function prologues and epilogues are
		# up to 40% performance imrovement of CBC benchmark results. 40% was
		# observed on P4 core, where "overall" imrovement coefficient, i.e. if
		# compared to PIC generated by GCC and in CBC mode, was observed to be
		# as large as 4x:-) CBC performance is virtually identical to ECB now
		# and on some platforms even better, e.g. 56 "small" cycles/byte on
		# senior Pentium, because certain function prologues and epilogues are
		# effectively taken out of the loop...

		push(@INC,"perlasm","../../perlasm");
		@@ -79,8 +83,9 @@ $acc="esi";

		$small_footprint=1; # $small_footprint=1 code is ~5% slower [on
		# recent µ-archs], but ~5 times smaller!
		# I favor compact code, because it minimizes
		# cache contention...
		# I favor compact code to minimize cache
		# contention and in hope to "collect" 5% back
		# in real-life applications...
		$vertical_spin=0; # shift "verticaly" defaults to 0, because of
		# its proof-of-concept status...

		@@ -1296,12 +1301,18 @@ sub declast()
		&push ($key eq "edi" ? $key : ""); # push ivp
		&pushf ();
		&mov ($key,&wparam(1)); # load out
		&xor ($s0,$s0);
		&mov (&DWP(0,$key),$s0); # zero output
		&mov (&DWP(4,$key),$s0);
		&mov (&DWP(8,$key),$s0);
		&mov (&DWP(12,$key),$s0);
		&mov ($s1,16);
		&sub ($s1,$s2);
		&cmp ($key,$acc); # compare with inp
		&je (&label("enc_in_place"));
		&data_word(0x90A4F3FC); # cld; rep movsb; nop # copy input
		&jmp (&label("enc_skip_in_place"));
		&set_label("enc_in_place");
		&lea ($key,&DWP(0,$key,$s2));
		&set_label("enc_skip_in_place");
		&mov ($s2,$s1);
		&xor ($s0,$s0);
		&data_word(0x90AAF3FC); # cld; rep stosb; nop # zero tail
		&popf ();
		&pop ($key); # pop ivp

		@@ -1456,6 +1467,8 @@ sub declast()
		&pushf ();
		&data_word(0x90A4F3FC); # cld; rep movsb; nop # restore tail
		&popf ();

		&align (4);
		&set_label("dec_out");
		&stack_pop(5);
		&function_end("AES_cbc_encrypt");