Loading crypto/aes/asm/aes-586.pl +29 −16 Original line number Diff line number Diff line Loading @@ -6,7 +6,7 @@ # forms are granted according to the OpenSSL license. # ==================================================================== # # Version 3.0. # Version 3.1. # # You might fail to appreciate this module performance from the first # try. If compared to "vanilla" linux-ia32-icc target, i.e. considered Loading Loading @@ -46,23 +46,27 @@ # Instruction Level Parallelism, and it indeed resulted in up to 15% # better performance on most recent µ-archs... # # Current ECB performance numbers for 128-bit key in cycles per byte # [measure commonly used by AES benchmarkers] are: # Current ECB performance numbers for 128-bit key in CPU cycles per # processed byte [measure commonly used by AES benchmarkers] are: # # small footprint fully unrolled # P4[-3] 23[24] 22[23] # AMD K8 19 18 # PIII 26(*) 23 # PIII 26 23 # Pentium 63(*) 52 # # (*) Performance difference between small footprint code and fully # unrolled in more commonly used CBC mode is not as big, 7% for # PIII and 15% for Pentium, which I consider tolerable. # unrolled in more commonly used CBC mode is not as big, 4% for # for Pentium. PIII's ~13% difference [in both cases in 3rd # version] is considered tolerable... # # Third version adds AES_cbc_encrypt implementation, which resulted in # up to 40% performance imrovement of CBC benchmark results [on most # recent -archs]. CBC performance is virtually as good as ECB now and # sometimes even better, because function prologues and epilogues are # up to 40% performance imrovement of CBC benchmark results. 40% was # observed on P4 core, where "overall" imrovement coefficient, i.e. if # compared to PIC generated by GCC and in CBC mode, was observed to be # as large as 4x:-) CBC performance is virtually identical to ECB now # and on some platforms even better, e.g. 56 "small" cycles/byte on # senior Pentium, because certain function prologues and epilogues are # effectively taken out of the loop... push(@INC,"perlasm","../../perlasm"); Loading @@ -79,8 +83,9 @@ $acc="esi"; $small_footprint=1; # $small_footprint=1 code is ~5% slower [on # recent µ-archs], but ~5 times smaller! # I favor compact code, because it minimizes # cache contention... # I favor compact code to minimize cache # contention and in hope to "collect" 5% back # in real-life applications... $vertical_spin=0; # shift "verticaly" defaults to 0, because of # its proof-of-concept status... Loading Loading @@ -1296,12 +1301,18 @@ sub declast() &push ($key eq "edi" ? $key : ""); # push ivp &pushf (); &mov ($key,&wparam(1)); # load out &xor ($s0,$s0); &mov (&DWP(0,$key),$s0); # zero output &mov (&DWP(4,$key),$s0); &mov (&DWP(8,$key),$s0); &mov (&DWP(12,$key),$s0); &mov ($s1,16); &sub ($s1,$s2); &cmp ($key,$acc); # compare with inp &je (&label("enc_in_place")); &data_word(0x90A4F3FC); # cld; rep movsb; nop # copy input &jmp (&label("enc_skip_in_place")); &set_label("enc_in_place"); &lea ($key,&DWP(0,$key,$s2)); &set_label("enc_skip_in_place"); &mov ($s2,$s1); &xor ($s0,$s0); &data_word(0x90AAF3FC); # cld; rep stosb; nop # zero tail &popf (); &pop ($key); # pop ivp Loading Loading @@ -1456,6 +1467,8 @@ sub declast() &pushf (); &data_word(0x90A4F3FC); # cld; rep movsb; nop # restore tail &popf (); &align (4); &set_label("dec_out"); &stack_pop(5); &function_end("AES_cbc_encrypt"); Loading Loading
crypto/aes/asm/aes-586.pl +29 −16 Original line number Diff line number Diff line Loading @@ -6,7 +6,7 @@ # forms are granted according to the OpenSSL license. # ==================================================================== # # Version 3.0. # Version 3.1. # # You might fail to appreciate this module performance from the first # try. If compared to "vanilla" linux-ia32-icc target, i.e. considered Loading Loading @@ -46,23 +46,27 @@ # Instruction Level Parallelism, and it indeed resulted in up to 15% # better performance on most recent µ-archs... # # Current ECB performance numbers for 128-bit key in cycles per byte # [measure commonly used by AES benchmarkers] are: # Current ECB performance numbers for 128-bit key in CPU cycles per # processed byte [measure commonly used by AES benchmarkers] are: # # small footprint fully unrolled # P4[-3] 23[24] 22[23] # AMD K8 19 18 # PIII 26(*) 23 # PIII 26 23 # Pentium 63(*) 52 # # (*) Performance difference between small footprint code and fully # unrolled in more commonly used CBC mode is not as big, 7% for # PIII and 15% for Pentium, which I consider tolerable. # unrolled in more commonly used CBC mode is not as big, 4% for # for Pentium. PIII's ~13% difference [in both cases in 3rd # version] is considered tolerable... # # Third version adds AES_cbc_encrypt implementation, which resulted in # up to 40% performance imrovement of CBC benchmark results [on most # recent -archs]. CBC performance is virtually as good as ECB now and # sometimes even better, because function prologues and epilogues are # up to 40% performance imrovement of CBC benchmark results. 40% was # observed on P4 core, where "overall" imrovement coefficient, i.e. if # compared to PIC generated by GCC and in CBC mode, was observed to be # as large as 4x:-) CBC performance is virtually identical to ECB now # and on some platforms even better, e.g. 56 "small" cycles/byte on # senior Pentium, because certain function prologues and epilogues are # effectively taken out of the loop... push(@INC,"perlasm","../../perlasm"); Loading @@ -79,8 +83,9 @@ $acc="esi"; $small_footprint=1; # $small_footprint=1 code is ~5% slower [on # recent µ-archs], but ~5 times smaller! # I favor compact code, because it minimizes # cache contention... # I favor compact code to minimize cache # contention and in hope to "collect" 5% back # in real-life applications... $vertical_spin=0; # shift "verticaly" defaults to 0, because of # its proof-of-concept status... Loading Loading @@ -1296,12 +1301,18 @@ sub declast() &push ($key eq "edi" ? $key : ""); # push ivp &pushf (); &mov ($key,&wparam(1)); # load out &xor ($s0,$s0); &mov (&DWP(0,$key),$s0); # zero output &mov (&DWP(4,$key),$s0); &mov (&DWP(8,$key),$s0); &mov (&DWP(12,$key),$s0); &mov ($s1,16); &sub ($s1,$s2); &cmp ($key,$acc); # compare with inp &je (&label("enc_in_place")); &data_word(0x90A4F3FC); # cld; rep movsb; nop # copy input &jmp (&label("enc_skip_in_place")); &set_label("enc_in_place"); &lea ($key,&DWP(0,$key,$s2)); &set_label("enc_skip_in_place"); &mov ($s2,$s1); &xor ($s0,$s0); &data_word(0x90AAF3FC); # cld; rep stosb; nop # zero tail &popf (); &pop ($key); # pop ivp Loading Loading @@ -1456,6 +1467,8 @@ sub declast() &pushf (); &data_word(0x90A4F3FC); # cld; rep movsb; nop # restore tail &popf (); &align (4); &set_label("dec_out"); &stack_pop(5); &function_end("AES_cbc_encrypt"); Loading