Loading crypto/aes/asm/bsaes-x86_64.pl +32 −40 Original line number Diff line number Diff line Loading @@ -38,8 +38,8 @@ # Emilia's this(*) difference # # Core 2 9.30 8.69 +7% # Nehalem(**) 7.63 6.98 +9% # Atom 17.1 17.4 -2%(***) # Nehalem(**) 7.63 6.88 +11% # Atom 17.1 16.4 +4% # # (*) Comparison is not completely fair, because "this" is ECB, # i.e. no extra processing such as counter values calculation Loading @@ -50,14 +50,6 @@ # (**) Results were collected on Westmere, which is considered to # be equivalent to Nehalem for this code. # # (***) Slowdown on Atom is rather strange per se, because original # implementation has a number of 9+-bytes instructions, which # are bad for Atom front-end, and which I eliminated completely. # In attempt to address deterioration sbox() was tested in FP # SIMD "domain" (movaps instead of movdqa, xorps instead of # pxor, etc.). While it resulted in nominal 4% improvement on # Atom, it hurted Westmere by more than 2x factor. # # As for key schedule conversion subroutine. Interface to OpenSSL # relies on per-invocation on-the-fly conversion. This naturally # has impact on performance, especially for short inputs. Conversion Loading @@ -67,7 +59,7 @@ # conversion conversion/8x block # Core 2 240 0.22 # Nehalem 180 0.20 # Atom 430 0.19 # Atom 430 0.20 # # The ratio values mean that 128-byte blocks will be processed # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%, Loading @@ -83,9 +75,9 @@ # Add decryption procedure. Performance in CPU cycles spent to decrypt # one byte out of 4096-byte buffer with 128-bit key is: # # Core 2 9.83 # Nehalem 7.74 # Atom 19.0 # Core 2 9.98 # Nehalem 7.80 # Atom 17.9 # # November 2011. # Loading Loading @@ -434,21 +426,21 @@ my $mask=pop; $code.=<<___; pxor 0x00($key),@x[0] pxor 0x10($key),@x[1] pshufb $mask,@x[0] pxor 0x20($key),@x[2] pshufb $mask,@x[1] pxor 0x30($key),@x[3] pshufb $mask,@x[2] pshufb $mask,@x[0] pshufb $mask,@x[1] pxor 0x40($key),@x[4] pshufb $mask,@x[3] pxor 0x50($key),@x[5] pshufb $mask,@x[4] pshufb $mask,@x[2] pshufb $mask,@x[3] pxor 0x60($key),@x[6] pshufb $mask,@x[5] pxor 0x70($key),@x[7] pshufb $mask,@x[4] pshufb $mask,@x[5] pshufb $mask,@x[6] lea 0x80($key),$key pshufb $mask,@x[7] lea 0x80($key),$key ___ } Loading Loading @@ -820,18 +812,18 @@ _bsaes_encrypt8: movdqa 0x50($const), @XMM[8] # .LM0SR pxor @XMM[9], @XMM[0] # xor with round0 key pxor @XMM[9], @XMM[1] pshufb @XMM[8], @XMM[0] pxor @XMM[9], @XMM[2] pshufb @XMM[8], @XMM[1] pxor @XMM[9], @XMM[3] pshufb @XMM[8], @XMM[2] pshufb @XMM[8], @XMM[0] pshufb @XMM[8], @XMM[1] pxor @XMM[9], @XMM[4] pshufb @XMM[8], @XMM[3] pxor @XMM[9], @XMM[5] pshufb @XMM[8], @XMM[4] pshufb @XMM[8], @XMM[2] pshufb @XMM[8], @XMM[3] pxor @XMM[9], @XMM[6] pshufb @XMM[8], @XMM[5] pxor @XMM[9], @XMM[7] pshufb @XMM[8], @XMM[4] pshufb @XMM[8], @XMM[5] pshufb @XMM[8], @XMM[6] pshufb @XMM[8], @XMM[7] _bsaes_encrypt8_bitslice: Loading Loading @@ -884,18 +876,18 @@ _bsaes_decrypt8: movdqa -0x30($const), @XMM[8] # .LM0ISR pxor @XMM[9], @XMM[0] # xor with round0 key pxor @XMM[9], @XMM[1] pshufb @XMM[8], @XMM[0] pxor @XMM[9], @XMM[2] pshufb @XMM[8], @XMM[1] pxor @XMM[9], @XMM[3] pshufb @XMM[8], @XMM[2] pshufb @XMM[8], @XMM[0] pshufb @XMM[8], @XMM[1] pxor @XMM[9], @XMM[4] pshufb @XMM[8], @XMM[3] pxor @XMM[9], @XMM[5] pshufb @XMM[8], @XMM[4] pshufb @XMM[8], @XMM[2] pshufb @XMM[8], @XMM[3] pxor @XMM[9], @XMM[6] pshufb @XMM[8], @XMM[5] pxor @XMM[9], @XMM[7] pshufb @XMM[8], @XMM[4] pshufb @XMM[8], @XMM[5] pshufb @XMM[8], @XMM[6] pshufb @XMM[8], @XMM[7] ___ Loading Loading @@ -1937,21 +1929,21 @@ $code.=<<___; movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR pxor @XMM[9], @XMM[0] # xor with round0 key pxor @XMM[9], @XMM[1] pshufb @XMM[8], @XMM[0] pxor @XMM[9], @XMM[2] pshufb @XMM[8], @XMM[1] pxor @XMM[9], @XMM[3] pshufb @XMM[8], @XMM[2] pshufb @XMM[8], @XMM[0] pshufb @XMM[8], @XMM[1] pxor @XMM[9], @XMM[4] pshufb @XMM[8], @XMM[3] pxor @XMM[9], @XMM[5] pshufb @XMM[8], @XMM[4] pshufb @XMM[8], @XMM[2] pshufb @XMM[8], @XMM[3] pxor @XMM[9], @XMM[6] pshufb @XMM[8], @XMM[5] pxor @XMM[9], @XMM[7] pshufb @XMM[8], @XMM[4] pshufb @XMM[8], @XMM[5] pshufb @XMM[8], @XMM[6] lea .LBS0(%rip), %r11 # constants table pshufb @XMM[8], @XMM[7] lea .LBS0(%rip), %r11 # constants table mov %ebx,%r10d # pass rounds call _bsaes_encrypt8_bitslice Loading Loading
crypto/aes/asm/bsaes-x86_64.pl +32 −40 Original line number Diff line number Diff line Loading @@ -38,8 +38,8 @@ # Emilia's this(*) difference # # Core 2 9.30 8.69 +7% # Nehalem(**) 7.63 6.98 +9% # Atom 17.1 17.4 -2%(***) # Nehalem(**) 7.63 6.88 +11% # Atom 17.1 16.4 +4% # # (*) Comparison is not completely fair, because "this" is ECB, # i.e. no extra processing such as counter values calculation Loading @@ -50,14 +50,6 @@ # (**) Results were collected on Westmere, which is considered to # be equivalent to Nehalem for this code. # # (***) Slowdown on Atom is rather strange per se, because original # implementation has a number of 9+-bytes instructions, which # are bad for Atom front-end, and which I eliminated completely. # In attempt to address deterioration sbox() was tested in FP # SIMD "domain" (movaps instead of movdqa, xorps instead of # pxor, etc.). While it resulted in nominal 4% improvement on # Atom, it hurted Westmere by more than 2x factor. # # As for key schedule conversion subroutine. Interface to OpenSSL # relies on per-invocation on-the-fly conversion. This naturally # has impact on performance, especially for short inputs. Conversion Loading @@ -67,7 +59,7 @@ # conversion conversion/8x block # Core 2 240 0.22 # Nehalem 180 0.20 # Atom 430 0.19 # Atom 430 0.20 # # The ratio values mean that 128-byte blocks will be processed # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%, Loading @@ -83,9 +75,9 @@ # Add decryption procedure. Performance in CPU cycles spent to decrypt # one byte out of 4096-byte buffer with 128-bit key is: # # Core 2 9.83 # Nehalem 7.74 # Atom 19.0 # Core 2 9.98 # Nehalem 7.80 # Atom 17.9 # # November 2011. # Loading Loading @@ -434,21 +426,21 @@ my $mask=pop; $code.=<<___; pxor 0x00($key),@x[0] pxor 0x10($key),@x[1] pshufb $mask,@x[0] pxor 0x20($key),@x[2] pshufb $mask,@x[1] pxor 0x30($key),@x[3] pshufb $mask,@x[2] pshufb $mask,@x[0] pshufb $mask,@x[1] pxor 0x40($key),@x[4] pshufb $mask,@x[3] pxor 0x50($key),@x[5] pshufb $mask,@x[4] pshufb $mask,@x[2] pshufb $mask,@x[3] pxor 0x60($key),@x[6] pshufb $mask,@x[5] pxor 0x70($key),@x[7] pshufb $mask,@x[4] pshufb $mask,@x[5] pshufb $mask,@x[6] lea 0x80($key),$key pshufb $mask,@x[7] lea 0x80($key),$key ___ } Loading Loading @@ -820,18 +812,18 @@ _bsaes_encrypt8: movdqa 0x50($const), @XMM[8] # .LM0SR pxor @XMM[9], @XMM[0] # xor with round0 key pxor @XMM[9], @XMM[1] pshufb @XMM[8], @XMM[0] pxor @XMM[9], @XMM[2] pshufb @XMM[8], @XMM[1] pxor @XMM[9], @XMM[3] pshufb @XMM[8], @XMM[2] pshufb @XMM[8], @XMM[0] pshufb @XMM[8], @XMM[1] pxor @XMM[9], @XMM[4] pshufb @XMM[8], @XMM[3] pxor @XMM[9], @XMM[5] pshufb @XMM[8], @XMM[4] pshufb @XMM[8], @XMM[2] pshufb @XMM[8], @XMM[3] pxor @XMM[9], @XMM[6] pshufb @XMM[8], @XMM[5] pxor @XMM[9], @XMM[7] pshufb @XMM[8], @XMM[4] pshufb @XMM[8], @XMM[5] pshufb @XMM[8], @XMM[6] pshufb @XMM[8], @XMM[7] _bsaes_encrypt8_bitslice: Loading Loading @@ -884,18 +876,18 @@ _bsaes_decrypt8: movdqa -0x30($const), @XMM[8] # .LM0ISR pxor @XMM[9], @XMM[0] # xor with round0 key pxor @XMM[9], @XMM[1] pshufb @XMM[8], @XMM[0] pxor @XMM[9], @XMM[2] pshufb @XMM[8], @XMM[1] pxor @XMM[9], @XMM[3] pshufb @XMM[8], @XMM[2] pshufb @XMM[8], @XMM[0] pshufb @XMM[8], @XMM[1] pxor @XMM[9], @XMM[4] pshufb @XMM[8], @XMM[3] pxor @XMM[9], @XMM[5] pshufb @XMM[8], @XMM[4] pshufb @XMM[8], @XMM[2] pshufb @XMM[8], @XMM[3] pxor @XMM[9], @XMM[6] pshufb @XMM[8], @XMM[5] pxor @XMM[9], @XMM[7] pshufb @XMM[8], @XMM[4] pshufb @XMM[8], @XMM[5] pshufb @XMM[8], @XMM[6] pshufb @XMM[8], @XMM[7] ___ Loading Loading @@ -1937,21 +1929,21 @@ $code.=<<___; movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR pxor @XMM[9], @XMM[0] # xor with round0 key pxor @XMM[9], @XMM[1] pshufb @XMM[8], @XMM[0] pxor @XMM[9], @XMM[2] pshufb @XMM[8], @XMM[1] pxor @XMM[9], @XMM[3] pshufb @XMM[8], @XMM[2] pshufb @XMM[8], @XMM[0] pshufb @XMM[8], @XMM[1] pxor @XMM[9], @XMM[4] pshufb @XMM[8], @XMM[3] pxor @XMM[9], @XMM[5] pshufb @XMM[8], @XMM[4] pshufb @XMM[8], @XMM[2] pshufb @XMM[8], @XMM[3] pxor @XMM[9], @XMM[6] pshufb @XMM[8], @XMM[5] pxor @XMM[9], @XMM[7] pshufb @XMM[8], @XMM[4] pshufb @XMM[8], @XMM[5] pshufb @XMM[8], @XMM[6] lea .LBS0(%rip), %r11 # constants table pshufb @XMM[8], @XMM[7] lea .LBS0(%rip), %r11 # constants table mov %ebx,%r10d # pass rounds call _bsaes_encrypt8_bitslice Loading