Loading engines/asm/e_padlock-x86.pl +32 −13 Original line number Diff line number Diff line Loading @@ -15,14 +15,21 @@ # mode and ~75% in CBC mode. For aligned data improvement can be # observed for short inputs only, e.g. 45% for 64-byte messages in # ECB mode, 20% in CBC. Difference in performance for aligned vs. # misaligned data depends on misalignment and is either ~1.8x or # ~2.9x. These are approximately same factors as for hardware support, # so there is little reason to rely on the latter. It might actually # hurt performance in mixture of aligned and misaligned buffers, # because a) if you choose to flip 'align' flag on per-buffer basis, # then you'd have to reload key context; b) if you choose to set # 'align' flag permanently, it limits performance for aligned data # to ~1/2. All results were collected on 1.5GHz C7. # misaligned data depends on misalignment and is either ~1.8x or 2.9x. # These are approximately same factors as for hardware support, so # there is little reason to rely on the latter. On the contrary, it # might actually hurt performance in mixture of aligned and misaligned # buffers, because a) if you choose to flip 'align' flag in control # word on per-buffer basis, then you'd have to reload key context, # which incurs penalty; b) if you choose to set 'align' flag # permanently, it limits performance even for aligned data to ~1/2. # All above mentioned results were collected on 1.5GHz C7. Nano on the # other hand handles unaligned data more gracefully. Depending on # algorithm and how unaligned data is, hardware can be up to 70% more # efficient than below software alignment procedures, nor does 'align' # flag have affect on aligned performance [if has any meaning at all]. # Therefore suggestion is to unconditionally set 'align' flag on Nano # for optimal performance. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../crypto/perlasm"); Loading Loading @@ -362,7 +369,7 @@ my ($mode,$opcode) = @_; &ret (); &function_end_B("padlock_sha1_oneshot"); &function_begin_B("padlock_sha1"); &function_begin_B("padlock_sha1_blocks"); &push ("edi"); &push ("esi"); &mov ("eax",-1); Loading @@ -373,7 +380,7 @@ my ($mode,$opcode) = @_; &pop ("esi"); &pop ("edi"); &ret (); &function_end_B("padlock_sha1"); &function_end_B("padlock_sha1_blocks"); &function_begin_B("padlock_sha256_oneshot"); &push ("edi"); Loading @@ -397,7 +404,7 @@ my ($mode,$opcode) = @_; &ret (); &function_end_B("padlock_sha256_oneshot"); &function_begin_B("padlock_sha256"); &function_begin_B("padlock_sha256_blocks"); &push ("edi"); &push ("esi"); &mov ("eax",-1); Loading @@ -408,7 +415,19 @@ my ($mode,$opcode) = @_; &pop ("esi"); &pop ("edi"); &ret (); &function_end_B("padlock_sha256"); &function_end_B("padlock_sha256_blocks"); &function_begin_B("padlock_sha512_blocks"); &push ("edi"); &push ("esi"); &mov ("edi",&wparam(0)); &mov ("esi",&wparam(1)); &mov ("ecx",&wparam(2)); &data_byte(0xf3,0x0f,0xa6,0xe0); # rep xsha512 &pop ("esi"); &pop ("edi"); &ret (); &function_end_B("padlock_sha512_blocks"); &asciz ("VIA Padlock x86 module, CRYPTOGAMS by <appro\@openssl.org>"); &align (16); Loading @@ -417,7 +436,7 @@ my ($mode,$opcode) = @_; # Essentially this variable belongs in thread local storage. # Having this variable global on the other hand can only cause # few bogus key reloads [if any at all on signle-CPU system], # so we accept the panalty... # so we accept the penalty... &set_label("padlock_saved_context",4); &data_word(0); Loading engines/asm/e_padlock-x86_64.pl +17 −8 Original line number Diff line number Diff line Loading @@ -151,15 +151,15 @@ padlock_sha1_oneshot: ret .size padlock_sha1_oneshot,.-padlock_sha1_oneshot .globl padlock_sha1 .type padlock_sha1,\@function,3 .globl padlock_sha1_blocks .type padlock_sha1_blocks,\@function,3 .align 16 padlock_sha1: padlock_sha1_blocks: mov \$-1,%rax mov %rdx,%rcx .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1 ret .size padlock_sha1,.-padlock_sha1 .size padlock_sha1_blocks,.-padlock_sha1_blocks .globl padlock_sha256_oneshot .type padlock_sha256_oneshot,\@function,3 Loading @@ -171,15 +171,23 @@ padlock_sha256_oneshot: ret .size padlock_sha256_oneshot,.-padlock_sha256_oneshot .globl padlock_sha256 .type padlock_sha256,\@function,3 .globl padlock_sha256_blocks .type padlock_sha256_blocks,\@function,3 .align 16 padlock_sha256: padlock_sha256_blocks: mov \$-1,%rax mov %rdx,%rcx .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256 ret .size padlock_sha256,.-padlock_sha256 .size padlock_sha256_blocks,.-padlock_sha256_blocks .globl padlock_sha512_blocks,\@function,3 .align 16 padlock_sha512_blocks: mov %rdx,%rcx .byte 0xf3,0x0f,0xa6,0xe0 # rep xha512 ret .size padlock_sha512_blocks,.-padlock_sha512_blocks ___ sub generate_mode { Loading Loading @@ -207,6 +215,7 @@ padlock_${mode}_encrypt: xor %eax,%eax xor %ebx,%ebx testl \$`1<<5`,($ctx) # align bit in control word jnz .L${mode}_aligned test \$0x0f,$out setz %al # !out_misaligned test \$0x0f,$inp Loading Loading
engines/asm/e_padlock-x86.pl +32 −13 Original line number Diff line number Diff line Loading @@ -15,14 +15,21 @@ # mode and ~75% in CBC mode. For aligned data improvement can be # observed for short inputs only, e.g. 45% for 64-byte messages in # ECB mode, 20% in CBC. Difference in performance for aligned vs. # misaligned data depends on misalignment and is either ~1.8x or # ~2.9x. These are approximately same factors as for hardware support, # so there is little reason to rely on the latter. It might actually # hurt performance in mixture of aligned and misaligned buffers, # because a) if you choose to flip 'align' flag on per-buffer basis, # then you'd have to reload key context; b) if you choose to set # 'align' flag permanently, it limits performance for aligned data # to ~1/2. All results were collected on 1.5GHz C7. # misaligned data depends on misalignment and is either ~1.8x or 2.9x. # These are approximately same factors as for hardware support, so # there is little reason to rely on the latter. On the contrary, it # might actually hurt performance in mixture of aligned and misaligned # buffers, because a) if you choose to flip 'align' flag in control # word on per-buffer basis, then you'd have to reload key context, # which incurs penalty; b) if you choose to set 'align' flag # permanently, it limits performance even for aligned data to ~1/2. # All above mentioned results were collected on 1.5GHz C7. Nano on the # other hand handles unaligned data more gracefully. Depending on # algorithm and how unaligned data is, hardware can be up to 70% more # efficient than below software alignment procedures, nor does 'align' # flag have affect on aligned performance [if has any meaning at all]. # Therefore suggestion is to unconditionally set 'align' flag on Nano # for optimal performance. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../crypto/perlasm"); Loading Loading @@ -362,7 +369,7 @@ my ($mode,$opcode) = @_; &ret (); &function_end_B("padlock_sha1_oneshot"); &function_begin_B("padlock_sha1"); &function_begin_B("padlock_sha1_blocks"); &push ("edi"); &push ("esi"); &mov ("eax",-1); Loading @@ -373,7 +380,7 @@ my ($mode,$opcode) = @_; &pop ("esi"); &pop ("edi"); &ret (); &function_end_B("padlock_sha1"); &function_end_B("padlock_sha1_blocks"); &function_begin_B("padlock_sha256_oneshot"); &push ("edi"); Loading @@ -397,7 +404,7 @@ my ($mode,$opcode) = @_; &ret (); &function_end_B("padlock_sha256_oneshot"); &function_begin_B("padlock_sha256"); &function_begin_B("padlock_sha256_blocks"); &push ("edi"); &push ("esi"); &mov ("eax",-1); Loading @@ -408,7 +415,19 @@ my ($mode,$opcode) = @_; &pop ("esi"); &pop ("edi"); &ret (); &function_end_B("padlock_sha256"); &function_end_B("padlock_sha256_blocks"); &function_begin_B("padlock_sha512_blocks"); &push ("edi"); &push ("esi"); &mov ("edi",&wparam(0)); &mov ("esi",&wparam(1)); &mov ("ecx",&wparam(2)); &data_byte(0xf3,0x0f,0xa6,0xe0); # rep xsha512 &pop ("esi"); &pop ("edi"); &ret (); &function_end_B("padlock_sha512_blocks"); &asciz ("VIA Padlock x86 module, CRYPTOGAMS by <appro\@openssl.org>"); &align (16); Loading @@ -417,7 +436,7 @@ my ($mode,$opcode) = @_; # Essentially this variable belongs in thread local storage. # Having this variable global on the other hand can only cause # few bogus key reloads [if any at all on signle-CPU system], # so we accept the panalty... # so we accept the penalty... &set_label("padlock_saved_context",4); &data_word(0); Loading
engines/asm/e_padlock-x86_64.pl +17 −8 Original line number Diff line number Diff line Loading @@ -151,15 +151,15 @@ padlock_sha1_oneshot: ret .size padlock_sha1_oneshot,.-padlock_sha1_oneshot .globl padlock_sha1 .type padlock_sha1,\@function,3 .globl padlock_sha1_blocks .type padlock_sha1_blocks,\@function,3 .align 16 padlock_sha1: padlock_sha1_blocks: mov \$-1,%rax mov %rdx,%rcx .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1 ret .size padlock_sha1,.-padlock_sha1 .size padlock_sha1_blocks,.-padlock_sha1_blocks .globl padlock_sha256_oneshot .type padlock_sha256_oneshot,\@function,3 Loading @@ -171,15 +171,23 @@ padlock_sha256_oneshot: ret .size padlock_sha256_oneshot,.-padlock_sha256_oneshot .globl padlock_sha256 .type padlock_sha256,\@function,3 .globl padlock_sha256_blocks .type padlock_sha256_blocks,\@function,3 .align 16 padlock_sha256: padlock_sha256_blocks: mov \$-1,%rax mov %rdx,%rcx .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256 ret .size padlock_sha256,.-padlock_sha256 .size padlock_sha256_blocks,.-padlock_sha256_blocks .globl padlock_sha512_blocks,\@function,3 .align 16 padlock_sha512_blocks: mov %rdx,%rcx .byte 0xf3,0x0f,0xa6,0xe0 # rep xha512 ret .size padlock_sha512_blocks,.-padlock_sha512_blocks ___ sub generate_mode { Loading Loading @@ -207,6 +215,7 @@ padlock_${mode}_encrypt: xor %eax,%eax xor %ebx,%ebx testl \$`1<<5`,($ctx) # align bit in control word jnz .L${mode}_aligned test \$0x0f,$out setz %al # !out_misaligned test \$0x0f,$inp Loading