Commit 149ca712 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

e_padlock-x86*.pl: Nano-related update.

parent 4cc2bbab
Loading
Loading
Loading
Loading
+32 −13
Original line number Diff line number Diff line
@@ -15,14 +15,21 @@
# mode and ~75% in CBC mode. For aligned data improvement can be
# observed for short inputs only, e.g. 45% for 64-byte messages in
# ECB mode, 20% in CBC. Difference in performance for aligned vs.
# misaligned data depends on misalignment and is either ~1.8x or
# ~2.9x. These are approximately same factors as for hardware support,
# so there is little reason to rely on the latter. It might actually
# hurt performance in mixture of aligned and misaligned buffers,
# because a) if you choose to flip 'align' flag on per-buffer basis,
# then you'd have to reload key context; b) if you choose to set
# 'align' flag permanently, it limits performance for aligned data
# to ~1/2. All results were collected on 1.5GHz C7.
# misaligned data depends on misalignment and is either ~1.8x or 2.9x.
# These are approximately same factors as for hardware support, so
# there is little reason to rely on the latter. On the contrary, it
# might actually hurt performance in mixture of aligned and misaligned
# buffers, because a) if you choose to flip 'align' flag in control
# word on per-buffer basis, then you'd have to reload key context,
# which incurs penalty; b) if you choose to set 'align' flag
# permanently, it limits performance even for aligned data to ~1/2.
# All above mentioned results were collected on 1.5GHz C7. Nano on the
# other hand handles unaligned data more gracefully. Depending on
# algorithm and how unaligned data is, hardware can be up to 70% more
# efficient than below software alignment procedures, nor does 'align'
# flag have affect on aligned performance [if has any meaning at all].
# Therefore suggestion is to unconditionally set 'align' flag on Nano
# for optimal performance.

$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../crypto/perlasm");
@@ -362,7 +369,7 @@ my ($mode,$opcode) = @_;
	&ret	();
&function_end_B("padlock_sha1_oneshot");

&function_begin_B("padlock_sha1");
&function_begin_B("padlock_sha1_blocks");
	&push	("edi");
	&push	("esi");
	&mov	("eax",-1);
@@ -373,7 +380,7 @@ my ($mode,$opcode) = @_;
	&pop	("esi");
	&pop	("edi");
	&ret	();
&function_end_B("padlock_sha1");
&function_end_B("padlock_sha1_blocks");

&function_begin_B("padlock_sha256_oneshot");
	&push	("edi");
@@ -397,7 +404,7 @@ my ($mode,$opcode) = @_;
	&ret	();
&function_end_B("padlock_sha256_oneshot");

&function_begin_B("padlock_sha256");
&function_begin_B("padlock_sha256_blocks");
	&push	("edi");
	&push	("esi");
	&mov	("eax",-1);
@@ -408,7 +415,19 @@ my ($mode,$opcode) = @_;
	&pop	("esi");
	&pop	("edi");
	&ret	();
&function_end_B("padlock_sha256");
&function_end_B("padlock_sha256_blocks");

&function_begin_B("padlock_sha512_blocks");
	&push	("edi");
	&push	("esi");
	&mov	("edi",&wparam(0));
	&mov	("esi",&wparam(1));
	&mov	("ecx",&wparam(2));
	&data_byte(0xf3,0x0f,0xa6,0xe0);	# rep xsha512
	&pop	("esi");
	&pop	("edi");
	&ret	();
&function_end_B("padlock_sha512_blocks");

&asciz	("VIA Padlock x86 module, CRYPTOGAMS by <appro\@openssl.org>");
&align	(16);
@@ -417,7 +436,7 @@ my ($mode,$opcode) = @_;
# Essentially this variable belongs in thread local storage.
# Having this variable global on the other hand can only cause
# few bogus key reloads [if any at all on signle-CPU system],
# so we accept the panalty...
# so we accept the penalty...
&set_label("padlock_saved_context",4);
&data_word(0);

+17 −8
Original line number Diff line number Diff line
@@ -151,15 +151,15 @@ padlock_sha1_oneshot:
	ret
.size	padlock_sha1_oneshot,.-padlock_sha1_oneshot

.globl	padlock_sha1
.type	padlock_sha1,\@function,3
.globl	padlock_sha1_blocks
.type	padlock_sha1_blocks,\@function,3
.align	16
padlock_sha1:
padlock_sha1_blocks:
	mov	\$-1,%rax
	mov	%rdx,%rcx
	.byte	0xf3,0x0f,0xa6,0xc8	# rep xsha1
	ret
.size	padlock_sha1,.-padlock_sha1
.size	padlock_sha1_blocks,.-padlock_sha1_blocks

.globl	padlock_sha256_oneshot
.type	padlock_sha256_oneshot,\@function,3
@@ -171,15 +171,23 @@ padlock_sha256_oneshot:
	ret
.size	padlock_sha256_oneshot,.-padlock_sha256_oneshot

.globl	padlock_sha256
.type	padlock_sha256,\@function,3
.globl	padlock_sha256_blocks
.type	padlock_sha256_blocks,\@function,3
.align	16
padlock_sha256:
padlock_sha256_blocks:
	mov	\$-1,%rax
	mov	%rdx,%rcx
	.byte	0xf3,0x0f,0xa6,0xd0	# rep xsha256
	ret
.size	padlock_sha256,.-padlock_sha256
.size	padlock_sha256_blocks,.-padlock_sha256_blocks

.globl	padlock_sha512_blocks,\@function,3
.align	16
padlock_sha512_blocks:
	mov	%rdx,%rcx
	.byte	0xf3,0x0f,0xa6,0xe0	# rep xha512
	ret
.size	padlock_sha512_blocks,.-padlock_sha512_blocks
___

sub generate_mode {
@@ -207,6 +215,7 @@ padlock_${mode}_encrypt:
	xor	%eax,%eax
	xor	%ebx,%ebx
	testl	\$`1<<5`,($ctx)		# align bit in control word
	jnz	.L${mode}_aligned
	test	\$0x0f,$out
	setz	%al			# !out_misaligned
	test	\$0x0f,$inp