e_padlock-x86*.pl: Nano-related update. (149ca712) · Commits · CYBER - Cyber Security / TS 103 523 MSP / TLMSP / TLMSP OpenSSL

engines/asm/e_padlock-x86.pl

+32 −13

Original line number	Diff line number	Diff line
		@@ -15,14 +15,21 @@
		# mode and ~75% in CBC mode. For aligned data improvement can be
		# observed for short inputs only, e.g. 45% for 64-byte messages in
		# ECB mode, 20% in CBC. Difference in performance for aligned vs.
		# misaligned data depends on misalignment and is either ~1.8x or
		# ~2.9x. These are approximately same factors as for hardware support,
		# so there is little reason to rely on the latter. It might actually
		# hurt performance in mixture of aligned and misaligned buffers,
		# because a) if you choose to flip 'align' flag on per-buffer basis,
		# then you'd have to reload key context; b) if you choose to set
		# 'align' flag permanently, it limits performance for aligned data
		# to ~1/2. All results were collected on 1.5GHz C7.
		# misaligned data depends on misalignment and is either ~1.8x or 2.9x.
		# These are approximately same factors as for hardware support, so
		# there is little reason to rely on the latter. On the contrary, it
		# might actually hurt performance in mixture of aligned and misaligned
		# buffers, because a) if you choose to flip 'align' flag in control
		# word on per-buffer basis, then you'd have to reload key context,
		# which incurs penalty; b) if you choose to set 'align' flag
		# permanently, it limits performance even for aligned data to ~1/2.
		# All above mentioned results were collected on 1.5GHz C7. Nano on the
		# other hand handles unaligned data more gracefully. Depending on
		# algorithm and how unaligned data is, hardware can be up to 70% more
		# efficient than below software alignment procedures, nor does 'align'
		# flag have affect on aligned performance [if has any meaning at all].
		# Therefore suggestion is to unconditionally set 'align' flag on Nano
		# for optimal performance.

		$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
		push(@INC,"${dir}","${dir}../../crypto/perlasm");
		@@ -362,7 +369,7 @@ my ($mode,$opcode) = @_;
		&ret ();
		&function_end_B("padlock_sha1_oneshot");

		&function_begin_B("padlock_sha1");
		&function_begin_B("padlock_sha1_blocks");
		&push ("edi");
		&push ("esi");
		&mov ("eax",-1);
		@@ -373,7 +380,7 @@ my ($mode,$opcode) = @_;
		&pop ("esi");
		&pop ("edi");
		&ret ();
		&function_end_B("padlock_sha1");
		&function_end_B("padlock_sha1_blocks");

		&function_begin_B("padlock_sha256_oneshot");
		&push ("edi");
		@@ -397,7 +404,7 @@ my ($mode,$opcode) = @_;
		&ret ();
		&function_end_B("padlock_sha256_oneshot");

		&function_begin_B("padlock_sha256");
		&function_begin_B("padlock_sha256_blocks");
		&push ("edi");
		&push ("esi");
		&mov ("eax",-1);
		@@ -408,7 +415,19 @@ my ($mode,$opcode) = @_;
		&pop ("esi");
		&pop ("edi");
		&ret ();
		&function_end_B("padlock_sha256");
		&function_end_B("padlock_sha256_blocks");

		&function_begin_B("padlock_sha512_blocks");
		&push ("edi");
		&push ("esi");
		&mov ("edi",&wparam(0));
		&mov ("esi",&wparam(1));
		&mov ("ecx",&wparam(2));
		&data_byte(0xf3,0x0f,0xa6,0xe0); # rep xsha512
		&pop ("esi");
		&pop ("edi");
		&ret ();
		&function_end_B("padlock_sha512_blocks");

		&asciz ("VIA Padlock x86 module, CRYPTOGAMS by <appro\@openssl.org>");
		&align (16);
		@@ -417,7 +436,7 @@ my ($mode,$opcode) = @_;
		# Essentially this variable belongs in thread local storage.
		# Having this variable global on the other hand can only cause
		# few bogus key reloads [if any at all on signle-CPU system],
		# so we accept the panalty...
		# so we accept the penalty...
		&set_label("padlock_saved_context",4);
		&data_word(0);

engines/asm/e_padlock-x86_64.pl

+17 −8

Original line number	Diff line number	Diff line
		@@ -151,15 +151,15 @@ padlock_sha1_oneshot:
		ret
		.size padlock_sha1_oneshot,.-padlock_sha1_oneshot

		.globl padlock_sha1
		.type padlock_sha1,\@function,3
		.globl padlock_sha1_blocks
		.type padlock_sha1_blocks,\@function,3
		.align 16
		padlock_sha1:
		padlock_sha1_blocks:
		mov \$-1,%rax
		mov %rdx,%rcx
		.byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1
		ret
		.size padlock_sha1,.-padlock_sha1
		.size padlock_sha1_blocks,.-padlock_sha1_blocks

		.globl padlock_sha256_oneshot
		.type padlock_sha256_oneshot,\@function,3
		@@ -171,15 +171,23 @@ padlock_sha256_oneshot:
		ret
		.size padlock_sha256_oneshot,.-padlock_sha256_oneshot

		.globl padlock_sha256
		.type padlock_sha256,\@function,3
		.globl padlock_sha256_blocks
		.type padlock_sha256_blocks,\@function,3
		.align 16
		padlock_sha256:
		padlock_sha256_blocks:
		mov \$-1,%rax
		mov %rdx,%rcx
		.byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256
		ret
		.size padlock_sha256,.-padlock_sha256
		.size padlock_sha256_blocks,.-padlock_sha256_blocks

		.globl padlock_sha512_blocks,\@function,3
		.align 16
		padlock_sha512_blocks:
		mov %rdx,%rcx
		.byte 0xf3,0x0f,0xa6,0xe0 # rep xha512
		ret
		.size padlock_sha512_blocks,.-padlock_sha512_blocks
		___

		sub generate_mode {
		@@ -207,6 +215,7 @@ padlock_${mode}_encrypt:
		xor %eax,%eax
		xor %ebx,%ebx
		testl \$`1<<5`,($ctx) # align bit in control word
		jnz .L${mode}_aligned
		test \$0x0f,$out
		setz %al # !out_misaligned
		test \$0x0f,$inp