Commit 977f32e8 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

Facilitate back-porting of AESNI and SHA modules.

Fix SEH and stack handling in Win64 build.
parent d84ba7ea
Loading
Loading
Loading
Loading
+31 −2
Original line number Diff line number Diff line
@@ -95,6 +95,8 @@ $avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
	   `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
	   $1>=10);

$shaext=1;	### set to zero if compiling for 1.0.1

$stitched_decrypt=0;

open OUT,"| \"$^X\" $xlate $flavour $output";
@@ -119,6 +121,8 @@ aesni_cbc_sha1_enc:
	# caller should check for SSSE3 and AES-NI bits
	mov	OPENSSL_ia32cap_P+0(%rip),%r10d
	mov	OPENSSL_ia32cap_P+4(%rip),%r11
___
$code.=<<___ if ($shaext);
	bt	\$61,%r11		# check SHA bit
	jc	aesni_cbc_sha1_enc_shaext
___
@@ -1657,7 +1661,7 @@ K_XX_XX:
.asciz	"AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
.align	64
___
						{{{
						if ($shaext) {{{
($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");

$rounds="%r11d";
@@ -1676,7 +1680,7 @@ aesni_cbc_sha1_enc_shaext:
	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
___
$code.=<<___ if ($win64);
	lea	`-8-4*16`(%rsp),%rsp
	lea	`-8-10*16`(%rsp),%rsp
	movaps	%xmm6,-8-10*16(%rax)
	movaps	%xmm7,-8-9*16(%rax)
	movaps	%xmm8,-8-8*16(%rax)
@@ -1867,7 +1871,21 @@ ssse3_handler:
	lea	(%rsi,%r10),%r10	# epilogue label
	cmp	%r10,%rbx		# context->Rip>=epilogue label
	jae	.Lcommon_seh_tail
___
$code.=<<___ if ($shaext);
	lea	aesni_cbc_sha1_enc_shaext(%rip),%r10
	cmp	%r10,%rbx
	jb	.Lseh_no_shaext

	lea	(%rax),%rsi
	lea	512($context),%rdi	# &context.Xmm6
	mov	\$20,%ecx
	.long	0xa548f3fc		# cld; rep movsq
	lea	168(%rax),%rax		# adjust stack pointer
	jmp	.Lcommon_seh_tail
.Lseh_no_shaext:
___
$code.=<<___;
	lea	96(%rax),%rsi
	lea	512($context),%rdi	# &context.Xmm6
	mov	\$20,%ecx
@@ -1939,6 +1957,11 @@ $code.=<<___ if ($avx);
	.rva	.LSEH_end_aesni_cbc_sha1_enc_avx
	.rva	.LSEH_info_aesni_cbc_sha1_enc_avx
___
$code.=<<___ if ($shaext);
	.rva	.LSEH_begin_aesni_cbc_sha1_enc_shaext
	.rva	.LSEH_end_aesni_cbc_sha1_enc_shaext
	.rva	.LSEH_info_aesni_cbc_sha1_enc_shaext
___
$code.=<<___;
.section	.xdata
.align	8
@@ -1953,6 +1976,12 @@ $code.=<<___ if ($avx);
	.rva	ssse3_handler
	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
___
$code.=<<___ if ($shaext);
.LSEH_info_aesni_cbc_sha1_enc_shaext:
	.byte	9,0,0,0
	.rva	ssse3_handler
	.rva	.Lprologue_shaext,.Lepilogue_shaext	# HandlerData[]
___
}

####################################################################
+47 −17
Original line number Diff line number Diff line
@@ -59,6 +59,9 @@ if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
	$avx = ($1>=10) + ($1>=11);
}

$shaext=1;	### set to zero if compiling for 1.0.1
$avx=1		if (!$shaext && $avx);

open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;

@@ -113,10 +116,12 @@ $code.=<<___ if ($avx);
	je	.Lprobe
	mov	0(%r11),%eax
	mov	4(%r11),%r10

___
$code.=<<___ if ($shaext);
	bt	\$61,%r10			# check for SHA
	jc	${func}_shaext

___
$code.=<<___;
	mov	%r10,%r11
	shr	\$32,%r11

@@ -1259,16 +1264,17 @@ ___
    $r++;	unshift(@rndkey,pop(@rndkey));
};

if ($shaext) {
my $Tbl="%rax";

$code.=<<___;
.type	${func}_shaext,\@function,6
.align	32
${func}_shaext:
	mov	%rsp,%rax
	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
	push	%rbx
___
$code.=<<___ if ($win64);
	lea	`-4*16`(%rsp),%rsp
	lea	`-8-10*16`(%rsp),%rsp
	movaps	%xmm6,-8-10*16(%rax)
	movaps	%xmm7,-8-9*16(%rax)
	movaps	%xmm8,-8-8*16(%rax)
@@ -1465,24 +1471,24 @@ $code.=<<___;
	movdqu		$CDGH,16($ctx)
___
$code.=<<___ if ($win64);
	movaps	-8-10*16(%rax),%xmm6
	movaps	-8-9*16(%rax),%xmm7
	movaps	-8-8*16(%rax),%xmm8
	movaps	-8-7*16(%rax),%xmm9
	movaps	-8-6*16(%rax),%xmm10
	movaps	-8-5*16(%rax),%xmm11
	movaps	-8-4*16(%rax),%xmm12
	movaps	-8-3*16(%rax),%xmm13
	movaps	-8-2*16(%rax),%xmm14
	movaps	-8-1*16(%rax),%xmm15
	movaps	0*16(%rsp),%xmm6
	movaps	1*16(%rsp),%xmm7
	movaps	2*16(%rsp),%xmm8
	movaps	3*16(%rsp),%xmm9
	movaps	4*16(%rsp),%xmm10
	movaps	5*16(%rsp),%xmm11
	movaps	6*16(%rsp),%xmm12
	movaps	7*16(%rsp),%xmm13
	movaps	8*16(%rsp),%xmm14
	movaps	9*16(%rsp),%xmm15
	lea	8+10*16(%rsp),%rsp
.Lepilogue_shaext:
___
$code.=<<___;
	mov	-8(%rax),%rbx
	mov	%rax,%rsp
	ret
.size	${func}_shaext,.-${func}_shaext
___
}
}}}}}

# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
@@ -1527,6 +1533,19 @@ se_handler:
	cmp	%r10,%rbx		# context->Rip>=epilogue label
	jae	.Lin_prologue
___
$code.=<<___ if ($shaext);
	lea	aesni_cbc_sha256_enc_shaext(%rip),%r10
	cmp	%r10,%rbx
	jb	.Lnot_in_shaext

	lea	(%rax),%rsi
	lea	512($context),%rdi	# &context.Xmm6
	mov	\$20,%ecx
	.long	0xa548f3fc		# cld; rep movsq
	lea	168(%rax),%rax		# adjust stack pointer
	jmp	.Lin_prologue
.Lnot_in_shaext:
___
$code.=<<___ if ($avx>1);
	lea	.Lavx2_shortcut(%rip),%r10
	cmp	%r10,%rbx		# context->Rip<avx2_shortcut
@@ -1613,6 +1632,11 @@ $code.=<<___ if ($avx>1);
	.rva	.LSEH_end_${func}_avx2
	.rva	.LSEH_info_${func}_avx2
___
$code.=<<___ if ($shaext);
	.rva	.LSEH_begin_${func}_shaext
	.rva	.LSEH_end_${func}_shaext
	.rva	.LSEH_info_${func}_shaext
___
$code.=<<___ if ($avx);
.section	.xdata
.align	8
@@ -1632,6 +1656,12 @@ $code.=<<___ if ($avx>1);
	.rva	se_handler
	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
___
$code.=<<___ if ($shaext);
.LSEH_info_${func}_shaext:
	.byte	9,0,0,0
	.rva	se_handler
	.rva	.Lprologue_shaext,.Lepilogue_shaext	# HandlerData[]
___
}

####################################################################
+8 −4
Original line number Diff line number Diff line
@@ -128,6 +128,8 @@ $ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32" &&
		`ml 2>&1` =~ /Version ([0-9]+)\./ &&
		$1>=10);	# first version supporting AVX

$shaext=$xmm;	### set to zero if compiling for 1.0.1

&external_label("OPENSSL_ia32cap_P") if ($xmm);


@@ -307,7 +309,7 @@ if ($alt) {

&function_begin("sha1_block_data_order");
if ($xmm) {
  &static_label("shaext_shortcut");
  &static_label("shaext_shortcut")	if ($shaext);
  &static_label("ssse3_shortcut");
  &static_label("avx_shortcut")		if ($ymm);
  &static_label("K_XX_XX");
@@ -325,8 +327,10 @@ if ($xmm) {
	&mov	($C,&DWP(8,$T));
	&test	($A,1<<24);		# check FXSR bit
	&jz	(&label("x86"));
	if ($shaext) {
		&test	($C,1<<29);		# check SHA bit
		&jnz	(&label("shaext_shortcut"));
	}
	if ($ymm) {
		&and	($D,1<<28);		# mask AVX bit
		&and	($A,1<<30);		# mask "Intel CPU" bit
@@ -405,7 +409,7 @@ if ($xmm) {
&function_end("sha1_block_data_order");

if ($xmm) {
{
if ($shaext) {
######################################################################
# Intel SHA Extensions implementation of SHA1 update function.
#
+10 −2
Original line number Diff line number Diff line
@@ -107,6 +107,9 @@ if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
	$avx = ($1>=10) + ($1>=11);
}

$shaext=1;	### set to zero if compiling for 1.0.1
$avx=1		if (!$shaext && $avx);

open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;

@@ -245,7 +248,8 @@ sha1_block_data_order:
	mov	OPENSSL_ia32cap_P+8(%rip),%r10d
	test	\$`1<<9`,%r8d		# check SSSE3 bit
	jz	.Lialu

___
$code.=<<___ if ($shaext);
	test	\$`1<<29`,%r10d		# check SHA bit	
	jnz	_shaext_shortcut
___
@@ -321,7 +325,7 @@ $code.=<<___;
	ret
.size	sha1_block_data_order,.-sha1_block_data_order
___
{{{
if ($shaext) {{{
######################################################################
# Intel SHA Extensions implementation of SHA1 update function.
#
@@ -1956,9 +1960,13 @@ ssse3_handler:
	.rva	.LSEH_begin_sha1_block_data_order
	.rva	.LSEH_end_sha1_block_data_order
	.rva	.LSEH_info_sha1_block_data_order
___
$code.=<<___ if ($shaext);
	.rva	.LSEH_begin_sha1_block_data_order_shaext
	.rva	.LSEH_end_sha1_block_data_order_shaext
	.rva	.LSEH_info_sha1_block_data_order_shaext
___
$code.=<<___;
	.rva	.LSEH_begin_sha1_block_data_order_ssse3
	.rva	.LSEH_end_sha1_block_data_order_ssse3
	.rva	.LSEH_info_sha1_block_data_order_ssse3
+5 −3
Original line number Diff line number Diff line
@@ -82,6 +82,8 @@ if ($xmm && !$avx && $ARGV[0] eq "win32" &&
	$avx = ($1>=10) + ($1>=11);
}

$shaext=$xmm;	### set to zero if compiling for 1.0.1

$unroll_after = 64*4;	# If pre-evicted from L1P cache first spin of
			# fully unrolled loop was measured to run about
			# 3-4x slower. If slowdown coefficient is N and
@@ -205,8 +207,8 @@ sub BODY_00_15() {
	&jz	($unroll_after?&label("no_xmm"):&label("loop"));
	&and	("ecx",1<<30);		# mask "Intel CPU" bit
	&and	("ebx",1<<28|1<<9);	# mask AVX and SSSE3 bits
	&test	("edx",1<<29)		if ($xmm);	# check for SHA
	&jnz	(&label("shaext"))	if ($xmm);
	&test	("edx",1<<29)		if ($shaext);	# check for SHA
	&jnz	(&label("shaext"))	if ($shaext);
	&or	("ecx","ebx");
	&and	("ecx",1<<28|1<<30);
	&cmp	("ecx",1<<28|1<<30);
@@ -505,7 +507,7 @@ my @AH=($A,$K256);
&function_end_A();
}
						if (!$i386 && $xmm) {{{
{
if ($shaext) {
######################################################################
# Intel SHA Extensions implementation of SHA256 update function.
#
Loading