Commit 1aed5e1a authored by Andy Polyakov's avatar Andy Polyakov
Browse files

crypto/x86*cpuid.pl: move extended feature detection.



Exteneded feature flags were not pulled on AMD processors, as result
a number of extensions were effectively masked on Ryzen. Original fix
for x86_64cpuid.pl addressed this problem, but messed up processor
vendor detection. This fix moves extended feature detection past
basic feature detection where it belongs. 32-bit counterpart is
harmonized too.

Reviewed-by: default avatarRich Salz <rsalz@openssl.org>
Reviewed-by: default avatarRichard Levitte <levitte@openssl.org>
parent b1fa4031
Loading
Loading
Loading
Loading
+10 −11
Original line number Diff line number Diff line
@@ -68,20 +68,10 @@ OPENSSL_ia32_cpuid:
.cfi_register	%rbx,%r8

	xor	%eax,%eax
	mov	%eax,8(%rdi)		# clear 3rd word
	mov	%eax,8(%rdi)		# clear extended feature flags
	cpuid
	mov	%eax,%r11d		# max value for standard query level

	cmp	\$7,%eax
	jb	.Lno_extended_info

	mov	\$7,%eax
	xor	%ecx,%ecx
	cpuid
	mov	%ebx,8(%rdi)

.Lno_extended_info:

	xor	%eax,%eax
	cmp	\$0x756e6547,%ebx	# "Genu"
	setne	%al
@@ -175,6 +165,15 @@ OPENSSL_ia32_cpuid:
	or	%ecx,%r9d		# merge AMD XOP flag

	mov	%edx,%r10d		# %r9d:%r10d is copy of %ecx:%edx

	cmp	\$7,%r11d
	jb	.Lno_extended_info
	mov	\$7,%eax
	xor	%ecx,%ecx
	cpuid
	mov	%ebx,8(%rdi)		# save extended feature flags
.Lno_extended_info:

	bt	\$27,%r9d		# check OSXSAVE bit
	jnc	.Lclear_avx
	xor	%ecx,%ecx		# XCR0
+18 −20
Original line number Diff line number Diff line
@@ -30,10 +30,10 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
	&pop	("eax");
	&xor	("ecx","eax");
	&xor	("eax","eax");
	&mov	("esi",&wparam(0));
	&mov	(&DWP(8,"esi"),"eax");	# clear extended feature flags
	&bt	("ecx",21);
	&jnc	(&label("nocpuid"));
	&mov	("esi",&wparam(0));
	&mov	(&DWP(8,"esi"),"eax");	# clear 3rd word
	&cpuid	();
	&mov	("edi","eax");		# max value for standard query level

@@ -91,26 +91,16 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
	&jmp	(&label("generic"));

&set_label("intel");
	&cmp	("edi",7);
	&jb	(&label("cacheinfo"));

	&mov	("esi",&wparam(0));
	&mov	("eax",7);
	&xor	("ecx","ecx");
	&cpuid	();
	&mov	(&DWP(8,"esi"),"ebx");

&set_label("cacheinfo");
	&cmp	("edi",4);
	&mov	("edi",-1);
	&mov	("esi",-1);
	&jb	(&label("nocacheinfo"));

	&mov	("eax",4);
	&mov	("ecx",0);		# query L1D
	&cpuid	();
	&mov	("edi","eax");
	&shr	("edi",14);
	&and	("edi",0xfff);		# number of cores -1 per L1D
	&mov	("esi","eax");
	&shr	("esi",14);
	&and	("esi",0xfff);		# number of cores -1 per L1D

&set_label("nocacheinfo");
	&mov	("eax",1);
@@ -128,7 +118,7 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
	&bt	("edx",28);		# test hyper-threading bit
	&jnc	(&label("generic"));
	&and	("edx",0xefffffff);
	&cmp	("edi",0);
	&cmp	("esi",0);
	&je	(&label("generic"));

	&or	("edx",0x10000000);
@@ -140,10 +130,19 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&set_label("generic");
	&and	("ebp",1<<11);		# isolate AMD XOP flag
	&and	("ecx",0xfffff7ff);	# force 11th bit to 0
	&mov	("esi","edx");
	&mov	("esi","edx");		# %ebp:%esi is copy of %ecx:%edx
	&or	("ebp","ecx");		# merge AMD XOP flag

	&bt	("ecx",27);		# check OSXSAVE bit
	&cmp	("edi",7);
	&mov	("edi",&wparam(0));
	&jb	(&label("no_extended_info"));
	&mov	("eax",7);
	&xor	("ecx","ecx");
	&cpuid	();
	&mov	(&DWP(8,"edi"),"ebx");	# save extended feature flag
&set_label("no_extended_info");

	&bt	("ebp",27);		# check OSXSAVE bit
	&jnc	(&label("clear_avx"));
	&xor	("ecx","ecx");
	&data_byte(0x0f,0x01,0xd0);	# xgetbv
@@ -157,7 +156,6 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
	&and	("esi",0xfeffffff);	# clear FXSR
&set_label("clear_avx");
	&and	("ebp",0xefffe7ff);	# clear AVX, FMA and AMD XOP bits
	&mov	("edi",&wparam(0));
	&and	(&DWP(8,"edi"),0xffffffdf);	# clear AVX2
&set_label("done");
	&mov	("eax","esi");