x86[_64]cpuid.pl: handle new extensions. (b9064221) · Commits · CYBER - Cyber Security / TS 103 523 MSP / TLMSP / TLMSP OpenSSL

crypto/x86_64cpuid.pl

+41 −16

Original line number	Diff line number	Diff line
		@@ -47,7 +47,7 @@ OPENSSL_rdtsc:
		.type OPENSSL_ia32_cpuid,\@abi-omnipotent
		.align 16
		OPENSSL_ia32_cpuid:
		mov %rbx,%r8
		mov %rbx,%r8 # save %rbx

		xor %eax,%eax
		cpuid
		@@ -79,7 +79,15 @@ OPENSSL_ia32_cpuid:
		# AMD specific
		mov \$0x80000000,%eax
		cpuid
		cmp \$0x80000008,%eax
		cmp \$0x80000001,%eax
		jb .Lintel
		mov %eax,%r10d
		mov \$0x80000001,%eax
		cpuid
		or %ecx,%r9d
		and \$0x00000801,%r9d # isolate AMD XOP bit, 1<<11

		cmp \$0x80000008,%r10d
		jb .Lintel

		mov \$0x80000008,%eax
		@@ -90,12 +98,12 @@ OPENSSL_ia32_cpuid:
		mov \$1,%eax
		cpuid
		bt \$28,%edx # test hyper-threading bit
		jnc .Ldone
		jnc .Lgeneric
		shr \$16,%ebx # number of logical processors
		cmp %r10b,%bl
		ja .Ldone
		ja .Lgeneric
		and \$0xefffffff,%edx # ~(1<<28)
		jmp .Ldone
		jmp .Lgeneric

		.Lintel:
		cmp \$4,%r11d
		@@ -121,21 +129,38 @@ OPENSSL_ia32_cpuid:
		or \$0x40000000,%edx # use reserved bit to skip unrolled loop
		.Lnotintel:
		bt \$28,%edx # test hyper-threading bit
		jnc .Ldone
		jnc .Lgeneric
		and \$0xefffffff,%edx # ~(1<<28)
		cmp \$0,%r10d
		je .Ldone
		je .Lgeneric

		or \$0x10000000,%edx # 1<<28
		shr \$16,%ebx
		cmp \$1,%bl # see if cache is shared
		ja .Ldone
		ja .Lgeneric
		and \$0xefffffff,%edx # ~(1<<28)
		.Ldone:
		.Lgeneric:
		and \$0x00000800,%r9d # isolate AMD XOP flag
		and \$0xfffff7ff,%ecx
		or %r9d,%ecx # merge AMD XOP flag

		shl \$32,%rcx
		mov %edx,%eax
		mov %r8,%rbx
		or %rcx,%rax
		mov %edx,%ebx
		or %rcx,%rbx # compose capability vector in %rbx
		bt \$27+32,%rcx # check OSXSAVE bit
		jnc .Lclear_avx
		xor %ecx,%ecx # XCR0
		.byte 0x0f,0x01,0xd0 # xgetbv
		and \$6,%eax # isolate XMM and YMM state support
		cmp \$6,%eax
		je .Ldone
		.Lclear_avx:
		mov \$0xefffe7ff,%eax # ~(1<<28\|1<<12\|1<<11)
		shl \$32,%rax
		and %rax,%rbx # clear AVX, FMA and AMD XOP bits
		.Ldone:
		mov %rbx,%rax
		mov %r8,%rbx # restore %rbx
		ret
		.size OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid

		@@ -250,7 +275,7 @@ OPENSSL_instrument_bus:
		mov %eax,$lasttick # lasttick = tick
		mov \$0,$lastdiff # lastdiff = 0
		clflush ($out)
		lock
		.byte 0xf0 # lock
		add $lastdiff,($out)
		jmp .Loop
		.align 16
		@@ -260,7 +285,7 @@ OPENSSL_instrument_bus:
		mov %edx,$lasttick
		mov %eax,$lastdiff
		clflush ($out)
		lock
		.byte 0xf0 # lock
		add %eax,($out)
		lea 4($out),$out
		sub \$1,$cnt
		@@ -284,7 +309,7 @@ OPENSSL_instrument_bus2:
		mov \$0,$lastdiff # lastdiff = 0

		clflush ($out)
		lock
		.byte 0xf0 # lock
		add $lastdiff,($out)

		rdtsc # collect 1st diff
		@@ -294,7 +319,7 @@ OPENSSL_instrument_bus2:
		mov %eax,$lastdiff # lastdiff = diff
		.Loop2:
		clflush ($out)
		lock
		.byte 0xf0 # lock
		add %eax,($out) # accumulate diff

		sub \$1,$max

crypto/x86cpuid.pl

+47 −16

Original line number	Diff line number	Diff line
		@@ -20,7 +20,7 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
		&pop ("eax");
		&xor ("ecx","eax");
		&bt ("ecx",21);
		&jnc (&label("done"));
		&jnc (&label("generic"));
		&xor ("eax","eax");
		&cpuid ();
		&mov ("edi","eax"); # max value for standard query level
		@@ -51,7 +51,14 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
		# AMD specific
		&mov ("eax",0x80000000);
		&cpuid ();
		&cmp ("eax",0x80000008);
		&cmp ("eax",0x80000001);
		&jb (&label("intel"));
		&mov ("esi","eax");
		&mov ("eax",0x80000001);
		&cpuid ();
		&or ("ebp","ecx");
		&and ("ebp",1<<11\|1); # isolate XOP bit
		&cmp ("esi",0x80000008);
		&jb (&label("intel"));

		&mov ("eax",0x80000008);
		@@ -62,13 +69,13 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
		&mov ("eax",1);
		&cpuid ();
		&bt ("edx",28);
		&jnc (&label("done"));
		&jnc (&label("generic"));
		&shr ("ebx",16);
		&and ("ebx",0xff);
		&cmp ("ebx","esi");
		&ja (&label("done"));
		&ja (&label("generic"));
		&and ("edx",0xefffffff); # clear hyper-threading bit
		&jmp (&label("done"));
		&jmp (&label("generic"));

		&set_label("intel");
		&cmp ("edi",4);
		@@ -93,19 +100,42 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
		&or ("edx",1<<20); # use reserved bit to engage RC4_CHAR
		&set_label("notP4");
		&bt ("edx",28); # test hyper-threading bit
		&jnc (&label("done"));
		&jnc (&label("generic"));
		&and ("edx",0xefffffff);
		&cmp ("edi",0);
		&je (&label("done"));
		&je (&label("generic"));

		&or ("edx",0x10000000);
		&shr ("ebx",16);
		&cmp (&LB("ebx"),1);
		&ja (&label("done"));
		&ja (&label("generic"));
		&and ("edx",0xefffffff); # clear hyper-threading bit if not

		&set_label("generic");
		&and ("ebp",1<<11); # isolate AMD XOP flag
		&and ("ecx",~(1<<11));
		&mov ("esi","edx");
		&or ("ebp","ecx"); # merge AMD XOP flag

		&bt ("ecx",26); # check XSAVE bit
		&jnc (&label("done"));
		&bt ("ecx",27); # check OSXSAVE bit
		&jnc (&label("clear_xmm"));
		&xor ("ecx","ecx");
		&data_byte(0x0f,0x01,0xd0); # xgetbv
		&and ("eax",6);
		&cmp ("eax",6);
		&je (&label("done"));
		&cmp ("eax",2);
		&je (&label("clear_avx"));
		&set_label("clear_xmm");
		&and ("ebp",~(1<<25\|1<<1)); # clear AESNI and PCLMULQDQ bits
		&and ("esi",~(1<<24)); # clear FXSR
		&set_label("clear_avx");
		&and ("ebp",~(1<<28\|1<<12\|1<<11));# clear AVX, FMA and AMD XOP bits
		&set_label("done");
		&mov ("eax","edx");
		&mov ("edx","ecx");
		&mov ("eax","esi");
		&mov ("edx","ebp");
		&function_end("OPENSSL_ia32_cpuid");

		&external_label("OPENSSL_ia32cap_P");
		@@ -199,8 +229,9 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
		&bt (&DWP(0,"ecx"),1);
		&jnc (&label("no_x87"));
		if ($sse2) {
		&bt (&DWP(0,"ecx"),26);
		&jnc (&label("no_sse2"));
		&and ("ecx",1<<26\|1<<24); # check SSE2 and FXSR bits
		&cmp ("ecx",1<<26\|1<<24);
		&jne (&label("no_sse2"));
		&pxor ("xmm0","xmm0");
		&pxor ("xmm1","xmm1");
		&pxor ("xmm2","xmm2");
		@@ -331,7 +362,7 @@ my $max = "ebp";
		&mov ($lasttick,"eax"); # lasttick = tick
		&mov ($lastdiff,0); # lastdiff = 0
		&clflush(&DWP(0,$out));
		&lock ();
		&data_byte(0xf0); # lock
		&add (&DWP(0,$out),$lastdiff);
		&jmp (&label("loop"));

		@@ -342,7 +373,7 @@ my $max = "ebp";
		&mov ($lasttick,"edx"); # lasttick = tick
		&mov ($lastdiff,"eax"); # lastdiff = diff
		&clflush(&DWP(0,$out));
		&lock ();
		&data_byte(0xf0); # lock
		&add (&DWP(0,$out),"eax"); # accumulate diff
		&lea ($out,&DWP(4,$out)); # ++$out
		&sub ($cnt,1); # --$cnt
		@@ -371,7 +402,7 @@ my $max = "ebp";
		&mov ($lastdiff,0); # lastdiff = 0

		&clflush(&DWP(0,$out));
		&lock ();
		&data_byte(0xf0); # lock
		&add (&DWP(0,$out),$lastdiff);

		&rdtsc (); # collect 1st diff
		@@ -383,7 +414,7 @@ my $max = "ebp";

		&set_label("loop2",16);
		&clflush(&DWP(0,$out));
		&lock ();
		&data_byte(0xf0); # lock
		&add (&DWP(0,$out),"eax"); # accumulate diff

		&sub ($max,1);

doc/crypto/OPENSSL_ia32cap.pod

+45 −24

Original line number	Diff line number	Diff line
		@@ -2,7 +2,7 @@

		=head1 NAME

		OPENSSL_ia32cap - finding the IA-32 processor capabilities
		OPENSSL_ia32cap - the IA-32 processor capabilities vector

		=head1 SYNOPSIS

		@@ -18,30 +18,52 @@ input value (see Intel Application Note #241618). Naturally it's
		meaningful on x86 and x86_64 platforms only. The variable is normally
		set up automatically upon toolkit initialization, but can be
		manipulated afterwards to modify crypto library behaviour. For the
		moment of this writing seven bits are significant, namely:
		moment of this writing following bits are significant:

		1. bit #4 denoting presence of Time-Stamp Counter.
		2. bit #20, reserved by Intel, is used to choose among RC4 code
		paths;
		3. bit #23 denoting MMX support;
		4. bit #25 denoting SSE support;
		5. bit #26 denoting SSE2 support;
		6. bit #28 denoting Hyperthreading, which is used to distiguish
		=item bit #4 denoting presence of Time-Stamp Counter.

		=item bit #19 denoting availability of CLFLUSH instruction;

		=item bit #20, reserved by Intel, is used to choose among RC4 code paths;

		=item bit #23 denoting MMX support;

		=item bit #24, FXSR bit, denoting availability of XMM registers;

		=item bit #25 denoting SSE support;

		=item bit #26 denoting SSE2 support;

		=item bit #28 denoting Hyperthreading, which is used to distiguish
		cores with shared cache;
		7. bit #30, reserved by Intel, is used to choose among RC4 code

		=item bit #30, reserved by Intel, is used to choose among RC4 code
		paths;
		8. bit #57 denoting Intel AES instruction set extension;

		=item bit #33 denoting availability of PCLMULQDQ instruction;

		=item bit #41 denoting SSSE3, Supplemental SSE3, support;

		=item bit #43 denoting AMD XOP support (forced to zero on Intel);

		=item bit #57 denoting AES-NI instruction set extension;

		=item bit #59, OSXSAVE bit, denoting availability of YMM registers;

		=item bit #60 denoting AVX extension;

		For example, clearing bit #26 at run-time disables high-performance
		SSE2 code present in the crypto library. You might have to do this if
		target OpenSSL application is executed on SSE2 capable CPU, but under
		control of OS which does not support SSE2 extentions. Even though you
		can manipulate the value programmatically, you most likely will find it
		more appropriate to set up an environment variable with the same name
		prior starting target application, e.g. on Intel P4 processor 'env
		OPENSSL_ia32cap=0x12900010 apps/openssl', to achieve same effect
		without modifying the application source code. Alternatively you can
		reconfigure the toolkit with no-sse2 option and recompile.
		SSE2 code present in the crypto library, while clearing bit #24
		disables SSE2 code operating on 128-bit XMM register bank. You might
		have to do the latter if target OpenSSL application is executed on SSE2
		capable CPU, but under control of OS that does not enable XMM
		registers. Even though you can manipulate the value programmatically,
		you most likely will find it more appropriate to set up an environment
		variable with the same name prior starting target application, e.g. on
		Intel P4 processor 'env OPENSSL_ia32cap=0x16980010 apps/openssl', to
		achieve same effect without modifying the application source code.
		Alternatively you can reconfigure the toolkit with no-sse2 option and
		recompile.

		Less intuituve is clearing bit #28. The truth is that it's not copied
		from CPUID output verbatim, but is adjusted to reflect whether or not
		@@ -49,4 +71,3 @@ the data cache is actually shared between logical cores. This in turn
		affects the decision on whether or not expensive countermeasures
		against cache-timing attacks are applied, most notably in AES assembler
		module.
		=cut