SPARCv9 assembly pack: harmonize ABI handling (so that it's handled in one (1efd5830) · Commits · CYBER - Cyber Security / TS 103 523 MSP / TLMSP / TLMSP OpenSSL

crypto/bn/asm/sparcv9-gf2m.pl

+9 −17

Original line number	Diff line number	Diff line
		@@ -18,23 +18,8 @@
		# ~100-230% faster than gcc-generated code and ~35-90% faster than
		# the pure SPARCv9 code path.

		$bits=32;
		for (@ARGV) { $bits=64 if (/\-m64/ \|\| /\-xarch\=v9/); }
		if ($bits==64) { $bias=2047; $frame=192; }
		else { $bias=0; $frame=112; }

		$locals=16*8;

		$code.=<<___;
		#include <sparc_arch.h>

		.section ".text",#alloc,#execinstr
		___
		$code.=<<___ if ($bits==64);
		.register %g2,#scratch
		.register %g3,#scratch
		___

		$tab="%l0";

		@T=("%g2","%g3");
		@@ -44,6 +29,13 @@ $tab="%l0";
		($lo,$hi,$b)=("%g1",$a8,"%o7"); $a=$lo;

		$code.=<<___;
		#include <sparc_arch.h>

		#ifdef __arch64__
		.register %g2,#scratch
		.register %g3,#scratch
		#endif

		#ifdef __PIC__
		SPARC_PIC_THUNK(%g1)
		#endif
		@@ -74,7 +66,7 @@ bn_GF2m_mul_2x2:

		.align 16
		.Lsoftware:
		save %sp,-$frame-$locals,%sp
		save %sp,-STACK_FRAME-$locals,%sp

		sllx %i1,32,$a
		mov -1,$a12
		@@ -83,7 +75,7 @@ bn_GF2m_mul_2x2:
		srlx $a12,1,$a48 ! 0x7fff...
		or %i4,$b,$b
		srlx $a12,2,$a12 ! 0x3fff...
		add %sp,$bias+$frame,$tab
		add %sp,STACK_BIAS+STACK_FRAME,$tab

		sllx $a,2,$a4
		mov $a,$a1

crypto/md5/asm/md5-sparcv9.pl

+9 −13

Original line number	Diff line number	Diff line
		@@ -17,11 +17,6 @@
		# single-process result on 8-core processor, or ~11GBps per 2.85GHz
		# socket.

		$bits=32;
		for (@ARGV) { $bits=64 if (/\-m64/ \|\| /\-xarch\=v9/); }
		if ($bits==64) { $bias=2047; $frame=192; }
		else { $bias=0; $frame=112; }

		$output=shift;
		open STDOUT,">$output";

		@@ -198,13 +193,14 @@ $code.=<<___;
		___
		}

		$code.=<<___ if ($bits==64);
		.register %g2,#scratch
		.register %g3,#scratch
		___
		$code.=<<___;
		#include "sparc_arch.h"

		#ifdef __arch64__
		.register %g2,#scratch
		.register %g3,#scratch
		#endif

		.section ".text",#alloc,#execinstr

		#ifdef __PIC__
		@@ -246,7 +242,7 @@ md5_block_asm_data_order:

		.word 0x81b02800 ! MD5

		bne,pt `$bits==64?"%xcc":"%icc"`, .Lhw_loop
		bne,pt SIZE_T_CC, .Lhw_loop
		nop

		.Lhwfinish:
		@@ -287,7 +283,7 @@ md5_block_asm_data_order:

		.word 0x81b02800 ! MD5

		bne,pt `$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop
		bne,pt SIZE_T_CC, .Lhwunaligned_loop
		for %f26, %f26, %f10 ! %f10=%f26

		ba .Lhwfinish
		@@ -295,7 +291,7 @@ md5_block_asm_data_order:

		.align 16
		.Lsoftware:
		save %sp,-$frame,%sp
		save %sp,-STACK_FRAME,%sp

		rd %asi,$saved_asi
		wr %g0,0x88,%asi ! ASI_PRIMARY_LITTLE
		@@ -355,7 +351,7 @@ $code.=<<___;
		add $t2,$C,$C
		add $CD,$D,$D
		srl $B,0,$B ! clruw $B
		bne `$bits==64?"%xcc":"%icc"`,.Loop
		bne SIZE_T_CC,.Loop
		srl $D,0,$D ! clruw $D

		st $A,[$ctx+0] ! write out ctx

crypto/sha/asm/sha1-sparcv9.pl

+9 −13

Original line number	Diff line number	Diff line
		@@ -25,11 +25,6 @@
		# single-process result on 8-core processor, or ~9GBps per 2.85GHz
		# socket.

		$bits=32;
		for (@ARGV) { $bits=64 if (/\-m64/ \|\| /\-xarch\=v9/); }
		if ($bits==64) { $bias=2047; $frame=192; }
		else { $bias=0; $frame=112; }

		$output=shift;
		open STDOUT,">$output";

		@@ -185,13 +180,14 @@ $code.=<<___;
		___
		}

		$code.=<<___ if ($bits==64);
		.register %g2,#scratch
		.register %g3,#scratch
		___
		$code.=<<___;
		#include "sparc_arch.h"

		#ifdef __arch64__
		.register %g2,#scratch
		.register %g3,#scratch
		#endif

		.section ".text",#alloc,#execinstr

		#ifdef __PIC__
		@@ -231,7 +227,7 @@ sha1_block_data_order:

		.word 0x81b02820 ! SHA1

		bne,pt `$bits==64?"%xcc":"%icc"`, .Lhw_loop
		bne,pt SIZE_T_CC, .Lhw_loop
		nop

		.Lhwfinish:
		@@ -271,7 +267,7 @@ sha1_block_data_order:

		.word 0x81b02820 ! SHA1

		bne,pt `$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop
		bne,pt SIZE_T_CC, .Lhwunaligned_loop
		for %f26, %f26, %f10 ! %f10=%f26

		ba .Lhwfinish
		@@ -279,7 +275,7 @@ sha1_block_data_order:

		.align 16
		.Lsoftware:
		save %sp,-$frame,%sp
		save %sp,-STACK_FRAME,%sp
		sllx $len,6,$len
		add $inp,$len,$len

		@@ -359,7 +355,7 @@ $code.=<<___;
		add $E,@X[4],$E
		st $E,[$ctx+16]

		bne `$bits==64?"%xcc":"%icc"`,.Lloop
		bne SIZE_T_CC,.Lloop
		andn $inp,7,$tmp0

		ret

crypto/sha/asm/sha512-sparcv9.pl

+30 −35

Original line number	Diff line number	Diff line
		@@ -49,12 +49,6 @@
		# saturates at 11.5x single-process result on 8-core processor, or
		# ~11/16GBps per 2.85GHz socket.


		$bits=32;
		for (@ARGV) { $bits=64 if (/\-m64/ \|\| /\-xarch\=v9/); }
		if ($bits==64) { $bias=2047; $frame=192; }
		else { $bias=0; $frame=112; }

		$output=shift;
		open STDOUT,">$output";

		@@ -191,29 +185,29 @@ $code.=<<___ if ($i<15);
		or @pair[1],$tmp2,$tmp2
		`"ld [$inp+".eval(32+4+$i*8)."],@pair[1]" if ($i<12)`
		add $h,$tmp2,$T1
		$ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`]
		$ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
		___
		$code.=<<___ if ($i==12);
		bnz,a,pn %icc,.+8
		ld [$inp+128],%l0
		___
		$code.=<<___ if ($i==15);
		ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
		ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
		sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
		add $tmp31,32,$tmp0
		ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
		ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
		sllx @pair[0],$tmp0,$tmp1
		ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
		ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
		srlx @pair[2],$tmp32,@pair[1]
		or $tmp1,$tmp2,$tmp2
		ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
		ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
		or @pair[1],$tmp2,$tmp2
		ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
		ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
		add $h,$tmp2,$T1
		$ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`]
		ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
		ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
		ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
		$ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
		ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
		ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
		ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
		___
		} if ($SZ==8);

		@@ -349,9 +343,9 @@ $code.=<<___;
		or %l3,$tmp0,$tmp0

		srlx $tmp0,@sigma0[0],$T1
		ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
		ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
		sllx $tmp0,`64-@sigma0[2]`,$tmp1
		ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
		ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
		srlx $tmp0,@sigma0[1],$tmp0
		xor $tmp1,$T1,$T1
		sllx $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
		@@ -363,9 +357,9 @@ $code.=<<___;
		or %l7,$tmp2,$tmp2

		srlx $tmp2,@sigma1[0],$tmp1
		ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
		ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
		sllx $tmp2,`64-@sigma1[2]`,$tmp0
		ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
		ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
		srlx $tmp2,@sigma1[1],$tmp2
		xor $tmp0,$tmp1,$tmp1
		sllx $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
		@@ -374,29 +368,30 @@ $code.=<<___;
		xor $tmp0,$tmp1,$tmp1
		sllx %l4,32,$tmp0
		xor $tmp2,$tmp1,$tmp1 ! sigma1(X[$i+14])
		ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
		ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
		or %l5,$tmp0,$tmp0
		ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
		ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5

		sllx %l0,32,$tmp2
		add $tmp1,$T1,$T1
		ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
		ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
		or %l1,$tmp2,$tmp2
		add $tmp0,$T1,$T1 ! +=X[$i+9]
		ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
		ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
		add $tmp2,$T1,$T1 ! +=X[$i]
		$ST $T1,[%sp+`$bias+$frame+($i%16)*$SZ`]
		$ST $T1,[%sp+STACK_BIAS+STACK_FRAME+`($i%16)*$SZ`]
		___
		&BODY_00_15(@_);
		} if ($SZ==8);

		$code.=<<___ if ($bits==64);
		.register %g2,#scratch
		.register %g3,#scratch
		___
		$code.=<<___;
		#include "sparc_arch.h"

		#ifdef __arch64__
		.register %g2,#scratch
		.register %g3,#scratch
		#endif

		.section ".text",#alloc,#execinstr

		.align 64
		@@ -519,7 +514,7 @@ $code.=<<___ if ($SZ==8); # SHA512

		.word 0x81b02860 ! SHA512

		bne,pt `$bits==64?"%xcc":"%icc"`, .Lhwaligned_loop
		bne,pt SIZE_T_CC, .Lhwaligned_loop
		nop

		.Lhwfinish:
		@@ -579,7 +574,7 @@ $code.=<<___ if ($SZ==8); # SHA512

		.word 0x81b02860 ! SHA512

		bne,pt `$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop
		bne,pt SIZE_T_CC, .Lhwunaligned_loop
		for %f50, %f50, %f18 ! %f18=%f50

		ba .Lhwfinish
		@@ -612,7 +607,7 @@ $code.=<<___ if ($SZ==4); # SHA256

		.word 0x81b02840 ! SHA256

		bne,pt `$bits==64?"%xcc":"%icc"`, .Lhwloop
		bne,pt SIZE_T_CC, .Lhwloop
		nop

		.Lhwfinish:
		@@ -655,7 +650,7 @@ $code.=<<___ if ($SZ==4); # SHA256

		.word 0x81b02840 ! SHA256

		bne,pt `$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop
		bne,pt SIZE_T_CC, .Lhwunaligned_loop
		for %f26, %f26, %f10 ! %f10=%f26

		ba .Lhwfinish
		@@ -664,7 +659,7 @@ ___
		$code.=<<___;
		.align 16
		.Lsoftware:
		save %sp,`-$frame-$locals`,%sp
		save %sp,-STACK_FRAME-$locals,%sp
		and $inp,`$align-1`,$tmp31
		sllx $len,`log(16*$SZ)/log(2)`,$len
		andn $inp,`$align-1`,$inp
		@@ -783,7 +778,7 @@ ___
		$code.=<<___;
		add $inp,`16*$SZ`,$inp ! advance inp
		cmp $inp,$len
		bne `$bits==64?"%xcc":"%icc"`,.Lloop
		bne SIZE_T_CC,.Lloop
		sub $Ktbl,`($rounds-16)*$SZ`,$Ktbl ! rewind Ktbl

		ret

crypto/sparc_arch.h

+13 −4

Original line number	Diff line number	Diff line
		@@ -32,6 +32,10 @@
		# define __PIC__
		#endif

		#if defined(__SUNPRO_C) && defined(__sparcv9) && !defined(__arch64__)
		# define __arch64__
		#endif

		#define SPARC_PIC_THUNK(reg) \
		.align 32; \
		.Lpic_thunk: \
		@@ -53,18 +57,23 @@
		add %o7, reg, reg
		#endif

		#if (defined(__GNUC__) && defined(__arch64__)) \|\| \
		(defined(__SUNPRO_C) && defined(__sparcv9))
		#if defined(__arch64__)

		# define SPARC_LOAD_ADDRESS(SYM, reg) \
		setx SYM, %o7, reg;
		# define LDPTR ldx
		# define SIZE_T_CC %xcc
		# define STACK_FRAME 192
		# define STACK_BIAS 2047

		#else

		# define SPARC_LOAD_ADDRESS(SYM, reg) \
		set SYM, reg;
		# define LDPTR ld
		# define SIZE_T_CC %icc
		# define STACK_FRAME 112
		# define STACK_BIAS 0
		# define SPARC_LOAD_ADDRESS_LEAF(SYM,reg,tmp) SPARC_LOAD_ADDRESS(SYM,reg)

		#endif