Commit 1efd5830 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

SPARCv9 assembly pack: harmonize ABI handling (so that it's handled in one

place at a time, by pre-processor in .S case and perl - in .s).
parent 8ed11a81
Loading
Loading
Loading
Loading
+9 −17
Original line number Diff line number Diff line
@@ -18,23 +18,8 @@
# ~100-230% faster than gcc-generated code and ~35-90% faster than
# the pure SPARCv9 code path.

$bits=32;
for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
if ($bits==64)  { $bias=2047; $frame=192; }
else            { $bias=0;    $frame=112; }

$locals=16*8;

$code.=<<___;
#include <sparc_arch.h>

.section        ".text",#alloc,#execinstr
___
$code.=<<___ if ($bits==64);
.register       %g2,#scratch
.register       %g3,#scratch
___

$tab="%l0";

@T=("%g2","%g3");
@@ -44,6 +29,13 @@ $tab="%l0";
($lo,$hi,$b)=("%g1",$a8,"%o7"); $a=$lo;

$code.=<<___;
#include <sparc_arch.h>

#ifdef __arch64__
.register	%g2,#scratch
.register	%g3,#scratch
#endif

#ifdef __PIC__
SPARC_PIC_THUNK(%g1)
#endif
@@ -74,7 +66,7 @@ bn_GF2m_mul_2x2:

.align	16
.Lsoftware:
	save	%sp,-$frame-$locals,%sp
	save	%sp,-STACK_FRAME-$locals,%sp

	sllx	%i1,32,$a
	mov	-1,$a12
@@ -83,7 +75,7 @@ bn_GF2m_mul_2x2:
	srlx	$a12,1,$a48			! 0x7fff...
	or	%i4,$b,$b
	srlx	$a12,2,$a12			! 0x3fff...
	add	%sp,$bias+$frame,$tab
	add	%sp,STACK_BIAS+STACK_FRAME,$tab

	sllx	$a,2,$a4
	mov	$a,$a1
+9 −13
Original line number Diff line number Diff line
@@ -17,11 +17,6 @@
# single-process result on 8-core processor, or ~11GBps per 2.85GHz
# socket.

$bits=32;
for (@ARGV)	{ $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
if ($bits==64)	{ $bias=2047; $frame=192; }
else		{ $bias=0;    $frame=112; }

$output=shift;
open STDOUT,">$output";

@@ -198,13 +193,14 @@ $code.=<<___;
___
}

$code.=<<___ if ($bits==64);
.register	%g2,#scratch
.register	%g3,#scratch
___
$code.=<<___;
#include "sparc_arch.h"

#ifdef __arch64__
.register	%g2,#scratch
.register	%g3,#scratch
#endif

.section	".text",#alloc,#execinstr

#ifdef __PIC__
@@ -246,7 +242,7 @@ md5_block_asm_data_order:

	.word	0x81b02800		! MD5

	bne,pt	`$bits==64?"%xcc":"%icc"`, .Lhw_loop
	bne,pt	SIZE_T_CC, .Lhw_loop
	nop

.Lhwfinish:
@@ -287,7 +283,7 @@ md5_block_asm_data_order:

	.word	0x81b02800		! MD5

	bne,pt	`$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop
	bne,pt	SIZE_T_CC, .Lhwunaligned_loop
	for	%f26, %f26, %f10	! %f10=%f26

	ba	.Lhwfinish
@@ -295,7 +291,7 @@ md5_block_asm_data_order:

.align	16
.Lsoftware:
	save	%sp,-$frame,%sp
	save	%sp,-STACK_FRAME,%sp

	rd	%asi,$saved_asi
	wr	%g0,0x88,%asi		! ASI_PRIMARY_LITTLE
@@ -355,7 +351,7 @@ $code.=<<___;
	add	$t2,$C,$C
	add	$CD,$D,$D
	srl	$B,0,$B			! clruw	$B
	bne	`$bits==64?"%xcc":"%icc"`,.Loop
	bne	SIZE_T_CC,.Loop
	srl	$D,0,$D			! clruw	$D

	st	$A,[$ctx+0]		! write out ctx
+9 −13
Original line number Diff line number Diff line
@@ -25,11 +25,6 @@
# single-process result on 8-core processor, or ~9GBps per 2.85GHz
# socket.

$bits=32;
for (@ARGV)	{ $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
if ($bits==64)	{ $bias=2047; $frame=192; }
else		{ $bias=0;    $frame=112; }

$output=shift;
open STDOUT,">$output";

@@ -185,13 +180,14 @@ $code.=<<___;
___
}

$code.=<<___ if ($bits==64);
.register	%g2,#scratch
.register	%g3,#scratch
___
$code.=<<___;
#include "sparc_arch.h"

#ifdef __arch64__
.register	%g2,#scratch
.register	%g3,#scratch
#endif

.section	".text",#alloc,#execinstr

#ifdef __PIC__
@@ -231,7 +227,7 @@ sha1_block_data_order:

	.word	0x81b02820		! SHA1

	bne,pt	`$bits==64?"%xcc":"%icc"`, .Lhw_loop
	bne,pt	SIZE_T_CC, .Lhw_loop
	nop

.Lhwfinish:
@@ -271,7 +267,7 @@ sha1_block_data_order:

	.word	0x81b02820		! SHA1

	bne,pt	`$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop
	bne,pt	SIZE_T_CC, .Lhwunaligned_loop
	for	%f26, %f26, %f10	! %f10=%f26

	ba	.Lhwfinish
@@ -279,7 +275,7 @@ sha1_block_data_order:

.align	16
.Lsoftware:
	save	%sp,-$frame,%sp
	save	%sp,-STACK_FRAME,%sp
	sllx	$len,6,$len
	add	$inp,$len,$len

@@ -359,7 +355,7 @@ $code.=<<___;
	add	$E,@X[4],$E
	st	$E,[$ctx+16]

	bne	`$bits==64?"%xcc":"%icc"`,.Lloop
	bne	SIZE_T_CC,.Lloop
	andn	$inp,7,$tmp0

	ret
+30 −35
Original line number Diff line number Diff line
@@ -49,12 +49,6 @@
# saturates at 11.5x single-process result on 8-core processor, or
# ~11/16GBps per 2.85GHz socket.


$bits=32;
for (@ARGV)	{ $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
if ($bits==64)	{ $bias=2047; $frame=192; }
else		{ $bias=0;    $frame=112; }

$output=shift;
open STDOUT,">$output";

@@ -191,29 +185,29 @@ $code.=<<___ if ($i<15);
	or	@pair[1],$tmp2,$tmp2
	`"ld	[$inp+".eval(32+4+$i*8)."],@pair[1]"	if ($i<12)`
	add	$h,$tmp2,$T1
	$ST	$tmp2,[%sp+`$bias+$frame+$i*$SZ`]
	$ST	$tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
___
$code.=<<___ if ($i==12);
	bnz,a,pn	%icc,.+8
	ld	[$inp+128],%l0
___
$code.=<<___ if ($i==15);
	ld	[%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
	sllx	@pair[1],$tmp31,$tmp2	! Xload($i)
	add	$tmp31,32,$tmp0
	ld	[%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
	sllx	@pair[0],$tmp0,$tmp1
	ld	[%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
	srlx	@pair[2],$tmp32,@pair[1]
	or	$tmp1,$tmp2,$tmp2
	ld	[%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
	or	@pair[1],$tmp2,$tmp2
	ld	[%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
	add	$h,$tmp2,$T1
	$ST	$tmp2,[%sp+`$bias+$frame+$i*$SZ`]
	ld	[%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
	ld	[%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
	ld	[%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
	$ST	$tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
___
} if ($SZ==8);

@@ -349,9 +343,9 @@ $code.=<<___;
	or	%l3,$tmp0,$tmp0

	srlx	$tmp0,@sigma0[0],$T1
	ld	[%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
	sllx	$tmp0,`64-@sigma0[2]`,$tmp1
	ld	[%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
	srlx	$tmp0,@sigma0[1],$tmp0
	xor	$tmp1,$T1,$T1
	sllx	$tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
@@ -363,9 +357,9 @@ $code.=<<___;
	or	%l7,$tmp2,$tmp2

	srlx	$tmp2,@sigma1[0],$tmp1
	ld	[%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
	sllx	$tmp2,`64-@sigma1[2]`,$tmp0
	ld	[%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
	srlx	$tmp2,@sigma1[1],$tmp2
	xor	$tmp0,$tmp1,$tmp1
	sllx	$tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
@@ -374,29 +368,30 @@ $code.=<<___;
	xor	$tmp0,$tmp1,$tmp1
	sllx	%l4,32,$tmp0
	xor	$tmp2,$tmp1,$tmp1	! sigma1(X[$i+14])
	ld	[%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
	or	%l5,$tmp0,$tmp0
	ld	[%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5

	sllx	%l0,32,$tmp2
	add	$tmp1,$T1,$T1
	ld	[%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
	or	%l1,$tmp2,$tmp2
	add	$tmp0,$T1,$T1		! +=X[$i+9]
	ld	[%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
	add	$tmp2,$T1,$T1		! +=X[$i]
	$ST	$T1,[%sp+`$bias+$frame+($i%16)*$SZ`]
	$ST	$T1,[%sp+STACK_BIAS+STACK_FRAME+`($i%16)*$SZ`]
___
    &BODY_00_15(@_);
} if ($SZ==8);

$code.=<<___ if ($bits==64);
.register	%g2,#scratch
.register	%g3,#scratch
___
$code.=<<___;
#include "sparc_arch.h"

#ifdef __arch64__
.register	%g2,#scratch
.register	%g3,#scratch
#endif

.section	".text",#alloc,#execinstr

.align	64
@@ -519,7 +514,7 @@ $code.=<<___ if ($SZ==8); # SHA512

	.word	0x81b02860		! SHA512

	bne,pt	`$bits==64?"%xcc":"%icc"`, .Lhwaligned_loop
	bne,pt	SIZE_T_CC, .Lhwaligned_loop
	nop

.Lhwfinish:
@@ -579,7 +574,7 @@ $code.=<<___ if ($SZ==8); # SHA512

	.word	0x81b02860		! SHA512

	bne,pt	`$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop
	bne,pt	SIZE_T_CC, .Lhwunaligned_loop
	for	%f50, %f50, %f18	! %f18=%f50

	ba	.Lhwfinish
@@ -612,7 +607,7 @@ $code.=<<___ if ($SZ==4); # SHA256

	.word	0x81b02840		! SHA256

	bne,pt	`$bits==64?"%xcc":"%icc"`, .Lhwloop
	bne,pt	SIZE_T_CC, .Lhwloop
	nop

.Lhwfinish:
@@ -655,7 +650,7 @@ $code.=<<___ if ($SZ==4); # SHA256

	.word	0x81b02840		! SHA256

	bne,pt	`$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop
	bne,pt	SIZE_T_CC, .Lhwunaligned_loop
	for	%f26, %f26, %f10	! %f10=%f26

	ba	.Lhwfinish
@@ -664,7 +659,7 @@ ___
$code.=<<___;
.align	16
.Lsoftware:
	save	%sp,`-$frame-$locals`,%sp
	save	%sp,-STACK_FRAME-$locals,%sp
	and	$inp,`$align-1`,$tmp31
	sllx	$len,`log(16*$SZ)/log(2)`,$len
	andn	$inp,`$align-1`,$inp
@@ -783,7 +778,7 @@ ___
$code.=<<___;
	add	$inp,`16*$SZ`,$inp		! advance inp
	cmp	$inp,$len
	bne	`$bits==64?"%xcc":"%icc"`,.Lloop
	bne	SIZE_T_CC,.Lloop
	sub	$Ktbl,`($rounds-16)*$SZ`,$Ktbl	! rewind Ktbl

	ret
+13 −4
Original line number Diff line number Diff line
@@ -32,6 +32,10 @@
# define __PIC__
#endif

#if defined(__SUNPRO_C) && defined(__sparcv9) && !defined(__arch64__)
# define __arch64__
#endif

#define SPARC_PIC_THUNK(reg)	\
	.align	32;		\
.Lpic_thunk:			\
@@ -53,18 +57,23 @@
	add	%o7, reg, reg
#endif

#if	(defined(__GNUC__) && defined(__arch64__)) || \
	(defined(__SUNPRO_C) && defined(__sparcv9))
#if defined(__arch64__)

# define SPARC_LOAD_ADDRESS(SYM, reg)	\
	setx	SYM, %o7, reg;
# define LDPTR		ldx
# define SIZE_T_CC	%xcc
# define STACK_FRAME	192
# define STACK_BIAS	2047

#else

# define SPARC_LOAD_ADDRESS(SYM, reg)	\
	set	SYM, reg;
# define LDPTR		ld
# define SIZE_T_CC	%icc
# define STACK_FRAME	112
# define STACK_BIAS	0
# define SPARC_LOAD_ADDRESS_LEAF(SYM,reg,tmp) SPARC_LOAD_ADDRESS(SYM,reg)

#endif