Commit 6df8c74d authored by Andy Polyakov's avatar Andy Polyakov
Browse files

Switch 64-bit sparcv9 platforms from bn(64,64) to bn(64,32). This doesn't

have impact on performance, because amount of multiplications does not
increase with this switch, not on sparcv9 that is. On the contrary, it
actually improves performance, because it spares a load of instructions
used to chase carries. Not to mention that BN assembler modules can be
shared more freely between 32- and 64-bit builts.
parent 877e8e97
Loading
Loading
Loading
Loading
+4 −4
Original line number Diff line number Diff line
@@ -202,7 +202,7 @@ my %table=(
"solaris-sparcv8-gcc","gcc:-mv8 -O3 -fomit-frame-pointer -Wall -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT::-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::dlfcn:solaris-shared:-fPIC:-shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
# -m32 should be safe to add as long as driver recognizes -mcpu=ultrasparc
"solaris-sparcv9-gcc","gcc:-m32 -mcpu=ultrasparc -O3 -fomit-frame-pointer -Wall -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT:ULTRASPARC:-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::sparcv8plus.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv8plus.o::::::dlfcn:solaris-shared:-fPIC:-shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"solaris64-sparcv9-gcc","gcc:-m64 -mcpu=ultrasparc -O3 -Wall -DB_ENDIAN::-D_REENTRANT:ULTRASPARC:-lsocket -lnsl -ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR::bn_asm.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv9.o::::::dlfcn:solaris-shared:-fPIC:-m64 -shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"solaris64-sparcv9-gcc","gcc:-m64 -mcpu=ultrasparc -O3 -Wall -DB_ENDIAN::-D_REENTRANT:ULTRASPARC:-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR::bn_asm.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv9.o::::::dlfcn:solaris-shared:-fPIC:-m64 -shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
####
"debug-solaris-sparcv8-gcc","gcc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DBN_CTX_DEBUG -DCRYPTO_MDEBUG_ALL -O -g -mv8 -Wall -DB_ENDIAN::-D_REENTRANT::-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::sparcv8.o::::::::::dlfcn:solaris-shared:-fPIC:-shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"debug-solaris-sparcv9-gcc","gcc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DBN_CTX_DEBUG -DCRYPTO_MDEBUG_ALL -DPEDANTIC -O -g -mcpu=ultrasparc -pedantic -ansi -Wall -Wshadow -Wno-long-long -D__EXTENSIONS__ -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT:ULTRASPARC:-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::sparcv8plus.o:des_enc-sparc.o fcrypt_b.o:::md5-sparcv8plus.o::::::dlfcn:solaris-shared:-fPIC:-shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
@@ -214,7 +214,7 @@ my %table=(
"solaris-sparcv7-cc","cc:-xO5 -xstrconst -xdepend -Xa -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT::-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_PTR DES_RISC1 DES_UNROLL BF_PTR:${no_asm}:dlfcn:solaris-shared:-KPIC:-G -dy -z text:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"solaris-sparcv8-cc","cc:-xarch=v8 -xO5 -xstrconst -xdepend -Xa -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT::-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_PTR DES_RISC1 DES_UNROLL BF_PTR::sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::dlfcn:solaris-shared:-KPIC:-G -dy -z text:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"solaris-sparcv9-cc","cc:-xtarget=ultra -xarch=v8plusa -xO5 -xstrconst -xdepend -Xa -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT:ULTRASPARC:-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK_LL DES_PTR DES_RISC1 DES_UNROLL BF_PTR::sparcv8plus.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv8plus.o::::::dlfcn:solaris-shared:-KPIC:-G -dy -z text:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"solaris64-sparcv9-cc","cc:-xtarget=ultra -xarch=v9a -xO5 -xstrconst -xdepend -Xa -DB_ENDIAN::-D_REENTRANT:ULTRASPARC:-lsocket -lnsl -ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR::bn_asm.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv9.o::::::dlfcn:solaris-shared:-KPIC:-xarch=v9 -G -dy -z text:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):/usr/ccs/bin/ar rs",
"solaris64-sparcv9-cc","cc:-xtarget=ultra -xarch=v9a -xO5 -xstrconst -xdepend -Xa -DB_ENDIAN::-D_REENTRANT:ULTRASPARC:-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR::bn_asm.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv9.o::::::dlfcn:solaris-shared:-KPIC:-xarch=v9 -G -dy -z text:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):/usr/ccs/bin/ar rs",
####
"debug-solaris-sparcv8-cc","cc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DBN_CTX_DEBUG -DCRYPTO_MDEBUG_ALL -xarch=v8 -g -O -xstrconst -Xa -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT::-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_PTR DES_RISC1 DES_UNROLL BF_PTR::sparcv8.o::::::::::dlfcn:solaris-shared:-KPIC:-G -dy -z text:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"debug-solaris-sparcv9-cc","cc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DBN_CTX_DEBUG -DCRYPTO_MDEBUG_ALL -xtarget=ultra -xarch=v8plus -g -O -xstrconst -Xa -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT:ULTRASPARC:-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK_LL DES_PTR DES_RISC1 DES_UNROLL BF_PTR::sparcv8plus.o::::md5-sparcv8plus.o::::::dlfcn:solaris-shared:-KPIC:-G -dy -z text:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", 
@@ -335,7 +335,7 @@ my %table=(
# -Wa,-Av8plus should do the trick no matter what.
"linux-sparcv9","gcc:-m32 -mcpu=ultrasparc -DB_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall -Wa,-Av8plusa -DBN_DIV2W::-D_REENTRANT:ULTRASPARC:-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::sparcv8plus.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv8plus.o::::::dlfcn:linux-shared:-fPIC:-m32:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
# GCC 3.1 is a requirement
"linux64-sparcv9","gcc:-m64 -mcpu=ultrasparc -DB_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT:ULTRASPARC:-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::bn_asm.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv9.o::::::dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"linux64-sparcv9","gcc:-m64 -mcpu=ultrasparc -DB_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT:ULTRASPARC:-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::bn_asm.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv9.o::::::dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
#### Alpha Linux with GNU C and Compaq C setups
# Special notes:
# - linux-alpha+bwx-gcc is ment to be used from ./config only. If you
@@ -365,7 +365,7 @@ my %table=(
# -DMD32_REG_T=int doesn't actually belong in sparc64 target, it
# simply *happens* to work around a compiler bug in gcc 3.3.3,
# triggered by RIPEMD160 code.
"BSD-sparc64",	"gcc:-DB_ENDIAN -DTERMIOS -O3 -DMD32_REG_T=int -Wall::${BSDthreads}:::SIXTY_FOUR_BIT_LONG RC2_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC2 BF_PTR::bn_asm.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv9.o::::::dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"BSD-sparc64",	"gcc:-DB_ENDIAN -DTERMIOS -O3 -DMD32_REG_T=int -Wall::${BSDthreads}:::BN_LLONG RC2_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC2 BF_PTR::bn_asm.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv9.o::::::dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"BSD-ia64",	"gcc:-DL_ENDIAN -DTERMIOS -O3 -Wall::${BSDthreads}:::SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_UNROLL DES_INT:${ia64_asm}:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"BSD-x86_64",	"gcc:-DL_ENDIAN -DTERMIOS -O3 -DMD32_REG_T=int -Wall::${BSDthreads}:::SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:${x86_64_asm}:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",

+4 −4
Original line number Diff line number Diff line
@@ -142,7 +142,7 @@ $unistd =
$thread_cflag = -pthread -D_THREAD_SAFE -D_REENTRANT
$sys_id       = 
$lflags       = 
$bn_ops       = SIXTY_FOUR_BIT_LONG RC2_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC2 BF_PTR
$bn_ops       = BN_LLONG RC2_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC2 BF_PTR
$cpuid_obj    = 
$bn_obj       = bn_asm.o sparcv9a-mont.o
$des_obj      = des_enc-sparc.o fcrypt_b.o
@@ -2923,7 +2923,7 @@ $unistd =
$thread_cflag = -D_REENTRANT
$sys_id       = ULTRASPARC
$lflags       = -ldl
$bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR
$bn_ops       = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR
$cpuid_obj    = 
$bn_obj       = bn_asm.o sparcv9a-mont.o
$des_obj      = des_enc-sparc.o fcrypt_b.o
@@ -3625,7 +3625,7 @@ $unistd =
$thread_cflag = -D_REENTRANT
$sys_id       = ULTRASPARC
$lflags       = -lsocket -lnsl -ldl
$bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR
$bn_ops       = BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR
$cpuid_obj    = 
$bn_obj       = bn_asm.o sparcv9a-mont.o
$des_obj      = des_enc-sparc.o fcrypt_b.o
@@ -3652,7 +3652,7 @@ $unistd =
$thread_cflag = -D_REENTRANT
$sys_id       = ULTRASPARC
$lflags       = -lsocket -lnsl -ldl
$bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR
$bn_ops       = BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR
$cpuid_obj    = 
$bn_obj       = bn_asm.o sparcv9a-mont.o
$des_obj      = des_enc-sparc.o fcrypt_b.o
+64 −93
Original line number Diff line number Diff line
@@ -138,11 +138,7 @@ $fname:
	save	%sp,-$frame-$locals,%sp
	sethi	%hi(0xffff),$mask
	or	$mask,%lo(0xffff),$mask
___
$code.=<<___ if ($bits==64);
	ldx	[%i4],$n0		! $n0 reassigned, remember?
___
$code.=<<___ if ($bits==32);

	cmp	$num,4
	bl,a,pn %icc,.Lret
	clr	%i0
@@ -160,8 +156,7 @@ $code.=<<___ if ($bits==32);
	ld	[%i4+4],%o0
	sllx	%o0,32,%o0
	or	%o0,$n0,$n0		! $n0=n0[1].n0[0]
___
$code.=<<___;

	sll	$num,3,$num		! num*=8

	add	%sp,$bias,%o0		! real top of stack
@@ -188,48 +183,44 @@ $code.=<<___;

	stx	%o7,[%sp+$bias+$frame+48]	! save %asi

	sub	%g0,$num,$i
	sub	%g0,$num,$j
	sub	%g0,$num,$i		! i=-num
	sub	%g0,$num,$j		! j=-num

	add	$ap,$j,%o3
	add	$bp,$i,%o4
___
$code.=<<___ if ($bits==64);

	ldx	[$bp+$i],%o0		! bp[0]
	ldx	[$ap+$j],%o1		! ap[0]
___
$code.=<<___ if ($bits==32);
	ldd	[$bp+$i],%o0		! bp[0]
	ldd	[$ap+$j],%g2		! ap[0]
	sllx	%o1,32,%o1
	sllx	%g3,32,%g3
	or	%o0,%o1,%o0
	or	%g2,%g3,%o1
___
$code.=<<___;
	sllx	%o0,32,%g1
	sllx	%o1,32,%g5
	srlx	%o0,32,%o0
	srlx	%o1,32,%o1
	or	%g1,%o0,%o0
	or	%g5,%o1,%o1

	add	$np,$j,%o5

	mulx	%o1,%o0,%o0		! ap[0]*bp[0]
	mulx	$n0,%o0,%o0		! ap[0]*bp[0]*n0
	stx	%o0,[%sp+$bias+$frame+0]

	ld	[%o3+`$bits==32 ? 0 : 4`],$alo_	! load a[j] as pair of 32-bit words
	ld	[%o3+0],$alo_	! load a[j] as pair of 32-bit words
	fzeros	$alo
	ld	[%o3+`$bits==32 ? 4 : 0`],$ahi_
	ld	[%o3+4],$ahi_
	fzeros	$ahi
	ld	[%o5+`$bits==32 ? 0 : 4`],$nlo_	! load n[j] as pair of 32-bit words
	ld	[%o5+0],$nlo_	! load n[j] as pair of 32-bit words
	fzeros	$nlo
	ld	[%o5+`$bits==32 ? 4 : 0`],$nhi_
	ld	[%o5+4],$nhi_
	fzeros	$nhi

	! transfer b[i] to FPU as 4x16-bit values
	ldda	[%o4+`$bits==32 ? 2 : 6`]%asi,$ba
	ldda	[%o4+2]%asi,$ba
	fxtod	$alo,$alo
	ldda	[%o4+`$bits==32 ? 0 : 4`]%asi,$bb
	ldda	[%o4+0]%asi,$bb
	fxtod	$ahi,$ahi
	ldda	[%o4+`$bits==32 ? 6 : 2`]%asi,$bc
	ldda	[%o4+6]%asi,$bc
	fxtod	$nlo,$nlo
	ldda	[%o4+`$bits==32 ? 4 : 0`]%asi,$bd
	ldda	[%o4+4]%asi,$bd
	fxtod	$nhi,$nhi

	! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
@@ -256,24 +247,24 @@ $code.=<<___;
		fmuld	$alo,$bb,$alob
		fmuld	$nlo,$nb,$nlob
		fmuld	$alo,$bc,$aloc
		fmuld	$nlo,$nc,$nloc
	faddd	$aloa,$nloa,$nloa
		fmuld	$nlo,$nc,$nloc
		fmuld	$alo,$bd,$alod
		fmuld	$nlo,$nd,$nlod
	faddd	$alob,$nlob,$nlob
		fmuld	$nlo,$nd,$nlod
		fmuld	$ahi,$ba,$ahia
		fmuld	$nhi,$na,$nhia
	faddd	$aloc,$nloc,$nloc
		fmuld	$nhi,$na,$nhia
		fmuld	$ahi,$bb,$ahib
		fmuld	$nhi,$nb,$nhib
	faddd	$alod,$nlod,$nlod
		fmuld	$nhi,$nb,$nhib
		fmuld	$ahi,$bc,$ahic
		fmuld	$nhi,$nc,$nhic
	faddd	$ahia,$nhia,$nhia
		fmuld	$nhi,$nc,$nhic
		fmuld	$ahi,$bd,$ahid
	faddd	$ahib,$nhib,$nhib
		fmuld	$nhi,$nd,$nhid

	faddd	$ahib,$nhib,$nhib
	faddd	$ahic,$nhic,$dota	! $nhic
	faddd	$ahid,$nhid,$dotb	! $nhid

@@ -317,13 +308,13 @@ $code.=<<___;
.L1st:
	add	$ap,$j,%o3
	add	$np,$j,%o4
	ld	[%o3+`$bits==32 ? 0 : 4`],$alo_	! load a[j] as pair of 32-bit words
	ld	[%o3+0],$alo_	! load a[j] as pair of 32-bit words
	fzeros	$alo
	ld	[%o3+`$bits==32 ? 4 : 0`],$ahi_
	ld	[%o3+4],$ahi_
	fzeros	$ahi
	ld	[%o4+`$bits==32 ? 0 : 4`],$nlo_	! load n[j] as pair of 32-bit words
	ld	[%o4+0],$nlo_	! load n[j] as pair of 32-bit words
	fzeros	$nlo
	ld	[%o4+`$bits==32 ? 4 : 0`],$nhi_
	ld	[%o4+4],$nhi_
	fzeros	$nhi

	fxtod	$alo,$alo
@@ -340,23 +331,23 @@ $code.=<<___;
	std	$nhi,[$np_h+$j]
		fmuld	$nlo,$nb,$nlob
		fmuld	$alo,$bc,$aloc
		fmuld	$nlo,$nc,$nloc
	faddd	$aloa,$nloa,$nloa
		fmuld	$nlo,$nc,$nloc
		fmuld	$alo,$bd,$alod
		fmuld	$nlo,$nd,$nlod
	faddd	$alob,$nlob,$nlob
		fmuld	$nlo,$nd,$nlod
		fmuld	$ahi,$ba,$ahia
		fmuld	$nhi,$na,$nhia
	faddd	$aloc,$nloc,$nloc
		fmuld	$nhi,$na,$nhia
		fmuld	$ahi,$bb,$ahib
		fmuld	$nhi,$nb,$nhib
	faddd	$alod,$nlod,$nlod
		fmuld	$nhi,$nb,$nhib
		fmuld	$ahi,$bc,$ahic
		fmuld	$nhi,$nc,$nhic
	faddd	$ahia,$nhia,$nhia
		fmuld	$nhi,$nc,$nhic
		fmuld	$ahi,$bd,$ahid
		fmuld	$nhi,$nd,$nhid
	faddd	$ahib,$nhib,$nhib
		fmuld	$nhi,$nd,$nhid

	faddd	$dota,$nloa,$nloa
	faddd	$dotb,$nlob,$nlob
@@ -429,36 +420,31 @@ $code.=<<___;
	add	$i,8,$i
.align	32
.Louter:
	sub	%g0,$num,$j
	sub	%g0,$num,$j		! j=-num
	add	%sp,$bias+$frame+$locals,$tp

	add	$bp,$i,%o4
___
$code.=<<___ if ($bits==64);

	ldx	[$bp+$i],%o0		! bp[i]
	ldx	[$ap+$j],%o1		! ap[0]
___
$code.=<<___ if ($bits==32);
	ldd	[$bp+$i],%o0		! bp[i]
	ldd	[$ap+$j],%g2		! ap[0]
	sllx	%o1,32,%o1
	sllx	%g3,32,%g3
	or	%o0,%o1,%o0
	or	%g2,%g3,%o1
___
$code.=<<___;
	sllx	%o0,32,%g1
	sllx	%o1,32,%g5
	srlx	%o0,32,%o0
	srlx	%o1,32,%o1
	or	%g1,%o0,%o0
	or	%g5,%o1,%o1

	ldx	[$tp],%o2		! tp[0]
	mulx	%o1,%o0,%o0
	addcc	%o2,%o0,%o0
	mulx	$n0,%o0,%o0		! (ap[0]*bp[i]+t[0])*n0
	stx	%o0,[%sp+$bias+$frame+0]


	! transfer b[i] to FPU as 4x16-bit values
	ldda	[%o4+`$bits==32 ? 2 : 6`]%asi,$ba
	ldda	[%o4+`$bits==32 ? 0 : 4`]%asi,$bb
	ldda	[%o4+`$bits==32 ? 6 : 2`]%asi,$bc
	ldda	[%o4+`$bits==32 ? 4 : 0`]%asi,$bd
	ldda	[%o4+2]%asi,$ba
	ldda	[%o4+0]%asi,$bb
	ldda	[%o4+6]%asi,$bc
	ldda	[%o4+4]%asi,$bd

	! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
	ldda	[%sp+$bias+$frame+6]%asi,$na
@@ -483,24 +469,24 @@ $code.=<<___;
		fmuld	$alo,$bb,$alob
		fmuld	$nlo,$nb,$nlob
		fmuld	$alo,$bc,$aloc
		fmuld	$nlo,$nc,$nloc
	faddd	$aloa,$nloa,$nloa
		fmuld	$nlo,$nc,$nloc
		fmuld	$alo,$bd,$alod
		fmuld	$nlo,$nd,$nlod
	faddd	$alob,$nlob,$nlob
		fmuld	$nlo,$nd,$nlod
		fmuld	$ahi,$ba,$ahia
		fmuld	$nhi,$na,$nhia
	faddd	$aloc,$nloc,$nloc
		fmuld	$nhi,$na,$nhia
		fmuld	$ahi,$bb,$ahib
		fmuld	$nhi,$nb,$nhib
	faddd	$alod,$nlod,$nlod
		fmuld	$nhi,$nb,$nhib
		fmuld	$ahi,$bc,$ahic
		fmuld	$nhi,$nc,$nhic
	faddd	$ahia,$nhia,$nhia
		fmuld	$nhi,$nc,$nhic
		fmuld	$ahi,$bd,$ahid
	faddd	$ahib,$nhib,$nhib
		fmuld	$nhi,$nd,$nhid

	faddd	$ahib,$nhib,$nhib
	faddd	$ahic,$nhic,$dota	! $nhic
	faddd	$ahid,$nhid,$dotb	! $nhid

@@ -558,24 +544,24 @@ $code.=<<___;
		fmuld	$alo,$bb,$alob
		fmuld	$nlo,$nb,$nlob
		fmuld	$alo,$bc,$aloc
		fmuld	$nlo,$nc,$nloc
	faddd	$aloa,$nloa,$nloa
		fmuld	$nlo,$nc,$nloc
		fmuld	$alo,$bd,$alod
		fmuld	$nlo,$nd,$nlod
	faddd	$alob,$nlob,$nlob
		fmuld	$nlo,$nd,$nlod
		fmuld	$ahi,$ba,$ahia
		fmuld	$nhi,$na,$nhia
	faddd	$aloc,$nloc,$nloc
		fmuld	$nhi,$na,$nhia
		fmuld	$ahi,$bb,$ahib
		fmuld	$nhi,$nb,$nhib
	faddd	$alod,$nlod,$nlod
		fmuld	$nhi,$nb,$nhib
		fmuld	$ahi,$bc,$ahic
		fmuld	$nhi,$nc,$nhic
	faddd	$ahia,$nhia,$nhia
		fmuld	$nhi,$nc,$nhic
		fmuld	$ahi,$bd,$ahid
	faddd	$ahib,$nhib,$nhib
		fmuld	$nhi,$nd,$nhid

	faddd	$ahib,$nhib,$nhib
	faddd	$dota,$nloa,$nloa
	faddd	$dotb,$nlob,$nlob
	faddd	$ahic,$nhic,$dota	! $nhic
@@ -661,7 +647,7 @@ $code.=<<___;
	add	$tp,8,$tp		! adjust tp to point at the end

	ld	[$tp-8],%o0
	ld	[$np-`$bits==32 ? 4 : 8`],%o1
	ld	[$np-4],%o1
	cmp	%o0,%o1			! compare topmost words
	bcs,pt	%icc,.Lcopy		! %icc.c is clean if not taken
	nop
@@ -670,41 +656,26 @@ $code.=<<___;
.Lsub:
	ldd	[$tp+%o7],%o0
	ldd	[$np+%o7],%o2
___
$code.=<<___ if ($bits==64);
	subccc	%o1,%o3,%o3
	subccc	%o0,%o2,%o2
___
$code.=<<___ if ($bits==32);
	subccc	%o1,%o2,%o2
	subccc	%o0,%o3,%o3
___
$code.=<<___;
	std	%o2,[$rp+%o7]
	add	%o7,8,%o7
	brnz,pt	%o7,.Lsub
	nop
	subccc	$carry,0,$carry
	bcc,pt	%icc,.Lzap
	sub	%g0,$num,%o7
	sub	%g0,$num,%o7		! n=-num

.align	16,0x1000000
.Lcopy:
	ldx	[$tp+%o7],%o0
___
$code.=<<___ if ($bits==64);
	stx	%o0,[$rp+%o7]
___
$code.=<<___ if ($bits==32);
	srlx	%o0,32,%o1
	std	%o0,[$rp+%o7]
___
$code.=<<___;
	add	%o7,8,%o7
	brnz,pt	%o7,.Lcopy
	nop
	ba	.Lzap
	sub	%g0,$num,%o7
	sub	%g0,$num,%o7		! n=-num

.align	32
.Lzap: