Commit 4f5fac80 authored by Ulf Möller's avatar Ulf Möller
Browse files

Sparc v8plus assembler.

Submitted by: Andy Polyakov <appro@fy.chalmers.se>
parent d872c55c
Loading
Loading
Loading
Loading
+3 −0
Original line number Original line Diff line number Diff line
@@ -5,6 +5,9 @@


 Changes between 0.9.2b and 0.9.3
 Changes between 0.9.2b and 0.9.3


  *) Sparc v8plus assembler for the bignum library.
    [Andy Polyakov <appro@fy.chalmers.se>]

  *) Accept any -xxx and +xxx compiler options in Configure.
  *) Accept any -xxx and +xxx compiler options in Configure.
     [Ulf Möller]
     [Ulf Möller]


+2 −2
Original line number Original line Diff line number Diff line
@@ -115,8 +115,8 @@ my %table=(
# Don't use -xtarget=ultra with SC4.2. It is broken, and will break exptest.
# Don't use -xtarget=ultra with SC4.2. It is broken, and will break exptest.
# SC5.0 with the compiler common patch works.
# SC5.0 with the compiler common patch works.
"solaris-sparc-sc4","cc:-xarch=v8 -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8.o::",
"solaris-sparc-sc4","cc:-xarch=v8 -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8.o::",
"solaris-usparc-sc4","cc:-xarch=v8plus -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8.o::",
"solaris-usparc-sc4","cc:-xarch=v8plus -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8plus.o::",
"solaris-usparc-sc5","cc:-xtarget=ultra -xarch=v8plus -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8.o::",
"solaris-usparc-sc5","cc:-xtarget=ultra -xarch=v8plus -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8plus.o::",


# Sunos configs, assuming sparc for the gcc one.
# Sunos configs, assuming sparc for the gcc one.
##"sunos-cc", "cc:-O4 -DNOPROTO -DNOCONST:(unknown)::DES_UNROLL:::",
##"sunos-cc", "cc:-O4 -DNOPROTO -DNOCONST:(unknown)::DES_UNROLL:::",
+3 −0
Original line number Original line Diff line number Diff line
@@ -101,6 +101,9 @@ asm/co86unix.cpp: asm/co-586.pl
asm/sparcv8.o: asm/sparcv8.S
asm/sparcv8.o: asm/sparcv8.S
	$(CC) -c -o asm/sparcv8.o asm/sparcv8.S
	$(CC) -c -o asm/sparcv8.o asm/sparcv8.S


asm/sparcv8plus: asm/sparcv8plus.S
	$(CC) -c -xarch=v8plus -o asm/sparcv8plus.o asm/sparcv8plus.S

# MIPS 64 bit assember 
# MIPS 64 bit assember 
asm/mips3.o: asm/mips3.s
asm/mips3.o: asm/mips3.s
	/usr/bin/as -mips3 -O2 -o asm/mips3.o asm/mips3.s            
	/usr/bin/as -mips3 -O2 -o asm/mips3.o asm/mips3.s            
+110 −120
Original line number Original line Diff line number Diff line
.ident	"sparcv8.s, Version 1.1"
.ident	"sparcv8.s, Version 1.2"
.ident	"SPARC v8 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
.ident	"SPARC v8 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"


/*
/*
@@ -24,14 +24,14 @@
/*
/*
 * Revision history.
 * Revision history.
 *
 *
 * 1.1	- new loop unrolling model(*)
 * 1.1	- new loop unrolling model(*);
 *	- 10% performance boost(*)
 * 1.2	- made gas friendly;
 *
 *
 * (*)	see bn_asm.sparc.v8plus.S for details
 * (*)	see bn_asm.sparc.v8plus.S for details
 */
 */


.section	".text",#alloc,#execinstr
.section	".text",#alloc,#execinstr
.file		"sparcv8.s"
.file		"bn_asm.sparc.v8.S"


.align	32
.align	32


@@ -551,43 +551,33 @@ bn_sub_words:
/*
/*
 * Here is register usage map for *all* routines below.
 * Here is register usage map for *all* routines below.
 */
 */
#define t_1	%o0
#define	t_2	%o1
#define c_1	%o2
#define c_2	%o3
#define c_3	%o4

#define a(I)	[%i1+4*I]
#define b(I)	[%i2+4*I]
#define r(I)	[%i0+4*I]

#define	a_0	%l0
#define	a_0	%l0
#define a_0_	[%i1]
#define	a_1	%l1
#define	a_1	%l1
#define a_1_	[%i1+4]
#define	a_2	%l2
#define	a_2	%l2
#define a_2_	[%i1+8]
#define	a_3	%l3
#define	a_3	%l3
#define a_3_	[%i1+12]
#define	a_4	%l4
#define	a_4	%l4
#define a_4_	[%i1+16]
#define	a_5	%l5
#define	a_5	%l5
#define a_5_	[%i1+20]
#define	a_6	%l6
#define	a_6	%l6
#define a_6_	[%i1+24]
#define	a_7	%l7
#define	a_7	%l7
#define a_7_	[%i1+28]

#define	b_0	%g1
#define	b_0	%i3
#define b_0_	[%i2]
#define	b_1	%i4
#define	b_1	%g2
#define	b_2	%i5
#define b_1_	[%i2+4]
#define	b_3	%o5
#define	b_2	%g3
#define	b_4	%g1
#define b_2_	[%i2+8]
#define	b_5	%g2
#define	b_3	%g4
#define	b_6	%g3
#define b_3_	[%i2+12]
#define	b_7	%g4
#define	b_4	%i3
#define b_4_	[%i2+16]
#define	b_5	%i4
#define b_5_	[%i2+20]
#define	b_6	%i5
#define b_6_	[%i2+24]
#define	b_7	%o5
#define b_7_	[%i2+28]
#define c_1	%o2
#define c_2	%o3
#define c_3	%o4
#define t_1	%o0
#define	t_2	%o1


.align	32
.align	32
.global bn_mul_comba8
.global bn_mul_comba8
@@ -597,25 +587,25 @@ bn_sub_words:
 */
 */
bn_mul_comba8:
bn_mul_comba8:
	save	%sp,FRAME_SIZE,%sp
	save	%sp,FRAME_SIZE,%sp
	ld	a_0_,a_0
	ld	a(0),a_0
	ld	b_0_,b_0
	ld	b(0),b_0
	umul	a_0,b_0,c_1	!=!mul_add_c(a[0],b[0],c1,c2,c3);
	umul	a_0,b_0,c_1	!=!mul_add_c(a[0],b[0],c1,c2,c3);
	ld	b_1_,b_1
	ld	b(1),b_1
	rd	%y,c_2
	rd	%y,c_2
	st	c_1,[%i0]	!r[0]=c1;
	st	c_1,r(0)	!r[0]=c1;


	umul	a_0,b_1,t_1	!=!mul_add_c(a[0],b[1],c2,c3,c1);
	umul	a_0,b_1,t_1	!=!mul_add_c(a[0],b[1],c2,c3,c1);
	ld	a_1_,a_1
	ld	a(1),a_1
	addcc	c_2,t_1,c_2
	addcc	c_2,t_1,c_2
	rd	%y,t_2
	rd	%y,t_2
	addxcc	%g0,t_2,c_3	!=
	addxcc	%g0,t_2,c_3	!=
	addx	%g0,%g0,c_1
	addx	%g0,%g0,c_1
	ld	a_2_,a_2
	ld	a(2),a_2
	umul	a_1,b_0,t_1	!mul_add_c(a[1],b[0],c2,c3,c1);
	umul	a_1,b_0,t_1	!mul_add_c(a[1],b[0],c2,c3,c1);
	addcc	c_2,t_1,c_2	!=
	addcc	c_2,t_1,c_2	!=
	rd	%y,t_2
	rd	%y,t_2
	addxcc	c_3,t_2,c_3
	addxcc	c_3,t_2,c_3
	st	c_2,[%i0+4]	!r[1]=c2;
	st	c_2,r(1)	!r[1]=c2;
	addx	c_1,%g0,c_1	!=
	addx	c_1,%g0,c_1	!=


	umul	a_2,b_0,t_1	!mul_add_c(a[2],b[0],c3,c1,c2);
	umul	a_2,b_0,t_1	!mul_add_c(a[2],b[0],c3,c1,c2);
@@ -623,19 +613,19 @@ bn_mul_comba8:
	rd	%y,t_2
	rd	%y,t_2
	addxcc	c_1,t_2,c_1	!=
	addxcc	c_1,t_2,c_1	!=
	addx	%g0,%g0,c_2
	addx	%g0,%g0,c_2
	ld	b_2_,b_2
	ld	b(2),b_2
	umul	a_1,b_1,t_1	!mul_add_c(a[1],b[1],c3,c1,c2);
	umul	a_1,b_1,t_1	!mul_add_c(a[1],b[1],c3,c1,c2);
	addcc	c_3,t_1,c_3	!=
	addcc	c_3,t_1,c_3	!=
	rd	%y,t_2
	rd	%y,t_2
	addxcc	c_1,t_2,c_1
	addxcc	c_1,t_2,c_1
	ld	b_3_,b_3
	ld	b(3),b_3
	addx	c_2,%g0,c_2	!=
	addx	c_2,%g0,c_2	!=
	umul	a_0,b_2,t_1	!mul_add_c(a[0],b[2],c3,c1,c2);
	umul	a_0,b_2,t_1	!mul_add_c(a[0],b[2],c3,c1,c2);
	addcc	c_3,t_1,c_3
	addcc	c_3,t_1,c_3
	rd	%y,t_2
	rd	%y,t_2
	addxcc	c_1,t_2,c_1	!=
	addxcc	c_1,t_2,c_1	!=
	addx	c_2,%g0,c_2
	addx	c_2,%g0,c_2
	st	c_3,[%i0+8]	!r[2]=c3;
	st	c_3,r(2)	!r[2]=c3;


	umul	a_0,b_3,t_1	!mul_add_c(a[0],b[3],c1,c2,c3);
	umul	a_0,b_3,t_1	!mul_add_c(a[0],b[3],c1,c2,c3);
	addcc	c_1,t_1,c_1	!=
	addcc	c_1,t_1,c_1	!=
@@ -647,19 +637,19 @@ bn_mul_comba8:
	rd	%y,t_2
	rd	%y,t_2
	addxcc	c_2,t_2,c_2
	addxcc	c_2,t_2,c_2
	addx	c_3,%g0,c_3	!=
	addx	c_3,%g0,c_3	!=
	ld	a_3_,a_3
	ld	a(3),a_3
	umul	a_2,b_1,t_1	!mul_add_c(a[2],b[1],c1,c2,c3);
	umul	a_2,b_1,t_1	!mul_add_c(a[2],b[1],c1,c2,c3);
	addcc	c_1,t_1,c_1
	addcc	c_1,t_1,c_1
	rd	%y,t_2		!=
	rd	%y,t_2		!=
	addxcc	c_2,t_2,c_2
	addxcc	c_2,t_2,c_2
	addx	c_3,%g0,c_3
	addx	c_3,%g0,c_3
	ld	a_4_,a_4
	ld	a(4),a_4
	umul	a_3,b_0,t_1	!mul_add_c(a[3],b[0],c1,c2,c3);!=
	umul	a_3,b_0,t_1	!mul_add_c(a[3],b[0],c1,c2,c3);!=
	addcc	c_1,t_1,c_1
	addcc	c_1,t_1,c_1
	rd	%y,t_2
	rd	%y,t_2
	addxcc	c_2,t_2,c_2
	addxcc	c_2,t_2,c_2
	addx	c_3,%g0,c_3	!=
	addx	c_3,%g0,c_3	!=
	st	c_1,[%i0+12]	!r[3]=c1;
	st	c_1,r(3)	!r[3]=c1;


	umul	a_4,b_0,t_1	!mul_add_c(a[4],b[0],c2,c3,c1);
	umul	a_4,b_0,t_1	!mul_add_c(a[4],b[0],c2,c3,c1);
	addcc	c_2,t_1,c_2
	addcc	c_2,t_1,c_2
@@ -676,19 +666,19 @@ bn_mul_comba8:
	rd	%y,t_2
	rd	%y,t_2
	addxcc	c_3,t_2,c_3
	addxcc	c_3,t_2,c_3
	addx	c_1,%g0,c_1	!=
	addx	c_1,%g0,c_1	!=
	ld	b_4_,b_4
	ld	b(4),b_4
	umul	a_1,b_3,t_1	!mul_add_c(a[1],b[3],c2,c3,c1);
	umul	a_1,b_3,t_1	!mul_add_c(a[1],b[3],c2,c3,c1);
	addcc	c_2,t_1,c_2
	addcc	c_2,t_1,c_2
	rd	%y,t_2		!=
	rd	%y,t_2		!=
	addxcc	c_3,t_2,c_3
	addxcc	c_3,t_2,c_3
	addx	c_1,%g0,c_1
	addx	c_1,%g0,c_1
	ld	b_5_,b_5
	ld	b(5),b_5
	umul	a_0,b_4,t_1	!=!mul_add_c(a[0],b[4],c2,c3,c1);
	umul	a_0,b_4,t_1	!=!mul_add_c(a[0],b[4],c2,c3,c1);
	addcc	c_2,t_1,c_2
	addcc	c_2,t_1,c_2
	rd	%y,t_2
	rd	%y,t_2
	addxcc	c_3,t_2,c_3
	addxcc	c_3,t_2,c_3
	addx	c_1,%g0,c_1	!=
	addx	c_1,%g0,c_1	!=
	st	c_2,[%i0+16]	!r[4]=c2;
	st	c_2,r(4)	!r[4]=c2;


	umul	a_0,b_5,t_1	!mul_add_c(a[0],b[5],c3,c1,c2);
	umul	a_0,b_5,t_1	!mul_add_c(a[0],b[5],c3,c1,c2);
	addcc	c_3,t_1,c_3
	addcc	c_3,t_1,c_3
@@ -710,19 +700,19 @@ bn_mul_comba8:
	rd	%y,t_2
	rd	%y,t_2
	addxcc	c_1,t_2,c_1	!=
	addxcc	c_1,t_2,c_1	!=
	addx	c_2,%g0,c_2
	addx	c_2,%g0,c_2
	ld	a_5_,a_5
	ld	a(5),a_5
	umul	a_4,b_1,t_1	!mul_add_c(a[4],b[1],c3,c1,c2);
	umul	a_4,b_1,t_1	!mul_add_c(a[4],b[1],c3,c1,c2);
	addcc	c_3,t_1,c_3	!=
	addcc	c_3,t_1,c_3	!=
	rd	%y,t_2
	rd	%y,t_2
	addxcc	c_1,t_2,c_1
	addxcc	c_1,t_2,c_1
	ld	a_6_,a_6
	ld	a(6),a_6
	addx	c_2,%g0,c_2	!=
	addx	c_2,%g0,c_2	!=
	umul	a_5,b_0,t_1	!mul_add_c(a[5],b[0],c3,c1,c2);
	umul	a_5,b_0,t_1	!mul_add_c(a[5],b[0],c3,c1,c2);
	addcc	c_3,t_1,c_3
	addcc	c_3,t_1,c_3
	rd	%y,t_2
	rd	%y,t_2
	addxcc	c_1,t_2,c_1	!=
	addxcc	c_1,t_2,c_1	!=
	addx	c_2,%g0,c_2
	addx	c_2,%g0,c_2
	st	c_3,[%i0+20]	!r[5]=c3;
	st	c_3,r(5)	!r[5]=c3;


	umul	a_6,b_0,t_1	!mul_add_c(a[6],b[0],c1,c2,c3);
	umul	a_6,b_0,t_1	!mul_add_c(a[6],b[0],c1,c2,c3);
	addcc	c_1,t_1,c_1	!=
	addcc	c_1,t_1,c_1	!=
@@ -748,19 +738,19 @@ bn_mul_comba8:
	addcc	c_1,t_1,c_1	!=
	addcc	c_1,t_1,c_1	!=
	rd	%y,t_2
	rd	%y,t_2
	addxcc	c_2,t_2,c_2
	addxcc	c_2,t_2,c_2
	ld	b_6_,b_6
	ld	b(6),b_6
	addx	c_3,%g0,c_3	!=
	addx	c_3,%g0,c_3	!=
	umul	a_1,b_5,t_1	!mul_add_c(a[1],b[5],c1,c2,c3);
	umul	a_1,b_5,t_1	!mul_add_c(a[1],b[5],c1,c2,c3);
	addcc	c_1,t_1,c_1
	addcc	c_1,t_1,c_1
	rd	%y,t_2
	rd	%y,t_2
	addxcc	c_2,t_2,c_2	!=
	addxcc	c_2,t_2,c_2	!=
	addx	c_3,%g0,c_3
	addx	c_3,%g0,c_3
	ld	b_7_,b_7
	ld	b(7),b_7
	umul	a_0,b_6,t_1	!mul_add_c(a[0],b[6],c1,c2,c3);
	umul	a_0,b_6,t_1	!mul_add_c(a[0],b[6],c1,c2,c3);
	addcc	c_1,t_1,c_1	!=
	addcc	c_1,t_1,c_1	!=
	rd	%y,t_2
	rd	%y,t_2
	addxcc	c_2,t_2,c_2
	addxcc	c_2,t_2,c_2
	st	c_1,[%i0+24]	!r[6]=c1;
	st	c_1,r(6)	!r[6]=c1;
	addx	c_3,%g0,c_3	!=
	addx	c_3,%g0,c_3	!=


	umul	a_0,b_7,t_1	!mul_add_c(a[0],b[7],c2,c3,c1);
	umul	a_0,b_7,t_1	!mul_add_c(a[0],b[7],c2,c3,c1);
@@ -793,7 +783,7 @@ bn_mul_comba8:
	rd	%y,t_2		!=
	rd	%y,t_2		!=
	addxcc	c_3,t_2,c_3
	addxcc	c_3,t_2,c_3
	addx	c_1,%g0,c_1
	addx	c_1,%g0,c_1
	ld	a_7_,a_7
	ld	a(7),a_7
	umul	a_6,b_1,t_1	!=!mul_add_c(a[6],b[1],c2,c3,c1);
	umul	a_6,b_1,t_1	!=!mul_add_c(a[6],b[1],c2,c3,c1);
	addcc	c_2,t_1,c_2
	addcc	c_2,t_1,c_2
	rd	%y,t_2
	rd	%y,t_2
@@ -804,7 +794,7 @@ bn_mul_comba8:
	rd	%y,t_2
	rd	%y,t_2
	addxcc	c_3,t_2,c_3	!=
	addxcc	c_3,t_2,c_3	!=
	addx	c_1,%g0,c_1
	addx	c_1,%g0,c_1
	st	c_2,[%i0+28]	!r[7]=c2;
	st	c_2,r(7)	!r[7]=c2;


	umul	a_7,b_1,t_1	!mul_add_c(a[7],b[1],c3,c1,c2);
	umul	a_7,b_1,t_1	!mul_add_c(a[7],b[1],c3,c1,c2);
	addcc	c_3,t_1,c_3	!=
	addcc	c_3,t_1,c_3	!=
@@ -841,7 +831,7 @@ bn_mul_comba8:
	rd	%y,t_2
	rd	%y,t_2
	addxcc	c_1,t_2,c_1	!
	addxcc	c_1,t_2,c_1	!
	addx	c_2,%g0,c_2
	addx	c_2,%g0,c_2
	st	c_3,[%i0+32]	!r[8]=c3;
	st	c_3,r(8)	!r[8]=c3;


	umul	a_2,b_7,t_1	!mul_add_c(a[2],b[7],c1,c2,c3);
	umul	a_2,b_7,t_1	!mul_add_c(a[2],b[7],c1,c2,c3);
	addcc	c_1,t_1,c_1	!=
	addcc	c_1,t_1,c_1	!=
@@ -873,7 +863,7 @@ bn_mul_comba8:
	rd	%y,t_2
	rd	%y,t_2
	addxcc	c_2,t_2,c_2
	addxcc	c_2,t_2,c_2
	addx	c_3,%g0,c_3	!=
	addx	c_3,%g0,c_3	!=
	st	c_1,[%i0+36]	!r[9]=c1;
	st	c_1,r(9)	!r[9]=c1;


	umul	a_7,b_3,t_1	!mul_add_c(a[7],b[3],c2,c3,c1);
	umul	a_7,b_3,t_1	!mul_add_c(a[7],b[3],c2,c3,c1);
	addcc	c_2,t_1,c_2
	addcc	c_2,t_1,c_2
@@ -900,7 +890,7 @@ bn_mul_comba8:
	rd	%y,t_2		!=
	rd	%y,t_2		!=
	addxcc	c_3,t_2,c_3
	addxcc	c_3,t_2,c_3
	addx	c_1,%g0,c_1
	addx	c_1,%g0,c_1
	st	c_2,[%i0+40]	!r[10]=c2;
	st	c_2,r(10)	!r[10]=c2;


	umul	a_4,b_7,t_1	!=!mul_add_c(a[4],b[7],c3,c1,c2);
	umul	a_4,b_7,t_1	!=!mul_add_c(a[4],b[7],c3,c1,c2);
	addcc	c_3,t_1,c_3
	addcc	c_3,t_1,c_3
@@ -921,7 +911,7 @@ bn_mul_comba8:
	addcc	c_3,t_1,c_3	!=
	addcc	c_3,t_1,c_3	!=
	rd	%y,t_2
	rd	%y,t_2
	addxcc	c_1,t_2,c_1
	addxcc	c_1,t_2,c_1
	st	c_3,[%i0+44]	!r[11]=c3;
	st	c_3,r(11)	!r[11]=c3;
	addx	c_2,%g0,c_2	!=
	addx	c_2,%g0,c_2	!=


	umul	a_7,b_5,t_1	!mul_add_c(a[7],b[5],c1,c2,c3);
	umul	a_7,b_5,t_1	!mul_add_c(a[7],b[5],c1,c2,c3);
@@ -938,7 +928,7 @@ bn_mul_comba8:
	addcc	c_1,t_1,c_1	!=
	addcc	c_1,t_1,c_1	!=
	rd	%y,t_2
	rd	%y,t_2
	addxcc	c_2,t_2,c_2
	addxcc	c_2,t_2,c_2
	st	c_1,[%i0+48]	!r[12]=c1;
	st	c_1,r(12)	!r[12]=c1;
	addx	c_3,%g0,c_3	!=
	addx	c_3,%g0,c_3	!=


	umul	a_6,b_7,t_1	!mul_add_c(a[6],b[7],c2,c3,c1);
	umul	a_6,b_7,t_1	!mul_add_c(a[6],b[7],c2,c3,c1);
@@ -951,15 +941,15 @@ bn_mul_comba8:
	rd	%y,t_2		!=
	rd	%y,t_2		!=
	addxcc	c_3,t_2,c_3
	addxcc	c_3,t_2,c_3
	addx	c_1,%g0,c_1
	addx	c_1,%g0,c_1
	st	c_2,[%i0+52]	!r[13]=c2;
	st	c_2,r(13)	!r[13]=c2;


	umul	a_7,b_7,t_1	!=!mul_add_c(a[7],b[7],c3,c1,c2);
	umul	a_7,b_7,t_1	!=!mul_add_c(a[7],b[7],c3,c1,c2);
	addcc	c_3,t_1,c_3
	addcc	c_3,t_1,c_3
	rd	%y,t_2
	rd	%y,t_2
	addxcc	c_1,t_2,c_1
	addxcc	c_1,t_2,c_1
	nop			!=
	nop			!=
	st	c_3,[%i0+56]	!r[14]=c3;
	st	c_3,r(14)	!r[14]=c3;
	st	c_1,[%i0+60]	!r[15]=c1;
	st	c_1,r(15)	!r[15]=c1;


	ret
	ret
	restore	%g0,%g0,%o0
	restore	%g0,%g0,%o0
@@ -976,45 +966,45 @@ bn_mul_comba8:
 */
 */
bn_mul_comba4:
bn_mul_comba4:
	save	%sp,FRAME_SIZE,%sp
	save	%sp,FRAME_SIZE,%sp
	ld	a_0_,a_0
	ld	a(0),a_0
	ld	b_0_,b_0
	ld	b(0),b_0
	umul	a_0,b_0,c_1	!=!mul_add_c(a[0],b[0],c1,c2,c3);
	umul	a_0,b_0,c_1	!=!mul_add_c(a[0],b[0],c1,c2,c3);
	ld	b_1_,b_1
	ld	b(1),b_1
	rd	%y,c_2
	rd	%y,c_2
	st	c_1,[%i0]	!r[0]=c1;
	st	c_1,r(0)	!r[0]=c1;


	umul	a_0,b_1,t_1	!=!mul_add_c(a[0],b[1],c2,c3,c1);
	umul	a_0,b_1,t_1	!=!mul_add_c(a[0],b[1],c2,c3,c1);
	ld	a_1_,a_1
	ld	a(1),a_1
	addcc	c_2,t_1,c_2
	addcc	c_2,t_1,c_2
	rd	%y,t_2		!=
	rd	%y,t_2		!=
	addxcc	%g0,t_2,c_3
	addxcc	%g0,t_2,c_3
	addx	%g0,%g0,c_1
	addx	%g0,%g0,c_1
	ld	a_2_,a_2
	ld	a(2),a_2
	umul	a_1,b_0,t_1	!=!mul_add_c(a[1],b[0],c2,c3,c1);
	umul	a_1,b_0,t_1	!=!mul_add_c(a[1],b[0],c2,c3,c1);
	addcc	c_2,t_1,c_2
	addcc	c_2,t_1,c_2
	rd	%y,t_2
	rd	%y,t_2
	addxcc	c_3,t_2,c_3
	addxcc	c_3,t_2,c_3
	addx	c_1,%g0,c_1	!=
	addx	c_1,%g0,c_1	!=
	st	c_2,[%i0+4]	!r[1]=c2;
	st	c_2,r(1)	!r[1]=c2;


	umul	a_2,b_0,t_1	!mul_add_c(a[2],b[0],c3,c1,c2);
	umul	a_2,b_0,t_1	!mul_add_c(a[2],b[0],c3,c1,c2);
	addcc	c_3,t_1,c_3
	addcc	c_3,t_1,c_3
	rd	%y,t_2		!=
	rd	%y,t_2		!=
	addxcc	c_1,t_2,c_1
	addxcc	c_1,t_2,c_1
	addx	%g0,%g0,c_2
	addx	%g0,%g0,c_2
	ld	b_2_,b_2
	ld	b(2),b_2
	umul	a_1,b_1,t_1	!=!mul_add_c(a[1],b[1],c3,c1,c2);
	umul	a_1,b_1,t_1	!=!mul_add_c(a[1],b[1],c3,c1,c2);
	addcc	c_3,t_1,c_3
	addcc	c_3,t_1,c_3
	rd	%y,t_2
	rd	%y,t_2
	addxcc	c_1,t_2,c_1
	addxcc	c_1,t_2,c_1
	addx	c_2,%g0,c_2	!=
	addx	c_2,%g0,c_2	!=
	ld	b_3_,b_3
	ld	b(3),b_3
	umul	a_0,b_2,t_1	!mul_add_c(a[0],b[2],c3,c1,c2);
	umul	a_0,b_2,t_1	!mul_add_c(a[0],b[2],c3,c1,c2);
	addcc	c_3,t_1,c_3
	addcc	c_3,t_1,c_3
	rd	%y,t_2		!=
	rd	%y,t_2		!=
	addxcc	c_1,t_2,c_1
	addxcc	c_1,t_2,c_1
	addx	c_2,%g0,c_2
	addx	c_2,%g0,c_2
	st	c_3,[%i0+8]	!r[2]=c3;
	st	c_3,r(2)	!r[2]=c3;


	umul	a_0,b_3,t_1	!=!mul_add_c(a[0],b[3],c1,c2,c3);
	umul	a_0,b_3,t_1	!=!mul_add_c(a[0],b[3],c1,c2,c3);
	addcc	c_1,t_1,c_1
	addcc	c_1,t_1,c_1
@@ -1026,7 +1016,7 @@ bn_mul_comba4:
	rd	%y,t_2
	rd	%y,t_2
	addxcc	c_2,t_2,c_2	!=
	addxcc	c_2,t_2,c_2	!=
	addx	c_3,%g0,c_3
	addx	c_3,%g0,c_3
	ld	a_3_,a_3
	ld	a(3),a_3
	umul	a_2,b_1,t_1	!mul_add_c(a[2],b[1],c1,c2,c3);
	umul	a_2,b_1,t_1	!mul_add_c(a[2],b[1],c1,c2,c3);
	addcc	c_1,t_1,c_1	!=
	addcc	c_1,t_1,c_1	!=
	rd	%y,t_2
	rd	%y,t_2
@@ -1037,7 +1027,7 @@ bn_mul_comba4:
	rd	%y,t_2
	rd	%y,t_2
	addxcc	c_2,t_2,c_2
	addxcc	c_2,t_2,c_2
	addx	c_3,%g0,c_3	!=
	addx	c_3,%g0,c_3	!=
	st	c_1,[%i0+12]	!r[3]=c1;
	st	c_1,r(3)	!r[3]=c1;


	umul	a_3,b_1,t_1	!mul_add_c(a[3],b[1],c2,c3,c1);
	umul	a_3,b_1,t_1	!mul_add_c(a[3],b[1],c2,c3,c1);
	addcc	c_2,t_1,c_2
	addcc	c_2,t_1,c_2
@@ -1054,7 +1044,7 @@ bn_mul_comba4:
	rd	%y,t_2
	rd	%y,t_2
	addxcc	c_3,t_2,c_3
	addxcc	c_3,t_2,c_3
	addx	c_1,%g0,c_1	!=
	addx	c_1,%g0,c_1	!=
	st	c_2,[%i0+16]	!r[4]=c2;
	st	c_2,r(4)	!r[4]=c2;


	umul	a_2,b_3,t_1	!mul_add_c(a[2],b[3],c3,c1,c2);
	umul	a_2,b_3,t_1	!mul_add_c(a[2],b[3],c3,c1,c2);
	addcc	c_3,t_1,c_3
	addcc	c_3,t_1,c_3
@@ -1065,15 +1055,15 @@ bn_mul_comba4:
	addcc	c_3,t_1,c_3	!=
	addcc	c_3,t_1,c_3	!=
	rd	%y,t_2
	rd	%y,t_2
	addxcc	c_1,t_2,c_1
	addxcc	c_1,t_2,c_1
	st	c_3,[%i0+20]	!r[5]=c3;
	st	c_3,r(5)	!r[5]=c3;
	addx	c_2,%g0,c_2	!=
	addx	c_2,%g0,c_2	!=


	umul	a_3,b_3,t_1	!mul_add_c(a[3],b[3],c1,c2,c3);
	umul	a_3,b_3,t_1	!mul_add_c(a[3],b[3],c1,c2,c3);
	addcc	c_1,t_1,c_1
	addcc	c_1,t_1,c_1
	rd	%y,t_2
	rd	%y,t_2
	addxcc	c_2,t_2,c_2	!=
	addxcc	c_2,t_2,c_2	!=
	st	c_1,[%i0+24]	!r[6]=c1;
	st	c_1,r(6)	!r[6]=c1;
	st	c_2,[%i0+28]	!r[7]=c2;
	st	c_2,r(7)	!r[7]=c2;
	
	
	ret
	ret
	restore	%g0,%g0,%o0
	restore	%g0,%g0,%o0
@@ -1086,13 +1076,13 @@ bn_mul_comba4:
.global bn_sqr_comba8
.global bn_sqr_comba8
bn_sqr_comba8:
bn_sqr_comba8:
	save	%sp,FRAME_SIZE,%sp
	save	%sp,FRAME_SIZE,%sp
	ld	a_0_,a_0
	ld	a(0),a_0
	ld	a_1_,a_1
	ld	a(1),a_1
	umul	a_0,a_0,c_1	!=!sqr_add_c(a,0,c1,c2,c3);
	umul	a_0,a_0,c_1	!=!sqr_add_c(a,0,c1,c2,c3);
	rd	%y,c_2
	rd	%y,c_2
	st	c_1,[%i0]	!r[0]=c1;
	st	c_1,r(0)	!r[0]=c1;


	ld	a_2_,a_2
	ld	a(2),a_2
	umul	a_0,a_1,t_1	!=!sqr_add_c2(a,1,0,c2,c3,c1);
	umul	a_0,a_1,t_1	!=!sqr_add_c2(a,1,0,c2,c3,c1);
	addcc	c_2,t_1,c_2
	addcc	c_2,t_1,c_2
	rd	%y,t_2
	rd	%y,t_2
@@ -1100,7 +1090,7 @@ bn_sqr_comba8:
	addx	%g0,%g0,c_1	!=
	addx	%g0,%g0,c_1	!=
	addcc	c_2,t_1,c_2
	addcc	c_2,t_1,c_2
	addxcc	c_3,t_2,c_3
	addxcc	c_3,t_2,c_3
	st	c_2,[%i0+4]	!r[1]=c2;
	st	c_2,r(1)	!r[1]=c2;
	addx	c_1,%g0,c_1	!=
	addx	c_1,%g0,c_1	!=


	umul	a_2,a_0,t_1	!sqr_add_c2(a,2,0,c3,c1,c2);
	umul	a_2,a_0,t_1	!sqr_add_c2(a,2,0,c3,c1,c2);
@@ -1111,13 +1101,13 @@ bn_sqr_comba8:
	addcc	c_3,t_1,c_3
	addcc	c_3,t_1,c_3
	addxcc	c_1,t_2,c_1
	addxcc	c_1,t_2,c_1
	addx	c_2,%g0,c_2	!=
	addx	c_2,%g0,c_2	!=
	ld	a_3_,a_3
	ld	a(3),a_3
	umul	a_1,a_1,t_1	!sqr_add_c(a,1,c3,c1,c2);
	umul	a_1,a_1,t_1	!sqr_add_c(a,1,c3,c1,c2);
	addcc	c_3,t_1,c_3
	addcc	c_3,t_1,c_3
	rd	%y,t_2		!=
	rd	%y,t_2		!=
	addxcc	c_1,t_2,c_1
	addxcc	c_1,t_2,c_1
	addx	c_2,%g0,c_2
	addx	c_2,%g0,c_2
	st	c_3,[%i0+8]	!r[2]=c3;
	st	c_3,r(2)	!r[2]=c3;


	umul	a_0,a_3,t_1	!=!sqr_add_c2(a,3,0,c1,c2,c3);
	umul	a_0,a_3,t_1	!=!sqr_add_c2(a,3,0,c1,c2,c3);
	addcc	c_1,t_1,c_1
	addcc	c_1,t_1,c_1
@@ -1126,7 +1116,7 @@ bn_sqr_comba8:
	addx	%g0,%g0,c_3	!=
	addx	%g0,%g0,c_3	!=
	addcc	c_1,t_1,c_1
	addcc	c_1,t_1,c_1
	addxcc	c_2,t_2,c_2
	addxcc	c_2,t_2,c_2
	ld	a_4_,a_4
	ld	a(4),a_4
	addx	c_3,%g0,c_3	!=
	addx	c_3,%g0,c_3	!=
	umul	a_1,a_2,t_1	!sqr_add_c2(a,2,1,c1,c2,c3);
	umul	a_1,a_2,t_1	!sqr_add_c2(a,2,1,c1,c2,c3);
	addcc	c_1,t_1,c_1
	addcc	c_1,t_1,c_1
@@ -1136,7 +1126,7 @@ bn_sqr_comba8:
	addcc	c_1,t_1,c_1
	addcc	c_1,t_1,c_1
	addxcc	c_2,t_2,c_2
	addxcc	c_2,t_2,c_2
	addx	c_3,%g0,c_3	!=
	addx	c_3,%g0,c_3	!=
	st	c_1,[%i0+12]	!r[3]=c1;
	st	c_1,r(3)	!r[3]=c1;


	umul	a_4,a_0,t_1	!sqr_add_c2(a,4,0,c2,c3,c1);
	umul	a_4,a_0,t_1	!sqr_add_c2(a,4,0,c2,c3,c1);
	addcc	c_2,t_1,c_2
	addcc	c_2,t_1,c_2
@@ -1154,12 +1144,12 @@ bn_sqr_comba8:
	addcc	c_2,t_1,c_2
	addcc	c_2,t_1,c_2
	addxcc	c_3,t_2,c_3	!=
	addxcc	c_3,t_2,c_3	!=
	addx	c_1,%g0,c_1
	addx	c_1,%g0,c_1
	ld	a_5_,a_5
	ld	a(5),a_5
	umul	a_2,a_2,t_1	!sqr_add_c(a,2,c2,c3,c1);
	umul	a_2,a_2,t_1	!sqr_add_c(a,2,c2,c3,c1);
	addcc	c_2,t_1,c_2	!=
	addcc	c_2,t_1,c_2	!=
	rd	%y,t_2
	rd	%y,t_2
	addxcc	c_3,t_2,c_3
	addxcc	c_3,t_2,c_3
	st	c_2,[%i0+16]	!r[4]=c2;
	st	c_2,r(4)	!r[4]=c2;
	addx	c_1,%g0,c_1	!=
	addx	c_1,%g0,c_1	!=


	umul	a_0,a_5,t_1	!sqr_add_c2(a,5,0,c3,c1,c2);
	umul	a_0,a_5,t_1	!sqr_add_c2(a,5,0,c3,c1,c2);
@@ -1178,7 +1168,7 @@ bn_sqr_comba8:
	addcc	c_3,t_1,c_3
	addcc	c_3,t_1,c_3
	addxcc	c_1,t_2,c_1
	addxcc	c_1,t_2,c_1
	addx	c_2,%g0,c_2	!=
	addx	c_2,%g0,c_2	!=
	ld	a_6_,a_6
	ld	a(6),a_6
	umul	a_2,a_3,t_1	!sqr_add_c2(a,3,2,c3,c1,c2);
	umul	a_2,a_3,t_1	!sqr_add_c2(a,3,2,c3,c1,c2);
	addcc	c_3,t_1,c_3
	addcc	c_3,t_1,c_3
	rd	%y,t_2		!=
	rd	%y,t_2		!=
@@ -1187,7 +1177,7 @@ bn_sqr_comba8:
	addcc	c_3,t_1,c_3
	addcc	c_3,t_1,c_3
	addxcc	c_1,t_2,c_1	!=
	addxcc	c_1,t_2,c_1	!=
	addx	c_2,%g0,c_2
	addx	c_2,%g0,c_2
	st	c_3,[%i0+20]	!r[5]=c3;
	st	c_3,r(5)	!r[5]=c3;


	umul	a_6,a_0,t_1	!sqr_add_c2(a,6,0,c1,c2,c3);
	umul	a_6,a_0,t_1	!sqr_add_c2(a,6,0,c1,c2,c3);
	addcc	c_1,t_1,c_1	!=
	addcc	c_1,t_1,c_1	!=
@@ -1213,13 +1203,13 @@ bn_sqr_comba8:
	addcc	c_1,t_1,c_1	!=
	addcc	c_1,t_1,c_1	!=
	addxcc	c_2,t_2,c_2
	addxcc	c_2,t_2,c_2
	addx	c_3,%g0,c_3
	addx	c_3,%g0,c_3
	ld	a_7_,a_7
	ld	a(7),a_7
	umul	a_3,a_3,t_1	!=!sqr_add_c(a,3,c1,c2,c3);
	umul	a_3,a_3,t_1	!=!sqr_add_c(a,3,c1,c2,c3);
	addcc	c_1,t_1,c_1
	addcc	c_1,t_1,c_1
	rd	%y,t_2
	rd	%y,t_2
	addxcc	c_2,t_2,c_2
	addxcc	c_2,t_2,c_2
	addx	c_3,%g0,c_3	!=
	addx	c_3,%g0,c_3	!=
	st	c_1,[%i0+24]	!r[6]=c1;
	st	c_1,r(6)	!r[6]=c1;


	umul	a_0,a_7,t_1	!sqr_add_c2(a,7,0,c2,c3,c1);
	umul	a_0,a_7,t_1	!sqr_add_c2(a,7,0,c2,c3,c1);
	addcc	c_2,t_1,c_2
	addcc	c_2,t_1,c_2
@@ -1253,7 +1243,7 @@ bn_sqr_comba8:
	addcc	c_2,t_1,c_2
	addcc	c_2,t_1,c_2
	addxcc	c_3,t_2,c_3	!=
	addxcc	c_3,t_2,c_3	!=
	addx	c_1,%g0,c_1
	addx	c_1,%g0,c_1
	st	c_2,[%i0+28]	!r[7]=c2;
	st	c_2,r(7)	!r[7]=c2;


	umul	a_7,a_1,t_1	!sqr_add_c2(a,7,1,c3,c1,c2);
	umul	a_7,a_1,t_1	!sqr_add_c2(a,7,1,c3,c1,c2);
	addcc	c_3,t_1,c_3	!=
	addcc	c_3,t_1,c_3	!=
@@ -1283,7 +1273,7 @@ bn_sqr_comba8:
	addcc	c_3,t_1,c_3	!=
	addcc	c_3,t_1,c_3	!=
	rd	%y,t_2
	rd	%y,t_2
	addxcc	c_1,t_2,c_1
	addxcc	c_1,t_2,c_1
	st	c_3,[%i0+32]	!r[8]=c3;
	st	c_3,r(8)	!r[8]=c3;
	addx	c_2,%g0,c_2	!=
	addx	c_2,%g0,c_2	!=


	umul	a_2,a_7,t_1	!sqr_add_c2(a,7,2,c1,c2,c3);
	umul	a_2,a_7,t_1	!sqr_add_c2(a,7,2,c1,c2,c3);
@@ -1310,7 +1300,7 @@ bn_sqr_comba8:
	addcc	c_1,t_1,c_1
	addcc	c_1,t_1,c_1
	addxcc	c_2,t_2,c_2
	addxcc	c_2,t_2,c_2
	addx	c_3,%g0,c_3	!=
	addx	c_3,%g0,c_3	!=
	st	c_1,[%i0+36]	!r[9]=c1;
	st	c_1,r(9)	!r[9]=c1;


	umul	a_7,a_3,t_1	!sqr_add_c2(a,7,3,c2,c3,c1);
	umul	a_7,a_3,t_1	!sqr_add_c2(a,7,3,c2,c3,c1);
	addcc	c_2,t_1,c_2
	addcc	c_2,t_1,c_2
@@ -1333,7 +1323,7 @@ bn_sqr_comba8:
	rd	%y,t_2		!=
	rd	%y,t_2		!=
	addxcc	c_3,t_2,c_3
	addxcc	c_3,t_2,c_3
	addx	c_1,%g0,c_1
	addx	c_1,%g0,c_1
	st	c_2,[%i0+40]	!r[10]=c2;
	st	c_2,r(10)	!r[10]=c2;


	umul	a_4,a_7,t_1	!=!sqr_add_c2(a,7,4,c3,c1,c2);
	umul	a_4,a_7,t_1	!=!sqr_add_c2(a,7,4,c3,c1,c2);
	addcc	c_3,t_1,c_3
	addcc	c_3,t_1,c_3
@@ -1350,7 +1340,7 @@ bn_sqr_comba8:
	addx	c_2,%g0,c_2	!=
	addx	c_2,%g0,c_2	!=
	addcc	c_3,t_1,c_3
	addcc	c_3,t_1,c_3
	addxcc	c_1,t_2,c_1
	addxcc	c_1,t_2,c_1
	st	c_3,[%i0+44]	!r[11]=c3;
	st	c_3,r(11)	!r[11]=c3;
	addx	c_2,%g0,c_2	!=
	addx	c_2,%g0,c_2	!=


	umul	a_7,a_5,t_1	!sqr_add_c2(a,7,5,c1,c2,c3);
	umul	a_7,a_5,t_1	!sqr_add_c2(a,7,5,c1,c2,c3);
@@ -1366,7 +1356,7 @@ bn_sqr_comba8:
	rd	%y,t_2
	rd	%y,t_2
	addxcc	c_2,t_2,c_2	!=
	addxcc	c_2,t_2,c_2	!=
	addx	c_3,%g0,c_3
	addx	c_3,%g0,c_3
	st	c_1,[%i0+48]	!r[12]=c1;
	st	c_1,r(12)	!r[12]=c1;


	umul	a_6,a_7,t_1	!sqr_add_c2(a,7,6,c2,c3,c1);
	umul	a_6,a_7,t_1	!sqr_add_c2(a,7,6,c2,c3,c1);
	addcc	c_2,t_1,c_2	!=
	addcc	c_2,t_1,c_2	!=
@@ -1376,15 +1366,15 @@ bn_sqr_comba8:
	addcc	c_2,t_1,c_2	!=
	addcc	c_2,t_1,c_2	!=
	rd	%y,t_2
	rd	%y,t_2
	addxcc	c_3,t_2,c_3
	addxcc	c_3,t_2,c_3
	st	c_2,[%i0+52]	!r[13]=c2;
	st	c_2,r(13)	!r[13]=c2;
	addx	c_1,%g0,c_1	!=
	addx	c_1,%g0,c_1	!=


	umul	a_7,a_7,t_1	!sqr_add_c(a,7,c3,c1,c2);
	umul	a_7,a_7,t_1	!sqr_add_c(a,7,c3,c1,c2);
	addcc	c_3,t_1,c_3
	addcc	c_3,t_1,c_3
	rd	%y,t_2
	rd	%y,t_2
	addxcc	c_1,t_2,c_1	!=
	addxcc	c_1,t_2,c_1	!=
	st	c_3,[%i0+56]	!r[14]=c3;
	st	c_3,r(14)	!r[14]=c3;
	st	c_1,[%i0+60]	!r[15]=c1;
	st	c_1,r(15)	!r[15]=c1;


	ret
	ret
	restore	%g0,%g0,%o0
	restore	%g0,%g0,%o0
@@ -1401,23 +1391,23 @@ bn_sqr_comba8:
 */
 */
bn_sqr_comba4:
bn_sqr_comba4:
	save	%sp,FRAME_SIZE,%sp
	save	%sp,FRAME_SIZE,%sp
	ld	a_0_,a_0
	ld	a(0),a_0
	umul	a_0,a_0,c_1	!sqr_add_c(a,0,c1,c2,c3);
	umul	a_0,a_0,c_1	!sqr_add_c(a,0,c1,c2,c3);
	ld	a_1_,a_1	!=
	ld	a(1),a_1	!=
	rd	%y,c_2
	rd	%y,c_2
	st	c_1,[%i0]	!r[0]=c1;
	st	c_1,r(0)	!r[0]=c1;


	ld	a_1_,a_1
	ld	a(1),a_1
	umul	a_0,a_1,t_1	!=!sqr_add_c2(a,1,0,c2,c3,c1);
	umul	a_0,a_1,t_1	!=!sqr_add_c2(a,1,0,c2,c3,c1);
	addcc	c_2,t_1,c_2
	addcc	c_2,t_1,c_2
	rd	%y,t_2
	rd	%y,t_2
	addxcc	%g0,t_2,c_3
	addxcc	%g0,t_2,c_3
	addx	%g0,%g0,c_1	!=
	addx	%g0,%g0,c_1	!=
	ld	a_2_,a_2
	ld	a(2),a_2
	addcc	c_2,t_1,c_2
	addcc	c_2,t_1,c_2
	addxcc	c_3,t_2,c_3
	addxcc	c_3,t_2,c_3
	addx	c_1,%g0,c_1	!=
	addx	c_1,%g0,c_1	!=
	st	c_2,[%i0+4]	!r[1]=c2;
	st	c_2,r(1)	!r[1]=c2;


	umul	a_2,a_0,t_1	!sqr_add_c2(a,2,0,c3,c1,c2);
	umul	a_2,a_0,t_1	!sqr_add_c2(a,2,0,c3,c1,c2);
	addcc	c_3,t_1,c_3
	addcc	c_3,t_1,c_3
@@ -1427,12 +1417,12 @@ bn_sqr_comba4:
	addcc	c_3,t_1,c_3
	addcc	c_3,t_1,c_3
	addxcc	c_1,t_2,c_1	!=
	addxcc	c_1,t_2,c_1	!=
	addx	c_2,%g0,c_2
	addx	c_2,%g0,c_2
	ld	a_3_,a_3
	ld	a(3),a_3
	umul	a_1,a_1,t_1	!sqr_add_c(a,1,c3,c1,c2);
	umul	a_1,a_1,t_1	!sqr_add_c(a,1,c3,c1,c2);
	addcc	c_3,t_1,c_3	!=
	addcc	c_3,t_1,c_3	!=
	rd	%y,t_2
	rd	%y,t_2
	addxcc	c_1,t_2,c_1
	addxcc	c_1,t_2,c_1
	st	c_3,[%i0+8]	!r[2]=c3;
	st	c_3,r(2)	!r[2]=c3;
	addx	c_2,%g0,c_2	!=
	addx	c_2,%g0,c_2	!=


	umul	a_0,a_3,t_1	!sqr_add_c2(a,3,0,c1,c2,c3);
	umul	a_0,a_3,t_1	!sqr_add_c2(a,3,0,c1,c2,c3);
@@ -1451,7 +1441,7 @@ bn_sqr_comba4:
	addcc	c_1,t_1,c_1
	addcc	c_1,t_1,c_1
	addxcc	c_2,t_2,c_2
	addxcc	c_2,t_2,c_2
	addx	c_3,%g0,c_3	!=
	addx	c_3,%g0,c_3	!=
	st	c_1,[%i0+12]	!r[3]=c1;
	st	c_1,r(3)	!r[3]=c1;


	umul	a_3,a_1,t_1	!sqr_add_c2(a,3,1,c2,c3,c1);
	umul	a_3,a_1,t_1	!sqr_add_c2(a,3,1,c2,c3,c1);
	addcc	c_2,t_1,c_2
	addcc	c_2,t_1,c_2
@@ -1466,7 +1456,7 @@ bn_sqr_comba4:
	rd	%y,t_2		!=
	rd	%y,t_2		!=
	addxcc	c_3,t_2,c_3
	addxcc	c_3,t_2,c_3
	addx	c_1,%g0,c_1
	addx	c_1,%g0,c_1
	st	c_2,[%i0+16]	!r[4]=c2;
	st	c_2,r(4)	!r[4]=c2;


	umul	a_2,a_3,t_1	!=!sqr_add_c2(a,3,2,c3,c1,c2);
	umul	a_2,a_3,t_1	!=!sqr_add_c2(a,3,2,c3,c1,c2);
	addcc	c_3,t_1,c_3
	addcc	c_3,t_1,c_3
@@ -1475,20 +1465,20 @@ bn_sqr_comba4:
	addx	%g0,%g0,c_2	!=
	addx	%g0,%g0,c_2	!=
	addcc	c_3,t_1,c_3
	addcc	c_3,t_1,c_3
	addxcc	c_1,t_2,c_1
	addxcc	c_1,t_2,c_1
	st	c_3,[%i0+20]	!r[5]=c3;
	st	c_3,r(5)	!r[5]=c3;
	addx	c_2,%g0,c_2	!=
	addx	c_2,%g0,c_2	!=


	umul	a_3,a_3,t_1	!sqr_add_c(a,3,c1,c2,c3);
	umul	a_3,a_3,t_1	!sqr_add_c(a,3,c1,c2,c3);
	addcc	c_1,t_1,c_1
	addcc	c_1,t_1,c_1
	rd	%y,t_2
	rd	%y,t_2
	addxcc	c_2,t_2,c_2	!=
	addxcc	c_2,t_2,c_2	!=
	st	c_1,[%i0+24]	!r[6]=c1;
	st	c_1,r(6)	!r[6]=c1;
	st	c_2,[%i0+28]	!r[7]=c2;
	st	c_2,r(7)	!r[7]=c2;
	
	
	ret
	ret
	restore	%g0,%g0,%o0
	restore	%g0,%g0,%o0


.type	bn_sqr_comba4,#function
.type	bn_sqr_comba4,#function
.size	bn_sqr_comba4,(.-bn_sqr_comba4)
.size	bn_sqr_comba4,(.-bn_sqr_comba4)
.align	32


.align	32
+1569 −0

File added.

Preview size limit exceeded, changes collapsed.