Commit f22e1e4d authored by Andy Polyakov's avatar Andy Polyakov
Browse files

UltraSPARC assembler DES implementation tune-up. The code can be

compiled for any SPARC CPU (UltraSPARC performance is *not* affected),
can be compiled for 64-bit ABI and is position-independent.
parent e0d769ca
Loading
Loading
Loading
Loading
+356 −214
Original line number Diff line number Diff line
@@ -7,6 +7,11 @@
!
!  June 8, 2000.
!
!  Version 2.0. 32/64-bit, PIC-ification, blended CPU adaptation
!		by Andy Polyakov.
!
!  January 1, 2003.
!
!  Assembler version: Copyright Svend Olaf Mikkelsen.
!
!  Original C code: Copyright Eric A. Young.
@@ -27,9 +32,45 @@
!  Assemble through gcc: gcc -c -mcpu=ultrasparc -o des_enc.o des_enc.S
!
!  Assemble through cc:  cc -c -xarch=v8plusa -o des_enc.o des_enc.S
!
!  Performance improvement according to './apps/openssl speed des'
!
!	32-bit build:
!		23%  faster than cc-5.2 -xarch=v8plus -xO5
!		115% faster than gcc-3.2.1 -m32 -mcpu=ultrasparc -O5
!	64-bit build:
!		50%  faster than cc-5.2 -xarch=v9 -xO5
!		100% faster than gcc-3.2.1 -m64 -mcpu=ultrasparc -O5
!


.ident "des_enc.m4 1.0"
.ident "des_enc.m4 2.0"

#if defined(__SUNPRO_C) && defined(__sparcv9)
# define ABI64  /* They've said -xarch=v9 at command line */
#elif defined(__GNUC__) && defined(__arch64__)
# define ABI64  /* They've said -m64 at command line */
#endif

#ifdef ABI64
  .register	%g2,#scratch
  .register	%g3,#scratch
# define	FRAME	-192
# define	BIAS	2047
# define	LDPTR	ldx
# define	STPTR	stx
# define	ARG0	128
# define	ARGSZ	8
# ifndef OPENSSL_SYSNAME_ULTRASPARC
# define OPENSSL_SYSNAME_ULTRASPARC
# endif
#else
# define	FRAME	-96
# define	BIAS	0
# define	LDPTR	ld
# define	STPTR	st
# define	ARG0	68
# define	ARGSZ	4
#endif

#define LOOPS 7

@@ -125,13 +166,13 @@ define(ip_macro, {
	srl	$1, 16, local4
	xor	$2, local1, $2

	ifelse($9,1,{ld	KS3, in4},{})
	ifelse($9,1,{LDPTR	KS3, in4},{})
	xor	local4, $2, local4
	sethi	%hi(des_SPtrans), global1 ! sbox addr
	nop	!sethi	%hi(DES_SPtrans), global1 ! sbox addr

	ifelse($9,1,{ld	KS2, in3},{})
	ifelse($9,1,{LDPTR	KS2, in3},{})
	and	local4, local2, local4
	or	global1, %lo(des_SPtrans), global1   ! sbox addr
	nop	!or	global1, %lo(DES_SPtrans), global1   ! sbox addr

	sll	local4, 16, local1
	xor	$2, local4, $2
@@ -260,7 +301,7 @@ define(rounds_macro, {
	xor	$2, out0, local1

	ld	[out2+284], local5        ! 0x0000FC00
	ba,pt	%icc, $4
	ba	$4
	and	local1, 252, local1

	.align 32
@@ -273,16 +314,16 @@ $4:
	ld	[global1+local1], local1
	xor	$2, out1, out1            ! 8642
	xor	$2, out0, out0            ! 7531
	fxor	%f0, %f0, %f0             ! fxor used for alignment
	fmovs	%f0, %f0                  ! fxor used for alignment

	srl	out1, 4, local0           ! rotate 4 right
	and	out0, local5, local3      ! 3
	fxor	%f0, %f0, %f0
	fmovs	%f0, %f0

	ld	[$5+$3*8], local7         ! key 7531 next round
	srl	local3, 8, local3         ! 3
	and	local0, 252, local2       ! 2
	fxor	%f0, %f0, %f0
	fmovs	%f0, %f0

	ld	[global3+local3],local3   ! 3
	sll	out1, 28, out1            ! rotate
@@ -380,7 +421,11 @@ $4:
	xor	$2, local1, $2            ! 1 finished

	xor	$2, local2, $2            ! 3 finished
#ifdef OPENSSL_SYSNAME_ULTRASPARC
	bne,pt	%icc, $4
#else
	bne	$4
#endif
	and	local4, 252, local1       ! sbox 1 next round

! two rounds more:
@@ -551,10 +596,10 @@ define(fp_macro, {
	sethi	%hi(0x0f0f0f0f), local4
	sll	local3, 2, local2

	ifelse($4,1, {ld INPUT, local5})
	ifelse($4,1, {LDPTR INPUT, local5})
	xor	$1, local3, $1

	ifelse($4,1, {ld OUTPUT, local7})
	ifelse($4,1, {LDPTR OUTPUT, local7})
	srl	$1, 16, local3
	xor	$2, local2, $2
	xor	local3, $2, local3
@@ -702,7 +747,7 @@ define(fp_ip_macro, {
	sll	temp1, 4, temp2
	xor	$1, temp1, $1

	ifelse($5,1,{ld	KS2, in4})
	ifelse($5,1,{LDPTR	KS2, in4})
	sll	$4, 3, local2
	xor	local4, temp2, $2

@@ -713,7 +758,7 @@ define(fp_ip_macro, {
	srl	$3, 29, local0
	ifelse($5,1,{add in4, 120, in4})

	ifelse($5,1,{ld	KS1, in3})
	ifelse($5,1,{LDPTR	KS1, in3})
	srl	$4, 29, local7

	or	local0, local5, $4
@@ -738,6 +783,7 @@ define(load_little_endian, {

	! first in memory to rightmost in register

#ifdef OPENSSL_SYSNAME_ULTRASPARC
	andcc	$1, 3, global0
	bne,pn	%icc, $5
	nop
@@ -747,6 +793,7 @@ define(load_little_endian, {

	ba,pt	%icc, $5a
	lda	[$4] 0x88, $3
#endif

$5:
	ldub	[$1+3], $2
@@ -799,6 +846,7 @@ define(load_little_endian_inc, {

	! first in memory to rightmost in register

#ifdef OPENSSL_SYSNAME_ULTRASPARC
	andcc	$1, 3, global0
	bne,pn	%icc, $5
	nop
@@ -809,6 +857,7 @@ define(load_little_endian_inc, {
	lda	[$1] 0x88, $3
	ba,pt	%icc, $5a
	add	$1, 4, $1
#endif

$5:
	ldub	[$1+3], $2
@@ -863,17 +912,17 @@ define(load_n_bytes, {
! {load_n_bytes}
! $1 $2 $5 $6 $7 $8 $7 $8 $9

$7.0:	call	.+8
	sll	$2, 2, $6

	sethi	%hi($7.jmp.table), $5
	or	$5, %lo($7.jmp.table), $5
	add	%o7,$7.jmp.table-$7.0,$5

	add	$5, $6, $5
	mov	0, $4

	ld	[$5], $5

	jmp	$5
	jmp	%o7+$5
	mov	0, $3

$7.7:
@@ -901,20 +950,20 @@ $7.2:
	or	$4, $5, $4
$7.1:
	ldub	[$1+0], $5
	ba,pt	%icc, $8
	ba	$8
	or	$4, $5, $4

	.align 4

$7.jmp.table:
	.word	0
	.word	$7.1
	.word	$7.2
	.word	$7.3
	.word	$7.4
	.word	$7.5
	.word	$7.6
	.word	$7.7
	.word	$7.1-$7.0
	.word	$7.2-$7.0
	.word	$7.3-$7.0
	.word	$7.4-$7.0
	.word	$7.5-$7.0
	.word	$7.6-$7.0
	.word	$7.7-$7.0
})


@@ -932,6 +981,7 @@ define(store_little_endian, {

	! rightmost in register to first in memory

#ifdef OPENSSL_SYSNAME_ULTRASPARC
	andcc	$1, 3, global0
	bne,pn	%icc, $5
	nop
@@ -941,6 +991,7 @@ define(store_little_endian, {

	ba,pt	%icc, $5a
	sta	$3, [$4] 0x88
#endif

$5:
	and	$2, 255, $4
@@ -995,15 +1046,16 @@ define(store_n_bytes, {
! {store_n_bytes}
! $1 $2 $5 $6 $7 $8 $7 $8 $9

$7.0:	call	.+8
	sll	$2, 2, $6

	sethi	%hi($7.jmp.table), $5
	or	$5, %lo($7.jmp.table), $5
	add	%o7,$7.jmp.table-$7.0,$5

	add	$5, $6, $5

	ld	[$5], $5
	jmp	$5

	jmp	%o7+$5
	nop

$7.7:
@@ -1032,7 +1084,7 @@ $7.1:
	and	$4, 0xff, $5


	ba,pt	%icc, $8
	ba	$8
	stub	$5, [$1]

	.align 4
@@ -1040,13 +1092,13 @@ $7.1:
$7.jmp.table:

	.word	0
	.word	$7.1
	.word	$7.2
	.word	$7.3
	.word	$7.4
	.word	$7.5
	.word	$7.6
	.word	$7.7
	.word	$7.1-$7.0
	.word	$7.2-$7.0
	.word	$7.3-$7.0
	.word	$7.4-$7.0
	.word	$7.5-$7.0
	.word	$7.6-$7.0
	.word	$7.7-$7.0
})


@@ -1089,64 +1141,6 @@ define(register_init, {

})

	.global .des_and
.section	".rodata"
	.align 8
	.type	 .des_and,#object
	.size	 .des_and,284

.des_and:

! This table is used for AND 0xFC when it is known that register
! bits 8-31 are zero. Makes it possible to do three arithmetic
! operations in one cycle.

	.byte  0, 0, 0, 0, 4, 4, 4, 4
	.byte  8, 8, 8, 8, 12, 12, 12, 12
	.byte  16, 16, 16, 16, 20, 20, 20, 20
	.byte  24, 24, 24, 24, 28, 28, 28, 28
	.byte  32, 32, 32, 32, 36, 36, 36, 36
	.byte  40, 40, 40, 40, 44, 44, 44, 44
	.byte  48, 48, 48, 48, 52, 52, 52, 52
	.byte  56, 56, 56, 56, 60, 60, 60, 60
	.byte  64, 64, 64, 64, 68, 68, 68, 68
	.byte  72, 72, 72, 72, 76, 76, 76, 76
	.byte  80, 80, 80, 80, 84, 84, 84, 84
	.byte  88, 88, 88, 88, 92, 92, 92, 92
	.byte  96, 96, 96, 96, 100, 100, 100, 100
	.byte  104, 104, 104, 104, 108, 108, 108, 108
	.byte  112, 112, 112, 112, 116, 116, 116, 116
	.byte  120, 120, 120, 120, 124, 124, 124, 124
	.byte  128, 128, 128, 128, 132, 132, 132, 132
	.byte  136, 136, 136, 136, 140, 140, 140, 140
	.byte  144, 144, 144, 144, 148, 148, 148, 148
	.byte  152, 152, 152, 152, 156, 156, 156, 156
	.byte  160, 160, 160, 160, 164, 164, 164, 164
	.byte  168, 168, 168, 168, 172, 172, 172, 172
	.byte  176, 176, 176, 176, 180, 180, 180, 180
	.byte  184, 184, 184, 184, 188, 188, 188, 188
	.byte  192, 192, 192, 192, 196, 196, 196, 196
	.byte  200, 200, 200, 200, 204, 204, 204, 204
	.byte  208, 208, 208, 208, 212, 212, 212, 212
	.byte  216, 216, 216, 216, 220, 220, 220, 220
	.byte  224, 224, 224, 224, 228, 228, 228, 228
	.byte  232, 232, 232, 232, 236, 236, 236, 236
	.byte  240, 240, 240, 240, 244, 244, 244, 244
	.byte  248, 248, 248, 248, 252, 252, 252, 252

	! 5 numbers for initil/final permutation

	.word   0x0f0f0f0f                ! offset 256
	.word	0x0000ffff                ! 260
	.word	0x33333333                ! 264
	.word	0x00ff00ff                ! 268
	.word	0x55555555                ! 272

	.word	0                         ! 276
	.word	LOOPS                     ! 280
	.word	0x0000FC00                ! 284


.section	".text"

	.align 32
@@ -1173,24 +1167,29 @@ define(register_init, {



! void des_encrypt(data, ks, enc)
! void DES_encrypt1(data, ks, enc)
! *******************************

	.align 32
	.global des_encrypt
	.type	 des_encrypt,#function
	.global DES_encrypt1
	.type	 DES_encrypt1,#function

des_encrypt:
DES_encrypt1:

	save	%sp, -96, %sp
	save	%sp, FRAME, %sp

	call	.PIC.me.up
	mov	.PIC.me.up-(.-4),out0

	ld	[in0], in5                ! left
	sethi	%hi(.des_and), out2       ! address constants
	cmp	in2, 0                    ! enc

	ld	[in0+4], out5             ! right
#ifdef OPENSSL_SYSNAME_ULTRASPARC
	be,pn	%icc, .encrypt.dec        ! enc/dec
	or	out2, %lo(.des_and), out2 ! address constants
#else
	be	.encrypt.dec
#endif
	ld	[in0+4], out5             ! right

	! parameter 6  1/2 for include encryption/decryption
	! parameter 7  1 for move in1 to in3
@@ -1198,12 +1197,12 @@ des_encrypt:

	ip_macro(in5, out5, in5, out5, in3, 0, 1, 1)

	rounds_macro(in5, out5, 1, .des_encrypt.1, in3, in4) ! in4 not used
	rounds_macro(in5, out5, 1, .des_encrypt1.1, in3, in4) ! in4 not used

	fp_macro(in5, out5, 1)            ! 1 for store to [in0]

	return	in7+8
	nop
	ret
	restore

.encrypt.dec:

@@ -1217,34 +1216,35 @@ des_encrypt:

	fp_macro(out5, in5, 1)            ! 1 for store to [in0]

	return	in7+8
	nop
	ret
	restore

.des_encrypt.end:
	.size	 des_encrypt,.des_encrypt.end-des_encrypt
.DES_encrypt1.end:
	.size	 DES_encrypt1,.DES_encrypt1.end-DES_encrypt1


! void des_encrypt2(data, ks, enc)
! void DES_encrypt2(data, ks, enc)
!*********************************

	! encrypts/decrypts without initial/final permutation

	.align 32
	.global des_encrypt2
	.type	 des_encrypt2,#function
	.global DES_encrypt2
	.type	 DES_encrypt2,#function

DES_encrypt2:

des_encrypt2:
	save	%sp, FRAME, %sp

	save	%sp, -112, %sp
	call	.PIC.me.up
	mov	.PIC.me.up-(.-4),out0

	! Set sbox address 1 to 6 and rotate halfs 3 left
	! Errors caught by destest? Yes. Still? *NO*

	sethi	%hi(des_SPtrans), global1 ! address sbox 1
	sethi	%hi(.des_and), out2       ! address constants
	!sethi	%hi(DES_SPtrans), global1 ! address sbox 1

	or	global1, %lo(des_SPtrans), global1  ! sbox 1
	or	out2, %lo(.des_and), out2 ! adress constants
	!or	global1, %lo(DES_SPtrans), global1  ! sbox 1

	add	global1, 256, global2     ! sbox 2
	add	global1, 512, global3     ! sbox 3
@@ -1273,8 +1273,12 @@ des_encrypt2:

	! we use our own stackframe

#ifdef OPENSSL_SYSNAME_ULTRASPARC
	be,pn	%icc, .encrypt2.dec       ! decryption
	st	in0, [%sp+68]
#else
	be	.encrypt2.dec
#endif
	STPTR	in0, [%sp+BIAS+ARG0+0*ARGSZ]

	ld	[in3], out0               ! key 7531 first round
	mov	LOOPS, out4               ! loop counter
@@ -1291,13 +1295,13 @@ des_encrypt2:
	sll	out5, 29, in1
	add	in5, in0, in5
	srl	out5, 3, out5
	ld	[%sp+68], in0
	LDPTR	[%sp+BIAS+ARG0+0*ARGSZ], in0
	add	out5, in1, out5
	st	in5, [in0]
	st	out5, [in0+4]

	return	in7+8
	nop
	ret
	restore


.encrypt2.dec:
@@ -1324,36 +1328,37 @@ des_encrypt2:
	sll	out5, 29, in1
	add	in5, in0, in5
	srl	out5, 3, out5
	ld	[%sp+68], in0
	LDPTR	[%sp+BIAS+ARG0+0*ARGSZ], in0
	add	out5, in1, out5
	st	out5, [in0]
	st	in5, [in0+4]

	return	in7+8
	nop
	ret
	restore

.des_encrypt2.end:
	.size	 des_encrypt2, .des_encrypt2.end-des_encrypt2
.DES_encrypt2.end:
	.size	 DES_encrypt2, .DES_encrypt2.end-DES_encrypt2


! void des_encrypt3(data, ks1, ks2, ks3)
! void DES_encrypt3(data, ks1, ks2, ks3)
! **************************************

	.align 32
	.global des_encrypt3
	.type	 des_encrypt3,#function
	.global DES_encrypt3
	.type	 DES_encrypt3,#function

DES_encrypt3:

des_encrypt3:
	save	%sp, FRAME, %sp
	
	save	%sp, -96, %sp
	call	.PIC.me.up
	mov	.PIC.me.up-(.-4),out0

	ld	[in0], in5                ! left
	add	in2, 120, in4             ! ks2
	sethi	%hi(.des_and), out2       ! address constants

	ld	[in0+4], out5             ! right
	mov	in3, in2                  ! save ks3
	or	out2, %lo(.des_and), out2 ! address constants

	! parameter 6  1/2 for include encryption/decryption
	! parameter 7  1 for mov in1 to in3
@@ -1370,31 +1375,32 @@ des_encrypt3:

	fp_macro(in5, out5, 1)

	return	in7+8
	nop
	ret
	restore

.des_encrypt3.end:
	.size	 des_encrypt3,.des_encrypt3.end-des_encrypt3
.DES_encrypt3.end:
	.size	 DES_encrypt3,.DES_encrypt3.end-DES_encrypt3


! void des_decrypt3(data, ks1, ks2, ks3)
! void DES_decrypt3(data, ks1, ks2, ks3)
! **************************************

	.align 32
	.global des_decrypt3
	.type	 des_decrypt3,#function
	.global DES_decrypt3
	.type	 DES_decrypt3,#function

DES_decrypt3:

des_decrypt3:
	save	%sp, FRAME, %sp
	
	save	%sp, -96, %sp
	call	.PIC.me.up
	mov	.PIC.me.up-(.-4),out0

	ld	[in0], in5                ! left
	add	in3, 120, in4             ! ks3
	sethi	%hi(.des_and), out2

	ld	[in0+4], out5             ! right
	mov	in2, in3                  ! ks2
	or	out2, %lo(.des_and), out2

	! parameter 6  1/2 for include encryption/decryption
	! parameter 7  1 for mov in1 to in3
@@ -1411,44 +1417,128 @@ des_decrypt3:

	fp_macro(out5, in5, 1)

	return	in7+8
	nop
	ret
	restore

.DES_decrypt3.end:
	.size	 DES_decrypt3,.DES_decrypt3.end-DES_decrypt3

! input:	out0	offset between .PIC.me.up and caller
! output:	out0	pointer to .PIC.me.up
!		out2	pointer to .des_and
!		global1	pointer to DES_SPtrans
	.align	32
.PIC.me.up:
	add	out0,%o7,out0			! pointer to .PIC.me.up

#ifdef __PIC__
	sethi	%hi(DES_SPtrans),global1
	or	global1,%lo(DES_SPtrans),global1
	sethi	%hi(_GLOBAL_OFFSET_TABLE_-(.PIC.me.up-.)),out2
	add	global1,out0,global1
	add	out2,%lo(_GLOBAL_OFFSET_TABLE_-(.PIC.me.up-.)),out2
	LDPTR	[out2+global1],global1
#else
	setn	DES_SPtrans,out2,global1	! synthetic instruction !
#endif

	retl
	add	out0,.des_and-.PIC.me.up,out2

	.align	256
	.type	 .des_and,#object
	.size	 .des_and,284

.des_and:

! This table is used for AND 0xFC when it is known that register
! bits 8-31 are zero. Makes it possible to do three arithmetic
! operations in one cycle.

	.byte  0, 0, 0, 0, 4, 4, 4, 4
	.byte  8, 8, 8, 8, 12, 12, 12, 12
	.byte  16, 16, 16, 16, 20, 20, 20, 20
	.byte  24, 24, 24, 24, 28, 28, 28, 28
	.byte  32, 32, 32, 32, 36, 36, 36, 36
	.byte  40, 40, 40, 40, 44, 44, 44, 44
	.byte  48, 48, 48, 48, 52, 52, 52, 52
	.byte  56, 56, 56, 56, 60, 60, 60, 60
	.byte  64, 64, 64, 64, 68, 68, 68, 68
	.byte  72, 72, 72, 72, 76, 76, 76, 76
	.byte  80, 80, 80, 80, 84, 84, 84, 84
	.byte  88, 88, 88, 88, 92, 92, 92, 92
	.byte  96, 96, 96, 96, 100, 100, 100, 100
	.byte  104, 104, 104, 104, 108, 108, 108, 108
	.byte  112, 112, 112, 112, 116, 116, 116, 116
	.byte  120, 120, 120, 120, 124, 124, 124, 124
	.byte  128, 128, 128, 128, 132, 132, 132, 132
	.byte  136, 136, 136, 136, 140, 140, 140, 140
	.byte  144, 144, 144, 144, 148, 148, 148, 148
	.byte  152, 152, 152, 152, 156, 156, 156, 156
	.byte  160, 160, 160, 160, 164, 164, 164, 164
	.byte  168, 168, 168, 168, 172, 172, 172, 172
	.byte  176, 176, 176, 176, 180, 180, 180, 180
	.byte  184, 184, 184, 184, 188, 188, 188, 188
	.byte  192, 192, 192, 192, 196, 196, 196, 196
	.byte  200, 200, 200, 200, 204, 204, 204, 204
	.byte  208, 208, 208, 208, 212, 212, 212, 212
	.byte  216, 216, 216, 216, 220, 220, 220, 220
	.byte  224, 224, 224, 224, 228, 228, 228, 228
	.byte  232, 232, 232, 232, 236, 236, 236, 236
	.byte  240, 240, 240, 240, 244, 244, 244, 244
	.byte  248, 248, 248, 248, 252, 252, 252, 252

.des_decrypt3.end:
	.size	 des_decrypt3,.des_decrypt3.end-des_decrypt3
	! 5 numbers for initil/final permutation

	.word   0x0f0f0f0f                ! offset 256
	.word	0x0000ffff                ! 260
	.word	0x33333333                ! 264
	.word	0x00ff00ff                ! 268
	.word	0x55555555                ! 272

	.word	0                         ! 276
	.word	LOOPS                     ! 280
	.word	0x0000FC00                ! 284

! void des_ncbc_encrypt(input, output, length, schedule, ivec, enc)
! void DES_ncbc_encrypt(input, output, length, schedule, ivec, enc)
! *****************************************************************


	.align 32
	.global des_ncbc_encrypt
	.type	 des_ncbc_encrypt,#function
	.global DES_ncbc_encrypt
	.type	 DES_ncbc_encrypt,#function

DES_ncbc_encrypt:

des_ncbc_encrypt:
	save	%sp, FRAME, %sp
	
	save	%sp, -96, %sp
	define({INPUT},  { [%sp+BIAS+ARG0+0*ARGSZ] })
	define({OUTPUT}, { [%sp+BIAS+ARG0+1*ARGSZ] })
	define({IVEC},   { [%sp+BIAS+ARG0+4*ARGSZ] })

	define({INPUT}, { [%sp+68] })
	define({OUTPUT}, { [%sp+72] })
	define({IVEC}, { [%sp+84] })
	call	.PIC.me.up
	mov	.PIC.me.up-(.-4),out0

	cmp	in5, 0                    ! enc   

	sethi	%hi(.des_and), out2       ! address constants
#ifdef OPENSSL_SYSNAME_ULTRASPARC
	be,pn	%icc, .ncbc.dec
	st	in4, IVEC
#else
	be	.ncbc.dec
#endif
	STPTR	in4, IVEC

	! addr  left  right  temp  label
	load_little_endian(in4, in5, out5, local3, .LLE1)  ! iv

	addcc	in2, -8, in2              ! bytes missing when first block done

	mov	in3, in4                  ! schedule
#ifdef OPENSSL_SYSNAME_ULTRASPARC
	bl,pn	%icc, .ncbc.enc.seven.or.less
	or	out2, %lo(.des_and), out2
#else
	bl	.ncbc.enc.seven.or.less
#endif
	mov	in3, in4                  ! schedule

.ncbc.enc.next.block:

@@ -1471,7 +1561,11 @@ des_ncbc_encrypt:

	rounds_macro(in5, out5, 1, .ncbc.enc.1, in3, in4) ! include encryption  ks in3

#ifdef OPENSSL_SYSNAME_ULTRASPARC
	bl,pn	%icc, .ncbc.enc.next.block_fp
#else
	bl	.ncbc.enc.next.block_fp
#endif
	add	in0, 8, in0               ! input address

	! If 8 or more bytes are to be encrypted after this block,
@@ -1501,7 +1595,7 @@ des_ncbc_encrypt:
	add	global1, 512, global3     ! address sbox 3 since register used
	xor	global4, local1, out5     ! iv xor next block

	ba,pt	%icc, .ncbc.enc.next.block_2
	ba	.ncbc.enc.next.block_2
	add	in1, 8, in1               ! output adress

.ncbc.enc.next.block_fp:
@@ -1512,14 +1606,22 @@ des_ncbc_encrypt:

	addcc   in2, -8, in2              ! bytes missing when next block done

#ifdef OPENSSL_SYSNAME_ULTRASPARC
	bpos,pt	%icc, .ncbc.enc.next.block  ! also jumps if 0
#else
	bpos	.ncbc.enc.next.block
#endif
	add	in1, 8, in1

.ncbc.enc.seven.or.less:

	cmp	in2, -8

#ifdef OPENSSL_SYSNAME_ULTRASPARC
	ble,pt	%icc, .ncbc.enc.finish
#else
	ble	.ncbc.enc.finish
#endif
	nop

	add	in2, 8, local1            ! bytes to load
@@ -1532,25 +1634,28 @@ des_ncbc_encrypt:

.ncbc.enc.finish:

	ld	IVEC, local4
	LDPTR	IVEC, local4
	store_little_endian(local4, in5, out5, local5, .SLE2)  ! ivec

	return	in7+8
	nop
	ret
	restore


.ncbc.dec:

	st	in0, INPUT
	STPTR	in0, INPUT
	cmp	in2, 0                    ! length
	add	in3, 120, in3

	ld	IVEC, local7              ! ivec
	LDPTR	IVEC, local7              ! ivec
#ifdef OPENSSL_SYSNAME_ULTRASPARC
	ble,pn	%icc, .ncbc.dec.finish
#else
	ble	.ncbc.dec.finish
#endif
	mov	in3, in4                  ! schedule

	st	in1, OUTPUT
	or	out2, %lo(.des_and), out2 ! address constants low part
	STPTR	in1, OUTPUT
	mov	in0, local5               ! input

	load_little_endian(local7, in0, in1, local3, .LLE3)   ! ivec
@@ -1571,7 +1676,11 @@ des_ncbc_encrypt:
	! in2 is compared to 8 in the rounds

	xor	out5, in0, out4           ! iv xor
#ifdef OPENSSL_SYSNAME_ULTRASPARC
	bl,pn	%icc, .ncbc.dec.seven.or.less
#else
	bl	.ncbc.dec.seven.or.less
#endif
	xor	in5, in1, global4         ! iv xor

	! Load ivec next block now, since input and output address might be the same.
@@ -1580,23 +1689,27 @@ des_ncbc_encrypt:

	store_little_endian(local7, out4, global4, local3, .SLE3)

	st	local5, INPUT
	STPTR	local5, INPUT
	add	local7, 8, local7
	addcc   in2, -8, in2

#ifdef OPENSSL_SYSNAME_ULTRASPARC
	bg,pt	%icc, .ncbc.dec.next.block
	st	local7, OUTPUT
#else
	bg	.ncbc.dec.next.block
#endif
	STPTR	local7, OUTPUT


.ncbc.dec.store.iv:

	ld	IVEC, local4              ! ivec
	LDPTR	IVEC, local4              ! ivec
	store_little_endian(local4, in0, in1, local5, .SLE4)

.ncbc.dec.finish:

	return	in7+8
	nop
	ret
	restore

.ncbc.dec.seven.or.less:

@@ -1605,45 +1718,52 @@ des_ncbc_encrypt:
	store_n_bytes(local7, in2, global4, out4, local3, local4, .SNB1, .ncbc.dec.store.iv)


.des_ncbc_encrypt.end:
	.size	 des_ncbc_encrypt, .des_ncbc_encrypt.end-des_ncbc_encrypt
.DES_ncbc_encrypt.end:
	.size	 DES_ncbc_encrypt, .DES_ncbc_encrypt.end-DES_ncbc_encrypt


! void des_ede3_cbc_encrypt(input, output, lenght, ks1, ks2, ks3, ivec, enc)
! void DES_ede3_cbc_encrypt(input, output, lenght, ks1, ks2, ks3, ivec, enc)
! **************************************************************************


	.align 32
	.global des_ede3_cbc_encrypt
	.type	 des_ede3_cbc_encrypt,#function
	.global DES_ede3_cbc_encrypt
	.type	 DES_ede3_cbc_encrypt,#function

des_ede3_cbc_encrypt:
DES_ede3_cbc_encrypt:

	save	%sp, -96, %sp
	save	%sp, FRAME, %sp

	define({LENGTH},{ [%sp+76] })
	define({KS1}, { [%sp+80] })
	define({KS2}, { [%sp+84] })
	define({KS3}, { [%sp+88] })
	define({KS1}, { [%sp+BIAS+ARG0+3*ARGSZ] })
	define({KS2}, { [%sp+BIAS+ARG0+4*ARGSZ] })
	define({KS3}, { [%sp+BIAS+ARG0+5*ARGSZ] })

	ld	[%fp+96], local3          ! enc
	sethi	%hi(.des_and), out2
	call	.PIC.me.up
	mov	.PIC.me.up-(.-4),out0

	ld	[%fp+92], local4          ! ivec
	or	out2, %lo(.des_and), out2
	LDPTR	[%fp+BIAS+ARG0+7*ARGSZ], local3          ! enc
	LDPTR	[%fp+BIAS+ARG0+6*ARGSZ], local4          ! ivec
	cmp	local3, 0                 ! enc

#ifdef OPENSSL_SYSNAME_ULTRASPARC
	be,pn	%icc, .ede3.dec
	st	in4, KS2
#else
	be	.ede3.dec
#endif
	STPTR	in4, KS2

	st	in5, KS3
	STPTR	in5, KS3

	load_little_endian(local4, in5, out5, local3, .LLE6)  ! ivec

	addcc	in2, -8, in2              ! bytes missing after next block

#ifdef OPENSSL_SYSNAME_ULTRASPARC
	bl,pn	%icc,  .ede3.enc.seven.or.less
	st	in3, KS1
#else
	bl	.ede3.enc.seven.or.less
#endif
	STPTR	in3, KS1

.ede3.enc.next.block:

@@ -1651,11 +1771,11 @@ des_ede3_cbc_encrypt:

.ede3.enc.next.block_1:

	ld	KS2, in4
	LDPTR	KS2, in4
	xor	in5, out4, in5            ! iv xor
	xor	out5, global4, out5       ! iv xor

	ld	KS1, in3
	LDPTR	KS1, in3
	add	in4, 120, in4             ! for decryption we use last subkey first
	nop

@@ -1667,12 +1787,16 @@ des_ede3_cbc_encrypt:
	nop

	call .des_dec                     ! ks2 in4
	ld	KS3, in3
	LDPTR	KS3, in3

	call .des_enc                     ! ks3 in3  compares in2 to 8
	nop

#ifdef OPENSSL_SYSNAME_ULTRASPARC
	bl,pn	%icc, .ede3.enc.next.block_fp
#else
	bl	.ede3.enc.next.block_fp
#endif
	add	in0, 8, in0

	! If 8 or more bytes are to be encrypted after this block,
@@ -1703,7 +1827,7 @@ des_ede3_cbc_encrypt:

	ld	[in3+4], out1             ! key 8642
	add	global1, 768, global4     ! address sbox 4
	ba,pt	%icc, .ede3.enc.next.block_2
	ba	.ede3.enc.next.block_2
	add	in1, 8, in1

.ede3.enc.next.block_fp:
@@ -1714,14 +1838,22 @@ des_ede3_cbc_encrypt:

	addcc   in2, -8, in2              ! bytes missing when next block done

#ifdef OPENSSL_SYSNAME_ULTRASPARC
	bpos,pt	%icc, .ede3.enc.next.block
#else
	bpos	.ede3.enc.next.block
#endif
	add	in1, 8, in1

.ede3.enc.seven.or.less:

	cmp	in2, -8

#ifdef OPENSSL_SYSNAME_ULTRASPARC
	ble,pt	%icc, .ede3.enc.finish
#else
	ble	.ede3.enc.finish
#endif
	nop

	add	in2, 8, local1            ! bytes to load
@@ -1731,29 +1863,32 @@ des_ede3_cbc_encrypt:

.ede3.enc.finish:

	ld	[%fp+92], local4          ! ivec
	LDPTR	[%fp+BIAS+ARG0+6*ARGSZ], local4          ! ivec
	store_little_endian(local4, in5, out5, local5, .SLE6)  ! ivec

	return	in7+8
	nop

	ret
	restore

.ede3.dec:

	st	in0, INPUT
	STPTR	in0, INPUT
	add	in5, 120, in5

	st	in1, OUTPUT
	STPTR	in1, OUTPUT
	mov	in0, local5
	add	in3, 120, in3

	st	in3, KS1
	STPTR	in3, KS1
	cmp	in2, 0

#ifdef OPENSSL_SYSNAME_ULTRASPARC
	ble	%icc, .ede3.dec.finish
	st	in5, KS3
#else
	ble	.ede3.dec.finish
#endif
	STPTR	in5, KS3

	ld	[%fp+92], local7          ! iv
	LDPTR	[%fp+BIAS+ARG0+6*ARGSZ], local7          ! iv
	load_little_endian(local7, in0, in1, local3, .LLE8)

.ede3.dec.next.block:
@@ -1768,7 +1903,7 @@ des_ede3_cbc_encrypt:
	ip_macro(in5, out5, out5, in5, in4, 2, 0, 0, 1) ! inc .des_dec ks3 in4

	call .des_enc                     ! ks2 in3
	ld	KS1, in4
	LDPTR	KS1, in4

	call .des_dec                     ! ks1 in4
	nop
@@ -1779,30 +1914,37 @@ des_ede3_cbc_encrypt:
	! in2 is compared to 8 in the rounds

	xor	out5, in0, out4
#ifdef OPENSSL_SYSNAME_ULTRASPARC
	bl,pn	%icc, .ede3.dec.seven.or.less
#else
	bl	.ede3.dec.seven.or.less
#endif
	xor	in5, in1, global4

	load_little_endian_inc(local5, in0, in1, local3, .LLE10)   ! iv next block

	store_little_endian(local7, out4, global4, local3, .SLE7)  ! block

	st	local5, INPUT
	STPTR	local5, INPUT
	addcc   in2, -8, in2
	add	local7, 8, local7

#ifdef OPENSSL_SYSNAME_ULTRASPARC
	bg,pt	%icc, .ede3.dec.next.block
	st	local7, OUTPUT
#else
	bg	.ede3.dec.next.block
#endif
	STPTR	local7, OUTPUT

.ede3.dec.store.iv:

	ld	[%fp+92], local4          ! ivec
	LDPTR	[%fp+BIAS+ARG0+6*ARGSZ], local4          ! ivec
	store_little_endian(local4, in0, in1, local5, .SLE8)  ! ivec

.ede3.dec.finish:

	return	in7+8
	nop

	ret
	restore

.ede3.dec.seven.or.less:

@@ -1811,5 +1953,5 @@ des_ede3_cbc_encrypt:
	store_n_bytes(local7, in2, global4, out4, local3, local4, .SNB2, .ede3.dec.store.iv)


.des_ede3_cbc_encrypt.end:
	.size	 des_ede3_cbc_encrypt,.des_ede3_cbc_encrypt.end-des_ede3_cbc_encrypt
.DES_ede3_cbc_encrypt.end:
	.size	 DES_ede3_cbc_encrypt,.DES_ede3_cbc_encrypt.end-DES_ede3_cbc_encrypt