Commit 5526e579 authored by Andy Polyakov's avatar Andy Polyakov Committed by Dr. Stephen Henson
Browse files

Add some C64x assembly modules [by minor adjustments of C64x+ modules].



AES, SHA256 and SHA512 modules can actually replace corresponding
C64x+ modules. This is because C64x+ instructions don't actually
provide "killer-argument" advantage in these modules. As for SHA1,
even though its performance exactly same, C64x+ module is more
responsive to interrupts, i.e. doesn't inhibit them for as long
periods as C64x module.

Reviewed-by: default avatarRich Salz <rsalz@openssl.org>
Reviewed-by: default avatarTim Hudson <tjh@openssl.org>
Reviewed-by: default avatarStephen Henson <steve@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/4265)
parent fe36a698
Loading
Loading
Loading
Loading
+1375 −0

File added.

Preview size limit exceeded, changes collapsed.

crypto/c64xcpuid.pl

0 → 100644
+326 −0
Original line number Diff line number Diff line
#! /usr/bin/env perl
# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License").  You may not use
# this file except in compliance with the License.  You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html

while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";

$code.=<<___;
	.text

	.if	.ASSEMBLER_VERSION<7000000
	.asg	0,__TI_EABI__
	.endif
	.if	__TI_EABI__
	.asg	OPENSSL_rdtsc,_OPENSSL_rdtsc
	.asg	OPENSSL_cleanse,_OPENSSL_cleanse
	.asg	CRYPTO_memcmp,_CRYPTO_memcmp
	.asg	OPENSSL_atomic_add,_OPENSSL_atomic_add
	.asg	OPENSSL_wipe_cpu,_OPENSSL_wipe_cpu
	.asg	OPENSSL_instrument_bus,_OPENSSL_instrument_bus
	.asg	OPENSSL_instrument_bus2,_OPENSSL_instrument_bus2
	.endif

	.asg	B3,RA
	.asg	0x01AC0000,TIMER_BASE	; Timer 2

	.global	_OPENSSL_rdtsc
_OPENSSL_rdtsc:
	.asmfunc
	MVKL	TIMER_BASE,A5
	MVKH	TIMER_BASE,A5
	LDW	*A5[0],A2	; load CTL
	LDW	*A5[2],A4	; load CTN
	NOP	2
	.if	.BIG_ENDIAN
	MVK	0x2c0,A7	; internal clock source, don't hold, go
||	MVK	-1,A6		; maximum period
	.else
	MVK	0x2c0,A6	; internal clock source, don't hold, go
||	MVK	-1,A7		; maximum period
	.endif
  [!A2]	STDW	A7:A6,*A5[0]	; fire it up
||	BNOP	RA,5
	.endasmfunc

	.global	_OPENSSL_cleanse
_OPENSSL_cleanse:
	.asmfunc
	ZERO	A3:A2
||	ZERO	B2
||	SHRU	B4,3,B0		; is length >= 8
||	ADD	1,A4,B6
  [!B0]	BNOP	RA
|| [B0]	SUB	B0,1,B2
||	ZERO	A1
||	ZERO	B1
   [B2]	BDEC	cleanse_loop?,B2
||[!B0]	CMPLT	0,B4,A1
||[!B0]	CMPLT	1,B4,B1
||	ZERO	B5
   [A1]	STB	A2,*A4++[2]
|| [B1] STB	B5,*B6++[2]
|| [B2]	BDEC	cleanse_loop?,B2
||[!B0]	CMPLT	2,B4,A1
||[!B0]	CMPLT	3,B4,B1
   [A1]	STB	A2,*A4++[2]
|| [B1] STB	B5,*B6++[2]
|| [B2]	BDEC	cleanse_loop?,B2
||[!B0]	CMPLT	4,B4,A1
||[!B0]	CMPLT	5,B4,B1
   [A1]	STB	A2,*A4++[2]
|| [B1] STB	B5,*B6++[2]
|| [B2]	BDEC	cleanse_loop?,B2
||[!B0]	CMPLT	6,B4,A1
   [A1]	STB	A2,*A4++[2]
|| [B2]	BDEC	cleanse_loop?,B2

cleanse_loop?:
	STNDW	A3:A2,*A4++
||	SUB	B4,8,B4
|| [B2]	BDEC	cleanse_loop?,B2

	MV	B4,B0		; remaining bytes
||	ADD	1,A4,B6
||	BNOP	RA
   [B0]	CMPLT	0,B0,A1
|| [B0]	CMPLT	1,B0,B1
   [A1]	STB	A2,*A4++[2]
|| [B1] STB	B5,*B6++[2]
|| [B0]	CMPLT	2,B0,A1
|| [B0]	CMPLT	3,B0,B1
   [A1]	STB	A2,*A4++[2]
|| [B1] STB	B5,*B6++[2]
|| [B0]	CMPLT	4,B0,A1
|| [B0]	CMPLT	5,B0,B1
   [A1]	STB	A2,*A4++[2]
|| [B1] STB	B5,*B6++[2]
|| [B0]	CMPLT	6,B0,A1
   [A1]	STB	A2,*A4++[2]
	.endasmfunc

	.if	0
	.global	_CRYPTO_memcmp
_CRYPTO_memcmp:
	.asmfunc
	MV	A6,B0
  [!B0]	BNOP	RA
||[!B0]	ZERO	A4
|| [B0]	ZERO	A1:A0
   [B0]	LDBU	*A4++,A5
|| [B0]	LDBU	*B4++,B5
|| [B0]	BDEC	memcmp_loop?,B0
   [B0]	LDBU	*A4++,A5
|| [B0]	LDBU	*B4++,B5
|| [B0]	BDEC	memcmp_loop?,B0
   [B0]	LDBU	*A4++,A5
|| [B0]	LDBU	*B4++,B5
|| [B0]	BDEC	memcmp_loop?,B0
   [B0]	LDBU	*A4++,A5
|| [B0]	LDBU	*B4++,B5
|| [B0]	BDEC	memcmp_loop?,B0
   [B0]	LDBU	*A4++,A5
|| [B0]	LDBU	*B4++,B5
|| [B0]	BDEC	memcmp_loop?,B0
	XOR	A5,B5,A1
|| [B0]	LDBU	*A4++,A5
|| [B0]	LDBU	*B4++,B5
|| [B0]	BDEC	memcmp_loop?,B0

memcmp_loop?:
	OR	A1,A0,A0
||	XOR	A5,B5,A1
|| [B0]	LDBU	*A4++,A5
|| [B0]	LDBU	*B4++,B5
|| [B0]	BDEC	memcmp_loop?,B0

	BNOP	RA,3
	ZERO	A4
  [A0]	MVK	1,A4
	.endasmfunc
	.endif

	.global	_OPENSSL_atomic_add
_OPENSSL_atomic_add:
	.asmfunc
	BNOP	atomic_store?	; pre-C64x+ systems are uni-processor, it's
||	LDW	*A4,B5		; enough to hold interrupts off through
				; the load-update-store cycle to achieve
				; atomicity
	NOP
	BNOP	RA,3		; and this branch stretches even over store
	ADD	B4,B5,B5
atomic_store?:
	STW	B5,*A4
||	MV	B5,A4
	.endasmfunc

	.global	_OPENSSL_wipe_cpu
_OPENSSL_wipe_cpu:
	.asmfunc
	ZERO	A0
||	ZERO	B0
||	ZERO	A1
||	ZERO	B1
	ZERO	A3:A2
||	MVD	B0,B2
||	ZERO	A4
||	ZERO	B4
||	ZERO	A5
||	ZERO	B5
||	BNOP	RA
	ZERO	A7:A6
||	ZERO	B7:B6
||	ZERO	A8
||	ZERO	B8
||	ZERO	A9
||	ZERO	B9
	ZERO	A17:A16
||	ZERO	B17:B16
||	ZERO	A18
||	ZERO	B18
||	ZERO	A19
||	ZERO	B19
	ZERO	A21:A20
||	ZERO	B21:B20
||	ZERO	A22
||	ZERO	B22
||	ZERO	A23
||	ZERO	B23
	ZERO	A25:A24
||	ZERO	B25:B24
||	ZERO	A26
||	ZERO	B26
||	ZERO	A27
||	ZERO	B27
	ZERO	A29:A28
||	ZERO	B29:B28
||	ZERO	A30
||	ZERO	B30
||	ZERO	A31
||	ZERO	B31
	.endasmfunc

CLFLUSH	.macro	CONTROL,ADDR,LEN
	B	passthrough?
||	STW	ADDR,*CONTROL[0]
	STW	LEN,*CONTROL[1]
spinlock?:
	LDW	*CONTROL[1],A0
	NOP	3
passthrough?:
	NOP
  [A0]	BNOP	spinlock?,5
	.endm

	.global	_OPENSSL_instrument_bus
_OPENSSL_instrument_bus:
	.asmfunc
	MV	B4,B0			; reassign sizeof(output)
||	MV	A4,B4			; reassign output
||	MVK	0x00004030,A3
||	MVKL	TIMER_BASE,B16
	MV	B0,A4			; return value
||	MVK	1,A1
||	MVKH	0x01840000,A3		; L1DWIBAR
||	MVKH	TIMER_BASE,B16
	LDW	*B16[2],B8		; collect 1st tick
||	MVK	0x00004010,A5
	NOP	4
	MV	B8,B9			; lasttick = tick
||	MVK	0,B7			; lastdiff = 0
||	MVKH	0x01840000,A5		; L2WIBAR
	CLFLUSH	A3,B4,A1		; write-back and invalidate L1D line
	CLFLUSH	A5,B4,A1		; write-back and invalidate L2 line
	LDW	*B4,B5
	NOP	4
	ADD	B7,B5,B5
	STW	B5,*B4
bus_loop1?:
	LDW	*B16[2],B8
|| [B0]	SUB	B0,1,B0
	NOP	4
	SUB	B8,B9,B7		; lastdiff = tick - lasttick
||	MV	B8,B9			; lasttick = tick
	CLFLUSH	A3,B4,A1		; write-back and invalidate L1D line
	CLFLUSH	A5,B4,A1		; write-back and invalidate L2 line
	LDW	*B4,B5
	NOP	4
	ADD	B7,B5,B5
	STW	B5,*B4			; [!B1] is removed to flatten samples
||	ADDK	4,B4
|| [B0]	BNOP	bus_loop1?,5

	BNOP	RA,5
	.endasmfunc

	.global	_OPENSSL_instrument_bus2
_OPENSSL_instrument_bus2:
	.asmfunc
	MV	A6,B0			; reassign max
||	MV	B4,A6			; reassing sizeof(output)
||	MVK	0x00004030,A3
||	MVKL	TIMER_BASE,B16
	MV	A4,B4			; reassign output
||	MVK	0,A4			; return value
||	MVK	1,A1
||	MVKH	0x01840000,A3		; L1DWIBAR
||	MVKH	TIMER_BASE,B16

	LDW	*B16[2],B8		; collect 1st tick
||	MVK	0x00004010,A5
	NOP	4
	MV	B8,B9			; lasttick = tick
||	MVK	0,B7			; lastdiff = 0
||	MVKH	0x01840000,A5		; L2WIBAR
	CLFLUSH	A3,B4,A1		; write-back and invalidate L1D line
	CLFLUSH	A5,B4,A1		; write-back and invalidate L2 line
	LDW	*B4,B5
	NOP	4
	ADD	B7,B5,B5
	STW	B5,*B4

	LDW	*B16[2],B8		; collect 1st diff
	NOP	4
	SUB	B8,B9,B7		; lastdiff = tick - lasttick
||	MV	B8,B9			; lasttick = tick
||	SUB	B0,1,B0
bus_loop2?:
	CLFLUSH	A3,B4,A1		; write-back and invalidate L1D line
	CLFLUSH	A5,B4,A1		; write-back and invalidate L2 line
	LDW	*B4,B5
	NOP	4
	ADD	B7,B5,B5
	STW	B5,*B4			; [!B1] is removed to flatten samples
||[!B0]	BNOP	bus_loop2_done?,2
||	SUB	B0,1,B0
	LDW	*B16[2],B8
	NOP	4
	SUB	B8,B9,B8
||	MV	B8,B9
	CMPEQ	B8,B7,B2
||	MV	B8,B7
  [!B2]	ADDAW	B4,1,B4
||[!B2]	ADDK	1,A4
	CMPEQ	A4,A6,A2
  [!A2]	BNOP	bus_loop2?,5

bus_loop2_done?:
	BNOP	RA,5
	.endasmfunc

	.if	__TI_EABI__
	.sect	".init_array"
	.else
	.sect	".pinit"
	.endif
	.align	4
	.long	_OPENSSL_rdtsc		; auto-start timer
___

print $code;
close STDOUT;
+230 −0
Original line number Diff line number Diff line
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# SHA1 for C64x.
#
# November 2016
#
# This is fully-unrolled SHA1 implementation. It's 25% faster than
# one with compact loops, doesn't use in-memory ring buffer, as
# everything is accomodated in registers, and has "perfect" interrupt
# agility. Drawback is obviously the code size...

while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";

($CTX,$INP,$NUM) = ("A4","B4","A6");		# arguments

($A,$B,$C,$D,$E, $Arot,$F,$F0,$K) = map("A$_",(16..20, 21..24));
@V = ($A,$B,$C,$D,$E);
@X = map("B$_",(16..31));
($Actx,$Bctx,$Cctx,$Dctx,$Ectx) = map("A$_",(3,6..9));	# zaps $NUM

sub BODY_00_19 {
my ($i,$a,$b,$c,$d,$e) = @_;
my $j = ($i+1)&15;

$code.=<<___				if ($i<14);
	ROTL	$a,5,$Arot		;; $i
||	AND	$c,$b,$F
||	ANDN	$d,$b,$F0
||	ADD	$K,$e,$e		; E+=K
||	 LDNW	*${INP}++,@X[$i+2]
	OR	$F0,$F,$F		; F_00_19(B,C,D)
||	ROTL	$b,30,$b
||	 SWAP2	@X[$i+1],@X[$i+1]
||	ADD	@X[$i],$e,$e		; E+=X[i]
	ADD	$Arot,$e,$e		; E+=rot(A,5)
||	 SWAP4	@X[$i+1],@X[$i+1]
	ADD	$F,$e,$e		; E+=F_00_19(B,C,D)
___
$code.=<<___				if ($i==14);
	ROTL	$a,5,$Arot		;; $i
||	AND	$c,$b,$F
||	ANDN	$d,$b,$F0
||	ADD	$K,$e,$e		; E+=K
	OR	$F0,$F,$F		; F_00_19(B,C,D)
||	ROTL	$b,30,$b
||	ADD	@X[$i],$e,$e		; E+=X[i]
||	 SWAP2	@X[$i+1],@X[$i+1]
	ADD	$Arot,$e,$e		; E+=rot(A,5)
||	 SWAP4	@X[$i+1],@X[$i+1]
	ADD	$F,$e,$e		; E+=F_00_19(B,C,D)
___
$code.=<<___				if ($i==15);
||	 XOR	@X[($j+2)&15],@X[$j],@X[$j]
	ROTL	$a,5,$Arot		;; $i
||	AND	$c,$b,$F
||	ANDN	$d,$b,$F0
||	ADD	$K,$e,$e		; E+=K
||	 XOR	@X[($j+8)&15],@X[$j],@X[$j]
	OR	$F0,$F,$F		; F_00_19(B,C,D)
||	ROTL	$b,30,$b
||	ADD	@X[$i],$e,$e		; E+=X[i]
||	 XOR	@X[($j+13)&15],@X[$j],@X[$j]
	ADD	$Arot,$e,$e		; E+=rot(A,5)
||	 ROTL	@X[$j],1,@X[$j]
	ADD	$F,$e,$e		; E+=F_00_19(B,C,D)
___
$code.=<<___				if ($i>15);
||	 XOR	@X[($j+2)&15],@X[$j],@X[$j]
	ROTL	$a,5,$Arot		;; $i
||	AND	$c,$b,$F
||	ANDN	$d,$b,$F0
||	ADD	$K,$e,$e		; E+=K
||	 XOR	@X[($j+8)&15],@X[$j],@X[$j]
	OR	$F0,$F,$F		; F_00_19(B,C,D)
||	ROTL	$b,30,$b
||	ADD	@X[$i&15],$e,$e		; E+=X[i]
||	 XOR	@X[($j+13)&15],@X[$j],@X[$j]
	ADD	$Arot,$e,$e		; E+=rot(A,5)
||	 ROTL	@X[$j],1,@X[$j]
	ADD	$F,$e,$e		; E+=F_00_19(B,C,D)
___
}

sub BODY_20_39 {
my ($i,$a,$b,$c,$d,$e) = @_;
my $j = ($i+1)&15;

$code.=<<___				if ($i<79);
||	 XOR	@X[($j+2)&15],@X[$j],@X[$j]
	ROTL	$a,5,$Arot		;; $i
||	XOR	$c,$b,$F
||	ADD	$K,$e,$e		; E+=K
||	 XOR	@X[($j+8)&15],@X[$j],@X[$j]
	XOR	$d,$F,$F		; F_20_39(B,C,D)
||	ROTL	$b,30,$b
||	ADD	@X[$i&15],$e,$e		; E+=X[i]
||	 XOR	@X[($j+13)&15],@X[$j],@X[$j]
	ADD	$Arot,$e,$e		; E+=rot(A,5)
||	 ROTL	@X[$j],1,@X[$j]
	ADD	$F,$e,$e		; E+=F_20_39(B,C,D)
___
$code.=<<___				if ($i==79);
|| [A0]	B	loop?
|| [A0]	LDNW	*${INP}++,@X[0]		; pre-fetch input
	ROTL	$a,5,$Arot		;; $i
||	XOR	$c,$b,$F
||	ADD	$K,$e,$e		; E+=K
|| [A0]	LDNW	*${INP}++,@X[1]
	XOR	$d,$F,$F		; F_20_39(B,C,D)
||	ROTL	$b,30,$b
||	ADD	@X[$i&15],$e,$e		; E+=X[i]
	ADD	$Arot,$e,$e		; E+=rot(A,5)
	ADD	$F,$e,$e		; E+=F_20_39(B,C,D)
||	ADD	$Bctx,$a,$a		; accumulate context
||	ADD	$Cctx,$b,$b
	ADD	$Dctx,$c,$c
||	ADD	$Ectx,$d,$d
||	ADD	$Actx,$e,$e
;;===== branch to loop? is taken here
___
}

sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e) = @_;
my $j = ($i+1)&15;

$code.=<<___;
||	 XOR	@X[($j+2)&15],@X[$j],@X[$j]
	ROTL	$a,5,$Arot		;; $i
||	AND	$c,$b,$F
||	AND	$d,$b,$F0
||	ADD	$K,$e,$e		; E+=K
||	 XOR	@X[($j+8)&15],@X[$j],@X[$j]
	XOR	$F0,$F,$F
||	AND	$c,$d,$F0
||	ROTL	$b,30,$b
||	 XOR	@X[($j+13)&15],@X[$j],@X[$j]
||	ADD	@X[$i&15],$e,$e		; E+=X[i]
	XOR	$F0,$F,$F		; F_40_59(B,C,D)
||	ADD	$Arot,$e,$e		; E+=rot(A,5)
||	 ROTL	@X[$j],1,@X[$j]
	ADD	$F,$e,$e		; E+=F_20_39(B,C,D)
___
}

$code=<<___;
	.text

	.if	.ASSEMBLER_VERSION<7000000
	.asg	0,__TI_EABI__
	.endif
	.if	__TI_EABI__
	.asg	sha1_block_data_order,_sha1_block_data_order
	.endif

	.asg	B3,RA
	.asg	A15,FP
	.asg	B15,SP

	.if	.BIG_ENDIAN
	.asg	MV,SWAP2
	.asg	MV,SWAP4
	.endif

	.global	_sha1_block_data_order
_sha1_block_data_order:
	.asmfunc
	MV	$NUM,A0			; reassign $NUM
  [!A0]	BNOP	RA			; if ($NUM==0) return;
|| [A0]	LDW	*${CTX}[0],$A		; load A-E...
   [A0]	LDW	*${CTX}[1],$B
   [A0]	LDW	*${CTX}[2],$C
   [A0]	LDW	*${CTX}[3],$D
   [A0]	LDW	*${CTX}[4],$E
   [A0]	LDNW	*${INP}++,@X[0]		; pre-fetch input
   [A0]	LDNW	*${INP}++,@X[1]
	NOP	3

loop?:
	SUB	A0,1,A0
||	MV	$A,$Actx
||	MVD	$B,$Bctx
||	SWAP2	@X[0],@X[0]
||	MVKL	0x5a827999,$K
	MVKH	0x5a827999,$K		; K_00_19
||	MV	$C,$Cctx
||	MV	$D,$Dctx
||	MVD	$E,$Ectx
||	SWAP4	@X[0],@X[0]
___
for ($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
||	MVKL	0x6ed9eba1,$K
	MVKH	0x6ed9eba1,$K		; K_20_39
___
for (;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
||	MVKL	0x8f1bbcdc,$K
	MVKH	0x8f1bbcdc,$K		; K_40_59
___
for (;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
||	MVKL	0xca62c1d6,$K
	MVKH	0xca62c1d6,$K		; K_60_79
___
for (;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
	BNOP	RA			; return
	STW	$A,*${CTX}[0]		; emit A-E...
	STW	$B,*${CTX}[1]
	STW	$C,*${CTX}[2]
	STW	$D,*${CTX}[3]
	STW	$E,*${CTX}[4]
	.endasmfunc

	.sect	.const
	.cstring "SHA1 block transform for C64x, CRYPTOGAMS by <appro\@openssl.org>"
	.align	4
___

print $code;
close STDOUT;
+330 −0
Original line number Diff line number Diff line
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# SHA1 for C64x.
#
# November 2016
#
# If compared to compiler-generated code with similar characteristics,
# i.e. compiled with OPENSSL_SMALL_FOOTPRINT and utilizing SPLOOPs,
# this implementation is 25% smaller and >2x faster. In absolute terms
# performance is (quite impressive) ~6.5 cycles per processed byte.
# Unlike its predecessor, sha1-c64xplus module, this module has worse
# interrupt agility. While original added up to 5 cycles delay to
# response to interrupt, this module adds up to 100. Fully unrolled
# implementation doesn't add any delay and even 25% faster, but is
# almost 5x larger...
#
# !!! Note that this module uses AMR, which means that all interrupt
# service routines are expected to preserve it and for own well-being
# zero it upon entry.

while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";

($CTX,$INP,$NUM) = ("A4","B4","A6");		# arguments

($A,$B,$C,$D,$E, $Arot,$F,$F0,$T,$K) = map("A$_",(16..20, 21..25));
($X0,$X2,$X8,$X13) = ("A26","B26","A27","B27");
($TX0,$TX1,$TX2,$TX3) = map("B$_",(28..31));
($XPA,$XPB) = ("A5","B5");			# X circular buffer
($Actx,$Bctx,$Cctx,$Dctx,$Ectx) = map("A$_",(3,6..9));	# zaps $NUM

$code=<<___;
	.text

	.if	.ASSEMBLER_VERSION<7000000
	.asg	0,__TI_EABI__
	.endif
	.if	__TI_EABI__
	.asg	sha1_block_data_order,_sha1_block_data_order
	.endif

	.asg	B3,RA
	.asg	A15,FP
	.asg	B15,SP

	.if	.BIG_ENDIAN
	.asg	MV,SWAP2
	.asg	MV,SWAP4
	.endif

	.global	_sha1_block_data_order
_sha1_block_data_order:
	.asmfunc stack_usage(64)
	MV	$NUM,A0			; reassign $NUM
||	MVK	-64,B0
  [!A0]	BNOP	RA			; if ($NUM==0) return;
|| [A0]	STW	FP,*SP--[16]		; save frame pointer and alloca(64)
|| [A0]	MV	SP,FP
   [A0]	LDW	*${CTX}[0],$A		; load A-E...
|| [A0]	AND	B0,SP,SP		; align stack at 64 bytes
   [A0]	LDW	*${CTX}[1],$B
|| [A0]	SUBAW	SP,2,SP			; reserve two words above buffer
   [A0]	LDW	*${CTX}[2],$C
|| [A0]	MVK	0x00404,B0
   [A0]	LDW	*${CTX}[3],$D
|| [A0]	MVKH	0x50000,B0		; 0x050404, 64 bytes for $XP[AB]
   [A0]	LDW	*${CTX}[4],$E
|| [A0]	MVC	B0,AMR			; setup circular addressing
	LDNW	*${INP}++,$TX1		; pre-fetch input
	NOP	1

loop?:
	MVKL	0x5a827999,$K
||	ADDAW	SP,2,$XPB
||	SUB	A0,1,A0
	MVKH	0x5a827999,$K		; K_00_19
||	MV	$A,$Actx
||	MV	$B,$Bctx
;;==================================================
	B	body_00_13?		; BODY_00_13
||	MVK	11,B0
||	MV	$XPB,$XPA
||	MV	$C,$Cctx
||	MV	$D,$Dctx
||	MVD	$E,$Ectx

body_00_13?:
	ROTL	$A,5,$Arot
||	AND	$C,$B,$F
||	ANDN	$D,$B,$F0
||	ADD	$K,$E,$T		; T=E+K

	XOR	$F0,$F,$F		; F_00_19(B,C,D)
||	MV	$D,$E			; E=D
||	MV	$C,$D			; D=C
||	SWAP2	$TX1,$TX2
||	LDNW	*${INP}++,$TX1

	ADD	$F,$T,$T		; T+=F_00_19(B,C,D)
||	ROTL	$B,30,$C		; C=ROL(B,30)
||	SWAP4	$TX2,$TX3		; byte swap

	ADD	$Arot,$T,$T		; T+=ROL(A,5)
||	MV	$A,$B			; B=A

	ADD	$TX3,$T,$A		; A=T+Xi
||	STW	$TX3,*${XPB}++
||	BDEC	body_00_13?,B0
;;==================================================
	ROTL	$A,5,$Arot		; BODY_14
||	AND	$C,$B,$F
||	ANDN	$D,$B,$F0
||	ADD	$K,$E,$T		; T=E+K

	XOR	$F0,$F,$F		; F_00_19(B,C,D)
||	MV	$D,$E			; E=D
||	MV	$C,$D			; D=C
||	SWAP2	$TX1,$TX2
||	LDNW	*${INP}++,$TX1

	ADD	$F,$T,$T		; T+=F_00_19(B,C,D)
||	ROTL	$B,30,$C		; C=ROL(B,30)
||	SWAP4	$TX2,$TX2		; byte swap
||	LDW	*${XPA}++,$X0		; fetches from X ring buffer are
||	LDW	*${XPB}[4],$X2		; 2 iterations ahead

	ADD	$Arot,$T,$T		; T+=ROL(A,5)
||	MV	$A,$B			; B=A
||	LDW	*${XPA}[7],$X8
||	MV	$TX3,$X13		; ||	LDW	*${XPB}[15],$X13
||	MV	$TX2,$TX3

	ADD	$TX2,$T,$A		; A=T+Xi
||	STW	$TX2,*${XPB}++
;;==================================================
	ROTL	$A,5,$Arot		; BODY_15
||	AND	$C,$B,$F
||	ANDN	$D,$B,$F0
||	ADD	$K,$E,$T		; T=E+K

	XOR	$F0,$F,$F		; F_00_19(B,C,D)
||	MV	$D,$E			; E=D
||	MV	$C,$D			; D=C
||	SWAP2	$TX1,$TX2

	ADD	$F,$T,$T		; T+=F_00_19(B,C,D)
||	ROTL	$B,30,$C		; C=ROL(B,30)
||	SWAP4	$TX2,$TX2		; byte swap
||	XOR	$X0,$X2,$TX0		; Xupdate XORs are 1 iteration ahead
||	LDW	*${XPA}++,$X0
||	LDW	*${XPB}[4],$X2

	ADD	$Arot,$T,$T		; T+=ROL(A,5)
||	MV	$A,$B			; B=A
||	XOR	$X8,$X13,$TX1
||	LDW	*${XPA}[7],$X8
||	MV	$TX3,$X13		; ||	LDW	*${XPB}[15],$X13
||	MV	$TX2,$TX3

	ADD	$TX2,$T,$A		; A=T+Xi
||	STW	$TX2,*${XPB}++
||	XOR	$TX0,$TX1,$TX1
;;==================================================
||	B	body_16_19?		; BODY_16_19
||	MVK	1,B0

body_16_19?:
	ROTL	$A,5,$Arot
||	AND	$C,$B,$F
||	ANDN	$D,$B,$F0
||	ADD	$K,$E,$T		; T=E+K
||	ROTL	$TX1,1,$TX2		; Xupdate output

	XOR	$F0,$F,$F		; F_00_19(B,C,D)
||	MV	$D,$E			; E=D
||	MV	$C,$D			; D=C

	ADD	$F,$T,$T		; T+=F_00_19(B,C,D)
||	ROTL	$B,30,$C		; C=ROL(B,30)
||	XOR	$X0,$X2,$TX0
||	LDW	*${XPA}++,$X0
||	LDW	*${XPB}[4],$X2

	ADD	$Arot,$T,$T		; T+=ROL(A,5)
||	MV	$A,$B			; B=A
||	XOR	$X8,$X13,$TX1
||	LDW	*${XPA}[7],$X8
||	MV	$TX3,$X13		; ||	LDW	*${XPB}[15],$X13
||	MV	$TX2,$TX3

	ADD	$TX2,$T,$A		; A=T+Xi
||	STW	$TX2,*${XPB}++
||	XOR	$TX0,$TX1,$TX1
||	BDEC	body_16_19?,B0

	MVKL	0x6ed9eba1,$K
||	MVK	17,B0
	MVKH	0x6ed9eba1,$K		; K_20_39
___
sub BODY_20_39 {
my $label = shift;
$code.=<<___;
;;==================================================
||	B	$label			; BODY_20_39

$label:
	ROTL	$A,5,$Arot
||	XOR	$B,$C,$F
||	ADD	$K,$E,$T		; T=E+K
||	ROTL	$TX1,1,$TX2		; Xupdate output

	XOR	$D,$F,$F		; F_20_39(B,C,D)
||	MV	$D,$E			; E=D
||	MV	$C,$D			; D=C

	ADD	$F,$T,$T		; T+=F_20_39(B,C,D)
||	ROTL	$B,30,$C		; C=ROL(B,30)
||	XOR	$X0,$X2,$TX0
||	LDW	*${XPA}++,$X0
||	LDW	*${XPB}[4],$X2

	ADD	$Arot,$T,$T		; T+=ROL(A,5)
||	MV	$A,$B			; B=A
||	XOR	$X8,$X13,$TX1
||	LDW	*${XPA}[7],$X8
||	MV	$TX3,$X13		; ||	LDW	*${XPB}[15],$X13
||	MV	$TX2,$TX3

	ADD	$TX2,$T,$A		; A=T+Xi
||	STW	$TX2,*${XPB}++		; last one is redundant
||	XOR	$TX0,$TX1,$TX1
||	BDEC	$label,B0
___
}	&BODY_20_39("body_20_39?");
$code.=<<___;
;;==================================================
	MVKL	0x8f1bbcdc,$K
||	MVK	17,B0
	MVKH	0x8f1bbcdc,$K		; K_40_59
||	B	body_40_59?		; BODY_40_59
||	AND	$B,$C,$F
||	AND	$B,$D,$F0

body_40_59?:
	ROTL	$A,5,$Arot
||	XOR	$F0,$F,$F
||	AND	$C,$D,$F0
||	ADD	$K,$E,$T		; T=E+K
||	ROTL	$TX1,1,$TX2		; Xupdate output

	XOR	$F0,$F,$F		; F_40_59(B,C,D)
||	MV	$D,$E			; E=D
||	MV	$C,$D			; D=C

	ADD	$F,$T,$T		; T+=F_40_59(B,C,D)
||	ROTL	$B,30,$C		; C=ROL(B,30)
||	XOR	$X0,$X2,$TX0
||	LDW	*${XPA}++,$X0
||	LDW	*${XPB}[4],$X2

	ADD	$Arot,$T,$T		; T+=ROL(A,5)
||	MV	$A,$B			; B=A
||	XOR	$X8,$X13,$TX1
||	LDW	*${XPA}[7],$X8
||	MV	$TX3,$X13		; ||	LDW	*${XPB}[15],$X13
||	MV	$TX2,$TX3

	ADD	$TX2,$T,$A		; A=T+Xi
||	STW	$TX2,*${XPB}++
||	XOR	$TX0,$TX1,$TX1
||	AND	$B,$C,$F
||	AND	$B,$D,$F0
||	BDEC	body_40_59?,B0

	MVKL	0xca62c1d6,$K
||	MVK	16,B0
	MVKH	0xca62c1d6,$K		; K_60_79
___
	&BODY_20_39("body_60_78?");	# BODY_60_78
$code.=<<___;
;;==================================================
   [A0]	B	loop?
||	ROTL	$A,5,$Arot		; BODY_79
||	XOR	$B,$C,$F
||	ROTL	$TX1,1,$TX2		; Xupdate output

   [A0]	LDNW	*${INP}++,$TX1		; pre-fetch input
||	ADD	$K,$E,$T		; T=E+K
||	XOR	$D,$F,$F		; F_20_39(B,C,D)

	ADD	$F,$T,$T		; T+=F_20_39(B,C,D)
||	ADD	$Ectx,$D,$E		; E=D,E+=Ectx
||	ADD	$Dctx,$C,$D		; D=C,D+=Dctx
||	ROTL	$B,30,$C		; C=ROL(B,30)

	ADD	$Arot,$T,$T		; T+=ROL(A,5)
||	ADD	$Bctx,$A,$B		; B=A,B+=Bctx

	ADD	$TX2,$T,$A		; A=T+Xi

	ADD	$Actx,$A,$A		; A+=Actx
||	ADD	$Cctx,$C,$C		; C+=Cctx
;; end of loop?

	BNOP	RA			; return
||	MV	FP,SP			; restore stack pointer
||	LDW	*FP[0],FP		; restore frame pointer
	STW	$A,*${CTX}[0]		; emit A-E...
||	MVK	0,B0
	STW	$B,*${CTX}[1]
||	MVC	B0,AMR			; clear AMR
	STW	$C,*${CTX}[2]
	STW	$D,*${CTX}[3]
	STW	$E,*${CTX}[4]
	.endasmfunc

	.sect	.const
	.cstring "SHA1 block transform for C64x, CRYPTOGAMS by <appro\@openssl.org>"
	.align	4
___

print $code;
close STDOUT;
+313 −0

File added.

Preview size limit exceeded, changes collapsed.

Loading