Commit 3e181369 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

C64x+ assembler pack. linux-c64xplus build is *not* tested nor can it be

tested, because kernel is not in shape to handle it *yet*. The code is
committed mostly to stimulate the kernel development.
parent d3ddf022
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -399,6 +399,9 @@ my %table=(
"linux-alpha+bwx-gcc","gcc:-O3 -DL_ENDIAN -DTERMIO::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL:${alpha_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"linux-alpha-ccc","ccc:-fast -readonly_strings -DL_ENDIAN -DTERMIO::-D_REENTRANT:::SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL:${alpha_asm}",
"linux-alpha+bwx-ccc","ccc:-fast -readonly_strings -DL_ENDIAN -DTERMIO::-D_REENTRANT:::SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL:${alpha_asm}",
#
# TI_CGT_C6000_7.3.x is a requirement
"linux-c64xplus","cl6x:--linux --strip_coff_underscore -ea=.s -eo=.o -mv6400+ -o2 -ox -ms -pden -DOPENSSL_SMALL_FOOTPRINT::-D_REENTRANT:::BN_LLONG:c64xpluscpuid.o:bn-c64xplus.o c64xplus-gf2m.o::aes-c64xplus.o aes_cbc.o aes_ctr.o:::sha1-c64xplus.o sha256-c64xplus.o sha512-c64xplus.o:::::::ghash-c64xplus.o::void:dlfcn:linux-shared:--pic:-z --sysv --shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):true",

# Android: linux-* but without -DTERMIO and pointers to headers and libs.
"android","gcc:-mandroid -I\$(ANDROID_DEV)/include -B\$(ANDROID_DEV)/lib -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+33 −0
Original line number Diff line number Diff line
@@ -3927,6 +3927,39 @@ $ranlib =
$arflags      = 
$multilib     = 

*** linux-c64xplus
$cc           = cl6x
$cflags       = --linux --strip_coff_underscore -ea=.s -eo=.o -mv6400+ -o2 -ox -ms -pden -DOPENSSL_SMALL_FOOTPRINT
$unistd       = 
$thread_cflag = -D_REENTRANT
$sys_id       = 
$lflags       = 
$bn_ops       = BN_LLONG
$cpuid_obj    = c64xpluscpuid.o
$bn_obj       = bn-c64xplus.o c64xplus-gf2m.o
$des_obj      = 
$aes_obj      = aes-c64xplus.o aes_cbc.o aes_ctr.o
$bf_obj       = 
$md5_obj      = 
$sha1_obj     = sha1-c64xplus.o sha256-c64xplus.o sha512-c64xplus.o
$cast_obj     = 
$rc4_obj      = 
$rmd160_obj   = 
$rc5_obj      = 
$wp_obj       = 
$cmll_obj     = 
$modes_obj    = ghash-c64xplus.o
$engines_obj  = 
$perlasm_scheme = void
$dso_scheme   = dlfcn
$shared_target= linux-shared
$shared_cflag = --pic
$shared_ldflag = -z --sysv --shared
$shared_extension = .so.$(SHLIB_MAJOR).$(SHLIB_MINOR)
$ranlib       = true
$arflags      = 
$multilib     = 

*** linux-elf
$cc           = gcc
$cflags       = -DL_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall
+1361 −0

File added.

Preview size limit exceeded, changes collapsed.

+333 −0
Original line number Diff line number Diff line
;;====================================================================
;; Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
;; project.
;;
;; Rights for redistribution and usage in source and binary forms are
;; granted according to the OpenSSL license. Warranty of any kind is
;; disclaimed.
;;====================================================================
;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n
;; being the number of 32-bit words, addition - 8*n. Corresponding 4x
;; unrolled SPLOOP-free loops - at ~8*n and ~5*n. Below assembler
;; SPLOOPs spin at ... 2*n cycles [plus epilogue].
;;====================================================================
	.text

	.asg	B3,RA
	.asg	A4,ARG0
	.asg	B4,ARG1
	.asg	A6,ARG2
	.asg	B6,ARG3
	.asg	A8,ARG4
	.asg	B8,ARG5
	.asg	A4,RET
	.asg	A15,FP
	.asg	B14,DP
	.asg	B15,SP

	.global	_bn_mul_add_words
_bn_mul_add_words:
	.asmfunc
	MV	ARG2,B0
  [!B0]	BNOP	RA
||[!B0]	MVK	0,RET
   [B0]	MVC	B0,ILC
   [B0]	ZERO	A19		; high part of accumulator
|| [B0]	MV	ARG0,A2
|| [B0]	MV	ARG3,A3
	NOP	3

	SPLOOP	2		; 2*n+10
;;====================================================================
	LDW	*ARG1++,B7	; ap[i]
	NOP	3
	LDW	*ARG0++,A7	; rp[i]
	MPY32U	B7,A3,A17:A16
	NOP	3		; [2,0] in epilogue
	ADDU	A16,A7,A21:A20
	ADDU	A19,A21:A20,A19:A18
||	MV.S	A17,A23
	SPKERNEL 2,1		; leave slot for "return value"
||	STW	A18,*A2++	; rp[i]
||	ADD	A19,A23,A19
;;====================================================================
	BNOP	RA,4
	MV	A19,RET		; return value
	.endasmfunc

	.global	_bn_mul_words
_bn_mul_words:
	.asmfunc
	MV	ARG2,B0
  [!B0]	BNOP	RA
||[!B0]	MVK	0,RET
   [B0]	MVC	B0,ILC
   [B0]	ZERO	A19		; high part of accumulator
	NOP	3

	SPLOOP	2		; 2*n+10
;;====================================================================
	LDW	*ARG1++,A7	; ap[i]
	NOP	4
	MPY32U	A7,ARG3,A17:A16
	NOP	4		; [2,0] in epiloque
	ADDU	A19,A16,A19:A18
||	MV.S	A17,A21
	SPKERNEL 2,1		; leave slot for "return value"
||	STW	A18,*ARG0++	; rp[i]
||	ADD.L	A19,A21,A19
;;====================================================================
	BNOP	RA,4
	MV	A19,RET		; return value
	.endasmfunc

	.global	_bn_sqr_words
_bn_sqr_words:
	.asmfunc
	MV	ARG2,B0
  [!B0]	BNOP	RA
||[!B0]	MVK	0,RET
   [B0]	MVC	B0,ILC
   [B0]	MV	ARG0,B2
|| [B0]	ADD	4,ARG0,ARG0
	NOP	3

	SPLOOP	2		; 2*n+10
;;====================================================================
	LDW	*ARG1++,B7	; ap[i]
	NOP	4
	MPY32U	B7,B7,B1:B0
	NOP	3		; [2,0] in epilogue
	STW	B0,*B2++(8)	; rp[2*i]
	MV	B1,A1
	SPKERNEL 2,0		; fully overlap BNOP RA,5
||	STW	A1,*ARG0++(8)	; rp[2*i+1]
;;====================================================================
	BNOP	RA,5
	.endasmfunc

	.global	_bn_add_words
_bn_add_words:
	.asmfunc
	MV	ARG3,B0
  [!B0]	BNOP	RA
||[!B0]	MVK	0,RET
   [B0]	MVC	B0,ILC
   [B0]	ZERO	A1		; carry flag
|| [B0]	MV	ARG0,A3
	NOP	3

	SPLOOP	2		; 2*n+6
;;====================================================================
	LDW	*ARG2++,A7	; bp[i]
||	LDW	*ARG1++,B7	; ap[i]
	NOP	4
	ADDU	A7,B7,A9:A8
	ADDU	A1,A9:A8,A1:A0
	SPKERNEL 0,0		; fully overlap BNOP RA,5
||	STW	A0,*A3++	; write result
||	MV	A1,RET		; keep carry flag in RET
;;====================================================================
	BNOP	RA,5
	.endasmfunc

	.global	_bn_sub_words
_bn_sub_words:
	.asmfunc
	MV	ARG3,B0
  [!B0]	BNOP	RA
||[!B0]	MVK	0,RET
   [B0]	MVC	B0,ILC
   [B0]	ZERO	A2		; borrow flag
|| [B0]	MV	ARG0,A3
	NOP	3

	SPLOOP	2		; 2*n+6
;;====================================================================
	LDW	*ARG2++,A7	; bp[i]
||	LDW	*ARG1++,B7	; ap[i]
	NOP	4
	SUBU	B7,A7,A1:A0
  [A2]	SUB	A1:A0,1,A1:A0
	SPKERNEL 0,1		; leave slot for "return borrow flag"
||	STW	A0,*A3++	; write result
||	AND	1,A1,A2		; pass on borrow flag
;;====================================================================
	BNOP	RA,4
	AND	1,A1,RET	; return borrow flag
	.endasmfunc

	.global	_bn_div_words
	.global	__divull
_bn_div_words:
	.asmfunc
	CALLP	__divull,A3	; jump to rts64plus.lib
||	MV	ARG0,A5
||	MV	ARG1,ARG0
||	MV	ARG2,ARG1
||	ZERO	B5
	.endasmfunc

;;====================================================================
;; Not really Comba algorithm, just straightforward NxM... Dedicated
;; fully unrolled real Comba implementations are asymptotically 2x
;; faster, but naturally larger undertaking. Purpose of this exercise
;; was rather to learn to master nested SPLOOPs...
;;====================================================================
	.global	_bn_sqr_comba8
	.global	_bn_mul_comba8
_bn_sqr_comba8:
	MV	ARG1,ARG2
_bn_mul_comba8:
	.asmfunc
	MVK	8,B0		; N, RILC
||	MVK	8,A0		; M, outer loop counter
||	MV	ARG1,A5		; copy ap
||	MV	ARG0,B4		; copy rp
||	ZERO	B19		; high part of accumulator
	MVC	B0,RILC
||	SUB	B0,2,B1		; N-2, initial ILC
||	SUB	B0,1,B2		; const B2=N-1
||	LDW	*A5++,B6	; ap[0]
||	MV	A0,A3		; const A3=M
sploopNxM?:			; for best performance arrange M<=N
   [A0]	SPLOOPD	2		; 2*n+10
||	MVC	B1,ILC
||	ADDAW	B4,B0,B5
||	ZERO	B7
||	LDW	*A5++,A9	; pre-fetch ap[1]
||	ZERO	A1
||	SUB	A0,1,A0
;;====================================================================
;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files.
;; This is because of Advisory 15 from TI publication SPRZ247I.
	LDW	*ARG2++,A7	; bp[i]
	NOP	3
   [A1]	LDW	*B5++,B7	; rp[i]
	MPY32U	A7,B6,B17:B16
	NOP	3
	ADDU	B16,B7,B21:B20
	ADDU	B19,B21:B20,B19:B18
||	MV.S	B17,B23
	SPKERNEL
||	STW	B18,*B4++	; rp[i]
||	ADD.S	B19,B23,B19
;;====================================================================
outer?:				; m*2*(n+1)+10
	SUBAW	ARG2,A3,ARG2	; rewind bp to bp[0]
	SPMASKR
||	CMPGT	A0,1,A2		; done pre-fetching ap[i+1]?
	MVD	A9,B6		; move through .M unit(*)
   [A2]	LDW	*A5++,A9	; pre-fetch ap[i+1]
	SUBAW	B5,B2,B5	; rewind rp to rp[1]
	MVK	1,A1
   [A0]	BNOP.S1	outer?,4
|| [A0]	SUB.L	A0,1,A0
	STW	B19,*B4--[B2]	; rewind rp tp rp[1]
||	ZERO.S	B19		; high part of accumulator
;; end of outer?
	BNOP	RA,5		; return
	.endasmfunc
;; (*)	It should be noted that B6 is used as input to MPY32U in
;;	chronologically next cycle in *preceding* SPLOOP iteration.
;;	Normally such arrangement would require DINT, but at this
;;	point SPLOOP is draining and interrupts are disabled
;;	implicitly.

	.global	_bn_sqr_comba4
	.global	_bn_mul_comba4
_bn_sqr_comba4:
	MV	ARG1,ARG2
_bn_mul_comba4:
	.asmfunc
	.if	0
	BNOP	sploopNxM?,3
	;; Above mentioned m*2*(n+1)+10 does not apply in n=m=4 case,
	;; because of read-after-write penalties, it's rather
	;; n*2*(n+3)+10, or 66 cycles [plus various overheads]...
	MVK	4,B0		; N, RILC
||	MVK	4,A0		; M, outer loop counter
||	MV	ARG1,A5		; copy ap
||	MV	ARG0,B4		; copy rp
||	ZERO	B19		; high part of accumulator
	MVC	B0,RILC
||	SUB	B0,2,B1		; first ILC
||	SUB	B0,1,B2		; const B2=N-1
||	LDW	*A5++,B6	; ap[0]
||	MV	A0,A3		; const A3=M
	.else
	;; This alternative is exercise in fully unrolled Comba
	;; algorithm implementation that operates at n*(n+1)+12, or
	;; as little as 32 cycles...
	LDW	*ARG1[0],B16	; a[0]
||	LDW	*ARG2[0],A16	; b[0]
	LDW	*ARG1[1],B17	; a[1]
||	LDW	*ARG2[1],A17	; b[1]
	LDW	*ARG1[2],B18	; a[2]
||	LDW	*ARG2[2],A18	; b[2]
	LDW	*ARG1[3],B19	; a[3]
||	LDW	*ARG2[3],A19	; b[3]
	NOP
	MPY32U	A16,B16,A1:A0	; a[0]*b[0]
	MPY32U	A17,B16,A23:A22	; a[0]*b[1]
	MPY32U	A16,B17,A25:A24	; a[1]*b[0]
	MPY32U	A16,B18,A27:A26	; a[2]*b[0]
	STW	A0,*ARG0[0]
||	MPY32U	A17,B17,A29:A28	; a[1]*b[1]
	MPY32U	A18,B16,A31:A30	; a[0]*b[2]
||	ADDU	A22,A1,A1:A0
	MV	A23,B0
||	MPY32U	A19,B16,A21:A20	; a[3]*b[0]
||	ADDU	A24,A1:A0,A1:A0
	ADDU	A25,B0,B1:B0
||	STW	A0,*ARG0[1]
||	MPY32U	A18,B17,A23:A22	; a[2]*b[1]
||	ADDU	A26,A1,A9:A8
	ADDU	A27,B1,B9:B8
||	MPY32U	A17,B18,A25:A24	; a[1]*b[2]
||	ADDU	A28,A9:A8,A9:A8
	ADDU	A29,B9:B8,B9:B8
||	MPY32U	A16,B19,A27:A26	; a[0]*b[3]
||	ADDU	A30,A9:A8,A9:A8
	ADDU	A31,B9:B8,B9:B8
||	ADDU	B0,A9:A8,A9:A8
	STW	A8,*ARG0[2]
||	ADDU	A20,A9,A1:A0
	ADDU	A21,B9,B1:B0
||	MPY32U	A19,B17,A21:A20	; a[3]*b[1]
||	ADDU	A22,A1:A0,A1:A0
	ADDU	A23,B1:B0,B1:B0
||	MPY32U	A18,B18,A23:A22	; a[2]*b[2]
||	ADDU	A24,A1:A0,A1:A0
	ADDU	A25,B1:B0,B1:B0
||	MPY32U	A17,B19,A25:A24	; a[1]*b[3]
||	ADDU	A26,A1:A0,A1:A0
	ADDU	A27,B1:B0,B1:B0
||	ADDU	B8,A1:A0,A1:A0
	STW	A0,*ARG0[3]
||	MPY32U	A19,B18,A27:A26	; a[3]*b[2]
||	ADDU	A20,A1,A9:A8
	ADDU	A21,B1,B9:B8
||	MPY32U	A18,B19,A29:A28	; a[2]*b[3]
||	ADDU	A22,A9:A8,A9:A8
	ADDU	A23,B9:B8,B9:B8
||	MPY32U	A19,B19,A31:A30	; a[3]*b[3]
||	ADDU	A24,A9:A8,A9:A8
	ADDU	A25,B9:B8,B9:B8
||	ADDU	B0,A9:A8,A9:A8
	STW	A8,*ARG0[4]
||	ADDU	A26,A9,A1:A0
	ADDU	A27,B9,B1:B0
||	ADDU	A28,A1:A0,A1:A0
	ADDU	A29,B1:B0,B1:B0
||	BNOP	RA
||	ADDU	B8,A1:A0,A1:A0
	STW	A0,*ARG0[5]
||	ADDU	A30,A1,A9:A8
	ADD	A31,B1,B8
	ADDU	B0,A9:A8,A9:A8	; removed || to avoid cross-path stall below
	ADD	B8,A9,A9
||	STW	A8,*ARG0[6]
	STW	A9,*ARG0[7]
	.endif
	.endasmfunc
+146 −0
Original line number Diff line number Diff line
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# February 2012
#
# The module implements bn_GF2m_mul_2x2 polynomial multiplication
# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
# C for the time being... The subroutine runs in 37 cycles, which is
# 4.5x faster than compiler-generated code. Though comparison is
# totally unfair, because this module utilizes Galois Field Multiply
# instruction.

while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";

($rp,$a1,$a0,$b1,$b0)=("A4","B4","A6","B6","A8");   # argument vector

($Alo,$Alox0,$Alox1,$Alox2,$Alox3)=map("A$_",(16..20));
($Ahi,$Ahix0,$Ahix1,$Ahix2,$Ahix3)=map("B$_",(16..20));
($B_0,$B_1,$B_2,$B_3)=("B5","A5","A7","B7");
($A,$B)=($Alo,$B_1);
$xFF="B1";

sub mul_1x1_upper {
my ($A,$B)=@_;
$code.=<<___;
	EXTU	$B,8,24,$B_2		; smash $B to 4 bytes
||	AND	$B,$xFF,$B_0
||	SHRU	$B,24,$B_3
	SHRU	$A,16,   $Ahi		; smash $A to two halfwords
||	EXTU	$A,16,16,$Alo

	XORMPY	$Alo,$B_2,$Alox2	; 16x8 bits muliplication
||	XORMPY	$Ahi,$B_2,$Ahix2
||	EXTU	$B,16,24,$B_1
	XORMPY	$Alo,$B_0,$Alox0
||	XORMPY	$Ahi,$B_0,$Ahix0
	XORMPY	$Alo,$B_3,$Alox3
||	XORMPY	$Ahi,$B_3,$Ahix3
	XORMPY	$Alo,$B_1,$Alox1
||	XORMPY	$Ahi,$B_1,$Ahix1
___
}
sub mul_1x1_merged {
my ($OUTlo,$OUThi,$A,$B)=@_;
$code.=<<___;
	 EXTU	$B,8,24,$B_2		; smash $B to 4 bytes
||	 AND	$B,$xFF,$B_0
||	 SHRU	$B,24,$B_3
	 SHRU	$A,16,   $Ahi		; smash $A to two halfwords
||	 EXTU	$A,16,16,$Alo

	XOR	$Ahix0,$Alox2,$Ahix0
||	MV	$Ahix2,$OUThi
||	 XORMPY	$Alo,$B_2,$Alox2
	 XORMPY	$Ahi,$B_2,$Ahix2
||	 EXTU	$B,16,24,$B_1
||	 XORMPY	$Alo,$B_0,A1		; $Alox0
	XOR	$Ahix1,$Alox3,$Ahix1
||	SHL	$Ahix0,16,$OUTlo
||	SHRU	$Ahix0,16,$Ahix0
	XOR	$Alox0,$OUTlo,$OUTlo
||	XOR	$Ahix0,$OUThi,$OUThi
||	 XORMPY	$Ahi,$B_0,$Ahix0
||	 XORMPY	$Alo,$B_3,$Alox3
||	SHL	$Alox1,8,$Alox1
||	SHL	$Ahix3,8,$Ahix3
	XOR	$Alox1,$OUTlo,$OUTlo
||	XOR	$Ahix3,$OUThi,$OUThi
||	 XORMPY	$Ahi,$B_3,$Ahix3
||	SHL	$Ahix1,24,$Alox1
||	SHRU	$Ahix1,8, $Ahix1
	XOR	$Alox1,$OUTlo,$OUTlo
||	XOR	$Ahix1,$OUThi,$OUThi
||	 XORMPY	$Alo,$B_1,$Alox1
||	 XORMPY	$Ahi,$B_1,$Ahix1
||	 MV	A1,$Alox0
___
}
sub mul_1x1_lower {
my ($OUTlo,$OUThi)=@_;
$code.=<<___;
	;NOP
	XOR	$Ahix0,$Alox2,$Ahix0
||	MV	$Ahix2,$OUThi
	NOP
	XOR	$Ahix1,$Alox3,$Ahix1
||	SHL	$Ahix0,16,$OUTlo
||	SHRU	$Ahix0,16,$Ahix0
	XOR	$Alox0,$OUTlo,$OUTlo
||	XOR	$Ahix0,$OUThi,$OUThi
||	SHL	$Alox1,8,$Alox1
||	SHL	$Ahix3,8,$Ahix3
	XOR	$Alox1,$OUTlo,$OUTlo
||	XOR	$Ahix3,$OUThi,$OUThi
||	SHL	$Ahix1,24,$Alox1
||	SHRU	$Ahix1,8, $Ahix1
	XOR	$Alox1,$OUTlo,$OUTlo
||	XOR	$Ahix1,$OUThi,$OUThi
___
}
$code.=<<___;
	.text

	.global	_bn_GF2m_mul_2x2
_bn_GF2m_mul_2x2:
	.asmfunc
	MVK	0xFF,$xFF
___
	&mul_1x1_upper($a0,$b0);		# a0b0
$code.=<<___;
||	MV	$b1,$B
	MV	$a1,$A
___
	&mul_1x1_merged("A28","B28",$A,$B);	# a0b0/a1b1
$code.=<<___;
||	XOR	$b0,$b1,$B
	XOR	$a0,$a1,$A
___
	&mul_1x1_merged("A31","B31",$A,$B);	# a1b1/(a0+a1)(b0+b1)
$code.=<<___;
	XOR	A28,A31,A29
||	XOR	B28,B31,B29			; a0b0+a1b1
___
	&mul_1x1_lower("A30","B30");		# (a0+a1)(b0+b1)
$code.=<<___;
||	BNOP	B3
	XOR	A29,A30,A30
||	XOR	B29,B30,B30			; (a0+a1)(b0+b1)-a0b0-a1b1
	XOR	B28,A30,A30
||	STW	A28,*${rp}[0]
	XOR	B30,A31,A31
||	STW	A30,*${rp}[1]
	STW	A31,*${rp}[2]
	STW	B31,*${rp}[3]
	.endasmfunc
___

print $code;
close STDOUT;
Loading