C64x+ assembler pack. linux-c64xplus build is *not* tested nor can it be (3e181369) · Commits · CYBER - Cyber Security / TS 103 523 MSP / TLMSP / TLMSP OpenSSL

Configure

+3 −0

Original line number	Diff line number	Diff line
		@@ -399,6 +399,9 @@ my %table=(
		"linux-alpha+bwx-gcc","gcc:-O3 -DL_ENDIAN -DTERMIO::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL:${alpha_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
		"linux-alpha-ccc","ccc:-fast -readonly_strings -DL_ENDIAN -DTERMIO::-D_REENTRANT:::SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL:${alpha_asm}",
		"linux-alpha+bwx-ccc","ccc:-fast -readonly_strings -DL_ENDIAN -DTERMIO::-D_REENTRANT:::SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL:${alpha_asm}",
		#
		# TI_CGT_C6000_7.3.x is a requirement
		"linux-c64xplus","cl6x:--linux --strip_coff_underscore -ea=.s -eo=.o -mv6400+ -o2 -ox -ms -pden -DOPENSSL_SMALL_FOOTPRINT::-D_REENTRANT:::BN_LLONG:c64xpluscpuid.o:bn-c64xplus.o c64xplus-gf2m.o::aes-c64xplus.o aes_cbc.o aes_ctr.o:::sha1-c64xplus.o sha256-c64xplus.o sha512-c64xplus.o:::::::ghash-c64xplus.o::void:dlfcn:linux-shared:--pic:-z --sysv --shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):true",

		# Android: linux-* but without -DTERMIO and pointers to headers and libs.
		"android","gcc:-mandroid -I\$(ANDROID_DEV)/include -B\$(ANDROID_DEV)/lib -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",

TABLE

+33 −0

Original line number	Diff line number	Diff line
		@@ -3927,6 +3927,39 @@ $ranlib =
		$arflags =
		$multilib =

		*** linux-c64xplus
		$cc = cl6x
		$cflags = --linux --strip_coff_underscore -ea=.s -eo=.o -mv6400+ -o2 -ox -ms -pden -DOPENSSL_SMALL_FOOTPRINT
		$unistd =
		$thread_cflag = -D_REENTRANT
		$sys_id =
		$lflags =
		$bn_ops = BN_LLONG
		$cpuid_obj = c64xpluscpuid.o
		$bn_obj = bn-c64xplus.o c64xplus-gf2m.o
		$des_obj =
		$aes_obj = aes-c64xplus.o aes_cbc.o aes_ctr.o
		$bf_obj =
		$md5_obj =
		$sha1_obj = sha1-c64xplus.o sha256-c64xplus.o sha512-c64xplus.o
		$cast_obj =
		$rc4_obj =
		$rmd160_obj =
		$rc5_obj =
		$wp_obj =
		$cmll_obj =
		$modes_obj = ghash-c64xplus.o
		$engines_obj =
		$perlasm_scheme = void
		$dso_scheme = dlfcn
		$shared_target= linux-shared
		$shared_cflag = --pic
		$shared_ldflag = -z --sysv --shared
		$shared_extension = .so.$(SHLIB_MAJOR).$(SHLIB_MINOR)
		$ranlib = true
		$arflags =
		$multilib =

		*** linux-elf
		$cc = gcc
		$cflags = -DL_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall

crypto/aes/asm/aes-c64xplus.pl

0 → 100644

+1361 −0

File added.

Preview size limit exceeded, changes collapsed.

crypto/bn/asm/bn-c64xplus.asm

0 → 100644

+333 −0

Original line number	Diff line number	Diff line
		;;====================================================================
		;; Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
		;; project.
		;;
		;; Rights for redistribution and usage in source and binary forms are
		;; granted according to the OpenSSL license. Warranty of any kind is
		;; disclaimed.
		;;====================================================================
		;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n
		;; being the number of 32-bit words, addition - 8*n. Corresponding 4x
		;; unrolled SPLOOP-free loops - at ~8n and ~5n. Below assembler
		;; SPLOOPs spin at ... 2*n cycles [plus epilogue].
		;;====================================================================
		.text

		.asg B3,RA
		.asg A4,ARG0
		.asg B4,ARG1
		.asg A6,ARG2
		.asg B6,ARG3
		.asg A8,ARG4
		.asg B8,ARG5
		.asg A4,RET
		.asg A15,FP
		.asg B14,DP
		.asg B15,SP

		.global _bn_mul_add_words
		_bn_mul_add_words:
		.asmfunc
		MV ARG2,B0
		[!B0] BNOP RA
		\|\|[!B0] MVK 0,RET
		[B0] MVC B0,ILC
		[B0] ZERO A19 ; high part of accumulator
		\|\| [B0] MV ARG0,A2
		\|\| [B0] MV ARG3,A3
		NOP 3

		SPLOOP 2 ; 2*n+10
		;;====================================================================
		LDW *ARG1++,B7 ; ap[i]
		NOP 3
		LDW *ARG0++,A7 ; rp[i]
		MPY32U B7,A3,A17:A16
		NOP 3 ; [2,0] in epilogue
		ADDU A16,A7,A21:A20
		ADDU A19,A21:A20,A19:A18
		\|\| MV.S A17,A23
		SPKERNEL 2,1 ; leave slot for "return value"
		\|\| STW A18,*A2++ ; rp[i]
		\|\| ADD A19,A23,A19
		;;====================================================================
		BNOP RA,4
		MV A19,RET ; return value
		.endasmfunc

		.global _bn_mul_words
		_bn_mul_words:
		.asmfunc
		MV ARG2,B0
		[!B0] BNOP RA
		\|\|[!B0] MVK 0,RET
		[B0] MVC B0,ILC
		[B0] ZERO A19 ; high part of accumulator
		NOP 3

		SPLOOP 2 ; 2*n+10
		;;====================================================================
		LDW *ARG1++,A7 ; ap[i]
		NOP 4
		MPY32U A7,ARG3,A17:A16
		NOP 4 ; [2,0] in epiloque
		ADDU A19,A16,A19:A18
		\|\| MV.S A17,A21
		SPKERNEL 2,1 ; leave slot for "return value"
		\|\| STW A18,*ARG0++ ; rp[i]
		\|\| ADD.L A19,A21,A19
		;;====================================================================
		BNOP RA,4
		MV A19,RET ; return value
		.endasmfunc

		.global _bn_sqr_words
		_bn_sqr_words:
		.asmfunc
		MV ARG2,B0
		[!B0] BNOP RA
		\|\|[!B0] MVK 0,RET
		[B0] MVC B0,ILC
		[B0] MV ARG0,B2
		\|\| [B0] ADD 4,ARG0,ARG0
		NOP 3

		SPLOOP 2 ; 2*n+10
		;;====================================================================
		LDW *ARG1++,B7 ; ap[i]
		NOP 4
		MPY32U B7,B7,B1:B0
		NOP 3 ; [2,0] in epilogue
		STW B0,B2++(8) ; rp[2i]
		MV B1,A1
		SPKERNEL 2,0 ; fully overlap BNOP RA,5
		\|\| STW A1,ARG0++(8) ; rp[2i+1]
		;;====================================================================
		BNOP RA,5
		.endasmfunc

		.global _bn_add_words
		_bn_add_words:
		.asmfunc
		MV ARG3,B0
		[!B0] BNOP RA
		\|\|[!B0] MVK 0,RET
		[B0] MVC B0,ILC
		[B0] ZERO A1 ; carry flag
		\|\| [B0] MV ARG0,A3
		NOP 3

		SPLOOP 2 ; 2*n+6
		;;====================================================================
		LDW *ARG2++,A7 ; bp[i]
		\|\| LDW *ARG1++,B7 ; ap[i]
		NOP 4
		ADDU A7,B7,A9:A8
		ADDU A1,A9:A8,A1:A0
		SPKERNEL 0,0 ; fully overlap BNOP RA,5
		\|\| STW A0,*A3++ ; write result
		\|\| MV A1,RET ; keep carry flag in RET
		;;====================================================================
		BNOP RA,5
		.endasmfunc

		.global _bn_sub_words
		_bn_sub_words:
		.asmfunc
		MV ARG3,B0
		[!B0] BNOP RA
		\|\|[!B0] MVK 0,RET
		[B0] MVC B0,ILC
		[B0] ZERO A2 ; borrow flag
		\|\| [B0] MV ARG0,A3
		NOP 3

		SPLOOP 2 ; 2*n+6
		;;====================================================================
		LDW *ARG2++,A7 ; bp[i]
		\|\| LDW *ARG1++,B7 ; ap[i]
		NOP 4
		SUBU B7,A7,A1:A0
		[A2] SUB A1:A0,1,A1:A0
		SPKERNEL 0,1 ; leave slot for "return borrow flag"
		\|\| STW A0,*A3++ ; write result
		\|\| AND 1,A1,A2 ; pass on borrow flag
		;;====================================================================
		BNOP RA,4
		AND 1,A1,RET ; return borrow flag
		.endasmfunc

		.global _bn_div_words
		.global __divull
		_bn_div_words:
		.asmfunc
		CALLP __divull,A3 ; jump to rts64plus.lib
		\|\| MV ARG0,A5
		\|\| MV ARG1,ARG0
		\|\| MV ARG2,ARG1
		\|\| ZERO B5
		.endasmfunc

		;;====================================================================
		;; Not really Comba algorithm, just straightforward NxM... Dedicated
		;; fully unrolled real Comba implementations are asymptotically 2x
		;; faster, but naturally larger undertaking. Purpose of this exercise
		;; was rather to learn to master nested SPLOOPs...
		;;====================================================================
		.global _bn_sqr_comba8
		.global _bn_mul_comba8
		_bn_sqr_comba8:
		MV ARG1,ARG2
		_bn_mul_comba8:
		.asmfunc
		MVK 8,B0 ; N, RILC
		\|\| MVK 8,A0 ; M, outer loop counter
		\|\| MV ARG1,A5 ; copy ap
		\|\| MV ARG0,B4 ; copy rp
		\|\| ZERO B19 ; high part of accumulator
		MVC B0,RILC
		\|\| SUB B0,2,B1 ; N-2, initial ILC
		\|\| SUB B0,1,B2 ; const B2=N-1
		\|\| LDW *A5++,B6 ; ap[0]
		\|\| MV A0,A3 ; const A3=M
		sploopNxM?: ; for best performance arrange M<=N
		[A0] SPLOOPD 2 ; 2*n+10
		\|\| MVC B1,ILC
		\|\| ADDAW B4,B0,B5
		\|\| ZERO B7
		\|\| LDW *A5++,A9 ; pre-fetch ap[1]
		\|\| ZERO A1
		\|\| SUB A0,1,A0
		;;====================================================================
		;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files.
		;; This is because of Advisory 15 from TI publication SPRZ247I.
		LDW *ARG2++,A7 ; bp[i]
		NOP 3
		[A1] LDW *B5++,B7 ; rp[i]
		MPY32U A7,B6,B17:B16
		NOP 3
		ADDU B16,B7,B21:B20
		ADDU B19,B21:B20,B19:B18
		\|\| MV.S B17,B23
		SPKERNEL
		\|\| STW B18,*B4++ ; rp[i]
		\|\| ADD.S B19,B23,B19
		;;====================================================================
		outer?: ; m2(n+1)+10
		SUBAW ARG2,A3,ARG2 ; rewind bp to bp[0]
		SPMASKR
		\|\| CMPGT A0,1,A2 ; done pre-fetching ap[i+1]?
		MVD A9,B6 ; move through .M unit(*)
		[A2] LDW *A5++,A9 ; pre-fetch ap[i+1]
		SUBAW B5,B2,B5 ; rewind rp to rp[1]
		MVK 1,A1
		[A0] BNOP.S1 outer?,4
		\|\| [A0] SUB.L A0,1,A0
		STW B19,*B4--[B2] ; rewind rp tp rp[1]
		\|\| ZERO.S B19 ; high part of accumulator
		;; end of outer?
		BNOP RA,5 ; return
		.endasmfunc
		;; (*) It should be noted that B6 is used as input to MPY32U in
		;; chronologically next cycle in preceding SPLOOP iteration.
		;; Normally such arrangement would require DINT, but at this
		;; point SPLOOP is draining and interrupts are disabled
		;; implicitly.

		.global _bn_sqr_comba4
		.global _bn_mul_comba4
		_bn_sqr_comba4:
		MV ARG1,ARG2
		_bn_mul_comba4:
		.asmfunc
		.if 0
		BNOP sploopNxM?,3
		;; Above mentioned m2(n+1)+10 does not apply in n=m=4 case,
		;; because of read-after-write penalties, it's rather
		;; n2(n+3)+10, or 66 cycles [plus various overheads]...
		MVK 4,B0 ; N, RILC
		\|\| MVK 4,A0 ; M, outer loop counter
		\|\| MV ARG1,A5 ; copy ap
		\|\| MV ARG0,B4 ; copy rp
		\|\| ZERO B19 ; high part of accumulator
		MVC B0,RILC
		\|\| SUB B0,2,B1 ; first ILC
		\|\| SUB B0,1,B2 ; const B2=N-1
		\|\| LDW *A5++,B6 ; ap[0]
		\|\| MV A0,A3 ; const A3=M
		.else
		;; This alternative is exercise in fully unrolled Comba
		;; algorithm implementation that operates at n*(n+1)+12, or
		;; as little as 32 cycles...
		LDW *ARG1[0],B16 ; a[0]
		\|\| LDW *ARG2[0],A16 ; b[0]
		LDW *ARG1[1],B17 ; a[1]
		\|\| LDW *ARG2[1],A17 ; b[1]
		LDW *ARG1[2],B18 ; a[2]
		\|\| LDW *ARG2[2],A18 ; b[2]
		LDW *ARG1[3],B19 ; a[3]
		\|\| LDW *ARG2[3],A19 ; b[3]
		NOP
		MPY32U A16,B16,A1:A0 ; a[0]*b[0]
		MPY32U A17,B16,A23:A22 ; a[0]*b[1]
		MPY32U A16,B17,A25:A24 ; a[1]*b[0]
		MPY32U A16,B18,A27:A26 ; a[2]*b[0]
		STW A0,*ARG0[0]
		\|\| MPY32U A17,B17,A29:A28 ; a[1]*b[1]
		MPY32U A18,B16,A31:A30 ; a[0]*b[2]
		\|\| ADDU A22,A1,A1:A0
		MV A23,B0
		\|\| MPY32U A19,B16,A21:A20 ; a[3]*b[0]
		\|\| ADDU A24,A1:A0,A1:A0
		ADDU A25,B0,B1:B0
		\|\| STW A0,*ARG0[1]
		\|\| MPY32U A18,B17,A23:A22 ; a[2]*b[1]
		\|\| ADDU A26,A1,A9:A8
		ADDU A27,B1,B9:B8
		\|\| MPY32U A17,B18,A25:A24 ; a[1]*b[2]
		\|\| ADDU A28,A9:A8,A9:A8
		ADDU A29,B9:B8,B9:B8
		\|\| MPY32U A16,B19,A27:A26 ; a[0]*b[3]
		\|\| ADDU A30,A9:A8,A9:A8
		ADDU A31,B9:B8,B9:B8
		\|\| ADDU B0,A9:A8,A9:A8
		STW A8,*ARG0[2]
		\|\| ADDU A20,A9,A1:A0
		ADDU A21,B9,B1:B0
		\|\| MPY32U A19,B17,A21:A20 ; a[3]*b[1]
		\|\| ADDU A22,A1:A0,A1:A0
		ADDU A23,B1:B0,B1:B0
		\|\| MPY32U A18,B18,A23:A22 ; a[2]*b[2]
		\|\| ADDU A24,A1:A0,A1:A0
		ADDU A25,B1:B0,B1:B0
		\|\| MPY32U A17,B19,A25:A24 ; a[1]*b[3]
		\|\| ADDU A26,A1:A0,A1:A0
		ADDU A27,B1:B0,B1:B0
		\|\| ADDU B8,A1:A0,A1:A0
		STW A0,*ARG0[3]
		\|\| MPY32U A19,B18,A27:A26 ; a[3]*b[2]
		\|\| ADDU A20,A1,A9:A8
		ADDU A21,B1,B9:B8
		\|\| MPY32U A18,B19,A29:A28 ; a[2]*b[3]
		\|\| ADDU A22,A9:A8,A9:A8
		ADDU A23,B9:B8,B9:B8
		\|\| MPY32U A19,B19,A31:A30 ; a[3]*b[3]
		\|\| ADDU A24,A9:A8,A9:A8
		ADDU A25,B9:B8,B9:B8
		\|\| ADDU B0,A9:A8,A9:A8
		STW A8,*ARG0[4]
		\|\| ADDU A26,A9,A1:A0
		ADDU A27,B9,B1:B0
		\|\| ADDU A28,A1:A0,A1:A0
		ADDU A29,B1:B0,B1:B0
		\|\| BNOP RA
		\|\| ADDU B8,A1:A0,A1:A0
		STW A0,*ARG0[5]
		\|\| ADDU A30,A1,A9:A8
		ADD A31,B1,B8
		ADDU B0,A9:A8,A9:A8 ; removed \|\| to avoid cross-path stall below
		ADD B8,A9,A9
		\|\| STW A8,*ARG0[6]
		STW A9,*ARG0[7]
		.endif
		.endasmfunc

crypto/bn/asm/c64xplus-gf2m.pl

0 → 100644

+146 −0

Original line number	Diff line number	Diff line
		#!/usr/bin/env perl
		#
		# ====================================================================
		# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
		# project. The module is, however, dual licensed under OpenSSL and
		# CRYPTOGAMS licenses depending on where you obtain it. For further
		# details see http://www.openssl.org/~appro/cryptogams/.
		# ====================================================================
		#
		# February 2012
		#
		# The module implements bn_GF2m_mul_2x2 polynomial multiplication
		# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
		# C for the time being... The subroutine runs in 37 cycles, which is
		# 4.5x faster than compiler-generated code. Though comparison is
		# totally unfair, because this module utilizes Galois Field Multiply
		# instruction.

		while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
		open STDOUT,">$output";

		($rp,$a1,$a0,$b1,$b0)=("A4","B4","A6","B6","A8"); # argument vector

		($Alo,$Alox0,$Alox1,$Alox2,$Alox3)=map("A$_",(16..20));
		($Ahi,$Ahix0,$Ahix1,$Ahix2,$Ahix3)=map("B$_",(16..20));
		($B_0,$B_1,$B_2,$B_3)=("B5","A5","A7","B7");
		($A,$B)=($Alo,$B_1);
		$xFF="B1";

		sub mul_1x1_upper {
		my ($A,$B)=@_;
		$code.=<<___;
		EXTU $B,8,24,$B_2 ; smash $B to 4 bytes
		\|\| AND $B,$xFF,$B_0
		\|\| SHRU $B,24,$B_3
		SHRU $A,16, $Ahi ; smash $A to two halfwords
		\|\| EXTU $A,16,16,$Alo

		XORMPY $Alo,$B_2,$Alox2 ; 16x8 bits muliplication
		\|\| XORMPY $Ahi,$B_2,$Ahix2
		\|\| EXTU $B,16,24,$B_1
		XORMPY $Alo,$B_0,$Alox0
		\|\| XORMPY $Ahi,$B_0,$Ahix0
		XORMPY $Alo,$B_3,$Alox3
		\|\| XORMPY $Ahi,$B_3,$Ahix3
		XORMPY $Alo,$B_1,$Alox1
		\|\| XORMPY $Ahi,$B_1,$Ahix1
		___
		}
		sub mul_1x1_merged {
		my ($OUTlo,$OUThi,$A,$B)=@_;
		$code.=<<___;
		EXTU $B,8,24,$B_2 ; smash $B to 4 bytes
		\|\| AND $B,$xFF,$B_0
		\|\| SHRU $B,24,$B_3
		SHRU $A,16, $Ahi ; smash $A to two halfwords
		\|\| EXTU $A,16,16,$Alo

		XOR $Ahix0,$Alox2,$Ahix0
		\|\| MV $Ahix2,$OUThi
		\|\| XORMPY $Alo,$B_2,$Alox2
		XORMPY $Ahi,$B_2,$Ahix2
		\|\| EXTU $B,16,24,$B_1
		\|\| XORMPY $Alo,$B_0,A1 ; $Alox0
		XOR $Ahix1,$Alox3,$Ahix1
		\|\| SHL $Ahix0,16,$OUTlo
		\|\| SHRU $Ahix0,16,$Ahix0
		XOR $Alox0,$OUTlo,$OUTlo
		\|\| XOR $Ahix0,$OUThi,$OUThi
		\|\| XORMPY $Ahi,$B_0,$Ahix0
		\|\| XORMPY $Alo,$B_3,$Alox3
		\|\| SHL $Alox1,8,$Alox1
		\|\| SHL $Ahix3,8,$Ahix3
		XOR $Alox1,$OUTlo,$OUTlo
		\|\| XOR $Ahix3,$OUThi,$OUThi
		\|\| XORMPY $Ahi,$B_3,$Ahix3
		\|\| SHL $Ahix1,24,$Alox1
		\|\| SHRU $Ahix1,8, $Ahix1
		XOR $Alox1,$OUTlo,$OUTlo
		\|\| XOR $Ahix1,$OUThi,$OUThi
		\|\| XORMPY $Alo,$B_1,$Alox1
		\|\| XORMPY $Ahi,$B_1,$Ahix1
		\|\| MV A1,$Alox0
		___
		}
		sub mul_1x1_lower {
		my ($OUTlo,$OUThi)=@_;
		$code.=<<___;
		;NOP
		XOR $Ahix0,$Alox2,$Ahix0
		\|\| MV $Ahix2,$OUThi
		NOP
		XOR $Ahix1,$Alox3,$Ahix1
		\|\| SHL $Ahix0,16,$OUTlo
		\|\| SHRU $Ahix0,16,$Ahix0
		XOR $Alox0,$OUTlo,$OUTlo
		\|\| XOR $Ahix0,$OUThi,$OUThi
		\|\| SHL $Alox1,8,$Alox1
		\|\| SHL $Ahix3,8,$Ahix3
		XOR $Alox1,$OUTlo,$OUTlo
		\|\| XOR $Ahix3,$OUThi,$OUThi
		\|\| SHL $Ahix1,24,$Alox1
		\|\| SHRU $Ahix1,8, $Ahix1
		XOR $Alox1,$OUTlo,$OUTlo
		\|\| XOR $Ahix1,$OUThi,$OUThi
		___
		}
		$code.=<<___;
		.text

		.global _bn_GF2m_mul_2x2
		_bn_GF2m_mul_2x2:
		.asmfunc
		MVK 0xFF,$xFF
		___
		&mul_1x1_upper($a0,$b0); # a0b0
		$code.=<<___;
		\|\| MV $b1,$B
		MV $a1,$A
		___
		&mul_1x1_merged("A28","B28",$A,$B); # a0b0/a1b1
		$code.=<<___;
		\|\| XOR $b0,$b1,$B
		XOR $a0,$a1,$A
		___
		&mul_1x1_merged("A31","B31",$A,$B); # a1b1/(a0+a1)(b0+b1)
		$code.=<<___;
		XOR A28,A31,A29
		\|\| XOR B28,B31,B29 ; a0b0+a1b1
		___
		&mul_1x1_lower("A30","B30"); # (a0+a1)(b0+b1)
		$code.=<<___;
		\|\| BNOP B3
		XOR A29,A30,A30
		\|\| XOR B29,B30,B30 ; (a0+a1)(b0+b1)-a0b0-a1b1
		XOR B28,A30,A30
		\|\| STW A28,*${rp}[0]
		XOR B30,A31,A31
		\|\| STW A30,*${rp}[1]
		STW A31,*${rp}[2]
		STW B31,*${rp}[3]
		.endasmfunc
		___

		print $code;
		close STDOUT;