Add ARMv8 Montgomery multiplication module. (cb2ed545) · Commits · CYBER - Cyber Security / TS 103 523 MSP / ETS / ETS OpenSSL

crypto/bn/asm/armv8-mont.pl

0 → 100755

+244 −0

Original line number	Diff line number	Diff line
		#!/usr/bin/env perl

		# ====================================================================
		# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
		# project. The module is, however, dual licensed under OpenSSL and
		# CRYPTOGAMS licenses depending on where you obtain it. For further
		# details see http://www.openssl.org/~appro/cryptogams/.
		# ====================================================================

		# March 2015
		#
		# "Teaser" Montgomery multiplication module for ARMv8. Needs more
		# work. While it does improve RSA sign performance by 20-30% (less for
		# longer keys) on most processors, for some reason RSA2048 is not
		# faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
		# instruction issue rate is limited on processor in question, meaning
		# that dedicated squaring procedure is a must. Well, actually all
		# contemporary AArch64 processors seem to have limited multiplication
		# issue rate, i.e. they can't issue multiplication every cycle, which
		# explains moderate improvement coefficients in comparison to
		# compiler-generated code. Recall that compiler is instructed to use
		# umulh and therefore uses same amount of multiplication instructions
		# to do the job. Assembly's edge is to minimize number of "collateral"
		# instructions and of course instruction scheduling.

		$flavour = shift;
		$output = shift;

		$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
		( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
		( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
		die "can't locate arm-xlate.pl";

		open OUT,"\| \"$^X\" $xlate $flavour $output";
		STDOUT=OUT;

		($lo0,$hi0,$aj,$m0,$alo,$ahi,
		$lo1,$hi1,$nj,$m1,$nlo,$nhi,
		$ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);

		# int bn_mul_mont(
		$rp="x0"; # BN_ULONG *rp,
		$ap="x1"; # const BN_ULONG *ap,
		$bp="x2"; # const BN_ULONG *bp,
		$np="x3"; # const BN_ULONG *np,
		$n0="x4"; # const BN_ULONG *n0,
		$num="x5"; # int num);

		$code.=<<___;
		.text

		.globl bn_mul_mont
		.type bn_mul_mont,%function
		.align 5
		bn_mul_mont:
		stp x29,x30,[sp,#-64]!
		add x29,sp,#0
		stp x19,x20,[sp,#16]
		stp x21,x22,[sp,#32]
		stp x23,x24,[sp,#48]

		ldr $m0,[$bp],#8 // bp[0]
		sub $tp,sp,$num,lsl#3
		ldp $hi0,$aj,[$ap],#16 // ap[0..1]
		lsl $num,$num,#3
		ldr $n0,[$n0] // *n0
		and $tp,$tp,#-16 // ABI says so
		ldp $hi1,$nj,[$np],#16 // np[0..1]

		mul $lo0,$hi0,$m0 // ap[0]*bp[0]
		sub $j,$num,#16 // j=num-2
		umulh $hi0,$hi0,$m0
		mul $alo,$aj,$m0 // ap[1]*bp[0]
		umulh $ahi,$aj,$m0

		mul $m1,$lo0,$n0 // "tp[0]"*n0
		mov sp,$tp // alloca

		mul $lo1,$hi1,$m1 // np[0]*m1
		umulh $hi1,$hi1,$m1
		mul $nlo,$nj,$m1 // np[1]*m1
		adds $lo1,$lo1,$lo0 // discarded
		umulh $nhi,$nj,$m1
		adc $hi1,$hi1,xzr
		cbz $j,.L1st_skip

		.L1st:
		ldr $aj,[$ap],#8
		adds $lo0,$alo,$hi0
		sub $j,$j,#8 // j--
		adc $hi0,$ahi,xzr

		ldr $nj,[$np],#8
		adds $lo1,$nlo,$hi1
		mul $alo,$aj,$m0 // ap[j]*bp[0]
		adc $hi1,$nhi,xzr
		umulh $ahi,$aj,$m0

		adds $lo1,$lo1,$lo0
		mul $nlo,$nj,$m1 // np[j]*m1
		adc $hi1,$hi1,xzr
		umulh $nhi,$nj,$m1
		str $lo1,[$tp],#8 // tp[j-1]
		cbnz $j,.L1st

		.L1st_skip:
		adds $lo0,$alo,$hi0
		sub $ap,$ap,$num // rewind $ap
		adc $hi0,$ahi,xzr

		adds $lo1,$nlo,$hi1
		sub $np,$np,$num // rewind $np
		adc $hi1,$nhi,xzr

		adds $lo1,$lo1,$lo0
		sub $i,$num,#8 // i=num-1
		adcs $hi1,$hi1,$hi0

		adc $ovf,xzr,xzr // upmost overflow bit
		stp $lo1,$hi1,[$tp]

		.Louter:
		ldr $m0,[$bp],#8 // bp[i]
		ldp $hi0,$aj,[$ap],#16
		ldr $tj,[sp] // tp[0]
		add $tp,sp,#8

		mul $lo0,$hi0,$m0 // ap[0]*bp[i]
		sub $j,$num,#16 // j=num-2
		umulh $hi0,$hi0,$m0
		ldp $hi1,$nj,[$np],#16
		mul $alo,$aj,$m0 // ap[1]*bp[i]
		adds $lo0,$lo0,$tj
		umulh $ahi,$aj,$m0
		adc $hi0,$hi0,xzr

		mul $m1,$lo0,$n0
		sub $i,$i,#8 // i--

		mul $lo1,$hi1,$m1 // np[0]*m1
		umulh $hi1,$hi1,$m1
		mul $nlo,$nj,$m1 // np[1]*m1
		adds $lo1,$lo1,$lo0
		umulh $nhi,$nj,$m1
		cbz $j,.Linner_skip

		.Linner:
		ldr $aj,[$ap],#8
		adc $hi1,$hi1,xzr
		ldr $tj,[$tp],#8 // tp[j]
		adds $lo0,$alo,$hi0
		sub $j,$j,#8 // j--
		adc $hi0,$ahi,xzr

		adds $lo1,$nlo,$hi1
		ldr $nj,[$np],#8
		adc $hi1,$nhi,xzr

		mul $alo,$aj,$m0 // ap[j]*bp[i]
		adds $lo0,$lo0,$tj
		umulh $ahi,$aj,$m0
		adc $hi0,$hi0,xzr

		mul $nlo,$nj,$m1 // np[j]*m1
		adds $lo1,$lo1,$lo0
		umulh $nhi,$nj,$m1
		str $lo1,[$tp,#-16] // tp[j-1]
		cbnz $j,.Linner

		.Linner_skip:
		ldr $tj,[$tp],#8 // tp[j]
		adc $hi1,$hi1,xzr
		adds $lo0,$alo,$hi0
		sub $ap,$ap,$num // rewind $ap
		adc $hi0,$ahi,xzr

		adds $lo1,$nlo,$hi1
		sub $np,$np,$num // rewind $np
		adc $hi1,$nhi,$ovf

		adds $lo0,$lo0,$tj
		adc $hi0,$hi0,xzr

		adds $lo1,$lo1,$lo0
		adcs $hi1,$hi1,$hi0
		adc $ovf,xzr,xzr // upmost overflow bit
		stp $lo1,$hi1,[$tp,#-16]

		cbnz $i,.Louter

		// Final step. We see if result is larger than modulus, and
		// if it is, subtract the modulus. But comparison implies
		// subtraction. So we subtract modulus, see if it borrowed,
		// and conditionally copy original value.
		ldr $tj,[sp] // tp[0]
		add $tp,sp,#8
		ldr $nj,[$np],#8 // np[0]
		subs $j,$num,#8 // j=num-1 and clear borrow
		mov $ap,$rp
		.Lsub:
		sbcs $aj,$tj,$nj // tp[j]-np[j]
		ldr $tj,[$tp],#8
		sub $j,$j,#8 // j--
		ldr $nj,[$np],#8
		str $aj,[$ap],#8 // rp[j]=tp[j]-np[j]
		cbnz $j,.Lsub

		sbcs $aj,$tj,$nj
		sbcs $ovf,$ovf,xzr // did it borrow?
		str $aj,[$ap],#8 // rp[num-1]

		ldr $tj,[sp] // tp[0]
		add $tp,sp,#8
		ldr $aj,[$rp],#8 // rp[0]
		sub $num,$num,#8 // num--
		nop
		.Lcond_copy:
		sub $num,$num,#8 // num--
		csel $nj,$aj,$tj,cs // did it borrow?
		ldr $tj,[$tp],#8
		ldr $aj,[$rp],#8
		str xzr,[$tp,#-16] // wipe tp
		str $nj,[$rp,#-16]
		cbnz $num,.Lcond_copy

		csel $nj,$aj,$tj,cs
		str xzr,[$tp,#-8] // wipe tp
		str $nj,[$rp,#-8]

		ldp x19,x20,[x29,#16]
		mov sp,x29
		ldp x21,x22,[x29,#32]
		ldp x23,x24,[x29,#48]
		ldr x29,[sp],#64
		ret
		.size bn_mul_mont,.-bn_mul_mont

		.asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
		.align 4
		___

		print $code;

		close STDOUT;