Commit 7d9cf7c0 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

Eliminate conditional final subtraction in Montgomery assembler modules.

parent 55525742
Loading
Loading
Loading
Loading
+23 −31
Original line number Diff line number Diff line
@@ -258,56 +258,48 @@ bn_mul_mont:
	stq	$hi1,16($tp)
	bne	$tj,.Louter

	s8addq	$num,sp,$ap
	mov	$rp,$bp
	s8addq	$num,sp,$tj	# &tp[num]
	mov	$rp,$bp		# put rp aside
	mov	sp,$tp
	mov	0,$hi0

	bne	$hi1,.Lsub
	cmpult	$nj,$lo1,AT
	bne	AT,.Lsub

.align	4
.Lcopy:	ldq	AT,($tp)
	lda	$tp,8($tp)
	stq	AT,($rp)
	cmpult	$tp,$ap,AT
	stq	zero,-8($tp)
	nop
	lda	$rp,8($rp)
	bne	AT,.Lcopy
	mov	1,v0
	br	.Lexit
	mov	sp,$ap
	srl	$nj,62,AT	# boundary condition...
	beq	AT,.Lcopy	# ... is met
	mov	0,$hi0		# clear borrow bit

.align	4
.Lsub:	ldq	$lo0,($tp)
	ldq	$lo1,($np)
	subq	$lo0,$lo1,$lo1
	lda	$tp,8($tp)
	lda	$np,8($np)
	subq	$lo0,$lo1,$lo1	# tp[i]-np[i]
	cmpult	$lo0,$lo1,AT
	subq	$lo1,$hi0,$lo0
	cmpult	$lo1,$lo0,$hi0
	lda	$tp,8($tp)
	or	$hi0,AT,$hi0
	lda	$np,8($np)
	stq	$lo0,($rp)
	cmpult	$tp,$ap,v0
	cmpult	$tp,$tj,v0
	lda	$rp,8($rp)
	bne	v0,.Lsub

	subq	$hi1,$hi0,$hi0
	subq	$hi1,$hi0,$hi0	# handle upmost overflow bit
	mov	sp,$tp
	cmpule	$hi1,$hi0,AT
	mov	$bp,$rp
	bne	AT,.Lcopy
	mov	$bp,$rp		# restore rp

	and	sp,$hi0,$ap
	bic	$bp,$hi0,$bp
	bis	$bp,$ap,$ap	# ap=borrow?tp:rp

.align	4
.Lzap:	stq	zero,($tp)
	cmpult	$tp,$ap,AT
.Lcopy:	ldq	$aj,($ap)	# copy or in-place refresh
	lda	$tp,8($tp)
	bne	AT,.Lzap
	lda	$rp,8($rp)
	lda	$ap,8($ap)
	stq	zero,-8($tp)	# zap tp
	cmpult	$tp,$tj,AT
	stq	$aj,-8($rp)
	bne	AT,.Lcopy
	mov	1,v0

.align	4
.Lexit:
	.set	noreorder
	mov	fp,sp
+21 −23
Original line number Diff line number Diff line
@@ -61,7 +61,7 @@ bn_mul_mont:
	cmp	$num,#2
	movlt	r0,#0
	addlt	sp,sp,#2*4
	blt	.Labort
	blt	.Labrt

	stmdb	sp!,{r4-r12,lr}		@ save 10 registers

@@ -160,27 +160,13 @@ bn_mul_mont:
	add	$num,$num,#4		@ $num to point at &tp[num]
	sub	$aj,$num,sp		@ "original" num value
	mov	$tp,sp			@ "rewind" $tp
	mov	$ap,$tp			@ "borrow" $ap
	sub	$np,$np,$aj		@ "rewind" $np to &np[0]

	cmp	$nhi,#0			@ upmost carry
	bne	.Lsub
	cmp	$nlo,$nj		@ tp[num-1]-np[num-1]
	bhs	.Lsub

.Lcopy:	ldr	$tj,[$tp]
	str	sp,[$tp],#4		@ zap tp
	str	$tj,[$rp],#4
	cmp	$tp,$num
	bne	.Lcopy

.Lexit:	add	sp,$num,#4		@ skip over tp[num+1]
	ldmia	sp!,{r4-r12,lr}		@ restore registers
	add	sp,sp,#2*4		@ skip over {r0,r2}
	mov	r0,#1
.Labort:tst	lr,#1
	moveq	pc,lr			@ be binary compatible with V4, yet
	bx	lr			@ interoperable with Thumb ISA:-)
	movs	$tj,$nj,lsr#30		@ boundary condition...
	beq	.Lcopy			@ ... is met

	subs	$tj,$tj,$tj		@ "clear" carry flag
.Lsub:	ldr	$tj,[$tp],#4
	ldr	$nj,[$np],#4
	sbcs	$tj,$tj,$nj		@ tp[j]-np[j]
@@ -190,12 +176,24 @@ bn_mul_mont:
	sbcs	$nhi,$nhi,#0		@ upmost carry
	mov	$tp,sp			@ "rewind" $tp
	sub	$rp,$rp,$aj		@ "rewind" $rp
	blo	.Lcopy			@ tp was less after all

.Lzap:	str	sp,[$tp],#4
	and	$ap,$tp,$nhi
	bic	$np,$rp,$nhi
	orr	$ap,$ap,$np		@ ap=borrow?tp:rp

.Lcopy:	ldr	$tj,[$ap],#4		@ copy or in-place refresh
	str	sp,[$tp],#4		@ zap tp
	str	$tj,[$rp],#4
	cmp	$tp,$num
	bne	.Lzap
	bal	.Lexit
	bne	.Lcopy

	add	sp,$num,#4		@ skip over tp[num+1]
	ldmia	sp!,{r4-r12,lr}		@ restore registers
	add	sp,sp,#2*4		@ skip over {r0,r2}
	mov	r0,#1
.Labrt:	tst	lr,#1
	moveq	pc,lr			@ be binary compatible with V4, yet
	bx	lr			@ interoperable with Thumb ISA:-)
.size	bn_mul_mont,.-bn_mul_mont
.asciz	"Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
___
+34 −39
Original line number Diff line number Diff line
@@ -265,27 +265,50 @@ bn_mul_mont:
	addu	$i,8
	sltu	s7,$i,$num
	bnez	s7,.Louter


	.set	noreorder
	PTR_ADD	$ap,sp,$num
	PTR_ADD	$tj,sp,$num	# &tp[num]
	move	$tp,sp
	move	$ap,sp

	bnez	$hi1,.Lsub
	li	$hi0,0
	sgeu	AT,$lo1,$nj
	beqz	AT,.Lsub
	nop
	dsrl	AT,$nj,62	# boundary condition...
	beqz	AT,.Lcopy	# ... is met
	li	$hi0,0		# clear borrow bit

.align	4
.Lcopy:	ld	AT,($tp)
.Lsub:	ld	$lo0,($tp)
	ld	$lo1,($np)
	PTR_ADD	$tp,8
	PTR_ADD	$np,8
	dsubu	$lo1,$lo0,$lo1	# tp[i]-np[i]
	sgtu	AT,$lo1,$lo0
	dsubu	$lo0,$lo1,$hi0
	sgtu	$hi0,$lo0,$lo1
	sd	$lo0,($rp)
	or	$hi0,AT
	sltu	AT,$tp,$tj
	bnez	AT,.Lsub
	PTR_ADD	$rp,8

	dsubu	$hi0,$hi1,$hi0	# handle upmost overflow bit
	move	$tp,sp
	PTR_SUB	$rp,$num	# restore rp
	not	$hi1,$hi0

	and	$ap,$hi0,sp
	and	$bp,$hi1,$rp
	or	$ap,$ap,$bp	# ap=borrow?tp:rp

.align	4
.Lcopy:	ld	$aj,($ap)
	PTR_ADD	$ap,8
	PTR_ADD	$tp,8
	sd	AT,($rp)
	sltu	AT,$tp,$ap
	sd	zero,-8($tp)
	sltu	AT,$tp,$tj
	sd	$aj,($rp)
	bnez	AT,.Lcopy
	PTR_ADD	$rp,8

.Lexit:
	ld	s0,0($fp)
	ld	s1,8($fp)
	ld	s2,16($fp)
@@ -297,34 +320,6 @@ bn_mul_mont:
	li	v0,1
	jr	ra
	PTR_ADD	sp,$fp,64

.align	4
.Lsub:	ld	$lo0,($tp)
	ld	$lo1,($np)
	dsubu	$lo1,$lo0,$lo1
	sgtu	AT,$lo1,$lo0
	dsubu	$lo0,$lo1,$hi0
	sgtu	$hi0,$lo0,$lo1
	PTR_ADD	$tp,8
	or	$hi0,AT
	PTR_ADD	$np,8
	sd	$lo0,($rp)
	sltu	AT,$tp,$ap
	bnez	AT,.Lsub
	PTR_ADD	$rp,8

	dsubu	$hi0,$hi1,$hi0
	move	$tp,sp
	sgtu	AT,$hi0,$hi1
	bnez	AT,.Lcopy
	PTR_SUB	$rp,$num
.align	4
.Lzap:	sd	zero,($tp)
	sltu	AT,$tp,$ap
	bnez	AT,.Lzap
	PTR_ADD	$tp,8
	b	.Lexit
	nop
	.set	reorder
END(bn_mul_mont)
.rdata
+26 −25
Original line number Diff line number Diff line
@@ -2,8 +2,9 @@

# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. Rights for redistribution and usage in source and binary
# forms are granted according to the OpenSSL license.
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================

# April 2006
@@ -42,6 +43,7 @@ if ($output =~ /32\-mont\.s/) {
	$UMULL=	"mullw";	# unsigned multiply low
	$UMULH=	"mulhwu";	# unsigned multiply high
	$UCMP=	"cmplw";	# unsigned compare
	$SHRI=	"srwi";		# unsigned shift right by immediate	
	$PUSH=	$ST;
	$POP=	$LD;
} elsif ($output =~ /64\-mont\.s/) {
@@ -62,6 +64,7 @@ if ($output =~ /32\-mont\.s/) {
	$UMULL=	"mulld";	# unsigned multiply low
	$UMULH=	"mulhdu";	# unsigned multiply high
	$UCMP=	"cmpld";	# unsigned compare
	$SHRI=	"srdi";		# unsigned shift right by immediate	
	$PUSH=	$ST;
	$POP=	$LD;
} else { die "nonsense $output"; }
@@ -264,24 +267,37 @@ Linner:
	addi	$i,$i,$BNSZ
	ble-	Louter

	$SHRI.	$nj,$nj,$BITS-2	; check boundary condition
	addi	$num,$num,2	; restore $num
	subfc	$j,$j,$j	; j=0 and "clear" XER[CA]
	addi	$tp,$sp,$FRAME
	addi	$ap,$sp,$FRAME
	mtctr	$num
	beq	Lcopy		; boundary condition is met

.align	4
Lsub:	$LDX	$tj,$tp,$j
	$LDX	$nj,$np,$j
	subfe	$aj,$nj,$tj	; tp[j]-np[j]
	$STX	$aj,$rp,$j
	addi	$j,$j,$BNSZ
	bdnz-	Lsub

	li	$j,0
	mtctr	$num
	subfe	$ovf,$j,$ovf	; handle upmost overflow bit
	and	$ap,$tp,$ovf
	andc	$np,$rp,$ovf
	or	$ap,$ap,$np	; ap=borrow?tp:rp

	subfc.	$ovf,$j,$ovf	; sets XER[CA]
	bne	Lsub
	$UCMP	$hi1,$nj
	bge	Lsub
.align	4
Lcopy:
	$LDX	$tj,$tp,$j
Lcopy:				; copy or in-place refresh
	$LDX	$tj,$ap,$j
	$STX	$tj,$rp,$j
	$STX	$j,$tp,$j	; zap at once
	addi	$j,$j,$BNSZ
	bdnz-	Lcopy

Lexit:
	$POP	r14,`4*$SIZE_T`($sp)
	$POP	r15,`5*$SIZE_T`($sp)
	$POP	r16,`6*$SIZE_T`($sp)
@@ -298,22 +314,7 @@ Lexit:
	li	r3,1
	blr
	.long	0
.align	4
Lsub:	$LDX	$tj,$tp,$j
	$LDX	$nj,$np,$j
	subfe	$tj,$nj,$tj	; tp[j]-np[j]
	$STX	$tj,$rp,$j
	addi	$j,$j,$BNSZ
	bdnz-	Lsub
	li	$j,0
	subfe.	$ovf,$j,$ovf
	mtctr	$num
	bne	Lcopy
.align	4
Lzap:	$STX	$j,$tp,$j
	addi	$j,$j,$BNSZ
	bdnz-	Lzap
	b	Lexit
.asciz  "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
___

$code =~ s/\`([^\`]*)\`/eval $1/gem;
+25 −25
Original line number Diff line number Diff line
@@ -176,45 +176,45 @@ bn_mul_mont:
___

undef $bi;
$count=$ap; undef $ap;
$count=$bp; undef $bp;

$code.=<<___;
	lg	$rp,16+16($fp)	# reincarnate rp
	la	$ap,8($fp)
	lgr	$j,$num
	ltgr	$AHI,$AHI
	jnz	.Lsub		# upmost overflow bit is not zero
	#slg	$NHI,-8($np)	# tp[num-1]-np[num-1]
	lghi	$count,-8		# buggy assembler
	slg	$NHI,0($count,$np)	# buggy assembler
	jnle	.Lsub		# branch if not borrow 

.Lcopy:	lg	$alo,8($j,$fp)
	stg	$j,8($j,$fp)
	stg	$alo,0($j,$rp)
	aghi	$j,8
	jnz	.Lcopy
.Lexit:
	lmg	%r6,%r15,16+48($fp)
	lghi	%r2,1		# signal "processed"
	br	%r14
	#lg	$nhi,-8($np)		# buggy assembler
	lghi	$count,-8		# buggy assembler
	lg	$nhi,0($count,$np)	# buggy assembler
	srag	$nhi,$nhi,62	# boundary condition...
	jz	.Lcopy		# ... is met

.Lsub:	lcgr	$count,$num
	lcgr	$count,$num
	sra	$count,3	# incidentally clears "borrow"
.Lsubloop:
	lg	$alo,8($j,$fp)
.Lsub:	lg	$alo,0($j,$ap)
	slbg	$alo,0($j,$np)
	stg	$alo,0($j,$rp)
	la	$j,8($j)
	brct	$count,.Lsubloop
	brct	$count,.Lsub
	lghi	$ahi,0
	slbgr	$AHI,$ahi
	slbgr	$AHI,$ahi	# handle upmost carry

	ngr	$ap,$AHI
	lghi	$np,-1
	xgr	$np,$AHI
	ngr	$np,$rp
	ogr	$ap,$np		# ap=borrow?tp:rp
	lgr	$j,$num
	jle	.Lcopy		# branch if borrow

.Lzap:	stg	$j,8($j,$fp)
.Lcopy:	lg	$alo,0($j,$ap)	# copy or in-place refresh
	stg	$j,8($j,$fp)	# zap tp
	stg	$alo,0($j,$rp)
	aghi	$j,8
	jnz	.Lzap
	j	.Lexit
	jnz	.Lcopy

	lmg	%r6,%r15,16+48($fp)
	lghi	%r2,1		# signal "processed"
	br	%r14
.size	bn_mul_mont,.-bn_mul_mont
.string	"Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
___
Loading