Commit 11208dcf authored by Andy Polyakov's avatar Andy Polyakov
Browse files

ARMv4 assembly pack: implement support for Thumb2.



As some of ARM processors, more specifically Cortex-Mx series, are
Thumb2-only, we need to support Thumb2-only builds even in assembly.

Reviewed-by: default avatarTim Hudson <tjh@openssl.org>
parent e7a68985
Loading
Loading
Loading
Loading
+10 −14
Original line number Diff line number Diff line
@@ -70,16 +70,12 @@ $code=<<___;
#endif

.text
#if __ARM_ARCH__<7
.code	32
#else
.syntax	unified
#if defined(__thumb2__) && !defined(__APPLE__)
.syntax	unified
.thumb
#else
.code	32
#endif
#endif

.type	AES_Te,%object
.align	5
@@ -193,7 +189,7 @@ AES_Te:
.type   AES_encrypt,%function
.align	5
AES_encrypt:
#if __ARM_ARCH__<7
#ifndef	__thumb2__
	sub	r3,pc,#8		@ AES_encrypt
#else
	adr	r3,AES_encrypt
@@ -443,19 +439,19 @@ _armv4_AES_encrypt:
.align	5
AES_set_encrypt_key:
_armv4_AES_set_encrypt_key:
#if __ARM_ARCH__<7
#ifndef	__thumb2__
	sub	r3,pc,#8		@ AES_set_encrypt_key
#else
	adr	r3,AES_set_encrypt_key
#endif
	teq	r0,#0
#if __ARM_ARCH__>=7
#ifdef	__thumb2__
	itt	eq			@ Thumb2 thing, sanity check in ARM
#endif
	moveq	r0,#-1
	beq	.Labrt
	teq	r2,#0
#if __ARM_ARCH__>=7
#ifdef	__thumb2__
	itt	eq			@ Thumb2 thing, sanity check in ARM
#endif
	moveq	r0,#-1
@@ -466,7 +462,7 @@ _armv4_AES_set_encrypt_key:
	teq	r1,#192
	beq	.Lok
	teq	r1,#256
#if __ARM_ARCH__>=7
#ifdef	__thumb2__
	itt	ne			@ Thumb2 thing, sanity check in ARM
#endif
	movne	r0,#-1
@@ -627,7 +623,7 @@ _armv4_AES_set_encrypt_key:
	str	$s2,[$key,#-16]
	subs	$rounds,$rounds,#1
	str	$s3,[$key,#-12]
#if __ARM_ARCH__>=7
#ifdef	__thumb2__
	itt	eq				@ Thumb2 thing, sanity check in ARM
#endif
	subeq	r2,$key,#216
@@ -699,7 +695,7 @@ _armv4_AES_set_encrypt_key:
	str	$s2,[$key,#-24]
	subs	$rounds,$rounds,#1
	str	$s3,[$key,#-20]
#if __ARM_ARCH__>=7
#ifdef	__thumb2__
	itt	eq				@ Thumb2 thing, sanity check in ARM
#endif
	subeq	r2,$key,#256
@@ -969,7 +965,7 @@ AES_Td:
.type   AES_decrypt,%function
.align	5
AES_decrypt:
#if __ARM_ARCH__<7
#ifndef	__thumb2__
	sub	r3,pc,#8		@ AES_decrypt
#else
	adr	r3,AES_decrypt
+24 −0
Original line number Diff line number Diff line
@@ -15,7 +15,12 @@ $code.=<<___;
#include "arm_arch.h"

.text
#if defined(__thumb2__) && !defined(__APPLE__)
.syntax	unified
.thumb
#else
.code	32
#endif

.align	5
.global	OPENSSL_atomic_add
@@ -59,6 +64,9 @@ OPENSSL_atomic_add:
OPENSSL_cleanse:
	eor	ip,ip,ip
	cmp	r1,#7
#ifdef	__thumb2__
	itt	hs
#endif
	subhs	r1,r1,#4
	bhs	.Lot
	cmp	r1,#0
@@ -116,27 +124,43 @@ _armv7_tick:
.global	_armv8_aes_probe
.type	_armv8_aes_probe,%function
_armv8_aes_probe:
#if defined(__thumb2__) && !defined(__APPLE__)
	.byte	0xb0,0xff,0x00,0x03	@ aese.8	q0,q0
#else
	.byte	0x00,0x03,0xb0,0xf3	@ aese.8	q0,q0
#endif
	bx	lr
.size	_armv8_aes_probe,.-_armv8_aes_probe

.global	_armv8_sha1_probe
.type	_armv8_sha1_probe,%function
_armv8_sha1_probe:
#if defined(__thumb2__) && !defined(__APPLE__)
	.byte	0x00,0xef,0x40,0x0c	@ sha1c.32	q0,q0,q0
#else
	.byte	0x40,0x0c,0x00,0xf2	@ sha1c.32	q0,q0,q0
#endif
	bx	lr
.size	_armv8_sha1_probe,.-_armv8_sha1_probe

.global	_armv8_sha256_probe
.type	_armv8_sha256_probe,%function
_armv8_sha256_probe:
#if defined(__thumb2__) && !defined(__APPLE__)
	.byte	0x00,0xff,0x40,0x0c	@ sha256h.32	q0,q0,q0
#else
	.byte	0x40,0x0c,0x00,0xf3	@ sha256h.32	q0,q0,q0
#endif
	bx	lr
.size	_armv8_sha256_probe,.-_armv8_sha256_probe
.global	_armv8_pmull_probe
.type	_armv8_pmull_probe,%function
_armv8_pmull_probe:
#if defined(__thumb2__) && !defined(__APPLE__)
	.byte	0xa0,0xef,0x00,0x0e	@ vmull.p64	q0,d0,d0
#else
	.byte	0x00,0x0e,0xa0,0xf2	@ vmull.p64	q0,d0,d0
#endif
	bx	lr
.size	_armv8_pmull_probe,.-_armv8_pmull_probe
#endif
+30 −6
Original line number Diff line number Diff line
@@ -51,7 +51,12 @@ $code=<<___;
#include "arm_arch.h"

.text
#if defined(__thumb2__) && !defined(__APPLE__)
.syntax	unified
.thumb
#else
.code	32
#endif
___
################
# private interface to mul_1x1_ialu
@@ -132,11 +137,17 @@ mul_1x1_ialu:
	eor	$hi,$hi,$t0,lsr#8
	ldr	$t0,[sp,$i0]		@ tab[b >> 30      ]

#ifdef	__thumb2__
	itt	ne
#endif
	eorne	$lo,$lo,$b,lsl#30
	eorne	$hi,$hi,$b,lsr#2
	tst	$a,#1<<31
	eor	$lo,$lo,$t1,lsl#27
	eor	$hi,$hi,$t1,lsr#5
#ifdef	__thumb2__
	itt	ne
#endif
	eorne	$lo,$lo,$b,lsl#31
	eorne	$hi,$hi,$b,lsr#1
	eor	$lo,$lo,$t0,lsl#30
@@ -156,20 +167,33 @@ $code.=<<___;
.align	5
bn_GF2m_mul_2x2:
#if __ARM_MAX_ARCH__>=7
	stmdb	sp!,{r10,lr}
	ldr	r12,.LOPENSSL_armcap
.Lpic:	ldr	r12,[pc,r12]
	tst	r12,#1
	adr	r10,.LOPENSSL_armcap
	ldr	r12,[r12,r10]
#ifdef	__APPLE__
	ldr	r12,[r12]
#endif
	tst	r12,#ARMV7_NEON
	itt	ne
	ldrne	r10,[sp],#8
	bne	.LNEON
	stmdb	sp!,{r4-r9}
#else
	stmdb	sp!,{r4-r10,lr}
#endif
___
$ret="r10";	# reassigned 1st argument
$code.=<<___;
	stmdb	sp!,{r4-r10,lr}
	mov	$ret,r0			@ reassign 1st argument
	mov	$b,r3			@ $b=b1
	sub	r7,sp,#36
	mov	r8,sp
	and	r7,r7,#-32
	ldr	r3,[sp,#32]		@ load b0
	mov	$mask,#7<<2
	sub	sp,sp,#32		@ allocate tab[8]
	mov	sp,r7			@ allocate tab[8]
	str	r8,[r7,#32]

	bl	mul_1x1_ialu		@ a1·b1
	str	$lo,[$ret,#8]
@@ -193,6 +217,7 @@ ___
$code.=<<___;
	ldmia	$ret,{@r[0]-@r[3]}
	eor	$lo,$lo,$hi
	ldr	sp,[sp,#32]		@ destroy tab[8]
	eor	$hi,$hi,@r[1]
	eor	$lo,$lo,@r[0]
	eor	$hi,$hi,@r[2]
@@ -200,7 +225,6 @@ $code.=<<___;
	eor	$hi,$hi,@r[3]
	str	$hi,[$ret,#8]
	eor	$lo,$lo,$hi
	add	sp,sp,#32		@ destroy tab[8]
	str	$lo,[$ret,#4]

#if __ARM_ARCH__>=5
@@ -279,7 +303,7 @@ $code.=<<___;
#if __ARM_MAX_ARCH__>=7
.align	5
.LOPENSSL_armcap:
.word	OPENSSL_armcap_P-(.Lpic+8)
.word	OPENSSL_armcap_P-.
#endif
.asciz	"GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align	5
+29 −6
Original line number Diff line number Diff line
@@ -82,7 +82,12 @@ $code=<<___;
#include "arm_arch.h"

.text
#if defined(__thumb2__) && !defined(__APPLE__)
.syntax	unified
.thumb
#else
.code	32
#endif

#if __ARM_MAX_ARCH__>=7
.align	5
@@ -101,7 +106,7 @@ bn_mul_mont:
#if __ARM_MAX_ARCH__>=7
	tst	ip,#7
	bne	.Lialu
	adr	r0,bn_mul_mont
	adr	r0,.Lbn_mul_mont
	ldr	r2,.LOPENSSL_armcap
	ldr	r0,[r0,r2]
#ifdef	__APPLE__
@@ -117,6 +122,9 @@ bn_mul_mont:
#endif
	cmp	ip,#2
	mov	$num,ip			@ load num
#ifdef	__thumb2__
	ittt	lt
#endif
	movlt	r0,#0
	addlt	sp,sp,#2*4
	blt	.Labrt
@@ -164,10 +172,11 @@ bn_mul_mont:
	ldr	$n0,[$_n0]		@ restore n0
	adc	$nhi,$nhi,#0
	str	$nlo,[$num]		@ tp[num-1]=
	mov	$tj,sp
	str	$nhi,[$num,#4]		@ tp[num]=

.Louter:
	sub	$tj,$num,sp		@ "original" $num-1 value
	sub	$tj,$num,$tj		@ "original" $num-1 value
	sub	$ap,$ap,$tj		@ "rewind" ap to &ap[1]
	ldr	$bi,[$tp,#4]!		@ *(++bp)
	sub	$np,$np,$tj		@ "rewind" np to &np[1]
@@ -212,11 +221,16 @@ bn_mul_mont:
	str	$nhi,[$num,#4]		@ tp[num]=

	cmp	$tp,$tj
#ifdef	__thumb2__
	itt	ne
#endif
	movne	$tj,sp
	bne	.Louter

	ldr	$rp,[$_rp]		@ pull rp
	mov	$aj,sp
	add	$num,$num,#4		@ $num to point at &tp[num]
	sub	$aj,$num,sp		@ "original" num value
	sub	$aj,$num,$aj		@ "original" num value
	mov	$tp,sp			@ "rewind" $tp
	mov	$ap,$tp			@ "borrow" $ap
	sub	$np,$np,$aj		@ "rewind" $np to &np[0]
@@ -242,7 +256,8 @@ bn_mul_mont:
	cmp	$tp,$num
	bne	.Lcopy

	add	sp,$num,#4		@ skip over tp[num+1]
	mov	sp,$num
	add	sp,sp,#4		@ skip over tp[num+1]
	ldmia	sp!,{r4-r12,lr}		@ restore registers
	add	sp,sp,#2*4		@ skip over {r0,r2}
	mov	r0,#1
@@ -283,6 +298,7 @@ bn_mul8x_mont_neon:
	stmdb	sp!,{r4-r11}
	vstmdb	sp!,{d8-d15}		@ ABI specification says so
	ldmia	ip,{r4-r5}		@ load rest of parameter block
	mov	ip,sp

	sub		$toutptr,sp,#16
	vld1.32		{${Bi}[0]}, [$bptr,:32]!
@@ -638,8 +654,9 @@ bn_mul8x_mont_neon:
	bne	.LNEON_sub

	ldr	r10, [$aptr]				@ load top-most bit
	mov	r11,sp
	veor	q0,q0,q0
	sub	r11,$bptr,sp				@ this is num*4
	sub	r11,$bptr,r11				@ this is num*4
	veor	q1,q1,q1
	mov	$aptr,sp
	sub	$rptr,$rptr,r11				@ rewind $rptr
@@ -649,27 +666,33 @@ bn_mul8x_mont_neon:
.LNEON_copy_n_zap:
	ldmia	$aptr!, {r4-r7}
	ldmia	$rptr,  {r8-r11}
	it	cc
	movcc	r8, r4
	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
	itt	cc
	movcc	r9, r5
	movcc	r10,r6
	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
	it	cc
	movcc	r11,r7
	ldmia	$aptr, {r4-r7}
	stmia	$rptr!, {r8-r11}
	sub	$aptr,$aptr,#16
	ldmia	$rptr, {r8-r11}
	it	cc
	movcc	r8, r4
	vst1.64	{q0-q1}, [$aptr,:256]!			@ wipe
	itt	cc
	movcc	r9, r5
	movcc	r10,r6
	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
	it	cc
	movcc	r11,r7
	teq	$aptr,$bptr				@ preserves carry
	stmia	$rptr!, {r8-r11}
	bne	.LNEON_copy_n_zap

	sub	sp,ip,#96
	mov	sp,ip
        vldmia  sp!,{d8-d15}
        ldmia   sp!,{r4-r11}
	ret						@ bx lr
+38 −0
Original line number Diff line number Diff line
@@ -45,7 +45,12 @@ $code.=<<___;
#include "arm_arch.h"

.text
#if defined(__thumb2__) && !defined(__APPLE__)
.syntax	unified
.thumb
#else
.code	32
#endif
___
########################################################################
# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
@@ -162,6 +167,9 @@ __ecp_nistz256_mul_by_2:
	adcs	$a6,$a6,$a6
	mov	$ff,#0
	adcs	$a7,$a7,$a7
#ifdef	__thumb2__
	it	cs
#endif
	movcs	$ff,#-1			@ $ff = carry ? -1 : 0

	b	.Lreduce_by_sub
@@ -213,6 +221,9 @@ __ecp_nistz256_add:
	adcs	$a6,$a6,$t2
	mov	$ff,#0
	adcs	$a7,$a7,$t3
#ifdef	__thumb2__
	it	cs
#endif
	movcs	$ff,#-1			@ $ff = carry ? -1 : 0, "broadcast" carry
	ldr	lr,[sp],#4		@ pop lr

@@ -286,6 +297,9 @@ __ecp_nistz256_mul_by_3:
	adcs	$a6,$a6,$a6
	mov	$ff,#0
	adcs	$a7,$a7,$a7
#ifdef	__thumb2__
	it	cs
#endif
	movcs	$ff,#-1			@ $ff = carry ? -1 : 0, "broadcast" carry

	subs	$a0,$a0,$ff		@ subtract synthesized modulus, see
@@ -318,6 +332,9 @@ __ecp_nistz256_mul_by_3:
	adcs	$a6,$a6,$t2
	mov	$ff,#0
	adcs	$a7,$a7,$t3
#ifdef	__thumb2__
	it	cs
#endif
	movcs	$ff,#-1			@ $ff = carry ? -1 : 0, "broadcast" carry
	ldr	lr,[sp],#4		@ pop lr

@@ -781,6 +798,9 @@ ecp_nistz256_gather_w5:

	cmp	$index,#0
	mov	$mask,#0
#ifdef	__thumb2__
	itt	ne
#endif
	subne	$index,$index,#1
	movne	$mask,#-1
	add	$inp,$inp,$index,lsl#2
@@ -887,6 +907,9 @@ ecp_nistz256_gather_w7:

	cmp	$index,#0
	mov	$mask,#0
#ifdef	__thumb2__
	itt	ne
#endif
	subne	$index,$index,#1
	movne	$mask,#-1
	add	$inp,$inp,$index
@@ -1180,6 +1203,9 @@ __ecp_nistz256_add_self:
	adcs	$a6,$a6,$a6
	mov	$ff,#0
	adcs	$a7,$a7,$a7
#ifdef	__thumb2__
	it	cs
#endif
	movcs	$ff,#-1			@ $ff = carry ? -1 : 0

	subs	$a0,$a0,$ff		@ subtract synthesized modulus
@@ -1369,6 +1395,9 @@ ecp_nistz256_point_add:
	stmia	r3!,{r4-r11}
	ldmia	$b_ptr,{r4-r11}
	cmp	r12,#0
#ifdef	__thumb2__
	it	ne
#endif
	movne	r12,#-1
	stmia	r3,{r4-r11}
	str	r12,[sp,#32*18+8]	@ !in2infty
@@ -1395,6 +1424,9 @@ ecp_nistz256_point_add:
	stmia	r3!,{r4-r11}
	ldmia	$a_ptr,{r4-r11}
	cmp	r12,#0
#ifdef	__thumb2__
	it	ne
#endif
	movne	r12,#-1
	stmia	r3,{r4-r11}
	str	r12,[sp,#32*18+4]	@ !in1infty
@@ -1636,6 +1668,9 @@ ecp_nistz256_point_add_affine:
	stmia	r3!,{r4-r11}
	ldmia	$a_ptr,{r4-r11}
	cmp	r12,#0
#ifdef	__thumb2__
	it	ne
#endif
	movne	r12,#-1
	stmia	r3,{r4-r11}
	str	r12,[sp,#32*15+4]	@ !in1infty
@@ -1661,6 +1696,9 @@ ecp_nistz256_point_add_affine:
	orr	r12,r12,r11
	stmia	r3!,{r4-r11}
	cmp	r12,#0
#ifdef	__thumb2__
	it	ne
#endif
	movne	r12,#-1
	str	r12,[sp,#32*15+8]	@ !in2infty

Loading