Commit 88cb5972 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

ARM assembler pack update from HEAD.

parent 781bfdc3
Loading
Loading
Loading
Loading
+145 −42
Original line number Diff line number Diff line
@@ -20,13 +20,18 @@

# May 2007.
#
# private_AES_set_[en|de]crypt_key is added.
# AES_set_[en|de]crypt_key is added.

# July 2010.
#
# Rescheduling for dual-issue pipeline resulted in 12% improvement on
# Cortex A8 core and ~25 cycles per byte processed with 128-bit key.

# February 2011.
#
# Profiler-assisted and platform-specific optimization resulted in 16%
# improvement on Cortex A8 core and ~21.5 cycles per byte.

while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";

@@ -46,6 +51,7 @@ $key="r11";
$rounds="r12";

$code=<<___;
#include "arm_arch.h"
.text
.code	32

@@ -166,7 +172,7 @@ AES_encrypt:
	mov	$rounds,r0		@ inp
	mov	$key,r2
	sub	$tbl,r3,#AES_encrypt-AES_Te	@ Te

#if __ARM_ARCH__<7
	ldrb	$s0,[$rounds,#3]	@ load input data in endian-neutral
	ldrb	$t1,[$rounds,#2]	@ manner...
	ldrb	$t2,[$rounds,#1]
@@ -195,10 +201,33 @@ AES_encrypt:
	orr	$s3,$s3,$t1,lsl#8
	orr	$s3,$s3,$t2,lsl#16
	orr	$s3,$s3,$t3,lsl#24

#else
	ldr	$s0,[$rounds,#0]
	ldr	$s1,[$rounds,#4]
	ldr	$s2,[$rounds,#8]
	ldr	$s3,[$rounds,#12]
#ifdef __ARMEL__
	rev	$s0,$s0
	rev	$s1,$s1
	rev	$s2,$s2
	rev	$s3,$s3
#endif
#endif
	bl	_armv4_AES_encrypt

	ldr	$rounds,[sp],#4		@ pop out
#if __ARM_ARCH__>=7
#ifdef __ARMEL__
	rev	$s0,$s0
	rev	$s1,$s1
	rev	$s2,$s2
	rev	$s3,$s3
#endif
	str	$s0,[$rounds,#0]
	str	$s1,[$rounds,#4]
	str	$s2,[$rounds,#8]
	str	$s3,[$rounds,#12]
#else
	mov	$t1,$s0,lsr#24		@ write output in endian-neutral
	mov	$t2,$s0,lsr#16		@ manner...
	mov	$t3,$s0,lsr#8
@@ -227,11 +256,15 @@ AES_encrypt:
	strb	$t2,[$rounds,#13]
	strb	$t3,[$rounds,#14]
	strb	$s3,[$rounds,#15]

#endif
#if __ARM_ARCH__>=5
	ldmia	sp!,{r4-r12,pc}
#else
	ldmia   sp!,{r4-r12,lr}
	tst	lr,#1
	moveq	pc,lr			@ be binary compatible with V4, yet
	bx	lr			@ interoperable with Thumb ISA:-)
#endif
.size	AES_encrypt,.-AES_encrypt

.type   _armv4_AES_encrypt,%function
@@ -271,11 +304,11 @@ _armv4_AES_encrypt:
	and	$i2,lr,$s2,lsr#16	@ i1
	eor	$t3,$t3,$i3,ror#8
	and	$i3,lr,$s2
	eor	$s1,$s1,$t1,ror#24
	ldr	$i1,[$tbl,$i1,lsl#2]	@ Te2[s2>>8]
	eor	$s1,$s1,$t1,ror#24
	ldr	$i2,[$tbl,$i2,lsl#2]	@ Te1[s2>>16]
	mov	$s2,$s2,lsr#24

	ldr	$i2,[$tbl,$i2,lsl#2]	@ Te1[s2>>16]
	ldr	$i3,[$tbl,$i3,lsl#2]	@ Te3[s2>>0]
	eor	$s0,$s0,$i1,ror#16
	ldr	$s2,[$tbl,$s2,lsl#2]	@ Te0[s2>>24]
@@ -284,16 +317,16 @@ _armv4_AES_encrypt:
	and	$i2,lr,$s3,lsr#8	@ i1
	eor	$t3,$t3,$i3,ror#16
	and	$i3,lr,$s3,lsr#16	@ i2
	eor	$s2,$s2,$t2,ror#16
	ldr	$i1,[$tbl,$i1,lsl#2]	@ Te3[s3>>0]
	eor	$s2,$s2,$t2,ror#16
	ldr	$i2,[$tbl,$i2,lsl#2]	@ Te2[s3>>8]
	mov	$s3,$s3,lsr#24

	ldr	$i2,[$tbl,$i2,lsl#2]	@ Te2[s3>>8]
	ldr	$i3,[$tbl,$i3,lsl#2]	@ Te1[s3>>16]
	eor	$s0,$s0,$i1,ror#24
	ldr	$s3,[$tbl,$s3,lsl#2]	@ Te0[s3>>24]
	eor	$s1,$s1,$i2,ror#16
	ldr	$i1,[$key],#16
	eor	$s1,$s1,$i2,ror#16
	ldr	$s3,[$tbl,$s3,lsl#2]	@ Te0[s3>>24]
	eor	$s2,$s2,$i3,ror#8
	ldr	$t1,[$key,#-12]
	eor	$s3,$s3,$t3,ror#8
@@ -333,11 +366,11 @@ _armv4_AES_encrypt:
	and	$i2,lr,$s2,lsr#16	@ i1
	eor	$t3,$i3,$t3,lsl#8
	and	$i3,lr,$s2
	eor	$s1,$t1,$s1,lsl#24
	ldrb	$i1,[$tbl,$i1,lsl#2]	@ Te4[s2>>8]
	eor	$s1,$t1,$s1,lsl#24
	ldrb	$i2,[$tbl,$i2,lsl#2]	@ Te4[s2>>16]
	mov	$s2,$s2,lsr#24

	ldrb	$i2,[$tbl,$i2,lsl#2]	@ Te4[s2>>16]
	ldrb	$i3,[$tbl,$i3,lsl#2]	@ Te4[s2>>0]
	eor	$s0,$i1,$s0,lsl#8
	ldrb	$s2,[$tbl,$s2,lsl#2]	@ Te4[s2>>24]
@@ -346,15 +379,15 @@ _armv4_AES_encrypt:
	and	$i2,lr,$s3,lsr#8	@ i1
	eor	$t3,$i3,$t3,lsl#8
	and	$i3,lr,$s3,lsr#16	@ i2
	eor	$s2,$t2,$s2,lsl#24
	ldrb	$i1,[$tbl,$i1,lsl#2]	@ Te4[s3>>0]
	eor	$s2,$t2,$s2,lsl#24
	ldrb	$i2,[$tbl,$i2,lsl#2]	@ Te4[s3>>8]
	mov	$s3,$s3,lsr#24

	ldrb	$i2,[$tbl,$i2,lsl#2]	@ Te4[s3>>8]
	ldrb	$i3,[$tbl,$i3,lsl#2]	@ Te4[s3>>16]
	eor	$s0,$i1,$s0,lsl#8
	ldrb	$s3,[$tbl,$s3,lsl#2]	@ Te4[s3>>24]
	ldr	$i1,[$key,#0]
	ldrb	$s3,[$tbl,$s3,lsl#2]	@ Te4[s3>>24]
	eor	$s1,$s1,$i2,lsl#8
	ldr	$t1,[$key,#4]
	eor	$s2,$s2,$i3,lsl#16
@@ -371,11 +404,11 @@ _armv4_AES_encrypt:
	ldr	pc,[sp],#4		@ pop and return
.size	_armv4_AES_encrypt,.-_armv4_AES_encrypt

.global private_AES_set_encrypt_key
.type   private_AES_set_encrypt_key,%function
.global AES_set_encrypt_key
.type   AES_set_encrypt_key,%function
.align	5
private_AES_set_encrypt_key:
	sub	r3,pc,#8		@ private_AES_set_encrypt_key
AES_set_encrypt_key:
	sub	r3,pc,#8		@ AES_set_encrypt_key
	teq	r0,#0
	moveq	r0,#-1
	beq	.Labrt
@@ -392,12 +425,13 @@ private_AES_set_encrypt_key:
	bne	.Labrt

.Lok:	stmdb   sp!,{r4-r12,lr}
	sub	$tbl,r3,#private_AES_set_encrypt_key-AES_Te-1024	@ Te4
	sub	$tbl,r3,#AES_set_encrypt_key-AES_Te-1024	@ Te4

	mov	$rounds,r0		@ inp
	mov	lr,r1			@ bits
	mov	$key,r2			@ key

#if __ARM_ARCH__<7
	ldrb	$s0,[$rounds,#3]	@ load input data in endian-neutral
	ldrb	$t1,[$rounds,#2]	@ manner...
	ldrb	$t2,[$rounds,#1]
@@ -430,6 +464,22 @@ private_AES_set_encrypt_key:
	orr	$s3,$s3,$t3,lsl#24
	str	$s2,[$key,#-8]
	str	$s3,[$key,#-4]
#else
	ldr	$s0,[$rounds,#0]
	ldr	$s1,[$rounds,#4]
	ldr	$s2,[$rounds,#8]
	ldr	$s3,[$rounds,#12]
#ifdef __ARMEL__
	rev	$s0,$s0
	rev	$s1,$s1
	rev	$s2,$s2
	rev	$s3,$s3
#endif
	str	$s0,[$key],#16
	str	$s1,[$key,#-12]
	str	$s2,[$key,#-8]
	str	$s3,[$key,#-4]
#endif

	teq	lr,#128
	bne	.Lnot128
@@ -466,6 +516,7 @@ private_AES_set_encrypt_key:
	b	.Ldone

.Lnot128:
#if __ARM_ARCH__<7
	ldrb	$i2,[$rounds,#19]
	ldrb	$t1,[$rounds,#18]
	ldrb	$t2,[$rounds,#17]
@@ -482,6 +533,16 @@ private_AES_set_encrypt_key:
	str	$i2,[$key],#8
	orr	$i3,$i3,$t3,lsl#24
	str	$i3,[$key,#-4]
#else
	ldr	$i2,[$rounds,#16]
	ldr	$i3,[$rounds,#20]
#ifdef __ARMEL__
	rev	$i2,$i2
	rev	$i3,$i3
#endif
	str	$i2,[$key],#8
	str	$i3,[$key,#-4]
#endif

	teq	lr,#192
	bne	.Lnot192
@@ -526,6 +587,7 @@ private_AES_set_encrypt_key:
	b	.L192_loop

.Lnot192:
#if __ARM_ARCH__<7
	ldrb	$i2,[$rounds,#27]
	ldrb	$t1,[$rounds,#26]
	ldrb	$t2,[$rounds,#25]
@@ -542,6 +604,16 @@ private_AES_set_encrypt_key:
	str	$i2,[$key],#8
	orr	$i3,$i3,$t3,lsl#24
	str	$i3,[$key,#-4]
#else
	ldr	$i2,[$rounds,#24]
	ldr	$i3,[$rounds,#28]
#ifdef __ARMEL__
	rev	$i2,$i2
	rev	$i3,$i3
#endif
	str	$i2,[$key],#8
	str	$i3,[$key,#-4]
#endif

	mov	$rounds,#14
	str	$rounds,[$key,#240-32]
@@ -606,21 +678,21 @@ private_AES_set_encrypt_key:
.Labrt:	tst	lr,#1
	moveq	pc,lr			@ be binary compatible with V4, yet
	bx	lr			@ interoperable with Thumb ISA:-)
.size	private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
.size	AES_set_encrypt_key,.-AES_set_encrypt_key

.global private_AES_set_decrypt_key
.type   private_AES_set_decrypt_key,%function
.global AES_set_decrypt_key
.type   AES_set_decrypt_key,%function
.align	5
private_AES_set_decrypt_key:
AES_set_decrypt_key:
	str	lr,[sp,#-4]!            @ push lr
	bl	private_AES_set_encrypt_key
	bl	AES_set_encrypt_key
	teq	r0,#0
	ldrne	lr,[sp],#4              @ pop lr
	bne	.Labrt

	stmdb   sp!,{r4-r12}

	ldr	$rounds,[r2,#240]	@ private_AES_set_encrypt_key preserves r2,
	ldr	$rounds,[r2,#240]	@ AES_set_encrypt_key preserves r2,
	mov	$key,r2			@ which is AES_KEY *key
	mov	$i1,r2
	add	$i2,r2,$rounds,lsl#4
@@ -692,11 +764,15 @@ $code.=<<___;
	bne	.Lmix

	mov	r0,#0
#if __ARM_ARCH__>=5
	ldmia	sp!,{r4-r12,pc}
#else
	ldmia   sp!,{r4-r12,lr}
	tst	lr,#1
	moveq	pc,lr			@ be binary compatible with V4, yet
	bx	lr			@ interoperable with Thumb ISA:-)
.size	private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
#endif
.size	AES_set_decrypt_key,.-AES_set_decrypt_key

.type	AES_Td,%object
.align	5
@@ -811,7 +887,7 @@ AES_decrypt:
	mov	$rounds,r0		@ inp
	mov	$key,r2
	sub	$tbl,r3,#AES_decrypt-AES_Td		@ Td

#if __ARM_ARCH__<7
	ldrb	$s0,[$rounds,#3]	@ load input data in endian-neutral
	ldrb	$t1,[$rounds,#2]	@ manner...
	ldrb	$t2,[$rounds,#1]
@@ -840,10 +916,33 @@ AES_decrypt:
	orr	$s3,$s3,$t1,lsl#8
	orr	$s3,$s3,$t2,lsl#16
	orr	$s3,$s3,$t3,lsl#24

#else
	ldr	$s0,[$rounds,#0]
	ldr	$s1,[$rounds,#4]
	ldr	$s2,[$rounds,#8]
	ldr	$s3,[$rounds,#12]
#ifdef __ARMEL__
	rev	$s0,$s0
	rev	$s1,$s1
	rev	$s2,$s2
	rev	$s3,$s3
#endif
#endif
	bl	_armv4_AES_decrypt

	ldr	$rounds,[sp],#4		@ pop out
#if __ARM_ARCH__>=7
#ifdef __ARMEL__
	rev	$s0,$s0
	rev	$s1,$s1
	rev	$s2,$s2
	rev	$s3,$s3
#endif
	str	$s0,[$rounds,#0]
	str	$s1,[$rounds,#4]
	str	$s2,[$rounds,#8]
	str	$s3,[$rounds,#12]
#else
	mov	$t1,$s0,lsr#24		@ write output in endian-neutral
	mov	$t2,$s0,lsr#16		@ manner...
	mov	$t3,$s0,lsr#8
@@ -872,11 +971,15 @@ AES_decrypt:
	strb	$t2,[$rounds,#13]
	strb	$t3,[$rounds,#14]
	strb	$s3,[$rounds,#15]

#endif
#if __ARM_ARCH__>=5
	ldmia	sp!,{r4-r12,pc}
#else
	ldmia   sp!,{r4-r12,lr}
	tst	lr,#1
	moveq	pc,lr			@ be binary compatible with V4, yet
	bx	lr			@ interoperable with Thumb ISA:-)
#endif
.size	AES_decrypt,.-AES_decrypt

.type   _armv4_AES_decrypt,%function
@@ -916,11 +1019,11 @@ _armv4_AES_decrypt:
	and	$i2,lr,$s2		@ i1
	eor	$t3,$i3,$t3,ror#8
	and	$i3,lr,$s2,lsr#16
	eor	$s1,$s1,$t1,ror#8
	ldr	$i1,[$tbl,$i1,lsl#2]	@ Td2[s2>>8]
	eor	$s1,$s1,$t1,ror#8
	ldr	$i2,[$tbl,$i2,lsl#2]	@ Td3[s2>>0]
	mov	$s2,$s2,lsr#24

	ldr	$i2,[$tbl,$i2,lsl#2]	@ Td3[s2>>0]
	ldr	$i3,[$tbl,$i3,lsl#2]	@ Td1[s2>>16]
	eor	$s0,$s0,$i1,ror#16
	ldr	$s2,[$tbl,$s2,lsl#2]	@ Td0[s2>>24]
@@ -929,22 +1032,22 @@ _armv4_AES_decrypt:
	and	$i2,lr,$s3,lsr#8	@ i1
	eor	$t3,$i3,$t3,ror#8
	and	$i3,lr,$s3		@ i2
	eor	$s2,$s2,$t2,ror#8
	ldr	$i1,[$tbl,$i1,lsl#2]	@ Td1[s3>>16]
	eor	$s2,$s2,$t2,ror#8
	ldr	$i2,[$tbl,$i2,lsl#2]	@ Td2[s3>>8]
	mov	$s3,$s3,lsr#24

	ldr	$i2,[$tbl,$i2,lsl#2]	@ Td2[s3>>8]
	ldr	$i3,[$tbl,$i3,lsl#2]	@ Td3[s3>>0]
	eor	$s0,$s0,$i1,ror#8
	ldr	$s3,[$tbl,$s3,lsl#2]	@ Td0[s3>>24]
	ldr	$i1,[$key],#16
	eor	$s1,$s1,$i2,ror#16
	ldr	$s3,[$tbl,$s3,lsl#2]	@ Td0[s3>>24]
	eor	$s2,$s2,$i3,ror#24
	ldr	$i1,[$key],#16
	eor	$s3,$s3,$t3,ror#8

	ldr	$t1,[$key,#-12]
	ldr	$t2,[$key,#-8]
	eor	$s0,$s0,$i1
	ldr	$t2,[$key,#-8]
	eor	$s3,$s3,$t3,ror#8
	ldr	$t3,[$key,#-4]
	and	$i1,lr,$s0,lsr#16
	eor	$s1,$s1,$t1
@@ -985,11 +1088,11 @@ _armv4_AES_decrypt:
	and	$i1,lr,$s2,lsr#8	@ i0
	eor	$t2,$t2,$i2,lsl#8
	and	$i2,lr,$s2		@ i1
	eor	$t3,$t3,$i3,lsl#8
	ldrb	$i1,[$tbl,$i1]		@ Td4[s2>>8]
	eor	$t3,$t3,$i3,lsl#8
	ldrb	$i2,[$tbl,$i2]		@ Td4[s2>>0]
	and	$i3,lr,$s2,lsr#16

	ldrb	$i2,[$tbl,$i2]		@ Td4[s2>>0]
	ldrb	$s2,[$tbl,$s2,lsr#24]	@ Td4[s2>>24]
	eor	$s0,$s0,$i1,lsl#8
	ldrb	$i3,[$tbl,$i3]		@ Td4[s2>>16]
@@ -997,11 +1100,11 @@ _armv4_AES_decrypt:
	and	$i1,lr,$s3,lsr#16	@ i0
	eor	$s2,$t2,$s2,lsl#16
	and	$i2,lr,$s3,lsr#8	@ i1
	eor	$t3,$t3,$i3,lsl#16
	ldrb	$i1,[$tbl,$i1]		@ Td4[s3>>16]
	eor	$t3,$t3,$i3,lsl#16
	ldrb	$i2,[$tbl,$i2]		@ Td4[s3>>8]
	and	$i3,lr,$s3		@ i2

	ldrb	$i2,[$tbl,$i2]		@ Td4[s3>>8]
	ldrb	$i3,[$tbl,$i3]		@ Td4[s3>>0]
	ldrb	$s3,[$tbl,$s3,lsr#24]	@ Td4[s3>>24]
	eor	$s0,$s0,$i1,lsl#16

crypto/arm_arch.h

0 → 100644
+51 −0
Original line number Diff line number Diff line
#ifndef __ARM_ARCH_H__
#define __ARM_ARCH_H__

#if !defined(__ARM_ARCH__)
# if defined(__CC_ARM)
#  define __ARM_ARCH__ __TARGET_ARCH_ARM
#  if defined(__BIG_ENDIAN)
#   define __ARMEB__
#  else
#   define __ARMEL__
#  endif
# elif defined(__GNUC__)
  /*
   * Why doesn't gcc define __ARM_ARCH__? Instead it defines
   * bunch of below macros. See all_architectires[] table in
   * gcc/config/arm/arm.c. On a side note it defines
   * __ARMEL__/__ARMEB__ for little-/big-endian.
   */
#  if	defined(__ARM_ARCH_7__)	|| defined(__ARM_ARCH_7A__)	|| \
	defined(__ARM_ARCH_7R__)|| defined(__ARM_ARCH_7M__)	|| \
	defined(__ARM_ARCH_7EM__)
#   define __ARM_ARCH__ 7
#  elif	defined(__ARM_ARCH_6__)	|| defined(__ARM_ARCH_6J__)	|| \
	defined(__ARM_ARCH_6K__)|| defined(__ARM_ARCH_6M__)	|| \
	defined(__ARM_ARCH_6Z__)|| defined(__ARM_ARCH_6ZK__)	|| \
	defined(__ARM_ARCH_6T2__)
#   define __ARM_ARCH__ 6
#  elif	defined(__ARM_ARCH_5__)	|| defined(__ARM_ARCH_5T__)	|| \
	defined(__ARM_ARCH_5E__)|| defined(__ARM_ARCH_5TE__)	|| \
	defined(__ARM_ARCH_5TEJ__)
#   define __ARM_ARCH__ 5
#  elif	defined(__ARM_ARCH_4__)	|| defined(__ARM_ARCH_4T__)
#   define __ARM_ARCH__ 4
#  else
#   error "unsupported ARM architecture"
#  endif
# endif
#endif

#ifdef OPENSSL_FIPSCANISTER
#include <openssl/fipssyms.h>
#endif

#if !__ASSEMBLER__
extern unsigned int OPENSSL_armcap_P;
                                     
#define ARMV7_NEON      (1<<0)
#define ARMV7_TICK      (1<<1)
#endif

#endif

crypto/armcap.c

0 → 100644
+80 −0
Original line number Diff line number Diff line
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <setjmp.h>
#include <signal.h>
#include <crypto.h>

#include "arm_arch.h"

unsigned int OPENSSL_armcap_P;

static sigset_t all_masked;

static sigjmp_buf ill_jmp;
static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); }

/*
 * Following subroutines could have been inlined, but it's not all
 * ARM compilers support inline assembler...
 */
void _armv7_neon_probe(void);
unsigned int _armv7_tick(void);

unsigned int OPENSSL_rdtsc(void)
	{
	if (OPENSSL_armcap_P|ARMV7_TICK)
		return _armv7_tick();
	else
		return 0;
	}

#if defined(__GNUC__) && __GNUC__>=2
void OPENSSL_cpuid_setup(void) __attribute__((constructor));
#endif
void OPENSSL_cpuid_setup(void)
	{
	char *e;
	struct sigaction	ill_oact,ill_act;
	sigset_t		oset;
	static int trigger=0;

	if (trigger) return;
	trigger=1;
 
	if ((e=getenv("OPENSSL_armcap")))
		{
		OPENSSL_armcap_P=strtoul(e,NULL,0);
		return;
		}

	sigfillset(&all_masked);
	sigdelset(&all_masked,SIGILL);
	sigdelset(&all_masked,SIGTRAP);
	sigdelset(&all_masked,SIGFPE);
	sigdelset(&all_masked,SIGBUS);
	sigdelset(&all_masked,SIGSEGV);

	OPENSSL_armcap_P = 0;

	memset(&ill_act,0,sizeof(ill_act));
	ill_act.sa_handler = ill_handler;
	ill_act.sa_mask    = all_masked;

	sigprocmask(SIG_SETMASK,&ill_act.sa_mask,&oset);
	sigaction(SIGILL,&ill_act,&ill_oact);

	if (sigsetjmp(ill_jmp,1) == 0)
		{
		_armv7_neon_probe();
		OPENSSL_armcap_P |= ARMV7_NEON;
		}
	if (sigsetjmp(ill_jmp,1) == 0)
		{
		_armv7_tick();
		OPENSSL_armcap_P |= ARMV7_TICK;
		}

	sigaction (SIGILL,&ill_oact,NULL);
	sigprocmask(SIG_SETMASK,&oset,NULL);
	}

crypto/armv4cpuid.S

0 → 100644
+154 −0
Original line number Diff line number Diff line
#include "arm_arch.h"

.text
.code	32

.align	5
.global	_armv7_neon_probe
.type	_armv7_neon_probe,%function
_armv7_neon_probe:
	.word	0xf26ee1fe	@ vorr	q15,q15,q15
	.word	0xe12fff1e	@ bx	lr
.size	_armv7_neon_probe,.-_armv7_neon_probe

.global	_armv7_tick
.type	_armv7_tick,%function
_armv7_tick:
	mrc	p15,0,r0,c9,c13,0
	.word	0xe12fff1e	@ bx	lr
.size	_armv7_tick,.-_armv7_tick

.global	OPENSSL_atomic_add
.type	OPENSSL_atomic_add,%function
OPENSSL_atomic_add:
#if __ARM_ARCH__>=6
.Ladd:	ldrex	r2,[r0]
	add	r3,r2,r1
	strex	r2,r3,[r0]
	cmp	r2,#0
	bne	.Ladd
	mov	r0,r3
	.word	0xe12fff1e	@ bx	lr
#else
	stmdb	sp!,{r4-r6,lr}
	ldr	r2,.Lspinlock
	adr	r3,.Lspinlock
	mov	r4,r0
	mov	r5,r1
	add	r6,r3,r2	@ &spinlock
	b	.+8
.Lspin:	bl	sched_yield
	mov	r0,#-1
	swp	r0,r0,[r6]
	cmp	r0,#0
	bne	.Lspin

	ldr	r2,[r4]
	add	r2,r2,r5
	str	r2,[r4]
	str	r0,[r6]		@ release spinlock
	ldmia	sp!,{r4-r6,lr}
	tst	lr,#1
	moveq	pc,lr
	.word	0xe12fff1e	@ bx	lr
#endif
.size	OPENSSL_atomic_add,.-OPENSSL_atomic_add

.global	OPENSSL_cleanse
.type	OPENSSL_cleanse,%function
OPENSSL_cleanse:
	eor	ip,ip,ip
	cmp	r1,#7
	subhs	r1,r1,#4
	bhs	.Lot
	cmp	r1,#0
	beq	.Lcleanse_done
.Little:
	strb	ip,[r0],#1
	subs	r1,r1,#1
	bhi	.Little
	b	.Lcleanse_done

.Lot:	tst	r0,#3
	beq	.Laligned
	strb	ip,[r0],#1
	sub	r1,r1,#1
	b	.Lot
.Laligned:
	str	ip,[r0],#4
	subs	r1,r1,#4
	bhs	.Laligned
	adds	r1,r1,#4
	bne	.Little
.Lcleanse_done:
	tst	lr,#1
	moveq	pc,lr
	.word	0xe12fff1e	@ bx	lr
.size	OPENSSL_cleanse,.-OPENSSL_cleanse

.global	OPENSSL_wipe_cpu
.type	OPENSSL_wipe_cpu,%function
OPENSSL_wipe_cpu:
	ldr	r0,.LOPENSSL_armcap
	adr	r1,.LOPENSSL_armcap
	ldr	r0,[r1,r0]
	eor	r2,r2,r2
	eor	r3,r3,r3
	eor	ip,ip,ip
	tst	r0,#1
	beq	.Lwipe_done
	.word	0xf3000150	@ veor    q0, q0, q0
	.word	0xf3022152	@ veor    q1, q1, q1
	.word	0xf3044154	@ veor    q2, q2, q2
	.word	0xf3066156	@ veor    q3, q3, q3
	.word	0xf34001f0	@ veor    q8, q8, q8
	.word	0xf34221f2	@ veor    q9, q9, q9
	.word	0xf34441f4	@ veor    q10, q10, q10
	.word	0xf34661f6	@ veor    q11, q11, q11
	.word	0xf34881f8	@ veor    q12, q12, q12
	.word	0xf34aa1fa	@ veor    q13, q13, q13
	.word	0xf34cc1fc	@ veor    q14, q14, q14
	.word	0xf34ee1fe	@ veor    q15, q15, q15
.Lwipe_done:
	mov	r0,sp
	tst	lr,#1
	moveq	pc,lr
	.word	0xe12fff1e	@ bx	lr
.size	OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu

.global	OPENSSL_instrument_bus
.type	OPENSSL_instrument_bus,%function
OPENSSL_instrument_bus:
	eor	r0,r0,r0
	tst	lr,#1
	moveq	pc,lr
	.word	0xe12fff1e	@ bx	lr
.size	OPENSSL_instrument_bus,.-OPENSSL_instrument_bus

.global	OPENSSL_instrument_bus2
.type	OPENSSL_instrument_bus2,%function
OPENSSL_instrument_bus2:
	eor	r0,r0,r0
	tst	lr,#1
	moveq	pc,lr
	.word	0xe12fff1e	@ bx	lr
.size	OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2

.align	5
.LOPENSSL_armcap:
.word	OPENSSL_armcap_P-.LOPENSSL_armcap
#if __ARM_ARCH__>=6
.align	5
#else
.Lspinlock:
.word	atomic_add_spinlock-.Lspinlock
.align	5

.data
.align	2
atomic_add_spinlock:
.word	0
#endif

.comm	OPENSSL_armcap_P,4,4
.hidden	OPENSSL_armcap_P
+278 −0
Original line number Diff line number Diff line
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# May 2011
#
# The module implements bn_GF2m_mul_2x2 polynomial multiplication
# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
# C for the time being... Except that it has two code paths: pure
# integer code suitable for any ARMv4 and later CPU and NEON code
# suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs
# in ~45 cycles on dual-issue core such as Cortex A8, which is ~50%
# faster than compiler-generated code. For ECDH and ECDSA verify (but
# not for ECDSA sign) it means 25%-45% improvement depending on key
# length, more for longer keys. Even though NEON 1x1 multiplication
# runs in even less cycles, ~30, improvement is measurable only on
# longer keys. One has to optimize code elsewhere to get NEON glow...

while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";

sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
sub Q()     { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }

$code=<<___;
#include "arm_arch.h"

.text
.code	32

#if __ARM_ARCH__>=7
.fpu	neon

.type	mul_1x1_neon,%function
.align	5
mul_1x1_neon:
	vshl.u64	`&Dlo("q1")`,d16,#8	@ q1-q3 are slided $a
	vmull.p8	`&Q("d0")`,d16,d17	@ abb
	vshl.u64	`&Dlo("q2")`,d16,#16
	vmull.p8	q1,`&Dlo("q1")`,d17	@ a<<8bb
	vshl.u64	`&Dlo("q3")`,d16,#24
	vmull.p8	q2,`&Dlo("q2")`,d17	@ a<<16bb
	vshr.u64	`&Dlo("q1")`,#8
	vmull.p8	q3,`&Dlo("q3")`,d17	@ a<<24bb
	vshl.u64	`&Dhi("q1")`,#24
	veor		d0,`&Dlo("q1")`
	vshr.u64	`&Dlo("q2")`,#16
	veor		d0,`&Dhi("q1")`
	vshl.u64	`&Dhi("q2")`,#16
	veor		d0,`&Dlo("q2")`
	vshr.u64	`&Dlo("q3")`,#24
	veor		d0,`&Dhi("q2")`
	vshl.u64	`&Dhi("q3")`,#8
	veor		d0,`&Dlo("q3")`
	veor		d0,`&Dhi("q3")`
	bx	lr
.size	mul_1x1_neon,.-mul_1x1_neon
#endif
___
################
# private interface to mul_1x1_ialu
#
$a="r1";
$b="r0";

($a0,$a1,$a2,$a12,$a4,$a14)=
($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12);

$mask="r12";

$code.=<<___;
.type	mul_1x1_ialu,%function
.align	5
mul_1x1_ialu:
	mov	$a0,#0
	bic	$a1,$a,#3<<30		@ a1=a&0x3fffffff
	str	$a0,[sp,#0]		@ tab[0]=0
	add	$a2,$a1,$a1		@ a2=a1<<1
	str	$a1,[sp,#4]		@ tab[1]=a1
	eor	$a12,$a1,$a2		@ a1^a2
	str	$a2,[sp,#8]		@ tab[2]=a2
	mov	$a4,$a1,lsl#2		@ a4=a1<<2
	str	$a12,[sp,#12]		@ tab[3]=a1^a2
	eor	$a14,$a1,$a4		@ a1^a4
	str	$a4,[sp,#16]		@ tab[4]=a4
	eor	$a0,$a2,$a4		@ a2^a4
	str	$a14,[sp,#20]		@ tab[5]=a1^a4
	eor	$a12,$a12,$a4		@ a1^a2^a4
	str	$a0,[sp,#24]		@ tab[6]=a2^a4
	and	$i0,$mask,$b,lsl#2
	str	$a12,[sp,#28]		@ tab[7]=a1^a2^a4

	and	$i1,$mask,$b,lsr#1
	ldr	$lo,[sp,$i0]		@ tab[b       & 0x7]
	and	$i0,$mask,$b,lsr#4
	ldr	$t1,[sp,$i1]		@ tab[b >>  3 & 0x7]
	and	$i1,$mask,$b,lsr#7
	ldr	$t0,[sp,$i0]		@ tab[b >>  6 & 0x7]
	eor	$lo,$lo,$t1,lsl#3	@ stall
	mov	$hi,$t1,lsr#29
	ldr	$t1,[sp,$i1]		@ tab[b >>  9 & 0x7]

	and	$i0,$mask,$b,lsr#10
	eor	$lo,$lo,$t0,lsl#6
	eor	$hi,$hi,$t0,lsr#26
	ldr	$t0,[sp,$i0]		@ tab[b >> 12 & 0x7]

	and	$i1,$mask,$b,lsr#13
	eor	$lo,$lo,$t1,lsl#9
	eor	$hi,$hi,$t1,lsr#23
	ldr	$t1,[sp,$i1]		@ tab[b >> 15 & 0x7]

	and	$i0,$mask,$b,lsr#16
	eor	$lo,$lo,$t0,lsl#12
	eor	$hi,$hi,$t0,lsr#20
	ldr	$t0,[sp,$i0]		@ tab[b >> 18 & 0x7]

	and	$i1,$mask,$b,lsr#19
	eor	$lo,$lo,$t1,lsl#15
	eor	$hi,$hi,$t1,lsr#17
	ldr	$t1,[sp,$i1]		@ tab[b >> 21 & 0x7]

	and	$i0,$mask,$b,lsr#22
	eor	$lo,$lo,$t0,lsl#18
	eor	$hi,$hi,$t0,lsr#14
	ldr	$t0,[sp,$i0]		@ tab[b >> 24 & 0x7]

	and	$i1,$mask,$b,lsr#25
	eor	$lo,$lo,$t1,lsl#21
	eor	$hi,$hi,$t1,lsr#11
	ldr	$t1,[sp,$i1]		@ tab[b >> 27 & 0x7]

	tst	$a,#1<<30
	and	$i0,$mask,$b,lsr#28
	eor	$lo,$lo,$t0,lsl#24
	eor	$hi,$hi,$t0,lsr#8
	ldr	$t0,[sp,$i0]		@ tab[b >> 30      ]

	eorne	$lo,$lo,$b,lsl#30
	eorne	$hi,$hi,$b,lsr#2
	tst	$a,#1<<31
	eor	$lo,$lo,$t1,lsl#27
	eor	$hi,$hi,$t1,lsr#5
	eorne	$lo,$lo,$b,lsl#31
	eorne	$hi,$hi,$b,lsr#1
	eor	$lo,$lo,$t0,lsl#30
	eor	$hi,$hi,$t0,lsr#2

	mov	pc,lr
.size	mul_1x1_ialu,.-mul_1x1_ialu
___
################
# void	bn_GF2m_mul_2x2(BN_ULONG *r,
#	BN_ULONG a1,BN_ULONG a0,
#	BN_ULONG b1,BN_ULONG b0);	# r[3..0]=a1a0b1b0

($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23));

$code.=<<___;
.global	bn_GF2m_mul_2x2
.type	bn_GF2m_mul_2x2,%function
.align	5
bn_GF2m_mul_2x2:
#if __ARM_ARCH__>=7
	ldr	r12,.LOPENSSL_armcap
.Lpic:	ldr	r12,[pc,r12]
	tst	r12,#1
	beq	.Lialu

	veor	$A1,$A1
	vmov.32	$B1,r3,r3		@ two copies of b1
	vmov.32	${A1}[0],r1		@ a1

	veor	$A0,$A0
	vld1.32	${B0}[],[sp,:32]	@ two copies of b0
	vmov.32	${A0}[0],r2		@ a0
	mov	r12,lr

	vmov	d16,$A1
	vmov	d17,$B1
	bl	mul_1x1_neon		@ a1b1
	vmov	$A1B1,d0

	vmov	d16,$A0
	vmov	d17,$B0
	bl	mul_1x1_neon		@ a0b0
	vmov	$A0B0,d0

	veor	d16,$A0,$A1
	veor	d17,$B0,$B1
	veor	$A0,$A0B0,$A1B1
	bl	mul_1x1_neon		@ (a0+a1)(b0+b1)

	veor	d0,$A0			@ (a0+a1)(b0+b1)-a0b0-a1b1
	vshl.u64 d1,d0,#32
	vshr.u64 d0,d0,#32
	veor	$A0B0,d1
	veor	$A1B1,d0
	vst1.32	{${A0B0}[0]},[r0,:32]!
	vst1.32	{${A0B0}[1]},[r0,:32]!
	vst1.32	{${A1B1}[0]},[r0,:32]!
	vst1.32	{${A1B1}[1]},[r0,:32]
	bx	r12
.align	4
.Lialu:
#endif
___
$ret="r10";	# reassigned 1st argument
$code.=<<___;
	stmdb	sp!,{r4-r10,lr}
	mov	$ret,r0			@ reassign 1st argument
	mov	$b,r3			@ $b=b1
	ldr	r3,[sp,#32]		@ load b0
	mov	$mask,#7<<2
	sub	sp,sp,#32		@ allocate tab[8]

	bl	mul_1x1_ialu		@ a1b1
	str	$lo,[$ret,#8]
	str	$hi,[$ret,#12]

	eor	$b,$b,r3		@ flip b0 and b1
	 eor	$a,$a,r2		@ flip a0 and a1
	eor	r3,r3,$b
	 eor	r2,r2,$a
	eor	$b,$b,r3
	 eor	$a,$a,r2
	bl	mul_1x1_ialu		@ a0b0
	str	$lo,[$ret]
	str	$hi,[$ret,#4]

	eor	$a,$a,r2
	eor	$b,$b,r3
	bl	mul_1x1_ialu		@ (a1+a0)(b1+b0)
___
@r=map("r$_",(6..9));
$code.=<<___;
	ldmia	$ret,{@r[0]-@r[3]}
	eor	$lo,$lo,$hi
	eor	$hi,$hi,@r[1]
	eor	$lo,$lo,@r[0]
	eor	$hi,$hi,@r[2]
	eor	$lo,$lo,@r[3]
	eor	$hi,$hi,@r[3]
	str	$hi,[$ret,#8]
	eor	$lo,$lo,$hi
	add	sp,sp,#32		@ destroy tab[8]
	str	$lo,[$ret,#4]

#if __ARM_ARCH__>=5
	ldmia	sp!,{r4-r10,pc}
#else
	ldmia	sp!,{r4-r10,lr}
	tst	lr,#1
	moveq	pc,lr			@ be binary compatible with V4, yet
	bx	lr			@ interoperable with Thumb ISA:-)
#endif
.size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
#if __ARM_ARCH__>=7
.align	5
.LOPENSSL_armcap:
.word	OPENSSL_armcap_P-(.Lpic+8)
#endif
.asciz	"GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align	5

.comm	OPENSSL_armcap_P,4,4
___

$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
print $code;
close STDOUT;   # enforce flush
Loading