Commit 1e863180 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

ARM assembler pack: profiler-assisted optimizations and NEON support.

parent d8d95832
Loading
Loading
Loading
Loading
+3 −2
Original line number Diff line number Diff line
@@ -398,8 +398,9 @@ my %table=(
"linux-alpha-ccc","ccc:-fast -readonly_strings -DL_ENDIAN -DTERMIO::-D_REENTRANT:::SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL:${alpha_asm}",
"linux-alpha+bwx-ccc","ccc:-fast -readonly_strings -DL_ENDIAN -DTERMIO::-D_REENTRANT:::SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL:${alpha_asm}",

# Android: linux-armv4 but without -DTERMIO and pointers to headers and libs.
"android","gcc:-mandroid -I\$(ANDROID_DEV)/include -B\$(ANDROID_DEV)/lib -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${armv4_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
# Android: linux-* but without -DTERMIO and pointers to headers and libs.
"android","gcc:-mandroid -I\$(ANDROID_DEV)/include -B\$(ANDROID_DEV)/lib -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"android-armv7","gcc:-march=armv7-a -mandroid -I\$(ANDROID_DEV)/include -B\$(ANDROID_DEV)/lib -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${armv4_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",

#### *BSD [do see comment about ${BSDthreads} above!]
"BSD-generic32","gcc:-DTERMIOS -O3 -fomit-frame-pointer -Wall::${BSDthreads}:::BN_LLONG RC2_CHAR RC4_INDEX DES_INT DES_UNROLL:${no_asm}:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+32 −0
Original line number Diff line number Diff line
@@ -1001,6 +1001,38 @@ $sys_id =
$lflags       = -ldl
$bn_ops       = BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR
$cpuid_obj    = 
$bn_obj       = 
$des_obj      = 
$aes_obj      = 
$bf_obj       = 
$md5_obj      = 
$sha1_obj     = 
$cast_obj     = 
$rc4_obj      = 
$rmd160_obj   = 
$rc5_obj      = 
$wp_obj       = 
$cmll_obj     = 
$modes_obj    = 
$perlasm_scheme = void
$dso_scheme   = dlfcn
$shared_target= linux-shared
$shared_cflag = -fPIC
$shared_ldflag = 
$shared_extension = .so.$(SHLIB_MAJOR).$(SHLIB_MINOR)
$ranlib       = 
$arflags      = 
$multilib     = 

*** android-armv7
$cc           = gcc
$cflags       = -march=armv7-a -mandroid -I$(ANDROID_DEV)/include -B$(ANDROID_DEV)/lib -O3 -fomit-frame-pointer -Wall
$unistd       = 
$thread_cflag = -D_REENTRANT
$sys_id       = 
$lflags       = -ldl
$bn_ops       = BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR
$cpuid_obj    = 
$bn_obj       = bn_asm.o armv4-mont.o
$des_obj      = 
$aes_obj      = aes_cbc.o aes-armv4.o
+1 −0
Original line number Diff line number Diff line
@@ -821,6 +821,7 @@ case "$GUESSOS" in
  beos-*) OUT="$GUESSOS" ;;
  x86pc-*-qnx6) OUT="QNX6-i386" ;;
  *-*-qnx6) OUT="QNX6" ;;
  armv[7-9]*-*-android) OUT="android-armv7" ;;
  *) OUT=`echo $GUESSOS | awk -F- '{print $3}'`;;
esac

+2 −1
Original line number Diff line number Diff line
@@ -68,7 +68,8 @@ aes-parisc.s: asm/aes-parisc.pl
	$(PERL) asm/aes-parisc.pl $(PERLASM_SCHEME) $@

# GNU make "catch all"
aes-%.s:	asm/aes-%.pl;	$(PERL) $< $(PERLASM_SCHEME) $@
aes-%.S:	asm/aes-%.pl;	$(PERL) $< $(PERLASM_SCHEME) $@
aes-armv4.o:	aes-armv4.S

files:
	$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
+132 −29
Original line number Diff line number Diff line
@@ -27,6 +27,11 @@
# Rescheduling for dual-issue pipeline resulted in 12% improvement on
# Cortex A8 core and ~25 cycles per byte processed with 128-bit key.

# February 2011.
#
# Profiler-assisted and platform-specific optimization resulted in 16%
# improvement on Cortex A8 core and ~21.5 cycles per byte.

while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";

@@ -46,6 +51,7 @@ $key="r11";
$rounds="r12";

$code=<<___;
#include "arm_arch.h"
.text
.code	32

@@ -166,7 +172,7 @@ AES_encrypt:
	mov	$rounds,r0		@ inp
	mov	$key,r2
	sub	$tbl,r3,#AES_encrypt-AES_Te	@ Te

#if __ARM_ARCH__<7
	ldrb	$s0,[$rounds,#3]	@ load input data in endian-neutral
	ldrb	$t1,[$rounds,#2]	@ manner...
	ldrb	$t2,[$rounds,#1]
@@ -195,10 +201,33 @@ AES_encrypt:
	orr	$s3,$s3,$t1,lsl#8
	orr	$s3,$s3,$t2,lsl#16
	orr	$s3,$s3,$t3,lsl#24

#else
	ldr	$s0,[$rounds,#0]
	ldr	$s1,[$rounds,#4]
	ldr	$s2,[$rounds,#8]
	ldr	$s3,[$rounds,#12]
#ifdef __ARMEL__
	rev	$s0,$s0
	rev	$s1,$s1
	rev	$s2,$s2
	rev	$s3,$s3
#endif
#endif
	bl	_armv4_AES_encrypt

	ldr	$rounds,[sp],#4		@ pop out
#if __ARM_ARCH__>=7
#ifdef __ARMEL__
	rev	$s0,$s0
	rev	$s1,$s1
	rev	$s2,$s2
	rev	$s3,$s3
#endif
	str	$s0,[$rounds,#0]
	str	$s1,[$rounds,#4]
	str	$s2,[$rounds,#8]
	str	$s3,[$rounds,#12]
#else
	mov	$t1,$s0,lsr#24		@ write output in endian-neutral
	mov	$t2,$s0,lsr#16		@ manner...
	mov	$t3,$s0,lsr#8
@@ -227,11 +256,15 @@ AES_encrypt:
	strb	$t2,[$rounds,#13]
	strb	$t3,[$rounds,#14]
	strb	$s3,[$rounds,#15]

#endif
#if __ARM_ARCH__>=5
	ldmia	sp!,{r4-r12,pc}
#else
	ldmia   sp!,{r4-r12,lr}
	tst	lr,#1
	moveq	pc,lr			@ be binary compatible with V4, yet
	bx	lr			@ interoperable with Thumb ISA:-)
#endif
.size	AES_encrypt,.-AES_encrypt

.type   _armv4_AES_encrypt,%function
@@ -271,11 +304,11 @@ _armv4_AES_encrypt:
	and	$i2,lr,$s2,lsr#16	@ i1
	eor	$t3,$t3,$i3,ror#8
	and	$i3,lr,$s2
	eor	$s1,$s1,$t1,ror#24
	ldr	$i1,[$tbl,$i1,lsl#2]	@ Te2[s2>>8]
	eor	$s1,$s1,$t1,ror#24
	ldr	$i2,[$tbl,$i2,lsl#2]	@ Te1[s2>>16]
	mov	$s2,$s2,lsr#24

	ldr	$i2,[$tbl,$i2,lsl#2]	@ Te1[s2>>16]
	ldr	$i3,[$tbl,$i3,lsl#2]	@ Te3[s2>>0]
	eor	$s0,$s0,$i1,ror#16
	ldr	$s2,[$tbl,$s2,lsl#2]	@ Te0[s2>>24]
@@ -284,16 +317,16 @@ _armv4_AES_encrypt:
	and	$i2,lr,$s3,lsr#8	@ i1
	eor	$t3,$t3,$i3,ror#16
	and	$i3,lr,$s3,lsr#16	@ i2
	eor	$s2,$s2,$t2,ror#16
	ldr	$i1,[$tbl,$i1,lsl#2]	@ Te3[s3>>0]
	eor	$s2,$s2,$t2,ror#16
	ldr	$i2,[$tbl,$i2,lsl#2]	@ Te2[s3>>8]
	mov	$s3,$s3,lsr#24

	ldr	$i2,[$tbl,$i2,lsl#2]	@ Te2[s3>>8]
	ldr	$i3,[$tbl,$i3,lsl#2]	@ Te1[s3>>16]
	eor	$s0,$s0,$i1,ror#24
	ldr	$s3,[$tbl,$s3,lsl#2]	@ Te0[s3>>24]
	eor	$s1,$s1,$i2,ror#16
	ldr	$i1,[$key],#16
	eor	$s1,$s1,$i2,ror#16
	ldr	$s3,[$tbl,$s3,lsl#2]	@ Te0[s3>>24]
	eor	$s2,$s2,$i3,ror#8
	ldr	$t1,[$key,#-12]
	eor	$s3,$s3,$t3,ror#8
@@ -333,11 +366,11 @@ _armv4_AES_encrypt:
	and	$i2,lr,$s2,lsr#16	@ i1
	eor	$t3,$i3,$t3,lsl#8
	and	$i3,lr,$s2
	eor	$s1,$t1,$s1,lsl#24
	ldrb	$i1,[$tbl,$i1,lsl#2]	@ Te4[s2>>8]
	eor	$s1,$t1,$s1,lsl#24
	ldrb	$i2,[$tbl,$i2,lsl#2]	@ Te4[s2>>16]
	mov	$s2,$s2,lsr#24

	ldrb	$i2,[$tbl,$i2,lsl#2]	@ Te4[s2>>16]
	ldrb	$i3,[$tbl,$i3,lsl#2]	@ Te4[s2>>0]
	eor	$s0,$i1,$s0,lsl#8
	ldrb	$s2,[$tbl,$s2,lsl#2]	@ Te4[s2>>24]
@@ -346,15 +379,15 @@ _armv4_AES_encrypt:
	and	$i2,lr,$s3,lsr#8	@ i1
	eor	$t3,$i3,$t3,lsl#8
	and	$i3,lr,$s3,lsr#16	@ i2
	eor	$s2,$t2,$s2,lsl#24
	ldrb	$i1,[$tbl,$i1,lsl#2]	@ Te4[s3>>0]
	eor	$s2,$t2,$s2,lsl#24
	ldrb	$i2,[$tbl,$i2,lsl#2]	@ Te4[s3>>8]
	mov	$s3,$s3,lsr#24

	ldrb	$i2,[$tbl,$i2,lsl#2]	@ Te4[s3>>8]
	ldrb	$i3,[$tbl,$i3,lsl#2]	@ Te4[s3>>16]
	eor	$s0,$i1,$s0,lsl#8
	ldrb	$s3,[$tbl,$s3,lsl#2]	@ Te4[s3>>24]
	ldr	$i1,[$key,#0]
	ldrb	$s3,[$tbl,$s3,lsl#2]	@ Te4[s3>>24]
	eor	$s1,$s1,$i2,lsl#8
	ldr	$t1,[$key,#4]
	eor	$s2,$s2,$i3,lsl#16
@@ -398,6 +431,7 @@ AES_set_encrypt_key:
	mov	lr,r1			@ bits
	mov	$key,r2			@ key

#if __ARM_ARCH__<7
	ldrb	$s0,[$rounds,#3]	@ load input data in endian-neutral
	ldrb	$t1,[$rounds,#2]	@ manner...
	ldrb	$t2,[$rounds,#1]
@@ -430,6 +464,22 @@ AES_set_encrypt_key:
	orr	$s3,$s3,$t3,lsl#24
	str	$s2,[$key,#-8]
	str	$s3,[$key,#-4]
#else
	ldr	$s0,[$rounds,#0]
	ldr	$s1,[$rounds,#4]
	ldr	$s2,[$rounds,#8]
	ldr	$s3,[$rounds,#12]
#ifdef __ARMEL__
	rev	$s0,$s0
	rev	$s1,$s1
	rev	$s2,$s2
	rev	$s3,$s3
#endif
	str	$s0,[$key],#16
	str	$s1,[$key,#-12]
	str	$s2,[$key,#-8]
	str	$s3,[$key,#-4]
#endif

	teq	lr,#128
	bne	.Lnot128
@@ -466,6 +516,7 @@ AES_set_encrypt_key:
	b	.Ldone

.Lnot128:
#if __ARM_ARCH__<7
	ldrb	$i2,[$rounds,#19]
	ldrb	$t1,[$rounds,#18]
	ldrb	$t2,[$rounds,#17]
@@ -482,6 +533,16 @@ AES_set_encrypt_key:
	str	$i2,[$key],#8
	orr	$i3,$i3,$t3,lsl#24
	str	$i3,[$key,#-4]
#else
	ldr	$i2,[$rounds,#16]
	ldr	$i3,[$rounds,#20]
#ifdef __ARMEL__
	rev	$i2,$i2
	rev	$i3,$i3
#endif
	str	$i2,[$key],#8
	str	$i3,[$key,#-4]
#endif

	teq	lr,#192
	bne	.Lnot192
@@ -526,6 +587,7 @@ AES_set_encrypt_key:
	b	.L192_loop

.Lnot192:
#if __ARM_ARCH__<7
	ldrb	$i2,[$rounds,#27]
	ldrb	$t1,[$rounds,#26]
	ldrb	$t2,[$rounds,#25]
@@ -542,6 +604,16 @@ AES_set_encrypt_key:
	str	$i2,[$key],#8
	orr	$i3,$i3,$t3,lsl#24
	str	$i3,[$key,#-4]
#else
	ldr	$i2,[$rounds,#24]
	ldr	$i3,[$rounds,#28]
#ifdef __ARMEL__
	rev	$i2,$i2
	rev	$i3,$i3
#endif
	str	$i2,[$key],#8
	str	$i3,[$key,#-4]
#endif

	mov	$rounds,#14
	str	$rounds,[$key,#240-32]
@@ -692,10 +764,14 @@ $code.=<<___;
	bne	.Lmix

	mov	r0,#0
#if __ARM_ARCH__>=5
	ldmia	sp!,{r4-r12,pc}
#else
	ldmia   sp!,{r4-r12,lr}
	tst	lr,#1
	moveq	pc,lr			@ be binary compatible with V4, yet
	bx	lr			@ interoperable with Thumb ISA:-)
#endif
.size	AES_set_decrypt_key,.-AES_set_decrypt_key

.type	AES_Td,%object
@@ -811,7 +887,7 @@ AES_decrypt:
	mov	$rounds,r0		@ inp
	mov	$key,r2
	sub	$tbl,r3,#AES_decrypt-AES_Td		@ Td

#if __ARM_ARCH__<7
	ldrb	$s0,[$rounds,#3]	@ load input data in endian-neutral
	ldrb	$t1,[$rounds,#2]	@ manner...
	ldrb	$t2,[$rounds,#1]
@@ -840,10 +916,33 @@ AES_decrypt:
	orr	$s3,$s3,$t1,lsl#8
	orr	$s3,$s3,$t2,lsl#16
	orr	$s3,$s3,$t3,lsl#24

#else
	ldr	$s0,[$rounds,#0]
	ldr	$s1,[$rounds,#4]
	ldr	$s2,[$rounds,#8]
	ldr	$s3,[$rounds,#12]
#ifdef __ARMEL__
	rev	$s0,$s0
	rev	$s1,$s1
	rev	$s2,$s2
	rev	$s3,$s3
#endif
#endif
	bl	_armv4_AES_decrypt

	ldr	$rounds,[sp],#4		@ pop out
#if __ARM_ARCH__>=7
#ifdef __ARMEL__
	rev	$s0,$s0
	rev	$s1,$s1
	rev	$s2,$s2
	rev	$s3,$s3
#endif
	str	$s0,[$rounds,#0]
	str	$s1,[$rounds,#4]
	str	$s2,[$rounds,#8]
	str	$s3,[$rounds,#12]
#else
	mov	$t1,$s0,lsr#24		@ write output in endian-neutral
	mov	$t2,$s0,lsr#16		@ manner...
	mov	$t3,$s0,lsr#8
@@ -872,11 +971,15 @@ AES_decrypt:
	strb	$t2,[$rounds,#13]
	strb	$t3,[$rounds,#14]
	strb	$s3,[$rounds,#15]

#endif
#if __ARM_ARCH__>=5
	ldmia	sp!,{r4-r12,pc}
#else
	ldmia   sp!,{r4-r12,lr}
	tst	lr,#1
	moveq	pc,lr			@ be binary compatible with V4, yet
	bx	lr			@ interoperable with Thumb ISA:-)
#endif
.size	AES_decrypt,.-AES_decrypt

.type   _armv4_AES_decrypt,%function
@@ -916,11 +1019,11 @@ _armv4_AES_decrypt:
	and	$i2,lr,$s2		@ i1
	eor	$t3,$i3,$t3,ror#8
	and	$i3,lr,$s2,lsr#16
	eor	$s1,$s1,$t1,ror#8
	ldr	$i1,[$tbl,$i1,lsl#2]	@ Td2[s2>>8]
	eor	$s1,$s1,$t1,ror#8
	ldr	$i2,[$tbl,$i2,lsl#2]	@ Td3[s2>>0]
	mov	$s2,$s2,lsr#24

	ldr	$i2,[$tbl,$i2,lsl#2]	@ Td3[s2>>0]
	ldr	$i3,[$tbl,$i3,lsl#2]	@ Td1[s2>>16]
	eor	$s0,$s0,$i1,ror#16
	ldr	$s2,[$tbl,$s2,lsl#2]	@ Td0[s2>>24]
@@ -929,22 +1032,22 @@ _armv4_AES_decrypt:
	and	$i2,lr,$s3,lsr#8	@ i1
	eor	$t3,$i3,$t3,ror#8
	and	$i3,lr,$s3		@ i2
	eor	$s2,$s2,$t2,ror#8
	ldr	$i1,[$tbl,$i1,lsl#2]	@ Td1[s3>>16]
	eor	$s2,$s2,$t2,ror#8
	ldr	$i2,[$tbl,$i2,lsl#2]	@ Td2[s3>>8]
	mov	$s3,$s3,lsr#24

	ldr	$i2,[$tbl,$i2,lsl#2]	@ Td2[s3>>8]
	ldr	$i3,[$tbl,$i3,lsl#2]	@ Td3[s3>>0]
	eor	$s0,$s0,$i1,ror#8
	ldr	$s3,[$tbl,$s3,lsl#2]	@ Td0[s3>>24]
	ldr	$i1,[$key],#16
	eor	$s1,$s1,$i2,ror#16
	ldr	$s3,[$tbl,$s3,lsl#2]	@ Td0[s3>>24]
	eor	$s2,$s2,$i3,ror#24
	ldr	$i1,[$key],#16
	eor	$s3,$s3,$t3,ror#8

	ldr	$t1,[$key,#-12]
	ldr	$t2,[$key,#-8]
	eor	$s0,$s0,$i1
	ldr	$t2,[$key,#-8]
	eor	$s3,$s3,$t3,ror#8
	ldr	$t3,[$key,#-4]
	and	$i1,lr,$s0,lsr#16
	eor	$s1,$s1,$t1
@@ -985,11 +1088,11 @@ _armv4_AES_decrypt:
	and	$i1,lr,$s2,lsr#8	@ i0
	eor	$t2,$t2,$i2,lsl#8
	and	$i2,lr,$s2		@ i1
	eor	$t3,$t3,$i3,lsl#8
	ldrb	$i1,[$tbl,$i1]		@ Td4[s2>>8]
	eor	$t3,$t3,$i3,lsl#8
	ldrb	$i2,[$tbl,$i2]		@ Td4[s2>>0]
	and	$i3,lr,$s2,lsr#16

	ldrb	$i2,[$tbl,$i2]		@ Td4[s2>>0]
	ldrb	$s2,[$tbl,$s2,lsr#24]	@ Td4[s2>>24]
	eor	$s0,$s0,$i1,lsl#8
	ldrb	$i3,[$tbl,$i3]		@ Td4[s2>>16]
@@ -997,11 +1100,11 @@ _armv4_AES_decrypt:
	and	$i1,lr,$s3,lsr#16	@ i0
	eor	$s2,$t2,$s2,lsl#16
	and	$i2,lr,$s3,lsr#8	@ i1
	eor	$t3,$t3,$i3,lsl#16
	ldrb	$i1,[$tbl,$i1]		@ Td4[s3>>16]
	eor	$t3,$t3,$i3,lsl#16
	ldrb	$i2,[$tbl,$i2]		@ Td4[s3>>8]
	and	$i3,lr,$s3		@ i2

	ldrb	$i2,[$tbl,$i2]		@ Td4[s3>>8]
	ldrb	$i3,[$tbl,$i3]		@ Td4[s3>>0]
	ldrb	$s3,[$tbl,$s3,lsr#24]	@ Td4[s3>>24]
	eor	$s0,$s0,$i1,lsl#16
Loading