Commit 874faf2f authored by Andy Polyakov's avatar Andy Polyakov
Browse files

Adapt ARM assembly pack for iOS.



This is achieved by filtering perlasm output through arm-xlate.pl. But note
that it's done only if "flavour" argument is not 'void'. As 'void' is
default value for other ARM targets, permasm output is not actually
filtered on previously validated platforms.

Reviewed-by: default avatarDr. Stephen Henson <steve@openssl.org>
parent 0b45df73
Loading
Loading
Loading
Loading
+29 −2
Original line number Diff line number Diff line
@@ -32,8 +32,20 @@
# Profiler-assisted and platform-specific optimization resulted in 16%
# improvement on Cortex A8 core and ~21.5 cycles per byte.

while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
$flavour = shift;
if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }

if ($flavour && $flavour ne "void") {
    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
    die "can't locate arm-xlate.pl";

    open STDOUT,"| \"$^X\" $xlate $flavour $output";
} else {
    open STDOUT,">$output";
}

$s0="r0";
$s1="r1";
@@ -171,7 +183,12 @@ AES_encrypt:
	stmdb   sp!,{r1,r4-r12,lr}
	mov	$rounds,r0		@ inp
	mov	$key,r2
#ifdef	__APPLE__
	mov	$tbl,#AES_encrypt-AES_Te
	sub	$tbl,r3,$tbl			@ Te
#else
	sub	$tbl,r3,#AES_encrypt-AES_Te	@ Te
#endif
#if __ARM_ARCH__<7
	ldrb	$s0,[$rounds,#3]	@ load input data in endian-neutral
	ldrb	$t1,[$rounds,#2]	@ manner...
@@ -425,7 +442,12 @@ AES_set_encrypt_key:
	bne	.Labrt

.Lok:	stmdb   sp!,{r4-r12,lr}
#ifdef	__APPLE__
	mov	$tbl,#AES_set_encrypt_key-AES_Te-1024
	sub	$tbl,r3,$tbl					@ Te4
#else
	sub	$tbl,r3,#AES_set_encrypt_key-AES_Te-1024	@ Te4
#endif

	mov	$rounds,r0		@ inp
	mov	lr,r1			@ bits
@@ -886,7 +908,12 @@ AES_decrypt:
	stmdb   sp!,{r1,r4-r12,lr}
	mov	$rounds,r0		@ inp
	mov	$key,r2
#ifdef	__APPLE__
	mov	$tbl,#AES_decrypt-AES_Td
	sub	$tbl,r3,$tbl				@ Td
#else
	sub	$tbl,r3,#AES_decrypt-AES_Td		@ Td
#endif
#if __ARM_ARCH__<7
	ldrb	$s0,[$rounds,#3]	@ load input data in endian-neutral
	ldrb	$t1,[$rounds,#2]	@ manner...
+21 −2
Original line number Diff line number Diff line
@@ -21,8 +21,20 @@
# runs in even less cycles, ~30, improvement is measurable only on
# longer keys. One has to optimize code elsewhere to get NEON glow...

while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
$flavour = shift;
if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }

if ($flavour && $flavour ne "void") {
    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
    die "can't locate arm-xlate.pl";

    open STDOUT,"| \"$^X\" $xlate $flavour $output";
} else {
    open STDOUT,">$output";
}

sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
@@ -170,11 +182,18 @@ bn_GF2m_mul_2x2:
#if __ARM_ARCH__>=7
	ldr	r12,.LOPENSSL_armcap
.Lpic:	ldr	r12,[pc,r12]
#ifdef	__APPLE__
	ldr	r12,[r12]
#endif
	tst	r12,#1
	beq	.Lialu

	veor	$A1,$A1
#ifdef	__APPLE__
	vmov	$B1,r3,r3		@ two copies of b1
#else
	vmov.32	$B1,r3,r3		@ two copies of b1
#endif
	vmov.32	${A1}[0],r1		@ a1

	veor	$A0,$A0
+14 −2
Original line number Diff line number Diff line
@@ -23,8 +23,20 @@
# than 1/2KB. Windows CE port would be trivial, as it's exclusively
# about decorations, ABI and instruction syntax are identical.

while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
$flavour = shift;
if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }

if ($flavour && $flavour ne "void") {
    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
    die "can't locate arm-xlate.pl";

    open STDOUT,"| \"$^X\" $xlate $flavour $output";
} else {
    open STDOUT,">$output";
}

$num="r0";	# starts as num argument, but holds &tp[num-1]
$ap="r1";
+25 −8
Original line number Diff line number Diff line
@@ -57,8 +57,20 @@
# *native* byte order on current platform. See gcm128.c for working
# example...

while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
$flavour = shift;
if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }

if ($flavour && $flavour ne "void") {
    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
    die "can't locate arm-xlate.pl";

    open STDOUT,"| \"$^X\" $xlate $flavour $output";
} else {
    open STDOUT,">$output";
}

$Xi="r0";	# argument block
$Htbl="r1";
@@ -112,6 +124,11 @@ $code=<<___;
.text
.code	32

#ifdef  __APPLE__
#define ldrplb	ldrbpl
#define ldrneb	ldrbne
#endif

.type	rem_4bit,%object
.align	5
rem_4bit:
@@ -326,9 +343,9 @@ $code.=<<___;
.align	4
gcm_gmult_neon:
	sub		$Htbl,#16		@ point at H in GCM128_CTX
	vld1.64		`&Dhi("$IN")`,[$Xi,:64]!@ load Xi
	vld1.64		`&Dhi("$IN")`,[$Xi]!	@ load Xi
	vmov.i32	$mod,#0xe1		@ our irreducible polynomial
	vld1.64		`&Dlo("$IN")`,[$Xi,:64]!
	vld1.64		`&Dlo("$IN")`,[$Xi]!
	vshr.u64	$mod,#32
	vldmia		$Htbl,{$Hhi-$Hlo}	@ load H
	veor		$zero,$zero
@@ -349,9 +366,9 @@ gcm_gmult_neon:
.type	gcm_ghash_neon,%function
.align	4
gcm_ghash_neon:
	vld1.64		`&Dhi("$Z")`,[$Xi,:64]!	@ load Xi
	vld1.64		`&Dhi("$Z")`,[$Xi]!	@ load Xi
	vmov.i32	$mod,#0xe1		@ our irreducible polynomial
	vld1.64		`&Dlo("$Z")`,[$Xi,:64]!
	vld1.64		`&Dlo("$Z")`,[$Xi]!
	vshr.u64	$mod,#32
	vldmia		$Xi,{$Hhi-$Hlo}		@ load H
	veor		$zero,$zero
@@ -410,8 +427,8 @@ gcm_ghash_neon:
	vrev64.8	$Z,$Z
#endif
	sub		$Xi,#16	
	vst1.64		`&Dhi("$Z")`,[$Xi,:64]!	@ write out Xi
	vst1.64		`&Dlo("$Z")`,[$Xi,:64]
	vst1.64		`&Dhi("$Z")`,[$Xi]!	@ write out Xi
	vst1.64		`&Dlo("$Z")`,[$Xi]

	bx	lr
.size	gcm_ghash_neon,.-gcm_ghash_neon
+14 −2
Original line number Diff line number Diff line
@@ -52,8 +52,20 @@
# Profiler-assisted and platform-specific optimization resulted in 10%
# improvement on Cortex A8 core and 12.2 cycles per byte.

while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
$flavour = shift;
if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }

if ($flavour && $flavour ne "void") {
    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
    die "can't locate arm-xlate.pl";

    open STDOUT,"| \"$^X\" $xlate $flavour $output";
} else {
    open STDOUT,">$output";
}

$ctx="r0";
$inp="r1";
Loading