Commit db42bb44 authored by Andy Polyakov's avatar Andy Polyakov Committed by Richard Levitte
Browse files

ARM64 assembly pack: make it Windows-friendly.



"Windows friendliness" means a) unified PIC-ification, unified across
all platforms; b) unified commantary delimiter; c) explicit ldur/stur,
as Visual Studio assembler can't automatically encode ldr/str as
ldur/stur when needed.

Reviewed-by: default avatarPaul Dale <paul.dale@oracle.com>
Reviewed-by: default avatarRichard Levitte <levitte@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/8256)
parent 3405db97
Loading
Loading
Loading
Loading
+138 −138
Original line number Diff line number Diff line
@@ -150,12 +150,12 @@ my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_.16b",(24..27));
my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_.16b",(24..31));

$code.=<<___;
##
##  _aes_preheat
##
##  Fills register %r10 -> .aes_consts (so you can -fPIC)
##  and %xmm9-%xmm15 as specified below.
##
//
//  _aes_preheat
//
//  Fills register %r10 -> .aes_consts (so you can -fPIC)
//  and %xmm9-%xmm15 as specified below.
//
.type	_vpaes_encrypt_preheat,%function
.align	4
_vpaes_encrypt_preheat:
@@ -167,21 +167,21 @@ _vpaes_encrypt_preheat:
	ret
.size	_vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat

##
##  _aes_encrypt_core
##
##  AES-encrypt %xmm0.
##
##  Inputs:
##     %xmm0 = input
##     %xmm9-%xmm15 as in _vpaes_preheat
##    (%rdx) = scheduled keys
##
##  Output in %xmm0
##  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
##  Preserves %xmm6 - %xmm8 so you get some local vectors
##
##
//
//  _aes_encrypt_core
//
//  AES-encrypt %xmm0.
//
//  Inputs:
//     %xmm0 = input
//     %xmm9-%xmm15 as in _vpaes_preheat
//    (%rdx) = scheduled keys
//
//  Output in %xmm0
//  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
//  Preserves %xmm6 - %xmm8 so you get some local vectors
//
//
.type	_vpaes_encrypt_core,%function
.align 4
_vpaes_encrypt_core:
@@ -387,11 +387,11 @@ _vpaes_decrypt_preheat:
	ret
.size	_vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat

##
##  Decryption core
##
##  Same API as encryption core.
##
//
//  Decryption core
//
//  Same API as encryption core.
//
.type	_vpaes_decrypt_core,%function
.align	4
_vpaes_decrypt_core:
@@ -643,11 +643,11 @@ my ($inp,$bits,$out,$dir)=("x0","w1","x2","w3");
my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_.16b",(18..21,8));

$code.=<<___;
########################################################
##                                                    ##
##                  AES key schedule                  ##
##                                                    ##
########################################################
////////////////////////////////////////////////////////
//                                                    //
//                  AES key schedule                  //
//                                                    //
////////////////////////////////////////////////////////
.type	_vpaes_key_preheat,%function
.align	4
_vpaes_key_preheat:
@@ -703,14 +703,14 @@ _vpaes_schedule_core:
	b.eq	.Lschedule_192
	// 128: fall though

##
##  .schedule_128
##
##  128-bit specific part of key schedule.
##
##  This schedule is really simple, because all its parts
##  are accomplished by the subroutines.
##
//
//  .schedule_128
//
//  128-bit specific part of key schedule.
//
//  This schedule is really simple, because all its parts
//  are accomplished by the subroutines.
//
.Lschedule_128:
	mov	$inp, #10			// mov	\$10, %esi

@@ -721,21 +721,21 @@ _vpaes_schedule_core:
	bl	_vpaes_schedule_mangle		// write output
	b 	.Loop_schedule_128

##
##  .aes_schedule_192
##
##  192-bit specific part of key schedule.
##
##  The main body of this schedule is the same as the 128-bit
##  schedule, but with more smearing.  The long, high side is
##  stored in %xmm7 as before, and the short, low side is in
##  the high bits of %xmm6.
##
##  This schedule is somewhat nastier, however, because each
##  round produces 192 bits of key material, or 1.5 round keys.
##  Therefore, on each cycle we do 2 rounds and produce 3 round
##  keys.
##
//
//  .aes_schedule_192
//
//  192-bit specific part of key schedule.
//
//  The main body of this schedule is the same as the 128-bit
//  schedule, but with more smearing.  The long, high side is
//  stored in %xmm7 as before, and the short, low side is in
//  the high bits of %xmm6.
//
//  This schedule is somewhat nastier, however, because each
//  round produces 192 bits of key material, or 1.5 round keys.
//  Therefore, on each cycle we do 2 rounds and produce 3 round
//  keys.
//
.align	4
.Lschedule_192:
	sub	$inp, $inp, #8
@@ -759,16 +759,16 @@ _vpaes_schedule_core:
	bl	_vpaes_schedule_192_smear
	b	.Loop_schedule_192

##
##  .aes_schedule_256
##
##  256-bit specific part of key schedule.
##
##  The structure here is very similar to the 128-bit
##  schedule, but with an additional "low side" in
##  %xmm6.  The low side's rounds are the same as the
##  high side's, except no rcon and no rotation.
##
//
//  .aes_schedule_256
//
//  256-bit specific part of key schedule.
//
//  The structure here is very similar to the 128-bit
//  schedule, but with an additional "low side" in
//  %xmm6.  The low side's rounds are the same as the
//  high side's, except no rcon and no rotation.
//
.align	4
.Lschedule_256:
	ld1	{v0.16b}, [$inp]		// vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
@@ -795,16 +795,16 @@ _vpaes_schedule_core:

	b	.Loop_schedule_256

##
##  .aes_schedule_mangle_last
##
##  Mangler for last round of key schedule
##  Mangles %xmm0
##    when encrypting, outputs out(%xmm0) ^ 63
##    when decrypting, outputs unskew(%xmm0)
##
##  Always called right before return... jumps to cleanup and exits
##
//
//  .aes_schedule_mangle_last
//
//  Mangler for last round of key schedule
//  Mangles %xmm0
//    when encrypting, outputs out(%xmm0) ^ 63
//    when decrypting, outputs unskew(%xmm0)
//
//  Always called right before return... jumps to cleanup and exits
//
.align	4
.Lschedule_mangle_last:
	// schedule last round key from xmm0
@@ -838,20 +838,20 @@ _vpaes_schedule_core:
	ret
.size	_vpaes_schedule_core,.-_vpaes_schedule_core

##
##  .aes_schedule_192_smear
##
##  Smear the short, low side in the 192-bit key schedule.
##
##  Inputs:
##    %xmm7: high side, b  a  x  y
##    %xmm6:  low side, d  c  0  0
##    %xmm13: 0
##
##  Outputs:
##    %xmm6: b+c+d  b+c  0  0
##    %xmm0: b+c+d  b+c  b  a
##
//
//  .aes_schedule_192_smear
//
//  Smear the short, low side in the 192-bit key schedule.
//
//  Inputs:
//    %xmm7: high side, b  a  x  y
//    %xmm6:  low side, d  c  0  0
//    %xmm13: 0
//
//  Outputs:
//    %xmm6: b+c+d  b+c  0  0
//    %xmm0: b+c+d  b+c  b  a
//
.type	_vpaes_schedule_192_smear,%function
.align	4
_vpaes_schedule_192_smear:
@@ -867,24 +867,24 @@ _vpaes_schedule_192_smear:
	ret
.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear

##
##  .aes_schedule_round
##
##  Runs one main round of the key schedule on %xmm0, %xmm7
##
##  Specifically, runs subbytes on the high dword of %xmm0
##  then rotates it by one byte and xors into the low dword of
##  %xmm7.
##
##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
##  next rcon.
##
##  Smears the dwords of %xmm7 by xoring the low into the
##  second low, result into third, result into highest.
##
##  Returns results in %xmm7 = %xmm0.
##  Clobbers %xmm1-%xmm4, %r11.
##
//
//  .aes_schedule_round
//
//  Runs one main round of the key schedule on %xmm0, %xmm7
//
//  Specifically, runs subbytes on the high dword of %xmm0
//  then rotates it by one byte and xors into the low dword of
//  %xmm7.
//
//  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
//  next rcon.
//
//  Smears the dwords of %xmm7 by xoring the low into the
//  second low, result into third, result into highest.
//
//  Returns results in %xmm7 = %xmm0.
//  Clobbers %xmm1-%xmm4, %r11.
//
.type	_vpaes_schedule_round,%function
.align	4
_vpaes_schedule_round:
@@ -932,15 +932,15 @@ _vpaes_schedule_low_round:
	ret
.size	_vpaes_schedule_round,.-_vpaes_schedule_round

##
##  .aes_schedule_transform
##
##  Linear-transform %xmm0 according to tables at (%r11)
##
##  Requires that %xmm9 = 0x0F0F... as in preheat
##  Output in %xmm0
##  Clobbers %xmm1, %xmm2
##
//
//  .aes_schedule_transform
//
//  Linear-transform %xmm0 according to tables at (%r11)
//
//  Requires that %xmm9 = 0x0F0F... as in preheat
//  Output in %xmm0
//  Clobbers %xmm1, %xmm2
//
.type	_vpaes_schedule_transform,%function
.align	4
_vpaes_schedule_transform:
@@ -954,29 +954,29 @@ _vpaes_schedule_transform:
	ret
.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform

##
##  .aes_schedule_mangle
##
##  Mangle xmm0 from (basis-transformed) standard version
##  to our version.
##
##  On encrypt,
##    xor with 0x63
##    multiply by circulant 0,1,1,1
##    apply shiftrows transform
##
##  On decrypt,
##    xor with 0x63
##    multiply by "inverse mixcolumns" circulant E,B,D,9
##    deskew
##    apply shiftrows transform
##
##
##  Writes out to (%rdx), and increments or decrements it
##  Keeps track of round number mod 4 in %r8
##  Preserves xmm0
##  Clobbers xmm1-xmm5
##
//
//  .aes_schedule_mangle
//
//  Mangle xmm0 from (basis-transformed) standard version
//  to our version.
//
//  On encrypt,
//    xor with 0x63
//    multiply by circulant 0,1,1,1
//    apply shiftrows transform
//
//  On decrypt,
//    xor with 0x63
//    multiply by "inverse mixcolumns" circulant E,B,D,9
//    deskew
//    apply shiftrows transform
//
//
//  Writes out to (%rdx), and increments or decrements it
//  Keeps track of round number mod 4 in %r8
//  Preserves xmm0
//  Clobbers xmm1-xmm5
//
.type	_vpaes_schedule_mangle,%function
.align	4
_vpaes_schedule_mangle:
+8 −8
Original line number Diff line number Diff line
@@ -197,7 +197,7 @@ bn_mul_mont:
	mul	$nlo,$nj,$m1		// np[j]*m1
	adds	$lo1,$lo1,$lo0
	umulh	$nhi,$nj,$m1
	str	$lo1,[$tp,#-16]		// tp[j-1]
	stur	$lo1,[$tp,#-16]		// tp[j-1]
	cbnz	$j,.Linner

.Linner_skip:
@@ -253,13 +253,13 @@ bn_mul_mont:
	csel	$nj,$tj,$aj,lo		// did it borrow?
	ldr	$tj,[$tp],#8
	ldr	$aj,[$rp],#8
	str	xzr,[$tp,#-16]		// wipe tp
	str	$nj,[$rp,#-16]
	stur	xzr,[$tp,#-16]		// wipe tp
	stur	$nj,[$rp,#-16]
	cbnz	$num,.Lcond_copy

	csel	$nj,$tj,$aj,lo
	str	xzr,[$tp,#-8]		// wipe tp
	str	$nj,[$rp,#-8]
	stur	xzr,[$tp,#-8]		// wipe tp
	stur	$nj,[$rp,#-8]

	ldp	x19,x20,[x29,#16]
	mov	sp,x29
@@ -596,7 +596,7 @@ __bn_sqr8x_mont:
	ldp	$a4,$a5,[$tp,#8*4]
	ldp	$a6,$a7,[$tp,#8*6]
	adds	$acc0,$acc0,$a0
	ldr	$n0,[$rp,#-8*8]
	ldur	$n0,[$rp,#-8*8]
	adcs	$acc1,$acc1,$a1
	ldp	$a0,$a1,[$ap,#8*0]
	adcs	$acc2,$acc2,$a2
@@ -794,7 +794,7 @@ $code.=<<___;
	//adc	$carry,xzr,xzr		// moved below
	cbz	$cnt,.Lsqr8x8_post_condition

	ldr	$n0,[$tp,#-8*8]
	ldur	$n0,[$tp,#-8*8]
	ldp	$a0,$a1,[$np,#8*0]
	ldp	$a2,$a3,[$np,#8*2]
	ldp	$a4,$a5,[$np,#8*4]
@@ -852,7 +852,7 @@ $code.=<<___;
	ldp	$a6,$a7,[$tp,#8*6]
	cbz	$cnt,.Lsqr8x_tail_break

	ldr	$n0,[$rp,#-8*8]
	ldur	$n0,[$rp,#-8*8]
	adds	$acc0,$acc0,$a0
	adcs	$acc1,$acc1,$a1
	ldp	$a0,$a1,[$np,#8*0]
+5 −14
Original line number Diff line number Diff line
@@ -131,12 +131,6 @@ $code.=<<___;
.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
.Lone:
.long	1,0,0,0
.LOPENSSL_armcap_P:
#ifdef	__ILP32__
.long	OPENSSL_armcap_P-.
#else
.quad	OPENSSL_armcap_P-.
#endif
.asciz	"ChaCha20 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"

.globl	ChaCha20_ctr32
@@ -144,17 +138,13 @@ $code.=<<___;
.align	5
ChaCha20_ctr32:
	cbz	$len,.Labort
	adr	@x[0],.LOPENSSL_armcap_P
	cmp	$len,#192
	b.lo	.Lshort
#ifdef	__ILP32__
	ldrsw	@x[1],[@x[0]]
#else
	ldr	@x[1],[@x[0]]
#endif
	ldr	w17,[@x[1],@x[0]]

	adrp	x17,OPENSSL_armcap_P
	ldr	w17,[x17,#:lo12:OPENSSL_armcap_P]
	tst	w17,#ARMV7_NEON
	b.ne	ChaCha20_neon
	b.ne	.LChaCha20_neon

.Lshort:
	.inst	0xd503233f			// paciasp
@@ -380,6 +370,7 @@ $code.=<<___;
.type	ChaCha20_neon,%function
.align	5
ChaCha20_neon:
.LChaCha20_neon:
	.inst	0xd503233f			// paciasp
	stp	x29,x30,[sp,#-96]!
	add	x29,sp,#0
+3 −3
Original line number Diff line number Diff line
@@ -1654,7 +1654,7 @@ ecp_nistz256_scatter_w5:

	ldp	x4,x5,[$inp]		// X
	ldp	x6,x7,[$inp,#16]
	str	w4,[$out,#64*0-4]
	stur	w4,[$out,#64*0-4]
	lsr	x4,x4,#32
	str	w5,[$out,#64*1-4]
	lsr	x5,x5,#32
@@ -1670,7 +1670,7 @@ ecp_nistz256_scatter_w5:

	ldp	x4,x5,[$inp,#32]	// Y
	ldp	x6,x7,[$inp,#48]
	str	w4,[$out,#64*0-4]
	stur	w4,[$out,#64*0-4]
	lsr	x4,x4,#32
	str	w5,[$out,#64*1-4]
	lsr	x5,x5,#32
@@ -1686,7 +1686,7 @@ ecp_nistz256_scatter_w5:

	ldp	x4,x5,[$inp,#64]	// Z
	ldp	x6,x7,[$inp,#80]
	str	w4,[$out,#64*0-4]
	stur	w4,[$out,#64*0-4]
	lsr	x4,x4,#32
	str	w5,[$out,#64*1-4]
	lsr	x5,x5,#32
+10 −0
Original line number Diff line number Diff line
@@ -103,6 +103,12 @@ my $asciz = sub {
    {	"";	}
};

my $adrp = sub {
    my ($args,$comment) = split(m|\s*//|,shift);
    "\tadrp\t$args\@PAGE";
} if ($flavour =~ /ios64/);


sub range {
  my ($r,$sfx,$start,$end) = @_;

@@ -132,6 +138,10 @@ sub expand_line {

    $line =~ s/\b(\w+)/$GLOBALS{$1} or $1/ge;

    if ($flavour =~ /ios64/) {
	$line =~ s/#:lo12:(\w+)/$1\@PAGEOFF/;
    }

    return $line;
}

Loading