ARM64 assembly pack: make it Windows-friendly. (db42bb44) · Commits · CYBER - Cyber Security / TS 103 523 MSP / TLMSP / TLMSP OpenSSL

crypto/aes/asm/vpaes-armv8.pl

+138 −138

Original line number	Diff line number	Diff line
		@@ -150,12 +150,12 @@ my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_.16b",(24..27));
		my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_.16b",(24..31));

		$code.=<<___;
		##
		## _aes_preheat
		##
		## Fills register %r10 -> .aes_consts (so you can -fPIC)
		## and %xmm9-%xmm15 as specified below.
		##
		//
		// _aes_preheat
		//
		// Fills register %r10 -> .aes_consts (so you can -fPIC)
		// and %xmm9-%xmm15 as specified below.
		//
		.type _vpaes_encrypt_preheat,%function
		.align 4
		_vpaes_encrypt_preheat:
		@@ -167,21 +167,21 @@ _vpaes_encrypt_preheat:
		ret
		.size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat

		##
		## _aes_encrypt_core
		##
		## AES-encrypt %xmm0.
		##
		## Inputs:
		## %xmm0 = input
		## %xmm9-%xmm15 as in _vpaes_preheat
		## (%rdx) = scheduled keys
		##
		## Output in %xmm0
		## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
		## Preserves %xmm6 - %xmm8 so you get some local vectors
		##
		##
		//
		// _aes_encrypt_core
		//
		// AES-encrypt %xmm0.
		//
		// Inputs:
		// %xmm0 = input
		// %xmm9-%xmm15 as in _vpaes_preheat
		// (%rdx) = scheduled keys
		//
		// Output in %xmm0
		// Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
		// Preserves %xmm6 - %xmm8 so you get some local vectors
		//
		//
		.type _vpaes_encrypt_core,%function
		.align 4
		_vpaes_encrypt_core:
		@@ -387,11 +387,11 @@ _vpaes_decrypt_preheat:
		ret
		.size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat

		##
		## Decryption core
		##
		## Same API as encryption core.
		##
		//
		// Decryption core
		//
		// Same API as encryption core.
		//
		.type _vpaes_decrypt_core,%function
		.align 4
		_vpaes_decrypt_core:
		@@ -643,11 +643,11 @@ my ($inp,$bits,$out,$dir)=("x0","w1","x2","w3");
		my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_.16b",(18..21,8));

		$code.=<<___;
		########################################################
		## ##
		## AES key schedule ##
		## ##
		########################################################
		////////////////////////////////////////////////////////
		// //
		// AES key schedule //
		// //
		////////////////////////////////////////////////////////
		.type _vpaes_key_preheat,%function
		.align 4
		_vpaes_key_preheat:
		@@ -703,14 +703,14 @@ _vpaes_schedule_core:
		b.eq .Lschedule_192
		// 128: fall though

		##
		## .schedule_128
		##
		## 128-bit specific part of key schedule.
		##
		## This schedule is really simple, because all its parts
		## are accomplished by the subroutines.
		##
		//
		// .schedule_128
		//
		// 128-bit specific part of key schedule.
		//
		// This schedule is really simple, because all its parts
		// are accomplished by the subroutines.
		//
		.Lschedule_128:
		mov $inp, #10 // mov \$10, %esi

		@@ -721,21 +721,21 @@ _vpaes_schedule_core:
		bl _vpaes_schedule_mangle // write output
		b .Loop_schedule_128

		##
		## .aes_schedule_192
		##
		## 192-bit specific part of key schedule.
		##
		## The main body of this schedule is the same as the 128-bit
		## schedule, but with more smearing. The long, high side is
		## stored in %xmm7 as before, and the short, low side is in
		## the high bits of %xmm6.
		##
		## This schedule is somewhat nastier, however, because each
		## round produces 192 bits of key material, or 1.5 round keys.
		## Therefore, on each cycle we do 2 rounds and produce 3 round
		## keys.
		##
		//
		// .aes_schedule_192
		//
		// 192-bit specific part of key schedule.
		//
		// The main body of this schedule is the same as the 128-bit
		// schedule, but with more smearing. The long, high side is
		// stored in %xmm7 as before, and the short, low side is in
		// the high bits of %xmm6.
		//
		// This schedule is somewhat nastier, however, because each
		// round produces 192 bits of key material, or 1.5 round keys.
		// Therefore, on each cycle we do 2 rounds and produce 3 round
		// keys.
		//
		.align 4
		.Lschedule_192:
		sub $inp, $inp, #8
		@@ -759,16 +759,16 @@ _vpaes_schedule_core:
		bl _vpaes_schedule_192_smear
		b .Loop_schedule_192

		##
		## .aes_schedule_256
		##
		## 256-bit specific part of key schedule.
		##
		## The structure here is very similar to the 128-bit
		## schedule, but with an additional "low side" in
		## %xmm6. The low side's rounds are the same as the
		## high side's, except no rcon and no rotation.
		##
		//
		// .aes_schedule_256
		//
		// 256-bit specific part of key schedule.
		//
		// The structure here is very similar to the 128-bit
		// schedule, but with an additional "low side" in
		// %xmm6. The low side's rounds are the same as the
		// high side's, except no rcon and no rotation.
		//
		.align 4
		.Lschedule_256:
		ld1 {v0.16b}, [$inp] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
		@@ -795,16 +795,16 @@ _vpaes_schedule_core:

		b .Loop_schedule_256

		##
		## .aes_schedule_mangle_last
		##
		## Mangler for last round of key schedule
		## Mangles %xmm0
		## when encrypting, outputs out(%xmm0) ^ 63
		## when decrypting, outputs unskew(%xmm0)
		##
		## Always called right before return... jumps to cleanup and exits
		##
		//
		// .aes_schedule_mangle_last
		//
		// Mangler for last round of key schedule
		// Mangles %xmm0
		// when encrypting, outputs out(%xmm0) ^ 63
		// when decrypting, outputs unskew(%xmm0)
		//
		// Always called right before return... jumps to cleanup and exits
		//
		.align 4
		.Lschedule_mangle_last:
		// schedule last round key from xmm0
		@@ -838,20 +838,20 @@ _vpaes_schedule_core:
		ret
		.size _vpaes_schedule_core,.-_vpaes_schedule_core

		##
		## .aes_schedule_192_smear
		##
		## Smear the short, low side in the 192-bit key schedule.
		##
		## Inputs:
		## %xmm7: high side, b a x y
		## %xmm6: low side, d c 0 0
		## %xmm13: 0
		##
		## Outputs:
		## %xmm6: b+c+d b+c 0 0
		## %xmm0: b+c+d b+c b a
		##
		//
		// .aes_schedule_192_smear
		//
		// Smear the short, low side in the 192-bit key schedule.
		//
		// Inputs:
		// %xmm7: high side, b a x y
		// %xmm6: low side, d c 0 0
		// %xmm13: 0
		//
		// Outputs:
		// %xmm6: b+c+d b+c 0 0
		// %xmm0: b+c+d b+c b a
		//
		.type _vpaes_schedule_192_smear,%function
		.align 4
		_vpaes_schedule_192_smear:
		@@ -867,24 +867,24 @@ _vpaes_schedule_192_smear:
		ret
		.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear

		##
		## .aes_schedule_round
		##
		## Runs one main round of the key schedule on %xmm0, %xmm7
		##
		## Specifically, runs subbytes on the high dword of %xmm0
		## then rotates it by one byte and xors into the low dword of
		## %xmm7.
		##
		## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
		## next rcon.
		##
		## Smears the dwords of %xmm7 by xoring the low into the
		## second low, result into third, result into highest.
		##
		## Returns results in %xmm7 = %xmm0.
		## Clobbers %xmm1-%xmm4, %r11.
		##
		//
		// .aes_schedule_round
		//
		// Runs one main round of the key schedule on %xmm0, %xmm7
		//
		// Specifically, runs subbytes on the high dword of %xmm0
		// then rotates it by one byte and xors into the low dword of
		// %xmm7.
		//
		// Adds rcon from low byte of %xmm8, then rotates %xmm8 for
		// next rcon.
		//
		// Smears the dwords of %xmm7 by xoring the low into the
		// second low, result into third, result into highest.
		//
		// Returns results in %xmm7 = %xmm0.
		// Clobbers %xmm1-%xmm4, %r11.
		//
		.type _vpaes_schedule_round,%function
		.align 4
		_vpaes_schedule_round:
		@@ -932,15 +932,15 @@ _vpaes_schedule_low_round:
		ret
		.size _vpaes_schedule_round,.-_vpaes_schedule_round

		##
		## .aes_schedule_transform
		##
		## Linear-transform %xmm0 according to tables at (%r11)
		##
		## Requires that %xmm9 = 0x0F0F... as in preheat
		## Output in %xmm0
		## Clobbers %xmm1, %xmm2
		##
		//
		// .aes_schedule_transform
		//
		// Linear-transform %xmm0 according to tables at (%r11)
		//
		// Requires that %xmm9 = 0x0F0F... as in preheat
		// Output in %xmm0
		// Clobbers %xmm1, %xmm2
		//
		.type _vpaes_schedule_transform,%function
		.align 4
		_vpaes_schedule_transform:
		@@ -954,29 +954,29 @@ _vpaes_schedule_transform:
		ret
		.size _vpaes_schedule_transform,.-_vpaes_schedule_transform

		##
		## .aes_schedule_mangle
		##
		## Mangle xmm0 from (basis-transformed) standard version
		## to our version.
		##
		## On encrypt,
		## xor with 0x63
		## multiply by circulant 0,1,1,1
		## apply shiftrows transform
		##
		## On decrypt,
		## xor with 0x63
		## multiply by "inverse mixcolumns" circulant E,B,D,9
		## deskew
		## apply shiftrows transform
		##
		##
		## Writes out to (%rdx), and increments or decrements it
		## Keeps track of round number mod 4 in %r8
		## Preserves xmm0
		## Clobbers xmm1-xmm5
		##
		//
		// .aes_schedule_mangle
		//
		// Mangle xmm0 from (basis-transformed) standard version
		// to our version.
		//
		// On encrypt,
		// xor with 0x63
		// multiply by circulant 0,1,1,1
		// apply shiftrows transform
		//
		// On decrypt,
		// xor with 0x63
		// multiply by "inverse mixcolumns" circulant E,B,D,9
		// deskew
		// apply shiftrows transform
		//
		//
		// Writes out to (%rdx), and increments or decrements it
		// Keeps track of round number mod 4 in %r8
		// Preserves xmm0
		// Clobbers xmm1-xmm5
		//
		.type _vpaes_schedule_mangle,%function
		.align 4
		_vpaes_schedule_mangle:

crypto/bn/asm/armv8-mont.pl

+8 −8

Original line number	Diff line number	Diff line
		@@ -197,7 +197,7 @@ bn_mul_mont:
		mul $nlo,$nj,$m1 // np[j]*m1
		adds $lo1,$lo1,$lo0
		umulh $nhi,$nj,$m1
		str $lo1,[$tp,#-16] // tp[j-1]
		stur $lo1,[$tp,#-16] // tp[j-1]
		cbnz $j,.Linner

		.Linner_skip:
		@@ -253,13 +253,13 @@ bn_mul_mont:
		csel $nj,$tj,$aj,lo // did it borrow?
		ldr $tj,[$tp],#8
		ldr $aj,[$rp],#8
		str xzr,[$tp,#-16] // wipe tp
		str $nj,[$rp,#-16]
		stur xzr,[$tp,#-16] // wipe tp
		stur $nj,[$rp,#-16]
		cbnz $num,.Lcond_copy

		csel $nj,$tj,$aj,lo
		str xzr,[$tp,#-8] // wipe tp
		str $nj,[$rp,#-8]
		stur xzr,[$tp,#-8] // wipe tp
		stur $nj,[$rp,#-8]

		ldp x19,x20,[x29,#16]
		mov sp,x29
		@@ -596,7 +596,7 @@ __bn_sqr8x_mont:
		ldp $a4,$a5,[$tp,#8*4]
		ldp $a6,$a7,[$tp,#8*6]
		adds $acc0,$acc0,$a0
		ldr $n0,[$rp,#-8*8]
		ldur $n0,[$rp,#-8*8]
		adcs $acc1,$acc1,$a1
		ldp $a0,$a1,[$ap,#8*0]
		adcs $acc2,$acc2,$a2
		@@ -794,7 +794,7 @@ $code.=<<___;
		//adc $carry,xzr,xzr // moved below
		cbz $cnt,.Lsqr8x8_post_condition

		ldr $n0,[$tp,#-8*8]
		ldur $n0,[$tp,#-8*8]
		ldp $a0,$a1,[$np,#8*0]
		ldp $a2,$a3,[$np,#8*2]
		ldp $a4,$a5,[$np,#8*4]
		@@ -852,7 +852,7 @@ $code.=<<___;
		ldp $a6,$a7,[$tp,#8*6]
		cbz $cnt,.Lsqr8x_tail_break

		ldr $n0,[$rp,#-8*8]
		ldur $n0,[$rp,#-8*8]
		adds $acc0,$acc0,$a0
		adcs $acc1,$acc1,$a1
		ldp $a0,$a1,[$np,#8*0]

crypto/chacha/asm/chacha-armv8.pl

+5 −14

Original line number	Diff line number	Diff line
		@@ -131,12 +131,6 @@ $code.=<<___;
		.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
		.Lone:
		.long 1,0,0,0
		.LOPENSSL_armcap_P:
		#ifdef __ILP32__
		.long OPENSSL_armcap_P-.
		#else
		.quad OPENSSL_armcap_P-.
		#endif
		.asciz "ChaCha20 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"

		.globl ChaCha20_ctr32
		@@ -144,17 +138,13 @@ $code.=<<___;
		.align 5
		ChaCha20_ctr32:
		cbz $len,.Labort
		adr @x[0],.LOPENSSL_armcap_P
		cmp $len,#192
		b.lo .Lshort
		#ifdef __ILP32__
		ldrsw @x[1],[@x[0]]
		#else
		ldr @x[1],[@x[0]]
		#endif
		ldr w17,[@x[1],@x[0]]

		adrp x17,OPENSSL_armcap_P
		ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
		tst w17,#ARMV7_NEON
		b.ne ChaCha20_neon
		b.ne .LChaCha20_neon

		.Lshort:
		.inst 0xd503233f // paciasp
		@@ -380,6 +370,7 @@ $code.=<<___;
		.type ChaCha20_neon,%function
		.align 5
		ChaCha20_neon:
		.LChaCha20_neon:
		.inst 0xd503233f // paciasp
		stp x29,x30,[sp,#-96]!
		add x29,sp,#0

crypto/ec/asm/ecp_nistz256-armv8.pl

+3 −3

Original line number	Diff line number	Diff line
		@@ -1654,7 +1654,7 @@ ecp_nistz256_scatter_w5:

		ldp x4,x5,[$inp] // X
		ldp x6,x7,[$inp,#16]
		str w4,[$out,#64*0-4]
		stur w4,[$out,#64*0-4]
		lsr x4,x4,#32
		str w5,[$out,#64*1-4]
		lsr x5,x5,#32
		@@ -1670,7 +1670,7 @@ ecp_nistz256_scatter_w5:

		ldp x4,x5,[$inp,#32] // Y
		ldp x6,x7,[$inp,#48]
		str w4,[$out,#64*0-4]
		stur w4,[$out,#64*0-4]
		lsr x4,x4,#32
		str w5,[$out,#64*1-4]
		lsr x5,x5,#32
		@@ -1686,7 +1686,7 @@ ecp_nistz256_scatter_w5:

		ldp x4,x5,[$inp,#64] // Z
		ldp x6,x7,[$inp,#80]
		str w4,[$out,#64*0-4]
		stur w4,[$out,#64*0-4]
		lsr x4,x4,#32
		str w5,[$out,#64*1-4]
		lsr x5,x5,#32

crypto/perlasm/arm-xlate.pl

+10 −0

Original line number	Diff line number	Diff line
		@@ -103,6 +103,12 @@ my $asciz = sub {
		{ ""; }
		};

		my $adrp = sub {
		my ($args,$comment) = split(m\|\s*//\|,shift);
		"\tadrp\t$args\@PAGE";
		} if ($flavour =~ /ios64/);


		sub range {
		my ($r,$sfx,$start,$end) = @_;

		@@ -132,6 +138,10 @@ sub expand_line {

		$line =~ s/\b(\w+)/$GLOBALS{$1} or $1/ge;

		if ($flavour =~ /ios64/) {
		$line =~ s/#:lo12:(\w+)/$1\@PAGEOFF/;
		}

		return $line;
		}