aes-armv4.pl, bsaes-armv7.pl: add Linux kernel and Thumb2 support. (e0202d94) · Commits · CYBER - Cyber Security / TS 103 523 MSP / TLMSP / TLMSP OpenSSL

crypto/aes/asm/aes-armv4.pl

+105 −28

Original line number	Diff line number	Diff line
		#!/usr/bin/env perl

		# ====================================================================
		# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
		# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
		# project. The module is, however, dual licensed under OpenSSL and
		# CRYPTOGAMS licenses depending on where you obtain it. For further
		# details see http://www.openssl.org/~appro/cryptogams/.
		@@ -51,9 +51,18 @@ $key="r11";
		$rounds="r12";

		$code=<<___;
		#ifndef __KERNEL__
		# include "arm_arch.h"
		#else
		# define __ARM_ARCH__ __LINUX_ARM_ARCH__
		#endif

		.text
		#if __ARM_ARCH__<7
		.code 32
		#else
		.syntax unified
		#endif

		.type AES_Te,%object
		.align 5
		@@ -167,7 +176,11 @@ AES_Te:
		.type AES_encrypt,%function
		.align 5
		AES_encrypt:
		#if __ARM_ARCH__<7
		sub r3,pc,#8 @ AES_encrypt
		#else
		adr r3,AES_encrypt
		#endif
		stmdb sp!,{r1,r4-r12,lr}
		mov $rounds,r0 @ inp
		mov $key,r2
		@@ -409,11 +422,21 @@ _armv4_AES_encrypt:
		.align 5
		AES_set_encrypt_key:
		_armv4_AES_set_encrypt_key:
		#if __ARM_ARCH__<7
		sub r3,pc,#8 @ AES_set_encrypt_key
		#else
		adr r3,AES_set_encrypt_key
		#endif
		teq r0,#0
		#if __ARM_ARCH__>=7
		itt eq @ Thumb2 thing, sanity check in ARM
		#endif
		moveq r0,#-1
		beq .Labrt
		teq r2,#0
		#if __ARM_ARCH__>=7
		itt eq @ Thumb2 thing, sanity check in ARM
		#endif
		moveq r0,#-1
		beq .Labrt

		@@ -422,6 +445,9 @@ _armv4_AES_set_encrypt_key:
		teq r1,#192
		beq .Lok
		teq r1,#256
		#if __ARM_ARCH__>=7
		itt ne @ Thumb2 thing, sanity check in ARM
		#endif
		movne r0,#-1
		bne .Labrt

		@@ -576,6 +602,9 @@ _armv4_AES_set_encrypt_key:
		str $s2,[$key,#-16]
		subs $rounds,$rounds,#1
		str $s3,[$key,#-12]
		#if __ARM_ARCH__>=7
		itt eq @ Thumb2 thing, sanity check in ARM
		#endif
		subeq r2,$key,#216
		beq .Ldone

		@@ -645,6 +674,9 @@ _armv4_AES_set_encrypt_key:
		str $s2,[$key,#-24]
		subs $rounds,$rounds,#1
		str $s3,[$key,#-20]
		#if __ARM_ARCH__>=7
		itt eq @ Thumb2 thing, sanity check in ARM
		#endif
		subeq r2,$key,#256
		beq .Ldone

		@@ -674,11 +706,17 @@ _armv4_AES_set_encrypt_key:
		str $i3,[$key,#-4]
		b .L256_loop

		.align 2
		.Ldone: mov r0,#0
		ldmia sp!,{r4-r12,lr}
		.Labrt: tst lr,#1
		.Labrt:
		#if defined(__thumb2__) && __ARM_ARCH__>=7
		.short 0x4770 @ bx lr in Thumb2 encoding
		#else
		tst lr,#1
		moveq pc,lr @ be binary compatible with V4, yet
		bx lr @ interoperable with Thumb ISA:-)
		#endif
		.size AES_set_encrypt_key,.-AES_set_encrypt_key

		.global AES_set_decrypt_key
		@@ -688,34 +726,57 @@ AES_set_decrypt_key:
		str lr,[sp,#-4]! @ push lr
		bl _armv4_AES_set_encrypt_key
		teq r0,#0
		ldrne lr,[sp],#4 @ pop lr
		ldr lr,[sp],#4 @ pop lr
		bne .Labrt

		stmdb sp!,{r4-r12}
		mov r0,r2 @ AES_set_encrypt_key preserves r2,
		mov r1,r2 @ which is AES_KEY *key
		b _armv4_AES_set_enc2dec_key
		.size AES_set_decrypt_key,.-AES_set_decrypt_key

		ldr $rounds,[r2,#240] @ AES_set_encrypt_key preserves r2,
		mov $key,r2 @ which is AES_KEY *key
		mov $i1,r2
		add $i2,r2,$rounds,lsl#4
		@ void AES_set_enc2dec_key(const AES_KEY inp,AES_KEY out)
		.global AES_set_enc2dec_key
		.type AES_set_enc2dec_key,%function
		.align 5
		AES_set_enc2dec_key:
		_armv4_AES_set_enc2dec_key:
		stmdb sp!,{r4-r12,lr}

		ldr $rounds,[r0,#240]
		mov $i1,r0 @ input
		add $i2,r0,$rounds,lsl#4
		mov $key,r1 @ ouput
		add $tbl,r1,$rounds,lsl#4
		str $rounds,[r1,#240]

		.Linv: ldr $s0,[$i1],#16
		ldr $s1,[$i1,#-12]
		ldr $s2,[$i1,#-8]
		ldr $s3,[$i1,#-4]
		ldr $t1,[$i2],#-16
		ldr $t2,[$i2,#16+4]
		ldr $t3,[$i2,#16+8]
		ldr $i3,[$i2,#16+12]
		str $s0,[$tbl],#-16
		str $s1,[$tbl,#16+4]
		str $s2,[$tbl,#16+8]
		str $s3,[$tbl,#16+12]
		str $t1,[$key],#16
		str $t2,[$key,#-12]
		str $t3,[$key,#-8]
		str $i3,[$key,#-4]
		teq $i1,$i2
		bne .Linv

		.Linv: ldr $s0,[$i1]
		ldr $s0,[$i1]
		ldr $s1,[$i1,#4]
		ldr $s2,[$i1,#8]
		ldr $s3,[$i1,#12]
		ldr $t1,[$i2]
		ldr $t2,[$i2,#4]
		ldr $t3,[$i2,#8]
		ldr $i3,[$i2,#12]
		str $s0,[$i2],#-16
		str $s1,[$i2,#16+4]
		str $s2,[$i2,#16+8]
		str $s3,[$i2,#16+12]
		str $t1,[$i1],#16
		str $t2,[$i1,#-12]
		str $t3,[$i1,#-8]
		str $i3,[$i1,#-4]
		teq $i1,$i2
		bne .Linv
		str $s0,[$key]
		str $s1,[$key,#4]
		str $s2,[$key,#8]
		str $s3,[$key,#12]
		sub $key,$key,$rounds,lsl#3
		___
		$mask80=$i1;
		$mask1b=$i2;
		@@ -773,7 +834,7 @@ $code.=<<___;
		moveq pc,lr @ be binary compatible with V4, yet
		bx lr @ interoperable with Thumb ISA:-)
		#endif
		.size AES_set_decrypt_key,.-AES_set_decrypt_key
		.size AES_set_enc2dec_key,.-AES_set_enc2dec_key

		.type AES_Td,%object
		.align 5
		@@ -883,7 +944,11 @@ AES_Td:
		.type AES_decrypt,%function
		.align 5
		AES_decrypt:
		#if __ARM_ARCH__<7
		sub r3,pc,#8 @ AES_decrypt
		#else
		adr r3,AES_decrypt
		#endif
		stmdb sp!,{r1,r4-r12,lr}
		mov $rounds,r0 @ inp
		mov $key,r2
		@@ -1080,8 +1145,9 @@ _armv4_AES_decrypt:
		ldrb $t3,[$tbl,$i3] @ Td4[s0>>0]
		and $i3,lr,$s1,lsr#8

		add $s1,$tbl,$s1,lsr#24
		ldrb $i1,[$tbl,$i1] @ Td4[s1>>0]
		ldrb $s1,[$tbl,$s1,lsr#24] @ Td4[s1>>24]
		ldrb $s1,[$s1] @ Td4[s1>>24]
		ldrb $i2,[$tbl,$i2] @ Td4[s1>>16]
		eor $s0,$i1,$s0,lsl#24
		ldrb $i3,[$tbl,$i3] @ Td4[s1>>8]
		@@ -1094,7 +1160,8 @@ _armv4_AES_decrypt:
		ldrb $i2,[$tbl,$i2] @ Td4[s2>>0]
		and $i3,lr,$s2,lsr#16

		ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24]
		add $s2,$tbl,$s2,lsr#24
		ldrb $s2,[$s2] @ Td4[s2>>24]
		eor $s0,$s0,$i1,lsl#8
		ldrb $i3,[$tbl,$i3] @ Td4[s2>>16]
		eor $s1,$i2,$s1,lsl#16
		@@ -1106,8 +1173,9 @@ _armv4_AES_decrypt:
		ldrb $i2,[$tbl,$i2] @ Td4[s3>>8]
		and $i3,lr,$s3 @ i2

		add $s3,$tbl,$s3,lsr#24
		ldrb $i3,[$tbl,$i3] @ Td4[s3>>0]
		ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24]
		ldrb $s3,[$s3] @ Td4[s3>>24]
		eor $s0,$s0,$i1,lsl#16
		ldr $i1,[$key,#0]
		eor $s1,$s1,$i2,lsl#8
		@@ -1130,5 +1198,14 @@ _armv4_AES_decrypt:
		___

		$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4

		open SELF,$0;
		while(<SELF>) {
		next if (/^#!/);
		last if (!s/^#/@/ and !/^$/);
		print;
		}
		close SELF;

		print $code;
		close STDOUT; # enforce flush

crypto/aes/asm/bsaes-armv7.pl

+297 −55

Original line number	Diff line number	Diff line
		@@ -5,6 +5,10 @@
		# project. The module is, however, dual licensed under OpenSSL and
		# CRYPTOGAMS licenses depending on where you obtain it. For further
		# details see http://www.openssl.org/~appro/cryptogams/.
		#
		# Specific modes and adaptation for Linux kernel by Ard Biesheuvel
		# <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
		# granted.
		# ====================================================================

		# Bit-sliced AES for ARM NEON
		@@ -37,6 +41,12 @@
		#
		# <appro@openssl.org>

		# April-August 2013
		#
		# Add CBC, CTR and XTS subroutines, adapt for kernel use.
		#
		# <ard.biesheuvel@linaro.org>

		while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
		open STDOUT,">$output";

		@@ -620,17 +630,34 @@ ___
		}

		$code.=<<___;
		#ifndef __KERNEL__
		# include "arm_arch.h"

		# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
		# define VFP_ABI_POP vldmia sp!,{d8-d15}
		# define VFP_ABI_FRAME 0x40
		#else
		# define VFP_ABI_PUSH
		# define VFP_ABI_POP
		# define VFP_ABI_FRAME 0
		# define BSAES_ASM_EXTENDED_KEY
		# define XTS_CHAIN_TWEAK
		# define __ARM_ARCH__ __LINUX_ARM_ARCH__
		#endif

		#ifdef __thumb__
		# define adrl adr
		#endif

		#if __ARM_ARCH__>=7
		.text
		.code 32
		.syntax unified @ ARMv7-capable assembler is expected to handle this
		.fpu neon

		.type _bsaes_decrypt8,%function
		.align 4
		_bsaes_decrypt8:
		sub $const,pc,#8 @ _bsaes_decrypt8
		adr $const,_bsaes_decrypt8
		vldmia $key!, {@XMM[9]} @ round 0 key
		add $const,$const,#.LM0ISR-_bsaes_decrypt8

		@@ -677,6 +704,7 @@ ___
		&InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
		$code.=<<___;
		vldmia $const, {@XMM[12]} @ .LISR
		ite eq @ Thumb2 thing, sanity check in ARM
		addeq $const,$const,#0x10
		bne .Ldec_loop
		vldmia $const, {@XMM[12]} @ .LISRM0
		@@ -717,8 +745,6 @@ _bsaes_const:
		.quad 0x02060a0e03070b0f, 0x0004080c0105090d
		.LREVM0SR:
		.quad 0x090d01050c000408, 0x03070b0f060a0e02
		.Lxts_magic:
		.quad 1, 0x87
		.asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>"
		.align 6
		.size _bsaes_const,.-_bsaes_const
		@@ -726,7 +752,7 @@ _bsaes_const:
		.type _bsaes_encrypt8,%function
		.align 4
		_bsaes_encrypt8:
		sub $const,pc,#8 @ _bsaes_encrypt8
		adr $const,_bsaes_encrypt8
		vldmia $key!, {@XMM[9]} @ round 0 key
		sub $const,$const,#_bsaes_encrypt8-.LM0SR

		@@ -775,6 +801,7 @@ ___
		&MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
		$code.=<<___;
		vldmia $const, {@XMM[12]} @ .LSR
		ite eq @ Thumb2 thing, samity check in ARM
		addeq $const,$const,#0x10
		bne .Lenc_loop
		vldmia $const, {@XMM[12]} @ .LSRM0
		@@ -829,7 +856,7 @@ $code.=<<___;
		.type _bsaes_key_convert,%function
		.align 4
		_bsaes_key_convert:
		sub $const,pc,#8 @ _bsaes_key_convert
		adr $const,_bsaes_key_convert
		vld1.8 {@XMM[7]}, [$inp]! @ load round 0 key
		sub $const,$const,#_bsaes_key_convert-.LM0
		vld1.8 {@XMM[15]}, [$inp]! @ load round 1 key
		@@ -998,32 +1025,62 @@ $code.=<<___;
		.type bsaes_cbc_encrypt,%function
		.align 5
		bsaes_cbc_encrypt:
		#ifndef __KERNEL__
		cmp $len, #128
		#ifndef __thumb__
		blo AES_cbc_encrypt
		#else
		bhs 1f
		b AES_cbc_encrypt
		1:
		#endif
		#endif

		@ it is up to the caller to make sure we are called with enc == 0

		mov ip, sp
		stmdb sp!, {r4-r10, lr}
		vstmdb sp!, {d8-d15} @ ABI specification says so
		ldr $ivp, [sp, #0x60] @ IV is 1st arg on the stack
		VFP_ABI_PUSH
		ldr $ivp, [ip] @ IV is 1st arg on the stack
		mov $len, $len, lsr#4 @ len in 16 byte blocks
		sub sp, #0x10 @ scratch space to carry over the IV
		mov $fp, sp @ save sp

		@ allocate the key schedule on the stack
		ldr $rounds, [$key, #240] @ get # of rounds
		sub sp, sp, $rounds, lsl#7 @ 128 bytes per inner round key
		add sp, sp, #`128-32` @ size of bit-sliced key schedule
		#ifndef BSAES_ASM_EXTENDED_KEY
		@ allocate the key schedule on the stack
		sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
		add r12, #`128-32` @ sifze of bit-slices key schedule

		@ populate the key schedule
		mov r4, $key @ pass key
		mov r5, $rounds @ pass # of rounds
		mov r12, $keysched @ pass key schedule
		mov sp, r12 @ sp is $keysched
		bl _bsaes_key_convert
		vldmia $keysched, {@XMM[6]}
		vstmia r12, {@XMM[15]} @ save last round key
		veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
		vstmia $keysched, {@XMM[7]}
		#else
		ldr r12, [$key, #244]
		eors r12, #1
		beq 0f

		@ populate the key schedule
		str r12, [$key, #244]
		mov r4, $key @ pass key
		mov r5, $rounds @ pass # of rounds
		add r12, $key, #248 @ pass key schedule
		bl _bsaes_key_convert
		add r4, $key, #248
		vldmia r4, {@XMM[6]}
		vstmia r12, {@XMM[15]} @ save last round key
		veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
		vstmia r4, {@XMM[7]}

		.align 2
		0:
		#endif

		vld1.8 {@XMM[15]}, [$ivp] @ load IV
		b .Lcbc_dec_loop
		@@ -1035,7 +1092,11 @@ bsaes_cbc_encrypt:

		vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
		vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
		#ifndef BSAES_ASM_EXTENDED_KEY
		mov r4, $keysched @ pass the key
		#else
		add r4, $key, #248
		#endif
		vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
		mov r5, $rounds
		vld1.8 {@XMM[6]-@XMM[7]}, [$inp]
		@@ -1075,7 +1136,11 @@ bsaes_cbc_encrypt:
		cmp $len, #2
		blo .Lcbc_dec_one
		vld1.8 {@XMM[1]}, [$inp]!
		#ifndef BSAES_ASM_EXTENDED_KEY
		mov r4, $keysched @ pass the key
		#else
		add r4, $key, #248
		#endif
		mov r5, $rounds
		vstmia $fp, {@XMM[15]} @ put aside IV
		beq .Lcbc_dec_two
		@@ -1207,16 +1272,19 @@ bsaes_cbc_encrypt:
		vst1.8 {@XMM[0]}, [$rounds] @ write output

		.Lcbc_dec_done:
		#ifndef BSAES_ASM_EXTENDED_KEY
		vmov.i32 q0, #0
		vmov.i32 q1, #0
		.Lcbc_dec_bzero: @ wipe key schedule [if any]
		vstmia $keysched!, {q0-q1}
		teq $keysched, $fp
		cmp $keysched, $fp
		bne .Lcbc_dec_bzero
		#endif

		add sp, $fp, #0x10
		mov sp, $fp
		add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
		vst1.8 {@XMM[15]}, [$ivp] @ return IV
		vldmia sp!, {d8-d15}
		VFP_ABI_POP
		ldmia sp!, {r4-r10, pc}
		.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
		___
		@@ -1235,21 +1303,23 @@ bsaes_ctr32_encrypt_blocks:
		cmp $len, #8 @ use plain AES for
		blo .Lctr_enc_short @ small sizes

		mov ip, sp
		stmdb sp!, {r4-r10, lr}
		vstmdb sp!, {d8-d15} @ ABI specification says so
		ldr $ctr, [sp, #0x60] @ ctr is 1st arg on the stack
		VFP_ABI_PUSH
		ldr $ctr, [ip] @ ctr is 1st arg on the stack
		sub sp, sp, #0x10 @ scratch space to carry over the ctr
		mov $fp, sp @ save sp

		@ allocate the key schedule on the stack
		ldr $rounds, [$key, #240] @ get # of rounds
		sub sp, sp, $rounds, lsl#7 @ 128 bytes per inner round key
		add sp, sp, #`128-32` @ size of bit-sliced key schedule
		#ifndef BSAES_ASM_EXTENDED_KEY
		@ allocate the key schedule on the stack
		sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
		add r12, #`128-32` @ size of bit-sliced key schedule

		@ populate the key schedule
		mov r4, $key @ pass key
		mov r5, $rounds @ pass # of rounds
		mov r12, $keysched @ pass key schedule
		mov sp, r12 @ sp is $keysched
		bl _bsaes_key_convert
		veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
		vstmia r12, {@XMM[7]} @ save last round key
		@@ -1257,6 +1327,27 @@ bsaes_ctr32_encrypt_blocks:
		vld1.8 {@XMM[0]}, [$ctr] @ load counter
		add $ctr, $const, #.LREVM0SR-.LM0 @ borrow $ctr
		vldmia $keysched, {@XMM[4]} @ load round0 key
		#else
		ldr r12, [$key, #244]
		eors r12, #1
		beq 0f

		@ populate the key schedule
		str r12, [$key, #244]
		mov r4, $key @ pass key
		mov r5, $rounds @ pass # of rounds
		add r12, $key, #248 @ pass key schedule
		bl _bsaes_key_convert
		veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
		vstmia r12, {@XMM[7]} @ save last round key

		.align 2
		0: add r12, $key, #248
		vld1.8 {@XMM[0]}, [$ctr] @ load counter
		adrl $ctr, .LREVM0SR @ borrow $ctr
		vldmia r12, {@XMM[4]} @ load round0 key
		sub sp, #0x10 @ place for adjusted round0 key
		#endif

		vmov.i32 @XMM[8],#1 @ compose 1<<96
		veor @XMM[9],@XMM[9],@XMM[9]
		@@ -1283,7 +1374,11 @@ bsaes_ctr32_encrypt_blocks:
		@ to flip byte order in 32-bit counter

		vldmia $keysched, {@XMM[9]} @ load round0 key
		#ifndef BSAES_ASM_EXTENDED_KEY
		add r4, $keysched, #0x10 @ pass next round key
		#else
		add r4, $key, #`248+16`
		#endif
		vldmia $ctr, {@XMM[8]} @ .LREVM0SR
		mov r5, $rounds @ pass rounds
		vstmia $fp, {@XMM[10]} @ save next counter
		@@ -1359,13 +1454,18 @@ bsaes_ctr32_encrypt_blocks:
		.Lctr_enc_done:
		vmov.i32 q0, #0
		vmov.i32 q1, #0
		#ifndef BSAES_ASM_EXTENDED_KEY
		.Lctr_enc_bzero: @ wipe key schedule [if any]
		vstmia $keysched!, {q0-q1}
		teq $keysched, $fp
		cmp $keysched, $fp
		bne .Lctr_enc_bzero
		#else
		vstmia $keysched, {q0-q1}
		#endif

		add sp, $fp, #0x10
		vldmia sp!, {d8-d15}
		mov sp, $fp
		add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
		VFP_ABI_POP
		ldmia sp!, {r4-r10, pc} @ return

		.align 4
		@@ -1407,7 +1507,10 @@ bsaes_ctr32_encrypt_blocks:
		subs r6, r6, #1
		bne .Lctr_enc_short_loop

		add sp, sp, #0x20
		vmov.i32 q0, #0
		vmov.i32 q1, #0
		vstmia sp!, {q0-q1}

		ldmia sp!, {r4-r8, pc}
		.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
		___
		@@ -1428,41 +1531,66 @@ $code.=<<___;
		.type bsaes_xts_encrypt,%function
		.align 4
		bsaes_xts_encrypt:
		mov ip, sp
		stmdb sp!, {r4-r10, lr} @ 0x20
		vstmdb sp!, {d8-d15} @ 0x40
		VFP_ABI_PUSH
		mov r6, sp @ future $fp
		sub sp, #0x10 @ 0x10

		mov $inp, r0
		mov $out, r1
		mov $len, r2
		mov $key, r3
		bic sp, #0xf @ align at 16 bytes

		sub r0, sp, #0x10 @ 0x10
		bic r0, #0xf @ align at 16 bytes
		mov sp, r0

		#ifdef XTS_CHAIN_TWEAK
		ldr r0, [ip] @ pointer to input tweak
		#else
		@ generate initial tweak
		ldr r0, [r6, #0x64] @ iv[]
		ldr r0, [ip, #4] @ iv[]
		mov r1, sp
		ldr r2, [r6, #0x60] @ key2
		ldr r2, [ip, #0] @ key2
		bl AES_encrypt
		mov r0,sp @ pointer to initial tweak
		#endif

		@ allocate the key schedule on the stack
		ldr $rounds, [$key, #240] @ get # of rounds
		mov $fp, r6
		mov r0, sp @ pointer to initial tweak
		sub sp, sp, $rounds, lsl#7 @ 128 bytes per inner round key
		@ add sp, sp, #`128-32` @ size of bit-sliced key schedule
		sub sp, sp, #`32+16` @ place for tweak[9]
		#ifndef BSAES_ASM_EXTENDED_KEY
		@ allocate the key schedule on the stack
		sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
		@ add r12, #`128-32` @ size of bit-sliced key schedule
		sub r12, #`32+16` @ place for tweak[9]

		@ populate the key schedule
		mov r4, $key @ pass key
		mov r5, $rounds @ pass # of rounds
		add r12, sp, #0x90 @ pass key schedule
		mov sp, r12
		add r12, #0x90 @ pass key schedule
		bl _bsaes_key_convert
		veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
		vstmia r12, {@XMM[7]} @ save last round key
		#else
		ldr r12, [$key, #244]
		eors r12, #1
		beq 0f

		str r12, [$key, #244]
		mov r4, $key @ pass key
		mov r5, $rounds @ pass # of rounds
		add r12, $key, #248 @ pass key schedule
		bl _bsaes_key_convert
		veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
		vstmia r12, {@XMM[7]}

		.align 2
		0: sub sp, #0x90 @ place for tweak[9]
		#endif

		vld1.8 {@XMM[8]}, [r0] @ initial tweak
		add $magic, $const, #.Lxts_magic-.LM0
		adr $magic, .Lxts_magic

		subs $len, #0x80
		blo .Lxts_enc_short
		@@ -1502,7 +1630,11 @@ $code.=<<___;

		vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
		veor @XMM[5], @XMM[5], @XMM[13]
		#ifndef BSAES_ASM_EXTENDED_KEY
		add r4, sp, #0x90 @ pass key schedule
		#else
		add r4, $key, #248 @ pass key schedule
		#endif
		veor @XMM[6], @XMM[6], @XMM[14]
		mov r5, $rounds @ pass rounds
		veor @XMM[7], @XMM[7], @XMM[15]
		@@ -1567,7 +1699,11 @@ $code.=<<___;

		vld1.8 {@XMM[6]}, [$inp]!
		veor @XMM[5], @XMM[5], @XMM[13]
		#ifndef BSAES_ASM_EXTENDED_KEY
		add r4, sp, #0x90 @ pass key schedule
		#else
		add r4, $key, #248 @ pass key schedule
		#endif
		veor @XMM[6], @XMM[6], @XMM[14]
		mov r5, $rounds @ pass rounds
		mov r0, sp
		@@ -1597,7 +1733,11 @@ $code.=<<___;
		vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak

		veor @XMM[4], @XMM[4], @XMM[12]
		#ifndef BSAES_ASM_EXTENDED_KEY
		add r4, sp, #0x90 @ pass key schedule
		#else
		add r4, $key, #248 @ pass key schedule
		#endif
		veor @XMM[5], @XMM[5], @XMM[13]
		mov r5, $rounds @ pass rounds
		mov r0, sp
		@@ -1619,12 +1759,22 @@ $code.=<<___;

		vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
		b .Lxts_enc_done
		.align 4

		@ put this in range for both ARM and Thumb mode adr instructions
		.align 5
		.Lxts_magic:
		.quad 1, 0x87

		.align 5
		.Lxts_enc_5:
		vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak

		veor @XMM[3], @XMM[3], @XMM[11]
		#ifndef BSAES_ASM_EXTENDED_KEY
		add r4, sp, #0x90 @ pass key schedule
		#else
		add r4, $key, #248 @ pass key schedule
		#endif
		veor @XMM[4], @XMM[4], @XMM[12]
		mov r5, $rounds @ pass rounds
		mov r0, sp
		@@ -1650,7 +1800,11 @@ $code.=<<___;
		vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak

		veor @XMM[2], @XMM[2], @XMM[10]
		#ifndef BSAES_ASM_EXTENDED_KEY
		add r4, sp, #0x90 @ pass key schedule
		#else
		add r4, $key, #248 @ pass key schedule
		#endif
		veor @XMM[3], @XMM[3], @XMM[11]
		mov r5, $rounds @ pass rounds
		mov r0, sp
		@@ -1673,7 +1827,11 @@ $code.=<<___;
		vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak

		veor @XMM[1], @XMM[1], @XMM[9]
		#ifndef BSAES_ASM_EXTENDED_KEY
		add r4, sp, #0x90 @ pass key schedule
		#else
		add r4, $key, #248 @ pass key schedule
		#endif
		veor @XMM[2], @XMM[2], @XMM[10]
		mov r5, $rounds @ pass rounds
		mov r0, sp
		@@ -1695,7 +1853,11 @@ $code.=<<___;
		vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak

		veor @XMM[0], @XMM[0], @XMM[8]
		#ifndef BSAES_ASM_EXTENDED_KEY
		add r4, sp, #0x90 @ pass key schedule
		#else
		add r4, $key, #248 @ pass key schedule
		#endif
		veor @XMM[1], @XMM[1], @XMM[9]
		mov r5, $rounds @ pass rounds
		mov r0, sp
		@@ -1728,6 +1890,7 @@ $code.=<<___;
		vmov @XMM[8], @XMM[9] @ next round tweak

		.Lxts_enc_done:
		#ifndef XTS_CHAIN_TWEAK
		adds $len, #0x10
		beq .Lxts_enc_ret
		sub r6, $out, #0x10
		@@ -1755,18 +1918,25 @@ $code.=<<___;
		veor @XMM[0], @XMM[0], @XMM[8]
		vst1.8 {@XMM[0]}, [r6]
		mov $fp, r4
		#endif

		.Lxts_enc_ret:
		bic r0, $fp, #0xf
		vmov.i32 q0, #0
		vmov.i32 q1, #0
		#ifdef XTS_CHAIN_TWEAK
		ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
		#endif
		.Lxts_enc_bzero: @ wipe key schedule [if any]
		vstmia sp!, {q0-q1}
		teq sp, r0
		cmp sp, r0
		bne .Lxts_enc_bzero

		mov sp, $fp
		vldmia sp!, {d8-d15}
		#ifdef XTS_CHAIN_TWEAK
		vst1.8 {@XMM[8]}, [r1]
		#endif
		VFP_ABI_POP
		ldmia sp!, {r4-r10, pc} @ return

		.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
		@@ -1775,46 +1945,74 @@ $code.=<<___;
		.type bsaes_xts_decrypt,%function
		.align 4
		bsaes_xts_decrypt:
		mov ip, sp
		stmdb sp!, {r4-r10, lr} @ 0x20
		vstmdb sp!, {d8-d15} @ 0x40
		VFP_ABI_PUSH
		mov r6, sp @ future $fp
		sub sp, #0x10 @ 0x10

		mov $inp, r0
		mov $out, r1
		mov $len, r2
		mov $key, r3
		bic sp, #0xf @ align at 16 bytes

		sub r0, sp, #0x10 @ 0x10
		bic r0, #0xf @ align at 16 bytes
		mov sp, r0

		#ifdef XTS_CHAIN_TWEAK
		ldr r0, [ip] @ pointer to input tweak
		#else
		@ generate initial tweak
		ldr r0, [r6, #0x64] @ iv[]
		ldr r0, [ip, #4] @ iv[]
		mov r1, sp
		ldr r2, [r6, #0x60] @ key2
		ldr r2, [ip, #0] @ key2
		bl AES_encrypt
		mov r0, sp @ pointer to initial tweak
		#endif

		@ allocate the key schedule on the stack
		ldr $rounds, [$key, #240] @ get # of rounds
		mov $fp, r6
		mov r0, sp @ pointer to initial tweak
		sub sp, sp, $rounds, lsl#7 @ 128 bytes per inner round key
		@ add sp, sp, #`128-32` @ size of bit-sliced key schedule
		sub sp, sp, #`32+16` @ place for tweak[9]
		#ifndef BSAES_ASM_EXTENDED_KEY
		@ allocate the key schedule on the stack
		sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
		@ add r12, #`128-32` @ size of bit-sliced key schedule
		sub r12, #`32+16` @ place for tweak[9]

		@ populate the key schedule
		mov r4, $key @ pass key
		mov r5, $rounds @ pass # of rounds
		add r12, sp, #0x90 @ pass key schedule
		mov sp, r12
		add r12, #0x90 @ pass key schedule
		bl _bsaes_key_convert
		add r4, sp, #0x90
		vldmia r4, {@XMM[6]}
		vstmia r12, {@XMM[15]} @ save last round key
		veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
		vstmia r4, {@XMM[7]}
		#else
		ldr r12, [$key, #244]
		eors r12, #1
		beq 0f

		str r12, [$key, #244]
		mov r4, $key @ pass key
		mov r5, $rounds @ pass # of rounds
		add r12, $key, #248 @ pass key schedule
		bl _bsaes_key_convert
		add r4, $key, #248
		vldmia r4, {@XMM[6]}
		vstmia r12, {@XMM[15]} @ save last round key
		veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
		vstmia r4, {@XMM[7]}

		.align 2
		0: sub sp, #0x90 @ place for tweak[9]
		#endif
		vld1.8 {@XMM[8]}, [r0] @ initial tweak
		add $magic, $const, #.Lxts_magic-.LM0
		adr $magic, .Lxts_magic

		tst $len, #0xf @ if not multiple of 16
		it ne @ Thumb2 thing, sanity check in ARM
		subne $len, #0x10 @ subtract another 16 bytes
		subs $len, #0x80

		@@ -1855,7 +2053,11 @@ $code.=<<___;

		vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
		veor @XMM[5], @XMM[5], @XMM[13]
		#ifndef BSAES_ASM_EXTENDED_KEY
		add r4, sp, #0x90 @ pass key schedule
		#else
		add r4, $key, #248 @ pass key schedule
		#endif
		veor @XMM[6], @XMM[6], @XMM[14]
		mov r5, $rounds @ pass rounds
		veor @XMM[7], @XMM[7], @XMM[15]
		@@ -1920,7 +2122,11 @@ $code.=<<___;

		vld1.8 {@XMM[6]}, [$inp]!
		veor @XMM[5], @XMM[5], @XMM[13]
		#ifndef BSAES_ASM_EXTENDED_KEY
		add r4, sp, #0x90 @ pass key schedule
		#else
		add r4, $key, #248 @ pass key schedule
		#endif
		veor @XMM[6], @XMM[6], @XMM[14]
		mov r5, $rounds @ pass rounds
		mov r0, sp
		@@ -1950,7 +2156,11 @@ $code.=<<___;
		vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak

		veor @XMM[4], @XMM[4], @XMM[12]
		#ifndef BSAES_ASM_EXTENDED_KEY
		add r4, sp, #0x90 @ pass key schedule
		#else
		add r4, $key, #248 @ pass key schedule
		#endif
		veor @XMM[5], @XMM[5], @XMM[13]
		mov r5, $rounds @ pass rounds
		mov r0, sp
		@@ -1977,7 +2187,11 @@ $code.=<<___;
		vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak

		veor @XMM[3], @XMM[3], @XMM[11]
		#ifndef BSAES_ASM_EXTENDED_KEY
		add r4, sp, #0x90 @ pass key schedule
		#else
		add r4, $key, #248 @ pass key schedule
		#endif
		veor @XMM[4], @XMM[4], @XMM[12]
		mov r5, $rounds @ pass rounds
		mov r0, sp
		@@ -2003,7 +2217,11 @@ $code.=<<___;
		vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak

		veor @XMM[2], @XMM[2], @XMM[10]
		#ifndef BSAES_ASM_EXTENDED_KEY
		add r4, sp, #0x90 @ pass key schedule
		#else
		add r4, $key, #248 @ pass key schedule
		#endif
		veor @XMM[3], @XMM[3], @XMM[11]
		mov r5, $rounds @ pass rounds
		mov r0, sp
		@@ -2026,7 +2244,11 @@ $code.=<<___;
		vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak

		veor @XMM[1], @XMM[1], @XMM[9]
		#ifndef BSAES_ASM_EXTENDED_KEY
		add r4, sp, #0x90 @ pass key schedule
		#else
		add r4, $key, #248 @ pass key schedule
		#endif
		veor @XMM[2], @XMM[2], @XMM[10]
		mov r5, $rounds @ pass rounds
		mov r0, sp
		@@ -2048,7 +2270,11 @@ $code.=<<___;
		vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak

		veor @XMM[0], @XMM[0], @XMM[8]
		#ifndef BSAES_ASM_EXTENDED_KEY
		add r4, sp, #0x90 @ pass key schedule
		#else
		add r4, $key, #248 @ pass key schedule
		#endif
		veor @XMM[1], @XMM[1], @XMM[9]
		mov r5, $rounds @ pass rounds
		mov r0, sp
		@@ -2083,6 +2309,7 @@ $code.=<<___;
		vmov @XMM[8], @XMM[9] @ next round tweak

		.Lxts_dec_done:
		#ifndef XTS_CHAIN_TWEAK
		adds $len, #0x10
		beq .Lxts_dec_ret

		@@ -2132,18 +2359,25 @@ $code.=<<___;
		veor @XMM[0], @XMM[0], @XMM[8]
		vst1.8 {@XMM[0]}, [r6]
		mov $fp, r4
		#endif

		.Lxts_dec_ret:
		bic r0, $fp, #0xf
		vmov.i32 q0, #0
		vmov.i32 q1, #0
		#ifdef XTS_CHAIN_TWEAK
		ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
		#endif
		.Lxts_dec_bzero: @ wipe key schedule [if any]
		vstmia sp!, {q0-q1}
		teq sp, r0
		cmp sp, r0
		bne .Lxts_dec_bzero

		mov sp, $fp
		vldmia sp!, {d8-d15}
		#ifdef XTS_CHAIN_TWEAK
		vst1.8 {@XMM[8]}, [r1]
		#endif
		VFP_ABI_POP
		ldmia sp!, {r4-r10, pc} @ return

		.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
		@@ -2155,6 +2389,14 @@ ___

		$code =~ s/\`([^\`]*)\`/eval($1)/gem;

		open SELF,$0;
		while(<SELF>) {
		next if (/^#!/);
		last if (!s/^#/@/ and !/^$/);
		print;
		}
		close SELF;

		print $code;

		close STDOUT;