Loading crypto/aes/asm/bsaes-armv7.pl +430 −1 Original line number Diff line number Diff line Loading @@ -715,6 +715,8 @@ _bsaes_const: .quad 0x0304090e00050a0f, 0x01060b0c0207080d .LM0: .quad 0x02060a0e03070b0f, 0x0004080c0105090d .LREVM0SR: .quad 0x090d02060c030708, 0x00040b0f050a0e01 .asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>" .align 6 .size _bsaes_const,.-_bsaes_const Loading @@ -727,6 +729,7 @@ _bsaes_encrypt8: sub $const,$const,#_bsaes_encrypt8-.LM0SR vldmia $const!, {@XMM[8]} @ .LM0SR _bsaes_encrypt8_alt: veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key veor @XMM[11], @XMM[1], @XMM[9] vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])` Loading Loading @@ -879,7 +882,7 @@ _bsaes_key_convert: ___ } if (1) { # following four functions are unsupported interface if (0) { # following four functions are unsupported interface # used for benchmarking... $code.=<<___; .globl bsaes_enc_key_convert Loading Loading @@ -981,6 +984,432 @@ bsaes_decrypt_128: .size bsaes_decrypt_128,.-bsaes_decrypt_128 ___ } { my ($inp,$out,$len,$key, $ivp,$fp,$rounds)=map("r$_",(0..3,8..10)); my ($keysched)=("sp"); $code.=<<___; .extern AES_cbc_encrypt .extern AES_decrypt .global bsaes_cbc_encrypt .type bsaes_cbc_encrypt,%function .align 5 bsaes_cbc_encrypt: cmp $len, #128 blo AES_cbc_encrypt @ it is up to the caller to make sure we are called with enc == 0 stmdb sp!, {r4-r10, lr} vstmdb sp!, {d8-d15} @ ABI specification says so ldr $ivp, [sp, #0x60] @ IV is 1st arg on the stack mov $len, $len, lsr#4 @ len in 16 byte blocks sub sp, #0x10 @ scratch space to carry over the IV mov $fp, sp @ save sp @ allocate the key schedule on the stack ldr $rounds, [$key, #240] @ get # of rounds sub sp, sp, $rounds, lsl#7 @ 128 bytes per inner round key add sp, sp, #`128-32` @ size of bit-sliced key schedule @ populate the key schedule mov r4, $key @ pass key mov r5, $rounds @ pass # of rounds mov r12, $keysched @ pass key schedule bl _bsaes_key_convert vldmia $keysched, {@XMM[6]} vstmia r12, {@XMM[15]} @ save last round key veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key vstmia $keysched, {@XMM[7]} vld1.8 {@XMM[15]}, [$ivp] @ load IV b .Lcbc_dec_loop .align 4 .Lcbc_dec_loop: subs $len, $len, #0x8 bmi .Lcbc_dec_loop_finish vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input vld1.8 {@XMM[2]-@XMM[3]}, [$inp]! mov r4, $keysched @ pass the key vld1.8 {@XMM[4]-@XMM[5]}, [$inp]! mov r5, $rounds vld1.8 {@XMM[6]-@XMM[7]}, [$inp] sub $inp, $inp, #0x60 vstmia $fp, {@XMM[15]} @ put aside IV bl _bsaes_decrypt8 vldmia $fp, {@XMM[14]} @ reload IV vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV vld1.8 {@XMM[10]-@XMM[11]}, [$inp]! veor @XMM[1], @XMM[1], @XMM[8] veor @XMM[6], @XMM[6], @XMM[9] vld1.8 {@XMM[12]-@XMM[13]}, [$inp]! veor @XMM[4], @XMM[4], @XMM[10] veor @XMM[2], @XMM[2], @XMM[11] vld1.8 {@XMM[14]-@XMM[15]}, [$inp]! veor @XMM[7], @XMM[7], @XMM[12] vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output veor @XMM[3], @XMM[3], @XMM[13] vst1.8 {@XMM[6]}, [$out]! veor @XMM[5], @XMM[5], @XMM[14] vst1.8 {@XMM[4]}, [$out]! vst1.8 {@XMM[2]}, [$out]! vst1.8 {@XMM[7]}, [$out]! vst1.8 {@XMM[3]}, [$out]! vst1.8 {@XMM[5]}, [$out]! b .Lcbc_dec_loop .Lcbc_dec_loop_finish: adds $len, $len, #8 beq .Lcbc_dec_done vld1.8 {@XMM[0]}, [$inp]! @ load input cmp $len, #2 blo .Lcbc_dec_one vld1.8 {@XMM[1]}, [$inp]! mov r4, $keysched @ pass the key mov r5, $rounds vstmia $fp, {@XMM[15]} @ put aside IV beq .Lcbc_dec_two vld1.8 {@XMM[2]}, [$inp]! cmp $len, #4 blo .Lcbc_dec_three vld1.8 {@XMM[3]}, [$inp]! beq .Lcbc_dec_four vld1.8 {@XMM[4]}, [$inp]! cmp $len, #6 blo .Lcbc_dec_five vld1.8 {@XMM[5]}, [$inp]! beq .Lcbc_dec_six vld1.8 {@XMM[6]}, [$inp]! sub $inp, $inp, #0x70 bl _bsaes_decrypt8 vldmia $fp, {@XMM[14]} @ reload IV vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV vld1.8 {@XMM[10]-@XMM[11]}, [$inp]! veor @XMM[1], @XMM[1], @XMM[8] veor @XMM[6], @XMM[6], @XMM[9] vld1.8 {@XMM[12]-@XMM[13]}, [$inp]! veor @XMM[4], @XMM[4], @XMM[10] veor @XMM[2], @XMM[2], @XMM[11] vld1.8 {@XMM[15]}, [$inp]! veor @XMM[7], @XMM[7], @XMM[12] vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output veor @XMM[3], @XMM[3], @XMM[13] vst1.8 {@XMM[6]}, [$out]! vst1.8 {@XMM[4]}, [$out]! vst1.8 {@XMM[2]}, [$out]! vst1.8 {@XMM[7]}, [$out]! vst1.8 {@XMM[3]}, [$out]! b .Lcbc_dec_done .align 4 .Lcbc_dec_six: sub $inp, $inp, #0x60 bl _bsaes_decrypt8 vldmia $fp,{@XMM[14]} @ reload IV vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV vld1.8 {@XMM[10]-@XMM[11]}, [$inp]! veor @XMM[1], @XMM[1], @XMM[8] veor @XMM[6], @XMM[6], @XMM[9] vld1.8 {@XMM[12]}, [$inp]! veor @XMM[4], @XMM[4], @XMM[10] veor @XMM[2], @XMM[2], @XMM[11] vld1.8 {@XMM[15]}, [$inp]! veor @XMM[7], @XMM[7], @XMM[12] vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output vst1.8 {@XMM[6]}, [$out]! vst1.8 {@XMM[4]}, [$out]! vst1.8 {@XMM[2]}, [$out]! vst1.8 {@XMM[7]}, [$out]! b .Lcbc_dec_done .align 4 .Lcbc_dec_five: sub $inp, $inp, #0x50 bl _bsaes_decrypt8 vldmia $fp, {@XMM[14]} @ reload IV vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV vld1.8 {@XMM[10]-@XMM[11]}, [$inp]! veor @XMM[1], @XMM[1], @XMM[8] veor @XMM[6], @XMM[6], @XMM[9] vld1.8 {@XMM[15]}, [$inp]! veor @XMM[4], @XMM[4], @XMM[10] vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output veor @XMM[2], @XMM[2], @XMM[11] vst1.8 {@XMM[6]}, [$out]! vst1.8 {@XMM[4]}, [$out]! vst1.8 {@XMM[2]}, [$out]! b .Lcbc_dec_done .align 4 .Lcbc_dec_four: sub $inp, $inp, #0x40 bl _bsaes_decrypt8 vldmia $fp, {@XMM[14]} @ reload IV vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV vld1.8 {@XMM[10]}, [$inp]! veor @XMM[1], @XMM[1], @XMM[8] veor @XMM[6], @XMM[6], @XMM[9] vld1.8 {@XMM[15]}, [$inp]! veor @XMM[4], @XMM[4], @XMM[10] vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output vst1.8 {@XMM[6]}, [$out]! vst1.8 {@XMM[4]}, [$out]! b .Lcbc_dec_done .align 4 .Lcbc_dec_three: sub $inp, $inp, #0x30 bl _bsaes_decrypt8 vldmia $fp, {@XMM[14]} @ reload IV vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV vld1.8 {@XMM[15]}, [$inp]! veor @XMM[1], @XMM[1], @XMM[8] veor @XMM[6], @XMM[6], @XMM[9] vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output vst1.8 {@XMM[6]}, [$out]! b .Lcbc_dec_done .align 4 .Lcbc_dec_two: sub $inp, $inp, #0x20 bl _bsaes_decrypt8 vldmia $fp, {@XMM[14]} @ reload IV vld1.8 {@XMM[8]}, [$inp]! @ reload input veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV vld1.8 {@XMM[15]}, [$inp]! @ reload input veor @XMM[1], @XMM[1], @XMM[8] vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output b .Lcbc_dec_done .align 4 .Lcbc_dec_one: sub $inp, $inp, #0x10 mov $rounds, $out @ save original out pointer mov $out, $fp @ use the iv scratch space as out buffer mov r2, $key vmov @XMM[4],@XMM[15] @ just in case ensure that IV vmov @XMM[5],@XMM[0] @ and input are preserved bl AES_decrypt vld1.8 {@XMM[0]}, [$fp,:64] @ load result veor @XMM[0], @XMM[0], @XMM[4] @ ^= IV vmov @XMM[15], @XMM[5] @ @XMM[5] holds input vst1.8 {@XMM[0]}, [$rounds] @ write output .Lcbc_dec_done: vmov.i32 q0, #0 vmov.i32 q1, #0 .Lcbc_dec_bzero: @ wipe key schedule [if any] vstmia $keysched!, {q0-q1} teq $keysched, $fp bne .Lcbc_dec_bzero add sp, $fp, #0x10 vst1.8 {@XMM[15]}, [$ivp] @ return IV vldmia sp!, {d8-d15} ldmia sp!, {r4-r10, pc} .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt ___ } { my ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10))); my $const = "r6"; # shared with _bsaes_encrypt8_alt my $keysched = "sp"; $code.=<<___; .extern AES_encrypt .global bsaes_ctr32_encrypt_blocks .type bsaes_ctr32_encrypt_blocks,%function .align 5 bsaes_ctr32_encrypt_blocks: cmp $len, #8 @ use plain AES for blo .Lctr_enc_short @ small sizes stmdb sp!, {r4-r10, lr} vstmdb sp!, {d8-d15} @ ABI specification says so ldr $ctr, [sp, #0x60] @ ctr is 1st arg on the stack sub sp, sp, #0x10 @ scratch space to carry over the ctr mov $fp, sp @ save sp @ allocate the key schedule on the stack ldr $rounds, [$key, #240] @ get # of rounds sub sp, sp, $rounds, lsl#7 @ 128 bytes per inner round key add sp, sp, #`128-32` @ size of bit-sliced key schedule @ populate the key schedule mov r4, $key @ pass key mov r5, $rounds @ pass # of rounds mov r12, $keysched @ pass key schedule bl _bsaes_key_convert veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key vstmia r12, {@XMM[7]} @ save last round key vld1.8 {@XMM[0]}, [$ctr] @ load counter add $ctr, $const, #.LREVM0SR-.LM0 @ borrow $ctr vldmia $keysched, {@XMM[4]} @ load round0 key vmov.i32 `&Dhi("@XMM[8]")`,#1 @ compose 1<<96 vmov.i32 `&Dlo("@XMM[8]")`,#0 vrev32.8 `&Dhi("@XMM[0]")`,`&Dhi("@XMM[0]")` vshl.u64 `&Dhi("@XMM[8]")`,#32 vrev32.8 `&Dhi("@XMM[4]")`,`&Dhi("@XMM[4]")` vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96 vstmia $keysched, {@XMM[4]} @ save adjusted round0 key b .Lctr_enc_loop .align 4 .Lctr_enc_loop: vadd.u32 @XMM[10], @XMM[8], @XMM[9] @ compose 3<<96 vadd.u32 @XMM[1], @XMM[0], @XMM[8] @ +1 vadd.u32 @XMM[2], @XMM[0], @XMM[9] @ +2 vadd.u32 @XMM[3], @XMM[0], @XMM[10] @ +3 vadd.u32 @XMM[4], @XMM[1], @XMM[10] vadd.u32 @XMM[5], @XMM[2], @XMM[10] vadd.u32 @XMM[6], @XMM[3], @XMM[10] vadd.u32 @XMM[7], @XMM[4], @XMM[10] vadd.u32 @XMM[10], @XMM[5], @XMM[10] @ next counter @ Borrow prologue from _bsaes_encrypt8 to use the opportunity @ to flip byte order in 32-bit counter vldmia $keysched, {@XMM[9]} @ load round0 key add r4, $keysched, #0x10 @ pass next round key vldmia $ctr, {@XMM[8]} @ .LREVM0SR mov r5, $rounds @ pass rounds vstmia $fp, {@XMM[10]} @ save next counter sub $const, $ctr, #.LREVM0SR-.LSR @ pass constants bl _bsaes_encrypt8_alt subs $len, $len, #8 blo .Lctr_enc_loop_done vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ load input vld1.8 {@XMM[10]-@XMM[11]}, [$inp]! veor @XMM[0], @XMM[8] veor @XMM[1], @XMM[9] vld1.8 {@XMM[12]-@XMM[13]}, [$inp]! veor @XMM[4], @XMM[10] veor @XMM[6], @XMM[11] vld1.8 {@XMM[14]-@XMM[15]}, [$inp]! veor @XMM[3], @XMM[12] vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output veor @XMM[7], @XMM[13] veor @XMM[2], @XMM[14] vst1.8 {@XMM[4]}, [$out]! veor @XMM[5], @XMM[15] vst1.8 {@XMM[6]}, [$out]! vmov.i32 `&Dhi("@XMM[8]")`,#1 @ compose 1<<96 vst1.8 {@XMM[3]}, [$out]! vmov.i32 `&Dlo("@XMM[8]")`,#0 vst1.8 {@XMM[7]}, [$out]! vshl.u64 `&Dhi("@XMM[8]")`,#32 vst1.8 {@XMM[2]}, [$out]! vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96 vst1.8 {@XMM[5]}, [$out]! vldmia $fp, {@XMM[0]} @ load counter bne .Lctr_enc_loop b .Lctr_enc_done .align 4 .Lctr_enc_loop_done: add $len, $len, #8 vld1.8 {@XMM[8]}, [$inp]! @ load input veor @XMM[0], @XMM[8] vst1.8 {@XMM[0]}, [$out]! @ write output cmp $len, #2 blo .Lctr_enc_done vld1.8 {@XMM[9]}, [$inp]! veor @XMM[1], @XMM[9] vst1.8 {@XMM[1]}, [$out]! beq .Lctr_enc_done vld1.8 {@XMM[10]}, [$inp]! veor @XMM[4], @XMM[10] vst1.8 {@XMM[4]}, [$out]! cmp $len, #4 blo .Lctr_enc_done vld1.8 {@XMM[11]}, [$inp]! veor @XMM[6], @XMM[11] vst1.8 {@XMM[6]}, [$out]! beq .Lctr_enc_done vld1.8 {@XMM[12]}, [$inp]! veor @XMM[3], @XMM[12] vst1.8 {@XMM[3]}, [$out]! cmp $len, #6 blo .Lctr_enc_done vld1.8 {@XMM[13]}, [$inp]! veor @XMM[7], @XMM[13] vst1.8 {@XMM[7]}, [$out]! beq .Lctr_enc_done vld1.8 {@XMM[14]}, [$inp] veor @XMM[2], @XMM[14] vst1.8 {@XMM[2]}, [$out]! .Lctr_enc_done: vmov.i32 q0, #0 vmov.i32 q1, #0 .Lctr_enc_bzero: @ wipe key schedule [if any] vstmia $keysched!, {q0-q1} teq $keysched, $fp bne .Lctr_enc_bzero add sp, $fp, #0x10 vldmia sp!, {d8-d15} ldmia sp!, {r4-r10, pc} @ return .align 4 .Lctr_enc_short: ldr ip, [sp] @ ctr pointer is passed on stack stmdb sp!, {r4-r8, lr} mov r4, $inp @ copy arguments mov r5, $out mov r6, $len mov r7, $key ldr r8, [ip, #12] @ load counter LSW vld1.8 {@XMM[1]}, [ip] @ load whole counter value #ifdef __ARMEL__ rev r8, r8 #endif sub sp, sp, #0x10 vst1.8 {@XMM[1]}, [sp,:64] @ copy counter value sub sp, sp, #0x10 .Lctr_enc_short_loop: add r0, sp, #0x10 @ input counter value mov r1, sp @ output on the stack mov r2, r7 @ key bl AES_encrypt vld1.8 {@XMM[0]}, [r4]! @ load input vld1.8 {@XMM[1]}, [sp,:64] @ load encrypted counter add r8, r8, #1 #ifdef __ARMEL__ rev r0, r8 str r0, [sp, #0x1c] @ next counter value #else str r8, [sp, #0x1c] @ next counter value #endif veor @XMM[0],@XMM[0],@XMM[1] vst1.8 {@XMM[0]}, [r5]! @ store output subs r6, r6, #1 bne .Lctr_enc_short_loop add sp, sp, #0x20 ldmia sp!, {r4-r8, pc} .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks ___ } $code.=<<___; #endif ___ Loading crypto/evp/e_aes.c +10 −0 Original line number Diff line number Diff line Loading @@ -892,6 +892,14 @@ static const EVP_CIPHER aes_##keylen##_##mode = { \ NULL,NULL,aes_##mode##_ctrl,NULL }; \ const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \ { return &aes_##keylen##_##mode; } #endif #if defined(AES_ASM) && defined(BSAES_ASM) && (defined(__arm__) || defined(__arm)) #include "arm_arch.h" #if __ARM_ARCH__>=7 #define BSAES_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON) #endif #endif #define BLOCK_CIPHER_generic_pack(nid,keylen,flags) \ Loading Loading @@ -1624,11 +1632,13 @@ static int aes_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, xctx->stream = NULL; #endif /* key_len is two AES keys */ #if !(defined(__arm__) || defined(__arm)) /* not yet? */ #ifdef BSAES_CAPABLE if (BSAES_CAPABLE) xctx->stream = enc ? bsaes_xts_encrypt : bsaes_xts_decrypt; else #endif #endif #ifdef VPAES_CAPABLE if (VPAES_CAPABLE) { Loading Loading
crypto/aes/asm/bsaes-armv7.pl +430 −1 Original line number Diff line number Diff line Loading @@ -715,6 +715,8 @@ _bsaes_const: .quad 0x0304090e00050a0f, 0x01060b0c0207080d .LM0: .quad 0x02060a0e03070b0f, 0x0004080c0105090d .LREVM0SR: .quad 0x090d02060c030708, 0x00040b0f050a0e01 .asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>" .align 6 .size _bsaes_const,.-_bsaes_const Loading @@ -727,6 +729,7 @@ _bsaes_encrypt8: sub $const,$const,#_bsaes_encrypt8-.LM0SR vldmia $const!, {@XMM[8]} @ .LM0SR _bsaes_encrypt8_alt: veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key veor @XMM[11], @XMM[1], @XMM[9] vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])` Loading Loading @@ -879,7 +882,7 @@ _bsaes_key_convert: ___ } if (1) { # following four functions are unsupported interface if (0) { # following four functions are unsupported interface # used for benchmarking... $code.=<<___; .globl bsaes_enc_key_convert Loading Loading @@ -981,6 +984,432 @@ bsaes_decrypt_128: .size bsaes_decrypt_128,.-bsaes_decrypt_128 ___ } { my ($inp,$out,$len,$key, $ivp,$fp,$rounds)=map("r$_",(0..3,8..10)); my ($keysched)=("sp"); $code.=<<___; .extern AES_cbc_encrypt .extern AES_decrypt .global bsaes_cbc_encrypt .type bsaes_cbc_encrypt,%function .align 5 bsaes_cbc_encrypt: cmp $len, #128 blo AES_cbc_encrypt @ it is up to the caller to make sure we are called with enc == 0 stmdb sp!, {r4-r10, lr} vstmdb sp!, {d8-d15} @ ABI specification says so ldr $ivp, [sp, #0x60] @ IV is 1st arg on the stack mov $len, $len, lsr#4 @ len in 16 byte blocks sub sp, #0x10 @ scratch space to carry over the IV mov $fp, sp @ save sp @ allocate the key schedule on the stack ldr $rounds, [$key, #240] @ get # of rounds sub sp, sp, $rounds, lsl#7 @ 128 bytes per inner round key add sp, sp, #`128-32` @ size of bit-sliced key schedule @ populate the key schedule mov r4, $key @ pass key mov r5, $rounds @ pass # of rounds mov r12, $keysched @ pass key schedule bl _bsaes_key_convert vldmia $keysched, {@XMM[6]} vstmia r12, {@XMM[15]} @ save last round key veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key vstmia $keysched, {@XMM[7]} vld1.8 {@XMM[15]}, [$ivp] @ load IV b .Lcbc_dec_loop .align 4 .Lcbc_dec_loop: subs $len, $len, #0x8 bmi .Lcbc_dec_loop_finish vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input vld1.8 {@XMM[2]-@XMM[3]}, [$inp]! mov r4, $keysched @ pass the key vld1.8 {@XMM[4]-@XMM[5]}, [$inp]! mov r5, $rounds vld1.8 {@XMM[6]-@XMM[7]}, [$inp] sub $inp, $inp, #0x60 vstmia $fp, {@XMM[15]} @ put aside IV bl _bsaes_decrypt8 vldmia $fp, {@XMM[14]} @ reload IV vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV vld1.8 {@XMM[10]-@XMM[11]}, [$inp]! veor @XMM[1], @XMM[1], @XMM[8] veor @XMM[6], @XMM[6], @XMM[9] vld1.8 {@XMM[12]-@XMM[13]}, [$inp]! veor @XMM[4], @XMM[4], @XMM[10] veor @XMM[2], @XMM[2], @XMM[11] vld1.8 {@XMM[14]-@XMM[15]}, [$inp]! veor @XMM[7], @XMM[7], @XMM[12] vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output veor @XMM[3], @XMM[3], @XMM[13] vst1.8 {@XMM[6]}, [$out]! veor @XMM[5], @XMM[5], @XMM[14] vst1.8 {@XMM[4]}, [$out]! vst1.8 {@XMM[2]}, [$out]! vst1.8 {@XMM[7]}, [$out]! vst1.8 {@XMM[3]}, [$out]! vst1.8 {@XMM[5]}, [$out]! b .Lcbc_dec_loop .Lcbc_dec_loop_finish: adds $len, $len, #8 beq .Lcbc_dec_done vld1.8 {@XMM[0]}, [$inp]! @ load input cmp $len, #2 blo .Lcbc_dec_one vld1.8 {@XMM[1]}, [$inp]! mov r4, $keysched @ pass the key mov r5, $rounds vstmia $fp, {@XMM[15]} @ put aside IV beq .Lcbc_dec_two vld1.8 {@XMM[2]}, [$inp]! cmp $len, #4 blo .Lcbc_dec_three vld1.8 {@XMM[3]}, [$inp]! beq .Lcbc_dec_four vld1.8 {@XMM[4]}, [$inp]! cmp $len, #6 blo .Lcbc_dec_five vld1.8 {@XMM[5]}, [$inp]! beq .Lcbc_dec_six vld1.8 {@XMM[6]}, [$inp]! sub $inp, $inp, #0x70 bl _bsaes_decrypt8 vldmia $fp, {@XMM[14]} @ reload IV vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV vld1.8 {@XMM[10]-@XMM[11]}, [$inp]! veor @XMM[1], @XMM[1], @XMM[8] veor @XMM[6], @XMM[6], @XMM[9] vld1.8 {@XMM[12]-@XMM[13]}, [$inp]! veor @XMM[4], @XMM[4], @XMM[10] veor @XMM[2], @XMM[2], @XMM[11] vld1.8 {@XMM[15]}, [$inp]! veor @XMM[7], @XMM[7], @XMM[12] vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output veor @XMM[3], @XMM[3], @XMM[13] vst1.8 {@XMM[6]}, [$out]! vst1.8 {@XMM[4]}, [$out]! vst1.8 {@XMM[2]}, [$out]! vst1.8 {@XMM[7]}, [$out]! vst1.8 {@XMM[3]}, [$out]! b .Lcbc_dec_done .align 4 .Lcbc_dec_six: sub $inp, $inp, #0x60 bl _bsaes_decrypt8 vldmia $fp,{@XMM[14]} @ reload IV vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV vld1.8 {@XMM[10]-@XMM[11]}, [$inp]! veor @XMM[1], @XMM[1], @XMM[8] veor @XMM[6], @XMM[6], @XMM[9] vld1.8 {@XMM[12]}, [$inp]! veor @XMM[4], @XMM[4], @XMM[10] veor @XMM[2], @XMM[2], @XMM[11] vld1.8 {@XMM[15]}, [$inp]! veor @XMM[7], @XMM[7], @XMM[12] vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output vst1.8 {@XMM[6]}, [$out]! vst1.8 {@XMM[4]}, [$out]! vst1.8 {@XMM[2]}, [$out]! vst1.8 {@XMM[7]}, [$out]! b .Lcbc_dec_done .align 4 .Lcbc_dec_five: sub $inp, $inp, #0x50 bl _bsaes_decrypt8 vldmia $fp, {@XMM[14]} @ reload IV vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV vld1.8 {@XMM[10]-@XMM[11]}, [$inp]! veor @XMM[1], @XMM[1], @XMM[8] veor @XMM[6], @XMM[6], @XMM[9] vld1.8 {@XMM[15]}, [$inp]! veor @XMM[4], @XMM[4], @XMM[10] vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output veor @XMM[2], @XMM[2], @XMM[11] vst1.8 {@XMM[6]}, [$out]! vst1.8 {@XMM[4]}, [$out]! vst1.8 {@XMM[2]}, [$out]! b .Lcbc_dec_done .align 4 .Lcbc_dec_four: sub $inp, $inp, #0x40 bl _bsaes_decrypt8 vldmia $fp, {@XMM[14]} @ reload IV vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV vld1.8 {@XMM[10]}, [$inp]! veor @XMM[1], @XMM[1], @XMM[8] veor @XMM[6], @XMM[6], @XMM[9] vld1.8 {@XMM[15]}, [$inp]! veor @XMM[4], @XMM[4], @XMM[10] vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output vst1.8 {@XMM[6]}, [$out]! vst1.8 {@XMM[4]}, [$out]! b .Lcbc_dec_done .align 4 .Lcbc_dec_three: sub $inp, $inp, #0x30 bl _bsaes_decrypt8 vldmia $fp, {@XMM[14]} @ reload IV vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV vld1.8 {@XMM[15]}, [$inp]! veor @XMM[1], @XMM[1], @XMM[8] veor @XMM[6], @XMM[6], @XMM[9] vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output vst1.8 {@XMM[6]}, [$out]! b .Lcbc_dec_done .align 4 .Lcbc_dec_two: sub $inp, $inp, #0x20 bl _bsaes_decrypt8 vldmia $fp, {@XMM[14]} @ reload IV vld1.8 {@XMM[8]}, [$inp]! @ reload input veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV vld1.8 {@XMM[15]}, [$inp]! @ reload input veor @XMM[1], @XMM[1], @XMM[8] vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output b .Lcbc_dec_done .align 4 .Lcbc_dec_one: sub $inp, $inp, #0x10 mov $rounds, $out @ save original out pointer mov $out, $fp @ use the iv scratch space as out buffer mov r2, $key vmov @XMM[4],@XMM[15] @ just in case ensure that IV vmov @XMM[5],@XMM[0] @ and input are preserved bl AES_decrypt vld1.8 {@XMM[0]}, [$fp,:64] @ load result veor @XMM[0], @XMM[0], @XMM[4] @ ^= IV vmov @XMM[15], @XMM[5] @ @XMM[5] holds input vst1.8 {@XMM[0]}, [$rounds] @ write output .Lcbc_dec_done: vmov.i32 q0, #0 vmov.i32 q1, #0 .Lcbc_dec_bzero: @ wipe key schedule [if any] vstmia $keysched!, {q0-q1} teq $keysched, $fp bne .Lcbc_dec_bzero add sp, $fp, #0x10 vst1.8 {@XMM[15]}, [$ivp] @ return IV vldmia sp!, {d8-d15} ldmia sp!, {r4-r10, pc} .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt ___ } { my ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10))); my $const = "r6"; # shared with _bsaes_encrypt8_alt my $keysched = "sp"; $code.=<<___; .extern AES_encrypt .global bsaes_ctr32_encrypt_blocks .type bsaes_ctr32_encrypt_blocks,%function .align 5 bsaes_ctr32_encrypt_blocks: cmp $len, #8 @ use plain AES for blo .Lctr_enc_short @ small sizes stmdb sp!, {r4-r10, lr} vstmdb sp!, {d8-d15} @ ABI specification says so ldr $ctr, [sp, #0x60] @ ctr is 1st arg on the stack sub sp, sp, #0x10 @ scratch space to carry over the ctr mov $fp, sp @ save sp @ allocate the key schedule on the stack ldr $rounds, [$key, #240] @ get # of rounds sub sp, sp, $rounds, lsl#7 @ 128 bytes per inner round key add sp, sp, #`128-32` @ size of bit-sliced key schedule @ populate the key schedule mov r4, $key @ pass key mov r5, $rounds @ pass # of rounds mov r12, $keysched @ pass key schedule bl _bsaes_key_convert veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key vstmia r12, {@XMM[7]} @ save last round key vld1.8 {@XMM[0]}, [$ctr] @ load counter add $ctr, $const, #.LREVM0SR-.LM0 @ borrow $ctr vldmia $keysched, {@XMM[4]} @ load round0 key vmov.i32 `&Dhi("@XMM[8]")`,#1 @ compose 1<<96 vmov.i32 `&Dlo("@XMM[8]")`,#0 vrev32.8 `&Dhi("@XMM[0]")`,`&Dhi("@XMM[0]")` vshl.u64 `&Dhi("@XMM[8]")`,#32 vrev32.8 `&Dhi("@XMM[4]")`,`&Dhi("@XMM[4]")` vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96 vstmia $keysched, {@XMM[4]} @ save adjusted round0 key b .Lctr_enc_loop .align 4 .Lctr_enc_loop: vadd.u32 @XMM[10], @XMM[8], @XMM[9] @ compose 3<<96 vadd.u32 @XMM[1], @XMM[0], @XMM[8] @ +1 vadd.u32 @XMM[2], @XMM[0], @XMM[9] @ +2 vadd.u32 @XMM[3], @XMM[0], @XMM[10] @ +3 vadd.u32 @XMM[4], @XMM[1], @XMM[10] vadd.u32 @XMM[5], @XMM[2], @XMM[10] vadd.u32 @XMM[6], @XMM[3], @XMM[10] vadd.u32 @XMM[7], @XMM[4], @XMM[10] vadd.u32 @XMM[10], @XMM[5], @XMM[10] @ next counter @ Borrow prologue from _bsaes_encrypt8 to use the opportunity @ to flip byte order in 32-bit counter vldmia $keysched, {@XMM[9]} @ load round0 key add r4, $keysched, #0x10 @ pass next round key vldmia $ctr, {@XMM[8]} @ .LREVM0SR mov r5, $rounds @ pass rounds vstmia $fp, {@XMM[10]} @ save next counter sub $const, $ctr, #.LREVM0SR-.LSR @ pass constants bl _bsaes_encrypt8_alt subs $len, $len, #8 blo .Lctr_enc_loop_done vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ load input vld1.8 {@XMM[10]-@XMM[11]}, [$inp]! veor @XMM[0], @XMM[8] veor @XMM[1], @XMM[9] vld1.8 {@XMM[12]-@XMM[13]}, [$inp]! veor @XMM[4], @XMM[10] veor @XMM[6], @XMM[11] vld1.8 {@XMM[14]-@XMM[15]}, [$inp]! veor @XMM[3], @XMM[12] vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output veor @XMM[7], @XMM[13] veor @XMM[2], @XMM[14] vst1.8 {@XMM[4]}, [$out]! veor @XMM[5], @XMM[15] vst1.8 {@XMM[6]}, [$out]! vmov.i32 `&Dhi("@XMM[8]")`,#1 @ compose 1<<96 vst1.8 {@XMM[3]}, [$out]! vmov.i32 `&Dlo("@XMM[8]")`,#0 vst1.8 {@XMM[7]}, [$out]! vshl.u64 `&Dhi("@XMM[8]")`,#32 vst1.8 {@XMM[2]}, [$out]! vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96 vst1.8 {@XMM[5]}, [$out]! vldmia $fp, {@XMM[0]} @ load counter bne .Lctr_enc_loop b .Lctr_enc_done .align 4 .Lctr_enc_loop_done: add $len, $len, #8 vld1.8 {@XMM[8]}, [$inp]! @ load input veor @XMM[0], @XMM[8] vst1.8 {@XMM[0]}, [$out]! @ write output cmp $len, #2 blo .Lctr_enc_done vld1.8 {@XMM[9]}, [$inp]! veor @XMM[1], @XMM[9] vst1.8 {@XMM[1]}, [$out]! beq .Lctr_enc_done vld1.8 {@XMM[10]}, [$inp]! veor @XMM[4], @XMM[10] vst1.8 {@XMM[4]}, [$out]! cmp $len, #4 blo .Lctr_enc_done vld1.8 {@XMM[11]}, [$inp]! veor @XMM[6], @XMM[11] vst1.8 {@XMM[6]}, [$out]! beq .Lctr_enc_done vld1.8 {@XMM[12]}, [$inp]! veor @XMM[3], @XMM[12] vst1.8 {@XMM[3]}, [$out]! cmp $len, #6 blo .Lctr_enc_done vld1.8 {@XMM[13]}, [$inp]! veor @XMM[7], @XMM[13] vst1.8 {@XMM[7]}, [$out]! beq .Lctr_enc_done vld1.8 {@XMM[14]}, [$inp] veor @XMM[2], @XMM[14] vst1.8 {@XMM[2]}, [$out]! .Lctr_enc_done: vmov.i32 q0, #0 vmov.i32 q1, #0 .Lctr_enc_bzero: @ wipe key schedule [if any] vstmia $keysched!, {q0-q1} teq $keysched, $fp bne .Lctr_enc_bzero add sp, $fp, #0x10 vldmia sp!, {d8-d15} ldmia sp!, {r4-r10, pc} @ return .align 4 .Lctr_enc_short: ldr ip, [sp] @ ctr pointer is passed on stack stmdb sp!, {r4-r8, lr} mov r4, $inp @ copy arguments mov r5, $out mov r6, $len mov r7, $key ldr r8, [ip, #12] @ load counter LSW vld1.8 {@XMM[1]}, [ip] @ load whole counter value #ifdef __ARMEL__ rev r8, r8 #endif sub sp, sp, #0x10 vst1.8 {@XMM[1]}, [sp,:64] @ copy counter value sub sp, sp, #0x10 .Lctr_enc_short_loop: add r0, sp, #0x10 @ input counter value mov r1, sp @ output on the stack mov r2, r7 @ key bl AES_encrypt vld1.8 {@XMM[0]}, [r4]! @ load input vld1.8 {@XMM[1]}, [sp,:64] @ load encrypted counter add r8, r8, #1 #ifdef __ARMEL__ rev r0, r8 str r0, [sp, #0x1c] @ next counter value #else str r8, [sp, #0x1c] @ next counter value #endif veor @XMM[0],@XMM[0],@XMM[1] vst1.8 {@XMM[0]}, [r5]! @ store output subs r6, r6, #1 bne .Lctr_enc_short_loop add sp, sp, #0x20 ldmia sp!, {r4-r8, pc} .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks ___ } $code.=<<___; #endif ___ Loading
crypto/evp/e_aes.c +10 −0 Original line number Diff line number Diff line Loading @@ -892,6 +892,14 @@ static const EVP_CIPHER aes_##keylen##_##mode = { \ NULL,NULL,aes_##mode##_ctrl,NULL }; \ const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \ { return &aes_##keylen##_##mode; } #endif #if defined(AES_ASM) && defined(BSAES_ASM) && (defined(__arm__) || defined(__arm)) #include "arm_arch.h" #if __ARM_ARCH__>=7 #define BSAES_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON) #endif #endif #define BLOCK_CIPHER_generic_pack(nid,keylen,flags) \ Loading Loading @@ -1624,11 +1632,13 @@ static int aes_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, xctx->stream = NULL; #endif /* key_len is two AES keys */ #if !(defined(__arm__) || defined(__arm)) /* not yet? */ #ifdef BSAES_CAPABLE if (BSAES_CAPABLE) xctx->stream = enc ? bsaes_xts_encrypt : bsaes_xts_decrypt; else #endif #endif #ifdef VPAES_CAPABLE if (VPAES_CAPABLE) { Loading