Loading crypto/aes/asm/aesni-x86_64.pl +313 −343 Original line number Diff line number Diff line Loading @@ -130,7 +130,7 @@ # Further data for other parallelizable modes: # # CBC decrypt 1.16 0.93 0.93 # CTR 1.14 0.91 0.86 # CTR 1.14 0.91 0.77 # # Well, given 3x column it's probably inappropriate to call the limit # asymptotic, if it can be surpassed, isn't it? What happens there? Loading Loading @@ -160,7 +160,7 @@ ###################################################################### # For reference, AMD Bulldozer spends 5.77 cycles per byte processed # with 128-bit key in CBC encrypt and 0.76 cycles in CBC decrypt, 0.70 # in ECB, 0.73 in CTR, 0.95 in XTS... This means that aes[enc|dec] # in ECB, 0.71 in CTR, 0.95 in XTS... This means that aes[enc|dec] # instruction latency is 9 cycles and that they can be issued every # cycle. Loading Loading @@ -1011,385 +1011,389 @@ ___ # const char *ivec); # # Handles only complete blocks, operates on 32-bit counter and # does not update *ivec! (see engine/eng_aesni.c for details) # does not update *ivec! (see crypto/modes/ctr128.c for details) # # Overhaul based on suggestions from Shay Gueron and Vlad Krasnov, # http://rt.openssl.org/Ticket/Display.html?id=3031&user=guest&pass=guest. # Keywords are full unroll and modulo-schedule counter calculations # with zero-round key xor. { my ($in0,$in1,$in2,$in3,$one,$ivec)=map("%xmm$_",(10..15)); my $len_="%r9"; my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15)); my ($key0,$ctr)=("${key_}d","${ivp}d"); my $frame_size = 0x80 + ($win64?160:0); $code.=<<___; .globl aesni_ctr32_encrypt_blocks .type aesni_ctr32_encrypt_blocks,\@function,5 .align 16 aesni_ctr32_encrypt_blocks: lea (%rsp),%rax push %rbp sub \$$frame_size,%rsp and \$-16,%rsp # Linux kernel stack can be incorrectly seeded ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp movaps %xmm6,0x00(%rsp) movaps %xmm7,0x10(%rsp) movaps %xmm8,0x20(%rsp) movaps %xmm9,0x30(%rsp) movaps %xmm10,0x40(%rsp) movaps %xmm11,0x50(%rsp) movaps %xmm12,0x60(%rsp) movaps %xmm13,0x70(%rsp) movaps %xmm14,0x80(%rsp) movaps %xmm15,0x90(%rsp) movaps %xmm6,-0xa8(%rax) movaps %xmm7,-0x98(%rax) movaps %xmm8,-0x88(%rax) movaps %xmm9,-0x78(%rax) movaps %xmm10,-0x68(%rax) movaps %xmm11,-0x58(%rax) movaps %xmm12,-0x48(%rax) movaps %xmm13,-0x38(%rax) movaps %xmm14,-0x28(%rax) movaps %xmm15,-0x18(%rax) .Lctr32_body: ___ $code.=<<___; lea -8(%rax),%rbp cmp \$1,$len je .Lctr32_one_shortcut movzb 15($ivp),%rax # counter LSB mov $len,$len_ # backup $len mov 240($key),$rnds_ # key->rounds mov $key,$key_ # backup $key movdqu ($ivp),$ivec neg %rax movdqa .Lincrement1(%rip),$one add \$256,%rax # steps to closest overflow .Lctr32_grandloop: cmp %rax,$len cmova %rax,$len mov $rnds_,$rounds # restore $rounds sub $len,$len_ movdqu ($ivp),$inout0 movdqu ($key),$rndkey0 mov 12($ivp),$ctr # counter LSB pxor $rndkey0,$inout0 mov 12($key),$key0 # 0-round key LSB movdqa $inout0,0x00(%rsp) # populate counter block bswap $ctr movdqa $inout0,0x10(%rsp) movdqa $inout0,0x20(%rsp) movdqa $inout0,0x30(%rsp) movdqa $inout0,0x40(%rsp) movdqa $inout0,0x50(%rsp) movdqa $inout0,0x60(%rsp) movdqa $inout0,0x70(%rsp) mov 240($key),$rounds # key->rounds lea 1($ctr),%r9 lea 2($ctr),%r10 bswap %r9d bswap %r10d xor $key0,%r9d xor $key0,%r10d mov %r9d,0x10+12(%rsp) lea 3($ctr),%r9 mov %r10d,0x20+12(%rsp) bswap %r9d lea 4($ctr),%r10 xor $key0,%r9d bswap %r10d mov %r9d,0x30+12(%rsp) xor $key0,%r10d lea 5($ctr),%r9 mov %r10d,0x40+12(%rsp) bswap %r9d lea 6($ctr),%r10 xor $key0,%r9d bswap %r10d mov %r9d,0x50+12(%rsp) xor $key0,%r10d lea 7($ctr),%r9 mov %r10d,0x60+12(%rsp) bswap %r9d xor $key0,%r9d mov %r9d,0x70+12(%rsp) $movkey 0x10($key),$rndkey1 movdqa 0x10(%rsp),$inout1 movdqa 0x20(%rsp),$inout2 movdqa 0x30(%rsp),$inout3 movdqa 0x40(%rsp),$inout4 movdqa 0x50(%rsp),$inout5 cmp \$8,$len jb .Lctr32_tail $movkey ($key_),$rndkey0 shr \$1,$rounds shr \$1,$rnds_ movdqa $rndkey0,$inout0 movdqa $rndkey0,$inout1 movdqa $rndkey0,$inout2 movdqa $rndkey0,$inout3 movdqa $rndkey0,$inout4 movdqa $rndkey0,$inout5 movdqa $rndkey0,$inout6 movdqa $rndkey0,$inout7 $movkey 16($key_),$rndkey1 lea 0x80($key),$key # size optimization sub \$8,$len jmp .Lctr32_loop8 .align 16 .align 32 .Lctr32_loop8: pxor $ivec,$inout0 paddb $one,$ivec add \$8,$ctr movdqa 0x60(%rsp),$inout6 aesenc $rndkey1,$inout0 pxor $ivec,$inout1 paddb $one,$ivec lea 32($key_),$key mov $ctr,%r9d movdqa 0x70(%rsp),$inout7 aesenc $rndkey1,$inout1 pxor $ivec,$inout2 paddb $one,$ivec bswap %r9d $movkey 0x20-0x80($key),$rndkey0 aesenc $rndkey1,$inout2 pxor $ivec,$inout3 paddb $one,$ivec xor $key0,%r9d aesenc $rndkey1,$inout3 pxor $ivec,$inout4 paddb $one,$ivec mov %r9d,0x00+12(%rsp) lea 1($ctr),%r9 aesenc $rndkey1,$inout4 pxor $ivec,$inout5 paddb $one,$ivec aesenc $rndkey1,$inout5 pxor $ivec,$inout6 paddb $one,$ivec $movkey ($key),$rndkey0 aesenc $rndkey1,$inout6 pxor $ivec,$inout7 paddb $one,$ivec dec $rounds aesenc $rndkey1,$inout7 $movkey 16($key),$rndkey1 $movkey 0x30-0x80($key),$rndkey1 ___ for($i=2;$i<8;$i++) { my $rndkeyx = ($i&1)?$rndkey1:$rndkey0; $code.=<<___; aesenc $rndkeyx,$inout0 aesenc $rndkeyx,$inout1 bswap %r9d aesenc $rndkeyx,$inout2 xor $key0,%r9d aesenc $rndkeyx,$inout3 mov %r9d,`0x10*($i-1)`+12(%rsp) lea $i($ctr),%r9 aesenc $rndkeyx,$inout4 aesenc $rndkeyx,$inout5 aesenc $rndkeyx,$inout6 aesenc $rndkeyx,$inout7 $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx ___ } $code.=<<___; aesenc $rndkey0,$inout0 aesenc $rndkey0,$inout1 bswap %r9d aesenc $rndkey0,$inout2 xor $key0,%r9d aesenc $rndkey0,$inout3 mov %r9d,0x70+12(%rsp) aesenc $rndkey0,$inout4 aesenc $rndkey0,$inout5 aesenc $rndkey0,$inout6 movdqu 0x00($inp),$in0 aesenc $rndkey0,$inout7 $movkey 0xa0-0x80($key),$rndkey0 cmp \$11,$rounds jb .Lctr32_enc_done aesenc $rndkey1,$inout0 aesenc $rndkey1,$inout1 aesenc $rndkey1,$inout2 aesenc $rndkey1,$inout3 aesenc $rndkey1,$inout4 aesenc $rndkey1,$inout5 aesenc $rndkey1,$inout6 aesenc $rndkey1,$inout7 $movkey 0xb0-0x80($key),$rndkey1 aesenc $rndkey0,$inout0 aesenc $rndkey0,$inout1 lea 32($key),$key aesenc $rndkey0,$inout2 movups ($inp),$in0 # load input aesenc $rndkey0,$inout3 movups 0x10($inp),$in1 aesenc $rndkey0,$inout4 movups 0x20($inp),$in2 aesenc $rndkey0,$inout5 movups 0x30($inp),$in3 aesenc $rndkey0,$inout6 movups 0x40($inp),$one aesenc $rndkey0,$inout7 $movkey ($key),$rndkey0 $movkey 0xc0-0x80($key),$rndkey0 je .Lctr32_enc_done .Lctr32_enc_loop8: aesenc $rndkey1,$inout0 aesenc $rndkey1,$inout1 dec $rounds aesenc $rndkey1,$inout2 aesenc $rndkey1,$inout3 aesenc $rndkey1,$inout4 aesenc $rndkey1,$inout5 aesenc $rndkey1,$inout6 aesenc $rndkey1,$inout7 $movkey 16($key),$rndkey1 $movkey 0xd0-0x80($key),$rndkey1 aesenc $rndkey0,$inout0 aesenc $rndkey0,$inout1 lea 32($key),$key aesenc $rndkey0,$inout2 aesenc $rndkey0,$inout3 aesenc $rndkey0,$inout4 aesenc $rndkey0,$inout5 aesenc $rndkey0,$inout6 aesenc $rndkey0,$inout7 $movkey ($key),$rndkey0 jnz .Lctr32_enc_loop8 $movkey 0xe0-0x80($key),$rndkey0 .Lctr32_enc_done: aesenc $rndkey1,$inout0 movdqu 0x10($inp),$in1 pxor $rndkey0,$in0 aesenc $rndkey1,$inout1 movdqu 0x20($inp),$in2 pxor $rndkey0,$in1 aesenc $rndkey1,$inout2 movdqu 0x30($inp),$in3 pxor $rndkey0,$in2 aesenc $rndkey1,$inout3 movdqu 0x40($inp),$in4 pxor $rndkey0,$in3 aesenc $rndkey1,$inout4 pxor $rndkey0,$one movdqu 0x50($inp),$in5 pxor $rndkey0,$in4 aesenc $rndkey1,$inout5 pxor $rndkey0,$in5 aesenc $rndkey1,$inout6 aesenc $rndkey1,$inout7 movdqu 0x50($inp),$rndkey1 movdqu 0x60($inp),$rndkey1 aesenclast $in0,$inout0 movdqu 0x60($inp),$in0 pxor $rndkey0,$rndkey1 movdqu 0x70($inp),$in0 lea 0x80($inp),$inp aesenclast $in1,$inout1 movdqu 0x70($inp),$in1 pxor $rndkey0,$in0 movdqa 0x00(%rsp),$in1 # load next counter block aesenclast $in2,$inout2 pxor $rndkey0,$in1 $movkey ($key_),$rndkey0 movdqa 0x10(%rsp),$in2 aesenclast $in3,$inout3 lea 0x80($inp),$inp aesenclast $one,$inout4 movdqa .Lincrement1(%rip),$one aesenclast $rndkey1,$inout5 $movkey 16($key_),$rndkey1 aesenclast $in0,$inout6 aesenclast $in1,$inout7 movdqa 0x20(%rsp),$in3 aesenclast $in4,$inout4 movdqa 0x30(%rsp),$in4 aesenclast $in5,$inout5 movdqa 0x40(%rsp),$in5 aesenclast $rndkey1,$inout6 movdqa 0x50(%rsp),$rndkey0 aesenclast $in0,$inout7 $movkey 0x10-0x80($key),$rndkey1 movups $inout0,($out) # store output movdqa $rndkey0,$inout0 movdqa $in1,$inout0 movups $inout1,0x10($out) movdqa $rndkey0,$inout1 movdqa $in2,$inout1 movups $inout2,0x20($out) movdqa $rndkey0,$inout2 movdqa $in3,$inout2 movups $inout3,0x30($out) movdqa $rndkey0,$inout3 movdqa $in4,$inout3 movups $inout4,0x40($out) movdqa $rndkey0,$inout4 movdqa $in5,$inout4 movups $inout5,0x50($out) movdqa $rndkey0,$inout5 movups $inout6,0x60($out) movdqa $rndkey0,$inout6 movups $inout7,0x70($out) movdqa $rndkey0,$inout7 lea 0x80($out),$out mov $rnds_,$rounds sub \$8,$len jnc .Lctr32_loop8 lea 1($rounds,$rounds),$rounds # restore original value lea 1($rnds_,$rnds_),$rnds_ # restore original value add \$8,$len jz .Lctr32_done lea -0x80($key),$key .Lctr32_tail: mov $key_,$key # restore $key movdqa $ivec,$inout0 paddb $one,$ivec movups ($inp),$in0 cmp \$2,$len jb .Lctr32_one movdqa $ivec,$inout1 paddb $one,$ivec movups 0x10($inp),$in1 je .Lctr32_two movdqa $ivec,$inout2 paddb $one,$ivec movups 0x20($inp),$in2 lea 16($key),$key cmp \$4,$len jb .Lctr32_three movdqa $ivec,$inout3 paddb $one,$ivec movups 0x30($inp),$in3 je .Lctr32_four jbe .Lctr32_loop4 movdqa $ivec,$inout4 paddb $one,$ivec cmp \$6,$len jb .Lctr32_five movdqa 0x60(%rsp),$inout6 movdqa $ivec,$inout5 paddb $one,$ivec je .Lctr32_six movdqa $ivec,$inout6 paddb $one,$ivec xorps $inout7,$inout7 $movkey 16($key),$rndkey0 aesenc $rndkey1,$inout0 lea 16($key),$key aesenc $rndkey1,$inout1 shr \$1,$rounds aesenc $rndkey1,$inout2 dec $rounds aesenc $rndkey1,$inout3 aesenc $rndkey1,$inout4 aesenc $rndkey1,$inout5 aesenc $rndkey1,$inout6 pxor $inout7,$inout7 $movkey 16($key),$rndkey1 call _aesni_encrypt8 call .Lenc_loop8_enter xorps $in0,$inout0 # xor movups 0x40($inp),$in0 movups ($inp),$in0 movups 0x10($inp),$in1 movups 0x20($inp),$in2 xorps $in0,$inout0 movups 0x30($inp),$in3 xorps $in1,$inout1 movups 0x50($inp),$in1 movups 0x40($inp),$in0 xorps $in2,$inout2 movups 0x60($inp),$in2 lea 0x70($inp),$inp movups $inout0,($out) xorps $in3,$inout3 movups $inout0,($out) # store output xorps $in0,$inout4 movups $inout1,0x10($out) xorps $in1,$inout5 xorps $in0,$inout4 movups $inout2,0x20($out) xorps $in2,$inout6 movups $inout3,0x30($out) movups $inout4,0x40($out) cmp \$6,$len jb .Lctr32_done movups 0x50($inp),$in1 xorps $in1,$inout5 movups $inout5,0x50($out) je .Lctr32_done movups 0x60($inp),$in2 xorps $in2,$inout6 movups $inout6,0x60($out) lea 0x70($out),$out jmp .Lctr32_done .align 16 .Lctr32_one_shortcut: movups ($ivp),$inout0 xor $len_,$len_ .align 32 .Lctr32_loop4: aesenc $rndkey1,$inout0 lea 16($key),$key aesenc $rndkey1,$inout1 aesenc $rndkey1,$inout2 aesenc $rndkey1,$inout3 $movkey ($key),$rndkey1 dec $rounds jnz .Lctr32_loop4 aesenclast $rndkey1,$inout0 aesenclast $rndkey1,$inout1 aesenclast $rndkey1,$inout2 aesenclast $rndkey1,$inout3 movups ($inp),$in0 mov 240($key),$rounds # key->rounds .Lctr32_one: ___ &aesni_generate1("enc",$key,$rounds); $code.=<<___; xorps $in0,$inout0 lea 0x10($inp),$inp movups $inout0,($out) lea 0x10($out),$out jmp .Lctr32_done cmp \$2,$len jb .Lctr32_done .align 16 .Lctr32_two: xorps $inout2,$inout2 call _aesni_encrypt3 xorps $in0,$inout0 # xor lea 0x20($inp),$inp movups 0x10($inp),$in1 xorps $in1,$inout1 movups $inout0,($out) # store output movups $inout1,0x10($out) lea 0x20($out),$out jmp .Lctr32_done je .Lctr32_done .align 16 .Lctr32_three: call _aesni_encrypt3 xorps $in0,$inout0 # xor lea 0x30($inp),$inp xorps $in1,$inout1 movups $inout0,($out) # store output movups 0x20($inp),$in2 xorps $in2,$inout2 movups $inout1,0x10($out) movups $inout2,0x20($out) lea 0x30($out),$out jmp .Lctr32_done cmp \$4,$len jb .Lctr32_done .align 16 .Lctr32_four: call _aesni_encrypt4 xorps $in0,$inout0 # xor lea 0x40($inp),$inp xorps $in1,$inout1 movups $inout0,($out) # store output xorps $in2,$inout2 movups $inout1,0x10($out) movups 0x30($inp),$in3 xorps $in3,$inout3 movups $inout2,0x20($out) movups $inout3,0x30($out) lea 0x40($out),$out jmp .Lctr32_done .align 16 .Lctr32_five: xorps $inout5,$inout5 call _aesni_encrypt6 xorps $in0,$inout0 # xor movups 0x40($inp),$in0 lea 0x50($inp),$inp xorps $in1,$inout1 movups $inout0,($out) # store output xorps $in2,$inout2 movups $inout1,0x10($out) xorps $in3,$inout3 movups $inout2,0x20($out) xorps $in0,$inout4 movups $inout3,0x30($out) movups $inout4,0x40($out) lea 0x50($out),$out .Lctr32_one_shortcut: movups ($ivp),$inout0 movups ($inp),$in0 mov 240($key),$rounds # key->rounds ___ &aesni_generate1("enc",$key,$rounds); $code.=<<___; xorps $in0,$inout0 movups $inout0,($out) jmp .Lctr32_done .align 16 .Lctr32_six: call _aesni_encrypt6 xorps $in0,$inout0 # xor movups 0x40($inp),$in0 xorps $in1,$inout1 movups 0x50($inp),$in1 lea 0x60($inp),$inp xorps $in2,$inout2 movups $inout0,($out) # store output xorps $in3,$inout3 movups $inout1,0x10($out) xorps $in0,$inout4 movups $inout2,0x20($out) xorps $in1,$inout5 movups $inout3,0x30($out) movups $inout4,0x40($out) movups $inout5,0x50($out) lea 0x60($out),$out .Lctr32_done: test $len_,$len_ jz .Lctr32_really_done movdqa .Lbswap_mask(%rip),$rndkey1 pshufb $rndkey1,$ivec psrldq \$14,$one # 256 paddd $one,$ivec pslldq \$14,$one pshufb $rndkey1,$ivec mov $len_,$len mov \$256,%rax jmp .Lctr32_grandloop .Lctr32_really_done: ___ $code.=<<___ if ($win64); movaps 0x00(%rsp),%xmm6 movaps 0x10(%rsp),%xmm7 movaps 0x20(%rsp),%xmm8 movaps 0x30(%rsp),%xmm9 movaps 0x40(%rsp),%xmm10 movaps 0x50(%rsp),%xmm11 movaps 0x60(%rsp),%xmm12 movaps 0x70(%rsp),%xmm13 movaps 0x80(%rsp),%xmm14 movaps 0x90(%rsp),%xmm15 lea 0xa8(%rsp),%rsp movaps -0xa0(%rbp),%xmm6 movaps -0x90(%rbp),%xmm7 movaps -0x80(%rbp),%xmm8 movaps -0x70(%rbp),%xmm9 movaps -0x60(%rbp),%xmm10 movaps -0x50(%rbp),%xmm11 movaps -0x40(%rbp),%xmm12 movaps -0x30(%rbp),%xmm13 movaps -0x20(%rbp),%xmm14 movaps -0x10(%rbp),%xmm15 ___ $code.=<<___; .Lctr32_ret: lea (%rbp),%rsp pop %rbp .Lctr32_epilogue: ret .size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks ___ Loading Loading @@ -1417,16 +1421,16 @@ aesni_xts_encrypt: and \$-16,%rsp # Linux kernel stack can be incorrectly seeded ___ $code.=<<___ if ($win64); movaps %xmm6,0x60(%rsp) movaps %xmm7,0x70(%rsp) movaps %xmm8,0x80(%rsp) movaps %xmm9,0x90(%rsp) movaps %xmm10,0xa0(%rsp) movaps %xmm11,0xb0(%rsp) movaps %xmm12,0xc0(%rsp) movaps %xmm13,0xd0(%rsp) movaps %xmm14,0xe0(%rsp) movaps %xmm15,0xf0(%rsp) movaps %xmm6,-0xa8(%rax) movaps %xmm7,-0x98(%rax) movaps %xmm8,-0x88(%rax) movaps %xmm9,-0x78(%rax) movaps %xmm10,-0x68(%rax) movaps %xmm11,-0x58(%rax) movaps %xmm12,-0x48(%rax) movaps %xmm13,-0x38(%rax) movaps %xmm14,-0x28(%rax) movaps %xmm15,-0x18(%rax) .Lxts_enc_body: ___ $code.=<<___; Loading Loading @@ -1782,16 +1786,16 @@ $code.=<<___; .Lxts_enc_ret: ___ $code.=<<___ if ($win64); movaps 0x60(%rsp),%xmm6 movaps 0x70(%rsp),%xmm7 movaps 0x80(%rsp),%xmm8 movaps 0x90(%rsp),%xmm9 movaps 0xa0(%rsp),%xmm10 movaps 0xb0(%rsp),%xmm11 movaps 0xc0(%rsp),%xmm12 movaps 0xd0(%rsp),%xmm13 movaps 0xe0(%rsp),%xmm14 movaps 0xf0(%rsp),%xmm15 movaps -0xa0(%rbp),%xmm6 movaps -0x90(%rbp),%xmm7 movaps -0x80(%rbp),%xmm8 movaps -0x70(%rbp),%xmm9 movaps -0x60(%rbp),%xmm10 movaps -0x50(%rbp),%xmm11 movaps -0x40(%rbp),%xmm12 movaps -0x30(%rbp),%xmm13 movaps -0x20(%rbp),%xmm14 movaps -0x10(%rbp),%xmm15 ___ $code.=<<___; lea (%rbp),%rsp Loading @@ -1812,16 +1816,16 @@ aesni_xts_decrypt: and \$-16,%rsp # Linux kernel stack can be incorrectly seeded ___ $code.=<<___ if ($win64); movaps %xmm6,0x60(%rsp) movaps %xmm7,0x70(%rsp) movaps %xmm8,0x80(%rsp) movaps %xmm9,0x90(%rsp) movaps %xmm10,0xa0(%rsp) movaps %xmm11,0xb0(%rsp) movaps %xmm12,0xc0(%rsp) movaps %xmm13,0xd0(%rsp) movaps %xmm14,0xe0(%rsp) movaps %xmm15,0xf0(%rsp) movaps %xmm6,-0xa8(%rax) movaps %xmm7,-0x98(%rax) movaps %xmm8,-0x88(%rax) movaps %xmm9,-0x78(%rax) movaps %xmm10,-0x68(%rax) movaps %xmm11,-0x58(%rax) movaps %xmm12,-0x48(%rax) movaps %xmm13,-0x38(%rax) movaps %xmm14,-0x28(%rax) movaps %xmm15,-0x18(%rax) .Lxts_dec_body: ___ $code.=<<___; Loading Loading @@ -2213,16 +2217,16 @@ $code.=<<___; .Lxts_dec_ret: ___ $code.=<<___ if ($win64); movaps 0x60(%rsp),%xmm6 movaps 0x70(%rsp),%xmm7 movaps 0x80(%rsp),%xmm8 movaps 0x90(%rsp),%xmm9 movaps 0xa0(%rsp),%xmm10 movaps 0xb0(%rsp),%xmm11 movaps 0xc0(%rsp),%xmm12 movaps 0xd0(%rsp),%xmm13 movaps 0xe0(%rsp),%xmm14 movaps 0xf0(%rsp),%xmm15 movaps -0xa0(%rbp),%xmm6 movaps -0x90(%rbp),%xmm7 movaps -0x80(%rbp),%xmm8 movaps -0x70(%rbp),%xmm9 movaps -0x60(%rbp),%xmm10 movaps -0x50(%rbp),%xmm11 movaps -0x40(%rbp),%xmm12 movaps -0x30(%rbp),%xmm13 movaps -0x20(%rbp),%xmm14 movaps -0x10(%rbp),%xmm15 ___ $code.=<<___; lea (%rbp),%rsp Loading Loading @@ -2914,45 +2918,9 @@ ccm64_se_handler: jmp .Lcommon_seh_tail .size ccm64_se_handler,.-ccm64_se_handler .type ctr32_se_handler,\@abi-omnipotent .type ctr_xts_se_handler,\@abi-omnipotent .align 16 ctr32_se_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip lea .Lctr32_body(%rip),%r10 cmp %r10,%rbx # context->Rip<"prologue" label jb .Lcommon_seh_tail mov 152($context),%rax # pull context->Rsp lea .Lctr32_ret(%rip),%r10 cmp %r10,%rbx jae .Lcommon_seh_tail lea (%rax),%rsi # %xmm save area lea 512($context),%rdi # &context.Xmm6 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) .long 0xa548f3fc # cld; rep movsq lea 0xa8(%rax),%rax # adjust stack pointer jmp .Lcommon_seh_tail .size ctr32_se_handler,.-ctr32_se_handler .type xts_se_handler,\@abi-omnipotent .align 16 xts_se_handler: ctr_xts_se_handler: push %rsi push %rdi push %rbx Loading Loading @@ -2982,13 +2950,14 @@ xts_se_handler: cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail lea 0x60(%rax),%rsi # %xmm save area mov 160($context),%rax # pull context->Rbp lea -0xa0(%rax),%rsi # %xmm save area lea 512($context),%rdi # & context.Xmm6 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) .long 0xa548f3fc # cld; rep movsq jmp .Lcommon_rbp_tail .size xts_se_handler,.-xts_se_handler .size ctr_xts_se_handler,.-ctr_xts_se_handler ___ $code.=<<___; .type cbc_se_handler,\@abi-omnipotent Loading Loading @@ -3132,14 +3101,15 @@ $code.=<<___ if ($PREFIX eq "aesni"); .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[] .LSEH_info_ctr32: .byte 9,0,0,0 .rva ctr32_se_handler .rva ctr_xts_se_handler .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[] .LSEH_info_xts_enc: .byte 9,0,0,0 .rva xts_se_handler .rva ctr_xts_se_handler .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] .LSEH_info_xts_dec: .byte 9,0,0,0 .rva xts_se_handler .rva ctr_xts_se_handler .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] ___ $code.=<<___; Loading Loading
crypto/aes/asm/aesni-x86_64.pl +313 −343 Original line number Diff line number Diff line Loading @@ -130,7 +130,7 @@ # Further data for other parallelizable modes: # # CBC decrypt 1.16 0.93 0.93 # CTR 1.14 0.91 0.86 # CTR 1.14 0.91 0.77 # # Well, given 3x column it's probably inappropriate to call the limit # asymptotic, if it can be surpassed, isn't it? What happens there? Loading Loading @@ -160,7 +160,7 @@ ###################################################################### # For reference, AMD Bulldozer spends 5.77 cycles per byte processed # with 128-bit key in CBC encrypt and 0.76 cycles in CBC decrypt, 0.70 # in ECB, 0.73 in CTR, 0.95 in XTS... This means that aes[enc|dec] # in ECB, 0.71 in CTR, 0.95 in XTS... This means that aes[enc|dec] # instruction latency is 9 cycles and that they can be issued every # cycle. Loading Loading @@ -1011,385 +1011,389 @@ ___ # const char *ivec); # # Handles only complete blocks, operates on 32-bit counter and # does not update *ivec! (see engine/eng_aesni.c for details) # does not update *ivec! (see crypto/modes/ctr128.c for details) # # Overhaul based on suggestions from Shay Gueron and Vlad Krasnov, # http://rt.openssl.org/Ticket/Display.html?id=3031&user=guest&pass=guest. # Keywords are full unroll and modulo-schedule counter calculations # with zero-round key xor. { my ($in0,$in1,$in2,$in3,$one,$ivec)=map("%xmm$_",(10..15)); my $len_="%r9"; my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15)); my ($key0,$ctr)=("${key_}d","${ivp}d"); my $frame_size = 0x80 + ($win64?160:0); $code.=<<___; .globl aesni_ctr32_encrypt_blocks .type aesni_ctr32_encrypt_blocks,\@function,5 .align 16 aesni_ctr32_encrypt_blocks: lea (%rsp),%rax push %rbp sub \$$frame_size,%rsp and \$-16,%rsp # Linux kernel stack can be incorrectly seeded ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp movaps %xmm6,0x00(%rsp) movaps %xmm7,0x10(%rsp) movaps %xmm8,0x20(%rsp) movaps %xmm9,0x30(%rsp) movaps %xmm10,0x40(%rsp) movaps %xmm11,0x50(%rsp) movaps %xmm12,0x60(%rsp) movaps %xmm13,0x70(%rsp) movaps %xmm14,0x80(%rsp) movaps %xmm15,0x90(%rsp) movaps %xmm6,-0xa8(%rax) movaps %xmm7,-0x98(%rax) movaps %xmm8,-0x88(%rax) movaps %xmm9,-0x78(%rax) movaps %xmm10,-0x68(%rax) movaps %xmm11,-0x58(%rax) movaps %xmm12,-0x48(%rax) movaps %xmm13,-0x38(%rax) movaps %xmm14,-0x28(%rax) movaps %xmm15,-0x18(%rax) .Lctr32_body: ___ $code.=<<___; lea -8(%rax),%rbp cmp \$1,$len je .Lctr32_one_shortcut movzb 15($ivp),%rax # counter LSB mov $len,$len_ # backup $len mov 240($key),$rnds_ # key->rounds mov $key,$key_ # backup $key movdqu ($ivp),$ivec neg %rax movdqa .Lincrement1(%rip),$one add \$256,%rax # steps to closest overflow .Lctr32_grandloop: cmp %rax,$len cmova %rax,$len mov $rnds_,$rounds # restore $rounds sub $len,$len_ movdqu ($ivp),$inout0 movdqu ($key),$rndkey0 mov 12($ivp),$ctr # counter LSB pxor $rndkey0,$inout0 mov 12($key),$key0 # 0-round key LSB movdqa $inout0,0x00(%rsp) # populate counter block bswap $ctr movdqa $inout0,0x10(%rsp) movdqa $inout0,0x20(%rsp) movdqa $inout0,0x30(%rsp) movdqa $inout0,0x40(%rsp) movdqa $inout0,0x50(%rsp) movdqa $inout0,0x60(%rsp) movdqa $inout0,0x70(%rsp) mov 240($key),$rounds # key->rounds lea 1($ctr),%r9 lea 2($ctr),%r10 bswap %r9d bswap %r10d xor $key0,%r9d xor $key0,%r10d mov %r9d,0x10+12(%rsp) lea 3($ctr),%r9 mov %r10d,0x20+12(%rsp) bswap %r9d lea 4($ctr),%r10 xor $key0,%r9d bswap %r10d mov %r9d,0x30+12(%rsp) xor $key0,%r10d lea 5($ctr),%r9 mov %r10d,0x40+12(%rsp) bswap %r9d lea 6($ctr),%r10 xor $key0,%r9d bswap %r10d mov %r9d,0x50+12(%rsp) xor $key0,%r10d lea 7($ctr),%r9 mov %r10d,0x60+12(%rsp) bswap %r9d xor $key0,%r9d mov %r9d,0x70+12(%rsp) $movkey 0x10($key),$rndkey1 movdqa 0x10(%rsp),$inout1 movdqa 0x20(%rsp),$inout2 movdqa 0x30(%rsp),$inout3 movdqa 0x40(%rsp),$inout4 movdqa 0x50(%rsp),$inout5 cmp \$8,$len jb .Lctr32_tail $movkey ($key_),$rndkey0 shr \$1,$rounds shr \$1,$rnds_ movdqa $rndkey0,$inout0 movdqa $rndkey0,$inout1 movdqa $rndkey0,$inout2 movdqa $rndkey0,$inout3 movdqa $rndkey0,$inout4 movdqa $rndkey0,$inout5 movdqa $rndkey0,$inout6 movdqa $rndkey0,$inout7 $movkey 16($key_),$rndkey1 lea 0x80($key),$key # size optimization sub \$8,$len jmp .Lctr32_loop8 .align 16 .align 32 .Lctr32_loop8: pxor $ivec,$inout0 paddb $one,$ivec add \$8,$ctr movdqa 0x60(%rsp),$inout6 aesenc $rndkey1,$inout0 pxor $ivec,$inout1 paddb $one,$ivec lea 32($key_),$key mov $ctr,%r9d movdqa 0x70(%rsp),$inout7 aesenc $rndkey1,$inout1 pxor $ivec,$inout2 paddb $one,$ivec bswap %r9d $movkey 0x20-0x80($key),$rndkey0 aesenc $rndkey1,$inout2 pxor $ivec,$inout3 paddb $one,$ivec xor $key0,%r9d aesenc $rndkey1,$inout3 pxor $ivec,$inout4 paddb $one,$ivec mov %r9d,0x00+12(%rsp) lea 1($ctr),%r9 aesenc $rndkey1,$inout4 pxor $ivec,$inout5 paddb $one,$ivec aesenc $rndkey1,$inout5 pxor $ivec,$inout6 paddb $one,$ivec $movkey ($key),$rndkey0 aesenc $rndkey1,$inout6 pxor $ivec,$inout7 paddb $one,$ivec dec $rounds aesenc $rndkey1,$inout7 $movkey 16($key),$rndkey1 $movkey 0x30-0x80($key),$rndkey1 ___ for($i=2;$i<8;$i++) { my $rndkeyx = ($i&1)?$rndkey1:$rndkey0; $code.=<<___; aesenc $rndkeyx,$inout0 aesenc $rndkeyx,$inout1 bswap %r9d aesenc $rndkeyx,$inout2 xor $key0,%r9d aesenc $rndkeyx,$inout3 mov %r9d,`0x10*($i-1)`+12(%rsp) lea $i($ctr),%r9 aesenc $rndkeyx,$inout4 aesenc $rndkeyx,$inout5 aesenc $rndkeyx,$inout6 aesenc $rndkeyx,$inout7 $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx ___ } $code.=<<___; aesenc $rndkey0,$inout0 aesenc $rndkey0,$inout1 bswap %r9d aesenc $rndkey0,$inout2 xor $key0,%r9d aesenc $rndkey0,$inout3 mov %r9d,0x70+12(%rsp) aesenc $rndkey0,$inout4 aesenc $rndkey0,$inout5 aesenc $rndkey0,$inout6 movdqu 0x00($inp),$in0 aesenc $rndkey0,$inout7 $movkey 0xa0-0x80($key),$rndkey0 cmp \$11,$rounds jb .Lctr32_enc_done aesenc $rndkey1,$inout0 aesenc $rndkey1,$inout1 aesenc $rndkey1,$inout2 aesenc $rndkey1,$inout3 aesenc $rndkey1,$inout4 aesenc $rndkey1,$inout5 aesenc $rndkey1,$inout6 aesenc $rndkey1,$inout7 $movkey 0xb0-0x80($key),$rndkey1 aesenc $rndkey0,$inout0 aesenc $rndkey0,$inout1 lea 32($key),$key aesenc $rndkey0,$inout2 movups ($inp),$in0 # load input aesenc $rndkey0,$inout3 movups 0x10($inp),$in1 aesenc $rndkey0,$inout4 movups 0x20($inp),$in2 aesenc $rndkey0,$inout5 movups 0x30($inp),$in3 aesenc $rndkey0,$inout6 movups 0x40($inp),$one aesenc $rndkey0,$inout7 $movkey ($key),$rndkey0 $movkey 0xc0-0x80($key),$rndkey0 je .Lctr32_enc_done .Lctr32_enc_loop8: aesenc $rndkey1,$inout0 aesenc $rndkey1,$inout1 dec $rounds aesenc $rndkey1,$inout2 aesenc $rndkey1,$inout3 aesenc $rndkey1,$inout4 aesenc $rndkey1,$inout5 aesenc $rndkey1,$inout6 aesenc $rndkey1,$inout7 $movkey 16($key),$rndkey1 $movkey 0xd0-0x80($key),$rndkey1 aesenc $rndkey0,$inout0 aesenc $rndkey0,$inout1 lea 32($key),$key aesenc $rndkey0,$inout2 aesenc $rndkey0,$inout3 aesenc $rndkey0,$inout4 aesenc $rndkey0,$inout5 aesenc $rndkey0,$inout6 aesenc $rndkey0,$inout7 $movkey ($key),$rndkey0 jnz .Lctr32_enc_loop8 $movkey 0xe0-0x80($key),$rndkey0 .Lctr32_enc_done: aesenc $rndkey1,$inout0 movdqu 0x10($inp),$in1 pxor $rndkey0,$in0 aesenc $rndkey1,$inout1 movdqu 0x20($inp),$in2 pxor $rndkey0,$in1 aesenc $rndkey1,$inout2 movdqu 0x30($inp),$in3 pxor $rndkey0,$in2 aesenc $rndkey1,$inout3 movdqu 0x40($inp),$in4 pxor $rndkey0,$in3 aesenc $rndkey1,$inout4 pxor $rndkey0,$one movdqu 0x50($inp),$in5 pxor $rndkey0,$in4 aesenc $rndkey1,$inout5 pxor $rndkey0,$in5 aesenc $rndkey1,$inout6 aesenc $rndkey1,$inout7 movdqu 0x50($inp),$rndkey1 movdqu 0x60($inp),$rndkey1 aesenclast $in0,$inout0 movdqu 0x60($inp),$in0 pxor $rndkey0,$rndkey1 movdqu 0x70($inp),$in0 lea 0x80($inp),$inp aesenclast $in1,$inout1 movdqu 0x70($inp),$in1 pxor $rndkey0,$in0 movdqa 0x00(%rsp),$in1 # load next counter block aesenclast $in2,$inout2 pxor $rndkey0,$in1 $movkey ($key_),$rndkey0 movdqa 0x10(%rsp),$in2 aesenclast $in3,$inout3 lea 0x80($inp),$inp aesenclast $one,$inout4 movdqa .Lincrement1(%rip),$one aesenclast $rndkey1,$inout5 $movkey 16($key_),$rndkey1 aesenclast $in0,$inout6 aesenclast $in1,$inout7 movdqa 0x20(%rsp),$in3 aesenclast $in4,$inout4 movdqa 0x30(%rsp),$in4 aesenclast $in5,$inout5 movdqa 0x40(%rsp),$in5 aesenclast $rndkey1,$inout6 movdqa 0x50(%rsp),$rndkey0 aesenclast $in0,$inout7 $movkey 0x10-0x80($key),$rndkey1 movups $inout0,($out) # store output movdqa $rndkey0,$inout0 movdqa $in1,$inout0 movups $inout1,0x10($out) movdqa $rndkey0,$inout1 movdqa $in2,$inout1 movups $inout2,0x20($out) movdqa $rndkey0,$inout2 movdqa $in3,$inout2 movups $inout3,0x30($out) movdqa $rndkey0,$inout3 movdqa $in4,$inout3 movups $inout4,0x40($out) movdqa $rndkey0,$inout4 movdqa $in5,$inout4 movups $inout5,0x50($out) movdqa $rndkey0,$inout5 movups $inout6,0x60($out) movdqa $rndkey0,$inout6 movups $inout7,0x70($out) movdqa $rndkey0,$inout7 lea 0x80($out),$out mov $rnds_,$rounds sub \$8,$len jnc .Lctr32_loop8 lea 1($rounds,$rounds),$rounds # restore original value lea 1($rnds_,$rnds_),$rnds_ # restore original value add \$8,$len jz .Lctr32_done lea -0x80($key),$key .Lctr32_tail: mov $key_,$key # restore $key movdqa $ivec,$inout0 paddb $one,$ivec movups ($inp),$in0 cmp \$2,$len jb .Lctr32_one movdqa $ivec,$inout1 paddb $one,$ivec movups 0x10($inp),$in1 je .Lctr32_two movdqa $ivec,$inout2 paddb $one,$ivec movups 0x20($inp),$in2 lea 16($key),$key cmp \$4,$len jb .Lctr32_three movdqa $ivec,$inout3 paddb $one,$ivec movups 0x30($inp),$in3 je .Lctr32_four jbe .Lctr32_loop4 movdqa $ivec,$inout4 paddb $one,$ivec cmp \$6,$len jb .Lctr32_five movdqa 0x60(%rsp),$inout6 movdqa $ivec,$inout5 paddb $one,$ivec je .Lctr32_six movdqa $ivec,$inout6 paddb $one,$ivec xorps $inout7,$inout7 $movkey 16($key),$rndkey0 aesenc $rndkey1,$inout0 lea 16($key),$key aesenc $rndkey1,$inout1 shr \$1,$rounds aesenc $rndkey1,$inout2 dec $rounds aesenc $rndkey1,$inout3 aesenc $rndkey1,$inout4 aesenc $rndkey1,$inout5 aesenc $rndkey1,$inout6 pxor $inout7,$inout7 $movkey 16($key),$rndkey1 call _aesni_encrypt8 call .Lenc_loop8_enter xorps $in0,$inout0 # xor movups 0x40($inp),$in0 movups ($inp),$in0 movups 0x10($inp),$in1 movups 0x20($inp),$in2 xorps $in0,$inout0 movups 0x30($inp),$in3 xorps $in1,$inout1 movups 0x50($inp),$in1 movups 0x40($inp),$in0 xorps $in2,$inout2 movups 0x60($inp),$in2 lea 0x70($inp),$inp movups $inout0,($out) xorps $in3,$inout3 movups $inout0,($out) # store output xorps $in0,$inout4 movups $inout1,0x10($out) xorps $in1,$inout5 xorps $in0,$inout4 movups $inout2,0x20($out) xorps $in2,$inout6 movups $inout3,0x30($out) movups $inout4,0x40($out) cmp \$6,$len jb .Lctr32_done movups 0x50($inp),$in1 xorps $in1,$inout5 movups $inout5,0x50($out) je .Lctr32_done movups 0x60($inp),$in2 xorps $in2,$inout6 movups $inout6,0x60($out) lea 0x70($out),$out jmp .Lctr32_done .align 16 .Lctr32_one_shortcut: movups ($ivp),$inout0 xor $len_,$len_ .align 32 .Lctr32_loop4: aesenc $rndkey1,$inout0 lea 16($key),$key aesenc $rndkey1,$inout1 aesenc $rndkey1,$inout2 aesenc $rndkey1,$inout3 $movkey ($key),$rndkey1 dec $rounds jnz .Lctr32_loop4 aesenclast $rndkey1,$inout0 aesenclast $rndkey1,$inout1 aesenclast $rndkey1,$inout2 aesenclast $rndkey1,$inout3 movups ($inp),$in0 mov 240($key),$rounds # key->rounds .Lctr32_one: ___ &aesni_generate1("enc",$key,$rounds); $code.=<<___; xorps $in0,$inout0 lea 0x10($inp),$inp movups $inout0,($out) lea 0x10($out),$out jmp .Lctr32_done cmp \$2,$len jb .Lctr32_done .align 16 .Lctr32_two: xorps $inout2,$inout2 call _aesni_encrypt3 xorps $in0,$inout0 # xor lea 0x20($inp),$inp movups 0x10($inp),$in1 xorps $in1,$inout1 movups $inout0,($out) # store output movups $inout1,0x10($out) lea 0x20($out),$out jmp .Lctr32_done je .Lctr32_done .align 16 .Lctr32_three: call _aesni_encrypt3 xorps $in0,$inout0 # xor lea 0x30($inp),$inp xorps $in1,$inout1 movups $inout0,($out) # store output movups 0x20($inp),$in2 xorps $in2,$inout2 movups $inout1,0x10($out) movups $inout2,0x20($out) lea 0x30($out),$out jmp .Lctr32_done cmp \$4,$len jb .Lctr32_done .align 16 .Lctr32_four: call _aesni_encrypt4 xorps $in0,$inout0 # xor lea 0x40($inp),$inp xorps $in1,$inout1 movups $inout0,($out) # store output xorps $in2,$inout2 movups $inout1,0x10($out) movups 0x30($inp),$in3 xorps $in3,$inout3 movups $inout2,0x20($out) movups $inout3,0x30($out) lea 0x40($out),$out jmp .Lctr32_done .align 16 .Lctr32_five: xorps $inout5,$inout5 call _aesni_encrypt6 xorps $in0,$inout0 # xor movups 0x40($inp),$in0 lea 0x50($inp),$inp xorps $in1,$inout1 movups $inout0,($out) # store output xorps $in2,$inout2 movups $inout1,0x10($out) xorps $in3,$inout3 movups $inout2,0x20($out) xorps $in0,$inout4 movups $inout3,0x30($out) movups $inout4,0x40($out) lea 0x50($out),$out .Lctr32_one_shortcut: movups ($ivp),$inout0 movups ($inp),$in0 mov 240($key),$rounds # key->rounds ___ &aesni_generate1("enc",$key,$rounds); $code.=<<___; xorps $in0,$inout0 movups $inout0,($out) jmp .Lctr32_done .align 16 .Lctr32_six: call _aesni_encrypt6 xorps $in0,$inout0 # xor movups 0x40($inp),$in0 xorps $in1,$inout1 movups 0x50($inp),$in1 lea 0x60($inp),$inp xorps $in2,$inout2 movups $inout0,($out) # store output xorps $in3,$inout3 movups $inout1,0x10($out) xorps $in0,$inout4 movups $inout2,0x20($out) xorps $in1,$inout5 movups $inout3,0x30($out) movups $inout4,0x40($out) movups $inout5,0x50($out) lea 0x60($out),$out .Lctr32_done: test $len_,$len_ jz .Lctr32_really_done movdqa .Lbswap_mask(%rip),$rndkey1 pshufb $rndkey1,$ivec psrldq \$14,$one # 256 paddd $one,$ivec pslldq \$14,$one pshufb $rndkey1,$ivec mov $len_,$len mov \$256,%rax jmp .Lctr32_grandloop .Lctr32_really_done: ___ $code.=<<___ if ($win64); movaps 0x00(%rsp),%xmm6 movaps 0x10(%rsp),%xmm7 movaps 0x20(%rsp),%xmm8 movaps 0x30(%rsp),%xmm9 movaps 0x40(%rsp),%xmm10 movaps 0x50(%rsp),%xmm11 movaps 0x60(%rsp),%xmm12 movaps 0x70(%rsp),%xmm13 movaps 0x80(%rsp),%xmm14 movaps 0x90(%rsp),%xmm15 lea 0xa8(%rsp),%rsp movaps -0xa0(%rbp),%xmm6 movaps -0x90(%rbp),%xmm7 movaps -0x80(%rbp),%xmm8 movaps -0x70(%rbp),%xmm9 movaps -0x60(%rbp),%xmm10 movaps -0x50(%rbp),%xmm11 movaps -0x40(%rbp),%xmm12 movaps -0x30(%rbp),%xmm13 movaps -0x20(%rbp),%xmm14 movaps -0x10(%rbp),%xmm15 ___ $code.=<<___; .Lctr32_ret: lea (%rbp),%rsp pop %rbp .Lctr32_epilogue: ret .size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks ___ Loading Loading @@ -1417,16 +1421,16 @@ aesni_xts_encrypt: and \$-16,%rsp # Linux kernel stack can be incorrectly seeded ___ $code.=<<___ if ($win64); movaps %xmm6,0x60(%rsp) movaps %xmm7,0x70(%rsp) movaps %xmm8,0x80(%rsp) movaps %xmm9,0x90(%rsp) movaps %xmm10,0xa0(%rsp) movaps %xmm11,0xb0(%rsp) movaps %xmm12,0xc0(%rsp) movaps %xmm13,0xd0(%rsp) movaps %xmm14,0xe0(%rsp) movaps %xmm15,0xf0(%rsp) movaps %xmm6,-0xa8(%rax) movaps %xmm7,-0x98(%rax) movaps %xmm8,-0x88(%rax) movaps %xmm9,-0x78(%rax) movaps %xmm10,-0x68(%rax) movaps %xmm11,-0x58(%rax) movaps %xmm12,-0x48(%rax) movaps %xmm13,-0x38(%rax) movaps %xmm14,-0x28(%rax) movaps %xmm15,-0x18(%rax) .Lxts_enc_body: ___ $code.=<<___; Loading Loading @@ -1782,16 +1786,16 @@ $code.=<<___; .Lxts_enc_ret: ___ $code.=<<___ if ($win64); movaps 0x60(%rsp),%xmm6 movaps 0x70(%rsp),%xmm7 movaps 0x80(%rsp),%xmm8 movaps 0x90(%rsp),%xmm9 movaps 0xa0(%rsp),%xmm10 movaps 0xb0(%rsp),%xmm11 movaps 0xc0(%rsp),%xmm12 movaps 0xd0(%rsp),%xmm13 movaps 0xe0(%rsp),%xmm14 movaps 0xf0(%rsp),%xmm15 movaps -0xa0(%rbp),%xmm6 movaps -0x90(%rbp),%xmm7 movaps -0x80(%rbp),%xmm8 movaps -0x70(%rbp),%xmm9 movaps -0x60(%rbp),%xmm10 movaps -0x50(%rbp),%xmm11 movaps -0x40(%rbp),%xmm12 movaps -0x30(%rbp),%xmm13 movaps -0x20(%rbp),%xmm14 movaps -0x10(%rbp),%xmm15 ___ $code.=<<___; lea (%rbp),%rsp Loading @@ -1812,16 +1816,16 @@ aesni_xts_decrypt: and \$-16,%rsp # Linux kernel stack can be incorrectly seeded ___ $code.=<<___ if ($win64); movaps %xmm6,0x60(%rsp) movaps %xmm7,0x70(%rsp) movaps %xmm8,0x80(%rsp) movaps %xmm9,0x90(%rsp) movaps %xmm10,0xa0(%rsp) movaps %xmm11,0xb0(%rsp) movaps %xmm12,0xc0(%rsp) movaps %xmm13,0xd0(%rsp) movaps %xmm14,0xe0(%rsp) movaps %xmm15,0xf0(%rsp) movaps %xmm6,-0xa8(%rax) movaps %xmm7,-0x98(%rax) movaps %xmm8,-0x88(%rax) movaps %xmm9,-0x78(%rax) movaps %xmm10,-0x68(%rax) movaps %xmm11,-0x58(%rax) movaps %xmm12,-0x48(%rax) movaps %xmm13,-0x38(%rax) movaps %xmm14,-0x28(%rax) movaps %xmm15,-0x18(%rax) .Lxts_dec_body: ___ $code.=<<___; Loading Loading @@ -2213,16 +2217,16 @@ $code.=<<___; .Lxts_dec_ret: ___ $code.=<<___ if ($win64); movaps 0x60(%rsp),%xmm6 movaps 0x70(%rsp),%xmm7 movaps 0x80(%rsp),%xmm8 movaps 0x90(%rsp),%xmm9 movaps 0xa0(%rsp),%xmm10 movaps 0xb0(%rsp),%xmm11 movaps 0xc0(%rsp),%xmm12 movaps 0xd0(%rsp),%xmm13 movaps 0xe0(%rsp),%xmm14 movaps 0xf0(%rsp),%xmm15 movaps -0xa0(%rbp),%xmm6 movaps -0x90(%rbp),%xmm7 movaps -0x80(%rbp),%xmm8 movaps -0x70(%rbp),%xmm9 movaps -0x60(%rbp),%xmm10 movaps -0x50(%rbp),%xmm11 movaps -0x40(%rbp),%xmm12 movaps -0x30(%rbp),%xmm13 movaps -0x20(%rbp),%xmm14 movaps -0x10(%rbp),%xmm15 ___ $code.=<<___; lea (%rbp),%rsp Loading Loading @@ -2914,45 +2918,9 @@ ccm64_se_handler: jmp .Lcommon_seh_tail .size ccm64_se_handler,.-ccm64_se_handler .type ctr32_se_handler,\@abi-omnipotent .type ctr_xts_se_handler,\@abi-omnipotent .align 16 ctr32_se_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip lea .Lctr32_body(%rip),%r10 cmp %r10,%rbx # context->Rip<"prologue" label jb .Lcommon_seh_tail mov 152($context),%rax # pull context->Rsp lea .Lctr32_ret(%rip),%r10 cmp %r10,%rbx jae .Lcommon_seh_tail lea (%rax),%rsi # %xmm save area lea 512($context),%rdi # &context.Xmm6 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) .long 0xa548f3fc # cld; rep movsq lea 0xa8(%rax),%rax # adjust stack pointer jmp .Lcommon_seh_tail .size ctr32_se_handler,.-ctr32_se_handler .type xts_se_handler,\@abi-omnipotent .align 16 xts_se_handler: ctr_xts_se_handler: push %rsi push %rdi push %rbx Loading Loading @@ -2982,13 +2950,14 @@ xts_se_handler: cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail lea 0x60(%rax),%rsi # %xmm save area mov 160($context),%rax # pull context->Rbp lea -0xa0(%rax),%rsi # %xmm save area lea 512($context),%rdi # & context.Xmm6 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) .long 0xa548f3fc # cld; rep movsq jmp .Lcommon_rbp_tail .size xts_se_handler,.-xts_se_handler .size ctr_xts_se_handler,.-ctr_xts_se_handler ___ $code.=<<___; .type cbc_se_handler,\@abi-omnipotent Loading Loading @@ -3132,14 +3101,15 @@ $code.=<<___ if ($PREFIX eq "aesni"); .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[] .LSEH_info_ctr32: .byte 9,0,0,0 .rva ctr32_se_handler .rva ctr_xts_se_handler .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[] .LSEH_info_xts_enc: .byte 9,0,0,0 .rva xts_se_handler .rva ctr_xts_se_handler .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] .LSEH_info_xts_dec: .byte 9,0,0,0 .rva xts_se_handler .rva ctr_xts_se_handler .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] ___ $code.=<<___; Loading