Loading crypto/aes/asm/aesni-x86.pl +11 −0 Original line number Diff line number Diff line Loading @@ -43,6 +43,17 @@ # Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing # one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09. ###################################################################### # Current large-block performance in cycles per byte processed with # 128-bit key (less is better). # # CBC en-/decrypt CTR XTS ECB # Westmere 3.77/1.37 1.37 1.52 1.27 # * Bridge 5.07/0.98 0.99 1.09 0.91 # Haswell 4.44/0.80 0.97 1.03 0.72 # Atom 5.77/3.56 3.67 4.03 3.46 # Bulldozer 5.80/0.98 1.05 1.24 0.93 $PREFIX="aesni"; # if $PREFIX is set to "AES", the script # generates drop-in replacement for # crypto/aes/asm/aes-586.pl:-) Loading crypto/aes/asm/aesni-x86_64.pl +168 −21 Original line number Diff line number Diff line Loading @@ -158,25 +158,19 @@ # in CTR mode AES instruction interleave factor was chosen to be 6x. ###################################################################### # For reference, AMD Bulldozer spends 5.77 cycles per byte processed # with 128-bit key in CBC encrypt and 0.70 cycles in CBC decrypt, 0.70 # in ECB, 0.71 in CTR, 0.90 in XTS... This means that aes[enc|dec] # instruction latency is 9 cycles and that they can be issued every # cycle. ###################################################################### # Haswell spends 4.44 cycles per byte in CBC encrypt, 0.63 in CBC # decrypt, CTR and ECB, 0.73 in XTS. ###################################################################### # Atom Silvermont spends 5.77/4.0 cycles per byte in CBC en-/decrypt, # 3.87 in ECB, 4.15 in CTR, 4.12 in XTS. Results for parallelizeable # modes [other than XTS] are actually suboptimal, because of penalties # incurred by operations on %xmm8-15, which are inevitable with such # high instruction interleave factors. This means that performance can # be improved by decreasing the interleave factor, but then it would # negatively affect other platforms in relatively larger degree. # Run-time detection would solve the dilemma... # Current large-block performance in cycles per byte processed with # 128-bit key (less is better). # # CBC en-/decrypt CTR XTS ECB # Westmere 3.77/1.25 1.25 1.25 1.26 # * Bridge 5.07/0.74 0.75 0.90 0.85 # Haswell 4.44/0.63 0.63 0.73 0.63 # Atom 5.75/3.54 3.56 4.12 3.87(*) # Bulldozer 5.77/0.70 0.72 0.90 0.70 # # (*) Atom ECB result is suboptimal because of penalties incurred # by operations on %xmm8-15. As ECB is not considered # critical, nothing was done to mitigate the problem. $PREFIX="aesni"; # if $PREFIX is set to "AES", the script # generates drop-in replacement for Loading @@ -201,6 +195,7 @@ $movkey = $PREFIX eq "aesni" ? "movups" : "movups"; ("%rdi","%rsi","%rdx","%rcx"); # Unix order $code=".text\n"; $code.=".extern OPENSSL_ia32cap_P\n"; $rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!! # this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ... Loading Loading @@ -1119,7 +1114,9 @@ $code.=<<___; lea 7($ctr),%r9 mov %r10d,0x60+12(%rsp) bswap %r9d mov OPENSSL_ia32cap_P+4(%rip),%r10d xor $key0,%r9d and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE mov %r9d,0x70+12(%rsp) $movkey 0x10($key),$rndkey1 Loading @@ -1130,10 +1127,104 @@ $code.=<<___; cmp \$8,$len jb .Lctr32_tail sub \$6,$len cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE je .Lctr32_6x lea 0x80($key),$key # size optimization sub \$8,$len sub \$2,$len jmp .Lctr32_loop8 .align 16 .Lctr32_6x: shl \$4,$rounds mov \$48,$rnds_ bswap $key0 lea 32($key,$rounds),$key # end of key schedule sub %rax,%r10 # twisted $rounds jmp .Lctr32_loop6 .align 16 .Lctr32_loop6: add \$6,$ctr $movkey -48($key,$rnds_),$rndkey0 aesenc $rndkey1,$inout0 mov $ctr,%eax xor $key0,%eax aesenc $rndkey1,$inout1 movbe %eax,`0x00+12`(%rsp) lea 1($ctr),%eax aesenc $rndkey1,$inout2 xor $key0,%eax movbe %eax,`0x10+12`(%rsp) aesenc $rndkey1,$inout3 lea 2($ctr),%eax xor $key0,%eax aesenc $rndkey1,$inout4 movbe %eax,`0x20+12`(%rsp) lea 3($ctr),%eax aesenc $rndkey1,$inout5 $movkey -32($key,$rnds_),$rndkey1 xor $key0,%eax aesenc $rndkey0,$inout0 movbe %eax,`0x30+12`(%rsp) lea 4($ctr),%eax aesenc $rndkey0,$inout1 xor $key0,%eax movbe %eax,`0x40+12`(%rsp) aesenc $rndkey0,$inout2 lea 5($ctr),%eax xor $key0,%eax aesenc $rndkey0,$inout3 movbe %eax,`0x50+12`(%rsp) mov %r10,%rax # mov $rnds_,$rounds aesenc $rndkey0,$inout4 aesenc $rndkey0,$inout5 $movkey -16($key,$rnds_),$rndkey0 call .Lenc_loop6 movdqu ($inp),$inout6 movdqu 0x10($inp),$inout7 movdqu 0x20($inp),$in0 movdqu 0x30($inp),$in1 movdqu 0x40($inp),$in2 movdqu 0x50($inp),$in3 lea 0x60($inp),$inp $movkey -64($key,$rnds_),$rndkey1 pxor $inout0,$inout6 movaps 0x00(%rsp),$inout0 pxor $inout1,$inout7 movaps 0x10(%rsp),$inout1 pxor $inout2,$in0 movaps 0x20(%rsp),$inout2 pxor $inout3,$in1 movaps 0x30(%rsp),$inout3 pxor $inout4,$in2 movaps 0x40(%rsp),$inout4 pxor $inout5,$in3 movaps 0x50(%rsp),$inout5 movdqu $inout6,($out) movdqu $inout7,0x10($out) movdqu $in0,0x20($out) movdqu $in1,0x30($out) movdqu $in2,0x40($out) movdqu $in3,0x50($out) lea 0x60($out),$out sub \$6,$len jnc .Lctr32_loop6 add \$6,$len jz .Lctr32_done lea -48($rnds_),$rounds lea -80($key,$rnds_),$key # restore $key neg $rounds shr \$4,$rounds # restore $rounds jmp .Lctr32_tail .align 32 .Lctr32_loop8: add \$8,$ctr Loading Loading @@ -2455,10 +2546,15 @@ $code.=<<___; movdqa $inout3,$in3 movdqu 0x50($inp),$inout5 movdqa $inout4,$in4 mov OPENSSL_ia32cap_P+4(%rip),%r9d cmp \$0x70,$len jbe .Lcbc_dec_six_or_seven sub \$0x70,$len and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE sub \$0x50,$len cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE je .Lcbc_dec_loop6_enter sub \$0x20,$len lea 0x70($key),$key # size optimization jmp .Lcbc_dec_loop8_enter .align 16 Loading Loading @@ -2638,6 +2734,51 @@ $code.=<<___; movdqa $inout6,$inout0 jmp .Lcbc_dec_tail_collected .align 16 .Lcbc_dec_loop6: movups $inout5,($out) lea 0x10($out),$out movdqu 0x00($inp),$inout0 # load input movdqu 0x10($inp),$inout1 movdqa $inout0,$in0 movdqu 0x20($inp),$inout2 movdqa $inout1,$in1 movdqu 0x30($inp),$inout3 movdqa $inout2,$in2 movdqu 0x40($inp),$inout4 movdqa $inout3,$in3 movdqu 0x50($inp),$inout5 movdqa $inout4,$in4 .Lcbc_dec_loop6_enter: lea 0x60($inp),$inp movdqa $inout5,$inout6 call _aesni_decrypt6 pxor $iv,$inout0 # ^= IV movdqa $inout6,$iv pxor $in0,$inout1 movdqu $inout0,($out) pxor $in1,$inout2 movdqu $inout1,0x10($out) pxor $in2,$inout3 movdqu $inout2,0x20($out) pxor $in3,$inout4 mov $key_,$key movdqu $inout3,0x30($out) pxor $in4,$inout5 mov $rnds_,$rounds movdqu $inout4,0x40($out) lea 0x50($out),$out sub \$0x60,$len ja .Lcbc_dec_loop6 movdqa $inout5,$inout0 add \$0x50,$len jle .Lcbc_dec_tail_collected movups $inout5,($out) lea 0x10($out),$out .Lcbc_dec_tail: movups ($inp),$inout0 sub \$0x10,$len Loading Loading @@ -3360,8 +3501,14 @@ sub aesni { return $line; } sub movbe { ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift; } $code =~ s/\`([^\`]*)\`/eval($1)/gem; $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; #$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact $code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem; print $code; Loading Loading
crypto/aes/asm/aesni-x86.pl +11 −0 Original line number Diff line number Diff line Loading @@ -43,6 +43,17 @@ # Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing # one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09. ###################################################################### # Current large-block performance in cycles per byte processed with # 128-bit key (less is better). # # CBC en-/decrypt CTR XTS ECB # Westmere 3.77/1.37 1.37 1.52 1.27 # * Bridge 5.07/0.98 0.99 1.09 0.91 # Haswell 4.44/0.80 0.97 1.03 0.72 # Atom 5.77/3.56 3.67 4.03 3.46 # Bulldozer 5.80/0.98 1.05 1.24 0.93 $PREFIX="aesni"; # if $PREFIX is set to "AES", the script # generates drop-in replacement for # crypto/aes/asm/aes-586.pl:-) Loading
crypto/aes/asm/aesni-x86_64.pl +168 −21 Original line number Diff line number Diff line Loading @@ -158,25 +158,19 @@ # in CTR mode AES instruction interleave factor was chosen to be 6x. ###################################################################### # For reference, AMD Bulldozer spends 5.77 cycles per byte processed # with 128-bit key in CBC encrypt and 0.70 cycles in CBC decrypt, 0.70 # in ECB, 0.71 in CTR, 0.90 in XTS... This means that aes[enc|dec] # instruction latency is 9 cycles and that they can be issued every # cycle. ###################################################################### # Haswell spends 4.44 cycles per byte in CBC encrypt, 0.63 in CBC # decrypt, CTR and ECB, 0.73 in XTS. ###################################################################### # Atom Silvermont spends 5.77/4.0 cycles per byte in CBC en-/decrypt, # 3.87 in ECB, 4.15 in CTR, 4.12 in XTS. Results for parallelizeable # modes [other than XTS] are actually suboptimal, because of penalties # incurred by operations on %xmm8-15, which are inevitable with such # high instruction interleave factors. This means that performance can # be improved by decreasing the interleave factor, but then it would # negatively affect other platforms in relatively larger degree. # Run-time detection would solve the dilemma... # Current large-block performance in cycles per byte processed with # 128-bit key (less is better). # # CBC en-/decrypt CTR XTS ECB # Westmere 3.77/1.25 1.25 1.25 1.26 # * Bridge 5.07/0.74 0.75 0.90 0.85 # Haswell 4.44/0.63 0.63 0.73 0.63 # Atom 5.75/3.54 3.56 4.12 3.87(*) # Bulldozer 5.77/0.70 0.72 0.90 0.70 # # (*) Atom ECB result is suboptimal because of penalties incurred # by operations on %xmm8-15. As ECB is not considered # critical, nothing was done to mitigate the problem. $PREFIX="aesni"; # if $PREFIX is set to "AES", the script # generates drop-in replacement for Loading @@ -201,6 +195,7 @@ $movkey = $PREFIX eq "aesni" ? "movups" : "movups"; ("%rdi","%rsi","%rdx","%rcx"); # Unix order $code=".text\n"; $code.=".extern OPENSSL_ia32cap_P\n"; $rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!! # this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ... Loading Loading @@ -1119,7 +1114,9 @@ $code.=<<___; lea 7($ctr),%r9 mov %r10d,0x60+12(%rsp) bswap %r9d mov OPENSSL_ia32cap_P+4(%rip),%r10d xor $key0,%r9d and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE mov %r9d,0x70+12(%rsp) $movkey 0x10($key),$rndkey1 Loading @@ -1130,10 +1127,104 @@ $code.=<<___; cmp \$8,$len jb .Lctr32_tail sub \$6,$len cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE je .Lctr32_6x lea 0x80($key),$key # size optimization sub \$8,$len sub \$2,$len jmp .Lctr32_loop8 .align 16 .Lctr32_6x: shl \$4,$rounds mov \$48,$rnds_ bswap $key0 lea 32($key,$rounds),$key # end of key schedule sub %rax,%r10 # twisted $rounds jmp .Lctr32_loop6 .align 16 .Lctr32_loop6: add \$6,$ctr $movkey -48($key,$rnds_),$rndkey0 aesenc $rndkey1,$inout0 mov $ctr,%eax xor $key0,%eax aesenc $rndkey1,$inout1 movbe %eax,`0x00+12`(%rsp) lea 1($ctr),%eax aesenc $rndkey1,$inout2 xor $key0,%eax movbe %eax,`0x10+12`(%rsp) aesenc $rndkey1,$inout3 lea 2($ctr),%eax xor $key0,%eax aesenc $rndkey1,$inout4 movbe %eax,`0x20+12`(%rsp) lea 3($ctr),%eax aesenc $rndkey1,$inout5 $movkey -32($key,$rnds_),$rndkey1 xor $key0,%eax aesenc $rndkey0,$inout0 movbe %eax,`0x30+12`(%rsp) lea 4($ctr),%eax aesenc $rndkey0,$inout1 xor $key0,%eax movbe %eax,`0x40+12`(%rsp) aesenc $rndkey0,$inout2 lea 5($ctr),%eax xor $key0,%eax aesenc $rndkey0,$inout3 movbe %eax,`0x50+12`(%rsp) mov %r10,%rax # mov $rnds_,$rounds aesenc $rndkey0,$inout4 aesenc $rndkey0,$inout5 $movkey -16($key,$rnds_),$rndkey0 call .Lenc_loop6 movdqu ($inp),$inout6 movdqu 0x10($inp),$inout7 movdqu 0x20($inp),$in0 movdqu 0x30($inp),$in1 movdqu 0x40($inp),$in2 movdqu 0x50($inp),$in3 lea 0x60($inp),$inp $movkey -64($key,$rnds_),$rndkey1 pxor $inout0,$inout6 movaps 0x00(%rsp),$inout0 pxor $inout1,$inout7 movaps 0x10(%rsp),$inout1 pxor $inout2,$in0 movaps 0x20(%rsp),$inout2 pxor $inout3,$in1 movaps 0x30(%rsp),$inout3 pxor $inout4,$in2 movaps 0x40(%rsp),$inout4 pxor $inout5,$in3 movaps 0x50(%rsp),$inout5 movdqu $inout6,($out) movdqu $inout7,0x10($out) movdqu $in0,0x20($out) movdqu $in1,0x30($out) movdqu $in2,0x40($out) movdqu $in3,0x50($out) lea 0x60($out),$out sub \$6,$len jnc .Lctr32_loop6 add \$6,$len jz .Lctr32_done lea -48($rnds_),$rounds lea -80($key,$rnds_),$key # restore $key neg $rounds shr \$4,$rounds # restore $rounds jmp .Lctr32_tail .align 32 .Lctr32_loop8: add \$8,$ctr Loading Loading @@ -2455,10 +2546,15 @@ $code.=<<___; movdqa $inout3,$in3 movdqu 0x50($inp),$inout5 movdqa $inout4,$in4 mov OPENSSL_ia32cap_P+4(%rip),%r9d cmp \$0x70,$len jbe .Lcbc_dec_six_or_seven sub \$0x70,$len and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE sub \$0x50,$len cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE je .Lcbc_dec_loop6_enter sub \$0x20,$len lea 0x70($key),$key # size optimization jmp .Lcbc_dec_loop8_enter .align 16 Loading Loading @@ -2638,6 +2734,51 @@ $code.=<<___; movdqa $inout6,$inout0 jmp .Lcbc_dec_tail_collected .align 16 .Lcbc_dec_loop6: movups $inout5,($out) lea 0x10($out),$out movdqu 0x00($inp),$inout0 # load input movdqu 0x10($inp),$inout1 movdqa $inout0,$in0 movdqu 0x20($inp),$inout2 movdqa $inout1,$in1 movdqu 0x30($inp),$inout3 movdqa $inout2,$in2 movdqu 0x40($inp),$inout4 movdqa $inout3,$in3 movdqu 0x50($inp),$inout5 movdqa $inout4,$in4 .Lcbc_dec_loop6_enter: lea 0x60($inp),$inp movdqa $inout5,$inout6 call _aesni_decrypt6 pxor $iv,$inout0 # ^= IV movdqa $inout6,$iv pxor $in0,$inout1 movdqu $inout0,($out) pxor $in1,$inout2 movdqu $inout1,0x10($out) pxor $in2,$inout3 movdqu $inout2,0x20($out) pxor $in3,$inout4 mov $key_,$key movdqu $inout3,0x30($out) pxor $in4,$inout5 mov $rnds_,$rounds movdqu $inout4,0x40($out) lea 0x50($out),$out sub \$0x60,$len ja .Lcbc_dec_loop6 movdqa $inout5,$inout0 add \$0x50,$len jle .Lcbc_dec_tail_collected movups $inout5,($out) lea 0x10($out),$out .Lcbc_dec_tail: movups ($inp),$inout0 sub \$0x10,$len Loading Loading @@ -3360,8 +3501,14 @@ sub aesni { return $line; } sub movbe { ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift; } $code =~ s/\`([^\`]*)\`/eval($1)/gem; $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; #$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact $code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem; print $code; Loading