Loading crypto/aes/asm/aes-x86_64.pl +122 −124 Original line number Diff line number Diff line Loading @@ -19,9 +19,10 @@ # Performance in number of cycles per processed byte for 128-bit key: # # ECB encrypt ECB decrypt CBC large chunk # AMD64 33 41 13.0 # EM64T 38 59 18.6(*) # Core 2 30 43 14.5(*) # AMD64 33 43 13.0 # EM64T 38 56 18.6(*) # Core 2 30 42 14.5(*) # Atom 65 86 32.1(*) # # (*) with hyper-threading off Loading Loading @@ -365,68 +366,66 @@ $code.=<<___; movzb `&lo("$s0")`,$t0 movzb `&lo("$s1")`,$t1 movzb `&lo("$s2")`,$t2 movzb ($sbox,$t0,1),$t0 movzb ($sbox,$t1,1),$t1 movzb ($sbox,$t2,1),$t2 movzb `&lo("$s3")`,$t3 movzb `&hi("$s1")`,$acc0 movzb `&hi("$s2")`,$acc1 shr \$16,$s2 movzb `&hi("$s3")`,$acc2 movzb ($sbox,$t0,1),$t0 movzb ($sbox,$t1,1),$t1 movzb ($sbox,$t2,1),$t2 movzb ($sbox,$t3,1),$t3 movzb ($sbox,$acc0,1),$t4 #$t0 movzb ($sbox,$acc1,1),$t5 #$t1 movzb `&hi("$s3")`,$acc2 movzb ($sbox,$acc0,1),$t4 #$t0 movzb `&hi("$s0")`,$acc0 shr \$16,$s2 movzb ($sbox,$acc1,1),$t5 #$t1 movzb `&lo("$s2")`,$acc1 movzb ($sbox,$acc2,1),$acc2 #$t2 movzb ($sbox,$acc0,1),$acc0 #$t3 shr \$16,$s3 movzb `&lo("$s2")`,$acc1 shl \$8,$t4 shr \$16,$s3 shl \$8,$t5 movzb ($sbox,$acc1,1),$acc1 #$t0 xor $t4,$t0 xor $t5,$t1 movzb `&lo("$s3")`,$t4 shr \$16,$s0 movzb `&lo("$s3")`,$t4 shr \$16,$s1 movzb `&lo("$s0")`,$t5 xor $t5,$t1 shl \$8,$acc2 shl \$8,$acc0 movzb ($sbox,$t4,1),$t4 #$t1 movzb ($sbox,$t5,1),$t5 #$t2 movzb `&lo("$s0")`,$t5 movzb ($sbox,$acc1,1),$acc1 #$t0 xor $acc2,$t2 xor $acc0,$t3 shl \$8,$acc0 movzb `&lo("$s1")`,$acc2 movzb `&hi("$s3")`,$acc0 shl \$16,$acc1 movzb ($sbox,$acc2,1),$acc2 #$t3 movzb ($sbox,$acc0,1),$acc0 #$t0 xor $acc0,$t3 movzb ($sbox,$t4,1),$t4 #$t1 movzb `&hi("$s3")`,$acc0 movzb ($sbox,$t5,1),$t5 #$t2 xor $acc1,$t0 movzb `&hi("$s0")`,$acc1 shr \$8,$s2 movzb `&hi("$s0")`,$acc1 shl \$16,$t4 shr \$8,$s1 shl \$16,$t5 xor $t4,$t1 movzb ($sbox,$acc2,1),$acc2 #$t3 movzb ($sbox,$acc0,1),$acc0 #$t0 movzb ($sbox,$acc1,1),$acc1 #$t1 movzb ($sbox,$s2,1),$s3 #$t3 movzb ($sbox,$s1,1),$s2 #$t2 shl \$16,$t4 shl \$16,$t5 shl \$16,$acc2 xor $t4,$t1 xor $t5,$t2 xor $acc2,$t3 shl \$24,$acc0 xor $acc2,$t3 shl \$24,$acc1 shl \$24,$s3 xor $acc0,$t0 shl \$24,$s2 shl \$24,$s3 xor $acc1,$t1 shl \$24,$s2 mov $t0,$s0 mov $t1,$s1 xor $t2,$s2 Loading Loading @@ -465,12 +464,12 @@ sub enctransform() { my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d"); $code.=<<___; mov $s0,$acc0 mov $s1,$acc1 and \$0x80808080,$acc0 and \$0x80808080,$acc1 mov $acc0,$t0 mov $acc1,$t1 mov \$0x80808080,$t0 mov \$0x80808080,$t1 and $s0,$t0 and $s1,$t1 mov $t0,$acc0 mov $t1,$acc1 shr \$7,$t0 lea ($s0,$s0),$r20 shr \$7,$t1 Loading @@ -488,25 +487,25 @@ $code.=<<___; xor $r20,$s0 xor $r21,$s1 mov $s2,$acc0 mov $s3,$acc1 mov \$0x80808080,$t2 rol \$24,$s0 mov \$0x80808080,$t3 rol \$24,$s1 and \$0x80808080,$acc0 and \$0x80808080,$acc1 and $s2,$t2 and $s3,$t3 xor $r20,$s0 xor $r21,$s1 mov $acc0,$t2 mov $acc1,$t3 mov $t2,$acc0 ror \$16,$t0 mov $t3,$acc1 ror \$16,$t1 shr \$7,$t2 lea ($s2,$s2),$r20 shr \$7,$t2 xor $t0,$s0 xor $t1,$s1 shr \$7,$t3 lea ($s3,$s3),$r21 xor $t1,$s1 ror \$8,$t0 lea ($s3,$s3),$r21 ror \$8,$t1 sub $t2,$acc0 sub $t3,$acc1 Loading @@ -522,23 +521,23 @@ $code.=<<___; xor $acc0,$r20 xor $acc1,$r21 ror \$16,$t2 xor $r20,$s2 ror \$16,$t3 xor $r21,$s3 rol \$24,$s2 mov 0($sbox),$acc0 # prefetch Te4 rol \$24,$s3 xor $r20,$s2 xor $r21,$s3 mov 0($sbox),$acc0 # prefetch Te4 ror \$16,$t2 ror \$16,$t3 mov 64($sbox),$acc1 xor $t2,$s2 xor $t3,$s3 xor $r21,$s3 mov 128($sbox),$r20 xor $t2,$s2 ror \$8,$t2 xor $t3,$s3 ror \$8,$t3 mov 192($sbox),$r21 xor $t2,$s2 mov 192($sbox),$r21 xor $t3,$s3 ___ } Loading Loading @@ -935,70 +934,69 @@ $code.=<<___; movzb `&lo("$s0")`,$t0 movzb `&lo("$s1")`,$t1 movzb `&lo("$s2")`,$t2 movzb ($sbox,$t0,1),$t0 movzb ($sbox,$t1,1),$t1 movzb ($sbox,$t2,1),$t2 movzb `&lo("$s3")`,$t3 movzb `&hi("$s3")`,$acc0 movzb `&hi("$s0")`,$acc1 shr \$16,$s3 movzb `&hi("$s1")`,$acc2 movzb ($sbox,$t0,1),$t0 movzb ($sbox,$t1,1),$t1 movzb ($sbox,$t2,1),$t2 movzb ($sbox,$t3,1),$t3 movzb ($sbox,$acc0,1),$t4 #$t0 movzb ($sbox,$acc1,1),$t5 #$t1 movzb `&hi("$s1")`,$acc2 movzb ($sbox,$acc0,1),$t4 #$t0 movzb `&hi("$s2")`,$acc0 shr \$16,$s2 movzb ($sbox,$acc1,1),$t5 #$t1 movzb ($sbox,$acc2,1),$acc2 #$t2 movzb ($sbox,$acc0,1),$acc0 #$t3 shr \$16,$s3 movzb `&lo("$s2")`,$acc1 shl \$8,$t4 shr \$16,$s2 shl \$8,$t5 movzb ($sbox,$acc1,1),$acc1 #$t0 xor $t4,$t0 xor $t5,$t1 movzb `&lo("$s3")`,$t4 shl \$8,$t4 movzb `&lo("$s2")`,$acc1 shr \$16,$s0 xor $t4,$t0 shr \$16,$s1 movzb `&lo("$s0")`,$t5 movzb `&lo("$s3")`,$t4 shl \$8,$acc2 xor $t5,$t1 shl \$8,$acc0 movzb ($sbox,$t4,1),$t4 #$t1 movzb ($sbox,$t5,1),$t5 #$t2 movzb `&lo("$s0")`,$t5 movzb ($sbox,$acc1,1),$acc1 #$t0 xor $acc2,$t2 xor $acc0,$t3 movzb `&lo("$s1")`,$acc2 movzb `&hi("$s1")`,$acc0 shl \$16,$acc1 xor $acc0,$t3 movzb ($sbox,$t4,1),$t4 #$t1 movzb `&hi("$s1")`,$acc0 movzb ($sbox,$acc2,1),$acc2 #$t3 movzb ($sbox,$acc0,1),$acc0 #$t0 xor $acc1,$t0 movzb ($sbox,$t5,1),$t5 #$t2 movzb `&hi("$s2")`,$acc1 shl \$16,$acc2 shl \$16,$t4 shl \$16,$t5 movzb ($sbox,$acc1,1),$s1 #$t1 xor $acc2,$t3 movzb `&hi("$s3")`,$acc2 xor $t4,$t1 shr \$8,$s0 xor $t5,$t2 movzb `&hi("$s3")`,$acc1 shr \$8,$s0 shl \$16,$acc2 movzb ($sbox,$acc1,1),$s2 #$t2 movzb ($sbox,$acc0,1),$acc0 #$t0 movzb ($sbox,$acc1,1),$s1 #$t1 movzb ($sbox,$acc2,1),$s2 #$t2 movzb ($sbox,$s0,1),$s3 #$t3 xor $acc2,$t3 mov $t0,$s0 shl \$24,$acc0 shl \$24,$s1 shl \$24,$s2 xor $acc0,$t0 xor $acc0,$s0 shl \$24,$s3 xor $t1,$s1 mov $t0,$s0 xor $t2,$s2 xor $t3,$s3 ___ Loading @@ -1013,12 +1011,12 @@ sub dectransform() my $prefetch = shift; $code.=<<___; mov $tp10,$acc0 mov $tp18,$acc8 and $mask80,$acc0 and $mask80,$acc8 mov $acc0,$tp40 mov $acc8,$tp48 mov $mask80,$tp40 mov $mask80,$tp48 and $tp10,$tp40 and $tp18,$tp48 mov $tp40,$acc0 mov $tp48,$acc8 shr \$7,$tp40 lea ($tp10,$tp10),$tp20 shr \$7,$tp48 Loading @@ -1029,15 +1027,15 @@ $code.=<<___; and $maskfe,$tp28 and $mask1b,$acc0 and $mask1b,$acc8 xor $tp20,$acc0 xor $tp28,$acc8 mov $acc0,$tp20 mov $acc8,$tp28 and $mask80,$acc0 and $mask80,$acc8 mov $acc0,$tp80 mov $acc8,$tp88 xor $acc0,$tp20 xor $acc8,$tp28 mov $mask80,$tp80 mov $mask80,$tp88 and $tp20,$tp80 and $tp28,$tp88 mov $tp80,$acc0 mov $tp88,$acc8 shr \$7,$tp80 lea ($tp20,$tp20),$tp40 shr \$7,$tp88 Loading @@ -1048,15 +1046,15 @@ $code.=<<___; and $maskfe,$tp48 and $mask1b,$acc0 and $mask1b,$acc8 xor $tp40,$acc0 xor $tp48,$acc8 mov $acc0,$tp40 mov $acc8,$tp48 and $mask80,$acc0 and $mask80,$acc8 mov $acc0,$tp80 mov $acc8,$tp88 xor $acc0,$tp40 xor $acc8,$tp48 mov $mask80,$tp80 mov $mask80,$tp88 and $tp40,$tp80 and $tp48,$tp88 mov $tp80,$acc0 mov $tp88,$acc8 shr \$7,$tp80 xor $tp10,$tp20 # tp2^=tp1 shr \$7,$tp88 Loading @@ -1081,51 +1079,51 @@ $code.=<<___; mov $tp10,$acc0 mov $tp18,$acc8 xor $tp80,$tp40 # tp4^tp1^=tp8 xor $tp88,$tp48 # tp4^tp1^=tp8 shr \$32,$acc0 xor $tp88,$tp48 # tp4^tp1^=tp8 shr \$32,$acc8 xor $tp20,$tp80 # tp8^=tp8^tp2^tp1=tp2^tp1 xor $tp28,$tp88 # tp8^=tp8^tp2^tp1=tp2^tp1 rol \$8,`&LO("$tp10")` # ROTATE(tp1^tp8,8) xor $tp28,$tp88 # tp8^=tp8^tp2^tp1=tp2^tp1 rol \$8,`&LO("$tp18")` # ROTATE(tp1^tp8,8) xor $tp40,$tp80 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2 rol \$8,`&LO("$acc0")` # ROTATE(tp1^tp8,8) xor $tp48,$tp88 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2 rol \$8,`&LO("$acc0")` # ROTATE(tp1^tp8,8) rol \$8,`&LO("$acc8")` # ROTATE(tp1^tp8,8) xor `&LO("$tp80")`,`&LO("$tp10")` xor `&LO("$tp88")`,`&LO("$tp18")` shr \$32,$tp80 xor `&LO("$tp88")`,`&LO("$tp18")` shr \$32,$tp88 xor `&LO("$tp80")`,`&LO("$acc0")` xor `&LO("$tp88")`,`&LO("$acc8")` mov $tp20,$tp80 mov $tp28,$tp88 shr \$32,$tp80 shr \$32,$tp88 rol \$24,`&LO("$tp20")` # ROTATE(tp2^tp1^tp8,24) mov $tp28,$tp88 rol \$24,`&LO("$tp28")` # ROTATE(tp2^tp1^tp8,24) rol \$24,`&LO("$tp80")` # ROTATE(tp2^tp1^tp8,24) rol \$24,`&LO("$tp88")` # ROTATE(tp2^tp1^tp8,24) shr \$32,$tp80 xor `&LO("$tp20")`,`&LO("$tp10")` shr \$32,$tp88 xor `&LO("$tp28")`,`&LO("$tp18")` rol \$24,`&LO("$tp80")` # ROTATE(tp2^tp1^tp8,24) mov $tp40,$tp20 rol \$24,`&LO("$tp88")` # ROTATE(tp2^tp1^tp8,24) mov $tp48,$tp28 shr \$32,$tp20 xor `&LO("$tp80")`,`&LO("$acc0")` shr \$32,$tp28 xor `&LO("$tp88")`,`&LO("$acc8")` `"mov 0($sbox),$mask80" if ($prefetch)` shr \$32,$tp20 shr \$32,$tp28 `"mov 64($sbox),$maskfe" if ($prefetch)` rol \$16,`&LO("$tp40")` # ROTATE(tp4^tp1^tp8,16) `"mov 64($sbox),$maskfe" if ($prefetch)` rol \$16,`&LO("$tp48")` # ROTATE(tp4^tp1^tp8,16) `"mov 128($sbox),$mask1b" if ($prefetch)` rol \$16,`&LO("$tp20")` # ROTATE(tp4^tp1^tp8,16) rol \$16,`&LO("$tp28")` # ROTATE(tp4^tp1^tp8,16) `"mov 192($sbox),$tp80" if ($prefetch)` xor `&LO("$tp40")`,`&LO("$tp10")` rol \$16,`&LO("$tp28")` # ROTATE(tp4^tp1^tp8,16) xor `&LO("$tp48")`,`&LO("$tp18")` `"mov 256($sbox),$tp88" if ($prefetch)` xor `&LO("$tp20")`,`&LO("$acc0")` Loading crypto/aes/asm/vpaes-x86_64.pl +42 −43 Original line number Diff line number Diff line Loading @@ -27,9 +27,9 @@ # # aes-x86_64.pl vpaes-x86_64.pl # # Core 2(**) 30.5/43.7/14.3 21.8/25.7(***) # Nehalem 30.5/42.2/14.6 9.8/11.8 # Atom 63.9/79.0/32.1 64.0/84.8(***) # Core 2(**) 29.6/41.1/14.3 21.9/25.2(***) # Nehalem 29.6/40.3/14.6 10.0/11.8 # Atom 57.3/74.2/32.1 60.9/82.3(***) # # (*) "Hyper-threading" in the context refers rather to cache shared # among multiple cores, than to specifically Intel HTT. As vast Loading @@ -40,7 +40,7 @@ # (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe. # # (***) Less impressive improvement on Core 2 and Atom is due to slow # pshufb, yet it's respectable +40%/78% improvement on Core 2 # pshufb, yet it's respectable +36%/62% improvement on Core 2 # (as implied, over "hyper-threading-safe" code path). # # <appro@openssl.org> Loading Loading @@ -94,8 +94,8 @@ _vpaes_encrypt_core: movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi pshufb %xmm1, %xmm0 pxor %xmm5, %xmm2 pxor %xmm2, %xmm0 add \$16, %r9 pxor %xmm2, %xmm0 lea .Lk_mc_backward(%rip),%r10 jmp .Lenc_entry Loading @@ -103,19 +103,19 @@ _vpaes_encrypt_core: .Lenc_loop: # middle of middle round movdqa %xmm13, %xmm4 # 4 : sb1u pshufb %xmm2, %xmm4 # 4 = sb1u pxor %xmm5, %xmm4 # 4 = sb1u + k movdqa %xmm12, %xmm0 # 0 : sb1t pshufb %xmm2, %xmm4 # 4 = sb1u pshufb %xmm3, %xmm0 # 0 = sb1t pxor %xmm4, %xmm0 # 0 = A pxor %xmm5, %xmm4 # 4 = sb1u + k movdqa %xmm15, %xmm5 # 4 : sb2u pshufb %xmm2, %xmm5 # 4 = sb2u pxor %xmm4, %xmm0 # 0 = A movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] pshufb %xmm2, %xmm5 # 4 = sb2u movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] movdqa %xmm14, %xmm2 # 2 : sb2t pshufb %xmm3, %xmm2 # 2 = sb2t pxor %xmm5, %xmm2 # 2 = 2A movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] movdqa %xmm0, %xmm3 # 3 = A pxor %xmm5, %xmm2 # 2 = 2A pshufb %xmm1, %xmm0 # 0 = B add \$16, %r9 # next key pxor %xmm2, %xmm0 # 0 = 2A+B Loading @@ -124,30 +124,30 @@ _vpaes_encrypt_core: pxor %xmm0, %xmm3 # 3 = 2A+B+D pshufb %xmm1, %xmm0 # 0 = 2B+C and \$0x30, %r11 # ... mod 4 pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D sub \$1,%rax # nr-- pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D .Lenc_entry: # top of round movdqa %xmm9, %xmm1 # 1 : i movdqa %xmm11, %xmm5 # 2 : a/k pandn %xmm0, %xmm1 # 1 = i<<4 psrld \$4, %xmm1 # 1 = i pand %xmm9, %xmm0 # 0 = k movdqa %xmm11, %xmm5 # 2 : a/k pshufb %xmm0, %xmm5 # 2 = a/k pxor %xmm1, %xmm0 # 0 = j movdqa %xmm10, %xmm3 # 3 : 1/i pxor %xmm1, %xmm0 # 0 = j pshufb %xmm1, %xmm3 # 3 = 1/i pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k movdqa %xmm10, %xmm4 # 4 : 1/j pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k pshufb %xmm0, %xmm4 # 4 = 1/j pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k movdqa %xmm10, %xmm2 # 2 : 1/iak pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k pshufb %xmm3, %xmm2 # 2 = 1/iak pxor %xmm0, %xmm2 # 2 = io movdqa %xmm10, %xmm3 # 3 : 1/jak movdqu (%r9), %xmm5 pxor %xmm0, %xmm2 # 2 = io pshufb %xmm4, %xmm3 # 3 = 1/jak movdqu (%r9), %xmm5 pxor %xmm1, %xmm3 # 3 = jo jnz .Lenc_loop Loading Loading @@ -200,62 +200,61 @@ _vpaes_decrypt_core: ## Inverse mix columns ## movdqa -0x20(%r10),%xmm4 # 4 : sb9u movdqa -0x10(%r10),%xmm1 # 0 : sb9t pshufb %xmm2, %xmm4 # 4 = sb9u pshufb %xmm3, %xmm1 # 0 = sb9t pxor %xmm0, %xmm4 movdqa -0x10(%r10),%xmm0 # 0 : sb9t pshufb %xmm3, %xmm0 # 0 = sb9t pxor %xmm4, %xmm0 # 0 = ch add \$16, %r9 # next round key pxor %xmm4, %xmm1 # 0 = ch pshufb %xmm5, %xmm0 # MC ch movdqa 0x00(%r10),%xmm4 # 4 : sbdu pshufb %xmm5, %xmm1 # MC ch pshufb %xmm2, %xmm4 # 4 = sbdu pxor %xmm0, %xmm4 # 4 = ch movdqa 0x10(%r10),%xmm0 # 0 : sbdt pxor %xmm1, %xmm4 # 4 = ch pshufb %xmm3, %xmm0 # 0 = sbdt pxor %xmm4, %xmm0 # 0 = ch sub \$1,%rax # nr-- pxor %xmm4, %xmm0 # 0 = ch pshufb %xmm5, %xmm0 # MC ch movdqa 0x20(%r10),%xmm4 # 4 : sbbu pshufb %xmm5, %xmm0 # MC ch movdqa 0x30(%r10),%xmm1 # 0 : sbbt pshufb %xmm2, %xmm4 # 4 = sbbu pshufb %xmm3, %xmm1 # 0 = sbbt pxor %xmm0, %xmm4 # 4 = ch movdqa 0x30(%r10),%xmm0 # 0 : sbbt pshufb %xmm3, %xmm0 # 0 = sbbt pxor %xmm4, %xmm0 # 0 = ch pxor %xmm4, %xmm1 # 0 = ch pshufb %xmm5, %xmm0 # MC ch movdqa 0x40(%r10),%xmm4 # 4 : sbeu pshufb %xmm2, %xmm4 # 4 = sbeu pxor %xmm0, %xmm4 # 4 = ch pshufb %xmm5, %xmm1 # MC ch movdqa 0x50(%r10),%xmm0 # 0 : sbet pshufb %xmm2, %xmm4 # 4 = sbeu pshufb %xmm3, %xmm0 # 0 = sbet pxor %xmm4, %xmm0 # 0 = ch palignr \$12, %xmm5, %xmm5 pxor %xmm1, %xmm4 # 4 = ch pxor %xmm4, %xmm0 # 0 = ch .Ldec_entry: # top of round movdqa %xmm9, %xmm1 # 1 : i pandn %xmm0, %xmm1 # 1 = i<<4 movdqa %xmm11, %xmm2 # 2 : a/k psrld \$4, %xmm1 # 1 = i pand %xmm9, %xmm0 # 0 = k movdqa %xmm11, %xmm2 # 2 : a/k pshufb %xmm0, %xmm2 # 2 = a/k pxor %xmm1, %xmm0 # 0 = j movdqa %xmm10, %xmm3 # 3 : 1/i pxor %xmm1, %xmm0 # 0 = j pshufb %xmm1, %xmm3 # 3 = 1/i pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k movdqa %xmm10, %xmm4 # 4 : 1/j pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k pshufb %xmm0, %xmm4 # 4 = 1/j pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k movdqa %xmm10, %xmm2 # 2 : 1/iak pshufb %xmm3, %xmm2 # 2 = 1/iak pxor %xmm0, %xmm2 # 2 = io movdqa %xmm10, %xmm3 # 3 : 1/jak pxor %xmm0, %xmm2 # 2 = io pshufb %xmm4, %xmm3 # 3 = 1/jak pxor %xmm1, %xmm3 # 3 = jo movdqu (%r9), %xmm0 pxor %xmm1, %xmm3 # 3 = jo jnz .Ldec_loop # middle of last round Loading Loading @@ -463,12 +462,12 @@ _vpaes_schedule_core: .type _vpaes_schedule_192_smear,\@abi-omnipotent .align 16 _vpaes_schedule_192_smear: pshufd \$0x80, %xmm6, %xmm0 # d c 0 0 -> c 0 0 0 pxor %xmm0, %xmm6 # -> c+d c 0 0 pshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 pshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a pxor %xmm1, %xmm6 # -> c+d c 0 0 pxor %xmm1, %xmm1 pxor %xmm0, %xmm6 # -> b+c+d b+c b a movdqa %xmm6, %xmm0 pxor %xmm1, %xmm1 movhlps %xmm1, %xmm6 # clobber low side with zeros ret .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear Loading Loading
crypto/aes/asm/aes-x86_64.pl +122 −124 Original line number Diff line number Diff line Loading @@ -19,9 +19,10 @@ # Performance in number of cycles per processed byte for 128-bit key: # # ECB encrypt ECB decrypt CBC large chunk # AMD64 33 41 13.0 # EM64T 38 59 18.6(*) # Core 2 30 43 14.5(*) # AMD64 33 43 13.0 # EM64T 38 56 18.6(*) # Core 2 30 42 14.5(*) # Atom 65 86 32.1(*) # # (*) with hyper-threading off Loading Loading @@ -365,68 +366,66 @@ $code.=<<___; movzb `&lo("$s0")`,$t0 movzb `&lo("$s1")`,$t1 movzb `&lo("$s2")`,$t2 movzb ($sbox,$t0,1),$t0 movzb ($sbox,$t1,1),$t1 movzb ($sbox,$t2,1),$t2 movzb `&lo("$s3")`,$t3 movzb `&hi("$s1")`,$acc0 movzb `&hi("$s2")`,$acc1 shr \$16,$s2 movzb `&hi("$s3")`,$acc2 movzb ($sbox,$t0,1),$t0 movzb ($sbox,$t1,1),$t1 movzb ($sbox,$t2,1),$t2 movzb ($sbox,$t3,1),$t3 movzb ($sbox,$acc0,1),$t4 #$t0 movzb ($sbox,$acc1,1),$t5 #$t1 movzb `&hi("$s3")`,$acc2 movzb ($sbox,$acc0,1),$t4 #$t0 movzb `&hi("$s0")`,$acc0 shr \$16,$s2 movzb ($sbox,$acc1,1),$t5 #$t1 movzb `&lo("$s2")`,$acc1 movzb ($sbox,$acc2,1),$acc2 #$t2 movzb ($sbox,$acc0,1),$acc0 #$t3 shr \$16,$s3 movzb `&lo("$s2")`,$acc1 shl \$8,$t4 shr \$16,$s3 shl \$8,$t5 movzb ($sbox,$acc1,1),$acc1 #$t0 xor $t4,$t0 xor $t5,$t1 movzb `&lo("$s3")`,$t4 shr \$16,$s0 movzb `&lo("$s3")`,$t4 shr \$16,$s1 movzb `&lo("$s0")`,$t5 xor $t5,$t1 shl \$8,$acc2 shl \$8,$acc0 movzb ($sbox,$t4,1),$t4 #$t1 movzb ($sbox,$t5,1),$t5 #$t2 movzb `&lo("$s0")`,$t5 movzb ($sbox,$acc1,1),$acc1 #$t0 xor $acc2,$t2 xor $acc0,$t3 shl \$8,$acc0 movzb `&lo("$s1")`,$acc2 movzb `&hi("$s3")`,$acc0 shl \$16,$acc1 movzb ($sbox,$acc2,1),$acc2 #$t3 movzb ($sbox,$acc0,1),$acc0 #$t0 xor $acc0,$t3 movzb ($sbox,$t4,1),$t4 #$t1 movzb `&hi("$s3")`,$acc0 movzb ($sbox,$t5,1),$t5 #$t2 xor $acc1,$t0 movzb `&hi("$s0")`,$acc1 shr \$8,$s2 movzb `&hi("$s0")`,$acc1 shl \$16,$t4 shr \$8,$s1 shl \$16,$t5 xor $t4,$t1 movzb ($sbox,$acc2,1),$acc2 #$t3 movzb ($sbox,$acc0,1),$acc0 #$t0 movzb ($sbox,$acc1,1),$acc1 #$t1 movzb ($sbox,$s2,1),$s3 #$t3 movzb ($sbox,$s1,1),$s2 #$t2 shl \$16,$t4 shl \$16,$t5 shl \$16,$acc2 xor $t4,$t1 xor $t5,$t2 xor $acc2,$t3 shl \$24,$acc0 xor $acc2,$t3 shl \$24,$acc1 shl \$24,$s3 xor $acc0,$t0 shl \$24,$s2 shl \$24,$s3 xor $acc1,$t1 shl \$24,$s2 mov $t0,$s0 mov $t1,$s1 xor $t2,$s2 Loading Loading @@ -465,12 +464,12 @@ sub enctransform() { my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d"); $code.=<<___; mov $s0,$acc0 mov $s1,$acc1 and \$0x80808080,$acc0 and \$0x80808080,$acc1 mov $acc0,$t0 mov $acc1,$t1 mov \$0x80808080,$t0 mov \$0x80808080,$t1 and $s0,$t0 and $s1,$t1 mov $t0,$acc0 mov $t1,$acc1 shr \$7,$t0 lea ($s0,$s0),$r20 shr \$7,$t1 Loading @@ -488,25 +487,25 @@ $code.=<<___; xor $r20,$s0 xor $r21,$s1 mov $s2,$acc0 mov $s3,$acc1 mov \$0x80808080,$t2 rol \$24,$s0 mov \$0x80808080,$t3 rol \$24,$s1 and \$0x80808080,$acc0 and \$0x80808080,$acc1 and $s2,$t2 and $s3,$t3 xor $r20,$s0 xor $r21,$s1 mov $acc0,$t2 mov $acc1,$t3 mov $t2,$acc0 ror \$16,$t0 mov $t3,$acc1 ror \$16,$t1 shr \$7,$t2 lea ($s2,$s2),$r20 shr \$7,$t2 xor $t0,$s0 xor $t1,$s1 shr \$7,$t3 lea ($s3,$s3),$r21 xor $t1,$s1 ror \$8,$t0 lea ($s3,$s3),$r21 ror \$8,$t1 sub $t2,$acc0 sub $t3,$acc1 Loading @@ -522,23 +521,23 @@ $code.=<<___; xor $acc0,$r20 xor $acc1,$r21 ror \$16,$t2 xor $r20,$s2 ror \$16,$t3 xor $r21,$s3 rol \$24,$s2 mov 0($sbox),$acc0 # prefetch Te4 rol \$24,$s3 xor $r20,$s2 xor $r21,$s3 mov 0($sbox),$acc0 # prefetch Te4 ror \$16,$t2 ror \$16,$t3 mov 64($sbox),$acc1 xor $t2,$s2 xor $t3,$s3 xor $r21,$s3 mov 128($sbox),$r20 xor $t2,$s2 ror \$8,$t2 xor $t3,$s3 ror \$8,$t3 mov 192($sbox),$r21 xor $t2,$s2 mov 192($sbox),$r21 xor $t3,$s3 ___ } Loading Loading @@ -935,70 +934,69 @@ $code.=<<___; movzb `&lo("$s0")`,$t0 movzb `&lo("$s1")`,$t1 movzb `&lo("$s2")`,$t2 movzb ($sbox,$t0,1),$t0 movzb ($sbox,$t1,1),$t1 movzb ($sbox,$t2,1),$t2 movzb `&lo("$s3")`,$t3 movzb `&hi("$s3")`,$acc0 movzb `&hi("$s0")`,$acc1 shr \$16,$s3 movzb `&hi("$s1")`,$acc2 movzb ($sbox,$t0,1),$t0 movzb ($sbox,$t1,1),$t1 movzb ($sbox,$t2,1),$t2 movzb ($sbox,$t3,1),$t3 movzb ($sbox,$acc0,1),$t4 #$t0 movzb ($sbox,$acc1,1),$t5 #$t1 movzb `&hi("$s1")`,$acc2 movzb ($sbox,$acc0,1),$t4 #$t0 movzb `&hi("$s2")`,$acc0 shr \$16,$s2 movzb ($sbox,$acc1,1),$t5 #$t1 movzb ($sbox,$acc2,1),$acc2 #$t2 movzb ($sbox,$acc0,1),$acc0 #$t3 shr \$16,$s3 movzb `&lo("$s2")`,$acc1 shl \$8,$t4 shr \$16,$s2 shl \$8,$t5 movzb ($sbox,$acc1,1),$acc1 #$t0 xor $t4,$t0 xor $t5,$t1 movzb `&lo("$s3")`,$t4 shl \$8,$t4 movzb `&lo("$s2")`,$acc1 shr \$16,$s0 xor $t4,$t0 shr \$16,$s1 movzb `&lo("$s0")`,$t5 movzb `&lo("$s3")`,$t4 shl \$8,$acc2 xor $t5,$t1 shl \$8,$acc0 movzb ($sbox,$t4,1),$t4 #$t1 movzb ($sbox,$t5,1),$t5 #$t2 movzb `&lo("$s0")`,$t5 movzb ($sbox,$acc1,1),$acc1 #$t0 xor $acc2,$t2 xor $acc0,$t3 movzb `&lo("$s1")`,$acc2 movzb `&hi("$s1")`,$acc0 shl \$16,$acc1 xor $acc0,$t3 movzb ($sbox,$t4,1),$t4 #$t1 movzb `&hi("$s1")`,$acc0 movzb ($sbox,$acc2,1),$acc2 #$t3 movzb ($sbox,$acc0,1),$acc0 #$t0 xor $acc1,$t0 movzb ($sbox,$t5,1),$t5 #$t2 movzb `&hi("$s2")`,$acc1 shl \$16,$acc2 shl \$16,$t4 shl \$16,$t5 movzb ($sbox,$acc1,1),$s1 #$t1 xor $acc2,$t3 movzb `&hi("$s3")`,$acc2 xor $t4,$t1 shr \$8,$s0 xor $t5,$t2 movzb `&hi("$s3")`,$acc1 shr \$8,$s0 shl \$16,$acc2 movzb ($sbox,$acc1,1),$s2 #$t2 movzb ($sbox,$acc0,1),$acc0 #$t0 movzb ($sbox,$acc1,1),$s1 #$t1 movzb ($sbox,$acc2,1),$s2 #$t2 movzb ($sbox,$s0,1),$s3 #$t3 xor $acc2,$t3 mov $t0,$s0 shl \$24,$acc0 shl \$24,$s1 shl \$24,$s2 xor $acc0,$t0 xor $acc0,$s0 shl \$24,$s3 xor $t1,$s1 mov $t0,$s0 xor $t2,$s2 xor $t3,$s3 ___ Loading @@ -1013,12 +1011,12 @@ sub dectransform() my $prefetch = shift; $code.=<<___; mov $tp10,$acc0 mov $tp18,$acc8 and $mask80,$acc0 and $mask80,$acc8 mov $acc0,$tp40 mov $acc8,$tp48 mov $mask80,$tp40 mov $mask80,$tp48 and $tp10,$tp40 and $tp18,$tp48 mov $tp40,$acc0 mov $tp48,$acc8 shr \$7,$tp40 lea ($tp10,$tp10),$tp20 shr \$7,$tp48 Loading @@ -1029,15 +1027,15 @@ $code.=<<___; and $maskfe,$tp28 and $mask1b,$acc0 and $mask1b,$acc8 xor $tp20,$acc0 xor $tp28,$acc8 mov $acc0,$tp20 mov $acc8,$tp28 and $mask80,$acc0 and $mask80,$acc8 mov $acc0,$tp80 mov $acc8,$tp88 xor $acc0,$tp20 xor $acc8,$tp28 mov $mask80,$tp80 mov $mask80,$tp88 and $tp20,$tp80 and $tp28,$tp88 mov $tp80,$acc0 mov $tp88,$acc8 shr \$7,$tp80 lea ($tp20,$tp20),$tp40 shr \$7,$tp88 Loading @@ -1048,15 +1046,15 @@ $code.=<<___; and $maskfe,$tp48 and $mask1b,$acc0 and $mask1b,$acc8 xor $tp40,$acc0 xor $tp48,$acc8 mov $acc0,$tp40 mov $acc8,$tp48 and $mask80,$acc0 and $mask80,$acc8 mov $acc0,$tp80 mov $acc8,$tp88 xor $acc0,$tp40 xor $acc8,$tp48 mov $mask80,$tp80 mov $mask80,$tp88 and $tp40,$tp80 and $tp48,$tp88 mov $tp80,$acc0 mov $tp88,$acc8 shr \$7,$tp80 xor $tp10,$tp20 # tp2^=tp1 shr \$7,$tp88 Loading @@ -1081,51 +1079,51 @@ $code.=<<___; mov $tp10,$acc0 mov $tp18,$acc8 xor $tp80,$tp40 # tp4^tp1^=tp8 xor $tp88,$tp48 # tp4^tp1^=tp8 shr \$32,$acc0 xor $tp88,$tp48 # tp4^tp1^=tp8 shr \$32,$acc8 xor $tp20,$tp80 # tp8^=tp8^tp2^tp1=tp2^tp1 xor $tp28,$tp88 # tp8^=tp8^tp2^tp1=tp2^tp1 rol \$8,`&LO("$tp10")` # ROTATE(tp1^tp8,8) xor $tp28,$tp88 # tp8^=tp8^tp2^tp1=tp2^tp1 rol \$8,`&LO("$tp18")` # ROTATE(tp1^tp8,8) xor $tp40,$tp80 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2 rol \$8,`&LO("$acc0")` # ROTATE(tp1^tp8,8) xor $tp48,$tp88 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2 rol \$8,`&LO("$acc0")` # ROTATE(tp1^tp8,8) rol \$8,`&LO("$acc8")` # ROTATE(tp1^tp8,8) xor `&LO("$tp80")`,`&LO("$tp10")` xor `&LO("$tp88")`,`&LO("$tp18")` shr \$32,$tp80 xor `&LO("$tp88")`,`&LO("$tp18")` shr \$32,$tp88 xor `&LO("$tp80")`,`&LO("$acc0")` xor `&LO("$tp88")`,`&LO("$acc8")` mov $tp20,$tp80 mov $tp28,$tp88 shr \$32,$tp80 shr \$32,$tp88 rol \$24,`&LO("$tp20")` # ROTATE(tp2^tp1^tp8,24) mov $tp28,$tp88 rol \$24,`&LO("$tp28")` # ROTATE(tp2^tp1^tp8,24) rol \$24,`&LO("$tp80")` # ROTATE(tp2^tp1^tp8,24) rol \$24,`&LO("$tp88")` # ROTATE(tp2^tp1^tp8,24) shr \$32,$tp80 xor `&LO("$tp20")`,`&LO("$tp10")` shr \$32,$tp88 xor `&LO("$tp28")`,`&LO("$tp18")` rol \$24,`&LO("$tp80")` # ROTATE(tp2^tp1^tp8,24) mov $tp40,$tp20 rol \$24,`&LO("$tp88")` # ROTATE(tp2^tp1^tp8,24) mov $tp48,$tp28 shr \$32,$tp20 xor `&LO("$tp80")`,`&LO("$acc0")` shr \$32,$tp28 xor `&LO("$tp88")`,`&LO("$acc8")` `"mov 0($sbox),$mask80" if ($prefetch)` shr \$32,$tp20 shr \$32,$tp28 `"mov 64($sbox),$maskfe" if ($prefetch)` rol \$16,`&LO("$tp40")` # ROTATE(tp4^tp1^tp8,16) `"mov 64($sbox),$maskfe" if ($prefetch)` rol \$16,`&LO("$tp48")` # ROTATE(tp4^tp1^tp8,16) `"mov 128($sbox),$mask1b" if ($prefetch)` rol \$16,`&LO("$tp20")` # ROTATE(tp4^tp1^tp8,16) rol \$16,`&LO("$tp28")` # ROTATE(tp4^tp1^tp8,16) `"mov 192($sbox),$tp80" if ($prefetch)` xor `&LO("$tp40")`,`&LO("$tp10")` rol \$16,`&LO("$tp28")` # ROTATE(tp4^tp1^tp8,16) xor `&LO("$tp48")`,`&LO("$tp18")` `"mov 256($sbox),$tp88" if ($prefetch)` xor `&LO("$tp20")`,`&LO("$acc0")` Loading
crypto/aes/asm/vpaes-x86_64.pl +42 −43 Original line number Diff line number Diff line Loading @@ -27,9 +27,9 @@ # # aes-x86_64.pl vpaes-x86_64.pl # # Core 2(**) 30.5/43.7/14.3 21.8/25.7(***) # Nehalem 30.5/42.2/14.6 9.8/11.8 # Atom 63.9/79.0/32.1 64.0/84.8(***) # Core 2(**) 29.6/41.1/14.3 21.9/25.2(***) # Nehalem 29.6/40.3/14.6 10.0/11.8 # Atom 57.3/74.2/32.1 60.9/82.3(***) # # (*) "Hyper-threading" in the context refers rather to cache shared # among multiple cores, than to specifically Intel HTT. As vast Loading @@ -40,7 +40,7 @@ # (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe. # # (***) Less impressive improvement on Core 2 and Atom is due to slow # pshufb, yet it's respectable +40%/78% improvement on Core 2 # pshufb, yet it's respectable +36%/62% improvement on Core 2 # (as implied, over "hyper-threading-safe" code path). # # <appro@openssl.org> Loading Loading @@ -94,8 +94,8 @@ _vpaes_encrypt_core: movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi pshufb %xmm1, %xmm0 pxor %xmm5, %xmm2 pxor %xmm2, %xmm0 add \$16, %r9 pxor %xmm2, %xmm0 lea .Lk_mc_backward(%rip),%r10 jmp .Lenc_entry Loading @@ -103,19 +103,19 @@ _vpaes_encrypt_core: .Lenc_loop: # middle of middle round movdqa %xmm13, %xmm4 # 4 : sb1u pshufb %xmm2, %xmm4 # 4 = sb1u pxor %xmm5, %xmm4 # 4 = sb1u + k movdqa %xmm12, %xmm0 # 0 : sb1t pshufb %xmm2, %xmm4 # 4 = sb1u pshufb %xmm3, %xmm0 # 0 = sb1t pxor %xmm4, %xmm0 # 0 = A pxor %xmm5, %xmm4 # 4 = sb1u + k movdqa %xmm15, %xmm5 # 4 : sb2u pshufb %xmm2, %xmm5 # 4 = sb2u pxor %xmm4, %xmm0 # 0 = A movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] pshufb %xmm2, %xmm5 # 4 = sb2u movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] movdqa %xmm14, %xmm2 # 2 : sb2t pshufb %xmm3, %xmm2 # 2 = sb2t pxor %xmm5, %xmm2 # 2 = 2A movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] movdqa %xmm0, %xmm3 # 3 = A pxor %xmm5, %xmm2 # 2 = 2A pshufb %xmm1, %xmm0 # 0 = B add \$16, %r9 # next key pxor %xmm2, %xmm0 # 0 = 2A+B Loading @@ -124,30 +124,30 @@ _vpaes_encrypt_core: pxor %xmm0, %xmm3 # 3 = 2A+B+D pshufb %xmm1, %xmm0 # 0 = 2B+C and \$0x30, %r11 # ... mod 4 pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D sub \$1,%rax # nr-- pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D .Lenc_entry: # top of round movdqa %xmm9, %xmm1 # 1 : i movdqa %xmm11, %xmm5 # 2 : a/k pandn %xmm0, %xmm1 # 1 = i<<4 psrld \$4, %xmm1 # 1 = i pand %xmm9, %xmm0 # 0 = k movdqa %xmm11, %xmm5 # 2 : a/k pshufb %xmm0, %xmm5 # 2 = a/k pxor %xmm1, %xmm0 # 0 = j movdqa %xmm10, %xmm3 # 3 : 1/i pxor %xmm1, %xmm0 # 0 = j pshufb %xmm1, %xmm3 # 3 = 1/i pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k movdqa %xmm10, %xmm4 # 4 : 1/j pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k pshufb %xmm0, %xmm4 # 4 = 1/j pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k movdqa %xmm10, %xmm2 # 2 : 1/iak pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k pshufb %xmm3, %xmm2 # 2 = 1/iak pxor %xmm0, %xmm2 # 2 = io movdqa %xmm10, %xmm3 # 3 : 1/jak movdqu (%r9), %xmm5 pxor %xmm0, %xmm2 # 2 = io pshufb %xmm4, %xmm3 # 3 = 1/jak movdqu (%r9), %xmm5 pxor %xmm1, %xmm3 # 3 = jo jnz .Lenc_loop Loading Loading @@ -200,62 +200,61 @@ _vpaes_decrypt_core: ## Inverse mix columns ## movdqa -0x20(%r10),%xmm4 # 4 : sb9u movdqa -0x10(%r10),%xmm1 # 0 : sb9t pshufb %xmm2, %xmm4 # 4 = sb9u pshufb %xmm3, %xmm1 # 0 = sb9t pxor %xmm0, %xmm4 movdqa -0x10(%r10),%xmm0 # 0 : sb9t pshufb %xmm3, %xmm0 # 0 = sb9t pxor %xmm4, %xmm0 # 0 = ch add \$16, %r9 # next round key pxor %xmm4, %xmm1 # 0 = ch pshufb %xmm5, %xmm0 # MC ch movdqa 0x00(%r10),%xmm4 # 4 : sbdu pshufb %xmm5, %xmm1 # MC ch pshufb %xmm2, %xmm4 # 4 = sbdu pxor %xmm0, %xmm4 # 4 = ch movdqa 0x10(%r10),%xmm0 # 0 : sbdt pxor %xmm1, %xmm4 # 4 = ch pshufb %xmm3, %xmm0 # 0 = sbdt pxor %xmm4, %xmm0 # 0 = ch sub \$1,%rax # nr-- pxor %xmm4, %xmm0 # 0 = ch pshufb %xmm5, %xmm0 # MC ch movdqa 0x20(%r10),%xmm4 # 4 : sbbu pshufb %xmm5, %xmm0 # MC ch movdqa 0x30(%r10),%xmm1 # 0 : sbbt pshufb %xmm2, %xmm4 # 4 = sbbu pshufb %xmm3, %xmm1 # 0 = sbbt pxor %xmm0, %xmm4 # 4 = ch movdqa 0x30(%r10),%xmm0 # 0 : sbbt pshufb %xmm3, %xmm0 # 0 = sbbt pxor %xmm4, %xmm0 # 0 = ch pxor %xmm4, %xmm1 # 0 = ch pshufb %xmm5, %xmm0 # MC ch movdqa 0x40(%r10),%xmm4 # 4 : sbeu pshufb %xmm2, %xmm4 # 4 = sbeu pxor %xmm0, %xmm4 # 4 = ch pshufb %xmm5, %xmm1 # MC ch movdqa 0x50(%r10),%xmm0 # 0 : sbet pshufb %xmm2, %xmm4 # 4 = sbeu pshufb %xmm3, %xmm0 # 0 = sbet pxor %xmm4, %xmm0 # 0 = ch palignr \$12, %xmm5, %xmm5 pxor %xmm1, %xmm4 # 4 = ch pxor %xmm4, %xmm0 # 0 = ch .Ldec_entry: # top of round movdqa %xmm9, %xmm1 # 1 : i pandn %xmm0, %xmm1 # 1 = i<<4 movdqa %xmm11, %xmm2 # 2 : a/k psrld \$4, %xmm1 # 1 = i pand %xmm9, %xmm0 # 0 = k movdqa %xmm11, %xmm2 # 2 : a/k pshufb %xmm0, %xmm2 # 2 = a/k pxor %xmm1, %xmm0 # 0 = j movdqa %xmm10, %xmm3 # 3 : 1/i pxor %xmm1, %xmm0 # 0 = j pshufb %xmm1, %xmm3 # 3 = 1/i pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k movdqa %xmm10, %xmm4 # 4 : 1/j pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k pshufb %xmm0, %xmm4 # 4 = 1/j pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k movdqa %xmm10, %xmm2 # 2 : 1/iak pshufb %xmm3, %xmm2 # 2 = 1/iak pxor %xmm0, %xmm2 # 2 = io movdqa %xmm10, %xmm3 # 3 : 1/jak pxor %xmm0, %xmm2 # 2 = io pshufb %xmm4, %xmm3 # 3 = 1/jak pxor %xmm1, %xmm3 # 3 = jo movdqu (%r9), %xmm0 pxor %xmm1, %xmm3 # 3 = jo jnz .Ldec_loop # middle of last round Loading Loading @@ -463,12 +462,12 @@ _vpaes_schedule_core: .type _vpaes_schedule_192_smear,\@abi-omnipotent .align 16 _vpaes_schedule_192_smear: pshufd \$0x80, %xmm6, %xmm0 # d c 0 0 -> c 0 0 0 pxor %xmm0, %xmm6 # -> c+d c 0 0 pshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 pshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a pxor %xmm1, %xmm6 # -> c+d c 0 0 pxor %xmm1, %xmm1 pxor %xmm0, %xmm6 # -> b+c+d b+c b a movdqa %xmm6, %xmm0 pxor %xmm1, %xmm1 movhlps %xmm1, %xmm6 # clobber low side with zeros ret .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear Loading