Loading crypto/aes/asm/vpaes-x86.pl +23 −23 Original line number Diff line number Diff line Loading @@ -29,7 +29,7 @@ # # Core 2(**) 28.1/41.4/18.3 21.9/25.2(***) # Nehalem 27.9/40.4/18.1 10.2/11.9 # Atom 70.7/92.1/60.1 61.1/81.0(***) # Atom 70.7/92.1/60.1 61.1/75.4(***) # # (*) "Hyper-threading" in the context refers rather to cache shared # among multiple cores, than to specifically Intel HTT. As vast Loading Loading @@ -295,43 +295,43 @@ $k_dsbo=0x2c0; # decryption sbox final output &movdqa ("xmm1",&QWP(-0x10,$base)); # 0 : sb9t &pshufb ("xmm4","xmm2"); # 4 = sb9u &pshufb ("xmm1","xmm3"); # 0 = sb9t &pxor ("xmm4","xmm0"); &add ($key,16); # next round key &pxor ("xmm1","xmm4"); # 0 = ch &pxor ("xmm0","xmm4"); &movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu &pshufb ("xmm1","xmm5"); # MC ch &pshufb ("xmm4","xmm2"); # 4 = sbdu &movdqa ("xmm0",&QWP(0x10,$base)); # 0 : sbdt &pxor ("xmm4","xmm1"); # 4 = ch &pshufb ("xmm0","xmm3"); # 0 = sbdt &sub ($round,1); # nr-- &pxor ("xmm0","xmm4"); # 0 = ch &pxor ("xmm0","xmm1"); # 0 = ch &movdqa ("xmm1",&QWP(0x10,$base)); # 0 : sbdt &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu &pshufb ("xmm4","xmm2"); # 4 = sbdu &pshufb ("xmm0","xmm5"); # MC ch &pshufb ("xmm1","xmm3"); # 0 = sbdt &pxor ("xmm0","xmm4"); # 4 = ch &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu &pxor ("xmm0","xmm1"); # 0 = ch &movdqa ("xmm1",&QWP(0x30,$base)); # 0 : sbbt &pshufb ("xmm4","xmm2"); # 4 = sbbu &pshufb ("xmm0","xmm5"); # MC ch &pshufb ("xmm1","xmm3"); # 0 = sbbt &pxor ("xmm4","xmm0"); # 4 = ch &pxor ("xmm1","xmm4"); # 0 = ch &pxor ("xmm0","xmm4"); # 4 = ch &movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu &pshufb ("xmm1","xmm5"); # MC ch &movdqa ("xmm0",&QWP(0x50,$base)); # 0 : sbet &pxor ("xmm0","xmm1"); # 0 = ch &movdqa ("xmm1",&QWP(0x50,$base)); # 0 : sbet &pshufb ("xmm4","xmm2"); # 4 = sbeu &pshufb ("xmm0","xmm3"); # 0 = sbet &pshufb ("xmm0","xmm5"); # MC ch &pshufb ("xmm1","xmm3"); # 0 = sbet &pxor ("xmm0","xmm4"); # 4 = ch &add ($key,16); # next round key &palignr("xmm5","xmm5",12); &pxor ("xmm4","xmm1"); # 4 = ch &pxor ("xmm0","xmm4"); # 0 = ch &pxor ("xmm0","xmm1"); # 0 = ch &sub ($round,1); # nr-- &set_label("dec_entry"); # top of round &movdqa ("xmm1","xmm6"); # 1 : i &pandn ("xmm1","xmm0"); # 1 = i<<4 &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k &psrld ("xmm1",4); # 1 = i &pandn ("xmm1","xmm0"); # 1 = i<<4 &pand ("xmm0","xmm6"); # 0 = k &psrld ("xmm1",4); # 1 = i &pshufb ("xmm2","xmm0"); # 2 = a/k &movdqa ("xmm3","xmm7"); # 3 : 1/i &pxor ("xmm0","xmm1"); # 0 = j Loading crypto/aes/asm/vpaes-x86_64.pl +21 −21 Original line number Diff line number Diff line Loading @@ -29,7 +29,7 @@ # # Core 2(**) 29.6/41.1/14.3 21.9/25.2(***) # Nehalem 29.6/40.3/14.6 10.0/11.8 # Atom 57.3/74.2/32.1 60.9/82.3(***) # Atom 57.3/74.2/32.1 60.9/77.2(***) # # (*) "Hyper-threading" in the context refers rather to cache shared # among multiple cores, than to specifically Intel HTT. As vast Loading Loading @@ -204,35 +204,35 @@ _vpaes_decrypt_core: movdqa -0x10(%r10),%xmm1 # 0 : sb9t pshufb %xmm2, %xmm4 # 4 = sb9u pshufb %xmm3, %xmm1 # 0 = sb9t pxor %xmm0, %xmm4 add \$16, %r9 # next round key pxor %xmm4, %xmm1 # 0 = ch pxor %xmm4, %xmm0 movdqa 0x00(%r10),%xmm4 # 4 : sbdu pshufb %xmm5, %xmm1 # MC ch pshufb %xmm2, %xmm4 # 4 = sbdu movdqa 0x10(%r10),%xmm0 # 0 : sbdt pxor %xmm1, %xmm4 # 4 = ch pshufb %xmm3, %xmm0 # 0 = sbdt sub \$1,%rax # nr-- pxor %xmm4, %xmm0 # 0 = ch pxor %xmm1, %xmm0 # 0 = ch movdqa 0x10(%r10),%xmm1 # 0 : sbdt movdqa 0x20(%r10),%xmm4 # 4 : sbbu pshufb %xmm2, %xmm4 # 4 = sbdu pshufb %xmm5, %xmm0 # MC ch pshufb %xmm3, %xmm1 # 0 = sbdt pxor %xmm4, %xmm0 # 4 = ch movdqa 0x20(%r10),%xmm4 # 4 : sbbu pxor %xmm1, %xmm0 # 0 = ch movdqa 0x30(%r10),%xmm1 # 0 : sbbt pshufb %xmm2, %xmm4 # 4 = sbbu pshufb %xmm5, %xmm0 # MC ch pshufb %xmm3, %xmm1 # 0 = sbbt pxor %xmm0, %xmm4 # 4 = ch pxor %xmm4, %xmm1 # 0 = ch pxor %xmm4, %xmm0 # 4 = ch movdqa 0x40(%r10),%xmm4 # 4 : sbeu pshufb %xmm5, %xmm1 # MC ch movdqa 0x50(%r10),%xmm0 # 0 : sbet pxor %xmm1, %xmm0 # 0 = ch movdqa 0x50(%r10),%xmm1 # 0 : sbet pshufb %xmm2, %xmm4 # 4 = sbeu pshufb %xmm3, %xmm0 # 0 = sbet pshufb %xmm5, %xmm0 # MC ch pshufb %xmm3, %xmm1 # 0 = sbet pxor %xmm4, %xmm0 # 4 = ch add \$16, %r9 # next round key palignr \$12, %xmm5, %xmm5 pxor %xmm1, %xmm4 # 4 = ch pxor %xmm4, %xmm0 # 0 = ch pxor %xmm1, %xmm0 # 0 = ch sub \$1,%rax # nr-- .Ldec_entry: # top of round Loading Loading
crypto/aes/asm/vpaes-x86.pl +23 −23 Original line number Diff line number Diff line Loading @@ -29,7 +29,7 @@ # # Core 2(**) 28.1/41.4/18.3 21.9/25.2(***) # Nehalem 27.9/40.4/18.1 10.2/11.9 # Atom 70.7/92.1/60.1 61.1/81.0(***) # Atom 70.7/92.1/60.1 61.1/75.4(***) # # (*) "Hyper-threading" in the context refers rather to cache shared # among multiple cores, than to specifically Intel HTT. As vast Loading Loading @@ -295,43 +295,43 @@ $k_dsbo=0x2c0; # decryption sbox final output &movdqa ("xmm1",&QWP(-0x10,$base)); # 0 : sb9t &pshufb ("xmm4","xmm2"); # 4 = sb9u &pshufb ("xmm1","xmm3"); # 0 = sb9t &pxor ("xmm4","xmm0"); &add ($key,16); # next round key &pxor ("xmm1","xmm4"); # 0 = ch &pxor ("xmm0","xmm4"); &movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu &pshufb ("xmm1","xmm5"); # MC ch &pshufb ("xmm4","xmm2"); # 4 = sbdu &movdqa ("xmm0",&QWP(0x10,$base)); # 0 : sbdt &pxor ("xmm4","xmm1"); # 4 = ch &pshufb ("xmm0","xmm3"); # 0 = sbdt &sub ($round,1); # nr-- &pxor ("xmm0","xmm4"); # 0 = ch &pxor ("xmm0","xmm1"); # 0 = ch &movdqa ("xmm1",&QWP(0x10,$base)); # 0 : sbdt &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu &pshufb ("xmm4","xmm2"); # 4 = sbdu &pshufb ("xmm0","xmm5"); # MC ch &pshufb ("xmm1","xmm3"); # 0 = sbdt &pxor ("xmm0","xmm4"); # 4 = ch &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu &pxor ("xmm0","xmm1"); # 0 = ch &movdqa ("xmm1",&QWP(0x30,$base)); # 0 : sbbt &pshufb ("xmm4","xmm2"); # 4 = sbbu &pshufb ("xmm0","xmm5"); # MC ch &pshufb ("xmm1","xmm3"); # 0 = sbbt &pxor ("xmm4","xmm0"); # 4 = ch &pxor ("xmm1","xmm4"); # 0 = ch &pxor ("xmm0","xmm4"); # 4 = ch &movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu &pshufb ("xmm1","xmm5"); # MC ch &movdqa ("xmm0",&QWP(0x50,$base)); # 0 : sbet &pxor ("xmm0","xmm1"); # 0 = ch &movdqa ("xmm1",&QWP(0x50,$base)); # 0 : sbet &pshufb ("xmm4","xmm2"); # 4 = sbeu &pshufb ("xmm0","xmm3"); # 0 = sbet &pshufb ("xmm0","xmm5"); # MC ch &pshufb ("xmm1","xmm3"); # 0 = sbet &pxor ("xmm0","xmm4"); # 4 = ch &add ($key,16); # next round key &palignr("xmm5","xmm5",12); &pxor ("xmm4","xmm1"); # 4 = ch &pxor ("xmm0","xmm4"); # 0 = ch &pxor ("xmm0","xmm1"); # 0 = ch &sub ($round,1); # nr-- &set_label("dec_entry"); # top of round &movdqa ("xmm1","xmm6"); # 1 : i &pandn ("xmm1","xmm0"); # 1 = i<<4 &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k &psrld ("xmm1",4); # 1 = i &pandn ("xmm1","xmm0"); # 1 = i<<4 &pand ("xmm0","xmm6"); # 0 = k &psrld ("xmm1",4); # 1 = i &pshufb ("xmm2","xmm0"); # 2 = a/k &movdqa ("xmm3","xmm7"); # 3 : 1/i &pxor ("xmm0","xmm1"); # 0 = j Loading
crypto/aes/asm/vpaes-x86_64.pl +21 −21 Original line number Diff line number Diff line Loading @@ -29,7 +29,7 @@ # # Core 2(**) 29.6/41.1/14.3 21.9/25.2(***) # Nehalem 29.6/40.3/14.6 10.0/11.8 # Atom 57.3/74.2/32.1 60.9/82.3(***) # Atom 57.3/74.2/32.1 60.9/77.2(***) # # (*) "Hyper-threading" in the context refers rather to cache shared # among multiple cores, than to specifically Intel HTT. As vast Loading Loading @@ -204,35 +204,35 @@ _vpaes_decrypt_core: movdqa -0x10(%r10),%xmm1 # 0 : sb9t pshufb %xmm2, %xmm4 # 4 = sb9u pshufb %xmm3, %xmm1 # 0 = sb9t pxor %xmm0, %xmm4 add \$16, %r9 # next round key pxor %xmm4, %xmm1 # 0 = ch pxor %xmm4, %xmm0 movdqa 0x00(%r10),%xmm4 # 4 : sbdu pshufb %xmm5, %xmm1 # MC ch pshufb %xmm2, %xmm4 # 4 = sbdu movdqa 0x10(%r10),%xmm0 # 0 : sbdt pxor %xmm1, %xmm4 # 4 = ch pshufb %xmm3, %xmm0 # 0 = sbdt sub \$1,%rax # nr-- pxor %xmm4, %xmm0 # 0 = ch pxor %xmm1, %xmm0 # 0 = ch movdqa 0x10(%r10),%xmm1 # 0 : sbdt movdqa 0x20(%r10),%xmm4 # 4 : sbbu pshufb %xmm2, %xmm4 # 4 = sbdu pshufb %xmm5, %xmm0 # MC ch pshufb %xmm3, %xmm1 # 0 = sbdt pxor %xmm4, %xmm0 # 4 = ch movdqa 0x20(%r10),%xmm4 # 4 : sbbu pxor %xmm1, %xmm0 # 0 = ch movdqa 0x30(%r10),%xmm1 # 0 : sbbt pshufb %xmm2, %xmm4 # 4 = sbbu pshufb %xmm5, %xmm0 # MC ch pshufb %xmm3, %xmm1 # 0 = sbbt pxor %xmm0, %xmm4 # 4 = ch pxor %xmm4, %xmm1 # 0 = ch pxor %xmm4, %xmm0 # 4 = ch movdqa 0x40(%r10),%xmm4 # 4 : sbeu pshufb %xmm5, %xmm1 # MC ch movdqa 0x50(%r10),%xmm0 # 0 : sbet pxor %xmm1, %xmm0 # 0 = ch movdqa 0x50(%r10),%xmm1 # 0 : sbet pshufb %xmm2, %xmm4 # 4 = sbeu pshufb %xmm3, %xmm0 # 0 = sbet pshufb %xmm5, %xmm0 # MC ch pshufb %xmm3, %xmm1 # 0 = sbet pxor %xmm4, %xmm0 # 4 = ch add \$16, %r9 # next round key palignr \$12, %xmm5, %xmm5 pxor %xmm1, %xmm4 # 4 = ch pxor %xmm4, %xmm0 # 0 = ch pxor %xmm1, %xmm0 # 0 = ch sub \$1,%rax # nr-- .Ldec_entry: # top of round Loading