Loading crypto/aes/asm/aes-586.pl +135 −128 Original line number Diff line number Diff line Loading @@ -103,11 +103,12 @@ # byte for 128-bit key. # # ECB encrypt ECB decrypt CBC large chunk # P4 56[60] 84[100] 23 # AMD K8 48[44] 70[79] 18 # PIII 41[50] 61[91] 24 # Core 2 32[38] 45[70] 18.5 # Pentium 120 160 77 # P4 52[54] 83[95] 23 # AMD K8 46[41] 66[70] 18 # PIII 41[50] 60[77] 24 # Core 2 31[36] 45[64] 18.5 # Atom 76[100] 96[138] 60 # Pentium 115 150 77 # # Version 4.1 switches to compact S-box even in key schedule setup. # Loading Loading @@ -476,24 +477,25 @@ sub enctransform() my $tmp = $tbl; my $r2 = $key ; &mov ($acc,$s[$i]); &and ($acc,0x80808080); &mov ($tmp,$acc); &shr ($tmp,7); &and ($tmp,$s[$i]); &lea ($r2,&DWP(0,$s[$i],$s[$i])); &sub ($acc,$tmp); &mov ($acc,$tmp); &shr ($tmp,7); &and ($r2,0xfefefefe); &and ($acc,0x1b1b1b1b); &sub ($acc,$tmp); &mov ($tmp,$s[$i]); &and ($acc,0x1b1b1b1b); &rotr ($tmp,16); &xor ($acc,$r2); # r2 &mov ($r2,$s[$i]); &xor ($s[$i],$acc); # r0 ^ r2 &rotr ($r2,16+8); &xor ($acc,$tmp); &rotl ($s[$i],24); &xor ($acc,$r2); &mov ($tmp,0x80808080) if ($i!=1); &xor ($s[$i],$acc); # ROTATE(r2^r0,24) ^ r2 &rotr ($tmp,16); &xor ($s[$i],$tmp); &rotr ($tmp,8); &xor ($s[$i],$tmp); } &function_begin_B("_x86_AES_encrypt_compact"); Loading Loading @@ -526,6 +528,7 @@ sub enctransform() &enccompact(1,$tbl,$s1,$s2,$s3,$s0,1); &enccompact(2,$tbl,$s2,$s3,$s0,$s1,1); &enccompact(3,$tbl,$s3,$s0,$s1,$s2,1); &mov ($tbl,0x80808080); &enctransform(2); &enctransform(3); &enctransform(0); Loading Loading @@ -607,82 +610,84 @@ sub sse_enccompact() &pshufw ("mm5","mm4",0x0d); # 15,14,11,10 &movd ("eax","mm1"); # 5, 4, 1, 0 &movd ("ebx","mm5"); # 15,14,11,10 &mov ($__key,$key); &movz ($acc,&LB("eax")); # 0 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0 &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2 &movz ("edx",&HB("eax")); # 1 &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0 &movz ($key,&LB("ebx")); # 10 &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1 &shl ("edx",8); # 1 &shr ("eax",16); # 5, 4 &shl ("edx",8); # 1 &movz ($acc,&LB("ebx")); # 10 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10 &movz ($acc,&BP(-128,$tbl,$key,1)); # 10 &movz ($key,&HB("ebx")); # 11 &shl ($acc,16); # 10 &or ("ecx",$acc); # 10 &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8 &movz ($acc,&HB("ebx")); # 11 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11 &or ("ecx",$acc); # 10 &movz ($acc,&BP(-128,$tbl,$key,1)); # 11 &movz ($key,&HB("eax")); # 5 &shl ($acc,24); # 11 &or ("edx",$acc); # 11 &shr ("ebx",16); # 15,14 &or ("edx",$acc); # 11 &movz ($acc,&HB("eax")); # 5 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 5 &movz ($acc,&BP(-128,$tbl,$key,1)); # 5 &movz ($key,&HB("ebx")); # 15 &shl ($acc,8); # 5 &or ("ecx",$acc); # 5 &movz ($acc,&HB("ebx")); # 15 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15 &movz ($acc,&BP(-128,$tbl,$key,1)); # 15 &movz ($key,&LB("eax")); # 4 &shl ($acc,24); # 15 &or ("ecx",$acc); # 15 &movd ("mm0","ecx"); # t[0] collected &movz ($acc,&LB("eax")); # 4 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 4 &movz ($acc,&BP(-128,$tbl,$key,1)); # 4 &movz ($key,&LB("ebx")); # 14 &movd ("eax","mm2"); # 7, 6, 3, 2 &movz ($acc,&LB("ebx")); # 14 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14 &shl ($acc,16); # 14 &movd ("mm0","ecx"); # t[0] collected &movz ("ecx",&BP(-128,$tbl,$key,1)); # 14 &movz ($key,&HB("eax")); # 3 &shl ("ecx",16); # 14 &movd ("ebx","mm6"); # 13,12, 9, 8 &or ("ecx",$acc); # 14 &movd ("ebx","mm6"); # 13,12, 9, 8 &movz ($acc,&HB("eax")); # 3 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 3 &movz ($acc,&BP(-128,$tbl,$key,1)); # 3 &movz ($key,&HB("ebx")); # 9 &shl ($acc,24); # 3 &or ("ecx",$acc); # 3 &movz ($acc,&HB("ebx")); # 9 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9 &movz ($acc,&BP(-128,$tbl,$key,1)); # 9 &movz ($key,&LB("ebx")); # 8 &shl ($acc,8); # 9 &shr ("ebx",16); # 13,12 &or ("ecx",$acc); # 9 &movd ("mm1","ecx"); # t[1] collected &movz ($acc,&LB("ebx")); # 8 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 8 &shr ("ebx",16); # 13,12 &movz ($acc,&LB("eax")); # 2 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2 &shl ($acc,16); # 2 &or ("ecx",$acc); # 2 &movz ($acc,&BP(-128,$tbl,$key,1)); # 8 &movz ($key,&LB("eax")); # 2 &shr ("eax",16); # 7, 6 &movd ("mm1","ecx"); # t[1] collected &movz ("ecx",&BP(-128,$tbl,$key,1)); # 2 &movz ($key,&HB("eax")); # 7 &shl ("ecx",16); # 2 &and ("eax",0xff); # 6 &or ("ecx",$acc); # 2 &punpckldq ("mm0","mm1"); # t[0,1] collected &movz ($acc,&HB("eax")); # 7 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7 &movz ($acc,&BP(-128,$tbl,$key,1)); # 7 &movz ($key,&HB("ebx")); # 13 &shl ($acc,24); # 7 &or ("ecx",$acc); # 7 &and ("eax",0xff); # 6 &and ("ebx",0xff); # 12 &movz ("eax",&BP(-128,$tbl,"eax",1)); # 6 &or ("ecx",$acc); # 7 &shl ("eax",16); # 6 &movz ($acc,&BP(-128,$tbl,$key,1)); # 13 &or ("edx","eax"); # 6 &movz ($acc,&HB("ebx")); # 13 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13 &shl ($acc,8); # 13 &or ("ecx",$acc); # 13 &movd ("mm4","ecx"); # t[2] collected &and ("ebx",0xff); # 12 &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 12 &or ("ecx",$acc); # 13 &or ("edx","ebx"); # 12 &mov ($key,$__key); &movd ("mm4","ecx"); # t[2] collected &movd ("mm5","edx"); # t[3] collected &punpckldq ("mm4","mm5"); # t[2,3] collected Loading Loading @@ -1270,30 +1275,30 @@ sub dectransform() my $tp4 = @s[($i+3)%4]; $tp4 = @s[3] if ($i==1); my $tp8 = $tbl; &mov ($acc,$s[$i]); &and ($acc,0x80808080); &mov ($tmp,$acc); &mov ($tmp,0x80808080); &and ($tmp,$s[$i]); &mov ($acc,$tmp); &shr ($tmp,7); &lea ($tp2,&DWP(0,$s[$i],$s[$i])); &sub ($acc,$tmp); &and ($tp2,0xfefefefe); &and ($acc,0x1b1b1b1b); &xor ($acc,$tp2); &mov ($tp2,$acc); &xor ($tp2,$acc); &mov ($tmp,0x80808080); &and ($acc,0x80808080); &mov ($tmp,$acc); &and ($tmp,$tp2); &mov ($acc,$tmp); &shr ($tmp,7); &lea ($tp4,&DWP(0,$tp2,$tp2)); &sub ($acc,$tmp); &and ($tp4,0xfefefefe); &and ($acc,0x1b1b1b1b); &xor ($tp2,$s[$i]); # tp2^tp1 &xor ($acc,$tp4); &mov ($tp4,$acc); &xor ($tp4,$acc); &mov ($tmp,0x80808080); &and ($acc,0x80808080); &mov ($tmp,$acc); &and ($tmp,$tp4); &mov ($acc,$tmp); &shr ($tmp,7); &lea ($tp8,&DWP(0,$tp4,$tp4)); &sub ($acc,$tmp); Loading @@ -1305,13 +1310,13 @@ sub dectransform() &xor ($s[$i],$tp2); &xor ($tp2,$tp8); &rotl ($tp2,24); &xor ($s[$i],$tp4); &xor ($tp4,$tp8); &rotl ($tp4,16); &rotl ($tp2,24); &xor ($s[$i],$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1) &rotl ($tp8,8); &rotl ($tp4,16); &xor ($s[$i],$tp2); # ^= ROTATE(tp8^tp2^tp1,24) &rotl ($tp8,8); &xor ($s[$i],$tp4); # ^= ROTATE(tp8^tp4^tp1,16) &mov ($s[0],$__s0) if($i==2); #prefetch $s0 &mov ($s[1],$__s1) if($i==3); #prefetch $s1 Loading Loading @@ -1389,85 +1394,87 @@ sub dectransform() sub sse_deccompact() { &pshufw ("mm1","mm0",0x0c); # 7, 6, 1, 0 &pshufw ("mm5","mm4",0x09); # 13,12,11,10 &movd ("eax","mm1"); # 7, 6, 1, 0 &movd ("ebx","mm5"); # 13,12,11,10 &mov ($__key,$key); &pshufw ("mm5","mm4",0x09); # 13,12,11,10 &movz ($acc,&LB("eax")); # 0 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0 &movd ("ebx","mm5"); # 13,12,11,10 &movz ("edx",&HB("eax")); # 1 &pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0 &movz ($key,&LB("ebx")); # 10 &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1 &shr ("eax",16); # 7, 6 &shl ("edx",8); # 1 &pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4 &movz ($acc,&LB("ebx")); # 10 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10 &movz ($acc,&BP(-128,$tbl,$key,1)); # 10 &movz ($key,&HB("ebx")); # 11 &shl ($acc,16); # 10 &pshufw ("mm6","mm4",0x03); # 9, 8,15,14 &or ("ecx",$acc); # 10 &shr ("eax",16); # 7, 6 &movz ($acc,&HB("ebx")); # 11 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11 &movz ($acc,&BP(-128,$tbl,$key,1)); # 11 &movz ($key,&HB("eax")); # 7 &shl ($acc,24); # 11 &or ("edx",$acc); # 11 &shr ("ebx",16); # 13,12 &or ("edx",$acc); # 11 &pshufw ("mm6","mm4",0x03); # 9, 8,15,14 &movz ($acc,&HB("eax")); # 7 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7 &movz ($acc,&BP(-128,$tbl,$key,1)); # 7 &movz ($key,&HB("ebx")); # 13 &shl ($acc,24); # 7 &or ("ecx",$acc); # 7 &movz ($acc,&HB("ebx")); # 13 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13 &movz ($acc,&BP(-128,$tbl,$key,1)); # 13 &movz ($key,&LB("eax")); # 6 &shl ($acc,8); # 13 &movd ("eax","mm2"); # 3, 2, 5, 4 &or ("ecx",$acc); # 13 &movd ("mm0","ecx"); # t[0] collected &movz ($acc,&LB("eax")); # 6 &movd ("eax","mm2"); # 3, 2, 5, 4 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 6 &shl ("ecx",16); # 6 &movz ($acc,&LB("ebx")); # 12 &movz ($acc,&BP(-128,$tbl,$key,1)); # 6 &movz ($key,&LB("ebx")); # 12 &shl ($acc,16); # 6 &movd ("ebx","mm6"); # 9, 8,15,14 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 12 &movd ("mm0","ecx"); # t[0] collected &movz ("ecx",&BP(-128,$tbl,$key,1)); # 12 &movz ($key,&LB("eax")); # 4 &or ("ecx",$acc); # 12 &movz ($acc,&LB("eax")); # 4 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 4 &movz ($acc,&BP(-128,$tbl,$key,1)); # 4 &movz ($key,&LB("ebx")); # 14 &or ("edx",$acc); # 4 &movz ($acc,&LB("ebx")); # 14 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14 &movz ($acc,&BP(-128,$tbl,$key,1)); # 14 &movz ($key,&HB("eax")); # 5 &shl ($acc,16); # 14 &shr ("eax",16); # 3, 2 &or ("edx",$acc); # 14 &movd ("mm1","edx"); # t[1] collected &movz ($acc,&HB("eax")); # 5 &movz ("edx",&BP(-128,$tbl,$acc,1)); # 5 &shl ("edx",8); # 5 &movz ($acc,&HB("ebx")); # 15 &shr ("eax",16); # 3, 2 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15 &shl ($acc,24); # 15 &or ("edx",$acc); # 15 &movz ($acc,&BP(-128,$tbl,$key,1)); # 5 &movz ($key,&HB("ebx")); # 15 &shr ("ebx",16); # 9, 8 &shl ($acc,8); # 5 &movd ("mm1","edx"); # t[1] collected &movz ("edx",&BP(-128,$tbl,$key,1)); # 15 &movz ($key,&HB("ebx")); # 9 &shl ("edx",24); # 15 &and ("ebx",0xff); # 8 &or ("edx",$acc); # 15 &punpckldq ("mm0","mm1"); # t[0,1] collected &movz ($acc,&HB("ebx")); # 9 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9 &movz ($acc,&BP(-128,$tbl,$key,1)); # 9 &movz ($key,&LB("eax")); # 2 &shl ($acc,8); # 9 &or ("ecx",$acc); # 9 &and ("ebx",0xff); # 8 &movz ("eax",&HB("eax")); # 3 &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 8 &or ("ecx",$acc); # 9 &movz ($acc,&BP(-128,$tbl,$key,1)); # 2 &or ("edx","ebx"); # 8 &movz ($acc,&LB("eax")); # 2 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2 &shl ($acc,16); # 2 &or ("edx",$acc); # 2 &movd ("mm4","edx"); # t[2] collected &movz ("eax",&HB("eax")); # 3 &movz ("eax",&BP(-128,$tbl,"eax",1)); # 3 &or ("edx",$acc); # 2 &shl ("eax",24); # 3 &or ("ecx","eax"); # 3 &mov ($key,$__key); &movd ("mm4","edx"); # t[2] collected &movd ("mm5","ecx"); # t[3] collected &punpckldq ("mm4","mm5"); # t[2,3] collected Loading Loading @@ -2865,32 +2872,32 @@ sub deckey() { my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_; my $tmp = $tbl; &mov ($acc,$tp1); &and ($acc,0x80808080); &mov ($tmp,$acc); &shr ($tmp,7); &mov ($tmp,0x80808080); &and ($tmp,$tp1); &lea ($tp2,&DWP(0,$tp1,$tp1)); &mov ($acc,$tmp); &shr ($tmp,7); &sub ($acc,$tmp); &and ($tp2,0xfefefefe); &and ($acc,0x1b1b1b1b); &xor ($acc,$tp2); &mov ($tp2,$acc); &xor ($tp2,$acc); &mov ($tmp,0x80808080); &and ($acc,0x80808080); &mov ($tmp,$acc); &shr ($tmp,7); &and ($tmp,$tp2); &lea ($tp4,&DWP(0,$tp2,$tp2)); &mov ($acc,$tmp); &shr ($tmp,7); &sub ($acc,$tmp); &and ($tp4,0xfefefefe); &and ($acc,0x1b1b1b1b); &xor ($tp2,$tp1); # tp2^tp1 &xor ($acc,$tp4); &mov ($tp4,$acc); &xor ($tp4,$acc); &mov ($tmp,0x80808080); &and ($acc,0x80808080); &mov ($tmp,$acc); &shr ($tmp,7); &and ($tmp,$tp4); &lea ($tp8,&DWP(0,$tp4,$tp4)); &mov ($acc,$tmp); &shr ($tmp,7); &xor ($tp4,$tp1); # tp4^tp1 &sub ($acc,$tmp); &and ($tp8,0xfefefefe); Loading crypto/aes/asm/vpaes-x86.pl +45 −46 Original line number Diff line number Diff line Loading @@ -27,9 +27,9 @@ # # aes-586.pl vpaes-x86.pl # # Core 2(**) 29.1/42.3/18.3 22.0/25.6(***) # Nehalem 27.9/40.4/18.1 10.3/12.0 # Atom 102./119./60.1 64.5/85.3(***) # Core 2(**) 28.1/41.4/18.3 21.9/25.2(***) # Nehalem 27.9/40.4/18.1 10.2/11.9 # Atom 70.7/92.1/60.1 61.1/81.0(***) # # (*) "Hyper-threading" in the context refers rather to cache shared # among multiple cores, than to specifically Intel HTT. As vast Loading @@ -40,8 +40,8 @@ # (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe. # # (***) Less impressive improvement on Core 2 and Atom is due to slow # pshufb, yet it's respectable +32%/65% improvement on Core 2 # and +58%/40% on Atom (as implied, over "hyper-threading-safe" # pshufb, yet it's respectable +28%/64% improvement on Core 2 # and +15% on Atom (as implied, over "hyper-threading-safe" # code path). # # <appro@openssl.org> Loading Loading @@ -183,35 +183,35 @@ $k_dsbo=0x2c0; # decryption sbox final output &movdqa ("xmm1","xmm6") &movdqa ("xmm2",&QWP($k_ipt,$const)); &pandn ("xmm1","xmm0"); &movdqu ("xmm5",&QWP(0,$key)); &psrld ("xmm1",4); &pand ("xmm0","xmm6"); &movdqu ("xmm5",&QWP(0,$key)); &pshufb ("xmm2","xmm0"); &movdqa ("xmm0",&QWP($k_ipt+16,$const)); &pshufb ("xmm0","xmm1"); &pxor ("xmm2","xmm5"); &pxor ("xmm0","xmm2"); &psrld ("xmm1",4); &add ($key,16); &pshufb ("xmm0","xmm1"); &lea ($base,&DWP($k_mc_backward,$const)); &pxor ("xmm0","xmm2"); &jmp (&label("enc_entry")); &set_label("enc_loop",16); # middle of middle round &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u &pshufb ("xmm4","xmm2"); # 4 = sb1u &pxor ("xmm4","xmm5"); # 4 = sb1u + k &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t &pshufb ("xmm4","xmm2"); # 4 = sb1u &pshufb ("xmm0","xmm3"); # 0 = sb1t &pxor ("xmm0","xmm4"); # 0 = A &pxor ("xmm4","xmm5"); # 4 = sb1u + k &movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u &pshufb ("xmm5","xmm2"); # 4 = sb2u &pxor ("xmm0","xmm4"); # 0 = A &movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[] &pshufb ("xmm5","xmm2"); # 4 = sb2u &movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t &pshufb ("xmm2","xmm3"); # 2 = sb2t &pxor ("xmm2","xmm5"); # 2 = 2A &movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[] &pshufb ("xmm2","xmm3"); # 2 = sb2t &movdqa ("xmm3","xmm0"); # 3 = A &pxor ("xmm2","xmm5"); # 2 = 2A &pshufb ("xmm0","xmm1"); # 0 = B &add ($key,16); # next key &pxor ("xmm0","xmm2"); # 0 = 2A+B Loading @@ -220,30 +220,30 @@ $k_dsbo=0x2c0; # decryption sbox final output &pxor ("xmm3","xmm0"); # 3 = 2A+B+D &pshufb ("xmm0","xmm1"); # 0 = 2B+C &and ($magic,0x30); # ... mod 4 &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D &sub ($round,1); # nr-- &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D &set_label("enc_entry"); # top of round &movdqa ("xmm1","xmm6"); # 1 : i &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k &pandn ("xmm1","xmm0"); # 1 = i<<4 &psrld ("xmm1",4); # 1 = i &pand ("xmm0","xmm6"); # 0 = k &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k &pshufb ("xmm5","xmm0"); # 2 = a/k &pxor ("xmm0","xmm1"); # 0 = j &movdqa ("xmm3","xmm7"); # 3 : 1/i &pxor ("xmm0","xmm1"); # 0 = j &pshufb ("xmm3","xmm1"); # 3 = 1/i &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k &movdqa ("xmm4","xmm7"); # 4 : 1/j &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k &pshufb ("xmm4","xmm0"); # 4 = 1/j &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k &movdqa ("xmm2","xmm7"); # 2 : 1/iak &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k &pshufb ("xmm2","xmm3"); # 2 = 1/iak &pxor ("xmm2","xmm0"); # 2 = io &movdqa ("xmm3","xmm7"); # 3 : 1/jak &movdqu ("xmm5",&QWP(0,$key)); &pxor ("xmm2","xmm0"); # 2 = io &pshufb ("xmm3","xmm4"); # 3 = 1/jak &movdqu ("xmm5",&QWP(0,$key)); &pxor ("xmm3","xmm1"); # 3 = jo &jnz (&label("enc_loop")); Loading @@ -265,8 +265,8 @@ $k_dsbo=0x2c0; # decryption sbox final output ## Same API as encryption core. ## &function_begin_B("_vpaes_decrypt_core"); &mov ($round,&DWP(240,$key)); &lea ($base,&DWP($k_dsbd,$const)); &mov ($round,&DWP(240,$key)); &movdqa ("xmm1","xmm6"); &movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base)); &pandn ("xmm1","xmm0"); Loading @@ -292,62 +292,61 @@ $k_dsbo=0x2c0; # decryption sbox final output ## Inverse mix columns ## &movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u &movdqa ("xmm1",&QWP(-0x10,$base)); # 0 : sb9t &pshufb ("xmm4","xmm2"); # 4 = sb9u &pshufb ("xmm1","xmm3"); # 0 = sb9t &pxor ("xmm4","xmm0"); &movdqa ("xmm0",&QWP(-0x10,$base)); # 0 : sb9t &pshufb ("xmm0","xmm3"); # 0 = sb9t &pxor ("xmm0","xmm4"); # 0 = ch &add ($key,16); # next round key &pxor ("xmm1","xmm4"); # 0 = ch &pshufb ("xmm0","xmm5"); # MC ch &movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu &pshufb ("xmm1","xmm5"); # MC ch &pshufb ("xmm4","xmm2"); # 4 = sbdu &pxor ("xmm4","xmm0"); # 4 = ch &movdqa ("xmm0",&QWP(0x10,$base)); # 0 : sbdt &pxor ("xmm4","xmm1"); # 4 = ch &pshufb ("xmm0","xmm3"); # 0 = sbdt &pxor ("xmm0","xmm4"); # 0 = ch &sub ($round,1); # nr-- &pxor ("xmm0","xmm4"); # 0 = ch &pshufb ("xmm0","xmm5"); # MC ch &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu &pshufb ("xmm0","xmm5"); # MC ch &movdqa ("xmm1",&QWP(0x30,$base)); # 0 : sbbt &pshufb ("xmm4","xmm2"); # 4 = sbbu &pshufb ("xmm1","xmm3"); # 0 = sbbt &pxor ("xmm4","xmm0"); # 4 = ch &movdqa ("xmm0",&QWP(0x30,$base)); # 0 : sbbt &pshufb ("xmm0","xmm3"); # 0 = sbbt &pxor ("xmm0","xmm4"); # 0 = ch &pxor ("xmm1","xmm4"); # 0 = ch &pshufb ("xmm0","xmm5"); # MC ch &movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu &pshufb ("xmm4","xmm2"); # 4 = sbeu &pxor ("xmm4","xmm0"); # 4 = ch &pshufb ("xmm1","xmm5"); # MC ch &movdqa ("xmm0",&QWP(0x50,$base)); # 0 : sbet &pshufb ("xmm4","xmm2"); # 4 = sbeu &pshufb ("xmm0","xmm3"); # 0 = sbet &pxor ("xmm0","xmm4"); # 0 = ch &palignr("xmm5","xmm5",12); &pxor ("xmm4","xmm1"); # 4 = ch &pxor ("xmm0","xmm4"); # 0 = ch &set_label("dec_entry"); # top of round &movdqa ("xmm1","xmm6"); # 1 : i &pandn ("xmm1","xmm0"); # 1 = i<<4 &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k &psrld ("xmm1",4); # 1 = i &pand ("xmm0","xmm6"); # 0 = k &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k &pshufb ("xmm2","xmm0"); # 2 = a/k &pxor ("xmm0","xmm1"); # 0 = j &movdqa ("xmm3","xmm7"); # 3 : 1/i &pxor ("xmm0","xmm1"); # 0 = j &pshufb ("xmm3","xmm1"); # 3 = 1/i &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k &movdqa ("xmm4","xmm7"); # 4 : 1/j &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k &pshufb ("xmm4","xmm0"); # 4 = 1/j &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k &movdqa ("xmm2","xmm7"); # 2 : 1/iak &pshufb ("xmm2","xmm3"); # 2 = 1/iak &pxor ("xmm2","xmm0"); # 2 = io &movdqa ("xmm3","xmm7"); # 3 : 1/jak &pxor ("xmm2","xmm0"); # 2 = io &pshufb ("xmm3","xmm4"); # 3 = 1/jak &pxor ("xmm3","xmm1"); # 3 = jo &movdqu ("xmm0",&QWP(0,$key)); &pxor ("xmm3","xmm1"); # 3 = jo &jnz (&label("dec_loop")); # middle of last round Loading Loading @@ -542,12 +541,12 @@ $k_dsbo=0x2c0; # decryption sbox final output ## %xmm0: b+c+d b+c b a ## &function_begin_B("_vpaes_schedule_192_smear"); &pshufd ("xmm0","xmm6",0x80); # d c 0 0 -> c 0 0 0 &pxor ("xmm6","xmm0"); # -> c+d c 0 0 &pshufd ("xmm1","xmm6",0x80); # d c 0 0 -> c 0 0 0 &pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a &pxor ("xmm6","xmm1"); # -> c+d c 0 0 &pxor ("xmm1","xmm1"); &pxor ("xmm6","xmm0"); # -> b+c+d b+c b a &movdqa ("xmm0","xmm6"); &pxor ("xmm1","xmm1"); &movhlps("xmm6","xmm1"); # clobber low side with zeros &ret (); &function_end_B("_vpaes_schedule_192_smear"); Loading Loading
crypto/aes/asm/aes-586.pl +135 −128 Original line number Diff line number Diff line Loading @@ -103,11 +103,12 @@ # byte for 128-bit key. # # ECB encrypt ECB decrypt CBC large chunk # P4 56[60] 84[100] 23 # AMD K8 48[44] 70[79] 18 # PIII 41[50] 61[91] 24 # Core 2 32[38] 45[70] 18.5 # Pentium 120 160 77 # P4 52[54] 83[95] 23 # AMD K8 46[41] 66[70] 18 # PIII 41[50] 60[77] 24 # Core 2 31[36] 45[64] 18.5 # Atom 76[100] 96[138] 60 # Pentium 115 150 77 # # Version 4.1 switches to compact S-box even in key schedule setup. # Loading Loading @@ -476,24 +477,25 @@ sub enctransform() my $tmp = $tbl; my $r2 = $key ; &mov ($acc,$s[$i]); &and ($acc,0x80808080); &mov ($tmp,$acc); &shr ($tmp,7); &and ($tmp,$s[$i]); &lea ($r2,&DWP(0,$s[$i],$s[$i])); &sub ($acc,$tmp); &mov ($acc,$tmp); &shr ($tmp,7); &and ($r2,0xfefefefe); &and ($acc,0x1b1b1b1b); &sub ($acc,$tmp); &mov ($tmp,$s[$i]); &and ($acc,0x1b1b1b1b); &rotr ($tmp,16); &xor ($acc,$r2); # r2 &mov ($r2,$s[$i]); &xor ($s[$i],$acc); # r0 ^ r2 &rotr ($r2,16+8); &xor ($acc,$tmp); &rotl ($s[$i],24); &xor ($acc,$r2); &mov ($tmp,0x80808080) if ($i!=1); &xor ($s[$i],$acc); # ROTATE(r2^r0,24) ^ r2 &rotr ($tmp,16); &xor ($s[$i],$tmp); &rotr ($tmp,8); &xor ($s[$i],$tmp); } &function_begin_B("_x86_AES_encrypt_compact"); Loading Loading @@ -526,6 +528,7 @@ sub enctransform() &enccompact(1,$tbl,$s1,$s2,$s3,$s0,1); &enccompact(2,$tbl,$s2,$s3,$s0,$s1,1); &enccompact(3,$tbl,$s3,$s0,$s1,$s2,1); &mov ($tbl,0x80808080); &enctransform(2); &enctransform(3); &enctransform(0); Loading Loading @@ -607,82 +610,84 @@ sub sse_enccompact() &pshufw ("mm5","mm4",0x0d); # 15,14,11,10 &movd ("eax","mm1"); # 5, 4, 1, 0 &movd ("ebx","mm5"); # 15,14,11,10 &mov ($__key,$key); &movz ($acc,&LB("eax")); # 0 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0 &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2 &movz ("edx",&HB("eax")); # 1 &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0 &movz ($key,&LB("ebx")); # 10 &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1 &shl ("edx",8); # 1 &shr ("eax",16); # 5, 4 &shl ("edx",8); # 1 &movz ($acc,&LB("ebx")); # 10 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10 &movz ($acc,&BP(-128,$tbl,$key,1)); # 10 &movz ($key,&HB("ebx")); # 11 &shl ($acc,16); # 10 &or ("ecx",$acc); # 10 &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8 &movz ($acc,&HB("ebx")); # 11 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11 &or ("ecx",$acc); # 10 &movz ($acc,&BP(-128,$tbl,$key,1)); # 11 &movz ($key,&HB("eax")); # 5 &shl ($acc,24); # 11 &or ("edx",$acc); # 11 &shr ("ebx",16); # 15,14 &or ("edx",$acc); # 11 &movz ($acc,&HB("eax")); # 5 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 5 &movz ($acc,&BP(-128,$tbl,$key,1)); # 5 &movz ($key,&HB("ebx")); # 15 &shl ($acc,8); # 5 &or ("ecx",$acc); # 5 &movz ($acc,&HB("ebx")); # 15 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15 &movz ($acc,&BP(-128,$tbl,$key,1)); # 15 &movz ($key,&LB("eax")); # 4 &shl ($acc,24); # 15 &or ("ecx",$acc); # 15 &movd ("mm0","ecx"); # t[0] collected &movz ($acc,&LB("eax")); # 4 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 4 &movz ($acc,&BP(-128,$tbl,$key,1)); # 4 &movz ($key,&LB("ebx")); # 14 &movd ("eax","mm2"); # 7, 6, 3, 2 &movz ($acc,&LB("ebx")); # 14 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14 &shl ($acc,16); # 14 &movd ("mm0","ecx"); # t[0] collected &movz ("ecx",&BP(-128,$tbl,$key,1)); # 14 &movz ($key,&HB("eax")); # 3 &shl ("ecx",16); # 14 &movd ("ebx","mm6"); # 13,12, 9, 8 &or ("ecx",$acc); # 14 &movd ("ebx","mm6"); # 13,12, 9, 8 &movz ($acc,&HB("eax")); # 3 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 3 &movz ($acc,&BP(-128,$tbl,$key,1)); # 3 &movz ($key,&HB("ebx")); # 9 &shl ($acc,24); # 3 &or ("ecx",$acc); # 3 &movz ($acc,&HB("ebx")); # 9 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9 &movz ($acc,&BP(-128,$tbl,$key,1)); # 9 &movz ($key,&LB("ebx")); # 8 &shl ($acc,8); # 9 &shr ("ebx",16); # 13,12 &or ("ecx",$acc); # 9 &movd ("mm1","ecx"); # t[1] collected &movz ($acc,&LB("ebx")); # 8 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 8 &shr ("ebx",16); # 13,12 &movz ($acc,&LB("eax")); # 2 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2 &shl ($acc,16); # 2 &or ("ecx",$acc); # 2 &movz ($acc,&BP(-128,$tbl,$key,1)); # 8 &movz ($key,&LB("eax")); # 2 &shr ("eax",16); # 7, 6 &movd ("mm1","ecx"); # t[1] collected &movz ("ecx",&BP(-128,$tbl,$key,1)); # 2 &movz ($key,&HB("eax")); # 7 &shl ("ecx",16); # 2 &and ("eax",0xff); # 6 &or ("ecx",$acc); # 2 &punpckldq ("mm0","mm1"); # t[0,1] collected &movz ($acc,&HB("eax")); # 7 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7 &movz ($acc,&BP(-128,$tbl,$key,1)); # 7 &movz ($key,&HB("ebx")); # 13 &shl ($acc,24); # 7 &or ("ecx",$acc); # 7 &and ("eax",0xff); # 6 &and ("ebx",0xff); # 12 &movz ("eax",&BP(-128,$tbl,"eax",1)); # 6 &or ("ecx",$acc); # 7 &shl ("eax",16); # 6 &movz ($acc,&BP(-128,$tbl,$key,1)); # 13 &or ("edx","eax"); # 6 &movz ($acc,&HB("ebx")); # 13 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13 &shl ($acc,8); # 13 &or ("ecx",$acc); # 13 &movd ("mm4","ecx"); # t[2] collected &and ("ebx",0xff); # 12 &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 12 &or ("ecx",$acc); # 13 &or ("edx","ebx"); # 12 &mov ($key,$__key); &movd ("mm4","ecx"); # t[2] collected &movd ("mm5","edx"); # t[3] collected &punpckldq ("mm4","mm5"); # t[2,3] collected Loading Loading @@ -1270,30 +1275,30 @@ sub dectransform() my $tp4 = @s[($i+3)%4]; $tp4 = @s[3] if ($i==1); my $tp8 = $tbl; &mov ($acc,$s[$i]); &and ($acc,0x80808080); &mov ($tmp,$acc); &mov ($tmp,0x80808080); &and ($tmp,$s[$i]); &mov ($acc,$tmp); &shr ($tmp,7); &lea ($tp2,&DWP(0,$s[$i],$s[$i])); &sub ($acc,$tmp); &and ($tp2,0xfefefefe); &and ($acc,0x1b1b1b1b); &xor ($acc,$tp2); &mov ($tp2,$acc); &xor ($tp2,$acc); &mov ($tmp,0x80808080); &and ($acc,0x80808080); &mov ($tmp,$acc); &and ($tmp,$tp2); &mov ($acc,$tmp); &shr ($tmp,7); &lea ($tp4,&DWP(0,$tp2,$tp2)); &sub ($acc,$tmp); &and ($tp4,0xfefefefe); &and ($acc,0x1b1b1b1b); &xor ($tp2,$s[$i]); # tp2^tp1 &xor ($acc,$tp4); &mov ($tp4,$acc); &xor ($tp4,$acc); &mov ($tmp,0x80808080); &and ($acc,0x80808080); &mov ($tmp,$acc); &and ($tmp,$tp4); &mov ($acc,$tmp); &shr ($tmp,7); &lea ($tp8,&DWP(0,$tp4,$tp4)); &sub ($acc,$tmp); Loading @@ -1305,13 +1310,13 @@ sub dectransform() &xor ($s[$i],$tp2); &xor ($tp2,$tp8); &rotl ($tp2,24); &xor ($s[$i],$tp4); &xor ($tp4,$tp8); &rotl ($tp4,16); &rotl ($tp2,24); &xor ($s[$i],$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1) &rotl ($tp8,8); &rotl ($tp4,16); &xor ($s[$i],$tp2); # ^= ROTATE(tp8^tp2^tp1,24) &rotl ($tp8,8); &xor ($s[$i],$tp4); # ^= ROTATE(tp8^tp4^tp1,16) &mov ($s[0],$__s0) if($i==2); #prefetch $s0 &mov ($s[1],$__s1) if($i==3); #prefetch $s1 Loading Loading @@ -1389,85 +1394,87 @@ sub dectransform() sub sse_deccompact() { &pshufw ("mm1","mm0",0x0c); # 7, 6, 1, 0 &pshufw ("mm5","mm4",0x09); # 13,12,11,10 &movd ("eax","mm1"); # 7, 6, 1, 0 &movd ("ebx","mm5"); # 13,12,11,10 &mov ($__key,$key); &pshufw ("mm5","mm4",0x09); # 13,12,11,10 &movz ($acc,&LB("eax")); # 0 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0 &movd ("ebx","mm5"); # 13,12,11,10 &movz ("edx",&HB("eax")); # 1 &pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0 &movz ($key,&LB("ebx")); # 10 &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1 &shr ("eax",16); # 7, 6 &shl ("edx",8); # 1 &pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4 &movz ($acc,&LB("ebx")); # 10 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10 &movz ($acc,&BP(-128,$tbl,$key,1)); # 10 &movz ($key,&HB("ebx")); # 11 &shl ($acc,16); # 10 &pshufw ("mm6","mm4",0x03); # 9, 8,15,14 &or ("ecx",$acc); # 10 &shr ("eax",16); # 7, 6 &movz ($acc,&HB("ebx")); # 11 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11 &movz ($acc,&BP(-128,$tbl,$key,1)); # 11 &movz ($key,&HB("eax")); # 7 &shl ($acc,24); # 11 &or ("edx",$acc); # 11 &shr ("ebx",16); # 13,12 &or ("edx",$acc); # 11 &pshufw ("mm6","mm4",0x03); # 9, 8,15,14 &movz ($acc,&HB("eax")); # 7 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7 &movz ($acc,&BP(-128,$tbl,$key,1)); # 7 &movz ($key,&HB("ebx")); # 13 &shl ($acc,24); # 7 &or ("ecx",$acc); # 7 &movz ($acc,&HB("ebx")); # 13 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13 &movz ($acc,&BP(-128,$tbl,$key,1)); # 13 &movz ($key,&LB("eax")); # 6 &shl ($acc,8); # 13 &movd ("eax","mm2"); # 3, 2, 5, 4 &or ("ecx",$acc); # 13 &movd ("mm0","ecx"); # t[0] collected &movz ($acc,&LB("eax")); # 6 &movd ("eax","mm2"); # 3, 2, 5, 4 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 6 &shl ("ecx",16); # 6 &movz ($acc,&LB("ebx")); # 12 &movz ($acc,&BP(-128,$tbl,$key,1)); # 6 &movz ($key,&LB("ebx")); # 12 &shl ($acc,16); # 6 &movd ("ebx","mm6"); # 9, 8,15,14 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 12 &movd ("mm0","ecx"); # t[0] collected &movz ("ecx",&BP(-128,$tbl,$key,1)); # 12 &movz ($key,&LB("eax")); # 4 &or ("ecx",$acc); # 12 &movz ($acc,&LB("eax")); # 4 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 4 &movz ($acc,&BP(-128,$tbl,$key,1)); # 4 &movz ($key,&LB("ebx")); # 14 &or ("edx",$acc); # 4 &movz ($acc,&LB("ebx")); # 14 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14 &movz ($acc,&BP(-128,$tbl,$key,1)); # 14 &movz ($key,&HB("eax")); # 5 &shl ($acc,16); # 14 &shr ("eax",16); # 3, 2 &or ("edx",$acc); # 14 &movd ("mm1","edx"); # t[1] collected &movz ($acc,&HB("eax")); # 5 &movz ("edx",&BP(-128,$tbl,$acc,1)); # 5 &shl ("edx",8); # 5 &movz ($acc,&HB("ebx")); # 15 &shr ("eax",16); # 3, 2 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15 &shl ($acc,24); # 15 &or ("edx",$acc); # 15 &movz ($acc,&BP(-128,$tbl,$key,1)); # 5 &movz ($key,&HB("ebx")); # 15 &shr ("ebx",16); # 9, 8 &shl ($acc,8); # 5 &movd ("mm1","edx"); # t[1] collected &movz ("edx",&BP(-128,$tbl,$key,1)); # 15 &movz ($key,&HB("ebx")); # 9 &shl ("edx",24); # 15 &and ("ebx",0xff); # 8 &or ("edx",$acc); # 15 &punpckldq ("mm0","mm1"); # t[0,1] collected &movz ($acc,&HB("ebx")); # 9 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9 &movz ($acc,&BP(-128,$tbl,$key,1)); # 9 &movz ($key,&LB("eax")); # 2 &shl ($acc,8); # 9 &or ("ecx",$acc); # 9 &and ("ebx",0xff); # 8 &movz ("eax",&HB("eax")); # 3 &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 8 &or ("ecx",$acc); # 9 &movz ($acc,&BP(-128,$tbl,$key,1)); # 2 &or ("edx","ebx"); # 8 &movz ($acc,&LB("eax")); # 2 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2 &shl ($acc,16); # 2 &or ("edx",$acc); # 2 &movd ("mm4","edx"); # t[2] collected &movz ("eax",&HB("eax")); # 3 &movz ("eax",&BP(-128,$tbl,"eax",1)); # 3 &or ("edx",$acc); # 2 &shl ("eax",24); # 3 &or ("ecx","eax"); # 3 &mov ($key,$__key); &movd ("mm4","edx"); # t[2] collected &movd ("mm5","ecx"); # t[3] collected &punpckldq ("mm4","mm5"); # t[2,3] collected Loading Loading @@ -2865,32 +2872,32 @@ sub deckey() { my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_; my $tmp = $tbl; &mov ($acc,$tp1); &and ($acc,0x80808080); &mov ($tmp,$acc); &shr ($tmp,7); &mov ($tmp,0x80808080); &and ($tmp,$tp1); &lea ($tp2,&DWP(0,$tp1,$tp1)); &mov ($acc,$tmp); &shr ($tmp,7); &sub ($acc,$tmp); &and ($tp2,0xfefefefe); &and ($acc,0x1b1b1b1b); &xor ($acc,$tp2); &mov ($tp2,$acc); &xor ($tp2,$acc); &mov ($tmp,0x80808080); &and ($acc,0x80808080); &mov ($tmp,$acc); &shr ($tmp,7); &and ($tmp,$tp2); &lea ($tp4,&DWP(0,$tp2,$tp2)); &mov ($acc,$tmp); &shr ($tmp,7); &sub ($acc,$tmp); &and ($tp4,0xfefefefe); &and ($acc,0x1b1b1b1b); &xor ($tp2,$tp1); # tp2^tp1 &xor ($acc,$tp4); &mov ($tp4,$acc); &xor ($tp4,$acc); &mov ($tmp,0x80808080); &and ($acc,0x80808080); &mov ($tmp,$acc); &shr ($tmp,7); &and ($tmp,$tp4); &lea ($tp8,&DWP(0,$tp4,$tp4)); &mov ($acc,$tmp); &shr ($tmp,7); &xor ($tp4,$tp1); # tp4^tp1 &sub ($acc,$tmp); &and ($tp8,0xfefefefe); Loading
crypto/aes/asm/vpaes-x86.pl +45 −46 Original line number Diff line number Diff line Loading @@ -27,9 +27,9 @@ # # aes-586.pl vpaes-x86.pl # # Core 2(**) 29.1/42.3/18.3 22.0/25.6(***) # Nehalem 27.9/40.4/18.1 10.3/12.0 # Atom 102./119./60.1 64.5/85.3(***) # Core 2(**) 28.1/41.4/18.3 21.9/25.2(***) # Nehalem 27.9/40.4/18.1 10.2/11.9 # Atom 70.7/92.1/60.1 61.1/81.0(***) # # (*) "Hyper-threading" in the context refers rather to cache shared # among multiple cores, than to specifically Intel HTT. As vast Loading @@ -40,8 +40,8 @@ # (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe. # # (***) Less impressive improvement on Core 2 and Atom is due to slow # pshufb, yet it's respectable +32%/65% improvement on Core 2 # and +58%/40% on Atom (as implied, over "hyper-threading-safe" # pshufb, yet it's respectable +28%/64% improvement on Core 2 # and +15% on Atom (as implied, over "hyper-threading-safe" # code path). # # <appro@openssl.org> Loading Loading @@ -183,35 +183,35 @@ $k_dsbo=0x2c0; # decryption sbox final output &movdqa ("xmm1","xmm6") &movdqa ("xmm2",&QWP($k_ipt,$const)); &pandn ("xmm1","xmm0"); &movdqu ("xmm5",&QWP(0,$key)); &psrld ("xmm1",4); &pand ("xmm0","xmm6"); &movdqu ("xmm5",&QWP(0,$key)); &pshufb ("xmm2","xmm0"); &movdqa ("xmm0",&QWP($k_ipt+16,$const)); &pshufb ("xmm0","xmm1"); &pxor ("xmm2","xmm5"); &pxor ("xmm0","xmm2"); &psrld ("xmm1",4); &add ($key,16); &pshufb ("xmm0","xmm1"); &lea ($base,&DWP($k_mc_backward,$const)); &pxor ("xmm0","xmm2"); &jmp (&label("enc_entry")); &set_label("enc_loop",16); # middle of middle round &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u &pshufb ("xmm4","xmm2"); # 4 = sb1u &pxor ("xmm4","xmm5"); # 4 = sb1u + k &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t &pshufb ("xmm4","xmm2"); # 4 = sb1u &pshufb ("xmm0","xmm3"); # 0 = sb1t &pxor ("xmm0","xmm4"); # 0 = A &pxor ("xmm4","xmm5"); # 4 = sb1u + k &movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u &pshufb ("xmm5","xmm2"); # 4 = sb2u &pxor ("xmm0","xmm4"); # 0 = A &movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[] &pshufb ("xmm5","xmm2"); # 4 = sb2u &movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t &pshufb ("xmm2","xmm3"); # 2 = sb2t &pxor ("xmm2","xmm5"); # 2 = 2A &movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[] &pshufb ("xmm2","xmm3"); # 2 = sb2t &movdqa ("xmm3","xmm0"); # 3 = A &pxor ("xmm2","xmm5"); # 2 = 2A &pshufb ("xmm0","xmm1"); # 0 = B &add ($key,16); # next key &pxor ("xmm0","xmm2"); # 0 = 2A+B Loading @@ -220,30 +220,30 @@ $k_dsbo=0x2c0; # decryption sbox final output &pxor ("xmm3","xmm0"); # 3 = 2A+B+D &pshufb ("xmm0","xmm1"); # 0 = 2B+C &and ($magic,0x30); # ... mod 4 &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D &sub ($round,1); # nr-- &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D &set_label("enc_entry"); # top of round &movdqa ("xmm1","xmm6"); # 1 : i &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k &pandn ("xmm1","xmm0"); # 1 = i<<4 &psrld ("xmm1",4); # 1 = i &pand ("xmm0","xmm6"); # 0 = k &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k &pshufb ("xmm5","xmm0"); # 2 = a/k &pxor ("xmm0","xmm1"); # 0 = j &movdqa ("xmm3","xmm7"); # 3 : 1/i &pxor ("xmm0","xmm1"); # 0 = j &pshufb ("xmm3","xmm1"); # 3 = 1/i &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k &movdqa ("xmm4","xmm7"); # 4 : 1/j &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k &pshufb ("xmm4","xmm0"); # 4 = 1/j &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k &movdqa ("xmm2","xmm7"); # 2 : 1/iak &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k &pshufb ("xmm2","xmm3"); # 2 = 1/iak &pxor ("xmm2","xmm0"); # 2 = io &movdqa ("xmm3","xmm7"); # 3 : 1/jak &movdqu ("xmm5",&QWP(0,$key)); &pxor ("xmm2","xmm0"); # 2 = io &pshufb ("xmm3","xmm4"); # 3 = 1/jak &movdqu ("xmm5",&QWP(0,$key)); &pxor ("xmm3","xmm1"); # 3 = jo &jnz (&label("enc_loop")); Loading @@ -265,8 +265,8 @@ $k_dsbo=0x2c0; # decryption sbox final output ## Same API as encryption core. ## &function_begin_B("_vpaes_decrypt_core"); &mov ($round,&DWP(240,$key)); &lea ($base,&DWP($k_dsbd,$const)); &mov ($round,&DWP(240,$key)); &movdqa ("xmm1","xmm6"); &movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base)); &pandn ("xmm1","xmm0"); Loading @@ -292,62 +292,61 @@ $k_dsbo=0x2c0; # decryption sbox final output ## Inverse mix columns ## &movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u &movdqa ("xmm1",&QWP(-0x10,$base)); # 0 : sb9t &pshufb ("xmm4","xmm2"); # 4 = sb9u &pshufb ("xmm1","xmm3"); # 0 = sb9t &pxor ("xmm4","xmm0"); &movdqa ("xmm0",&QWP(-0x10,$base)); # 0 : sb9t &pshufb ("xmm0","xmm3"); # 0 = sb9t &pxor ("xmm0","xmm4"); # 0 = ch &add ($key,16); # next round key &pxor ("xmm1","xmm4"); # 0 = ch &pshufb ("xmm0","xmm5"); # MC ch &movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu &pshufb ("xmm1","xmm5"); # MC ch &pshufb ("xmm4","xmm2"); # 4 = sbdu &pxor ("xmm4","xmm0"); # 4 = ch &movdqa ("xmm0",&QWP(0x10,$base)); # 0 : sbdt &pxor ("xmm4","xmm1"); # 4 = ch &pshufb ("xmm0","xmm3"); # 0 = sbdt &pxor ("xmm0","xmm4"); # 0 = ch &sub ($round,1); # nr-- &pxor ("xmm0","xmm4"); # 0 = ch &pshufb ("xmm0","xmm5"); # MC ch &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu &pshufb ("xmm0","xmm5"); # MC ch &movdqa ("xmm1",&QWP(0x30,$base)); # 0 : sbbt &pshufb ("xmm4","xmm2"); # 4 = sbbu &pshufb ("xmm1","xmm3"); # 0 = sbbt &pxor ("xmm4","xmm0"); # 4 = ch &movdqa ("xmm0",&QWP(0x30,$base)); # 0 : sbbt &pshufb ("xmm0","xmm3"); # 0 = sbbt &pxor ("xmm0","xmm4"); # 0 = ch &pxor ("xmm1","xmm4"); # 0 = ch &pshufb ("xmm0","xmm5"); # MC ch &movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu &pshufb ("xmm4","xmm2"); # 4 = sbeu &pxor ("xmm4","xmm0"); # 4 = ch &pshufb ("xmm1","xmm5"); # MC ch &movdqa ("xmm0",&QWP(0x50,$base)); # 0 : sbet &pshufb ("xmm4","xmm2"); # 4 = sbeu &pshufb ("xmm0","xmm3"); # 0 = sbet &pxor ("xmm0","xmm4"); # 0 = ch &palignr("xmm5","xmm5",12); &pxor ("xmm4","xmm1"); # 4 = ch &pxor ("xmm0","xmm4"); # 0 = ch &set_label("dec_entry"); # top of round &movdqa ("xmm1","xmm6"); # 1 : i &pandn ("xmm1","xmm0"); # 1 = i<<4 &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k &psrld ("xmm1",4); # 1 = i &pand ("xmm0","xmm6"); # 0 = k &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k &pshufb ("xmm2","xmm0"); # 2 = a/k &pxor ("xmm0","xmm1"); # 0 = j &movdqa ("xmm3","xmm7"); # 3 : 1/i &pxor ("xmm0","xmm1"); # 0 = j &pshufb ("xmm3","xmm1"); # 3 = 1/i &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k &movdqa ("xmm4","xmm7"); # 4 : 1/j &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k &pshufb ("xmm4","xmm0"); # 4 = 1/j &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k &movdqa ("xmm2","xmm7"); # 2 : 1/iak &pshufb ("xmm2","xmm3"); # 2 = 1/iak &pxor ("xmm2","xmm0"); # 2 = io &movdqa ("xmm3","xmm7"); # 3 : 1/jak &pxor ("xmm2","xmm0"); # 2 = io &pshufb ("xmm3","xmm4"); # 3 = 1/jak &pxor ("xmm3","xmm1"); # 3 = jo &movdqu ("xmm0",&QWP(0,$key)); &pxor ("xmm3","xmm1"); # 3 = jo &jnz (&label("dec_loop")); # middle of last round Loading Loading @@ -542,12 +541,12 @@ $k_dsbo=0x2c0; # decryption sbox final output ## %xmm0: b+c+d b+c b a ## &function_begin_B("_vpaes_schedule_192_smear"); &pshufd ("xmm0","xmm6",0x80); # d c 0 0 -> c 0 0 0 &pxor ("xmm6","xmm0"); # -> c+d c 0 0 &pshufd ("xmm1","xmm6",0x80); # d c 0 0 -> c 0 0 0 &pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a &pxor ("xmm6","xmm1"); # -> c+d c 0 0 &pxor ("xmm1","xmm1"); &pxor ("xmm6","xmm0"); # -> b+c+d b+c b a &movdqa ("xmm0","xmm6"); &pxor ("xmm1","xmm1"); &movhlps("xmm6","xmm1"); # clobber low side with zeros &ret (); &function_end_B("_vpaes_schedule_192_smear"); Loading