Loading crypto/sha/asm/sha256-586.pl +12 −7 Original line number Diff line number Diff line Loading @@ -45,13 +45,17 @@ $Xoff=&DWP(32,"esp"); $K256="ebp"; sub BODY_00_15() { my $in_16_64=shift; &mov ("ecx",$E); &add ($T,&DWP(4*(8+15+16-9),"esp")) if ($in_16_64); # T += X[-7] &ror ("ecx",6); &mov ("edi",$E); &ror ("edi",11); &mov ("esi",$Foff); &xor ("ecx","edi"); &ror ("edi",25-11); &mov (&DWP(4*(8+15),"esp"),$T) if ($in_16_64); # save X[0] &xor ("ecx","edi"); # Sigma1(e) &mov ("edi",$Goff); &add ($T,"ecx"); # T += Sigma1(e) Loading Loading @@ -88,6 +92,7 @@ sub BODY_00_15() { &add ($K256,4); &add ($A,$T); # h += T &mov ($T,&DWP(4*(8+15+16-1),"esp")) if ($in_16_64); # preload T &add ($E,"esi"); # d += K256[i] &add ($A,"esi"); # h += K256[i] } Loading Loading @@ -159,10 +164,10 @@ sub BODY_00_15() { &cmp ("esi",0xc19bf174); &jne (&label("00_15")); &mov ($T,&DWP(4*(8+15+16-1),"esp")); # preloaded in BODY_00_15(1) &set_label("16_63",16); &mov ($T,&DWP(4*(8+15+16-1),"esp")); &mov ("ecx",&DWP(4*(8+15+16-14),"esp")); &mov ("esi",$T); &mov ("ecx",&DWP(4*(8+15+16-14),"esp")); &shr ($T,3); &ror ("esi",7); &xor ($T,"esi"); Loading @@ -176,13 +181,13 @@ sub BODY_00_15() { &xor ("ecx","edi"); &ror ("edi",19-17); &add ($T,"esi"); # T += X[-16] &xor ("ecx","edi") # sigma1(X[-2]) &xor ("edi","ecx") # sigma1(X[-2]) &add ($T,"ecx"); # T += sigma1(X[-2]) &add ($T,&DWP(4*(8+15+16-9),"esp")); # T += X[-7] &mov (&DWP(4*(8+15),"esp"),$T); # save X[0] &add ($T,"edi"); # T += sigma1(X[-2]) # &add ($T,&DWP(4*(8+15+16-9),"esp")); # T += X[-7], moved to BODY_00_15(1) # &mov (&DWP(4*(8+15),"esp"),$T); # save X[0] &BODY_00_15(); &BODY_00_15(1); &cmp ("esi",0xc67178f2); &jne (&label("16_63")); Loading crypto/sha/asm/sha512-586.pl +24 −21 Original line number Diff line number Diff line Loading @@ -68,6 +68,8 @@ $E="mm4"; # F-H are commonly loaded to respectively mm1-mm3 and # mm5-mm7, but it's done on on-demand basis... sub BODY_00_15_sse2 { my $prefetch=shift; &movq ("mm5",$Fsse2); # load f &movq ("mm6",$Gsse2); # load g &movq ("mm7",$Hsse2); # load h Loading Loading @@ -96,7 +98,7 @@ sub BODY_00_15_sse2 { &pxor ("mm5","mm6"); # f^=g &movq ($E,$Dsse2); # e = load d &paddq ("mm3","mm5"); # T1+=Ch(e,f,g) &movq (&QWP(0,"esp"),$A); # modulo-scheduled save a &paddq ("mm3","mm7"); # T1+=h &movq ("mm5",$A); # %mm5 is sliding right Loading @@ -114,15 +116,16 @@ sub BODY_00_15_sse2 { &pxor ("mm7","mm6"); &psllq ("mm6",6); &pxor ("mm7","mm5"); &movq (&QWP(0,"esp"),$A); # modulo-scheduled save a &sub ("esp",8); &pxor ("mm7","mm6"); # T2=Sigma0_512(a) &movq ("mm5",$A); # %mm5=a &por ($A,"mm2"); # a=a|c &movq ("mm6",&QWP(8*(9+16-14),"esp")) if ($prefetch); &pand ("mm5","mm2"); # %mm5=a&c &pand ($A,"mm1"); # a=(a|c)&b &movq ("mm2",&QWP(8*(9+16-1),"esp")) if ($prefetch); &por ("mm5",$A); # %mm5=(a&c)|((a|c)&b) &sub ("esp",8); &paddq ("mm7","mm5"); # T2+=Maj(a,b,c) &movq ($A,"mm3"); # a=T1 Loading Loading @@ -327,48 +330,48 @@ if ($sse2) { &cmp (&LB("edx"),0x35); &jne (&label("00_14_sse2")); &BODY_00_15_sse2(); &BODY_00_15_sse2(1); &set_label("16_79_sse2",16); &movq ("mm3",&QWP(8*(9+16-1),"esp")); &movq ("mm6",&QWP(8*(9+16-14),"esp")); &movq ("mm1","mm3"); #&movq ("mm2",&QWP(8*(9+16-1),"esp")); #prefetched in BODY_00_15 #&movq ("mm6",&QWP(8*(9+16-14),"esp")); &movq ("mm1","mm2"); &psrlq ("mm3",1); &psrlq ("mm2",1); &movq ("mm7","mm6"); &psrlq ("mm6",6); &movq ("mm2","mm3"); &movq ("mm3","mm2"); &psrlq ("mm3",7-1); &psrlq ("mm2",7-1); &movq ("mm5","mm6"); &psrlq ("mm6",19-6); &pxor ("mm2","mm3"); &pxor ("mm3","mm2"); &psrlq ("mm3",8-7); &psrlq ("mm2",8-7); &pxor ("mm5","mm6"); &psrlq ("mm6",61-19); &pxor ("mm2","mm3"); &pxor ("mm3","mm2"); &movq ("mm3",&QWP(8*(9+16),"esp")); &movq ("mm2",&QWP(8*(9+16),"esp")); &psllq ("mm1",56); &pxor ("mm5","mm6"); &psllq ("mm7",3); &pxor ("mm2","mm1"); &pxor ("mm3","mm1"); &paddq ("mm3",&QWP(8*(9+16-9),"esp")); &paddq ("mm2",&QWP(8*(9+16-9),"esp")); &psllq ("mm1",63-56); &pxor ("mm5","mm7"); &psllq ("mm7",45-3); &pxor ("mm2","mm1"); &pxor ("mm3","mm1"); &pxor ("mm5","mm7"); &paddq ("mm2","mm5"); &paddq ("mm2","mm3"); &movq (&QWP(8*9,"esp"),"mm2"); &paddq ("mm3","mm5"); &paddq ("mm3","mm2"); &movq (&QWP(8*9,"esp"),"mm3"); &BODY_00_15_sse2(); &BODY_00_15_sse2(1); &cmp (&LB("edx"),0x17); &jne (&label("16_79_sse2")); Loading Loading
crypto/sha/asm/sha256-586.pl +12 −7 Original line number Diff line number Diff line Loading @@ -45,13 +45,17 @@ $Xoff=&DWP(32,"esp"); $K256="ebp"; sub BODY_00_15() { my $in_16_64=shift; &mov ("ecx",$E); &add ($T,&DWP(4*(8+15+16-9),"esp")) if ($in_16_64); # T += X[-7] &ror ("ecx",6); &mov ("edi",$E); &ror ("edi",11); &mov ("esi",$Foff); &xor ("ecx","edi"); &ror ("edi",25-11); &mov (&DWP(4*(8+15),"esp"),$T) if ($in_16_64); # save X[0] &xor ("ecx","edi"); # Sigma1(e) &mov ("edi",$Goff); &add ($T,"ecx"); # T += Sigma1(e) Loading Loading @@ -88,6 +92,7 @@ sub BODY_00_15() { &add ($K256,4); &add ($A,$T); # h += T &mov ($T,&DWP(4*(8+15+16-1),"esp")) if ($in_16_64); # preload T &add ($E,"esi"); # d += K256[i] &add ($A,"esi"); # h += K256[i] } Loading Loading @@ -159,10 +164,10 @@ sub BODY_00_15() { &cmp ("esi",0xc19bf174); &jne (&label("00_15")); &mov ($T,&DWP(4*(8+15+16-1),"esp")); # preloaded in BODY_00_15(1) &set_label("16_63",16); &mov ($T,&DWP(4*(8+15+16-1),"esp")); &mov ("ecx",&DWP(4*(8+15+16-14),"esp")); &mov ("esi",$T); &mov ("ecx",&DWP(4*(8+15+16-14),"esp")); &shr ($T,3); &ror ("esi",7); &xor ($T,"esi"); Loading @@ -176,13 +181,13 @@ sub BODY_00_15() { &xor ("ecx","edi"); &ror ("edi",19-17); &add ($T,"esi"); # T += X[-16] &xor ("ecx","edi") # sigma1(X[-2]) &xor ("edi","ecx") # sigma1(X[-2]) &add ($T,"ecx"); # T += sigma1(X[-2]) &add ($T,&DWP(4*(8+15+16-9),"esp")); # T += X[-7] &mov (&DWP(4*(8+15),"esp"),$T); # save X[0] &add ($T,"edi"); # T += sigma1(X[-2]) # &add ($T,&DWP(4*(8+15+16-9),"esp")); # T += X[-7], moved to BODY_00_15(1) # &mov (&DWP(4*(8+15),"esp"),$T); # save X[0] &BODY_00_15(); &BODY_00_15(1); &cmp ("esi",0xc67178f2); &jne (&label("16_63")); Loading
crypto/sha/asm/sha512-586.pl +24 −21 Original line number Diff line number Diff line Loading @@ -68,6 +68,8 @@ $E="mm4"; # F-H are commonly loaded to respectively mm1-mm3 and # mm5-mm7, but it's done on on-demand basis... sub BODY_00_15_sse2 { my $prefetch=shift; &movq ("mm5",$Fsse2); # load f &movq ("mm6",$Gsse2); # load g &movq ("mm7",$Hsse2); # load h Loading Loading @@ -96,7 +98,7 @@ sub BODY_00_15_sse2 { &pxor ("mm5","mm6"); # f^=g &movq ($E,$Dsse2); # e = load d &paddq ("mm3","mm5"); # T1+=Ch(e,f,g) &movq (&QWP(0,"esp"),$A); # modulo-scheduled save a &paddq ("mm3","mm7"); # T1+=h &movq ("mm5",$A); # %mm5 is sliding right Loading @@ -114,15 +116,16 @@ sub BODY_00_15_sse2 { &pxor ("mm7","mm6"); &psllq ("mm6",6); &pxor ("mm7","mm5"); &movq (&QWP(0,"esp"),$A); # modulo-scheduled save a &sub ("esp",8); &pxor ("mm7","mm6"); # T2=Sigma0_512(a) &movq ("mm5",$A); # %mm5=a &por ($A,"mm2"); # a=a|c &movq ("mm6",&QWP(8*(9+16-14),"esp")) if ($prefetch); &pand ("mm5","mm2"); # %mm5=a&c &pand ($A,"mm1"); # a=(a|c)&b &movq ("mm2",&QWP(8*(9+16-1),"esp")) if ($prefetch); &por ("mm5",$A); # %mm5=(a&c)|((a|c)&b) &sub ("esp",8); &paddq ("mm7","mm5"); # T2+=Maj(a,b,c) &movq ($A,"mm3"); # a=T1 Loading Loading @@ -327,48 +330,48 @@ if ($sse2) { &cmp (&LB("edx"),0x35); &jne (&label("00_14_sse2")); &BODY_00_15_sse2(); &BODY_00_15_sse2(1); &set_label("16_79_sse2",16); &movq ("mm3",&QWP(8*(9+16-1),"esp")); &movq ("mm6",&QWP(8*(9+16-14),"esp")); &movq ("mm1","mm3"); #&movq ("mm2",&QWP(8*(9+16-1),"esp")); #prefetched in BODY_00_15 #&movq ("mm6",&QWP(8*(9+16-14),"esp")); &movq ("mm1","mm2"); &psrlq ("mm3",1); &psrlq ("mm2",1); &movq ("mm7","mm6"); &psrlq ("mm6",6); &movq ("mm2","mm3"); &movq ("mm3","mm2"); &psrlq ("mm3",7-1); &psrlq ("mm2",7-1); &movq ("mm5","mm6"); &psrlq ("mm6",19-6); &pxor ("mm2","mm3"); &pxor ("mm3","mm2"); &psrlq ("mm3",8-7); &psrlq ("mm2",8-7); &pxor ("mm5","mm6"); &psrlq ("mm6",61-19); &pxor ("mm2","mm3"); &pxor ("mm3","mm2"); &movq ("mm3",&QWP(8*(9+16),"esp")); &movq ("mm2",&QWP(8*(9+16),"esp")); &psllq ("mm1",56); &pxor ("mm5","mm6"); &psllq ("mm7",3); &pxor ("mm2","mm1"); &pxor ("mm3","mm1"); &paddq ("mm3",&QWP(8*(9+16-9),"esp")); &paddq ("mm2",&QWP(8*(9+16-9),"esp")); &psllq ("mm1",63-56); &pxor ("mm5","mm7"); &psllq ("mm7",45-3); &pxor ("mm2","mm1"); &pxor ("mm3","mm1"); &pxor ("mm5","mm7"); &paddq ("mm2","mm5"); &paddq ("mm2","mm3"); &movq (&QWP(8*9,"esp"),"mm2"); &paddq ("mm3","mm5"); &paddq ("mm3","mm2"); &movq (&QWP(8*9,"esp"),"mm3"); &BODY_00_15_sse2(); &BODY_00_15_sse2(1); &cmp (&LB("edx"),0x17); &jne (&label("16_79_sse2")); Loading