Loading crypto/aes/asm/aes-586.pl +108 −32 Original line number Diff line number Diff line Loading @@ -6,7 +6,7 @@ # forms are granted according to the OpenSSL license. # ==================================================================== # # Version 3.3. # Version 3.4. # # You might fail to appreciate this module performance from the first # try. If compared to "vanilla" linux-ia32-icc target, i.e. considered Loading Loading @@ -60,6 +60,12 @@ # misaligned, which unfortunately has negative impact on elder IA-32 # implementations, Pentium suffered 30% penalty, PIII - 10%. # # Version 3.3 avoids L1 cache aliasing between stack frame and # S-boxes, and 3.4 - L1 cache aliasing even between key schedule. The # latter is achieved by copying the key schedule to controlled place in # stack. This unfortunately has rather strong impact on small block CBC # performance, ~2x deterioration on 16-byte block if compared to 3.3. # # Current ECB performance numbers for 128-bit key in CPU cycles per # processed byte [measure commonly used by AES benchmarkers] are: # Loading @@ -81,6 +87,12 @@ $s3="edx"; $key="edi"; $acc="esi"; $compromise=0; # $compromise=128 abstains from copying key # schedule to stack when encrypting inputs # shorter than 128 bytes at the cost of # risksing aliasing with S-boxes. In return # you get way better, up to +70%, small block # performance. $small_footprint=1; # $small_footprint=1 code is ~5% slower [on # recent µ-archs], but ~5 times smaller! # I favor compact code to minimize cache Loading Loading @@ -792,6 +804,7 @@ my $_key=&DWP(32,"esp"); #copy of wparam(3) my $_ivp=&DWP(36,"esp"); #copy of wparam(4) my $_tmp=&DWP(40,"esp"); #volatile variable my $ivec=&DWP(44,"esp"); #ivec[16] my $aes_key=&DWP(60,"esp"); #copy of aes_key &public_label("AES_Te"); &public_label("AES_Td"); Loading @@ -804,28 +817,37 @@ my $ivec=&DWP(44,"esp"); #ivec[16] &set_label("pic_point"); &blindpop("ebp"); &pushf (); &cld (); &cmp (&wparam(5),0); &je (&label("DECRYPT")); &lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp")); # allocate aligned stack frame... &lea ($key,&DWP(-44,"esp")); &lea ($key,&DWP(-64-260,"esp")); &and ($key,-64); # ... and make sure it doesn't alias with AES_Te modulo 4096 &mov ($s1,"ebp"); &mov ($s0,"ebp"); &lea ($s1,&DWP(2048,"ebp")); &mov ($s3,$key); &and ($s1,0xfff); # t = %ebp&0xfff &and ($s0,0xfff); # s = %ebp&0xfff &and ($s1,0xfff); # e = (%ebp+2048)&0xfff &and ($s3,0xfff); # p = %esp&0xfff &cmp ($s3,$s1); # if (p<t) goto ok &jb (&label("te_ok")); &lea ($acc,&DWP(2048,$s1)); &cmp ($s3,$acc); # if (p>=(t+2048)) goto ok &jae (&label("te_ok")); &sub ($s1,$s3); # t -= p &lea ($key,&DWP(-64,$key,$s1));# %esp -= (p-t)+64 &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e); &jb (&label("te_break_out")); &sub ($s3,$s1); &sub ($key,$s3); &jmp (&label("te_ok")); &set_label("te_break_out"); # else %esp -= (p-s)&0xfff + framesz; &sub ($s3,$s0); &and ($s3,0xfff); &add ($s3,64+320); &sub ($key,$s3); &align (4); &set_label("te_ok"); &mov ($s0,&wparam(0)); # load inp Loading @@ -843,6 +865,19 @@ my $ivec=&DWP(44,"esp"); #ivec[16] &mov ($_key,$s3); # save copy of key &mov ($_ivp,$acc); # save copy of ivp if ($compromise) { &cmp ($s2,$compromise); &jb (&label("skip_ecopy")); } # copy key schedule to stack &mov ("ecx",260/4); &mov ("esi",$s3); &lea ("edi",$aes_key); &mov ($_key,"edi"); &align (4); &data_word(0xF689A5F3); # rep movsd &set_label("skip_ecopy") if ($compromise); &mov ($acc,$s0); &mov ($key,16); &align (4); Loading Loading @@ -906,28 +941,42 @@ my $ivec=&DWP(44,"esp"); #ivec[16] &mov (&DWP(4,$acc),$s1); &mov (&DWP(8,$acc),$s2); &mov (&DWP(12,$acc),$s3); &mov ("edi",$_key); &mov ("esp",$_esp); if ($compromise) { &cmp (&wparam(2),$compromise); &jb (&label("skip_ezero")); } # zero copy of key schedule &mov ("ecx",256/4); &xor ("eax","eax"); &align (4); &data_word(0xF689ABF3); # rep stosd &set_label("skip_ezero") if ($compromise); &popf (); &set_label("enc_out"); &function_end_A(); &pushf (); # kludge, never executed &align (4); &set_label("enc_tail"); &push ($key eq "edi" ? $key : ""); # push ivp &pushf (); &mov ($key,$_out); # load out &mov ($s1,16); &sub ($s1,$s2); &cmp ($key,$acc); # compare with inp &je (&label("enc_in_place")); &data_word(0x90A4F3FC); # cld; rep movsb; nop # copy input &align (4); &data_word(0xF689A4F3); # rep movsb # copy input &jmp (&label("enc_skip_in_place")); &set_label("enc_in_place"); &lea ($key,&DWP(0,$key,$s2)); &set_label("enc_skip_in_place"); &mov ($s2,$s1); &xor ($s0,$s0); &data_word(0x90AAF3FC); # cld; rep stosb; nop # zero tail &popf (); &align (4); &data_word(0xF689AAF3); # rep stosb # zero tail &pop ($key); # pop ivp &mov ($acc,$_out); # output as input Loading @@ -942,22 +991,28 @@ my $ivec=&DWP(44,"esp"); #ivec[16] &lea ("ebp",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp")); # allocate aligned stack frame... &lea ($key,&DWP(-64,"esp")); &lea ($key,&DWP(-64-260,"esp")); &and ($key,-64); # ... and make sure it doesn't alias with AES_Td modulo 4096 &mov ($s1,"ebp"); &mov ($s0,"ebp"); &lea ($s1,&DWP(3072,"ebp")); &mov ($s3,$key); &and ($s1,0xfff); # t = %ebp&0xfff &and ($s0,0xfff); # s = %ebp&0xfff &and ($s1,0xfff); # e = (%ebp+3072)&0xfff &and ($s3,0xfff); # p = %esp&0xfff &cmp ($s3,$s1); # if (p<t) goto ok &jb (&label("td_ok")); &lea ($acc,&DWP(3072,$s1)); &cmp ($s3,$acc); # if (p>=(t+3072)) goto ok &jae (&label("td_ok")); &sub ($s1,$s3); # t -= p &lea ($key,&DWP(-64,$key,$s1));# %esp -= (p-t)+64 &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e); &jb (&label("td_break_out")); &sub ($s3,$s1); &sub ($key,$s3); &jmp (&label("td_ok")); &set_label("td_break_out"); # else %esp -= (p-s)&0xfff + framesz; &sub ($s3,$s0); &and ($s3,0xfff); &add ($s3,64+320); &sub ($key,$s3); &align (4); &set_label("td_ok"); &mov ($s0,&wparam(0)); # load inp Loading @@ -975,6 +1030,19 @@ my $ivec=&DWP(44,"esp"); #ivec[16] &mov ($_key,$s3); # save copy of key &mov ($_ivp,$acc); # save copy of ivp if ($compromise) { &cmp ($s2,$compromise); &jb (&label("skip_dcopy")); } # copy key schedule to stack &mov ("ecx",260/4); &mov ("esi",$s3); &lea ("edi",$aes_key); &mov ($_key,"edi"); &align (4); &data_word(0xF689A5F3); # rep movsd &set_label("skip_dcopy") if ($compromise); &mov ($acc,$s0); &mov ($key,24); &align (4); Loading Loading @@ -1053,9 +1121,7 @@ my $ivec=&DWP(44,"esp"); #ivec[16] &lea ($s2 eq "ecx" ? $s2 : "",&DWP(16,$acc)); &mov ($acc eq "esi" ? $acc : "",$key); &mov ($key eq "edi" ? $key : "",$_out); # load out &pushf (); &data_word(0x90A4F3FC); # cld; rep movsb; nop # copy output &popf (); &data_word(0xF689A4F3); # rep movsb # copy output &mov ($key,$_inp); # use inp as temp ivp &jmp (&label("dec_end")); Loading Loading @@ -1122,13 +1188,23 @@ my $ivec=&DWP(44,"esp"); #ivec[16] &lea ($key,&DWP(0,$key,$s2)); &lea ($acc,&DWP(16,$acc,$s2)); &neg ($s2 eq "ecx" ? $s2 : ""); &pushf (); &data_word(0x90A4F3FC); # cld; rep movsb; nop # restore tail &popf (); &data_word(0xF689A4F3); # rep movsb # restore tail &align (4); &set_label("dec_out"); &mov ("edi",$_key); &mov ("esp",$_esp); if ($compromise) { &cmp (&wparam(2),$compromise); &jb (&label("skip_dzero")); } # zero copy of key schedule &mov ("ecx",256/4); &xor ("eax","eax"); &align (4); &data_word(0xF689ABF3); # rep stosd &set_label("skip_dzero") if ($compromise); &popf (); &function_end("AES_cbc_encrypt"); } Loading crypto/perlasm/x86ms.pl +1 −0 Original line number Diff line number Diff line Loading @@ -176,6 +176,7 @@ sub main'rdtsc { &out0("DW\t0310Fh"); } sub main'halt { &out0("hlt"); } sub main'movz { &out2("movzx",@_); } sub main'neg { &out1("neg",@_); } sub main'cld { &out0("cld"); } # SSE2 sub main'emms { &out0("emms"); } Loading crypto/perlasm/x86nasm.pl +1 −0 Original line number Diff line number Diff line Loading @@ -194,6 +194,7 @@ sub main'rdtsc { &out0("rdtsc"); } sub main'halt { &out0("hlt"); } sub main'movz { &out2("movzx",@_); } sub main'neg { &out1("neg",@_); } sub main'cld { &out0("cld"); } # SSE2 sub main'emms { &out0("emms"); } Loading crypto/perlasm/x86unix.pl +1 −0 Original line number Diff line number Diff line Loading @@ -210,6 +210,7 @@ sub main'rdtsc { &out0(".byte\t0x0f,0x31"); } sub main'halt { &out0("hlt"); } sub main'movz { &out2("movzbl",@_); } sub main'neg { &out1("negl",@_); } sub main'cld { &out0("cld"); } # SSE2 sub main'emms { &out0("emms"); } Loading Loading
crypto/aes/asm/aes-586.pl +108 −32 Original line number Diff line number Diff line Loading @@ -6,7 +6,7 @@ # forms are granted according to the OpenSSL license. # ==================================================================== # # Version 3.3. # Version 3.4. # # You might fail to appreciate this module performance from the first # try. If compared to "vanilla" linux-ia32-icc target, i.e. considered Loading Loading @@ -60,6 +60,12 @@ # misaligned, which unfortunately has negative impact on elder IA-32 # implementations, Pentium suffered 30% penalty, PIII - 10%. # # Version 3.3 avoids L1 cache aliasing between stack frame and # S-boxes, and 3.4 - L1 cache aliasing even between key schedule. The # latter is achieved by copying the key schedule to controlled place in # stack. This unfortunately has rather strong impact on small block CBC # performance, ~2x deterioration on 16-byte block if compared to 3.3. # # Current ECB performance numbers for 128-bit key in CPU cycles per # processed byte [measure commonly used by AES benchmarkers] are: # Loading @@ -81,6 +87,12 @@ $s3="edx"; $key="edi"; $acc="esi"; $compromise=0; # $compromise=128 abstains from copying key # schedule to stack when encrypting inputs # shorter than 128 bytes at the cost of # risksing aliasing with S-boxes. In return # you get way better, up to +70%, small block # performance. $small_footprint=1; # $small_footprint=1 code is ~5% slower [on # recent µ-archs], but ~5 times smaller! # I favor compact code to minimize cache Loading Loading @@ -792,6 +804,7 @@ my $_key=&DWP(32,"esp"); #copy of wparam(3) my $_ivp=&DWP(36,"esp"); #copy of wparam(4) my $_tmp=&DWP(40,"esp"); #volatile variable my $ivec=&DWP(44,"esp"); #ivec[16] my $aes_key=&DWP(60,"esp"); #copy of aes_key &public_label("AES_Te"); &public_label("AES_Td"); Loading @@ -804,28 +817,37 @@ my $ivec=&DWP(44,"esp"); #ivec[16] &set_label("pic_point"); &blindpop("ebp"); &pushf (); &cld (); &cmp (&wparam(5),0); &je (&label("DECRYPT")); &lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp")); # allocate aligned stack frame... &lea ($key,&DWP(-44,"esp")); &lea ($key,&DWP(-64-260,"esp")); &and ($key,-64); # ... and make sure it doesn't alias with AES_Te modulo 4096 &mov ($s1,"ebp"); &mov ($s0,"ebp"); &lea ($s1,&DWP(2048,"ebp")); &mov ($s3,$key); &and ($s1,0xfff); # t = %ebp&0xfff &and ($s0,0xfff); # s = %ebp&0xfff &and ($s1,0xfff); # e = (%ebp+2048)&0xfff &and ($s3,0xfff); # p = %esp&0xfff &cmp ($s3,$s1); # if (p<t) goto ok &jb (&label("te_ok")); &lea ($acc,&DWP(2048,$s1)); &cmp ($s3,$acc); # if (p>=(t+2048)) goto ok &jae (&label("te_ok")); &sub ($s1,$s3); # t -= p &lea ($key,&DWP(-64,$key,$s1));# %esp -= (p-t)+64 &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e); &jb (&label("te_break_out")); &sub ($s3,$s1); &sub ($key,$s3); &jmp (&label("te_ok")); &set_label("te_break_out"); # else %esp -= (p-s)&0xfff + framesz; &sub ($s3,$s0); &and ($s3,0xfff); &add ($s3,64+320); &sub ($key,$s3); &align (4); &set_label("te_ok"); &mov ($s0,&wparam(0)); # load inp Loading @@ -843,6 +865,19 @@ my $ivec=&DWP(44,"esp"); #ivec[16] &mov ($_key,$s3); # save copy of key &mov ($_ivp,$acc); # save copy of ivp if ($compromise) { &cmp ($s2,$compromise); &jb (&label("skip_ecopy")); } # copy key schedule to stack &mov ("ecx",260/4); &mov ("esi",$s3); &lea ("edi",$aes_key); &mov ($_key,"edi"); &align (4); &data_word(0xF689A5F3); # rep movsd &set_label("skip_ecopy") if ($compromise); &mov ($acc,$s0); &mov ($key,16); &align (4); Loading Loading @@ -906,28 +941,42 @@ my $ivec=&DWP(44,"esp"); #ivec[16] &mov (&DWP(4,$acc),$s1); &mov (&DWP(8,$acc),$s2); &mov (&DWP(12,$acc),$s3); &mov ("edi",$_key); &mov ("esp",$_esp); if ($compromise) { &cmp (&wparam(2),$compromise); &jb (&label("skip_ezero")); } # zero copy of key schedule &mov ("ecx",256/4); &xor ("eax","eax"); &align (4); &data_word(0xF689ABF3); # rep stosd &set_label("skip_ezero") if ($compromise); &popf (); &set_label("enc_out"); &function_end_A(); &pushf (); # kludge, never executed &align (4); &set_label("enc_tail"); &push ($key eq "edi" ? $key : ""); # push ivp &pushf (); &mov ($key,$_out); # load out &mov ($s1,16); &sub ($s1,$s2); &cmp ($key,$acc); # compare with inp &je (&label("enc_in_place")); &data_word(0x90A4F3FC); # cld; rep movsb; nop # copy input &align (4); &data_word(0xF689A4F3); # rep movsb # copy input &jmp (&label("enc_skip_in_place")); &set_label("enc_in_place"); &lea ($key,&DWP(0,$key,$s2)); &set_label("enc_skip_in_place"); &mov ($s2,$s1); &xor ($s0,$s0); &data_word(0x90AAF3FC); # cld; rep stosb; nop # zero tail &popf (); &align (4); &data_word(0xF689AAF3); # rep stosb # zero tail &pop ($key); # pop ivp &mov ($acc,$_out); # output as input Loading @@ -942,22 +991,28 @@ my $ivec=&DWP(44,"esp"); #ivec[16] &lea ("ebp",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp")); # allocate aligned stack frame... &lea ($key,&DWP(-64,"esp")); &lea ($key,&DWP(-64-260,"esp")); &and ($key,-64); # ... and make sure it doesn't alias with AES_Td modulo 4096 &mov ($s1,"ebp"); &mov ($s0,"ebp"); &lea ($s1,&DWP(3072,"ebp")); &mov ($s3,$key); &and ($s1,0xfff); # t = %ebp&0xfff &and ($s0,0xfff); # s = %ebp&0xfff &and ($s1,0xfff); # e = (%ebp+3072)&0xfff &and ($s3,0xfff); # p = %esp&0xfff &cmp ($s3,$s1); # if (p<t) goto ok &jb (&label("td_ok")); &lea ($acc,&DWP(3072,$s1)); &cmp ($s3,$acc); # if (p>=(t+3072)) goto ok &jae (&label("td_ok")); &sub ($s1,$s3); # t -= p &lea ($key,&DWP(-64,$key,$s1));# %esp -= (p-t)+64 &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e); &jb (&label("td_break_out")); &sub ($s3,$s1); &sub ($key,$s3); &jmp (&label("td_ok")); &set_label("td_break_out"); # else %esp -= (p-s)&0xfff + framesz; &sub ($s3,$s0); &and ($s3,0xfff); &add ($s3,64+320); &sub ($key,$s3); &align (4); &set_label("td_ok"); &mov ($s0,&wparam(0)); # load inp Loading @@ -975,6 +1030,19 @@ my $ivec=&DWP(44,"esp"); #ivec[16] &mov ($_key,$s3); # save copy of key &mov ($_ivp,$acc); # save copy of ivp if ($compromise) { &cmp ($s2,$compromise); &jb (&label("skip_dcopy")); } # copy key schedule to stack &mov ("ecx",260/4); &mov ("esi",$s3); &lea ("edi",$aes_key); &mov ($_key,"edi"); &align (4); &data_word(0xF689A5F3); # rep movsd &set_label("skip_dcopy") if ($compromise); &mov ($acc,$s0); &mov ($key,24); &align (4); Loading Loading @@ -1053,9 +1121,7 @@ my $ivec=&DWP(44,"esp"); #ivec[16] &lea ($s2 eq "ecx" ? $s2 : "",&DWP(16,$acc)); &mov ($acc eq "esi" ? $acc : "",$key); &mov ($key eq "edi" ? $key : "",$_out); # load out &pushf (); &data_word(0x90A4F3FC); # cld; rep movsb; nop # copy output &popf (); &data_word(0xF689A4F3); # rep movsb # copy output &mov ($key,$_inp); # use inp as temp ivp &jmp (&label("dec_end")); Loading Loading @@ -1122,13 +1188,23 @@ my $ivec=&DWP(44,"esp"); #ivec[16] &lea ($key,&DWP(0,$key,$s2)); &lea ($acc,&DWP(16,$acc,$s2)); &neg ($s2 eq "ecx" ? $s2 : ""); &pushf (); &data_word(0x90A4F3FC); # cld; rep movsb; nop # restore tail &popf (); &data_word(0xF689A4F3); # rep movsb # restore tail &align (4); &set_label("dec_out"); &mov ("edi",$_key); &mov ("esp",$_esp); if ($compromise) { &cmp (&wparam(2),$compromise); &jb (&label("skip_dzero")); } # zero copy of key schedule &mov ("ecx",256/4); &xor ("eax","eax"); &align (4); &data_word(0xF689ABF3); # rep stosd &set_label("skip_dzero") if ($compromise); &popf (); &function_end("AES_cbc_encrypt"); } Loading
crypto/perlasm/x86ms.pl +1 −0 Original line number Diff line number Diff line Loading @@ -176,6 +176,7 @@ sub main'rdtsc { &out0("DW\t0310Fh"); } sub main'halt { &out0("hlt"); } sub main'movz { &out2("movzx",@_); } sub main'neg { &out1("neg",@_); } sub main'cld { &out0("cld"); } # SSE2 sub main'emms { &out0("emms"); } Loading
crypto/perlasm/x86nasm.pl +1 −0 Original line number Diff line number Diff line Loading @@ -194,6 +194,7 @@ sub main'rdtsc { &out0("rdtsc"); } sub main'halt { &out0("hlt"); } sub main'movz { &out2("movzx",@_); } sub main'neg { &out1("neg",@_); } sub main'cld { &out0("cld"); } # SSE2 sub main'emms { &out0("emms"); } Loading
crypto/perlasm/x86unix.pl +1 −0 Original line number Diff line number Diff line Loading @@ -210,6 +210,7 @@ sub main'rdtsc { &out0(".byte\t0x0f,0x31"); } sub main'halt { &out0("hlt"); } sub main'movz { &out2("movzbl",@_); } sub main'neg { &out1("negl",@_); } sub main'cld { &out0("cld"); } # SSE2 sub main'emms { &out0("emms"); } Loading