Loading engines/asm/e_padlock-x86.pl +32 −8 Original line number Diff line number Diff line Loading @@ -37,6 +37,7 @@ require "x86asm.pl"; &asm_init($ARGV[0],$0); %PADLOCK_MARGIN=(ecb=>128, cbc=>64); # prefetch errata $PADLOCK_CHUNK=512; # Must be a power of 2 larger than 16 $ctx="edx"; Loading Loading @@ -187,6 +188,10 @@ my ($mode,$opcode) = @_; &movq ("mm0",&QWP(-16,$ctx)); # load [upper part of] counter } else { &xor ("ebx","ebx"); if ($PADLOCK_MARGIN{$mode}) { &cmp ($len,$PADLOCK_MARGIN{$mode}); &jbe (&label("${mode}_short")); } &test (&DWP(0,$ctx),1<<5); # align bit in control word &jnz (&label("${mode}_aligned")); &test ($out,0x0f); Loading Loading @@ -285,20 +290,39 @@ my ($mode,$opcode) = @_; &mov ($chunk,$PADLOCK_CHUNK); &jnz (&label("${mode}_loop")); if ($mode ne "ctr32") { &test ($out,0x0f); # out_misaligned &jz (&label("${mode}_done")); &cmp ("esp","ebp"); &je (&label("${mode}_done")); } &mov ($len,"ebp"); &mov ($out,"esp"); &sub ($len,"esp"); &xor ("eax","eax"); &shr ($len,2); &data_byte(0xf3,0xab); # rep stosl &pxor ("xmm0","xmm0"); &lea ("eax",&DWP(0,"esp")); &set_label("${mode}_bzero"); &movaps (&QWP(0,"eax"),"xmm0"); &lea ("eax",&DWP(16,"eax")); &cmp ("ebp","eax"); &ja (&label("${mode}_bzero")); &set_label("${mode}_done"); &lea ("esp",&DWP(24,"ebp")); if ($mode ne "ctr32") { &jmp (&label("${mode}_exit")); &set_label("${mode}_short",16); &xor ("eax","eax"); &lea ("ebp",&DWP(-24,"esp")); &sub ("eax",$len); &lea ("esp",&DWP(0,"eax","ebp")); &and ("esp",-16); &xor ($chunk,$chunk); &set_label("${mode}_short_copy"); &movups ("xmm0",&QWP(0,$inp,$chunk)); &lea ($chunk,&DWP(16,$chunk)); &cmp ($len,$chunk); &movaps (&QWP(-16,"esp",$chunk),"xmm0"); &ja (&label("${mode}_short_copy")); &mov ($inp,"esp"); &mov ($chunk,$len); &jmp (&label("${mode}_loop")); &set_label("${mode}_aligned",16); &lea ("eax",&DWP(-16,$ctx)); # ivp &lea ("ebx",&DWP(16,$ctx)); # key Loading engines/asm/e_padlock-x86_64.pl +41 −9 Original line number Diff line number Diff line Loading @@ -27,6 +27,7 @@ open STDOUT,"| $^X $xlate $flavour $output"; $code=".text\n"; %PADLOCK_MARGIN=(ecb=>128, cbc=>64, ctr32=>64); # prefetch errata $PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20 $ctx="%rdx"; Loading Loading @@ -284,6 +285,17 @@ padlock_${mode}_encrypt: lea 16($ctx),$ctx # control word xor %eax,%eax xor %ebx,%ebx ___ # Formally speaking correct condtion is $len<=$margin and $inp+$margin # crosses page boundary [and next page is unreadable]. But $inp can # be unaligned in which case data can be copied to $out if latter is # aligned, in which case $out+$margin has to be checked. Covering all # cases appears more complicated than just copying short input... $code.=<<___ if ($PADLOCK_MARGIN{$mode}); cmp \$$PADLOCK_MARGIN{$mode},$len jbe .L${mode}_short ___ $code.=<<___; testl \$`1<<5`,($ctx) # align bit in control word jnz .L${mode}_aligned test \$0x0f,$out Loading @@ -305,6 +317,7 @@ padlock_${mode}_encrypt: lea (%rax,%rbp),%rsp ___ $code.=<<___ if ($mode eq "ctr32"); .L${mode}_reenter: mov -4($ctx),%eax # pull 32-bit counter bswap %eax neg %eax Loading Loading @@ -373,19 +386,38 @@ $code.=<<___; mov \$$PADLOCK_CHUNK,$chunk jnz .L${mode}_loop test \$0x0f,$out jz .L${mode}_done cmp %rsp,%rbp je .L${mode}_done pxor %xmm0,%xmm0 lea (%rsp),%rax .L${mode}_bzero: movaps %xmm0,(%rax) lea 16(%rax),%rax cmp %rax,%rbp ja .L${mode}_bzero mov %rbp,$len mov %rsp,$out sub %rsp,$len xor %rax,%rax shr \$3,$len .byte 0xf3,0x48,0xab # rep stosq .L${mode}_done: lea (%rbp),%rsp jmp .L${mode}_exit ___ $code.=<<___ if ($PADLOCK_MARGIN{$mode}); .align 16 .L${mode}_short: mov %rsp,%rbp sub $len,%rsp xor $chunk,$chunk .L${mode}_short_copy: movups ($inp,$chunk),%xmm0 lea 16($chunk),$chunk cmp $chunk,$len movaps %xmm0,-16(%rsp,$chunk) ja .L${mode}_short_copy mov %rsp,$inp mov $len,$chunk jmp .L${mode}_`${mode} eq "ctr32"?"reenter":"loop"` ___ $code.=<<___; .align 16 .L${mode}_aligned: ___ Loading Loading
engines/asm/e_padlock-x86.pl +32 −8 Original line number Diff line number Diff line Loading @@ -37,6 +37,7 @@ require "x86asm.pl"; &asm_init($ARGV[0],$0); %PADLOCK_MARGIN=(ecb=>128, cbc=>64); # prefetch errata $PADLOCK_CHUNK=512; # Must be a power of 2 larger than 16 $ctx="edx"; Loading Loading @@ -187,6 +188,10 @@ my ($mode,$opcode) = @_; &movq ("mm0",&QWP(-16,$ctx)); # load [upper part of] counter } else { &xor ("ebx","ebx"); if ($PADLOCK_MARGIN{$mode}) { &cmp ($len,$PADLOCK_MARGIN{$mode}); &jbe (&label("${mode}_short")); } &test (&DWP(0,$ctx),1<<5); # align bit in control word &jnz (&label("${mode}_aligned")); &test ($out,0x0f); Loading Loading @@ -285,20 +290,39 @@ my ($mode,$opcode) = @_; &mov ($chunk,$PADLOCK_CHUNK); &jnz (&label("${mode}_loop")); if ($mode ne "ctr32") { &test ($out,0x0f); # out_misaligned &jz (&label("${mode}_done")); &cmp ("esp","ebp"); &je (&label("${mode}_done")); } &mov ($len,"ebp"); &mov ($out,"esp"); &sub ($len,"esp"); &xor ("eax","eax"); &shr ($len,2); &data_byte(0xf3,0xab); # rep stosl &pxor ("xmm0","xmm0"); &lea ("eax",&DWP(0,"esp")); &set_label("${mode}_bzero"); &movaps (&QWP(0,"eax"),"xmm0"); &lea ("eax",&DWP(16,"eax")); &cmp ("ebp","eax"); &ja (&label("${mode}_bzero")); &set_label("${mode}_done"); &lea ("esp",&DWP(24,"ebp")); if ($mode ne "ctr32") { &jmp (&label("${mode}_exit")); &set_label("${mode}_short",16); &xor ("eax","eax"); &lea ("ebp",&DWP(-24,"esp")); &sub ("eax",$len); &lea ("esp",&DWP(0,"eax","ebp")); &and ("esp",-16); &xor ($chunk,$chunk); &set_label("${mode}_short_copy"); &movups ("xmm0",&QWP(0,$inp,$chunk)); &lea ($chunk,&DWP(16,$chunk)); &cmp ($len,$chunk); &movaps (&QWP(-16,"esp",$chunk),"xmm0"); &ja (&label("${mode}_short_copy")); &mov ($inp,"esp"); &mov ($chunk,$len); &jmp (&label("${mode}_loop")); &set_label("${mode}_aligned",16); &lea ("eax",&DWP(-16,$ctx)); # ivp &lea ("ebx",&DWP(16,$ctx)); # key Loading
engines/asm/e_padlock-x86_64.pl +41 −9 Original line number Diff line number Diff line Loading @@ -27,6 +27,7 @@ open STDOUT,"| $^X $xlate $flavour $output"; $code=".text\n"; %PADLOCK_MARGIN=(ecb=>128, cbc=>64, ctr32=>64); # prefetch errata $PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20 $ctx="%rdx"; Loading Loading @@ -284,6 +285,17 @@ padlock_${mode}_encrypt: lea 16($ctx),$ctx # control word xor %eax,%eax xor %ebx,%ebx ___ # Formally speaking correct condtion is $len<=$margin and $inp+$margin # crosses page boundary [and next page is unreadable]. But $inp can # be unaligned in which case data can be copied to $out if latter is # aligned, in which case $out+$margin has to be checked. Covering all # cases appears more complicated than just copying short input... $code.=<<___ if ($PADLOCK_MARGIN{$mode}); cmp \$$PADLOCK_MARGIN{$mode},$len jbe .L${mode}_short ___ $code.=<<___; testl \$`1<<5`,($ctx) # align bit in control word jnz .L${mode}_aligned test \$0x0f,$out Loading @@ -305,6 +317,7 @@ padlock_${mode}_encrypt: lea (%rax,%rbp),%rsp ___ $code.=<<___ if ($mode eq "ctr32"); .L${mode}_reenter: mov -4($ctx),%eax # pull 32-bit counter bswap %eax neg %eax Loading Loading @@ -373,19 +386,38 @@ $code.=<<___; mov \$$PADLOCK_CHUNK,$chunk jnz .L${mode}_loop test \$0x0f,$out jz .L${mode}_done cmp %rsp,%rbp je .L${mode}_done pxor %xmm0,%xmm0 lea (%rsp),%rax .L${mode}_bzero: movaps %xmm0,(%rax) lea 16(%rax),%rax cmp %rax,%rbp ja .L${mode}_bzero mov %rbp,$len mov %rsp,$out sub %rsp,$len xor %rax,%rax shr \$3,$len .byte 0xf3,0x48,0xab # rep stosq .L${mode}_done: lea (%rbp),%rsp jmp .L${mode}_exit ___ $code.=<<___ if ($PADLOCK_MARGIN{$mode}); .align 16 .L${mode}_short: mov %rsp,%rbp sub $len,%rsp xor $chunk,$chunk .L${mode}_short_copy: movups ($inp,$chunk),%xmm0 lea 16($chunk),$chunk cmp $chunk,$len movaps %xmm0,-16(%rsp,$chunk) ja .L${mode}_short_copy mov %rsp,$inp mov $len,$chunk jmp .L${mode}_`${mode} eq "ctr32"?"reenter":"loop"` ___ $code.=<<___; .align 16 .L${mode}_aligned: ___ Loading