Loading engines/asm/e_padlock-x86.pl +81 −23 Original line number Diff line number Diff line Loading @@ -37,7 +37,7 @@ require "x86asm.pl"; &asm_init($ARGV[0],$0); %PADLOCK_MARGIN=(ecb=>128, cbc=>64); # prefetch errata %PADLOCK_PREFETCH=(ecb=>128, cbc=>64); # prefetch errata $PADLOCK_CHUNK=512; # Must be a power of 2 larger than 16 $ctx="edx"; Loading Loading @@ -188,10 +188,6 @@ my ($mode,$opcode) = @_; &movq ("mm0",&QWP(-16,$ctx)); # load [upper part of] counter } else { &xor ("ebx","ebx"); if ($PADLOCK_MARGIN{$mode}) { &cmp ($len,$PADLOCK_MARGIN{$mode}); &jbe (&label("${mode}_short")); } &test (&DWP(0,$ctx),1<<5); # align bit in control word &jnz (&label("${mode}_aligned")); &test ($out,0x0f); Loading @@ -212,7 +208,27 @@ my ($mode,$opcode) = @_; &neg ("eax"); &and ($chunk,$PADLOCK_CHUNK-1); # chunk=len%PADLOCK_CHUNK &lea ("esp",&DWP(0,"eax","ebp")); # alloca &mov ("eax",$PADLOCK_CHUNK); &cmovz ($chunk,"eax"); # chunk=chunk?:PADLOCK_CHUNK &mov ("eax","ebp"); &and ("ebp",-16); &and ("esp",-16); &mov (&DWP(16,"ebp"),"eax"); if ($PADLOCK_PREFETCH{$mode}) { &cmp ($len,$chunk); &ja (&label("${mode}_loop")); &mov ("eax",$inp); # check if prefetch crosses page &cmp ("ebp","esp"); &cmove ("eax",$out); &add ("eax",$len); &neg ("eax"); &and ("eax",0xfff); # distance to page boundary &cmp ("eax",$PADLOCK_PREFETCH{$mode}); &mov ("eax",-$PADLOCK_PREFETCH{$mode}); &cmovae ("eax",$chunk); # mask=distance<prefetch?-prefetch:-1 &and ($chunk,"eax"); &jz (&label("${mode}_unaligned_tail")); } &jmp (&label("${mode}_loop")); &set_label("${mode}_loop",16); Loading Loading @@ -276,8 +292,8 @@ my ($mode,$opcode) = @_; &test ($out,0x0f); &jz (&label("${mode}_out_aligned")); &mov ($len,$chunk); &shr ($len,2); &lea ($inp,&DWP(0,"esp")); &shr ($len,2); &data_byte(0xf3,0xa5); # rep movsl &sub ($out,$chunk); &set_label("${mode}_out_aligned"); Loading @@ -288,7 +304,30 @@ my ($mode,$opcode) = @_; &add ($inp,$chunk); &sub ($len,$chunk); &mov ($chunk,$PADLOCK_CHUNK); if (!$PADLOCK_PREFETCH{$mode}) { &jnz (&label("${mode}_loop")); } else { &jz (&label("${mode}_break")); &cmp ($len,$chunk); &jae (&label("${mode}_loop")); &set_label("${mode}_unaligned_tail"); &xor ("eax","eax"); &cmp ("esp","ebp"); &cmove ("eax",$len); &sub ("esp","eax"); # alloca &mov ("eax", $out); # save parameters &mov ($chunk,$len); &shr ($len,2); &lea ($out,&DWP(0,"esp")); &data_byte(0xf3,0xa5); # rep movsl &mov ($inp,"esp"); &mov ($out,"eax"); # restore parameters &mov ($len,$chunk); &jmp (&label("${mode}_loop")); &set_label("${mode}_break",16); } if ($mode ne "ctr32") { &cmp ("esp","ebp"); &je (&label("${mode}_done")); Loading @@ -302,28 +341,24 @@ my ($mode,$opcode) = @_; &ja (&label("${mode}_bzero")); &set_label("${mode}_done"); &mov ("ebp",&DWP(16,"ebp")); &lea ("esp",&DWP(24,"ebp")); if ($mode ne "ctr32") { &jmp (&label("${mode}_exit")); &set_label("${mode}_short",16); &xor ("eax","eax"); &lea ("ebp",&DWP(-24,"esp")); &sub ("eax",$len); &lea ("esp",&DWP(0,"eax","ebp")); &and ("esp",-16); &xor ($chunk,$chunk); &set_label("${mode}_short_copy"); &movups ("xmm0",&QWP(0,$inp,$chunk)); &lea ($chunk,&DWP(16,$chunk)); &cmp ($len,$chunk); &movaps (&QWP(-16,"esp",$chunk),"xmm0"); &ja (&label("${mode}_short_copy")); &mov ($inp,"esp"); &mov ($chunk,$len); &jmp (&label("${mode}_loop")); &set_label("${mode}_aligned",16); if ($PADLOCK_PREFETCH{$mode}) { &lea ("ebp",&DWP(0,$inp,$len)); &neg ("ebp"); &and ("ebp",0xfff); # distance to page boundary &xor ("eax","eax"); &cmp ("ebp",$PADLOCK_PREFETCH{$mode}); &mov ("ebp",$PADLOCK_PREFETCH{$mode}-1); &cmovae ("ebp","eax"); &and ("ebp",$len); # remainder &sub ($len,"ebp"); &jz (&label("${mode}_aligned_tail")); } &lea ("eax",&DWP(-16,$ctx)); # ivp &lea ("ebx",&DWP(16,$ctx)); # key &shr ($len,4); # len/=AES_BLOCK_SIZE Loading @@ -332,6 +367,29 @@ my ($mode,$opcode) = @_; &movaps ("xmm0",&QWP(0,"eax")); &movaps (&QWP(-16,$ctx),"xmm0"); # copy [or refresh] iv } if ($PADLOCK_PREFETCH{$mode}) { &test ("ebp","ebp"); &jz (&label("${mode}_exit")); &set_label("${mode}_aligned_tail"); &mov ($len,"ebp"); &lea ("ebp",&DWP(-24,"esp")); &mov ("esp","ebp"); &mov ("eax","ebp"); &sub ("esp",$len); &and ("ebp",-16); &and ("esp",-16); &mov (&DWP(16,"ebp"),"eax"); &mov ("eax", $out); # save parameters &mov ($chunk,$len); &shr ($len,2); &lea ($out,&DWP(0,"esp")); &data_byte(0xf3,0xa5); # rep movsl &mov ($inp,"esp"); &mov ($out,"eax"); # restore parameters &mov ($len,$chunk); &jmp (&label("${mode}_loop")); } &set_label("${mode}_exit"); } &mov ("eax",1); &lea ("esp",&DWP(4,"esp")); # popf Loading engines/asm/e_padlock-x86_64.pl +123 −55 Original line number Diff line number Diff line Loading @@ -27,7 +27,7 @@ open STDOUT,"| $^X $xlate $flavour $output"; $code=".text\n"; %PADLOCK_MARGIN=(ecb=>128, cbc=>64, ctr32=>64); # prefetch errata %PADLOCK_PREFETCH=(ecb=>128, cbc=>64, ctr32=>32); # prefetch errata $PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20 $ctx="%rdx"; Loading Loading @@ -285,17 +285,6 @@ padlock_${mode}_encrypt: lea 16($ctx),$ctx # control word xor %eax,%eax xor %ebx,%ebx ___ # Formally speaking correct condtion is $len<=$margin and $inp+$margin # crosses page boundary [and next page is unreadable]. But $inp can # be unaligned in which case data can be copied to $out if latter is # aligned, in which case $out+$margin has to be checked. Covering all # cases appears more complicated than just copying short input... $code.=<<___ if ($PADLOCK_MARGIN{$mode}); cmp \$$PADLOCK_MARGIN{$mode},$len jbe .L${mode}_short ___ $code.=<<___; testl \$`1<<5`,($ctx) # align bit in control word jnz .L${mode}_aligned test \$0x0f,$out Loading @@ -315,6 +304,8 @@ $code.=<<___; neg %rax and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK lea (%rax,%rbp),%rsp mov \$$PADLOCK_CHUNK,%rax cmovz %rax,$chunk # chunk=chunk?:PADLOCK_CHUNK ___ $code.=<<___ if ($mode eq "ctr32"); .L${mode}_reenter: Loading @@ -322,10 +313,27 @@ $code.=<<___ if ($mode eq "ctr32"); bswap %eax neg %eax and \$`$PADLOCK_CHUNK/16-1`,%eax jz .L${mode}_loop mov \$$PADLOCK_CHUNK,$chunk shl \$4,%eax cmovz $chunk,%rax cmp %rax,$len cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK cmovbe $len,$chunk ___ $code.=<<___ if ($PADLOCK_PREFETCH{$mode}); cmp $chunk,$len ja .L${mode}_loop mov $inp,%rax # check if prefetch crosses page cmp %rsp,%rbp cmove $out,%rax add $len,%rax neg %rax and \$0xfff,%rax # distance to page boundary cmp \$$PADLOCK_PREFETCH{$mode},%rax mov \$-$PADLOCK_PREFETCH{$mode},%rax cmovae $chunk,%rax # mask=distance<prefetch?-prefetch:-1 and %rax,$chunk jz .L${mode}_unaligned_tail ___ $code.=<<___; jmp .L${mode}_loop Loading Loading @@ -360,12 +368,12 @@ ___ $code.=<<___ if ($mode eq "ctr32"); mov -4($ctx),%eax # pull 32-bit counter test \$0xffff0000,%eax jnz .L${mode}_no_corr jnz .L${mode}_no_carry bswap %eax add \$0x10000,%eax bswap %eax mov %eax,-4($ctx) .L${mode}_no_corr: .L${mode}_no_carry: ___ $code.=<<___; mov %r8,$out # restore paramters Loading @@ -373,8 +381,8 @@ $code.=<<___; test \$0x0f,$out jz .L${mode}_out_aligned mov $chunk,$len shr \$3,$len lea (%rsp),$inp shr \$3,$len .byte 0xf3,0x48,0xa5 # rep movsq sub $chunk,$out .L${mode}_out_aligned: Loading @@ -384,9 +392,52 @@ $code.=<<___; add $chunk,$inp sub $chunk,$len mov \$$PADLOCK_CHUNK,$chunk ___ if (!$PADLOCK_PREFETCH{$mode}) { $code.=<<___; jnz .L${mode}_loop ___ } else { $code.=<<___; jz .L${mode}_break cmp $chunk,$len jae .L${mode}_loop ___ $code.=<<___ if ($mode eq "ctr32"); mov $len,$chunk mov $inp,%rax # check if prefetch crosses page cmp %rsp,%rbp cmove $out,%rax add $len,%rax neg %rax and \$0xfff,%rax # distance to page boundary cmp \$$PADLOCK_PREFETCH{$mode},%rax mov \$-$PADLOCK_PREFETCH{$mode},%rax cmovae $chunk,%rax and %rax,$chunk jnz .L${mode}_loop ___ $code.=<<___; .L${mode}_unaligned_tail: xor %eax,%eax cmp %rsp,%rbp cmove $len,%rax mov $out,%r8 # save parameters mov $len,$chunk sub %rax,%rsp # alloca shr \$3,$len lea (%rsp),$out .byte 0xf3,0x48,0xa5 # rep movsq mov %rsp,$inp mov %r8, $out # restore parameters mov $chunk,$len jmp .L${mode}_loop .align 16 .L${mode}_break: ___ } $code.=<<___; cmp %rbp,%rsp je .L${mode}_done pxor %xmm0,%xmm0 Loading @@ -400,70 +451,87 @@ $code.=<<___; .L${mode}_done: lea (%rbp),%rsp jmp .L${mode}_exit ___ $code.=<<___ if ($PADLOCK_MARGIN{$mode}); .align 16 .L${mode}_short: mov %rsp,%rbp sub $len,%rsp xor $chunk,$chunk .L${mode}_short_copy: movups ($inp,$chunk),%xmm0 lea 16($chunk),$chunk cmp $chunk,$len movaps %xmm0,-16(%rsp,$chunk) ja .L${mode}_short_copy mov %rsp,$inp mov $len,$chunk jmp .L${mode}_`${mode} eq "ctr32"?"reenter":"loop"` ___ $code.=<<___; .align 16 .L${mode}_aligned: ___ $code.=<<___ if ($mode eq "ctr32"); mov -4($ctx),%eax # pull 32-bit counter mov \$`16*0x10000`,$chunk bswap %eax cmp $len,$chunk cmova $len,$chunk neg %eax and \$0xffff,%eax jz .L${mode}_aligned_loop mov \$`16*0x10000`,$chunk shl \$4,%eax cmovz $chunk,%rax cmp %rax,$len cmova %rax,$chunk # don't let counter cross 2^16 jmp .L${mode}_aligned_loop .align 16 cmovbe $len,$chunk jbe .L${mode}_aligned_skip .L${mode}_aligned_loop: cmp $len,$chunk cmova $len,$chunk mov $len,%r10 # save parameters mov $chunk,$len mov $chunk,%r11 ___ $code.=<<___; lea -16($ctx),%rax # ivp lea 16($ctx),%rbx # key shr \$4,$len # len/=AES_BLOCK_SIZE .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt* ___ $code.=<<___ if ($mode !~ /ecb|ctr/); movdqa (%rax),%xmm0 movdqa %xmm0,-16($ctx) # copy [or refresh] iv ___ $code.=<<___ if ($mode eq "ctr32"); mov -4($ctx),%eax # pull 32-bit counter bswap %eax add \$0x10000,%eax bswap %eax mov %eax,-4($ctx) mov %r11,$chunk # restore paramters mov %r10,$len sub $chunk,$len mov %r10,$len # restore paramters sub %r11,$len mov \$`16*0x10000`,$chunk jnz .L${mode}_aligned_loop jz .L${mode}_exit cmp $chunk,$len jae .L${mode}_aligned_loop .L${mode}_aligned_skip: ___ $code.=<<___ if ($PADLOCK_PREFETCH{$mode}); lea ($inp,$len),%rbp neg %rbp and \$0xfff,%rbp # distance to page boundary xor %eax,%eax cmp \$$PADLOCK_PREFETCH{$mode},%rbp mov \$$PADLOCK_PREFETCH{$mode}-1,%rbp cmovae %rax,%rbp and $len,%rbp # remainder sub %rbp,$len jz .L${mode}_aligned_tail ___ $code.=<<___; lea -16($ctx),%rax # ivp lea 16($ctx),%rbx # key shr \$4,$len # len/=AES_BLOCK_SIZE .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt* ___ $code.=<<___ if ($mode !~ /ecb|ctr/); movdqa (%rax),%xmm0 movdqa %xmm0,-16($ctx) # copy [or refresh] iv ___ $code.=<<___ if ($PADLOCK_PREFETCH{$mode}); test %rbp,%rbp # check remainder jz .L${mode}_exit .L${mode}_aligned_tail: mov $out,%r8 mov %rbp,$chunk mov %rbp,$len lea (%rsp),%rbp sub $len,%rsp shr \$3,$len lea (%rsp),$out .byte 0xf3,0x48,0xa5 # rep movsq lea (%r8),$out lea (%rsp),$inp mov $chunk,$len jmp .L${mode}_loop ___ $code.=<<___; .L${mode}_exit: Loading Loading
engines/asm/e_padlock-x86.pl +81 −23 Original line number Diff line number Diff line Loading @@ -37,7 +37,7 @@ require "x86asm.pl"; &asm_init($ARGV[0],$0); %PADLOCK_MARGIN=(ecb=>128, cbc=>64); # prefetch errata %PADLOCK_PREFETCH=(ecb=>128, cbc=>64); # prefetch errata $PADLOCK_CHUNK=512; # Must be a power of 2 larger than 16 $ctx="edx"; Loading Loading @@ -188,10 +188,6 @@ my ($mode,$opcode) = @_; &movq ("mm0",&QWP(-16,$ctx)); # load [upper part of] counter } else { &xor ("ebx","ebx"); if ($PADLOCK_MARGIN{$mode}) { &cmp ($len,$PADLOCK_MARGIN{$mode}); &jbe (&label("${mode}_short")); } &test (&DWP(0,$ctx),1<<5); # align bit in control word &jnz (&label("${mode}_aligned")); &test ($out,0x0f); Loading @@ -212,7 +208,27 @@ my ($mode,$opcode) = @_; &neg ("eax"); &and ($chunk,$PADLOCK_CHUNK-1); # chunk=len%PADLOCK_CHUNK &lea ("esp",&DWP(0,"eax","ebp")); # alloca &mov ("eax",$PADLOCK_CHUNK); &cmovz ($chunk,"eax"); # chunk=chunk?:PADLOCK_CHUNK &mov ("eax","ebp"); &and ("ebp",-16); &and ("esp",-16); &mov (&DWP(16,"ebp"),"eax"); if ($PADLOCK_PREFETCH{$mode}) { &cmp ($len,$chunk); &ja (&label("${mode}_loop")); &mov ("eax",$inp); # check if prefetch crosses page &cmp ("ebp","esp"); &cmove ("eax",$out); &add ("eax",$len); &neg ("eax"); &and ("eax",0xfff); # distance to page boundary &cmp ("eax",$PADLOCK_PREFETCH{$mode}); &mov ("eax",-$PADLOCK_PREFETCH{$mode}); &cmovae ("eax",$chunk); # mask=distance<prefetch?-prefetch:-1 &and ($chunk,"eax"); &jz (&label("${mode}_unaligned_tail")); } &jmp (&label("${mode}_loop")); &set_label("${mode}_loop",16); Loading Loading @@ -276,8 +292,8 @@ my ($mode,$opcode) = @_; &test ($out,0x0f); &jz (&label("${mode}_out_aligned")); &mov ($len,$chunk); &shr ($len,2); &lea ($inp,&DWP(0,"esp")); &shr ($len,2); &data_byte(0xf3,0xa5); # rep movsl &sub ($out,$chunk); &set_label("${mode}_out_aligned"); Loading @@ -288,7 +304,30 @@ my ($mode,$opcode) = @_; &add ($inp,$chunk); &sub ($len,$chunk); &mov ($chunk,$PADLOCK_CHUNK); if (!$PADLOCK_PREFETCH{$mode}) { &jnz (&label("${mode}_loop")); } else { &jz (&label("${mode}_break")); &cmp ($len,$chunk); &jae (&label("${mode}_loop")); &set_label("${mode}_unaligned_tail"); &xor ("eax","eax"); &cmp ("esp","ebp"); &cmove ("eax",$len); &sub ("esp","eax"); # alloca &mov ("eax", $out); # save parameters &mov ($chunk,$len); &shr ($len,2); &lea ($out,&DWP(0,"esp")); &data_byte(0xf3,0xa5); # rep movsl &mov ($inp,"esp"); &mov ($out,"eax"); # restore parameters &mov ($len,$chunk); &jmp (&label("${mode}_loop")); &set_label("${mode}_break",16); } if ($mode ne "ctr32") { &cmp ("esp","ebp"); &je (&label("${mode}_done")); Loading @@ -302,28 +341,24 @@ my ($mode,$opcode) = @_; &ja (&label("${mode}_bzero")); &set_label("${mode}_done"); &mov ("ebp",&DWP(16,"ebp")); &lea ("esp",&DWP(24,"ebp")); if ($mode ne "ctr32") { &jmp (&label("${mode}_exit")); &set_label("${mode}_short",16); &xor ("eax","eax"); &lea ("ebp",&DWP(-24,"esp")); &sub ("eax",$len); &lea ("esp",&DWP(0,"eax","ebp")); &and ("esp",-16); &xor ($chunk,$chunk); &set_label("${mode}_short_copy"); &movups ("xmm0",&QWP(0,$inp,$chunk)); &lea ($chunk,&DWP(16,$chunk)); &cmp ($len,$chunk); &movaps (&QWP(-16,"esp",$chunk),"xmm0"); &ja (&label("${mode}_short_copy")); &mov ($inp,"esp"); &mov ($chunk,$len); &jmp (&label("${mode}_loop")); &set_label("${mode}_aligned",16); if ($PADLOCK_PREFETCH{$mode}) { &lea ("ebp",&DWP(0,$inp,$len)); &neg ("ebp"); &and ("ebp",0xfff); # distance to page boundary &xor ("eax","eax"); &cmp ("ebp",$PADLOCK_PREFETCH{$mode}); &mov ("ebp",$PADLOCK_PREFETCH{$mode}-1); &cmovae ("ebp","eax"); &and ("ebp",$len); # remainder &sub ($len,"ebp"); &jz (&label("${mode}_aligned_tail")); } &lea ("eax",&DWP(-16,$ctx)); # ivp &lea ("ebx",&DWP(16,$ctx)); # key &shr ($len,4); # len/=AES_BLOCK_SIZE Loading @@ -332,6 +367,29 @@ my ($mode,$opcode) = @_; &movaps ("xmm0",&QWP(0,"eax")); &movaps (&QWP(-16,$ctx),"xmm0"); # copy [or refresh] iv } if ($PADLOCK_PREFETCH{$mode}) { &test ("ebp","ebp"); &jz (&label("${mode}_exit")); &set_label("${mode}_aligned_tail"); &mov ($len,"ebp"); &lea ("ebp",&DWP(-24,"esp")); &mov ("esp","ebp"); &mov ("eax","ebp"); &sub ("esp",$len); &and ("ebp",-16); &and ("esp",-16); &mov (&DWP(16,"ebp"),"eax"); &mov ("eax", $out); # save parameters &mov ($chunk,$len); &shr ($len,2); &lea ($out,&DWP(0,"esp")); &data_byte(0xf3,0xa5); # rep movsl &mov ($inp,"esp"); &mov ($out,"eax"); # restore parameters &mov ($len,$chunk); &jmp (&label("${mode}_loop")); } &set_label("${mode}_exit"); } &mov ("eax",1); &lea ("esp",&DWP(4,"esp")); # popf Loading
engines/asm/e_padlock-x86_64.pl +123 −55 Original line number Diff line number Diff line Loading @@ -27,7 +27,7 @@ open STDOUT,"| $^X $xlate $flavour $output"; $code=".text\n"; %PADLOCK_MARGIN=(ecb=>128, cbc=>64, ctr32=>64); # prefetch errata %PADLOCK_PREFETCH=(ecb=>128, cbc=>64, ctr32=>32); # prefetch errata $PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20 $ctx="%rdx"; Loading Loading @@ -285,17 +285,6 @@ padlock_${mode}_encrypt: lea 16($ctx),$ctx # control word xor %eax,%eax xor %ebx,%ebx ___ # Formally speaking correct condtion is $len<=$margin and $inp+$margin # crosses page boundary [and next page is unreadable]. But $inp can # be unaligned in which case data can be copied to $out if latter is # aligned, in which case $out+$margin has to be checked. Covering all # cases appears more complicated than just copying short input... $code.=<<___ if ($PADLOCK_MARGIN{$mode}); cmp \$$PADLOCK_MARGIN{$mode},$len jbe .L${mode}_short ___ $code.=<<___; testl \$`1<<5`,($ctx) # align bit in control word jnz .L${mode}_aligned test \$0x0f,$out Loading @@ -315,6 +304,8 @@ $code.=<<___; neg %rax and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK lea (%rax,%rbp),%rsp mov \$$PADLOCK_CHUNK,%rax cmovz %rax,$chunk # chunk=chunk?:PADLOCK_CHUNK ___ $code.=<<___ if ($mode eq "ctr32"); .L${mode}_reenter: Loading @@ -322,10 +313,27 @@ $code.=<<___ if ($mode eq "ctr32"); bswap %eax neg %eax and \$`$PADLOCK_CHUNK/16-1`,%eax jz .L${mode}_loop mov \$$PADLOCK_CHUNK,$chunk shl \$4,%eax cmovz $chunk,%rax cmp %rax,$len cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK cmovbe $len,$chunk ___ $code.=<<___ if ($PADLOCK_PREFETCH{$mode}); cmp $chunk,$len ja .L${mode}_loop mov $inp,%rax # check if prefetch crosses page cmp %rsp,%rbp cmove $out,%rax add $len,%rax neg %rax and \$0xfff,%rax # distance to page boundary cmp \$$PADLOCK_PREFETCH{$mode},%rax mov \$-$PADLOCK_PREFETCH{$mode},%rax cmovae $chunk,%rax # mask=distance<prefetch?-prefetch:-1 and %rax,$chunk jz .L${mode}_unaligned_tail ___ $code.=<<___; jmp .L${mode}_loop Loading Loading @@ -360,12 +368,12 @@ ___ $code.=<<___ if ($mode eq "ctr32"); mov -4($ctx),%eax # pull 32-bit counter test \$0xffff0000,%eax jnz .L${mode}_no_corr jnz .L${mode}_no_carry bswap %eax add \$0x10000,%eax bswap %eax mov %eax,-4($ctx) .L${mode}_no_corr: .L${mode}_no_carry: ___ $code.=<<___; mov %r8,$out # restore paramters Loading @@ -373,8 +381,8 @@ $code.=<<___; test \$0x0f,$out jz .L${mode}_out_aligned mov $chunk,$len shr \$3,$len lea (%rsp),$inp shr \$3,$len .byte 0xf3,0x48,0xa5 # rep movsq sub $chunk,$out .L${mode}_out_aligned: Loading @@ -384,9 +392,52 @@ $code.=<<___; add $chunk,$inp sub $chunk,$len mov \$$PADLOCK_CHUNK,$chunk ___ if (!$PADLOCK_PREFETCH{$mode}) { $code.=<<___; jnz .L${mode}_loop ___ } else { $code.=<<___; jz .L${mode}_break cmp $chunk,$len jae .L${mode}_loop ___ $code.=<<___ if ($mode eq "ctr32"); mov $len,$chunk mov $inp,%rax # check if prefetch crosses page cmp %rsp,%rbp cmove $out,%rax add $len,%rax neg %rax and \$0xfff,%rax # distance to page boundary cmp \$$PADLOCK_PREFETCH{$mode},%rax mov \$-$PADLOCK_PREFETCH{$mode},%rax cmovae $chunk,%rax and %rax,$chunk jnz .L${mode}_loop ___ $code.=<<___; .L${mode}_unaligned_tail: xor %eax,%eax cmp %rsp,%rbp cmove $len,%rax mov $out,%r8 # save parameters mov $len,$chunk sub %rax,%rsp # alloca shr \$3,$len lea (%rsp),$out .byte 0xf3,0x48,0xa5 # rep movsq mov %rsp,$inp mov %r8, $out # restore parameters mov $chunk,$len jmp .L${mode}_loop .align 16 .L${mode}_break: ___ } $code.=<<___; cmp %rbp,%rsp je .L${mode}_done pxor %xmm0,%xmm0 Loading @@ -400,70 +451,87 @@ $code.=<<___; .L${mode}_done: lea (%rbp),%rsp jmp .L${mode}_exit ___ $code.=<<___ if ($PADLOCK_MARGIN{$mode}); .align 16 .L${mode}_short: mov %rsp,%rbp sub $len,%rsp xor $chunk,$chunk .L${mode}_short_copy: movups ($inp,$chunk),%xmm0 lea 16($chunk),$chunk cmp $chunk,$len movaps %xmm0,-16(%rsp,$chunk) ja .L${mode}_short_copy mov %rsp,$inp mov $len,$chunk jmp .L${mode}_`${mode} eq "ctr32"?"reenter":"loop"` ___ $code.=<<___; .align 16 .L${mode}_aligned: ___ $code.=<<___ if ($mode eq "ctr32"); mov -4($ctx),%eax # pull 32-bit counter mov \$`16*0x10000`,$chunk bswap %eax cmp $len,$chunk cmova $len,$chunk neg %eax and \$0xffff,%eax jz .L${mode}_aligned_loop mov \$`16*0x10000`,$chunk shl \$4,%eax cmovz $chunk,%rax cmp %rax,$len cmova %rax,$chunk # don't let counter cross 2^16 jmp .L${mode}_aligned_loop .align 16 cmovbe $len,$chunk jbe .L${mode}_aligned_skip .L${mode}_aligned_loop: cmp $len,$chunk cmova $len,$chunk mov $len,%r10 # save parameters mov $chunk,$len mov $chunk,%r11 ___ $code.=<<___; lea -16($ctx),%rax # ivp lea 16($ctx),%rbx # key shr \$4,$len # len/=AES_BLOCK_SIZE .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt* ___ $code.=<<___ if ($mode !~ /ecb|ctr/); movdqa (%rax),%xmm0 movdqa %xmm0,-16($ctx) # copy [or refresh] iv ___ $code.=<<___ if ($mode eq "ctr32"); mov -4($ctx),%eax # pull 32-bit counter bswap %eax add \$0x10000,%eax bswap %eax mov %eax,-4($ctx) mov %r11,$chunk # restore paramters mov %r10,$len sub $chunk,$len mov %r10,$len # restore paramters sub %r11,$len mov \$`16*0x10000`,$chunk jnz .L${mode}_aligned_loop jz .L${mode}_exit cmp $chunk,$len jae .L${mode}_aligned_loop .L${mode}_aligned_skip: ___ $code.=<<___ if ($PADLOCK_PREFETCH{$mode}); lea ($inp,$len),%rbp neg %rbp and \$0xfff,%rbp # distance to page boundary xor %eax,%eax cmp \$$PADLOCK_PREFETCH{$mode},%rbp mov \$$PADLOCK_PREFETCH{$mode}-1,%rbp cmovae %rax,%rbp and $len,%rbp # remainder sub %rbp,$len jz .L${mode}_aligned_tail ___ $code.=<<___; lea -16($ctx),%rax # ivp lea 16($ctx),%rbx # key shr \$4,$len # len/=AES_BLOCK_SIZE .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt* ___ $code.=<<___ if ($mode !~ /ecb|ctr/); movdqa (%rax),%xmm0 movdqa %xmm0,-16($ctx) # copy [or refresh] iv ___ $code.=<<___ if ($PADLOCK_PREFETCH{$mode}); test %rbp,%rbp # check remainder jz .L${mode}_exit .L${mode}_aligned_tail: mov $out,%r8 mov %rbp,$chunk mov %rbp,$len lea (%rsp),%rbp sub $len,%rsp shr \$3,$len lea (%rsp),$out .byte 0xf3,0x48,0xa5 # rep movsq lea (%r8),$out lea (%rsp),$inp mov $chunk,$len jmp .L${mode}_loop ___ $code.=<<___; .L${mode}_exit: Loading