Loading crypto/bn/asm/x86_64-mont.pl +151 −138 Original line number Diff line number Diff line Loading @@ -1685,6 +1685,7 @@ bn_mulx4x_mont: push %r15 shl \$3,${num}d # convert $num to bytes .byte 0x67 xor %r10,%r10 mov %rsp,%r11 # put aside %rsp sub $num,%r10 # -$num Loading Loading @@ -1725,15 +1726,14 @@ $code.=<<___; mov ($bp),%rdx # b[0], $bp==%rdx actually lea 64+32(%rsp),$tptr mov %rdx,$bi xor $zero,$zero # of=0,cf=0 mulx 0*8($aptr),$mi,%rax # a[0]*b[0] mulx 1*8($aptr),%r11,%r14 # a[1]*b[0] adcx %rax,%r11 add %rax,%r11 mov $bptr,8(%rsp) # off-load &b[i] mulx 2*8($aptr),%r12,%r13 # ... adcx %r14,%r12 adcx $zero,%r13 adc %r14,%r12 adc \$0,%r13 mov $mi,$bptr # borrow $bptr imulq 24(%rsp),$mi # "t[0]"*n0 Loading @@ -1751,13 +1751,12 @@ $code.=<<___; mulx 1*8($nptr),%rax,%r11 adcx %rax,%r10 adox %r12,%r11 mulx 2*8($nptr),%rax,%r12 .byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 # mulx 2*8($nptr),%rax,%r12 mov 48(%rsp),$bptr # counter value mov %r10,-4*8($tptr) adcx %rax,%r11 adox %r13,%r12 mulx 3*8($nptr),%rax,%r15 .byte 0x66,0x66 mov $bi,%rdx mov %r11,-3*8($tptr) adcx %rax,%r12 Loading @@ -1765,7 +1764,7 @@ $code.=<<___; lea 4*8($nptr),$nptr mov %r12,-2*8($tptr) #jmp .Lmulx4x_1st jmp .Lmulx4x_1st .align 32 .Lmulx4x_1st: Loading Loading @@ -1863,7 +1862,6 @@ $code.=<<___; adox %r12,%r11 mulx 2*8($nptr),%rax,%r12 mov %r10,-4*8($tptr) mov 0*8($tptr),%r10 adcx %rax,%r11 adox %r13,%r12 mulx 3*8($nptr),%rax,%r15 Loading @@ -1872,23 +1870,22 @@ $code.=<<___; adcx %rax,%r12 adox $zero,%r15 # of=0 mov 48(%rsp),$bptr # counter value .byte 0x66,0x3e mov %r12,-2*8($tptr) .byte 0x66 lea 4*8($nptr),$nptr jmp .Lmulx4x_inner #jmp .Lmulx4x_inner .align 32 .Lmulx4x_inner: adcx $zero,%r15 # cf=0, modulo-scheduled adox %r10,%r14 adox 0*8($tptr),%r14 mulx 0*8($aptr),%r10,%rax # a[4]*b[i] mov 1*8($tptr),%r13 adcx %r14,%r10 mulx 1*8($aptr),%r11,%r14 # a[5]*b[i] adox %rax,%r11 mulx 2*8($aptr),%r12,%rax # ... adcx %r13,%r11 adcx 1*8($tptr),%r11 adox %r14,%r12 mulx 3*8($aptr),%r13,%r14 mov $mi,%rdx Loading @@ -1896,8 +1893,8 @@ $code.=<<___; adox %rax,%r13 adcx 3*8($tptr),%r13 adox $zero,%r14 # of=0 .byte 0x48,0x8d,0xb6,0x20,0x00,0x00,0x00 # lea 4*8($aptr),$aptr .byte 0x48,0x8d,0x9b,0x20,0x00,0x00,0x00 # lea 4*8($tptr),$tptr lea 4*8($aptr),$aptr lea 4*8($tptr),$tptr adcx $zero,%r14 # cf=0 adox %r15,%r10 Loading @@ -1909,7 +1906,6 @@ $code.=<<___; adox %r15,%r12 mulx 2*8($nptr),%rax,%r15 mov %r10,-5*8($tptr) mov 0*8($tptr),%r10 adcx %rax,%r12 adox %r15,%r13 mulx 3*8($nptr),%rax,%r15 Loading @@ -1927,7 +1923,7 @@ $code.=<<___; mov 0(%rsp),$num # load num mov 8(%rsp),$bptr # re-load &b[i] adc $zero,%r15 # modulo-scheduled sub %r10,$zero # pull top-most carry sub 0*8($tptr),$zero # pull top-most carry adc %r15,%r14 sbb %r15,%r15 # top-most carry mov %r14,-1*8($tptr) Loading @@ -1936,10 +1932,10 @@ $code.=<<___; jne .Lmulx4x_outer neg $num xor %rdx,%rdx mov 32(%rsp),$rptr # restore rp lea 64(%rsp),$tptr xor %rdx,%rdx pxor %xmm0,%xmm0 mov 0*8($nptr,$num),%r8 mov 1*8($nptr,$num),%r9 Loading Loading @@ -2022,6 +2018,7 @@ bn_sqrx8x_mont: push %r15 shl \$3,${num}d # convert $num to bytes .byte 0x67 xor %r10,%r10 mov %rsp,%r11 # put aside %rsp sub $num,%r10 # -$num Loading @@ -2043,6 +2040,12 @@ bn_sqrx8x_mont: movq %r10, %xmm3 # -$num movq %r11, %xmm4 # save original %rsp mov $n0, 32(%rsp) ___ $code.=<<___ if ($win64); jmp .Lsqrx8x_body .align 32 ___ $code.=<<___; .Lsqrx8x_body: ################################################################## # Squaring part: Loading Loading @@ -2096,12 +2099,15 @@ $code.=<<___; mov $aaptr,8(%rsp) # save end of $aptr jmp .Lsqr8x_zero_start .align 32 .byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 .Lsqrx8x_zero: .byte 0x3e movdqa %xmm0,0*8($tptr) movdqa %xmm0,2*8($tptr) movdqa %xmm0,4*8($tptr) movdqa %xmm0,6*8($tptr) .Lsqr8x_zero_start: .Lsqr8x_zero_start: # aligned at 32 movdqa %xmm0,8*8($tptr) movdqa %xmm0,10*8($tptr) movdqa %xmm0,12*8($tptr) Loading @@ -2111,47 +2117,47 @@ $code.=<<___; jnz .Lsqrx8x_zero mov 0*8($aptr),%rdx # a[0], modulo-scheduled xor %r8,%r8 xor %r9,%r9 #xor %r9,%r9 # t[1], ex-$num, zero already xor %r10,%r10 xor %r11,%r11 xor %r12,%r12 xor %r13,%r13 xor %r14,%r14 xor %r15,%r15 lea 48(%rsp),$tptr xor $zero,$zero # cf=0, cf=0 jmp .Lsqrx8x_outer_loop .align 32 .Lsqrx8x_outer_loop: mulx 1*8($aptr),%rax,%rbx # a[1]*a[0] adcx %rax,%r8 # a[1]*a[0]+=t[1] adox %rbx,%r9 mulx 2*8($aptr),%rax,%rbx # a[2]*a[0] adcx %rax,%r9 adox %rbx,%r10 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 3*8($aptr),%rax,%rbx # ... adcx %rax,%r10 adox %rbx,%r11 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 4*8($aptr),%rax,%rbx adcx %rax,%r11 adox %rbx,%r12 mulx 5*8($aptr),%rax,%rbx adcx %rax,%r12 adox %rbx,%r13 mulx 6*8($aptr),%rax,%rbx adcx %rax,%r13 adox %rbx,%r14 mulx 7*8($aptr),%rax,%r15 mulx 1*8($aptr),%r8,%rax # a[1]*a[0] adcx %r9,%r8 # a[1]*a[0]+=t[1] adox %rax,%r10 mulx 2*8($aptr),%r9,%rax # a[2]*a[0] adcx %r10,%r9 adox %rax,%r11 .byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 3*8($aptr),%r10,%rax # ... adcx %r11,%r10 adox %rax,%r12 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 # mulx 4*8($aptr),%r11,%rax adcx %r12,%r11 adox %rax,%r13 mulx 5*8($aptr),%r12,%rax adcx %r13,%r12 adox %rax,%r14 mulx 6*8($aptr),%r13,%rax adcx %r14,%r13 adox %r15,%rax mulx 7*8($aptr),%r14,%r15 mov 1*8($aptr),%rdx # a[1] adcx %rax,%r14 adox $zero,%r15 adc 8*8($tptr),%r15 mov %r8,1*8($tptr) # t[1] mov %r9,2*8($tptr) # t[2] sbb $carry,$carry # mov %cf,$carry xor $zero,$zero # cf=0, of=0 mov %r8,1*8($tptr) # t[1] mov %r9,2*8($tptr) # t[2] mulx 2*8($aptr),%r8,%rbx # a[2]*a[1] mulx 3*8($aptr),%r9,%rax # a[3]*a[1] Loading Loading @@ -2193,13 +2199,12 @@ $code.=<<___; adcx %rbx,%r11 adox %rax,%r12 adcx %r14,%r12 adox $zero,%r13 # of=0 adcx $zero,%r13 # cf=0 mov %r8,5*8($tptr) # t[5] mov %r9,6*8($tptr) # t[6] mulx 4*8($aptr),%r8,%rax # a[4]*a[3] adox $zero,%r13 # of=0 adcx $zero,%r13 # cf=0 mulx 5*8($aptr),%r9,%rbx # a[5]*a[3] adcx %r10,%r8 adox %rax,%r9 Loading Loading @@ -2239,9 +2244,9 @@ $code.=<<___; adcx %r14,%r11 adox %rbx,%r12 adcx %rax,%r12 .byte 0x66,0x66 adox $zero,%r13 .byte 0x67,0x67 mulx %r8,%r8,%r14 # a[7]*a[6] adcx %r8,%r13 adcx $zero,%r14 Loading @@ -2250,26 +2255,26 @@ $code.=<<___; je .Lsqrx8x_outer_break neg $carry # mov $carry,%cf mov \$-8,%rcx mov $zero,%r15 mov 8*8($tptr),%r8 adc 9*8($tptr),%r9 # +=t[9] adc 10*8($tptr),%r10 # ... adc 11*8($tptr),%r11 adcx 9*8($tptr),%r9 # +=t[9] adcx 10*8($tptr),%r10 # ... adcx 11*8($tptr),%r11 adc 12*8($tptr),%r12 adc 13*8($tptr),%r13 adc 14*8($tptr),%r14 adc 15*8($tptr),%r15 lea 8*8($tptr),$tptr sbb $carry,$carry # mov %cf,$carry lea ($aptr),$aaptr lea 2*8*8($tptr),$tptr sbb %rax,%rax # mov %cf,$carry mov -64($aptr),%rdx # a[0] lea ($aptr),$aaptr mov $carry,16(%rsp) # offload $carry mov %rax,16(%rsp) # offload $carry mov $tptr,24(%rsp) lea 8*8($tptr),$tptr #lea 8*8($tptr),$tptr # see 2*8*8($tptr) above xor %eax,%eax # cf=0, of=0 mov \$-8,%rcx jmp .Lsqrx8x_loop .align 32 Loading Loading @@ -2311,17 +2316,20 @@ $code.=<<___; adox %rbx,%r15 # %rbx is 0, of=0 adcx %rbx,%r15 # cf=0 .byte 0x67 inc %rcx # of=0 jnz .Lsqrx8x_loop lea 8*8($aaptr),$aaptr mov \$-8,%rcx cmp 8(%rsp),$aaptr # done? je .Lsqrx8x_break sub 16(%rsp),%rbx # mov 16(%rsp),%cf .byte 0x66 mov -64($aptr),%rdx adc 0*8($tptr),%r8 adc 1*8($tptr),%r9 adcx 0*8($tptr),%r8 adcx 1*8($tptr),%r9 adc 2*8($tptr),%r10 adc 3*8($tptr),%r11 adc 4*8($tptr),%r12 Loading @@ -2329,35 +2337,37 @@ $code.=<<___; adc 6*8($tptr),%r14 adc 7*8($tptr),%r15 lea 8*8($tptr),$tptr sbb %rbx,%rbx # mov %cf,%rbx xor %eax,%eax # cf=0, of=0 mov %rbx,16(%rsp) # offload carry mov \$-8,%rcx .byte 0x67 sbb %rax,%rax # mov %cf,%rax xor %ebx,%ebx # cf=0, of=0 mov %rax,16(%rsp) # offload carry jmp .Lsqrx8x_loop .align 32 .Lsqrx8x_break: sub 16(%rsp),%r8 # consume last carry mov 24(%rsp),$aaptr # initial $tptr mov 24(%rsp),$carry # initial $tptr, borrow $carry mov 0*8($aptr),%rdx # a[8], modulo-scheduled xor %ebp,%ebp # xor $zero,$zero mov %r8,0*8($tptr) lea 8*8($aaptr),$aaptr cmp $carry,$tptr # cf=0, of=0 je .Lsqrx8x_outer_loop mov %r9,1*8($tptr) mov 1*8($aaptr),%r8 # potentially forwarded store mov 1*8($carry),%r9 mov %r10,2*8($tptr) mov 2*8($aaptr),%r9 # ... mov 2*8($carry),%r10 mov %r11,3*8($tptr) mov 3*8($aaptr),%r10 mov 3*8($carry),%r11 mov %r12,4*8($tptr) mov 4*8($aaptr),%r11 mov 4*8($carry),%r12 mov %r13,5*8($tptr) mov 5*8($aaptr),%r12 mov 5*8($carry),%r13 mov %r14,6*8($tptr) mov 6*8($aaptr),%r13 mov 6*8($carry),%r14 mov %r15,7*8($tptr) mov 7*8($aaptr),%r14 mov $aaptr,$tptr xor $zero,$zero # cf=0, cf=0 mov 7*8($carry),%r15 mov $carry,$tptr jmp .Lsqrx8x_outer_loop .align 32 Loading @@ -2373,13 +2383,12 @@ ___ }{ my $i="%rcx"; $code.=<<___; mov (%rsp),$num # restore $num lea 48(%rsp),$tptr mov ($aptr,$i),%rdx # a[0] mov 8($tptr),$A0[1] # t[1] xor $A0[0],$A0[0] # t[0], of=0, cf=0 mov (%rsp),$num # restore $num adox $A0[1],$A0[1] mov 16($tptr),$A1[0] # t[2] # prefetch mov 24($tptr),$A1[1] # t[3] # prefetch Loading Loading @@ -2440,9 +2449,9 @@ $code.=<<___; .align 32 .Lsqrx4x_shift_n_add_break: adcx $A1[1],%rbx .byte 0x48,0x89,0x87,0x30,0x00,0x00,0x00 # mov %rax,48($tptr) .byte 0x48,0x89,0x9f,0x38,0x00,0x00,0x00 # mov %rbx,56($tptr) .byte 0x48,0x8d,0xbf,0x40,0x00,0x00,0x00 # lea 64($tptr),$tptr mov %rax,48($tptr) mov %rbx,56($tptr) lea 64($tptr),$tptr # end of t[] buffer ___ } ###################################################################### Loading @@ -2456,17 +2465,16 @@ my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx"); $code.=<<___; movq %xmm2,$nptr xor %eax,%eax # initial top-most carry bit mov 32(%rsp),%rbx # n0 mov 48(%rsp),%rdx # "%r8", 8*0($tptr) lea ($nptr,$num),%rax # end of n[] lea -64($nptr,$num),%rcx # end of n[] #lea 48(%rsp,$num,2),$tptr # end of t[] buffer mov %rax, 0(%rsp) # save end of n[] mov %rcx, 0(%rsp) # save end of n[] mov $tptr,8(%rsp) # save end of t[] lea 48(%rsp),$tptr # initial t[] window xor %rax,%rax nop #jmp .Lsqrx8x_reduction_loop jmp .Lsqrx8x_reduction_loop .align 32 .Lsqrx8x_reduction_loop: Loading Loading @@ -2529,29 +2537,31 @@ $code.=<<___; adox $carry,%r15 # $carry is 0 adcx $carry,%r15 # cf=0 .byte 0x67 inc %rcx # of=0 jnz .Lsqrx8x_reduce lea 8*8($nptr),$nptr xor %rax,%rax .byte 0x66,0x67 mov $carry,%rax # xor %rax,%rax cmp 0(%rsp),$nptr # end of n[]? jae .Lsqrx8x_no_tail mov 48(%rsp),%rdx # pull n0*a[0] add 8*0($tptr),%r8 adcx 8*1($tptr),%r9 adcx 8*2($tptr),%r10 adcx 8*3($tptr),%r11 adcx 8*4($tptr),%r12 adcx 8*5($tptr),%r13 adcx 8*6($tptr),%r14 adcx 8*7($tptr),%r15 lea 8*8($nptr),$nptr mov \$-8,%rcx adc 8*1($tptr),%r9 adc 8*2($tptr),%r10 adc 8*3($tptr),%r11 adc 8*4($tptr),%r12 adc 8*5($tptr),%r13 adc 8*6($tptr),%r14 adc 8*7($tptr),%r15 lea 8*8($tptr),$tptr sbb $carry,$carry # top carry sbb %rax,%rax # top carry mov \$-8,%rcx mov $carry,16(%rsp) xor $carry,$carry # of=0, cf=0 mov %rax,16(%rsp) jmp .Lsqrx8x_tail .align 32 Loading Loading @@ -2588,7 +2598,7 @@ $code.=<<___; mulx 8*7($nptr),%rax,%r15 mov 48+72(%rsp,%rcx,8),%rdx # pull n0*a[i] adcx %rax,%r14 .byte 0x66 .byte 0x67 adox $carry,%r15 mov %rbx,($tptr,%rcx,8) # save result mov %r8,%rbx Loading @@ -2597,35 +2607,35 @@ $code.=<<___; inc %rcx # of=0 jnz .Lsqrx8x_tail lea 8*8($nptr),$nptr cmp 0(%rsp),$nptr # end of n[]? jae .Lsqrx8x_tail_done # break out of loop sub 16(%rsp),$carry # neg $carry sub 16(%rsp),$carry # mov 16(%rsp),%cf mov 48(%rsp),%rdx # pull n0*a[0] adcx 8*0($tptr),%r8 adcx 8*1($tptr),%r9 adcx 8*2($tptr),%r10 adcx 8*3($tptr),%r11 adcx 8*4($tptr),%r12 adcx 8*5($tptr),%r13 adcx 8*6($tptr),%r14 adcx 8*7($tptr),%r15 lea 8*8($nptr),$nptr adc 8*0($tptr),%r8 adc 8*1($tptr),%r9 adc 8*2($tptr),%r10 adc 8*3($tptr),%r11 adc 8*4($tptr),%r12 adc 8*5($tptr),%r13 adc 8*6($tptr),%r14 adc 8*7($tptr),%r15 lea 8*8($tptr),$tptr sbb $carry,$carry mov \$-8,%rcx mov $carry,16(%rsp) sbb %rax,%rax xor $carry,$carry # of=0, cf=0 mov %rax,16(%rsp) jmp .Lsqrx8x_tail .align 32 .Lsqrx8x_tail_done: add 24(%rsp),%r8 # can this overflow? xor %rax,%rax mov $carry,%rax # xor %rax,%rax sub 16(%rsp),$carry # neg $carry .Lsqrx8x_no_tail: # carry flag is 0 sub 16(%rsp),$carry # mov 16(%rsp),%cf .Lsqrx8x_no_tail: # %cf is 0 if jumped here adc 8*0($tptr),%r8 movq %xmm3,%rcx adc 8*1($tptr),%r9 Loading @@ -2639,24 +2649,24 @@ $code.=<<___; adc 8*7($tptr),%r15 adc %rax,%rax # top-most carry cmp 8(%rsp),$carry # end of t[]? mov 32(%rsp),%rbx # n0 mov 8*8($tptr,%rcx),%rdx # modulo-scheduled "%r8" lea 8*8($tptr,%rcx),$tptr # start of current t[] window mov %r8,-8*8($carry) # store top 512 bits mov %r9,-8*7($carry) mov %r10,-8*6($carry) mov %r11,-8*5($carry) mov %r12,-8*4($carry) mov %r13,-8*3($carry) mov %r14,-8*2($carry) mov %r15,-8*1($carry) mov %r8,8*0($tptr) # store top 512 bits mov %r9,8*1($tptr) mov %r10,8*2($tptr) mov %r11,8*3($tptr) mov %r12,8*4($tptr) mov %r13,8*5($tptr) mov %r14,8*6($tptr) mov %r15,8*7($tptr) lea 8*8($tptr,%rcx),$tptr # start of current t[] window cmp 8(%rsp),$carry # end of t[]? jb .Lsqrx8x_reduction_loop mov %rcx,$num neg $num # restore $num mov %rcx,%rdx # -$num jmp .Lsqrx8x_post ___ } ############################################################## Loading @@ -2667,24 +2677,28 @@ my ($rptr,$nptr,$lptr,$i)=($aptr,"%rbp","%rbx","%rcx"); my @ri=map("%r$_",(10..13)); my @ni=map("%r$_",(14..15)); $code.=<<___; lea ($nptr,$num),$nptr # end of $nptr lea 48(%rsp,$num),$lptr # end of lower half of t[2*num] lea 48(%rsp,$num),$tptr .align 32 .Lsqrx8x_post: neg %rdx # restore $num neg %rax # top-most carry as mask mov 0*8($nptr),%r8 mov 1*8($nptr),%r9 lea ($nptr,%rdx),$nptr # end of $nptr lea 48(%rsp,%rdx),$lptr # end of lower half of t[2*num] lea 48(%rsp,%rdx),$tptr .byte 0x67 xor %rdx,%rdx movq %xmm1,$rptr # restore $rptr mov 0*8($nptr,$i),%r8 mov 1*8($nptr,$i),%r9 neg %r8 jmp .Lsqrx8x_sub_entry .align 32 .byte 0x66,0x66,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 .Lsqrx8x_sub: mov 0*8($nptr,$i),%r8 mov 1*8($nptr,$i),%r9 not %r8 .Lsqrx8x_sub_entry: .Lsqrx8x_sub_entry: # aligned at 32 mov 2*8($nptr,$i),%r10 not %r9 and %rax,%r8 Loading @@ -2709,28 +2723,27 @@ $code.=<<___; movdqa %xmm0,2*8($lptr,$i) and %rax,%r15 neg %rdx # mov %rdx,%cf neg %edx # mov %edx,%cf movdqa %xmm0,4*8($lptr,$i) adc 0*8($tptr),%r8 mov %r8,0*8($rptr) # result adc 1*8($tptr),%r9 movdqa %xmm0,6*8($lptr,$i) adc 2*8($tptr),%r10 mov %r9,1*8($rptr) adc 3*8($tptr),%r11 movdqa %xmm0,0*8($tptr) # zap upper half adc 4*8($tptr),%r12 mov %r10,2*8($rptr) adc 5*8($tptr),%r13 movdqa %xmm0,2*8($tptr) adc 6*8($tptr),%r14 mov %r11,3*8($rptr) adc 7*8($tptr),%r15 sbb %edx,%edx # mov %cf,%edx movdqa %xmm0,4*8($tptr) sbb %rdx,%rdx # mov %cf,%rdx movdqa %xmm0,6*8($tptr) lea 8*8($tptr),$tptr mov %r8,0*8($rptr) mov %r9,1*8($rptr) mov %r10,2*8($rptr) mov %r11,3*8($rptr) mov %r12,4*8($rptr) mov %r13,5*8($rptr) mov %r14,6*8($rptr) Loading Loading
crypto/bn/asm/x86_64-mont.pl +151 −138 Original line number Diff line number Diff line Loading @@ -1685,6 +1685,7 @@ bn_mulx4x_mont: push %r15 shl \$3,${num}d # convert $num to bytes .byte 0x67 xor %r10,%r10 mov %rsp,%r11 # put aside %rsp sub $num,%r10 # -$num Loading Loading @@ -1725,15 +1726,14 @@ $code.=<<___; mov ($bp),%rdx # b[0], $bp==%rdx actually lea 64+32(%rsp),$tptr mov %rdx,$bi xor $zero,$zero # of=0,cf=0 mulx 0*8($aptr),$mi,%rax # a[0]*b[0] mulx 1*8($aptr),%r11,%r14 # a[1]*b[0] adcx %rax,%r11 add %rax,%r11 mov $bptr,8(%rsp) # off-load &b[i] mulx 2*8($aptr),%r12,%r13 # ... adcx %r14,%r12 adcx $zero,%r13 adc %r14,%r12 adc \$0,%r13 mov $mi,$bptr # borrow $bptr imulq 24(%rsp),$mi # "t[0]"*n0 Loading @@ -1751,13 +1751,12 @@ $code.=<<___; mulx 1*8($nptr),%rax,%r11 adcx %rax,%r10 adox %r12,%r11 mulx 2*8($nptr),%rax,%r12 .byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 # mulx 2*8($nptr),%rax,%r12 mov 48(%rsp),$bptr # counter value mov %r10,-4*8($tptr) adcx %rax,%r11 adox %r13,%r12 mulx 3*8($nptr),%rax,%r15 .byte 0x66,0x66 mov $bi,%rdx mov %r11,-3*8($tptr) adcx %rax,%r12 Loading @@ -1765,7 +1764,7 @@ $code.=<<___; lea 4*8($nptr),$nptr mov %r12,-2*8($tptr) #jmp .Lmulx4x_1st jmp .Lmulx4x_1st .align 32 .Lmulx4x_1st: Loading Loading @@ -1863,7 +1862,6 @@ $code.=<<___; adox %r12,%r11 mulx 2*8($nptr),%rax,%r12 mov %r10,-4*8($tptr) mov 0*8($tptr),%r10 adcx %rax,%r11 adox %r13,%r12 mulx 3*8($nptr),%rax,%r15 Loading @@ -1872,23 +1870,22 @@ $code.=<<___; adcx %rax,%r12 adox $zero,%r15 # of=0 mov 48(%rsp),$bptr # counter value .byte 0x66,0x3e mov %r12,-2*8($tptr) .byte 0x66 lea 4*8($nptr),$nptr jmp .Lmulx4x_inner #jmp .Lmulx4x_inner .align 32 .Lmulx4x_inner: adcx $zero,%r15 # cf=0, modulo-scheduled adox %r10,%r14 adox 0*8($tptr),%r14 mulx 0*8($aptr),%r10,%rax # a[4]*b[i] mov 1*8($tptr),%r13 adcx %r14,%r10 mulx 1*8($aptr),%r11,%r14 # a[5]*b[i] adox %rax,%r11 mulx 2*8($aptr),%r12,%rax # ... adcx %r13,%r11 adcx 1*8($tptr),%r11 adox %r14,%r12 mulx 3*8($aptr),%r13,%r14 mov $mi,%rdx Loading @@ -1896,8 +1893,8 @@ $code.=<<___; adox %rax,%r13 adcx 3*8($tptr),%r13 adox $zero,%r14 # of=0 .byte 0x48,0x8d,0xb6,0x20,0x00,0x00,0x00 # lea 4*8($aptr),$aptr .byte 0x48,0x8d,0x9b,0x20,0x00,0x00,0x00 # lea 4*8($tptr),$tptr lea 4*8($aptr),$aptr lea 4*8($tptr),$tptr adcx $zero,%r14 # cf=0 adox %r15,%r10 Loading @@ -1909,7 +1906,6 @@ $code.=<<___; adox %r15,%r12 mulx 2*8($nptr),%rax,%r15 mov %r10,-5*8($tptr) mov 0*8($tptr),%r10 adcx %rax,%r12 adox %r15,%r13 mulx 3*8($nptr),%rax,%r15 Loading @@ -1927,7 +1923,7 @@ $code.=<<___; mov 0(%rsp),$num # load num mov 8(%rsp),$bptr # re-load &b[i] adc $zero,%r15 # modulo-scheduled sub %r10,$zero # pull top-most carry sub 0*8($tptr),$zero # pull top-most carry adc %r15,%r14 sbb %r15,%r15 # top-most carry mov %r14,-1*8($tptr) Loading @@ -1936,10 +1932,10 @@ $code.=<<___; jne .Lmulx4x_outer neg $num xor %rdx,%rdx mov 32(%rsp),$rptr # restore rp lea 64(%rsp),$tptr xor %rdx,%rdx pxor %xmm0,%xmm0 mov 0*8($nptr,$num),%r8 mov 1*8($nptr,$num),%r9 Loading Loading @@ -2022,6 +2018,7 @@ bn_sqrx8x_mont: push %r15 shl \$3,${num}d # convert $num to bytes .byte 0x67 xor %r10,%r10 mov %rsp,%r11 # put aside %rsp sub $num,%r10 # -$num Loading @@ -2043,6 +2040,12 @@ bn_sqrx8x_mont: movq %r10, %xmm3 # -$num movq %r11, %xmm4 # save original %rsp mov $n0, 32(%rsp) ___ $code.=<<___ if ($win64); jmp .Lsqrx8x_body .align 32 ___ $code.=<<___; .Lsqrx8x_body: ################################################################## # Squaring part: Loading Loading @@ -2096,12 +2099,15 @@ $code.=<<___; mov $aaptr,8(%rsp) # save end of $aptr jmp .Lsqr8x_zero_start .align 32 .byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 .Lsqrx8x_zero: .byte 0x3e movdqa %xmm0,0*8($tptr) movdqa %xmm0,2*8($tptr) movdqa %xmm0,4*8($tptr) movdqa %xmm0,6*8($tptr) .Lsqr8x_zero_start: .Lsqr8x_zero_start: # aligned at 32 movdqa %xmm0,8*8($tptr) movdqa %xmm0,10*8($tptr) movdqa %xmm0,12*8($tptr) Loading @@ -2111,47 +2117,47 @@ $code.=<<___; jnz .Lsqrx8x_zero mov 0*8($aptr),%rdx # a[0], modulo-scheduled xor %r8,%r8 xor %r9,%r9 #xor %r9,%r9 # t[1], ex-$num, zero already xor %r10,%r10 xor %r11,%r11 xor %r12,%r12 xor %r13,%r13 xor %r14,%r14 xor %r15,%r15 lea 48(%rsp),$tptr xor $zero,$zero # cf=0, cf=0 jmp .Lsqrx8x_outer_loop .align 32 .Lsqrx8x_outer_loop: mulx 1*8($aptr),%rax,%rbx # a[1]*a[0] adcx %rax,%r8 # a[1]*a[0]+=t[1] adox %rbx,%r9 mulx 2*8($aptr),%rax,%rbx # a[2]*a[0] adcx %rax,%r9 adox %rbx,%r10 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 3*8($aptr),%rax,%rbx # ... adcx %rax,%r10 adox %rbx,%r11 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 4*8($aptr),%rax,%rbx adcx %rax,%r11 adox %rbx,%r12 mulx 5*8($aptr),%rax,%rbx adcx %rax,%r12 adox %rbx,%r13 mulx 6*8($aptr),%rax,%rbx adcx %rax,%r13 adox %rbx,%r14 mulx 7*8($aptr),%rax,%r15 mulx 1*8($aptr),%r8,%rax # a[1]*a[0] adcx %r9,%r8 # a[1]*a[0]+=t[1] adox %rax,%r10 mulx 2*8($aptr),%r9,%rax # a[2]*a[0] adcx %r10,%r9 adox %rax,%r11 .byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 3*8($aptr),%r10,%rax # ... adcx %r11,%r10 adox %rax,%r12 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 # mulx 4*8($aptr),%r11,%rax adcx %r12,%r11 adox %rax,%r13 mulx 5*8($aptr),%r12,%rax adcx %r13,%r12 adox %rax,%r14 mulx 6*8($aptr),%r13,%rax adcx %r14,%r13 adox %r15,%rax mulx 7*8($aptr),%r14,%r15 mov 1*8($aptr),%rdx # a[1] adcx %rax,%r14 adox $zero,%r15 adc 8*8($tptr),%r15 mov %r8,1*8($tptr) # t[1] mov %r9,2*8($tptr) # t[2] sbb $carry,$carry # mov %cf,$carry xor $zero,$zero # cf=0, of=0 mov %r8,1*8($tptr) # t[1] mov %r9,2*8($tptr) # t[2] mulx 2*8($aptr),%r8,%rbx # a[2]*a[1] mulx 3*8($aptr),%r9,%rax # a[3]*a[1] Loading Loading @@ -2193,13 +2199,12 @@ $code.=<<___; adcx %rbx,%r11 adox %rax,%r12 adcx %r14,%r12 adox $zero,%r13 # of=0 adcx $zero,%r13 # cf=0 mov %r8,5*8($tptr) # t[5] mov %r9,6*8($tptr) # t[6] mulx 4*8($aptr),%r8,%rax # a[4]*a[3] adox $zero,%r13 # of=0 adcx $zero,%r13 # cf=0 mulx 5*8($aptr),%r9,%rbx # a[5]*a[3] adcx %r10,%r8 adox %rax,%r9 Loading Loading @@ -2239,9 +2244,9 @@ $code.=<<___; adcx %r14,%r11 adox %rbx,%r12 adcx %rax,%r12 .byte 0x66,0x66 adox $zero,%r13 .byte 0x67,0x67 mulx %r8,%r8,%r14 # a[7]*a[6] adcx %r8,%r13 adcx $zero,%r14 Loading @@ -2250,26 +2255,26 @@ $code.=<<___; je .Lsqrx8x_outer_break neg $carry # mov $carry,%cf mov \$-8,%rcx mov $zero,%r15 mov 8*8($tptr),%r8 adc 9*8($tptr),%r9 # +=t[9] adc 10*8($tptr),%r10 # ... adc 11*8($tptr),%r11 adcx 9*8($tptr),%r9 # +=t[9] adcx 10*8($tptr),%r10 # ... adcx 11*8($tptr),%r11 adc 12*8($tptr),%r12 adc 13*8($tptr),%r13 adc 14*8($tptr),%r14 adc 15*8($tptr),%r15 lea 8*8($tptr),$tptr sbb $carry,$carry # mov %cf,$carry lea ($aptr),$aaptr lea 2*8*8($tptr),$tptr sbb %rax,%rax # mov %cf,$carry mov -64($aptr),%rdx # a[0] lea ($aptr),$aaptr mov $carry,16(%rsp) # offload $carry mov %rax,16(%rsp) # offload $carry mov $tptr,24(%rsp) lea 8*8($tptr),$tptr #lea 8*8($tptr),$tptr # see 2*8*8($tptr) above xor %eax,%eax # cf=0, of=0 mov \$-8,%rcx jmp .Lsqrx8x_loop .align 32 Loading Loading @@ -2311,17 +2316,20 @@ $code.=<<___; adox %rbx,%r15 # %rbx is 0, of=0 adcx %rbx,%r15 # cf=0 .byte 0x67 inc %rcx # of=0 jnz .Lsqrx8x_loop lea 8*8($aaptr),$aaptr mov \$-8,%rcx cmp 8(%rsp),$aaptr # done? je .Lsqrx8x_break sub 16(%rsp),%rbx # mov 16(%rsp),%cf .byte 0x66 mov -64($aptr),%rdx adc 0*8($tptr),%r8 adc 1*8($tptr),%r9 adcx 0*8($tptr),%r8 adcx 1*8($tptr),%r9 adc 2*8($tptr),%r10 adc 3*8($tptr),%r11 adc 4*8($tptr),%r12 Loading @@ -2329,35 +2337,37 @@ $code.=<<___; adc 6*8($tptr),%r14 adc 7*8($tptr),%r15 lea 8*8($tptr),$tptr sbb %rbx,%rbx # mov %cf,%rbx xor %eax,%eax # cf=0, of=0 mov %rbx,16(%rsp) # offload carry mov \$-8,%rcx .byte 0x67 sbb %rax,%rax # mov %cf,%rax xor %ebx,%ebx # cf=0, of=0 mov %rax,16(%rsp) # offload carry jmp .Lsqrx8x_loop .align 32 .Lsqrx8x_break: sub 16(%rsp),%r8 # consume last carry mov 24(%rsp),$aaptr # initial $tptr mov 24(%rsp),$carry # initial $tptr, borrow $carry mov 0*8($aptr),%rdx # a[8], modulo-scheduled xor %ebp,%ebp # xor $zero,$zero mov %r8,0*8($tptr) lea 8*8($aaptr),$aaptr cmp $carry,$tptr # cf=0, of=0 je .Lsqrx8x_outer_loop mov %r9,1*8($tptr) mov 1*8($aaptr),%r8 # potentially forwarded store mov 1*8($carry),%r9 mov %r10,2*8($tptr) mov 2*8($aaptr),%r9 # ... mov 2*8($carry),%r10 mov %r11,3*8($tptr) mov 3*8($aaptr),%r10 mov 3*8($carry),%r11 mov %r12,4*8($tptr) mov 4*8($aaptr),%r11 mov 4*8($carry),%r12 mov %r13,5*8($tptr) mov 5*8($aaptr),%r12 mov 5*8($carry),%r13 mov %r14,6*8($tptr) mov 6*8($aaptr),%r13 mov 6*8($carry),%r14 mov %r15,7*8($tptr) mov 7*8($aaptr),%r14 mov $aaptr,$tptr xor $zero,$zero # cf=0, cf=0 mov 7*8($carry),%r15 mov $carry,$tptr jmp .Lsqrx8x_outer_loop .align 32 Loading @@ -2373,13 +2383,12 @@ ___ }{ my $i="%rcx"; $code.=<<___; mov (%rsp),$num # restore $num lea 48(%rsp),$tptr mov ($aptr,$i),%rdx # a[0] mov 8($tptr),$A0[1] # t[1] xor $A0[0],$A0[0] # t[0], of=0, cf=0 mov (%rsp),$num # restore $num adox $A0[1],$A0[1] mov 16($tptr),$A1[0] # t[2] # prefetch mov 24($tptr),$A1[1] # t[3] # prefetch Loading Loading @@ -2440,9 +2449,9 @@ $code.=<<___; .align 32 .Lsqrx4x_shift_n_add_break: adcx $A1[1],%rbx .byte 0x48,0x89,0x87,0x30,0x00,0x00,0x00 # mov %rax,48($tptr) .byte 0x48,0x89,0x9f,0x38,0x00,0x00,0x00 # mov %rbx,56($tptr) .byte 0x48,0x8d,0xbf,0x40,0x00,0x00,0x00 # lea 64($tptr),$tptr mov %rax,48($tptr) mov %rbx,56($tptr) lea 64($tptr),$tptr # end of t[] buffer ___ } ###################################################################### Loading @@ -2456,17 +2465,16 @@ my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx"); $code.=<<___; movq %xmm2,$nptr xor %eax,%eax # initial top-most carry bit mov 32(%rsp),%rbx # n0 mov 48(%rsp),%rdx # "%r8", 8*0($tptr) lea ($nptr,$num),%rax # end of n[] lea -64($nptr,$num),%rcx # end of n[] #lea 48(%rsp,$num,2),$tptr # end of t[] buffer mov %rax, 0(%rsp) # save end of n[] mov %rcx, 0(%rsp) # save end of n[] mov $tptr,8(%rsp) # save end of t[] lea 48(%rsp),$tptr # initial t[] window xor %rax,%rax nop #jmp .Lsqrx8x_reduction_loop jmp .Lsqrx8x_reduction_loop .align 32 .Lsqrx8x_reduction_loop: Loading Loading @@ -2529,29 +2537,31 @@ $code.=<<___; adox $carry,%r15 # $carry is 0 adcx $carry,%r15 # cf=0 .byte 0x67 inc %rcx # of=0 jnz .Lsqrx8x_reduce lea 8*8($nptr),$nptr xor %rax,%rax .byte 0x66,0x67 mov $carry,%rax # xor %rax,%rax cmp 0(%rsp),$nptr # end of n[]? jae .Lsqrx8x_no_tail mov 48(%rsp),%rdx # pull n0*a[0] add 8*0($tptr),%r8 adcx 8*1($tptr),%r9 adcx 8*2($tptr),%r10 adcx 8*3($tptr),%r11 adcx 8*4($tptr),%r12 adcx 8*5($tptr),%r13 adcx 8*6($tptr),%r14 adcx 8*7($tptr),%r15 lea 8*8($nptr),$nptr mov \$-8,%rcx adc 8*1($tptr),%r9 adc 8*2($tptr),%r10 adc 8*3($tptr),%r11 adc 8*4($tptr),%r12 adc 8*5($tptr),%r13 adc 8*6($tptr),%r14 adc 8*7($tptr),%r15 lea 8*8($tptr),$tptr sbb $carry,$carry # top carry sbb %rax,%rax # top carry mov \$-8,%rcx mov $carry,16(%rsp) xor $carry,$carry # of=0, cf=0 mov %rax,16(%rsp) jmp .Lsqrx8x_tail .align 32 Loading Loading @@ -2588,7 +2598,7 @@ $code.=<<___; mulx 8*7($nptr),%rax,%r15 mov 48+72(%rsp,%rcx,8),%rdx # pull n0*a[i] adcx %rax,%r14 .byte 0x66 .byte 0x67 adox $carry,%r15 mov %rbx,($tptr,%rcx,8) # save result mov %r8,%rbx Loading @@ -2597,35 +2607,35 @@ $code.=<<___; inc %rcx # of=0 jnz .Lsqrx8x_tail lea 8*8($nptr),$nptr cmp 0(%rsp),$nptr # end of n[]? jae .Lsqrx8x_tail_done # break out of loop sub 16(%rsp),$carry # neg $carry sub 16(%rsp),$carry # mov 16(%rsp),%cf mov 48(%rsp),%rdx # pull n0*a[0] adcx 8*0($tptr),%r8 adcx 8*1($tptr),%r9 adcx 8*2($tptr),%r10 adcx 8*3($tptr),%r11 adcx 8*4($tptr),%r12 adcx 8*5($tptr),%r13 adcx 8*6($tptr),%r14 adcx 8*7($tptr),%r15 lea 8*8($nptr),$nptr adc 8*0($tptr),%r8 adc 8*1($tptr),%r9 adc 8*2($tptr),%r10 adc 8*3($tptr),%r11 adc 8*4($tptr),%r12 adc 8*5($tptr),%r13 adc 8*6($tptr),%r14 adc 8*7($tptr),%r15 lea 8*8($tptr),$tptr sbb $carry,$carry mov \$-8,%rcx mov $carry,16(%rsp) sbb %rax,%rax xor $carry,$carry # of=0, cf=0 mov %rax,16(%rsp) jmp .Lsqrx8x_tail .align 32 .Lsqrx8x_tail_done: add 24(%rsp),%r8 # can this overflow? xor %rax,%rax mov $carry,%rax # xor %rax,%rax sub 16(%rsp),$carry # neg $carry .Lsqrx8x_no_tail: # carry flag is 0 sub 16(%rsp),$carry # mov 16(%rsp),%cf .Lsqrx8x_no_tail: # %cf is 0 if jumped here adc 8*0($tptr),%r8 movq %xmm3,%rcx adc 8*1($tptr),%r9 Loading @@ -2639,24 +2649,24 @@ $code.=<<___; adc 8*7($tptr),%r15 adc %rax,%rax # top-most carry cmp 8(%rsp),$carry # end of t[]? mov 32(%rsp),%rbx # n0 mov 8*8($tptr,%rcx),%rdx # modulo-scheduled "%r8" lea 8*8($tptr,%rcx),$tptr # start of current t[] window mov %r8,-8*8($carry) # store top 512 bits mov %r9,-8*7($carry) mov %r10,-8*6($carry) mov %r11,-8*5($carry) mov %r12,-8*4($carry) mov %r13,-8*3($carry) mov %r14,-8*2($carry) mov %r15,-8*1($carry) mov %r8,8*0($tptr) # store top 512 bits mov %r9,8*1($tptr) mov %r10,8*2($tptr) mov %r11,8*3($tptr) mov %r12,8*4($tptr) mov %r13,8*5($tptr) mov %r14,8*6($tptr) mov %r15,8*7($tptr) lea 8*8($tptr,%rcx),$tptr # start of current t[] window cmp 8(%rsp),$carry # end of t[]? jb .Lsqrx8x_reduction_loop mov %rcx,$num neg $num # restore $num mov %rcx,%rdx # -$num jmp .Lsqrx8x_post ___ } ############################################################## Loading @@ -2667,24 +2677,28 @@ my ($rptr,$nptr,$lptr,$i)=($aptr,"%rbp","%rbx","%rcx"); my @ri=map("%r$_",(10..13)); my @ni=map("%r$_",(14..15)); $code.=<<___; lea ($nptr,$num),$nptr # end of $nptr lea 48(%rsp,$num),$lptr # end of lower half of t[2*num] lea 48(%rsp,$num),$tptr .align 32 .Lsqrx8x_post: neg %rdx # restore $num neg %rax # top-most carry as mask mov 0*8($nptr),%r8 mov 1*8($nptr),%r9 lea ($nptr,%rdx),$nptr # end of $nptr lea 48(%rsp,%rdx),$lptr # end of lower half of t[2*num] lea 48(%rsp,%rdx),$tptr .byte 0x67 xor %rdx,%rdx movq %xmm1,$rptr # restore $rptr mov 0*8($nptr,$i),%r8 mov 1*8($nptr,$i),%r9 neg %r8 jmp .Lsqrx8x_sub_entry .align 32 .byte 0x66,0x66,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 .Lsqrx8x_sub: mov 0*8($nptr,$i),%r8 mov 1*8($nptr,$i),%r9 not %r8 .Lsqrx8x_sub_entry: .Lsqrx8x_sub_entry: # aligned at 32 mov 2*8($nptr,$i),%r10 not %r9 and %rax,%r8 Loading @@ -2709,28 +2723,27 @@ $code.=<<___; movdqa %xmm0,2*8($lptr,$i) and %rax,%r15 neg %rdx # mov %rdx,%cf neg %edx # mov %edx,%cf movdqa %xmm0,4*8($lptr,$i) adc 0*8($tptr),%r8 mov %r8,0*8($rptr) # result adc 1*8($tptr),%r9 movdqa %xmm0,6*8($lptr,$i) adc 2*8($tptr),%r10 mov %r9,1*8($rptr) adc 3*8($tptr),%r11 movdqa %xmm0,0*8($tptr) # zap upper half adc 4*8($tptr),%r12 mov %r10,2*8($rptr) adc 5*8($tptr),%r13 movdqa %xmm0,2*8($tptr) adc 6*8($tptr),%r14 mov %r11,3*8($rptr) adc 7*8($tptr),%r15 sbb %edx,%edx # mov %cf,%edx movdqa %xmm0,4*8($tptr) sbb %rdx,%rdx # mov %cf,%rdx movdqa %xmm0,6*8($tptr) lea 8*8($tptr),$tptr mov %r8,0*8($rptr) mov %r9,1*8($rptr) mov %r10,2*8($rptr) mov %r11,3*8($rptr) mov %r12,4*8($rptr) mov %r13,5*8($rptr) mov %r14,6*8($rptr) Loading