Loading crypto/bn/asm/x86_64-mont.pl +126 −62 Original line number Diff line number Diff line Loading @@ -814,34 +814,87 @@ $code.=<<___ if ($addx); jne .Lsqr8x_nox call bn_sqrx8x_internal # see x86_64-mont5 module pxor %xmm0,%xmm0 lea 48(%rsp),%rax shr \$3+2,$num mov 40(%rsp),%rsi # restore %rsp jmp .Lsqr8x_zero # %rax top-most carry # %rbp nptr # %rcx -8*num # %r8 end of tp[2*num] lea (%r8,%rcx),%rbx mov %rcx,$num mov %rcx,%rdx movq %xmm1,$rptr sar \$3+2,%rcx # %cf=0 jmp .Lsqr8x_sub .align 32 .Lsqr8x_nox: ___ $code.=<<___; call bn_sqr8x_internal # see x86_64-mont5 module # %rax top-most carry # %rbp nptr # %r8 -8*num # %rdi end of tp[2*num] lea (%rdi,$num),%rbx mov $num,%rcx mov $num,%rdx movq %xmm1,$rptr sar \$3+2,%rcx # %cf=0 jmp .Lsqr8x_sub .align 32 .Lsqr8x_sub: mov 8*0(%rbx),%r12 mov 8*1(%rbx),%r13 mov 8*2(%rbx),%r14 mov 8*3(%rbx),%r15 lea 8*4(%rbx),%rbx sbb 8*0(%rbp),%r12 sbb 8*1(%rbp),%r13 sbb 8*2(%rbp),%r14 sbb 8*3(%rbp),%r15 lea 8*4(%rbp),%rbp mov %r12,8*0($rptr) mov %r13,8*1($rptr) mov %r14,8*2($rptr) mov %r15,8*3($rptr) lea 8*4($rptr),$rptr inc %rcx # preserves %cf jnz .Lsqr8x_sub sbb \$0,%rax # top-most carry lea (%rbx,$num),%rbx # rewind lea ($rptr,$num),$rptr # rewind movq %rax,%xmm1 pxor %xmm0,%xmm0 lea 48(%rsp),%rax shr \$3+2,$num pshufd \$0,%xmm1,%xmm1 mov 40(%rsp),%rsi # restore %rsp jmp .Lsqr8x_zero jmp .Lsqr8x_cond_copy .align 32 .Lsqr8x_zero: movdqa %xmm0,16*0(%rax) # wipe t movdqa %xmm0,16*1(%rax) movdqa %xmm0,16*2(%rax) movdqa %xmm0,16*3(%rax) lea 16*4(%rax),%rax dec $num jnz .Lsqr8x_zero .Lsqr8x_cond_copy: movdqa 16*0(%rbx),%xmm2 movdqa 16*1(%rbx),%xmm3 lea 16*2(%rbx),%rbx movdqu 16*0($rptr),%xmm4 movdqu 16*1($rptr),%xmm5 lea 16*2($rptr),$rptr movdqa %xmm0,-16*2(%rbx) # zero tp movdqa %xmm0,-16*1(%rbx) movdqa %xmm0,-16*2(%rbx,%rdx) movdqa %xmm0,-16*1(%rbx,%rdx) pcmpeqd %xmm1,%xmm0 pand %xmm1,%xmm2 pand %xmm1,%xmm3 pand %xmm0,%xmm4 pand %xmm0,%xmm5 pxor %xmm0,%xmm0 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqu %xmm4,-16*2($rptr) movdqu %xmm5,-16*1($rptr) add \$32,$num jnz .Lsqr8x_cond_copy mov \$1,%rax mov -48(%rsi),%r15 Loading Loading @@ -1108,64 +1161,75 @@ $code.=<<___; adc $zero,%r15 # modulo-scheduled sub 0*8($tptr),$zero # pull top-most carry adc %r15,%r14 mov -8($nptr),$mi sbb %r15,%r15 # top-most carry mov %r14,-1*8($tptr) cmp 16(%rsp),$bptr jne .Lmulx4x_outer sub %r14,$mi # compare top-most words sbb $mi,$mi or $mi,%r15 neg $num xor %rdx,%rdx lea 64(%rsp),$tptr sub $num,$nptr # rewind $nptr neg %r15 mov $num,%rdx shr \$3+2,$num # %cf=0 mov 32(%rsp),$rptr # restore rp jmp .Lmulx4x_sub .align 32 .Lmulx4x_sub: mov 8*0($tptr),%r11 mov 8*1($tptr),%r12 mov 8*2($tptr),%r13 mov 8*3($tptr),%r14 lea 8*4($tptr),$tptr sbb 8*0($nptr),%r11 sbb 8*1($nptr),%r12 sbb 8*2($nptr),%r13 sbb 8*3($nptr),%r14 lea 8*4($nptr),$nptr mov %r11,8*0($rptr) mov %r12,8*1($rptr) mov %r13,8*2($rptr) mov %r14,8*3($rptr) lea 8*4($rptr),$rptr dec $num # preserves %cf jnz .Lmulx4x_sub sbb \$0,%r15 # top-most carry lea 64(%rsp),$tptr sub %rdx,$rptr # rewind movq %r15,%xmm1 pxor %xmm0,%xmm0 mov 0*8($nptr,$num),%r8 mov 1*8($nptr,$num),%r9 neg %r8 jmp .Lmulx4x_sub_entry pshufd \$0,%xmm1,%xmm1 mov 40(%rsp),%rsi # restore %rsp jmp .Lmulx4x_cond_copy .align 32 .Lmulx4x_sub: mov 0*8($nptr,$num),%r8 mov 1*8($nptr,$num),%r9 not %r8 .Lmulx4x_sub_entry: mov 2*8($nptr,$num),%r10 not %r9 and %r15,%r8 mov 3*8($nptr,$num),%r11 not %r10 and %r15,%r9 not %r11 and %r15,%r10 and %r15,%r11 neg %rdx # mov %rdx,%cf adc 0*8($tptr),%r8 adc 1*8($tptr),%r9 movdqa %xmm0,($tptr) adc 2*8($tptr),%r10 adc 3*8($tptr),%r11 movdqa %xmm0,16($tptr) lea 4*8($tptr),$tptr sbb %rdx,%rdx # mov %cf,%rdx mov %r8,0*8($rptr) mov %r9,1*8($rptr) mov %r10,2*8($rptr) mov %r11,3*8($rptr) lea 4*8($rptr),$rptr .Lmulx4x_cond_copy: movdqa 16*0($tptr),%xmm2 movdqa 16*1($tptr),%xmm3 lea 16*2($tptr),$tptr movdqu 16*0($rptr),%xmm4 movdqu 16*1($rptr),%xmm5 lea 16*2($rptr),$rptr movdqa %xmm0,-16*2($tptr) # zero tp movdqa %xmm0,-16*1($tptr) pcmpeqd %xmm1,%xmm0 pand %xmm1,%xmm2 pand %xmm1,%xmm3 pand %xmm0,%xmm4 pand %xmm0,%xmm5 pxor %xmm0,%xmm0 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqu %xmm4,-16*2($rptr) movdqu %xmm5,-16*1($rptr) sub \$32,%rdx jnz .Lmulx4x_cond_copy add \$32,$num jnz .Lmulx4x_sub mov %rdx,($tptr) mov 40(%rsp),%rsi # restore %rsp mov \$1,%rax mov -48(%rsi),%r15 mov -40(%rsi),%r14 Loading crypto/bn/asm/x86_64-mont5.pl +152 −137 Original line number Diff line number Diff line Loading @@ -99,25 +99,18 @@ $code.=<<___; .Lmul_enter: mov ${num}d,${num}d mov %rsp,%rax movd `($win64?56:8)`(%rsp),%xmm0 # load 7th argument lea .Lmagic_masks(%rip),%r10 movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument lea .Linc(%rip),%r10 push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 ___ $code.=<<___ if ($win64); lea -0x38(%rsp),%rsp movaps %xmm6,(%rsp) movaps %xmm7,0x10(%rsp) movaps %xmm8,0x20(%rsp) ___ $code.=<<___; lea 2($num),%r11 neg %r11 lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)) lea -264(%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)+256+8) and \$-1024,%rsp # minimize TLB usage mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp Loading @@ -128,64 +121,89 @@ ___ $STRIDE=2**5*8; # 5 is "window size" $N=$STRIDE/4; # should match cache line size $code.=<<___; ################################################################ # calculate mask: one of %xmm4..7 will contain 0xff..00 or # 0x00..ff denoting which half of a quarter of corresponding # cache line is significant. # movq 56(%r10),%xmm1 # 0b11001 movq %xmm0,%rdx pand %xmm1,%xmm0 movdqa 0(%r10),%xmm4 pshufd \$0,%xmm0,%xmm0 # broadcast masked index movdqa 16(%r10),%xmm5 movdqa 32(%r10),%xmm6 pcmpeqd %xmm0,%xmm4 movdqa 48(%r10),%xmm7 pcmpeqd %xmm0,%xmm5 pcmpeqd %xmm0,%xmm6 pcmpeqd %xmm0,%xmm7 ################################################################ # calculate index in 1st cache line, but in such manner that # if target data is in another cache line, then relevant # "rotating" reference would land on it... # shr \$1,%rdx # idx/=2 mov %rdx,$j shr \$2,%rdx sub %rdx,$j and \$3,$j # (idx-idx/4)%4 shl \$4,$j # scale for xmm references ################################################################ # "rotating" references are touching different cache banks in # different cache lines, so that not only all cache lines are # referred in each iteration, but even all cache banks. movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000 movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002 lea 24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization) and \$-16,%r10 pshufd \$0,%xmm5,%xmm5 # broadcast index movdqa %xmm1,%xmm4 movdqa %xmm1,%xmm2 ___ ######################################################################## # calculate mask by comparing 0..31 to index and save result to stack # lea 16($j),$m0 lea 32($j),$m1 and \$63,$m0 lea 48($j),%rdx and \$63,$m1 and \$63,%rdx movdqa `0*$STRIDE/4-128`($bp,$j),%xmm0 movdqa `1*$STRIDE/4-128`($bp,$m0),%xmm1 movdqa `2*$STRIDE/4-128`($bp,$m1),%xmm2 movdqa `3*$STRIDE/4-128`($bp,%rdx),%xmm3 pand %xmm4,%xmm0 pand %xmm5,%xmm1 pand %xmm6,%xmm2 por %xmm1,%xmm0 pand %xmm7,%xmm3 por %xmm2,%xmm0 lea $STRIDE($bp),$bp por %xmm3,%xmm0 movq $j,%xmm8 $code.=<<___; paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 # compare to 1,0 .byte 0x67 movdqa %xmm4,%xmm3 ___ for($k=0;$k<$STRIDE/16-4;$k+=4) { $code.=<<___; paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 # compare to 3,2 movdqa %xmm0,`16*($k+0)+112`(%r10) movdqa %xmm4,%xmm0 pshufd \$0x4e,%xmm0,%xmm1 por %xmm1,%xmm0 # merge upper and lower halves paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 # compare to 5,4 movdqa %xmm1,`16*($k+1)+112`(%r10) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 # compare to 7,6 movdqa %xmm2,`16*($k+2)+112`(%r10) movdqa %xmm4,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm3,`16*($k+3)+112`(%r10) movdqa %xmm4,%xmm3 ___ } $code.=<<___; # last iteration can be optimized paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,`16*($k+0)+112`(%r10) paddd %xmm2,%xmm3 .byte 0x67 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,`16*($k+1)+112`(%r10) pcmpeqd %xmm5,%xmm3 movdqa %xmm2,`16*($k+2)+112`(%r10) pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register pand `16*($k+1)-128`($bp),%xmm1 pand `16*($k+2)-128`($bp),%xmm2 movdqa %xmm3,`16*($k+3)+112`(%r10) pand `16*($k+3)-128`($bp),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 ___ for($k=0;$k<$STRIDE/16-4;$k+=4) { $code.=<<___; movdqa `16*($k+0)-128`($bp),%xmm4 movdqa `16*($k+1)-128`($bp),%xmm5 movdqa `16*($k+2)-128`($bp),%xmm2 pand `16*($k+0)+112`(%r10),%xmm4 movdqa `16*($k+3)-128`($bp),%xmm3 pand `16*($k+1)+112`(%r10),%xmm5 por %xmm4,%xmm0 pand `16*($k+2)+112`(%r10),%xmm2 por %xmm5,%xmm1 pand `16*($k+3)+112`(%r10),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 ___ } $code.=<<___; por %xmm1,%xmm0 pshufd \$0x4e,%xmm0,%xmm1 por %xmm1,%xmm0 lea $STRIDE($bp),$bp movq %xmm0,$m0 # m0=bp[0] mov ($n0),$n0 # pull n0[0] value Loading Loading @@ -232,15 +250,14 @@ $code.=<<___; mulq $m1 # np[j]*m1 cmp $num,$j jne .L1st movq %xmm8,$j jne .L1st # note that upon exit $j==$num, so # they can be used interchangeably add %rax,$hi1 adc \$0,%rdx add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $hi1,-16(%rsp,$num,8) # tp[j-1] mov $hi1,-16(%rsp,$num,8) # tp[num-1] mov %rdx,$hi1 mov $lo0,$hi0 Loading @@ -254,27 +271,32 @@ $code.=<<___; jmp .Louter .align 16 .Louter: lea 16($j),$m0 lea 32($j),$m1 and \$63,$m0 lea 48($j),%rdx and \$63,$m1 and \$63,%rdx movdqa `0*$STRIDE/4-128`($bp,$j),%xmm0 movdqa `1*$STRIDE/4-128`($bp,$m0),%xmm1 movdqa `2*$STRIDE/4-128`($bp,$m1),%xmm2 movdqa `3*$STRIDE/4-128`($bp,%rdx),%xmm3 pand %xmm4,%xmm0 pand %xmm5,%xmm1 pand %xmm6,%xmm2 por %xmm1,%xmm0 pand %xmm7,%xmm3 por %xmm2,%xmm0 lea 24+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization) and \$-16,%rdx pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 ___ for($k=0;$k<$STRIDE/16;$k+=4) { $code.=<<___; movdqa `16*($k+0)-128`($bp),%xmm0 movdqa `16*($k+1)-128`($bp),%xmm1 movdqa `16*($k+2)-128`($bp),%xmm2 movdqa `16*($k+3)-128`($bp),%xmm3 pand `16*($k+0)-128`(%rdx),%xmm0 pand `16*($k+1)-128`(%rdx),%xmm1 por %xmm0,%xmm4 pand `16*($k+2)-128`(%rdx),%xmm2 por %xmm1,%xmm5 pand `16*($k+3)-128`(%rdx),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 ___ } $code.=<<___; por %xmm5,%xmm4 pshufd \$0x4e,%xmm4,%xmm0 por %xmm4,%xmm0 lea $STRIDE($bp),$bp por %xmm3,%xmm0 pshufd \$0x4e,%xmm0,%xmm1 por %xmm1,%xmm0 # merge upper and lower halves mov ($ap),%rax # ap[0] movq %xmm0,$m0 # m0=bp[i] Loading Loading @@ -324,16 +346,14 @@ $code.=<<___; mulq $m1 # np[j]*m1 cmp $num,$j jne .Linner movq %xmm8,$j jne .Linner # note that upon exit $j==$num, so # they can be used interchangeably add %rax,$hi1 adc \$0,%rdx add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] mov (%rsp,$num,8),$lo0 adc \$0,%rdx mov $hi1,-16(%rsp,$num,8) # tp[j-1] mov $hi1,-16(%rsp,$num,8) # tp[num-1] mov %rdx,$hi1 xor %rdx,%rdx Loading Loading @@ -380,13 +400,7 @@ $code.=<<___; mov 8(%rsp,$num,8),%rsi # restore %rsp mov \$1,%rax ___ $code.=<<___ if ($win64); movaps -104(%rsi),%xmm6 movaps -88(%rsi),%xmm7 movaps -72(%rsi),%xmm8 ___ $code.=<<___; mov -48(%rsi),%r15 mov -40(%rsi),%r14 mov -32(%rsi),%r13 Loading Loading @@ -1065,10 +1079,15 @@ $code.=<<___; movq $bptr,%xmm4 call __bn_sqr8x_internal call __bn_post4x_internal call __bn_sqr8x_internal call __bn_post4x_internal call __bn_sqr8x_internal call __bn_post4x_internal call __bn_sqr8x_internal call __bn_post4x_internal call __bn_sqr8x_internal call __bn_post4x_internal movq %xmm2,$nptr movq %xmm4,$bptr Loading Loading @@ -1629,7 +1648,7 @@ my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx"); $code.=<<___; movq %xmm2,$nptr sqr8x_reduction: __bn_sqr8x_reduction: xor %rax,%rax lea ($nptr,$num),%rcx # end of n[] lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer Loading Loading @@ -1888,6 +1907,8 @@ sqr8x_reduction: cmp %rdx,$tptr # end of t[]? jb .L8x_reduction_loop ret .size bn_sqr8x_internal,.-bn_sqr8x_internal ___ } ############################################################## Loading @@ -1896,13 +1917,12 @@ ___ { my ($tptr,$nptr)=("%rbx","%rbp"); $code.=<<___; #xor %rsi,%rsi # %rsi was $carry above .type __bn_post4x_internal,\@abi-omnipotent .align 32 __bn_post4x_internal: mov 8*0($nptr),%r12 sub %r15,%rcx # compare top-most words lea (%rdi,$num),$tptr # %rdi was $tptr above adc %rsi,%rsi mov $num,%rcx or %rsi,%rax movq %xmm1,$rptr # restore $rptr neg %rax movq %xmm1,$aptr # prepare for back-to-back call Loading Loading @@ -1946,14 +1966,13 @@ $code.=<<___; inc %rcx # pass %cf jnz .Lsqr4x_sub ___ } $code.=<<___; mov $num,%r10 # prepare for back-to-back call neg $num # restore $num ret .size bn_sqr8x_internal,.-bn_sqr8x_internal .size __bn_post4x_internal,.-__bn_post4x_internal ___ } { $code.=<<___; .globl bn_from_montgomery Loading Loading @@ -2061,7 +2080,8 @@ $code.=<<___ if ($addx); jne .Lfrom_mont_nox lea (%rax,$num),$rptr call sqrx8x_reduction call __bn_sqrx8x_reduction call __bn_postx4x_internal pxor %xmm0,%xmm0 lea 48(%rsp),%rax Loading @@ -2072,7 +2092,8 @@ $code.=<<___ if ($addx); .Lfrom_mont_nox: ___ $code.=<<___; call sqr8x_reduction call __bn_sqr8x_reduction call __bn_post4x_internal pxor %xmm0,%xmm0 lea 48(%rsp),%rax Loading Loading @@ -2622,10 +2643,15 @@ bn_powerx5: .Lpowerx5_body: call __bn_sqrx8x_internal call __bn_postx4x_internal call __bn_sqrx8x_internal call __bn_postx4x_internal call __bn_sqrx8x_internal call __bn_postx4x_internal call __bn_sqrx8x_internal call __bn_postx4x_internal call __bn_sqrx8x_internal call __bn_postx4x_internal mov %r10,$num # -num mov $aptr,$rptr Loading Loading @@ -3071,7 +3097,7 @@ my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx"); $code.=<<___; movq %xmm2,$nptr sqrx8x_reduction: __bn_sqrx8x_reduction: xor %eax,%eax # initial top-most carry bit mov 32+8(%rsp),%rbx # n0 mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr) Loading Loading @@ -3279,6 +3305,8 @@ sqrx8x_reduction: lea 8*8($tptr,%rcx),$tptr # start of current t[] window cmp 8+8(%rsp),%r8 # end of t[]? jb .Lsqrx8x_reduction_loop ret .size bn_sqrx8x_internal,.-bn_sqrx8x_internal ___ } ############################################################## Loading @@ -3286,15 +3314,11 @@ ___ # { my ($rptr,$nptr)=("%rdx","%rbp"); my @ri=map("%r$_",(10..13)); my @ni=map("%r$_",(14..15)); $code.=<<___; .align 32 __bn_postx4x_internal: mov 8*0($nptr),%r12 xor %ebx,%ebx sub %r15,%rsi # compare top-most words adc %rbx,%rbx mov %rcx,%r10 # -$num or %rbx,%rax mov %rcx,%r9 # -$num neg %rax sar \$3+2,%rcx Loading @@ -3308,6 +3332,7 @@ $code.=<<___; mov 8*3($nptr),%r15 jmp .Lsqrx4x_sub_entry .align 16 .Lsqrx4x_sub: mov 8*0($nptr),%r12 mov 8*1($nptr),%r13 Loading Loading @@ -3335,14 +3360,13 @@ $code.=<<___; inc %rcx jnz .Lsqrx4x_sub ___ } $code.=<<___; neg %r9 # restore $num ret .size bn_sqrx8x_internal,.-bn_sqrx8x_internal .size __bn_postx4x_internal,.-__bn_postx4x_internal ___ } }}} { my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order Loading Loading @@ -3483,9 +3507,6 @@ ___ } $code.=<<___; .align 64 .Lmagic_masks: .long 0x00,0x00,0x01,0x01, 0x08,0x08,0x09,0x09 .long 0x10,0x10,0x11,0x11, 0x18,0x18,0x19,0x19 .Linc: .long 0,0, 1,1 .long 2,2, 2,2 Loading Loading @@ -3541,13 +3562,6 @@ mul_handler: mov 192($context),%r10 # pull $num mov 8(%rax,%r10,8),%rax # pull saved stack pointer movaps -104(%rax),%xmm0 movaps -88(%rax),%xmm1 movaps -72(%rax),%xmm2 movups %xmm0,512($context) # restore context->Xmm6 movups %xmm1,528($context) # restore context->Xmm7 movups %xmm2,544($context) # restore context->Xmm8 jmp .Lbody_proceed .Lbody_40: Loading Loading @@ -3675,8 +3689,9 @@ ___ $code.=<<___; .align 8 .LSEH_info_bn_gather5: .byte 0x01,0x0b,0x02,0x00 .byte 0x01,0x0b,0x03,0x0a .byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108 .byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp) .align 8 ___ } Loading Loading
crypto/bn/asm/x86_64-mont.pl +126 −62 Original line number Diff line number Diff line Loading @@ -814,34 +814,87 @@ $code.=<<___ if ($addx); jne .Lsqr8x_nox call bn_sqrx8x_internal # see x86_64-mont5 module pxor %xmm0,%xmm0 lea 48(%rsp),%rax shr \$3+2,$num mov 40(%rsp),%rsi # restore %rsp jmp .Lsqr8x_zero # %rax top-most carry # %rbp nptr # %rcx -8*num # %r8 end of tp[2*num] lea (%r8,%rcx),%rbx mov %rcx,$num mov %rcx,%rdx movq %xmm1,$rptr sar \$3+2,%rcx # %cf=0 jmp .Lsqr8x_sub .align 32 .Lsqr8x_nox: ___ $code.=<<___; call bn_sqr8x_internal # see x86_64-mont5 module # %rax top-most carry # %rbp nptr # %r8 -8*num # %rdi end of tp[2*num] lea (%rdi,$num),%rbx mov $num,%rcx mov $num,%rdx movq %xmm1,$rptr sar \$3+2,%rcx # %cf=0 jmp .Lsqr8x_sub .align 32 .Lsqr8x_sub: mov 8*0(%rbx),%r12 mov 8*1(%rbx),%r13 mov 8*2(%rbx),%r14 mov 8*3(%rbx),%r15 lea 8*4(%rbx),%rbx sbb 8*0(%rbp),%r12 sbb 8*1(%rbp),%r13 sbb 8*2(%rbp),%r14 sbb 8*3(%rbp),%r15 lea 8*4(%rbp),%rbp mov %r12,8*0($rptr) mov %r13,8*1($rptr) mov %r14,8*2($rptr) mov %r15,8*3($rptr) lea 8*4($rptr),$rptr inc %rcx # preserves %cf jnz .Lsqr8x_sub sbb \$0,%rax # top-most carry lea (%rbx,$num),%rbx # rewind lea ($rptr,$num),$rptr # rewind movq %rax,%xmm1 pxor %xmm0,%xmm0 lea 48(%rsp),%rax shr \$3+2,$num pshufd \$0,%xmm1,%xmm1 mov 40(%rsp),%rsi # restore %rsp jmp .Lsqr8x_zero jmp .Lsqr8x_cond_copy .align 32 .Lsqr8x_zero: movdqa %xmm0,16*0(%rax) # wipe t movdqa %xmm0,16*1(%rax) movdqa %xmm0,16*2(%rax) movdqa %xmm0,16*3(%rax) lea 16*4(%rax),%rax dec $num jnz .Lsqr8x_zero .Lsqr8x_cond_copy: movdqa 16*0(%rbx),%xmm2 movdqa 16*1(%rbx),%xmm3 lea 16*2(%rbx),%rbx movdqu 16*0($rptr),%xmm4 movdqu 16*1($rptr),%xmm5 lea 16*2($rptr),$rptr movdqa %xmm0,-16*2(%rbx) # zero tp movdqa %xmm0,-16*1(%rbx) movdqa %xmm0,-16*2(%rbx,%rdx) movdqa %xmm0,-16*1(%rbx,%rdx) pcmpeqd %xmm1,%xmm0 pand %xmm1,%xmm2 pand %xmm1,%xmm3 pand %xmm0,%xmm4 pand %xmm0,%xmm5 pxor %xmm0,%xmm0 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqu %xmm4,-16*2($rptr) movdqu %xmm5,-16*1($rptr) add \$32,$num jnz .Lsqr8x_cond_copy mov \$1,%rax mov -48(%rsi),%r15 Loading Loading @@ -1108,64 +1161,75 @@ $code.=<<___; adc $zero,%r15 # modulo-scheduled sub 0*8($tptr),$zero # pull top-most carry adc %r15,%r14 mov -8($nptr),$mi sbb %r15,%r15 # top-most carry mov %r14,-1*8($tptr) cmp 16(%rsp),$bptr jne .Lmulx4x_outer sub %r14,$mi # compare top-most words sbb $mi,$mi or $mi,%r15 neg $num xor %rdx,%rdx lea 64(%rsp),$tptr sub $num,$nptr # rewind $nptr neg %r15 mov $num,%rdx shr \$3+2,$num # %cf=0 mov 32(%rsp),$rptr # restore rp jmp .Lmulx4x_sub .align 32 .Lmulx4x_sub: mov 8*0($tptr),%r11 mov 8*1($tptr),%r12 mov 8*2($tptr),%r13 mov 8*3($tptr),%r14 lea 8*4($tptr),$tptr sbb 8*0($nptr),%r11 sbb 8*1($nptr),%r12 sbb 8*2($nptr),%r13 sbb 8*3($nptr),%r14 lea 8*4($nptr),$nptr mov %r11,8*0($rptr) mov %r12,8*1($rptr) mov %r13,8*2($rptr) mov %r14,8*3($rptr) lea 8*4($rptr),$rptr dec $num # preserves %cf jnz .Lmulx4x_sub sbb \$0,%r15 # top-most carry lea 64(%rsp),$tptr sub %rdx,$rptr # rewind movq %r15,%xmm1 pxor %xmm0,%xmm0 mov 0*8($nptr,$num),%r8 mov 1*8($nptr,$num),%r9 neg %r8 jmp .Lmulx4x_sub_entry pshufd \$0,%xmm1,%xmm1 mov 40(%rsp),%rsi # restore %rsp jmp .Lmulx4x_cond_copy .align 32 .Lmulx4x_sub: mov 0*8($nptr,$num),%r8 mov 1*8($nptr,$num),%r9 not %r8 .Lmulx4x_sub_entry: mov 2*8($nptr,$num),%r10 not %r9 and %r15,%r8 mov 3*8($nptr,$num),%r11 not %r10 and %r15,%r9 not %r11 and %r15,%r10 and %r15,%r11 neg %rdx # mov %rdx,%cf adc 0*8($tptr),%r8 adc 1*8($tptr),%r9 movdqa %xmm0,($tptr) adc 2*8($tptr),%r10 adc 3*8($tptr),%r11 movdqa %xmm0,16($tptr) lea 4*8($tptr),$tptr sbb %rdx,%rdx # mov %cf,%rdx mov %r8,0*8($rptr) mov %r9,1*8($rptr) mov %r10,2*8($rptr) mov %r11,3*8($rptr) lea 4*8($rptr),$rptr .Lmulx4x_cond_copy: movdqa 16*0($tptr),%xmm2 movdqa 16*1($tptr),%xmm3 lea 16*2($tptr),$tptr movdqu 16*0($rptr),%xmm4 movdqu 16*1($rptr),%xmm5 lea 16*2($rptr),$rptr movdqa %xmm0,-16*2($tptr) # zero tp movdqa %xmm0,-16*1($tptr) pcmpeqd %xmm1,%xmm0 pand %xmm1,%xmm2 pand %xmm1,%xmm3 pand %xmm0,%xmm4 pand %xmm0,%xmm5 pxor %xmm0,%xmm0 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqu %xmm4,-16*2($rptr) movdqu %xmm5,-16*1($rptr) sub \$32,%rdx jnz .Lmulx4x_cond_copy add \$32,$num jnz .Lmulx4x_sub mov %rdx,($tptr) mov 40(%rsp),%rsi # restore %rsp mov \$1,%rax mov -48(%rsi),%r15 mov -40(%rsi),%r14 Loading
crypto/bn/asm/x86_64-mont5.pl +152 −137 Original line number Diff line number Diff line Loading @@ -99,25 +99,18 @@ $code.=<<___; .Lmul_enter: mov ${num}d,${num}d mov %rsp,%rax movd `($win64?56:8)`(%rsp),%xmm0 # load 7th argument lea .Lmagic_masks(%rip),%r10 movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument lea .Linc(%rip),%r10 push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 ___ $code.=<<___ if ($win64); lea -0x38(%rsp),%rsp movaps %xmm6,(%rsp) movaps %xmm7,0x10(%rsp) movaps %xmm8,0x20(%rsp) ___ $code.=<<___; lea 2($num),%r11 neg %r11 lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)) lea -264(%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)+256+8) and \$-1024,%rsp # minimize TLB usage mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp Loading @@ -128,64 +121,89 @@ ___ $STRIDE=2**5*8; # 5 is "window size" $N=$STRIDE/4; # should match cache line size $code.=<<___; ################################################################ # calculate mask: one of %xmm4..7 will contain 0xff..00 or # 0x00..ff denoting which half of a quarter of corresponding # cache line is significant. # movq 56(%r10),%xmm1 # 0b11001 movq %xmm0,%rdx pand %xmm1,%xmm0 movdqa 0(%r10),%xmm4 pshufd \$0,%xmm0,%xmm0 # broadcast masked index movdqa 16(%r10),%xmm5 movdqa 32(%r10),%xmm6 pcmpeqd %xmm0,%xmm4 movdqa 48(%r10),%xmm7 pcmpeqd %xmm0,%xmm5 pcmpeqd %xmm0,%xmm6 pcmpeqd %xmm0,%xmm7 ################################################################ # calculate index in 1st cache line, but in such manner that # if target data is in another cache line, then relevant # "rotating" reference would land on it... # shr \$1,%rdx # idx/=2 mov %rdx,$j shr \$2,%rdx sub %rdx,$j and \$3,$j # (idx-idx/4)%4 shl \$4,$j # scale for xmm references ################################################################ # "rotating" references are touching different cache banks in # different cache lines, so that not only all cache lines are # referred in each iteration, but even all cache banks. movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000 movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002 lea 24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization) and \$-16,%r10 pshufd \$0,%xmm5,%xmm5 # broadcast index movdqa %xmm1,%xmm4 movdqa %xmm1,%xmm2 ___ ######################################################################## # calculate mask by comparing 0..31 to index and save result to stack # lea 16($j),$m0 lea 32($j),$m1 and \$63,$m0 lea 48($j),%rdx and \$63,$m1 and \$63,%rdx movdqa `0*$STRIDE/4-128`($bp,$j),%xmm0 movdqa `1*$STRIDE/4-128`($bp,$m0),%xmm1 movdqa `2*$STRIDE/4-128`($bp,$m1),%xmm2 movdqa `3*$STRIDE/4-128`($bp,%rdx),%xmm3 pand %xmm4,%xmm0 pand %xmm5,%xmm1 pand %xmm6,%xmm2 por %xmm1,%xmm0 pand %xmm7,%xmm3 por %xmm2,%xmm0 lea $STRIDE($bp),$bp por %xmm3,%xmm0 movq $j,%xmm8 $code.=<<___; paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 # compare to 1,0 .byte 0x67 movdqa %xmm4,%xmm3 ___ for($k=0;$k<$STRIDE/16-4;$k+=4) { $code.=<<___; paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 # compare to 3,2 movdqa %xmm0,`16*($k+0)+112`(%r10) movdqa %xmm4,%xmm0 pshufd \$0x4e,%xmm0,%xmm1 por %xmm1,%xmm0 # merge upper and lower halves paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 # compare to 5,4 movdqa %xmm1,`16*($k+1)+112`(%r10) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 # compare to 7,6 movdqa %xmm2,`16*($k+2)+112`(%r10) movdqa %xmm4,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm3,`16*($k+3)+112`(%r10) movdqa %xmm4,%xmm3 ___ } $code.=<<___; # last iteration can be optimized paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,`16*($k+0)+112`(%r10) paddd %xmm2,%xmm3 .byte 0x67 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,`16*($k+1)+112`(%r10) pcmpeqd %xmm5,%xmm3 movdqa %xmm2,`16*($k+2)+112`(%r10) pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register pand `16*($k+1)-128`($bp),%xmm1 pand `16*($k+2)-128`($bp),%xmm2 movdqa %xmm3,`16*($k+3)+112`(%r10) pand `16*($k+3)-128`($bp),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 ___ for($k=0;$k<$STRIDE/16-4;$k+=4) { $code.=<<___; movdqa `16*($k+0)-128`($bp),%xmm4 movdqa `16*($k+1)-128`($bp),%xmm5 movdqa `16*($k+2)-128`($bp),%xmm2 pand `16*($k+0)+112`(%r10),%xmm4 movdqa `16*($k+3)-128`($bp),%xmm3 pand `16*($k+1)+112`(%r10),%xmm5 por %xmm4,%xmm0 pand `16*($k+2)+112`(%r10),%xmm2 por %xmm5,%xmm1 pand `16*($k+3)+112`(%r10),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 ___ } $code.=<<___; por %xmm1,%xmm0 pshufd \$0x4e,%xmm0,%xmm1 por %xmm1,%xmm0 lea $STRIDE($bp),$bp movq %xmm0,$m0 # m0=bp[0] mov ($n0),$n0 # pull n0[0] value Loading Loading @@ -232,15 +250,14 @@ $code.=<<___; mulq $m1 # np[j]*m1 cmp $num,$j jne .L1st movq %xmm8,$j jne .L1st # note that upon exit $j==$num, so # they can be used interchangeably add %rax,$hi1 adc \$0,%rdx add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $hi1,-16(%rsp,$num,8) # tp[j-1] mov $hi1,-16(%rsp,$num,8) # tp[num-1] mov %rdx,$hi1 mov $lo0,$hi0 Loading @@ -254,27 +271,32 @@ $code.=<<___; jmp .Louter .align 16 .Louter: lea 16($j),$m0 lea 32($j),$m1 and \$63,$m0 lea 48($j),%rdx and \$63,$m1 and \$63,%rdx movdqa `0*$STRIDE/4-128`($bp,$j),%xmm0 movdqa `1*$STRIDE/4-128`($bp,$m0),%xmm1 movdqa `2*$STRIDE/4-128`($bp,$m1),%xmm2 movdqa `3*$STRIDE/4-128`($bp,%rdx),%xmm3 pand %xmm4,%xmm0 pand %xmm5,%xmm1 pand %xmm6,%xmm2 por %xmm1,%xmm0 pand %xmm7,%xmm3 por %xmm2,%xmm0 lea 24+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization) and \$-16,%rdx pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 ___ for($k=0;$k<$STRIDE/16;$k+=4) { $code.=<<___; movdqa `16*($k+0)-128`($bp),%xmm0 movdqa `16*($k+1)-128`($bp),%xmm1 movdqa `16*($k+2)-128`($bp),%xmm2 movdqa `16*($k+3)-128`($bp),%xmm3 pand `16*($k+0)-128`(%rdx),%xmm0 pand `16*($k+1)-128`(%rdx),%xmm1 por %xmm0,%xmm4 pand `16*($k+2)-128`(%rdx),%xmm2 por %xmm1,%xmm5 pand `16*($k+3)-128`(%rdx),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 ___ } $code.=<<___; por %xmm5,%xmm4 pshufd \$0x4e,%xmm4,%xmm0 por %xmm4,%xmm0 lea $STRIDE($bp),$bp por %xmm3,%xmm0 pshufd \$0x4e,%xmm0,%xmm1 por %xmm1,%xmm0 # merge upper and lower halves mov ($ap),%rax # ap[0] movq %xmm0,$m0 # m0=bp[i] Loading Loading @@ -324,16 +346,14 @@ $code.=<<___; mulq $m1 # np[j]*m1 cmp $num,$j jne .Linner movq %xmm8,$j jne .Linner # note that upon exit $j==$num, so # they can be used interchangeably add %rax,$hi1 adc \$0,%rdx add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] mov (%rsp,$num,8),$lo0 adc \$0,%rdx mov $hi1,-16(%rsp,$num,8) # tp[j-1] mov $hi1,-16(%rsp,$num,8) # tp[num-1] mov %rdx,$hi1 xor %rdx,%rdx Loading Loading @@ -380,13 +400,7 @@ $code.=<<___; mov 8(%rsp,$num,8),%rsi # restore %rsp mov \$1,%rax ___ $code.=<<___ if ($win64); movaps -104(%rsi),%xmm6 movaps -88(%rsi),%xmm7 movaps -72(%rsi),%xmm8 ___ $code.=<<___; mov -48(%rsi),%r15 mov -40(%rsi),%r14 mov -32(%rsi),%r13 Loading Loading @@ -1065,10 +1079,15 @@ $code.=<<___; movq $bptr,%xmm4 call __bn_sqr8x_internal call __bn_post4x_internal call __bn_sqr8x_internal call __bn_post4x_internal call __bn_sqr8x_internal call __bn_post4x_internal call __bn_sqr8x_internal call __bn_post4x_internal call __bn_sqr8x_internal call __bn_post4x_internal movq %xmm2,$nptr movq %xmm4,$bptr Loading Loading @@ -1629,7 +1648,7 @@ my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx"); $code.=<<___; movq %xmm2,$nptr sqr8x_reduction: __bn_sqr8x_reduction: xor %rax,%rax lea ($nptr,$num),%rcx # end of n[] lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer Loading Loading @@ -1888,6 +1907,8 @@ sqr8x_reduction: cmp %rdx,$tptr # end of t[]? jb .L8x_reduction_loop ret .size bn_sqr8x_internal,.-bn_sqr8x_internal ___ } ############################################################## Loading @@ -1896,13 +1917,12 @@ ___ { my ($tptr,$nptr)=("%rbx","%rbp"); $code.=<<___; #xor %rsi,%rsi # %rsi was $carry above .type __bn_post4x_internal,\@abi-omnipotent .align 32 __bn_post4x_internal: mov 8*0($nptr),%r12 sub %r15,%rcx # compare top-most words lea (%rdi,$num),$tptr # %rdi was $tptr above adc %rsi,%rsi mov $num,%rcx or %rsi,%rax movq %xmm1,$rptr # restore $rptr neg %rax movq %xmm1,$aptr # prepare for back-to-back call Loading Loading @@ -1946,14 +1966,13 @@ $code.=<<___; inc %rcx # pass %cf jnz .Lsqr4x_sub ___ } $code.=<<___; mov $num,%r10 # prepare for back-to-back call neg $num # restore $num ret .size bn_sqr8x_internal,.-bn_sqr8x_internal .size __bn_post4x_internal,.-__bn_post4x_internal ___ } { $code.=<<___; .globl bn_from_montgomery Loading Loading @@ -2061,7 +2080,8 @@ $code.=<<___ if ($addx); jne .Lfrom_mont_nox lea (%rax,$num),$rptr call sqrx8x_reduction call __bn_sqrx8x_reduction call __bn_postx4x_internal pxor %xmm0,%xmm0 lea 48(%rsp),%rax Loading @@ -2072,7 +2092,8 @@ $code.=<<___ if ($addx); .Lfrom_mont_nox: ___ $code.=<<___; call sqr8x_reduction call __bn_sqr8x_reduction call __bn_post4x_internal pxor %xmm0,%xmm0 lea 48(%rsp),%rax Loading Loading @@ -2622,10 +2643,15 @@ bn_powerx5: .Lpowerx5_body: call __bn_sqrx8x_internal call __bn_postx4x_internal call __bn_sqrx8x_internal call __bn_postx4x_internal call __bn_sqrx8x_internal call __bn_postx4x_internal call __bn_sqrx8x_internal call __bn_postx4x_internal call __bn_sqrx8x_internal call __bn_postx4x_internal mov %r10,$num # -num mov $aptr,$rptr Loading Loading @@ -3071,7 +3097,7 @@ my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx"); $code.=<<___; movq %xmm2,$nptr sqrx8x_reduction: __bn_sqrx8x_reduction: xor %eax,%eax # initial top-most carry bit mov 32+8(%rsp),%rbx # n0 mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr) Loading Loading @@ -3279,6 +3305,8 @@ sqrx8x_reduction: lea 8*8($tptr,%rcx),$tptr # start of current t[] window cmp 8+8(%rsp),%r8 # end of t[]? jb .Lsqrx8x_reduction_loop ret .size bn_sqrx8x_internal,.-bn_sqrx8x_internal ___ } ############################################################## Loading @@ -3286,15 +3314,11 @@ ___ # { my ($rptr,$nptr)=("%rdx","%rbp"); my @ri=map("%r$_",(10..13)); my @ni=map("%r$_",(14..15)); $code.=<<___; .align 32 __bn_postx4x_internal: mov 8*0($nptr),%r12 xor %ebx,%ebx sub %r15,%rsi # compare top-most words adc %rbx,%rbx mov %rcx,%r10 # -$num or %rbx,%rax mov %rcx,%r9 # -$num neg %rax sar \$3+2,%rcx Loading @@ -3308,6 +3332,7 @@ $code.=<<___; mov 8*3($nptr),%r15 jmp .Lsqrx4x_sub_entry .align 16 .Lsqrx4x_sub: mov 8*0($nptr),%r12 mov 8*1($nptr),%r13 Loading Loading @@ -3335,14 +3360,13 @@ $code.=<<___; inc %rcx jnz .Lsqrx4x_sub ___ } $code.=<<___; neg %r9 # restore $num ret .size bn_sqrx8x_internal,.-bn_sqrx8x_internal .size __bn_postx4x_internal,.-__bn_postx4x_internal ___ } }}} { my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order Loading Loading @@ -3483,9 +3507,6 @@ ___ } $code.=<<___; .align 64 .Lmagic_masks: .long 0x00,0x00,0x01,0x01, 0x08,0x08,0x09,0x09 .long 0x10,0x10,0x11,0x11, 0x18,0x18,0x19,0x19 .Linc: .long 0,0, 1,1 .long 2,2, 2,2 Loading Loading @@ -3541,13 +3562,6 @@ mul_handler: mov 192($context),%r10 # pull $num mov 8(%rax,%r10,8),%rax # pull saved stack pointer movaps -104(%rax),%xmm0 movaps -88(%rax),%xmm1 movaps -72(%rax),%xmm2 movups %xmm0,512($context) # restore context->Xmm6 movups %xmm1,528($context) # restore context->Xmm7 movups %xmm2,544($context) # restore context->Xmm8 jmp .Lbody_proceed .Lbody_40: Loading Loading @@ -3675,8 +3689,9 @@ ___ $code.=<<___; .align 8 .LSEH_info_bn_gather5: .byte 0x01,0x0b,0x02,0x00 .byte 0x01,0x0b,0x03,0x0a .byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108 .byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp) .align 8 ___ } Loading