Skip to content
Snippets Groups Projects
x86_64-mont5.pl 20.3 KiB
Newer Older
  • Learn to ignore specific revisions
  • #!/usr/bin/env perl
    
    # ====================================================================
    # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
    # project. The module is, however, dual licensed under OpenSSL and
    # CRYPTOGAMS licenses depending on where you obtain it. For further
    # details see http://www.openssl.org/~appro/cryptogams/.
    # ====================================================================
    
    # August 2011.
    #
    # Companion to x86_64-mont.pl that optimizes cache-timing attack
    # countermeasures. The subroutines are produced by replacing bp[i]
    # references in their x86_64-mont.pl counterparts with cache-neutral
    # references to powers table computed in BN_mod_exp_mont_consttime.
    # In addition subroutine that scatters elements of the powers table
    # is implemented, so that scatter-/gathering can be tuned without
    # bn_exp.c modifications.
    
    $flavour = shift;
    $output  = shift;
    if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
    
    $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
    
    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
    ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
    ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
    die "can't locate x86_64-xlate.pl";
    
    open STDOUT,"| $^X $xlate $flavour $output";
    
    # int bn_mul_mont_gather5(
    $rp="%rdi";	# BN_ULONG *rp,
    $ap="%rsi";	# const BN_ULONG *ap,
    $bp="%rdx";	# const BN_ULONG *bp,
    $np="%rcx";	# const BN_ULONG *np,
    $n0="%r8";	# const BN_ULONG *n0,
    $num="%r9";	# int num,
    		# int idx);	# 0 to 2^5-1, "index" in $bp holding
    				# pre-computed powers of a', interlaced
    				# in such manner that b[0] is $bp[idx],
    				# b[1] is [2^5+idx], etc.
    $lo0="%r10";
    $hi0="%r11";
    $hi1="%r13";
    $i="%r14";
    $j="%r15";
    $m0="%rbx";
    $m1="%rbp";
    
    $code=<<___;
    .text
    
    .globl	bn_mul_mont_gather5
    .type	bn_mul_mont_gather5,\@function,6
    .align	64
    bn_mul_mont_gather5:
    	test	\$3,${num}d
    	jnz	.Lmul_enter
    	cmp	\$8,${num}d
    	jb	.Lmul_enter
    	jmp	.Lmul4x_enter
    
    .align	16
    .Lmul_enter:
    
    	mov	${num}d,${num}d
    
    	mov	`($win64?56:8)`(%rsp),%r10d	# load 7th argument
    	push	%rbx
    	push	%rbp
    	push	%r12
    	push	%r13
    	push	%r14
    	push	%r15
    
    ___
    $code.=<<___ if ($win64);
    	lea	-0x28(%rsp),%rsp
    	movaps	%xmm6,(%rsp)
    	movaps	%xmm7,0x10(%rsp)
    .Lmul_alloca:
    ___
    $code.=<<___;
    
    	lea	2($num),%r11
    
    	neg	%r11
    	lea	(%rsp,%r11,8),%rsp	# tp=alloca(8*(num+2))
    	and	\$-1024,%rsp		# minimize TLB usage
    
    	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
    .Lmul_body:
    	mov	$bp,%r12		# reassign $bp
    ___
    		$bp="%r12";
    		$STRIDE=2**5*8;		# 5 is "window size"
    		$N=$STRIDE/4;		# should match cache line size
    $code.=<<___;
    	mov	%r10,%r11
    	shr	\$`log($N/8)/log(2)`,%r10
    	and	\$`$N/8-1`,%r11
    	not	%r10
    	lea	.Lmagic_masks(%rip),%rax
    	and	\$`2**5/($N/8)-1`,%r10	# 5 is "window size"
    	lea	96($bp,%r11,8),$bp	# pointer within 1st cache line
    	movq	0(%rax,%r10,8),%xmm4	# set of masks denoting which
    	movq	8(%rax,%r10,8),%xmm5	# cache line contains element
    	movq	16(%rax,%r10,8),%xmm6	# denoted by 7th argument
    	movq	24(%rax,%r10,8),%xmm7
    
    	movq	`0*$STRIDE/4-96`($bp),%xmm0
    	movq	`1*$STRIDE/4-96`($bp),%xmm1
    	pand	%xmm4,%xmm0
    	movq	`2*$STRIDE/4-96`($bp),%xmm2
    	pand	%xmm5,%xmm1
    	movq	`3*$STRIDE/4-96`($bp),%xmm3
    	pand	%xmm6,%xmm2
    	por	%xmm1,%xmm0
    	pand	%xmm7,%xmm3
    	por	%xmm2,%xmm0
    	lea	$STRIDE($bp),$bp
    	por	%xmm3,%xmm0
    
    	movq	%xmm0,$m0		# m0=bp[0]
    
    	mov	($n0),$n0		# pull n0[0] value
    	mov	($ap),%rax
    
    	xor	$i,$i			# i=0
    	xor	$j,$j			# j=0
    
    	movq	`0*$STRIDE/4-96`($bp),%xmm0
    	movq	`1*$STRIDE/4-96`($bp),%xmm1
    	pand	%xmm4,%xmm0
    	movq	`2*$STRIDE/4-96`($bp),%xmm2
    	pand	%xmm5,%xmm1
    
    	mov	$n0,$m1
    	mulq	$m0			# ap[0]*bp[0]
    	mov	%rax,$lo0
    	mov	($np),%rax
    
    	movq	`3*$STRIDE/4-96`($bp),%xmm3
    	pand	%xmm6,%xmm2
    	por	%xmm1,%xmm0
    	pand	%xmm7,%xmm3
    
    	imulq	$lo0,$m1		# "tp[0]"*n0
    	mov	%rdx,$hi0
    
    	por	%xmm2,%xmm0
    	lea	$STRIDE($bp),$bp
    	por	%xmm3,%xmm0
    
    	mulq	$m1			# np[0]*m1
    	add	%rax,$lo0		# discarded
    	mov	8($ap),%rax
    	adc	\$0,%rdx
    	mov	%rdx,$hi1
    
    	lea	1($j),$j		# j++
    	jmp	.L1st_enter
    
    .align	16
    .L1st:
    	add	%rax,$hi1
    	mov	($ap,$j,8),%rax
    	adc	\$0,%rdx
    	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
    	mov	$lo0,$hi0
    	adc	\$0,%rdx
    	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
    	mov	%rdx,$hi1
    
    .L1st_enter:
    	mulq	$m0			# ap[j]*bp[0]
    	add	%rax,$hi0
    	mov	($np,$j,8),%rax
    	adc	\$0,%rdx
    	lea	1($j),$j		# j++
    	mov	%rdx,$lo0
    
    	mulq	$m1			# np[j]*m1
    	cmp	$num,$j
    	jne	.L1st
    
    	movq	%xmm0,$m0		# bp[1]
    
    	add	%rax,$hi1
    	mov	($ap),%rax		# ap[0]
    	adc	\$0,%rdx
    	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
    	adc	\$0,%rdx
    	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
    	mov	%rdx,$hi1
    	mov	$lo0,$hi0
    
    	xor	%rdx,%rdx
    	add	$hi0,$hi1
    	adc	\$0,%rdx
    	mov	$hi1,-8(%rsp,$num,8)
    	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
    
    	lea	1($i),$i		# i++
    	jmp	.Louter
    .align	16
    .Louter:
    	xor	$j,$j			# j=0
    	mov	$n0,$m1
    	mov	(%rsp),$lo0
    
    	movq	`0*$STRIDE/4-96`($bp),%xmm0
    	movq	`1*$STRIDE/4-96`($bp),%xmm1
    	pand	%xmm4,%xmm0
    	movq	`2*$STRIDE/4-96`($bp),%xmm2
    	pand	%xmm5,%xmm1
    
    	mulq	$m0			# ap[0]*bp[i]
    	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
    	mov	($np),%rax
    	adc	\$0,%rdx
    
    	movq	`3*$STRIDE/4-96`($bp),%xmm3
    	pand	%xmm6,%xmm2
    	por	%xmm1,%xmm0
    	pand	%xmm7,%xmm3
    
    	imulq	$lo0,$m1		# tp[0]*n0
    	mov	%rdx,$hi0
    
    	por	%xmm2,%xmm0
    	lea	$STRIDE($bp),$bp
    	por	%xmm3,%xmm0
    
    	mulq	$m1			# np[0]*m1
    	add	%rax,$lo0		# discarded
    	mov	8($ap),%rax
    	adc	\$0,%rdx
    	mov	8(%rsp),$lo0		# tp[1]
    	mov	%rdx,$hi1
    
    	lea	1($j),$j		# j++
    	jmp	.Linner_enter
    
    .align	16
    .Linner:
    	add	%rax,$hi1
    	mov	($ap,$j,8),%rax
    	adc	\$0,%rdx
    	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
    	mov	(%rsp,$j,8),$lo0
    	adc	\$0,%rdx
    	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
    	mov	%rdx,$hi1
    
    .Linner_enter:
    	mulq	$m0			# ap[j]*bp[i]
    	add	%rax,$hi0
    	mov	($np,$j,8),%rax
    	adc	\$0,%rdx
    	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
    	mov	%rdx,$hi0
    	adc	\$0,$hi0
    	lea	1($j),$j		# j++
    
    	mulq	$m1			# np[j]*m1
    	cmp	$num,$j
    	jne	.Linner
    
    	movq	%xmm0,$m0		# bp[i+1]
    
    	add	%rax,$hi1
    	mov	($ap),%rax		# ap[0]
    	adc	\$0,%rdx
    	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
    	mov	(%rsp,$j,8),$lo0
    	adc	\$0,%rdx
    	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
    	mov	%rdx,$hi1
    
    	xor	%rdx,%rdx
    	add	$hi0,$hi1
    	adc	\$0,%rdx
    	add	$lo0,$hi1		# pull upmost overflow bit
    	adc	\$0,%rdx
    	mov	$hi1,-8(%rsp,$num,8)
    	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
    
    	lea	1($i),$i		# i++
    	cmp	$num,$i
    	jl	.Louter
    
    	xor	$i,$i			# i=0 and clear CF!
    	mov	(%rsp),%rax		# tp[0]
    	lea	(%rsp),$ap		# borrow ap for tp
    	mov	$num,$j			# j=num
    	jmp	.Lsub
    .align	16
    .Lsub:	sbb	($np,$i,8),%rax
    	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
    	mov	8($ap,$i,8),%rax	# tp[i+1]
    	lea	1($i),$i		# i++
    	dec	$j			# doesnn't affect CF!
    	jnz	.Lsub
    
    	sbb	\$0,%rax		# handle upmost overflow bit
    	xor	$i,$i
    	and	%rax,$ap
    	not	%rax
    	mov	$rp,$np
    	and	%rax,$np
    	mov	$num,$j			# j=num
    	or	$np,$ap			# ap=borrow?tp:rp
    .align	16
    .Lcopy:					# copy or in-place refresh
    	mov	($ap,$i,8),%rax
    	mov	$i,(%rsp,$i,8)		# zap temporary vector
    	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
    	lea	1($i),$i
    	sub	\$1,$j
    	jnz	.Lcopy
    
    	mov	8(%rsp,$num,8),%rsi	# restore %rsp
    	mov	\$1,%rax
    
    ___
    $code.=<<___ if ($win64);
    	movaps	(%rsi),%xmm6
    	movaps	0x10(%rsi),%xmm7
    	lea	0x28(%rsi),%rsi
    ___
    $code.=<<___;
    
    	mov	(%rsi),%r15
    	mov	8(%rsi),%r14
    	mov	16(%rsi),%r13
    	mov	24(%rsi),%r12
    	mov	32(%rsi),%rbp
    	mov	40(%rsi),%rbx
    	lea	48(%rsi),%rsp
    .Lmul_epilogue:
    	ret
    .size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
    ___
    {{{
    my @A=("%r10","%r11");
    my @N=("%r13","%rdi");
    $code.=<<___;
    .type	bn_mul4x_mont_gather5,\@function,6
    .align	16
    bn_mul4x_mont_gather5:
    .Lmul4x_enter:
    
    	mov	${num}d,${num}d
    
    	mov	`($win64?56:8)`(%rsp),%r10d	# load 7th argument
    	push	%rbx
    	push	%rbp
    	push	%r12
    	push	%r13
    	push	%r14
    	push	%r15
    
    ___
    $code.=<<___ if ($win64);
    	lea	-0x28(%rsp),%rsp
    	movaps	%xmm6,(%rsp)
    	movaps	%xmm7,0x10(%rsp)
    .Lmul4x_alloca:
    ___
    $code.=<<___;
    	mov	%rsp,%rax
    
    366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810
    	lea	4($num),%r11
    	neg	%r11
    	lea	(%rsp,%r11,8),%rsp	# tp=alloca(8*(num+4))
    	and	\$-1024,%rsp		# minimize TLB usage
    
    	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
    .Lmul4x_body:
    	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
    	mov	%rdx,%r12		# reassign $bp
    ___
    		$bp="%r12";
    		$STRIDE=2**5*8;		# 5 is "window size"
    		$N=$STRIDE/4;		# should match cache line size
    $code.=<<___;
    	mov	%r10,%r11
    	shr	\$`log($N/8)/log(2)`,%r10
    	and	\$`$N/8-1`,%r11
    	not	%r10
    	lea	.Lmagic_masks(%rip),%rax
    	and	\$`2**5/($N/8)-1`,%r10	# 5 is "window size"
    	lea	96($bp,%r11,8),$bp	# pointer within 1st cache line
    	movq	0(%rax,%r10,8),%xmm4	# set of masks denoting which
    	movq	8(%rax,%r10,8),%xmm5	# cache line contains element
    	movq	16(%rax,%r10,8),%xmm6	# denoted by 7th argument
    	movq	24(%rax,%r10,8),%xmm7
    
    	movq	`0*$STRIDE/4-96`($bp),%xmm0
    	movq	`1*$STRIDE/4-96`($bp),%xmm1
    	pand	%xmm4,%xmm0
    	movq	`2*$STRIDE/4-96`($bp),%xmm2
    	pand	%xmm5,%xmm1
    	movq	`3*$STRIDE/4-96`($bp),%xmm3
    	pand	%xmm6,%xmm2
    	por	%xmm1,%xmm0
    	pand	%xmm7,%xmm3
    	por	%xmm2,%xmm0
    	lea	$STRIDE($bp),$bp
    	por	%xmm3,%xmm0
    
    	movq	%xmm0,$m0		# m0=bp[0]
    	mov	($n0),$n0		# pull n0[0] value
    	mov	($ap),%rax
    
    	xor	$i,$i			# i=0
    	xor	$j,$j			# j=0
    
    	movq	`0*$STRIDE/4-96`($bp),%xmm0
    	movq	`1*$STRIDE/4-96`($bp),%xmm1
    	pand	%xmm4,%xmm0
    	movq	`2*$STRIDE/4-96`($bp),%xmm2
    	pand	%xmm5,%xmm1
    
    	mov	$n0,$m1
    	mulq	$m0			# ap[0]*bp[0]
    	mov	%rax,$A[0]
    	mov	($np),%rax
    
    	movq	`3*$STRIDE/4-96`($bp),%xmm3
    	pand	%xmm6,%xmm2
    	por	%xmm1,%xmm0
    	pand	%xmm7,%xmm3
    
    	imulq	$A[0],$m1		# "tp[0]"*n0
    	mov	%rdx,$A[1]
    
    	por	%xmm2,%xmm0
    	lea	$STRIDE($bp),$bp
    	por	%xmm3,%xmm0
    
    	mulq	$m1			# np[0]*m1
    	add	%rax,$A[0]		# discarded
    	mov	8($ap),%rax
    	adc	\$0,%rdx
    	mov	%rdx,$N[1]
    
    	mulq	$m0
    	add	%rax,$A[1]
    	mov	8($np),%rax
    	adc	\$0,%rdx
    	mov	%rdx,$A[0]
    
    	mulq	$m1
    	add	%rax,$N[1]
    	mov	16($ap),%rax
    	adc	\$0,%rdx
    	add	$A[1],$N[1]
    	lea	4($j),$j		# j++
    	adc	\$0,%rdx
    	mov	$N[1],(%rsp)
    	mov	%rdx,$N[0]
    	jmp	.L1st4x
    .align	16
    .L1st4x:
    	mulq	$m0			# ap[j]*bp[0]
    	add	%rax,$A[0]
    	mov	-16($np,$j,8),%rax
    	adc	\$0,%rdx
    	mov	%rdx,$A[1]
    
    	mulq	$m1			# np[j]*m1
    	add	%rax,$N[0]
    	mov	-8($ap,$j,8),%rax
    	adc	\$0,%rdx
    	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
    	adc	\$0,%rdx
    	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
    	mov	%rdx,$N[1]
    
    	mulq	$m0			# ap[j]*bp[0]
    	add	%rax,$A[1]
    	mov	-8($np,$j,8),%rax
    	adc	\$0,%rdx
    	mov	%rdx,$A[0]
    
    	mulq	$m1			# np[j]*m1
    	add	%rax,$N[1]
    	mov	($ap,$j,8),%rax
    	adc	\$0,%rdx
    	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
    	adc	\$0,%rdx
    	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
    	mov	%rdx,$N[0]
    
    	mulq	$m0			# ap[j]*bp[0]
    	add	%rax,$A[0]
    	mov	($np,$j,8),%rax
    	adc	\$0,%rdx
    	mov	%rdx,$A[1]
    
    	mulq	$m1			# np[j]*m1
    	add	%rax,$N[0]
    	mov	8($ap,$j,8),%rax
    	adc	\$0,%rdx
    	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
    	adc	\$0,%rdx
    	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
    	mov	%rdx,$N[1]
    
    	mulq	$m0			# ap[j]*bp[0]
    	add	%rax,$A[1]
    	mov	8($np,$j,8),%rax
    	adc	\$0,%rdx
    	lea	4($j),$j		# j++
    	mov	%rdx,$A[0]
    
    	mulq	$m1			# np[j]*m1
    	add	%rax,$N[1]
    	mov	-16($ap,$j,8),%rax
    	adc	\$0,%rdx
    	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
    	adc	\$0,%rdx
    	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
    	mov	%rdx,$N[0]
    	cmp	$num,$j
    	jl	.L1st4x
    
    	mulq	$m0			# ap[j]*bp[0]
    	add	%rax,$A[0]
    	mov	-16($np,$j,8),%rax
    	adc	\$0,%rdx
    	mov	%rdx,$A[1]
    
    	mulq	$m1			# np[j]*m1
    	add	%rax,$N[0]
    	mov	-8($ap,$j,8),%rax
    	adc	\$0,%rdx
    	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
    	adc	\$0,%rdx
    	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
    	mov	%rdx,$N[1]
    
    	mulq	$m0			# ap[j]*bp[0]
    	add	%rax,$A[1]
    	mov	-8($np,$j,8),%rax
    	adc	\$0,%rdx
    	mov	%rdx,$A[0]
    
    	mulq	$m1			# np[j]*m1
    	add	%rax,$N[1]
    	mov	($ap),%rax		# ap[0]
    	adc	\$0,%rdx
    	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
    	adc	\$0,%rdx
    	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
    	mov	%rdx,$N[0]
    
    	movq	%xmm0,$m0		# bp[1]
    
    	xor	$N[1],$N[1]
    	add	$A[0],$N[0]
    	adc	\$0,$N[1]
    	mov	$N[0],-8(%rsp,$j,8)
    	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
    
    	lea	1($i),$i		# i++
    .align	4
    .Louter4x:
    	xor	$j,$j			# j=0
    	movq	`0*$STRIDE/4-96`($bp),%xmm0
    	movq	`1*$STRIDE/4-96`($bp),%xmm1
    	pand	%xmm4,%xmm0
    	movq	`2*$STRIDE/4-96`($bp),%xmm2
    	pand	%xmm5,%xmm1
    
    	mov	(%rsp),$A[0]
    	mov	$n0,$m1
    	mulq	$m0			# ap[0]*bp[i]
    	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
    	mov	($np),%rax
    	adc	\$0,%rdx
    
    	movq	`3*$STRIDE/4-96`($bp),%xmm3
    	pand	%xmm6,%xmm2
    	por	%xmm1,%xmm0
    	pand	%xmm7,%xmm3
    
    	imulq	$A[0],$m1		# tp[0]*n0
    	mov	%rdx,$A[1]
    
    	por	%xmm2,%xmm0
    	lea	$STRIDE($bp),$bp
    	por	%xmm3,%xmm0
    
    	mulq	$m1			# np[0]*m1
    	add	%rax,$A[0]		# "$N[0]", discarded
    	mov	8($ap),%rax
    	adc	\$0,%rdx
    	mov	%rdx,$N[1]
    
    	mulq	$m0			# ap[j]*bp[i]
    	add	%rax,$A[1]
    	mov	8($np),%rax
    	adc	\$0,%rdx
    	add	8(%rsp),$A[1]		# +tp[1]
    	adc	\$0,%rdx
    	mov	%rdx,$A[0]
    
    	mulq	$m1			# np[j]*m1
    	add	%rax,$N[1]
    	mov	16($ap),%rax
    	adc	\$0,%rdx
    	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
    	lea	4($j),$j		# j+=2
    	adc	\$0,%rdx
    	mov	$N[1],(%rsp)		# tp[j-1]
    	mov	%rdx,$N[0]
    	jmp	.Linner4x
    .align	16
    .Linner4x:
    	mulq	$m0			# ap[j]*bp[i]
    	add	%rax,$A[0]
    	mov	-16($np,$j,8),%rax
    	adc	\$0,%rdx
    	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
    	adc	\$0,%rdx
    	mov	%rdx,$A[1]
    
    	mulq	$m1			# np[j]*m1
    	add	%rax,$N[0]
    	mov	-8($ap,$j,8),%rax
    	adc	\$0,%rdx
    	add	$A[0],$N[0]
    	adc	\$0,%rdx
    	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
    	mov	%rdx,$N[1]
    
    	mulq	$m0			# ap[j]*bp[i]
    	add	%rax,$A[1]
    	mov	-8($np,$j,8),%rax
    	adc	\$0,%rdx
    	add	-8(%rsp,$j,8),$A[1]
    	adc	\$0,%rdx
    	mov	%rdx,$A[0]
    
    	mulq	$m1			# np[j]*m1
    	add	%rax,$N[1]
    	mov	($ap,$j,8),%rax
    	adc	\$0,%rdx
    	add	$A[1],$N[1]
    	adc	\$0,%rdx
    	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
    	mov	%rdx,$N[0]
    
    	mulq	$m0			# ap[j]*bp[i]
    	add	%rax,$A[0]
    	mov	($np,$j,8),%rax
    	adc	\$0,%rdx
    	add	(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
    	adc	\$0,%rdx
    	mov	%rdx,$A[1]
    
    	mulq	$m1			# np[j]*m1
    	add	%rax,$N[0]
    	mov	8($ap,$j,8),%rax
    	adc	\$0,%rdx
    	add	$A[0],$N[0]
    	adc	\$0,%rdx
    	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
    	mov	%rdx,$N[1]
    
    	mulq	$m0			# ap[j]*bp[i]
    	add	%rax,$A[1]
    	mov	8($np,$j,8),%rax
    	adc	\$0,%rdx
    	add	8(%rsp,$j,8),$A[1]
    	adc	\$0,%rdx
    	lea	4($j),$j		# j++
    	mov	%rdx,$A[0]
    
    	mulq	$m1			# np[j]*m1
    	add	%rax,$N[1]
    	mov	-16($ap,$j,8),%rax
    	adc	\$0,%rdx
    	add	$A[1],$N[1]
    	adc	\$0,%rdx
    	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
    	mov	%rdx,$N[0]
    	cmp	$num,$j
    	jl	.Linner4x
    
    	mulq	$m0			# ap[j]*bp[i]
    	add	%rax,$A[0]
    	mov	-16($np,$j,8),%rax
    	adc	\$0,%rdx
    	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
    	adc	\$0,%rdx
    	mov	%rdx,$A[1]
    
    	mulq	$m1			# np[j]*m1
    	add	%rax,$N[0]
    	mov	-8($ap,$j,8),%rax
    	adc	\$0,%rdx
    	add	$A[0],$N[0]
    	adc	\$0,%rdx
    	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
    	mov	%rdx,$N[1]
    
    	mulq	$m0			# ap[j]*bp[i]
    	add	%rax,$A[1]
    	mov	-8($np,$j,8),%rax
    	adc	\$0,%rdx
    	add	-8(%rsp,$j,8),$A[1]
    	adc	\$0,%rdx
    	lea	1($i),$i		# i++
    	mov	%rdx,$A[0]
    
    	mulq	$m1			# np[j]*m1
    	add	%rax,$N[1]
    	mov	($ap),%rax		# ap[0]
    	adc	\$0,%rdx
    	add	$A[1],$N[1]
    	adc	\$0,%rdx
    	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
    	mov	%rdx,$N[0]
    
    	movq	%xmm0,$m0		# bp[i+1]
    
    	xor	$N[1],$N[1]
    	add	$A[0],$N[0]
    	adc	\$0,$N[1]
    	add	(%rsp,$num,8),$N[0]	# pull upmost overflow bit
    	adc	\$0,$N[1]
    	mov	$N[0],-8(%rsp,$j,8)
    	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
    
    	cmp	$num,$i
    	jl	.Louter4x
    ___
    {
    my @ri=("%rax","%rdx",$m0,$m1);
    $code.=<<___;
    	mov	16(%rsp,$num,8),$rp	# restore $rp
    	mov	0(%rsp),@ri[0]		# tp[0]
    	pxor	%xmm0,%xmm0
    	mov	8(%rsp),@ri[1]		# tp[1]
    	shr	\$2,$num		# num/=4
    	lea	(%rsp),$ap		# borrow ap for tp
    	xor	$i,$i			# i=0 and clear CF!
    
    	sub	0($np),@ri[0]
    	mov	16($ap),@ri[2]		# tp[2]
    	mov	24($ap),@ri[3]		# tp[3]
    	sbb	8($np),@ri[1]
    	lea	-1($num),$j		# j=num/4-1
    	jmp	.Lsub4x
    .align	16
    .Lsub4x:
    	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
    	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
    	sbb	16($np,$i,8),@ri[2]
    	mov	32($ap,$i,8),@ri[0]	# tp[i+1]
    	mov	40($ap,$i,8),@ri[1]
    	sbb	24($np,$i,8),@ri[3]
    	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
    	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
    	sbb	32($np,$i,8),@ri[0]
    	mov	48($ap,$i,8),@ri[2]
    	mov	56($ap,$i,8),@ri[3]
    	sbb	40($np,$i,8),@ri[1]
    	lea	4($i),$i		# i++
    	dec	$j			# doesnn't affect CF!
    	jnz	.Lsub4x
    
    	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
    	mov	32($ap,$i,8),@ri[0]	# load overflow bit
    	sbb	16($np,$i,8),@ri[2]
    	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
    	sbb	24($np,$i,8),@ri[3]
    	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
    
    	sbb	\$0,@ri[0]		# handle upmost overflow bit
    	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
    	xor	$i,$i			# i=0
    	and	@ri[0],$ap
    	not	@ri[0]
    	mov	$rp,$np
    	and	@ri[0],$np
    	lea	-1($num),$j
    	or	$np,$ap			# ap=borrow?tp:rp
    
    	movdqu	($ap),%xmm1
    	movdqa	%xmm0,(%rsp)
    	movdqu	%xmm1,($rp)
    	jmp	.Lcopy4x
    .align	16
    .Lcopy4x:					# copy or in-place refresh
    	movdqu	16($ap,$i),%xmm2
    	movdqu	32($ap,$i),%xmm1
    	movdqa	%xmm0,16(%rsp,$i)
    	movdqu	%xmm2,16($rp,$i)
    	movdqa	%xmm0,32(%rsp,$i)
    	movdqu	%xmm1,32($rp,$i)
    	lea	32($i),$i
    	dec	$j
    	jnz	.Lcopy4x
    
    	shl	\$2,$num
    	movdqu	16($ap,$i),%xmm2
    	movdqa	%xmm0,16(%rsp,$i)
    	movdqu	%xmm2,16($rp,$i)
    ___
    }
    $code.=<<___;
    	mov	8(%rsp,$num,8),%rsi	# restore %rsp
    	mov	\$1,%rax
    
    ___
    $code.=<<___ if ($win64);
    	movaps	(%rsi),%xmm6
    	movaps	0x10(%rsi),%xmm7
    	lea	0x28(%rsi),%rsi
    ___
    $code.=<<___;
    
    	mov	(%rsi),%r15
    	mov	8(%rsi),%r14
    	mov	16(%rsi),%r13
    	mov	24(%rsi),%r12
    	mov	32(%rsi),%rbp
    	mov	40(%rsi),%rbx
    	lea	48(%rsi),%rsp
    .Lmul4x_epilogue:
    	ret
    .size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
    ___
    }}}
    
    {
    my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
    				("%rdi","%rsi","%rdx","%rcx"); # Unix order
    $code.=<<___;
    .globl	bn_scatter5
    .type	bn_scatter5,\@abi-omnipotent
    .align	16
    bn_scatter5:
    
    	cmp	\$0, $num
    	jz	.Lscatter_epilogue
    
    	lea	($tbl,$idx,8),$tbl
    .Lscatter:
    	mov	($inp),%rax
    	lea	8($inp),$inp
    	mov	%rax,($tbl)
    	lea	32*8($tbl),$tbl
    	sub	\$1,$num
    	jnz	.Lscatter
    
    	ret
    .size	bn_scatter5,.-bn_scatter5
    ___
    }
    $code.=<<___;
    .align	64
    .Lmagic_masks:
    	.long	0,0, 0,0, 0,0, -1,-1
    	.long	0,0, 0,0, 0,0,  0,0
    .asciz	"Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
    ___
    
    
    # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
    #		CONTEXT *context,DISPATCHER_CONTEXT *disp)
    if ($win64) {
    $rec="%rcx";
    $frame="%rdx";
    $context="%r8";
    $disp="%r9";
    
    $code.=<<___;
    .extern	__imp_RtlVirtualUnwind
    .type	mul_handler,\@abi-omnipotent
    .align	16
    mul_handler:
    	push	%rsi
    	push	%rdi
    	push	%rbx
    	push	%rbp
    	push	%r12
    	push	%r13
    	push	%r14
    	push	%r15
    	pushfq
    	sub	\$64,%rsp
    
    	mov	120($context),%rax	# pull context->Rax
    	mov	248($context),%rbx	# pull context->Rip
    
    	mov	8($disp),%rsi		# disp->ImageBase
    	mov	56($disp),%r11		# disp->HandlerData
    
    	mov	0(%r11),%r10d		# HandlerData[0]
    	lea	(%rsi,%r10),%r10	# end of prologue label
    	cmp	%r10,%rbx		# context->Rip<end of prologue label
    	jb	.Lcommon_seh_tail
    
    	lea	`40+48`(%rax),%rax
    
    	mov	4(%r11),%r10d		# HandlerData[1]
    	lea	(%rsi,%r10),%r10	# end of alloca label
    	cmp	%r10,%rbx		# context->Rip<end of alloca label
    	jb	.Lcommon_seh_tail
    
    	mov	152($context),%rax	# pull context->Rsp
    
    	mov	8(%r11),%r10d		# HandlerData[2]
    	lea	(%rsi,%r10),%r10	# epilogue label
    	cmp	%r10,%rbx		# context->Rip>=epilogue label
    	jae	.Lcommon_seh_tail
    
    	mov	192($context),%r10	# pull $num
    	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
    
    	movaps	(%rax),%xmm0
    	movaps	16(%rax),%xmm1
    	lea	`40+48`(%rax),%rax
    
    	mov	-8(%rax),%rbx
    	mov	-16(%rax),%rbp
    	mov	-24(%rax),%r12
    	mov	-32(%rax),%r13
    	mov	-40(%rax),%r14
    	mov	-48(%rax),%r15
    	mov	%rbx,144($context)	# restore context->Rbx
    	mov	%rbp,160($context)	# restore context->Rbp
    	mov	%r12,216($context)	# restore context->R12
    	mov	%r13,224($context)	# restore context->R13
    	mov	%r14,232($context)	# restore context->R14
    	mov	%r15,240($context)	# restore context->R15
    	movups	%xmm0,512($context)	# restore context->Xmm6
    	movups	%xmm1,528($context)	# restore context->Xmm7
    
    .Lcommon_seh_tail:
    	mov	8(%rax),%rdi
    	mov	16(%rax),%rsi
    	mov	%rax,152($context)	# restore context->Rsp
    	mov	%rsi,168($context)	# restore context->Rsi
    	mov	%rdi,176($context)	# restore context->Rdi
    
    	mov	40($disp),%rdi		# disp->ContextRecord
    	mov	$context,%rsi		# context
    	mov	\$154,%ecx		# sizeof(CONTEXT)
    	.long	0xa548f3fc		# cld; rep movsq
    
    	mov	$disp,%rsi
    	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
    	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
    	mov	0(%rsi),%r8		# arg3, disp->ControlPc
    	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
    	mov	40(%rsi),%r10		# disp->ContextRecord
    	lea	56(%rsi),%r11		# &disp->HandlerData
    	lea	24(%rsi),%r12		# &disp->EstablisherFrame
    	mov	%r10,32(%rsp)		# arg5
    	mov	%r11,40(%rsp)		# arg6
    	mov	%r12,48(%rsp)		# arg7
    	mov	%rcx,56(%rsp)		# arg8, (NULL)
    	call	*__imp_RtlVirtualUnwind(%rip)
    
    	mov	\$1,%eax		# ExceptionContinueSearch
    	add	\$64,%rsp
    	popfq
    	pop	%r15
    	pop	%r14
    	pop	%r13
    	pop	%r12
    	pop	%rbp
    	pop	%rbx
    	pop	%rdi
    	pop	%rsi
    	ret
    .size	mul_handler,.-mul_handler
    
    .section	.pdata
    .align	4
    	.rva	.LSEH_begin_bn_mul_mont_gather5
    	.rva	.LSEH_end_bn_mul_mont_gather5
    	.rva	.LSEH_info_bn_mul_mont_gather5
    
    	.rva	.LSEH_begin_bn_mul4x_mont_gather5
    	.rva	.LSEH_end_bn_mul4x_mont_gather5
    	.rva	.LSEH_info_bn_mul4x_mont_gather5
    
    .section	.xdata
    .align	8
    .LSEH_info_bn_mul_mont_gather5:
    	.byte	9,0,0,0
    	.rva	mul_handler
    	.rva	.Lmul_alloca,.Lmul_body,.Lmul_epilogue		# HandlerData[]
    .align	8
    .LSEH_info_bn_mul4x_mont_gather5:
    	.byte	9,0,0,0
    	.rva	mul_handler
    	.rva	.Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue	# HandlerData[]
    .align	8
    ___
    }
    
    
    $code =~ s/\`([^\`]*)\`/eval($1)/gem;
    
    print $code;