ia64.S 44.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000
.explicit
.text
.ident	"ia64.S, Version 2.1"
.ident	"IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"

//
// ====================================================================
// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
// project.
//
// Rights for redistribution and usage in source and binary forms are
// granted according to the OpenSSL license. Warranty of any kind is
// disclaimed.
// ====================================================================
//
// Version 2.x is Itanium2 re-tune. Few words about how Itanum2 is
// different from Itanium to this module viewpoint. Most notably, is it
// "wider" than Itanium? Can you experience loop scalability as
// discussed in commentary sections? Not really:-( Itanium2 has 6
// integer ALU ports, i.e. it's 2 ports wider, but it's not enough to
// spin twice as fast, as I need 8 IALU ports. Amount of floating point
// ports is the same, i.e. 2, while I need 4. In other words, to this
// module Itanium2 remains effectively as "wide" as Itanium. Yet it's
// essentially different in respect to this module, and a re-tune was
// required. Well, because some intruction latencies has changed. Most
// noticeably those intensively used:
//
//			Itanium	Itanium2
//	ldf8		9	6		L2 hit
//	ld8		2	1		L1 hit
//	getf		2	5
//	xma[->getf]	7[+1]	4[+0]
//	add[->st8]	1[+1]	1[+0]
//
// What does it mean? You might ratiocinate that the original code
// should run just faster... Because sum of latencies is smaller...
// Wrong! Note that getf latency increased. This means that if a loop is
// scheduled for lower latency (as they were), then it will suffer from
// stall condition and the code will therefore turn anti-scalable, e.g.
// original bn_mul_words spun at 5*n or 2.5 times slower than expected
// on Itanium2! What to do? Reschedule loops for Itanium2? But then
// Itanium would exhibit anti-scalability. So I've chosen to reschedule
// for worst latency for every instruction aiming for best *all-round*
// performance.  

// Q.	How much faster does it get?
// A.	Here is the output from 'openssl speed rsa dsa' for vanilla
//	0.9.6a compiled with gcc version 2.96 20000731 (Red Hat
//	Linux 7.1 2.96-81):
//
//	                  sign    verify    sign/s verify/s
//	rsa  512 bits   0.0036s   0.0003s    275.3   2999.2
//	rsa 1024 bits   0.0203s   0.0011s     49.3    894.1
//	rsa 2048 bits   0.1331s   0.0040s      7.5    250.9
//	rsa 4096 bits   0.9270s   0.0147s      1.1     68.1
//	                  sign    verify    sign/s verify/s
//	dsa  512 bits   0.0035s   0.0043s    288.3    234.8
//	dsa 1024 bits   0.0111s   0.0135s     90.0     74.2
//
//	And here is similar output but for this assembler
//	implementation:-)
//
//	                  sign    verify    sign/s verify/s
//	rsa  512 bits   0.0021s   0.0001s    549.4   9638.5
//	rsa 1024 bits   0.0055s   0.0002s    183.8   4481.1
//	rsa 2048 bits   0.0244s   0.0006s     41.4   1726.3
//	rsa 4096 bits   0.1295s   0.0018s      7.7    561.5
//	                  sign    verify    sign/s verify/s
//	dsa  512 bits   0.0012s   0.0013s    891.9    756.6
//	dsa 1024 bits   0.0023s   0.0028s    440.4    376.2
//	
//	Yes, you may argue that it's not fair comparison as it's
//	possible to craft the C implementation with BN_UMULT_HIGH
//	inline assembler macro. But of course! Here is the output
//	with the macro:
//
//	                  sign    verify    sign/s verify/s
//	rsa  512 bits   0.0020s   0.0002s    495.0   6561.0
//	rsa 1024 bits   0.0086s   0.0004s    116.2   2235.7
//	rsa 2048 bits   0.0519s   0.0015s     19.3    667.3
//	rsa 4096 bits   0.3464s   0.0053s      2.9    187.7
//	                  sign    verify    sign/s verify/s
//	dsa  512 bits   0.0016s   0.0020s    613.1    510.5
//	dsa 1024 bits   0.0045s   0.0054s    221.0    183.9
//
//	My code is still way faster, huh:-) And I believe that even
//	higher performance can be achieved. Note that as keys get
//	longer, performance gain is larger. Why? According to the
//	profiler there is another player in the field, namely
//	BN_from_montgomery consuming larger and larger portion of CPU
//	time as keysize decreases. I therefore consider putting effort
//	to assembler implementation of the following routine:
//
//	void bn_mul_add_mont (BN_ULONG *rp,BN_ULONG *np,int nl,BN_ULONG n0)
//	{
//	int      i,j;
//	BN_ULONG v;
//
//	for (i=0; i<nl; i++)
//		{
//		v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2);
//		nrp++;
//		rp++;
//		if (((nrp[-1]+=v)&BN_MASK2) < v)
//			for (j=0; ((++nrp[j])&BN_MASK2) == 0; j++) ;
//		}
//	}
//
//	It might as well be beneficial to implement even combaX
//	variants, as it appears as it can literally unleash the
//	performance (see comment section to bn_mul_comba8 below).
//
//	And finally for your reference the output for 0.9.6a compiled
//	with SGIcc version 0.01.0-12 (keep in mind that for the moment
//	of this writing it's not possible to convince SGIcc to use
//	BN_UMULT_HIGH inline assembler macro, yet the code is fast,
//	i.e. for a compiler generated one:-):
//
//	                  sign    verify    sign/s verify/s
//	rsa  512 bits   0.0022s   0.0002s    452.7   5894.3
//	rsa 1024 bits   0.0097s   0.0005s    102.7   2002.9
//	rsa 2048 bits   0.0578s   0.0017s     17.3    600.2
//	rsa 4096 bits   0.3838s   0.0061s      2.6    164.5
//	                  sign    verify    sign/s verify/s
//	dsa  512 bits   0.0018s   0.0022s    547.3    459.6
//	dsa 1024 bits   0.0051s   0.0062s    196.6    161.3
//
//	Oh! Benchmarks were performed on 733MHz Lion-class Itanium
//	system running Redhat Linux 7.1 (very special thanks to Ray
//	McCaffity of Williams Communications for providing an account).
//
// Q.	What's the heck with 'rum 1<<5' at the end of every function?
// A.	Well, by clearing the "upper FP registers written" bit of the
//	User Mask I want to excuse the kernel from preserving upper
//	(f32-f128) FP register bank over process context switch, thus
//	minimizing bus bandwidth consumption during the switch (i.e.
//	after PKI opration completes and the program is off doing
//	something else like bulk symmetric encryption). Having said
//	this, I also want to point out that it might be good idea
//	to compile the whole toolkit (as well as majority of the
//	programs for that matter) with -mfixed-range=f32-f127 command
//	line option. No, it doesn't prevent the compiler from writing
//	to upper bank, but at least discourages to do so. If you don't
//	like the idea you have the option to compile the module with
//	-Drum=nop.m in command line.
//

#if defined(_HPUX_SOURCE) && !defined(_LP64)
#define	ADDP	addp4
#else
#define	ADDP	add
#endif

#if 1
//
// bn_[add|sub]_words routines.
//
// Loops are spinning in 2*(n+5) ticks on Itanuim (provided that the
// data reside in L1 cache, i.e. 2 ticks away). It's possible to
// compress the epilogue and get down to 2*n+6, but at the cost of
// scalability (the neat feature of this implementation is that it
// shall automagically spin in n+5 on "wider" IA-64 implementations:-)
// I consider that the epilogue is short enough as it is to trade tiny
// performance loss on Itanium for scalability.
//
// BN_ULONG bn_add_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num)
//
.global	bn_add_words#
.proc	bn_add_words#
.align	64
.skip	32	// makes the loop body aligned at 64-byte boundary
bn_add_words:
	.prologue
	.save	ar.pfs,r2
{ .mii;	alloc		r2=ar.pfs,4,12,0,16
	cmp4.le		p6,p0=r35,r0	};;
{ .mfb;	mov		r8=r0			// return value
(p6)	br.ret.spnt.many	b0	};;

{ .mib;	sub		r10=r35,r0,1
	.save	ar.lc,r3
	mov		r3=ar.lc
	brp.loop.imp	.L_bn_add_words_ctop,.L_bn_add_words_cend-16
					}
{ .mib;	ADDP		r14=0,r32		// rp
	.save	pr,r9
	mov		r9=pr		};;
	.body
{ .mii;	ADDP		r15=0,r33		// ap
	mov		ar.lc=r10
	mov		ar.ec=6		}
{ .mib;	ADDP		r16=0,r34		// bp
	mov		pr.rot=1<<16	};;

.L_bn_add_words_ctop:
{ .mii;	(p16)	ld8		r32=[r16],8	  // b=*(bp++)
	(p18)	add		r39=r37,r34
	(p19)	cmp.ltu.unc	p56,p0=r40,r38	}
{ .mfb;	(p0)	nop.m		0x0
	(p0)	nop.f		0x0
	(p0)	nop.b		0x0		}
{ .mii;	(p16)	ld8		r35=[r15],8	  // a=*(ap++)
	(p58)	cmp.eq.or	p57,p0=-1,r41	  // (p20)
	(p58)	add		r41=1,r41	} // (p20)
{ .mfb;	(p21)	st8		[r14]=r42,8	  // *(rp++)=r
	(p0)	nop.f		0x0
	br.ctop.sptk	.L_bn_add_words_ctop	};;
.L_bn_add_words_cend:

{ .mii;
(p59)	add		r8=1,r8		// return value
	mov		pr=r9,0x1ffff
	mov		ar.lc=r3	}
{ .mbb;	nop.b		0x0
	br.ret.sptk.many	b0	};;
.endp	bn_add_words#

//
// BN_ULONG bn_sub_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num)
//
.global	bn_sub_words#
.proc	bn_sub_words#
.align	64
.skip	32	// makes the loop body aligned at 64-byte boundary
bn_sub_words:
	.prologue
	.save	ar.pfs,r2
{ .mii;	alloc		r2=ar.pfs,4,12,0,16
	cmp4.le		p6,p0=r35,r0	};;
{ .mfb;	mov		r8=r0			// return value
(p6)	br.ret.spnt.many	b0	};;

{ .mib;	sub		r10=r35,r0,1
	.save	ar.lc,r3
	mov		r3=ar.lc
	brp.loop.imp	.L_bn_sub_words_ctop,.L_bn_sub_words_cend-16
					}
{ .mib;	ADDP		r14=0,r32		// rp
	.save	pr,r9
	mov		r9=pr		};;
	.body
{ .mii;	ADDP		r15=0,r33		// ap
	mov		ar.lc=r10
	mov		ar.ec=6		}
{ .mib;	ADDP		r16=0,r34		// bp
	mov		pr.rot=1<<16	};;

.L_bn_sub_words_ctop:
{ .mii;	(p16)	ld8		r32=[r16],8	  // b=*(bp++)
	(p18)	sub		r39=r37,r34
	(p19)	cmp.gtu.unc	p56,p0=r40,r38	}
{ .mfb;	(p0)	nop.m		0x0
	(p0)	nop.f		0x0
	(p0)	nop.b		0x0		}
{ .mii;	(p16)	ld8		r35=[r15],8	  // a=*(ap++)
	(p58)	cmp.eq.or	p57,p0=0,r41	  // (p20)
	(p58)	add		r41=-1,r41	} // (p20)
{ .mbb;	(p21)	st8		[r14]=r42,8	  // *(rp++)=r
	(p0)	nop.b		0x0
	br.ctop.sptk	.L_bn_sub_words_ctop	};;
.L_bn_sub_words_cend:

{ .mii;
(p59)	add		r8=1,r8		// return value
	mov		pr=r9,0x1ffff
	mov		ar.lc=r3	}
{ .mbb;	nop.b		0x0
	br.ret.sptk.many	b0	};;
.endp	bn_sub_words#
#endif

#if 0
#define XMA_TEMPTATION
#endif

#if 1
//
// BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
//
.global	bn_mul_words#
.proc	bn_mul_words#
.align	64
.skip	32	// makes the loop body aligned at 64-byte boundary
bn_mul_words:
	.prologue
	.save	ar.pfs,r2
#ifdef XMA_TEMPTATION
{ .mfi;	alloc		r2=ar.pfs,4,0,0,0	};;
#else
{ .mfi;	alloc		r2=ar.pfs,4,12,0,16	};;
#endif
{ .mib;	mov		r8=r0			// return value
	cmp4.le		p6,p0=r34,r0
(p6)	br.ret.spnt.many	b0		};;

{ .mii;	sub	r10=r34,r0,1
	.save	ar.lc,r3
	mov	r3=ar.lc
	.save	pr,r9
	mov	r9=pr			};;

	.body
{ .mib;	setf.sig	f8=r35	// w
	mov		pr.rot=0x800001<<16
			// ------^----- serves as (p50) at first (p27)
	brp.loop.imp	.L_bn_mul_words_ctop,.L_bn_mul_words_cend-16
					}

#ifndef XMA_TEMPTATION

{ .mmi;	ADDP		r14=0,r32	// rp
	ADDP		r15=0,r33	// ap
	mov		ar.lc=r10	}
{ .mmi;	mov		r40=0		// serves as r35 at first (p27)
	mov		ar.ec=13	};;

// This loop spins in 2*(n+12) ticks. It's scheduled for data in Itanium
// L2 cache (i.e. 9 ticks away) as floating point load/store instructions
// bypass L1 cache and L2 latency is actually best-case scenario for
// ldf8. The loop is not scalable and shall run in 2*(n+12) even on
// "wider" IA-64 implementations. It's a trade-off here. n+24 loop
// would give us ~5% in *overall* performance improvement on "wider"
// IA-64, but would hurt Itanium for about same because of longer
// epilogue. As it's a matter of few percents in either case I've
// chosen to trade the scalability for development time (you can see
// this very instruction sequence in bn_mul_add_words loop which in
// turn is scalable).
.L_bn_mul_words_ctop:
{ .mfi;	(p25)	getf.sig	r36=f52			// low
	(p21)	xmpy.lu		f48=f37,f8
	(p28)	cmp.ltu		p54,p50=r41,r39	}
{ .mfi;	(p16)	ldf8		f32=[r15],8
	(p21)	xmpy.hu		f40=f37,f8
	(p0)	nop.i		0x0		};;
{ .mii;	(p25)	getf.sig	r32=f44			// high
	.pred.rel	"mutex",p50,p54
	(p50)	add		r40=r38,r35		// (p27)
	(p54)	add		r40=r38,r35,1	}	// (p27)
{ .mfb;	(p28)	st8		[r14]=r41,8
	(p0)	nop.f		0x0
	br.ctop.sptk	.L_bn_mul_words_ctop	};;
.L_bn_mul_words_cend:

{ .mii;	nop.m		0x0
.pred.rel	"mutex",p51,p55
(p51)	add		r8=r36,r0
(p55)	add		r8=r36,r0,1	}
{ .mfb;	nop.m	0x0
	nop.f	0x0
	nop.b	0x0			}

#else	// XMA_TEMPTATION

	setf.sig	f37=r0	// serves as carry at (p18) tick
	mov		ar.lc=r10
	mov		ar.ec=5;;

// Most of you examining this code very likely wonder why in the name
// of Intel the following loop is commented out? Indeed, it looks so
// neat that you find it hard to believe that it's something wrong
// with it, right? The catch is that every iteration depends on the
// result from previous one and the latter isn't available instantly.
// The loop therefore spins at the latency of xma minus 1, or in other
// words at 6*(n+4) ticks:-( Compare to the "production" loop above
// that runs in 2*(n+11) where the low latency problem is worked around
// by moving the dependency to one-tick latent interger ALU. Note that
// "distance" between ldf8 and xma is not latency of ldf8, but the
// *difference* between xma and ldf8 latencies.
.L_bn_mul_words_ctop:
{ .mfi;	(p16)	ldf8		f32=[r33],8
	(p18)	xma.hu		f38=f34,f8,f39	}
{ .mfb;	(p20)	stf8		[r32]=f37,8
	(p18)	xma.lu		f35=f34,f8,f39
	br.ctop.sptk	.L_bn_mul_words_ctop	};;
.L_bn_mul_words_cend:

	getf.sig	r8=f41		// the return value

#endif	// XMA_TEMPTATION

{ .mii;	nop.m		0x0
	mov		pr=r9,0x1ffff
	mov		ar.lc=r3	}
{ .mfb;	rum		1<<5		// clear um.mfh
	nop.f		0x0
	br.ret.sptk.many	b0	};;
.endp	bn_mul_words#
#endif

#if 1
//
// BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
//
.global	bn_mul_add_words#
.proc	bn_mul_add_words#
.align	64
.skip	48	// makes the loop body aligned at 64-byte boundary
bn_mul_add_words:
	.prologue
	.save	ar.pfs,r2
{ .mmi;	alloc		r2=ar.pfs,4,4,0,8
	cmp4.le		p6,p0=r34,r0
	.save	ar.lc,r3
	mov		r3=ar.lc	};;
{ .mib;	mov		r8=r0		// return value
	sub		r10=r34,r0,1
(p6)	br.ret.spnt.many	b0	};;

{ .mib;	setf.sig	f8=r35		// w
	.save	pr,r9
	mov		r9=pr
	brp.loop.imp	.L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16
					}
	.body
{ .mmi;	ADDP		r14=0,r32	// rp
	ADDP		r15=0,r33	// ap
	mov		ar.lc=r10	}
{ .mii;	ADDP		r16=0,r32	// rp copy
	mov		pr.rot=0x2001<<16
			// ------^----- serves as (p40) at first (p27)
	mov		ar.ec=11	};;

// This loop spins in 3*(n+10) ticks on Itanium and in 2*(n+10) on
// Itanium 2. Yes, unlike previous versions it scales:-) Previous
// version was peforming *all* additions in IALU and was starving
// for those even on Itanium 2. In this version one addition is
// moved to FPU and is folded with multiplication. This is at cost
// of propogating the result from previous call to this subroutine
// to L2 cache... In other words negligible even for shorter keys.
// *Overall* performance improvement [over previous version] varies
// from 11 to 22 percent depending on key length.
.L_bn_mul_add_words_ctop:
.pred.rel	"mutex",p40,p42
{ .mfi;	(p23)	getf.sig	r36=f45			// low
	(p20)	xma.lu		f42=f36,f8,f50		// low
	(p40)	add		r39=r39,r35	}	// (p27)
{ .mfi;	(p16)	ldf8		f32=[r15],8		// *(ap++)
	(p20)	xma.hu		f36=f36,f8,f50		// high
	(p42)	add		r39=r39,r35,1	};;	// (p27)
{ .mmi;	(p24)	getf.sig	r32=f40			// high
	(p16)	ldf8		f46=[r16],8		// *(rp1++)
	(p40)	cmp.ltu		p41,p39=r39,r35	}	// (p27)
{ .mib;	(p26)	st8		[r14]=r39,8		// *(rp2++)
	(p42)	cmp.leu		p41,p39=r39,r35		// (p27)
	br.ctop.sptk	.L_bn_mul_add_words_ctop};;
.L_bn_mul_add_words_cend:

{ .mmi;	.pred.rel	"mutex",p40,p42
(p40)	add		r8=r35,r0
(p42)	add		r8=r35,r0,1
	mov		pr=r9,0x1ffff	}
{ .mib;	rum		1<<5		// clear um.mfh
	mov		ar.lc=r3
	br.ret.sptk.many	b0	};;
.endp	bn_mul_add_words#
#endif

#if 1
//
// void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
//
.global	bn_sqr_words#
.proc	bn_sqr_words#
.align	64
.skip	32	// makes the loop body aligned at 64-byte boundary 
bn_sqr_words:
	.prologue
	.save	ar.pfs,r2
{ .mii;	alloc		r2=ar.pfs,3,0,0,0
	sxt4		r34=r34		};;
{ .mii;	cmp.le		p6,p0=r34,r0
	mov		r8=r0		}	// return value
{ .mfb;	ADDP		r32=0,r32
	nop.f		0x0
(p6)	br.ret.spnt.many	b0	};;

{ .mii;	sub	r10=r34,r0,1
	.save	ar.lc,r3
	mov	r3=ar.lc
	.save	pr,r9
	mov	r9=pr			};;

	.body
{ .mib;	ADDP		r33=0,r33
	mov		pr.rot=1<<16
	brp.loop.imp	.L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16
					}
{ .mii;	add		r34=8,r32
	mov		ar.lc=r10
	mov		ar.ec=18	};;

// 2*(n+17) on Itanium, (n+17) on "wider" IA-64 implementations. It's
// possible to compress the epilogue (I'm getting tired to write this
// comment over and over) and get down to 2*n+16 at the cost of
// scalability. The decision will very likely be reconsidered after the
// benchmark program is profiled. I.e. if perfomance gain on Itanium
// will appear larger than loss on "wider" IA-64, then the loop should
// be explicitely split and the epilogue compressed.
.L_bn_sqr_words_ctop:
{ .mfi;	(p16)	ldf8		f32=[r33],8
	(p25)	xmpy.lu		f42=f41,f41
	(p0)	nop.i		0x0		}
{ .mib;	(p33)	stf8		[r32]=f50,16
	(p0)	nop.i		0x0
	(p0)	nop.b		0x0		}
{ .mfi;	(p0)	nop.m		0x0
	(p25)	xmpy.hu		f52=f41,f41
	(p0)	nop.i		0x0		}
{ .mib;	(p33)	stf8		[r34]=f60,16
	(p0)	nop.i		0x0
	br.ctop.sptk	.L_bn_sqr_words_ctop	};;
.L_bn_sqr_words_cend:

{ .mii;	nop.m		0x0
	mov		pr=r9,0x1ffff
	mov		ar.lc=r3	}
{ .mfb;	rum		1<<5		// clear um.mfh
	nop.f		0x0
	br.ret.sptk.many	b0	};;
.endp	bn_sqr_words#
#endif

#if 1
// Apparently we win nothing by implementing special bn_sqr_comba8.
// Yes, it is possible to reduce the number of multiplications by
// almost factor of two, but then the amount of additions would
// increase by factor of two (as we would have to perform those
// otherwise performed by xma ourselves). Normally we would trade
// anyway as multiplications are way more expensive, but not this
// time... Multiplication kernel is fully pipelined and as we drain
// one 128-bit multiplication result per clock cycle multiplications
// are effectively as inexpensive as additions. Special implementation
// might become of interest for "wider" IA-64 implementation as you'll
// be able to get through the multiplication phase faster (there won't
// be any stall issues as discussed in the commentary section below and
// you therefore will be able to employ all 4 FP units)... But these
// Itanium days it's simply too hard to justify the effort so I just
// drop down to bn_mul_comba8 code:-)
//
// void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
//
.global	bn_sqr_comba8#
.proc	bn_sqr_comba8#
.align	64
bn_sqr_comba8:
	.prologue
	.save	ar.pfs,r2
#if defined(_HPUX_SOURCE) && !defined(_LP64)
{ .mii;	alloc	r2=ar.pfs,2,1,0,0
	addp4	r33=0,r33
	addp4	r32=0,r32		};;
{ .mii;
#else
{ .mii;	alloc	r2=ar.pfs,2,1,0,0
#endif
	mov	r34=r33
	add	r14=8,r33		};;
	.body
{ .mii;	add	r17=8,r34
	add	r15=16,r33
	add	r18=16,r34		}
{ .mfb;	add	r16=24,r33
	br	.L_cheat_entry_point8	};;
.endp	bn_sqr_comba8#
#endif

#if 1
// I've estimated this routine to run in ~120 ticks, but in reality
// (i.e. according to ar.itc) it takes ~160 ticks. Are those extra
// cycles consumed for instructions fetch? Or did I misinterpret some
// clause in Itanium µ-architecture manual? Comments are welcomed and
// highly appreciated.
//
// On Itanium 2 it takes ~190 ticks. This is because of stalls on
// result from getf.sig. I do nothing about it at this point for
// reasons depicted below.
//
// However! It should be noted that even 160 ticks is darn good result
// as it's over 10 (yes, ten, spelled as t-e-n) times faster than the
// C version (compiled with gcc with inline assembler). I really
// kicked compiler's butt here, didn't I? Yeah! This brings us to the
// following statement. It's damn shame that this routine isn't called
// very often nowadays! According to the profiler most CPU time is
// consumed by bn_mul_add_words called from BN_from_montgomery. In
// order to estimate what we're missing, I've compared the performance
// of this routine against "traditional" implementation, i.e. against
// following routine:
//
// void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
// {	r[ 8]=bn_mul_words(    &(r[0]),a,8,b[0]);
//	r[ 9]=bn_mul_add_words(&(r[1]),a,8,b[1]);
//	r[10]=bn_mul_add_words(&(r[2]),a,8,b[2]);
//	r[11]=bn_mul_add_words(&(r[3]),a,8,b[3]);
//	r[12]=bn_mul_add_words(&(r[4]),a,8,b[4]);
//	r[13]=bn_mul_add_words(&(r[5]),a,8,b[5]);
//	r[14]=bn_mul_add_words(&(r[6]),a,8,b[6]);
//	r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]);
// }
//
// The one below is over 8 times faster than the one above:-( Even
// more reasons to "combafy" bn_mul_add_mont...
//
// And yes, this routine really made me wish there were an optimizing
// assembler! It also feels like it deserves a dedication.
//
//	To my wife for being there and to my kids...
//
// void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
//
#define	carry1	r14
#define	carry2	r15
#define	carry3	r34
.global	bn_mul_comba8#
.proc	bn_mul_comba8#
.align	64
bn_mul_comba8:
	.prologue
	.save	ar.pfs,r2
#if defined(_HPUX_SOURCE) && !defined(_LP64)
{ .mii;	alloc	r2=ar.pfs,3,0,0,0
	addp4	r33=0,r33
	addp4	r34=0,r34		};;
{ .mii;	addp4	r32=0,r32
#else
{ .mii;	alloc   r2=ar.pfs,3,0,0,0
#endif
	add	r14=8,r33
	add	r17=8,r34		}
	.body
{ .mii;	add	r15=16,r33
	add	r18=16,r34
	add	r16=24,r33		}
.L_cheat_entry_point8:
{ .mmi;	add	r19=24,r34

	ldf8	f32=[r33],32		};;

{ .mmi;	ldf8	f120=[r34],32
	ldf8	f121=[r17],32		}
{ .mmi;	ldf8	f122=[r18],32
	ldf8	f123=[r19],32		};;
{ .mmi;	ldf8	f124=[r34]
	ldf8	f125=[r17]		}
{ .mmi;	ldf8	f126=[r18]
	ldf8	f127=[r19]		}

{ .mmi;	ldf8	f33=[r14],32
	ldf8	f34=[r15],32		}
{ .mmi;	ldf8	f35=[r16],32;;
	ldf8	f36=[r33]		}
{ .mmi;	ldf8	f37=[r14]
	ldf8	f38=[r15]		}
{ .mfi;	ldf8	f39=[r16]
// -------\ Entering multiplier's heaven /-------
// ------------\                    /------------
// -----------------\          /-----------------
// ----------------------\/----------------------
		xma.hu	f41=f32,f120,f0		}
{ .mfi;		xma.lu	f40=f32,f120,f0		};; // (*)
{ .mfi;		xma.hu	f51=f32,f121,f0		}
{ .mfi;		xma.lu	f50=f32,f121,f0		};;
{ .mfi;		xma.hu	f61=f32,f122,f0		}
{ .mfi;		xma.lu	f60=f32,f122,f0		};;
{ .mfi;		xma.hu	f71=f32,f123,f0		}
{ .mfi;		xma.lu	f70=f32,f123,f0		};;
{ .mfi;		xma.hu	f81=f32,f124,f0		}
{ .mfi;		xma.lu	f80=f32,f124,f0		};;
{ .mfi;		xma.hu	f91=f32,f125,f0		}
{ .mfi;		xma.lu	f90=f32,f125,f0		};;
{ .mfi;		xma.hu	f101=f32,f126,f0	}
{ .mfi;		xma.lu	f100=f32,f126,f0	};;
{ .mfi;		xma.hu	f111=f32,f127,f0	}
{ .mfi;		xma.lu	f110=f32,f127,f0	};;//
// (*)	You can argue that splitting at every second bundle would
//	prevent "wider" IA-64 implementations from achieving the peak
//	performance. Well, not really... The catch is that if you
//	intend to keep 4 FP units busy by splitting at every fourth
//	bundle and thus perform these 16 multiplications in 4 ticks,
//	the first bundle *below* would stall because the result from
//	the first xma bundle *above* won't be available for another 3
//	ticks (if not more, being an optimist, I assume that "wider"
//	implementation will have same latency:-). This stall will hold
//	you back and the performance would be as if every second bundle
//	were split *anyway*...
{ .mfi;	getf.sig	r16=f40
		xma.hu	f42=f33,f120,f41
	add		r33=8,r32		}
{ .mfi;		xma.lu	f41=f33,f120,f41	};;
{ .mfi;	getf.sig	r24=f50
		xma.hu	f52=f33,f121,f51	}
{ .mfi;		xma.lu	f51=f33,f121,f51	};;
{ .mfi;	st8		[r32]=r16,16
		xma.hu	f62=f33,f122,f61	}
{ .mfi;		xma.lu	f61=f33,f122,f61	};;
{ .mfi;		xma.hu	f72=f33,f123,f71	}
{ .mfi;		xma.lu	f71=f33,f123,f71	};;
{ .mfi;		xma.hu	f82=f33,f124,f81	}
{ .mfi;		xma.lu	f81=f33,f124,f81	};;
{ .mfi;		xma.hu	f92=f33,f125,f91	}
{ .mfi;		xma.lu	f91=f33,f125,f91	};;
{ .mfi;		xma.hu	f102=f33,f126,f101	}
{ .mfi;		xma.lu	f101=f33,f126,f101	};;
{ .mfi;		xma.hu	f112=f33,f127,f111	}
{ .mfi;		xma.lu	f111=f33,f127,f111	};;//
//-------------------------------------------------//
{ .mfi;	getf.sig	r25=f41
		xma.hu	f43=f34,f120,f42	}
{ .mfi;		xma.lu	f42=f34,f120,f42	};;
{ .mfi;	getf.sig	r16=f60
		xma.hu	f53=f34,f121,f52	}
{ .mfi;		xma.lu	f52=f34,f121,f52	};;
{ .mfi;	getf.sig	r17=f51
		xma.hu	f63=f34,f122,f62
	add		r25=r25,r24		}
{ .mfi;		xma.lu	f62=f34,f122,f62
	mov		carry1=0		};;
{ .mfi;	cmp.ltu		p6,p0=r25,r24
		xma.hu	f73=f34,f123,f72	}
{ .mfi;		xma.lu	f72=f34,f123,f72	};;
{ .mfi;	st8		[r33]=r25,16
		xma.hu	f83=f34,f124,f82
(p6)	add		carry1=1,carry1		}
{ .mfi;		xma.lu	f82=f34,f124,f82	};;
{ .mfi;		xma.hu	f93=f34,f125,f92	}
{ .mfi;		xma.lu	f92=f34,f125,f92	};;
{ .mfi;		xma.hu	f103=f34,f126,f102	}
{ .mfi;		xma.lu	f102=f34,f126,f102	};;
{ .mfi;		xma.hu	f113=f34,f127,f112	}
{ .mfi;		xma.lu	f112=f34,f127,f112	};;//
//-------------------------------------------------//
{ .mfi;	getf.sig	r18=f42
		xma.hu	f44=f35,f120,f43
	add		r17=r17,r16		}
{ .mfi;		xma.lu	f43=f35,f120,f43	};;
{ .mfi;	getf.sig	r24=f70
		xma.hu	f54=f35,f121,f53	}
{ .mfi;	mov		carry2=0
		xma.lu	f53=f35,f121,f53	};;
{ .mfi;	getf.sig	r25=f61
		xma.hu	f64=f35,f122,f63
	cmp.ltu		p7,p0=r17,r16		}
{ .mfi;	add		r18=r18,r17
		xma.lu	f63=f35,f122,f63	};;
{ .mfi;	getf.sig	r26=f52
		xma.hu	f74=f35,f123,f73
(p7)	add		carry2=1,carry2		}
{ .mfi;	cmp.ltu		p7,p0=r18,r17
		xma.lu	f73=f35,f123,f73
	add		r18=r18,carry1		};;
{ .mfi;
		xma.hu	f84=f35,f124,f83
(p7)	add		carry2=1,carry2		}
{ .mfi;	cmp.ltu		p7,p0=r18,carry1
		xma.lu	f83=f35,f124,f83	};;
{ .mfi;	st8		[r32]=r18,16
		xma.hu	f94=f35,f125,f93
(p7)	add		carry2=1,carry2		}
{ .mfi;		xma.lu	f93=f35,f125,f93	};;
{ .mfi;		xma.hu	f104=f35,f126,f103	}
{ .mfi;		xma.lu	f103=f35,f126,f103	};;
{ .mfi;		xma.hu	f114=f35,f127,f113	}
{ .mfi;	mov		carry1=0
		xma.lu	f113=f35,f127,f113
	add		r25=r25,r24		};;//
//-------------------------------------------------//
{ .mfi;	getf.sig	r27=f43
		xma.hu	f45=f36,f120,f44
	cmp.ltu		p6,p0=r25,r24		}
{ .mfi;		xma.lu	f44=f36,f120,f44	
	add		r26=r26,r25		};;
{ .mfi;	getf.sig	r16=f80
		xma.hu	f55=f36,f121,f54
(p6)	add		carry1=1,carry1		}
{ .mfi;		xma.lu	f54=f36,f121,f54	};;
{ .mfi;	getf.sig	r17=f71
		xma.hu	f65=f36,f122,f64
	cmp.ltu		p6,p0=r26,r25		}
{ .mfi;		xma.lu	f64=f36,f122,f64
	add		r27=r27,r26		};;
{ .mfi;	getf.sig	r18=f62
		xma.hu	f75=f36,f123,f74
(p6)	add		carry1=1,carry1		}
{ .mfi;	cmp.ltu		p6,p0=r27,r26
		xma.lu	f74=f36,f123,f74
	add		r27=r27,carry2		};;
{ .mfi;	getf.sig	r19=f53
		xma.hu	f85=f36,f124,f84
(p6)	add		carry1=1,carry1		}
{ .mfi;		xma.lu	f84=f36,f124,f84
	cmp.ltu		p6,p0=r27,carry2	};;
{ .mfi;	st8		[r33]=r27,16
		xma.hu	f95=f36,f125,f94
(p6)	add		carry1=1,carry1		}
{ .mfi;		xma.lu	f94=f36,f125,f94	};;
{ .mfi;		xma.hu	f105=f36,f126,f104	}
{ .mfi;	mov		carry2=0
		xma.lu	f104=f36,f126,f104
	add		r17=r17,r16		};;
{ .mfi;		xma.hu	f115=f36,f127,f114
	cmp.ltu		p7,p0=r17,r16		}
{ .mfi;		xma.lu	f114=f36,f127,f114
	add		r18=r18,r17		};;//
//-------------------------------------------------//
{ .mfi;	getf.sig	r20=f44
		xma.hu	f46=f37,f120,f45
(p7)	add		carry2=1,carry2		}
{ .mfi;	cmp.ltu		p7,p0=r18,r17
		xma.lu	f45=f37,f120,f45
	add		r19=r19,r18		};;
{ .mfi;	getf.sig	r24=f90
		xma.hu	f56=f37,f121,f55	}
{ .mfi;		xma.lu	f55=f37,f121,f55	};;
{ .mfi;	getf.sig	r25=f81
		xma.hu	f66=f37,f122,f65
(p7)	add		carry2=1,carry2		}
{ .mfi;	cmp.ltu		p7,p0=r19,r18
		xma.lu	f65=f37,f122,f65
	add		r20=r20,r19		};;
{ .mfi;	getf.sig	r26=f72
		xma.hu	f76=f37,f123,f75
(p7)	add		carry2=1,carry2		}
{ .mfi;	cmp.ltu		p7,p0=r20,r19
		xma.lu	f75=f37,f123,f75
	add		r20=r20,carry1		};;
{ .mfi;	getf.sig	r27=f63
		xma.hu	f86=f37,f124,f85
(p7)	add		carry2=1,carry2		}
{ .mfi;		xma.lu	f85=f37,f124,f85
	cmp.ltu		p7,p0=r20,carry1	};;
{ .mfi;	getf.sig	r28=f54
		xma.hu	f96=f37,f125,f95
(p7)	add		carry2=1,carry2		}
{ .mfi;	st8		[r32]=r20,16
		xma.lu	f95=f37,f125,f95	};;
{ .mfi;		xma.hu	f106=f37,f126,f105	}
{ .mfi;	mov		carry1=0
		xma.lu	f105=f37,f126,f105
	add		r25=r25,r24		};;
{ .mfi;		xma.hu	f116=f37,f127,f115
	cmp.ltu		p6,p0=r25,r24		}
{ .mfi;		xma.lu	f115=f37,f127,f115
	add		r26=r26,r25		};;//
//-------------------------------------------------//
{ .mfi;	getf.sig	r29=f45
		xma.hu	f47=f38,f120,f46
(p6)	add		carry1=1,carry1		}
{ .mfi;	cmp.ltu		p6,p0=r26,r25
		xma.lu	f46=f38,f120,f46
	add		r27=r27,r26		};;
{ .mfi;	getf.sig	r16=f100
		xma.hu	f57=f38,f121,f56
(p6)	add		carry1=1,carry1		}
{ .mfi;	cmp.ltu		p6,p0=r27,r26
		xma.lu	f56=f38,f121,f56
	add		r28=r28,r27		};;
{ .mfi;	getf.sig	r17=f91
		xma.hu	f67=f38,f122,f66
(p6)	add		carry1=1,carry1		}
{ .mfi;	cmp.ltu		p6,p0=r28,r27
		xma.lu	f66=f38,f122,f66
	add		r29=r29,r28		};;
{ .mfi;	getf.sig	r18=f82
		xma.hu	f77=f38,f123,f76
(p6)	add		carry1=1,carry1		}
{ .mfi;	cmp.ltu		p6,p0=r29,r28
		xma.lu	f76=f38,f123,f76
	add		r29=r29,carry2		};;
{ .mfi;	getf.sig	r19=f73
		xma.hu	f87=f38,f124,f86
(p6)	add		carry1=1,carry1		}
{ .mfi;		xma.lu	f86=f38,f124,f86
	cmp.ltu		p6,p0=r29,carry2	};;
{ .mfi;	getf.sig	r20=f64
		xma.hu	f97=f38,f125,f96
(p6)	add		carry1=1,carry1		}
{ .mfi;	st8		[r33]=r29,16
		xma.lu	f96=f38,f125,f96	};;
{ .mfi;	getf.sig	r21=f55
		xma.hu	f107=f38,f126,f106	}
{ .mfi;	mov		carry2=0
		xma.lu	f106=f38,f126,f106
	add		r17=r17,r16		};;
{ .mfi;		xma.hu	f117=f38,f127,f116
	cmp.ltu		p7,p0=r17,r16		}
{ .mfi;		xma.lu	f116=f38,f127,f116
	add		r18=r18,r17		};;//
//-------------------------------------------------//
{ .mfi;	getf.sig	r22=f46
		xma.hu	f48=f39,f120,f47
(p7)	add		carry2=1,carry2		}
{ .mfi;	cmp.ltu		p7,p0=r18,r17
		xma.lu	f47=f39,f120,f47
	add		r19=r19,r18		};;
{ .mfi;	getf.sig	r24=f110
		xma.hu	f58=f39,f121,f57
(p7)	add		carry2=1,carry2		}
{ .mfi;	cmp.ltu		p7,p0=r19,r18
		xma.lu	f57=f39,f121,f57
	add		r20=r20,r19		};;
{ .mfi;	getf.sig	r25=f101
		xma.hu	f68=f39,f122,f67
(p7)	add		carry2=1,carry2		}
{ .mfi;	cmp.ltu		p7,p0=r20,r19
		xma.lu	f67=f39,f122,f67
	add		r21=r21,r20		};;
{ .mfi;	getf.sig	r26=f92
		xma.hu	f78=f39,f123,f77
(p7)	add		carry2=1,carry2		}
{ .mfi;	cmp.ltu		p7,p0=r21,r20
		xma.lu	f77=f39,f123,f77
	add		r22=r22,r21		};;
{ .mfi;	getf.sig	r27=f83
		xma.hu	f88=f39,f124,f87
(p7)	add		carry2=1,carry2		}
{ .mfi;	cmp.ltu		p7,p0=r22,r21
		xma.lu	f87=f39,f124,f87
	add		r22=r22,carry1		};;
{ .mfi;	getf.sig	r28=f74
		xma.hu	f98=f39,f125,f97
(p7)	add		carry2=1,carry2		}
{ .mfi;		xma.lu	f97=f39,f125,f97
	cmp.ltu		p7,p0=r22,carry1	};;
{ .mfi;	getf.sig	r29=f65
		xma.hu	f108=f39,f126,f107
(p7)	add		carry2=1,carry2		}
{ .mfi;	st8		[r32]=r22,16
		xma.lu	f107=f39,f126,f107	};;
{ .mfi;	getf.sig	r30=f56
		xma.hu	f118=f39,f127,f117	}
{ .mfi;		xma.lu	f117=f39,f127,f117	};;//
//-------------------------------------------------//
// Leaving muliplier's heaven... Quite a ride, huh?

{ .mii;	getf.sig	r31=f47
	add		r25=r25,r24
	mov		carry1=0		};;
{ .mii;		getf.sig	r16=f111
	cmp.ltu		p6,p0=r25,r24
	add		r26=r26,r25		};;
{ .mfb;		getf.sig	r17=f102	}
{ .mii;
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r26,r25
	add		r27=r27,r26		};;
{ .mfb;	nop.m	0x0				}
{ .mii;
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r27,r26
	add		r28=r28,r27		};;
{ .mii;		getf.sig	r18=f93
		add		r17=r17,r16
		mov		carry3=0	}
{ .mii;
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r28,r27
	add		r29=r29,r28		};;
{ .mii;		getf.sig	r19=f84
		cmp.ltu		p7,p0=r17,r16	}
{ .mii;
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r29,r28
	add		r30=r30,r29		};;
{ .mii;		getf.sig	r20=f75
		add		r18=r18,r17	}
{ .mii;
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r30,r29
	add		r31=r31,r30		};;
{ .mfb;		getf.sig	r21=f66		}
{ .mii;	(p7)	add		carry3=1,carry3
		cmp.ltu		p7,p0=r18,r17
		add		r19=r19,r18	}
{ .mfb;	nop.m	0x0				}
{ .mii;
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r31,r30
	add		r31=r31,carry2		};;
{ .mfb;		getf.sig	r22=f57		}
{ .mii;	(p7)	add		carry3=1,carry3
		cmp.ltu		p7,p0=r19,r18
		add		r20=r20,r19	}
{ .mfb;	nop.m	0x0				}
{ .mii;
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r31,carry2	};;
{ .mfb;		getf.sig	r23=f48		}
{ .mii;	(p7)	add		carry3=1,carry3
		cmp.ltu		p7,p0=r20,r19
		add		r21=r21,r20	}
{ .mii;
(p6)	add		carry1=1,carry1		}
{ .mfb;	st8		[r33]=r31,16		};;

{ .mfb;	getf.sig	r24=f112		}
{ .mii;	(p7)	add		carry3=1,carry3
		cmp.ltu		p7,p0=r21,r20
		add		r22=r22,r21	};;
{ .mfb;	getf.sig	r25=f103		}
{ .mii;	(p7)	add		carry3=1,carry3
		cmp.ltu		p7,p0=r22,r21