Loading crypto/bn/asm/ia64-mont.pl +150 −63 Original line number Diff line number Diff line Loading @@ -38,15 +38,15 @@ # So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with* # this module is: # sign verify sign/s verify/s # rsa 512 bits 0.000302s 0.000024s 3312.3 41332.2 # rsa 1024 bits 0.000816s 0.000058s 1225.2 17172.0 # rsa 512 bits 0.000290s 0.000024s 3452.8 42031.4 # rsa 1024 bits 0.000793s 0.000058s 1261.7 17172.0 # rsa 2048 bits 0.005908s 0.000148s 169.3 6754.0 # rsa 4096 bits 0.033456s 0.000469s 29.9 2133.6 # dsa 512 bits 0.000254s 0.000206s 3944.6 4865.1 # dsa 512 bits 0.000253s 0.000198s 3949.9 5057.0 # dsa 1024 bits 0.000585s 0.000607s 1708.4 1647.4 # dsa 2048 bits 0.001453s 0.001703s 688.1 587.4 # # ... and *without*: # ... and *without* (but still with ia64.S): # # rsa 512 bits 0.000670s 0.000041s 1491.8 24145.5 # rsa 1024 bits 0.001988s 0.000080s 502.9 12499.3 Loading @@ -56,9 +56,9 @@ # dsa 1024 bits 0.000823s 0.000867s 1215.6 1153.2 # dsa 2048 bits 0.001894s 0.002179s 528.1 458.9 # # As it can be seen, RSA sign performance improves by 120-30%, # hereafter less for longer keys, while verify - by 72-13%. # DSA performance improves by 100-30%. # As it can be seen, RSA sign performance improves by 130-30%, # hereafter less for longer keys, while verify - by 74-13%. # DSA performance improves by 115-30%. if ($^O eq "hpux") { $ADDP="addp4"; Loading Loading @@ -473,7 +473,7 @@ bn_mul_mont_8: mov ar.lc=r28 } { .mfi; (p14)ldf8 bj[0]=[r30],16 // bp[7] (p15)fcvt.fxu bj[0]=f0 mov ar.ec=2 } mov ar.ec=1 } { .mfi; (p12)ldf8 ni6=[nptr],16 // np[6] (p13)fcvt.fxu ni6=f0 mov pr.rot=1<<16 } Loading @@ -482,12 +482,12 @@ bn_mul_mont_8: brp.loop.imp .Louter_8_ctop,.Louter_8_cend-16 };; // The loop is scheduled for 32*(n+1) ticks on Itanium 2. Actual // measurement with help of Interval Time Counter indicate that the // The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt // to measure with help of Interval Time Counter indicated that the // factor is a tad higher: 33 or 34, if not 35. Exact measurement and // addressing the issue is problematic, because I don't have access // to platform-specific instruction-level profiler. On Itanium it // should run in 56*(n+1) ticks, because of higher xma latency... // should run in 56*n ticks, because of higher xma latency... .Louter_8_ctop: .pred.rel "mutex",p40,p42 .pred.rel "mutex",p48,p50 Loading Loading @@ -652,62 +652,149 @@ bn_mul_mont_8: br.ctop.sptk.many .Louter_8_ctop };; .Louter_8_cend: // move np[8] to GPR bank and subtract it from carrybit|tmp[8] // above loop has to execute one more time, without (p16), which is // replaced with merged move of np[8] to GPR bank .pred.rel "mutex",p40,p42 .pred.rel "mutex",p48,p50 { .mmi; (p0) getf.sig n1=ni0 // 0: (p40) add a3=a3,n3 // (p17) a3+=n3 (p42) add a3=a3,n3,1 };; { .mii; (p17) getf.sig a7=alo[8] // 1: (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3 (p50) add t[6]=t[6],a3,1 };; { .mfi; (p17) getf.sig a8=ahi[8] // 2: (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0 (p40) cmp.ltu p43,p41=a3,n3 } { .mfi; (p42) cmp.leu p43,p41=a3,n3 (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6] (p0) nop.i 0 };; { .mii; (p17) getf.sig n5=nlo[6] // 3: (p48) cmp.ltu p51,p49=t[6],a3 (p50) cmp.leu p51,p49=t[6],a3 };; .pred.rel "mutex",p41,p43 .pred.rel "mutex",p49,p51 { .mmi; (p0) getf.sig n2=ni1 // 4: (p41) add a4=a4,n4 // (p17) a4+=n4 (p43) add a4=a4,n4,1 };; { .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4 (p0) nop.f 0 (p51) add t[5]=t[5],a4,1 };; { .mfi; (p0) getf.sig n3=ni2 // 6: (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0 (p41) cmp.ltu p42,p40=a4,n4 } { .mfi; (p43) cmp.leu p42,p40=a4,n4 (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7] (p0) nop.i 0 };; { .mii; (p17) getf.sig n6=nlo[7] // 7: (p49) cmp.ltu p50,p48=t[5],a4 (p51) cmp.leu p50,p48=t[5],a4 };; .pred.rel "mutex",p40,p42 .pred.rel "mutex",p48,p50 { .mii; (p0) getf.sig n4=ni3 // 8: (p40) add a5=a5,n5 // (p17) a5+=n5 (p42) add a5=a5,n5,1 };; { .mii; (p0) nop.m 0 // 9: (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5 (p50) add t[4]=t[4],a5,1 };; { .mii; (p0) nop.m 0 // 10: (p40) cmp.ltu p43,p41=a5,n5 (p42) cmp.leu p43,p41=a5,n5 };; { .mii; (p17) getf.sig n7=nlo[8] // 11: (p48) cmp.ltu p51,p49=t[4],a5 (p50) cmp.leu p51,p49=t[4],a5 };; .pred.rel "mutex",p41,p43 .pred.rel "mutex",p49,p51 { .mii; (p17) getf.sig n8=nhi[8] // 12: (p41) add a6=a6,n6 // (p17) a6+=n6 (p43) add a6=a6,n6,1 };; { .mii; (p0) getf.sig n5=ni4 // 13: (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6 (p51) add t[3]=t[3],a6,1 };; { .mii; (p0) nop.m 0 // 14: (p41) cmp.ltu p42,p40=a6,n6 (p43) cmp.leu p42,p40=a6,n6 };; { .mii; (p0) getf.sig n6=ni5 // 15: (p49) cmp.ltu p50,p48=t[3],a6 (p51) cmp.leu p50,p48=t[3],a6 };; .pred.rel "mutex",p40,p42 .pred.rel "mutex",p48,p50 { .mii; (p0) nop.m 0 // 16: (p40) add a7=a7,n7 // (p17) a7+=n7 (p42) add a7=a7,n7,1 };; { .mii; (p0) nop.m 0 // 17: (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7 (p50) add t[2]=t[2],a7,1 };; { .mii; (p0) nop.m 0 // 18: (p40) cmp.ltu p43,p41=a7,n7 (p42) cmp.leu p43,p41=a7,n7 };; { .mii; (p0) getf.sig n7=ni6 // 19: (p48) cmp.ltu p51,p49=t[2],a7 (p50) cmp.leu p51,p49=t[2],a7 };; .pred.rel "mutex",p41,p43 .pred.rel "mutex",p49,p51 { .mii; (p0) nop.m 0 // 20: (p41) add a8=a8,n8 // (p17) a8+=n8 (p43) add a8=a8,n8,1 };; { .mmi; (p0) nop.m 0 // 21: (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8 (p51) add t[1]=t[1],a8,1 } { .mmi; (p17) mov t[0]=r0 (p41) cmp.ltu p42,p40=a8,n8 (p43) cmp.leu p42,p40=a8,n8 };; { .mmi; (p0) getf.sig n8=ni7 // 22: (p49) cmp.ltu p50,p48=t[1],a8 (p51) cmp.leu p50,p48=t[1],a8 } { .mmi; (p42) add t[0]=t[0],r0,1 (p0) add r16=-7*16,prevsp (p0) add r17=-6*16,prevsp };; // subtract np[8] from carrybit|tmp[8] // carrybit|tmp[8] layout upon exit from above loop is: // t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t[0]|t0 (least significant) { .mmi; getf.sig n1=ni0 getf.sig n2=ni1 add r16=-7*16,prevsp} { .mmi; getf.sig n3=ni2 getf.sig n4=ni3 add r17=-6*16,prevsp};; { .mmi; getf.sig n5=ni4 getf.sig n6=ni5 add r18=-5*16,prevsp} { .mmi; getf.sig n7=ni6 getf.sig n8=ni7 // t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant) { .mmi; (p50)add t[0]=t[0],r0,1 add r18=-5*16,prevsp sub n1=t0,n1 };; { .mmi; cmp.gtu p34,p32=n1,t0;; .pred.rel "mutex",p32,p34 (p32)sub n2=t[0],n2 (p34)sub n2=t[0],n2,1 };; { .mii; (p32)cmp.gtu p35,p33=n2,t[0] (p34)cmp.geu p35,p33=n2,t[0];; (p32)sub n2=t[7],n2 (p34)sub n2=t[7],n2,1 };; { .mii; (p32)cmp.gtu p35,p33=n2,t[7] (p34)cmp.geu p35,p33=n2,t[7];; .pred.rel "mutex",p33,p35 (p33)sub n3=t[7],n3 } { .mmi; (p35)sub n3=t[7],n3,1;; (p33)cmp.gtu p34,p32=n3,t[7] (p35)cmp.geu p34,p32=n3,t[7] };; (p33)sub n3=t[6],n3 } { .mmi; (p35)sub n3=t[6],n3,1;; (p33)cmp.gtu p34,p32=n3,t[6] (p35)cmp.geu p34,p32=n3,t[6] };; .pred.rel "mutex",p32,p34 { .mii; (p32)sub n4=t[6],n4 (p34)sub n4=t[6],n4,1;; (p32)cmp.gtu p35,p33=n4,t[6] } { .mmi; (p34)cmp.geu p35,p33=n4,t[6];; { .mii; (p32)sub n4=t[5],n4 (p34)sub n4=t[5],n4,1;; (p32)cmp.gtu p35,p33=n4,t[5] } { .mmi; (p34)cmp.geu p35,p33=n4,t[5];; .pred.rel "mutex",p33,p35 (p33)sub n5=t[5],n5 (p35)sub n5=t[5],n5,1 };; { .mii; (p33)cmp.gtu p34,p32=n5,t[5] (p35)cmp.geu p34,p32=n5,t[5];; (p33)sub n5=t[4],n5 (p35)sub n5=t[4],n5,1 };; { .mii; (p33)cmp.gtu p34,p32=n5,t[4] (p35)cmp.geu p34,p32=n5,t[4];; .pred.rel "mutex",p32,p34 (p32)sub n6=t[4],n6 } { .mmi; (p34)sub n6=t[4],n6,1;; (p32)cmp.gtu p35,p33=n6,t[4] (p34)cmp.geu p35,p33=n6,t[4] };; (p32)sub n6=t[3],n6 } { .mmi; (p34)sub n6=t[3],n6,1;; (p32)cmp.gtu p35,p33=n6,t[3] (p34)cmp.geu p35,p33=n6,t[3] };; .pred.rel "mutex",p33,p35 { .mii; (p33)sub n7=t[3],n7 (p35)sub n7=t[3],n7,1;; (p33)cmp.gtu p34,p32=n7,t[3] } { .mmi; (p35)cmp.geu p34,p32=n7,t[3];; { .mii; (p33)sub n7=t[2],n7 (p35)sub n7=t[2],n7,1;; (p33)cmp.gtu p34,p32=n7,t[2] } { .mmi; (p35)cmp.geu p34,p32=n7,t[2];; .pred.rel "mutex",p32,p34 (p32)sub n8=t[2],n8 (p34)sub n8=t[2],n8,1 };; { .mii; (p32)cmp.gtu p35,p33=n8,t[2] (p34)cmp.geu p35,p33=n8,t[2];; (p32)sub n8=t[1],n8 (p34)sub n8=t[1],n8,1 };; { .mii; (p32)cmp.gtu p35,p33=n8,t[1] (p34)cmp.geu p35,p33=n8,t[1];; .pred.rel "mutex",p33,p35 (p33)sub a8=t[1],r0 } { .mmi; (p35)sub a8=t[1],r0,1;; (p33)cmp.gtu p34,p32=a8,t[1] (p35)cmp.geu p34,p32=a8,t[1] };; (p33)sub a8=t[0],r0 } { .mmi; (p35)sub a8=t[0],r0,1;; (p33)cmp.gtu p34,p32=a8,t[0] (p35)cmp.geu p34,p32=a8,t[0] };; // save the result, either tmp[num] or tmp[num]-np[num] .pred.rel "mutex",p32,p34 Loading @@ -715,25 +802,25 @@ bn_mul_mont_8: (p34)st8 [rptr]=t0,8 add r19=-4*16,prevsp};; { .mmb; (p32)st8 [rptr]=n2,8 (p34)st8 [rptr]=t[0],8 (p34)st8 [rptr]=t[7],8 (p5)br.cond.dpnt.few .Ldone };; { .mmb; (p32)st8 [rptr]=n3,8 (p34)st8 [rptr]=t[7],8 (p34)st8 [rptr]=t[6],8 (p7)br.cond.dpnt.few .Ldone };; { .mmb; (p32)st8 [rptr]=n4,8 (p34)st8 [rptr]=t[6],8 (p34)st8 [rptr]=t[5],8 (p9)br.cond.dpnt.few .Ldone };; { .mmb; (p32)st8 [rptr]=n5,8 (p34)st8 [rptr]=t[5],8 (p34)st8 [rptr]=t[4],8 (p11)br.cond.dpnt.few .Ldone };; { .mmb; (p32)st8 [rptr]=n6,8 (p34)st8 [rptr]=t[4],8 (p34)st8 [rptr]=t[3],8 (p13)br.cond.dpnt.few .Ldone };; { .mmb; (p32)st8 [rptr]=n7,8 (p34)st8 [rptr]=t[3],8 (p34)st8 [rptr]=t[2],8 (p15)br.cond.dpnt.few .Ldone };; { .mmb; (p32)st8 [rptr]=n8,8 (p34)st8 [rptr]=t[2],8 (p34)st8 [rptr]=t[1],8 nop.b 0 };; .Ldone: // epilogue { .mmi; ldf.fill f16=[r16],64 Loading Loading
crypto/bn/asm/ia64-mont.pl +150 −63 Original line number Diff line number Diff line Loading @@ -38,15 +38,15 @@ # So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with* # this module is: # sign verify sign/s verify/s # rsa 512 bits 0.000302s 0.000024s 3312.3 41332.2 # rsa 1024 bits 0.000816s 0.000058s 1225.2 17172.0 # rsa 512 bits 0.000290s 0.000024s 3452.8 42031.4 # rsa 1024 bits 0.000793s 0.000058s 1261.7 17172.0 # rsa 2048 bits 0.005908s 0.000148s 169.3 6754.0 # rsa 4096 bits 0.033456s 0.000469s 29.9 2133.6 # dsa 512 bits 0.000254s 0.000206s 3944.6 4865.1 # dsa 512 bits 0.000253s 0.000198s 3949.9 5057.0 # dsa 1024 bits 0.000585s 0.000607s 1708.4 1647.4 # dsa 2048 bits 0.001453s 0.001703s 688.1 587.4 # # ... and *without*: # ... and *without* (but still with ia64.S): # # rsa 512 bits 0.000670s 0.000041s 1491.8 24145.5 # rsa 1024 bits 0.001988s 0.000080s 502.9 12499.3 Loading @@ -56,9 +56,9 @@ # dsa 1024 bits 0.000823s 0.000867s 1215.6 1153.2 # dsa 2048 bits 0.001894s 0.002179s 528.1 458.9 # # As it can be seen, RSA sign performance improves by 120-30%, # hereafter less for longer keys, while verify - by 72-13%. # DSA performance improves by 100-30%. # As it can be seen, RSA sign performance improves by 130-30%, # hereafter less for longer keys, while verify - by 74-13%. # DSA performance improves by 115-30%. if ($^O eq "hpux") { $ADDP="addp4"; Loading Loading @@ -473,7 +473,7 @@ bn_mul_mont_8: mov ar.lc=r28 } { .mfi; (p14)ldf8 bj[0]=[r30],16 // bp[7] (p15)fcvt.fxu bj[0]=f0 mov ar.ec=2 } mov ar.ec=1 } { .mfi; (p12)ldf8 ni6=[nptr],16 // np[6] (p13)fcvt.fxu ni6=f0 mov pr.rot=1<<16 } Loading @@ -482,12 +482,12 @@ bn_mul_mont_8: brp.loop.imp .Louter_8_ctop,.Louter_8_cend-16 };; // The loop is scheduled for 32*(n+1) ticks on Itanium 2. Actual // measurement with help of Interval Time Counter indicate that the // The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt // to measure with help of Interval Time Counter indicated that the // factor is a tad higher: 33 or 34, if not 35. Exact measurement and // addressing the issue is problematic, because I don't have access // to platform-specific instruction-level profiler. On Itanium it // should run in 56*(n+1) ticks, because of higher xma latency... // should run in 56*n ticks, because of higher xma latency... .Louter_8_ctop: .pred.rel "mutex",p40,p42 .pred.rel "mutex",p48,p50 Loading Loading @@ -652,62 +652,149 @@ bn_mul_mont_8: br.ctop.sptk.many .Louter_8_ctop };; .Louter_8_cend: // move np[8] to GPR bank and subtract it from carrybit|tmp[8] // above loop has to execute one more time, without (p16), which is // replaced with merged move of np[8] to GPR bank .pred.rel "mutex",p40,p42 .pred.rel "mutex",p48,p50 { .mmi; (p0) getf.sig n1=ni0 // 0: (p40) add a3=a3,n3 // (p17) a3+=n3 (p42) add a3=a3,n3,1 };; { .mii; (p17) getf.sig a7=alo[8] // 1: (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3 (p50) add t[6]=t[6],a3,1 };; { .mfi; (p17) getf.sig a8=ahi[8] // 2: (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0 (p40) cmp.ltu p43,p41=a3,n3 } { .mfi; (p42) cmp.leu p43,p41=a3,n3 (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6] (p0) nop.i 0 };; { .mii; (p17) getf.sig n5=nlo[6] // 3: (p48) cmp.ltu p51,p49=t[6],a3 (p50) cmp.leu p51,p49=t[6],a3 };; .pred.rel "mutex",p41,p43 .pred.rel "mutex",p49,p51 { .mmi; (p0) getf.sig n2=ni1 // 4: (p41) add a4=a4,n4 // (p17) a4+=n4 (p43) add a4=a4,n4,1 };; { .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4 (p0) nop.f 0 (p51) add t[5]=t[5],a4,1 };; { .mfi; (p0) getf.sig n3=ni2 // 6: (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0 (p41) cmp.ltu p42,p40=a4,n4 } { .mfi; (p43) cmp.leu p42,p40=a4,n4 (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7] (p0) nop.i 0 };; { .mii; (p17) getf.sig n6=nlo[7] // 7: (p49) cmp.ltu p50,p48=t[5],a4 (p51) cmp.leu p50,p48=t[5],a4 };; .pred.rel "mutex",p40,p42 .pred.rel "mutex",p48,p50 { .mii; (p0) getf.sig n4=ni3 // 8: (p40) add a5=a5,n5 // (p17) a5+=n5 (p42) add a5=a5,n5,1 };; { .mii; (p0) nop.m 0 // 9: (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5 (p50) add t[4]=t[4],a5,1 };; { .mii; (p0) nop.m 0 // 10: (p40) cmp.ltu p43,p41=a5,n5 (p42) cmp.leu p43,p41=a5,n5 };; { .mii; (p17) getf.sig n7=nlo[8] // 11: (p48) cmp.ltu p51,p49=t[4],a5 (p50) cmp.leu p51,p49=t[4],a5 };; .pred.rel "mutex",p41,p43 .pred.rel "mutex",p49,p51 { .mii; (p17) getf.sig n8=nhi[8] // 12: (p41) add a6=a6,n6 // (p17) a6+=n6 (p43) add a6=a6,n6,1 };; { .mii; (p0) getf.sig n5=ni4 // 13: (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6 (p51) add t[3]=t[3],a6,1 };; { .mii; (p0) nop.m 0 // 14: (p41) cmp.ltu p42,p40=a6,n6 (p43) cmp.leu p42,p40=a6,n6 };; { .mii; (p0) getf.sig n6=ni5 // 15: (p49) cmp.ltu p50,p48=t[3],a6 (p51) cmp.leu p50,p48=t[3],a6 };; .pred.rel "mutex",p40,p42 .pred.rel "mutex",p48,p50 { .mii; (p0) nop.m 0 // 16: (p40) add a7=a7,n7 // (p17) a7+=n7 (p42) add a7=a7,n7,1 };; { .mii; (p0) nop.m 0 // 17: (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7 (p50) add t[2]=t[2],a7,1 };; { .mii; (p0) nop.m 0 // 18: (p40) cmp.ltu p43,p41=a7,n7 (p42) cmp.leu p43,p41=a7,n7 };; { .mii; (p0) getf.sig n7=ni6 // 19: (p48) cmp.ltu p51,p49=t[2],a7 (p50) cmp.leu p51,p49=t[2],a7 };; .pred.rel "mutex",p41,p43 .pred.rel "mutex",p49,p51 { .mii; (p0) nop.m 0 // 20: (p41) add a8=a8,n8 // (p17) a8+=n8 (p43) add a8=a8,n8,1 };; { .mmi; (p0) nop.m 0 // 21: (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8 (p51) add t[1]=t[1],a8,1 } { .mmi; (p17) mov t[0]=r0 (p41) cmp.ltu p42,p40=a8,n8 (p43) cmp.leu p42,p40=a8,n8 };; { .mmi; (p0) getf.sig n8=ni7 // 22: (p49) cmp.ltu p50,p48=t[1],a8 (p51) cmp.leu p50,p48=t[1],a8 } { .mmi; (p42) add t[0]=t[0],r0,1 (p0) add r16=-7*16,prevsp (p0) add r17=-6*16,prevsp };; // subtract np[8] from carrybit|tmp[8] // carrybit|tmp[8] layout upon exit from above loop is: // t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t[0]|t0 (least significant) { .mmi; getf.sig n1=ni0 getf.sig n2=ni1 add r16=-7*16,prevsp} { .mmi; getf.sig n3=ni2 getf.sig n4=ni3 add r17=-6*16,prevsp};; { .mmi; getf.sig n5=ni4 getf.sig n6=ni5 add r18=-5*16,prevsp} { .mmi; getf.sig n7=ni6 getf.sig n8=ni7 // t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant) { .mmi; (p50)add t[0]=t[0],r0,1 add r18=-5*16,prevsp sub n1=t0,n1 };; { .mmi; cmp.gtu p34,p32=n1,t0;; .pred.rel "mutex",p32,p34 (p32)sub n2=t[0],n2 (p34)sub n2=t[0],n2,1 };; { .mii; (p32)cmp.gtu p35,p33=n2,t[0] (p34)cmp.geu p35,p33=n2,t[0];; (p32)sub n2=t[7],n2 (p34)sub n2=t[7],n2,1 };; { .mii; (p32)cmp.gtu p35,p33=n2,t[7] (p34)cmp.geu p35,p33=n2,t[7];; .pred.rel "mutex",p33,p35 (p33)sub n3=t[7],n3 } { .mmi; (p35)sub n3=t[7],n3,1;; (p33)cmp.gtu p34,p32=n3,t[7] (p35)cmp.geu p34,p32=n3,t[7] };; (p33)sub n3=t[6],n3 } { .mmi; (p35)sub n3=t[6],n3,1;; (p33)cmp.gtu p34,p32=n3,t[6] (p35)cmp.geu p34,p32=n3,t[6] };; .pred.rel "mutex",p32,p34 { .mii; (p32)sub n4=t[6],n4 (p34)sub n4=t[6],n4,1;; (p32)cmp.gtu p35,p33=n4,t[6] } { .mmi; (p34)cmp.geu p35,p33=n4,t[6];; { .mii; (p32)sub n4=t[5],n4 (p34)sub n4=t[5],n4,1;; (p32)cmp.gtu p35,p33=n4,t[5] } { .mmi; (p34)cmp.geu p35,p33=n4,t[5];; .pred.rel "mutex",p33,p35 (p33)sub n5=t[5],n5 (p35)sub n5=t[5],n5,1 };; { .mii; (p33)cmp.gtu p34,p32=n5,t[5] (p35)cmp.geu p34,p32=n5,t[5];; (p33)sub n5=t[4],n5 (p35)sub n5=t[4],n5,1 };; { .mii; (p33)cmp.gtu p34,p32=n5,t[4] (p35)cmp.geu p34,p32=n5,t[4];; .pred.rel "mutex",p32,p34 (p32)sub n6=t[4],n6 } { .mmi; (p34)sub n6=t[4],n6,1;; (p32)cmp.gtu p35,p33=n6,t[4] (p34)cmp.geu p35,p33=n6,t[4] };; (p32)sub n6=t[3],n6 } { .mmi; (p34)sub n6=t[3],n6,1;; (p32)cmp.gtu p35,p33=n6,t[3] (p34)cmp.geu p35,p33=n6,t[3] };; .pred.rel "mutex",p33,p35 { .mii; (p33)sub n7=t[3],n7 (p35)sub n7=t[3],n7,1;; (p33)cmp.gtu p34,p32=n7,t[3] } { .mmi; (p35)cmp.geu p34,p32=n7,t[3];; { .mii; (p33)sub n7=t[2],n7 (p35)sub n7=t[2],n7,1;; (p33)cmp.gtu p34,p32=n7,t[2] } { .mmi; (p35)cmp.geu p34,p32=n7,t[2];; .pred.rel "mutex",p32,p34 (p32)sub n8=t[2],n8 (p34)sub n8=t[2],n8,1 };; { .mii; (p32)cmp.gtu p35,p33=n8,t[2] (p34)cmp.geu p35,p33=n8,t[2];; (p32)sub n8=t[1],n8 (p34)sub n8=t[1],n8,1 };; { .mii; (p32)cmp.gtu p35,p33=n8,t[1] (p34)cmp.geu p35,p33=n8,t[1];; .pred.rel "mutex",p33,p35 (p33)sub a8=t[1],r0 } { .mmi; (p35)sub a8=t[1],r0,1;; (p33)cmp.gtu p34,p32=a8,t[1] (p35)cmp.geu p34,p32=a8,t[1] };; (p33)sub a8=t[0],r0 } { .mmi; (p35)sub a8=t[0],r0,1;; (p33)cmp.gtu p34,p32=a8,t[0] (p35)cmp.geu p34,p32=a8,t[0] };; // save the result, either tmp[num] or tmp[num]-np[num] .pred.rel "mutex",p32,p34 Loading @@ -715,25 +802,25 @@ bn_mul_mont_8: (p34)st8 [rptr]=t0,8 add r19=-4*16,prevsp};; { .mmb; (p32)st8 [rptr]=n2,8 (p34)st8 [rptr]=t[0],8 (p34)st8 [rptr]=t[7],8 (p5)br.cond.dpnt.few .Ldone };; { .mmb; (p32)st8 [rptr]=n3,8 (p34)st8 [rptr]=t[7],8 (p34)st8 [rptr]=t[6],8 (p7)br.cond.dpnt.few .Ldone };; { .mmb; (p32)st8 [rptr]=n4,8 (p34)st8 [rptr]=t[6],8 (p34)st8 [rptr]=t[5],8 (p9)br.cond.dpnt.few .Ldone };; { .mmb; (p32)st8 [rptr]=n5,8 (p34)st8 [rptr]=t[5],8 (p34)st8 [rptr]=t[4],8 (p11)br.cond.dpnt.few .Ldone };; { .mmb; (p32)st8 [rptr]=n6,8 (p34)st8 [rptr]=t[4],8 (p34)st8 [rptr]=t[3],8 (p13)br.cond.dpnt.few .Ldone };; { .mmb; (p32)st8 [rptr]=n7,8 (p34)st8 [rptr]=t[3],8 (p34)st8 [rptr]=t[2],8 (p15)br.cond.dpnt.few .Ldone };; { .mmb; (p32)st8 [rptr]=n8,8 (p34)st8 [rptr]=t[2],8 (p34)st8 [rptr]=t[1],8 nop.b 0 };; .Ldone: // epilogue { .mmi; ldf.fill f16=[r16],64 Loading