Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
.explicit
.text
.ident "ia64.S, Version 2.1"
.ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
//
// ====================================================================
// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
// project.
//
// Rights for redistribution and usage in source and binary forms are
// granted according to the OpenSSL license. Warranty of any kind is
// disclaimed.
// ====================================================================
//
// Version 2.x is Itanium2 re-tune. Few words about how Itanum2 is
// different from Itanium to this module viewpoint. Most notably, is it
// "wider" than Itanium? Can you experience loop scalability as
// discussed in commentary sections? Not really:-( Itanium2 has 6
// integer ALU ports, i.e. it's 2 ports wider, but it's not enough to
// spin twice as fast, as I need 8 IALU ports. Amount of floating point
// ports is the same, i.e. 2, while I need 4. In other words, to this
// module Itanium2 remains effectively as "wide" as Itanium. Yet it's
// essentially different in respect to this module, and a re-tune was
// required. Well, because some intruction latencies has changed. Most
// noticeably those intensively used:
//
// Itanium Itanium2
// ldf8 9 6 L2 hit
// ld8 2 1 L1 hit
// getf 2 5
// xma[->getf] 7[+1] 4[+0]
// add[->st8] 1[+1] 1[+0]
//
// What does it mean? You might ratiocinate that the original code
// should run just faster... Because sum of latencies is smaller...
// Wrong! Note that getf latency increased. This means that if a loop is
// scheduled for lower latency (as they were), then it will suffer from
// stall condition and the code will therefore turn anti-scalable, e.g.
// original bn_mul_words spun at 5*n or 2.5 times slower than expected
// on Itanium2! What to do? Reschedule loops for Itanium2? But then
// Itanium would exhibit anti-scalability. So I've chosen to reschedule
// for worst latency for every instruction aiming for best *all-round*
// performance.
// Q. How much faster does it get?
// A. Here is the output from 'openssl speed rsa dsa' for vanilla
// 0.9.6a compiled with gcc version 2.96 20000731 (Red Hat
// Linux 7.1 2.96-81):
//
// sign verify sign/s verify/s
// rsa 512 bits 0.0036s 0.0003s 275.3 2999.2
// rsa 1024 bits 0.0203s 0.0011s 49.3 894.1
// rsa 2048 bits 0.1331s 0.0040s 7.5 250.9
// rsa 4096 bits 0.9270s 0.0147s 1.1 68.1
// sign verify sign/s verify/s
// dsa 512 bits 0.0035s 0.0043s 288.3 234.8
// dsa 1024 bits 0.0111s 0.0135s 90.0 74.2
//
// And here is similar output but for this assembler
// implementation:-)
//
// sign verify sign/s verify/s
// rsa 512 bits 0.0021s 0.0001s 549.4 9638.5
// rsa 1024 bits 0.0055s 0.0002s 183.8 4481.1
// rsa 2048 bits 0.0244s 0.0006s 41.4 1726.3
// rsa 4096 bits 0.1295s 0.0018s 7.7 561.5
// sign verify sign/s verify/s
// dsa 512 bits 0.0012s 0.0013s 891.9 756.6
// dsa 1024 bits 0.0023s 0.0028s 440.4 376.2
//
// Yes, you may argue that it's not fair comparison as it's
// possible to craft the C implementation with BN_UMULT_HIGH
// inline assembler macro. But of course! Here is the output
// with the macro:
//
// sign verify sign/s verify/s
// rsa 512 bits 0.0020s 0.0002s 495.0 6561.0
// rsa 1024 bits 0.0086s 0.0004s 116.2 2235.7
// rsa 2048 bits 0.0519s 0.0015s 19.3 667.3
// rsa 4096 bits 0.3464s 0.0053s 2.9 187.7
// sign verify sign/s verify/s
// dsa 512 bits 0.0016s 0.0020s 613.1 510.5
// dsa 1024 bits 0.0045s 0.0054s 221.0 183.9
//
// My code is still way faster, huh:-) And I believe that even
// higher performance can be achieved. Note that as keys get
// longer, performance gain is larger. Why? According to the
// profiler there is another player in the field, namely
// BN_from_montgomery consuming larger and larger portion of CPU
// time as keysize decreases. I therefore consider putting effort
// to assembler implementation of the following routine:
//
// void bn_mul_add_mont (BN_ULONG *rp,BN_ULONG *np,int nl,BN_ULONG n0)
// {
// int i,j;
// BN_ULONG v;
//
// for (i=0; i<nl; i++)
// {
// v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2);
// nrp++;
// rp++;
// if (((nrp[-1]+=v)&BN_MASK2) < v)
// for (j=0; ((++nrp[j])&BN_MASK2) == 0; j++) ;
// }
// }
//
// It might as well be beneficial to implement even combaX
// variants, as it appears as it can literally unleash the
// performance (see comment section to bn_mul_comba8 below).
//
// And finally for your reference the output for 0.9.6a compiled
// with SGIcc version 0.01.0-12 (keep in mind that for the moment
// of this writing it's not possible to convince SGIcc to use
// BN_UMULT_HIGH inline assembler macro, yet the code is fast,
// i.e. for a compiler generated one:-):
//
// sign verify sign/s verify/s
// rsa 512 bits 0.0022s 0.0002s 452.7 5894.3
// rsa 1024 bits 0.0097s 0.0005s 102.7 2002.9
// rsa 2048 bits 0.0578s 0.0017s 17.3 600.2
// rsa 4096 bits 0.3838s 0.0061s 2.6 164.5
// sign verify sign/s verify/s
// dsa 512 bits 0.0018s 0.0022s 547.3 459.6
// dsa 1024 bits 0.0051s 0.0062s 196.6 161.3
//
// Oh! Benchmarks were performed on 733MHz Lion-class Itanium
// system running Redhat Linux 7.1 (very special thanks to Ray
// McCaffity of Williams Communications for providing an account).
//
// Q. What's the heck with 'rum 1<<5' at the end of every function?
// A. Well, by clearing the "upper FP registers written" bit of the
// User Mask I want to excuse the kernel from preserving upper
// (f32-f128) FP register bank over process context switch, thus
// minimizing bus bandwidth consumption during the switch (i.e.
// after PKI opration completes and the program is off doing
// something else like bulk symmetric encryption). Having said
// this, I also want to point out that it might be good idea
// to compile the whole toolkit (as well as majority of the
// programs for that matter) with -mfixed-range=f32-f127 command
// line option. No, it doesn't prevent the compiler from writing
// to upper bank, but at least discourages to do so. If you don't
// like the idea you have the option to compile the module with
// -Drum=nop.m in command line.
//
#if defined(_HPUX_SOURCE) && !defined(_LP64)
#define ADDP addp4
#else
#define ADDP add
#endif
#if 1
//
// bn_[add|sub]_words routines.
//
// Loops are spinning in 2*(n+5) ticks on Itanuim (provided that the
// data reside in L1 cache, i.e. 2 ticks away). It's possible to
// compress the epilogue and get down to 2*n+6, but at the cost of
// scalability (the neat feature of this implementation is that it
// shall automagically spin in n+5 on "wider" IA-64 implementations:-)
// I consider that the epilogue is short enough as it is to trade tiny
// performance loss on Itanium for scalability.
//
// BN_ULONG bn_add_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num)
//
.global bn_add_words#
.proc bn_add_words#
.align 64
.skip 32 // makes the loop body aligned at 64-byte boundary
bn_add_words:
.prologue
.save ar.pfs,r2
{ .mii; alloc r2=ar.pfs,4,12,0,16
cmp4.le p6,p0=r35,r0 };;
{ .mfb; mov r8=r0 // return value
(p6) br.ret.spnt.many b0 };;
{ .mib; sub r10=r35,r0,1
.save ar.lc,r3
mov r3=ar.lc
brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16
}
{ .mib; ADDP r14=0,r32 // rp
.save pr,r9
mov r9=pr };;
.body
{ .mii; ADDP r15=0,r33 // ap
mov ar.lc=r10
mov ar.ec=6 }
{ .mib; ADDP r16=0,r34 // bp
mov pr.rot=1<<16 };;
.L_bn_add_words_ctop:
{ .mii; (p16) ld8 r32=[r16],8 // b=*(bp++)
(p18) add r39=r37,r34
(p19) cmp.ltu.unc p56,p0=r40,r38 }
{ .mfb; (p0) nop.m 0x0
(p0) nop.f 0x0
Loading
Loading full blame…