1 #include "../bn_lcl.h"
2 #if !(defined(__GNUC__) && __GNUC__>=2)
3 # include "../bn_asm.c" /* kind of dirty hack for Sun Studio */
4 #else
5 /*
6 * x86_64 BIGNUM accelerator version 0.1, December 2002.
7 *
8 * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
9 * project.
10 *
11 * Rights for redistribution and usage in source and binary forms are
12 * granted according to the OpenSSL license. Warranty of any kind is
13 * disclaimed.
14 *
15 * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
16 * versions, like 1.0...
17 * A. Well, that's because this code is basically a quick-n-dirty
18 * proof-of-concept hack. As you can see it's implemented with
19 * inline assembler, which means that you're bound to GCC and that
20 * there might be enough room for further improvement.
21 *
22 * Q. Why inline assembler?
23 * A. x86_64 features own ABI which I'm not familiar with. This is
55 * machine.
56 */
57
58 #ifdef _WIN64
59 #define BN_ULONG unsigned long long
60 #else
61 #define BN_ULONG unsigned long
62 #endif
63
64 #undef mul
65 #undef mul_add
66 #undef sqr
67
68 /*
69 * "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
70 * "g"(0) let the compiler to decide where does it
71 * want to keep the value of zero;
72 */
73 #define mul_add(r,a,word,carry) do { \
74 register BN_ULONG high,low; \
75 asm ("mulq %3" \
76 : "=a"(low),"=d"(high) \
77 : "a"(word),"m"(a) \
78 : "cc"); \
79 asm ("addq %2,%0; adcq %3,%1" \
80 : "+r"(carry),"+d"(high)\
81 : "a"(low),"g"(0) \
82 : "cc"); \
83 asm ("addq %2,%0; adcq %3,%1" \
84 : "+m"(r),"+d"(high) \
85 : "r"(carry),"g"(0) \
86 : "cc"); \
87 carry=high; \
88 } while (0)
89
90 #define mul(r,a,word,carry) do { \
91 register BN_ULONG high,low; \
92 asm ("mulq %3" \
93 : "=a"(low),"=d"(high) \
94 : "a"(word),"g"(a) \
95 : "cc"); \
96 asm ("addq %2,%0; adcq %3,%1" \
97 : "+r"(carry),"+d"(high)\
98 : "a"(low),"g"(0) \
99 : "cc"); \
100 (r)=carry, carry=high; \
101 } while (0)
102
103 #define sqr(r0,r1,a) \
104 asm ("mulq %2" \
105 : "=a"(r0),"=d"(r1) \
106 : "a"(a) \
107 : "cc");
108
109 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
110 {
111 BN_ULONG c1=0;
112
113 if (num <= 0) return(c1);
114
115 while (num&~3)
116 {
117 mul_add(rp[0],ap[0],w,c1);
118 mul_add(rp[1],ap[1],w,c1);
119 mul_add(rp[2],ap[2],w,c1);
120 mul_add(rp[3],ap[3],w,c1);
121 ap+=4; rp+=4; num-=4;
122 }
123 if (num)
124 {
159
160 while (n&~3)
161 {
162 sqr(r[0],r[1],a[0]);
163 sqr(r[2],r[3],a[1]);
164 sqr(r[4],r[5],a[2]);
165 sqr(r[6],r[7],a[3]);
166 a+=4; r+=8; n-=4;
167 }
168 if (n)
169 {
170 sqr(r[0],r[1],a[0]); if (--n == 0) return;
171 sqr(r[2],r[3],a[1]); if (--n == 0) return;
172 sqr(r[4],r[5],a[2]);
173 }
174 }
175
176 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
177 { BN_ULONG ret,waste;
178
179 asm ("divq %4"
180 : "=a"(ret),"=d"(waste)
181 : "a"(l),"d"(h),"g"(d)
182 : "cc");
183
184 return ret;
185 }
186
187 BN_ULONG bn_add_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
188 { BN_ULONG ret=0,i=0;
189
190 if (n <= 0) return 0;
191
192 asm (
193 " subq %2,%2 \n"
194 ".p2align 4 \n"
195 "1: movq (%4,%2,8),%0 \n"
196 " adcq (%5,%2,8),%0 \n"
197 " movq %0,(%3,%2,8) \n"
198 " leaq 1(%2),%2 \n"
199 " loop 1b \n"
200 " sbbq %0,%0 \n"
201 : "=&a"(ret),"+c"(n),"=&r"(i)
202 : "r"(rp),"r"(ap),"r"(bp)
203 : "cc"
204 );
205
206 return ret&1;
207 }
208
209 #ifndef SIMICS
210 BN_ULONG bn_sub_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
211 { BN_ULONG ret=0,i=0;
212
213 if (n <= 0) return 0;
214
215 asm (
216 " subq %2,%2 \n"
217 ".p2align 4 \n"
218 "1: movq (%4,%2,8),%0 \n"
219 " sbbq (%5,%2,8),%0 \n"
220 " movq %0,(%3,%2,8) \n"
221 " leaq 1(%2),%2 \n"
222 " loop 1b \n"
223 " sbbq %0,%0 \n"
224 : "=&a"(ret),"+c"(n),"=&r"(i)
225 : "r"(rp),"r"(ap),"r"(bp)
226 : "cc"
227 );
228
229 return ret&1;
230 }
231 #else
232 /* Simics 1.4<7 has buggy sbbq:-( */
233 #define BN_MASK2 0xffffffffffffffffL
234 BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
235 {
277 /* original macros are kept for reference purposes */
278 #define mul_add_c(a,b,c0,c1,c2) { \
279 BN_ULONG ta=(a),tb=(b); \
280 t1 = ta * tb; \
281 t2 = BN_UMULT_HIGH(ta,tb); \
282 c0 += t1; t2 += (c0<t1)?1:0; \
283 c1 += t2; c2 += (c1<t2)?1:0; \
284 }
285
286 #define mul_add_c2(a,b,c0,c1,c2) { \
287 BN_ULONG ta=(a),tb=(b),t0; \
288 t1 = BN_UMULT_HIGH(ta,tb); \
289 t0 = ta * tb; \
290 t2 = t1+t1; c2 += (t2<t1)?1:0; \
291 t1 = t0+t0; t2 += (t1<t0)?1:0; \
292 c0 += t1; t2 += (c0<t1)?1:0; \
293 c1 += t2; c2 += (c1<t2)?1:0; \
294 }
295 #else
296 #define mul_add_c(a,b,c0,c1,c2) do { \
297 asm ("mulq %3" \
298 : "=a"(t1),"=d"(t2) \
299 : "a"(a),"m"(b) \
300 : "cc"); \
301 asm ("addq %2,%0; adcq %3,%1" \
302 : "+r"(c0),"+d"(t2) \
303 : "a"(t1),"g"(0) \
304 : "cc"); \
305 asm ("addq %2,%0; adcq %3,%1" \
306 : "+r"(c1),"+r"(c2) \
307 : "d"(t2),"g"(0) \
308 : "cc"); \
309 } while (0)
310
311 #define sqr_add_c(a,i,c0,c1,c2) do { \
312 asm ("mulq %2" \
313 : "=a"(t1),"=d"(t2) \
314 : "a"(a[i]) \
315 : "cc"); \
316 asm ("addq %2,%0; adcq %3,%1" \
317 : "+r"(c0),"+d"(t2) \
318 : "a"(t1),"g"(0) \
319 : "cc"); \
320 asm ("addq %2,%0; adcq %3,%1" \
321 : "+r"(c1),"+r"(c2) \
322 : "d"(t2),"g"(0) \
323 : "cc"); \
324 } while (0)
325
326 #define mul_add_c2(a,b,c0,c1,c2) do { \
327 asm ("mulq %3" \
328 : "=a"(t1),"=d"(t2) \
329 : "a"(a),"m"(b) \
330 : "cc"); \
331 asm ("addq %0,%0; adcq %2,%1" \
332 : "+d"(t2),"+r"(c2) \
333 : "g"(0) \
334 : "cc"); \
335 asm ("addq %0,%0; adcq %2,%1" \
336 : "+a"(t1),"+d"(t2) \
337 : "g"(0) \
338 : "cc"); \
339 asm ("addq %2,%0; adcq %3,%1" \
340 : "+r"(c0),"+d"(t2) \
341 : "a"(t1),"g"(0) \
342 : "cc"); \
343 asm ("addq %2,%0; adcq %3,%1" \
344 : "+r"(c1),"+r"(c2) \
345 : "d"(t2),"g"(0) \
346 : "cc"); \
347 } while (0)
348 #endif
349
350 #define sqr_add_c2(a,i,j,c0,c1,c2) \
351 mul_add_c2((a)[i],(a)[j],c0,c1,c2)
352
353 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
354 {
355 BN_ULONG t1,t2;
356 BN_ULONG c1,c2,c3;
357
358 c1=0;
359 c2=0;
360 c3=0;
361 mul_add_c(a[0],b[0],c1,c2,c3);
362 r[0]=c1;
363 c1=0;
|
1 #include <bn_lcl.h>
2 #if !(defined(__GNUC__) && __GNUC__>=2)
3 # include "bn_asm.c" /* kind of dirty hack for Sun Studio */
4 #else
5 /*
6 * x86_64 BIGNUM accelerator version 0.1, December 2002.
7 *
8 * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
9 * project.
10 *
11 * Rights for redistribution and usage in source and binary forms are
12 * granted according to the OpenSSL license. Warranty of any kind is
13 * disclaimed.
14 *
15 * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
16 * versions, like 1.0...
17 * A. Well, that's because this code is basically a quick-n-dirty
18 * proof-of-concept hack. As you can see it's implemented with
19 * inline assembler, which means that you're bound to GCC and that
20 * there might be enough room for further improvement.
21 *
22 * Q. Why inline assembler?
23 * A. x86_64 features own ABI which I'm not familiar with. This is
55 * machine.
56 */
57
58 #ifdef _WIN64
59 #define BN_ULONG unsigned long long
60 #else
61 #define BN_ULONG unsigned long
62 #endif
63
64 #undef mul
65 #undef mul_add
66 #undef sqr
67
68 /*
69 * "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
70 * "g"(0) let the compiler to decide where does it
71 * want to keep the value of zero;
72 */
73 #define mul_add(r,a,word,carry) do { \
74 register BN_ULONG high,low; \
75 __asm__ ("mulq %3" \
76 : "=a"(low),"=d"(high) \
77 : "a"(word),"m"(a) \
78 : "cc"); \
79 __asm__ ("addq %2,%0; adcq %3,%1" \
80 : "+r"(carry),"+d"(high)\
81 : "a"(low),"g"(0) \
82 : "cc"); \
83 __asm__ ("addq %2,%0; adcq %3,%1" \
84 : "+m"(r),"+d"(high) \
85 : "r"(carry),"g"(0) \
86 : "cc"); \
87 carry=high; \
88 } while (0)
89
90 #define mul(r,a,word,carry) do { \
91 register BN_ULONG high,low; \
92 __asm__ ("mulq %3" \
93 : "=a"(low),"=d"(high) \
94 : "a"(word),"g"(a) \
95 : "cc"); \
96 __asm__ ("addq %2,%0; adcq %3,%1" \
97 : "+r"(carry),"+d"(high)\
98 : "a"(low),"g"(0) \
99 : "cc"); \
100 (r)=carry, carry=high; \
101 } while (0)
102
103 #define sqr(r0,r1,a) \
104 __asm__ ("mulq %2" \
105 : "=a"(r0),"=d"(r1) \
106 : "a"(a) \
107 : "cc");
108
109 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
110 {
111 BN_ULONG c1=0;
112
113 if (num <= 0) return(c1);
114
115 while (num&~3)
116 {
117 mul_add(rp[0],ap[0],w,c1);
118 mul_add(rp[1],ap[1],w,c1);
119 mul_add(rp[2],ap[2],w,c1);
120 mul_add(rp[3],ap[3],w,c1);
121 ap+=4; rp+=4; num-=4;
122 }
123 if (num)
124 {
159
160 while (n&~3)
161 {
162 sqr(r[0],r[1],a[0]);
163 sqr(r[2],r[3],a[1]);
164 sqr(r[4],r[5],a[2]);
165 sqr(r[6],r[7],a[3]);
166 a+=4; r+=8; n-=4;
167 }
168 if (n)
169 {
170 sqr(r[0],r[1],a[0]); if (--n == 0) return;
171 sqr(r[2],r[3],a[1]); if (--n == 0) return;
172 sqr(r[4],r[5],a[2]);
173 }
174 }
175
176 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
177 { BN_ULONG ret,waste;
178
179 __asm__ ("divq %4"
180 : "=a"(ret),"=d"(waste)
181 : "a"(l),"d"(h),"g"(d)
182 : "cc");
183
184 return ret;
185 }
186
187 BN_ULONG bn_add_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
188 { BN_ULONG ret=0,i=0;
189
190 if (n <= 0) return 0;
191
192 __asm__ (
193 " subq %2,%2 \n"
194 ".p2align 4 \n"
195 "1: movq (%4,%2,8),%0 \n"
196 " adcq (%5,%2,8),%0 \n"
197 " movq %0,(%3,%2,8) \n"
198 " leaq 1(%2),%2 \n"
199 " loop 1b \n"
200 " sbbq %0,%0 \n"
201 : "=&a"(ret),"+c"(n),"=&r"(i)
202 : "r"(rp),"r"(ap),"r"(bp)
203 : "cc"
204 );
205
206 return ret&1;
207 }
208
209 #ifndef SIMICS
210 BN_ULONG bn_sub_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
211 { BN_ULONG ret=0,i=0;
212
213 if (n <= 0) return 0;
214
215 __asm__ (
216 " subq %2,%2 \n"
217 ".p2align 4 \n"
218 "1: movq (%4,%2,8),%0 \n"
219 " sbbq (%5,%2,8),%0 \n"
220 " movq %0,(%3,%2,8) \n"
221 " leaq 1(%2),%2 \n"
222 " loop 1b \n"
223 " sbbq %0,%0 \n"
224 : "=&a"(ret),"+c"(n),"=&r"(i)
225 : "r"(rp),"r"(ap),"r"(bp)
226 : "cc"
227 );
228
229 return ret&1;
230 }
231 #else
232 /* Simics 1.4<7 has buggy sbbq:-( */
233 #define BN_MASK2 0xffffffffffffffffL
234 BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
235 {
277 /* original macros are kept for reference purposes */
278 #define mul_add_c(a,b,c0,c1,c2) { \
279 BN_ULONG ta=(a),tb=(b); \
280 t1 = ta * tb; \
281 t2 = BN_UMULT_HIGH(ta,tb); \
282 c0 += t1; t2 += (c0<t1)?1:0; \
283 c1 += t2; c2 += (c1<t2)?1:0; \
284 }
285
286 #define mul_add_c2(a,b,c0,c1,c2) { \
287 BN_ULONG ta=(a),tb=(b),t0; \
288 t1 = BN_UMULT_HIGH(ta,tb); \
289 t0 = ta * tb; \
290 t2 = t1+t1; c2 += (t2<t1)?1:0; \
291 t1 = t0+t0; t2 += (t1<t0)?1:0; \
292 c0 += t1; t2 += (c0<t1)?1:0; \
293 c1 += t2; c2 += (c1<t2)?1:0; \
294 }
295 #else
296 #define mul_add_c(a,b,c0,c1,c2) do { \
297 __asm__ ("mulq %3" \
298 : "=a"(t1),"=d"(t2) \
299 : "a"(a),"m"(b) \
300 : "cc"); \
301 __asm__ ("addq %2,%0; adcq %3,%1" \
302 : "+r"(c0),"+d"(t2) \
303 : "a"(t1),"g"(0) \
304 : "cc"); \
305 __asm__ ("addq %2,%0; adcq %3,%1" \
306 : "+r"(c1),"+r"(c2) \
307 : "d"(t2),"g"(0) \
308 : "cc"); \
309 } while (0)
310
311 #define sqr_add_c(a,i,c0,c1,c2) do { \
312 __asm__ ("mulq %2" \
313 : "=a"(t1),"=d"(t2) \
314 : "a"(a[i]) \
315 : "cc"); \
316 __asm__ ("addq %2,%0; adcq %3,%1" \
317 : "+r"(c0),"+d"(t2) \
318 : "a"(t1),"g"(0) \
319 : "cc"); \
320 __asm__ ("addq %2,%0; adcq %3,%1" \
321 : "+r"(c1),"+r"(c2) \
322 : "d"(t2),"g"(0) \
323 : "cc"); \
324 } while (0)
325
326 #define mul_add_c2(a,b,c0,c1,c2) do { \
327 __asm__ ("mulq %3" \
328 : "=a"(t1),"=d"(t2) \
329 : "a"(a),"m"(b) \
330 : "cc"); \
331 __asm__ ("addq %0,%0; adcq %2,%1" \
332 : "+d"(t2),"+r"(c2) \
333 : "g"(0) \
334 : "cc"); \
335 __asm__ ("addq %0,%0; adcq %2,%1" \
336 : "+a"(t1),"+d"(t2) \
337 : "g"(0) \
338 : "cc"); \
339 __asm__ ("addq %2,%0; adcq %3,%1" \
340 : "+r"(c0),"+d"(t2) \
341 : "a"(t1),"g"(0) \
342 : "cc"); \
343 __asm__ ("addq %2,%0; adcq %3,%1" \
344 : "+r"(c1),"+r"(c2) \
345 : "d"(t2),"g"(0) \
346 : "cc"); \
347 } while (0)
348 #endif
349
350 #define sqr_add_c2(a,i,j,c0,c1,c2) \
351 mul_add_c2((a)[i],(a)[j],c0,c1,c2)
352
353 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
354 {
355 BN_ULONG t1,t2;
356 BN_ULONG c1,c2,c3;
357
358 c1=0;
359 c2=0;
360 c3=0;
361 mul_add_c(a[0],b[0],c1,c2,c3);
362 r[0]=c1;
363 c1=0;
|