1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
23 */
24 /*
25 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
26 * Use is subject to license terms.
27 */
28
29 .file "__vhypot.S"
30
31 #include "libm.h"
32
33 RO_DATA
34 .align 64
35
36 .CONST_TBL:
37 .word 0x7ff00000, 0 ! DC0
38 .word 0x7fe00000, 0 ! DC1
39 .word 0x00100000, 0 ! DC2
40 .word 0x41b00000, 0 ! D2ON28 = 268435456.0
41 .word 0x7fd00000, 0 ! DC3
42
43 #define counter %i0
44 #define tmp_counter %l3
45 #define tmp_px %l5
46 #define tmp_py %o7
47 #define stridex %i2
48 #define stridey %i4
49 #define stridez %l0
50
51 #define DC0 %f8
52 #define DC0_HI %f8
53 #define DC0_LO %f9
54 #define DC1 %f46
55 #define DC2 %f48
56 #define DC3 %f0
57 #define D2ON28 %f62
58
59 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
60 ! !!!!! algorithm !!!!!
61 ! ((float*)&x)[0] = ((float*)px)[0];
62 ! ((float*)&x)[1] = ((float*)px)[1];
63 !
64 ! ((float*)&y)[0] = ((float*)py)[0];
65 ! ((float*)&y)[1] = ((float*)py)[1];
66 !
67 ! x = fabs(x);
68 ! y = fabs(y);
69 !
70 ! c0 = vis_fcmple32(DC1,x);
71 ! c2 = vis_fcmple32(DC1,y);
72 ! c1 = vis_fcmpgt32(DC2,x);
73 ! c3 = vis_fcmpgt32(DC2,y);
74 !
75 ! c0 |= c2;
76 ! c1 &= c3;
77 ! if ( (c0 & 2) != 0 )
78 ! {
79 ! lx = ((int*)px)[1];
80 ! ly = ((int*)py)[1];
81 ! hx = *(int*)px;
82 ! hy = *(int*)py;
83 !
84 ! hx &= 0x7fffffff;
85 ! hy &= 0x7fffffff;
86 !
87 ! j0 = hx;
88 ! if ( j0 < hy ) j0 = hy;
89 ! j0 &= 0x7ff00000;
90 ! if ( j0 >= 0x7ff00000 )
91 ! {
92 ! if ( hx == 0x7ff00000 && lx == 0 ) res = x == y ? y : x;
93 ! else if ( hy == 0x7ff00000 && ly == 0 ) res = x == y ? x : y;
94 ! else res = x * y;
95 !
96 ! ((float*)pz)[0] = ((float*)&res)[0];
97 ! ((float*)pz)[1] = ((float*)&res)[1];
98 ! }
99 ! else
100 ! {
101 ! diff = hy - hx;
102 ! j0 = diff >> 31;
103 ! if ( ((diff ^ j0) - j0) < 0x03600000 )
104 ! {!
105 ! x *= D2ONM1022;
106 ! y *= D2ONM1022;
107 !
108 ! x_hi = ( x + two28 ) - two28;
109 ! x_lo = x - x_hi;
110 ! y_hi = ( y + two28 ) - two28;
111 ! y_lo = y - y_hi;
112 ! res = (x_hi * x_hi + y_hi * y_hi);
113 ! res += ((x + x_hi) * x_lo + (y + y_hi) * y_lo);
114 !
115 ! res = sqrt(res);
116 !
117 ! res = D2ONP1022 * res;
118 ! ((float*)pz)[0] = ((float*)&res)[0];
119 ! ((float*)pz)[1] = ((float*)&res)[1];
120 ! }
121 ! else
122 ! {
123 ! res = x + y;
124 ! ((float*)pz)[0] = ((float*)&res)[0];
125 ! ((float*)pz)[1] = ((float*)&res)[1];
126 ! }
127 ! }
128 ! px += stridex;
129 ! py += stridey;
130 ! pz += stridez;
131 ! continue;
132 ! }
133 ! if ( (c1 & 2) != 0 )
134 ! {
135 ! x *= D2ONP1022;
136 ! y *= D2ONP1022;
137 !
138 ! x_hi = ( x + two28 ) - two28;
139 ! x_lo = x - x_hi;
140 ! y_hi = ( y + two28 ) - two28;
141 ! y_lo = y - y_hi;
142 ! res = (x_hi * x_hi + y_hi * y_hi);
143 ! res += ((x + x_hi) * x_lo + (y + y_hi) * y_lo);
144 !
145 ! res = sqrt(res);
146 !
147 ! res = D2ONM1022 * res;
148 ! ((float*)pz)[0] = ((float*)&res)[0];
149 ! ((float*)pz)[1] = ((float*)&res)[1];
150 ! px += stridex;
151 ! py += stridey;
152 ! pz += stridez;
153 ! continue;
154 ! }
155 !
156 ! dmax = x;
157 ! if ( dmax < y ) dmax = y;
158 !
159 ! dmax = vis_fand(dmax,DC0);
160 ! dnorm = vis_fpsub32(DC1,dmax);
161 !
162 ! x *= dnorm;
163 ! y *= dnorm;
164 !
165 ! x_hi = x + D2ON28;
166 ! x_hi -= D2ON28;
167 ! x_lo = x - x_hi;
168 !
169 ! y_hi = y + D2ON28;
170 ! y_hi -= D2ON28;
171 ! y_lo = y - y_hi;
172 !
173 ! res = x_hi * x_hi;
174 ! dtmp1 = x + x_hi;
175 ! dtmp0 = y_hi * y_hi;
176 ! dtmp2 = y + y_hi;
177 !
178 ! res += dtmp0;
179 ! dtmp1 *= x_lo;
180 ! dtmp2 *= y_lo;
181 ! dtmp1 += dtmp2;
182 ! res += dtmp1;
183 !
184 ! res = sqrt(res);
185 !
186 ! res = dmax * res;
187 ! ((float*)pz)[0] = ((float*)&res)[0];
188 ! ((float*)pz)[1] = ((float*)&res)[1];
189 !
190 ! px += stridex;
191 ! py += stridey;
192 ! pz += stridez;
193 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
194
195 ENTRY(__vhypot)
196 save %sp,-SA(MINFRAME),%sp
197 PIC_SETUP(l7)
198 PIC_SET(l7,.CONST_TBL,o3)
199 wr %g0,0x82,%asi
200
201 #ifdef __sparcv9
202 ldx [%fp+STACK_BIAS+176],%l0
203 #else
204 ld [%fp+STACK_BIAS+92],%l0
205 #endif
206 ldd [%o3],DC0
207 sll %i2,3,stridex
208 mov %i0,tmp_counter
209
210 ldd [%o3+8],DC1
211 sll %i4,3,stridey
212 mov %i1,tmp_px
213
214 ldd [%o3+16],DC2
215 sll %l0,3,stridez
216 mov %i3,tmp_py
217
218 ldd [%o3+24],D2ON28
219
220 ldd [%o3+32],DC3
221
222 .begin:
223 mov tmp_counter,counter
224 mov tmp_px,%i1
225 mov tmp_py,%i3
226 clr tmp_counter
227 .begin1:
228 cmp counter,0
229 ble,pn %icc,.exit
230 nop
231
232 lda [%i1]%asi,%o0
233 sethi %hi(0x7ffffc00),%o5
234
235 lda [%i3]%asi,%o2
236 add %o5,1023,%o5
237
238 lda [%i1]%asi,%f26 ! (1_0) ((float*)&x)[0] = ((float*)px)[0];
239
240 lda [%i1+4]%asi,%f27 ! (1_0) ((float*)&x)[1] = ((float*)px)[1];
241 add %i1,stridex,%o1 ! px += stridex
242
243 lda [%i3]%asi,%f24 ! (1_0) ((float*)&y)[0] = ((float*)py)[0];
244 sethi %hi(0x00100000),%l7
245 and %o0,%o5,%o0
246
247 lda [%i3+4]%asi,%f25 ! (1_0) ((float*)&y)[1] = ((float*)py)[1];
248 and %o2,%o5,%o2
249 sethi %hi(0x7fe00000),%l6
250
251 fabsd %f26,%f36 ! (1_0) x = fabs(x);
252 cmp %o0,%o2
253 mov %o2,%l4
254
255 fabsd %f24,%f54 ! (1_0) y = fabs(y);
256 add %i3,stridey,%o5 ! py += stridey
257 movg %icc,%o0,%o2
258 lda [%o5]%asi,%f28 ! (2_0) ((float*)&y)[0] = ((float*)py)[0];
259
260 cmp %o2,%l6
261 sethi %hi(0x7ff00000),%o4
262 bge,pn %icc,.spec0
263 lda [%o5+4]%asi,%f29 ! (2_0) ((float*)&y)[1] = ((float*)py)[1];
264
265 cmp %o2,%l7
266 bl,pn %icc,.spec1
267 nop
268 lda [%o1]%asi,%f26 ! (2_0) ((float*)&x)[0] = ((float*)px)[0];
269
270 lda [%o1+4]%asi,%f27 ! (2_0) ((float*)&x)[1] = ((float*)px)[1];
271 add %i3,stridey,%i3 ! py += stridey
272
273 fabsd %f28,%f34 ! (2_0) y = fabs(y);
274
275 fabsd %f26,%f50 ! (2_0) x = fabs(x);
276
277 fcmple32 DC1,%f50,%o3 ! (2_0) c0 = vis_fcmple32(DC1,x);
278
279 fcmple32 DC1,%f34,%o0 ! (2_0) c2 = vis_fcmple32(DC1,y);
280
281 fcmpgt32 DC2,%f50,%o4 ! (2_0) c1 = vis_fcmpgt32(DC2,x);
282
283 fcmpgt32 DC2,%f34,%o5 ! (2_0) c3 = vis_fcmpgt32(DC2,y);
284
285 or %o3,%o0,%o3 ! (2_0) c0 |= c2;
286
287 andcc %o3,2,%g0 ! (2_0) c0 & 2
288 bnz,pn %icc,.update0 ! (2_0) if ( (c0 & 2) != 0 )
289 and %o4,%o5,%o4 ! (2_0) c1 &= c3;
290 .cont0:
291 add %i3,stridey,%l4 ! py += stridey
292 andcc %o4,2,%g0 ! (2_0) c1 & 2
293 bnz,pn %icc,.update1 ! (2_0) if ( (c1 & 2) != 0 )
294 fmovd %f36,%f56 ! (1_0) dmax = x;
295 .cont1:
296 lda [%l4]%asi,%f30 ! (3_0) ((float*)&y)[0] = ((float*)py)[0];
297 add %o1,stridex,%l2 ! px += stridex
298
299 lda [%l4+4]%asi,%f31 ! (3_0) ((float*)&y)[1] = ((float*)py)[1];
300
301 lda [%l2]%asi,%f18 ! (3_1) ((float*)&x)[0] = ((float*)px)[0];
302
303 lda [%l2+4]%asi,%f19 ! (3_1) ((float*)&x)[1] = ((float*)px)[1];
304
305 fabsd %f30,%f30 ! (3_1) y = fabs(y);
306
307 fabsd %f18,%f18 ! (3_1) x = fabs(x);
308
309 fcmped %fcc2,%f54,%f56 ! (1_1) dmax ? y
310
311 fmovdg %fcc2,%f54,%f56 ! (1_1) if ( dmax < y ) dmax = y;
312
313 fcmple32 DC1,%f18,%o3 ! (3_1) c0 = vis_fcmple32(DC1,x);
314
315 fcmple32 DC1,%f30,%o0 ! (3_1) c2 = vis_fcmple32(DC1,y);
316
317 fcmpgt32 DC2,%f18,%o4 ! (3_1) c1 = vis_fcmpgt32(DC2,x);
318
319 fcmpgt32 DC2,%f30,%o1 ! (3_1) c3 = vis_fcmpgt32(DC2,y);
320
321 fand %f56,DC0,%f38 ! (1_1) dmax = vis_fand(dmax,DC0);
322
323 or %o3,%o0,%o3 ! (3_1) c0 |= c2;
324
325 andcc %o3,2,%g0 ! (3_1) c0 & 2
326 bnz,pn %icc,.update2 ! (3_1) if ( (c0 & 2) != 0 )
327 and %o4,%o1,%o4 ! (3_1) c1 &= c3;
328 .cont2:
329 add %l4,stridey,%i3 ! py += stridey
330 andcc %o4,2,%g0 ! (3_1) c1 & 2
331 bnz,pn %icc,.update3 ! (3_1) if ( (c1 & 2) != 0 )
332 fmovd %f50,%f32 ! (2_1) dmax = x;
333 .cont3:
334 fpsub32 DC1,%f38,%f10 ! (1_1) dnorm = vis_fpsub32(DC1,dmax);
335 lda [%i3]%asi,%f20 ! (0_0) ((float*)&y)[0] = ((float*)py)[0];
336
337 lda [%i3+4]%asi,%f21 ! (0_0) ((float*)&y)[1] = ((float*)py)[1];
338
339 add %l2,stridex,%l1 ! px += stridex
340
341 fmuld %f36,%f10,%f36 ! (1_1) x *= dnorm;
342 lda [%l1]%asi,%f22 ! (0_0) ((float*)&x)[0] = ((float*)px)[0]
343
344 lda [%l1+4]%asi,%f23 ! (0_0) ((float*)&x)[1] = ((float*)px)[1];
345
346 fmuld %f54,%f10,%f56 ! (1_1) y *= dnorm;
347 fabsd %f20,%f40 ! (0_0) y = fabs(y);
348
349 fabsd %f22,%f20 ! (0_0) x = fabs(x);
350
351 fcmped %fcc3,%f34,%f32 ! (2_1) dmax ? y
352
353
354 fmovdg %fcc3,%f34,%f32 ! (2_1) if ( dmax < y ) dmax = y;
355
356 faddd %f36,D2ON28,%f58 ! (1_1) x_hi = x + D2ON28;
357 fcmple32 DC1,%f20,%g5 ! (0_0) c0 = vis_fcmple32(DC1,x);
358
359 faddd %f56,D2ON28,%f22 ! (1_1) y_hi = y + D2ON28;
360 fcmple32 DC1,%f40,%o2 ! (0_0) c2 = vis_fcmple32(DC1,y);
361
362 fcmpgt32 DC2,%f20,%g1 ! (0_0) c1 = vis_fcmpgt32(DC2,x);
363
364 fcmpgt32 DC2,%f40,%o4 ! (0_0) c3 = vis_fcmpgt32(DC2,y);
365
366 fand %f32,DC0,%f52 ! (2_1) dmax = vis_fand(dmax,DC0);
367
368 or %g5,%o2,%g5 ! (0_0) c0 |= c2;
369 fsubd %f58,D2ON28,%f58 ! (1_1) x_hi -= D2ON28;
370
371 andcc %g5,2,%g0 ! (0_0) c0 & 2
372 bnz,pn %icc,.update4 ! (0_0) if ( (c0 & 2) != 0 )
373 fsubd %f22,D2ON28,%f22 ! (1_1) y_hi -= D2ON28;
374 .cont4:
375 and %g1,%o4,%g1 ! (0_0) c1 &= c3;
376
377 add %i3,stridey,%l2 ! py += stridey
378 andcc %g1,2,%g0 ! (0_0) c1 & 2
379 bnz,pn %icc,.update5 ! (0_0) if ( (c1 & 2) != 0 )
380 fmovd %f18,%f44 ! (3_1) dmax = x;
381 .cont5:
382 fpsub32 DC1,%f52,%f10 ! (2_1) dnorm = vis_fpsub32(DC1,dmax);
383 lda [%l2]%asi,%f24 ! (1_0) ((float*)&y)[0] = ((float*)py)[0];
384
385 fmuld %f58,%f58,%f60 ! (1_1) res = x_hi * x_hi;
386 lda [%l2+4]%asi,%f25 ! (1_0) ((float*)&y)[1] = ((float*)py)[1];
387 add %l1,stridex,%l7 ! px += stridex
388 faddd %f56,%f22,%f28 ! (1_1) dtmp2 = y + y_hi;
389
390 faddd %f36,%f58,%f6 ! (1_1) dtmp1 = x + x_hi;
391 lda [%l7]%asi,%f26 ! (1_0) ((float*)&x)[0] = ((float*)px)[0];
392
393 fmuld %f50,%f10,%f50 ! (2_1) x *= dnorm;
394 fsubd %f36,%f58,%f58 ! (1_1) x_lo = x - x_hi;
395 lda [%l7+4]%asi,%f27 ! (1_0) ((float*)&x)[1] = ((float*)px)[1];
396
397 fmuld %f22,%f22,%f2 ! (1_1) dtmp0 = y_hi * y_hi;
398 fsubd %f56,%f22,%f56 ! (1_1) y_lo = y - y_hi;
399
400 fmuld %f34,%f10,%f34 ! (2_1) y *= dnorm;
401 fabsd %f24,%f54 ! (1_0) y = fabs(y);
402
403 fabsd %f26,%f36 ! (1_0) x = fabs(x);
404
405 fmuld %f6,%f58,%f10 ! (1_1) dtmp1 *= x_lo;
406 fcmped %fcc0,%f30,%f44 ! (3_1) dmax ? y
407
408 fmuld %f28,%f56,%f26 ! (1_1) dtmp2 *= y_lo;
409
410 fmovdg %fcc0,%f30,%f44 ! (3_1) if ( dmax < y ) dmax = y;
411
412 faddd %f50,D2ON28,%f58 ! (2_1) x_hi = x + D2ON28;
413 fcmple32 DC1,%f36,%g1 ! (1_0) c0 = vis_fcmple32(DC1,x);
414
415 faddd %f34,D2ON28,%f22 ! (2_1) y_hi = y + D2ON28;
416 fcmple32 DC1,%f54,%g5 ! (1_0) c2 = vis_fcmple32(DC1,y);
417
418 faddd %f60,%f2,%f24 ! (1_1) res += dtmp0;
419 fcmpgt32 DC2,%f36,%o5 ! (1_0) c1 = vis_fcmpgt32(DC2,x);
420
421 faddd %f10,%f26,%f28 ! (1_1) dtmp1 += dtmp2;
422 fcmpgt32 DC2,%f54,%o1 ! (1_0) c3 = vis_fcmpgt32(DC2,y);
423
424 fand %f44,DC0,%f14 ! (3_1) dmax = vis_fand(dmax,DC0);
425
426 or %g1,%g5,%g1 ! (1_0) c0 |= c2;
427 fsubd %f58,D2ON28,%f44 ! (2_1) x_hi -= D2ON28;
428
429 andcc %g1,2,%g0 ! (1_0) c0 & 2
430 bnz,pn %icc,.update6 ! (1_0) if ( (c0 & 2) != 0 )
431 fsubd %f22,D2ON28,%f58 ! (2_1) y_hi -= D2ON28;
432 .cont6:
433 and %o5,%o1,%o5 ! (1_0) c1 &= c3;
434 faddd %f24,%f28,%f26 ! (1_1) res += dtmp1;
435
436 add %l2,stridey,%i3 ! py += stridey
437 andcc %o5,2,%g0 ! (1_0) c1 & 2
438 bnz,pn %icc,.update7 ! (1_0) if ( (c1 & 2) != 0 )
439 fmovd %f20,%f4 ! (0_0) dmax = x;
440 .cont7:
441 fpsub32 DC1,%f14,%f10 ! (3_1) dnorm = vis_fpsub32(DC1,dmax);
442 lda [%i3]%asi,%f28 ! (2_0) ((float*)&y)[0] = ((float*)py)[0];
443
444 fmuld %f44,%f44,%f2 ! (2_1) res = x_hi * x_hi;
445 lda [%i3+4]%asi,%f29 ! (2_0) ((float*)&y)[1] = ((float*)py)[1];
446 add %l7,stridex,%o1 ! px += stridex
447 faddd %f34,%f58,%f60 ! (2_1) dtmp2 = y + y_hi;
448
449 fsqrtd %f26,%f24 ! (1_1) res = sqrt(res);
450 lda [%o1]%asi,%f26 ! (2_0) ((float*)&x)[0] = ((float*)px)[0];
451 faddd %f50,%f44,%f56 ! (2_1) dtmp1 = x + x_hi;
452
453 fmuld %f18,%f10,%f6 ! (3_1) x *= dnorm;
454 fsubd %f50,%f44,%f18 ! (2_1) x_lo = x - x_hi;
455 lda [%o1+4]%asi,%f27 ! (2_0) ((float*)&x)[1] = ((float*)px)[1];
456
457 fmuld %f58,%f58,%f44 ! (2_1) dtmp0 = y_hi * y_hi;
458 fsubd %f34,%f58,%f22 ! (2_1) y_lo = y - y_hi;
459
460 fmuld %f30,%f10,%f58 ! (3_1) y *= dnorm;
461 fabsd %f28,%f34 ! (2_0) y = fabs(y);
462
463 fabsd %f26,%f50 ! (2_0) x = fabs(x);
464
465 fmuld %f56,%f18,%f10 ! (2_1) dtmp1 *= x_lo;
466 fcmped %fcc1,%f40,%f4 ! (0_0) dmax ? y
467
468 fmuld %f60,%f22,%f12 ! (2_1) dtmp2 *= y_lo;
469
470 fmovdg %fcc1,%f40,%f4 ! (0_0) if ( dmax < y ) dmax = y;
471
472 faddd %f6,D2ON28,%f56 ! (3_1) x_hi = x + D2ON28;
473 fcmple32 DC1,%f50,%o3 ! (2_0) c0 = vis_fcmple32(DC1,x);
474
475 faddd %f58,D2ON28,%f28 ! (3_1) y_hi = y + D2ON28;
476 fcmple32 DC1,%f34,%o0 ! (2_0) c2 = vis_fcmple32(DC1,y);
477
478 faddd %f2,%f44,%f30 ! (2_1) res += dtmp0;
479 fcmpgt32 DC2,%f50,%o4 ! (2_0) c1 = vis_fcmpgt32(DC2,x);
480
481 faddd %f10,%f12,%f26 ! (2_1) dtmp1 += dtmp2;
482 fcmpgt32 DC2,%f34,%o5 ! (2_0) c3 = vis_fcmpgt32(DC2,y);
483
484 fand %f4,DC0,%f16 ! (0_0) dmax = vis_fand(dmax,DC0);
485
486 or %o3,%o0,%o3 ! (2_0) c0 |= c2;
487 fsubd %f56,D2ON28,%f18 ! (3_1) x_hi -= D2ON28;
488
489 andcc %o3,2,%g0 ! (2_0) c0 & 2
490 bnz,pn %icc,.update8 ! (2_0) if ( (c0 & 2) != 0 )
491 fsubd %f28,D2ON28,%f4 ! (3_1) y_hi -= D2ON28;
492 .cont8:
493 and %o4,%o5,%o4 ! (2_0) c1 &= c3;
494 faddd %f30,%f26,%f12 ! (2_1) res += dtmp1;
495
496 add %i3,stridey,%l4 ! py += stridey
497 andcc %o4,2,%g0 ! (2_0) c1 & 2
498 bnz,pn %icc,.update9 ! (2_0) if ( (c1 & 2) != 0 )
499 fmovd %f36,%f56 ! (1_0) dmax = x;
500 .cont9:
501 lda [%l4]%asi,%f30 ! (3_0) ((float*)&y)[0] = ((float*)py)[0];
502 add %o1,stridex,%l2 ! px += stridex
503 fpsub32 DC1,%f16,%f44 ! (0_0) dnorm = vis_fpsub32(DC1,dmax);
504
505 fmuld %f18,%f18,%f60 ! (3_1) res = x_hi * x_hi;
506 lda [%l4+4]%asi,%f31 ! (3_0) ((float*)&y)[1] = ((float*)py)[1];
507 faddd %f58,%f4,%f32 ! (3_1) dtmp2 = y + y_hi;
508
509 fsqrtd %f12,%f12 ! (2_1) res = sqrt(res);
510 faddd %f6,%f18,%f28 ! (3_1) dtmp1 = x + x_hi;
511
512 cmp counter,4
513 bl,pn %icc,.tail
514 nop
515
516 ba .main_loop
517 sub counter,4,counter
518
519 .align 16
520 .main_loop:
521 fmuld %f20,%f44,%f2 ! (0_1) x *= dnorm;
522 fsubd %f6,%f18,%f20 ! (3_2) x_lo = x - x_hi;
523 lda [%l2]%asi,%f18 ! (3_1) ((float*)&x)[0] = ((float*)px)[0];
524
525 fmuld %f4,%f4,%f22 ! (3_2) dtmp0 = y_hi * y_hi;
526 lda [%l2+4]%asi,%f19 ! (3_1) ((float*)&x)[1] = ((float*)px)[1];
527 fsubd %f58,%f4,%f58 ! (3_2) y_lo = y - y_hi;
528
529 fmuld %f40,%f44,%f44 ! (0_1) y *= dnorm;
530 fabsd %f30,%f30 ! (3_1) y = fabs(y);
531
532 fmuld %f38,%f24,%f10 ! (1_2) res = dmax * res;
533 fabsd %f18,%f18 ! (3_1) x = fabs(x);
534 st %f10,[%i5] ! (1_2) ((float*)pz)[0] = ((float*)&res)[0];
535
536 fmuld %f28,%f20,%f28 ! (3_2) dtmp1 *= x_lo;
537 st %f11,[%i5+4] ! (1_2) ((float*)pz)[1] = ((float*)&res)[1];
538 fcmped %fcc2,%f54,%f56 ! (1_1) dmax ? y
539
540 fmuld %f32,%f58,%f24 ! (3_2) dtmp2 *= y_lo;
541
542 fmovdg %fcc2,%f54,%f56 ! (1_1) if ( dmax < y ) dmax = y;
543
544 faddd %f2,D2ON28,%f10 ! (0_1) x_hi = x + D2ON28;
545 fcmple32 DC1,%f18,%o3 ! (3_1) c0 = vis_fcmple32(DC1,x);
546
547 faddd %f44,D2ON28,%f20 ! (0_1) y_hi = y + D2ON28;
548 fcmple32 DC1,%f30,%o0 ! (3_1) c2 = vis_fcmple32(DC1,y);
549
550 faddd %f60,%f22,%f22 ! (3_2) res += dtmp0;
551 fcmpgt32 DC2,%f18,%o4 ! (3_1) c1 = vis_fcmpgt32(DC2,x);
552
553 faddd %f28,%f24,%f26 ! (3_2) dtmp1 += dtmp2;
554 fcmpgt32 DC2,%f30,%o1 ! (3_1) c3 = vis_fcmpgt32(DC2,y);
555
556 fand %f56,DC0,%f38 ! (1_1) dmax = vis_fand(dmax,DC0);
557
558 or %o3,%o0,%o3 ! (3_1) c0 |= c2;
559 fsubd %f10,D2ON28,%f58 ! (0_1) x_hi -= D2ON28;
560
561 andcc %o3,2,%g0 ! (3_1) c0 & 2
562 bnz,pn %icc,.update10 ! (3_1) if ( (c0 & 2) != 0 )
563 fsubd %f20,D2ON28,%f56 ! (0_1) y_hi -= D2ON28;
564 .cont10:
565 faddd %f22,%f26,%f28 ! (3_2) res += dtmp1;
566 and %o4,%o1,%o4 ! (3_1) c1 &= c3;
567
568 add %l4,stridey,%i3 ! py += stridey
569 andcc %o4,2,%g0 ! (3_1) c1 & 2
570 bnz,pn %icc,.update11 ! (3_1) if ( (c1 & 2) != 0 )
571 fmovd %f50,%f32 ! (2_1) dmax = x;
572 .cont11:
573 fpsub32 DC1,%f38,%f10 ! (1_1) dnorm = vis_fpsub32(DC1,dmax);
574 add %l2,stridex,%l1 ! px += stridex
575 lda [%i3]%asi,%f20 ! (0_0) ((float*)&y)[0] = ((float*)py)[0];
576
577 fmuld %f58,%f58,%f6 ! (0_1) res = x_hi * x_hi;
578 lda [%i3+4]%asi,%f21 ! (0_0) ((float*)&y)[1] = ((float*)py)[1];
579 add %i5,stridez,%l6 ! pz += stridez
580 faddd %f44,%f56,%f60 ! (0_1) dtmp2 = y + y_hi;
581
582 fsqrtd %f28,%f4 ! (3_2) res = sqrt(res);
583 lda [%l1]%asi,%f22 ! (0_0) ((float*)&x)[0] = ((float*)px)[0];
584 faddd %f2,%f58,%f24 ! (0_1) dtmp1 = x + x_hi;
585
586 fmuld %f36,%f10,%f36 ! (1_1) x *= dnorm;
587 fsubd %f2,%f58,%f26 ! (0_1) x_lo = x - x_hi;
588 lda [%l1+4]%asi,%f23 ! (0_0) ((float*)&x)[1] = ((float*)px)[1];
589
590 fmuld %f56,%f56,%f28 ! (0_1) dtmp0 = y_hi * y_hi;
591 fsubd %f44,%f56,%f44 ! (0_1) y_lo = y - y_hi;
592
593 fmuld %f54,%f10,%f56 ! (1_1) y *= dnorm;
594 fabsd %f20,%f40 ! (0_0) y = fabs(y);
595
596 fmuld %f52,%f12,%f12 ! (2_2) res = dmax * res;
597 fabsd %f22,%f20 ! (0_0) x = fabs(x);
598 st %f12,[%l6] ! (2_2) ((float*)pz)[0] = ((float*)&res)[0];
599
600 fmuld %f24,%f26,%f10 ! (0_1) dtmp1 *= x_lo;
601 st %f13,[%l6+4] ! (2_2) ((float*)pz)[1] = ((float*)&res)[1];
602 fcmped %fcc3,%f34,%f32 ! (2_1) dmax ? y
603
604 fmuld %f60,%f44,%f12 ! (0_1) dtmp2 *= y_lo;
605
606 fmovdg %fcc3,%f34,%f32 ! (2_1) if ( dmax < y ) dmax = y;
607
608 faddd %f36,D2ON28,%f58 ! (1_1) x_hi = x + D2ON28;
609 fcmple32 DC1,%f20,%g5 ! (0_0) c0 = vis_fcmple32(DC1,x);
610
611 faddd %f56,D2ON28,%f22 ! (1_1) y_hi = y + D2ON28;
612 fcmple32 DC1,%f40,%o2 ! (0_0) c2 = vis_fcmple32(DC1,y);
613
614 faddd %f6,%f28,%f24 ! (0_1) res += dtmp0;
615 fcmpgt32 DC2,%f20,%g1 ! (0_0) c1 = vis_fcmpgt32(DC2,x);
616
617 faddd %f10,%f12,%f26 ! (0_1) dtmp1 += dtmp2;
618 fcmpgt32 DC2,%f40,%o4 ! (0_0) c3 = vis_fcmpgt32(DC2,y);
619
620 fand %f32,DC0,%f52 ! (2_1) dmax = vis_fand(dmax,DC0);
621
622 or %g5,%o2,%g5 ! (0_0) c0 |= c2;
623 fsubd %f58,D2ON28,%f58 ! (1_1) x_hi -= D2ON28;
624
625 andcc %g5,2,%g0 ! (0_0) c0 & 2
626 bnz,pn %icc,.update12 ! (0_0) if ( (c0 & 2) != 0 )
627 fsubd %f22,D2ON28,%f22 ! (1_1) y_hi -= D2ON28;
628 .cont12:
629 and %g1,%o4,%g1 ! (0_0) c1 &= c3;
630 faddd %f24,%f26,%f12 ! (0_1) res += dtmp1;
631
632 add %i3,stridey,%l2 ! py += stridey
633 andcc %g1,2,%g0 ! (0_0) c1 & 2
634 bnz,pn %icc,.update13 ! (0_0) if ( (c1 & 2) != 0 )
635 fmovd %f18,%f44 ! (3_1) dmax = x;
636 .cont13:
637 fpsub32 DC1,%f52,%f10 ! (2_1) dnorm = vis_fpsub32(DC1,dmax);
638 add %l1,stridex,%l7 ! px += stridex
639 lda [%l2]%asi,%f24 ! (1_0) ((float*)&y)[0] = ((float*)py)[0];
640
641 fmuld %f58,%f58,%f60 ! (1_1) res = x_hi * x_hi;
642 add %l6,stridez,%i5 ! pz += stridez
643 lda [%l2+4]%asi,%f25 ! (1_0) ((float*)&y)[1] = ((float*)py)[1];
644 faddd %f56,%f22,%f28 ! (1_1) dtmp2 = y + y_hi;
645
646 fsqrtd %f12,%f12 ! (0_1) res = sqrt(res);
647 lda [%l7]%asi,%f26 ! (1_0) ((float*)&x)[0] = ((float*)px)[0];
648 faddd %f36,%f58,%f6 ! (1_1) dtmp1 = x + x_hi;
649
650 fmuld %f50,%f10,%f50 ! (2_1) x *= dnorm;
651 fsubd %f36,%f58,%f58 ! (1_1) x_lo = x - x_hi;
652 lda [%l7+4]%asi,%f27 ! (1_0) ((float*)&x)[1] = ((float*)px)[1];
653
654 fmuld %f22,%f22,%f2 ! (1_1) dtmp0 = y_hi * y_hi;
655 fsubd %f56,%f22,%f56 ! (1_1) y_lo = y - y_hi;
656
657 fmuld %f34,%f10,%f34 ! (2_1) y *= dnorm;
658 fabsd %f24,%f54 ! (1_0) y = fabs(y);
659
660 fmuld %f14,%f4,%f14 ! (3_2) res = dmax * res;
661 fabsd %f26,%f36 ! (1_0) x = fabs(x);
662 st %f14,[%i5] ! (3_2) ((float*)pz)[0] = ((float*)&res)[0];
663
664 fmuld %f6,%f58,%f10 ! (1_1) dtmp1 *= x_lo;
665 st %f15,[%i5+4] ! (3_2) ((float*)pz)[1] = ((float*)&res)[1];
666 fcmped %fcc0,%f30,%f44 ! (3_1) dmax ? y
667
668 fmuld %f28,%f56,%f26 ! (1_1) dtmp2 *= y_lo;
669
670 fmovdg %fcc0,%f30,%f44 ! (3_1) if ( dmax < y ) dmax = y;
671
672 faddd %f50,D2ON28,%f58 ! (2_1) x_hi = x + D2ON28;
673 fcmple32 DC1,%f36,%g1 ! (1_0) c0 = vis_fcmple32(DC1,x);
674
675 faddd %f34,D2ON28,%f22 ! (2_1) y_hi = y + D2ON28;
676 fcmple32 DC1,%f54,%g5 ! (1_0) c2 = vis_fcmple32(DC1,y);
677
678 faddd %f60,%f2,%f24 ! (1_1) res += dtmp0;
679 fcmpgt32 DC2,%f36,%o5 ! (1_0) c1 = vis_fcmpgt32(DC2,x);
680
681 faddd %f10,%f26,%f28 ! (1_1) dtmp1 += dtmp2;
682 fcmpgt32 DC2,%f54,%o1 ! (1_0) c3 = vis_fcmpgt32(DC2,y);
683
684 fand %f44,DC0,%f14 ! (3_1) dmax = vis_fand(dmax,DC0);
685
686 or %g1,%g5,%g1 ! (1_0) c0 |= c2;
687 fsubd %f58,D2ON28,%f44 ! (2_1) x_hi -= D2ON28;
688
689 andcc %g1,2,%g0 ! (1_0) c0 & 2
690 bnz,pn %icc,.update14 ! (1_0) if ( (c0 & 2) != 0 )
691 fsubd %f22,D2ON28,%f58 ! (2_1) y_hi -= D2ON28;
692 .cont14:
693 and %o5,%o1,%o5 ! (1_0) c1 &= c3;
694 faddd %f24,%f28,%f26 ! (1_1) res += dtmp1;
695
696 add %l2,stridey,%i3 ! py += stridey
697 andcc %o5,2,%g0 ! (1_0) c1 & 2
698 bnz,pn %icc,.update15 ! (1_0) if ( (c1 & 2) != 0 )
699 fmovd %f20,%f4 ! (0_0) dmax = x;
700 .cont15:
701 fpsub32 DC1,%f14,%f10 ! (3_1) dnorm = vis_fpsub32(DC1,dmax);
702 add %l7,stridex,%o1 ! px += stridex
703 lda [%i3]%asi,%f28 ! (2_0) ((float*)&y)[0] = ((float*)py)[0];
704
705 fmuld %f44,%f44,%f2 ! (2_1) res = x_hi * x_hi;
706 add %i5,stridez,%g5 ! pz += stridez
707 lda [%i3+4]%asi,%f29 ! (2_0) ((float*)&y)[1] = ((float*)py)[1];
708 faddd %f34,%f58,%f60 ! (2_1) dtmp2 = y + y_hi;
709
710 fsqrtd %f26,%f24 ! (1_1) res = sqrt(res);
711 lda [%o1]%asi,%f26 ! (2_0) ((float*)&x)[0] = ((float*)px)[0];
712 faddd %f50,%f44,%f56 ! (2_1) dtmp1 = x + x_hi;
713
714 fmuld %f18,%f10,%f6 ! (3_1) x *= dnorm;
715 fsubd %f50,%f44,%f18 ! (2_1) x_lo = x - x_hi;
716 lda [%o1+4]%asi,%f27 ! (2_0) ((float*)&x)[1] = ((float*)px)[1];
717
718 fmuld %f58,%f58,%f44 ! (2_1) dtmp0 = y_hi * y_hi;
719 fsubd %f34,%f58,%f22 ! (2_1) y_lo = y - y_hi;
720
721 fmuld %f30,%f10,%f58 ! (3_1) y *= dnorm;
722 fabsd %f28,%f34 ! (2_0) y = fabs(y);
723
724 fmuld %f16,%f12,%f16 ! (0_1) res = dmax * res;
725 fabsd %f26,%f50 ! (2_0) x = fabs(x);
726 st %f16,[%g5] ! (0_1) ((float*)pz)[0] = ((float*)&res)[0];
727
728 fmuld %f56,%f18,%f10 ! (2_1) dtmp1 *= x_lo;
729 st %f17,[%g5+4] ! (0_1) ((float*)pz)[1] = ((float*)&res)[1];
730 fcmped %fcc1,%f40,%f4 ! (0_0) dmax ? y
731
732 fmuld %f60,%f22,%f12 ! (2_1) dtmp2 *= y_lo;
733
734 fmovdg %fcc1,%f40,%f4 ! (0_0) if ( dmax < y ) dmax = y;
735
736 faddd %f6,D2ON28,%f56 ! (3_1) x_hi = x + D2ON28;
737 fcmple32 DC1,%f50,%o3 ! (2_0) c0 = vis_fcmple32(DC1,x);
738
739 faddd %f58,D2ON28,%f28 ! (3_1) y_hi = y + D2ON28;
740 fcmple32 DC1,%f34,%o0 ! (2_0) c2 = vis_fcmple32(DC1,y);
741
742 faddd %f2,%f44,%f30 ! (2_1) res += dtmp0;
743 fcmpgt32 DC2,%f50,%o4 ! (2_0) c1 = vis_fcmpgt32(DC2,x);
744
745 faddd %f10,%f12,%f26 ! (2_1) dtmp1 += dtmp2;
746 fcmpgt32 DC2,%f34,%o5 ! (2_0) c3 = vis_fcmpgt32(DC2,y);
747
748 fand %f4,DC0,%f16 ! (0_0) dmax = vis_fand(dmax,DC0);
749
750 or %o3,%o0,%o3 ! (2_0) c0 |= c2;
751 fsubd %f56,D2ON28,%f18 ! (3_1) x_hi -= D2ON28;
752
753 andcc %o3,2,%g0 ! (2_0) c0 & 2
754 bnz,pn %icc,.update16 ! (2_0) if ( (c0 & 2) != 0 )
755 fsubd %f28,D2ON28,%f4 ! (3_1) y_hi -= D2ON28;
756 .cont16:
757 and %o4,%o5,%o4 ! (2_0) c1 &= c3;
758 faddd %f30,%f26,%f12 ! (2_1) res += dtmp1;
759
760 add %i3,stridey,%l4 ! py += stridey
761 andcc %o4,2,%g0 ! (2_0) c1 & 2
762 bnz,pn %icc,.update17 ! (2_0) if ( (c1 & 2) != 0 )
763 fmovd %f36,%f56 ! (1_0) dmax = x;
764 .cont17:
765 lda [%l4]%asi,%f30 ! (3_0) ((float*)&y)[0] = ((float*)py)[0];
766 add %o1,stridex,%l2 ! px += stridex
767 fpsub32 DC1,%f16,%f44 ! (0_0) dnorm = vis_fpsub32(DC1,dmax);
768
769 fmuld %f18,%f18,%f60 ! (3_1) res = x_hi * x_hi;
770 add %g5,stridez,%i5 ! pz += stridez
771 lda [%l4+4]%asi,%f31 ! (3_0) ((float*)&y)[1] = ((float*)py)[1];
772 faddd %f58,%f4,%f32 ! (3_1) dtmp2 = y + y_hi;
773
774 fsqrtd %f12,%f12 ! (2_1) res = sqrt(res);
775 subcc counter,4,counter ! counter -= 4;
776 bpos,pt %icc,.main_loop
777 faddd %f6,%f18,%f28 ! (3_1) dtmp1 = x + x_hi;
778
779 add counter,4,counter
780
781 .tail:
782 subcc counter,1,counter
783 bneg,a .begin
784 nop
785
786 fsubd %f6,%f18,%f20 ! (3_2) x_lo = x - x_hi;
787
788 fmuld %f4,%f4,%f22 ! (3_2) dtmp0 = y_hi * y_hi;
789 fsubd %f58,%f4,%f58 ! (3_2) y_lo = y - y_hi;
790
791 fmuld %f38,%f24,%f10 ! (1_2) res = dmax * res;
792 st %f10,[%i5] ! (1_2) ((float*)pz)[0] = ((float*)&res)[0];
793
794 st %f11,[%i5+4] ! (1_2) ((float*)pz)[1] = ((float*)&res)[1];
795
796 subcc counter,1,counter
797 bneg,a .begin
798 add %i5,stridez,%i5
799
800 fmuld %f28,%f20,%f28 ! (3_2) dtmp1 *= x_lo;
801
802 fmuld %f32,%f58,%f24 ! (3_2) dtmp2 *= y_lo;
803
804 faddd %f60,%f22,%f22 ! (3_2) res += dtmp0;
805
806 faddd %f28,%f24,%f26 ! (3_2) dtmp1 += dtmp2;
807
808 faddd %f22,%f26,%f28 ! (3_2) res += dtmp1;
809
810 add %i5,stridez,%l6 ! pz += stridez
811
812 fsqrtd %f28,%f4 ! (3_2) res = sqrt(res);
813 add %l2,stridex,%l1 ! px += stridex
814
815 fmuld %f52,%f12,%f12 ! (2_2) res = dmax * res;
816 st %f12,[%l6] ! (2_2) ((float*)pz)[0] = ((float*)&res)[0];
817
818 st %f13,[%l6+4] ! (2_2) ((float*)pz)[1] = ((float*)&res)[1];
819
820 subcc counter,1,counter
821 bneg .begin
822 add %l6,stridez,%i5
823
824 fmuld %f14,%f4,%f14 ! (3_2) res = dmax * res;
825 st %f14,[%i5] ! (3_2) ((float*)pz)[0] = ((float*)&res)[0];
826
827 st %f15,[%i5+4] ! (3_2) ((float*)pz)[1] = ((float*)&res)[1];
828
829 ba .begin
830 add %i5,stridez,%i5
831
832 .align 16
833 .spec0:
834 ld [%i1+4],%l1 ! lx = ((int*)px)[1];
835 cmp %o2,%o4 ! j0 ? 0x7ff00000
836 bge,pn %icc,1f ! if ( j0 >= 0x7ff00000 )
837 fabsd %f26,%f26 ! x = fabs(x);
838
839 sub %o0,%l4,%o0 ! diff = hy - hx;
840 fabsd %f24,%f24 ! y = fabs(y);
841
842 sra %o0,31,%l4 ! j0 = diff >> 31;
843
844 xor %o0,%l4,%o0 ! diff ^ j0
845
846 sethi %hi(0x03600000),%l1
847 sub %o0,%l4,%o0 ! (diff ^ j0) - j0
848
849 cmp %o0,%l1 ! ((diff ^ j0) - j0) ? 0x03600000
850 bge,a,pn %icc,2f ! if ( ((diff ^ j0) - j0) >= 0x03600000 )
851 faddd %f26,%f24,%f24 ! *pz = x + y
852
853 fmuld %f26,DC2,%f36 ! (1_1) x *= dnorm;
854
855 fmuld %f24,DC2,%f56 ! (1_1) y *= dnorm;
856
857 faddd %f36,D2ON28,%f58 ! (1_1) x_hi = x + D2ON28;
858
859 faddd %f56,D2ON28,%f22 ! (1_1) y_hi = y + D2ON28;
860
861 fsubd %f58,D2ON28,%f58 ! (1_1) x_hi -= D2ON28;
862
863 fsubd %f22,D2ON28,%f22 ! (1_1) y_hi -= D2ON28;
864
865 fmuld %f58,%f58,%f60 ! (1_1) res = x_hi * x_hi;
866 faddd %f56,%f22,%f28 ! (1_1) dtmp2 = y + y_hi;
867
868 faddd %f36,%f58,%f6 ! (1_1) dtmp1 = x + x_hi;
869
870 fsubd %f36,%f58,%f58 ! (1_1) x_lo = x - x_hi;
871
872 fmuld %f22,%f22,%f2 ! (1_1) dtmp0 = y_hi * y_hi;
873 fsubd %f56,%f22,%f56 ! (1_1) y_lo = y - y_hi;
874
875 fmuld %f6,%f58,%f10 ! (1_1) dtmp1 *= x_lo;
876
877 fmuld %f28,%f56,%f26 ! (1_1) dtmp2 *= y_lo;
878
879 faddd %f60,%f2,%f24 ! (1_1) res += dtmp0;
880
881 faddd %f10,%f26,%f28 ! (1_1) dtmp1 += dtmp2;
882
883 faddd %f24,%f28,%f26 ! (1_1) res += dtmp1;
884
885 fsqrtd %f26,%f24 ! (1_1) res = sqrt(res);
886
887 fmuld DC3,%f24,%f24 ! (1_2) res = dmax * res;
888 2:
889 add %i3,stridey,%i3
890 add %i1,stridex,%i1
891 st %f24,[%i5] ! ((float*)pz)[0] = ((float*)&res)[0];
892 st %f25,[%i5+4] ! ((float*)pz)[1] = ((float*)&res)[1];
893
894 add %i5,stridez,%i5
895 ba .begin1
896 sub counter,1,counter
897
898 1:
899 ld [%i3+4],%l2 ! ly = ((int*)py)[1];
900 cmp %o0,%o4 ! hx ? 0x7ff00000
901 bne,pn %icc,1f ! if ( hx != 0x7ff00000 )
902 fabsd %f24,%f24 ! y = fabs(y);
903
904 cmp %l1,0 ! lx ? 0
905 be,pn %icc,2f ! if ( lx == 0 )
906 nop
907 1:
908 cmp %l4,%o4 ! hy ? 0x7ff00000
909 bne,pn %icc,1f ! if ( hy != 0x7ff00000 )
910 nop
911
912 cmp %l2,0 ! ly ? 0
913 be,pn %icc,2f ! if ( ly == 0 )
914 nop
915 1:
916 add %i3,stridey,%i3
917 add %i1,stridex,%i1
918 fmuld %f26,%f24,%f24 ! res = x * y;
919 st %f24,[%i5] ! ((float*)pz)[0] = ((float*)&res)[0];
920
921 st %f25,[%i5+4] ! ((float*)pz)[1] = ((float*)&res)[1];
922
923 add %i5,stridez,%i5
924 ba .begin1
925 sub counter,1,counter
926
927 2:
928 add %i1,stridex,%i1
929 add %i3,stridey,%i3
930 st DC0_HI,[%i5] ! ((int*)pz)[0] = 0x7ff00000;
931 st DC0_LO,[%i5+4] ! ((int*)pz)[1] = 0;
932 fcmpd %f26,%f24 ! x ? y
933
934 add %i5,stridez,%i5
935 ba .begin1
936 sub counter,1,counter
937
938 .align 16
939 .spec1:
940 fmuld %f26,DC3,%f36 ! (1_1) x *= dnorm;
941
942 fmuld %f24,DC3,%f56 ! (1_1) y *= dnorm;
943
944 faddd %f36,D2ON28,%f58 ! (1_1) x_hi = x + D2ON28;
945
946 faddd %f56,D2ON28,%f22 ! (1_1) y_hi = y + D2ON28;
947
948 fsubd %f58,D2ON28,%f58 ! (1_1) x_hi -= D2ON28;
949
950 fsubd %f22,D2ON28,%f22 ! (1_1) y_hi -= D2ON28;
951
952 fmuld %f58,%f58,%f60 ! (1_1) res = x_hi * x_hi;
953 faddd %f56,%f22,%f28 ! (1_1) dtmp2 = y + y_hi;
954
955 faddd %f36,%f58,%f6 ! (1_1) dtmp1 = x + x_hi;
956
957 fsubd %f36,%f58,%f58 ! (1_1) x_lo = x - x_hi;
958
959 fmuld %f22,%f22,%f2 ! (1_1) dtmp0 = y_hi * y_hi;
960 fsubd %f56,%f22,%f56 ! (1_1) y_lo = y - y_hi;
961
962 fmuld %f6,%f58,%f10 ! (1_1) dtmp1 *= x_lo;
963
964 fmuld %f28,%f56,%f26 ! (1_1) dtmp2 *= y_lo;
965
966 faddd %f60,%f2,%f24 ! (1_1) res += dtmp0;
967
968 faddd %f10,%f26,%f28 ! (1_1) dtmp1 += dtmp2;
969
970 faddd %f24,%f28,%f26 ! (1_1) res += dtmp1;
971
972 fsqrtd %f26,%f24 ! (1_1) res = sqrt(res);
973
974 fmuld DC2,%f24,%f24 ! (1_2) res = dmax * res;
975
976 add %i3,stridey,%i3
977 add %i1,stridex,%i1
978 st %f24,[%i5] ! ((float*)pz)[0] = ((float*)&res)[0];
979
980 st %f25,[%i5+4] ! ((float*)pz)[1] = ((float*)&res)[1];
981 add %i5,stridez,%i5
982 ba .begin1
983 sub counter,1,counter
984
985 .align 16
986 .update0:
987 fzero %f50
988 cmp counter,1
989 ble .cont0
990 fzero %f34
991
992 mov %o1,tmp_px
993 mov %i3,tmp_py
994
995 sub counter,1,tmp_counter
996 ba .cont0
997 mov 1,counter
998
999 .align 16
1000 .update1:
1001 fzero %f50
1002 cmp counter,1
1003 ble .cont1
1004 fzero %f34
1005
1006 mov %o1,tmp_px
1007 mov %i3,tmp_py
1008
1009 sub counter,1,tmp_counter
1010 ba .cont1
1011 mov 1,counter
1012
1013 .align 16
1014 .update2:
1015 fzero %f18
1016 cmp counter,2
1017 ble .cont2
1018 fzero %f30
1019
1020 mov %l2,tmp_px
1021 mov %l4,tmp_py
1022
1023 sub counter,2,tmp_counter
1024 ba .cont1
1025 mov 2,counter
1026
1027 .align 16
1028 .update3:
1029 fzero %f18
1030 cmp counter,2
1031 ble .cont3
1032 fzero %f30
1033
1034 mov %l2,tmp_px
1035 mov %l4,tmp_py
1036
1037 sub counter,2,tmp_counter
1038 ba .cont3
1039 mov 2,counter
1040
1041 .align 16
1042 .update4:
1043 fzero %f20
1044 cmp counter,3
1045 ble .cont4
1046 fzero %f40
1047
1048 mov %l1,tmp_px
1049 mov %i3,tmp_py
1050
1051 sub counter,3,tmp_counter
1052 ba .cont4
1053 mov 3,counter
1054
1055 .align 16
1056 .update5:
1057 fzero %f20
1058 cmp counter,3
1059 ble .cont5
1060 fzero %f40
1061
1062 mov %l1,tmp_px
1063 mov %i3,tmp_py
1064
1065 sub counter,3,tmp_counter
1066 ba .cont5
1067 mov 3,counter
1068
1069 .align 16
1070 .update6:
1071 fzero %f36
1072 cmp counter,4
1073 ble .cont6
1074 fzero %f54
1075
1076 mov %l7,tmp_px
1077 mov %l2,tmp_py
1078
1079 sub counter,4,tmp_counter
1080 ba .cont6
1081 mov 4,counter
1082
1083 .align 16
1084 .update7:
1085 fzero %f36
1086 cmp counter,4
1087 ble .cont7
1088 fzero %f54
1089
1090 mov %l7,tmp_px
1091 mov %l2,tmp_py
1092
1093 sub counter,4,tmp_counter
1094 ba .cont7
1095 mov 4,counter
1096
1097 .align 16
1098 .update8:
1099 fzero %f50
1100 cmp counter,5
1101 ble .cont8
1102 fzero %f34
1103
1104 mov %o1,tmp_px
1105 mov %i3,tmp_py
1106
1107 sub counter,5,tmp_counter
1108 ba .cont8
1109 mov 5,counter
1110
1111 .align 16
1112 .update9:
1113 fzero %f50
1114 cmp counter,5
1115 ble .cont9
1116 fzero %f34
1117
1118 mov %o1,tmp_px
1119 mov %i3,tmp_py
1120
1121 sub counter,5,tmp_counter
1122 ba .cont9
1123 mov 5,counter
1124
1125
1126 .align 16
1127 .update10:
1128 fzero %f18
1129 cmp counter,2
1130 ble .cont10
1131 fzero %f30
1132
1133 mov %l2,tmp_px
1134 mov %l4,tmp_py
1135
1136 sub counter,2,tmp_counter
1137 ba .cont10
1138 mov 2,counter
1139
1140 .align 16
1141 .update11:
1142 fzero %f18
1143 cmp counter,2
1144 ble .cont11
1145 fzero %f30
1146
1147 mov %l2,tmp_px
1148 mov %l4,tmp_py
1149
1150 sub counter,2,tmp_counter
1151 ba .cont11
1152 mov 2,counter
1153
1154 .align 16
1155 .update12:
1156 fzero %f20
1157 cmp counter,3
1158 ble .cont12
1159 fzero %f40
1160
1161 mov %l1,tmp_px
1162 mov %i3,tmp_py
1163
1164 sub counter,3,tmp_counter
1165 ba .cont12
1166 mov 3,counter
1167
1168 .align 16
1169 .update13:
1170 fzero %f20
1171 cmp counter,3
1172 ble .cont13
1173 fzero %f40
1174
1175 mov %l1,tmp_px
1176 mov %i3,tmp_py
1177
1178 sub counter,3,tmp_counter
1179 ba .cont13
1180 mov 3,counter
1181
1182 .align 16
1183 .update14:
1184 fzero %f54
1185 cmp counter,4
1186 ble .cont14
1187 fzero %f36
1188
1189 mov %l7,tmp_px
1190 mov %l2,tmp_py
1191
1192 sub counter,4,tmp_counter
1193 ba .cont14
1194 mov 4,counter
1195
1196 .align 16
1197 .update15:
1198 fzero %f54
1199 cmp counter,4
1200 ble .cont15
1201 fzero %f36
1202
1203 mov %l7,tmp_px
1204 mov %l2,tmp_py
1205
1206 sub counter,4,tmp_counter
1207 ba .cont15
1208 mov 4,counter
1209
1210 .align 16
1211 .update16:
1212 fzero %f50
1213 cmp counter,5
1214 ble .cont16
1215 fzero %f34
1216
1217 mov %o1,tmp_px
1218 mov %i3,tmp_py
1219
1220 sub counter,5,tmp_counter
1221 ba .cont16
1222 mov 5,counter
1223
1224 .align 16
1225 .update17:
1226 fzero %f50
1227 cmp counter,5
1228 ble .cont17
1229 fzero %f34
1230
1231 mov %o1,tmp_px
1232 mov %i3,tmp_py
1233
1234 sub counter,5,tmp_counter
1235 ba .cont17
1236 mov 5,counter
1237
1238 .align 16
1239 .exit:
1240 ret
1241 restore
1242 SET_SIZE(__vhypot)
1243