1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25
26 #include <sys/param.h>
27 #include <sys/errno.h>
28 #include <sys/asm_linkage.h>
29 #include <sys/vtrace.h>
30 #include <sys/machthread.h>
31 #include <sys/clock.h>
32 #include <sys/asi.h>
33 #include <sys/fsr.h>
34 #include <sys/privregs.h>
35 #include <sys/machasi.h>
36 #include <sys/niagaraasi.h>
37
38 #include "assym.h"
39
40
41 /*
42 * Pseudo-code to aid in understanding the control flow of the
43 * bcopy/kcopy routine.
44 *
45 * ! WARNING : <Register usage convention>
46 * ! In kcopy() the %o5, holds previous error handler and a flag
47 * ! LOFAULT_SET (low bits). The %o5 is null in bcopy().
48 * ! The %o5 is not available for any other use.
49 *
50 * On entry:
51 * ! Determine whether to use the FP register version or the
52 * ! the leaf routine version depending on the size of the copy.
53 * ! Set up error handling accordingly.
54 * ! The transition point depends on FP_COPY
55 * ! For both versions %o5 is reserved
56 *
57 * kcopy():
58 * if(length > FP_COPY)
59 * go to regular_kcopy
60 *
61 * ! Setup_leaf_rtn_error_handler
62 * %o5 = curthread->t_lofault; ! save existing handler in %o5
63 * %o5 |= LOFAULT_SET; ! ORed with LOFAULT_SET flag
64 * curthread->t_lofault = .sm_copyerr;
65 * goto small_bcopy();
66 *
67 * regular_kcopy:
68 * save_registers()
69 * %o5 = curthread->t_lofault; ! save existing handler in %o5
70 * %o5 |= LOFAULT_SET; ! ORed with LOFAULT_SET flag
71 * curthread->t_lofault = .copyerr;
72 * goto do_copy();
73 *
74 * bcopy():
75 * if(length > FP_COPY)
76 * go to regular_bcopy
77 *
78 * ! Setup_leaf_rtn_error_handler
79 * %o5 = curthread->t_lofault; ! save existing handler in %o5
80 * curthread->t_lofault = .sm_copyerr;
81 * goto small_bcopy();
82 *
83 * regular_bcopy:
84 * %o5 = curthread->t_lofault; ! save existing handler in %o5
85 * curthread->t_lofault = .copyerr;
86 * goto do_copy();
87 *
88 * small_bcopy:
89 * ! handle copies smaller than FP_COPY
90 * restore t_lofault handler
91 * exit
92 *
93 * do_copy:
94 * ! handle copies larger than FP_COPY
95 * save fp_regs
96 * blockcopy;
97 * restore fp_regs
98 * restore t_lofault handler if came from kcopy();
99 *
100 *
101 * In leaf lofault handler:
102 * curthread->t_lofault = (%o5 & ~LOFAULT_SET); ! restore old t_lofault
103 * return (errno)
104 *
105 * In lofault handler:
106 * curthread->t_lofault = (%o5 & ~LOFAULT_SET); ! restore old t_lofault
107 * restore fp_regs
108 * return (errno)
109 *
110 *
111 *
112 * For all of bcopy/copyin/copyout the copy logic is specialized according
113 * to how the src and dst is aligned and how much data needs to be moved.
114 * The following comments apply to the N2/RF code (#if !defined(NIAGARA_IMPL))
115 *
116 * N2/RF Flow :
117 *
118 * if (count < FP_COPY) { (584 bytes)
119 * set small fault handler (no register window save/restore)
120 * if count < SHORTCOPY (7 bytes)
121 * copy bytes; go to short_exit
122 * else
123 * determine dst alignment, move minimum bytes/halfwords to
124 * get dst aligned on long word boundary
125 * if( src is on long word boundary ) {
126 * medlong: src/dst aligned on 8 bytes
127 * copy with ldx/stx in 4-way unrolled loop;
128 * copy final 0-31 bytes; go to short_exit
129 * } else { src/dst not aligned on 8 bytes
130 * if src is word aligned, ld/st words in 32-byte chunks
131 * if src is half word aligned, ld half, ld word, ld half; pack
132 * into long word, store long words in 32-byte chunks
133 * if src is byte aligned, ld byte,half,word parts; pack into long
134 * word, store long words in 32-byte chunks
135 * move final 0-31 bytes according to src alignment; go to short_exit
136 * short_exit:
137 * restore trap handler if needed, retl
138 * else { More than FP_COPY bytes
139 * set fault handler
140 * disable kernel preemption
141 * save registers, save FP registers if in use
142 * move bytes to align destination register on long word boundary
143 * if(src is on long word boundary) { src/dst aligned on 8 bytes
144 * align dst on 64 byte boundary; use 8-way test for each of 8 possible
145 * src alignments relative to a 64 byte boundary to select the
146 * 16-way unrolled loop (128 bytes) to use for
147 * block load, fmovd, block-init-store, block-store, fmovd operations
148 * then go to remain_stuff.
149 * remain_stuff: move remaining bytes. go to long_exit
150 * } else {
151 * setup alignaddr for faligndata instructions
152 * align dst on 64 byte boundary; use 8-way test for each of 8 possible
153 * src alignments to nearest long word relative to 64 byte boundary to
154 * select the 8-way unrolled loop (64 bytes) to use for
155 * block load, falign, fmovd, block-store loop
156 * (only use block-init-store when src/dst on 8 byte boundaries.)
157 * goto unalign_done.
158 * unalign_done:
159 * move remaining bytes for unaligned cases. go to long_exit
160 * long_exit:
161 * restore %gsr, FP regs (either from stack or set to zero),
162 * restore trap handler, check for kernel preemption request,
163 * handle if needed, ret.
164 * }
165 *
166 * Other platforms include hw_bcopy_limit_[1248] to control the exact
167 * point where the FP register code is used. On those platforms, the
168 * FP register code did not leave data in L2 cache, potentially affecting
169 * performance more than the gain/loss from the algorithm difference.
170 * For N2/RF, block store places data in the L2 cache, so use or non-use
171 * of the FP registers has no effect on L2 cache behavior.
172 * The cost for testing hw_bcopy_limit_* according to different
173 * alignments exceeds 50 cycles for all cases, even when hw_bcopy_limits
174 * were not used. That cost was judged too high relative to the benefits,
175 * so the hw_bcopy_limit option is omitted from this code.
176 */
177
178 /*
179 * Less then or equal this number of bytes we will always copy byte-for-byte
180 */
181 #define SMALL_LIMIT 7
182
183 /*
184 * LOFAULT_SET : Flag set by kzero and kcopy to indicate that t_lofault
185 * handler was set
186 */
187 #define LOFAULT_SET 2
188
189 /*
190 * This define is to align data for the unaligned source cases.
191 * The data1, data2 and data3 is merged into data1 and data2.
192 * The data3 is preserved for next merge.
193 */
194 #define ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp) \
195 sllx data1, lshift, data1 ;\
196 srlx data2, rshift, tmp ;\
197 or data1, tmp, data1 ;\
198 sllx data2, lshift, data2 ;\
199 srlx data3, rshift, tmp ;\
200 or data2, tmp, data2
201 /*
202 * This macro is to align the data. Basically it merges
203 * data1 and data2 to form double word.
204 */
205 #define ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp) \
206 sllx data1, lshift, data1 ;\
207 srlx data2, rshift, tmp ;\
208 or data1, tmp, data1
209
210 #if !defined(NIAGARA_IMPL)
211 /*
212 * Flags set in the lower bits of the t_lofault address:
213 * FPUSED_FLAG: The FP registers were in use and must be restored
214 * LOFAULT_SET: Set for bcopy calls, cleared for kcopy calls
215 * COPY_FLAGS: Both of the above
216 *
217 * Other flags:
218 * KPREEMPT_FLAG: kpreempt needs to be called
219 */
220 #define FPUSED_FLAG 1
221 #define LOFAULT_SET 2
222 #define COPY_FLAGS (FPUSED_FLAG | LOFAULT_SET)
223 #define KPREEMPT_FLAG 4
224
225 #define ALIGN_OFF_1_7 \
226 faligndata %d0, %d2, %d48 ;\
227 faligndata %d2, %d4, %d50 ;\
228 faligndata %d4, %d6, %d52 ;\
229 faligndata %d6, %d8, %d54 ;\
230 faligndata %d8, %d10, %d56 ;\
231 faligndata %d10, %d12, %d58 ;\
232 faligndata %d12, %d14, %d60 ;\
233 faligndata %d14, %d16, %d62
234
235 #define ALIGN_OFF_8_15 \
236 faligndata %d2, %d4, %d48 ;\
237 faligndata %d4, %d6, %d50 ;\
238 faligndata %d6, %d8, %d52 ;\
239 faligndata %d8, %d10, %d54 ;\
240 faligndata %d10, %d12, %d56 ;\
241 faligndata %d12, %d14, %d58 ;\
242 faligndata %d14, %d16, %d60 ;\
243 faligndata %d16, %d18, %d62
244
245 #define ALIGN_OFF_16_23 \
246 faligndata %d4, %d6, %d48 ;\
247 faligndata %d6, %d8, %d50 ;\
248 faligndata %d8, %d10, %d52 ;\
249 faligndata %d10, %d12, %d54 ;\
250 faligndata %d12, %d14, %d56 ;\
251 faligndata %d14, %d16, %d58 ;\
252 faligndata %d16, %d18, %d60 ;\
253 faligndata %d18, %d20, %d62
254
255 #define ALIGN_OFF_24_31 \
256 faligndata %d6, %d8, %d48 ;\
257 faligndata %d8, %d10, %d50 ;\
258 faligndata %d10, %d12, %d52 ;\
259 faligndata %d12, %d14, %d54 ;\
260 faligndata %d14, %d16, %d56 ;\
261 faligndata %d16, %d18, %d58 ;\
262 faligndata %d18, %d20, %d60 ;\
263 faligndata %d20, %d22, %d62
264
265 #define ALIGN_OFF_32_39 \
266 faligndata %d8, %d10, %d48 ;\
267 faligndata %d10, %d12, %d50 ;\
268 faligndata %d12, %d14, %d52 ;\
269 faligndata %d14, %d16, %d54 ;\
270 faligndata %d16, %d18, %d56 ;\
271 faligndata %d18, %d20, %d58 ;\
272 faligndata %d20, %d22, %d60 ;\
273 faligndata %d22, %d24, %d62
274
275 #define ALIGN_OFF_40_47 \
276 faligndata %d10, %d12, %d48 ;\
277 faligndata %d12, %d14, %d50 ;\
278 faligndata %d14, %d16, %d52 ;\
279 faligndata %d16, %d18, %d54 ;\
280 faligndata %d18, %d20, %d56 ;\
281 faligndata %d20, %d22, %d58 ;\
282 faligndata %d22, %d24, %d60 ;\
283 faligndata %d24, %d26, %d62
284
285 #define ALIGN_OFF_48_55 \
286 faligndata %d12, %d14, %d48 ;\
287 faligndata %d14, %d16, %d50 ;\
288 faligndata %d16, %d18, %d52 ;\
289 faligndata %d18, %d20, %d54 ;\
290 faligndata %d20, %d22, %d56 ;\
291 faligndata %d22, %d24, %d58 ;\
292 faligndata %d24, %d26, %d60 ;\
293 faligndata %d26, %d28, %d62
294
295 #define ALIGN_OFF_56_63 \
296 faligndata %d14, %d16, %d48 ;\
297 faligndata %d16, %d18, %d50 ;\
298 faligndata %d18, %d20, %d52 ;\
299 faligndata %d20, %d22, %d54 ;\
300 faligndata %d22, %d24, %d56 ;\
301 faligndata %d24, %d26, %d58 ;\
302 faligndata %d26, %d28, %d60 ;\
303 faligndata %d28, %d30, %d62
304
305 /*
306 * FP_COPY indicates the minimum number of bytes needed
307 * to justify using FP/VIS-accelerated memory operations.
308 * The FPBLK code assumes a minimum number of bytes are available
309 * to be moved on entry. Check that code carefully before
310 * reducing FP_COPY below 256.
311 */
312 #define FP_COPY 584
313 #define SHORTCOPY 7
314 #define ASI_STBI_P ASI_BLK_INIT_ST_QUAD_LDD_P
315 #define ASI_STBI_AIUS ASI_BLK_INIT_QUAD_LDD_AIUS
316 #define CACHE_LINE 64
317 #define VIS_BLOCKSIZE 64
318
319 /*
320 * Size of stack frame in order to accomodate a 64-byte aligned
321 * floating-point register save area and 2 64-bit temp locations.
322 * All copy functions use three quadrants of fp registers; to assure a
323 * block-aligned three block buffer in which to save we must reserve
324 * four blocks on stack.
325 *
326 * _______________________________________ <-- %fp + STACK_BIAS
327 * | We may need to preserve 3 quadrants |
328 * | of fp regs, but since we do so with |
329 * | BST/BLD we need room in which to |
330 * | align to VIS_BLOCKSIZE bytes. So |
331 * | this area is 4 * VIS_BLOCKSIZE. | <-- - SAVED_FPREGS_OFFSET
332 * |-------------------------------------|
333 * | 8 bytes to save %fprs | <-- - SAVED_FPRS_OFFSET
334 * |-------------------------------------|
335 * | 8 bytes to save %gsr | <-- - SAVED_GSR_OFFSET
336 * ---------------------------------------
337 */
338 #define HWCOPYFRAMESIZE ((VIS_BLOCKSIZE * (3 + 1)) + (2 * 8))
339 #define SAVED_FPREGS_OFFSET (VIS_BLOCKSIZE * 4)
340 #define SAVED_FPREGS_ADJUST ((VIS_BLOCKSIZE * 3) + 1)
341 #define SAVED_FPRS_OFFSET (SAVED_FPREGS_OFFSET + 8)
342 #define SAVED_GSR_OFFSET (SAVED_FPRS_OFFSET + 8)
343
344 /*
345 * In FP copies if we do not have preserved data to restore over
346 * the fp regs we used then we must zero those regs to avoid
347 * exposing portions of the data to later threads (data security).
348 */
349 #define FZERO \
350 fzero %f0 ;\
351 fzero %f2 ;\
352 faddd %f0, %f2, %f4 ;\
353 fmuld %f0, %f2, %f6 ;\
354 faddd %f0, %f2, %f8 ;\
355 fmuld %f0, %f2, %f10 ;\
356 faddd %f0, %f2, %f12 ;\
357 fmuld %f0, %f2, %f14 ;\
358 faddd %f0, %f2, %f16 ;\
359 fmuld %f0, %f2, %f18 ;\
360 faddd %f0, %f2, %f20 ;\
361 fmuld %f0, %f2, %f22 ;\
362 faddd %f0, %f2, %f24 ;\
363 fmuld %f0, %f2, %f26 ;\
364 faddd %f0, %f2, %f28 ;\
365 fmuld %f0, %f2, %f30 ;\
366 faddd %f0, %f2, %f48 ;\
367 fmuld %f0, %f2, %f50 ;\
368 faddd %f0, %f2, %f52 ;\
369 fmuld %f0, %f2, %f54 ;\
370 faddd %f0, %f2, %f56 ;\
371 fmuld %f0, %f2, %f58 ;\
372 faddd %f0, %f2, %f60 ;\
373 fmuld %f0, %f2, %f62
374
375 /*
376 * Macros to save and restore fp registers to/from the stack.
377 * Used to save and restore in-use fp registers when we want to use FP.
378 */
379 #define BST_FP_TOSTACK(tmp1) \
380 /* membar #Sync */ ;\
381 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\
382 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\
383 stda %f0, [tmp1]ASI_BLK_P ;\
384 add tmp1, VIS_BLOCKSIZE, tmp1 ;\
385 stda %f16, [tmp1]ASI_BLK_P ;\
386 add tmp1, VIS_BLOCKSIZE, tmp1 ;\
387 stda %f48, [tmp1]ASI_BLK_P ;\
388 membar #Sync
389
390 #define BLD_FP_FROMSTACK(tmp1) \
391 /* membar #Sync - provided at copy completion */ ;\
392 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\
393 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\
394 ldda [tmp1]ASI_BLK_P, %f0 ;\
395 add tmp1, VIS_BLOCKSIZE, tmp1 ;\
396 ldda [tmp1]ASI_BLK_P, %f16 ;\
397 add tmp1, VIS_BLOCKSIZE, tmp1 ;\
398 ldda [tmp1]ASI_BLK_P, %f48 ;\
399 membar #Sync
400
401 #endif
402 /*
403 * Copy a block of storage, returning an error code if `from' or
404 * `to' takes a kernel pagefault which cannot be resolved.
405 * Returns errno value on pagefault error, 0 if all ok
406 */
407
408 .seg ".text"
409 .align 4
410
411 ENTRY(kcopy)
412 #if !defined(NIAGARA_IMPL)
413 cmp %o2, FP_COPY ! check for small copy/leaf case
414 bgt,pt %ncc, .kcopy_more !
415 nop
416 .kcopy_small: ! setup error handler
417 sethi %hi(.sm_copyerr), %o4
418 or %o4, %lo(.sm_copyerr), %o4 ! .sm_copyerr is lofault value
419 ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler
420 ! Note that we carefully do *not* flag the setting of
421 ! t_lofault.
422 membar #Sync ! sync error barrier
423 b .sm_do_copy ! common code
424 stn %o4, [THREAD_REG + T_LOFAULT] ! set t_lofault
425
426
427 .kcopy_more:
428 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
429 sethi %hi(.copyerr), %l7 ! copyerr is lofault value
430 or %l7, %lo(.copyerr), %l7
431 ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler
432 ! Note that we carefully do *not* flag the setting of
433 ! t_lofault.
434 membar #Sync ! sync error barrier
435 b .do_copy ! common code
436 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault
437
438 /*
439 * We got here because of a fault during a small kcopy or bcopy.
440 * if a fault handler existed when bcopy was called.
441 * No floating point registers are used by the small copies.
442 * Small copies are from a leaf routine
443 * Errno value is in %g1.
444 */
445 .sm_copyerr:
446 ! The kcopy will always set a t_lofault handler. If it fires,
447 ! we're expected to just return the error code and not to
448 ! invoke any existing error handler. As far as bcopy is concerned,
449 ! we only set t_lofault if there was an existing lofault handler.
450 ! In that case we're expected to invoke the previously existing
451 ! handler after resetting the t_lofault value.
452 btst LOFAULT_SET, %o5
453 membar #Sync ! sync error barrier
454 andn %o5, LOFAULT_SET, %o5 ! clear fault flag
455 bnz,pn %ncc, 3f
456 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
457 retl
458 mov %g1, %o0
459 3:
460 ! We're here via bcopy. There must have been an error handler
461 ! in place otherwise we would have died a nasty death already.
462 jmp %o5 ! goto real handler
463 mov %g0, %o0
464 /*
465 * end of .sm_copyerr
466 */
467
468 /*
469 * We got here because of a fault during kcopy or bcopy if a fault
470 * handler existed when bcopy was called.
471 * stack and fp registers need to be restored
472 * Errno value is in %g1.
473 */
474 .copyerr:
475 sethi %hi(.copyerr2), %l1
476 or %l1, %lo(.copyerr2), %l1
477 membar #Sync ! sync error barrier
478 stn %l1, [THREAD_REG + T_LOFAULT] ! set t_lofault
479 btst FPUSED_FLAG, %o5
480 bz,pt %xcc, 1f
481 and %o5, LOFAULT_SET, %l1 ! copy flag to %l1
482
483 membar #Sync ! sync error barrier
484 wr %l5, 0, %gsr
485 btst FPRS_FEF, %g5
486 bz,pt %icc, 4f
487 nop
488 ! restore fpregs from stack
489 BLD_FP_FROMSTACK(%o2)
490 ba,pt %ncc, 2f
491 wr %g5, 0, %fprs ! restore fprs
492 4:
493 FZERO
494 wr %g5, 0, %fprs ! restore fprs
495 2:
496 ldn [THREAD_REG + T_LWP], %o2
497 brnz,pt %o2, 1f
498 nop
499
500 ldsb [THREAD_REG + T_PREEMPT], %l0
501 deccc %l0
502 bnz,pn %ncc, 1f
503 stb %l0, [THREAD_REG + T_PREEMPT]
504
505 ! Check for a kernel preemption request
506 ldn [THREAD_REG + T_CPU], %l0
507 ldub [%l0 + CPU_KPRUNRUN], %l0
508 brnz,a,pt %l0, 1f ! Need to call kpreempt?
509 or %l1, KPREEMPT_FLAG, %l1 ! If so, set the flag
510
511 ! The kcopy will always set a t_lofault handler. If it fires,
512 ! we're expected to just return the error code and not to
513 ! invoke any existing error handler. As far as bcopy is concerned,
514 ! we only set t_lofault if there was an existing lofault handler.
515 ! In that case we're expected to invoke the previously existing
516 ! handler after resetting the t_lofault value.
517 1:
518 andn %o5, COPY_FLAGS, %o5 ! remove flags from lofault address
519 membar #Sync ! sync error barrier
520 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
521
522 ! call kpreempt if necessary
523 btst KPREEMPT_FLAG, %l1
524 bz,pt %icc, 2f
525 nop
526 call kpreempt
527 rdpr %pil, %o0 ! pass %pil
528 2:
529 btst LOFAULT_SET, %l1
530 bnz,pn %ncc, 3f
531 nop
532 ret
533 restore %g1, 0, %o0
534 3:
535 ! We're here via bcopy. There must have been an error handler
536 ! in place otherwise we would have died a nasty death already.
537 jmp %o5 ! goto real handler
538 restore %g0, 0, %o0 ! dispose of copy window
539
540 /*
541 * We got here because of a fault in .copyerr. We can't safely restore fp
542 * state, so we panic.
543 */
544 fp_panic_msg:
545 .asciz "Unable to restore fp state after copy operation"
546
547 .align 4
548 .copyerr2:
549 set fp_panic_msg, %o0
550 call panic
551 nop
552 /*
553 * end of .copyerr
554 */
555
556 #else /* NIAGARA_IMPL */
557 save %sp, -SA(MINFRAME), %sp
558 set .copyerr, %l7 ! copyerr is lofault value
559 ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler
560 or %o5, LOFAULT_SET, %o5
561 membar #Sync ! sync error barrier
562 b .do_copy ! common code
563 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault
564
565 /*
566 * We got here because of a fault during kcopy.
567 * Errno value is in %g1.
568 */
569 .copyerr:
570 ! The kcopy() *always* sets a t_lofault handler and it ORs LOFAULT_SET
571 ! into %o5 to indicate it has set t_lofault handler. Need to clear
572 ! LOFAULT_SET flag before restoring the error handler.
573 andn %o5, LOFAULT_SET, %o5
574 membar #Sync ! sync error barrier
575 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
576 ret
577 restore %g1, 0, %o0
578 #endif /* NIAGARA_IMPL */
579
580 SET_SIZE(kcopy)
581
582
583 /*
584 * Copy a block of storage - must not overlap (from + len <= to).
585 */
586
587 ENTRY(bcopy)
588 #if !defined(NIAGARA_IMPL)
589 cmp %o2, FP_COPY ! check for small copy/leaf case
590 bgt,pt %ncc, .bcopy_more !
591 nop
592 .bcopy_small: ! setup error handler
593 ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler
594 tst %o5
595 bz,pt %icc, .sm_do_copy
596 sethi %hi(.sm_copyerr), %o4
597 or %o4, %lo(.sm_copyerr), %o4 ! .sm_copyerr is lofault value
598 membar #Sync ! sync error barrier
599 stn %o4, [THREAD_REG + T_LOFAULT] ! set t_lofault
600 or %o5, LOFAULT_SET, %o5 ! Error should trampoline
601 .sm_do_copy:
602 mov %o0, %g1 ! save %o0
603 cmp %o2, SHORTCOPY ! make sure there is enough to align
604 ble,pt %ncc, .bc_smallest
605 andcc %o1, 0x7, %o3 ! is dest long aligned
606 bnz,pn %ncc, .bc_align
607 andcc %o1, 1, %o3 ! is dest byte aligned
608
609 ! Destination is long word aligned
610 .bc_al_src:
611 andcc %o0, 7, %o3
612 brnz,pt %o3, .bc_src_dst_unal8
613 nop
614 /*
615 * Special case for handling when src and dest are both long word aligned
616 * and total data to move is less than FP_COPY bytes
617 * Also handles finish up for large block moves, so may be less than 32 bytes
618 */
619 .bc_medlong:
620 subcc %o2, 31, %o2 ! adjust length to allow cc test
621 ble,pt %ncc, .bc_medl31
622 nop
623 .bc_medl32:
624 ldx [%o0], %o4 ! move 32 bytes
625 subcc %o2, 32, %o2 ! decrement length count by 32
626 stx %o4, [%o1]
627 ldx [%o0+8], %o4
628 stx %o4, [%o1+8]
629 ldx [%o0+16], %o4
630 add %o0, 32, %o0 ! increase src ptr by 32
631 stx %o4, [%o1+16]
632 ldx [%o0-8], %o4
633 add %o1, 32, %o1 ! increase dst ptr by 32
634 bgu,pt %ncc, .bc_medl32 ! repeat if at least 32 bytes left
635 stx %o4, [%o1-8]
636 .bc_medl31:
637 addcc %o2, 24, %o2 ! adjust count to be off by 7
638 ble,pt %ncc, .bc_medl7 ! skip if 7 or fewer bytes left
639 nop
640 .bc_medl8:
641 ldx [%o0], %o4 ! move 8 bytes
642 add %o0, 8, %o0 ! increase src ptr by 8
643 subcc %o2, 8, %o2 ! decrease count by 8
644 add %o1, 8, %o1 ! increase dst ptr by 8
645 bgu,pt %ncc, .bc_medl8
646 stx %o4, [%o1-8]
647 .bc_medl7:
648 addcc %o2, 7, %o2 ! finish adjustment of remaining count
649 bnz,pt %ncc, .bc_small4 ! do final bytes if not finished
650
651 .bc_smallx: ! finish up and exit
652 tst %o5
653 bz,pt %ncc, .bc_sm_done
654 andn %o5, COPY_FLAGS, %o5 ! remove flags from lofault address
655 membar #Sync ! sync error barrier
656 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
657 .bc_sm_done:
658 retl
659 mov %g0, %o0
660
661 .bc_small4:
662 cmp %o2, 4
663 blt,pt %ncc, .bc_small3x ! skip if less than 4 bytes left
664 nop !
665 ld [%o0], %o4 ! move 4 bytes
666 add %o0, 4, %o0 ! increase src ptr by 4
667 add %o1, 4, %o1 ! increase dst ptr by 4
668 subcc %o2, 4, %o2 ! decrease count by 4
669 bz,pt %ncc, .bc_smallx
670 stw %o4, [%o1-4]
671
672 .bc_small3x: ! Exactly 1, 2, or 3 bytes remain
673 subcc %o2, 1, %o2 ! reduce count for cc test
674 ldub [%o0], %o4 ! load one byte
675 bz,pt %ncc, .bc_smallx
676 stb %o4, [%o1] ! store one byte
677 ldub [%o0+1], %o4 ! load second byte
678 subcc %o2, 1, %o2
679 bz,pt %ncc, .bc_smallx
680 stb %o4, [%o1+1] ! store second byte
681 ldub [%o0+2], %o4 ! load third byte
682 ba .bc_smallx
683 stb %o4, [%o1+2] ! store third byte
684
685 .bc_smallest: ! 7 or fewer bytes remain
686 tst %o2
687 bz,pt %ncc, .bc_smallx
688 cmp %o2, 4
689 blt,pt %ncc, .bc_small3x
690 nop
691 ldub [%o0], %o4 ! read byte
692 subcc %o2, 4, %o2 ! reduce count by 4
693 stb %o4, [%o1] ! write byte
694 ldub [%o0+1], %o4 ! repeat for total of 4 bytes
695 add %o0, 4, %o0 ! advance src by 4
696 stb %o4, [%o1+1]
697 ldub [%o0-2], %o4
698 add %o1, 4, %o1 ! advance dst by 4
699 stb %o4, [%o1-2]
700 ldub [%o0-1], %o4
701 bnz,pt %ncc, .bc_small3x
702 stb %o4, [%o1-1]
703 ba .bc_smallx
704 nop
705
706 /*
707 * Align destination to long word boundary
708 */
709 .bc_align: ! byte align test in prior branch delay
710 bnz,pt %ncc, .bc_al_d1
711 .bc_al_d1f: ! dest is now half word aligned
712 andcc %o1, 2, %o3
713 bnz,pt %ncc, .bc_al_d2
714 .bc_al_d2f: ! dest is now word aligned
715 andcc %o1, 4, %o3 ! is dest longword aligned?
716 bz,pt %ncc, .bc_al_src
717 nop
718 .bc_al_d4: ! dest is word aligned; src is unknown
719 ldub [%o0], %o4 ! move a word (src align unknown)
720 ldub [%o0+1], %o3
721 sll %o4, 24, %o4 ! position
722 sll %o3, 16, %o3 ! position
723 or %o4, %o3, %o3 ! merge
724 ldub [%o0+2], %o4
725 sll %o4, 8, %o4 ! position
726 or %o4, %o3, %o3 ! merge
727 ldub [%o0+3], %o4
728 or %o4, %o3, %o4 ! merge
729 stw %o4,[%o1] ! store four bytes
730 add %o0, 4, %o0 ! adjust src by 4
731 add %o1, 4, %o1 ! adjust dest by 4
732 sub %o2, 4, %o2 ! adjust count by 4
733 andcc %o0, 7, %o3 ! check for src long word alignment
734 brz,pt %o3, .bc_medlong
735 .bc_src_dst_unal8:
736 ! dst is 8-byte aligned, src is not
737 ! Size is less than FP_COPY
738 ! Following code is to select for alignment
739 andcc %o0, 0x3, %o3 ! test word alignment
740 bz,pt %ncc, .bc_medword
741 nop
742 andcc %o0, 0x1, %o3 ! test halfword alignment
743 bnz,pt %ncc, .bc_med_byte ! go to byte move if not halfword
744 andcc %o0, 0x2, %o3 ! test which byte alignment
745 ba .bc_medhalf
746 nop
747 .bc_al_d1: ! align dest to half word
748 ldub [%o0], %o4 ! move a byte
749 add %o0, 1, %o0
750 stb %o4, [%o1]
751 add %o1, 1, %o1
752 andcc %o1, 2, %o3
753 bz,pt %ncc, .bc_al_d2f
754 sub %o2, 1, %o2
755 .bc_al_d2: ! align dest to word
756 ldub [%o0], %o4 ! move a half-word (src align unknown)
757 ldub [%o0+1], %o3
758 sll %o4, 8, %o4 ! position
759 or %o4, %o3, %o4 ! merge
760 sth %o4, [%o1]
761 add %o0, 2, %o0
762 add %o1, 2, %o1
763 andcc %o1, 4, %o3 ! is dest longword aligned?
764 bz,pt %ncc, .bc_al_src
765 sub %o2, 2, %o2
766 ba .bc_al_d4
767 nop
768 /*
769 * Handle all cases where src and dest are aligned on word
770 * boundaries. Use unrolled loops for better performance.
771 * This option wins over standard large data move when
772 * source and destination is in cache for medium
773 * to short data moves.
774 */
775 .bc_medword:
776 subcc %o2, 31, %o2 ! adjust length to allow cc test
777 ble,pt %ncc, .bc_medw31
778 nop
779 .bc_medw32:
780 ld [%o0], %o4 ! move a block of 32 bytes
781 stw %o4, [%o1]
782 ld [%o0+4], %o4
783 stw %o4, [%o1+4]
784 ld [%o0+8], %o4
785 stw %o4, [%o1+8]
786 ld [%o0+12], %o4
787 stw %o4, [%o1+12]
788 ld [%o0+16], %o4
789 stw %o4, [%o1+16]
790 ld [%o0+20], %o4
791 subcc %o2, 32, %o2 ! decrement length count
792 stw %o4, [%o1+20]
793 ld [%o0+24], %o4
794 add %o0, 32, %o0 ! increase src ptr by 32
795 stw %o4, [%o1+24]
796 ld [%o0-4], %o4
797 add %o1, 32, %o1 ! increase dst ptr by 32
798 bgu,pt %ncc, .bc_medw32 ! repeat if at least 32 bytes left
799 stw %o4, [%o1-4]
800 .bc_medw31:
801 addcc %o2, 24, %o2 ! adjust count to be off by 7
802 ble,pt %ncc, .bc_medw7 ! skip if 7 or fewer bytes left
803 nop !
804 .bc_medw15:
805 ld [%o0], %o4 ! move a block of 8 bytes
806 subcc %o2, 8, %o2 ! decrement length count
807 stw %o4, [%o1]
808 add %o0, 8, %o0 ! increase src ptr by 8
809 ld [%o0-4], %o4
810 add %o1, 8, %o1 ! increase dst ptr by 8
811 bgu,pt %ncc, .bc_medw15
812 stw %o4, [%o1-4]
813 .bc_medw7:
814 addcc %o2, 7, %o2 ! finish adjustment of remaining count
815 bz,pt %ncc, .bc_smallx ! exit if finished
816 cmp %o2, 4
817 blt,pt %ncc, .bc_small3x ! skip if less than 4 bytes left
818 nop !
819 ld [%o0], %o4 ! move 4 bytes
820 add %o0, 4, %o0 ! increase src ptr by 4
821 add %o1, 4, %o1 ! increase dst ptr by 4
822 subcc %o2, 4, %o2 ! decrease count by 4
823 bnz .bc_small3x
824 stw %o4, [%o1-4]
825 ba .bc_smallx
826 nop
827
828 .bc_medhalf:
829 subcc %o2, 31, %o2 ! adjust length to allow cc test
830 ble,pt %ncc, .bc_medh31
831 nop
832 .bc_medh32: ! load and store block of 32 bytes
833 subcc %o2, 32, %o2 ! decrement length count
834
835 lduh [%o0], %o4 ! move 32 bytes
836 lduw [%o0+2], %o3
837 sllx %o4, 48, %o4
838 sllx %o3, 16, %o3
839 or %o4, %o3, %o3
840 lduh [%o0+6], %o4
841 or %o4, %o3, %o4
842 stx %o4, [%o1]
843
844 lduh [%o0+8], %o4
845 lduw [%o0+10], %o3
846 sllx %o4, 48, %o4
847 sllx %o3, 16, %o3
848 or %o4, %o3, %o3
849 lduh [%o0+14], %o4
850 or %o4, %o3, %o4
851 stx %o4, [%o1+8]
852
853 lduh [%o0+16], %o4
854 lduw [%o0+18], %o3
855 sllx %o4, 48, %o4
856 sllx %o3, 16, %o3
857 or %o4, %o3, %o3
858 lduh [%o0+22], %o4
859 or %o4, %o3, %o4
860 stx %o4, [%o1+16]
861
862 add %o0, 32, %o0 ! increase src ptr by 32
863 add %o1, 32, %o1 ! increase dst ptr by 32
864
865 lduh [%o0-8], %o4
866 lduw [%o0-6], %o3
867 sllx %o4, 48, %o4
868 sllx %o3, 16, %o3
869 or %o4, %o3, %o3
870 lduh [%o0-2], %o4
871 or %o3, %o4, %o4
872 bgu,pt %ncc, .bc_medh32 ! repeat if at least 32 bytes left
873 stx %o4, [%o1-8]
874
875 .bc_medh31:
876 addcc %o2, 24, %o2 ! adjust count to be off by 7
877 ble,pt %ncc, .bc_medh7 ! skip if 7 or fewer bytes left
878 nop !
879 .bc_medh15:
880 lduh [%o0], %o4 ! move 16 bytes
881 subcc %o2, 8, %o2 ! decrement length count
882 lduw [%o0+2], %o3
883 sllx %o4, 48, %o4
884 sllx %o3, 16, %o3
885 or %o4, %o3, %o3
886 add %o1, 8, %o1 ! increase dst ptr by 8
887 lduh [%o0+6], %o4
888 add %o0, 8, %o0 ! increase src ptr by 8
889 or %o4, %o3, %o4
890 bgu,pt %ncc, .bc_medh15
891 stx %o4, [%o1-8]
892 .bc_medh7:
893 addcc %o2, 7, %o2 ! finish adjustment of remaining count
894 bz,pt %ncc, .bc_smallx ! exit if finished
895 cmp %o2, 4
896 blt,pt %ncc, .bc_small3x ! skip if less than 4 bytes left
897 nop !
898 lduh [%o0], %o4
899 sll %o4, 16, %o4
900 lduh [%o0+2], %o3
901 or %o3, %o4, %o4
902 subcc %o2, 4, %o2
903 add %o0, 4, %o0
904 add %o1, 4, %o1
905 bnz .bc_small3x
906 stw %o4, [%o1-4]
907 ba .bc_smallx
908 nop
909
910 .align 16
911 .bc_med_byte:
912 bnz,pt %ncc, .bc_medbh32a ! go to correct byte move
913 subcc %o2, 31, %o2 ! adjust length to allow cc test
914 ble,pt %ncc, .bc_medb31
915 nop
916 .bc_medb32: ! Alignment 1 or 5
917 subcc %o2, 32, %o2 ! decrement length count
918
919 ldub [%o0], %o4 ! load and store a block of 32 bytes
920 sllx %o4, 56, %o3
921 lduh [%o0+1], %o4
922 sllx %o4, 40, %o4
923 or %o4, %o3, %o3
924 lduw [%o0+3], %o4
925 sllx %o4, 8, %o4
926 or %o4, %o3, %o3
927 ldub [%o0+7], %o4
928 or %o4, %o3, %o4
929 stx %o4, [%o1]
930
931 ldub [%o0+8], %o4
932 sllx %o4, 56, %o3
933 lduh [%o0+9], %o4
934 sllx %o4, 40, %o4
935 or %o4, %o3, %o3
936 lduw [%o0+11], %o4
937 sllx %o4, 8, %o4
938 or %o4, %o3, %o3
939 ldub [%o0+15], %o4
940 or %o4, %o3, %o4
941 stx %o4, [%o1+8]
942
943 ldub [%o0+16], %o4
944 sllx %o4, 56, %o3
945 lduh [%o0+17], %o4
946 sllx %o4, 40, %o4
947 or %o4, %o3, %o3
948 lduw [%o0+19], %o4
949 sllx %o4, 8, %o4
950 or %o4, %o3, %o3
951 ldub [%o0+23], %o4
952 or %o4, %o3, %o4
953 stx %o4, [%o1+16]
954
955 add %o0, 32, %o0 ! increase src ptr by 32
956 add %o1, 32, %o1 ! increase dst ptr by 32
957
958 ldub [%o0-8], %o4
959 sllx %o4, 56, %o3
960 lduh [%o0-7], %o4
961 sllx %o4, 40, %o4
962 or %o4, %o3, %o3
963 lduw [%o0-5], %o4
964 sllx %o4, 8, %o4
965 or %o4, %o3, %o3
966 ldub [%o0-1], %o4
967 or %o4, %o3, %o4
968 bgu,pt %ncc, .bc_medb32 ! repeat if at least 32 bytes left
969 stx %o4, [%o1-8]
970
971 .bc_medb31: ! 31 or fewer bytes remaining
972 addcc %o2, 24, %o2 ! adjust count to be off by 7
973 ble,pt %ncc, .bc_medb7 ! skip if 7 or fewer bytes left
974 nop !
975 .bc_medb15:
976
977 ldub [%o0], %o4 ! load and store a block of 8 bytes
978 subcc %o2, 8, %o2 ! decrement length count
979 sllx %o4, 56, %o3
980 lduh [%o0+1], %o4
981 sllx %o4, 40, %o4
982 or %o4, %o3, %o3
983 lduw [%o0+3], %o4
984 add %o1, 8, %o1 ! increase dst ptr by 16
985 sllx %o4, 8, %o4
986 or %o4, %o3, %o3
987 ldub [%o0+7], %o4
988 add %o0, 8, %o0 ! increase src ptr by 16
989 or %o4, %o3, %o4
990 bgu,pt %ncc, .bc_medb15
991 stx %o4, [%o1-8]
992 .bc_medb7:
993 addcc %o2, 7, %o2 ! finish adjustment of remaining count
994 bz,pt %ncc, .bc_smallx ! exit if finished
995 cmp %o2, 4
996 blt,pt %ncc, .bc_small3x ! skip if less than 4 bytes left
997 nop !
998 ldub [%o0], %o4 ! move 4 bytes
999 sll %o4, 24, %o3
1000 lduh [%o0+1], %o4
1001 sll %o4, 8, %o4
1002 or %o4, %o3, %o3
1003 ldub [%o0+3], %o4
1004 or %o4, %o3, %o4
1005 subcc %o2, 4, %o2
1006 add %o0, 4, %o0
1007 add %o1, 4, %o1
1008 bnz .bc_small3x
1009 stw %o4, [%o1-4]
1010 ba .bc_smallx
1011 nop
1012
1013 .align 16
1014 .bc_medbh32a: ! Alignment 3 or 7
1015 ble,pt %ncc, .bc_medbh31
1016 nop
1017 .bc_medbh32: ! Alignment 3 or 7
1018 subcc %o2, 32, %o2 ! decrement length count
1019
1020 ldub [%o0], %o4 ! load and store a block of 32 bytes
1021 sllx %o4, 56, %o3
1022 lduw [%o0+1], %o4
1023 sllx %o4, 24, %o4
1024 or %o4, %o3, %o3
1025 lduh [%o0+5], %o4
1026 sllx %o4, 8, %o4
1027 or %o4, %o3, %o3
1028 ldub [%o0+7], %o4
1029 or %o4, %o3, %o4
1030 stx %o4, [%o1]
1031
1032 ldub [%o0+8], %o4
1033 sllx %o4, 56, %o3
1034 lduw [%o0+9], %o4
1035 sllx %o4, 24, %o4
1036 or %o4, %o3, %o3
1037 lduh [%o0+13], %o4
1038 sllx %o4, 8, %o4
1039 or %o4, %o3, %o3
1040 ldub [%o0+15], %o4
1041 or %o4, %o3, %o4
1042 stx %o4, [%o1+8]
1043
1044 ldub [%o0+16], %o4
1045 sllx %o4, 56, %o3
1046 lduw [%o0+17], %o4
1047 sllx %o4, 24, %o4
1048 or %o4, %o3, %o3
1049 lduh [%o0+21], %o4
1050 sllx %o4, 8, %o4
1051 or %o4, %o3, %o3
1052 ldub [%o0+23], %o4
1053 or %o4, %o3, %o4
1054 stx %o4, [%o1+16]
1055
1056 add %o0, 32, %o0 ! increase src ptr by 32
1057 add %o1, 32, %o1 ! increase dst ptr by 32
1058
1059 ldub [%o0-8], %o4
1060 sllx %o4, 56, %o3
1061 lduw [%o0-7], %o4
1062 sllx %o4, 24, %o4
1063 or %o4, %o3, %o3
1064 lduh [%o0-3], %o4
1065 sllx %o4, 8, %o4
1066 or %o4, %o3, %o3
1067 ldub [%o0-1], %o4
1068 or %o4, %o3, %o4
1069 bgu,pt %ncc, .bc_medbh32 ! repeat if at least 32 bytes left
1070 stx %o4, [%o1-8]
1071
1072 .bc_medbh31:
1073 addcc %o2, 24, %o2 ! adjust count to be off by 7
1074 ble,pt %ncc, .bc_medb7 ! skip if 7 or fewer bytes left
1075 nop !
1076 .bc_medbh15:
1077 ldub [%o0], %o4 ! load and store a block of 8 bytes
1078 sllx %o4, 56, %o3
1079 lduw [%o0+1], %o4
1080 sllx %o4, 24, %o4
1081 or %o4, %o3, %o3
1082 lduh [%o0+5], %o4
1083 sllx %o4, 8, %o4
1084 or %o4, %o3, %o3
1085 ldub [%o0+7], %o4
1086 or %o4, %o3, %o4
1087 stx %o4, [%o1]
1088 subcc %o2, 8, %o2 ! decrement length count
1089 add %o1, 8, %o1 ! increase dst ptr by 8
1090 add %o0, 8, %o0 ! increase src ptr by 8
1091 bgu,pt %ncc, .bc_medbh15
1092 stx %o4, [%o1-8]
1093 ba .bc_medb7
1094 nop
1095
1096 SET_SIZE(bcopy)
1097 /*
1098 * The _more entry points are not intended to be used directly by
1099 * any caller from outside this file. They are provided to allow
1100 * profiling and dtrace of the portions of the copy code that uses
1101 * the floating point registers.
1102 */
1103 ENTRY(bcopy_more)
1104 .bcopy_more:
1105 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1106 ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler
1107 brz,pt %o5, .do_copy
1108 nop
1109 sethi %hi(.copyerr), %l7 ! copyerr is lofault value
1110 or %l7, %lo(.copyerr), %l7
1111 membar #Sync ! sync error barrier
1112 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault
1113 ! We've already captured whether t_lofault was zero on entry.
1114 ! We need to mark ourselves as being from bcopy since both
1115 ! kcopy and bcopy use the same code path. If LOFAULT_SET is
1116 ! set and the saved lofault was zero, we won't reset lofault on
1117 ! returning.
1118 or %o5, LOFAULT_SET, %o5
1119 .do_copy:
1120 ldn [THREAD_REG + T_LWP], %o3
1121 brnz,pt %o3, 1f
1122 nop
1123 /*
1124 * kpreempt_disable();
1125 */
1126 ldsb [THREAD_REG +T_PREEMPT], %o3
1127 inc %o3
1128 stb %o3, [THREAD_REG + T_PREEMPT]
1129 1:
1130 /*
1131 * Following code is for large copies. We know there is at
1132 * least FP_COPY bytes available. FP regs are used, so
1133 * we save registers and fp regs before starting
1134 */
1135 rd %fprs, %g5 ! check for unused fp
1136 or %o5,FPUSED_FLAG,%o5
1137 ! if fprs.fef == 0, set it.
1138 ! Setting it when already set costs more than checking
1139 andcc %g5, FPRS_FEF, %g5 ! test FEF, fprs.du = fprs.dl = 0
1140 bz,pt %ncc, .bc_fp_unused
1141 prefetch [%i0 + (1 * CACHE_LINE)], #one_read
1142 BST_FP_TOSTACK(%o3)
1143 ba .bc_fp_ready
1144 .bc_fp_unused:
1145 andcc %i1, 1, %o3 ! is dest byte aligned
1146 wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1
1147 .bc_fp_ready:
1148 rd %gsr, %l5 ! save %gsr value
1149 bnz,pt %ncc, .bc_big_d1
1150 .bc_big_d1f: ! dest is now half word aligned
1151 andcc %i1, 2, %o3
1152 bnz,pt %ncc, .bc_big_d2
1153 .bc_big_d2f: ! dest is now word aligned
1154 andcc %i1, 4, %o3
1155 bnz,pt %ncc, .bc_big_d4
1156 .bc_big_d4f: ! dest is now long word aligned
1157 andcc %i0, 7, %o3 ! is src long word aligned
1158 brnz,pt %o3, .bc_big_unal8
1159 prefetch [%i0 + (2 * CACHE_LINE)], #one_read
1160
1161 ! Src and dst are long word aligned
1162 ! align dst to 64 byte boundary
1163 andcc %i1, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned
1164 brz,pn %o3, .bc_al_to_64
1165 nop
1166 sub %o3, 64, %o3 ! %o3 has negative bytes to move
1167 add %i2, %o3, %i2 ! adjust remaining count
1168 andcc %o3, 8, %o4 ! odd long words to move?
1169 brz,pt %o4, .bc_al_to_16
1170 nop
1171 add %o3, 8, %o3
1172 ldx [%i0], %o4
1173 add %i0, 8, %i0 ! increment src ptr
1174 add %i1, 8, %i1 ! increment dst ptr
1175 stx %o4, [%i1-8]
1176 ! Dest is aligned on 16 bytes, src 8 byte aligned
1177 .bc_al_to_16:
1178 andcc %o3, 0x30, %o4 ! pair of long words to move?
1179 brz,pt %o4, .bc_al_to_64
1180 nop
1181 .bc_al_mv_16:
1182 add %o3, 16, %o3
1183 ldx [%i0], %o4
1184 stx %o4, [%i1]
1185 ldx [%i0+8], %o4
1186 add %i0, 16, %i0 ! increment src ptr
1187 stx %o4, [%i1+8]
1188 andcc %o3, 48, %o4
1189 brnz,pt %o4, .bc_al_mv_16
1190 add %i1, 16, %i1 ! increment dst ptr
1191 ! Dest is aligned on 64 bytes, src 8 byte aligned
1192 .bc_al_to_64:
1193 ! Determine source alignment
1194 ! to correct 8 byte offset
1195 andcc %i0, 32, %o3
1196 brnz,pn %o3, .bc_aln_1
1197 andcc %i0, 16, %o3
1198 brnz,pn %o3, .bc_aln_01
1199 andcc %i0, 8, %o3
1200 brz,pn %o3, .bc_aln_000
1201 prefetch [%i0 + (3 * CACHE_LINE)], #one_read
1202 ba .bc_aln_001
1203 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1204
1205 .bc_aln_01:
1206 brnz,pn %o3, .bc_aln_011
1207 prefetch [%i0 + (3 * CACHE_LINE)], #one_read
1208 ba .bc_aln_010
1209 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1210 .bc_aln_1:
1211 andcc %i0, 16, %o3
1212 brnz,pn %o3, .bc_aln_11
1213 andcc %i0, 8, %o3
1214 brnz,pn %o3, .bc_aln_101
1215 prefetch [%i0 + (3 * CACHE_LINE)], #one_read
1216 ba .bc_aln_100
1217 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1218 .bc_aln_11:
1219 brz,pn %o3, .bc_aln_110
1220 prefetch [%i0 + (3 * CACHE_LINE)], #one_read
1221
1222 .bc_aln_111:
1223 ! Alignment off by 8 bytes
1224 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1225 ldd [%i0], %d0
1226 add %i0, 8, %i0
1227 sub %i2, 8, %i2
1228 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
1229 and %i2, 0x7f, %i2 ! residue bytes in %i2
1230 sub %i1, %i0, %i1
1231 .bc_aln_111_loop:
1232 ldda [%i0]ASI_BLK_P,%d16 ! block load
1233 subcc %o3, 64, %o3
1234 fmovd %d16, %d2
1235 fmovd %d18, %d4
1236 fmovd %d20, %d6
1237 fmovd %d22, %d8
1238 fmovd %d24, %d10
1239 fmovd %d26, %d12
1240 fmovd %d28, %d14
1241 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1242 stda %d0,[%i0+%i1]ASI_BLK_P
1243 add %i0, 64, %i0
1244 fmovd %d30, %d0
1245 bgt,pt %ncc, .bc_aln_111_loop
1246 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1247 add %i1, %i0, %i1
1248
1249 std %d0, [%i1]
1250 ba .bc_remain_stuff
1251 add %i1, 8, %i1
1252 ! END OF aln_111
1253
1254 .bc_aln_110:
1255 ! Alignment off by 16 bytes
1256 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1257 ldd [%i0], %d0
1258 ldd [%i0+8], %d2
1259 add %i0, 16, %i0
1260 sub %i2, 16, %i2
1261 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
1262 and %i2, 0x7f, %i2 ! residue bytes in %i2
1263 sub %i1, %i0, %i1
1264 .bc_aln_110_loop:
1265 ldda [%i0]ASI_BLK_P,%d16 ! block load
1266 subcc %o3, 64, %o3
1267 fmovd %d16, %d4
1268 fmovd %d18, %d6
1269 fmovd %d20, %d8
1270 fmovd %d22, %d10
1271 fmovd %d24, %d12
1272 fmovd %d26, %d14
1273 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1274 stda %d0,[%i0+%i1]ASI_BLK_P
1275 add %i0, 64, %i0
1276 fmovd %d28, %d0
1277 fmovd %d30, %d2
1278 bgt,pt %ncc, .bc_aln_110_loop
1279 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1280 add %i1, %i0, %i1
1281
1282 std %d0, [%i1]
1283 std %d2, [%i1+8]
1284 ba .bc_remain_stuff
1285 add %i1, 16, %i1
1286 ! END OF aln_110
1287
1288 .bc_aln_101:
1289 ! Alignment off by 24 bytes
1290 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1291 ldd [%i0], %d0
1292 ldd [%i0+8], %d2
1293 ldd [%i0+16], %d4
1294 add %i0, 24, %i0
1295 sub %i2, 24, %i2
1296 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
1297 and %i2, 0x7f, %i2 ! residue bytes in %i2
1298 sub %i1, %i0, %i1
1299 .bc_aln_101_loop:
1300 ldda [%i0]ASI_BLK_P,%d16 ! block load
1301 subcc %o3, 64, %o3
1302 fmovd %d16, %d6
1303 fmovd %d18, %d8
1304 fmovd %d20, %d10
1305 fmovd %d22, %d12
1306 fmovd %d24, %d14
1307 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1308 stda %d0,[%i0+%i1]ASI_BLK_P
1309 add %i0, 64, %i0
1310 fmovd %d26, %d0
1311 fmovd %d28, %d2
1312 fmovd %d30, %d4
1313 bgt,pt %ncc, .bc_aln_101_loop
1314 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1315 add %i1, %i0, %i1
1316
1317 std %d0, [%i1]
1318 std %d2, [%i1+8]
1319 std %d4, [%i1+16]
1320 ba .bc_remain_stuff
1321 add %i1, 24, %i1
1322 ! END OF aln_101
1323
1324 .bc_aln_100:
1325 ! Alignment off by 32 bytes
1326 ldd [%i0], %d0
1327 ldd [%i0+8], %d2
1328 ldd [%i0+16],%d4
1329 ldd [%i0+24],%d6
1330 add %i0, 32, %i0
1331 sub %i2, 32, %i2
1332 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
1333 and %i2, 0x7f, %i2 ! residue bytes in %i2
1334 sub %i1, %i0, %i1
1335 .bc_aln_100_loop:
1336 ldda [%i0]ASI_BLK_P,%d16 ! block load
1337 subcc %o3, 64, %o3
1338 fmovd %d16, %d8
1339 fmovd %d18, %d10
1340 fmovd %d20, %d12
1341 fmovd %d22, %d14
1342 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1343 stda %d0,[%i0+%i1]ASI_BLK_P
1344 add %i0, 64, %i0
1345 fmovd %d24, %d0
1346 fmovd %d26, %d2
1347 fmovd %d28, %d4
1348 fmovd %d30, %d6
1349 bgt,pt %ncc, .bc_aln_100_loop
1350 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1351 add %i1, %i0, %i1
1352
1353 std %d0, [%i1]
1354 std %d2, [%i1+8]
1355 std %d4, [%i1+16]
1356 std %d6, [%i1+24]
1357 ba .bc_remain_stuff
1358 add %i1, 32, %i1
1359 ! END OF aln_100
1360
1361 .bc_aln_011:
1362 ! Alignment off by 40 bytes
1363 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1364 ldd [%i0], %d0
1365 ldd [%i0+8], %d2
1366 ldd [%i0+16], %d4
1367 ldd [%i0+24], %d6
1368 ldd [%i0+32], %d8
1369 add %i0, 40, %i0
1370 sub %i2, 40, %i2
1371 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
1372 and %i2, 0x7f, %i2 ! residue bytes in %i2
1373 sub %i1, %i0, %i1
1374 .bc_aln_011_loop:
1375 ldda [%i0]ASI_BLK_P,%d16 ! block load
1376 subcc %o3, 64, %o3
1377 fmovd %d16, %d10
1378 fmovd %d18, %d12
1379 fmovd %d20, %d14
1380 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1381 stda %d0,[%i0+%i1]ASI_BLK_P
1382 add %i0, 64, %i0
1383 fmovd %d22, %d0
1384 fmovd %d24, %d2
1385 fmovd %d26, %d4
1386 fmovd %d28, %d6
1387 fmovd %d30, %d8
1388 bgt,pt %ncc, .bc_aln_011_loop
1389 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1390 add %i1, %i0, %i1
1391
1392 std %d0, [%i1]
1393 std %d2, [%i1+8]
1394 std %d4, [%i1+16]
1395 std %d6, [%i1+24]
1396 std %d8, [%i1+32]
1397 ba .bc_remain_stuff
1398 add %i1, 40, %i1
1399 ! END OF aln_011
1400
1401 .bc_aln_010:
1402 ! Alignment off by 48 bytes
1403 ldd [%i0], %d0
1404 ldd [%i0+8], %d2
1405 ldd [%i0+16], %d4
1406 ldd [%i0+24], %d6
1407 ldd [%i0+32], %d8
1408 ldd [%i0+40], %d10
1409 add %i0, 48, %i0
1410 sub %i2, 48, %i2
1411 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
1412 and %i2, 0x7f, %i2 ! residue bytes in %i2
1413 sub %i1, %i0, %i1
1414 .bc_aln_010_loop:
1415 ldda [%i0]ASI_BLK_P,%d16 ! block load
1416 subcc %o3, 64, %o3
1417 fmovd %d16, %d12
1418 fmovd %d18, %d14
1419 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1420 stda %d0,[%i0+%i1]ASI_BLK_P
1421 add %i0, 64, %i0
1422 fmovd %d20, %d0
1423 fmovd %d22, %d2
1424 fmovd %d24, %d4
1425 fmovd %d26, %d6
1426 fmovd %d28, %d8
1427 fmovd %d30, %d10
1428 bgt,pt %ncc, .bc_aln_010_loop
1429 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1430 add %i1, %i0, %i1
1431
1432 std %d0, [%i1]
1433 std %d2, [%i1+8]
1434 std %d4, [%i1+16]
1435 std %d6, [%i1+24]
1436 std %d8, [%i1+32]
1437 std %d10, [%i1+40]
1438 ba .bc_remain_stuff
1439 add %i1, 48, %i1
1440 ! END OF aln_010
1441
1442 .bc_aln_001:
1443 ! Alignment off by 56 bytes
1444 ldd [%i0], %d0
1445 ldd [%i0+8], %d2
1446 ldd [%i0+16], %d4
1447 ldd [%i0+24], %d6
1448 ldd [%i0+32], %d8
1449 ldd [%i0+40], %d10
1450 ldd [%i0+48], %d12
1451 add %i0, 56, %i0
1452 sub %i2, 56, %i2
1453 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
1454 and %i2, 0x7f, %i2 ! residue bytes in %i2
1455 sub %i1, %i0, %i1
1456 .bc_aln_001_loop:
1457 ldda [%i0]ASI_BLK_P,%d16 ! block load
1458 subcc %o3, 64, %o3
1459 fmovd %d16, %d14
1460 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1461 stda %d0,[%i0+%i1]ASI_BLK_P
1462 add %i0, 64, %i0
1463 fmovd %d18, %d0
1464 fmovd %d20, %d2
1465 fmovd %d22, %d4
1466 fmovd %d24, %d6
1467 fmovd %d26, %d8
1468 fmovd %d28, %d10
1469 fmovd %d30, %d12
1470 bgt,pt %ncc, .bc_aln_001_loop
1471 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1472 add %i1, %i0, %i1
1473
1474 std %d0, [%i1]
1475 std %d2, [%i1+8]
1476 std %d4, [%i1+16]
1477 std %d6, [%i1+24]
1478 std %d8, [%i1+32]
1479 std %d10, [%i1+40]
1480 std %d12, [%i1+48]
1481 ba .bc_remain_stuff
1482 add %i1, 56, %i1
1483 ! END OF aln_001
1484
1485 .bc_aln_000:
1486 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1487 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
1488 and %i2, 0x7f, %i2 ! residue bytes in %i2
1489 sub %i1, %i0, %i1
1490 .bc_aln_000_loop:
1491 ldda [%i0]ASI_BLK_P,%d0
1492 subcc %o3, 64, %o3
1493 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1494 stda %d0,[%i0+%i1]ASI_BLK_P
1495 add %i0, 64, %i0
1496 bgt,pt %ncc, .bc_aln_000_loop
1497 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1498 add %i1, %i0, %i1
1499
1500 ! END OF aln_000
1501
1502 .bc_remain_stuff:
1503 subcc %i2, 31, %i2 ! adjust length to allow cc test
1504 ble,pt %ncc, .bc_aln_31
1505 nop
1506 .bc_aln_32:
1507 ldx [%i0], %o4 ! move 32 bytes
1508 subcc %i2, 32, %i2 ! decrement length count by 32
1509 stx %o4, [%i1]
1510 ldx [%i0+8], %o4
1511 stx %o4, [%i1+8]
1512 ldx [%i0+16], %o4
1513 add %i0, 32, %i0 ! increase src ptr by 32
1514 stx %o4, [%i1+16]
1515 ldx [%i0-8], %o4
1516 add %i1, 32, %i1 ! increase dst ptr by 32
1517 bgu,pt %ncc, .bc_aln_32 ! repeat if at least 32 bytes left
1518 stx %o4, [%i1-8]
1519 .bc_aln_31:
1520 addcc %i2, 24, %i2 ! adjust count to be off by 7
1521 ble,pt %ncc, .bc_aln_7 ! skip if 7 or fewer bytes left
1522 nop !
1523 .bc_aln_15:
1524 ldx [%i0], %o4 ! move 8 bytes
1525 add %i0, 8, %i0 ! increase src ptr by 8
1526 subcc %i2, 8, %i2 ! decrease count by 8
1527 add %i1, 8, %i1 ! increase dst ptr by 8
1528 bgu,pt %ncc, .bc_aln_15
1529 stx %o4, [%i1-8] !
1530 .bc_aln_7:
1531 addcc %i2, 7, %i2 ! finish adjustment of remaining count
1532 bz,pt %ncc, .bc_exit ! exit if finished
1533 cmp %i2, 4
1534 blt,pt %ncc, .bc_unaln3x ! skip if less than 4 bytes left
1535 nop !
1536 ld [%i0], %o4 ! move 4 bytes
1537 add %i0, 4, %i0 ! increase src ptr by 4
1538 add %i1, 4, %i1 ! increase dst ptr by 4
1539 subcc %i2, 4, %i2 ! decrease count by 4
1540 bnz .bc_unaln3x
1541 stw %o4, [%i1-4]
1542 ba .bc_exit
1543 nop
1544
1545 ! destination alignment code
1546 .bc_big_d1:
1547 ldub [%i0], %o4 ! move a byte
1548 add %i0, 1, %i0
1549 stb %o4, [%i1]
1550 add %i1, 1, %i1
1551 andcc %i1, 2, %o3
1552 bz,pt %ncc, .bc_big_d2f
1553 sub %i2, 1, %i2
1554 .bc_big_d2:
1555 ldub [%i0], %o4 ! move a half-word (src align unknown)
1556 ldub [%i0+1], %o3
1557 add %i0, 2, %i0
1558 sll %o4, 8, %o4 ! position
1559 or %o4, %o3, %o4 ! merge
1560 sth %o4, [%i1]
1561 add %i1, 2, %i1
1562 andcc %i1, 4, %o3
1563 bz,pt %ncc, .bc_big_d4f
1564 sub %i2, 2, %i2
1565 .bc_big_d4:
1566 ldub [%i0], %o4 ! move a word (src align unknown)
1567 ldub [%i0+1], %o3
1568 sll %o4, 24, %o4 ! position
1569 sll %o3, 16, %o3 ! position
1570 or %o4, %o3, %o3 ! merge
1571 ldub [%i0+2], %o4
1572 sll %o4, 8, %o4 ! position
1573 or %o4, %o3, %o3 ! merge
1574 ldub [%i0+3], %o4
1575 or %o4, %o3, %o4 ! merge
1576 stw %o4,[%i1] ! store four bytes
1577 add %i0, 4, %i0 ! adjust src by 4
1578 add %i1, 4, %i1 ! adjust dest by 4
1579 ba .bc_big_d4f
1580 sub %i2, 4, %i2 ! adjust count by 4
1581
1582
1583 ! Dst is on 8 byte boundary; src is not;
1584 .bc_big_unal8:
1585 andcc %i1, 0x3f, %o3 ! is dst 64-byte block aligned?
1586 bz %ncc, .bc_unalnsrc
1587 sub %o3, 64, %o3 ! %o3 will be multiple of 8
1588 neg %o3 ! bytes until dest is 64 byte aligned
1589 sub %i2, %o3, %i2 ! update cnt with bytes to be moved
1590 ! Move bytes according to source alignment
1591 andcc %i0, 0x1, %o4
1592 bnz %ncc, .bc_unalnbyte ! check for byte alignment
1593 nop
1594 andcc %i0, 2, %o4 ! check for half word alignment
1595 bnz %ncc, .bc_unalnhalf
1596 nop
1597 ! Src is word aligned, move bytes until dest 64 byte aligned
1598 .bc_unalnword:
1599 ld [%i0], %o4 ! load 4 bytes
1600 stw %o4, [%i1] ! and store 4 bytes
1601 ld [%i0+4], %o4 ! load 4 bytes
1602 add %i0, 8, %i0 ! increase src ptr by 8
1603 stw %o4, [%i1+4] ! and store 4 bytes
1604 subcc %o3, 8, %o3 ! decrease count by 8
1605 bnz %ncc, .bc_unalnword
1606 add %i1, 8, %i1 ! increase dst ptr by 8
1607 ba .bc_unalnsrc
1608 nop
1609
1610 ! Src is half-word aligned, move bytes until dest 64 byte aligned
1611 .bc_unalnhalf:
1612 lduh [%i0], %o4 ! load 2 bytes
1613 sllx %o4, 32, %i3 ! shift left
1614 lduw [%i0+2], %o4
1615 or %o4, %i3, %i3
1616 sllx %i3, 16, %i3
1617 lduh [%i0+6], %o4
1618 or %o4, %i3, %i3
1619 stx %i3, [%i1]
1620 add %i0, 8, %i0
1621 subcc %o3, 8, %o3
1622 bnz %ncc, .bc_unalnhalf
1623 add %i1, 8, %i1
1624 ba .bc_unalnsrc
1625 nop
1626
1627 ! Src is Byte aligned, move bytes until dest 64 byte aligned
1628 .bc_unalnbyte:
1629 sub %i1, %i0, %i1 ! share pointer advance
1630 .bc_unalnbyte_loop:
1631 ldub [%i0], %o4
1632 sllx %o4, 56, %i3
1633 lduh [%i0+1], %o4
1634 sllx %o4, 40, %o4
1635 or %o4, %i3, %i3
1636 lduh [%i0+3], %o4
1637 sllx %o4, 24, %o4
1638 or %o4, %i3, %i3
1639 lduh [%i0+5], %o4
1640 sllx %o4, 8, %o4
1641 or %o4, %i3, %i3
1642 ldub [%i0+7], %o4
1643 or %o4, %i3, %i3
1644 stx %i3, [%i1+%i0]
1645 subcc %o3, 8, %o3
1646 bnz %ncc, .bc_unalnbyte_loop
1647 add %i0, 8, %i0
1648 add %i1,%i0, %i1 ! restore pointer
1649
1650 ! Destination is now block (64 byte aligned), src is not 8 byte aligned
1651 .bc_unalnsrc:
1652 andn %i2, 0x3f, %i3 ! %i3 is multiple of block size
1653 and %i2, 0x3f, %i2 ! residue bytes in %i2
1654 add %i2, 64, %i2 ! Insure we don't load beyond
1655 sub %i3, 64, %i3 ! end of source buffer
1656
1657 andn %i0, 0x3f, %o4 ! %o4 has block aligned src address
1658 prefetch [%o4 + (3 * CACHE_LINE)], #one_read
1659 alignaddr %i0, %g0, %g0 ! generate %gsr
1660 add %i0, %i3, %i0 ! advance %i0 to after blocks
1661 !
1662 ! Determine source alignment to correct 8 byte offset
1663 andcc %i0, 0x20, %o3
1664 brnz,pn %o3, .bc_unaln_1
1665 andcc %i0, 0x10, %o3
1666 brnz,pn %o3, .bc_unaln_01
1667 andcc %i0, 0x08, %o3
1668 brz,a %o3, .bc_unaln_000
1669 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1670 ba .bc_unaln_001
1671 nop
1672 .bc_unaln_01:
1673 brnz,a %o3, .bc_unaln_011
1674 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1675 ba .bc_unaln_010
1676 nop
1677 .bc_unaln_1:
1678 brnz,pn %o3, .bc_unaln_11
1679 andcc %i0, 0x08, %o3
1680 brnz,a %o3, .bc_unaln_101
1681 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1682 ba .bc_unaln_100
1683 nop
1684 .bc_unaln_11:
1685 brz,pn %o3, .bc_unaln_110
1686 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1687
1688 .bc_unaln_111:
1689 ldd [%o4+56], %d14
1690 .bc_unaln_111_loop:
1691 add %o4, 64, %o4
1692 ldda [%o4]ASI_BLK_P, %d16
1693 faligndata %d14, %d16, %d48
1694 faligndata %d16, %d18, %d50
1695 faligndata %d18, %d20, %d52
1696 faligndata %d20, %d22, %d54
1697 faligndata %d22, %d24, %d56
1698 faligndata %d24, %d26, %d58
1699 faligndata %d26, %d28, %d60
1700 faligndata %d28, %d30, %d62
1701 fmovd %d30, %d14
1702 stda %d48, [%i1]ASI_BLK_P
1703 subcc %i3, 64, %i3
1704 add %i1, 64, %i1
1705 bgu,pt %ncc, .bc_unaln_111_loop
1706 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1707 ba .bc_unaln_done
1708 nop
1709
1710 .bc_unaln_110:
1711 ldd [%o4+48], %d12
1712 ldd [%o4+56], %d14
1713 .bc_unaln_110_loop:
1714 add %o4, 64, %o4
1715 ldda [%o4]ASI_BLK_P, %d16
1716 faligndata %d12, %d14, %d48
1717 faligndata %d14, %d16, %d50
1718 faligndata %d16, %d18, %d52
1719 faligndata %d18, %d20, %d54
1720 faligndata %d20, %d22, %d56
1721 faligndata %d22, %d24, %d58
1722 faligndata %d24, %d26, %d60
1723 faligndata %d26, %d28, %d62
1724 fmovd %d28, %d12
1725 fmovd %d30, %d14
1726 stda %d48, [%i1]ASI_BLK_P
1727 subcc %i3, 64, %i3
1728 add %i1, 64, %i1
1729 bgu,pt %ncc, .bc_unaln_110_loop
1730 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1731 ba .bc_unaln_done
1732 nop
1733
1734 .bc_unaln_101:
1735 ldd [%o4+40], %d10
1736 ldd [%o4+48], %d12
1737 ldd [%o4+56], %d14
1738 .bc_unaln_101_loop:
1739 add %o4, 64, %o4
1740 ldda [%o4]ASI_BLK_P, %d16
1741 faligndata %d10, %d12, %d48
1742 faligndata %d12, %d14, %d50
1743 faligndata %d14, %d16, %d52
1744 faligndata %d16, %d18, %d54
1745 faligndata %d18, %d20, %d56
1746 faligndata %d20, %d22, %d58
1747 faligndata %d22, %d24, %d60
1748 faligndata %d24, %d26, %d62
1749 fmovd %d26, %d10
1750 fmovd %d28, %d12
1751 fmovd %d30, %d14
1752 stda %d48, [%i1]ASI_BLK_P
1753 subcc %i3, 64, %i3
1754 add %i1, 64, %i1
1755 bgu,pt %ncc, .bc_unaln_101_loop
1756 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1757 ba .bc_unaln_done
1758 nop
1759
1760 .bc_unaln_100:
1761 ldd [%o4+32], %d8
1762 ldd [%o4+40], %d10
1763 ldd [%o4+48], %d12
1764 ldd [%o4+56], %d14
1765 .bc_unaln_100_loop:
1766 add %o4, 64, %o4
1767 ldda [%o4]ASI_BLK_P, %d16
1768 faligndata %d8, %d10, %d48
1769 faligndata %d10, %d12, %d50
1770 faligndata %d12, %d14, %d52
1771 faligndata %d14, %d16, %d54
1772 faligndata %d16, %d18, %d56
1773 faligndata %d18, %d20, %d58
1774 faligndata %d20, %d22, %d60
1775 faligndata %d22, %d24, %d62
1776 fmovd %d24, %d8
1777 fmovd %d26, %d10
1778 fmovd %d28, %d12
1779 fmovd %d30, %d14
1780 stda %d48, [%i1]ASI_BLK_P
1781 subcc %i3, 64, %i3
1782 add %i1, 64, %i1
1783 bgu,pt %ncc, .bc_unaln_100_loop
1784 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1785 ba .bc_unaln_done
1786 nop
1787
1788 .bc_unaln_011:
1789 ldd [%o4+24], %d6
1790 ldd [%o4+32], %d8
1791 ldd [%o4+40], %d10
1792 ldd [%o4+48], %d12
1793 ldd [%o4+56], %d14
1794 .bc_unaln_011_loop:
1795 add %o4, 64, %o4
1796 ldda [%o4]ASI_BLK_P, %d16
1797 faligndata %d6, %d8, %d48
1798 faligndata %d8, %d10, %d50
1799 faligndata %d10, %d12, %d52
1800 faligndata %d12, %d14, %d54
1801 faligndata %d14, %d16, %d56
1802 faligndata %d16, %d18, %d58
1803 faligndata %d18, %d20, %d60
1804 faligndata %d20, %d22, %d62
1805 fmovd %d22, %d6
1806 fmovd %d24, %d8
1807 fmovd %d26, %d10
1808 fmovd %d28, %d12
1809 fmovd %d30, %d14
1810 stda %d48, [%i1]ASI_BLK_P
1811 subcc %i3, 64, %i3
1812 add %i1, 64, %i1
1813 bgu,pt %ncc, .bc_unaln_011_loop
1814 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1815 ba .bc_unaln_done
1816 nop
1817
1818 .bc_unaln_010:
1819 ldd [%o4+16], %d4
1820 ldd [%o4+24], %d6
1821 ldd [%o4+32], %d8
1822 ldd [%o4+40], %d10
1823 ldd [%o4+48], %d12
1824 ldd [%o4+56], %d14
1825 .bc_unaln_010_loop:
1826 add %o4, 64, %o4
1827 ldda [%o4]ASI_BLK_P, %d16
1828 faligndata %d4, %d6, %d48
1829 faligndata %d6, %d8, %d50
1830 faligndata %d8, %d10, %d52
1831 faligndata %d10, %d12, %d54
1832 faligndata %d12, %d14, %d56
1833 faligndata %d14, %d16, %d58
1834 faligndata %d16, %d18, %d60
1835 faligndata %d18, %d20, %d62
1836 fmovd %d20, %d4
1837 fmovd %d22, %d6
1838 fmovd %d24, %d8
1839 fmovd %d26, %d10
1840 fmovd %d28, %d12
1841 fmovd %d30, %d14
1842 stda %d48, [%i1]ASI_BLK_P
1843 subcc %i3, 64, %i3
1844 add %i1, 64, %i1
1845 bgu,pt %ncc, .bc_unaln_010_loop
1846 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1847 ba .bc_unaln_done
1848 nop
1849
1850 .bc_unaln_001:
1851 ldd [%o4+8], %d2
1852 ldd [%o4+16], %d4
1853 ldd [%o4+24], %d6
1854 ldd [%o4+32], %d8
1855 ldd [%o4+40], %d10
1856 ldd [%o4+48], %d12
1857 ldd [%o4+56], %d14
1858 .bc_unaln_001_loop:
1859 add %o4, 64, %o4
1860 ldda [%o4]ASI_BLK_P, %d16
1861 faligndata %d2, %d4, %d48
1862 faligndata %d4, %d6, %d50
1863 faligndata %d6, %d8, %d52
1864 faligndata %d8, %d10, %d54
1865 faligndata %d10, %d12, %d56
1866 faligndata %d12, %d14, %d58
1867 faligndata %d14, %d16, %d60
1868 faligndata %d16, %d18, %d62
1869 fmovd %d18, %d2
1870 fmovd %d20, %d4
1871 fmovd %d22, %d6
1872 fmovd %d24, %d8
1873 fmovd %d26, %d10
1874 fmovd %d28, %d12
1875 fmovd %d30, %d14
1876 stda %d48, [%i1]ASI_BLK_P
1877 subcc %i3, 64, %i3
1878 add %i1, 64, %i1
1879 bgu,pt %ncc, .bc_unaln_001_loop
1880 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1881 ba .bc_unaln_done
1882 nop
1883
1884 .bc_unaln_000:
1885 ldda [%o4]ASI_BLK_P, %d0
1886 .bc_unaln_000_loop:
1887 add %o4, 64, %o4
1888 ldda [%o4]ASI_BLK_P, %d16
1889 faligndata %d0, %d2, %d48
1890 faligndata %d2, %d4, %d50
1891 faligndata %d4, %d6, %d52
1892 faligndata %d6, %d8, %d54
1893 faligndata %d8, %d10, %d56
1894 faligndata %d10, %d12, %d58
1895 faligndata %d12, %d14, %d60
1896 faligndata %d14, %d16, %d62
1897 fmovd %d16, %d0
1898 fmovd %d18, %d2
1899 fmovd %d20, %d4
1900 fmovd %d22, %d6
1901 fmovd %d24, %d8
1902 fmovd %d26, %d10
1903 fmovd %d28, %d12
1904 fmovd %d30, %d14
1905 stda %d48, [%i1]ASI_BLK_P
1906 subcc %i3, 64, %i3
1907 add %i1, 64, %i1
1908 bgu,pt %ncc, .bc_unaln_000_loop
1909 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1910
1911 .bc_unaln_done:
1912 ! Handle trailing bytes, 64 to 127
1913 ! Dest long word aligned, Src not long word aligned
1914 cmp %i2, 15
1915 bleu %ncc, .bc_unaln_short
1916
1917 andn %i2, 0x7, %i3 ! %i3 is multiple of 8
1918 and %i2, 0x7, %i2 ! residue bytes in %i2
1919 add %i2, 8, %i2
1920 sub %i3, 8, %i3 ! insure we don't load past end of src
1921 andn %i0, 0x7, %o4 ! %o4 has long word aligned src address
1922 add %i0, %i3, %i0 ! advance %i0 to after multiple of 8
1923 ldd [%o4], %d0 ! fetch partial word
1924 .bc_unaln_by8:
1925 ldd [%o4+8], %d2
1926 add %o4, 8, %o4
1927 faligndata %d0, %d2, %d16
1928 subcc %i3, 8, %i3
1929 std %d16, [%i1]
1930 fmovd %d2, %d0
1931 bgu,pt %ncc, .bc_unaln_by8
1932 add %i1, 8, %i1
1933
1934 .bc_unaln_short:
1935 cmp %i2, 8
1936 blt,pt %ncc, .bc_unalnfin
1937 nop
1938 ldub [%i0], %o4
1939 sll %o4, 24, %o3
1940 ldub [%i0+1], %o4
1941 sll %o4, 16, %o4
1942 or %o4, %o3, %o3
1943 ldub [%i0+2], %o4
1944 sll %o4, 8, %o4
1945 or %o4, %o3, %o3
1946 ldub [%i0+3], %o4
1947 or %o4, %o3, %o3
1948 stw %o3, [%i1]
1949 ldub [%i0+4], %o4
1950 sll %o4, 24, %o3
1951 ldub [%i0+5], %o4
1952 sll %o4, 16, %o4
1953 or %o4, %o3, %o3
1954 ldub [%i0+6], %o4
1955 sll %o4, 8, %o4
1956 or %o4, %o3, %o3
1957 ldub [%i0+7], %o4
1958 or %o4, %o3, %o3
1959 stw %o3, [%i1+4]
1960 add %i0, 8, %i0
1961 add %i1, 8, %i1
1962 sub %i2, 8, %i2
1963 .bc_unalnfin:
1964 cmp %i2, 4
1965 blt,pt %ncc, .bc_unalnz
1966 tst %i2
1967 ldub [%i0], %o3 ! read byte
1968 subcc %i2, 4, %i2 ! reduce count by 4
1969 sll %o3, 24, %o3 ! position
1970 ldub [%i0+1], %o4
1971 sll %o4, 16, %o4 ! position
1972 or %o4, %o3, %o3 ! merge
1973 ldub [%i0+2], %o4
1974 sll %o4, 8, %o4 ! position
1975 or %o4, %o3, %o3 ! merge
1976 add %i1, 4, %i1 ! advance dst by 4
1977 ldub [%i0+3], %o4
1978 add %i0, 4, %i0 ! advance src by 4
1979 or %o4, %o3, %o4 ! merge
1980 bnz,pt %ncc, .bc_unaln3x
1981 stw %o4, [%i1-4]
1982 ba .bc_exit
1983 nop
1984 .bc_unalnz:
1985 bz,pt %ncc, .bc_exit
1986 .bc_unaln3x: ! Exactly 1, 2, or 3 bytes remain
1987 subcc %i2, 1, %i2 ! reduce count for cc test
1988 ldub [%i0], %o4 ! load one byte
1989 bz,pt %ncc, .bc_exit
1990 stb %o4, [%i1] ! store one byte
1991 ldub [%i0+1], %o4 ! load second byte
1992 subcc %i2, 1, %i2
1993 bz,pt %ncc, .bc_exit
1994 stb %o4, [%i1+1] ! store second byte
1995 ldub [%i0+2], %o4 ! load third byte
1996 stb %o4, [%i1+2] ! store third byte
1997 .bc_exit:
1998 wr %l5, %g0, %gsr ! restore %gsr
1999 brnz %g5, .bc_fp_restore
2000 and %o5, COPY_FLAGS, %l1 ! save flags in %l1
2001 FZERO
2002 wr %g5, %g0, %fprs
2003 ba,pt %ncc, .bc_ex2
2004 nop
2005 .bc_fp_restore:
2006 BLD_FP_FROMSTACK(%o4)
2007 .bc_ex2:
2008 ldn [THREAD_REG + T_LWP], %o2
2009 brnz,pt %o2, 1f
2010 nop
2011
2012 ldsb [THREAD_REG + T_PREEMPT], %l0
2013 deccc %l0
2014 bnz,pn %ncc, 1f
2015 stb %l0, [THREAD_REG + T_PREEMPT]
2016
2017 ! Check for a kernel preemption request
2018 ldn [THREAD_REG + T_CPU], %l0
2019 ldub [%l0 + CPU_KPRUNRUN], %l0
2020 brnz,a,pt %l0, 1f ! Need to call kpreempt?
2021 or %l1, KPREEMPT_FLAG, %l1 ! If so, set the flag
2022 1:
2023 btst LOFAULT_SET, %l1
2024 bz,pn %icc, 3f
2025 andncc %o5, COPY_FLAGS, %o5
2026 ! Here via bcopy. Check to see if the handler was NULL.
2027 ! If so, just return quietly. Otherwise, reset the
2028 ! handler and return.
2029 bz,pn %ncc, 2f
2030 nop
2031 membar #Sync
2032 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2033 2:
2034 btst KPREEMPT_FLAG, %l1
2035 bz,pt %icc, 3f
2036 nop
2037 call kpreempt
2038 rdpr %pil, %o0 ! pass %pil
2039 3:
2040 ret
2041 restore %g0, 0, %o0
2042
2043 SET_SIZE(bcopy_more)
2044
2045
2046 #else /* NIAGARA_IMPL */
2047 save %sp, -SA(MINFRAME), %sp
2048 clr %o5 ! flag LOFAULT_SET is not set for bcopy
2049 .do_copy:
2050 cmp %i2, 12 ! for small counts
2051 blu %ncc, .bytecp ! just copy bytes
2052 .empty
2053
2054 cmp %i2, 128 ! for less than 128 bytes
2055 blu,pn %ncc, .bcb_punt ! no block st/quad ld
2056 nop
2057
2058 set use_hw_bcopy, %o2
2059 ld [%o2], %o2
2060 brz,pn %o2, .bcb_punt
2061 nop
2062
2063 subcc %i1, %i0, %i3
2064 bneg,a,pn %ncc, 1f
2065 neg %i3
2066 1:
2067 /*
2068 * Compare against 256 since we should be checking block addresses
2069 * and (dest & ~63) - (src & ~63) can be 3 blocks even if
2070 * src = dest + (64 * 3) + 63.
2071 */
2072 cmp %i3, 256
2073 blu,pn %ncc, .bcb_punt
2074 nop
2075
2076 /*
2077 * Copy that reach here have at least 2 blocks of data to copy.
2078 */
2079 .do_blockcopy:
2080 ! Swap src/dst since the code below is memcpy code
2081 ! and memcpy/bcopy have different calling sequences
2082 mov %i1, %i5
2083 mov %i0, %i1
2084 mov %i5, %i0
2085
2086 ! Block (64 bytes) align the destination.
2087 andcc %i0, 0x3f, %i3 ! is dst aligned on a 64 bytes
2088 bz %xcc, .chksrc ! dst is already double aligned
2089 sub %i3, 0x40, %i3
2090 neg %i3 ! bytes till dst 64 bytes aligned
2091 sub %i2, %i3, %i2 ! update i2 with new count
2092
2093 ! Based on source and destination alignment do
2094 ! either 8 bytes, 4 bytes, 2 bytes or byte copy.
2095
2096 ! Is dst & src 8B aligned
2097 or %i0, %i1, %o2
2098 andcc %o2, 0x7, %g0
2099 bz %ncc, .alewdcp
2100 nop
2101
2102 ! Is dst & src 4B aligned
2103 andcc %o2, 0x3, %g0
2104 bz %ncc, .alwdcp
2105 nop
2106
2107 ! Is dst & src 2B aligned
2108 andcc %o2, 0x1, %g0
2109 bz %ncc, .alhlfwdcp
2110 nop
2111
2112 ! 1B aligned
2113 1: ldub [%i1], %o2
2114 stb %o2, [%i0]
2115 inc %i1
2116 deccc %i3
2117 bgu,pt %ncc, 1b
2118 inc %i0
2119
2120 ba .chksrc
2121 nop
2122
2123 ! dst & src 4B aligned
2124 .alwdcp:
2125 ld [%i1], %o2
2126 st %o2, [%i0]
2127 add %i1, 0x4, %i1
2128 subcc %i3, 0x4, %i3
2129 bgu,pt %ncc, .alwdcp
2130 add %i0, 0x4, %i0
2131
2132 ba .chksrc
2133 nop
2134
2135 ! dst & src 2B aligned
2136 .alhlfwdcp:
2137 lduh [%i1], %o2
2138 stuh %o2, [%i0]
2139 add %i1, 0x2, %i1
2140 subcc %i3, 0x2, %i3
2141 bgu,pt %ncc, .alhlfwdcp
2142 add %i0, 0x2, %i0
2143
2144 ba .chksrc
2145 nop
2146
2147 ! dst & src 8B aligned
2148 .alewdcp:
2149 ldx [%i1], %o2
2150 stx %o2, [%i0]
2151 add %i1, 0x8, %i1
2152 subcc %i3, 0x8, %i3
2153 bgu,pt %ncc, .alewdcp
2154 add %i0, 0x8, %i0
2155
2156 ! Now Destination is block (64 bytes) aligned
2157 .chksrc:
2158 andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size
2159 sub %i2, %i3, %i2 ! Residue bytes in %i2
2160
2161 mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
2162
2163 andcc %i1, 0xf, %o2 ! is src quadword aligned
2164 bz,pn %xcc, .blkcpy ! src offset in %o2
2165 nop
2166 cmp %o2, 0x8
2167 bg .cpy_upper_double
2168 nop
2169 bl .cpy_lower_double
2170 nop
2171
2172 ! Falls through when source offset is equal to 8 i.e.
2173 ! source is double word aligned.
2174 ! In this case no shift/merge of data is required
2175 sub %i1, %o2, %i1 ! align the src at 16 bytes.
2176 andn %i1, 0x3f, %l0 ! %l0 has block aligned source
2177 prefetch [%l0+0x0], #one_read
2178 ldda [%i1+0x0]%asi, %l2
2179 loop0:
2180 ldda [%i1+0x10]%asi, %l4
2181 prefetch [%l0+0x40], #one_read
2182
2183 stxa %l3, [%i0+0x0]%asi
2184 stxa %l4, [%i0+0x8]%asi
2185
2186 ldda [%i1+0x20]%asi, %l2
2187 stxa %l5, [%i0+0x10]%asi
2188 stxa %l2, [%i0+0x18]%asi
2189
2190 ldda [%i1+0x30]%asi, %l4
2191 stxa %l3, [%i0+0x20]%asi
2192 stxa %l4, [%i0+0x28]%asi
2193
2194 ldda [%i1+0x40]%asi, %l2
2195 stxa %l5, [%i0+0x30]%asi
2196 stxa %l2, [%i0+0x38]%asi
2197
2198 add %l0, 0x40, %l0
2199 add %i1, 0x40, %i1
2200 subcc %i3, 0x40, %i3
2201 bgu,pt %xcc, loop0
2202 add %i0, 0x40, %i0
2203 ba .blkdone
2204 add %i1, %o2, %i1 ! increment the source by src offset
2205 ! the src offset was stored in %o2
2206
2207 .cpy_lower_double:
2208 sub %i1, %o2, %i1 ! align the src at 16 bytes.
2209 sll %o2, 3, %o0 ! %o0 left shift
2210 mov 0x40, %o1
2211 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift)
2212 andn %i1, 0x3f, %l0 ! %l0 has block aligned source
2213 prefetch [%l0+0x0], #one_read
2214 ldda [%i1+0x0]%asi, %l2 ! partial data in %l2 and %l3 has
2215 ! complete data
2216 loop1:
2217 ldda [%i1+0x10]%asi, %l4 ! %l4 has partial data for this read.
2218 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) ! merge %l2, %l3 and %l4
2219 ! into %l2 and %l3
2220 prefetch [%l0+0x40], #one_read
2221 stxa %l2, [%i0+0x0]%asi
2222 stxa %l3, [%i0+0x8]%asi
2223
2224 ldda [%i1+0x20]%asi, %l2
2225 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) ! merge %l2 with %l5 and
2226 stxa %l4, [%i0+0x10]%asi ! %l4 from previous read
2227 stxa %l5, [%i0+0x18]%asi ! into %l4 and %l5
2228
2229 ! Repeat the same for next 32 bytes.
2230
2231 ldda [%i1+0x30]%asi, %l4
2232 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
2233 stxa %l2, [%i0+0x20]%asi
2234 stxa %l3, [%i0+0x28]%asi
2235
2236 ldda [%i1+0x40]%asi, %l2
2237 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
2238 stxa %l4, [%i0+0x30]%asi
2239 stxa %l5, [%i0+0x38]%asi
2240
2241 add %l0, 0x40, %l0
2242 add %i1, 0x40, %i1
2243 subcc %i3, 0x40, %i3
2244 bgu,pt %xcc, loop1
2245 add %i0, 0x40, %i0
2246 ba .blkdone
2247 add %i1, %o2, %i1 ! increment the source by src offset
2248 ! the src offset was stored in %o2
2249
2250 .cpy_upper_double:
2251 sub %i1, %o2, %i1 ! align the src at 16 bytes.
2252 mov 0x8, %o0
2253 sub %o2, %o0, %o0
2254 sll %o0, 3, %o0 ! %o0 left shift
2255 mov 0x40, %o1
2256 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift)
2257 andn %i1, 0x3f, %l0 ! %l0 has block aligned source
2258 prefetch [%l0+0x0], #one_read
2259 ldda [%i1+0x0]%asi, %l2 ! partial data in %l3 for this read and
2260 ! no data in %l2
2261 loop2:
2262 ldda [%i1+0x10]%asi, %l4 ! %l4 has complete data and %l5 has
2263 ! partial
2264 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) ! merge %l3, %l4 and %l5
2265 ! into %l3 and %l4
2266 prefetch [%l0+0x40], #one_read
2267 stxa %l3, [%i0+0x0]%asi
2268 stxa %l4, [%i0+0x8]%asi
2269
2270 ldda [%i1+0x20]%asi, %l2
2271 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) ! merge %l2 and %l3 with
2272 stxa %l5, [%i0+0x10]%asi ! %l5 from previous read
2273 stxa %l2, [%i0+0x18]%asi ! into %l5 and %l2
2274
2275 ! Repeat the same for next 32 bytes.
2276
2277 ldda [%i1+0x30]%asi, %l4
2278 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
2279 stxa %l3, [%i0+0x20]%asi
2280 stxa %l4, [%i0+0x28]%asi
2281
2282 ldda [%i1+0x40]%asi, %l2
2283 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
2284 stxa %l5, [%i0+0x30]%asi
2285 stxa %l2, [%i0+0x38]%asi
2286
2287 add %l0, 0x40, %l0
2288 add %i1, 0x40, %i1
2289 subcc %i3, 0x40, %i3
2290 bgu,pt %xcc, loop2
2291 add %i0, 0x40, %i0
2292 ba .blkdone
2293 add %i1, %o2, %i1 ! increment the source by src offset
2294 ! the src offset was stored in %o2
2295
2296
2297 ! Both Source and Destination are block aligned.
2298 ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
2299 .blkcpy:
2300 prefetch [%i1+0x0], #one_read
2301 1:
2302 ldda [%i1+0x0]%asi, %l0
2303 ldda [%i1+0x10]%asi, %l2
2304 prefetch [%i1+0x40], #one_read
2305
2306 stxa %l0, [%i0+0x0]%asi
2307 ldda [%i1+0x20]%asi, %l4
2308 ldda [%i1+0x30]%asi, %l6
2309
2310 stxa %l1, [%i0+0x8]%asi
2311 stxa %l2, [%i0+0x10]%asi
2312 stxa %l3, [%i0+0x18]%asi
2313 stxa %l4, [%i0+0x20]%asi
2314 stxa %l5, [%i0+0x28]%asi
2315 stxa %l6, [%i0+0x30]%asi
2316 stxa %l7, [%i0+0x38]%asi
2317
2318 add %i1, 0x40, %i1
2319 subcc %i3, 0x40, %i3
2320 bgu,pt %xcc, 1b
2321 add %i0, 0x40, %i0
2322
2323 .blkdone:
2324 membar #Sync
2325
2326 brz,pt %i2, .blkexit
2327 nop
2328
2329 ! Handle trailing bytes
2330 cmp %i2, 0x8
2331 blu,pt %ncc, .residue
2332 nop
2333
2334 ! Can we do some 8B ops
2335 or %i1, %i0, %o2
2336 andcc %o2, 0x7, %g0
2337 bnz %ncc, .last4
2338 nop
2339
2340 ! Do 8byte ops as long as possible
2341 .last8:
2342 ldx [%i1], %o2
2343 stx %o2, [%i0]
2344 add %i1, 0x8, %i1
2345 sub %i2, 0x8, %i2
2346 cmp %i2, 0x8
2347 bgu,pt %ncc, .last8
2348 add %i0, 0x8, %i0
2349
2350 brz,pt %i2, .blkexit
2351 nop
2352
2353 ba .residue
2354 nop
2355
2356 .last4:
2357 ! Can we do 4B ops
2358 andcc %o2, 0x3, %g0
2359 bnz %ncc, .last2
2360 nop
2361 1:
2362 ld [%i1], %o2
2363 st %o2, [%i0]
2364 add %i1, 0x4, %i1
2365 sub %i2, 0x4, %i2
2366 cmp %i2, 0x4
2367 bgu,pt %ncc, 1b
2368 add %i0, 0x4, %i0
2369
2370 brz,pt %i2, .blkexit
2371 nop
2372
2373 ba .residue
2374 nop
2375
2376 .last2:
2377 ! Can we do 2B ops
2378 andcc %o2, 0x1, %g0
2379 bnz %ncc, .residue
2380 nop
2381
2382 1:
2383 lduh [%i1], %o2
2384 stuh %o2, [%i0]
2385 add %i1, 0x2, %i1
2386 sub %i2, 0x2, %i2
2387 cmp %i2, 0x2
2388 bgu,pt %ncc, 1b
2389 add %i0, 0x2, %i0
2390
2391 brz,pt %i2, .blkexit
2392 nop
2393
2394 .residue:
2395 ldub [%i1], %o2
2396 stb %o2, [%i0]
2397 inc %i1
2398 deccc %i2
2399 bgu,pt %ncc, .residue
2400 inc %i0
2401
2402 .blkexit:
2403
2404 membar #Sync ! sync error barrier
2405 ! Restore t_lofault handler, if came here from kcopy().
2406 tst %o5
2407 bz %ncc, 1f
2408 andn %o5, LOFAULT_SET, %o5
2409 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2410 1:
2411 ret
2412 restore %g0, 0, %o0
2413
2414
2415 .bcb_punt:
2416 !
2417 ! use aligned transfers where possible
2418 !
2419 xor %i0, %i1, %o4 ! xor from and to address
2420 btst 7, %o4 ! if lower three bits zero
2421 bz .aldoubcp ! can align on double boundary
2422 .empty ! assembler complaints about label
2423
2424 xor %i0, %i1, %o4 ! xor from and to address
2425 btst 3, %o4 ! if lower two bits zero
2426 bz .alwordcp ! can align on word boundary
2427 btst 3, %i0 ! delay slot, from address unaligned?
2428 !
2429 ! use aligned reads and writes where possible
2430 ! this differs from wordcp in that it copes
2431 ! with odd alignment between source and destnation
2432 ! using word reads and writes with the proper shifts
2433 ! in between to align transfers to and from memory
2434 ! i0 - src address, i1 - dest address, i2 - count
2435 ! i3, i4 - tmps for used generating complete word
2436 ! i5 (word to write)
2437 ! l0 size in bits of upper part of source word (US)
2438 ! l1 size in bits of lower part of source word (LS = 32 - US)
2439 ! l2 size in bits of upper part of destination word (UD)
2440 ! l3 size in bits of lower part of destination word (LD = 32 - UD)
2441 ! l4 number of bytes leftover after aligned transfers complete
2442 ! l5 the number 32
2443 !
2444 mov 32, %l5 ! load an oft-needed constant
2445 bz .align_dst_only
2446 btst 3, %i1 ! is destnation address aligned?
2447 clr %i4 ! clear registers used in either case
2448 bz .align_src_only
2449 clr %l0
2450 !
2451 ! both source and destination addresses are unaligned
2452 !
2453 1: ! align source
2454 ldub [%i0], %i3 ! read a byte from source address
2455 add %i0, 1, %i0 ! increment source address
2456 or %i4, %i3, %i4 ! or in with previous bytes (if any)
2457 btst 3, %i0 ! is source aligned?
2458 add %l0, 8, %l0 ! increment size of upper source (US)
2459 bnz,a 1b
2460 sll %i4, 8, %i4 ! make room for next byte
2461
2462 sub %l5, %l0, %l1 ! generate shift left count (LS)
2463 sll %i4, %l1, %i4 ! prepare to get rest
2464 ld [%i0], %i3 ! read a word
2465 add %i0, 4, %i0 ! increment source address
2466 srl %i3, %l0, %i5 ! upper src bits into lower dst bits
2467 or %i4, %i5, %i5 ! merge
2468 mov 24, %l3 ! align destination
2469 1:
2470 srl %i5, %l3, %i4 ! prepare to write a single byte
2471 stb %i4, [%i1] ! write a byte
2472 add %i1, 1, %i1 ! increment destination address
2473 sub %i2, 1, %i2 ! decrement count
2474 btst 3, %i1 ! is destination aligned?
2475 bnz,a 1b
2476 sub %l3, 8, %l3 ! delay slot, decrement shift count (LD)
2477 sub %l5, %l3, %l2 ! generate shift left count (UD)
2478 sll %i5, %l2, %i5 ! move leftover into upper bytes
2479 cmp %l2, %l0 ! cmp # reqd to fill dst w old src left
2480 bgu %ncc, .more_needed ! need more to fill than we have
2481 nop
2482
2483 sll %i3, %l1, %i3 ! clear upper used byte(s)
2484 srl %i3, %l1, %i3
2485 ! get the odd bytes between alignments
2486 sub %l0, %l2, %l0 ! regenerate shift count
2487 sub %l5, %l0, %l1 ! generate new shift left count (LS)
2488 and %i2, 3, %l4 ! must do remaining bytes if count%4 > 0
2489 andn %i2, 3, %i2 ! # of aligned bytes that can be moved
2490 srl %i3, %l0, %i4
2491 or %i5, %i4, %i5
2492 st %i5, [%i1] ! write a word
2493 subcc %i2, 4, %i2 ! decrement count
2494 bz %ncc, .unalign_out
2495 add %i1, 4, %i1 ! increment destination address
2496
2497 b 2f
2498 sll %i3, %l1, %i5 ! get leftover into upper bits
2499 .more_needed:
2500 sll %i3, %l0, %i3 ! save remaining byte(s)
2501 srl %i3, %l0, %i3
2502 sub %l2, %l0, %l1 ! regenerate shift count
2503 sub %l5, %l1, %l0 ! generate new shift left count
2504 sll %i3, %l1, %i4 ! move to fill empty space
2505 b 3f
2506 or %i5, %i4, %i5 ! merge to complete word
2507 !
2508 ! the source address is aligned and destination is not
2509 !
2510 .align_dst_only:
2511 ld [%i0], %i4 ! read a word
2512 add %i0, 4, %i0 ! increment source address
2513 mov 24, %l0 ! initial shift alignment count
2514 1:
2515 srl %i4, %l0, %i3 ! prepare to write a single byte
2516 stb %i3, [%i1] ! write a byte
2517 add %i1, 1, %i1 ! increment destination address
2518 sub %i2, 1, %i2 ! decrement count
2519 btst 3, %i1 ! is destination aligned?
2520 bnz,a 1b
2521 sub %l0, 8, %l0 ! delay slot, decrement shift count
2522 .xfer:
2523 sub %l5, %l0, %l1 ! generate shift left count
2524 sll %i4, %l1, %i5 ! get leftover
2525 3:
2526 and %i2, 3, %l4 ! must do remaining bytes if count%4 > 0
2527 andn %i2, 3, %i2 ! # of aligned bytes that can be moved
2528 2:
2529 ld [%i0], %i3 ! read a source word
2530 add %i0, 4, %i0 ! increment source address
2531 srl %i3, %l0, %i4 ! upper src bits into lower dst bits
2532 or %i5, %i4, %i5 ! merge with upper dest bits (leftover)
2533 st %i5, [%i1] ! write a destination word
2534 subcc %i2, 4, %i2 ! decrement count
2535 bz %ncc, .unalign_out ! check if done
2536 add %i1, 4, %i1 ! increment destination address
2537 b 2b ! loop
2538 sll %i3, %l1, %i5 ! get leftover
2539 .unalign_out:
2540 tst %l4 ! any bytes leftover?
2541 bz %ncc, .cpdone
2542 .empty ! allow next instruction in delay slot
2543 1:
2544 sub %l0, 8, %l0 ! decrement shift
2545 srl %i3, %l0, %i4 ! upper src byte into lower dst byte
2546 stb %i4, [%i1] ! write a byte
2547 subcc %l4, 1, %l4 ! decrement count
2548 bz %ncc, .cpdone ! done?
2549 add %i1, 1, %i1 ! increment destination
2550 tst %l0 ! any more previously read bytes
2551 bnz %ncc, 1b ! we have leftover bytes
2552 mov %l4, %i2 ! delay slot, mv cnt where dbytecp wants
2553 b .dbytecp ! let dbytecp do the rest
2554 sub %i0, %i1, %i0 ! i0 gets the difference of src and dst
2555 !
2556 ! the destination address is aligned and the source is not
2557 !
2558 .align_src_only:
2559 ldub [%i0], %i3 ! read a byte from source address
2560 add %i0, 1, %i0 ! increment source address
2561 or %i4, %i3, %i4 ! or in with previous bytes (if any)
2562 btst 3, %i0 ! is source aligned?
2563 add %l0, 8, %l0 ! increment shift count (US)
2564 bnz,a .align_src_only
2565 sll %i4, 8, %i4 ! make room for next byte
2566 b,a .xfer
2567 !
2568 ! if from address unaligned for double-word moves,
2569 ! move bytes till it is, if count is < 56 it could take
2570 ! longer to align the thing than to do the transfer
2571 ! in word size chunks right away
2572 !
2573 .aldoubcp:
2574 cmp %i2, 56 ! if count < 56, use wordcp, it takes
2575 blu,a %ncc, .alwordcp ! longer to align doubles than words
2576 mov 3, %o0 ! mask for word alignment
2577 call .alignit ! copy bytes until aligned
2578 mov 7, %o0 ! mask for double alignment
2579 !
2580 ! source and destination are now double-word aligned
2581 ! i3 has aligned count returned by alignit
2582 !
2583 and %i2, 7, %i2 ! unaligned leftover count
2584 sub %i0, %i1, %i0 ! i0 gets the difference of src and dst
2585 5:
2586 ldx [%i0+%i1], %o4 ! read from address
2587 stx %o4, [%i1] ! write at destination address
2588 subcc %i3, 8, %i3 ! dec count
2589 bgu %ncc, 5b
2590 add %i1, 8, %i1 ! delay slot, inc to address
2591 cmp %i2, 4 ! see if we can copy a word
2592 blu %ncc, .dbytecp ! if 3 or less bytes use bytecp
2593 .empty
2594 !
2595 ! for leftover bytes we fall into wordcp, if needed
2596 !
2597 .wordcp:
2598 and %i2, 3, %i2 ! unaligned leftover count
2599 5:
2600 ld [%i0+%i1], %o4 ! read from address
2601 st %o4, [%i1] ! write at destination address
2602 subcc %i3, 4, %i3 ! dec count
2603 bgu %ncc, 5b
2604 add %i1, 4, %i1 ! delay slot, inc to address
2605 b,a .dbytecp
2606
2607 ! we come here to align copies on word boundaries
2608 .alwordcp:
2609 call .alignit ! go word-align it
2610 mov 3, %o0 ! bits that must be zero to be aligned
2611 b .wordcp
2612 sub %i0, %i1, %i0 ! i0 gets the difference of src and dst
2613
2614 !
2615 ! byte copy, works with any alignment
2616 !
2617 .bytecp:
2618 b .dbytecp
2619 sub %i0, %i1, %i0 ! i0 gets difference of src and dst
2620
2621 !
2622 ! differenced byte copy, works with any alignment
2623 ! assumes dest in %i1 and (source - dest) in %i0
2624 !
2625 1:
2626 stb %o4, [%i1] ! write to address
2627 inc %i1 ! inc to address
2628 .dbytecp:
2629 deccc %i2 ! dec count
2630 bgeu,a %ncc, 1b ! loop till done
2631 ldub [%i0+%i1], %o4 ! read from address
2632 .cpdone:
2633
2634 membar #Sync ! sync error barrier
2635 ! Restore t_lofault handler, if came here from kcopy().
2636 tst %o5
2637 bz %ncc, 1f
2638 andn %o5, LOFAULT_SET, %o5
2639 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2640 1:
2641 ret
2642 restore %g0, 0, %o0 ! return (0)
2643
2644 /*
2645 * Common code used to align transfers on word and doubleword
2646 * boundaries. Aligns source and destination and returns a count
2647 * of aligned bytes to transfer in %i3
2648 */
2649 1:
2650 inc %i0 ! inc from
2651 stb %o4, [%i1] ! write a byte
2652 inc %i1 ! inc to
2653 dec %i2 ! dec count
2654 .alignit:
2655 btst %o0, %i0 ! %o0 is bit mask to check for alignment
2656 bnz,a 1b
2657 ldub [%i0], %o4 ! read next byte
2658
2659 retl
2660 andn %i2, %o0, %i3 ! return size of aligned bytes
2661
2662 SET_SIZE(bcopy)
2663
2664 #endif /* NIAGARA_IMPL */
2665
2666 /*
2667 * Block copy with possibly overlapped operands.
2668 */
2669
2670 ENTRY(ovbcopy)
2671 tst %o2 ! check count
2672 bgu,a %ncc, 1f ! nothing to do or bad arguments
2673 subcc %o0, %o1, %o3 ! difference of from and to address
2674
2675 retl ! return
2676 nop
2677 1:
2678 bneg,a %ncc, 2f
2679 neg %o3 ! if < 0, make it positive
2680 2: cmp %o2, %o3 ! cmp size and abs(from - to)
2681 bleu %ncc, bcopy ! if size <= abs(diff): use bcopy,
2682 .empty ! no overlap
2683 cmp %o0, %o1 ! compare from and to addresses
2684 blu %ncc, .ov_bkwd ! if from < to, copy backwards
2685 nop
2686 !
2687 ! Copy forwards.
2688 !
2689 .ov_fwd:
2690 ldub [%o0], %o3 ! read from address
2691 inc %o0 ! inc from address
2692 stb %o3, [%o1] ! write to address
2693 deccc %o2 ! dec count
2694 bgu %ncc, .ov_fwd ! loop till done
2695 inc %o1 ! inc to address
2696
2697 retl ! return
2698 nop
2699 !
2700 ! Copy backwards.
2701 !
2702 .ov_bkwd:
2703 deccc %o2 ! dec count
2704 ldub [%o0 + %o2], %o3 ! get byte at end of src
2705 bgu %ncc, .ov_bkwd ! loop till done
2706 stb %o3, [%o1 + %o2] ! delay slot, store at end of dst
2707
2708 retl ! return
2709 nop
2710 SET_SIZE(ovbcopy)
2711
2712 /*
2713 * hwblkpagecopy()
2714 *
2715 * Copies exactly one page. This routine assumes the caller (ppcopy)
2716 * has already disabled kernel preemption and has checked
2717 * use_hw_bcopy.
2718 */
2719 ENTRY(hwblkpagecopy)
2720 save %sp, -SA(MINFRAME), %sp
2721
2722 ! %i0 - source address (arg)
2723 ! %i1 - destination address (arg)
2724 ! %i2 - length of region (not arg)
2725
2726 set PAGESIZE, %i2
2727
2728 /*
2729 * Copying exactly one page and PAGESIZE is in mutliple of 0x80.
2730 */
2731 mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
2732 prefetch [%i0+0x0], #one_read
2733 prefetch [%i0+0x40], #one_read
2734 1:
2735 prefetch [%i0+0x80], #one_read
2736 prefetch [%i0+0xc0], #one_read
2737 ldda [%i0+0x0]%asi, %l0
2738 ldda [%i0+0x10]%asi, %l2
2739 ldda [%i0+0x20]%asi, %l4
2740 ldda [%i0+0x30]%asi, %l6
2741 stxa %l0, [%i1+0x0]%asi
2742 stxa %l1, [%i1+0x8]%asi
2743 stxa %l2, [%i1+0x10]%asi
2744 stxa %l3, [%i1+0x18]%asi
2745 stxa %l4, [%i1+0x20]%asi
2746 stxa %l5, [%i1+0x28]%asi
2747 stxa %l6, [%i1+0x30]%asi
2748 stxa %l7, [%i1+0x38]%asi
2749 ldda [%i0+0x40]%asi, %l0
2750 ldda [%i0+0x50]%asi, %l2
2751 ldda [%i0+0x60]%asi, %l4
2752 ldda [%i0+0x70]%asi, %l6
2753 stxa %l0, [%i1+0x40]%asi
2754 stxa %l1, [%i1+0x48]%asi
2755 stxa %l2, [%i1+0x50]%asi
2756 stxa %l3, [%i1+0x58]%asi
2757 stxa %l4, [%i1+0x60]%asi
2758 stxa %l5, [%i1+0x68]%asi
2759 stxa %l6, [%i1+0x70]%asi
2760 stxa %l7, [%i1+0x78]%asi
2761
2762 add %i0, 0x80, %i0
2763 subcc %i2, 0x80, %i2
2764 bgu,pt %xcc, 1b
2765 add %i1, 0x80, %i1
2766
2767 membar #Sync
2768 ret
2769 restore %g0, 0, %o0
2770 SET_SIZE(hwblkpagecopy)
2771
2772
2773 /*
2774 * Transfer data to and from user space -
2775 * Note that these routines can cause faults
2776 * It is assumed that the kernel has nothing at
2777 * less than KERNELBASE in the virtual address space.
2778 *
2779 * Note that copyin(9F) and copyout(9F) are part of the
2780 * DDI/DKI which specifies that they return '-1' on "errors."
2781 *
2782 * Sigh.
2783 *
2784 * So there's two extremely similar routines - xcopyin() and xcopyout()
2785 * which return the errno that we've faithfully computed. This
2786 * allows other callers (e.g. uiomove(9F)) to work correctly.
2787 * Given that these are used pretty heavily, we expand the calling
2788 * sequences inline for all flavours (rather than making wrappers).
2789 *
2790 * There are also stub routines for xcopyout_little and xcopyin_little,
2791 * which currently are intended to handle requests of <= 16 bytes from
2792 * do_unaligned. Future enhancement to make them handle 8k pages efficiently
2793 * is left as an exercise...
2794 */
2795
2796 /*
2797 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
2798 *
2799 * General theory of operation:
2800 *
2801 * None of the copyops routines grab a window until it's decided that
2802 * we need to do a HW block copy operation. This saves a window
2803 * spill/fill when we're called during socket ops. The typical IO
2804 * path won't cause spill/fill traps.
2805 *
2806 * This code uses a set of 4 limits for the maximum size that will
2807 * be copied given a particular input/output address alignment.
2808 * the default limits are:
2809 *
2810 * single byte aligned - 256 (hw_copy_limit_1)
2811 * two byte aligned - 512 (hw_copy_limit_2)
2812 * four byte aligned - 1024 (hw_copy_limit_4)
2813 * eight byte aligned - 1024 (hw_copy_limit_8)
2814 *
2815 * If the value for a particular limit is zero, the copy will be done
2816 * via the copy loops rather than block store/quad load instructions.
2817 *
2818 * Flow:
2819 *
2820 * If count == zero return zero.
2821 *
2822 * Store the previous lo_fault handler into %g6.
2823 * Place our secondary lofault handler into %g5.
2824 * Place the address of our nowindow fault handler into %o3.
2825 * Place the address of the windowed fault handler into %o4.
2826 * --> We'll use this handler if we end up grabbing a window
2827 * --> before we use block initializing store and quad load ASIs
2828 *
2829 * If count is less than or equal to SMALL_LIMIT (7) we
2830 * always do a byte for byte copy.
2831 *
2832 * If count is > SMALL_LIMIT, we check the alignment of the input
2833 * and output pointers. Based on the alignment we check count
2834 * against a limit based on detected alignment. If we exceed the
2835 * alignment value we copy via block initializing store and quad
2836 * load instructions.
2837 *
2838 * If we don't exceed one of the limits, we store -count in %o3,
2839 * we store the number of chunks (8, 4, 2 or 1 byte) operated
2840 * on in our basic copy loop in %o2. Following this we branch
2841 * to the appropriate copy loop and copy that many chunks.
2842 * Since we've been adding the chunk size to %o3 each time through
2843 * as well as decrementing %o2, we can tell if any data is
2844 * is left to be copied by examining %o3. If that is zero, we're
2845 * done and can go home. If not, we figure out what the largest
2846 * chunk size left to be copied is and branch to that copy loop
2847 * unless there's only one byte left. We load that as we're
2848 * branching to code that stores it just before we return.
2849 *
2850 * Fault handlers are invoked if we reference memory that has no
2851 * current mapping. All forms share the same copyio_fault handler.
2852 * This routine handles fixing up the stack and general housecleaning.
2853 * Each copy operation has a simple fault handler that is then called
2854 * to do the work specific to the invidual operation. The handler
2855 * for copyOP and xcopyOP are found at the end of individual function.
2856 * The handlers for xcopyOP_little are found at the end of xcopyin_little.
2857 * The handlers for copyOP_noerr are found at the end of copyin_noerr.
2858 */
2859
2860 /*
2861 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
2862 */
2863
2864 /*
2865 * We save the arguments in the following registers in case of a fault:
2866 * kaddr - %g2
2867 * uaddr - %g3
2868 * count - %g4
2869 */
2870 #define SAVE_SRC %g2
2871 #define SAVE_DST %g3
2872 #define SAVE_COUNT %g4
2873
2874 #define REAL_LOFAULT %g5
2875 #define SAVED_LOFAULT %g6
2876
2877 /*
2878 * Generic copyio fault handler. This is the first line of defense when a
2879 * fault occurs in (x)copyin/(x)copyout. In order for this to function
2880 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
2881 * This allows us to share common code for all the flavors of the copy
2882 * operations, including the _noerr versions.
2883 *
2884 * Note that this function will restore the original input parameters before
2885 * calling REAL_LOFAULT. So the real handler can vector to the appropriate
2886 * member of the t_copyop structure, if needed.
2887 */
2888 ENTRY(copyio_fault)
2889 #if !defined(NIAGARA_IMPL)
2890 btst FPUSED_FLAG, SAVED_LOFAULT
2891 bz 1f
2892 andn SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
2893
2894 wr %l5, 0, %gsr ! restore gsr
2895
2896 btst FPRS_FEF, %g1
2897 bz %icc, 4f
2898 nop
2899
2900 ! restore fpregs from stack
2901 BLD_FP_FROMSTACK(%o2)
2902
2903 ba,pt %ncc, 1f
2904 nop
2905 4:
2906 FZERO ! zero all of the fpregs
2907 wr %g1, %g0, %fprs ! restore fprs
2908 1:
2909 restore
2910 mov SAVE_SRC, %o0
2911 mov SAVE_DST, %o1
2912 jmp REAL_LOFAULT
2913 mov SAVE_COUNT, %o2
2914
2915 #else /* NIAGARA_IMPL */
2916 membar #Sync
2917 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2918 restore
2919 mov SAVE_SRC, %o0
2920 mov SAVE_DST, %o1
2921 jmp REAL_LOFAULT
2922 mov SAVE_COUNT, %o2
2923
2924 #endif /* NIAGARA_IMPL */
2925
2926 SET_SIZE(copyio_fault)
2927
2928 ENTRY(copyio_fault_nowindow)
2929 membar #Sync
2930 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2931
2932 mov SAVE_SRC, %o0
2933 mov SAVE_DST, %o1
2934 jmp REAL_LOFAULT
2935 mov SAVE_COUNT, %o2
2936 SET_SIZE(copyio_fault_nowindow)
2937
2938 ENTRY(copyout)
2939 sethi %hi(.copyout_err), REAL_LOFAULT
2940 or REAL_LOFAULT, %lo(.copyout_err), REAL_LOFAULT
2941
2942 #if !defined(NIAGARA_IMPL)
2943 .do_copyout:
2944 tst %o2 ! check for zero count; quick exit
2945 bz,pt %ncc, .co_smallqx
2946 mov %o0, SAVE_SRC
2947 mov %o1, SAVE_DST
2948 mov %o2, SAVE_COUNT
2949 cmp %o2, FP_COPY ! check for small copy/leaf case
2950 bgt,pt %ncc, .co_copy_more
2951 ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
2952 /*
2953 * Small copy out code
2954 *
2955 */
2956 sethi %hi(copyio_fault_nowindow), %o3
2957 or %o3, %lo(copyio_fault_nowindow), %o3
2958 membar #Sync
2959 stn %o3, [THREAD_REG + T_LOFAULT]
2960
2961 mov ASI_USER, %asi
2962 cmp %o2, SHORTCOPY ! make sure there is enough to align
2963 ble,pt %ncc, .co_smallest
2964 andcc %o1, 0x7, %o3 ! is dest long word aligned
2965 bnz,pn %ncc, .co_align
2966 andcc %o1, 1, %o3 ! is dest byte aligned
2967
2968 ! Destination is long word aligned
2969 ! 8 cases for src alignment; load parts, store long words
2970 .co_al_src:
2971 andcc %o0, 7, %o3
2972 brnz,pt %o3, .co_src_dst_unal8
2973 nop
2974 /*
2975 * Special case for handling when src and dest are both long word aligned
2976 * and total data to move is less than FP_COPY bytes
2977 * Also handles finish up for large block moves, so may be less than 32 bytes
2978 */
2979 .co_medlong:
2980 subcc %o2, 31, %o2 ! adjust length to allow cc test
2981 ble,pt %ncc, .co_medl31
2982 nop
2983 .co_medl32:
2984 ldx [%o0], %o4 ! move 32 bytes
2985 subcc %o2, 32, %o2 ! decrement length count by 32
2986 stxa %o4, [%o1]%asi
2987 ldx [%o0+8], %o4
2988 stxa %o4, [%o1+8]%asi
2989 ldx [%o0+16], %o4
2990 add %o0, 32, %o0 ! increase src ptr by 32
2991 stxa %o4, [%o1+16]%asi
2992 ldx [%o0-8], %o4
2993 add %o1, 32, %o1 ! increase dst ptr by 32
2994 bgu,pt %ncc, .co_medl32 ! repeat if at least 32 bytes left
2995 stxa %o4, [%o1-8]%asi
2996 .co_medl31:
2997 addcc %o2, 24, %o2 ! adjust count to be off by 7
2998 ble,pt %ncc, .co_medl7 ! skip if 7 or fewer bytes left
2999 nop
3000 .co_medl8:
3001 ldx [%o0], %o4 ! move 8 bytes
3002 add %o0, 8, %o0 ! increase src ptr by 8
3003 subcc %o2, 8, %o2 ! decrease count by 8
3004 add %o1, 8, %o1 ! increase dst ptr by 8
3005 bgu,pt %ncc, .co_medl8
3006 stxa %o4, [%o1-8]%asi
3007 .co_medl7:
3008 addcc %o2, 7, %o2 ! finish adjustment of remaining count
3009 bnz,pt %ncc, .co_small4 ! do final bytes if not finished
3010
3011 .co_smallx: ! finish up and exit
3012 membar #Sync
3013 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3014 .co_smallqx:
3015 retl
3016 mov %g0, %o0
3017
3018 .co_small4:
3019 cmp %o2, 4
3020 blt,pt %ncc, .co_small3x ! skip if less than 4 bytes left
3021 nop !
3022 ld [%o0], %o4 ! move 4 bytes
3023 add %o0, 4, %o0 ! increase src ptr by 4
3024 add %o1, 4, %o1 ! increase dst ptr by 4
3025 subcc %o2, 4, %o2 ! decrease count by 4
3026 bz,pt %ncc, .co_smallx
3027 stwa %o4, [%o1-4]%asi
3028
3029 .co_small3x: ! Exactly 1, 2, or 3 bytes remain
3030 subcc %o2, 1, %o2 ! reduce count for cc test
3031 ldub [%o0], %o4 ! load one byte
3032 bz,pt %ncc, .co_smallx
3033 stba %o4, [%o1]%asi ! store one byte
3034 ldub [%o0+1], %o4 ! load second byte
3035 subcc %o2, 1, %o2
3036 bz,pt %ncc, .co_smallx
3037 stba %o4, [%o1+1]%asi ! store second byte
3038 ldub [%o0+2], %o4 ! load third byte
3039 ba .co_smallx
3040 stba %o4, [%o1+2]%asi ! store third byte
3041
3042 .co_smallest: ! 7 or fewer bytes remain
3043 cmp %o2, 4
3044 blt,pt %ncc, .co_small3x
3045 nop
3046 ldub [%o0], %o4 ! read byte
3047 subcc %o2, 4, %o2 ! reduce count by 4
3048 stba %o4, [%o1]%asi ! write byte
3049 ldub [%o0+1], %o4 ! repeat for total of 4 bytes
3050 add %o0, 4, %o0 ! advance src by 4
3051 stba %o4, [%o1+1]%asi
3052 ldub [%o0-2], %o4
3053 add %o1, 4, %o1 ! advance dst by 4
3054 stba %o4, [%o1-2]%asi
3055 ldub [%o0-1], %o4
3056 bnz,pt %ncc, .co_small3x
3057 stba %o4, [%o1-1]%asi
3058 membar #Sync
3059 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3060 retl
3061 mov %g0, %o0
3062
3063 .co_align: ! byte align test in prior branch delay
3064 bnz,pt %ncc, .co_al_d1
3065 .co_al_d1f: ! dest is now half word aligned
3066 andcc %o1, 2, %o3
3067 bnz,pt %ncc, .co_al_d2
3068 .co_al_d2f: ! dest is now word aligned
3069 andcc %o1, 4, %o3 ! is dest longword aligned?
3070 bz,pt %ncc, .co_al_src
3071 nop
3072 .co_al_d4: ! dest is word aligned; src is unknown
3073 ldub [%o0], %o4 ! move a word (src align unknown)
3074 ldub [%o0+1], %o3
3075 sll %o4, 24, %o4 ! position
3076 sll %o3, 16, %o3 ! position
3077 or %o4, %o3, %o3 ! merge
3078 ldub [%o0+2], %o4
3079 sll %o4, 8, %o4 ! position
3080 or %o4, %o3, %o3 ! merge
3081 ldub [%o0+3], %o4
3082 or %o4, %o3, %o4 ! merge
3083 stwa %o4,[%o1]%asi ! store four bytes
3084 add %o0, 4, %o0 ! adjust src by 4
3085 add %o1, 4, %o1 ! adjust dest by 4
3086 sub %o2, 4, %o2 ! adjust count by 4
3087 andcc %o0, 7, %o3 ! check for src long word alignment
3088 brz,pt %o3, .co_medlong
3089 .co_src_dst_unal8:
3090 ! dst is 8-byte aligned, src is not
3091 ! Size is less than FP_COPY
3092 ! Following code is to select for alignment
3093 andcc %o0, 0x3, %o3 ! test word alignment
3094 bz,pt %ncc, .co_medword
3095 nop
3096 andcc %o0, 0x1, %o3 ! test halfword alignment
3097 bnz,pt %ncc, .co_med_byte ! go to byte move if not halfword
3098 andcc %o0, 0x2, %o3 ! test which byte alignment
3099 ba .co_medhalf
3100 nop
3101 .co_al_d1: ! align dest to half word
3102 ldub [%o0], %o4 ! move a byte
3103 add %o0, 1, %o0
3104 stba %o4, [%o1]%asi
3105 add %o1, 1, %o1
3106 andcc %o1, 2, %o3
3107 bz,pt %ncc, .co_al_d2f
3108 sub %o2, 1, %o2
3109 .co_al_d2: ! align dest to word
3110 ldub [%o0], %o4 ! move a half-word (src align unknown)
3111 ldub [%o0+1], %o3
3112 sll %o4, 8, %o4 ! position
3113 or %o4, %o3, %o4 ! merge
3114 stha %o4, [%o1]%asi
3115 add %o0, 2, %o0
3116 add %o1, 2, %o1
3117 andcc %o1, 4, %o3 ! is dest longword aligned?
3118 bz,pt %ncc, .co_al_src
3119 sub %o2, 2, %o2
3120 ba .co_al_d4
3121 nop
3122 /*
3123 * Handle all cases where src and dest are aligned on word
3124 * boundaries. Use unrolled loops for better performance.
3125 * This option wins over standard large data move when
3126 * source and destination is in cache for medium
3127 * to short data moves.
3128 */
3129 .co_medword:
3130 subcc %o2, 31, %o2 ! adjust length to allow cc test
3131 ble,pt %ncc, .co_medw31
3132 nop
3133 .co_medw32:
3134 ld [%o0], %o4 ! move a block of 32 bytes
3135 stwa %o4, [%o1]%asi
3136 ld [%o0+4], %o4
3137 stwa %o4, [%o1+4]%asi
3138 ld [%o0+8], %o4
3139 stwa %o4, [%o1+8]%asi
3140 ld [%o0+12], %o4
3141 stwa %o4, [%o1+12]%asi
3142 ld [%o0+16], %o4
3143 stwa %o4, [%o1+16]%asi
3144 ld [%o0+20], %o4
3145 subcc %o2, 32, %o2 ! decrement length count
3146 stwa %o4, [%o1+20]%asi
3147 ld [%o0+24], %o4
3148 add %o0, 32, %o0 ! increase src ptr by 32
3149 stwa %o4, [%o1+24]%asi
3150 ld [%o0-4], %o4
3151 add %o1, 32, %o1 ! increase dst ptr by 32
3152 bgu,pt %ncc, .co_medw32 ! repeat if at least 32 bytes left
3153 stwa %o4, [%o1-4]%asi
3154 .co_medw31:
3155 addcc %o2, 24, %o2 ! adjust count to be off by 7
3156 ble,pt %ncc, .co_medw7 ! skip if 7 or fewer bytes left
3157 nop !
3158 .co_medw15:
3159 ld [%o0], %o4 ! move a block of 8 bytes
3160 subcc %o2, 8, %o2 ! decrement length count
3161 stwa %o4, [%o1]%asi
3162 add %o0, 8, %o0 ! increase src ptr by 8
3163 ld [%o0-4], %o4
3164 add %o1, 8, %o1 ! increase dst ptr by 8
3165 bgu,pt %ncc, .co_medw15
3166 stwa %o4, [%o1-4]%asi
3167 .co_medw7:
3168 addcc %o2, 7, %o2 ! finish adjustment of remaining count
3169 bz,pt %ncc, .co_smallx ! exit if finished
3170 cmp %o2, 4
3171 blt,pt %ncc, .co_small3x ! skip if less than 4 bytes left
3172 nop !
3173 ld [%o0], %o4 ! move 4 bytes
3174 add %o0, 4, %o0 ! increase src ptr by 4
3175 add %o1, 4, %o1 ! increase dst ptr by 4
3176 subcc %o2, 4, %o2 ! decrease count by 4
3177 bnz .co_small3x
3178 stwa %o4, [%o1-4]%asi
3179 membar #Sync
3180 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3181 retl
3182 mov %g0, %o0
3183
3184 .co_medhalf:
3185 subcc %o2, 31, %o2 ! adjust length to allow cc test
3186 ble,pt %ncc, .co_medh31
3187 nop
3188 .co_medh32: ! load and store block of 32 bytes
3189
3190 lduh [%o0], %o4 ! move 32 bytes
3191 subcc %o2, 32, %o2 ! decrement length count
3192 lduw [%o0+2], %o3
3193 sllx %o4, 48, %o4
3194 sllx %o3, 16, %o3
3195 or %o4, %o3, %o3
3196 lduh [%o0+6], %o4
3197 or %o4, %o3, %o4
3198 stxa %o4, [%o1]%asi
3199
3200 lduh [%o0+8], %o4
3201 lduw [%o0+10], %o3
3202 sllx %o4, 48, %o4
3203 sllx %o3, 16, %o3
3204 or %o4, %o3, %o3
3205 lduh [%o0+14], %o4
3206 or %o4, %o3, %o4
3207 stxa %o4, [%o1+8]%asi
3208
3209 lduh [%o0+16], %o4
3210 lduw [%o0+18], %o3
3211 sllx %o4, 48, %o4
3212 sllx %o3, 16, %o3
3213 or %o4, %o3, %o3
3214 lduh [%o0+22], %o4
3215 or %o4, %o3, %o4
3216 stxa %o4, [%o1+16]%asi
3217
3218 add %o0, 32, %o0 ! increase src ptr by 32
3219 add %o1, 32, %o1 ! increase dst ptr by 32
3220
3221 lduh [%o0-8], %o4
3222 lduw [%o0-6], %o3
3223 sllx %o4, 48, %o4
3224 sllx %o3, 16, %o3
3225 or %o4, %o3, %o3
3226 lduh [%o0-2], %o4
3227 or %o3, %o4, %o4
3228 bgu,pt %ncc, .co_medh32 ! repeat if at least 32 bytes left
3229 stxa %o4, [%o1-8]%asi
3230
3231 .co_medh31:
3232 addcc %o2, 24, %o2 ! adjust count to be off by 7
3233 ble,pt %ncc, .co_medh7 ! skip if 7 or fewer bytes left
3234 nop !
3235 .co_medh15:
3236 lduh [%o0], %o4 ! move 16 bytes
3237 subcc %o2, 8, %o2 ! decrement length count
3238 lduw [%o0+2], %o3
3239 sllx %o4, 48, %o4
3240 sllx %o3, 16, %o3
3241 or %o4, %o3, %o3
3242 add %o1, 8, %o1 ! increase dst ptr by 8
3243 lduh [%o0+6], %o4
3244 add %o0, 8, %o0 ! increase src ptr by 8
3245 or %o4, %o3, %o4
3246 bgu,pt %ncc, .co_medh15
3247 stxa %o4, [%o1-8]%asi
3248 .co_medh7:
3249 addcc %o2, 7, %o2 ! finish adjustment of remaining count
3250 bz,pt %ncc, .co_smallx ! exit if finished
3251 cmp %o2, 4
3252 blt,pt %ncc, .co_small3x ! skip if less than 4 bytes left
3253 nop !
3254 lduh [%o0], %o4
3255 sll %o4, 16, %o4
3256 lduh [%o0+2], %o3
3257 or %o3, %o4, %o4
3258 subcc %o2, 4, %o2
3259 add %o0, 4, %o0
3260 add %o1, 4, %o1
3261 bnz .co_small3x
3262 stwa %o4, [%o1-4]%asi
3263 membar #Sync
3264 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3265 retl
3266 mov %g0, %o0
3267
3268 .align 16
3269 .co_med_byte:
3270 bnz,pt %ncc, .co_medbh32a ! go to correct byte move
3271 subcc %o2, 31, %o2 ! adjust length to allow cc test
3272 ble,pt %ncc, .co_medb31
3273 nop
3274 .co_medb32: ! Alignment 1 or 5
3275 subcc %o2, 32, %o2 ! decrement length count
3276
3277 ldub [%o0], %o4 ! load and store a block of 32 bytes
3278 sllx %o4, 56, %o3
3279 lduh [%o0+1], %o4
3280 sllx %o4, 40, %o4
3281 or %o4, %o3, %o3
3282 lduw [%o0+3], %o4
3283 sllx %o4, 8, %o4
3284 or %o4, %o3, %o3
3285 ldub [%o0+7], %o4
3286 or %o4, %o3, %o4
3287 stxa %o4, [%o1]%asi
3288
3289 ldub [%o0+8], %o4
3290 sllx %o4, 56, %o3
3291 lduh [%o0+9], %o4
3292 sllx %o4, 40, %o4
3293 or %o4, %o3, %o3
3294 lduw [%o0+11], %o4
3295 sllx %o4, 8, %o4
3296 or %o4, %o3, %o3
3297 ldub [%o0+15], %o4
3298 or %o4, %o3, %o4
3299 stxa %o4, [%o1+8]%asi
3300
3301 ldub [%o0+16], %o4
3302 sllx %o4, 56, %o3
3303 lduh [%o0+17], %o4
3304 sllx %o4, 40, %o4
3305 or %o4, %o3, %o3
3306 lduw [%o0+19], %o4
3307 sllx %o4, 8, %o4
3308 or %o4, %o3, %o3
3309 ldub [%o0+23], %o4
3310 or %o4, %o3, %o4
3311 stxa %o4, [%o1+16]%asi
3312
3313 add %o0, 32, %o0 ! increase src ptr by 32
3314 add %o1, 32, %o1 ! increase dst ptr by 32
3315
3316 ldub [%o0-8], %o4
3317 sllx %o4, 56, %o3
3318 lduh [%o0-7], %o4
3319 sllx %o4, 40, %o4
3320 or %o4, %o3, %o3
3321 lduw [%o0-5], %o4
3322 sllx %o4, 8, %o4
3323 or %o4, %o3, %o3
3324 ldub [%o0-1], %o4
3325 or %o4, %o3, %o4
3326 bgu,pt %ncc, .co_medb32 ! repeat if at least 32 bytes left
3327 stxa %o4, [%o1-8]%asi
3328
3329 .co_medb31: ! 31 or fewer bytes remaining
3330 addcc %o2, 24, %o2 ! adjust count to be off by 7
3331 ble,pt %ncc, .co_medb7 ! skip if 7 or fewer bytes left
3332 nop !
3333 .co_medb15:
3334
3335 ldub [%o0], %o4 ! load and store a block of 8 bytes
3336 subcc %o2, 8, %o2 ! decrement length count
3337 sllx %o4, 56, %o3
3338 lduh [%o0+1], %o4
3339 sllx %o4, 40, %o4
3340 or %o4, %o3, %o3
3341 lduw [%o0+3], %o4
3342 add %o1, 8, %o1 ! increase dst ptr by 16
3343 sllx %o4, 8, %o4
3344 or %o4, %o3, %o3
3345 ldub [%o0+7], %o4
3346 add %o0, 8, %o0 ! increase src ptr by 16
3347 or %o4, %o3, %o4
3348 bgu,pt %ncc, .co_medb15
3349 stxa %o4, [%o1-8]%asi
3350 .co_medb7:
3351 addcc %o2, 7, %o2 ! finish adjustment of remaining count
3352 bz,pt %ncc, .co_smallx ! exit if finished
3353 cmp %o2, 4
3354 blt,pt %ncc, .co_small3x ! skip if less than 4 bytes left
3355 nop !
3356 ldub [%o0], %o4 ! move 4 bytes
3357 sll %o4, 24, %o3
3358 lduh [%o0+1], %o4
3359 sll %o4, 8, %o4
3360 or %o4, %o3, %o3
3361 ldub [%o0+3], %o4
3362 or %o4, %o3, %o4
3363 subcc %o2, 4, %o2
3364 add %o0, 4, %o0
3365 add %o1, 4, %o1
3366 bnz .co_small3x
3367 stwa %o4, [%o1-4]%asi
3368 membar #Sync
3369 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3370 retl
3371 mov %g0, %o0
3372
3373 .align 16
3374 .co_medbh32a:
3375 ble,pt %ncc, .co_medbh31
3376 nop
3377 .co_medbh32: ! Alignment 3 or 7
3378 subcc %o2, 32, %o2 ! decrement length count
3379
3380 ldub [%o0], %o4 ! load and store a block of 32 bytes
3381 sllx %o4, 56, %o3
3382 lduw [%o0+1], %o4
3383 sllx %o4, 24, %o4
3384 or %o4, %o3, %o3
3385 lduh [%o0+5], %o4
3386 sllx %o4, 8, %o4
3387 or %o4, %o3, %o3
3388 ldub [%o0+7], %o4
3389 or %o4, %o3, %o4
3390 stxa %o4, [%o1]%asi
3391
3392 ldub [%o0+8], %o4
3393 sllx %o4, 56, %o3
3394 lduw [%o0+9], %o4
3395 sllx %o4, 24, %o4
3396 or %o4, %o3, %o3
3397 lduh [%o0+13], %o4
3398 sllx %o4, 8, %o4
3399 or %o4, %o3, %o3
3400 ldub [%o0+15], %o4
3401 or %o4, %o3, %o4
3402 stxa %o4, [%o1+8]%asi
3403
3404 ldub [%o0+16], %o4
3405 sllx %o4, 56, %o3
3406 lduw [%o0+17], %o4
3407 sllx %o4, 24, %o4
3408 or %o4, %o3, %o3
3409 lduh [%o0+21], %o4
3410 sllx %o4, 8, %o4
3411 or %o4, %o3, %o3
3412 ldub [%o0+23], %o4
3413 or %o4, %o3, %o4
3414 stxa %o4, [%o1+16]%asi
3415
3416 add %o0, 32, %o0 ! increase src ptr by 32
3417 add %o1, 32, %o1 ! increase dst ptr by 32
3418
3419 ldub [%o0-8], %o4
3420 sllx %o4, 56, %o3
3421 lduw [%o0-7], %o4
3422 sllx %o4, 24, %o4
3423 or %o4, %o3, %o3
3424 lduh [%o0-3], %o4
3425 sllx %o4, 8, %o4
3426 or %o4, %o3, %o3
3427 ldub [%o0-1], %o4
3428 or %o4, %o3, %o4
3429 bgu,pt %ncc, .co_medbh32 ! repeat if at least 32 bytes left
3430 stxa %o4, [%o1-8]%asi
3431
3432 .co_medbh31:
3433 addcc %o2, 24, %o2 ! adjust count to be off by 7
3434 ble,pt %ncc, .co_medb7 ! skip if 7 or fewer bytes left
3435 nop !
3436 .co_medbh15:
3437 ldub [%o0], %o4 ! load and store a block of 8 bytes
3438 sllx %o4, 56, %o3
3439 lduw [%o0+1], %o4
3440 sllx %o4, 24, %o4
3441 or %o4, %o3, %o3
3442 lduh [%o0+5], %o4
3443 sllx %o4, 8, %o4
3444 or %o4, %o3, %o3
3445 ldub [%o0+7], %o4
3446 or %o4, %o3, %o4
3447 stxa %o4, [%o1]%asi
3448 subcc %o2, 8, %o2 ! decrement length count
3449 add %o1, 8, %o1 ! increase dst ptr by 8
3450 add %o0, 8, %o0 ! increase src ptr by 8
3451 bgu,pt %ncc, .co_medbh15
3452 stxa %o4, [%o1-8]%asi
3453 ba .co_medb7
3454 nop
3455 /*
3456 * End of small copy (no window) code
3457 */
3458
3459 /*
3460 * Long copy code
3461 */
3462 .co_copy_more:
3463 sethi %hi(copyio_fault), %o3
3464 or %o3, %lo(copyio_fault), %o3
3465 membar #Sync
3466 stn %o3, [THREAD_REG + T_LOFAULT]
3467
3468 /*
3469 * Following code is for large copies. We know there is at
3470 * least FP_COPY bytes available. FP regs are used, so
3471 * we save registers and fp regs before starting
3472 */
3473 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3474 or SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
3475 rd %fprs, %g1 ! check for unused fp
3476 ! if fprs.fef == 0, set it.
3477 ! Setting it when already set costs more than checking
3478 andcc %g1, FPRS_FEF, %g1 ! test FEF, fprs.du = fprs.dl = 0
3479 bz,pt %ncc, .co_fp_unused
3480 mov ASI_USER, %asi
3481 BST_FP_TOSTACK(%o3)
3482 ba .co_fp_ready
3483 .co_fp_unused:
3484 prefetch [%i0 + (1 * CACHE_LINE)], #one_read
3485 wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1
3486 .co_fp_ready:
3487 rd %gsr, %l5 ! save %gsr value
3488 andcc %i1, 1, %o3 ! is dest byte aligned
3489 bnz,pt %ncc, .co_big_d1
3490 .co_big_d1f: ! dest is now half word aligned
3491 andcc %i1, 2, %o3
3492 bnz,pt %ncc, .co_big_d2
3493 .co_big_d2f: ! dest is now word aligned
3494 andcc %i1, 4, %o3 ! is dest longword aligned
3495 bnz,pt %ncc, .co_big_d4
3496 .co_big_d4f: ! dest is now long word aligned
3497 andcc %i0, 7, %o3 ! is src long word aligned
3498 brnz,pt %o3, .co_big_unal8
3499 prefetch [%i0 + (2 * CACHE_LINE)], #one_read
3500 ! Src and dst are long word aligned
3501 ! align dst to 64 byte boundary
3502 andcc %i1, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned
3503 brz,pn %o3, .co_al_to_64
3504 nop
3505 sub %o3, 64, %o3 ! %o3 has negative bytes to move
3506 add %i2, %o3, %i2 ! adjust remaining count
3507 andcc %o3, 8, %o4 ! odd long words to move?
3508 brz,pt %o4, .co_al_to_16
3509 nop
3510 add %o3, 8, %o3
3511 ldx [%i0], %o4
3512 add %i0, 8, %i0 ! increment src ptr
3513 stxa %o4, [%i1]ASI_USER
3514 add %i1, 8, %i1 ! increment dst ptr
3515 ! Dest is aligned on 16 bytes, src 8 byte aligned
3516 .co_al_to_16:
3517 andcc %o3, 0x30, %o4 ! move to move?
3518 brz,pt %o4, .co_al_to_64
3519 nop
3520 .co_al_mv_16:
3521 add %o3, 16, %o3
3522 ldx [%i0], %o4
3523 stxa %o4, [%i1]ASI_USER
3524 add %i0, 16, %i0 ! increment src ptr
3525 ldx [%i0-8], %o4
3526 add %i1, 8, %i1 ! increment dst ptr
3527 stxa %o4, [%i1]ASI_USER
3528 andcc %o3, 0x30, %o4
3529 brnz,pt %o4, .co_al_mv_16
3530 add %i1, 8, %i1 ! increment dst ptr
3531 ! Dest is aligned on 64 bytes, src 8 byte aligned
3532 .co_al_to_64:
3533 ! Determine source alignment
3534 ! to correct 8 byte offset
3535 andcc %i0, 32, %o3
3536 brnz,pn %o3, .co_aln_1
3537 andcc %i0, 16, %o3
3538 brnz,pn %o3, .co_aln_01
3539 andcc %i0, 8, %o3
3540 brz,pn %o3, .co_aln_000
3541 prefetch [%i0 + (3 * CACHE_LINE)], #one_read
3542 ba .co_aln_001
3543 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3544 .co_aln_01:
3545 brnz,pn %o3, .co_aln_011
3546 prefetch [%i0 + (3 * CACHE_LINE)], #one_read
3547 ba .co_aln_010
3548 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3549 .co_aln_1:
3550 andcc %i0, 16, %o3
3551 brnz,pn %o3, .co_aln_11
3552 andcc %i0, 8, %o3
3553 brnz,pn %o3, .co_aln_101
3554 prefetch [%i0 + (3 * CACHE_LINE)], #one_read
3555 ba .co_aln_100
3556 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3557 .co_aln_11:
3558 brz,pn %o3, .co_aln_110
3559 prefetch [%i0 + (3 * CACHE_LINE)], #one_read
3560
3561 .co_aln_111:
3562 ! Alignment off by 8 bytes
3563 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3564 ldd [%i0], %d0
3565 add %i0, 8, %i0
3566 sub %i2, 8, %i2
3567 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
3568 and %i2, 0x7f, %i2 ! residue bytes in %i2
3569 sub %i1, %i0, %i1
3570 .co_aln_111_loop:
3571 ldda [%i0]ASI_BLK_P,%d16 ! block load
3572 subcc %o3, 64, %o3
3573 fmovd %d16, %d2
3574 fmovd %d18, %d4
3575 fmovd %d20, %d6
3576 fmovd %d22, %d8
3577 fmovd %d24, %d10
3578 fmovd %d26, %d12
3579 fmovd %d28, %d14
3580 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store
3581 stda %d0,[%i0+%i1]ASI_BLK_AIUS
3582 add %i0, 64, %i0
3583 fmovd %d30, %d0
3584 bgt,pt %ncc, .co_aln_111_loop
3585 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3586 add %i1, %i0, %i1
3587
3588 stda %d0, [%i1]ASI_USER
3589 ba .co_remain_stuff
3590 add %i1, 8, %i1
3591 ! END OF aln_111
3592
3593 .co_aln_110:
3594 ! Alignment off by 16 bytes
3595 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3596 ldd [%i0], %d0
3597 ldd [%i0+8], %d2
3598 add %i0, 16, %i0
3599 sub %i2, 16, %i2
3600 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
3601 and %i2, 0x7f, %i2 ! residue bytes in %i2
3602 sub %i1, %i0, %i1
3603 .co_aln_110_loop:
3604 ldda [%i0]ASI_BLK_P,%d16 ! block load
3605 subcc %o3, 64, %o3
3606 fmovd %d16, %d4
3607 fmovd %d18, %d6
3608 fmovd %d20, %d8
3609 fmovd %d22, %d10
3610 fmovd %d24, %d12
3611 fmovd %d26, %d14
3612 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store
3613 stda %d0,[%i0+%i1]ASI_BLK_AIUS
3614 add %i0, 64, %i0
3615 fmovd %d28, %d0
3616 fmovd %d30, %d2
3617 bgt,pt %ncc, .co_aln_110_loop
3618 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3619 add %i1, %i0, %i1
3620
3621 stda %d0, [%i1]%asi
3622 stda %d2, [%i1+8]%asi
3623 ba .co_remain_stuff
3624 add %i1, 16, %i1
3625 ! END OF aln_110
3626
3627 .co_aln_101:
3628 ! Alignment off by 24 bytes
3629 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3630 ldd [%i0], %d0
3631 ldd [%i0+8], %d2
3632 ldd [%i0+16], %d4
3633 add %i0, 24, %i0
3634 sub %i2, 24, %i2
3635 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
3636 and %i2, 0x7f, %i2 ! residue bytes in %i2
3637 sub %i1, %i0, %i1
3638 .co_aln_101_loop:
3639 ldda [%i0]ASI_BLK_P,%d16 ! block load
3640 subcc %o3, 64, %o3
3641 fmovd %d16, %d6
3642 fmovd %d18, %d8
3643 fmovd %d20, %d10
3644 fmovd %d22, %d12
3645 fmovd %d24, %d14
3646 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store
3647 stda %d0,[%i0+%i1]ASI_BLK_AIUS
3648 add %i0, 64, %i0
3649 fmovd %d26, %d0
3650 fmovd %d28, %d2
3651 fmovd %d30, %d4
3652 bgt,pt %ncc, .co_aln_101_loop
3653 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3654 add %i1, %i0, %i1
3655
3656 stda %d0, [%i1]%asi
3657 stda %d2, [%i1+8]%asi
3658 stda %d4, [%i1+16]%asi
3659 ba .co_remain_stuff
3660 add %i1, 24, %i1
3661 ! END OF aln_101
3662
3663 .co_aln_100:
3664 ! Alignment off by 32 bytes
3665 ldd [%i0], %d0
3666 ldd [%i0+8], %d2
3667 ldd [%i0+16],%d4
3668 ldd [%i0+24],%d6
3669 add %i0, 32, %i0
3670 sub %i2, 32, %i2
3671 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
3672 and %i2, 0x7f, %i2 ! residue bytes in %i2
3673 sub %i1, %i0, %i1
3674 .co_aln_100_loop:
3675 ldda [%i0]ASI_BLK_P,%d16 ! block load
3676 subcc %o3, 64, %o3
3677 fmovd %d16, %d8
3678 fmovd %d18, %d10
3679 fmovd %d20, %d12
3680 fmovd %d22, %d14
3681 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store
3682 stda %d0,[%i0+%i1]ASI_BLK_AIUS
3683 add %i0, 64, %i0
3684 fmovd %d24, %d0
3685 fmovd %d26, %d2
3686 fmovd %d28, %d4
3687 fmovd %d30, %d6
3688 bgt,pt %ncc, .co_aln_100_loop
3689 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3690 add %i1, %i0, %i1
3691
3692 stda %d0, [%i1]%asi
3693 stda %d2, [%i1+8]%asi
3694 stda %d4, [%i1+16]%asi
3695 stda %d6, [%i1+24]%asi
3696 ba .co_remain_stuff
3697 add %i1, 32, %i1
3698 ! END OF aln_100
3699
3700 .co_aln_011:
3701 ! Alignment off by 40 bytes
3702 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3703 ldd [%i0], %d0
3704 ldd [%i0+8], %d2
3705 ldd [%i0+16], %d4
3706 ldd [%i0+24], %d6
3707 ldd [%i0+32], %d8
3708 add %i0, 40, %i0
3709 sub %i2, 40, %i2
3710 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
3711 and %i2, 0x7f, %i2 ! residue bytes in %i2
3712 sub %i1, %i0, %i1
3713 .co_aln_011_loop:
3714 ldda [%i0]ASI_BLK_P,%d16 ! block load
3715 subcc %o3, 64, %o3
3716 fmovd %d16, %d10
3717 fmovd %d18, %d12
3718 fmovd %d20, %d14
3719 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store
3720 stda %d0,[%i0+%i1]ASI_BLK_AIUS
3721 add %i0, 64, %i0
3722 fmovd %d22, %d0
3723 fmovd %d24, %d2
3724 fmovd %d26, %d4
3725 fmovd %d28, %d6
3726 fmovd %d30, %d8
3727 bgt,pt %ncc, .co_aln_011_loop
3728 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3729 add %i1, %i0, %i1
3730
3731 stda %d0, [%i1]%asi
3732 stda %d2, [%i1+8]%asi
3733 stda %d4, [%i1+16]%asi
3734 stda %d6, [%i1+24]%asi
3735 stda %d8, [%i1+32]%asi
3736 ba .co_remain_stuff
3737 add %i1, 40, %i1
3738 ! END OF aln_011
3739
3740 .co_aln_010:
3741 ! Alignment off by 48 bytes
3742 ldd [%i0], %d0
3743 ldd [%i0+8], %d2
3744 ldd [%i0+16], %d4
3745 ldd [%i0+24], %d6
3746 ldd [%i0+32], %d8
3747 ldd [%i0+40], %d10
3748 add %i0, 48, %i0
3749 sub %i2, 48, %i2
3750 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
3751 and %i2, 0x7f, %i2 ! residue bytes in %i2
3752 sub %i1, %i0, %i1
3753 .co_aln_010_loop:
3754 ldda [%i0]ASI_BLK_P,%d16 ! block load
3755 subcc %o3, 64, %o3
3756 fmovd %d16, %d12
3757 fmovd %d18, %d14
3758 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store
3759 stda %d0,[%i0+%i1]ASI_BLK_AIUS
3760 add %i0, 64, %i0
3761 fmovd %d20, %d0
3762 fmovd %d22, %d2
3763 fmovd %d24, %d4
3764 fmovd %d26, %d6
3765 fmovd %d28, %d8
3766 fmovd %d30, %d10
3767 bgt,pt %ncc, .co_aln_010_loop
3768 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3769 add %i1, %i0, %i1
3770
3771 stda %d0, [%i1]%asi
3772 stda %d2, [%i1+8]%asi
3773 stda %d4, [%i1+16]%asi
3774 stda %d6, [%i1+24]%asi
3775 stda %d8, [%i1+32]%asi
3776 stda %d10, [%i1+40]%asi
3777 ba .co_remain_stuff
3778 add %i1, 48, %i1
3779 ! END OF aln_010
3780
3781 .co_aln_001:
3782 ! Alignment off by 56 bytes
3783 ldd [%i0], %d0
3784 ldd [%i0+8], %d2
3785 ldd [%i0+16], %d4
3786 ldd [%i0+24], %d6
3787 ldd [%i0+32], %d8
3788 ldd [%i0+40], %d10
3789 ldd [%i0+48], %d12
3790 add %i0, 56, %i0
3791 sub %i2, 56, %i2
3792 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
3793 and %i2, 0x7f, %i2 ! residue bytes in %i2
3794 sub %i1, %i0, %i1
3795 .co_aln_001_loop:
3796 ldda [%i0]ASI_BLK_P,%d16 ! block load
3797 subcc %o3, 64, %o3
3798 fmovd %d16, %d14
3799 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store
3800 stda %d0,[%i0+%i1]ASI_BLK_AIUS
3801 add %i0, 64, %i0
3802 fmovd %d18, %d0
3803 fmovd %d20, %d2
3804 fmovd %d22, %d4
3805 fmovd %d24, %d6
3806 fmovd %d26, %d8
3807 fmovd %d28, %d10
3808 fmovd %d30, %d12
3809 bgt,pt %ncc, .co_aln_001_loop
3810 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3811 add %i1, %i0, %i1
3812
3813 stda %d0, [%i1]%asi
3814 stda %d2, [%i1+8]%asi
3815 stda %d4, [%i1+16]%asi
3816 stda %d6, [%i1+24]%asi
3817 stda %d8, [%i1+32]%asi
3818 stda %d10, [%i1+40]%asi
3819 stda %d12, [%i1+48]%asi
3820 ba .co_remain_stuff
3821 add %i1, 56, %i1
3822 ! END OF aln_001
3823
3824 .co_aln_000:
3825 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3826 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
3827 and %i2, 0x7f, %i2 ! residue bytes in %i2
3828 sub %i1, %i0, %i1
3829 .co_aln_000_loop:
3830 ldda [%i0]ASI_BLK_P,%d0
3831 subcc %o3, 64, %o3
3832 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store
3833 stda %d0,[%i0+%i1]ASI_BLK_AIUS
3834 add %i0, 64, %i0
3835 bgt,pt %ncc, .co_aln_000_loop
3836 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3837 add %i1, %i0, %i1
3838
3839 ! END OF aln_000
3840
3841 .co_remain_stuff:
3842 subcc %i2, 31, %i2 ! adjust length to allow cc test
3843 ble,pt %ncc, .co_aln_31
3844 nop
3845 .co_aln_32:
3846 ldx [%i0], %o4 ! move 32 bytes
3847 subcc %i2, 32, %i2 ! decrement length count by 32
3848 stxa %o4, [%i1]%asi
3849 ldx [%i0+8], %o4
3850 stxa %o4, [%i1+8]%asi
3851 ldx [%i0+16], %o4
3852 add %i0, 32, %i0 ! increase src ptr by 32
3853 stxa %o4, [%i1+16]%asi
3854 ldx [%i0-8], %o4
3855 add %i1, 32, %i1 ! increase dst ptr by 32
3856 bgu,pt %ncc, .co_aln_32 ! repeat if at least 32 bytes left
3857 stxa %o4, [%i1-8]%asi
3858 .co_aln_31:
3859 addcc %i2, 24, %i2 ! adjust count to be off by 7
3860 ble,pt %ncc, .co_aln_7 ! skip if 7 or fewer bytes left
3861 nop !
3862 .co_aln_15:
3863 ldx [%i0], %o4 ! move 8 bytes
3864 add %i0, 8, %i0 ! increase src ptr by 8
3865 subcc %i2, 8, %i2 ! decrease count by 8
3866 add %i1, 8, %i1 ! increase dst ptr by 8
3867 bgu,pt %ncc, .co_aln_15
3868 stxa %o4, [%i1-8]%asi
3869 .co_aln_7:
3870 addcc %i2, 7, %i2 ! finish adjustment of remaining count
3871 bz,pt %ncc, .co_exit ! exit if finished
3872 cmp %i2, 4
3873 blt,pt %ncc, .co_unaln3x ! skip if less than 4 bytes left
3874 nop !
3875 ld [%i0], %o4 ! move 4 bytes
3876 add %i0, 4, %i0 ! increase src ptr by 4
3877 add %i1, 4, %i1 ! increase dst ptr by 4
3878 subcc %i2, 4, %i2 ! decrease count by 4
3879 bnz .co_unaln3x
3880 stwa %o4, [%i1-4]%asi
3881 ba .co_exit
3882 nop
3883
3884 ! destination alignment code
3885 .co_big_d1:
3886 ldub [%i0], %o4 ! move a byte
3887 add %i0, 1, %i0
3888 stba %o4, [%i1]ASI_USER
3889 add %i1, 1, %i1
3890 andcc %i1, 2, %o3
3891 bz,pt %ncc, .co_big_d2f
3892 sub %i2, 1, %i2
3893 .co_big_d2:
3894 ldub [%i0], %o4 ! move a half-word (src align unknown)
3895 ldub [%i0+1], %o3
3896 add %i0, 2, %i0
3897 sll %o4, 8, %o4 ! position
3898 or %o4, %o3, %o4 ! merge
3899 stha %o4, [%i1]ASI_USER
3900 add %i1, 2, %i1
3901 andcc %i1, 4, %o3 ! is dest longword aligned
3902 bz,pt %ncc, .co_big_d4f
3903 sub %i2, 2, %i2
3904 .co_big_d4: ! dest is at least word aligned
3905 nop
3906 ldub [%i0], %o4 ! move a word (src align unknown)
3907 ldub [%i0+1], %o3
3908 sll %o4, 24, %o4 ! position
3909 sll %o3, 16, %o3 ! position
3910 or %o4, %o3, %o3 ! merge
3911 ldub [%i0+2], %o4
3912 sll %o4, 8, %o4 ! position
3913 or %o4, %o3, %o3 ! merge
3914 ldub [%i0+3], %o4
3915 or %o4, %o3, %o4 ! merge
3916 stwa %o4,[%i1]ASI_USER ! store four bytes
3917 add %i0, 4, %i0 ! adjust src by 4
3918 add %i1, 4, %i1 ! adjust dest by 4
3919 ba .co_big_d4f
3920 sub %i2, 4, %i2 ! adjust count by 4
3921
3922
3923 ! Dst is on 8 byte boundary; src is not;
3924 .co_big_unal8:
3925 andcc %i1, 0x3f, %o3 ! is dst 64-byte block aligned?
3926 bz %ncc, .co_unalnsrc
3927 sub %o3, 64, %o3 ! %o3 will be multiple of 8
3928 neg %o3 ! bytes until dest is 64 byte aligned
3929 sub %i2, %o3, %i2 ! update cnt with bytes to be moved
3930 ! Move bytes according to source alignment
3931 andcc %i0, 0x1, %o4
3932 bnz %ncc, .co_unalnbyte ! check for byte alignment
3933 nop
3934 andcc %i0, 2, %o4 ! check for half word alignment
3935 bnz %ncc, .co_unalnhalf
3936 nop
3937 ! Src is word aligned, move bytes until dest 64 byte aligned
3938 .co_unalnword:
3939 ld [%i0], %o4 ! load 4 bytes
3940 stwa %o4, [%i1]%asi ! and store 4 bytes
3941 ld [%i0+4], %o4 ! load 4 bytes
3942 add %i0, 8, %i0 ! increase src ptr by 8
3943 stwa %o4, [%i1+4]%asi ! and store 4 bytes
3944 subcc %o3, 8, %o3 ! decrease count by 8
3945 bnz %ncc, .co_unalnword
3946 add %i1, 8, %i1 ! increase dst ptr by 8
3947 ba .co_unalnsrc
3948 nop
3949
3950 ! Src is half-word aligned, move bytes until dest 64 byte aligned
3951 .co_unalnhalf:
3952 lduh [%i0], %o4 ! load 2 bytes
3953 sllx %o4, 32, %i3 ! shift left
3954 lduw [%i0+2], %o4
3955 or %o4, %i3, %i3
3956 sllx %i3, 16, %i3
3957 lduh [%i0+6], %o4
3958 or %o4, %i3, %i3
3959 stxa %i3, [%i1]ASI_USER
3960 add %i0, 8, %i0
3961 subcc %o3, 8, %o3
3962 bnz %ncc, .co_unalnhalf
3963 add %i1, 8, %i1
3964 ba .co_unalnsrc
3965 nop
3966
3967 ! Src is Byte aligned, move bytes until dest 64 byte aligned
3968 .co_unalnbyte:
3969 sub %i1, %i0, %i1 ! share pointer advance
3970 .co_unalnbyte_loop:
3971 ldub [%i0], %o4
3972 sllx %o4, 56, %i3
3973 lduh [%i0+1], %o4
3974 sllx %o4, 40, %o4
3975 or %o4, %i3, %i3
3976 lduh [%i0+3], %o4
3977 sllx %o4, 24, %o4
3978 or %o4, %i3, %i3
3979 lduh [%i0+5], %o4
3980 sllx %o4, 8, %o4
3981 or %o4, %i3, %i3
3982 ldub [%i0+7], %o4
3983 or %o4, %i3, %i3
3984 stxa %i3, [%i1+%i0]ASI_USER
3985 subcc %o3, 8, %o3
3986 bnz %ncc, .co_unalnbyte_loop
3987 add %i0, 8, %i0
3988 add %i1,%i0, %i1 ! restore pointer
3989
3990 ! Destination is now block (64 byte aligned), src is not 8 byte aligned
3991 .co_unalnsrc:
3992 andn %i2, 0x3f, %i3 ! %i3 is multiple of block size
3993 and %i2, 0x3f, %i2 ! residue bytes in %i2
3994 add %i2, 64, %i2 ! Insure we don't load beyond
3995 sub %i3, 64, %i3 ! end of source buffer
3996
3997 andn %i0, 0x3f, %o4 ! %o4 has block aligned src address
3998 prefetch [%o4 + (3 * CACHE_LINE)], #one_read
3999 alignaddr %i0, %g0, %g0 ! generate %gsr
4000 add %i0, %i3, %i0 ! advance %i0 to after blocks
4001 !
4002 ! Determine source alignment to correct 8 byte offset
4003 andcc %i0, 0x20, %o3
4004 brnz,pn %o3, .co_unaln_1
4005 andcc %i0, 0x10, %o3
4006 brnz,pn %o3, .co_unaln_01
4007 andcc %i0, 0x08, %o3
4008 brz,a %o3, .co_unaln_000
4009 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4010 ba .co_unaln_001
4011 nop
4012 .co_unaln_01:
4013 brnz,a %o3, .co_unaln_011
4014 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4015 ba .co_unaln_010
4016 nop
4017 .co_unaln_1:
4018 brnz,pn %o3, .co_unaln_11
4019 andcc %i0, 0x08, %o3
4020 brnz,a %o3, .co_unaln_101
4021 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4022 ba .co_unaln_100
4023 nop
4024 .co_unaln_11:
4025 brz,pn %o3, .co_unaln_110
4026 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
4027
4028 .co_unaln_111:
4029 ldd [%o4+56], %d14
4030 .co_unaln_111_loop:
4031 add %o4, 64, %o4
4032 ldda [%o4]ASI_BLK_P, %d16
4033 faligndata %d14, %d16, %d48
4034 faligndata %d16, %d18, %d50
4035 faligndata %d18, %d20, %d52
4036 faligndata %d20, %d22, %d54
4037 faligndata %d22, %d24, %d56
4038 faligndata %d24, %d26, %d58
4039 faligndata %d26, %d28, %d60
4040 faligndata %d28, %d30, %d62
4041 fmovd %d30, %d14
4042 stda %d48, [%i1]ASI_BLK_AIUS
4043 subcc %i3, 64, %i3
4044 add %i1, 64, %i1
4045 bgu,pt %ncc, .co_unaln_111_loop
4046 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4047 ba .co_unaln_done
4048 nop
4049
4050 .co_unaln_110:
4051 ldd [%o4+48], %d12
4052 ldd [%o4+56], %d14
4053 .co_unaln_110_loop:
4054 add %o4, 64, %o4
4055 ldda [%o4]ASI_BLK_P, %d16
4056 faligndata %d12, %d14, %d48
4057 faligndata %d14, %d16, %d50
4058 faligndata %d16, %d18, %d52
4059 faligndata %d18, %d20, %d54
4060 faligndata %d20, %d22, %d56
4061 faligndata %d22, %d24, %d58
4062 faligndata %d24, %d26, %d60
4063 faligndata %d26, %d28, %d62
4064 fmovd %d28, %d12
4065 fmovd %d30, %d14
4066 stda %d48, [%i1]ASI_BLK_AIUS
4067 subcc %i3, 64, %i3
4068 add %i1, 64, %i1
4069 bgu,pt %ncc, .co_unaln_110_loop
4070 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4071 ba .co_unaln_done
4072 nop
4073
4074 .co_unaln_101:
4075 ldd [%o4+40], %d10
4076 ldd [%o4+48], %d12
4077 ldd [%o4+56], %d14
4078 .co_unaln_101_loop:
4079 add %o4, 64, %o4
4080 ldda [%o4]ASI_BLK_P, %d16
4081 faligndata %d10, %d12, %d48
4082 faligndata %d12, %d14, %d50
4083 faligndata %d14, %d16, %d52
4084 faligndata %d16, %d18, %d54
4085 faligndata %d18, %d20, %d56
4086 faligndata %d20, %d22, %d58
4087 faligndata %d22, %d24, %d60
4088 faligndata %d24, %d26, %d62
4089 fmovd %d26, %d10
4090 fmovd %d28, %d12
4091 fmovd %d30, %d14
4092 stda %d48, [%i1]ASI_BLK_AIUS
4093 subcc %i3, 64, %i3
4094 add %i1, 64, %i1
4095 bgu,pt %ncc, .co_unaln_101_loop
4096 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4097 ba .co_unaln_done
4098 nop
4099
4100 .co_unaln_100:
4101 ldd [%o4+32], %d8
4102 ldd [%o4+40], %d10
4103 ldd [%o4+48], %d12
4104 ldd [%o4+56], %d14
4105 .co_unaln_100_loop:
4106 add %o4, 64, %o4
4107 ldda [%o4]ASI_BLK_P, %d16
4108 faligndata %d8, %d10, %d48
4109 faligndata %d10, %d12, %d50
4110 faligndata %d12, %d14, %d52
4111 faligndata %d14, %d16, %d54
4112 faligndata %d16, %d18, %d56
4113 faligndata %d18, %d20, %d58
4114 faligndata %d20, %d22, %d60
4115 faligndata %d22, %d24, %d62
4116 fmovd %d24, %d8
4117 fmovd %d26, %d10
4118 fmovd %d28, %d12
4119 fmovd %d30, %d14
4120 stda %d48, [%i1]ASI_BLK_AIUS
4121 subcc %i3, 64, %i3
4122 add %i1, 64, %i1
4123 bgu,pt %ncc, .co_unaln_100_loop
4124 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4125 ba .co_unaln_done
4126 nop
4127
4128 .co_unaln_011:
4129 ldd [%o4+24], %d6
4130 ldd [%o4+32], %d8
4131 ldd [%o4+40], %d10
4132 ldd [%o4+48], %d12
4133 ldd [%o4+56], %d14
4134 .co_unaln_011_loop:
4135 add %o4, 64, %o4
4136 ldda [%o4]ASI_BLK_P, %d16
4137 faligndata %d6, %d8, %d48
4138 faligndata %d8, %d10, %d50
4139 faligndata %d10, %d12, %d52
4140 faligndata %d12, %d14, %d54
4141 faligndata %d14, %d16, %d56
4142 faligndata %d16, %d18, %d58
4143 faligndata %d18, %d20, %d60
4144 faligndata %d20, %d22, %d62
4145 fmovd %d22, %d6
4146 fmovd %d24, %d8
4147 fmovd %d26, %d10
4148 fmovd %d28, %d12
4149 fmovd %d30, %d14
4150 stda %d48, [%i1]ASI_BLK_AIUS
4151 subcc %i3, 64, %i3
4152 add %i1, 64, %i1
4153 bgu,pt %ncc, .co_unaln_011_loop
4154 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4155 ba .co_unaln_done
4156 nop
4157
4158 .co_unaln_010:
4159 ldd [%o4+16], %d4
4160 ldd [%o4+24], %d6
4161 ldd [%o4+32], %d8
4162 ldd [%o4+40], %d10
4163 ldd [%o4+48], %d12
4164 ldd [%o4+56], %d14
4165 .co_unaln_010_loop:
4166 add %o4, 64, %o4
4167 ldda [%o4]ASI_BLK_P, %d16
4168 faligndata %d4, %d6, %d48
4169 faligndata %d6, %d8, %d50
4170 faligndata %d8, %d10, %d52
4171 faligndata %d10, %d12, %d54
4172 faligndata %d12, %d14, %d56
4173 faligndata %d14, %d16, %d58
4174 faligndata %d16, %d18, %d60
4175 faligndata %d18, %d20, %d62
4176 fmovd %d20, %d4
4177 fmovd %d22, %d6
4178 fmovd %d24, %d8
4179 fmovd %d26, %d10
4180 fmovd %d28, %d12
4181 fmovd %d30, %d14
4182 stda %d48, [%i1]ASI_BLK_AIUS
4183 subcc %i3, 64, %i3
4184 add %i1, 64, %i1
4185 bgu,pt %ncc, .co_unaln_010_loop
4186 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4187 ba .co_unaln_done
4188 nop
4189
4190 .co_unaln_001:
4191 ldd [%o4+8], %d2
4192 ldd [%o4+16], %d4
4193 ldd [%o4+24], %d6
4194 ldd [%o4+32], %d8
4195 ldd [%o4+40], %d10
4196 ldd [%o4+48], %d12
4197 ldd [%o4+56], %d14
4198 .co_unaln_001_loop:
4199 add %o4, 64, %o4
4200 ldda [%o4]ASI_BLK_P, %d16
4201 faligndata %d2, %d4, %d48
4202 faligndata %d4, %d6, %d50
4203 faligndata %d6, %d8, %d52
4204 faligndata %d8, %d10, %d54
4205 faligndata %d10, %d12, %d56
4206 faligndata %d12, %d14, %d58
4207 faligndata %d14, %d16, %d60
4208 faligndata %d16, %d18, %d62
4209 fmovd %d18, %d2
4210 fmovd %d20, %d4
4211 fmovd %d22, %d6
4212 fmovd %d24, %d8
4213 fmovd %d26, %d10
4214 fmovd %d28, %d12
4215 fmovd %d30, %d14
4216 stda %d48, [%i1]ASI_BLK_AIUS
4217 subcc %i3, 64, %i3
4218 add %i1, 64, %i1
4219 bgu,pt %ncc, .co_unaln_001_loop
4220 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4221 ba .co_unaln_done
4222 nop
4223
4224 .co_unaln_000:
4225 ldda [%o4]ASI_BLK_P, %d0
4226 .co_unaln_000_loop:
4227 add %o4, 64, %o4
4228 ldda [%o4]ASI_BLK_P, %d16
4229 faligndata %d0, %d2, %d48
4230 faligndata %d2, %d4, %d50
4231 faligndata %d4, %d6, %d52
4232 faligndata %d6, %d8, %d54
4233 faligndata %d8, %d10, %d56
4234 faligndata %d10, %d12, %d58
4235 faligndata %d12, %d14, %d60
4236 faligndata %d14, %d16, %d62
4237 fmovd %d16, %d0
4238 fmovd %d18, %d2
4239 fmovd %d20, %d4
4240 fmovd %d22, %d6
4241 fmovd %d24, %d8
4242 fmovd %d26, %d10
4243 fmovd %d28, %d12
4244 fmovd %d30, %d14
4245 stda %d48, [%i1]ASI_BLK_AIUS
4246 subcc %i3, 64, %i3
4247 add %i1, 64, %i1
4248 bgu,pt %ncc, .co_unaln_000_loop
4249 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4250
4251 .co_unaln_done:
4252 ! Handle trailing bytes, 64 to 127
4253 ! Dest long word aligned, Src not long word aligned
4254 cmp %i2, 15
4255 bleu %ncc, .co_unaln_short
4256
4257 andn %i2, 0x7, %i3 ! %i3 is multiple of 8
4258 and %i2, 0x7, %i2 ! residue bytes in %i2
4259 add %i2, 8, %i2
4260 sub %i3, 8, %i3 ! insure we don't load past end of src
4261 andn %i0, 0x7, %o4 ! %o4 has long word aligned src address
4262 add %i0, %i3, %i0 ! advance %i0 to after multiple of 8
4263 ldd [%o4], %d0 ! fetch partial word
4264 .co_unaln_by8:
4265 ldd [%o4+8], %d2
4266 add %o4, 8, %o4
4267 faligndata %d0, %d2, %d16
4268 subcc %i3, 8, %i3
4269 stda %d16, [%i1]%asi
4270 fmovd %d2, %d0
4271 bgu,pt %ncc, .co_unaln_by8
4272 add %i1, 8, %i1
4273
4274 .co_unaln_short:
4275 cmp %i2, 8
4276 blt,pt %ncc, .co_unalnfin
4277 nop
4278 ldub [%i0], %o4
4279 sll %o4, 24, %o3
4280 ldub [%i0+1], %o4
4281 sll %o4, 16, %o4
4282 or %o4, %o3, %o3
4283 ldub [%i0+2], %o4
4284 sll %o4, 8, %o4
4285 or %o4, %o3, %o3
4286 ldub [%i0+3], %o4
4287 or %o4, %o3, %o3
4288 stwa %o3, [%i1]%asi
4289 ldub [%i0+4], %o4
4290 sll %o4, 24, %o3
4291 ldub [%i0+5], %o4
4292 sll %o4, 16, %o4
4293 or %o4, %o3, %o3
4294 ldub [%i0+6], %o4
4295 sll %o4, 8, %o4
4296 or %o4, %o3, %o3
4297 ldub [%i0+7], %o4
4298 or %o4, %o3, %o3
4299 stwa %o3, [%i1+4]%asi
4300 add %i0, 8, %i0
4301 add %i1, 8, %i1
4302 sub %i2, 8, %i2
4303 .co_unalnfin:
4304 cmp %i2, 4
4305 blt,pt %ncc, .co_unalnz
4306 tst %i2
4307 ldub [%i0], %o3 ! read byte
4308 subcc %i2, 4, %i2 ! reduce count by 4
4309 sll %o3, 24, %o3 ! position
4310 ldub [%i0+1], %o4
4311 sll %o4, 16, %o4 ! position
4312 or %o4, %o3, %o3 ! merge
4313 ldub [%i0+2], %o4
4314 sll %o4, 8, %o4 ! position
4315 or %o4, %o3, %o3 ! merge
4316 add %i1, 4, %i1 ! advance dst by 4
4317 ldub [%i0+3], %o4
4318 add %i0, 4, %i0 ! advance src by 4
4319 or %o4, %o3, %o4 ! merge
4320 bnz,pt %ncc, .co_unaln3x
4321 stwa %o4, [%i1-4]%asi
4322 ba .co_exit
4323 nop
4324 .co_unalnz:
4325 bz,pt %ncc, .co_exit
4326 wr %l5, %g0, %gsr ! restore %gsr
4327 .co_unaln3x: ! Exactly 1, 2, or 3 bytes remain
4328 subcc %i2, 1, %i2 ! reduce count for cc test
4329 ldub [%i0], %o4 ! load one byte
4330 bz,pt %ncc, .co_exit
4331 stba %o4, [%i1]%asi ! store one byte
4332 ldub [%i0+1], %o4 ! load second byte
4333 subcc %i2, 1, %i2
4334 bz,pt %ncc, .co_exit
4335 stba %o4, [%i1+1]%asi ! store second byte
4336 ldub [%i0+2], %o4 ! load third byte
4337 stba %o4, [%i1+2]%asi ! store third byte
4338 .co_exit:
4339 brnz %g1, .co_fp_restore
4340 nop
4341 FZERO
4342 wr %g1, %g0, %fprs
4343 ba,pt %ncc, .co_ex2
4344 membar #Sync
4345 .co_fp_restore:
4346 BLD_FP_FROMSTACK(%o4)
4347 .co_ex2:
4348 andn SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
4349 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
4350 ret
4351 restore %g0, 0, %o0
4352
4353 .copyout_err:
4354 ldn [THREAD_REG + T_COPYOPS], %o4
4355 brz %o4, 2f
4356 nop
4357 ldn [%o4 + CP_COPYOUT], %g2
4358 jmp %g2
4359 nop
4360 2:
4361 retl
4362 mov -1, %o0
4363
4364 #else /* NIAGARA_IMPL */
4365 .do_copyout:
4366 !
4367 ! Check the length and bail if zero.
4368 !
4369 tst %o2
4370 bnz,pt %ncc, 1f
4371 nop
4372 retl
4373 clr %o0
4374 1:
4375 sethi %hi(copyio_fault), %o4
4376 or %o4, %lo(copyio_fault), %o4
4377 sethi %hi(copyio_fault_nowindow), %o3
4378 ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
4379 or %o3, %lo(copyio_fault_nowindow), %o3
4380 membar #Sync
4381 stn %o3, [THREAD_REG + T_LOFAULT]
4382
4383 mov %o0, SAVE_SRC
4384 mov %o1, SAVE_DST
4385 mov %o2, SAVE_COUNT
4386
4387 !
4388 ! Check to see if we're more than SMALL_LIMIT (7 bytes).
4389 ! Run in leaf mode, using the %o regs as our input regs.
4390 !
4391 subcc %o2, SMALL_LIMIT, %o3
4392 bgu,a,pt %ncc, .dco_ns
4393 or %o0, %o1, %o3
4394 !
4395 ! What was previously ".small_copyout"
4396 ! Do full differenced copy.
4397 !
4398 .dcobcp:
4399 sub %g0, %o2, %o3 ! negate count
4400 add %o0, %o2, %o0 ! make %o0 point at the end
4401 add %o1, %o2, %o1 ! make %o1 point at the end
4402 ba,pt %ncc, .dcocl
4403 ldub [%o0 + %o3], %o4 ! load first byte
4404 !
4405 ! %o0 and %o2 point at the end and remain pointing at the end
4406 ! of their buffers. We pull things out by adding %o3 (which is
4407 ! the negation of the length) to the buffer end which gives us
4408 ! the curent location in the buffers. By incrementing %o3 we walk
4409 ! through both buffers without having to bump each buffer's
4410 ! pointer. A very fast 4 instruction loop.
4411 !
4412 .align 16
4413 .dcocl:
4414 stba %o4, [%o1 + %o3]ASI_USER
4415 inccc %o3
4416 bl,a,pt %ncc, .dcocl
4417 ldub [%o0 + %o3], %o4
4418 !
4419 ! We're done. Go home.
4420 !
4421 membar #Sync
4422 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
4423 retl
4424 clr %o0
4425 !
4426 ! Try aligned copies from here.
4427 !
4428 .dco_ns:
4429 ! %o0 = kernel addr (to be copied from)
4430 ! %o1 = user addr (to be copied to)
4431 ! %o2 = length
4432 ! %o3 = %o1 | %o2 (used for alignment checking)
4433 ! %o4 is alternate lo_fault
4434 ! %o5 is original lo_fault
4435 !
4436 ! See if we're single byte aligned. If we are, check the
4437 ! limit for single byte copies. If we're smaller or equal,
4438 ! bounce to the byte for byte copy loop. Otherwise do it in
4439 ! HW (if enabled).
4440 !
4441 btst 1, %o3
4442 bz,pt %icc, .dcoh8
4443 btst 7, %o3
4444 !
4445 ! Single byte aligned. Do we do it via HW or via
4446 ! byte for byte? Do a quick no memory reference
4447 ! check to pick up small copies.
4448 !
4449 sethi %hi(hw_copy_limit_1), %o3
4450 !
4451 ! Big enough that we need to check the HW limit for
4452 ! this size copy.
4453 !
4454 ld [%o3 + %lo(hw_copy_limit_1)], %o3
4455 !
4456 ! Is HW copy on? If not, do everything byte for byte.
4457 !
4458 tst %o3
4459 bz,pn %icc, .dcobcp
4460 subcc %o3, %o2, %o3
4461 !
4462 ! If we're less than or equal to the single byte copy limit,
4463 ! bop to the copy loop.
4464 !
4465 bge,pt %ncc, .dcobcp
4466 nop
4467 !
4468 ! We're big enough and copy is on. Do it with HW.
4469 !
4470 ba,pt %ncc, .big_copyout
4471 nop
4472 .dcoh8:
4473 !
4474 ! 8 byte aligned?
4475 !
4476 bnz,a %ncc, .dcoh4
4477 btst 3, %o3
4478 !
4479 ! See if we're in the "small range".
4480 ! If so, go off and do the copy.
4481 ! If not, load the hard limit. %o3 is
4482 ! available for reuse.
4483 !
4484 sethi %hi(hw_copy_limit_8), %o3
4485 ld [%o3 + %lo(hw_copy_limit_8)], %o3
4486 !
4487 ! If it's zero, there's no HW bcopy.
4488 ! Bop off to the aligned copy.
4489 !
4490 tst %o3
4491 bz,pn %icc, .dcos8
4492 subcc %o3, %o2, %o3
4493 !
4494 ! We're negative if our size is larger than hw_copy_limit_8.
4495 !
4496 bge,pt %ncc, .dcos8
4497 nop
4498 !
4499 ! HW assist is on and we're large enough. Do it.
4500 !
4501 ba,pt %ncc, .big_copyout
4502 nop
4503 .dcos8:
4504 !
4505 ! Housekeeping for copy loops. Uses same idea as in the byte for
4506 ! byte copy loop above.
4507 !
4508 add %o0, %o2, %o0
4509 add %o1, %o2, %o1
4510 sub %g0, %o2, %o3
4511 ba,pt %ncc, .dodebc
4512 srl %o2, 3, %o2 ! Number of 8 byte chunks to copy
4513 !
4514 ! 4 byte aligned?
4515 !
4516 .dcoh4:
4517 bnz,pn %ncc, .dcoh2
4518 !
4519 ! See if we're in the "small range".
4520 ! If so, go off an do the copy.
4521 ! If not, load the hard limit. %o3 is
4522 ! available for reuse.
4523 !
4524 sethi %hi(hw_copy_limit_4), %o3
4525 ld [%o3 + %lo(hw_copy_limit_4)], %o3
4526 !
4527 ! If it's zero, there's no HW bcopy.
4528 ! Bop off to the aligned copy.
4529 !
4530 tst %o3
4531 bz,pn %icc, .dcos4
4532 subcc %o3, %o2, %o3
4533 !
4534 ! We're negative if our size is larger than hw_copy_limit_4.
4535 !
4536 bge,pt %ncc, .dcos4
4537 nop
4538 !
4539 ! HW assist is on and we're large enough. Do it.
4540 !
4541 ba,pt %ncc, .big_copyout
4542 nop
4543 .dcos4:
4544 add %o0, %o2, %o0
4545 add %o1, %o2, %o1
4546 sub %g0, %o2, %o3
4547 ba,pt %ncc, .dodfbc
4548 srl %o2, 2, %o2 ! Number of 4 byte chunks to copy
4549 !
4550 ! We must be 2 byte aligned. Off we go.
4551 ! The check for small copies was done in the
4552 ! delay at .dcoh4
4553 !
4554 .dcoh2:
4555 ble %ncc, .dcos2
4556 sethi %hi(hw_copy_limit_2), %o3
4557 ld [%o3 + %lo(hw_copy_limit_2)], %o3
4558 tst %o3
4559 bz,pn %icc, .dcos2
4560 subcc %o3, %o2, %o3
4561 bge,pt %ncc, .dcos2
4562 nop
4563 !
4564 ! HW is on and we're big enough. Do it.
4565 !
4566 ba,pt %ncc, .big_copyout
4567 nop
4568 .dcos2:
4569 add %o0, %o2, %o0
4570 add %o1, %o2, %o1
4571 sub %g0, %o2, %o3
4572 ba,pt %ncc, .dodtbc
4573 srl %o2, 1, %o2 ! Number of 2 byte chunks to copy
4574 .small_copyout:
4575 !
4576 ! Why are we doing this AGAIN? There are certain conditions in
4577 ! big_copyout that will cause us to forego the HW assisted copies
4578 ! and bounce back to a non-HW assisted copy. This dispatches those
4579 ! copies. Note that we branch around this in the main line code.
4580 !
4581 ! We make no check for limits or HW enablement here. We've
4582 ! already been told that we're a poster child so just go off
4583 ! and do it.
4584 !
4585 or %o0, %o1, %o3
4586 btst 1, %o3
4587 bnz %icc, .dcobcp ! Most likely
4588 btst 7, %o3
4589 bz %icc, .dcos8
4590 btst 3, %o3
4591 bz %icc, .dcos4
4592 nop
4593 ba,pt %ncc, .dcos2
4594 nop
4595 .align 32
4596 .dodebc:
4597 ldx [%o0 + %o3], %o4
4598 deccc %o2
4599 stxa %o4, [%o1 + %o3]ASI_USER
4600 bg,pt %ncc, .dodebc
4601 addcc %o3, 8, %o3
4602 !
4603 ! End of copy loop. Check to see if we're done. Most
4604 ! eight byte aligned copies end here.
4605 !
4606 bz,pt %ncc, .dcofh
4607 nop
4608 !
4609 ! Something is left - do it byte for byte.
4610 !
4611 ba,pt %ncc, .dcocl
4612 ldub [%o0 + %o3], %o4 ! load next byte
4613 !
4614 ! Four byte copy loop. %o2 is the number of 4 byte chunks to copy.
4615 !
4616 .align 32
4617 .dodfbc:
4618 lduw [%o0 + %o3], %o4
4619 deccc %o2
4620 sta %o4, [%o1 + %o3]ASI_USER
4621 bg,pt %ncc, .dodfbc
4622 addcc %o3, 4, %o3
4623 !
4624 ! End of copy loop. Check to see if we're done. Most
4625 ! four byte aligned copies end here.
4626 !
4627 bz,pt %ncc, .dcofh
4628 nop
4629 !
4630 ! Something is left. Do it byte for byte.
4631 !
4632 ba,pt %ncc, .dcocl
4633 ldub [%o0 + %o3], %o4 ! load next byte
4634 !
4635 ! two byte aligned copy loop. %o2 is the number of 2 byte chunks to
4636 ! copy.
4637 !
4638 .align 32
4639 .dodtbc:
4640 lduh [%o0 + %o3], %o4
4641 deccc %o2
4642 stha %o4, [%o1 + %o3]ASI_USER
4643 bg,pt %ncc, .dodtbc
4644 addcc %o3, 2, %o3
4645 !
4646 ! End of copy loop. Anything left?
4647 !
4648 bz,pt %ncc, .dcofh
4649 nop
4650 !
4651 ! Deal with the last byte
4652 !
4653 ldub [%o0 + %o3], %o4
4654 stba %o4, [%o1 + %o3]ASI_USER
4655 .dcofh:
4656 membar #Sync
4657 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
4658 retl
4659 clr %o0
4660
4661 .big_copyout:
4662 ! We're going to go off and do a block copy.
4663 ! Switch fault handlers and grab a window. We
4664 ! don't do a membar #Sync since we've done only
4665 ! kernel data to this point.
4666 stn %o4, [THREAD_REG + T_LOFAULT]
4667
4668 ! Copy out that reach here are larger than 256 bytes. The
4669 ! hw_copy_limit_1 is set to 256. Never set this limit less
4670 ! 128 bytes.
4671 save %sp, -SA(MINFRAME), %sp
4672 .do_block_copyout:
4673
4674 ! Swap src/dst since the code below is memcpy code
4675 ! and memcpy/bcopy have different calling sequences
4676 mov %i1, %i5
4677 mov %i0, %i1
4678 mov %i5, %i0
4679
4680 ! Block (64 bytes) align the destination.
4681 andcc %i0, 0x3f, %i3 ! is dst block aligned
4682 bz %ncc, copyout_blalign ! dst already block aligned
4683 sub %i3, 0x40, %i3
4684 neg %i3 ! bytes till dst 64 bytes aligned
4685 sub %i2, %i3, %i2 ! update i2 with new count
4686
4687 ! Based on source and destination alignment do
4688 ! either 8 bytes, 4 bytes, 2 bytes or byte copy.
4689
4690 ! Is dst & src 8B aligned
4691 or %i0, %i1, %o2
4692 andcc %o2, 0x7, %g0
4693 bz %ncc, .co_alewdcp
4694 nop
4695
4696 ! Is dst & src 4B aligned
4697 andcc %o2, 0x3, %g0
4698 bz %ncc, .co_alwdcp
4699 nop
4700
4701 ! Is dst & src 2B aligned
4702 andcc %o2, 0x1, %g0
4703 bz %ncc, .co_alhlfwdcp
4704 nop
4705
4706 ! 1B aligned
4707 1: ldub [%i1], %o2
4708 stba %o2, [%i0]ASI_USER
4709 inc %i1
4710 deccc %i3
4711 bgu,pt %ncc, 1b
4712 inc %i0
4713
4714 ba copyout_blalign
4715 nop
4716
4717 ! dst & src 4B aligned
4718 .co_alwdcp:
4719 ld [%i1], %o2
4720 sta %o2, [%i0]ASI_USER
4721 add %i1, 0x4, %i1
4722 subcc %i3, 0x4, %i3
4723 bgu,pt %ncc, .co_alwdcp
4724 add %i0, 0x4, %i0
4725
4726 ba copyout_blalign
4727 nop
4728
4729 ! dst & src 2B aligned
4730 .co_alhlfwdcp:
4731 lduh [%i1], %o2
4732 stuha %o2, [%i0]ASI_USER
4733 add %i1, 0x2, %i1
4734 subcc %i3, 0x2, %i3
4735 bgu,pt %ncc, .co_alhlfwdcp
4736 add %i0, 0x2, %i0
4737
4738 ba copyout_blalign
4739 nop
4740
4741 ! dst & src 8B aligned
4742 .co_alewdcp:
4743 ldx [%i1], %o2
4744 stxa %o2, [%i0]ASI_USER
4745 add %i1, 0x8, %i1
4746 subcc %i3, 0x8, %i3
4747 bgu,pt %ncc, .co_alewdcp
4748 add %i0, 0x8, %i0
4749
4750 ! Now Destination is block (64 bytes) aligned
4751 copyout_blalign:
4752 andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size
4753 sub %i2, %i3, %i2 ! Residue bytes in %i2
4754
4755 mov ASI_BLK_INIT_QUAD_LDD_AIUS, %asi
4756
4757 andcc %i1, 0xf, %o2 ! is src quadword aligned
4758 bz,pn %xcc, .co_blkcpy ! src offset in %o2 (last 4-bits)
4759 nop
4760 cmp %o2, 0x8
4761 bg .co_upper_double
4762 nop
4763 bl .co_lower_double
4764 nop
4765
4766 ! Falls through when source offset is equal to 8 i.e.
4767 ! source is double word aligned.
4768 ! In this case no shift/merge of data is required
4769
4770 sub %i1, %o2, %i1 ! align the src at 16 bytes.
4771 andn %i1, 0x3f, %l0 ! %l0 has block aligned source
4772 prefetch [%l0+0x0], #one_read
4773 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4774 .co_loop0:
4775 add %i1, 0x10, %i1
4776 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
4777 prefetch [%l0+0x40], #one_read
4778
4779 stxa %l3, [%i0+0x0]%asi
4780 stxa %l4, [%i0+0x8]%asi
4781
4782 add %i1, 0x10, %i1
4783 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4784
4785 stxa %l5, [%i0+0x10]%asi
4786 stxa %l2, [%i0+0x18]%asi
4787
4788 add %i1, 0x10, %i1
4789 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
4790
4791 stxa %l3, [%i0+0x20]%asi
4792 stxa %l4, [%i0+0x28]%asi
4793
4794 add %i1, 0x10, %i1
4795 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4796
4797 stxa %l5, [%i0+0x30]%asi
4798 stxa %l2, [%i0+0x38]%asi
4799
4800 add %l0, 0x40, %l0
4801 subcc %i3, 0x40, %i3
4802 bgu,pt %xcc, .co_loop0
4803 add %i0, 0x40, %i0
4804 ba .co_blkdone
4805 add %i1, %o2, %i1 ! increment the source by src offset
4806 ! the src offset was stored in %o2
4807
4808 .co_lower_double:
4809
4810 sub %i1, %o2, %i1 ! align the src at 16 bytes.
4811 sll %o2, 3, %o0 ! %o0 left shift
4812 mov 0x40, %o1
4813 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift)
4814 andn %i1, 0x3f, %l0 ! %l0 has block aligned source
4815 prefetch [%l0+0x0], #one_read
4816 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 ! partial data in %l2 and %l3 has
4817 ! complete data
4818 .co_loop1:
4819 add %i1, 0x10, %i1
4820 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 ! %l4 has partial data
4821 ! for this read.
4822 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) ! merge %l2, %l3 and %l4
4823 ! into %l2 and %l3
4824 prefetch [%l0+0x40], #one_read
4825
4826 stxa %l2, [%i0+0x0]%asi
4827 stxa %l3, [%i0+0x8]%asi
4828
4829 add %i1, 0x10, %i1
4830 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4831 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) ! merge %l2 with %l5 and
4832 ! %l4 from previous read
4833 ! into %l4 and %l5
4834 stxa %l4, [%i0+0x10]%asi
4835 stxa %l5, [%i0+0x18]%asi
4836
4837 ! Repeat the same for next 32 bytes.
4838
4839 add %i1, 0x10, %i1
4840 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
4841 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
4842
4843 stxa %l2, [%i0+0x20]%asi
4844 stxa %l3, [%i0+0x28]%asi
4845
4846 add %i1, 0x10, %i1
4847 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4848 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
4849
4850 stxa %l4, [%i0+0x30]%asi
4851 stxa %l5, [%i0+0x38]%asi
4852
4853 add %l0, 0x40, %l0
4854 subcc %i3, 0x40, %i3
4855 bgu,pt %xcc, .co_loop1
4856 add %i0, 0x40, %i0
4857 ba .co_blkdone
4858 add %i1, %o2, %i1 ! increment the source by src offset
4859 ! the src offset was stored in %o2
4860
4861 .co_upper_double:
4862
4863 sub %i1, %o2, %i1 ! align the src at 16 bytes.
4864 sub %o2, 0x8, %o0
4865 sll %o0, 3, %o0 ! %o0 left shift
4866 mov 0x40, %o1
4867 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift)
4868 andn %i1, 0x3f, %l0 ! %l0 has block aligned source
4869 prefetch [%l0+0x0], #one_read
4870 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 ! partial data in %l3
4871 ! for this read and
4872 ! no data in %l2
4873 .co_loop2:
4874 add %i1, 0x10, %i1
4875 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 ! %l4 has complete data
4876 ! and %l5 has partial
4877 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) ! merge %l3, %l4 and %l5
4878 ! into %l3 and %l4
4879 prefetch [%l0+0x40], #one_read
4880
4881 stxa %l3, [%i0+0x0]%asi
4882 stxa %l4, [%i0+0x8]%asi
4883
4884 add %i1, 0x10, %i1
4885 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4886 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) ! merge %l2 and %l3 with
4887 ! %l5 from previous read
4888 ! into %l5 and %l2
4889
4890 stxa %l5, [%i0+0x10]%asi
4891 stxa %l2, [%i0+0x18]%asi
4892
4893 ! Repeat the same for next 32 bytes.
4894
4895 add %i1, 0x10, %i1
4896 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
4897 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
4898
4899 stxa %l3, [%i0+0x20]%asi
4900 stxa %l4, [%i0+0x28]%asi
4901
4902 add %i1, 0x10, %i1
4903 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4904 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
4905
4906 stxa %l5, [%i0+0x30]%asi
4907 stxa %l2, [%i0+0x38]%asi
4908
4909 add %l0, 0x40, %l0
4910 subcc %i3, 0x40, %i3
4911 bgu,pt %xcc, .co_loop2
4912 add %i0, 0x40, %i0
4913 ba .co_blkdone
4914 add %i1, %o2, %i1 ! increment the source by src offset
4915 ! the src offset was stored in %o2
4916
4917
4918 ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
4919 .co_blkcpy:
4920
4921 andn %i1, 0x3f, %o0 ! %o0 has block aligned source
4922 prefetch [%o0+0x0], #one_read
4923 1:
4924 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l0
4925 add %i1, 0x10, %i1
4926 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4927 add %i1, 0x10, %i1
4928
4929 prefetch [%o0+0x40], #one_read
4930
4931 stxa %l0, [%i0+0x0]%asi
4932
4933 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
4934 add %i1, 0x10, %i1
4935 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l6
4936 add %i1, 0x10, %i1
4937
4938 stxa %l1, [%i0+0x8]%asi
4939 stxa %l2, [%i0+0x10]%asi
4940 stxa %l3, [%i0+0x18]%asi
4941 stxa %l4, [%i0+0x20]%asi
4942 stxa %l5, [%i0+0x28]%asi
4943 stxa %l6, [%i0+0x30]%asi
4944 stxa %l7, [%i0+0x38]%asi
4945
4946 add %o0, 0x40, %o0
4947 subcc %i3, 0x40, %i3
4948 bgu,pt %xcc, 1b
4949 add %i0, 0x40, %i0
4950
4951 .co_blkdone:
4952 membar #Sync
4953
4954 brz,pt %i2, .copyout_exit
4955 nop
4956
4957 ! Handle trailing bytes
4958 cmp %i2, 0x8
4959 blu,pt %ncc, .co_residue
4960 nop
4961
4962 ! Can we do some 8B ops
4963 or %i1, %i0, %o2
4964 andcc %o2, 0x7, %g0
4965 bnz %ncc, .co_last4
4966 nop
4967
4968 ! Do 8byte ops as long as possible
4969 .co_last8:
4970 ldx [%i1], %o2
4971 stxa %o2, [%i0]ASI_USER
4972 add %i1, 0x8, %i1
4973 sub %i2, 0x8, %i2
4974 cmp %i2, 0x8
4975 bgu,pt %ncc, .co_last8
4976 add %i0, 0x8, %i0
4977
4978 brz,pt %i2, .copyout_exit
4979 nop
4980
4981 ba .co_residue
4982 nop
4983
4984 .co_last4:
4985 ! Can we do 4B ops
4986 andcc %o2, 0x3, %g0
4987 bnz %ncc, .co_last2
4988 nop
4989 1:
4990 ld [%i1], %o2
4991 sta %o2, [%i0]ASI_USER
4992 add %i1, 0x4, %i1
4993 sub %i2, 0x4, %i2
4994 cmp %i2, 0x4
4995 bgu,pt %ncc, 1b
4996 add %i0, 0x4, %i0
4997
4998 brz,pt %i2, .copyout_exit
4999 nop
5000
5001 ba .co_residue
5002 nop
5003
5004 .co_last2:
5005 ! Can we do 2B ops
5006 andcc %o2, 0x1, %g0
5007 bnz %ncc, .co_residue
5008 nop
5009
5010 1:
5011 lduh [%i1], %o2
5012 stuha %o2, [%i0]ASI_USER
5013 add %i1, 0x2, %i1
5014 sub %i2, 0x2, %i2
5015 cmp %i2, 0x2
5016 bgu,pt %ncc, 1b
5017 add %i0, 0x2, %i0
5018
5019 brz,pt %i2, .copyout_exit
5020 nop
5021
5022 ! Copy the residue as byte copy
5023 .co_residue:
5024 ldub [%i1], %i4
5025 stba %i4, [%i0]ASI_USER
5026 inc %i1
5027 deccc %i2
5028 bgu,pt %xcc, .co_residue
5029 inc %i0
5030
5031 .copyout_exit:
5032 membar #Sync
5033 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
5034 ret
5035 restore %g0, 0, %o0
5036
5037 .copyout_err:
5038 ldn [THREAD_REG + T_COPYOPS], %o4
5039 brz %o4, 2f
5040 nop
5041 ldn [%o4 + CP_COPYOUT], %g2
5042 jmp %g2
5043 nop
5044 2:
5045 retl
5046 mov -1, %o0
5047 #endif /* NIAGARA_IMPL */
5048 SET_SIZE(copyout)
5049
5050
5051 ENTRY(xcopyout)
5052 sethi %hi(.xcopyout_err), REAL_LOFAULT
5053 b .do_copyout
5054 or REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
5055 .xcopyout_err:
5056 ldn [THREAD_REG + T_COPYOPS], %o4
5057 brz %o4, 2f
5058 nop
5059 ldn [%o4 + CP_XCOPYOUT], %g2
5060 jmp %g2
5061 nop
5062 2:
5063 retl
5064 mov %g1, %o0
5065 SET_SIZE(xcopyout)
5066
5067 ENTRY(xcopyout_little)
5068 sethi %hi(.little_err), %o4
5069 ldn [THREAD_REG + T_LOFAULT], %o5
5070 or %o4, %lo(.little_err), %o4
5071 membar #Sync ! sync error barrier
5072 stn %o4, [THREAD_REG + T_LOFAULT]
5073
5074 subcc %g0, %o2, %o3
5075 add %o0, %o2, %o0
5076 bz,pn %ncc, 2f ! check for zero bytes
5077 sub %o2, 1, %o4
5078 add %o0, %o4, %o0 ! start w/last byte
5079 add %o1, %o2, %o1
5080 ldub [%o0+%o3], %o4
5081
5082 1: stba %o4, [%o1+%o3]ASI_AIUSL
5083 inccc %o3
5084 sub %o0, 2, %o0 ! get next byte
5085 bcc,a,pt %ncc, 1b
5086 ldub [%o0+%o3], %o4
5087
5088 2: membar #Sync ! sync error barrier
5089 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
5090 retl
5091 mov %g0, %o0 ! return (0)
5092 SET_SIZE(xcopyout_little)
5093
5094 /*
5095 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
5096 */
5097
5098 ENTRY(copyin)
5099 sethi %hi(.copyin_err), REAL_LOFAULT
5100 or REAL_LOFAULT, %lo(.copyin_err), REAL_LOFAULT
5101
5102 #if !defined(NIAGARA_IMPL)
5103 .do_copyin:
5104 tst %o2 ! check for zero count; quick exit
5105 bz,pt %ncc, .ci_smallqx
5106 mov %o0, SAVE_SRC
5107 mov %o1, SAVE_DST
5108 mov %o2, SAVE_COUNT
5109 cmp %o2, FP_COPY ! check for small copy/leaf case
5110 bgt,pt %ncc, .ci_copy_more
5111 ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
5112 /*
5113 * Small copy in code
5114 *
5115 */
5116 sethi %hi(copyio_fault_nowindow), %o3
5117 or %o3, %lo(copyio_fault_nowindow), %o3
5118 membar #Sync
5119 stn %o3, [THREAD_REG + T_LOFAULT]
5120
5121 mov ASI_USER, %asi
5122 cmp %o2, SHORTCOPY ! make sure there is enough to align
5123 ble,pt %ncc, .ci_smallest
5124 andcc %o1, 0x7, %o3 ! is dest long word aligned
5125 bnz,pn %ncc, .ci_align
5126 andcc %o1, 1, %o3 ! is dest byte aligned
5127
5128 ! Destination is long word aligned
5129 .ci_al_src:
5130 andcc %o0, 7, %o3
5131 brnz,pt %o3, .ci_src_dst_unal8
5132 nop
5133 /*
5134 * Special case for handling when src and dest are both long word aligned
5135 * and total data to move is less than FP_COPY bytes
5136 * Also handles finish up for large block moves, so may be less than 32 bytes
5137 */
5138 .ci_medlong:
5139 subcc %o2, 31, %o2 ! adjust length to allow cc test
5140 ble,pt %ncc, .ci_medl31
5141 nop
5142 .ci_medl32:
5143 ldxa [%o0]%asi, %o4 ! move 32 bytes
5144 subcc %o2, 32, %o2 ! decrement length count by 32
5145 stx %o4, [%o1]
5146 ldxa [%o0+8]%asi, %o4
5147 stx %o4, [%o1+8]
5148 ldxa [%o0+16]%asi, %o4
5149 add %o0, 32, %o0 ! increase src ptr by 32
5150 stx %o4, [%o1+16]
5151 ldxa [%o0-8]%asi, %o4
5152 add %o1, 32, %o1 ! increase dst ptr by 32
5153 bgu,pt %ncc, .ci_medl32 ! repeat if at least 32 bytes left
5154 stx %o4, [%o1-8]
5155 .ci_medl31:
5156 addcc %o2, 24, %o2 ! adjust count to be off by 7
5157 ble,pt %ncc, .ci_medl7 ! skip if 7 or fewer bytes left
5158 nop
5159 .ci_medl8:
5160 ldxa [%o0]%asi, %o4 ! move 8 bytes
5161 add %o0, 8, %o0 ! increase src ptr by 8
5162 subcc %o2, 8, %o2 ! decrease count by 8
5163 add %o1, 8, %o1 ! increase dst ptr by 8
5164 bgu,pt %ncc, .ci_medl8
5165 stx %o4, [%o1-8]
5166 .ci_medl7:
5167 addcc %o2, 7, %o2 ! finish adjustment of remaining count
5168 bnz,pt %ncc, .ci_small4 ! do final bytes if not finished
5169 nop
5170 .ci_smallx: ! finish up and exit
5171 membar #Sync
5172 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
5173 .ci_smallqx:
5174 retl
5175 mov %g0, %o0
5176
5177 .ci_small4:
5178 cmp %o2, 4
5179 blt,pt %ncc, .ci_small3x ! skip if less than 4 bytes left
5180 nop !
5181 lda [%o0]%asi, %o4 ! move 4 bytes
5182 add %o0, 4, %o0 ! increase src ptr by 4
5183 add %o1, 4, %o1 ! increase dst ptr by 4
5184 subcc %o2, 4, %o2 ! decrease count by 4
5185 bz %ncc, .ci_smallx
5186 stw %o4, [%o1-4]
5187
5188 .ci_small3x: ! Exactly 1, 2, or 3 bytes remain
5189 subcc %o2, 1, %o2 ! reduce count for cc test
5190 lduba [%o0]%asi, %o4 ! load one byte
5191 bz,pt %ncc, .ci_smallx
5192 stb %o4, [%o1] ! store one byte
5193 lduba [%o0+1]%asi, %o4 ! load second byte
5194 subcc %o2, 1, %o2
5195 bz,pt %ncc, .ci_smallx
5196 stb %o4, [%o1+1] ! store second byte
5197 lduba [%o0+2]%asi, %o4 ! load third byte
5198 ba .ci_smallx
5199 stb %o4, [%o1+2] ! store third byte
5200
5201 .ci_smallest: ! 7 or fewer bytes remain
5202 cmp %o2, 4
5203 blt,pt %ncc, .ci_small3x
5204 nop
5205 lduba [%o0]%asi, %o4 ! read byte
5206 subcc %o2, 4, %o2 ! reduce count by 4
5207 stb %o4, [%o1] ! write byte
5208 lduba [%o0+1]%asi, %o4 ! repeat for total of 4 bytes
5209 add %o0, 4, %o0 ! advance src by 4
5210 stb %o4, [%o1+1]
5211 lduba [%o0-2]%asi, %o4
5212 add %o1, 4, %o1 ! advance dst by 4
5213 stb %o4, [%o1-2]
5214 lduba [%o0-1]%asi, %o4
5215 bnz,pt %ncc, .ci_small3x
5216 stb %o4, [%o1-1]
5217 membar #Sync
5218 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
5219 retl
5220 mov %g0, %o0
5221
5222 .ci_align:
5223 bnz,pt %ncc, .ci_al_d1
5224 .ci_al_d1f: ! dest is now half word aligned
5225 andcc %o1, 2, %o3 ! is dest word aligned
5226 bnz,pt %ncc, .ci_al_d2
5227 .ci_al_d2f: ! dest is now word aligned
5228 andcc %o1, 4, %o3 ! is dest longword aligned?
5229 bz,pt %ncc, .ci_al_src
5230 nop
5231 .ci_al_d4: ! dest is word aligned; src is unknown
5232 lduba [%o0]%asi, %o4 ! move a word (src align unknown)
5233 lduba [%o0+1]%asi, %o3
5234 sll %o4, 24, %o4 ! position
5235 sll %o3, 16, %o3 ! position
5236 or %o4, %o3, %o3 ! merge
5237 lduba [%o0+2]%asi, %o4
5238 sll %o4, 8, %o4 ! position
5239 or %o4, %o3, %o3 ! merge
5240 lduba [%o0+3]%asi, %o4
5241 or %o4, %o3, %o4 ! merge
5242 stw %o4,[%o1] ! store four bytes
5243 add %o0, 4, %o0 ! adjust src by 4
5244 add %o1, 4, %o1 ! adjust dest by 4
5245 sub %o2, 4, %o2 ! adjust count by 4
5246 andcc %o0, 7, %o3 ! check for src long word alignment
5247 brz,pt %o3, .ci_medlong
5248 .ci_src_dst_unal8:
5249 ! dst is 8-byte aligned, src is not
5250 ! Size is less than FP_COPY
5251 ! Following code is to select for alignment
5252 andcc %o0, 0x3, %o3 ! test word alignment
5253 bz,pt %ncc, .ci_medword
5254 nop
5255 andcc %o0, 0x1, %o3 ! test halfword alignment
5256 bnz,pt %ncc, .ci_med_byte ! go to byte move if not halfword
5257 andcc %o0, 0x2, %o3 ! test which byte alignment
5258 ba .ci_medhalf
5259 nop
5260 .ci_al_d1: ! align dest to half word
5261 lduba [%o0]%asi, %o4 ! move a byte
5262 add %o0, 1, %o0
5263 stb %o4, [%o1]
5264 add %o1, 1, %o1
5265 andcc %o1, 2, %o3 ! is dest word aligned
5266 bz,pt %ncc, .ci_al_d2f
5267 sub %o2, 1, %o2
5268 .ci_al_d2: ! align dest to word
5269 lduba [%o0]%asi, %o4 ! move a half-word (src align unknown)
5270 lduba [%o0+1]%asi, %o3
5271 sll %o4, 8, %o4 ! position
5272 or %o4, %o3, %o4 ! merge
5273 sth %o4, [%o1]
5274 add %o0, 2, %o0
5275 add %o1, 2, %o1
5276 andcc %o1, 4, %o3 ! is dest longword aligned?
5277 bz,pt %ncc, .ci_al_src
5278 sub %o2, 2, %o2
5279 ba .ci_al_d4
5280 nop
5281 /*
5282 * Handle all cases where src and dest are aligned on word
5283 * boundaries. Use unrolled loops for better performance.
5284 * This option wins over standard large data move when
5285 * source and destination is in cache for medium
5286 * to short data moves.
5287 */
5288 .ci_medword:
5289 subcc %o2, 31, %o2 ! adjust length to allow cc test
5290 ble,pt %ncc, .ci_medw31
5291 nop
5292 .ci_medw32:
5293 lda [%o0]%asi, %o4 ! move a block of 32 bytes
5294 stw %o4, [%o1]
5295 lda [%o0+4]%asi, %o4
5296 stw %o4, [%o1+4]
5297 lda [%o0+8]%asi, %o4
5298 stw %o4, [%o1+8]
5299 lda [%o0+12]%asi, %o4
5300 stw %o4, [%o1+12]
5301 lda [%o0+16]%asi, %o4
5302 stw %o4, [%o1+16]
5303 lda [%o0+20]%asi, %o4
5304 subcc %o2, 32, %o2 ! decrement length count
5305 stw %o4, [%o1+20]
5306 lda [%o0+24]%asi, %o4
5307 add %o0, 32, %o0 ! increase src ptr by 32
5308 stw %o4, [%o1+24]
5309 lda [%o0-4]%asi, %o4
5310 add %o1, 32, %o1 ! increase dst ptr by 32
5311 bgu,pt %ncc, .ci_medw32 ! repeat if at least 32 bytes left
5312 stw %o4, [%o1-4]
5313 .ci_medw31:
5314 addcc %o2, 24, %o2 ! adjust count to be off by 7
5315 ble,pt %ncc, .ci_medw7 ! skip if 7 or fewer bytes left
5316 nop !
5317 .ci_medw15:
5318 lda [%o0]%asi, %o4 ! move a block of 8 bytes
5319 subcc %o2, 8, %o2 ! decrement length count
5320 stw %o4, [%o1]
5321 add %o0, 8, %o0 ! increase src ptr by 8
5322 lda [%o0-4]%asi, %o4
5323 add %o1, 8, %o1 ! increase dst ptr by 8
5324 bgu,pt %ncc, .ci_medw15
5325 stw %o4, [%o1-4]
5326 .ci_medw7:
5327 addcc %o2, 7, %o2 ! finish adjustment of remaining count
5328 bz,pt %ncc, .ci_smallx ! exit if finished
5329 cmp %o2, 4
5330 blt,pt %ncc, .ci_small3x ! skip if less than 4 bytes left
5331 nop !
5332 lda [%o0]%asi, %o4 ! move 4 bytes
5333 add %o0, 4, %o0 ! increase src ptr by 4
5334 add %o1, 4, %o1 ! increase dst ptr by 4
5335 subcc %o2, 4, %o2 ! decrease count by 4
5336 bnz .ci_small3x
5337 stw %o4, [%o1-4]
5338 membar #Sync
5339 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
5340 retl
5341 mov %g0, %o0
5342
5343 .ci_medhalf:
5344 subcc %o2, 31, %o2 ! adjust length to allow cc test
5345 ble,pt %ncc, .ci_medh31
5346 nop
5347 .ci_medh32: ! load and store block of 32 bytes
5348 subcc %o2, 32, %o2 ! decrement length count
5349
5350 lduha [%o0]%asi, %o4 ! move 32 bytes
5351 lduwa [%o0+2]%asi, %o3
5352 sllx %o4, 48, %o4
5353 sllx %o3, 16, %o3
5354 or %o4, %o3, %o3
5355 lduha [%o0+6]%asi, %o4
5356 or %o4, %o3, %o4
5357 stx %o4, [%o1]
5358
5359 lduha [%o0+8]%asi, %o4
5360 lduwa [%o0+10]%asi, %o3
5361 sllx %o4, 48, %o4
5362 sllx %o3, 16, %o3
5363 or %o4, %o3, %o3
5364 lduha [%o0+14]%asi, %o4
5365 or %o4, %o3, %o4
5366 stx %o4, [%o1+8]
5367
5368 lduha [%o0+16]%asi, %o4
5369 lduwa [%o0+18]%asi, %o3
5370 sllx %o4, 48, %o4
5371 sllx %o3, 16, %o3
5372 or %o4, %o3, %o3
5373 lduha [%o0+22]%asi, %o4
5374 or %o4, %o3, %o4
5375 stx %o4, [%o1+16]
5376
5377 add %o0, 32, %o0 ! increase src ptr by 32
5378 add %o1, 32, %o1 ! increase dst ptr by 32
5379
5380 lduha [%o0-8]%asi, %o4
5381 lduwa [%o0-6]%asi, %o3
5382 sllx %o4, 48, %o4
5383 sllx %o3, 16, %o3
5384 or %o4, %o3, %o3
5385 lduha [%o0-2]%asi, %o4
5386 or %o3, %o4, %o4
5387 bgu,pt %ncc, .ci_medh32 ! repeat if at least 32 bytes left
5388 stx %o4, [%o1-8]
5389
5390 .ci_medh31:
5391 addcc %o2, 24, %o2 ! adjust count to be off by 7
5392 ble,pt %ncc, .ci_medh7 ! skip if 7 or fewer bytes left
5393 nop !
5394 .ci_medh15:
5395 lduha [%o0]%asi, %o4 ! move 16 bytes
5396 subcc %o2, 8, %o2 ! decrement length count
5397 lduwa [%o0+2]%asi, %o3
5398 sllx %o4, 48, %o4
5399 sllx %o3, 16, %o3
5400 or %o4, %o3, %o3
5401 add %o1, 8, %o1 ! increase dst ptr by 8
5402 lduha [%o0+6]%asi, %o4
5403 add %o0, 8, %o0 ! increase src ptr by 8
5404 or %o4, %o3, %o4
5405 bgu,pt %ncc, .ci_medh15
5406 stx %o4, [%o1-8]
5407 .ci_medh7:
5408 addcc %o2, 7, %o2 ! finish adjustment of remaining count
5409 bz,pt %ncc, .ci_smallx ! exit if finished
5410 cmp %o2, 4
5411 blt,pt %ncc, .ci_small3x ! skip if less than 4 bytes left
5412 nop !
5413 lduha [%o0]%asi, %o4
5414 sll %o4, 16, %o4
5415 lduha [%o0+2]%asi, %o3
5416 or %o3, %o4, %o4
5417 subcc %o2, 4, %o2
5418 add %o0, 4, %o0
5419 add %o1, 4, %o1
5420 bnz .ci_small3x
5421 stw %o4, [%o1-4]
5422 membar #Sync
5423 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
5424 retl
5425 mov %g0, %o0
5426
5427 .align 16
5428 .ci_med_byte:
5429 bnz,pt %ncc, .ci_medbh32a ! go to correct byte move
5430 subcc %o2, 31, %o2 ! adjust length to allow cc test
5431 ble,pt %ncc, .ci_medb31
5432 nop
5433 .ci_medb32: ! Alignment 1 or 5
5434 subcc %o2, 32, %o2 ! decrement length count
5435
5436 lduba [%o0]%asi, %o4 ! load and store a block of 32 bytes
5437 sllx %o4, 56, %o3
5438 lduha [%o0+1]%asi, %o4
5439 sllx %o4, 40, %o4
5440 or %o4, %o3, %o3
5441 lduwa [%o0+3]%asi, %o4
5442 sllx %o4, 8, %o4
5443 or %o4, %o3, %o3
5444 lduba [%o0+7]%asi, %o4
5445 or %o4, %o3, %o4
5446 stx %o4, [%o1]
5447
5448 lduba [%o0+8]%asi, %o4
5449 sllx %o4, 56, %o3
5450 lduha [%o0+9]%asi, %o4
5451 sllx %o4, 40, %o4
5452 or %o4, %o3, %o3
5453 lduwa [%o0+11]%asi, %o4
5454 sllx %o4, 8, %o4
5455 or %o4, %o3, %o3
5456 lduba [%o0+15]%asi, %o4
5457 or %o4, %o3, %o4
5458 stx %o4, [%o1+8]
5459
5460 lduba [%o0+16]%asi, %o4
5461 sllx %o4, 56, %o3
5462 lduha [%o0+17]%asi, %o4
5463 sllx %o4, 40, %o4
5464 or %o4, %o3, %o3
5465 lduwa [%o0+19]%asi, %o4
5466 sllx %o4, 8, %o4
5467 or %o4, %o3, %o3
5468 lduba [%o0+23]%asi, %o4
5469 or %o4, %o3, %o4
5470 stx %o4, [%o1+16]
5471
5472 add %o0, 32, %o0 ! increase src ptr by 32
5473 add %o1, 32, %o1 ! increase dst ptr by 32
5474
5475 lduba [%o0-8]%asi, %o4
5476 sllx %o4, 56, %o3
5477 lduha [%o0-7]%asi, %o4
5478 sllx %o4, 40, %o4
5479 or %o4, %o3, %o3
5480 lduwa [%o0-5]%asi, %o4
5481 sllx %o4, 8, %o4
5482 or %o4, %o3, %o3
5483 lduba [%o0-1]%asi, %o4
5484 or %o4, %o3, %o4
5485 bgu,pt %ncc, .ci_medb32 ! repeat if at least 32 bytes left
5486 stx %o4, [%o1-8]
5487
5488 .ci_medb31: ! 31 or fewer bytes remaining
5489 addcc %o2, 24, %o2 ! adjust count to be off by 7
5490 ble,pt %ncc, .ci_medb7 ! skip if 7 or fewer bytes left
5491 nop !
5492 .ci_medb15:
5493
5494 lduba [%o0]%asi, %o4 ! load and store a block of 8 bytes
5495 subcc %o2, 8, %o2 ! decrement length count
5496 sllx %o4, 56, %o3
5497 lduha [%o0+1]%asi, %o4
5498 sllx %o4, 40, %o4
5499 or %o4, %o3, %o3
5500 lduwa [%o0+3]%asi, %o4
5501 add %o1, 8, %o1 ! increase dst ptr by 16
5502 sllx %o4, 8, %o4
5503 or %o4, %o3, %o3
5504 lduba [%o0+7]%asi, %o4
5505 add %o0, 8, %o0 ! increase src ptr by 16
5506 or %o4, %o3, %o4
5507 bgu,pt %ncc, .ci_medb15
5508 stx %o4, [%o1-8]
5509 .ci_medb7:
5510 addcc %o2, 7, %o2 ! finish adjustment of remaining count
5511 bz,pt %ncc, .ci_smallx ! exit if finished
5512 cmp %o2, 4
5513 blt,pt %ncc, .ci_small3x ! skip if less than 4 bytes left
5514 nop !
5515 lduba [%o0]%asi, %o4 ! move 4 bytes
5516 sll %o4, 24, %o3
5517 lduha [%o0+1]%asi, %o4
5518 sll %o4, 8, %o4
5519 or %o4, %o3, %o3
5520 lduba [%o0+3]%asi, %o4
5521 or %o4, %o3, %o4
5522 subcc %o2, 4, %o2
5523 add %o0, 4, %o0
5524 add %o1, 4, %o1
5525 bnz .ci_small3x
5526 stw %o4, [%o1-4]
5527 membar #Sync
5528 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
5529 retl
5530 mov %g0, %o0
5531
5532 .align 16
5533 .ci_medbh32a: ! Alignment 3 or 7
5534 ble,pt %ncc, .ci_medbh31
5535 nop
5536 .ci_medbh32: ! Alignment 3 or 7
5537 subcc %o2, 32, %o2 ! decrement length count
5538
5539 lduba [%o0]%asi, %o4 ! load and store a block of 32 bytes
5540 sllx %o4, 56, %o3
5541 lduwa [%o0+1]%asi, %o4
5542 sllx %o4, 24, %o4
5543 or %o4, %o3, %o3
5544 lduha [%o0+5]%asi, %o4
5545 sllx %o4, 8, %o4
5546 or %o4, %o3, %o3
5547 lduba [%o0+7]%asi, %o4
5548 or %o4, %o3, %o4
5549 stx %o4, [%o1]
5550
5551 lduba [%o0+8]%asi, %o4
5552 sllx %o4, 56, %o3
5553 lduwa [%o0+9]%asi, %o4
5554 sllx %o4, 24, %o4
5555 or %o4, %o3, %o3
5556 lduha [%o0+13]%asi, %o4
5557 sllx %o4, 8, %o4
5558 or %o4, %o3, %o3
5559 lduba [%o0+15]%asi, %o4
5560 or %o4, %o3, %o4
5561 stx %o4, [%o1+8]
5562
5563 lduba [%o0+16]%asi, %o4
5564 sllx %o4, 56, %o3
5565 lduwa [%o0+17]%asi, %o4
5566 sllx %o4, 24, %o4
5567 or %o4, %o3, %o3
5568 lduha [%o0+21]%asi, %o4
5569 sllx %o4, 8, %o4
5570 or %o4, %o3, %o3
5571 lduba [%o0+23]%asi, %o4
5572 or %o4, %o3, %o4
5573 stx %o4, [%o1+16]
5574
5575 add %o0, 32, %o0 ! increase src ptr by 32
5576 add %o1, 32, %o1 ! increase dst ptr by 32
5577
5578 lduba [%o0-8]%asi, %o4
5579 sllx %o4, 56, %o3
5580 lduwa [%o0-7]%asi, %o4
5581 sllx %o4, 24, %o4
5582 or %o4, %o3, %o3
5583 lduha [%o0-3]%asi, %o4
5584 sllx %o4, 8, %o4
5585 or %o4, %o3, %o3
5586 lduba [%o0-1]%asi, %o4
5587 or %o4, %o3, %o4
5588 bgu,pt %ncc, .ci_medbh32 ! repeat if at least 32 bytes left
5589 stx %o4, [%o1-8]
5590
5591 .ci_medbh31:
5592 addcc %o2, 24, %o2 ! adjust count to be off by 7
5593 ble,pt %ncc, .ci_medb7 ! skip if 7 or fewer bytes left
5594 nop !
5595 .ci_medbh15:
5596 lduba [%o0]%asi, %o4 ! load and store a block of 8 bytes
5597 sllx %o4, 56, %o3
5598 lduwa [%o0+1]%asi, %o4
5599 sllx %o4, 24, %o4
5600 or %o4, %o3, %o3
5601 lduha [%o0+5]%asi, %o4
5602 sllx %o4, 8, %o4
5603 or %o4, %o3, %o3
5604 lduba [%o0+7]%asi, %o4
5605 or %o4, %o3, %o4
5606 stx %o4, [%o1]
5607 subcc %o2, 8, %o2 ! decrement length count
5608 add %o1, 8, %o1 ! increase dst ptr by 8
5609 add %o0, 8, %o0 ! increase src ptr by 8
5610 bgu,pt %ncc, .ci_medbh15
5611 stx %o4, [%o1-8]
5612 ba .ci_medb7
5613 nop
5614
5615 /*
5616 * End of small copy in code (no window)
5617 *
5618 */
5619
5620 /*
5621 * Long copy in code (using register window and fp regs)
5622 *
5623 */
5624
5625 .ci_copy_more:
5626 sethi %hi(copyio_fault), %o3
5627 or %o3, %lo(copyio_fault), %o3
5628 membar #Sync
5629 stn %o3, [THREAD_REG + T_LOFAULT]
5630 /*
5631 * Following code is for large copies. We know there is at
5632 * least FP_COPY bytes available. FP regs are used, so
5633 * we save registers and fp regs before starting
5634 */
5635 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
5636 or SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
5637 rd %fprs, %g1 ! check for unused fp
5638 ! if fprs.fef == 0, set it.
5639 ! Setting it when already set costs more than checking
5640 andcc %g1, FPRS_FEF, %g1 ! test FEF, fprs.du = fprs.dl = 0
5641 bz,pt %ncc, .ci_fp_unused
5642 mov ASI_USER, %asi
5643 BST_FP_TOSTACK(%o3)
5644 ba .ci_fp_ready
5645 .ci_fp_unused:
5646 prefetcha [%i0 + (1 * CACHE_LINE)]%asi, #one_read
5647 wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1
5648 .ci_fp_ready:
5649 rd %gsr, %l5 ! save %gsr value
5650 andcc %i1, 1, %o3 ! is dest byte aligned
5651 bnz,pt %ncc, .ci_big_d1
5652 .ci_big_d1f: ! dest is now half word aligned
5653 andcc %i1, 2, %o3
5654 bnz,pt %ncc, .ci_big_d2
5655 .ci_big_d2f: ! dest is now word aligned
5656 andcc %i1, 4, %o3
5657 bnz,pt %ncc, .ci_big_d4
5658 .ci_big_d4f: ! dest is long word aligned
5659 andcc %i0, 7, %o3 ! is src long word aligned
5660 brnz,pt %o3, .ci_big_unal8
5661 prefetcha [%i0 + (2 * CACHE_LINE)]%asi, #one_read
5662 ! Src and dst are long word aligned
5663 ! align dst to 64 byte boundary
5664 andcc %i1, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned
5665 brz,pn %o3, .ci_al_to_64
5666 nop
5667 sub %o3, 64, %o3 ! %o3 has negative bytes to move
5668 add %i2, %o3, %i2 ! adjust remaining count
5669 andcc %o3, 8, %o4 ! odd long words to move?
5670 brz,pt %o4, .ci_al_to_16
5671 nop
5672 add %o3, 8, %o3
5673 ldxa [%i0]%asi, %o4
5674 add %i0, 8, %i0 ! increment src ptr
5675 add %i1, 8, %i1 ! increment dst ptr
5676 stx %o4, [%i1-8]
5677 ! Dest is aligned on 16 bytes, src 8 byte aligned
5678 .ci_al_to_16:
5679 andcc %o3, 0x30, %o4 ! pair of long words to move?
5680 brz,pt %o4, .ci_al_to_64
5681 nop
5682 .ci_al_mv_16:
5683 add %o3, 16, %o3
5684 ldxa [%i0]%asi, %o4
5685 stx %o4, [%i1]
5686 add %i0, 16, %i0 ! increment src ptr
5687 ldxa [%i0-8]%asi, %o4
5688 stx %o4, [%i1+8]
5689 andcc %o3, 0x30, %o4
5690 brnz,pt %o4, .ci_al_mv_16
5691 add %i1, 16, %i1 ! increment dst ptr
5692 ! Dest is aligned on 64 bytes, src 8 byte aligned
5693 .ci_al_to_64:
5694 ! Determine source alignment
5695 ! to correct 8 byte offset
5696 andcc %i0, 32, %o3
5697 brnz,pn %o3, .ci_aln_1
5698 andcc %i0, 16, %o3
5699 brnz,pn %o3, .ci_aln_01
5700 andcc %i0, 8, %o3
5701 brz,pn %o3, .ci_aln_000
5702 prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read
5703 ba .ci_aln_001
5704 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5705 .ci_aln_01:
5706 brnz,pn %o3, .ci_aln_011
5707 prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read
5708 ba .ci_aln_010
5709 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5710 .ci_aln_1:
5711 andcc %i0, 16, %o3
5712 brnz,pn %o3, .ci_aln_11
5713 andcc %i0, 8, %o3
5714 brnz,pn %o3, .ci_aln_101
5715 prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read
5716 ba .ci_aln_100
5717 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5718 .ci_aln_11:
5719 brz,pn %o3, .ci_aln_110
5720 prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read
5721
5722 .ci_aln_111:
5723 ! Alignment off by 8 bytes
5724 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5725 ldda [%i0]%asi, %d0
5726 add %i0, 8, %i0
5727 sub %i2, 8, %i2
5728 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
5729 and %i2, 0x7f, %i2 ! residue bytes in %i2
5730 sub %i1, %i0, %i1
5731 .ci_aln_111_loop:
5732 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load
5733 subcc %o3, 64, %o3
5734 fmovd %d16, %d2
5735 fmovd %d18, %d4
5736 fmovd %d20, %d6
5737 fmovd %d22, %d8
5738 fmovd %d24, %d10
5739 fmovd %d26, %d12
5740 fmovd %d28, %d14
5741 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
5742 stda %d0,[%i0+%i1]ASI_BLK_P
5743 add %i0, 64, %i0
5744 fmovd %d30, %d0
5745 bgt,pt %ncc, .ci_aln_111_loop
5746 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5747 add %i1, %i0, %i1
5748
5749 std %d0, [%i1]
5750 ba .ci_remain_stuff
5751 add %i1, 8, %i1
5752 ! END OF aln_111
5753
5754 .ci_aln_110:
5755 ! Alignment off by 16 bytes
5756 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5757 ldda [%i0]%asi, %d0
5758 ldda [%i0+8]%asi, %d2
5759 add %i0, 16, %i0
5760 sub %i2, 16, %i2
5761 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
5762 and %i2, 0x7f, %i2 ! residue bytes in %i2
5763 sub %i1, %i0, %i1
5764 .ci_aln_110_loop:
5765 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load
5766 subcc %o3, 64, %o3
5767 fmovd %d16, %d4
5768 fmovd %d18, %d6
5769 fmovd %d20, %d8
5770 fmovd %d22, %d10
5771 fmovd %d24, %d12
5772 fmovd %d26, %d14
5773 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
5774 stda %d0,[%i0+%i1]ASI_BLK_P
5775 add %i0, 64, %i0
5776 fmovd %d28, %d0
5777 fmovd %d30, %d2
5778 bgt,pt %ncc, .ci_aln_110_loop
5779 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5780 add %i1, %i0, %i1
5781
5782 std %d0, [%i1]
5783 std %d2, [%i1+8]
5784 ba .ci_remain_stuff
5785 add %i1, 16, %i1
5786 ! END OF aln_110
5787
5788 .ci_aln_101:
5789 ! Alignment off by 24 bytes
5790 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5791 ldda [%i0]%asi, %d0
5792 ldda [%i0+8]%asi, %d2
5793 ldda [%i0+16]%asi, %d4
5794 add %i0, 24, %i0
5795 sub %i2, 24, %i2
5796 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
5797 and %i2, 0x7f, %i2 ! residue bytes in %i2
5798 sub %i1, %i0, %i1
5799 .ci_aln_101_loop:
5800 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load
5801 subcc %o3, 64, %o3
5802 fmovd %d16, %d6
5803 fmovd %d18, %d8
5804 fmovd %d20, %d10
5805 fmovd %d22, %d12
5806 fmovd %d24, %d14
5807 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
5808 stda %d0,[%i0+%i1]ASI_BLK_P
5809 add %i0, 64, %i0
5810 fmovd %d26, %d0
5811 fmovd %d28, %d2
5812 fmovd %d30, %d4
5813 bgt,pt %ncc, .ci_aln_101_loop
5814 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5815 add %i1, %i0, %i1
5816
5817 std %d0, [%i1]
5818 std %d2, [%i1+8]
5819 std %d4, [%i1+16]
5820 ba .ci_remain_stuff
5821 add %i1, 24, %i1
5822 ! END OF aln_101
5823
5824 .ci_aln_100:
5825 ! Alignment off by 32 bytes
5826 ldda [%i0]%asi, %d0
5827 ldda [%i0+8]%asi, %d2
5828 ldda [%i0+16]%asi,%d4
5829 ldda [%i0+24]%asi,%d6
5830 add %i0, 32, %i0
5831 sub %i2, 32, %i2
5832 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
5833 and %i2, 0x7f, %i2 ! residue bytes in %i2
5834 sub %i1, %i0, %i1
5835 .ci_aln_100_loop:
5836 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load
5837 subcc %o3, 64, %o3
5838 fmovd %d16, %d8
5839 fmovd %d18, %d10
5840 fmovd %d20, %d12
5841 fmovd %d22, %d14
5842 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
5843 stda %d0,[%i0+%i1]ASI_BLK_P
5844 add %i0, 64, %i0
5845 fmovd %d24, %d0
5846 fmovd %d26, %d2
5847 fmovd %d28, %d4
5848 fmovd %d30, %d6
5849 bgt,pt %ncc, .ci_aln_100_loop
5850 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5851 add %i1, %i0, %i1
5852
5853 std %d0, [%i1]
5854 std %d2, [%i1+8]
5855 std %d4, [%i1+16]
5856 std %d6, [%i1+24]
5857 ba .ci_remain_stuff
5858 add %i1, 32, %i1
5859 ! END OF aln_100
5860
5861 .ci_aln_011:
5862 ! Alignment off by 40 bytes
5863 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5864 ldda [%i0]%asi, %d0
5865 ldda [%i0+8]%asi, %d2
5866 ldda [%i0+16]%asi, %d4
5867 ldda [%i0+24]%asi, %d6
5868 ldda [%i0+32]%asi, %d8
5869 add %i0, 40, %i0
5870 sub %i2, 40, %i2
5871 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
5872 and %i2, 0x7f, %i2 ! residue bytes in %i2
5873 sub %i1, %i0, %i1
5874 .ci_aln_011_loop:
5875 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load
5876 subcc %o3, 64, %o3
5877 fmovd %d16, %d10
5878 fmovd %d18, %d12
5879 fmovd %d20, %d14
5880 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
5881 stda %d0,[%i0+%i1]ASI_BLK_P
5882 add %i0, 64, %i0
5883 fmovd %d22, %d0
5884 fmovd %d24, %d2
5885 fmovd %d26, %d4
5886 fmovd %d28, %d6
5887 fmovd %d30, %d8
5888 bgt,pt %ncc, .ci_aln_011_loop
5889 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5890 add %i1, %i0, %i1
5891
5892 std %d0, [%i1]
5893 std %d2, [%i1+8]
5894 std %d4, [%i1+16]
5895 std %d6, [%i1+24]
5896 std %d8, [%i1+32]
5897 ba .ci_remain_stuff
5898 add %i1, 40, %i1
5899 ! END OF aln_011
5900
5901 .ci_aln_010:
5902 ! Alignment off by 48 bytes
5903 ldda [%i0]%asi, %d0
5904 ldda [%i0+8]%asi, %d2
5905 ldda [%i0+16]%asi, %d4
5906 ldda [%i0+24]%asi, %d6
5907 ldda [%i0+32]%asi, %d8
5908 ldda [%i0+40]%asi, %d10
5909 add %i0, 48, %i0
5910 sub %i2, 48, %i2
5911 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
5912 and %i2, 0x7f, %i2 ! residue bytes in %i2
5913 sub %i1, %i0, %i1
5914 .ci_aln_010_loop:
5915 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load
5916 subcc %o3, 64, %o3
5917 fmovd %d16, %d12
5918 fmovd %d18, %d14
5919 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
5920 stda %d0,[%i0+%i1]ASI_BLK_P
5921 add %i0, 64, %i0
5922 fmovd %d20, %d0
5923 fmovd %d22, %d2
5924 fmovd %d24, %d4
5925 fmovd %d26, %d6
5926 fmovd %d28, %d8
5927 fmovd %d30, %d10
5928 bgt,pt %ncc, .ci_aln_010_loop
5929 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5930 add %i1, %i0, %i1
5931
5932 std %d0, [%i1]
5933 std %d2, [%i1+8]
5934 std %d4, [%i1+16]
5935 std %d6, [%i1+24]
5936 std %d8, [%i1+32]
5937 std %d10, [%i1+40]
5938 ba .ci_remain_stuff
5939 add %i1, 48, %i1
5940 ! END OF aln_010
5941
5942 .ci_aln_001:
5943 ! Alignment off by 56 bytes
5944 ldda [%i0]%asi, %d0
5945 ldda [%i0+8]%asi, %d2
5946 ldda [%i0+16]%asi, %d4
5947 ldda [%i0+24]%asi, %d6
5948 ldda [%i0+32]%asi, %d8
5949 ldda [%i0+40]%asi, %d10
5950 ldda [%i0+48]%asi, %d12
5951 add %i0, 56, %i0
5952 sub %i2, 56, %i2
5953 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
5954 and %i2, 0x7f, %i2 ! residue bytes in %i2
5955 sub %i1, %i0, %i1
5956 .ci_aln_001_loop:
5957 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load
5958 subcc %o3, 64, %o3
5959 fmovd %d16, %d14
5960 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
5961 stda %d0,[%i0+%i1]ASI_BLK_P
5962 add %i0, 64, %i0
5963 fmovd %d18, %d0
5964 fmovd %d20, %d2
5965 fmovd %d22, %d4
5966 fmovd %d24, %d6
5967 fmovd %d26, %d8
5968 fmovd %d28, %d10
5969 fmovd %d30, %d12
5970 bgt,pt %ncc, .ci_aln_001_loop
5971 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5972 add %i1, %i0, %i1
5973
5974 std %d0, [%i1]
5975 std %d2, [%i1+8]
5976 std %d4, [%i1+16]
5977 std %d6, [%i1+24]
5978 std %d8, [%i1+32]
5979 std %d10, [%i1+40]
5980 std %d12, [%i1+48]
5981 ba .ci_remain_stuff
5982 add %i1, 56, %i1
5983 ! END OF aln_001
5984
5985 .ci_aln_000:
5986 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5987 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
5988 and %i2, 0x7f, %i2 ! residue bytes in %i2
5989 sub %i1, %i0, %i1
5990 .ci_aln_000_loop:
5991 ldda [%i0]ASI_BLK_AIUS,%d0
5992 subcc %o3, 64, %o3
5993 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
5994 stda %d0,[%i0+%i1]ASI_BLK_P
5995 add %i0, 64, %i0
5996 bgt,pt %ncc, .ci_aln_000_loop
5997 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5998 add %i1, %i0, %i1
5999
6000 ! END OF aln_000
6001
6002 .ci_remain_stuff:
6003 subcc %i2, 31, %i2 ! adjust length to allow cc test
6004 ble,pt %ncc, .ci_aln_31
6005 nop
6006 .ci_aln_32:
6007 ldxa [%i0]%asi, %o4 ! move 32 bytes
6008 subcc %i2, 32, %i2 ! decrement length count by 32
6009 stx %o4, [%i1]
6010 ldxa [%i0+8]%asi, %o4
6011 stx %o4, [%i1+8]
6012 ldxa [%i0+16]%asi, %o4
6013 add %i0, 32, %i0 ! increase src ptr by 32
6014 stx %o4, [%i1+16]
6015 ldxa [%i0-8]%asi, %o4
6016 add %i1, 32, %i1 ! increase dst ptr by 32
6017 bgu,pt %ncc, .ci_aln_32 ! repeat if at least 32 bytes left
6018 stx %o4, [%i1-8]
6019 .ci_aln_31:
6020 addcc %i2, 24, %i2 ! adjust count to be off by 7
6021 ble,pt %ncc, .ci_aln_7 ! skip if 7 or fewer bytes left
6022 nop !
6023 .ci_aln_15:
6024 ldxa [%i0]%asi, %o4 ! move 8 bytes
6025 add %i0, 8, %i0 ! increase src ptr by 8
6026 subcc %i2, 8, %i2 ! decrease count by 8
6027 add %i1, 8, %i1 ! increase dst ptr by 8
6028 bgu,pt %ncc, .ci_aln_15
6029 stx %o4, [%i1-8] !
6030 .ci_aln_7:
6031 addcc %i2, 7, %i2 ! finish adjustment of remaining count
6032 bz,pt %ncc, .ci_exit ! exit if finished
6033 cmp %i2, 4
6034 blt,pt %ncc, .ci_unaln3x ! skip if less than 4 bytes left
6035 nop !
6036 lda [%i0]%asi, %o4 ! move 4 bytes
6037 add %i0, 4, %i0 ! increase src ptr by 4
6038 add %i1, 4, %i1 ! increase dst ptr by 4
6039 subcc %i2, 4, %i2 ! decrease count by 4
6040 bnz .ci_unaln3x
6041 stw %o4, [%i1-4]
6042 ba .ci_exit
6043 nop
6044
6045 ! destination alignment code
6046 .ci_big_d1:
6047 lduba [%i0]%asi, %o4 ! move a byte
6048 add %i0, 1, %i0
6049 stb %o4, [%i1]
6050 add %i1, 1, %i1
6051 andcc %i1, 2, %o3
6052 bz,pt %ncc, .ci_big_d2f
6053 sub %i2, 1, %i2
6054 .ci_big_d2: ! dest is now at least half word aligned
6055 lduba [%i0]%asi, %o4 ! move a half-word (src align unknown)
6056 lduba [%i0+1]%asi, %o3
6057 add %i0, 2, %i0
6058 sll %o4, 8, %o4 ! position
6059 or %o4, %o3, %o4 ! merge
6060 sth %o4, [%i1]
6061 add %i1, 2, %i1
6062 andcc %i1, 4, %o3
6063 bz,pt %ncc, .ci_big_d4f
6064 sub %i2, 2, %i2
6065 .ci_big_d4: ! dest is at least word aligned
6066 nop
6067 lduba [%i0]%asi, %o4 ! move a word (src align unknown)
6068 lduba [%i0+1]%asi, %o3
6069 sll %o4, 24, %o4 ! position
6070 sll %o3, 16, %o3 ! position
6071 or %o4, %o3, %o3 ! merge
6072 lduba [%i0+2]%asi, %o4
6073 sll %o4, 8, %o4 ! position
6074 or %o4, %o3, %o3 ! merge
6075 lduba [%i0+3]%asi, %o4
6076 or %o4, %o3, %o4 ! merge
6077 stw %o4,[%i1] ! store four bytes
6078 add %i0, 4, %i0 ! adjust src by 4
6079 add %i1, 4, %i1 ! adjust dest by 4
6080 ba .ci_big_d4f
6081 sub %i2, 4, %i2 ! adjust count by 4
6082
6083
6084 ! Dst is on 8 byte boundary; src is not;
6085 .ci_big_unal8:
6086 andcc %i1, 0x3f, %o3 ! is dst 64-byte block aligned?
6087 bz %ncc, .ci_unalnsrc
6088 sub %o3, 64, %o3 ! %o3 will be multiple of 8
6089 neg %o3 ! bytes until dest is 64 byte aligned
6090 sub %i2, %o3, %i2 ! update cnt with bytes to be moved
6091 ! Move bytes according to source alignment
6092 andcc %i0, 0x1, %o4
6093 bnz %ncc, .ci_unalnbyte ! check for byte alignment
6094 nop
6095 andcc %i0, 2, %o4 ! check for half word alignment
6096 bnz %ncc, .ci_unalnhalf
6097 nop
6098 ! Src is word aligned, move bytes until dest 64 byte aligned
6099 .ci_unalnword:
6100 lda [%i0]%asi, %o4 ! load 4 bytes
6101 stw %o4, [%i1] ! and store 4 bytes
6102 lda [%i0+4]%asi, %o4 ! load 4 bytes
6103 add %i0, 8, %i0 ! increase src ptr by 8
6104 stw %o4, [%i1+4] ! and store 4 bytes
6105 subcc %o3, 8, %o3 ! decrease count by 8
6106 bnz %ncc, .ci_unalnword
6107 add %i1, 8, %i1 ! increase dst ptr by 8
6108 ba .ci_unalnsrc
6109 nop
6110
6111 ! Src is half-word aligned, move bytes until dest 64 byte aligned
6112 .ci_unalnhalf:
6113 lduha [%i0]%asi, %o4 ! load 2 bytes
6114 sllx %o4, 32, %i3 ! shift left
6115 lduwa [%i0+2]%asi, %o4
6116 or %o4, %i3, %i3
6117 sllx %i3, 16, %i3
6118 lduha [%i0+6]%asi, %o4
6119 or %o4, %i3, %i3
6120 stx %i3, [%i1]
6121 add %i0, 8, %i0
6122 subcc %o3, 8, %o3
6123 bnz %ncc, .ci_unalnhalf
6124 add %i1, 8, %i1
6125 ba .ci_unalnsrc
6126 nop
6127
6128 ! Src is Byte aligned, move bytes until dest 64 byte aligned
6129 .ci_unalnbyte:
6130 sub %i1, %i0, %i1 ! share pointer advance
6131 .ci_unalnbyte_loop:
6132 lduba [%i0]%asi, %o4
6133 sllx %o4, 56, %i3
6134 lduha [%i0+1]%asi, %o4
6135 sllx %o4, 40, %o4
6136 or %o4, %i3, %i3
6137 lduha [%i0+3]%asi, %o4
6138 sllx %o4, 24, %o4
6139 or %o4, %i3, %i3
6140 lduha [%i0+5]%asi, %o4
6141 sllx %o4, 8, %o4
6142 or %o4, %i3, %i3
6143 lduba [%i0+7]%asi, %o4
6144 or %o4, %i3, %i3
6145 stx %i3, [%i1+%i0]
6146 subcc %o3, 8, %o3
6147 bnz %ncc, .ci_unalnbyte_loop
6148 add %i0, 8, %i0
6149 add %i1,%i0, %i1 ! restore pointer
6150
6151 ! Destination is now block (64 byte aligned), src is not 8 byte aligned
6152 .ci_unalnsrc:
6153 andn %i2, 0x3f, %i3 ! %i3 is multiple of block size
6154 and %i2, 0x3f, %i2 ! residue bytes in %i2
6155 add %i2, 64, %i2 ! Insure we don't load beyond
6156 sub %i3, 64, %i3 ! end of source buffer
6157
6158 andn %i0, 0x3f, %o4 ! %o4 has block aligned src address
6159 prefetcha [%o4 + (3 * CACHE_LINE)]%asi, #one_read
6160 alignaddr %i0, %g0, %g0 ! generate %gsr
6161 add %i0, %i3, %i0 ! advance %i0 to after blocks
6162 !
6163 ! Determine source alignment to correct 8 byte offset
6164 andcc %i0, 0x20, %o3
6165 brnz,pn %o3, .ci_unaln_1
6166 andcc %i0, 0x10, %o3
6167 brnz,pn %o3, .ci_unaln_01
6168 andcc %i0, 0x08, %o3
6169 brz,a %o3, .ci_unaln_000
6170 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6171 ba .ci_unaln_001
6172 nop
6173 .ci_unaln_01:
6174 brnz,a %o3, .ci_unaln_011
6175 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6176 ba .ci_unaln_010
6177 nop
6178 .ci_unaln_1:
6179 brnz,pn %o3, .ci_unaln_11
6180 andcc %i0, 0x08, %o3
6181 brnz,a %o3, .ci_unaln_101
6182 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6183 ba .ci_unaln_100
6184 nop
6185 .ci_unaln_11:
6186 brz,pn %o3, .ci_unaln_110
6187 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
6188
6189 .ci_unaln_111:
6190 ldda [%o4+56]%asi, %d14
6191 .ci_unaln_111_loop:
6192 add %o4, 64, %o4
6193 ldda [%o4]ASI_BLK_AIUS, %d16
6194 faligndata %d14, %d16, %d48
6195 faligndata %d16, %d18, %d50
6196 faligndata %d18, %d20, %d52
6197 faligndata %d20, %d22, %d54
6198 faligndata %d22, %d24, %d56
6199 faligndata %d24, %d26, %d58
6200 faligndata %d26, %d28, %d60
6201 faligndata %d28, %d30, %d62
6202 fmovd %d30, %d14
6203 stda %d48, [%i1]ASI_BLK_P
6204 subcc %i3, 64, %i3
6205 add %i1, 64, %i1
6206 bgu,pt %ncc, .ci_unaln_111_loop
6207 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6208 ba .ci_unaln_done
6209 nop
6210
6211 .ci_unaln_110:
6212 ldda [%o4+48]%asi, %d12
6213 ldda [%o4+56]%asi, %d14
6214 .ci_unaln_110_loop:
6215 add %o4, 64, %o4
6216 ldda [%o4]ASI_BLK_AIUS, %d16
6217 faligndata %d12, %d14, %d48
6218 faligndata %d14, %d16, %d50
6219 faligndata %d16, %d18, %d52
6220 faligndata %d18, %d20, %d54
6221 faligndata %d20, %d22, %d56
6222 faligndata %d22, %d24, %d58
6223 faligndata %d24, %d26, %d60
6224 faligndata %d26, %d28, %d62
6225 fmovd %d28, %d12
6226 fmovd %d30, %d14
6227 stda %d48, [%i1]ASI_BLK_P
6228 subcc %i3, 64, %i3
6229 add %i1, 64, %i1
6230 bgu,pt %ncc, .ci_unaln_110_loop
6231 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6232 ba .ci_unaln_done
6233 nop
6234
6235 .ci_unaln_101:
6236 ldda [%o4+40]%asi, %d10
6237 ldda [%o4+48]%asi, %d12
6238 ldda [%o4+56]%asi, %d14
6239 .ci_unaln_101_loop:
6240 add %o4, 64, %o4
6241 ldda [%o4]ASI_BLK_AIUS, %d16
6242 faligndata %d10, %d12, %d48
6243 faligndata %d12, %d14, %d50
6244 faligndata %d14, %d16, %d52
6245 faligndata %d16, %d18, %d54
6246 faligndata %d18, %d20, %d56
6247 faligndata %d20, %d22, %d58
6248 faligndata %d22, %d24, %d60
6249 faligndata %d24, %d26, %d62
6250 fmovd %d26, %d10
6251 fmovd %d28, %d12
6252 fmovd %d30, %d14
6253 stda %d48, [%i1]ASI_BLK_P
6254 subcc %i3, 64, %i3
6255 add %i1, 64, %i1
6256 bgu,pt %ncc, .ci_unaln_101_loop
6257 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6258 ba .ci_unaln_done
6259 nop
6260
6261 .ci_unaln_100:
6262 ldda [%o4+32]%asi, %d8
6263 ldda [%o4+40]%asi, %d10
6264 ldda [%o4+48]%asi, %d12
6265 ldda [%o4+56]%asi, %d14
6266 .ci_unaln_100_loop:
6267 add %o4, 64, %o4
6268 ldda [%o4]ASI_BLK_AIUS, %d16
6269 faligndata %d8, %d10, %d48
6270 faligndata %d10, %d12, %d50
6271 faligndata %d12, %d14, %d52
6272 faligndata %d14, %d16, %d54
6273 faligndata %d16, %d18, %d56
6274 faligndata %d18, %d20, %d58
6275 faligndata %d20, %d22, %d60
6276 faligndata %d22, %d24, %d62
6277 fmovd %d24, %d8
6278 fmovd %d26, %d10
6279 fmovd %d28, %d12
6280 fmovd %d30, %d14
6281 stda %d48, [%i1]ASI_BLK_P
6282 subcc %i3, 64, %i3
6283 add %i1, 64, %i1
6284 bgu,pt %ncc, .ci_unaln_100_loop
6285 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6286 ba .ci_unaln_done
6287 nop
6288
6289 .ci_unaln_011:
6290 ldda [%o4+24]%asi, %d6
6291 ldda [%o4+32]%asi, %d8
6292 ldda [%o4+40]%asi, %d10
6293 ldda [%o4+48]%asi, %d12
6294 ldda [%o4+56]%asi, %d14
6295 .ci_unaln_011_loop:
6296 add %o4, 64, %o4
6297 ldda [%o4]ASI_BLK_AIUS, %d16
6298 faligndata %d6, %d8, %d48
6299 faligndata %d8, %d10, %d50
6300 faligndata %d10, %d12, %d52
6301 faligndata %d12, %d14, %d54
6302 faligndata %d14, %d16, %d56
6303 faligndata %d16, %d18, %d58
6304 faligndata %d18, %d20, %d60
6305 faligndata %d20, %d22, %d62
6306 fmovd %d22, %d6
6307 fmovd %d24, %d8
6308 fmovd %d26, %d10
6309 fmovd %d28, %d12
6310 fmovd %d30, %d14
6311 stda %d48, [%i1]ASI_BLK_P
6312 subcc %i3, 64, %i3
6313 add %i1, 64, %i1
6314 bgu,pt %ncc, .ci_unaln_011_loop
6315 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6316 ba .ci_unaln_done
6317 nop
6318
6319 .ci_unaln_010:
6320 ldda [%o4+16]%asi, %d4
6321 ldda [%o4+24]%asi, %d6
6322 ldda [%o4+32]%asi, %d8
6323 ldda [%o4+40]%asi, %d10
6324 ldda [%o4+48]%asi, %d12
6325 ldda [%o4+56]%asi, %d14
6326 .ci_unaln_010_loop:
6327 add %o4, 64, %o4
6328 ldda [%o4]ASI_BLK_AIUS, %d16
6329 faligndata %d4, %d6, %d48
6330 faligndata %d6, %d8, %d50
6331 faligndata %d8, %d10, %d52
6332 faligndata %d10, %d12, %d54
6333 faligndata %d12, %d14, %d56
6334 faligndata %d14, %d16, %d58
6335 faligndata %d16, %d18, %d60
6336 faligndata %d18, %d20, %d62
6337 fmovd %d20, %d4
6338 fmovd %d22, %d6
6339 fmovd %d24, %d8
6340 fmovd %d26, %d10
6341 fmovd %d28, %d12
6342 fmovd %d30, %d14
6343 stda %d48, [%i1]ASI_BLK_P
6344 subcc %i3, 64, %i3
6345 add %i1, 64, %i1
6346 bgu,pt %ncc, .ci_unaln_010_loop
6347 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6348 ba .ci_unaln_done
6349 nop
6350
6351 .ci_unaln_001:
6352 ldda [%o4+8]%asi, %d2
6353 ldda [%o4+16]%asi, %d4
6354 ldda [%o4+24]%asi, %d6
6355 ldda [%o4+32]%asi, %d8
6356 ldda [%o4+40]%asi, %d10
6357 ldda [%o4+48]%asi, %d12
6358 ldda [%o4+56]%asi, %d14
6359 .ci_unaln_001_loop:
6360 add %o4, 64, %o4
6361 ldda [%o4]ASI_BLK_AIUS, %d16
6362 faligndata %d2, %d4, %d48
6363 faligndata %d4, %d6, %d50
6364 faligndata %d6, %d8, %d52
6365 faligndata %d8, %d10, %d54
6366 faligndata %d10, %d12, %d56
6367 faligndata %d12, %d14, %d58
6368 faligndata %d14, %d16, %d60
6369 faligndata %d16, %d18, %d62
6370 fmovd %d18, %d2
6371 fmovd %d20, %d4
6372 fmovd %d22, %d6
6373 fmovd %d24, %d8
6374 fmovd %d26, %d10
6375 fmovd %d28, %d12
6376 fmovd %d30, %d14
6377 stda %d48, [%i1]ASI_BLK_P
6378 subcc %i3, 64, %i3
6379 add %i1, 64, %i1
6380 bgu,pt %ncc, .ci_unaln_001_loop
6381 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6382 ba .ci_unaln_done
6383 nop
6384
6385 .ci_unaln_000:
6386 ldda [%o4]ASI_BLK_AIUS, %d0
6387 .ci_unaln_000_loop:
6388 add %o4, 64, %o4
6389 ldda [%o4]ASI_BLK_AIUS, %d16
6390 faligndata %d0, %d2, %d48
6391 faligndata %d2, %d4, %d50
6392 faligndata %d4, %d6, %d52
6393 faligndata %d6, %d8, %d54
6394 faligndata %d8, %d10, %d56
6395 faligndata %d10, %d12, %d58
6396 faligndata %d12, %d14, %d60
6397 faligndata %d14, %d16, %d62
6398 fmovd %d16, %d0
6399 fmovd %d18, %d2
6400 fmovd %d20, %d4
6401 fmovd %d22, %d6
6402 fmovd %d24, %d8
6403 fmovd %d26, %d10
6404 fmovd %d28, %d12
6405 fmovd %d30, %d14
6406 stda %d48, [%i1]ASI_BLK_P
6407 subcc %i3, 64, %i3
6408 add %i1, 64, %i1
6409 bgu,pt %ncc, .ci_unaln_000_loop
6410 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6411
6412 .ci_unaln_done:
6413 ! Handle trailing bytes, 64 to 127
6414 ! Dest long word aligned, Src not long word aligned
6415 cmp %i2, 15
6416 bleu %ncc, .ci_unaln_short
6417
6418 andn %i2, 0x7, %i3 ! %i3 is multiple of 8
6419 and %i2, 0x7, %i2 ! residue bytes in %i2
6420 add %i2, 8, %i2
6421 sub %i3, 8, %i3 ! insure we don't load past end of src
6422 andn %i0, 0x7, %o4 ! %o4 has long word aligned src address
6423 add %i0, %i3, %i0 ! advance %i0 to after multiple of 8
6424 ldda [%o4]%asi, %d0 ! fetch partial word
6425 .ci_unaln_by8:
6426 ldda [%o4+8]%asi, %d2
6427 add %o4, 8, %o4
6428 faligndata %d0, %d2, %d16
6429 subcc %i3, 8, %i3
6430 std %d16, [%i1]
6431 fmovd %d2, %d0
6432 bgu,pt %ncc, .ci_unaln_by8
6433 add %i1, 8, %i1
6434
6435 .ci_unaln_short:
6436 cmp %i2, 8
6437 blt,pt %ncc, .ci_unalnfin
6438 nop
6439 lduba [%i0]%asi, %o4
6440 sll %o4, 24, %o3
6441 lduba [%i0+1]%asi, %o4
6442 sll %o4, 16, %o4
6443 or %o4, %o3, %o3
6444 lduba [%i0+2]%asi, %o4
6445 sll %o4, 8, %o4
6446 or %o4, %o3, %o3
6447 lduba [%i0+3]%asi, %o4
6448 or %o4, %o3, %o3
6449 stw %o3, [%i1]
6450 lduba [%i0+4]%asi, %o4
6451 sll %o4, 24, %o3
6452 lduba [%i0+5]%asi, %o4
6453 sll %o4, 16, %o4
6454 or %o4, %o3, %o3
6455 lduba [%i0+6]%asi, %o4
6456 sll %o4, 8, %o4
6457 or %o4, %o3, %o3
6458 lduba [%i0+7]%asi, %o4
6459 or %o4, %o3, %o3
6460 stw %o3, [%i1+4]
6461 add %i0, 8, %i0
6462 add %i1, 8, %i1
6463 sub %i2, 8, %i2
6464 .ci_unalnfin:
6465 cmp %i2, 4
6466 blt,pt %ncc, .ci_unalnz
6467 tst %i2
6468 lduba [%i0]%asi, %o3 ! read byte
6469 subcc %i2, 4, %i2 ! reduce count by 4
6470 sll %o3, 24, %o3 ! position
6471 lduba [%i0+1]%asi, %o4
6472 sll %o4, 16, %o4 ! position
6473 or %o4, %o3, %o3 ! merge
6474 lduba [%i0+2]%asi, %o4
6475 sll %o4, 8, %o4 ! position
6476 or %o4, %o3, %o3 ! merge
6477 add %i1, 4, %i1 ! advance dst by 4
6478 lduba [%i0+3]%asi, %o4
6479 add %i0, 4, %i0 ! advance src by 4
6480 or %o4, %o3, %o4 ! merge
6481 bnz,pt %ncc, .ci_unaln3x
6482 stw %o4, [%i1-4]
6483 ba .ci_exit
6484 nop
6485 .ci_unalnz:
6486 bz,pt %ncc, .ci_exit
6487 wr %l5, %g0, %gsr ! restore %gsr
6488 .ci_unaln3x: ! Exactly 1, 2, or 3 bytes remain
6489 subcc %i2, 1, %i2 ! reduce count for cc test
6490 lduba [%i0]%asi, %o4 ! load one byte
6491 bz,pt %ncc, .ci_exit
6492 stb %o4, [%i1] ! store one byte
6493 lduba [%i0+1]%asi, %o4 ! load second byte
6494 subcc %i2, 1, %i2
6495 bz,pt %ncc, .ci_exit
6496 stb %o4, [%i1+1] ! store second byte
6497 lduba [%i0+2]%asi, %o4 ! load third byte
6498 stb %o4, [%i1+2] ! store third byte
6499 .ci_exit:
6500 brnz %g1, .ci_fp_restore
6501 nop
6502 FZERO
6503 wr %g1, %g0, %fprs
6504 ba,pt %ncc, .ci_ex2
6505 membar #Sync
6506 .ci_fp_restore:
6507 BLD_FP_FROMSTACK(%o4)
6508 .ci_ex2:
6509 andn SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
6510 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
6511 ret
6512 restore %g0, 0, %o0
6513
6514 .copyin_err:
6515 ldn [THREAD_REG + T_COPYOPS], %o4
6516 brz %o4, 2f
6517 nop
6518 ldn [%o4 + CP_COPYIN], %g2
6519 jmp %g2
6520 nop
6521 2:
6522 retl
6523 mov -1, %o0
6524
6525 #else /* NIAGARA_IMPL */
6526 .do_copyin:
6527 !
6528 ! Check the length and bail if zero.
6529 !
6530 tst %o2
6531 bnz,pt %ncc, 1f
6532 nop
6533 retl
6534 clr %o0
6535 1:
6536 sethi %hi(copyio_fault), %o4
6537 or %o4, %lo(copyio_fault), %o4
6538 sethi %hi(copyio_fault_nowindow), %o3
6539 ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
6540 or %o3, %lo(copyio_fault_nowindow), %o3
6541 membar #Sync
6542 stn %o3, [THREAD_REG + T_LOFAULT]
6543
6544 mov %o0, SAVE_SRC
6545 mov %o1, SAVE_DST
6546 mov %o2, SAVE_COUNT
6547
6548 !
6549 ! Check to see if we're more than SMALL_LIMIT.
6550 !
6551 subcc %o2, SMALL_LIMIT, %o3
6552 bgu,a,pt %ncc, .dci_ns
6553 or %o0, %o1, %o3
6554 !
6555 ! What was previously ".small_copyin"
6556 !
6557 .dcibcp:
6558 sub %g0, %o2, %o3 ! setup for copy loop
6559 add %o0, %o2, %o0
6560 add %o1, %o2, %o1
6561 ba,pt %ncc, .dcicl
6562 lduba [%o0 + %o3]ASI_USER, %o4
6563 !
6564 ! %o0 and %o1 point at the end and remain pointing at the end
6565 ! of their buffers. We pull things out by adding %o3 (which is
6566 ! the negation of the length) to the buffer end which gives us
6567 ! the curent location in the buffers. By incrementing %o3 we walk
6568 ! through both buffers without having to bump each buffer's
6569 ! pointer. A very fast 4 instruction loop.
6570 !
6571 .align 16
6572 .dcicl:
6573 stb %o4, [%o1 + %o3]
6574 inccc %o3
6575 bl,a,pt %ncc, .dcicl
6576 lduba [%o0 + %o3]ASI_USER, %o4
6577 !
6578 ! We're done. Go home.
6579 !
6580 membar #Sync
6581 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
6582 retl
6583 clr %o0
6584 !
6585 ! Try aligned copies from here.
6586 !
6587 .dci_ns:
6588 !
6589 ! See if we're single byte aligned. If we are, check the
6590 ! limit for single byte copies. If we're smaller, or equal,
6591 ! bounce to the byte for byte copy loop. Otherwise do it in
6592 ! HW (if enabled).
6593 !
6594 btst 1, %o3
6595 bz,a,pt %icc, .dcih8
6596 btst 7, %o3
6597 !
6598 ! We're single byte aligned.
6599 !
6600 sethi %hi(hw_copy_limit_1), %o3
6601 ld [%o3 + %lo(hw_copy_limit_1)], %o3
6602 !
6603 ! Is HW copy on? If not do everything byte for byte.
6604 !
6605 tst %o3
6606 bz,pn %icc, .dcibcp
6607 subcc %o3, %o2, %o3
6608 !
6609 ! Are we bigger than the HW limit? If not
6610 ! go to byte for byte.
6611 !
6612 bge,pt %ncc, .dcibcp
6613 nop
6614 !
6615 ! We're big enough and copy is on. Do it with HW.
6616 !
6617 ba,pt %ncc, .big_copyin
6618 nop
6619 .dcih8:
6620 !
6621 ! 8 byte aligned?
6622 !
6623 bnz,a %ncc, .dcih4
6624 btst 3, %o3
6625 !
6626 ! We're eight byte aligned.
6627 !
6628 sethi %hi(hw_copy_limit_8), %o3
6629 ld [%o3 + %lo(hw_copy_limit_8)], %o3
6630 !
6631 ! Is HW assist on? If not, do it with the aligned copy.
6632 !
6633 tst %o3
6634 bz,pn %icc, .dcis8
6635 subcc %o3, %o2, %o3
6636 bge %ncc, .dcis8
6637 nop
6638 ba,pt %ncc, .big_copyin
6639 nop
6640 .dcis8:
6641 !
6642 ! Housekeeping for copy loops. Uses same idea as in the byte for
6643 ! byte copy loop above.
6644 !
6645 add %o0, %o2, %o0
6646 add %o1, %o2, %o1
6647 sub %g0, %o2, %o3
6648 ba,pt %ncc, .didebc
6649 srl %o2, 3, %o2 ! Number of 8 byte chunks to copy
6650 !
6651 ! 4 byte aligned?
6652 !
6653 .dcih4:
6654 bnz %ncc, .dcih2
6655 sethi %hi(hw_copy_limit_4), %o3
6656 ld [%o3 + %lo(hw_copy_limit_4)], %o3
6657 !
6658 ! Is HW assist on? If not, do it with the aligned copy.
6659 !
6660 tst %o3
6661 bz,pn %icc, .dcis4
6662 subcc %o3, %o2, %o3
6663 !
6664 ! We're negative if our size is less than or equal to hw_copy_limit_4.
6665 !
6666 bge %ncc, .dcis4
6667 nop
6668 ba,pt %ncc, .big_copyin
6669 nop
6670 .dcis4:
6671 !
6672 ! Housekeeping for copy loops. Uses same idea as in the byte
6673 ! for byte copy loop above.
6674 !
6675 add %o0, %o2, %o0
6676 add %o1, %o2, %o1
6677 sub %g0, %o2, %o3
6678 ba,pt %ncc, .didfbc
6679 srl %o2, 2, %o2 ! Number of 4 byte chunks to copy
6680 .dcih2:
6681 !
6682 ! We're two byte aligned. Check for "smallness"
6683 ! done in delay at .dcih4
6684 !
6685 bleu,pt %ncc, .dcis2
6686 sethi %hi(hw_copy_limit_2), %o3
6687 ld [%o3 + %lo(hw_copy_limit_2)], %o3
6688 !
6689 ! Is HW assist on? If not, do it with the aligned copy.
6690 !
6691 tst %o3
6692 bz,pn %icc, .dcis2
6693 subcc %o3, %o2, %o3
6694 !
6695 ! Are we larger than the HW limit?
6696 !
6697 bge %ncc, .dcis2
6698 nop
6699 !
6700 ! HW assist is on and we're large enough to use it.
6701 !
6702 ba,pt %ncc, .big_copyin
6703 nop
6704 !
6705 ! Housekeeping for copy loops. Uses same idea as in the byte
6706 ! for byte copy loop above.
6707 !
6708 .dcis2:
6709 add %o0, %o2, %o0
6710 add %o1, %o2, %o1
6711 sub %g0, %o2, %o3
6712 ba,pt %ncc, .didtbc
6713 srl %o2, 1, %o2 ! Number of 2 byte chunks to copy
6714 !
6715 .small_copyin:
6716 !
6717 ! Why are we doing this AGAIN? There are certain conditions in
6718 ! big copyin that will cause us to forgo the HW assisted copys
6719 ! and bounce back to a non-hw assisted copy. This dispatches
6720 ! those copies. Note that we branch around this in the main line
6721 ! code.
6722 !
6723 ! We make no check for limits or HW enablement here. We've
6724 ! already been told that we're a poster child so just go off
6725 ! and do it.
6726 !
6727 or %o0, %o1, %o3
6728 btst 1, %o3
6729 bnz %icc, .dcibcp ! Most likely
6730 btst 7, %o3
6731 bz %icc, .dcis8
6732 btst 3, %o3
6733 bz %icc, .dcis4
6734 nop
6735 ba,pt %ncc, .dcis2
6736 nop
6737 !
6738 ! Eight byte aligned copies. A steal from the original .small_copyin
6739 ! with modifications. %o2 is number of 8 byte chunks to copy. When
6740 ! done, we examine %o3. If this is < 0, we have 1 - 7 bytes more
6741 ! to copy.
6742 !
6743 .align 32
6744 .didebc:
6745 ldxa [%o0 + %o3]ASI_USER, %o4
6746 deccc %o2
6747 stx %o4, [%o1 + %o3]
6748 bg,pt %ncc, .didebc
6749 addcc %o3, 8, %o3
6750 !
6751 ! End of copy loop. Most 8 byte aligned copies end here.
6752 !
6753 bz,pt %ncc, .dcifh
6754 nop
6755 !
6756 ! Something is left. Do it byte for byte.
6757 !
6758 ba,pt %ncc, .dcicl
6759 lduba [%o0 + %o3]ASI_USER, %o4
6760 !
6761 ! 4 byte copy loop. %o2 is number of 4 byte chunks to copy.
6762 !
6763 .align 32
6764 .didfbc:
6765 lduwa [%o0 + %o3]ASI_USER, %o4
6766 deccc %o2
6767 st %o4, [%o1 + %o3]
6768 bg,pt %ncc, .didfbc
6769 addcc %o3, 4, %o3
6770 !
6771 ! End of copy loop. Most 4 byte aligned copies end here.
6772 !
6773 bz,pt %ncc, .dcifh
6774 nop
6775 !
6776 ! Something is left. Do it byte for byte.
6777 !
6778 ba,pt %ncc, .dcicl
6779 lduba [%o0 + %o3]ASI_USER, %o4
6780 !
6781 ! 2 byte aligned copy loop. %o2 is number of 2 byte chunks to
6782 ! copy.
6783 !
6784 .align 32
6785 .didtbc:
6786 lduha [%o0 + %o3]ASI_USER, %o4
6787 deccc %o2
6788 sth %o4, [%o1 + %o3]
6789 bg,pt %ncc, .didtbc
6790 addcc %o3, 2, %o3
6791 !
6792 ! End of copy loop. Most 2 byte aligned copies end here.
6793 !
6794 bz,pt %ncc, .dcifh
6795 nop
6796 !
6797 ! Deal with the last byte
6798 !
6799 lduba [%o0 + %o3]ASI_USER, %o4
6800 stb %o4, [%o1 + %o3]
6801 .dcifh:
6802 membar #Sync
6803 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
6804 retl
6805 clr %o0
6806
6807 .big_copyin:
6808 ! We're going off to do a block copy.
6809 ! Switch fault hendlers and grab a window. We
6810 ! don't do a membar #Sync since we've done only
6811 ! kernel data to this point.
6812 stn %o4, [THREAD_REG + T_LOFAULT]
6813
6814 ! Copy in that reach here are larger than 256 bytes. The
6815 ! hw_copy_limit_1 is set to 256. Never set this limit less
6816 ! 128 bytes.
6817 save %sp, -SA(MINFRAME), %sp
6818 .do_blockcopyin:
6819
6820 ! Swap src/dst since the code below is memcpy code
6821 ! and memcpy/bcopy have different calling sequences
6822 mov %i1, %i5
6823 mov %i0, %i1
6824 mov %i5, %i0
6825
6826 ! Block (64 bytes) align the destination.
6827 andcc %i0, 0x3f, %i3 ! is dst block aligned
6828 bz %ncc, copyin_blalign ! dst already block aligned
6829 sub %i3, 0x40, %i3
6830 neg %i3 ! bytes till dst 64 bytes aligned
6831 sub %i2, %i3, %i2 ! update i2 with new count
6832
6833 ! Based on source and destination alignment do
6834 ! either 8 bytes, 4 bytes, 2 bytes or byte copy.
6835
6836 ! Is dst & src 8B aligned
6837 or %i0, %i1, %o2
6838 andcc %o2, 0x7, %g0
6839 bz %ncc, .ci_alewdcp
6840 nop
6841
6842 ! Is dst & src 4B aligned
6843 andcc %o2, 0x3, %g0
6844 bz %ncc, .ci_alwdcp
6845 nop
6846
6847 ! Is dst & src 2B aligned
6848 andcc %o2, 0x1, %g0
6849 bz %ncc, .ci_alhlfwdcp
6850 nop
6851
6852 ! 1B aligned
6853 1: lduba [%i1]ASI_USER, %o2
6854 stb %o2, [%i0]
6855 inc %i1
6856 deccc %i3
6857 bgu,pt %ncc, 1b
6858 inc %i0
6859
6860 ba copyin_blalign
6861 nop
6862
6863 ! dst & src 4B aligned
6864 .ci_alwdcp:
6865 lda [%i1]ASI_USER, %o2
6866 st %o2, [%i0]
6867 add %i1, 0x4, %i1
6868 subcc %i3, 0x4, %i3
6869 bgu,pt %ncc, .ci_alwdcp
6870 add %i0, 0x4, %i0
6871
6872 ba copyin_blalign
6873 nop
6874
6875 ! dst & src 2B aligned
6876 .ci_alhlfwdcp:
6877 lduha [%i1]ASI_USER, %o2
6878 stuh %o2, [%i0]
6879 add %i1, 0x2, %i1
6880 subcc %i3, 0x2, %i3
6881 bgu,pt %ncc, .ci_alhlfwdcp
6882 add %i0, 0x2, %i0
6883
6884 ba copyin_blalign
6885 nop
6886
6887 ! dst & src 8B aligned
6888 .ci_alewdcp:
6889 ldxa [%i1]ASI_USER, %o2
6890 stx %o2, [%i0]
6891 add %i1, 0x8, %i1
6892 subcc %i3, 0x8, %i3
6893 bgu,pt %ncc, .ci_alewdcp
6894 add %i0, 0x8, %i0
6895
6896 copyin_blalign:
6897 andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size
6898 sub %i2, %i3, %i2 ! Residue bytes in %i2
6899
6900 mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
6901
6902 andcc %i1, 0xf, %o2 ! is src quadword aligned
6903 bz,pn %xcc, .ci_blkcpy ! src offset in %o2 (last 4-bits)
6904 nop
6905 cmp %o2, 0x8
6906 bg .ci_upper_double
6907 nop
6908 bl .ci_lower_double
6909 nop
6910
6911 ! Falls through when source offset is equal to 8 i.e.
6912 ! source is double word aligned.
6913 ! In this case no shift/merge of data is required
6914
6915 sub %i1, %o2, %i1 ! align the src at 16 bytes.
6916 andn %i1, 0x3f, %l0 ! %l0 has block aligned source
6917 prefetcha [%l0]ASI_USER, #one_read
6918 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
6919 add %l0, 0x40, %l0
6920 .ci_loop0:
6921 add %i1, 0x10, %i1
6922 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
6923
6924 prefetcha [%l0]ASI_USER, #one_read
6925
6926 stxa %l3, [%i0+0x0]%asi
6927 stxa %l4, [%i0+0x8]%asi
6928
6929 add %i1, 0x10, %i1
6930 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
6931
6932 stxa %l5, [%i0+0x10]%asi
6933 stxa %l2, [%i0+0x18]%asi
6934
6935 add %i1, 0x10, %i1
6936 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
6937
6938 stxa %l3, [%i0+0x20]%asi
6939 stxa %l4, [%i0+0x28]%asi
6940
6941 add %i1, 0x10, %i1
6942 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
6943
6944 stxa %l5, [%i0+0x30]%asi
6945 stxa %l2, [%i0+0x38]%asi
6946
6947 add %l0, 0x40, %l0
6948 subcc %i3, 0x40, %i3
6949 bgu,pt %xcc, .ci_loop0
6950 add %i0, 0x40, %i0
6951 ba .ci_blkdone
6952 add %i1, %o2, %i1 ! increment the source by src offset
6953 ! the src offset was stored in %o2
6954
6955 .ci_lower_double:
6956
6957 sub %i1, %o2, %i1 ! align the src at 16 bytes.
6958 sll %o2, 3, %o0 ! %o0 left shift
6959 mov 0x40, %o1
6960 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift)
6961 andn %i1, 0x3f, %l0 ! %l0 has block aligned source
6962 prefetcha [%l0]ASI_USER, #one_read
6963 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 ! partial data in %l2
6964 ! and %l3 has complete
6965 ! data
6966 add %l0, 0x40, %l0
6967 .ci_loop1:
6968 add %i1, 0x10, %i1
6969 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 ! %l4 has partial data
6970 ! for this read.
6971 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) ! merge %l2, %l3 and %l4
6972 ! into %l2 and %l3
6973
6974 prefetcha [%l0]ASI_USER, #one_read
6975
6976 stxa %l2, [%i0+0x0]%asi
6977 stxa %l3, [%i0+0x8]%asi
6978
6979 add %i1, 0x10, %i1
6980 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
6981 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) ! merge %l2 with %l5 and
6982 ! %l4 from previous read
6983 ! into %l4 and %l5
6984 stxa %l4, [%i0+0x10]%asi
6985 stxa %l5, [%i0+0x18]%asi
6986
6987 ! Repeat the same for next 32 bytes.
6988
6989 add %i1, 0x10, %i1
6990 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
6991 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
6992
6993 stxa %l2, [%i0+0x20]%asi
6994 stxa %l3, [%i0+0x28]%asi
6995
6996 add %i1, 0x10, %i1
6997 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
6998 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
6999
7000 stxa %l4, [%i0+0x30]%asi
7001 stxa %l5, [%i0+0x38]%asi
7002
7003 add %l0, 0x40, %l0
7004 subcc %i3, 0x40, %i3
7005 bgu,pt %xcc, .ci_loop1
7006 add %i0, 0x40, %i0
7007 ba .ci_blkdone
7008 add %i1, %o2, %i1 ! increment the source by src offset
7009 ! the src offset was stored in %o2
7010
7011 .ci_upper_double:
7012
7013 sub %i1, %o2, %i1 ! align the src at 16 bytes.
7014 sub %o2, 0x8, %o0
7015 sll %o0, 3, %o0 ! %o0 left shift
7016 mov 0x40, %o1
7017 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift)
7018 andn %i1, 0x3f, %l0 ! %l0 has block aligned source
7019 prefetcha [%l0]ASI_USER, #one_read
7020 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 ! partial data in %l3
7021 ! for this read and
7022 ! no data in %l2
7023 add %l0, 0x40, %l0
7024 .ci_loop2:
7025 add %i1, 0x10, %i1
7026 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 ! %l4 has complete data
7027 ! and %l5 has partial
7028 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) ! merge %l3, %l4 and %l5
7029 ! into %l3 and %l4
7030 prefetcha [%l0]ASI_USER, #one_read
7031
7032 stxa %l3, [%i0+0x0]%asi
7033 stxa %l4, [%i0+0x8]%asi
7034
7035 add %i1, 0x10, %i1
7036 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7037 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) ! merge %l2 and %l3 with
7038 ! %l5 from previous read
7039 ! into %l5 and %l2
7040
7041 stxa %l5, [%i0+0x10]%asi
7042 stxa %l2, [%i0+0x18]%asi
7043
7044 ! Repeat the same for next 32 bytes.
7045
7046 add %i1, 0x10, %i1
7047 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
7048 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
7049
7050 stxa %l3, [%i0+0x20]%asi
7051 stxa %l4, [%i0+0x28]%asi
7052
7053 add %i1, 0x10, %i1
7054 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7055 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
7056
7057 stxa %l5, [%i0+0x30]%asi
7058 stxa %l2, [%i0+0x38]%asi
7059
7060 add %l0, 0x40, %l0
7061 subcc %i3, 0x40, %i3
7062 bgu,pt %xcc, .ci_loop2
7063 add %i0, 0x40, %i0
7064 ba .ci_blkdone
7065 add %i1, %o2, %i1 ! increment the source by src offset
7066 ! the src offset was stored in %o2
7067
7068
7069 ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
7070 .ci_blkcpy:
7071
7072 andn %i1, 0x3f, %o0 ! %o0 has block aligned source
7073 prefetcha [%o0]ASI_USER, #one_read
7074 add %o0, 0x40, %o0
7075 1:
7076 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l0
7077 add %i1, 0x10, %i1
7078 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7079 add %i1, 0x10, %i1
7080
7081 prefetcha [%o0]ASI_USER, #one_read
7082
7083 stxa %l0, [%i0+0x0]%asi
7084
7085 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
7086 add %i1, 0x10, %i1
7087 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l6
7088 add %i1, 0x10, %i1
7089
7090 stxa %l1, [%i0+0x8]%asi
7091 stxa %l2, [%i0+0x10]%asi
7092 stxa %l3, [%i0+0x18]%asi
7093 stxa %l4, [%i0+0x20]%asi
7094 stxa %l5, [%i0+0x28]%asi
7095 stxa %l6, [%i0+0x30]%asi
7096 stxa %l7, [%i0+0x38]%asi
7097
7098 add %o0, 0x40, %o0
7099 subcc %i3, 0x40, %i3
7100 bgu,pt %xcc, 1b
7101 add %i0, 0x40, %i0
7102
7103 .ci_blkdone:
7104 membar #Sync
7105
7106 brz,pt %i2, .copyin_exit
7107 nop
7108
7109 ! Handle trailing bytes
7110 cmp %i2, 0x8
7111 blu,pt %ncc, .ci_residue
7112 nop
7113
7114 ! Can we do some 8B ops
7115 or %i1, %i0, %o2
7116 andcc %o2, 0x7, %g0
7117 bnz %ncc, .ci_last4
7118 nop
7119
7120 ! Do 8byte ops as long as possible
7121 .ci_last8:
7122 ldxa [%i1]ASI_USER, %o2
7123 stx %o2, [%i0]
7124 add %i1, 0x8, %i1
7125 sub %i2, 0x8, %i2
7126 cmp %i2, 0x8
7127 bgu,pt %ncc, .ci_last8
7128 add %i0, 0x8, %i0
7129
7130 brz,pt %i2, .copyin_exit
7131 nop
7132
7133 ba .ci_residue
7134 nop
7135
7136 .ci_last4:
7137 ! Can we do 4B ops
7138 andcc %o2, 0x3, %g0
7139 bnz %ncc, .ci_last2
7140 nop
7141 1:
7142 lda [%i1]ASI_USER, %o2
7143 st %o2, [%i0]
7144 add %i1, 0x4, %i1
7145 sub %i2, 0x4, %i2
7146 cmp %i2, 0x4
7147 bgu,pt %ncc, 1b
7148 add %i0, 0x4, %i0
7149
7150 brz,pt %i2, .copyin_exit
7151 nop
7152
7153 ba .ci_residue
7154 nop
7155
7156 .ci_last2:
7157 ! Can we do 2B ops
7158 andcc %o2, 0x1, %g0
7159 bnz %ncc, .ci_residue
7160 nop
7161
7162 1:
7163 lduha [%i1]ASI_USER, %o2
7164 stuh %o2, [%i0]
7165 add %i1, 0x2, %i1
7166 sub %i2, 0x2, %i2
7167 cmp %i2, 0x2
7168 bgu,pt %ncc, 1b
7169 add %i0, 0x2, %i0
7170
7171 brz,pt %i2, .copyin_exit
7172 nop
7173
7174 ! Copy the residue as byte copy
7175 .ci_residue:
7176 lduba [%i1]ASI_USER, %i4
7177 stb %i4, [%i0]
7178 inc %i1
7179 deccc %i2
7180 bgu,pt %xcc, .ci_residue
7181 inc %i0
7182
7183 .copyin_exit:
7184 membar #Sync
7185 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
7186 ret
7187 restore %g0, 0, %o0
7188 .copyin_err:
7189 ldn [THREAD_REG + T_COPYOPS], %o4
7190 brz %o4, 2f
7191 nop
7192 ldn [%o4 + CP_COPYIN], %g2
7193 jmp %g2
7194 nop
7195 2:
7196 retl
7197 mov -1, %o0
7198 #endif /* NIAGARA_IMPL */
7199 SET_SIZE(copyin)
7200
7201 ENTRY(xcopyin)
7202 sethi %hi(.xcopyin_err), REAL_LOFAULT
7203 b .do_copyin
7204 or REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
7205 .xcopyin_err:
7206 ldn [THREAD_REG + T_COPYOPS], %o4
7207 brz %o4, 2f
7208 nop
7209 ldn [%o4 + CP_XCOPYIN], %g2
7210 jmp %g2
7211 nop
7212 2:
7213 retl
7214 mov %g1, %o0
7215 SET_SIZE(xcopyin)
7216
7217 ENTRY(xcopyin_little)
7218 sethi %hi(.little_err), %o4
7219 ldn [THREAD_REG + T_LOFAULT], %o5
7220 or %o4, %lo(.little_err), %o4
7221 membar #Sync ! sync error barrier
7222 stn %o4, [THREAD_REG + T_LOFAULT]
7223
7224 subcc %g0, %o2, %o3
7225 add %o0, %o2, %o0
7226 bz,pn %ncc, 2f ! check for zero bytes
7227 sub %o2, 1, %o4
7228 add %o0, %o4, %o0 ! start w/last byte
7229 add %o1, %o2, %o1
7230 lduba [%o0+%o3]ASI_AIUSL, %o4
7231
7232 1: stb %o4, [%o1+%o3]
7233 inccc %o3
7234 sub %o0, 2, %o0 ! get next byte
7235 bcc,a,pt %ncc, 1b
7236 lduba [%o0+%o3]ASI_AIUSL, %o4
7237
7238 2: membar #Sync ! sync error barrier
7239 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
7240 retl
7241 mov %g0, %o0 ! return (0)
7242
7243 .little_err:
7244 membar #Sync ! sync error barrier
7245 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
7246 retl
7247 mov %g1, %o0
7248 SET_SIZE(xcopyin_little)
7249
7250
7251 /*
7252 * Copy a block of storage - must not overlap (from + len <= to).
7253 * No fault handler installed (to be called under on_fault())
7254 */
7255
7256 ENTRY(copyin_noerr)
7257 sethi %hi(.copyio_noerr), REAL_LOFAULT
7258 b .do_copyin
7259 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
7260 .copyio_noerr:
7261 jmp SAVED_LOFAULT
7262 nop
7263 SET_SIZE(copyin_noerr)
7264
7265 /*
7266 * Copy a block of storage - must not overlap (from + len <= to).
7267 * No fault handler installed (to be called under on_fault())
7268 */
7269
7270 ENTRY(copyout_noerr)
7271 sethi %hi(.copyio_noerr), REAL_LOFAULT
7272 b .do_copyout
7273 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
7274 SET_SIZE(copyout_noerr)
7275
7276 .align 4
7277 DGDEF(use_hw_bcopy)
7278 .word 1
7279 DGDEF(use_hw_bzero)
7280 .word 1
7281 DGDEF(hw_copy_limit_1)
7282 .word 0x100
7283 DGDEF(hw_copy_limit_2)
7284 .word 0x200
7285 DGDEF(hw_copy_limit_4)
7286 .word 0x400
7287 DGDEF(hw_copy_limit_8)
7288 .word 0x400
7289
7290 .align 64
7291 .section ".text"
7292
7293 /*
7294 * hwblkclr - clears block-aligned, block-multiple-sized regions that are
7295 * longer than 256 bytes in length using Niagara's block stores/quad store.
7296 * If the criteria for using this routine are not met then it calls bzero
7297 * and returns 1. Otherwise 0 is returned indicating success.
7298 * Caller is responsible for ensuring use_hw_bzero is true and that
7299 * kpreempt_disable() has been called.
7300 */
7301 ! %i0 - start address
7302 ! %i1 - length of region (multiple of 64)
7303
7304 ENTRY(hwblkclr)
7305 save %sp, -SA(MINFRAME), %sp
7306
7307 ! Must be block-aligned
7308 andcc %i0, 0x3f, %g0
7309 bnz,pn %ncc, 1f
7310 nop
7311
7312 ! ... and must be 256 bytes or more
7313 cmp %i1, 0x100
7314 blu,pn %ncc, 1f
7315 nop
7316
7317 ! ... and length must be a multiple of 64
7318 andcc %i1, 0x3f, %g0
7319 bz,pn %ncc, .pz_doblock
7320 mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
7321
7322 1: ! punt, call bzero but notify the caller that bzero was used
7323 mov %i0, %o0
7324 call bzero
7325 mov %i1, %o1
7326 ret
7327 restore %g0, 1, %o0 ! return (1) - did not use block operations
7328
7329 ! Already verified that there are at least 256 bytes to set
7330 .pz_doblock:
7331 stxa %g0, [%i0+0x0]%asi
7332 stxa %g0, [%i0+0x40]%asi
7333 stxa %g0, [%i0+0x80]%asi
7334 stxa %g0, [%i0+0xc0]%asi
7335
7336 stxa %g0, [%i0+0x8]%asi
7337 stxa %g0, [%i0+0x10]%asi
7338 stxa %g0, [%i0+0x18]%asi
7339 stxa %g0, [%i0+0x20]%asi
7340 stxa %g0, [%i0+0x28]%asi
7341 stxa %g0, [%i0+0x30]%asi
7342 stxa %g0, [%i0+0x38]%asi
7343
7344 stxa %g0, [%i0+0x48]%asi
7345 stxa %g0, [%i0+0x50]%asi
7346 stxa %g0, [%i0+0x58]%asi
7347 stxa %g0, [%i0+0x60]%asi
7348 stxa %g0, [%i0+0x68]%asi
7349 stxa %g0, [%i0+0x70]%asi
7350 stxa %g0, [%i0+0x78]%asi
7351
7352 stxa %g0, [%i0+0x88]%asi
7353 stxa %g0, [%i0+0x90]%asi
7354 stxa %g0, [%i0+0x98]%asi
7355 stxa %g0, [%i0+0xa0]%asi
7356 stxa %g0, [%i0+0xa8]%asi
7357 stxa %g0, [%i0+0xb0]%asi
7358 stxa %g0, [%i0+0xb8]%asi
7359
7360 stxa %g0, [%i0+0xc8]%asi
7361 stxa %g0, [%i0+0xd0]%asi
7362 stxa %g0, [%i0+0xd8]%asi
7363 stxa %g0, [%i0+0xe0]%asi
7364 stxa %g0, [%i0+0xe8]%asi
7365 stxa %g0, [%i0+0xf0]%asi
7366 stxa %g0, [%i0+0xf8]%asi
7367
7368 sub %i1, 0x100, %i1
7369 cmp %i1, 0x100
7370 bgu,pt %ncc, .pz_doblock
7371 add %i0, 0x100, %i0
7372
7373 2:
7374 ! Check if more than 64 bytes to set
7375 cmp %i1,0x40
7376 blu %ncc, .pz_finish
7377 nop
7378
7379 3:
7380 stxa %g0, [%i0+0x0]%asi
7381 stxa %g0, [%i0+0x8]%asi
7382 stxa %g0, [%i0+0x10]%asi
7383 stxa %g0, [%i0+0x18]%asi
7384 stxa %g0, [%i0+0x20]%asi
7385 stxa %g0, [%i0+0x28]%asi
7386 stxa %g0, [%i0+0x30]%asi
7387 stxa %g0, [%i0+0x38]%asi
7388
7389 subcc %i1, 0x40, %i1
7390 bgu,pt %ncc, 3b
7391 add %i0, 0x40, %i0
7392
7393 .pz_finish:
7394 membar #Sync
7395 ret
7396 restore %g0, 0, %o0 ! return (bzero or not)
7397 SET_SIZE(hwblkclr)
7398
7399 /*
7400 * Copy 32 bytes of data from src (%o0) to dst (%o1)
7401 * using physical addresses.
7402 */
7403 ENTRY_NP(hw_pa_bcopy32)
7404 rdpr %pstate, %g1
7405 andn %g1, PSTATE_IE, %g2
7406 wrpr %g0, %g2, %pstate
7407
7408 ldxa [%o0]ASI_MEM, %o2
7409 add %o0, 8, %o0
7410 ldxa [%o0]ASI_MEM, %o3
7411 add %o0, 8, %o0
7412 ldxa [%o0]ASI_MEM, %o4
7413 add %o0, 8, %o0
7414 ldxa [%o0]ASI_MEM, %o5
7415 stxa %o2, [%o1]ASI_MEM
7416 add %o1, 8, %o1
7417 stxa %o3, [%o1]ASI_MEM
7418 add %o1, 8, %o1
7419 stxa %o4, [%o1]ASI_MEM
7420 add %o1, 8, %o1
7421 stxa %o5, [%o1]ASI_MEM
7422
7423 membar #Sync
7424 retl
7425 wrpr %g0, %g1, %pstate
7426 SET_SIZE(hw_pa_bcopy32)
7427
7428 /*
7429 * Zero a block of storage.
7430 *
7431 * uzero is used by the kernel to zero a block in user address space.
7432 */
7433
7434 /*
7435 * Control flow of the bzero/kzero/uzero routine.
7436 *
7437 * For fewer than 7 bytes stores, bytes will be zeroed.
7438 *
7439 * For less than 15 bytes stores, align the address on 4 byte boundary.
7440 * Then store as many 4-byte chunks, followed by trailing bytes.
7441 *
7442 * For sizes greater than 15 bytes, align the address on 8 byte boundary.
7443 * if (count > 128) {
7444 * store as many 8-bytes chunks to block align the address
7445 * store using ASI_BLK_INIT_ST_QUAD_LDD_P (bzero/kzero) OR
7446 * store using ASI_BLK_INIT_QUAD_LDD_AIUS (uzero)
7447 * }
7448 * Store as many 8-byte chunks, followed by trailing bytes.
7449 */
7450
7451 ENTRY(uzero)
7452 !
7453 ! Set a new lo_fault handler only if we came in with one
7454 ! already specified.
7455 !
7456 wr %g0, ASI_USER, %asi
7457 ldn [THREAD_REG + T_LOFAULT], %o5
7458 tst %o5
7459 bz,pt %ncc, .do_zero
7460 sethi %hi(.zeroerr), %o2
7461 or %o2, %lo(.zeroerr), %o2
7462 membar #Sync
7463 ba,pt %ncc, .do_zero
7464 stn %o2, [THREAD_REG + T_LOFAULT]
7465
7466 ENTRY(kzero)
7467 !
7468 ! Always set a lo_fault handler
7469 !
7470 wr %g0, ASI_P, %asi
7471 ldn [THREAD_REG + T_LOFAULT], %o5
7472 sethi %hi(.zeroerr), %o2
7473 or %o5, LOFAULT_SET, %o5
7474 or %o2, %lo(.zeroerr), %o2
7475 membar #Sync
7476 ba,pt %ncc, .do_zero
7477 stn %o2, [THREAD_REG + T_LOFAULT]
7478
7479 /*
7480 * We got here because of a fault during kzero or if
7481 * uzero or bzero was called with t_lofault non-zero.
7482 * Otherwise we've already run screaming from the room.
7483 * Errno value is in %g1. Note that we're here iff
7484 * we did set t_lofault.
7485 */
7486 .zeroerr:
7487 !
7488 ! Undo asi register setting. Just set it to be the
7489 ! kernel default without checking.
7490 !
7491 wr %g0, ASI_P, %asi
7492
7493 !
7494 ! We did set t_lofault. It may well have been zero coming in.
7495 !
7496 1:
7497 tst %o5
7498 membar #Sync
7499 bne,pn %ncc, 3f
7500 andncc %o5, LOFAULT_SET, %o5
7501 2:
7502 !
7503 ! Old handler was zero. Just return the error.
7504 !
7505 retl ! return
7506 mov %g1, %o0 ! error code from %g1
7507 3:
7508 !
7509 ! We're here because %o5 was non-zero. It was non-zero
7510 ! because either LOFAULT_SET was present, a previous fault
7511 ! handler was present or both. In all cases we need to reset
7512 ! T_LOFAULT to the value of %o5 after clearing LOFAULT_SET
7513 ! before we either simply return the error or we invoke the
7514 ! previously specified handler.
7515 !
7516 be %ncc, 2b
7517 stn %o5, [THREAD_REG + T_LOFAULT]
7518 jmp %o5 ! goto real handler
7519 nop
7520 SET_SIZE(kzero)
7521 SET_SIZE(uzero)
7522
7523 /*
7524 * Zero a block of storage.
7525 */
7526
7527 ENTRY(bzero)
7528 wr %g0, ASI_P, %asi
7529
7530 ldn [THREAD_REG + T_LOFAULT], %o5 ! save old vector
7531 tst %o5
7532 bz,pt %ncc, .do_zero
7533 sethi %hi(.zeroerr), %o2
7534 or %o2, %lo(.zeroerr), %o2
7535 membar #Sync ! sync error barrier
7536 stn %o2, [THREAD_REG + T_LOFAULT] ! install new vector
7537
7538 .do_zero:
7539 cmp %o1, 7
7540 blu,pn %ncc, .byteclr
7541 nop
7542
7543 cmp %o1, 15
7544 blu,pn %ncc, .wdalign
7545 nop
7546
7547 andcc %o0, 7, %o3 ! is add aligned on a 8 byte bound
7548 bz,pt %ncc, .blkalign ! already double aligned
7549 sub %o3, 8, %o3 ! -(bytes till double aligned)
7550 add %o1, %o3, %o1 ! update o1 with new count
7551
7552 1:
7553 stba %g0, [%o0]%asi
7554 inccc %o3
7555 bl,pt %ncc, 1b
7556 inc %o0
7557
7558 ! Now address is double aligned
7559 .blkalign:
7560 cmp %o1, 0x80 ! check if there are 128 bytes to set
7561 blu,pn %ncc, .bzero_small
7562 mov %o1, %o3
7563
7564 sethi %hi(use_hw_bzero), %o2
7565 ld [%o2 + %lo(use_hw_bzero)], %o2
7566 tst %o2
7567 bz %ncc, .bzero_small
7568 mov %o1, %o3
7569
7570 rd %asi, %o3
7571 wr %g0, ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
7572 cmp %o3, ASI_P
7573 bne,a %ncc, .algnblk
7574 wr %g0, ASI_BLK_INIT_QUAD_LDD_AIUS, %asi
7575
7576 .algnblk:
7577 andcc %o0, 0x3f, %o3 ! is block aligned?
7578 bz,pt %ncc, .bzero_blk
7579 sub %o3, 0x40, %o3 ! -(bytes till block aligned)
7580 add %o1, %o3, %o1 ! o1 is the remainder
7581
7582 ! Clear -(%o3) bytes till block aligned
7583 1:
7584 stxa %g0, [%o0]%asi
7585 addcc %o3, 8, %o3
7586 bl,pt %ncc, 1b
7587 add %o0, 8, %o0
7588
7589 .bzero_blk:
7590 and %o1, 0x3f, %o3 ! calc bytes left after blk clear
7591 andn %o1, 0x3f, %o4 ! calc size of blocks in bytes
7592
7593 cmp %o4, 0x100 ! 256 bytes or more
7594 blu,pn %ncc, 3f
7595 nop
7596
7597 2:
7598 stxa %g0, [%o0+0x0]%asi
7599 stxa %g0, [%o0+0x40]%asi
7600 stxa %g0, [%o0+0x80]%asi
7601 stxa %g0, [%o0+0xc0]%asi
7602
7603 stxa %g0, [%o0+0x8]%asi
7604 stxa %g0, [%o0+0x10]%asi
7605 stxa %g0, [%o0+0x18]%asi
7606 stxa %g0, [%o0+0x20]%asi
7607 stxa %g0, [%o0+0x28]%asi
7608 stxa %g0, [%o0+0x30]%asi
7609 stxa %g0, [%o0+0x38]%asi
7610
7611 stxa %g0, [%o0+0x48]%asi
7612 stxa %g0, [%o0+0x50]%asi
7613 stxa %g0, [%o0+0x58]%asi
7614 stxa %g0, [%o0+0x60]%asi
7615 stxa %g0, [%o0+0x68]%asi
7616 stxa %g0, [%o0+0x70]%asi
7617 stxa %g0, [%o0+0x78]%asi
7618
7619 stxa %g0, [%o0+0x88]%asi
7620 stxa %g0, [%o0+0x90]%asi
7621 stxa %g0, [%o0+0x98]%asi
7622 stxa %g0, [%o0+0xa0]%asi
7623 stxa %g0, [%o0+0xa8]%asi
7624 stxa %g0, [%o0+0xb0]%asi
7625 stxa %g0, [%o0+0xb8]%asi
7626
7627 stxa %g0, [%o0+0xc8]%asi
7628 stxa %g0, [%o0+0xd0]%asi
7629 stxa %g0, [%o0+0xd8]%asi
7630 stxa %g0, [%o0+0xe0]%asi
7631 stxa %g0, [%o0+0xe8]%asi
7632 stxa %g0, [%o0+0xf0]%asi
7633 stxa %g0, [%o0+0xf8]%asi
7634
7635 sub %o4, 0x100, %o4
7636 cmp %o4, 0x100
7637 bgu,pt %ncc, 2b
7638 add %o0, 0x100, %o0
7639
7640 3:
7641 ! ... check if 64 bytes to set
7642 cmp %o4, 0x40
7643 blu %ncc, .bzero_blk_done
7644 nop
7645
7646 4:
7647 stxa %g0, [%o0+0x0]%asi
7648 stxa %g0, [%o0+0x8]%asi
7649 stxa %g0, [%o0+0x10]%asi
7650 stxa %g0, [%o0+0x18]%asi
7651 stxa %g0, [%o0+0x20]%asi
7652 stxa %g0, [%o0+0x28]%asi
7653 stxa %g0, [%o0+0x30]%asi
7654 stxa %g0, [%o0+0x38]%asi
7655
7656 subcc %o4, 0x40, %o4
7657 bgu,pt %ncc, 3b
7658 add %o0, 0x40, %o0
7659
7660 .bzero_blk_done:
7661 membar #Sync
7662 !
7663 ! Undo asi register setting.
7664 !
7665 rd %asi, %o4
7666 wr %g0, ASI_P, %asi
7667 cmp %o4, ASI_BLK_INIT_ST_QUAD_LDD_P
7668 bne,a %ncc, .bzero_small
7669 wr %g0, ASI_USER, %asi
7670
7671 .bzero_small:
7672 ! Set the remaining doubles
7673 subcc %o3, 8, %o3 ! Can we store any doubles?
7674 blu,pn %ncc, .byteclr
7675 and %o1, 7, %o1 ! calc bytes left after doubles
7676
7677 .dbclr:
7678 stxa %g0, [%o0]%asi ! Clear the doubles
7679 subcc %o3, 8, %o3
7680 bgeu,pt %ncc, .dbclr
7681 add %o0, 8, %o0
7682
7683 ba .byteclr
7684 nop
7685
7686 .wdalign:
7687 andcc %o0, 3, %o3 ! is add aligned on a word boundary
7688 bz,pn %ncc, .wdclr
7689 andn %o1, 3, %o3 ! create word sized count in %o3
7690
7691 dec %o1 ! decrement count
7692 stba %g0, [%o0]%asi ! clear a byte
7693 ba .wdalign
7694 inc %o0 ! next byte
7695
7696 .wdclr:
7697 sta %g0, [%o0]%asi ! 4-byte clearing loop
7698 subcc %o3, 4, %o3
7699 bnz,pt %ncc, .wdclr
7700 inc 4, %o0
7701
7702 and %o1, 3, %o1 ! leftover count, if any
7703
7704 .byteclr:
7705 ! Set the leftover bytes
7706 brz %o1, .bzero_exit
7707 nop
7708
7709 7:
7710 deccc %o1 ! byte clearing loop
7711 stba %g0, [%o0]%asi
7712 bgu,pt %ncc, 7b
7713 inc %o0
7714
7715 .bzero_exit:
7716 !
7717 ! We're just concerned with whether t_lofault was set
7718 ! when we came in. We end up here from either kzero()
7719 ! or bzero(). kzero() *always* sets a lofault handler.
7720 ! It ors LOFAULT_SET into %o5 to indicate it has done
7721 ! this even if the value of %o5 is otherwise zero.
7722 ! bzero() sets a lofault handler *only* if one was
7723 ! previously set. Accordingly we need to examine
7724 ! %o5 and if it is non-zero be sure to clear LOFAULT_SET
7725 ! before resetting the error handler.
7726 !
7727 tst %o5
7728 bz %ncc, 1f
7729 andn %o5, LOFAULT_SET, %o5
7730 membar #Sync ! sync error barrier
7731 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
7732 1:
7733 retl
7734 clr %o0 ! return (0)
7735
7736 SET_SIZE(bzero)