1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25
26 #include <sys/param.h>
27 #include <sys/errno.h>
28 #include <sys/asm_linkage.h>
29 #include <sys/vtrace.h>
30 #include <sys/machthread.h>
31 #include <sys/clock.h>
32 #include <sys/asi.h>
33 #include <sys/fsr.h>
34 #include <sys/privregs.h>
35 #include <sys/machasi.h>
36 #include <sys/niagaraasi.h>
37
38 #if !defined(lint)
39 #include "assym.h"
40 #endif /* lint */
41
42
43 /*
44 * Pseudo-code to aid in understanding the control flow of the
45 * bcopy/kcopy routine.
46 *
47 * ! WARNING : <Register usage convention>
48 * ! In kcopy() the %o5, holds previous error handler and a flag
49 * ! LOFAULT_SET (low bits). The %o5 is null in bcopy().
50 * ! The %o5 is not available for any other use.
51 *
52 * On entry:
53 * ! Determine whether to use the FP register version or the
54 * ! the leaf routine version depending on the size of the copy.
55 * ! Set up error handling accordingly.
56 * ! The transition point depends on FP_COPY
57 * ! For both versions %o5 is reserved
58 *
59 * kcopy():
60 * if(length > FP_COPY)
61 * go to regular_kcopy
62 *
63 * ! Setup_leaf_rtn_error_handler
64 * %o5 = curthread->t_lofault; ! save existing handler in %o5
65 * %o5 |= LOFAULT_SET; ! ORed with LOFAULT_SET flag
66 * curthread->t_lofault = .sm_copyerr;
67 * goto small_bcopy();
68 *
69 * regular_kcopy:
70 * save_registers()
71 * %o5 = curthread->t_lofault; ! save existing handler in %o5
72 * %o5 |= LOFAULT_SET; ! ORed with LOFAULT_SET flag
73 * curthread->t_lofault = .copyerr;
74 * goto do_copy();
75 *
76 * bcopy():
77 * if(length > FP_COPY)
78 * go to regular_bcopy
79 *
80 * ! Setup_leaf_rtn_error_handler
81 * %o5 = curthread->t_lofault; ! save existing handler in %o5
82 * curthread->t_lofault = .sm_copyerr;
83 * goto small_bcopy();
84 *
85 * regular_bcopy:
86 * %o5 = curthread->t_lofault; ! save existing handler in %o5
87 * curthread->t_lofault = .copyerr;
88 * goto do_copy();
89 *
90 * small_bcopy:
91 * ! handle copies smaller than FP_COPY
92 * restore t_lofault handler
93 * exit
94 *
95 * do_copy:
96 * ! handle copies larger than FP_COPY
97 * save fp_regs
98 * blockcopy;
99 * restore fp_regs
100 * restore t_lofault handler if came from kcopy();
101 *
102 *
103 * In leaf lofault handler:
104 * curthread->t_lofault = (%o5 & ~LOFAULT_SET); ! restore old t_lofault
105 * return (errno)
106 *
107 * In lofault handler:
108 * curthread->t_lofault = (%o5 & ~LOFAULT_SET); ! restore old t_lofault
109 * restore fp_regs
110 * return (errno)
111 *
112 *
113 *
114 * For all of bcopy/copyin/copyout the copy logic is specialized according
115 * to how the src and dst is aligned and how much data needs to be moved.
116 * The following comments apply to the N2/RF code (#if !defined(NIAGARA_IMPL))
117 *
118 * N2/RF Flow :
119 *
120 * if (count < FP_COPY) { (584 bytes)
121 * set small fault handler (no register window save/restore)
122 * if count < SHORTCOPY (7 bytes)
123 * copy bytes; go to short_exit
124 * else
125 * determine dst alignment, move minimum bytes/halfwords to
126 * get dst aligned on long word boundary
127 * if( src is on long word boundary ) {
128 * medlong: src/dst aligned on 8 bytes
129 * copy with ldx/stx in 4-way unrolled loop;
130 * copy final 0-31 bytes; go to short_exit
131 * } else { src/dst not aligned on 8 bytes
132 * if src is word aligned, ld/st words in 32-byte chunks
133 * if src is half word aligned, ld half, ld word, ld half; pack
134 * into long word, store long words in 32-byte chunks
135 * if src is byte aligned, ld byte,half,word parts; pack into long
136 * word, store long words in 32-byte chunks
137 * move final 0-31 bytes according to src alignment; go to short_exit
138 * short_exit:
139 * restore trap handler if needed, retl
140 * else { More than FP_COPY bytes
141 * set fault handler
142 * disable kernel preemption
143 * save registers, save FP registers if in use
144 * move bytes to align destination register on long word boundary
145 * if(src is on long word boundary) { src/dst aligned on 8 bytes
146 * align dst on 64 byte boundary; use 8-way test for each of 8 possible
147 * src alignments relative to a 64 byte boundary to select the
148 * 16-way unrolled loop (128 bytes) to use for
149 * block load, fmovd, block-init-store, block-store, fmovd operations
150 * then go to remain_stuff.
151 * remain_stuff: move remaining bytes. go to long_exit
152 * } else {
153 * setup alignaddr for faligndata instructions
154 * align dst on 64 byte boundary; use 8-way test for each of 8 possible
155 * src alignments to nearest long word relative to 64 byte boundary to
156 * select the 8-way unrolled loop (64 bytes) to use for
157 * block load, falign, fmovd, block-store loop
158 * (only use block-init-store when src/dst on 8 byte boundaries.)
159 * goto unalign_done.
160 * unalign_done:
161 * move remaining bytes for unaligned cases. go to long_exit
162 * long_exit:
163 * restore %gsr, FP regs (either from stack or set to zero),
164 * restore trap handler, check for kernel preemption request,
165 * handle if needed, ret.
166 * }
167 *
168 * Other platforms include hw_bcopy_limit_[1248] to control the exact
169 * point where the FP register code is used. On those platforms, the
170 * FP register code did not leave data in L2 cache, potentially affecting
171 * performance more than the gain/loss from the algorithm difference.
172 * For N2/RF, block store places data in the L2 cache, so use or non-use
173 * of the FP registers has no effect on L2 cache behavior.
174 * The cost for testing hw_bcopy_limit_* according to different
175 * alignments exceeds 50 cycles for all cases, even when hw_bcopy_limits
176 * were not used. That cost was judged too high relative to the benefits,
177 * so the hw_bcopy_limit option is omitted from this code.
178 */
179
180 /*
181 * Less then or equal this number of bytes we will always copy byte-for-byte
182 */
183 #define SMALL_LIMIT 7
184
185 /*
186 * LOFAULT_SET : Flag set by kzero and kcopy to indicate that t_lofault
187 * handler was set
188 */
189 #define LOFAULT_SET 2
190
191 /*
192 * This define is to align data for the unaligned source cases.
193 * The data1, data2 and data3 is merged into data1 and data2.
194 * The data3 is preserved for next merge.
195 */
196 #define ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp) \
197 sllx data1, lshift, data1 ;\
198 srlx data2, rshift, tmp ;\
199 or data1, tmp, data1 ;\
200 sllx data2, lshift, data2 ;\
201 srlx data3, rshift, tmp ;\
202 or data2, tmp, data2
203 /*
204 * This macro is to align the data. Basically it merges
205 * data1 and data2 to form double word.
206 */
207 #define ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp) \
208 sllx data1, lshift, data1 ;\
209 srlx data2, rshift, tmp ;\
210 or data1, tmp, data1
211
212 #if !defined(NIAGARA_IMPL)
213 /*
214 * Flags set in the lower bits of the t_lofault address:
215 * FPUSED_FLAG: The FP registers were in use and must be restored
216 * LOFAULT_SET: Set for bcopy calls, cleared for kcopy calls
217 * COPY_FLAGS: Both of the above
218 *
219 * Other flags:
220 * KPREEMPT_FLAG: kpreempt needs to be called
221 */
222 #define FPUSED_FLAG 1
223 #define LOFAULT_SET 2
224 #define COPY_FLAGS (FPUSED_FLAG | LOFAULT_SET)
225 #define KPREEMPT_FLAG 4
226
227 #define ALIGN_OFF_1_7 \
228 faligndata %d0, %d2, %d48 ;\
229 faligndata %d2, %d4, %d50 ;\
230 faligndata %d4, %d6, %d52 ;\
231 faligndata %d6, %d8, %d54 ;\
232 faligndata %d8, %d10, %d56 ;\
233 faligndata %d10, %d12, %d58 ;\
234 faligndata %d12, %d14, %d60 ;\
235 faligndata %d14, %d16, %d62
236
237 #define ALIGN_OFF_8_15 \
238 faligndata %d2, %d4, %d48 ;\
239 faligndata %d4, %d6, %d50 ;\
240 faligndata %d6, %d8, %d52 ;\
241 faligndata %d8, %d10, %d54 ;\
242 faligndata %d10, %d12, %d56 ;\
243 faligndata %d12, %d14, %d58 ;\
244 faligndata %d14, %d16, %d60 ;\
245 faligndata %d16, %d18, %d62
246
247 #define ALIGN_OFF_16_23 \
248 faligndata %d4, %d6, %d48 ;\
249 faligndata %d6, %d8, %d50 ;\
250 faligndata %d8, %d10, %d52 ;\
251 faligndata %d10, %d12, %d54 ;\
252 faligndata %d12, %d14, %d56 ;\
253 faligndata %d14, %d16, %d58 ;\
254 faligndata %d16, %d18, %d60 ;\
255 faligndata %d18, %d20, %d62
256
257 #define ALIGN_OFF_24_31 \
258 faligndata %d6, %d8, %d48 ;\
259 faligndata %d8, %d10, %d50 ;\
260 faligndata %d10, %d12, %d52 ;\
261 faligndata %d12, %d14, %d54 ;\
262 faligndata %d14, %d16, %d56 ;\
263 faligndata %d16, %d18, %d58 ;\
264 faligndata %d18, %d20, %d60 ;\
265 faligndata %d20, %d22, %d62
266
267 #define ALIGN_OFF_32_39 \
268 faligndata %d8, %d10, %d48 ;\
269 faligndata %d10, %d12, %d50 ;\
270 faligndata %d12, %d14, %d52 ;\
271 faligndata %d14, %d16, %d54 ;\
272 faligndata %d16, %d18, %d56 ;\
273 faligndata %d18, %d20, %d58 ;\
274 faligndata %d20, %d22, %d60 ;\
275 faligndata %d22, %d24, %d62
276
277 #define ALIGN_OFF_40_47 \
278 faligndata %d10, %d12, %d48 ;\
279 faligndata %d12, %d14, %d50 ;\
280 faligndata %d14, %d16, %d52 ;\
281 faligndata %d16, %d18, %d54 ;\
282 faligndata %d18, %d20, %d56 ;\
283 faligndata %d20, %d22, %d58 ;\
284 faligndata %d22, %d24, %d60 ;\
285 faligndata %d24, %d26, %d62
286
287 #define ALIGN_OFF_48_55 \
288 faligndata %d12, %d14, %d48 ;\
289 faligndata %d14, %d16, %d50 ;\
290 faligndata %d16, %d18, %d52 ;\
291 faligndata %d18, %d20, %d54 ;\
292 faligndata %d20, %d22, %d56 ;\
293 faligndata %d22, %d24, %d58 ;\
294 faligndata %d24, %d26, %d60 ;\
295 faligndata %d26, %d28, %d62
296
297 #define ALIGN_OFF_56_63 \
298 faligndata %d14, %d16, %d48 ;\
299 faligndata %d16, %d18, %d50 ;\
300 faligndata %d18, %d20, %d52 ;\
301 faligndata %d20, %d22, %d54 ;\
302 faligndata %d22, %d24, %d56 ;\
303 faligndata %d24, %d26, %d58 ;\
304 faligndata %d26, %d28, %d60 ;\
305 faligndata %d28, %d30, %d62
306
307 /*
308 * FP_COPY indicates the minimum number of bytes needed
309 * to justify using FP/VIS-accelerated memory operations.
310 * The FPBLK code assumes a minimum number of bytes are available
311 * to be moved on entry. Check that code carefully before
312 * reducing FP_COPY below 256.
313 */
314 #define FP_COPY 584
315 #define SHORTCOPY 7
316 #define ASI_STBI_P ASI_BLK_INIT_ST_QUAD_LDD_P
317 #define ASI_STBI_AIUS ASI_BLK_INIT_QUAD_LDD_AIUS
318 #define CACHE_LINE 64
319 #define VIS_BLOCKSIZE 64
320
321 /*
322 * Size of stack frame in order to accomodate a 64-byte aligned
323 * floating-point register save area and 2 64-bit temp locations.
324 * All copy functions use three quadrants of fp registers; to assure a
325 * block-aligned three block buffer in which to save we must reserve
326 * four blocks on stack.
327 *
328 * _______________________________________ <-- %fp + STACK_BIAS
329 * | We may need to preserve 3 quadrants |
330 * | of fp regs, but since we do so with |
331 * | BST/BLD we need room in which to |
332 * | align to VIS_BLOCKSIZE bytes. So |
333 * | this area is 4 * VIS_BLOCKSIZE. | <-- - SAVED_FPREGS_OFFSET
334 * |-------------------------------------|
335 * | 8 bytes to save %fprs | <-- - SAVED_FPRS_OFFSET
336 * |-------------------------------------|
337 * | 8 bytes to save %gsr | <-- - SAVED_GSR_OFFSET
338 * ---------------------------------------
339 */
340 #define HWCOPYFRAMESIZE ((VIS_BLOCKSIZE * (3 + 1)) + (2 * 8))
341 #define SAVED_FPREGS_OFFSET (VIS_BLOCKSIZE * 4)
342 #define SAVED_FPREGS_ADJUST ((VIS_BLOCKSIZE * 3) + 1)
343 #define SAVED_FPRS_OFFSET (SAVED_FPREGS_OFFSET + 8)
344 #define SAVED_GSR_OFFSET (SAVED_FPRS_OFFSET + 8)
345
346 /*
347 * In FP copies if we do not have preserved data to restore over
348 * the fp regs we used then we must zero those regs to avoid
349 * exposing portions of the data to later threads (data security).
350 */
351 #define FZERO \
352 fzero %f0 ;\
353 fzero %f2 ;\
354 faddd %f0, %f2, %f4 ;\
355 fmuld %f0, %f2, %f6 ;\
356 faddd %f0, %f2, %f8 ;\
357 fmuld %f0, %f2, %f10 ;\
358 faddd %f0, %f2, %f12 ;\
359 fmuld %f0, %f2, %f14 ;\
360 faddd %f0, %f2, %f16 ;\
361 fmuld %f0, %f2, %f18 ;\
362 faddd %f0, %f2, %f20 ;\
363 fmuld %f0, %f2, %f22 ;\
364 faddd %f0, %f2, %f24 ;\
365 fmuld %f0, %f2, %f26 ;\
366 faddd %f0, %f2, %f28 ;\
367 fmuld %f0, %f2, %f30 ;\
368 faddd %f0, %f2, %f48 ;\
369 fmuld %f0, %f2, %f50 ;\
370 faddd %f0, %f2, %f52 ;\
371 fmuld %f0, %f2, %f54 ;\
372 faddd %f0, %f2, %f56 ;\
373 fmuld %f0, %f2, %f58 ;\
374 faddd %f0, %f2, %f60 ;\
375 fmuld %f0, %f2, %f62
376
377 #if !defined(lint)
378
379 /*
380 * Macros to save and restore fp registers to/from the stack.
381 * Used to save and restore in-use fp registers when we want to use FP.
382 */
383 #define BST_FP_TOSTACK(tmp1) \
384 /* membar #Sync */ ;\
385 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\
386 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\
387 stda %f0, [tmp1]ASI_BLK_P ;\
388 add tmp1, VIS_BLOCKSIZE, tmp1 ;\
389 stda %f16, [tmp1]ASI_BLK_P ;\
390 add tmp1, VIS_BLOCKSIZE, tmp1 ;\
391 stda %f48, [tmp1]ASI_BLK_P ;\
392 membar #Sync
393
394 #define BLD_FP_FROMSTACK(tmp1) \
395 /* membar #Sync - provided at copy completion */ ;\
396 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\
397 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\
398 ldda [tmp1]ASI_BLK_P, %f0 ;\
399 add tmp1, VIS_BLOCKSIZE, tmp1 ;\
400 ldda [tmp1]ASI_BLK_P, %f16 ;\
401 add tmp1, VIS_BLOCKSIZE, tmp1 ;\
402 ldda [tmp1]ASI_BLK_P, %f48 ;\
403 membar #Sync
404 #endif /* NIAGARA_IMPL */
405
406 #endif /* lint */
407 /*
408 * Copy a block of storage, returning an error code if `from' or
409 * `to' takes a kernel pagefault which cannot be resolved.
410 * Returns errno value on pagefault error, 0 if all ok
411 */
412
413 #if defined(lint)
414
415 /* ARGSUSED */
416 int
417 kcopy(const void *from, void *to, size_t count)
418 { return(0); }
419
420 #else /* lint */
421
422 .seg ".text"
423 .align 4
424
425 ENTRY(kcopy)
426 #if !defined(NIAGARA_IMPL)
427 cmp %o2, FP_COPY ! check for small copy/leaf case
428 bgt,pt %ncc, .kcopy_more !
429 nop
430 .kcopy_small: ! setup error handler
431 sethi %hi(.sm_copyerr), %o4
432 or %o4, %lo(.sm_copyerr), %o4 ! .sm_copyerr is lofault value
433 ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler
434 ! Note that we carefully do *not* flag the setting of
435 ! t_lofault.
436 membar #Sync ! sync error barrier
437 b .sm_do_copy ! common code
438 stn %o4, [THREAD_REG + T_LOFAULT] ! set t_lofault
439
440
441 .kcopy_more:
442 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
443 sethi %hi(.copyerr), %l7 ! copyerr is lofault value
444 or %l7, %lo(.copyerr), %l7
445 ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler
446 ! Note that we carefully do *not* flag the setting of
447 ! t_lofault.
448 membar #Sync ! sync error barrier
449 b .do_copy ! common code
450 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault
451
452 /*
453 * We got here because of a fault during a small kcopy or bcopy.
454 * if a fault handler existed when bcopy was called.
455 * No floating point registers are used by the small copies.
456 * Small copies are from a leaf routine
457 * Errno value is in %g1.
458 */
459 .sm_copyerr:
460 ! The kcopy will always set a t_lofault handler. If it fires,
461 ! we're expected to just return the error code and not to
462 ! invoke any existing error handler. As far as bcopy is concerned,
463 ! we only set t_lofault if there was an existing lofault handler.
464 ! In that case we're expected to invoke the previously existing
465 ! handler after resetting the t_lofault value.
466 btst LOFAULT_SET, %o5
467 membar #Sync ! sync error barrier
468 andn %o5, LOFAULT_SET, %o5 ! clear fault flag
469 bnz,pn %ncc, 3f
470 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
471 retl
472 mov %g1, %o0
473 3:
474 ! We're here via bcopy. There must have been an error handler
475 ! in place otherwise we would have died a nasty death already.
476 jmp %o5 ! goto real handler
477 mov %g0, %o0
478 /*
479 * end of .sm_copyerr
480 */
481
482 /*
483 * We got here because of a fault during kcopy or bcopy if a fault
484 * handler existed when bcopy was called.
485 * stack and fp registers need to be restored
486 * Errno value is in %g1.
487 */
488 .copyerr:
489 sethi %hi(.copyerr2), %l1
490 or %l1, %lo(.copyerr2), %l1
491 membar #Sync ! sync error barrier
492 stn %l1, [THREAD_REG + T_LOFAULT] ! set t_lofault
493 btst FPUSED_FLAG, %o5
494 bz,pt %xcc, 1f
495 and %o5, LOFAULT_SET, %l1 ! copy flag to %l1
496
497 membar #Sync ! sync error barrier
498 wr %l5, 0, %gsr
499 btst FPRS_FEF, %g5
500 bz,pt %icc, 4f
501 nop
502 ! restore fpregs from stack
503 BLD_FP_FROMSTACK(%o2)
504 ba,pt %ncc, 2f
505 wr %g5, 0, %fprs ! restore fprs
506 4:
507 FZERO
508 wr %g5, 0, %fprs ! restore fprs
509 2:
510 ldn [THREAD_REG + T_LWP], %o2
511 brnz,pt %o2, 1f
512 nop
513
514 ldsb [THREAD_REG + T_PREEMPT], %l0
515 deccc %l0
516 bnz,pn %ncc, 1f
517 stb %l0, [THREAD_REG + T_PREEMPT]
518
519 ! Check for a kernel preemption request
520 ldn [THREAD_REG + T_CPU], %l0
521 ldub [%l0 + CPU_KPRUNRUN], %l0
522 brnz,a,pt %l0, 1f ! Need to call kpreempt?
523 or %l1, KPREEMPT_FLAG, %l1 ! If so, set the flag
524
525 ! The kcopy will always set a t_lofault handler. If it fires,
526 ! we're expected to just return the error code and not to
527 ! invoke any existing error handler. As far as bcopy is concerned,
528 ! we only set t_lofault if there was an existing lofault handler.
529 ! In that case we're expected to invoke the previously existing
530 ! handler after resetting the t_lofault value.
531 1:
532 andn %o5, COPY_FLAGS, %o5 ! remove flags from lofault address
533 membar #Sync ! sync error barrier
534 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
535
536 ! call kpreempt if necessary
537 btst KPREEMPT_FLAG, %l1
538 bz,pt %icc, 2f
539 nop
540 call kpreempt
541 rdpr %pil, %o0 ! pass %pil
542 2:
543 btst LOFAULT_SET, %l1
544 bnz,pn %ncc, 3f
545 nop
546 ret
547 restore %g1, 0, %o0
548 3:
549 ! We're here via bcopy. There must have been an error handler
550 ! in place otherwise we would have died a nasty death already.
551 jmp %o5 ! goto real handler
552 restore %g0, 0, %o0 ! dispose of copy window
553
554 /*
555 * We got here because of a fault in .copyerr. We can't safely restore fp
556 * state, so we panic.
557 */
558 fp_panic_msg:
559 .asciz "Unable to restore fp state after copy operation"
560
561 .align 4
562 .copyerr2:
563 set fp_panic_msg, %o0
564 call panic
565 nop
566 /*
567 * end of .copyerr
568 */
569
570 #else /* NIAGARA_IMPL */
571 save %sp, -SA(MINFRAME), %sp
572 set .copyerr, %l7 ! copyerr is lofault value
573 ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler
574 or %o5, LOFAULT_SET, %o5
575 membar #Sync ! sync error barrier
576 b .do_copy ! common code
577 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault
578
579 /*
580 * We got here because of a fault during kcopy.
581 * Errno value is in %g1.
582 */
583 .copyerr:
584 ! The kcopy() *always* sets a t_lofault handler and it ORs LOFAULT_SET
585 ! into %o5 to indicate it has set t_lofault handler. Need to clear
586 ! LOFAULT_SET flag before restoring the error handler.
587 andn %o5, LOFAULT_SET, %o5
588 membar #Sync ! sync error barrier
589 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
590 ret
591 restore %g1, 0, %o0
592 #endif /* NIAGARA_IMPL */
593
594 SET_SIZE(kcopy)
595 #endif /* lint */
596
597
598 /*
599 * Copy a block of storage - must not overlap (from + len <= to).
600 */
601 #if defined(lint)
602
603 /* ARGSUSED */
604 void
605 bcopy(const void *from, void *to, size_t count)
606 {}
607
608 #else /* lint */
609
610 ENTRY(bcopy)
611 #if !defined(NIAGARA_IMPL)
612 cmp %o2, FP_COPY ! check for small copy/leaf case
613 bgt,pt %ncc, .bcopy_more !
614 nop
615 .bcopy_small: ! setup error handler
616 ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler
617 tst %o5
618 bz,pt %icc, .sm_do_copy
619 sethi %hi(.sm_copyerr), %o4
620 or %o4, %lo(.sm_copyerr), %o4 ! .sm_copyerr is lofault value
621 membar #Sync ! sync error barrier
622 stn %o4, [THREAD_REG + T_LOFAULT] ! set t_lofault
623 or %o5, LOFAULT_SET, %o5 ! Error should trampoline
624 .sm_do_copy:
625 mov %o0, %g1 ! save %o0
626 cmp %o2, SHORTCOPY ! make sure there is enough to align
627 ble,pt %ncc, .bc_smallest
628 andcc %o1, 0x7, %o3 ! is dest long aligned
629 bnz,pn %ncc, .bc_align
630 andcc %o1, 1, %o3 ! is dest byte aligned
631
632 ! Destination is long word aligned
633 .bc_al_src:
634 andcc %o0, 7, %o3
635 brnz,pt %o3, .bc_src_dst_unal8
636 nop
637 /*
638 * Special case for handling when src and dest are both long word aligned
639 * and total data to move is less than FP_COPY bytes
640 * Also handles finish up for large block moves, so may be less than 32 bytes
641 */
642 .bc_medlong:
643 subcc %o2, 31, %o2 ! adjust length to allow cc test
644 ble,pt %ncc, .bc_medl31
645 nop
646 .bc_medl32:
647 ldx [%o0], %o4 ! move 32 bytes
648 subcc %o2, 32, %o2 ! decrement length count by 32
649 stx %o4, [%o1]
650 ldx [%o0+8], %o4
651 stx %o4, [%o1+8]
652 ldx [%o0+16], %o4
653 add %o0, 32, %o0 ! increase src ptr by 32
654 stx %o4, [%o1+16]
655 ldx [%o0-8], %o4
656 add %o1, 32, %o1 ! increase dst ptr by 32
657 bgu,pt %ncc, .bc_medl32 ! repeat if at least 32 bytes left
658 stx %o4, [%o1-8]
659 .bc_medl31:
660 addcc %o2, 24, %o2 ! adjust count to be off by 7
661 ble,pt %ncc, .bc_medl7 ! skip if 7 or fewer bytes left
662 nop
663 .bc_medl8:
664 ldx [%o0], %o4 ! move 8 bytes
665 add %o0, 8, %o0 ! increase src ptr by 8
666 subcc %o2, 8, %o2 ! decrease count by 8
667 add %o1, 8, %o1 ! increase dst ptr by 8
668 bgu,pt %ncc, .bc_medl8
669 stx %o4, [%o1-8]
670 .bc_medl7:
671 addcc %o2, 7, %o2 ! finish adjustment of remaining count
672 bnz,pt %ncc, .bc_small4 ! do final bytes if not finished
673
674 .bc_smallx: ! finish up and exit
675 tst %o5
676 bz,pt %ncc, .bc_sm_done
677 andn %o5, COPY_FLAGS, %o5 ! remove flags from lofault address
678 membar #Sync ! sync error barrier
679 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
680 .bc_sm_done:
681 retl
682 mov %g0, %o0
683
684 .bc_small4:
685 cmp %o2, 4
686 blt,pt %ncc, .bc_small3x ! skip if less than 4 bytes left
687 nop !
688 ld [%o0], %o4 ! move 4 bytes
689 add %o0, 4, %o0 ! increase src ptr by 4
690 add %o1, 4, %o1 ! increase dst ptr by 4
691 subcc %o2, 4, %o2 ! decrease count by 4
692 bz,pt %ncc, .bc_smallx
693 stw %o4, [%o1-4]
694
695 .bc_small3x: ! Exactly 1, 2, or 3 bytes remain
696 subcc %o2, 1, %o2 ! reduce count for cc test
697 ldub [%o0], %o4 ! load one byte
698 bz,pt %ncc, .bc_smallx
699 stb %o4, [%o1] ! store one byte
700 ldub [%o0+1], %o4 ! load second byte
701 subcc %o2, 1, %o2
702 bz,pt %ncc, .bc_smallx
703 stb %o4, [%o1+1] ! store second byte
704 ldub [%o0+2], %o4 ! load third byte
705 ba .bc_smallx
706 stb %o4, [%o1+2] ! store third byte
707
708 .bc_smallest: ! 7 or fewer bytes remain
709 tst %o2
710 bz,pt %ncc, .bc_smallx
711 cmp %o2, 4
712 blt,pt %ncc, .bc_small3x
713 nop
714 ldub [%o0], %o4 ! read byte
715 subcc %o2, 4, %o2 ! reduce count by 4
716 stb %o4, [%o1] ! write byte
717 ldub [%o0+1], %o4 ! repeat for total of 4 bytes
718 add %o0, 4, %o0 ! advance src by 4
719 stb %o4, [%o1+1]
720 ldub [%o0-2], %o4
721 add %o1, 4, %o1 ! advance dst by 4
722 stb %o4, [%o1-2]
723 ldub [%o0-1], %o4
724 bnz,pt %ncc, .bc_small3x
725 stb %o4, [%o1-1]
726 ba .bc_smallx
727 nop
728
729 /*
730 * Align destination to long word boundary
731 */
732 .bc_align: ! byte align test in prior branch delay
733 bnz,pt %ncc, .bc_al_d1
734 .bc_al_d1f: ! dest is now half word aligned
735 andcc %o1, 2, %o3
736 bnz,pt %ncc, .bc_al_d2
737 .bc_al_d2f: ! dest is now word aligned
738 andcc %o1, 4, %o3 ! is dest longword aligned?
739 bz,pt %ncc, .bc_al_src
740 nop
741 .bc_al_d4: ! dest is word aligned; src is unknown
742 ldub [%o0], %o4 ! move a word (src align unknown)
743 ldub [%o0+1], %o3
744 sll %o4, 24, %o4 ! position
745 sll %o3, 16, %o3 ! position
746 or %o4, %o3, %o3 ! merge
747 ldub [%o0+2], %o4
748 sll %o4, 8, %o4 ! position
749 or %o4, %o3, %o3 ! merge
750 ldub [%o0+3], %o4
751 or %o4, %o3, %o4 ! merge
752 stw %o4,[%o1] ! store four bytes
753 add %o0, 4, %o0 ! adjust src by 4
754 add %o1, 4, %o1 ! adjust dest by 4
755 sub %o2, 4, %o2 ! adjust count by 4
756 andcc %o0, 7, %o3 ! check for src long word alignment
757 brz,pt %o3, .bc_medlong
758 .bc_src_dst_unal8:
759 ! dst is 8-byte aligned, src is not
760 ! Size is less than FP_COPY
761 ! Following code is to select for alignment
762 andcc %o0, 0x3, %o3 ! test word alignment
763 bz,pt %ncc, .bc_medword
764 nop
765 andcc %o0, 0x1, %o3 ! test halfword alignment
766 bnz,pt %ncc, .bc_med_byte ! go to byte move if not halfword
767 andcc %o0, 0x2, %o3 ! test which byte alignment
768 ba .bc_medhalf
769 nop
770 .bc_al_d1: ! align dest to half word
771 ldub [%o0], %o4 ! move a byte
772 add %o0, 1, %o0
773 stb %o4, [%o1]
774 add %o1, 1, %o1
775 andcc %o1, 2, %o3
776 bz,pt %ncc, .bc_al_d2f
777 sub %o2, 1, %o2
778 .bc_al_d2: ! align dest to word
779 ldub [%o0], %o4 ! move a half-word (src align unknown)
780 ldub [%o0+1], %o3
781 sll %o4, 8, %o4 ! position
782 or %o4, %o3, %o4 ! merge
783 sth %o4, [%o1]
784 add %o0, 2, %o0
785 add %o1, 2, %o1
786 andcc %o1, 4, %o3 ! is dest longword aligned?
787 bz,pt %ncc, .bc_al_src
788 sub %o2, 2, %o2
789 ba .bc_al_d4
790 nop
791 /*
792 * Handle all cases where src and dest are aligned on word
793 * boundaries. Use unrolled loops for better performance.
794 * This option wins over standard large data move when
795 * source and destination is in cache for medium
796 * to short data moves.
797 */
798 .bc_medword:
799 subcc %o2, 31, %o2 ! adjust length to allow cc test
800 ble,pt %ncc, .bc_medw31
801 nop
802 .bc_medw32:
803 ld [%o0], %o4 ! move a block of 32 bytes
804 stw %o4, [%o1]
805 ld [%o0+4], %o4
806 stw %o4, [%o1+4]
807 ld [%o0+8], %o4
808 stw %o4, [%o1+8]
809 ld [%o0+12], %o4
810 stw %o4, [%o1+12]
811 ld [%o0+16], %o4
812 stw %o4, [%o1+16]
813 ld [%o0+20], %o4
814 subcc %o2, 32, %o2 ! decrement length count
815 stw %o4, [%o1+20]
816 ld [%o0+24], %o4
817 add %o0, 32, %o0 ! increase src ptr by 32
818 stw %o4, [%o1+24]
819 ld [%o0-4], %o4
820 add %o1, 32, %o1 ! increase dst ptr by 32
821 bgu,pt %ncc, .bc_medw32 ! repeat if at least 32 bytes left
822 stw %o4, [%o1-4]
823 .bc_medw31:
824 addcc %o2, 24, %o2 ! adjust count to be off by 7
825 ble,pt %ncc, .bc_medw7 ! skip if 7 or fewer bytes left
826 nop !
827 .bc_medw15:
828 ld [%o0], %o4 ! move a block of 8 bytes
829 subcc %o2, 8, %o2 ! decrement length count
830 stw %o4, [%o1]
831 add %o0, 8, %o0 ! increase src ptr by 8
832 ld [%o0-4], %o4
833 add %o1, 8, %o1 ! increase dst ptr by 8
834 bgu,pt %ncc, .bc_medw15
835 stw %o4, [%o1-4]
836 .bc_medw7:
837 addcc %o2, 7, %o2 ! finish adjustment of remaining count
838 bz,pt %ncc, .bc_smallx ! exit if finished
839 cmp %o2, 4
840 blt,pt %ncc, .bc_small3x ! skip if less than 4 bytes left
841 nop !
842 ld [%o0], %o4 ! move 4 bytes
843 add %o0, 4, %o0 ! increase src ptr by 4
844 add %o1, 4, %o1 ! increase dst ptr by 4
845 subcc %o2, 4, %o2 ! decrease count by 4
846 bnz .bc_small3x
847 stw %o4, [%o1-4]
848 ba .bc_smallx
849 nop
850
851 .bc_medhalf:
852 subcc %o2, 31, %o2 ! adjust length to allow cc test
853 ble,pt %ncc, .bc_medh31
854 nop
855 .bc_medh32: ! load and store block of 32 bytes
856 subcc %o2, 32, %o2 ! decrement length count
857
858 lduh [%o0], %o4 ! move 32 bytes
859 lduw [%o0+2], %o3
860 sllx %o4, 48, %o4
861 sllx %o3, 16, %o3
862 or %o4, %o3, %o3
863 lduh [%o0+6], %o4
864 or %o4, %o3, %o4
865 stx %o4, [%o1]
866
867 lduh [%o0+8], %o4
868 lduw [%o0+10], %o3
869 sllx %o4, 48, %o4
870 sllx %o3, 16, %o3
871 or %o4, %o3, %o3
872 lduh [%o0+14], %o4
873 or %o4, %o3, %o4
874 stx %o4, [%o1+8]
875
876 lduh [%o0+16], %o4
877 lduw [%o0+18], %o3
878 sllx %o4, 48, %o4
879 sllx %o3, 16, %o3
880 or %o4, %o3, %o3
881 lduh [%o0+22], %o4
882 or %o4, %o3, %o4
883 stx %o4, [%o1+16]
884
885 add %o0, 32, %o0 ! increase src ptr by 32
886 add %o1, 32, %o1 ! increase dst ptr by 32
887
888 lduh [%o0-8], %o4
889 lduw [%o0-6], %o3
890 sllx %o4, 48, %o4
891 sllx %o3, 16, %o3
892 or %o4, %o3, %o3
893 lduh [%o0-2], %o4
894 or %o3, %o4, %o4
895 bgu,pt %ncc, .bc_medh32 ! repeat if at least 32 bytes left
896 stx %o4, [%o1-8]
897
898 .bc_medh31:
899 addcc %o2, 24, %o2 ! adjust count to be off by 7
900 ble,pt %ncc, .bc_medh7 ! skip if 7 or fewer bytes left
901 nop !
902 .bc_medh15:
903 lduh [%o0], %o4 ! move 16 bytes
904 subcc %o2, 8, %o2 ! decrement length count
905 lduw [%o0+2], %o3
906 sllx %o4, 48, %o4
907 sllx %o3, 16, %o3
908 or %o4, %o3, %o3
909 add %o1, 8, %o1 ! increase dst ptr by 8
910 lduh [%o0+6], %o4
911 add %o0, 8, %o0 ! increase src ptr by 8
912 or %o4, %o3, %o4
913 bgu,pt %ncc, .bc_medh15
914 stx %o4, [%o1-8]
915 .bc_medh7:
916 addcc %o2, 7, %o2 ! finish adjustment of remaining count
917 bz,pt %ncc, .bc_smallx ! exit if finished
918 cmp %o2, 4
919 blt,pt %ncc, .bc_small3x ! skip if less than 4 bytes left
920 nop !
921 lduh [%o0], %o4
922 sll %o4, 16, %o4
923 lduh [%o0+2], %o3
924 or %o3, %o4, %o4
925 subcc %o2, 4, %o2
926 add %o0, 4, %o0
927 add %o1, 4, %o1
928 bnz .bc_small3x
929 stw %o4, [%o1-4]
930 ba .bc_smallx
931 nop
932
933 .align 16
934 .bc_med_byte:
935 bnz,pt %ncc, .bc_medbh32a ! go to correct byte move
936 subcc %o2, 31, %o2 ! adjust length to allow cc test
937 ble,pt %ncc, .bc_medb31
938 nop
939 .bc_medb32: ! Alignment 1 or 5
940 subcc %o2, 32, %o2 ! decrement length count
941
942 ldub [%o0], %o4 ! load and store a block of 32 bytes
943 sllx %o4, 56, %o3
944 lduh [%o0+1], %o4
945 sllx %o4, 40, %o4
946 or %o4, %o3, %o3
947 lduw [%o0+3], %o4
948 sllx %o4, 8, %o4
949 or %o4, %o3, %o3
950 ldub [%o0+7], %o4
951 or %o4, %o3, %o4
952 stx %o4, [%o1]
953
954 ldub [%o0+8], %o4
955 sllx %o4, 56, %o3
956 lduh [%o0+9], %o4
957 sllx %o4, 40, %o4
958 or %o4, %o3, %o3
959 lduw [%o0+11], %o4
960 sllx %o4, 8, %o4
961 or %o4, %o3, %o3
962 ldub [%o0+15], %o4
963 or %o4, %o3, %o4
964 stx %o4, [%o1+8]
965
966 ldub [%o0+16], %o4
967 sllx %o4, 56, %o3
968 lduh [%o0+17], %o4
969 sllx %o4, 40, %o4
970 or %o4, %o3, %o3
971 lduw [%o0+19], %o4
972 sllx %o4, 8, %o4
973 or %o4, %o3, %o3
974 ldub [%o0+23], %o4
975 or %o4, %o3, %o4
976 stx %o4, [%o1+16]
977
978 add %o0, 32, %o0 ! increase src ptr by 32
979 add %o1, 32, %o1 ! increase dst ptr by 32
980
981 ldub [%o0-8], %o4
982 sllx %o4, 56, %o3
983 lduh [%o0-7], %o4
984 sllx %o4, 40, %o4
985 or %o4, %o3, %o3
986 lduw [%o0-5], %o4
987 sllx %o4, 8, %o4
988 or %o4, %o3, %o3
989 ldub [%o0-1], %o4
990 or %o4, %o3, %o4
991 bgu,pt %ncc, .bc_medb32 ! repeat if at least 32 bytes left
992 stx %o4, [%o1-8]
993
994 .bc_medb31: ! 31 or fewer bytes remaining
995 addcc %o2, 24, %o2 ! adjust count to be off by 7
996 ble,pt %ncc, .bc_medb7 ! skip if 7 or fewer bytes left
997 nop !
998 .bc_medb15:
999
1000 ldub [%o0], %o4 ! load and store a block of 8 bytes
1001 subcc %o2, 8, %o2 ! decrement length count
1002 sllx %o4, 56, %o3
1003 lduh [%o0+1], %o4
1004 sllx %o4, 40, %o4
1005 or %o4, %o3, %o3
1006 lduw [%o0+3], %o4
1007 add %o1, 8, %o1 ! increase dst ptr by 16
1008 sllx %o4, 8, %o4
1009 or %o4, %o3, %o3
1010 ldub [%o0+7], %o4
1011 add %o0, 8, %o0 ! increase src ptr by 16
1012 or %o4, %o3, %o4
1013 bgu,pt %ncc, .bc_medb15
1014 stx %o4, [%o1-8]
1015 .bc_medb7:
1016 addcc %o2, 7, %o2 ! finish adjustment of remaining count
1017 bz,pt %ncc, .bc_smallx ! exit if finished
1018 cmp %o2, 4
1019 blt,pt %ncc, .bc_small3x ! skip if less than 4 bytes left
1020 nop !
1021 ldub [%o0], %o4 ! move 4 bytes
1022 sll %o4, 24, %o3
1023 lduh [%o0+1], %o4
1024 sll %o4, 8, %o4
1025 or %o4, %o3, %o3
1026 ldub [%o0+3], %o4
1027 or %o4, %o3, %o4
1028 subcc %o2, 4, %o2
1029 add %o0, 4, %o0
1030 add %o1, 4, %o1
1031 bnz .bc_small3x
1032 stw %o4, [%o1-4]
1033 ba .bc_smallx
1034 nop
1035
1036 .align 16
1037 .bc_medbh32a: ! Alignment 3 or 7
1038 ble,pt %ncc, .bc_medbh31
1039 nop
1040 .bc_medbh32: ! Alignment 3 or 7
1041 subcc %o2, 32, %o2 ! decrement length count
1042
1043 ldub [%o0], %o4 ! load and store a block of 32 bytes
1044 sllx %o4, 56, %o3
1045 lduw [%o0+1], %o4
1046 sllx %o4, 24, %o4
1047 or %o4, %o3, %o3
1048 lduh [%o0+5], %o4
1049 sllx %o4, 8, %o4
1050 or %o4, %o3, %o3
1051 ldub [%o0+7], %o4
1052 or %o4, %o3, %o4
1053 stx %o4, [%o1]
1054
1055 ldub [%o0+8], %o4
1056 sllx %o4, 56, %o3
1057 lduw [%o0+9], %o4
1058 sllx %o4, 24, %o4
1059 or %o4, %o3, %o3
1060 lduh [%o0+13], %o4
1061 sllx %o4, 8, %o4
1062 or %o4, %o3, %o3
1063 ldub [%o0+15], %o4
1064 or %o4, %o3, %o4
1065 stx %o4, [%o1+8]
1066
1067 ldub [%o0+16], %o4
1068 sllx %o4, 56, %o3
1069 lduw [%o0+17], %o4
1070 sllx %o4, 24, %o4
1071 or %o4, %o3, %o3
1072 lduh [%o0+21], %o4
1073 sllx %o4, 8, %o4
1074 or %o4, %o3, %o3
1075 ldub [%o0+23], %o4
1076 or %o4, %o3, %o4
1077 stx %o4, [%o1+16]
1078
1079 add %o0, 32, %o0 ! increase src ptr by 32
1080 add %o1, 32, %o1 ! increase dst ptr by 32
1081
1082 ldub [%o0-8], %o4
1083 sllx %o4, 56, %o3
1084 lduw [%o0-7], %o4
1085 sllx %o4, 24, %o4
1086 or %o4, %o3, %o3
1087 lduh [%o0-3], %o4
1088 sllx %o4, 8, %o4
1089 or %o4, %o3, %o3
1090 ldub [%o0-1], %o4
1091 or %o4, %o3, %o4
1092 bgu,pt %ncc, .bc_medbh32 ! repeat if at least 32 bytes left
1093 stx %o4, [%o1-8]
1094
1095 .bc_medbh31:
1096 addcc %o2, 24, %o2 ! adjust count to be off by 7
1097 ble,pt %ncc, .bc_medb7 ! skip if 7 or fewer bytes left
1098 nop !
1099 .bc_medbh15:
1100 ldub [%o0], %o4 ! load and store a block of 8 bytes
1101 sllx %o4, 56, %o3
1102 lduw [%o0+1], %o4
1103 sllx %o4, 24, %o4
1104 or %o4, %o3, %o3
1105 lduh [%o0+5], %o4
1106 sllx %o4, 8, %o4
1107 or %o4, %o3, %o3
1108 ldub [%o0+7], %o4
1109 or %o4, %o3, %o4
1110 stx %o4, [%o1]
1111 subcc %o2, 8, %o2 ! decrement length count
1112 add %o1, 8, %o1 ! increase dst ptr by 8
1113 add %o0, 8, %o0 ! increase src ptr by 8
1114 bgu,pt %ncc, .bc_medbh15
1115 stx %o4, [%o1-8]
1116 ba .bc_medb7
1117 nop
1118
1119 SET_SIZE(bcopy)
1120 /*
1121 * The _more entry points are not intended to be used directly by
1122 * any caller from outside this file. They are provided to allow
1123 * profiling and dtrace of the portions of the copy code that uses
1124 * the floating point registers.
1125 */
1126 ENTRY(bcopy_more)
1127 .bcopy_more:
1128 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1129 ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler
1130 brz,pt %o5, .do_copy
1131 nop
1132 sethi %hi(.copyerr), %l7 ! copyerr is lofault value
1133 or %l7, %lo(.copyerr), %l7
1134 membar #Sync ! sync error barrier
1135 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault
1136 ! We've already captured whether t_lofault was zero on entry.
1137 ! We need to mark ourselves as being from bcopy since both
1138 ! kcopy and bcopy use the same code path. If LOFAULT_SET is
1139 ! set and the saved lofault was zero, we won't reset lofault on
1140 ! returning.
1141 or %o5, LOFAULT_SET, %o5
1142 .do_copy:
1143 ldn [THREAD_REG + T_LWP], %o3
1144 brnz,pt %o3, 1f
1145 nop
1146 /*
1147 * kpreempt_disable();
1148 */
1149 ldsb [THREAD_REG +T_PREEMPT], %o3
1150 inc %o3
1151 stb %o3, [THREAD_REG + T_PREEMPT]
1152 1:
1153 /*
1154 * Following code is for large copies. We know there is at
1155 * least FP_COPY bytes available. FP regs are used, so
1156 * we save registers and fp regs before starting
1157 */
1158 rd %fprs, %g5 ! check for unused fp
1159 or %o5,FPUSED_FLAG,%o5
1160 ! if fprs.fef == 0, set it.
1161 ! Setting it when already set costs more than checking
1162 andcc %g5, FPRS_FEF, %g5 ! test FEF, fprs.du = fprs.dl = 0
1163 bz,pt %ncc, .bc_fp_unused
1164 prefetch [%i0 + (1 * CACHE_LINE)], #one_read
1165 BST_FP_TOSTACK(%o3)
1166 ba .bc_fp_ready
1167 .bc_fp_unused:
1168 andcc %i1, 1, %o3 ! is dest byte aligned
1169 wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1
1170 .bc_fp_ready:
1171 rd %gsr, %l5 ! save %gsr value
1172 bnz,pt %ncc, .bc_big_d1
1173 .bc_big_d1f: ! dest is now half word aligned
1174 andcc %i1, 2, %o3
1175 bnz,pt %ncc, .bc_big_d2
1176 .bc_big_d2f: ! dest is now word aligned
1177 andcc %i1, 4, %o3
1178 bnz,pt %ncc, .bc_big_d4
1179 .bc_big_d4f: ! dest is now long word aligned
1180 andcc %i0, 7, %o3 ! is src long word aligned
1181 brnz,pt %o3, .bc_big_unal8
1182 prefetch [%i0 + (2 * CACHE_LINE)], #one_read
1183
1184 ! Src and dst are long word aligned
1185 ! align dst to 64 byte boundary
1186 andcc %i1, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned
1187 brz,pn %o3, .bc_al_to_64
1188 nop
1189 sub %o3, 64, %o3 ! %o3 has negative bytes to move
1190 add %i2, %o3, %i2 ! adjust remaining count
1191 andcc %o3, 8, %o4 ! odd long words to move?
1192 brz,pt %o4, .bc_al_to_16
1193 nop
1194 add %o3, 8, %o3
1195 ldx [%i0], %o4
1196 add %i0, 8, %i0 ! increment src ptr
1197 add %i1, 8, %i1 ! increment dst ptr
1198 stx %o4, [%i1-8]
1199 ! Dest is aligned on 16 bytes, src 8 byte aligned
1200 .bc_al_to_16:
1201 andcc %o3, 0x30, %o4 ! pair of long words to move?
1202 brz,pt %o4, .bc_al_to_64
1203 nop
1204 .bc_al_mv_16:
1205 add %o3, 16, %o3
1206 ldx [%i0], %o4
1207 stx %o4, [%i1]
1208 ldx [%i0+8], %o4
1209 add %i0, 16, %i0 ! increment src ptr
1210 stx %o4, [%i1+8]
1211 andcc %o3, 48, %o4
1212 brnz,pt %o4, .bc_al_mv_16
1213 add %i1, 16, %i1 ! increment dst ptr
1214 ! Dest is aligned on 64 bytes, src 8 byte aligned
1215 .bc_al_to_64:
1216 ! Determine source alignment
1217 ! to correct 8 byte offset
1218 andcc %i0, 32, %o3
1219 brnz,pn %o3, .bc_aln_1
1220 andcc %i0, 16, %o3
1221 brnz,pn %o3, .bc_aln_01
1222 andcc %i0, 8, %o3
1223 brz,pn %o3, .bc_aln_000
1224 prefetch [%i0 + (3 * CACHE_LINE)], #one_read
1225 ba .bc_aln_001
1226 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1227
1228 .bc_aln_01:
1229 brnz,pn %o3, .bc_aln_011
1230 prefetch [%i0 + (3 * CACHE_LINE)], #one_read
1231 ba .bc_aln_010
1232 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1233 .bc_aln_1:
1234 andcc %i0, 16, %o3
1235 brnz,pn %o3, .bc_aln_11
1236 andcc %i0, 8, %o3
1237 brnz,pn %o3, .bc_aln_101
1238 prefetch [%i0 + (3 * CACHE_LINE)], #one_read
1239 ba .bc_aln_100
1240 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1241 .bc_aln_11:
1242 brz,pn %o3, .bc_aln_110
1243 prefetch [%i0 + (3 * CACHE_LINE)], #one_read
1244
1245 .bc_aln_111:
1246 ! Alignment off by 8 bytes
1247 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1248 ldd [%i0], %d0
1249 add %i0, 8, %i0
1250 sub %i2, 8, %i2
1251 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
1252 and %i2, 0x7f, %i2 ! residue bytes in %i2
1253 sub %i1, %i0, %i1
1254 .bc_aln_111_loop:
1255 ldda [%i0]ASI_BLK_P,%d16 ! block load
1256 subcc %o3, 64, %o3
1257 fmovd %d16, %d2
1258 fmovd %d18, %d4
1259 fmovd %d20, %d6
1260 fmovd %d22, %d8
1261 fmovd %d24, %d10
1262 fmovd %d26, %d12
1263 fmovd %d28, %d14
1264 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1265 stda %d0,[%i0+%i1]ASI_BLK_P
1266 add %i0, 64, %i0
1267 fmovd %d30, %d0
1268 bgt,pt %ncc, .bc_aln_111_loop
1269 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1270 add %i1, %i0, %i1
1271
1272 std %d0, [%i1]
1273 ba .bc_remain_stuff
1274 add %i1, 8, %i1
1275 ! END OF aln_111
1276
1277 .bc_aln_110:
1278 ! Alignment off by 16 bytes
1279 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1280 ldd [%i0], %d0
1281 ldd [%i0+8], %d2
1282 add %i0, 16, %i0
1283 sub %i2, 16, %i2
1284 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
1285 and %i2, 0x7f, %i2 ! residue bytes in %i2
1286 sub %i1, %i0, %i1
1287 .bc_aln_110_loop:
1288 ldda [%i0]ASI_BLK_P,%d16 ! block load
1289 subcc %o3, 64, %o3
1290 fmovd %d16, %d4
1291 fmovd %d18, %d6
1292 fmovd %d20, %d8
1293 fmovd %d22, %d10
1294 fmovd %d24, %d12
1295 fmovd %d26, %d14
1296 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1297 stda %d0,[%i0+%i1]ASI_BLK_P
1298 add %i0, 64, %i0
1299 fmovd %d28, %d0
1300 fmovd %d30, %d2
1301 bgt,pt %ncc, .bc_aln_110_loop
1302 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1303 add %i1, %i0, %i1
1304
1305 std %d0, [%i1]
1306 std %d2, [%i1+8]
1307 ba .bc_remain_stuff
1308 add %i1, 16, %i1
1309 ! END OF aln_110
1310
1311 .bc_aln_101:
1312 ! Alignment off by 24 bytes
1313 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1314 ldd [%i0], %d0
1315 ldd [%i0+8], %d2
1316 ldd [%i0+16], %d4
1317 add %i0, 24, %i0
1318 sub %i2, 24, %i2
1319 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
1320 and %i2, 0x7f, %i2 ! residue bytes in %i2
1321 sub %i1, %i0, %i1
1322 .bc_aln_101_loop:
1323 ldda [%i0]ASI_BLK_P,%d16 ! block load
1324 subcc %o3, 64, %o3
1325 fmovd %d16, %d6
1326 fmovd %d18, %d8
1327 fmovd %d20, %d10
1328 fmovd %d22, %d12
1329 fmovd %d24, %d14
1330 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1331 stda %d0,[%i0+%i1]ASI_BLK_P
1332 add %i0, 64, %i0
1333 fmovd %d26, %d0
1334 fmovd %d28, %d2
1335 fmovd %d30, %d4
1336 bgt,pt %ncc, .bc_aln_101_loop
1337 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1338 add %i1, %i0, %i1
1339
1340 std %d0, [%i1]
1341 std %d2, [%i1+8]
1342 std %d4, [%i1+16]
1343 ba .bc_remain_stuff
1344 add %i1, 24, %i1
1345 ! END OF aln_101
1346
1347 .bc_aln_100:
1348 ! Alignment off by 32 bytes
1349 ldd [%i0], %d0
1350 ldd [%i0+8], %d2
1351 ldd [%i0+16],%d4
1352 ldd [%i0+24],%d6
1353 add %i0, 32, %i0
1354 sub %i2, 32, %i2
1355 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
1356 and %i2, 0x7f, %i2 ! residue bytes in %i2
1357 sub %i1, %i0, %i1
1358 .bc_aln_100_loop:
1359 ldda [%i0]ASI_BLK_P,%d16 ! block load
1360 subcc %o3, 64, %o3
1361 fmovd %d16, %d8
1362 fmovd %d18, %d10
1363 fmovd %d20, %d12
1364 fmovd %d22, %d14
1365 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1366 stda %d0,[%i0+%i1]ASI_BLK_P
1367 add %i0, 64, %i0
1368 fmovd %d24, %d0
1369 fmovd %d26, %d2
1370 fmovd %d28, %d4
1371 fmovd %d30, %d6
1372 bgt,pt %ncc, .bc_aln_100_loop
1373 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1374 add %i1, %i0, %i1
1375
1376 std %d0, [%i1]
1377 std %d2, [%i1+8]
1378 std %d4, [%i1+16]
1379 std %d6, [%i1+24]
1380 ba .bc_remain_stuff
1381 add %i1, 32, %i1
1382 ! END OF aln_100
1383
1384 .bc_aln_011:
1385 ! Alignment off by 40 bytes
1386 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1387 ldd [%i0], %d0
1388 ldd [%i0+8], %d2
1389 ldd [%i0+16], %d4
1390 ldd [%i0+24], %d6
1391 ldd [%i0+32], %d8
1392 add %i0, 40, %i0
1393 sub %i2, 40, %i2
1394 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
1395 and %i2, 0x7f, %i2 ! residue bytes in %i2
1396 sub %i1, %i0, %i1
1397 .bc_aln_011_loop:
1398 ldda [%i0]ASI_BLK_P,%d16 ! block load
1399 subcc %o3, 64, %o3
1400 fmovd %d16, %d10
1401 fmovd %d18, %d12
1402 fmovd %d20, %d14
1403 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1404 stda %d0,[%i0+%i1]ASI_BLK_P
1405 add %i0, 64, %i0
1406 fmovd %d22, %d0
1407 fmovd %d24, %d2
1408 fmovd %d26, %d4
1409 fmovd %d28, %d6
1410 fmovd %d30, %d8
1411 bgt,pt %ncc, .bc_aln_011_loop
1412 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1413 add %i1, %i0, %i1
1414
1415 std %d0, [%i1]
1416 std %d2, [%i1+8]
1417 std %d4, [%i1+16]
1418 std %d6, [%i1+24]
1419 std %d8, [%i1+32]
1420 ba .bc_remain_stuff
1421 add %i1, 40, %i1
1422 ! END OF aln_011
1423
1424 .bc_aln_010:
1425 ! Alignment off by 48 bytes
1426 ldd [%i0], %d0
1427 ldd [%i0+8], %d2
1428 ldd [%i0+16], %d4
1429 ldd [%i0+24], %d6
1430 ldd [%i0+32], %d8
1431 ldd [%i0+40], %d10
1432 add %i0, 48, %i0
1433 sub %i2, 48, %i2
1434 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
1435 and %i2, 0x7f, %i2 ! residue bytes in %i2
1436 sub %i1, %i0, %i1
1437 .bc_aln_010_loop:
1438 ldda [%i0]ASI_BLK_P,%d16 ! block load
1439 subcc %o3, 64, %o3
1440 fmovd %d16, %d12
1441 fmovd %d18, %d14
1442 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1443 stda %d0,[%i0+%i1]ASI_BLK_P
1444 add %i0, 64, %i0
1445 fmovd %d20, %d0
1446 fmovd %d22, %d2
1447 fmovd %d24, %d4
1448 fmovd %d26, %d6
1449 fmovd %d28, %d8
1450 fmovd %d30, %d10
1451 bgt,pt %ncc, .bc_aln_010_loop
1452 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1453 add %i1, %i0, %i1
1454
1455 std %d0, [%i1]
1456 std %d2, [%i1+8]
1457 std %d4, [%i1+16]
1458 std %d6, [%i1+24]
1459 std %d8, [%i1+32]
1460 std %d10, [%i1+40]
1461 ba .bc_remain_stuff
1462 add %i1, 48, %i1
1463 ! END OF aln_010
1464
1465 .bc_aln_001:
1466 ! Alignment off by 56 bytes
1467 ldd [%i0], %d0
1468 ldd [%i0+8], %d2
1469 ldd [%i0+16], %d4
1470 ldd [%i0+24], %d6
1471 ldd [%i0+32], %d8
1472 ldd [%i0+40], %d10
1473 ldd [%i0+48], %d12
1474 add %i0, 56, %i0
1475 sub %i2, 56, %i2
1476 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
1477 and %i2, 0x7f, %i2 ! residue bytes in %i2
1478 sub %i1, %i0, %i1
1479 .bc_aln_001_loop:
1480 ldda [%i0]ASI_BLK_P,%d16 ! block load
1481 subcc %o3, 64, %o3
1482 fmovd %d16, %d14
1483 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1484 stda %d0,[%i0+%i1]ASI_BLK_P
1485 add %i0, 64, %i0
1486 fmovd %d18, %d0
1487 fmovd %d20, %d2
1488 fmovd %d22, %d4
1489 fmovd %d24, %d6
1490 fmovd %d26, %d8
1491 fmovd %d28, %d10
1492 fmovd %d30, %d12
1493 bgt,pt %ncc, .bc_aln_001_loop
1494 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1495 add %i1, %i0, %i1
1496
1497 std %d0, [%i1]
1498 std %d2, [%i1+8]
1499 std %d4, [%i1+16]
1500 std %d6, [%i1+24]
1501 std %d8, [%i1+32]
1502 std %d10, [%i1+40]
1503 std %d12, [%i1+48]
1504 ba .bc_remain_stuff
1505 add %i1, 56, %i1
1506 ! END OF aln_001
1507
1508 .bc_aln_000:
1509 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1510 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
1511 and %i2, 0x7f, %i2 ! residue bytes in %i2
1512 sub %i1, %i0, %i1
1513 .bc_aln_000_loop:
1514 ldda [%i0]ASI_BLK_P,%d0
1515 subcc %o3, 64, %o3
1516 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1517 stda %d0,[%i0+%i1]ASI_BLK_P
1518 add %i0, 64, %i0
1519 bgt,pt %ncc, .bc_aln_000_loop
1520 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1521 add %i1, %i0, %i1
1522
1523 ! END OF aln_000
1524
1525 .bc_remain_stuff:
1526 subcc %i2, 31, %i2 ! adjust length to allow cc test
1527 ble,pt %ncc, .bc_aln_31
1528 nop
1529 .bc_aln_32:
1530 ldx [%i0], %o4 ! move 32 bytes
1531 subcc %i2, 32, %i2 ! decrement length count by 32
1532 stx %o4, [%i1]
1533 ldx [%i0+8], %o4
1534 stx %o4, [%i1+8]
1535 ldx [%i0+16], %o4
1536 add %i0, 32, %i0 ! increase src ptr by 32
1537 stx %o4, [%i1+16]
1538 ldx [%i0-8], %o4
1539 add %i1, 32, %i1 ! increase dst ptr by 32
1540 bgu,pt %ncc, .bc_aln_32 ! repeat if at least 32 bytes left
1541 stx %o4, [%i1-8]
1542 .bc_aln_31:
1543 addcc %i2, 24, %i2 ! adjust count to be off by 7
1544 ble,pt %ncc, .bc_aln_7 ! skip if 7 or fewer bytes left
1545 nop !
1546 .bc_aln_15:
1547 ldx [%i0], %o4 ! move 8 bytes
1548 add %i0, 8, %i0 ! increase src ptr by 8
1549 subcc %i2, 8, %i2 ! decrease count by 8
1550 add %i1, 8, %i1 ! increase dst ptr by 8
1551 bgu,pt %ncc, .bc_aln_15
1552 stx %o4, [%i1-8] !
1553 .bc_aln_7:
1554 addcc %i2, 7, %i2 ! finish adjustment of remaining count
1555 bz,pt %ncc, .bc_exit ! exit if finished
1556 cmp %i2, 4
1557 blt,pt %ncc, .bc_unaln3x ! skip if less than 4 bytes left
1558 nop !
1559 ld [%i0], %o4 ! move 4 bytes
1560 add %i0, 4, %i0 ! increase src ptr by 4
1561 add %i1, 4, %i1 ! increase dst ptr by 4
1562 subcc %i2, 4, %i2 ! decrease count by 4
1563 bnz .bc_unaln3x
1564 stw %o4, [%i1-4]
1565 ba .bc_exit
1566 nop
1567
1568 ! destination alignment code
1569 .bc_big_d1:
1570 ldub [%i0], %o4 ! move a byte
1571 add %i0, 1, %i0
1572 stb %o4, [%i1]
1573 add %i1, 1, %i1
1574 andcc %i1, 2, %o3
1575 bz,pt %ncc, .bc_big_d2f
1576 sub %i2, 1, %i2
1577 .bc_big_d2:
1578 ldub [%i0], %o4 ! move a half-word (src align unknown)
1579 ldub [%i0+1], %o3
1580 add %i0, 2, %i0
1581 sll %o4, 8, %o4 ! position
1582 or %o4, %o3, %o4 ! merge
1583 sth %o4, [%i1]
1584 add %i1, 2, %i1
1585 andcc %i1, 4, %o3
1586 bz,pt %ncc, .bc_big_d4f
1587 sub %i2, 2, %i2
1588 .bc_big_d4:
1589 ldub [%i0], %o4 ! move a word (src align unknown)
1590 ldub [%i0+1], %o3
1591 sll %o4, 24, %o4 ! position
1592 sll %o3, 16, %o3 ! position
1593 or %o4, %o3, %o3 ! merge
1594 ldub [%i0+2], %o4
1595 sll %o4, 8, %o4 ! position
1596 or %o4, %o3, %o3 ! merge
1597 ldub [%i0+3], %o4
1598 or %o4, %o3, %o4 ! merge
1599 stw %o4,[%i1] ! store four bytes
1600 add %i0, 4, %i0 ! adjust src by 4
1601 add %i1, 4, %i1 ! adjust dest by 4
1602 ba .bc_big_d4f
1603 sub %i2, 4, %i2 ! adjust count by 4
1604
1605
1606 ! Dst is on 8 byte boundary; src is not;
1607 .bc_big_unal8:
1608 andcc %i1, 0x3f, %o3 ! is dst 64-byte block aligned?
1609 bz %ncc, .bc_unalnsrc
1610 sub %o3, 64, %o3 ! %o3 will be multiple of 8
1611 neg %o3 ! bytes until dest is 64 byte aligned
1612 sub %i2, %o3, %i2 ! update cnt with bytes to be moved
1613 ! Move bytes according to source alignment
1614 andcc %i0, 0x1, %o4
1615 bnz %ncc, .bc_unalnbyte ! check for byte alignment
1616 nop
1617 andcc %i0, 2, %o4 ! check for half word alignment
1618 bnz %ncc, .bc_unalnhalf
1619 nop
1620 ! Src is word aligned, move bytes until dest 64 byte aligned
1621 .bc_unalnword:
1622 ld [%i0], %o4 ! load 4 bytes
1623 stw %o4, [%i1] ! and store 4 bytes
1624 ld [%i0+4], %o4 ! load 4 bytes
1625 add %i0, 8, %i0 ! increase src ptr by 8
1626 stw %o4, [%i1+4] ! and store 4 bytes
1627 subcc %o3, 8, %o3 ! decrease count by 8
1628 bnz %ncc, .bc_unalnword
1629 add %i1, 8, %i1 ! increase dst ptr by 8
1630 ba .bc_unalnsrc
1631 nop
1632
1633 ! Src is half-word aligned, move bytes until dest 64 byte aligned
1634 .bc_unalnhalf:
1635 lduh [%i0], %o4 ! load 2 bytes
1636 sllx %o4, 32, %i3 ! shift left
1637 lduw [%i0+2], %o4
1638 or %o4, %i3, %i3
1639 sllx %i3, 16, %i3
1640 lduh [%i0+6], %o4
1641 or %o4, %i3, %i3
1642 stx %i3, [%i1]
1643 add %i0, 8, %i0
1644 subcc %o3, 8, %o3
1645 bnz %ncc, .bc_unalnhalf
1646 add %i1, 8, %i1
1647 ba .bc_unalnsrc
1648 nop
1649
1650 ! Src is Byte aligned, move bytes until dest 64 byte aligned
1651 .bc_unalnbyte:
1652 sub %i1, %i0, %i1 ! share pointer advance
1653 .bc_unalnbyte_loop:
1654 ldub [%i0], %o4
1655 sllx %o4, 56, %i3
1656 lduh [%i0+1], %o4
1657 sllx %o4, 40, %o4
1658 or %o4, %i3, %i3
1659 lduh [%i0+3], %o4
1660 sllx %o4, 24, %o4
1661 or %o4, %i3, %i3
1662 lduh [%i0+5], %o4
1663 sllx %o4, 8, %o4
1664 or %o4, %i3, %i3
1665 ldub [%i0+7], %o4
1666 or %o4, %i3, %i3
1667 stx %i3, [%i1+%i0]
1668 subcc %o3, 8, %o3
1669 bnz %ncc, .bc_unalnbyte_loop
1670 add %i0, 8, %i0
1671 add %i1,%i0, %i1 ! restore pointer
1672
1673 ! Destination is now block (64 byte aligned), src is not 8 byte aligned
1674 .bc_unalnsrc:
1675 andn %i2, 0x3f, %i3 ! %i3 is multiple of block size
1676 and %i2, 0x3f, %i2 ! residue bytes in %i2
1677 add %i2, 64, %i2 ! Insure we don't load beyond
1678 sub %i3, 64, %i3 ! end of source buffer
1679
1680 andn %i0, 0x3f, %o4 ! %o4 has block aligned src address
1681 prefetch [%o4 + (3 * CACHE_LINE)], #one_read
1682 alignaddr %i0, %g0, %g0 ! generate %gsr
1683 add %i0, %i3, %i0 ! advance %i0 to after blocks
1684 !
1685 ! Determine source alignment to correct 8 byte offset
1686 andcc %i0, 0x20, %o3
1687 brnz,pn %o3, .bc_unaln_1
1688 andcc %i0, 0x10, %o3
1689 brnz,pn %o3, .bc_unaln_01
1690 andcc %i0, 0x08, %o3
1691 brz,a %o3, .bc_unaln_000
1692 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1693 ba .bc_unaln_001
1694 nop
1695 .bc_unaln_01:
1696 brnz,a %o3, .bc_unaln_011
1697 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1698 ba .bc_unaln_010
1699 nop
1700 .bc_unaln_1:
1701 brnz,pn %o3, .bc_unaln_11
1702 andcc %i0, 0x08, %o3
1703 brnz,a %o3, .bc_unaln_101
1704 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1705 ba .bc_unaln_100
1706 nop
1707 .bc_unaln_11:
1708 brz,pn %o3, .bc_unaln_110
1709 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1710
1711 .bc_unaln_111:
1712 ldd [%o4+56], %d14
1713 .bc_unaln_111_loop:
1714 add %o4, 64, %o4
1715 ldda [%o4]ASI_BLK_P, %d16
1716 faligndata %d14, %d16, %d48
1717 faligndata %d16, %d18, %d50
1718 faligndata %d18, %d20, %d52
1719 faligndata %d20, %d22, %d54
1720 faligndata %d22, %d24, %d56
1721 faligndata %d24, %d26, %d58
1722 faligndata %d26, %d28, %d60
1723 faligndata %d28, %d30, %d62
1724 fmovd %d30, %d14
1725 stda %d48, [%i1]ASI_BLK_P
1726 subcc %i3, 64, %i3
1727 add %i1, 64, %i1
1728 bgu,pt %ncc, .bc_unaln_111_loop
1729 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1730 ba .bc_unaln_done
1731 nop
1732
1733 .bc_unaln_110:
1734 ldd [%o4+48], %d12
1735 ldd [%o4+56], %d14
1736 .bc_unaln_110_loop:
1737 add %o4, 64, %o4
1738 ldda [%o4]ASI_BLK_P, %d16
1739 faligndata %d12, %d14, %d48
1740 faligndata %d14, %d16, %d50
1741 faligndata %d16, %d18, %d52
1742 faligndata %d18, %d20, %d54
1743 faligndata %d20, %d22, %d56
1744 faligndata %d22, %d24, %d58
1745 faligndata %d24, %d26, %d60
1746 faligndata %d26, %d28, %d62
1747 fmovd %d28, %d12
1748 fmovd %d30, %d14
1749 stda %d48, [%i1]ASI_BLK_P
1750 subcc %i3, 64, %i3
1751 add %i1, 64, %i1
1752 bgu,pt %ncc, .bc_unaln_110_loop
1753 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1754 ba .bc_unaln_done
1755 nop
1756
1757 .bc_unaln_101:
1758 ldd [%o4+40], %d10
1759 ldd [%o4+48], %d12
1760 ldd [%o4+56], %d14
1761 .bc_unaln_101_loop:
1762 add %o4, 64, %o4
1763 ldda [%o4]ASI_BLK_P, %d16
1764 faligndata %d10, %d12, %d48
1765 faligndata %d12, %d14, %d50
1766 faligndata %d14, %d16, %d52
1767 faligndata %d16, %d18, %d54
1768 faligndata %d18, %d20, %d56
1769 faligndata %d20, %d22, %d58
1770 faligndata %d22, %d24, %d60
1771 faligndata %d24, %d26, %d62
1772 fmovd %d26, %d10
1773 fmovd %d28, %d12
1774 fmovd %d30, %d14
1775 stda %d48, [%i1]ASI_BLK_P
1776 subcc %i3, 64, %i3
1777 add %i1, 64, %i1
1778 bgu,pt %ncc, .bc_unaln_101_loop
1779 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1780 ba .bc_unaln_done
1781 nop
1782
1783 .bc_unaln_100:
1784 ldd [%o4+32], %d8
1785 ldd [%o4+40], %d10
1786 ldd [%o4+48], %d12
1787 ldd [%o4+56], %d14
1788 .bc_unaln_100_loop:
1789 add %o4, 64, %o4
1790 ldda [%o4]ASI_BLK_P, %d16
1791 faligndata %d8, %d10, %d48
1792 faligndata %d10, %d12, %d50
1793 faligndata %d12, %d14, %d52
1794 faligndata %d14, %d16, %d54
1795 faligndata %d16, %d18, %d56
1796 faligndata %d18, %d20, %d58
1797 faligndata %d20, %d22, %d60
1798 faligndata %d22, %d24, %d62
1799 fmovd %d24, %d8
1800 fmovd %d26, %d10
1801 fmovd %d28, %d12
1802 fmovd %d30, %d14
1803 stda %d48, [%i1]ASI_BLK_P
1804 subcc %i3, 64, %i3
1805 add %i1, 64, %i1
1806 bgu,pt %ncc, .bc_unaln_100_loop
1807 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1808 ba .bc_unaln_done
1809 nop
1810
1811 .bc_unaln_011:
1812 ldd [%o4+24], %d6
1813 ldd [%o4+32], %d8
1814 ldd [%o4+40], %d10
1815 ldd [%o4+48], %d12
1816 ldd [%o4+56], %d14
1817 .bc_unaln_011_loop:
1818 add %o4, 64, %o4
1819 ldda [%o4]ASI_BLK_P, %d16
1820 faligndata %d6, %d8, %d48
1821 faligndata %d8, %d10, %d50
1822 faligndata %d10, %d12, %d52
1823 faligndata %d12, %d14, %d54
1824 faligndata %d14, %d16, %d56
1825 faligndata %d16, %d18, %d58
1826 faligndata %d18, %d20, %d60
1827 faligndata %d20, %d22, %d62
1828 fmovd %d22, %d6
1829 fmovd %d24, %d8
1830 fmovd %d26, %d10
1831 fmovd %d28, %d12
1832 fmovd %d30, %d14
1833 stda %d48, [%i1]ASI_BLK_P
1834 subcc %i3, 64, %i3
1835 add %i1, 64, %i1
1836 bgu,pt %ncc, .bc_unaln_011_loop
1837 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1838 ba .bc_unaln_done
1839 nop
1840
1841 .bc_unaln_010:
1842 ldd [%o4+16], %d4
1843 ldd [%o4+24], %d6
1844 ldd [%o4+32], %d8
1845 ldd [%o4+40], %d10
1846 ldd [%o4+48], %d12
1847 ldd [%o4+56], %d14
1848 .bc_unaln_010_loop:
1849 add %o4, 64, %o4
1850 ldda [%o4]ASI_BLK_P, %d16
1851 faligndata %d4, %d6, %d48
1852 faligndata %d6, %d8, %d50
1853 faligndata %d8, %d10, %d52
1854 faligndata %d10, %d12, %d54
1855 faligndata %d12, %d14, %d56
1856 faligndata %d14, %d16, %d58
1857 faligndata %d16, %d18, %d60
1858 faligndata %d18, %d20, %d62
1859 fmovd %d20, %d4
1860 fmovd %d22, %d6
1861 fmovd %d24, %d8
1862 fmovd %d26, %d10
1863 fmovd %d28, %d12
1864 fmovd %d30, %d14
1865 stda %d48, [%i1]ASI_BLK_P
1866 subcc %i3, 64, %i3
1867 add %i1, 64, %i1
1868 bgu,pt %ncc, .bc_unaln_010_loop
1869 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1870 ba .bc_unaln_done
1871 nop
1872
1873 .bc_unaln_001:
1874 ldd [%o4+8], %d2
1875 ldd [%o4+16], %d4
1876 ldd [%o4+24], %d6
1877 ldd [%o4+32], %d8
1878 ldd [%o4+40], %d10
1879 ldd [%o4+48], %d12
1880 ldd [%o4+56], %d14
1881 .bc_unaln_001_loop:
1882 add %o4, 64, %o4
1883 ldda [%o4]ASI_BLK_P, %d16
1884 faligndata %d2, %d4, %d48
1885 faligndata %d4, %d6, %d50
1886 faligndata %d6, %d8, %d52
1887 faligndata %d8, %d10, %d54
1888 faligndata %d10, %d12, %d56
1889 faligndata %d12, %d14, %d58
1890 faligndata %d14, %d16, %d60
1891 faligndata %d16, %d18, %d62
1892 fmovd %d18, %d2
1893 fmovd %d20, %d4
1894 fmovd %d22, %d6
1895 fmovd %d24, %d8
1896 fmovd %d26, %d10
1897 fmovd %d28, %d12
1898 fmovd %d30, %d14
1899 stda %d48, [%i1]ASI_BLK_P
1900 subcc %i3, 64, %i3
1901 add %i1, 64, %i1
1902 bgu,pt %ncc, .bc_unaln_001_loop
1903 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1904 ba .bc_unaln_done
1905 nop
1906
1907 .bc_unaln_000:
1908 ldda [%o4]ASI_BLK_P, %d0
1909 .bc_unaln_000_loop:
1910 add %o4, 64, %o4
1911 ldda [%o4]ASI_BLK_P, %d16
1912 faligndata %d0, %d2, %d48
1913 faligndata %d2, %d4, %d50
1914 faligndata %d4, %d6, %d52
1915 faligndata %d6, %d8, %d54
1916 faligndata %d8, %d10, %d56
1917 faligndata %d10, %d12, %d58
1918 faligndata %d12, %d14, %d60
1919 faligndata %d14, %d16, %d62
1920 fmovd %d16, %d0
1921 fmovd %d18, %d2
1922 fmovd %d20, %d4
1923 fmovd %d22, %d6
1924 fmovd %d24, %d8
1925 fmovd %d26, %d10
1926 fmovd %d28, %d12
1927 fmovd %d30, %d14
1928 stda %d48, [%i1]ASI_BLK_P
1929 subcc %i3, 64, %i3
1930 add %i1, 64, %i1
1931 bgu,pt %ncc, .bc_unaln_000_loop
1932 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1933
1934 .bc_unaln_done:
1935 ! Handle trailing bytes, 64 to 127
1936 ! Dest long word aligned, Src not long word aligned
1937 cmp %i2, 15
1938 bleu %ncc, .bc_unaln_short
1939
1940 andn %i2, 0x7, %i3 ! %i3 is multiple of 8
1941 and %i2, 0x7, %i2 ! residue bytes in %i2
1942 add %i2, 8, %i2
1943 sub %i3, 8, %i3 ! insure we don't load past end of src
1944 andn %i0, 0x7, %o4 ! %o4 has long word aligned src address
1945 add %i0, %i3, %i0 ! advance %i0 to after multiple of 8
1946 ldd [%o4], %d0 ! fetch partial word
1947 .bc_unaln_by8:
1948 ldd [%o4+8], %d2
1949 add %o4, 8, %o4
1950 faligndata %d0, %d2, %d16
1951 subcc %i3, 8, %i3
1952 std %d16, [%i1]
1953 fmovd %d2, %d0
1954 bgu,pt %ncc, .bc_unaln_by8
1955 add %i1, 8, %i1
1956
1957 .bc_unaln_short:
1958 cmp %i2, 8
1959 blt,pt %ncc, .bc_unalnfin
1960 nop
1961 ldub [%i0], %o4
1962 sll %o4, 24, %o3
1963 ldub [%i0+1], %o4
1964 sll %o4, 16, %o4
1965 or %o4, %o3, %o3
1966 ldub [%i0+2], %o4
1967 sll %o4, 8, %o4
1968 or %o4, %o3, %o3
1969 ldub [%i0+3], %o4
1970 or %o4, %o3, %o3
1971 stw %o3, [%i1]
1972 ldub [%i0+4], %o4
1973 sll %o4, 24, %o3
1974 ldub [%i0+5], %o4
1975 sll %o4, 16, %o4
1976 or %o4, %o3, %o3
1977 ldub [%i0+6], %o4
1978 sll %o4, 8, %o4
1979 or %o4, %o3, %o3
1980 ldub [%i0+7], %o4
1981 or %o4, %o3, %o3
1982 stw %o3, [%i1+4]
1983 add %i0, 8, %i0
1984 add %i1, 8, %i1
1985 sub %i2, 8, %i2
1986 .bc_unalnfin:
1987 cmp %i2, 4
1988 blt,pt %ncc, .bc_unalnz
1989 tst %i2
1990 ldub [%i0], %o3 ! read byte
1991 subcc %i2, 4, %i2 ! reduce count by 4
1992 sll %o3, 24, %o3 ! position
1993 ldub [%i0+1], %o4
1994 sll %o4, 16, %o4 ! position
1995 or %o4, %o3, %o3 ! merge
1996 ldub [%i0+2], %o4
1997 sll %o4, 8, %o4 ! position
1998 or %o4, %o3, %o3 ! merge
1999 add %i1, 4, %i1 ! advance dst by 4
2000 ldub [%i0+3], %o4
2001 add %i0, 4, %i0 ! advance src by 4
2002 or %o4, %o3, %o4 ! merge
2003 bnz,pt %ncc, .bc_unaln3x
2004 stw %o4, [%i1-4]
2005 ba .bc_exit
2006 nop
2007 .bc_unalnz:
2008 bz,pt %ncc, .bc_exit
2009 .bc_unaln3x: ! Exactly 1, 2, or 3 bytes remain
2010 subcc %i2, 1, %i2 ! reduce count for cc test
2011 ldub [%i0], %o4 ! load one byte
2012 bz,pt %ncc, .bc_exit
2013 stb %o4, [%i1] ! store one byte
2014 ldub [%i0+1], %o4 ! load second byte
2015 subcc %i2, 1, %i2
2016 bz,pt %ncc, .bc_exit
2017 stb %o4, [%i1+1] ! store second byte
2018 ldub [%i0+2], %o4 ! load third byte
2019 stb %o4, [%i1+2] ! store third byte
2020 .bc_exit:
2021 wr %l5, %g0, %gsr ! restore %gsr
2022 brnz %g5, .bc_fp_restore
2023 and %o5, COPY_FLAGS, %l1 ! save flags in %l1
2024 FZERO
2025 wr %g5, %g0, %fprs
2026 ba,pt %ncc, .bc_ex2
2027 nop
2028 .bc_fp_restore:
2029 BLD_FP_FROMSTACK(%o4)
2030 .bc_ex2:
2031 ldn [THREAD_REG + T_LWP], %o2
2032 brnz,pt %o2, 1f
2033 nop
2034
2035 ldsb [THREAD_REG + T_PREEMPT], %l0
2036 deccc %l0
2037 bnz,pn %ncc, 1f
2038 stb %l0, [THREAD_REG + T_PREEMPT]
2039
2040 ! Check for a kernel preemption request
2041 ldn [THREAD_REG + T_CPU], %l0
2042 ldub [%l0 + CPU_KPRUNRUN], %l0
2043 brnz,a,pt %l0, 1f ! Need to call kpreempt?
2044 or %l1, KPREEMPT_FLAG, %l1 ! If so, set the flag
2045 1:
2046 btst LOFAULT_SET, %l1
2047 bz,pn %icc, 3f
2048 andncc %o5, COPY_FLAGS, %o5
2049 ! Here via bcopy. Check to see if the handler was NULL.
2050 ! If so, just return quietly. Otherwise, reset the
2051 ! handler and return.
2052 bz,pn %ncc, 2f
2053 nop
2054 membar #Sync
2055 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2056 2:
2057 btst KPREEMPT_FLAG, %l1
2058 bz,pt %icc, 3f
2059 nop
2060 call kpreempt
2061 rdpr %pil, %o0 ! pass %pil
2062 3:
2063 ret
2064 restore %g0, 0, %o0
2065
2066 SET_SIZE(bcopy_more)
2067
2068
2069 #else /* NIAGARA_IMPL */
2070 save %sp, -SA(MINFRAME), %sp
2071 clr %o5 ! flag LOFAULT_SET is not set for bcopy
2072 .do_copy:
2073 cmp %i2, 12 ! for small counts
2074 blu %ncc, .bytecp ! just copy bytes
2075 .empty
2076
2077 cmp %i2, 128 ! for less than 128 bytes
2078 blu,pn %ncc, .bcb_punt ! no block st/quad ld
2079 nop
2080
2081 set use_hw_bcopy, %o2
2082 ld [%o2], %o2
2083 brz,pn %o2, .bcb_punt
2084 nop
2085
2086 subcc %i1, %i0, %i3
2087 bneg,a,pn %ncc, 1f
2088 neg %i3
2089 1:
2090 /*
2091 * Compare against 256 since we should be checking block addresses
2092 * and (dest & ~63) - (src & ~63) can be 3 blocks even if
2093 * src = dest + (64 * 3) + 63.
2094 */
2095 cmp %i3, 256
2096 blu,pn %ncc, .bcb_punt
2097 nop
2098
2099 /*
2100 * Copy that reach here have at least 2 blocks of data to copy.
2101 */
2102 .do_blockcopy:
2103 ! Swap src/dst since the code below is memcpy code
2104 ! and memcpy/bcopy have different calling sequences
2105 mov %i1, %i5
2106 mov %i0, %i1
2107 mov %i5, %i0
2108
2109 ! Block (64 bytes) align the destination.
2110 andcc %i0, 0x3f, %i3 ! is dst aligned on a 64 bytes
2111 bz %xcc, .chksrc ! dst is already double aligned
2112 sub %i3, 0x40, %i3
2113 neg %i3 ! bytes till dst 64 bytes aligned
2114 sub %i2, %i3, %i2 ! update i2 with new count
2115
2116 ! Based on source and destination alignment do
2117 ! either 8 bytes, 4 bytes, 2 bytes or byte copy.
2118
2119 ! Is dst & src 8B aligned
2120 or %i0, %i1, %o2
2121 andcc %o2, 0x7, %g0
2122 bz %ncc, .alewdcp
2123 nop
2124
2125 ! Is dst & src 4B aligned
2126 andcc %o2, 0x3, %g0
2127 bz %ncc, .alwdcp
2128 nop
2129
2130 ! Is dst & src 2B aligned
2131 andcc %o2, 0x1, %g0
2132 bz %ncc, .alhlfwdcp
2133 nop
2134
2135 ! 1B aligned
2136 1: ldub [%i1], %o2
2137 stb %o2, [%i0]
2138 inc %i1
2139 deccc %i3
2140 bgu,pt %ncc, 1b
2141 inc %i0
2142
2143 ba .chksrc
2144 nop
2145
2146 ! dst & src 4B aligned
2147 .alwdcp:
2148 ld [%i1], %o2
2149 st %o2, [%i0]
2150 add %i1, 0x4, %i1
2151 subcc %i3, 0x4, %i3
2152 bgu,pt %ncc, .alwdcp
2153 add %i0, 0x4, %i0
2154
2155 ba .chksrc
2156 nop
2157
2158 ! dst & src 2B aligned
2159 .alhlfwdcp:
2160 lduh [%i1], %o2
2161 stuh %o2, [%i0]
2162 add %i1, 0x2, %i1
2163 subcc %i3, 0x2, %i3
2164 bgu,pt %ncc, .alhlfwdcp
2165 add %i0, 0x2, %i0
2166
2167 ba .chksrc
2168 nop
2169
2170 ! dst & src 8B aligned
2171 .alewdcp:
2172 ldx [%i1], %o2
2173 stx %o2, [%i0]
2174 add %i1, 0x8, %i1
2175 subcc %i3, 0x8, %i3
2176 bgu,pt %ncc, .alewdcp
2177 add %i0, 0x8, %i0
2178
2179 ! Now Destination is block (64 bytes) aligned
2180 .chksrc:
2181 andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size
2182 sub %i2, %i3, %i2 ! Residue bytes in %i2
2183
2184 mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
2185
2186 andcc %i1, 0xf, %o2 ! is src quadword aligned
2187 bz,pn %xcc, .blkcpy ! src offset in %o2
2188 nop
2189 cmp %o2, 0x8
2190 bg .cpy_upper_double
2191 nop
2192 bl .cpy_lower_double
2193 nop
2194
2195 ! Falls through when source offset is equal to 8 i.e.
2196 ! source is double word aligned.
2197 ! In this case no shift/merge of data is required
2198 sub %i1, %o2, %i1 ! align the src at 16 bytes.
2199 andn %i1, 0x3f, %l0 ! %l0 has block aligned source
2200 prefetch [%l0+0x0], #one_read
2201 ldda [%i1+0x0]%asi, %l2
2202 loop0:
2203 ldda [%i1+0x10]%asi, %l4
2204 prefetch [%l0+0x40], #one_read
2205
2206 stxa %l3, [%i0+0x0]%asi
2207 stxa %l4, [%i0+0x8]%asi
2208
2209 ldda [%i1+0x20]%asi, %l2
2210 stxa %l5, [%i0+0x10]%asi
2211 stxa %l2, [%i0+0x18]%asi
2212
2213 ldda [%i1+0x30]%asi, %l4
2214 stxa %l3, [%i0+0x20]%asi
2215 stxa %l4, [%i0+0x28]%asi
2216
2217 ldda [%i1+0x40]%asi, %l2
2218 stxa %l5, [%i0+0x30]%asi
2219 stxa %l2, [%i0+0x38]%asi
2220
2221 add %l0, 0x40, %l0
2222 add %i1, 0x40, %i1
2223 subcc %i3, 0x40, %i3
2224 bgu,pt %xcc, loop0
2225 add %i0, 0x40, %i0
2226 ba .blkdone
2227 add %i1, %o2, %i1 ! increment the source by src offset
2228 ! the src offset was stored in %o2
2229
2230 .cpy_lower_double:
2231 sub %i1, %o2, %i1 ! align the src at 16 bytes.
2232 sll %o2, 3, %o0 ! %o0 left shift
2233 mov 0x40, %o1
2234 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift)
2235 andn %i1, 0x3f, %l0 ! %l0 has block aligned source
2236 prefetch [%l0+0x0], #one_read
2237 ldda [%i1+0x0]%asi, %l2 ! partial data in %l2 and %l3 has
2238 ! complete data
2239 loop1:
2240 ldda [%i1+0x10]%asi, %l4 ! %l4 has partial data for this read.
2241 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) ! merge %l2, %l3 and %l4
2242 ! into %l2 and %l3
2243 prefetch [%l0+0x40], #one_read
2244 stxa %l2, [%i0+0x0]%asi
2245 stxa %l3, [%i0+0x8]%asi
2246
2247 ldda [%i1+0x20]%asi, %l2
2248 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) ! merge %l2 with %l5 and
2249 stxa %l4, [%i0+0x10]%asi ! %l4 from previous read
2250 stxa %l5, [%i0+0x18]%asi ! into %l4 and %l5
2251
2252 ! Repeat the same for next 32 bytes.
2253
2254 ldda [%i1+0x30]%asi, %l4
2255 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
2256 stxa %l2, [%i0+0x20]%asi
2257 stxa %l3, [%i0+0x28]%asi
2258
2259 ldda [%i1+0x40]%asi, %l2
2260 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
2261 stxa %l4, [%i0+0x30]%asi
2262 stxa %l5, [%i0+0x38]%asi
2263
2264 add %l0, 0x40, %l0
2265 add %i1, 0x40, %i1
2266 subcc %i3, 0x40, %i3
2267 bgu,pt %xcc, loop1
2268 add %i0, 0x40, %i0
2269 ba .blkdone
2270 add %i1, %o2, %i1 ! increment the source by src offset
2271 ! the src offset was stored in %o2
2272
2273 .cpy_upper_double:
2274 sub %i1, %o2, %i1 ! align the src at 16 bytes.
2275 mov 0x8, %o0
2276 sub %o2, %o0, %o0
2277 sll %o0, 3, %o0 ! %o0 left shift
2278 mov 0x40, %o1
2279 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift)
2280 andn %i1, 0x3f, %l0 ! %l0 has block aligned source
2281 prefetch [%l0+0x0], #one_read
2282 ldda [%i1+0x0]%asi, %l2 ! partial data in %l3 for this read and
2283 ! no data in %l2
2284 loop2:
2285 ldda [%i1+0x10]%asi, %l4 ! %l4 has complete data and %l5 has
2286 ! partial
2287 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) ! merge %l3, %l4 and %l5
2288 ! into %l3 and %l4
2289 prefetch [%l0+0x40], #one_read
2290 stxa %l3, [%i0+0x0]%asi
2291 stxa %l4, [%i0+0x8]%asi
2292
2293 ldda [%i1+0x20]%asi, %l2
2294 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) ! merge %l2 and %l3 with
2295 stxa %l5, [%i0+0x10]%asi ! %l5 from previous read
2296 stxa %l2, [%i0+0x18]%asi ! into %l5 and %l2
2297
2298 ! Repeat the same for next 32 bytes.
2299
2300 ldda [%i1+0x30]%asi, %l4
2301 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
2302 stxa %l3, [%i0+0x20]%asi
2303 stxa %l4, [%i0+0x28]%asi
2304
2305 ldda [%i1+0x40]%asi, %l2
2306 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
2307 stxa %l5, [%i0+0x30]%asi
2308 stxa %l2, [%i0+0x38]%asi
2309
2310 add %l0, 0x40, %l0
2311 add %i1, 0x40, %i1
2312 subcc %i3, 0x40, %i3
2313 bgu,pt %xcc, loop2
2314 add %i0, 0x40, %i0
2315 ba .blkdone
2316 add %i1, %o2, %i1 ! increment the source by src offset
2317 ! the src offset was stored in %o2
2318
2319
2320 ! Both Source and Destination are block aligned.
2321 ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
2322 .blkcpy:
2323 prefetch [%i1+0x0], #one_read
2324 1:
2325 ldda [%i1+0x0]%asi, %l0
2326 ldda [%i1+0x10]%asi, %l2
2327 prefetch [%i1+0x40], #one_read
2328
2329 stxa %l0, [%i0+0x0]%asi
2330 ldda [%i1+0x20]%asi, %l4
2331 ldda [%i1+0x30]%asi, %l6
2332
2333 stxa %l1, [%i0+0x8]%asi
2334 stxa %l2, [%i0+0x10]%asi
2335 stxa %l3, [%i0+0x18]%asi
2336 stxa %l4, [%i0+0x20]%asi
2337 stxa %l5, [%i0+0x28]%asi
2338 stxa %l6, [%i0+0x30]%asi
2339 stxa %l7, [%i0+0x38]%asi
2340
2341 add %i1, 0x40, %i1
2342 subcc %i3, 0x40, %i3
2343 bgu,pt %xcc, 1b
2344 add %i0, 0x40, %i0
2345
2346 .blkdone:
2347 membar #Sync
2348
2349 brz,pt %i2, .blkexit
2350 nop
2351
2352 ! Handle trailing bytes
2353 cmp %i2, 0x8
2354 blu,pt %ncc, .residue
2355 nop
2356
2357 ! Can we do some 8B ops
2358 or %i1, %i0, %o2
2359 andcc %o2, 0x7, %g0
2360 bnz %ncc, .last4
2361 nop
2362
2363 ! Do 8byte ops as long as possible
2364 .last8:
2365 ldx [%i1], %o2
2366 stx %o2, [%i0]
2367 add %i1, 0x8, %i1
2368 sub %i2, 0x8, %i2
2369 cmp %i2, 0x8
2370 bgu,pt %ncc, .last8
2371 add %i0, 0x8, %i0
2372
2373 brz,pt %i2, .blkexit
2374 nop
2375
2376 ba .residue
2377 nop
2378
2379 .last4:
2380 ! Can we do 4B ops
2381 andcc %o2, 0x3, %g0
2382 bnz %ncc, .last2
2383 nop
2384 1:
2385 ld [%i1], %o2
2386 st %o2, [%i0]
2387 add %i1, 0x4, %i1
2388 sub %i2, 0x4, %i2
2389 cmp %i2, 0x4
2390 bgu,pt %ncc, 1b
2391 add %i0, 0x4, %i0
2392
2393 brz,pt %i2, .blkexit
2394 nop
2395
2396 ba .residue
2397 nop
2398
2399 .last2:
2400 ! Can we do 2B ops
2401 andcc %o2, 0x1, %g0
2402 bnz %ncc, .residue
2403 nop
2404
2405 1:
2406 lduh [%i1], %o2
2407 stuh %o2, [%i0]
2408 add %i1, 0x2, %i1
2409 sub %i2, 0x2, %i2
2410 cmp %i2, 0x2
2411 bgu,pt %ncc, 1b
2412 add %i0, 0x2, %i0
2413
2414 brz,pt %i2, .blkexit
2415 nop
2416
2417 .residue:
2418 ldub [%i1], %o2
2419 stb %o2, [%i0]
2420 inc %i1
2421 deccc %i2
2422 bgu,pt %ncc, .residue
2423 inc %i0
2424
2425 .blkexit:
2426
2427 membar #Sync ! sync error barrier
2428 ! Restore t_lofault handler, if came here from kcopy().
2429 tst %o5
2430 bz %ncc, 1f
2431 andn %o5, LOFAULT_SET, %o5
2432 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2433 1:
2434 ret
2435 restore %g0, 0, %o0
2436
2437
2438 .bcb_punt:
2439 !
2440 ! use aligned transfers where possible
2441 !
2442 xor %i0, %i1, %o4 ! xor from and to address
2443 btst 7, %o4 ! if lower three bits zero
2444 bz .aldoubcp ! can align on double boundary
2445 .empty ! assembler complaints about label
2446
2447 xor %i0, %i1, %o4 ! xor from and to address
2448 btst 3, %o4 ! if lower two bits zero
2449 bz .alwordcp ! can align on word boundary
2450 btst 3, %i0 ! delay slot, from address unaligned?
2451 !
2452 ! use aligned reads and writes where possible
2453 ! this differs from wordcp in that it copes
2454 ! with odd alignment between source and destnation
2455 ! using word reads and writes with the proper shifts
2456 ! in between to align transfers to and from memory
2457 ! i0 - src address, i1 - dest address, i2 - count
2458 ! i3, i4 - tmps for used generating complete word
2459 ! i5 (word to write)
2460 ! l0 size in bits of upper part of source word (US)
2461 ! l1 size in bits of lower part of source word (LS = 32 - US)
2462 ! l2 size in bits of upper part of destination word (UD)
2463 ! l3 size in bits of lower part of destination word (LD = 32 - UD)
2464 ! l4 number of bytes leftover after aligned transfers complete
2465 ! l5 the number 32
2466 !
2467 mov 32, %l5 ! load an oft-needed constant
2468 bz .align_dst_only
2469 btst 3, %i1 ! is destnation address aligned?
2470 clr %i4 ! clear registers used in either case
2471 bz .align_src_only
2472 clr %l0
2473 !
2474 ! both source and destination addresses are unaligned
2475 !
2476 1: ! align source
2477 ldub [%i0], %i3 ! read a byte from source address
2478 add %i0, 1, %i0 ! increment source address
2479 or %i4, %i3, %i4 ! or in with previous bytes (if any)
2480 btst 3, %i0 ! is source aligned?
2481 add %l0, 8, %l0 ! increment size of upper source (US)
2482 bnz,a 1b
2483 sll %i4, 8, %i4 ! make room for next byte
2484
2485 sub %l5, %l0, %l1 ! generate shift left count (LS)
2486 sll %i4, %l1, %i4 ! prepare to get rest
2487 ld [%i0], %i3 ! read a word
2488 add %i0, 4, %i0 ! increment source address
2489 srl %i3, %l0, %i5 ! upper src bits into lower dst bits
2490 or %i4, %i5, %i5 ! merge
2491 mov 24, %l3 ! align destination
2492 1:
2493 srl %i5, %l3, %i4 ! prepare to write a single byte
2494 stb %i4, [%i1] ! write a byte
2495 add %i1, 1, %i1 ! increment destination address
2496 sub %i2, 1, %i2 ! decrement count
2497 btst 3, %i1 ! is destination aligned?
2498 bnz,a 1b
2499 sub %l3, 8, %l3 ! delay slot, decrement shift count (LD)
2500 sub %l5, %l3, %l2 ! generate shift left count (UD)
2501 sll %i5, %l2, %i5 ! move leftover into upper bytes
2502 cmp %l2, %l0 ! cmp # reqd to fill dst w old src left
2503 bgu %ncc, .more_needed ! need more to fill than we have
2504 nop
2505
2506 sll %i3, %l1, %i3 ! clear upper used byte(s)
2507 srl %i3, %l1, %i3
2508 ! get the odd bytes between alignments
2509 sub %l0, %l2, %l0 ! regenerate shift count
2510 sub %l5, %l0, %l1 ! generate new shift left count (LS)
2511 and %i2, 3, %l4 ! must do remaining bytes if count%4 > 0
2512 andn %i2, 3, %i2 ! # of aligned bytes that can be moved
2513 srl %i3, %l0, %i4
2514 or %i5, %i4, %i5
2515 st %i5, [%i1] ! write a word
2516 subcc %i2, 4, %i2 ! decrement count
2517 bz %ncc, .unalign_out
2518 add %i1, 4, %i1 ! increment destination address
2519
2520 b 2f
2521 sll %i3, %l1, %i5 ! get leftover into upper bits
2522 .more_needed:
2523 sll %i3, %l0, %i3 ! save remaining byte(s)
2524 srl %i3, %l0, %i3
2525 sub %l2, %l0, %l1 ! regenerate shift count
2526 sub %l5, %l1, %l0 ! generate new shift left count
2527 sll %i3, %l1, %i4 ! move to fill empty space
2528 b 3f
2529 or %i5, %i4, %i5 ! merge to complete word
2530 !
2531 ! the source address is aligned and destination is not
2532 !
2533 .align_dst_only:
2534 ld [%i0], %i4 ! read a word
2535 add %i0, 4, %i0 ! increment source address
2536 mov 24, %l0 ! initial shift alignment count
2537 1:
2538 srl %i4, %l0, %i3 ! prepare to write a single byte
2539 stb %i3, [%i1] ! write a byte
2540 add %i1, 1, %i1 ! increment destination address
2541 sub %i2, 1, %i2 ! decrement count
2542 btst 3, %i1 ! is destination aligned?
2543 bnz,a 1b
2544 sub %l0, 8, %l0 ! delay slot, decrement shift count
2545 .xfer:
2546 sub %l5, %l0, %l1 ! generate shift left count
2547 sll %i4, %l1, %i5 ! get leftover
2548 3:
2549 and %i2, 3, %l4 ! must do remaining bytes if count%4 > 0
2550 andn %i2, 3, %i2 ! # of aligned bytes that can be moved
2551 2:
2552 ld [%i0], %i3 ! read a source word
2553 add %i0, 4, %i0 ! increment source address
2554 srl %i3, %l0, %i4 ! upper src bits into lower dst bits
2555 or %i5, %i4, %i5 ! merge with upper dest bits (leftover)
2556 st %i5, [%i1] ! write a destination word
2557 subcc %i2, 4, %i2 ! decrement count
2558 bz %ncc, .unalign_out ! check if done
2559 add %i1, 4, %i1 ! increment destination address
2560 b 2b ! loop
2561 sll %i3, %l1, %i5 ! get leftover
2562 .unalign_out:
2563 tst %l4 ! any bytes leftover?
2564 bz %ncc, .cpdone
2565 .empty ! allow next instruction in delay slot
2566 1:
2567 sub %l0, 8, %l0 ! decrement shift
2568 srl %i3, %l0, %i4 ! upper src byte into lower dst byte
2569 stb %i4, [%i1] ! write a byte
2570 subcc %l4, 1, %l4 ! decrement count
2571 bz %ncc, .cpdone ! done?
2572 add %i1, 1, %i1 ! increment destination
2573 tst %l0 ! any more previously read bytes
2574 bnz %ncc, 1b ! we have leftover bytes
2575 mov %l4, %i2 ! delay slot, mv cnt where dbytecp wants
2576 b .dbytecp ! let dbytecp do the rest
2577 sub %i0, %i1, %i0 ! i0 gets the difference of src and dst
2578 !
2579 ! the destination address is aligned and the source is not
2580 !
2581 .align_src_only:
2582 ldub [%i0], %i3 ! read a byte from source address
2583 add %i0, 1, %i0 ! increment source address
2584 or %i4, %i3, %i4 ! or in with previous bytes (if any)
2585 btst 3, %i0 ! is source aligned?
2586 add %l0, 8, %l0 ! increment shift count (US)
2587 bnz,a .align_src_only
2588 sll %i4, 8, %i4 ! make room for next byte
2589 b,a .xfer
2590 !
2591 ! if from address unaligned for double-word moves,
2592 ! move bytes till it is, if count is < 56 it could take
2593 ! longer to align the thing than to do the transfer
2594 ! in word size chunks right away
2595 !
2596 .aldoubcp:
2597 cmp %i2, 56 ! if count < 56, use wordcp, it takes
2598 blu,a %ncc, .alwordcp ! longer to align doubles than words
2599 mov 3, %o0 ! mask for word alignment
2600 call .alignit ! copy bytes until aligned
2601 mov 7, %o0 ! mask for double alignment
2602 !
2603 ! source and destination are now double-word aligned
2604 ! i3 has aligned count returned by alignit
2605 !
2606 and %i2, 7, %i2 ! unaligned leftover count
2607 sub %i0, %i1, %i0 ! i0 gets the difference of src and dst
2608 5:
2609 ldx [%i0+%i1], %o4 ! read from address
2610 stx %o4, [%i1] ! write at destination address
2611 subcc %i3, 8, %i3 ! dec count
2612 bgu %ncc, 5b
2613 add %i1, 8, %i1 ! delay slot, inc to address
2614 cmp %i2, 4 ! see if we can copy a word
2615 blu %ncc, .dbytecp ! if 3 or less bytes use bytecp
2616 .empty
2617 !
2618 ! for leftover bytes we fall into wordcp, if needed
2619 !
2620 .wordcp:
2621 and %i2, 3, %i2 ! unaligned leftover count
2622 5:
2623 ld [%i0+%i1], %o4 ! read from address
2624 st %o4, [%i1] ! write at destination address
2625 subcc %i3, 4, %i3 ! dec count
2626 bgu %ncc, 5b
2627 add %i1, 4, %i1 ! delay slot, inc to address
2628 b,a .dbytecp
2629
2630 ! we come here to align copies on word boundaries
2631 .alwordcp:
2632 call .alignit ! go word-align it
2633 mov 3, %o0 ! bits that must be zero to be aligned
2634 b .wordcp
2635 sub %i0, %i1, %i0 ! i0 gets the difference of src and dst
2636
2637 !
2638 ! byte copy, works with any alignment
2639 !
2640 .bytecp:
2641 b .dbytecp
2642 sub %i0, %i1, %i0 ! i0 gets difference of src and dst
2643
2644 !
2645 ! differenced byte copy, works with any alignment
2646 ! assumes dest in %i1 and (source - dest) in %i0
2647 !
2648 1:
2649 stb %o4, [%i1] ! write to address
2650 inc %i1 ! inc to address
2651 .dbytecp:
2652 deccc %i2 ! dec count
2653 bgeu,a %ncc, 1b ! loop till done
2654 ldub [%i0+%i1], %o4 ! read from address
2655 .cpdone:
2656
2657 membar #Sync ! sync error barrier
2658 ! Restore t_lofault handler, if came here from kcopy().
2659 tst %o5
2660 bz %ncc, 1f
2661 andn %o5, LOFAULT_SET, %o5
2662 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2663 1:
2664 ret
2665 restore %g0, 0, %o0 ! return (0)
2666
2667 /*
2668 * Common code used to align transfers on word and doubleword
2669 * boundaries. Aligns source and destination and returns a count
2670 * of aligned bytes to transfer in %i3
2671 */
2672 1:
2673 inc %i0 ! inc from
2674 stb %o4, [%i1] ! write a byte
2675 inc %i1 ! inc to
2676 dec %i2 ! dec count
2677 .alignit:
2678 btst %o0, %i0 ! %o0 is bit mask to check for alignment
2679 bnz,a 1b
2680 ldub [%i0], %o4 ! read next byte
2681
2682 retl
2683 andn %i2, %o0, %i3 ! return size of aligned bytes
2684
2685 SET_SIZE(bcopy)
2686
2687 #endif /* NIAGARA_IMPL */
2688
2689 #endif /* lint */
2690
2691 /*
2692 * Block copy with possibly overlapped operands.
2693 */
2694
2695 #if defined(lint)
2696
2697 /*ARGSUSED*/
2698 void
2699 ovbcopy(const void *from, void *to, size_t count)
2700 {}
2701
2702 #else /* lint */
2703
2704 ENTRY(ovbcopy)
2705 tst %o2 ! check count
2706 bgu,a %ncc, 1f ! nothing to do or bad arguments
2707 subcc %o0, %o1, %o3 ! difference of from and to address
2708
2709 retl ! return
2710 nop
2711 1:
2712 bneg,a %ncc, 2f
2713 neg %o3 ! if < 0, make it positive
2714 2: cmp %o2, %o3 ! cmp size and abs(from - to)
2715 bleu %ncc, bcopy ! if size <= abs(diff): use bcopy,
2716 .empty ! no overlap
2717 cmp %o0, %o1 ! compare from and to addresses
2718 blu %ncc, .ov_bkwd ! if from < to, copy backwards
2719 nop
2720 !
2721 ! Copy forwards.
2722 !
2723 .ov_fwd:
2724 ldub [%o0], %o3 ! read from address
2725 inc %o0 ! inc from address
2726 stb %o3, [%o1] ! write to address
2727 deccc %o2 ! dec count
2728 bgu %ncc, .ov_fwd ! loop till done
2729 inc %o1 ! inc to address
2730
2731 retl ! return
2732 nop
2733 !
2734 ! Copy backwards.
2735 !
2736 .ov_bkwd:
2737 deccc %o2 ! dec count
2738 ldub [%o0 + %o2], %o3 ! get byte at end of src
2739 bgu %ncc, .ov_bkwd ! loop till done
2740 stb %o3, [%o1 + %o2] ! delay slot, store at end of dst
2741
2742 retl ! return
2743 nop
2744 SET_SIZE(ovbcopy)
2745
2746 #endif /* lint */
2747
2748 /*
2749 * hwblkpagecopy()
2750 *
2751 * Copies exactly one page. This routine assumes the caller (ppcopy)
2752 * has already disabled kernel preemption and has checked
2753 * use_hw_bcopy.
2754 */
2755 #ifdef lint
2756 /*ARGSUSED*/
2757 void
2758 hwblkpagecopy(const void *src, void *dst)
2759 { }
2760 #else /* lint */
2761 ENTRY(hwblkpagecopy)
2762 save %sp, -SA(MINFRAME), %sp
2763
2764 ! %i0 - source address (arg)
2765 ! %i1 - destination address (arg)
2766 ! %i2 - length of region (not arg)
2767
2768 set PAGESIZE, %i2
2769
2770 /*
2771 * Copying exactly one page and PAGESIZE is in mutliple of 0x80.
2772 */
2773 mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
2774 prefetch [%i0+0x0], #one_read
2775 prefetch [%i0+0x40], #one_read
2776 1:
2777 prefetch [%i0+0x80], #one_read
2778 prefetch [%i0+0xc0], #one_read
2779 ldda [%i0+0x0]%asi, %l0
2780 ldda [%i0+0x10]%asi, %l2
2781 ldda [%i0+0x20]%asi, %l4
2782 ldda [%i0+0x30]%asi, %l6
2783 stxa %l0, [%i1+0x0]%asi
2784 stxa %l1, [%i1+0x8]%asi
2785 stxa %l2, [%i1+0x10]%asi
2786 stxa %l3, [%i1+0x18]%asi
2787 stxa %l4, [%i1+0x20]%asi
2788 stxa %l5, [%i1+0x28]%asi
2789 stxa %l6, [%i1+0x30]%asi
2790 stxa %l7, [%i1+0x38]%asi
2791 ldda [%i0+0x40]%asi, %l0
2792 ldda [%i0+0x50]%asi, %l2
2793 ldda [%i0+0x60]%asi, %l4
2794 ldda [%i0+0x70]%asi, %l6
2795 stxa %l0, [%i1+0x40]%asi
2796 stxa %l1, [%i1+0x48]%asi
2797 stxa %l2, [%i1+0x50]%asi
2798 stxa %l3, [%i1+0x58]%asi
2799 stxa %l4, [%i1+0x60]%asi
2800 stxa %l5, [%i1+0x68]%asi
2801 stxa %l6, [%i1+0x70]%asi
2802 stxa %l7, [%i1+0x78]%asi
2803
2804 add %i0, 0x80, %i0
2805 subcc %i2, 0x80, %i2
2806 bgu,pt %xcc, 1b
2807 add %i1, 0x80, %i1
2808
2809 membar #Sync
2810 ret
2811 restore %g0, 0, %o0
2812 SET_SIZE(hwblkpagecopy)
2813 #endif /* lint */
2814
2815
2816 /*
2817 * Transfer data to and from user space -
2818 * Note that these routines can cause faults
2819 * It is assumed that the kernel has nothing at
2820 * less than KERNELBASE in the virtual address space.
2821 *
2822 * Note that copyin(9F) and copyout(9F) are part of the
2823 * DDI/DKI which specifies that they return '-1' on "errors."
2824 *
2825 * Sigh.
2826 *
2827 * So there's two extremely similar routines - xcopyin() and xcopyout()
2828 * which return the errno that we've faithfully computed. This
2829 * allows other callers (e.g. uiomove(9F)) to work correctly.
2830 * Given that these are used pretty heavily, we expand the calling
2831 * sequences inline for all flavours (rather than making wrappers).
2832 *
2833 * There are also stub routines for xcopyout_little and xcopyin_little,
2834 * which currently are intended to handle requests of <= 16 bytes from
2835 * do_unaligned. Future enhancement to make them handle 8k pages efficiently
2836 * is left as an exercise...
2837 */
2838
2839 /*
2840 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
2841 *
2842 * General theory of operation:
2843 *
2844 * None of the copyops routines grab a window until it's decided that
2845 * we need to do a HW block copy operation. This saves a window
2846 * spill/fill when we're called during socket ops. The typical IO
2847 * path won't cause spill/fill traps.
2848 *
2849 * This code uses a set of 4 limits for the maximum size that will
2850 * be copied given a particular input/output address alignment.
2851 * the default limits are:
2852 *
2853 * single byte aligned - 256 (hw_copy_limit_1)
2854 * two byte aligned - 512 (hw_copy_limit_2)
2855 * four byte aligned - 1024 (hw_copy_limit_4)
2856 * eight byte aligned - 1024 (hw_copy_limit_8)
2857 *
2858 * If the value for a particular limit is zero, the copy will be done
2859 * via the copy loops rather than block store/quad load instructions.
2860 *
2861 * Flow:
2862 *
2863 * If count == zero return zero.
2864 *
2865 * Store the previous lo_fault handler into %g6.
2866 * Place our secondary lofault handler into %g5.
2867 * Place the address of our nowindow fault handler into %o3.
2868 * Place the address of the windowed fault handler into %o4.
2869 * --> We'll use this handler if we end up grabbing a window
2870 * --> before we use block initializing store and quad load ASIs
2871 *
2872 * If count is less than or equal to SMALL_LIMIT (7) we
2873 * always do a byte for byte copy.
2874 *
2875 * If count is > SMALL_LIMIT, we check the alignment of the input
2876 * and output pointers. Based on the alignment we check count
2877 * against a limit based on detected alignment. If we exceed the
2878 * alignment value we copy via block initializing store and quad
2879 * load instructions.
2880 *
2881 * If we don't exceed one of the limits, we store -count in %o3,
2882 * we store the number of chunks (8, 4, 2 or 1 byte) operated
2883 * on in our basic copy loop in %o2. Following this we branch
2884 * to the appropriate copy loop and copy that many chunks.
2885 * Since we've been adding the chunk size to %o3 each time through
2886 * as well as decrementing %o2, we can tell if any data is
2887 * is left to be copied by examining %o3. If that is zero, we're
2888 * done and can go home. If not, we figure out what the largest
2889 * chunk size left to be copied is and branch to that copy loop
2890 * unless there's only one byte left. We load that as we're
2891 * branching to code that stores it just before we return.
2892 *
2893 * Fault handlers are invoked if we reference memory that has no
2894 * current mapping. All forms share the same copyio_fault handler.
2895 * This routine handles fixing up the stack and general housecleaning.
2896 * Each copy operation has a simple fault handler that is then called
2897 * to do the work specific to the invidual operation. The handler
2898 * for copyOP and xcopyOP are found at the end of individual function.
2899 * The handlers for xcopyOP_little are found at the end of xcopyin_little.
2900 * The handlers for copyOP_noerr are found at the end of copyin_noerr.
2901 */
2902
2903 /*
2904 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
2905 */
2906
2907 #if defined(lint)
2908
2909 /*ARGSUSED*/
2910 int
2911 copyout(const void *kaddr, void *uaddr, size_t count)
2912 { return (0); }
2913
2914 #else /* lint */
2915
2916 /*
2917 * We save the arguments in the following registers in case of a fault:
2918 * kaddr - %g2
2919 * uaddr - %g3
2920 * count - %g4
2921 */
2922 #define SAVE_SRC %g2
2923 #define SAVE_DST %g3
2924 #define SAVE_COUNT %g4
2925
2926 #define REAL_LOFAULT %g5
2927 #define SAVED_LOFAULT %g6
2928
2929 /*
2930 * Generic copyio fault handler. This is the first line of defense when a
2931 * fault occurs in (x)copyin/(x)copyout. In order for this to function
2932 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
2933 * This allows us to share common code for all the flavors of the copy
2934 * operations, including the _noerr versions.
2935 *
2936 * Note that this function will restore the original input parameters before
2937 * calling REAL_LOFAULT. So the real handler can vector to the appropriate
2938 * member of the t_copyop structure, if needed.
2939 */
2940 ENTRY(copyio_fault)
2941 #if !defined(NIAGARA_IMPL)
2942 btst FPUSED_FLAG, SAVED_LOFAULT
2943 bz 1f
2944 andn SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
2945
2946 wr %l5, 0, %gsr ! restore gsr
2947
2948 btst FPRS_FEF, %g1
2949 bz %icc, 4f
2950 nop
2951
2952 ! restore fpregs from stack
2953 BLD_FP_FROMSTACK(%o2)
2954
2955 ba,pt %ncc, 1f
2956 nop
2957 4:
2958 FZERO ! zero all of the fpregs
2959 wr %g1, %g0, %fprs ! restore fprs
2960 1:
2961 restore
2962 mov SAVE_SRC, %o0
2963 mov SAVE_DST, %o1
2964 jmp REAL_LOFAULT
2965 mov SAVE_COUNT, %o2
2966
2967 #else /* NIAGARA_IMPL */
2968 membar #Sync
2969 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2970 restore
2971 mov SAVE_SRC, %o0
2972 mov SAVE_DST, %o1
2973 jmp REAL_LOFAULT
2974 mov SAVE_COUNT, %o2
2975
2976 #endif /* NIAGARA_IMPL */
2977
2978 SET_SIZE(copyio_fault)
2979
2980 ENTRY(copyio_fault_nowindow)
2981 membar #Sync
2982 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2983
2984 mov SAVE_SRC, %o0
2985 mov SAVE_DST, %o1
2986 jmp REAL_LOFAULT
2987 mov SAVE_COUNT, %o2
2988 SET_SIZE(copyio_fault_nowindow)
2989
2990 ENTRY(copyout)
2991 sethi %hi(.copyout_err), REAL_LOFAULT
2992 or REAL_LOFAULT, %lo(.copyout_err), REAL_LOFAULT
2993
2994 #if !defined(NIAGARA_IMPL)
2995 .do_copyout:
2996 tst %o2 ! check for zero count; quick exit
2997 bz,pt %ncc, .co_smallqx
2998 mov %o0, SAVE_SRC
2999 mov %o1, SAVE_DST
3000 mov %o2, SAVE_COUNT
3001 cmp %o2, FP_COPY ! check for small copy/leaf case
3002 bgt,pt %ncc, .co_copy_more
3003 ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
3004 /*
3005 * Small copy out code
3006 *
3007 */
3008 sethi %hi(copyio_fault_nowindow), %o3
3009 or %o3, %lo(copyio_fault_nowindow), %o3
3010 membar #Sync
3011 stn %o3, [THREAD_REG + T_LOFAULT]
3012
3013 mov ASI_USER, %asi
3014 cmp %o2, SHORTCOPY ! make sure there is enough to align
3015 ble,pt %ncc, .co_smallest
3016 andcc %o1, 0x7, %o3 ! is dest long word aligned
3017 bnz,pn %ncc, .co_align
3018 andcc %o1, 1, %o3 ! is dest byte aligned
3019
3020 ! Destination is long word aligned
3021 ! 8 cases for src alignment; load parts, store long words
3022 .co_al_src:
3023 andcc %o0, 7, %o3
3024 brnz,pt %o3, .co_src_dst_unal8
3025 nop
3026 /*
3027 * Special case for handling when src and dest are both long word aligned
3028 * and total data to move is less than FP_COPY bytes
3029 * Also handles finish up for large block moves, so may be less than 32 bytes
3030 */
3031 .co_medlong:
3032 subcc %o2, 31, %o2 ! adjust length to allow cc test
3033 ble,pt %ncc, .co_medl31
3034 nop
3035 .co_medl32:
3036 ldx [%o0], %o4 ! move 32 bytes
3037 subcc %o2, 32, %o2 ! decrement length count by 32
3038 stxa %o4, [%o1]%asi
3039 ldx [%o0+8], %o4
3040 stxa %o4, [%o1+8]%asi
3041 ldx [%o0+16], %o4
3042 add %o0, 32, %o0 ! increase src ptr by 32
3043 stxa %o4, [%o1+16]%asi
3044 ldx [%o0-8], %o4
3045 add %o1, 32, %o1 ! increase dst ptr by 32
3046 bgu,pt %ncc, .co_medl32 ! repeat if at least 32 bytes left
3047 stxa %o4, [%o1-8]%asi
3048 .co_medl31:
3049 addcc %o2, 24, %o2 ! adjust count to be off by 7
3050 ble,pt %ncc, .co_medl7 ! skip if 7 or fewer bytes left
3051 nop
3052 .co_medl8:
3053 ldx [%o0], %o4 ! move 8 bytes
3054 add %o0, 8, %o0 ! increase src ptr by 8
3055 subcc %o2, 8, %o2 ! decrease count by 8
3056 add %o1, 8, %o1 ! increase dst ptr by 8
3057 bgu,pt %ncc, .co_medl8
3058 stxa %o4, [%o1-8]%asi
3059 .co_medl7:
3060 addcc %o2, 7, %o2 ! finish adjustment of remaining count
3061 bnz,pt %ncc, .co_small4 ! do final bytes if not finished
3062
3063 .co_smallx: ! finish up and exit
3064 membar #Sync
3065 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3066 .co_smallqx:
3067 retl
3068 mov %g0, %o0
3069
3070 .co_small4:
3071 cmp %o2, 4
3072 blt,pt %ncc, .co_small3x ! skip if less than 4 bytes left
3073 nop !
3074 ld [%o0], %o4 ! move 4 bytes
3075 add %o0, 4, %o0 ! increase src ptr by 4
3076 add %o1, 4, %o1 ! increase dst ptr by 4
3077 subcc %o2, 4, %o2 ! decrease count by 4
3078 bz,pt %ncc, .co_smallx
3079 stwa %o4, [%o1-4]%asi
3080
3081 .co_small3x: ! Exactly 1, 2, or 3 bytes remain
3082 subcc %o2, 1, %o2 ! reduce count for cc test
3083 ldub [%o0], %o4 ! load one byte
3084 bz,pt %ncc, .co_smallx
3085 stba %o4, [%o1]%asi ! store one byte
3086 ldub [%o0+1], %o4 ! load second byte
3087 subcc %o2, 1, %o2
3088 bz,pt %ncc, .co_smallx
3089 stba %o4, [%o1+1]%asi ! store second byte
3090 ldub [%o0+2], %o4 ! load third byte
3091 ba .co_smallx
3092 stba %o4, [%o1+2]%asi ! store third byte
3093
3094 .co_smallest: ! 7 or fewer bytes remain
3095 cmp %o2, 4
3096 blt,pt %ncc, .co_small3x
3097 nop
3098 ldub [%o0], %o4 ! read byte
3099 subcc %o2, 4, %o2 ! reduce count by 4
3100 stba %o4, [%o1]%asi ! write byte
3101 ldub [%o0+1], %o4 ! repeat for total of 4 bytes
3102 add %o0, 4, %o0 ! advance src by 4
3103 stba %o4, [%o1+1]%asi
3104 ldub [%o0-2], %o4
3105 add %o1, 4, %o1 ! advance dst by 4
3106 stba %o4, [%o1-2]%asi
3107 ldub [%o0-1], %o4
3108 bnz,pt %ncc, .co_small3x
3109 stba %o4, [%o1-1]%asi
3110 membar #Sync
3111 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3112 retl
3113 mov %g0, %o0
3114
3115 .co_align: ! byte align test in prior branch delay
3116 bnz,pt %ncc, .co_al_d1
3117 .co_al_d1f: ! dest is now half word aligned
3118 andcc %o1, 2, %o3
3119 bnz,pt %ncc, .co_al_d2
3120 .co_al_d2f: ! dest is now word aligned
3121 andcc %o1, 4, %o3 ! is dest longword aligned?
3122 bz,pt %ncc, .co_al_src
3123 nop
3124 .co_al_d4: ! dest is word aligned; src is unknown
3125 ldub [%o0], %o4 ! move a word (src align unknown)
3126 ldub [%o0+1], %o3
3127 sll %o4, 24, %o4 ! position
3128 sll %o3, 16, %o3 ! position
3129 or %o4, %o3, %o3 ! merge
3130 ldub [%o0+2], %o4
3131 sll %o4, 8, %o4 ! position
3132 or %o4, %o3, %o3 ! merge
3133 ldub [%o0+3], %o4
3134 or %o4, %o3, %o4 ! merge
3135 stwa %o4,[%o1]%asi ! store four bytes
3136 add %o0, 4, %o0 ! adjust src by 4
3137 add %o1, 4, %o1 ! adjust dest by 4
3138 sub %o2, 4, %o2 ! adjust count by 4
3139 andcc %o0, 7, %o3 ! check for src long word alignment
3140 brz,pt %o3, .co_medlong
3141 .co_src_dst_unal8:
3142 ! dst is 8-byte aligned, src is not
3143 ! Size is less than FP_COPY
3144 ! Following code is to select for alignment
3145 andcc %o0, 0x3, %o3 ! test word alignment
3146 bz,pt %ncc, .co_medword
3147 nop
3148 andcc %o0, 0x1, %o3 ! test halfword alignment
3149 bnz,pt %ncc, .co_med_byte ! go to byte move if not halfword
3150 andcc %o0, 0x2, %o3 ! test which byte alignment
3151 ba .co_medhalf
3152 nop
3153 .co_al_d1: ! align dest to half word
3154 ldub [%o0], %o4 ! move a byte
3155 add %o0, 1, %o0
3156 stba %o4, [%o1]%asi
3157 add %o1, 1, %o1
3158 andcc %o1, 2, %o3
3159 bz,pt %ncc, .co_al_d2f
3160 sub %o2, 1, %o2
3161 .co_al_d2: ! align dest to word
3162 ldub [%o0], %o4 ! move a half-word (src align unknown)
3163 ldub [%o0+1], %o3
3164 sll %o4, 8, %o4 ! position
3165 or %o4, %o3, %o4 ! merge
3166 stha %o4, [%o1]%asi
3167 add %o0, 2, %o0
3168 add %o1, 2, %o1
3169 andcc %o1, 4, %o3 ! is dest longword aligned?
3170 bz,pt %ncc, .co_al_src
3171 sub %o2, 2, %o2
3172 ba .co_al_d4
3173 nop
3174 /*
3175 * Handle all cases where src and dest are aligned on word
3176 * boundaries. Use unrolled loops for better performance.
3177 * This option wins over standard large data move when
3178 * source and destination is in cache for medium
3179 * to short data moves.
3180 */
3181 .co_medword:
3182 subcc %o2, 31, %o2 ! adjust length to allow cc test
3183 ble,pt %ncc, .co_medw31
3184 nop
3185 .co_medw32:
3186 ld [%o0], %o4 ! move a block of 32 bytes
3187 stwa %o4, [%o1]%asi
3188 ld [%o0+4], %o4
3189 stwa %o4, [%o1+4]%asi
3190 ld [%o0+8], %o4
3191 stwa %o4, [%o1+8]%asi
3192 ld [%o0+12], %o4
3193 stwa %o4, [%o1+12]%asi
3194 ld [%o0+16], %o4
3195 stwa %o4, [%o1+16]%asi
3196 ld [%o0+20], %o4
3197 subcc %o2, 32, %o2 ! decrement length count
3198 stwa %o4, [%o1+20]%asi
3199 ld [%o0+24], %o4
3200 add %o0, 32, %o0 ! increase src ptr by 32
3201 stwa %o4, [%o1+24]%asi
3202 ld [%o0-4], %o4
3203 add %o1, 32, %o1 ! increase dst ptr by 32
3204 bgu,pt %ncc, .co_medw32 ! repeat if at least 32 bytes left
3205 stwa %o4, [%o1-4]%asi
3206 .co_medw31:
3207 addcc %o2, 24, %o2 ! adjust count to be off by 7
3208 ble,pt %ncc, .co_medw7 ! skip if 7 or fewer bytes left
3209 nop !
3210 .co_medw15:
3211 ld [%o0], %o4 ! move a block of 8 bytes
3212 subcc %o2, 8, %o2 ! decrement length count
3213 stwa %o4, [%o1]%asi
3214 add %o0, 8, %o0 ! increase src ptr by 8
3215 ld [%o0-4], %o4
3216 add %o1, 8, %o1 ! increase dst ptr by 8
3217 bgu,pt %ncc, .co_medw15
3218 stwa %o4, [%o1-4]%asi
3219 .co_medw7:
3220 addcc %o2, 7, %o2 ! finish adjustment of remaining count
3221 bz,pt %ncc, .co_smallx ! exit if finished
3222 cmp %o2, 4
3223 blt,pt %ncc, .co_small3x ! skip if less than 4 bytes left
3224 nop !
3225 ld [%o0], %o4 ! move 4 bytes
3226 add %o0, 4, %o0 ! increase src ptr by 4
3227 add %o1, 4, %o1 ! increase dst ptr by 4
3228 subcc %o2, 4, %o2 ! decrease count by 4
3229 bnz .co_small3x
3230 stwa %o4, [%o1-4]%asi
3231 membar #Sync
3232 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3233 retl
3234 mov %g0, %o0
3235
3236 .co_medhalf:
3237 subcc %o2, 31, %o2 ! adjust length to allow cc test
3238 ble,pt %ncc, .co_medh31
3239 nop
3240 .co_medh32: ! load and store block of 32 bytes
3241
3242 lduh [%o0], %o4 ! move 32 bytes
3243 subcc %o2, 32, %o2 ! decrement length count
3244 lduw [%o0+2], %o3
3245 sllx %o4, 48, %o4
3246 sllx %o3, 16, %o3
3247 or %o4, %o3, %o3
3248 lduh [%o0+6], %o4
3249 or %o4, %o3, %o4
3250 stxa %o4, [%o1]%asi
3251
3252 lduh [%o0+8], %o4
3253 lduw [%o0+10], %o3
3254 sllx %o4, 48, %o4
3255 sllx %o3, 16, %o3
3256 or %o4, %o3, %o3
3257 lduh [%o0+14], %o4
3258 or %o4, %o3, %o4
3259 stxa %o4, [%o1+8]%asi
3260
3261 lduh [%o0+16], %o4
3262 lduw [%o0+18], %o3
3263 sllx %o4, 48, %o4
3264 sllx %o3, 16, %o3
3265 or %o4, %o3, %o3
3266 lduh [%o0+22], %o4
3267 or %o4, %o3, %o4
3268 stxa %o4, [%o1+16]%asi
3269
3270 add %o0, 32, %o0 ! increase src ptr by 32
3271 add %o1, 32, %o1 ! increase dst ptr by 32
3272
3273 lduh [%o0-8], %o4
3274 lduw [%o0-6], %o3
3275 sllx %o4, 48, %o4
3276 sllx %o3, 16, %o3
3277 or %o4, %o3, %o3
3278 lduh [%o0-2], %o4
3279 or %o3, %o4, %o4
3280 bgu,pt %ncc, .co_medh32 ! repeat if at least 32 bytes left
3281 stxa %o4, [%o1-8]%asi
3282
3283 .co_medh31:
3284 addcc %o2, 24, %o2 ! adjust count to be off by 7
3285 ble,pt %ncc, .co_medh7 ! skip if 7 or fewer bytes left
3286 nop !
3287 .co_medh15:
3288 lduh [%o0], %o4 ! move 16 bytes
3289 subcc %o2, 8, %o2 ! decrement length count
3290 lduw [%o0+2], %o3
3291 sllx %o4, 48, %o4
3292 sllx %o3, 16, %o3
3293 or %o4, %o3, %o3
3294 add %o1, 8, %o1 ! increase dst ptr by 8
3295 lduh [%o0+6], %o4
3296 add %o0, 8, %o0 ! increase src ptr by 8
3297 or %o4, %o3, %o4
3298 bgu,pt %ncc, .co_medh15
3299 stxa %o4, [%o1-8]%asi
3300 .co_medh7:
3301 addcc %o2, 7, %o2 ! finish adjustment of remaining count
3302 bz,pt %ncc, .co_smallx ! exit if finished
3303 cmp %o2, 4
3304 blt,pt %ncc, .co_small3x ! skip if less than 4 bytes left
3305 nop !
3306 lduh [%o0], %o4
3307 sll %o4, 16, %o4
3308 lduh [%o0+2], %o3
3309 or %o3, %o4, %o4
3310 subcc %o2, 4, %o2
3311 add %o0, 4, %o0
3312 add %o1, 4, %o1
3313 bnz .co_small3x
3314 stwa %o4, [%o1-4]%asi
3315 membar #Sync
3316 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3317 retl
3318 mov %g0, %o0
3319
3320 .align 16
3321 .co_med_byte:
3322 bnz,pt %ncc, .co_medbh32a ! go to correct byte move
3323 subcc %o2, 31, %o2 ! adjust length to allow cc test
3324 ble,pt %ncc, .co_medb31
3325 nop
3326 .co_medb32: ! Alignment 1 or 5
3327 subcc %o2, 32, %o2 ! decrement length count
3328
3329 ldub [%o0], %o4 ! load and store a block of 32 bytes
3330 sllx %o4, 56, %o3
3331 lduh [%o0+1], %o4
3332 sllx %o4, 40, %o4
3333 or %o4, %o3, %o3
3334 lduw [%o0+3], %o4
3335 sllx %o4, 8, %o4
3336 or %o4, %o3, %o3
3337 ldub [%o0+7], %o4
3338 or %o4, %o3, %o4
3339 stxa %o4, [%o1]%asi
3340
3341 ldub [%o0+8], %o4
3342 sllx %o4, 56, %o3
3343 lduh [%o0+9], %o4
3344 sllx %o4, 40, %o4
3345 or %o4, %o3, %o3
3346 lduw [%o0+11], %o4
3347 sllx %o4, 8, %o4
3348 or %o4, %o3, %o3
3349 ldub [%o0+15], %o4
3350 or %o4, %o3, %o4
3351 stxa %o4, [%o1+8]%asi
3352
3353 ldub [%o0+16], %o4
3354 sllx %o4, 56, %o3
3355 lduh [%o0+17], %o4
3356 sllx %o4, 40, %o4
3357 or %o4, %o3, %o3
3358 lduw [%o0+19], %o4
3359 sllx %o4, 8, %o4
3360 or %o4, %o3, %o3
3361 ldub [%o0+23], %o4
3362 or %o4, %o3, %o4
3363 stxa %o4, [%o1+16]%asi
3364
3365 add %o0, 32, %o0 ! increase src ptr by 32
3366 add %o1, 32, %o1 ! increase dst ptr by 32
3367
3368 ldub [%o0-8], %o4
3369 sllx %o4, 56, %o3
3370 lduh [%o0-7], %o4
3371 sllx %o4, 40, %o4
3372 or %o4, %o3, %o3
3373 lduw [%o0-5], %o4
3374 sllx %o4, 8, %o4
3375 or %o4, %o3, %o3
3376 ldub [%o0-1], %o4
3377 or %o4, %o3, %o4
3378 bgu,pt %ncc, .co_medb32 ! repeat if at least 32 bytes left
3379 stxa %o4, [%o1-8]%asi
3380
3381 .co_medb31: ! 31 or fewer bytes remaining
3382 addcc %o2, 24, %o2 ! adjust count to be off by 7
3383 ble,pt %ncc, .co_medb7 ! skip if 7 or fewer bytes left
3384 nop !
3385 .co_medb15:
3386
3387 ldub [%o0], %o4 ! load and store a block of 8 bytes
3388 subcc %o2, 8, %o2 ! decrement length count
3389 sllx %o4, 56, %o3
3390 lduh [%o0+1], %o4
3391 sllx %o4, 40, %o4
3392 or %o4, %o3, %o3
3393 lduw [%o0+3], %o4
3394 add %o1, 8, %o1 ! increase dst ptr by 16
3395 sllx %o4, 8, %o4
3396 or %o4, %o3, %o3
3397 ldub [%o0+7], %o4
3398 add %o0, 8, %o0 ! increase src ptr by 16
3399 or %o4, %o3, %o4
3400 bgu,pt %ncc, .co_medb15
3401 stxa %o4, [%o1-8]%asi
3402 .co_medb7:
3403 addcc %o2, 7, %o2 ! finish adjustment of remaining count
3404 bz,pt %ncc, .co_smallx ! exit if finished
3405 cmp %o2, 4
3406 blt,pt %ncc, .co_small3x ! skip if less than 4 bytes left
3407 nop !
3408 ldub [%o0], %o4 ! move 4 bytes
3409 sll %o4, 24, %o3
3410 lduh [%o0+1], %o4
3411 sll %o4, 8, %o4
3412 or %o4, %o3, %o3
3413 ldub [%o0+3], %o4
3414 or %o4, %o3, %o4
3415 subcc %o2, 4, %o2
3416 add %o0, 4, %o0
3417 add %o1, 4, %o1
3418 bnz .co_small3x
3419 stwa %o4, [%o1-4]%asi
3420 membar #Sync
3421 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3422 retl
3423 mov %g0, %o0
3424
3425 .align 16
3426 .co_medbh32a:
3427 ble,pt %ncc, .co_medbh31
3428 nop
3429 .co_medbh32: ! Alignment 3 or 7
3430 subcc %o2, 32, %o2 ! decrement length count
3431
3432 ldub [%o0], %o4 ! load and store a block of 32 bytes
3433 sllx %o4, 56, %o3
3434 lduw [%o0+1], %o4
3435 sllx %o4, 24, %o4
3436 or %o4, %o3, %o3
3437 lduh [%o0+5], %o4
3438 sllx %o4, 8, %o4
3439 or %o4, %o3, %o3
3440 ldub [%o0+7], %o4
3441 or %o4, %o3, %o4
3442 stxa %o4, [%o1]%asi
3443
3444 ldub [%o0+8], %o4
3445 sllx %o4, 56, %o3
3446 lduw [%o0+9], %o4
3447 sllx %o4, 24, %o4
3448 or %o4, %o3, %o3
3449 lduh [%o0+13], %o4
3450 sllx %o4, 8, %o4
3451 or %o4, %o3, %o3
3452 ldub [%o0+15], %o4
3453 or %o4, %o3, %o4
3454 stxa %o4, [%o1+8]%asi
3455
3456 ldub [%o0+16], %o4
3457 sllx %o4, 56, %o3
3458 lduw [%o0+17], %o4
3459 sllx %o4, 24, %o4
3460 or %o4, %o3, %o3
3461 lduh [%o0+21], %o4
3462 sllx %o4, 8, %o4
3463 or %o4, %o3, %o3
3464 ldub [%o0+23], %o4
3465 or %o4, %o3, %o4
3466 stxa %o4, [%o1+16]%asi
3467
3468 add %o0, 32, %o0 ! increase src ptr by 32
3469 add %o1, 32, %o1 ! increase dst ptr by 32
3470
3471 ldub [%o0-8], %o4
3472 sllx %o4, 56, %o3
3473 lduw [%o0-7], %o4
3474 sllx %o4, 24, %o4
3475 or %o4, %o3, %o3
3476 lduh [%o0-3], %o4
3477 sllx %o4, 8, %o4
3478 or %o4, %o3, %o3
3479 ldub [%o0-1], %o4
3480 or %o4, %o3, %o4
3481 bgu,pt %ncc, .co_medbh32 ! repeat if at least 32 bytes left
3482 stxa %o4, [%o1-8]%asi
3483
3484 .co_medbh31:
3485 addcc %o2, 24, %o2 ! adjust count to be off by 7
3486 ble,pt %ncc, .co_medb7 ! skip if 7 or fewer bytes left
3487 nop !
3488 .co_medbh15:
3489 ldub [%o0], %o4 ! load and store a block of 8 bytes
3490 sllx %o4, 56, %o3
3491 lduw [%o0+1], %o4
3492 sllx %o4, 24, %o4
3493 or %o4, %o3, %o3
3494 lduh [%o0+5], %o4
3495 sllx %o4, 8, %o4
3496 or %o4, %o3, %o3
3497 ldub [%o0+7], %o4
3498 or %o4, %o3, %o4
3499 stxa %o4, [%o1]%asi
3500 subcc %o2, 8, %o2 ! decrement length count
3501 add %o1, 8, %o1 ! increase dst ptr by 8
3502 add %o0, 8, %o0 ! increase src ptr by 8
3503 bgu,pt %ncc, .co_medbh15
3504 stxa %o4, [%o1-8]%asi
3505 ba .co_medb7
3506 nop
3507 /*
3508 * End of small copy (no window) code
3509 */
3510
3511 /*
3512 * Long copy code
3513 */
3514 .co_copy_more:
3515 sethi %hi(copyio_fault), %o3
3516 or %o3, %lo(copyio_fault), %o3
3517 membar #Sync
3518 stn %o3, [THREAD_REG + T_LOFAULT]
3519
3520 /*
3521 * Following code is for large copies. We know there is at
3522 * least FP_COPY bytes available. FP regs are used, so
3523 * we save registers and fp regs before starting
3524 */
3525 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3526 or SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
3527 rd %fprs, %g1 ! check for unused fp
3528 ! if fprs.fef == 0, set it.
3529 ! Setting it when already set costs more than checking
3530 andcc %g1, FPRS_FEF, %g1 ! test FEF, fprs.du = fprs.dl = 0
3531 bz,pt %ncc, .co_fp_unused
3532 mov ASI_USER, %asi
3533 BST_FP_TOSTACK(%o3)
3534 ba .co_fp_ready
3535 .co_fp_unused:
3536 prefetch [%i0 + (1 * CACHE_LINE)], #one_read
3537 wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1
3538 .co_fp_ready:
3539 rd %gsr, %l5 ! save %gsr value
3540 andcc %i1, 1, %o3 ! is dest byte aligned
3541 bnz,pt %ncc, .co_big_d1
3542 .co_big_d1f: ! dest is now half word aligned
3543 andcc %i1, 2, %o3
3544 bnz,pt %ncc, .co_big_d2
3545 .co_big_d2f: ! dest is now word aligned
3546 andcc %i1, 4, %o3 ! is dest longword aligned
3547 bnz,pt %ncc, .co_big_d4
3548 .co_big_d4f: ! dest is now long word aligned
3549 andcc %i0, 7, %o3 ! is src long word aligned
3550 brnz,pt %o3, .co_big_unal8
3551 prefetch [%i0 + (2 * CACHE_LINE)], #one_read
3552 ! Src and dst are long word aligned
3553 ! align dst to 64 byte boundary
3554 andcc %i1, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned
3555 brz,pn %o3, .co_al_to_64
3556 nop
3557 sub %o3, 64, %o3 ! %o3 has negative bytes to move
3558 add %i2, %o3, %i2 ! adjust remaining count
3559 andcc %o3, 8, %o4 ! odd long words to move?
3560 brz,pt %o4, .co_al_to_16
3561 nop
3562 add %o3, 8, %o3
3563 ldx [%i0], %o4
3564 add %i0, 8, %i0 ! increment src ptr
3565 stxa %o4, [%i1]ASI_USER
3566 add %i1, 8, %i1 ! increment dst ptr
3567 ! Dest is aligned on 16 bytes, src 8 byte aligned
3568 .co_al_to_16:
3569 andcc %o3, 0x30, %o4 ! move to move?
3570 brz,pt %o4, .co_al_to_64
3571 nop
3572 .co_al_mv_16:
3573 add %o3, 16, %o3
3574 ldx [%i0], %o4
3575 stxa %o4, [%i1]ASI_USER
3576 add %i0, 16, %i0 ! increment src ptr
3577 ldx [%i0-8], %o4
3578 add %i1, 8, %i1 ! increment dst ptr
3579 stxa %o4, [%i1]ASI_USER
3580 andcc %o3, 0x30, %o4
3581 brnz,pt %o4, .co_al_mv_16
3582 add %i1, 8, %i1 ! increment dst ptr
3583 ! Dest is aligned on 64 bytes, src 8 byte aligned
3584 .co_al_to_64:
3585 ! Determine source alignment
3586 ! to correct 8 byte offset
3587 andcc %i0, 32, %o3
3588 brnz,pn %o3, .co_aln_1
3589 andcc %i0, 16, %o3
3590 brnz,pn %o3, .co_aln_01
3591 andcc %i0, 8, %o3
3592 brz,pn %o3, .co_aln_000
3593 prefetch [%i0 + (3 * CACHE_LINE)], #one_read
3594 ba .co_aln_001
3595 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3596 .co_aln_01:
3597 brnz,pn %o3, .co_aln_011
3598 prefetch [%i0 + (3 * CACHE_LINE)], #one_read
3599 ba .co_aln_010
3600 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3601 .co_aln_1:
3602 andcc %i0, 16, %o3
3603 brnz,pn %o3, .co_aln_11
3604 andcc %i0, 8, %o3
3605 brnz,pn %o3, .co_aln_101
3606 prefetch [%i0 + (3 * CACHE_LINE)], #one_read
3607 ba .co_aln_100
3608 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3609 .co_aln_11:
3610 brz,pn %o3, .co_aln_110
3611 prefetch [%i0 + (3 * CACHE_LINE)], #one_read
3612
3613 .co_aln_111:
3614 ! Alignment off by 8 bytes
3615 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3616 ldd [%i0], %d0
3617 add %i0, 8, %i0
3618 sub %i2, 8, %i2
3619 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
3620 and %i2, 0x7f, %i2 ! residue bytes in %i2
3621 sub %i1, %i0, %i1
3622 .co_aln_111_loop:
3623 ldda [%i0]ASI_BLK_P,%d16 ! block load
3624 subcc %o3, 64, %o3
3625 fmovd %d16, %d2
3626 fmovd %d18, %d4
3627 fmovd %d20, %d6
3628 fmovd %d22, %d8
3629 fmovd %d24, %d10
3630 fmovd %d26, %d12
3631 fmovd %d28, %d14
3632 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store
3633 stda %d0,[%i0+%i1]ASI_BLK_AIUS
3634 add %i0, 64, %i0
3635 fmovd %d30, %d0
3636 bgt,pt %ncc, .co_aln_111_loop
3637 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3638 add %i1, %i0, %i1
3639
3640 stda %d0, [%i1]ASI_USER
3641 ba .co_remain_stuff
3642 add %i1, 8, %i1
3643 ! END OF aln_111
3644
3645 .co_aln_110:
3646 ! Alignment off by 16 bytes
3647 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3648 ldd [%i0], %d0
3649 ldd [%i0+8], %d2
3650 add %i0, 16, %i0
3651 sub %i2, 16, %i2
3652 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
3653 and %i2, 0x7f, %i2 ! residue bytes in %i2
3654 sub %i1, %i0, %i1
3655 .co_aln_110_loop:
3656 ldda [%i0]ASI_BLK_P,%d16 ! block load
3657 subcc %o3, 64, %o3
3658 fmovd %d16, %d4
3659 fmovd %d18, %d6
3660 fmovd %d20, %d8
3661 fmovd %d22, %d10
3662 fmovd %d24, %d12
3663 fmovd %d26, %d14
3664 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store
3665 stda %d0,[%i0+%i1]ASI_BLK_AIUS
3666 add %i0, 64, %i0
3667 fmovd %d28, %d0
3668 fmovd %d30, %d2
3669 bgt,pt %ncc, .co_aln_110_loop
3670 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3671 add %i1, %i0, %i1
3672
3673 stda %d0, [%i1]%asi
3674 stda %d2, [%i1+8]%asi
3675 ba .co_remain_stuff
3676 add %i1, 16, %i1
3677 ! END OF aln_110
3678
3679 .co_aln_101:
3680 ! Alignment off by 24 bytes
3681 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3682 ldd [%i0], %d0
3683 ldd [%i0+8], %d2
3684 ldd [%i0+16], %d4
3685 add %i0, 24, %i0
3686 sub %i2, 24, %i2
3687 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
3688 and %i2, 0x7f, %i2 ! residue bytes in %i2
3689 sub %i1, %i0, %i1
3690 .co_aln_101_loop:
3691 ldda [%i0]ASI_BLK_P,%d16 ! block load
3692 subcc %o3, 64, %o3
3693 fmovd %d16, %d6
3694 fmovd %d18, %d8
3695 fmovd %d20, %d10
3696 fmovd %d22, %d12
3697 fmovd %d24, %d14
3698 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store
3699 stda %d0,[%i0+%i1]ASI_BLK_AIUS
3700 add %i0, 64, %i0
3701 fmovd %d26, %d0
3702 fmovd %d28, %d2
3703 fmovd %d30, %d4
3704 bgt,pt %ncc, .co_aln_101_loop
3705 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3706 add %i1, %i0, %i1
3707
3708 stda %d0, [%i1]%asi
3709 stda %d2, [%i1+8]%asi
3710 stda %d4, [%i1+16]%asi
3711 ba .co_remain_stuff
3712 add %i1, 24, %i1
3713 ! END OF aln_101
3714
3715 .co_aln_100:
3716 ! Alignment off by 32 bytes
3717 ldd [%i0], %d0
3718 ldd [%i0+8], %d2
3719 ldd [%i0+16],%d4
3720 ldd [%i0+24],%d6
3721 add %i0, 32, %i0
3722 sub %i2, 32, %i2
3723 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
3724 and %i2, 0x7f, %i2 ! residue bytes in %i2
3725 sub %i1, %i0, %i1
3726 .co_aln_100_loop:
3727 ldda [%i0]ASI_BLK_P,%d16 ! block load
3728 subcc %o3, 64, %o3
3729 fmovd %d16, %d8
3730 fmovd %d18, %d10
3731 fmovd %d20, %d12
3732 fmovd %d22, %d14
3733 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store
3734 stda %d0,[%i0+%i1]ASI_BLK_AIUS
3735 add %i0, 64, %i0
3736 fmovd %d24, %d0
3737 fmovd %d26, %d2
3738 fmovd %d28, %d4
3739 fmovd %d30, %d6
3740 bgt,pt %ncc, .co_aln_100_loop
3741 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3742 add %i1, %i0, %i1
3743
3744 stda %d0, [%i1]%asi
3745 stda %d2, [%i1+8]%asi
3746 stda %d4, [%i1+16]%asi
3747 stda %d6, [%i1+24]%asi
3748 ba .co_remain_stuff
3749 add %i1, 32, %i1
3750 ! END OF aln_100
3751
3752 .co_aln_011:
3753 ! Alignment off by 40 bytes
3754 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3755 ldd [%i0], %d0
3756 ldd [%i0+8], %d2
3757 ldd [%i0+16], %d4
3758 ldd [%i0+24], %d6
3759 ldd [%i0+32], %d8
3760 add %i0, 40, %i0
3761 sub %i2, 40, %i2
3762 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
3763 and %i2, 0x7f, %i2 ! residue bytes in %i2
3764 sub %i1, %i0, %i1
3765 .co_aln_011_loop:
3766 ldda [%i0]ASI_BLK_P,%d16 ! block load
3767 subcc %o3, 64, %o3
3768 fmovd %d16, %d10
3769 fmovd %d18, %d12
3770 fmovd %d20, %d14
3771 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store
3772 stda %d0,[%i0+%i1]ASI_BLK_AIUS
3773 add %i0, 64, %i0
3774 fmovd %d22, %d0
3775 fmovd %d24, %d2
3776 fmovd %d26, %d4
3777 fmovd %d28, %d6
3778 fmovd %d30, %d8
3779 bgt,pt %ncc, .co_aln_011_loop
3780 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3781 add %i1, %i0, %i1
3782
3783 stda %d0, [%i1]%asi
3784 stda %d2, [%i1+8]%asi
3785 stda %d4, [%i1+16]%asi
3786 stda %d6, [%i1+24]%asi
3787 stda %d8, [%i1+32]%asi
3788 ba .co_remain_stuff
3789 add %i1, 40, %i1
3790 ! END OF aln_011
3791
3792 .co_aln_010:
3793 ! Alignment off by 48 bytes
3794 ldd [%i0], %d0
3795 ldd [%i0+8], %d2
3796 ldd [%i0+16], %d4
3797 ldd [%i0+24], %d6
3798 ldd [%i0+32], %d8
3799 ldd [%i0+40], %d10
3800 add %i0, 48, %i0
3801 sub %i2, 48, %i2
3802 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
3803 and %i2, 0x7f, %i2 ! residue bytes in %i2
3804 sub %i1, %i0, %i1
3805 .co_aln_010_loop:
3806 ldda [%i0]ASI_BLK_P,%d16 ! block load
3807 subcc %o3, 64, %o3
3808 fmovd %d16, %d12
3809 fmovd %d18, %d14
3810 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store
3811 stda %d0,[%i0+%i1]ASI_BLK_AIUS
3812 add %i0, 64, %i0
3813 fmovd %d20, %d0
3814 fmovd %d22, %d2
3815 fmovd %d24, %d4
3816 fmovd %d26, %d6
3817 fmovd %d28, %d8
3818 fmovd %d30, %d10
3819 bgt,pt %ncc, .co_aln_010_loop
3820 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3821 add %i1, %i0, %i1
3822
3823 stda %d0, [%i1]%asi
3824 stda %d2, [%i1+8]%asi
3825 stda %d4, [%i1+16]%asi
3826 stda %d6, [%i1+24]%asi
3827 stda %d8, [%i1+32]%asi
3828 stda %d10, [%i1+40]%asi
3829 ba .co_remain_stuff
3830 add %i1, 48, %i1
3831 ! END OF aln_010
3832
3833 .co_aln_001:
3834 ! Alignment off by 56 bytes
3835 ldd [%i0], %d0
3836 ldd [%i0+8], %d2
3837 ldd [%i0+16], %d4
3838 ldd [%i0+24], %d6
3839 ldd [%i0+32], %d8
3840 ldd [%i0+40], %d10
3841 ldd [%i0+48], %d12
3842 add %i0, 56, %i0
3843 sub %i2, 56, %i2
3844 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
3845 and %i2, 0x7f, %i2 ! residue bytes in %i2
3846 sub %i1, %i0, %i1
3847 .co_aln_001_loop:
3848 ldda [%i0]ASI_BLK_P,%d16 ! block load
3849 subcc %o3, 64, %o3
3850 fmovd %d16, %d14
3851 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store
3852 stda %d0,[%i0+%i1]ASI_BLK_AIUS
3853 add %i0, 64, %i0
3854 fmovd %d18, %d0
3855 fmovd %d20, %d2
3856 fmovd %d22, %d4
3857 fmovd %d24, %d6
3858 fmovd %d26, %d8
3859 fmovd %d28, %d10
3860 fmovd %d30, %d12
3861 bgt,pt %ncc, .co_aln_001_loop
3862 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3863 add %i1, %i0, %i1
3864
3865 stda %d0, [%i1]%asi
3866 stda %d2, [%i1+8]%asi
3867 stda %d4, [%i1+16]%asi
3868 stda %d6, [%i1+24]%asi
3869 stda %d8, [%i1+32]%asi
3870 stda %d10, [%i1+40]%asi
3871 stda %d12, [%i1+48]%asi
3872 ba .co_remain_stuff
3873 add %i1, 56, %i1
3874 ! END OF aln_001
3875
3876 .co_aln_000:
3877 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3878 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
3879 and %i2, 0x7f, %i2 ! residue bytes in %i2
3880 sub %i1, %i0, %i1
3881 .co_aln_000_loop:
3882 ldda [%i0]ASI_BLK_P,%d0
3883 subcc %o3, 64, %o3
3884 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store
3885 stda %d0,[%i0+%i1]ASI_BLK_AIUS
3886 add %i0, 64, %i0
3887 bgt,pt %ncc, .co_aln_000_loop
3888 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3889 add %i1, %i0, %i1
3890
3891 ! END OF aln_000
3892
3893 .co_remain_stuff:
3894 subcc %i2, 31, %i2 ! adjust length to allow cc test
3895 ble,pt %ncc, .co_aln_31
3896 nop
3897 .co_aln_32:
3898 ldx [%i0], %o4 ! move 32 bytes
3899 subcc %i2, 32, %i2 ! decrement length count by 32
3900 stxa %o4, [%i1]%asi
3901 ldx [%i0+8], %o4
3902 stxa %o4, [%i1+8]%asi
3903 ldx [%i0+16], %o4
3904 add %i0, 32, %i0 ! increase src ptr by 32
3905 stxa %o4, [%i1+16]%asi
3906 ldx [%i0-8], %o4
3907 add %i1, 32, %i1 ! increase dst ptr by 32
3908 bgu,pt %ncc, .co_aln_32 ! repeat if at least 32 bytes left
3909 stxa %o4, [%i1-8]%asi
3910 .co_aln_31:
3911 addcc %i2, 24, %i2 ! adjust count to be off by 7
3912 ble,pt %ncc, .co_aln_7 ! skip if 7 or fewer bytes left
3913 nop !
3914 .co_aln_15:
3915 ldx [%i0], %o4 ! move 8 bytes
3916 add %i0, 8, %i0 ! increase src ptr by 8
3917 subcc %i2, 8, %i2 ! decrease count by 8
3918 add %i1, 8, %i1 ! increase dst ptr by 8
3919 bgu,pt %ncc, .co_aln_15
3920 stxa %o4, [%i1-8]%asi
3921 .co_aln_7:
3922 addcc %i2, 7, %i2 ! finish adjustment of remaining count
3923 bz,pt %ncc, .co_exit ! exit if finished
3924 cmp %i2, 4
3925 blt,pt %ncc, .co_unaln3x ! skip if less than 4 bytes left
3926 nop !
3927 ld [%i0], %o4 ! move 4 bytes
3928 add %i0, 4, %i0 ! increase src ptr by 4
3929 add %i1, 4, %i1 ! increase dst ptr by 4
3930 subcc %i2, 4, %i2 ! decrease count by 4
3931 bnz .co_unaln3x
3932 stwa %o4, [%i1-4]%asi
3933 ba .co_exit
3934 nop
3935
3936 ! destination alignment code
3937 .co_big_d1:
3938 ldub [%i0], %o4 ! move a byte
3939 add %i0, 1, %i0
3940 stba %o4, [%i1]ASI_USER
3941 add %i1, 1, %i1
3942 andcc %i1, 2, %o3
3943 bz,pt %ncc, .co_big_d2f
3944 sub %i2, 1, %i2
3945 .co_big_d2:
3946 ldub [%i0], %o4 ! move a half-word (src align unknown)
3947 ldub [%i0+1], %o3
3948 add %i0, 2, %i0
3949 sll %o4, 8, %o4 ! position
3950 or %o4, %o3, %o4 ! merge
3951 stha %o4, [%i1]ASI_USER
3952 add %i1, 2, %i1
3953 andcc %i1, 4, %o3 ! is dest longword aligned
3954 bz,pt %ncc, .co_big_d4f
3955 sub %i2, 2, %i2
3956 .co_big_d4: ! dest is at least word aligned
3957 nop
3958 ldub [%i0], %o4 ! move a word (src align unknown)
3959 ldub [%i0+1], %o3
3960 sll %o4, 24, %o4 ! position
3961 sll %o3, 16, %o3 ! position
3962 or %o4, %o3, %o3 ! merge
3963 ldub [%i0+2], %o4
3964 sll %o4, 8, %o4 ! position
3965 or %o4, %o3, %o3 ! merge
3966 ldub [%i0+3], %o4
3967 or %o4, %o3, %o4 ! merge
3968 stwa %o4,[%i1]ASI_USER ! store four bytes
3969 add %i0, 4, %i0 ! adjust src by 4
3970 add %i1, 4, %i1 ! adjust dest by 4
3971 ba .co_big_d4f
3972 sub %i2, 4, %i2 ! adjust count by 4
3973
3974
3975 ! Dst is on 8 byte boundary; src is not;
3976 .co_big_unal8:
3977 andcc %i1, 0x3f, %o3 ! is dst 64-byte block aligned?
3978 bz %ncc, .co_unalnsrc
3979 sub %o3, 64, %o3 ! %o3 will be multiple of 8
3980 neg %o3 ! bytes until dest is 64 byte aligned
3981 sub %i2, %o3, %i2 ! update cnt with bytes to be moved
3982 ! Move bytes according to source alignment
3983 andcc %i0, 0x1, %o4
3984 bnz %ncc, .co_unalnbyte ! check for byte alignment
3985 nop
3986 andcc %i0, 2, %o4 ! check for half word alignment
3987 bnz %ncc, .co_unalnhalf
3988 nop
3989 ! Src is word aligned, move bytes until dest 64 byte aligned
3990 .co_unalnword:
3991 ld [%i0], %o4 ! load 4 bytes
3992 stwa %o4, [%i1]%asi ! and store 4 bytes
3993 ld [%i0+4], %o4 ! load 4 bytes
3994 add %i0, 8, %i0 ! increase src ptr by 8
3995 stwa %o4, [%i1+4]%asi ! and store 4 bytes
3996 subcc %o3, 8, %o3 ! decrease count by 8
3997 bnz %ncc, .co_unalnword
3998 add %i1, 8, %i1 ! increase dst ptr by 8
3999 ba .co_unalnsrc
4000 nop
4001
4002 ! Src is half-word aligned, move bytes until dest 64 byte aligned
4003 .co_unalnhalf:
4004 lduh [%i0], %o4 ! load 2 bytes
4005 sllx %o4, 32, %i3 ! shift left
4006 lduw [%i0+2], %o4
4007 or %o4, %i3, %i3
4008 sllx %i3, 16, %i3
4009 lduh [%i0+6], %o4
4010 or %o4, %i3, %i3
4011 stxa %i3, [%i1]ASI_USER
4012 add %i0, 8, %i0
4013 subcc %o3, 8, %o3
4014 bnz %ncc, .co_unalnhalf
4015 add %i1, 8, %i1
4016 ba .co_unalnsrc
4017 nop
4018
4019 ! Src is Byte aligned, move bytes until dest 64 byte aligned
4020 .co_unalnbyte:
4021 sub %i1, %i0, %i1 ! share pointer advance
4022 .co_unalnbyte_loop:
4023 ldub [%i0], %o4
4024 sllx %o4, 56, %i3
4025 lduh [%i0+1], %o4
4026 sllx %o4, 40, %o4
4027 or %o4, %i3, %i3
4028 lduh [%i0+3], %o4
4029 sllx %o4, 24, %o4
4030 or %o4, %i3, %i3
4031 lduh [%i0+5], %o4
4032 sllx %o4, 8, %o4
4033 or %o4, %i3, %i3
4034 ldub [%i0+7], %o4
4035 or %o4, %i3, %i3
4036 stxa %i3, [%i1+%i0]ASI_USER
4037 subcc %o3, 8, %o3
4038 bnz %ncc, .co_unalnbyte_loop
4039 add %i0, 8, %i0
4040 add %i1,%i0, %i1 ! restore pointer
4041
4042 ! Destination is now block (64 byte aligned), src is not 8 byte aligned
4043 .co_unalnsrc:
4044 andn %i2, 0x3f, %i3 ! %i3 is multiple of block size
4045 and %i2, 0x3f, %i2 ! residue bytes in %i2
4046 add %i2, 64, %i2 ! Insure we don't load beyond
4047 sub %i3, 64, %i3 ! end of source buffer
4048
4049 andn %i0, 0x3f, %o4 ! %o4 has block aligned src address
4050 prefetch [%o4 + (3 * CACHE_LINE)], #one_read
4051 alignaddr %i0, %g0, %g0 ! generate %gsr
4052 add %i0, %i3, %i0 ! advance %i0 to after blocks
4053 !
4054 ! Determine source alignment to correct 8 byte offset
4055 andcc %i0, 0x20, %o3
4056 brnz,pn %o3, .co_unaln_1
4057 andcc %i0, 0x10, %o3
4058 brnz,pn %o3, .co_unaln_01
4059 andcc %i0, 0x08, %o3
4060 brz,a %o3, .co_unaln_000
4061 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4062 ba .co_unaln_001
4063 nop
4064 .co_unaln_01:
4065 brnz,a %o3, .co_unaln_011
4066 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4067 ba .co_unaln_010
4068 nop
4069 .co_unaln_1:
4070 brnz,pn %o3, .co_unaln_11
4071 andcc %i0, 0x08, %o3
4072 brnz,a %o3, .co_unaln_101
4073 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4074 ba .co_unaln_100
4075 nop
4076 .co_unaln_11:
4077 brz,pn %o3, .co_unaln_110
4078 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
4079
4080 .co_unaln_111:
4081 ldd [%o4+56], %d14
4082 .co_unaln_111_loop:
4083 add %o4, 64, %o4
4084 ldda [%o4]ASI_BLK_P, %d16
4085 faligndata %d14, %d16, %d48
4086 faligndata %d16, %d18, %d50
4087 faligndata %d18, %d20, %d52
4088 faligndata %d20, %d22, %d54
4089 faligndata %d22, %d24, %d56
4090 faligndata %d24, %d26, %d58
4091 faligndata %d26, %d28, %d60
4092 faligndata %d28, %d30, %d62
4093 fmovd %d30, %d14
4094 stda %d48, [%i1]ASI_BLK_AIUS
4095 subcc %i3, 64, %i3
4096 add %i1, 64, %i1
4097 bgu,pt %ncc, .co_unaln_111_loop
4098 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4099 ba .co_unaln_done
4100 nop
4101
4102 .co_unaln_110:
4103 ldd [%o4+48], %d12
4104 ldd [%o4+56], %d14
4105 .co_unaln_110_loop:
4106 add %o4, 64, %o4
4107 ldda [%o4]ASI_BLK_P, %d16
4108 faligndata %d12, %d14, %d48
4109 faligndata %d14, %d16, %d50
4110 faligndata %d16, %d18, %d52
4111 faligndata %d18, %d20, %d54
4112 faligndata %d20, %d22, %d56
4113 faligndata %d22, %d24, %d58
4114 faligndata %d24, %d26, %d60
4115 faligndata %d26, %d28, %d62
4116 fmovd %d28, %d12
4117 fmovd %d30, %d14
4118 stda %d48, [%i1]ASI_BLK_AIUS
4119 subcc %i3, 64, %i3
4120 add %i1, 64, %i1
4121 bgu,pt %ncc, .co_unaln_110_loop
4122 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4123 ba .co_unaln_done
4124 nop
4125
4126 .co_unaln_101:
4127 ldd [%o4+40], %d10
4128 ldd [%o4+48], %d12
4129 ldd [%o4+56], %d14
4130 .co_unaln_101_loop:
4131 add %o4, 64, %o4
4132 ldda [%o4]ASI_BLK_P, %d16
4133 faligndata %d10, %d12, %d48
4134 faligndata %d12, %d14, %d50
4135 faligndata %d14, %d16, %d52
4136 faligndata %d16, %d18, %d54
4137 faligndata %d18, %d20, %d56
4138 faligndata %d20, %d22, %d58
4139 faligndata %d22, %d24, %d60
4140 faligndata %d24, %d26, %d62
4141 fmovd %d26, %d10
4142 fmovd %d28, %d12
4143 fmovd %d30, %d14
4144 stda %d48, [%i1]ASI_BLK_AIUS
4145 subcc %i3, 64, %i3
4146 add %i1, 64, %i1
4147 bgu,pt %ncc, .co_unaln_101_loop
4148 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4149 ba .co_unaln_done
4150 nop
4151
4152 .co_unaln_100:
4153 ldd [%o4+32], %d8
4154 ldd [%o4+40], %d10
4155 ldd [%o4+48], %d12
4156 ldd [%o4+56], %d14
4157 .co_unaln_100_loop:
4158 add %o4, 64, %o4
4159 ldda [%o4]ASI_BLK_P, %d16
4160 faligndata %d8, %d10, %d48
4161 faligndata %d10, %d12, %d50
4162 faligndata %d12, %d14, %d52
4163 faligndata %d14, %d16, %d54
4164 faligndata %d16, %d18, %d56
4165 faligndata %d18, %d20, %d58
4166 faligndata %d20, %d22, %d60
4167 faligndata %d22, %d24, %d62
4168 fmovd %d24, %d8
4169 fmovd %d26, %d10
4170 fmovd %d28, %d12
4171 fmovd %d30, %d14
4172 stda %d48, [%i1]ASI_BLK_AIUS
4173 subcc %i3, 64, %i3
4174 add %i1, 64, %i1
4175 bgu,pt %ncc, .co_unaln_100_loop
4176 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4177 ba .co_unaln_done
4178 nop
4179
4180 .co_unaln_011:
4181 ldd [%o4+24], %d6
4182 ldd [%o4+32], %d8
4183 ldd [%o4+40], %d10
4184 ldd [%o4+48], %d12
4185 ldd [%o4+56], %d14
4186 .co_unaln_011_loop:
4187 add %o4, 64, %o4
4188 ldda [%o4]ASI_BLK_P, %d16
4189 faligndata %d6, %d8, %d48
4190 faligndata %d8, %d10, %d50
4191 faligndata %d10, %d12, %d52
4192 faligndata %d12, %d14, %d54
4193 faligndata %d14, %d16, %d56
4194 faligndata %d16, %d18, %d58
4195 faligndata %d18, %d20, %d60
4196 faligndata %d20, %d22, %d62
4197 fmovd %d22, %d6
4198 fmovd %d24, %d8
4199 fmovd %d26, %d10
4200 fmovd %d28, %d12
4201 fmovd %d30, %d14
4202 stda %d48, [%i1]ASI_BLK_AIUS
4203 subcc %i3, 64, %i3
4204 add %i1, 64, %i1
4205 bgu,pt %ncc, .co_unaln_011_loop
4206 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4207 ba .co_unaln_done
4208 nop
4209
4210 .co_unaln_010:
4211 ldd [%o4+16], %d4
4212 ldd [%o4+24], %d6
4213 ldd [%o4+32], %d8
4214 ldd [%o4+40], %d10
4215 ldd [%o4+48], %d12
4216 ldd [%o4+56], %d14
4217 .co_unaln_010_loop:
4218 add %o4, 64, %o4
4219 ldda [%o4]ASI_BLK_P, %d16
4220 faligndata %d4, %d6, %d48
4221 faligndata %d6, %d8, %d50
4222 faligndata %d8, %d10, %d52
4223 faligndata %d10, %d12, %d54
4224 faligndata %d12, %d14, %d56
4225 faligndata %d14, %d16, %d58
4226 faligndata %d16, %d18, %d60
4227 faligndata %d18, %d20, %d62
4228 fmovd %d20, %d4
4229 fmovd %d22, %d6
4230 fmovd %d24, %d8
4231 fmovd %d26, %d10
4232 fmovd %d28, %d12
4233 fmovd %d30, %d14
4234 stda %d48, [%i1]ASI_BLK_AIUS
4235 subcc %i3, 64, %i3
4236 add %i1, 64, %i1
4237 bgu,pt %ncc, .co_unaln_010_loop
4238 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4239 ba .co_unaln_done
4240 nop
4241
4242 .co_unaln_001:
4243 ldd [%o4+8], %d2
4244 ldd [%o4+16], %d4
4245 ldd [%o4+24], %d6
4246 ldd [%o4+32], %d8
4247 ldd [%o4+40], %d10
4248 ldd [%o4+48], %d12
4249 ldd [%o4+56], %d14
4250 .co_unaln_001_loop:
4251 add %o4, 64, %o4
4252 ldda [%o4]ASI_BLK_P, %d16
4253 faligndata %d2, %d4, %d48
4254 faligndata %d4, %d6, %d50
4255 faligndata %d6, %d8, %d52
4256 faligndata %d8, %d10, %d54
4257 faligndata %d10, %d12, %d56
4258 faligndata %d12, %d14, %d58
4259 faligndata %d14, %d16, %d60
4260 faligndata %d16, %d18, %d62
4261 fmovd %d18, %d2
4262 fmovd %d20, %d4
4263 fmovd %d22, %d6
4264 fmovd %d24, %d8
4265 fmovd %d26, %d10
4266 fmovd %d28, %d12
4267 fmovd %d30, %d14
4268 stda %d48, [%i1]ASI_BLK_AIUS
4269 subcc %i3, 64, %i3
4270 add %i1, 64, %i1
4271 bgu,pt %ncc, .co_unaln_001_loop
4272 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4273 ba .co_unaln_done
4274 nop
4275
4276 .co_unaln_000:
4277 ldda [%o4]ASI_BLK_P, %d0
4278 .co_unaln_000_loop:
4279 add %o4, 64, %o4
4280 ldda [%o4]ASI_BLK_P, %d16
4281 faligndata %d0, %d2, %d48
4282 faligndata %d2, %d4, %d50
4283 faligndata %d4, %d6, %d52
4284 faligndata %d6, %d8, %d54
4285 faligndata %d8, %d10, %d56
4286 faligndata %d10, %d12, %d58
4287 faligndata %d12, %d14, %d60
4288 faligndata %d14, %d16, %d62
4289 fmovd %d16, %d0
4290 fmovd %d18, %d2
4291 fmovd %d20, %d4
4292 fmovd %d22, %d6
4293 fmovd %d24, %d8
4294 fmovd %d26, %d10
4295 fmovd %d28, %d12
4296 fmovd %d30, %d14
4297 stda %d48, [%i1]ASI_BLK_AIUS
4298 subcc %i3, 64, %i3
4299 add %i1, 64, %i1
4300 bgu,pt %ncc, .co_unaln_000_loop
4301 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4302
4303 .co_unaln_done:
4304 ! Handle trailing bytes, 64 to 127
4305 ! Dest long word aligned, Src not long word aligned
4306 cmp %i2, 15
4307 bleu %ncc, .co_unaln_short
4308
4309 andn %i2, 0x7, %i3 ! %i3 is multiple of 8
4310 and %i2, 0x7, %i2 ! residue bytes in %i2
4311 add %i2, 8, %i2
4312 sub %i3, 8, %i3 ! insure we don't load past end of src
4313 andn %i0, 0x7, %o4 ! %o4 has long word aligned src address
4314 add %i0, %i3, %i0 ! advance %i0 to after multiple of 8
4315 ldd [%o4], %d0 ! fetch partial word
4316 .co_unaln_by8:
4317 ldd [%o4+8], %d2
4318 add %o4, 8, %o4
4319 faligndata %d0, %d2, %d16
4320 subcc %i3, 8, %i3
4321 stda %d16, [%i1]%asi
4322 fmovd %d2, %d0
4323 bgu,pt %ncc, .co_unaln_by8
4324 add %i1, 8, %i1
4325
4326 .co_unaln_short:
4327 cmp %i2, 8
4328 blt,pt %ncc, .co_unalnfin
4329 nop
4330 ldub [%i0], %o4
4331 sll %o4, 24, %o3
4332 ldub [%i0+1], %o4
4333 sll %o4, 16, %o4
4334 or %o4, %o3, %o3
4335 ldub [%i0+2], %o4
4336 sll %o4, 8, %o4
4337 or %o4, %o3, %o3
4338 ldub [%i0+3], %o4
4339 or %o4, %o3, %o3
4340 stwa %o3, [%i1]%asi
4341 ldub [%i0+4], %o4
4342 sll %o4, 24, %o3
4343 ldub [%i0+5], %o4
4344 sll %o4, 16, %o4
4345 or %o4, %o3, %o3
4346 ldub [%i0+6], %o4
4347 sll %o4, 8, %o4
4348 or %o4, %o3, %o3
4349 ldub [%i0+7], %o4
4350 or %o4, %o3, %o3
4351 stwa %o3, [%i1+4]%asi
4352 add %i0, 8, %i0
4353 add %i1, 8, %i1
4354 sub %i2, 8, %i2
4355 .co_unalnfin:
4356 cmp %i2, 4
4357 blt,pt %ncc, .co_unalnz
4358 tst %i2
4359 ldub [%i0], %o3 ! read byte
4360 subcc %i2, 4, %i2 ! reduce count by 4
4361 sll %o3, 24, %o3 ! position
4362 ldub [%i0+1], %o4
4363 sll %o4, 16, %o4 ! position
4364 or %o4, %o3, %o3 ! merge
4365 ldub [%i0+2], %o4
4366 sll %o4, 8, %o4 ! position
4367 or %o4, %o3, %o3 ! merge
4368 add %i1, 4, %i1 ! advance dst by 4
4369 ldub [%i0+3], %o4
4370 add %i0, 4, %i0 ! advance src by 4
4371 or %o4, %o3, %o4 ! merge
4372 bnz,pt %ncc, .co_unaln3x
4373 stwa %o4, [%i1-4]%asi
4374 ba .co_exit
4375 nop
4376 .co_unalnz:
4377 bz,pt %ncc, .co_exit
4378 wr %l5, %g0, %gsr ! restore %gsr
4379 .co_unaln3x: ! Exactly 1, 2, or 3 bytes remain
4380 subcc %i2, 1, %i2 ! reduce count for cc test
4381 ldub [%i0], %o4 ! load one byte
4382 bz,pt %ncc, .co_exit
4383 stba %o4, [%i1]%asi ! store one byte
4384 ldub [%i0+1], %o4 ! load second byte
4385 subcc %i2, 1, %i2
4386 bz,pt %ncc, .co_exit
4387 stba %o4, [%i1+1]%asi ! store second byte
4388 ldub [%i0+2], %o4 ! load third byte
4389 stba %o4, [%i1+2]%asi ! store third byte
4390 .co_exit:
4391 brnz %g1, .co_fp_restore
4392 nop
4393 FZERO
4394 wr %g1, %g0, %fprs
4395 ba,pt %ncc, .co_ex2
4396 membar #Sync
4397 .co_fp_restore:
4398 BLD_FP_FROMSTACK(%o4)
4399 .co_ex2:
4400 andn SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
4401 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
4402 ret
4403 restore %g0, 0, %o0
4404
4405 .copyout_err:
4406 ldn [THREAD_REG + T_COPYOPS], %o4
4407 brz %o4, 2f
4408 nop
4409 ldn [%o4 + CP_COPYOUT], %g2
4410 jmp %g2
4411 nop
4412 2:
4413 retl
4414 mov -1, %o0
4415
4416 #else /* NIAGARA_IMPL */
4417 .do_copyout:
4418 !
4419 ! Check the length and bail if zero.
4420 !
4421 tst %o2
4422 bnz,pt %ncc, 1f
4423 nop
4424 retl
4425 clr %o0
4426 1:
4427 sethi %hi(copyio_fault), %o4
4428 or %o4, %lo(copyio_fault), %o4
4429 sethi %hi(copyio_fault_nowindow), %o3
4430 ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
4431 or %o3, %lo(copyio_fault_nowindow), %o3
4432 membar #Sync
4433 stn %o3, [THREAD_REG + T_LOFAULT]
4434
4435 mov %o0, SAVE_SRC
4436 mov %o1, SAVE_DST
4437 mov %o2, SAVE_COUNT
4438
4439 !
4440 ! Check to see if we're more than SMALL_LIMIT (7 bytes).
4441 ! Run in leaf mode, using the %o regs as our input regs.
4442 !
4443 subcc %o2, SMALL_LIMIT, %o3
4444 bgu,a,pt %ncc, .dco_ns
4445 or %o0, %o1, %o3
4446 !
4447 ! What was previously ".small_copyout"
4448 ! Do full differenced copy.
4449 !
4450 .dcobcp:
4451 sub %g0, %o2, %o3 ! negate count
4452 add %o0, %o2, %o0 ! make %o0 point at the end
4453 add %o1, %o2, %o1 ! make %o1 point at the end
4454 ba,pt %ncc, .dcocl
4455 ldub [%o0 + %o3], %o4 ! load first byte
4456 !
4457 ! %o0 and %o2 point at the end and remain pointing at the end
4458 ! of their buffers. We pull things out by adding %o3 (which is
4459 ! the negation of the length) to the buffer end which gives us
4460 ! the curent location in the buffers. By incrementing %o3 we walk
4461 ! through both buffers without having to bump each buffer's
4462 ! pointer. A very fast 4 instruction loop.
4463 !
4464 .align 16
4465 .dcocl:
4466 stba %o4, [%o1 + %o3]ASI_USER
4467 inccc %o3
4468 bl,a,pt %ncc, .dcocl
4469 ldub [%o0 + %o3], %o4
4470 !
4471 ! We're done. Go home.
4472 !
4473 membar #Sync
4474 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
4475 retl
4476 clr %o0
4477 !
4478 ! Try aligned copies from here.
4479 !
4480 .dco_ns:
4481 ! %o0 = kernel addr (to be copied from)
4482 ! %o1 = user addr (to be copied to)
4483 ! %o2 = length
4484 ! %o3 = %o1 | %o2 (used for alignment checking)
4485 ! %o4 is alternate lo_fault
4486 ! %o5 is original lo_fault
4487 !
4488 ! See if we're single byte aligned. If we are, check the
4489 ! limit for single byte copies. If we're smaller or equal,
4490 ! bounce to the byte for byte copy loop. Otherwise do it in
4491 ! HW (if enabled).
4492 !
4493 btst 1, %o3
4494 bz,pt %icc, .dcoh8
4495 btst 7, %o3
4496 !
4497 ! Single byte aligned. Do we do it via HW or via
4498 ! byte for byte? Do a quick no memory reference
4499 ! check to pick up small copies.
4500 !
4501 sethi %hi(hw_copy_limit_1), %o3
4502 !
4503 ! Big enough that we need to check the HW limit for
4504 ! this size copy.
4505 !
4506 ld [%o3 + %lo(hw_copy_limit_1)], %o3
4507 !
4508 ! Is HW copy on? If not, do everything byte for byte.
4509 !
4510 tst %o3
4511 bz,pn %icc, .dcobcp
4512 subcc %o3, %o2, %o3
4513 !
4514 ! If we're less than or equal to the single byte copy limit,
4515 ! bop to the copy loop.
4516 !
4517 bge,pt %ncc, .dcobcp
4518 nop
4519 !
4520 ! We're big enough and copy is on. Do it with HW.
4521 !
4522 ba,pt %ncc, .big_copyout
4523 nop
4524 .dcoh8:
4525 !
4526 ! 8 byte aligned?
4527 !
4528 bnz,a %ncc, .dcoh4
4529 btst 3, %o3
4530 !
4531 ! See if we're in the "small range".
4532 ! If so, go off and do the copy.
4533 ! If not, load the hard limit. %o3 is
4534 ! available for reuse.
4535 !
4536 sethi %hi(hw_copy_limit_8), %o3
4537 ld [%o3 + %lo(hw_copy_limit_8)], %o3
4538 !
4539 ! If it's zero, there's no HW bcopy.
4540 ! Bop off to the aligned copy.
4541 !
4542 tst %o3
4543 bz,pn %icc, .dcos8
4544 subcc %o3, %o2, %o3
4545 !
4546 ! We're negative if our size is larger than hw_copy_limit_8.
4547 !
4548 bge,pt %ncc, .dcos8
4549 nop
4550 !
4551 ! HW assist is on and we're large enough. Do it.
4552 !
4553 ba,pt %ncc, .big_copyout
4554 nop
4555 .dcos8:
4556 !
4557 ! Housekeeping for copy loops. Uses same idea as in the byte for
4558 ! byte copy loop above.
4559 !
4560 add %o0, %o2, %o0
4561 add %o1, %o2, %o1
4562 sub %g0, %o2, %o3
4563 ba,pt %ncc, .dodebc
4564 srl %o2, 3, %o2 ! Number of 8 byte chunks to copy
4565 !
4566 ! 4 byte aligned?
4567 !
4568 .dcoh4:
4569 bnz,pn %ncc, .dcoh2
4570 !
4571 ! See if we're in the "small range".
4572 ! If so, go off an do the copy.
4573 ! If not, load the hard limit. %o3 is
4574 ! available for reuse.
4575 !
4576 sethi %hi(hw_copy_limit_4), %o3
4577 ld [%o3 + %lo(hw_copy_limit_4)], %o3
4578 !
4579 ! If it's zero, there's no HW bcopy.
4580 ! Bop off to the aligned copy.
4581 !
4582 tst %o3
4583 bz,pn %icc, .dcos4
4584 subcc %o3, %o2, %o3
4585 !
4586 ! We're negative if our size is larger than hw_copy_limit_4.
4587 !
4588 bge,pt %ncc, .dcos4
4589 nop
4590 !
4591 ! HW assist is on and we're large enough. Do it.
4592 !
4593 ba,pt %ncc, .big_copyout
4594 nop
4595 .dcos4:
4596 add %o0, %o2, %o0
4597 add %o1, %o2, %o1
4598 sub %g0, %o2, %o3
4599 ba,pt %ncc, .dodfbc
4600 srl %o2, 2, %o2 ! Number of 4 byte chunks to copy
4601 !
4602 ! We must be 2 byte aligned. Off we go.
4603 ! The check for small copies was done in the
4604 ! delay at .dcoh4
4605 !
4606 .dcoh2:
4607 ble %ncc, .dcos2
4608 sethi %hi(hw_copy_limit_2), %o3
4609 ld [%o3 + %lo(hw_copy_limit_2)], %o3
4610 tst %o3
4611 bz,pn %icc, .dcos2
4612 subcc %o3, %o2, %o3
4613 bge,pt %ncc, .dcos2
4614 nop
4615 !
4616 ! HW is on and we're big enough. Do it.
4617 !
4618 ba,pt %ncc, .big_copyout
4619 nop
4620 .dcos2:
4621 add %o0, %o2, %o0
4622 add %o1, %o2, %o1
4623 sub %g0, %o2, %o3
4624 ba,pt %ncc, .dodtbc
4625 srl %o2, 1, %o2 ! Number of 2 byte chunks to copy
4626 .small_copyout:
4627 !
4628 ! Why are we doing this AGAIN? There are certain conditions in
4629 ! big_copyout that will cause us to forego the HW assisted copies
4630 ! and bounce back to a non-HW assisted copy. This dispatches those
4631 ! copies. Note that we branch around this in the main line code.
4632 !
4633 ! We make no check for limits or HW enablement here. We've
4634 ! already been told that we're a poster child so just go off
4635 ! and do it.
4636 !
4637 or %o0, %o1, %o3
4638 btst 1, %o3
4639 bnz %icc, .dcobcp ! Most likely
4640 btst 7, %o3
4641 bz %icc, .dcos8
4642 btst 3, %o3
4643 bz %icc, .dcos4
4644 nop
4645 ba,pt %ncc, .dcos2
4646 nop
4647 .align 32
4648 .dodebc:
4649 ldx [%o0 + %o3], %o4
4650 deccc %o2
4651 stxa %o4, [%o1 + %o3]ASI_USER
4652 bg,pt %ncc, .dodebc
4653 addcc %o3, 8, %o3
4654 !
4655 ! End of copy loop. Check to see if we're done. Most
4656 ! eight byte aligned copies end here.
4657 !
4658 bz,pt %ncc, .dcofh
4659 nop
4660 !
4661 ! Something is left - do it byte for byte.
4662 !
4663 ba,pt %ncc, .dcocl
4664 ldub [%o0 + %o3], %o4 ! load next byte
4665 !
4666 ! Four byte copy loop. %o2 is the number of 4 byte chunks to copy.
4667 !
4668 .align 32
4669 .dodfbc:
4670 lduw [%o0 + %o3], %o4
4671 deccc %o2
4672 sta %o4, [%o1 + %o3]ASI_USER
4673 bg,pt %ncc, .dodfbc
4674 addcc %o3, 4, %o3
4675 !
4676 ! End of copy loop. Check to see if we're done. Most
4677 ! four byte aligned copies end here.
4678 !
4679 bz,pt %ncc, .dcofh
4680 nop
4681 !
4682 ! Something is left. Do it byte for byte.
4683 !
4684 ba,pt %ncc, .dcocl
4685 ldub [%o0 + %o3], %o4 ! load next byte
4686 !
4687 ! two byte aligned copy loop. %o2 is the number of 2 byte chunks to
4688 ! copy.
4689 !
4690 .align 32
4691 .dodtbc:
4692 lduh [%o0 + %o3], %o4
4693 deccc %o2
4694 stha %o4, [%o1 + %o3]ASI_USER
4695 bg,pt %ncc, .dodtbc
4696 addcc %o3, 2, %o3
4697 !
4698 ! End of copy loop. Anything left?
4699 !
4700 bz,pt %ncc, .dcofh
4701 nop
4702 !
4703 ! Deal with the last byte
4704 !
4705 ldub [%o0 + %o3], %o4
4706 stba %o4, [%o1 + %o3]ASI_USER
4707 .dcofh:
4708 membar #Sync
4709 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
4710 retl
4711 clr %o0
4712
4713 .big_copyout:
4714 ! We're going to go off and do a block copy.
4715 ! Switch fault handlers and grab a window. We
4716 ! don't do a membar #Sync since we've done only
4717 ! kernel data to this point.
4718 stn %o4, [THREAD_REG + T_LOFAULT]
4719
4720 ! Copy out that reach here are larger than 256 bytes. The
4721 ! hw_copy_limit_1 is set to 256. Never set this limit less
4722 ! 128 bytes.
4723 save %sp, -SA(MINFRAME), %sp
4724 .do_block_copyout:
4725
4726 ! Swap src/dst since the code below is memcpy code
4727 ! and memcpy/bcopy have different calling sequences
4728 mov %i1, %i5
4729 mov %i0, %i1
4730 mov %i5, %i0
4731
4732 ! Block (64 bytes) align the destination.
4733 andcc %i0, 0x3f, %i3 ! is dst block aligned
4734 bz %ncc, copyout_blalign ! dst already block aligned
4735 sub %i3, 0x40, %i3
4736 neg %i3 ! bytes till dst 64 bytes aligned
4737 sub %i2, %i3, %i2 ! update i2 with new count
4738
4739 ! Based on source and destination alignment do
4740 ! either 8 bytes, 4 bytes, 2 bytes or byte copy.
4741
4742 ! Is dst & src 8B aligned
4743 or %i0, %i1, %o2
4744 andcc %o2, 0x7, %g0
4745 bz %ncc, .co_alewdcp
4746 nop
4747
4748 ! Is dst & src 4B aligned
4749 andcc %o2, 0x3, %g0
4750 bz %ncc, .co_alwdcp
4751 nop
4752
4753 ! Is dst & src 2B aligned
4754 andcc %o2, 0x1, %g0
4755 bz %ncc, .co_alhlfwdcp
4756 nop
4757
4758 ! 1B aligned
4759 1: ldub [%i1], %o2
4760 stba %o2, [%i0]ASI_USER
4761 inc %i1
4762 deccc %i3
4763 bgu,pt %ncc, 1b
4764 inc %i0
4765
4766 ba copyout_blalign
4767 nop
4768
4769 ! dst & src 4B aligned
4770 .co_alwdcp:
4771 ld [%i1], %o2
4772 sta %o2, [%i0]ASI_USER
4773 add %i1, 0x4, %i1
4774 subcc %i3, 0x4, %i3
4775 bgu,pt %ncc, .co_alwdcp
4776 add %i0, 0x4, %i0
4777
4778 ba copyout_blalign
4779 nop
4780
4781 ! dst & src 2B aligned
4782 .co_alhlfwdcp:
4783 lduh [%i1], %o2
4784 stuha %o2, [%i0]ASI_USER
4785 add %i1, 0x2, %i1
4786 subcc %i3, 0x2, %i3
4787 bgu,pt %ncc, .co_alhlfwdcp
4788 add %i0, 0x2, %i0
4789
4790 ba copyout_blalign
4791 nop
4792
4793 ! dst & src 8B aligned
4794 .co_alewdcp:
4795 ldx [%i1], %o2
4796 stxa %o2, [%i0]ASI_USER
4797 add %i1, 0x8, %i1
4798 subcc %i3, 0x8, %i3
4799 bgu,pt %ncc, .co_alewdcp
4800 add %i0, 0x8, %i0
4801
4802 ! Now Destination is block (64 bytes) aligned
4803 copyout_blalign:
4804 andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size
4805 sub %i2, %i3, %i2 ! Residue bytes in %i2
4806
4807 mov ASI_BLK_INIT_QUAD_LDD_AIUS, %asi
4808
4809 andcc %i1, 0xf, %o2 ! is src quadword aligned
4810 bz,pn %xcc, .co_blkcpy ! src offset in %o2 (last 4-bits)
4811 nop
4812 cmp %o2, 0x8
4813 bg .co_upper_double
4814 nop
4815 bl .co_lower_double
4816 nop
4817
4818 ! Falls through when source offset is equal to 8 i.e.
4819 ! source is double word aligned.
4820 ! In this case no shift/merge of data is required
4821
4822 sub %i1, %o2, %i1 ! align the src at 16 bytes.
4823 andn %i1, 0x3f, %l0 ! %l0 has block aligned source
4824 prefetch [%l0+0x0], #one_read
4825 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4826 .co_loop0:
4827 add %i1, 0x10, %i1
4828 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
4829 prefetch [%l0+0x40], #one_read
4830
4831 stxa %l3, [%i0+0x0]%asi
4832 stxa %l4, [%i0+0x8]%asi
4833
4834 add %i1, 0x10, %i1
4835 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4836
4837 stxa %l5, [%i0+0x10]%asi
4838 stxa %l2, [%i0+0x18]%asi
4839
4840 add %i1, 0x10, %i1
4841 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
4842
4843 stxa %l3, [%i0+0x20]%asi
4844 stxa %l4, [%i0+0x28]%asi
4845
4846 add %i1, 0x10, %i1
4847 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4848
4849 stxa %l5, [%i0+0x30]%asi
4850 stxa %l2, [%i0+0x38]%asi
4851
4852 add %l0, 0x40, %l0
4853 subcc %i3, 0x40, %i3
4854 bgu,pt %xcc, .co_loop0
4855 add %i0, 0x40, %i0
4856 ba .co_blkdone
4857 add %i1, %o2, %i1 ! increment the source by src offset
4858 ! the src offset was stored in %o2
4859
4860 .co_lower_double:
4861
4862 sub %i1, %o2, %i1 ! align the src at 16 bytes.
4863 sll %o2, 3, %o0 ! %o0 left shift
4864 mov 0x40, %o1
4865 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift)
4866 andn %i1, 0x3f, %l0 ! %l0 has block aligned source
4867 prefetch [%l0+0x0], #one_read
4868 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 ! partial data in %l2 and %l3 has
4869 ! complete data
4870 .co_loop1:
4871 add %i1, 0x10, %i1
4872 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 ! %l4 has partial data
4873 ! for this read.
4874 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) ! merge %l2, %l3 and %l4
4875 ! into %l2 and %l3
4876 prefetch [%l0+0x40], #one_read
4877
4878 stxa %l2, [%i0+0x0]%asi
4879 stxa %l3, [%i0+0x8]%asi
4880
4881 add %i1, 0x10, %i1
4882 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4883 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) ! merge %l2 with %l5 and
4884 ! %l4 from previous read
4885 ! into %l4 and %l5
4886 stxa %l4, [%i0+0x10]%asi
4887 stxa %l5, [%i0+0x18]%asi
4888
4889 ! Repeat the same for next 32 bytes.
4890
4891 add %i1, 0x10, %i1
4892 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
4893 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
4894
4895 stxa %l2, [%i0+0x20]%asi
4896 stxa %l3, [%i0+0x28]%asi
4897
4898 add %i1, 0x10, %i1
4899 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4900 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
4901
4902 stxa %l4, [%i0+0x30]%asi
4903 stxa %l5, [%i0+0x38]%asi
4904
4905 add %l0, 0x40, %l0
4906 subcc %i3, 0x40, %i3
4907 bgu,pt %xcc, .co_loop1
4908 add %i0, 0x40, %i0
4909 ba .co_blkdone
4910 add %i1, %o2, %i1 ! increment the source by src offset
4911 ! the src offset was stored in %o2
4912
4913 .co_upper_double:
4914
4915 sub %i1, %o2, %i1 ! align the src at 16 bytes.
4916 sub %o2, 0x8, %o0
4917 sll %o0, 3, %o0 ! %o0 left shift
4918 mov 0x40, %o1
4919 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift)
4920 andn %i1, 0x3f, %l0 ! %l0 has block aligned source
4921 prefetch [%l0+0x0], #one_read
4922 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 ! partial data in %l3
4923 ! for this read and
4924 ! no data in %l2
4925 .co_loop2:
4926 add %i1, 0x10, %i1
4927 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 ! %l4 has complete data
4928 ! and %l5 has partial
4929 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) ! merge %l3, %l4 and %l5
4930 ! into %l3 and %l4
4931 prefetch [%l0+0x40], #one_read
4932
4933 stxa %l3, [%i0+0x0]%asi
4934 stxa %l4, [%i0+0x8]%asi
4935
4936 add %i1, 0x10, %i1
4937 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4938 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) ! merge %l2 and %l3 with
4939 ! %l5 from previous read
4940 ! into %l5 and %l2
4941
4942 stxa %l5, [%i0+0x10]%asi
4943 stxa %l2, [%i0+0x18]%asi
4944
4945 ! Repeat the same for next 32 bytes.
4946
4947 add %i1, 0x10, %i1
4948 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
4949 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
4950
4951 stxa %l3, [%i0+0x20]%asi
4952 stxa %l4, [%i0+0x28]%asi
4953
4954 add %i1, 0x10, %i1
4955 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4956 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
4957
4958 stxa %l5, [%i0+0x30]%asi
4959 stxa %l2, [%i0+0x38]%asi
4960
4961 add %l0, 0x40, %l0
4962 subcc %i3, 0x40, %i3
4963 bgu,pt %xcc, .co_loop2
4964 add %i0, 0x40, %i0
4965 ba .co_blkdone
4966 add %i1, %o2, %i1 ! increment the source by src offset
4967 ! the src offset was stored in %o2
4968
4969
4970 ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
4971 .co_blkcpy:
4972
4973 andn %i1, 0x3f, %o0 ! %o0 has block aligned source
4974 prefetch [%o0+0x0], #one_read
4975 1:
4976 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l0
4977 add %i1, 0x10, %i1
4978 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4979 add %i1, 0x10, %i1
4980
4981 prefetch [%o0+0x40], #one_read
4982
4983 stxa %l0, [%i0+0x0]%asi
4984
4985 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
4986 add %i1, 0x10, %i1
4987 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l6
4988 add %i1, 0x10, %i1
4989
4990 stxa %l1, [%i0+0x8]%asi
4991 stxa %l2, [%i0+0x10]%asi
4992 stxa %l3, [%i0+0x18]%asi
4993 stxa %l4, [%i0+0x20]%asi
4994 stxa %l5, [%i0+0x28]%asi
4995 stxa %l6, [%i0+0x30]%asi
4996 stxa %l7, [%i0+0x38]%asi
4997
4998 add %o0, 0x40, %o0
4999 subcc %i3, 0x40, %i3
5000 bgu,pt %xcc, 1b
5001 add %i0, 0x40, %i0
5002
5003 .co_blkdone:
5004 membar #Sync
5005
5006 brz,pt %i2, .copyout_exit
5007 nop
5008
5009 ! Handle trailing bytes
5010 cmp %i2, 0x8
5011 blu,pt %ncc, .co_residue
5012 nop
5013
5014 ! Can we do some 8B ops
5015 or %i1, %i0, %o2
5016 andcc %o2, 0x7, %g0
5017 bnz %ncc, .co_last4
5018 nop
5019
5020 ! Do 8byte ops as long as possible
5021 .co_last8:
5022 ldx [%i1], %o2
5023 stxa %o2, [%i0]ASI_USER
5024 add %i1, 0x8, %i1
5025 sub %i2, 0x8, %i2
5026 cmp %i2, 0x8
5027 bgu,pt %ncc, .co_last8
5028 add %i0, 0x8, %i0
5029
5030 brz,pt %i2, .copyout_exit
5031 nop
5032
5033 ba .co_residue
5034 nop
5035
5036 .co_last4:
5037 ! Can we do 4B ops
5038 andcc %o2, 0x3, %g0
5039 bnz %ncc, .co_last2
5040 nop
5041 1:
5042 ld [%i1], %o2
5043 sta %o2, [%i0]ASI_USER
5044 add %i1, 0x4, %i1
5045 sub %i2, 0x4, %i2
5046 cmp %i2, 0x4
5047 bgu,pt %ncc, 1b
5048 add %i0, 0x4, %i0
5049
5050 brz,pt %i2, .copyout_exit
5051 nop
5052
5053 ba .co_residue
5054 nop
5055
5056 .co_last2:
5057 ! Can we do 2B ops
5058 andcc %o2, 0x1, %g0
5059 bnz %ncc, .co_residue
5060 nop
5061
5062 1:
5063 lduh [%i1], %o2
5064 stuha %o2, [%i0]ASI_USER
5065 add %i1, 0x2, %i1
5066 sub %i2, 0x2, %i2
5067 cmp %i2, 0x2
5068 bgu,pt %ncc, 1b
5069 add %i0, 0x2, %i0
5070
5071 brz,pt %i2, .copyout_exit
5072 nop
5073
5074 ! Copy the residue as byte copy
5075 .co_residue:
5076 ldub [%i1], %i4
5077 stba %i4, [%i0]ASI_USER
5078 inc %i1
5079 deccc %i2
5080 bgu,pt %xcc, .co_residue
5081 inc %i0
5082
5083 .copyout_exit:
5084 membar #Sync
5085 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
5086 ret
5087 restore %g0, 0, %o0
5088
5089 .copyout_err:
5090 ldn [THREAD_REG + T_COPYOPS], %o4
5091 brz %o4, 2f
5092 nop
5093 ldn [%o4 + CP_COPYOUT], %g2
5094 jmp %g2
5095 nop
5096 2:
5097 retl
5098 mov -1, %o0
5099 #endif /* NIAGARA_IMPL */
5100 SET_SIZE(copyout)
5101
5102 #endif /* lint */
5103
5104
5105 #ifdef lint
5106
5107 /*ARGSUSED*/
5108 int
5109 xcopyout(const void *kaddr, void *uaddr, size_t count)
5110 { return (0); }
5111
5112 #else /* lint */
5113
5114 ENTRY(xcopyout)
5115 sethi %hi(.xcopyout_err), REAL_LOFAULT
5116 b .do_copyout
5117 or REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
5118 .xcopyout_err:
5119 ldn [THREAD_REG + T_COPYOPS], %o4
5120 brz %o4, 2f
5121 nop
5122 ldn [%o4 + CP_XCOPYOUT], %g2
5123 jmp %g2
5124 nop
5125 2:
5126 retl
5127 mov %g1, %o0
5128 SET_SIZE(xcopyout)
5129
5130 #endif /* lint */
5131
5132 #ifdef lint
5133
5134 /*ARGSUSED*/
5135 int
5136 xcopyout_little(const void *kaddr, void *uaddr, size_t count)
5137 { return (0); }
5138
5139 #else /* lint */
5140
5141 ENTRY(xcopyout_little)
5142 sethi %hi(.little_err), %o4
5143 ldn [THREAD_REG + T_LOFAULT], %o5
5144 or %o4, %lo(.little_err), %o4
5145 membar #Sync ! sync error barrier
5146 stn %o4, [THREAD_REG + T_LOFAULT]
5147
5148 subcc %g0, %o2, %o3
5149 add %o0, %o2, %o0
5150 bz,pn %ncc, 2f ! check for zero bytes
5151 sub %o2, 1, %o4
5152 add %o0, %o4, %o0 ! start w/last byte
5153 add %o1, %o2, %o1
5154 ldub [%o0+%o3], %o4
5155
5156 1: stba %o4, [%o1+%o3]ASI_AIUSL
5157 inccc %o3
5158 sub %o0, 2, %o0 ! get next byte
5159 bcc,a,pt %ncc, 1b
5160 ldub [%o0+%o3], %o4
5161
5162 2: membar #Sync ! sync error barrier
5163 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
5164 retl
5165 mov %g0, %o0 ! return (0)
5166 SET_SIZE(xcopyout_little)
5167
5168 #endif /* lint */
5169
5170 /*
5171 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
5172 */
5173
5174 #if defined(lint)
5175
5176 /*ARGSUSED*/
5177 int
5178 copyin(const void *uaddr, void *kaddr, size_t count)
5179 { return (0); }
5180
5181 #else /* lint */
5182
5183 ENTRY(copyin)
5184 sethi %hi(.copyin_err), REAL_LOFAULT
5185 or REAL_LOFAULT, %lo(.copyin_err), REAL_LOFAULT
5186
5187 #if !defined(NIAGARA_IMPL)
5188 .do_copyin:
5189 tst %o2 ! check for zero count; quick exit
5190 bz,pt %ncc, .ci_smallqx
5191 mov %o0, SAVE_SRC
5192 mov %o1, SAVE_DST
5193 mov %o2, SAVE_COUNT
5194 cmp %o2, FP_COPY ! check for small copy/leaf case
5195 bgt,pt %ncc, .ci_copy_more
5196 ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
5197 /*
5198 * Small copy in code
5199 *
5200 */
5201 sethi %hi(copyio_fault_nowindow), %o3
5202 or %o3, %lo(copyio_fault_nowindow), %o3
5203 membar #Sync
5204 stn %o3, [THREAD_REG + T_LOFAULT]
5205
5206 mov ASI_USER, %asi
5207 cmp %o2, SHORTCOPY ! make sure there is enough to align
5208 ble,pt %ncc, .ci_smallest
5209 andcc %o1, 0x7, %o3 ! is dest long word aligned
5210 bnz,pn %ncc, .ci_align
5211 andcc %o1, 1, %o3 ! is dest byte aligned
5212
5213 ! Destination is long word aligned
5214 .ci_al_src:
5215 andcc %o0, 7, %o3
5216 brnz,pt %o3, .ci_src_dst_unal8
5217 nop
5218 /*
5219 * Special case for handling when src and dest are both long word aligned
5220 * and total data to move is less than FP_COPY bytes
5221 * Also handles finish up for large block moves, so may be less than 32 bytes
5222 */
5223 .ci_medlong:
5224 subcc %o2, 31, %o2 ! adjust length to allow cc test
5225 ble,pt %ncc, .ci_medl31
5226 nop
5227 .ci_medl32:
5228 ldxa [%o0]%asi, %o4 ! move 32 bytes
5229 subcc %o2, 32, %o2 ! decrement length count by 32
5230 stx %o4, [%o1]
5231 ldxa [%o0+8]%asi, %o4
5232 stx %o4, [%o1+8]
5233 ldxa [%o0+16]%asi, %o4
5234 add %o0, 32, %o0 ! increase src ptr by 32
5235 stx %o4, [%o1+16]
5236 ldxa [%o0-8]%asi, %o4
5237 add %o1, 32, %o1 ! increase dst ptr by 32
5238 bgu,pt %ncc, .ci_medl32 ! repeat if at least 32 bytes left
5239 stx %o4, [%o1-8]
5240 .ci_medl31:
5241 addcc %o2, 24, %o2 ! adjust count to be off by 7
5242 ble,pt %ncc, .ci_medl7 ! skip if 7 or fewer bytes left
5243 nop
5244 .ci_medl8:
5245 ldxa [%o0]%asi, %o4 ! move 8 bytes
5246 add %o0, 8, %o0 ! increase src ptr by 8
5247 subcc %o2, 8, %o2 ! decrease count by 8
5248 add %o1, 8, %o1 ! increase dst ptr by 8
5249 bgu,pt %ncc, .ci_medl8
5250 stx %o4, [%o1-8]
5251 .ci_medl7:
5252 addcc %o2, 7, %o2 ! finish adjustment of remaining count
5253 bnz,pt %ncc, .ci_small4 ! do final bytes if not finished
5254 nop
5255 .ci_smallx: ! finish up and exit
5256 membar #Sync
5257 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
5258 .ci_smallqx:
5259 retl
5260 mov %g0, %o0
5261
5262 .ci_small4:
5263 cmp %o2, 4
5264 blt,pt %ncc, .ci_small3x ! skip if less than 4 bytes left
5265 nop !
5266 lda [%o0]%asi, %o4 ! move 4 bytes
5267 add %o0, 4, %o0 ! increase src ptr by 4
5268 add %o1, 4, %o1 ! increase dst ptr by 4
5269 subcc %o2, 4, %o2 ! decrease count by 4
5270 bz %ncc, .ci_smallx
5271 stw %o4, [%o1-4]
5272
5273 .ci_small3x: ! Exactly 1, 2, or 3 bytes remain
5274 subcc %o2, 1, %o2 ! reduce count for cc test
5275 lduba [%o0]%asi, %o4 ! load one byte
5276 bz,pt %ncc, .ci_smallx
5277 stb %o4, [%o1] ! store one byte
5278 lduba [%o0+1]%asi, %o4 ! load second byte
5279 subcc %o2, 1, %o2
5280 bz,pt %ncc, .ci_smallx
5281 stb %o4, [%o1+1] ! store second byte
5282 lduba [%o0+2]%asi, %o4 ! load third byte
5283 ba .ci_smallx
5284 stb %o4, [%o1+2] ! store third byte
5285
5286 .ci_smallest: ! 7 or fewer bytes remain
5287 cmp %o2, 4
5288 blt,pt %ncc, .ci_small3x
5289 nop
5290 lduba [%o0]%asi, %o4 ! read byte
5291 subcc %o2, 4, %o2 ! reduce count by 4
5292 stb %o4, [%o1] ! write byte
5293 lduba [%o0+1]%asi, %o4 ! repeat for total of 4 bytes
5294 add %o0, 4, %o0 ! advance src by 4
5295 stb %o4, [%o1+1]
5296 lduba [%o0-2]%asi, %o4
5297 add %o1, 4, %o1 ! advance dst by 4
5298 stb %o4, [%o1-2]
5299 lduba [%o0-1]%asi, %o4
5300 bnz,pt %ncc, .ci_small3x
5301 stb %o4, [%o1-1]
5302 membar #Sync
5303 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
5304 retl
5305 mov %g0, %o0
5306
5307 .ci_align:
5308 bnz,pt %ncc, .ci_al_d1
5309 .ci_al_d1f: ! dest is now half word aligned
5310 andcc %o1, 2, %o3 ! is dest word aligned
5311 bnz,pt %ncc, .ci_al_d2
5312 .ci_al_d2f: ! dest is now word aligned
5313 andcc %o1, 4, %o3 ! is dest longword aligned?
5314 bz,pt %ncc, .ci_al_src
5315 nop
5316 .ci_al_d4: ! dest is word aligned; src is unknown
5317 lduba [%o0]%asi, %o4 ! move a word (src align unknown)
5318 lduba [%o0+1]%asi, %o3
5319 sll %o4, 24, %o4 ! position
5320 sll %o3, 16, %o3 ! position
5321 or %o4, %o3, %o3 ! merge
5322 lduba [%o0+2]%asi, %o4
5323 sll %o4, 8, %o4 ! position
5324 or %o4, %o3, %o3 ! merge
5325 lduba [%o0+3]%asi, %o4
5326 or %o4, %o3, %o4 ! merge
5327 stw %o4,[%o1] ! store four bytes
5328 add %o0, 4, %o0 ! adjust src by 4
5329 add %o1, 4, %o1 ! adjust dest by 4
5330 sub %o2, 4, %o2 ! adjust count by 4
5331 andcc %o0, 7, %o3 ! check for src long word alignment
5332 brz,pt %o3, .ci_medlong
5333 .ci_src_dst_unal8:
5334 ! dst is 8-byte aligned, src is not
5335 ! Size is less than FP_COPY
5336 ! Following code is to select for alignment
5337 andcc %o0, 0x3, %o3 ! test word alignment
5338 bz,pt %ncc, .ci_medword
5339 nop
5340 andcc %o0, 0x1, %o3 ! test halfword alignment
5341 bnz,pt %ncc, .ci_med_byte ! go to byte move if not halfword
5342 andcc %o0, 0x2, %o3 ! test which byte alignment
5343 ba .ci_medhalf
5344 nop
5345 .ci_al_d1: ! align dest to half word
5346 lduba [%o0]%asi, %o4 ! move a byte
5347 add %o0, 1, %o0
5348 stb %o4, [%o1]
5349 add %o1, 1, %o1
5350 andcc %o1, 2, %o3 ! is dest word aligned
5351 bz,pt %ncc, .ci_al_d2f
5352 sub %o2, 1, %o2
5353 .ci_al_d2: ! align dest to word
5354 lduba [%o0]%asi, %o4 ! move a half-word (src align unknown)
5355 lduba [%o0+1]%asi, %o3
5356 sll %o4, 8, %o4 ! position
5357 or %o4, %o3, %o4 ! merge
5358 sth %o4, [%o1]
5359 add %o0, 2, %o0
5360 add %o1, 2, %o1
5361 andcc %o1, 4, %o3 ! is dest longword aligned?
5362 bz,pt %ncc, .ci_al_src
5363 sub %o2, 2, %o2
5364 ba .ci_al_d4
5365 nop
5366 /*
5367 * Handle all cases where src and dest are aligned on word
5368 * boundaries. Use unrolled loops for better performance.
5369 * This option wins over standard large data move when
5370 * source and destination is in cache for medium
5371 * to short data moves.
5372 */
5373 .ci_medword:
5374 subcc %o2, 31, %o2 ! adjust length to allow cc test
5375 ble,pt %ncc, .ci_medw31
5376 nop
5377 .ci_medw32:
5378 lda [%o0]%asi, %o4 ! move a block of 32 bytes
5379 stw %o4, [%o1]
5380 lda [%o0+4]%asi, %o4
5381 stw %o4, [%o1+4]
5382 lda [%o0+8]%asi, %o4
5383 stw %o4, [%o1+8]
5384 lda [%o0+12]%asi, %o4
5385 stw %o4, [%o1+12]
5386 lda [%o0+16]%asi, %o4
5387 stw %o4, [%o1+16]
5388 lda [%o0+20]%asi, %o4
5389 subcc %o2, 32, %o2 ! decrement length count
5390 stw %o4, [%o1+20]
5391 lda [%o0+24]%asi, %o4
5392 add %o0, 32, %o0 ! increase src ptr by 32
5393 stw %o4, [%o1+24]
5394 lda [%o0-4]%asi, %o4
5395 add %o1, 32, %o1 ! increase dst ptr by 32
5396 bgu,pt %ncc, .ci_medw32 ! repeat if at least 32 bytes left
5397 stw %o4, [%o1-4]
5398 .ci_medw31:
5399 addcc %o2, 24, %o2 ! adjust count to be off by 7
5400 ble,pt %ncc, .ci_medw7 ! skip if 7 or fewer bytes left
5401 nop !
5402 .ci_medw15:
5403 lda [%o0]%asi, %o4 ! move a block of 8 bytes
5404 subcc %o2, 8, %o2 ! decrement length count
5405 stw %o4, [%o1]
5406 add %o0, 8, %o0 ! increase src ptr by 8
5407 lda [%o0-4]%asi, %o4
5408 add %o1, 8, %o1 ! increase dst ptr by 8
5409 bgu,pt %ncc, .ci_medw15
5410 stw %o4, [%o1-4]
5411 .ci_medw7:
5412 addcc %o2, 7, %o2 ! finish adjustment of remaining count
5413 bz,pt %ncc, .ci_smallx ! exit if finished
5414 cmp %o2, 4
5415 blt,pt %ncc, .ci_small3x ! skip if less than 4 bytes left
5416 nop !
5417 lda [%o0]%asi, %o4 ! move 4 bytes
5418 add %o0, 4, %o0 ! increase src ptr by 4
5419 add %o1, 4, %o1 ! increase dst ptr by 4
5420 subcc %o2, 4, %o2 ! decrease count by 4
5421 bnz .ci_small3x
5422 stw %o4, [%o1-4]
5423 membar #Sync
5424 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
5425 retl
5426 mov %g0, %o0
5427
5428 .ci_medhalf:
5429 subcc %o2, 31, %o2 ! adjust length to allow cc test
5430 ble,pt %ncc, .ci_medh31
5431 nop
5432 .ci_medh32: ! load and store block of 32 bytes
5433 subcc %o2, 32, %o2 ! decrement length count
5434
5435 lduha [%o0]%asi, %o4 ! move 32 bytes
5436 lduwa [%o0+2]%asi, %o3
5437 sllx %o4, 48, %o4
5438 sllx %o3, 16, %o3
5439 or %o4, %o3, %o3
5440 lduha [%o0+6]%asi, %o4
5441 or %o4, %o3, %o4
5442 stx %o4, [%o1]
5443
5444 lduha [%o0+8]%asi, %o4
5445 lduwa [%o0+10]%asi, %o3
5446 sllx %o4, 48, %o4
5447 sllx %o3, 16, %o3
5448 or %o4, %o3, %o3
5449 lduha [%o0+14]%asi, %o4
5450 or %o4, %o3, %o4
5451 stx %o4, [%o1+8]
5452
5453 lduha [%o0+16]%asi, %o4
5454 lduwa [%o0+18]%asi, %o3
5455 sllx %o4, 48, %o4
5456 sllx %o3, 16, %o3
5457 or %o4, %o3, %o3
5458 lduha [%o0+22]%asi, %o4
5459 or %o4, %o3, %o4
5460 stx %o4, [%o1+16]
5461
5462 add %o0, 32, %o0 ! increase src ptr by 32
5463 add %o1, 32, %o1 ! increase dst ptr by 32
5464
5465 lduha [%o0-8]%asi, %o4
5466 lduwa [%o0-6]%asi, %o3
5467 sllx %o4, 48, %o4
5468 sllx %o3, 16, %o3
5469 or %o4, %o3, %o3
5470 lduha [%o0-2]%asi, %o4
5471 or %o3, %o4, %o4
5472 bgu,pt %ncc, .ci_medh32 ! repeat if at least 32 bytes left
5473 stx %o4, [%o1-8]
5474
5475 .ci_medh31:
5476 addcc %o2, 24, %o2 ! adjust count to be off by 7
5477 ble,pt %ncc, .ci_medh7 ! skip if 7 or fewer bytes left
5478 nop !
5479 .ci_medh15:
5480 lduha [%o0]%asi, %o4 ! move 16 bytes
5481 subcc %o2, 8, %o2 ! decrement length count
5482 lduwa [%o0+2]%asi, %o3
5483 sllx %o4, 48, %o4
5484 sllx %o3, 16, %o3
5485 or %o4, %o3, %o3
5486 add %o1, 8, %o1 ! increase dst ptr by 8
5487 lduha [%o0+6]%asi, %o4
5488 add %o0, 8, %o0 ! increase src ptr by 8
5489 or %o4, %o3, %o4
5490 bgu,pt %ncc, .ci_medh15
5491 stx %o4, [%o1-8]
5492 .ci_medh7:
5493 addcc %o2, 7, %o2 ! finish adjustment of remaining count
5494 bz,pt %ncc, .ci_smallx ! exit if finished
5495 cmp %o2, 4
5496 blt,pt %ncc, .ci_small3x ! skip if less than 4 bytes left
5497 nop !
5498 lduha [%o0]%asi, %o4
5499 sll %o4, 16, %o4
5500 lduha [%o0+2]%asi, %o3
5501 or %o3, %o4, %o4
5502 subcc %o2, 4, %o2
5503 add %o0, 4, %o0
5504 add %o1, 4, %o1
5505 bnz .ci_small3x
5506 stw %o4, [%o1-4]
5507 membar #Sync
5508 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
5509 retl
5510 mov %g0, %o0
5511
5512 .align 16
5513 .ci_med_byte:
5514 bnz,pt %ncc, .ci_medbh32a ! go to correct byte move
5515 subcc %o2, 31, %o2 ! adjust length to allow cc test
5516 ble,pt %ncc, .ci_medb31
5517 nop
5518 .ci_medb32: ! Alignment 1 or 5
5519 subcc %o2, 32, %o2 ! decrement length count
5520
5521 lduba [%o0]%asi, %o4 ! load and store a block of 32 bytes
5522 sllx %o4, 56, %o3
5523 lduha [%o0+1]%asi, %o4
5524 sllx %o4, 40, %o4
5525 or %o4, %o3, %o3
5526 lduwa [%o0+3]%asi, %o4
5527 sllx %o4, 8, %o4
5528 or %o4, %o3, %o3
5529 lduba [%o0+7]%asi, %o4
5530 or %o4, %o3, %o4
5531 stx %o4, [%o1]
5532
5533 lduba [%o0+8]%asi, %o4
5534 sllx %o4, 56, %o3
5535 lduha [%o0+9]%asi, %o4
5536 sllx %o4, 40, %o4
5537 or %o4, %o3, %o3
5538 lduwa [%o0+11]%asi, %o4
5539 sllx %o4, 8, %o4
5540 or %o4, %o3, %o3
5541 lduba [%o0+15]%asi, %o4
5542 or %o4, %o3, %o4
5543 stx %o4, [%o1+8]
5544
5545 lduba [%o0+16]%asi, %o4
5546 sllx %o4, 56, %o3
5547 lduha [%o0+17]%asi, %o4
5548 sllx %o4, 40, %o4
5549 or %o4, %o3, %o3
5550 lduwa [%o0+19]%asi, %o4
5551 sllx %o4, 8, %o4
5552 or %o4, %o3, %o3
5553 lduba [%o0+23]%asi, %o4
5554 or %o4, %o3, %o4
5555 stx %o4, [%o1+16]
5556
5557 add %o0, 32, %o0 ! increase src ptr by 32
5558 add %o1, 32, %o1 ! increase dst ptr by 32
5559
5560 lduba [%o0-8]%asi, %o4
5561 sllx %o4, 56, %o3
5562 lduha [%o0-7]%asi, %o4
5563 sllx %o4, 40, %o4
5564 or %o4, %o3, %o3
5565 lduwa [%o0-5]%asi, %o4
5566 sllx %o4, 8, %o4
5567 or %o4, %o3, %o3
5568 lduba [%o0-1]%asi, %o4
5569 or %o4, %o3, %o4
5570 bgu,pt %ncc, .ci_medb32 ! repeat if at least 32 bytes left
5571 stx %o4, [%o1-8]
5572
5573 .ci_medb31: ! 31 or fewer bytes remaining
5574 addcc %o2, 24, %o2 ! adjust count to be off by 7
5575 ble,pt %ncc, .ci_medb7 ! skip if 7 or fewer bytes left
5576 nop !
5577 .ci_medb15:
5578
5579 lduba [%o0]%asi, %o4 ! load and store a block of 8 bytes
5580 subcc %o2, 8, %o2 ! decrement length count
5581 sllx %o4, 56, %o3
5582 lduha [%o0+1]%asi, %o4
5583 sllx %o4, 40, %o4
5584 or %o4, %o3, %o3
5585 lduwa [%o0+3]%asi, %o4
5586 add %o1, 8, %o1 ! increase dst ptr by 16
5587 sllx %o4, 8, %o4
5588 or %o4, %o3, %o3
5589 lduba [%o0+7]%asi, %o4
5590 add %o0, 8, %o0 ! increase src ptr by 16
5591 or %o4, %o3, %o4
5592 bgu,pt %ncc, .ci_medb15
5593 stx %o4, [%o1-8]
5594 .ci_medb7:
5595 addcc %o2, 7, %o2 ! finish adjustment of remaining count
5596 bz,pt %ncc, .ci_smallx ! exit if finished
5597 cmp %o2, 4
5598 blt,pt %ncc, .ci_small3x ! skip if less than 4 bytes left
5599 nop !
5600 lduba [%o0]%asi, %o4 ! move 4 bytes
5601 sll %o4, 24, %o3
5602 lduha [%o0+1]%asi, %o4
5603 sll %o4, 8, %o4
5604 or %o4, %o3, %o3
5605 lduba [%o0+3]%asi, %o4
5606 or %o4, %o3, %o4
5607 subcc %o2, 4, %o2
5608 add %o0, 4, %o0
5609 add %o1, 4, %o1
5610 bnz .ci_small3x
5611 stw %o4, [%o1-4]
5612 membar #Sync
5613 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
5614 retl
5615 mov %g0, %o0
5616
5617 .align 16
5618 .ci_medbh32a: ! Alignment 3 or 7
5619 ble,pt %ncc, .ci_medbh31
5620 nop
5621 .ci_medbh32: ! Alignment 3 or 7
5622 subcc %o2, 32, %o2 ! decrement length count
5623
5624 lduba [%o0]%asi, %o4 ! load and store a block of 32 bytes
5625 sllx %o4, 56, %o3
5626 lduwa [%o0+1]%asi, %o4
5627 sllx %o4, 24, %o4
5628 or %o4, %o3, %o3
5629 lduha [%o0+5]%asi, %o4
5630 sllx %o4, 8, %o4
5631 or %o4, %o3, %o3
5632 lduba [%o0+7]%asi, %o4
5633 or %o4, %o3, %o4
5634 stx %o4, [%o1]
5635
5636 lduba [%o0+8]%asi, %o4
5637 sllx %o4, 56, %o3
5638 lduwa [%o0+9]%asi, %o4
5639 sllx %o4, 24, %o4
5640 or %o4, %o3, %o3
5641 lduha [%o0+13]%asi, %o4
5642 sllx %o4, 8, %o4
5643 or %o4, %o3, %o3
5644 lduba [%o0+15]%asi, %o4
5645 or %o4, %o3, %o4
5646 stx %o4, [%o1+8]
5647
5648 lduba [%o0+16]%asi, %o4
5649 sllx %o4, 56, %o3
5650 lduwa [%o0+17]%asi, %o4
5651 sllx %o4, 24, %o4
5652 or %o4, %o3, %o3
5653 lduha [%o0+21]%asi, %o4
5654 sllx %o4, 8, %o4
5655 or %o4, %o3, %o3
5656 lduba [%o0+23]%asi, %o4
5657 or %o4, %o3, %o4
5658 stx %o4, [%o1+16]
5659
5660 add %o0, 32, %o0 ! increase src ptr by 32
5661 add %o1, 32, %o1 ! increase dst ptr by 32
5662
5663 lduba [%o0-8]%asi, %o4
5664 sllx %o4, 56, %o3
5665 lduwa [%o0-7]%asi, %o4
5666 sllx %o4, 24, %o4
5667 or %o4, %o3, %o3
5668 lduha [%o0-3]%asi, %o4
5669 sllx %o4, 8, %o4
5670 or %o4, %o3, %o3
5671 lduba [%o0-1]%asi, %o4
5672 or %o4, %o3, %o4
5673 bgu,pt %ncc, .ci_medbh32 ! repeat if at least 32 bytes left
5674 stx %o4, [%o1-8]
5675
5676 .ci_medbh31:
5677 addcc %o2, 24, %o2 ! adjust count to be off by 7
5678 ble,pt %ncc, .ci_medb7 ! skip if 7 or fewer bytes left
5679 nop !
5680 .ci_medbh15:
5681 lduba [%o0]%asi, %o4 ! load and store a block of 8 bytes
5682 sllx %o4, 56, %o3
5683 lduwa [%o0+1]%asi, %o4
5684 sllx %o4, 24, %o4
5685 or %o4, %o3, %o3
5686 lduha [%o0+5]%asi, %o4
5687 sllx %o4, 8, %o4
5688 or %o4, %o3, %o3
5689 lduba [%o0+7]%asi, %o4
5690 or %o4, %o3, %o4
5691 stx %o4, [%o1]
5692 subcc %o2, 8, %o2 ! decrement length count
5693 add %o1, 8, %o1 ! increase dst ptr by 8
5694 add %o0, 8, %o0 ! increase src ptr by 8
5695 bgu,pt %ncc, .ci_medbh15
5696 stx %o4, [%o1-8]
5697 ba .ci_medb7
5698 nop
5699
5700 /*
5701 * End of small copy in code (no window)
5702 *
5703 */
5704
5705 /*
5706 * Long copy in code (using register window and fp regs)
5707 *
5708 */
5709
5710 .ci_copy_more:
5711 sethi %hi(copyio_fault), %o3
5712 or %o3, %lo(copyio_fault), %o3
5713 membar #Sync
5714 stn %o3, [THREAD_REG + T_LOFAULT]
5715 /*
5716 * Following code is for large copies. We know there is at
5717 * least FP_COPY bytes available. FP regs are used, so
5718 * we save registers and fp regs before starting
5719 */
5720 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
5721 or SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
5722 rd %fprs, %g1 ! check for unused fp
5723 ! if fprs.fef == 0, set it.
5724 ! Setting it when already set costs more than checking
5725 andcc %g1, FPRS_FEF, %g1 ! test FEF, fprs.du = fprs.dl = 0
5726 bz,pt %ncc, .ci_fp_unused
5727 mov ASI_USER, %asi
5728 BST_FP_TOSTACK(%o3)
5729 ba .ci_fp_ready
5730 .ci_fp_unused:
5731 prefetcha [%i0 + (1 * CACHE_LINE)]%asi, #one_read
5732 wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1
5733 .ci_fp_ready:
5734 rd %gsr, %l5 ! save %gsr value
5735 andcc %i1, 1, %o3 ! is dest byte aligned
5736 bnz,pt %ncc, .ci_big_d1
5737 .ci_big_d1f: ! dest is now half word aligned
5738 andcc %i1, 2, %o3
5739 bnz,pt %ncc, .ci_big_d2
5740 .ci_big_d2f: ! dest is now word aligned
5741 andcc %i1, 4, %o3
5742 bnz,pt %ncc, .ci_big_d4
5743 .ci_big_d4f: ! dest is long word aligned
5744 andcc %i0, 7, %o3 ! is src long word aligned
5745 brnz,pt %o3, .ci_big_unal8
5746 prefetcha [%i0 + (2 * CACHE_LINE)]%asi, #one_read
5747 ! Src and dst are long word aligned
5748 ! align dst to 64 byte boundary
5749 andcc %i1, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned
5750 brz,pn %o3, .ci_al_to_64
5751 nop
5752 sub %o3, 64, %o3 ! %o3 has negative bytes to move
5753 add %i2, %o3, %i2 ! adjust remaining count
5754 andcc %o3, 8, %o4 ! odd long words to move?
5755 brz,pt %o4, .ci_al_to_16
5756 nop
5757 add %o3, 8, %o3
5758 ldxa [%i0]%asi, %o4
5759 add %i0, 8, %i0 ! increment src ptr
5760 add %i1, 8, %i1 ! increment dst ptr
5761 stx %o4, [%i1-8]
5762 ! Dest is aligned on 16 bytes, src 8 byte aligned
5763 .ci_al_to_16:
5764 andcc %o3, 0x30, %o4 ! pair of long words to move?
5765 brz,pt %o4, .ci_al_to_64
5766 nop
5767 .ci_al_mv_16:
5768 add %o3, 16, %o3
5769 ldxa [%i0]%asi, %o4
5770 stx %o4, [%i1]
5771 add %i0, 16, %i0 ! increment src ptr
5772 ldxa [%i0-8]%asi, %o4
5773 stx %o4, [%i1+8]
5774 andcc %o3, 0x30, %o4
5775 brnz,pt %o4, .ci_al_mv_16
5776 add %i1, 16, %i1 ! increment dst ptr
5777 ! Dest is aligned on 64 bytes, src 8 byte aligned
5778 .ci_al_to_64:
5779 ! Determine source alignment
5780 ! to correct 8 byte offset
5781 andcc %i0, 32, %o3
5782 brnz,pn %o3, .ci_aln_1
5783 andcc %i0, 16, %o3
5784 brnz,pn %o3, .ci_aln_01
5785 andcc %i0, 8, %o3
5786 brz,pn %o3, .ci_aln_000
5787 prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read
5788 ba .ci_aln_001
5789 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5790 .ci_aln_01:
5791 brnz,pn %o3, .ci_aln_011
5792 prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read
5793 ba .ci_aln_010
5794 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5795 .ci_aln_1:
5796 andcc %i0, 16, %o3
5797 brnz,pn %o3, .ci_aln_11
5798 andcc %i0, 8, %o3
5799 brnz,pn %o3, .ci_aln_101
5800 prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read
5801 ba .ci_aln_100
5802 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5803 .ci_aln_11:
5804 brz,pn %o3, .ci_aln_110
5805 prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read
5806
5807 .ci_aln_111:
5808 ! Alignment off by 8 bytes
5809 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5810 ldda [%i0]%asi, %d0
5811 add %i0, 8, %i0
5812 sub %i2, 8, %i2
5813 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
5814 and %i2, 0x7f, %i2 ! residue bytes in %i2
5815 sub %i1, %i0, %i1
5816 .ci_aln_111_loop:
5817 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load
5818 subcc %o3, 64, %o3
5819 fmovd %d16, %d2
5820 fmovd %d18, %d4
5821 fmovd %d20, %d6
5822 fmovd %d22, %d8
5823 fmovd %d24, %d10
5824 fmovd %d26, %d12
5825 fmovd %d28, %d14
5826 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
5827 stda %d0,[%i0+%i1]ASI_BLK_P
5828 add %i0, 64, %i0
5829 fmovd %d30, %d0
5830 bgt,pt %ncc, .ci_aln_111_loop
5831 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5832 add %i1, %i0, %i1
5833
5834 std %d0, [%i1]
5835 ba .ci_remain_stuff
5836 add %i1, 8, %i1
5837 ! END OF aln_111
5838
5839 .ci_aln_110:
5840 ! Alignment off by 16 bytes
5841 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5842 ldda [%i0]%asi, %d0
5843 ldda [%i0+8]%asi, %d2
5844 add %i0, 16, %i0
5845 sub %i2, 16, %i2
5846 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
5847 and %i2, 0x7f, %i2 ! residue bytes in %i2
5848 sub %i1, %i0, %i1
5849 .ci_aln_110_loop:
5850 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load
5851 subcc %o3, 64, %o3
5852 fmovd %d16, %d4
5853 fmovd %d18, %d6
5854 fmovd %d20, %d8
5855 fmovd %d22, %d10
5856 fmovd %d24, %d12
5857 fmovd %d26, %d14
5858 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
5859 stda %d0,[%i0+%i1]ASI_BLK_P
5860 add %i0, 64, %i0
5861 fmovd %d28, %d0
5862 fmovd %d30, %d2
5863 bgt,pt %ncc, .ci_aln_110_loop
5864 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5865 add %i1, %i0, %i1
5866
5867 std %d0, [%i1]
5868 std %d2, [%i1+8]
5869 ba .ci_remain_stuff
5870 add %i1, 16, %i1
5871 ! END OF aln_110
5872
5873 .ci_aln_101:
5874 ! Alignment off by 24 bytes
5875 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5876 ldda [%i0]%asi, %d0
5877 ldda [%i0+8]%asi, %d2
5878 ldda [%i0+16]%asi, %d4
5879 add %i0, 24, %i0
5880 sub %i2, 24, %i2
5881 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
5882 and %i2, 0x7f, %i2 ! residue bytes in %i2
5883 sub %i1, %i0, %i1
5884 .ci_aln_101_loop:
5885 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load
5886 subcc %o3, 64, %o3
5887 fmovd %d16, %d6
5888 fmovd %d18, %d8
5889 fmovd %d20, %d10
5890 fmovd %d22, %d12
5891 fmovd %d24, %d14
5892 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
5893 stda %d0,[%i0+%i1]ASI_BLK_P
5894 add %i0, 64, %i0
5895 fmovd %d26, %d0
5896 fmovd %d28, %d2
5897 fmovd %d30, %d4
5898 bgt,pt %ncc, .ci_aln_101_loop
5899 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5900 add %i1, %i0, %i1
5901
5902 std %d0, [%i1]
5903 std %d2, [%i1+8]
5904 std %d4, [%i1+16]
5905 ba .ci_remain_stuff
5906 add %i1, 24, %i1
5907 ! END OF aln_101
5908
5909 .ci_aln_100:
5910 ! Alignment off by 32 bytes
5911 ldda [%i0]%asi, %d0
5912 ldda [%i0+8]%asi, %d2
5913 ldda [%i0+16]%asi,%d4
5914 ldda [%i0+24]%asi,%d6
5915 add %i0, 32, %i0
5916 sub %i2, 32, %i2
5917 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
5918 and %i2, 0x7f, %i2 ! residue bytes in %i2
5919 sub %i1, %i0, %i1
5920 .ci_aln_100_loop:
5921 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load
5922 subcc %o3, 64, %o3
5923 fmovd %d16, %d8
5924 fmovd %d18, %d10
5925 fmovd %d20, %d12
5926 fmovd %d22, %d14
5927 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
5928 stda %d0,[%i0+%i1]ASI_BLK_P
5929 add %i0, 64, %i0
5930 fmovd %d24, %d0
5931 fmovd %d26, %d2
5932 fmovd %d28, %d4
5933 fmovd %d30, %d6
5934 bgt,pt %ncc, .ci_aln_100_loop
5935 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5936 add %i1, %i0, %i1
5937
5938 std %d0, [%i1]
5939 std %d2, [%i1+8]
5940 std %d4, [%i1+16]
5941 std %d6, [%i1+24]
5942 ba .ci_remain_stuff
5943 add %i1, 32, %i1
5944 ! END OF aln_100
5945
5946 .ci_aln_011:
5947 ! Alignment off by 40 bytes
5948 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5949 ldda [%i0]%asi, %d0
5950 ldda [%i0+8]%asi, %d2
5951 ldda [%i0+16]%asi, %d4
5952 ldda [%i0+24]%asi, %d6
5953 ldda [%i0+32]%asi, %d8
5954 add %i0, 40, %i0
5955 sub %i2, 40, %i2
5956 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
5957 and %i2, 0x7f, %i2 ! residue bytes in %i2
5958 sub %i1, %i0, %i1
5959 .ci_aln_011_loop:
5960 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load
5961 subcc %o3, 64, %o3
5962 fmovd %d16, %d10
5963 fmovd %d18, %d12
5964 fmovd %d20, %d14
5965 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
5966 stda %d0,[%i0+%i1]ASI_BLK_P
5967 add %i0, 64, %i0
5968 fmovd %d22, %d0
5969 fmovd %d24, %d2
5970 fmovd %d26, %d4
5971 fmovd %d28, %d6
5972 fmovd %d30, %d8
5973 bgt,pt %ncc, .ci_aln_011_loop
5974 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5975 add %i1, %i0, %i1
5976
5977 std %d0, [%i1]
5978 std %d2, [%i1+8]
5979 std %d4, [%i1+16]
5980 std %d6, [%i1+24]
5981 std %d8, [%i1+32]
5982 ba .ci_remain_stuff
5983 add %i1, 40, %i1
5984 ! END OF aln_011
5985
5986 .ci_aln_010:
5987 ! Alignment off by 48 bytes
5988 ldda [%i0]%asi, %d0
5989 ldda [%i0+8]%asi, %d2
5990 ldda [%i0+16]%asi, %d4
5991 ldda [%i0+24]%asi, %d6
5992 ldda [%i0+32]%asi, %d8
5993 ldda [%i0+40]%asi, %d10
5994 add %i0, 48, %i0
5995 sub %i2, 48, %i2
5996 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
5997 and %i2, 0x7f, %i2 ! residue bytes in %i2
5998 sub %i1, %i0, %i1
5999 .ci_aln_010_loop:
6000 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load
6001 subcc %o3, 64, %o3
6002 fmovd %d16, %d12
6003 fmovd %d18, %d14
6004 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
6005 stda %d0,[%i0+%i1]ASI_BLK_P
6006 add %i0, 64, %i0
6007 fmovd %d20, %d0
6008 fmovd %d22, %d2
6009 fmovd %d24, %d4
6010 fmovd %d26, %d6
6011 fmovd %d28, %d8
6012 fmovd %d30, %d10
6013 bgt,pt %ncc, .ci_aln_010_loop
6014 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
6015 add %i1, %i0, %i1
6016
6017 std %d0, [%i1]
6018 std %d2, [%i1+8]
6019 std %d4, [%i1+16]
6020 std %d6, [%i1+24]
6021 std %d8, [%i1+32]
6022 std %d10, [%i1+40]
6023 ba .ci_remain_stuff
6024 add %i1, 48, %i1
6025 ! END OF aln_010
6026
6027 .ci_aln_001:
6028 ! Alignment off by 56 bytes
6029 ldda [%i0]%asi, %d0
6030 ldda [%i0+8]%asi, %d2
6031 ldda [%i0+16]%asi, %d4
6032 ldda [%i0+24]%asi, %d6
6033 ldda [%i0+32]%asi, %d8
6034 ldda [%i0+40]%asi, %d10
6035 ldda [%i0+48]%asi, %d12
6036 add %i0, 56, %i0
6037 sub %i2, 56, %i2
6038 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
6039 and %i2, 0x7f, %i2 ! residue bytes in %i2
6040 sub %i1, %i0, %i1
6041 .ci_aln_001_loop:
6042 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load
6043 subcc %o3, 64, %o3
6044 fmovd %d16, %d14
6045 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
6046 stda %d0,[%i0+%i1]ASI_BLK_P
6047 add %i0, 64, %i0
6048 fmovd %d18, %d0
6049 fmovd %d20, %d2
6050 fmovd %d22, %d4
6051 fmovd %d24, %d6
6052 fmovd %d26, %d8
6053 fmovd %d28, %d10
6054 fmovd %d30, %d12
6055 bgt,pt %ncc, .ci_aln_001_loop
6056 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
6057 add %i1, %i0, %i1
6058
6059 std %d0, [%i1]
6060 std %d2, [%i1+8]
6061 std %d4, [%i1+16]
6062 std %d6, [%i1+24]
6063 std %d8, [%i1+32]
6064 std %d10, [%i1+40]
6065 std %d12, [%i1+48]
6066 ba .ci_remain_stuff
6067 add %i1, 56, %i1
6068 ! END OF aln_001
6069
6070 .ci_aln_000:
6071 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
6072 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
6073 and %i2, 0x7f, %i2 ! residue bytes in %i2
6074 sub %i1, %i0, %i1
6075 .ci_aln_000_loop:
6076 ldda [%i0]ASI_BLK_AIUS,%d0
6077 subcc %o3, 64, %o3
6078 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
6079 stda %d0,[%i0+%i1]ASI_BLK_P
6080 add %i0, 64, %i0
6081 bgt,pt %ncc, .ci_aln_000_loop
6082 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
6083 add %i1, %i0, %i1
6084
6085 ! END OF aln_000
6086
6087 .ci_remain_stuff:
6088 subcc %i2, 31, %i2 ! adjust length to allow cc test
6089 ble,pt %ncc, .ci_aln_31
6090 nop
6091 .ci_aln_32:
6092 ldxa [%i0]%asi, %o4 ! move 32 bytes
6093 subcc %i2, 32, %i2 ! decrement length count by 32
6094 stx %o4, [%i1]
6095 ldxa [%i0+8]%asi, %o4
6096 stx %o4, [%i1+8]
6097 ldxa [%i0+16]%asi, %o4
6098 add %i0, 32, %i0 ! increase src ptr by 32
6099 stx %o4, [%i1+16]
6100 ldxa [%i0-8]%asi, %o4
6101 add %i1, 32, %i1 ! increase dst ptr by 32
6102 bgu,pt %ncc, .ci_aln_32 ! repeat if at least 32 bytes left
6103 stx %o4, [%i1-8]
6104 .ci_aln_31:
6105 addcc %i2, 24, %i2 ! adjust count to be off by 7
6106 ble,pt %ncc, .ci_aln_7 ! skip if 7 or fewer bytes left
6107 nop !
6108 .ci_aln_15:
6109 ldxa [%i0]%asi, %o4 ! move 8 bytes
6110 add %i0, 8, %i0 ! increase src ptr by 8
6111 subcc %i2, 8, %i2 ! decrease count by 8
6112 add %i1, 8, %i1 ! increase dst ptr by 8
6113 bgu,pt %ncc, .ci_aln_15
6114 stx %o4, [%i1-8] !
6115 .ci_aln_7:
6116 addcc %i2, 7, %i2 ! finish adjustment of remaining count
6117 bz,pt %ncc, .ci_exit ! exit if finished
6118 cmp %i2, 4
6119 blt,pt %ncc, .ci_unaln3x ! skip if less than 4 bytes left
6120 nop !
6121 lda [%i0]%asi, %o4 ! move 4 bytes
6122 add %i0, 4, %i0 ! increase src ptr by 4
6123 add %i1, 4, %i1 ! increase dst ptr by 4
6124 subcc %i2, 4, %i2 ! decrease count by 4
6125 bnz .ci_unaln3x
6126 stw %o4, [%i1-4]
6127 ba .ci_exit
6128 nop
6129
6130 ! destination alignment code
6131 .ci_big_d1:
6132 lduba [%i0]%asi, %o4 ! move a byte
6133 add %i0, 1, %i0
6134 stb %o4, [%i1]
6135 add %i1, 1, %i1
6136 andcc %i1, 2, %o3
6137 bz,pt %ncc, .ci_big_d2f
6138 sub %i2, 1, %i2
6139 .ci_big_d2: ! dest is now at least half word aligned
6140 lduba [%i0]%asi, %o4 ! move a half-word (src align unknown)
6141 lduba [%i0+1]%asi, %o3
6142 add %i0, 2, %i0
6143 sll %o4, 8, %o4 ! position
6144 or %o4, %o3, %o4 ! merge
6145 sth %o4, [%i1]
6146 add %i1, 2, %i1
6147 andcc %i1, 4, %o3
6148 bz,pt %ncc, .ci_big_d4f
6149 sub %i2, 2, %i2
6150 .ci_big_d4: ! dest is at least word aligned
6151 nop
6152 lduba [%i0]%asi, %o4 ! move a word (src align unknown)
6153 lduba [%i0+1]%asi, %o3
6154 sll %o4, 24, %o4 ! position
6155 sll %o3, 16, %o3 ! position
6156 or %o4, %o3, %o3 ! merge
6157 lduba [%i0+2]%asi, %o4
6158 sll %o4, 8, %o4 ! position
6159 or %o4, %o3, %o3 ! merge
6160 lduba [%i0+3]%asi, %o4
6161 or %o4, %o3, %o4 ! merge
6162 stw %o4,[%i1] ! store four bytes
6163 add %i0, 4, %i0 ! adjust src by 4
6164 add %i1, 4, %i1 ! adjust dest by 4
6165 ba .ci_big_d4f
6166 sub %i2, 4, %i2 ! adjust count by 4
6167
6168
6169 ! Dst is on 8 byte boundary; src is not;
6170 .ci_big_unal8:
6171 andcc %i1, 0x3f, %o3 ! is dst 64-byte block aligned?
6172 bz %ncc, .ci_unalnsrc
6173 sub %o3, 64, %o3 ! %o3 will be multiple of 8
6174 neg %o3 ! bytes until dest is 64 byte aligned
6175 sub %i2, %o3, %i2 ! update cnt with bytes to be moved
6176 ! Move bytes according to source alignment
6177 andcc %i0, 0x1, %o4
6178 bnz %ncc, .ci_unalnbyte ! check for byte alignment
6179 nop
6180 andcc %i0, 2, %o4 ! check for half word alignment
6181 bnz %ncc, .ci_unalnhalf
6182 nop
6183 ! Src is word aligned, move bytes until dest 64 byte aligned
6184 .ci_unalnword:
6185 lda [%i0]%asi, %o4 ! load 4 bytes
6186 stw %o4, [%i1] ! and store 4 bytes
6187 lda [%i0+4]%asi, %o4 ! load 4 bytes
6188 add %i0, 8, %i0 ! increase src ptr by 8
6189 stw %o4, [%i1+4] ! and store 4 bytes
6190 subcc %o3, 8, %o3 ! decrease count by 8
6191 bnz %ncc, .ci_unalnword
6192 add %i1, 8, %i1 ! increase dst ptr by 8
6193 ba .ci_unalnsrc
6194 nop
6195
6196 ! Src is half-word aligned, move bytes until dest 64 byte aligned
6197 .ci_unalnhalf:
6198 lduha [%i0]%asi, %o4 ! load 2 bytes
6199 sllx %o4, 32, %i3 ! shift left
6200 lduwa [%i0+2]%asi, %o4
6201 or %o4, %i3, %i3
6202 sllx %i3, 16, %i3
6203 lduha [%i0+6]%asi, %o4
6204 or %o4, %i3, %i3
6205 stx %i3, [%i1]
6206 add %i0, 8, %i0
6207 subcc %o3, 8, %o3
6208 bnz %ncc, .ci_unalnhalf
6209 add %i1, 8, %i1
6210 ba .ci_unalnsrc
6211 nop
6212
6213 ! Src is Byte aligned, move bytes until dest 64 byte aligned
6214 .ci_unalnbyte:
6215 sub %i1, %i0, %i1 ! share pointer advance
6216 .ci_unalnbyte_loop:
6217 lduba [%i0]%asi, %o4
6218 sllx %o4, 56, %i3
6219 lduha [%i0+1]%asi, %o4
6220 sllx %o4, 40, %o4
6221 or %o4, %i3, %i3
6222 lduha [%i0+3]%asi, %o4
6223 sllx %o4, 24, %o4
6224 or %o4, %i3, %i3
6225 lduha [%i0+5]%asi, %o4
6226 sllx %o4, 8, %o4
6227 or %o4, %i3, %i3
6228 lduba [%i0+7]%asi, %o4
6229 or %o4, %i3, %i3
6230 stx %i3, [%i1+%i0]
6231 subcc %o3, 8, %o3
6232 bnz %ncc, .ci_unalnbyte_loop
6233 add %i0, 8, %i0
6234 add %i1,%i0, %i1 ! restore pointer
6235
6236 ! Destination is now block (64 byte aligned), src is not 8 byte aligned
6237 .ci_unalnsrc:
6238 andn %i2, 0x3f, %i3 ! %i3 is multiple of block size
6239 and %i2, 0x3f, %i2 ! residue bytes in %i2
6240 add %i2, 64, %i2 ! Insure we don't load beyond
6241 sub %i3, 64, %i3 ! end of source buffer
6242
6243 andn %i0, 0x3f, %o4 ! %o4 has block aligned src address
6244 prefetcha [%o4 + (3 * CACHE_LINE)]%asi, #one_read
6245 alignaddr %i0, %g0, %g0 ! generate %gsr
6246 add %i0, %i3, %i0 ! advance %i0 to after blocks
6247 !
6248 ! Determine source alignment to correct 8 byte offset
6249 andcc %i0, 0x20, %o3
6250 brnz,pn %o3, .ci_unaln_1
6251 andcc %i0, 0x10, %o3
6252 brnz,pn %o3, .ci_unaln_01
6253 andcc %i0, 0x08, %o3
6254 brz,a %o3, .ci_unaln_000
6255 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6256 ba .ci_unaln_001
6257 nop
6258 .ci_unaln_01:
6259 brnz,a %o3, .ci_unaln_011
6260 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6261 ba .ci_unaln_010
6262 nop
6263 .ci_unaln_1:
6264 brnz,pn %o3, .ci_unaln_11
6265 andcc %i0, 0x08, %o3
6266 brnz,a %o3, .ci_unaln_101
6267 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6268 ba .ci_unaln_100
6269 nop
6270 .ci_unaln_11:
6271 brz,pn %o3, .ci_unaln_110
6272 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
6273
6274 .ci_unaln_111:
6275 ldda [%o4+56]%asi, %d14
6276 .ci_unaln_111_loop:
6277 add %o4, 64, %o4
6278 ldda [%o4]ASI_BLK_AIUS, %d16
6279 faligndata %d14, %d16, %d48
6280 faligndata %d16, %d18, %d50
6281 faligndata %d18, %d20, %d52
6282 faligndata %d20, %d22, %d54
6283 faligndata %d22, %d24, %d56
6284 faligndata %d24, %d26, %d58
6285 faligndata %d26, %d28, %d60
6286 faligndata %d28, %d30, %d62
6287 fmovd %d30, %d14
6288 stda %d48, [%i1]ASI_BLK_P
6289 subcc %i3, 64, %i3
6290 add %i1, 64, %i1
6291 bgu,pt %ncc, .ci_unaln_111_loop
6292 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6293 ba .ci_unaln_done
6294 nop
6295
6296 .ci_unaln_110:
6297 ldda [%o4+48]%asi, %d12
6298 ldda [%o4+56]%asi, %d14
6299 .ci_unaln_110_loop:
6300 add %o4, 64, %o4
6301 ldda [%o4]ASI_BLK_AIUS, %d16
6302 faligndata %d12, %d14, %d48
6303 faligndata %d14, %d16, %d50
6304 faligndata %d16, %d18, %d52
6305 faligndata %d18, %d20, %d54
6306 faligndata %d20, %d22, %d56
6307 faligndata %d22, %d24, %d58
6308 faligndata %d24, %d26, %d60
6309 faligndata %d26, %d28, %d62
6310 fmovd %d28, %d12
6311 fmovd %d30, %d14
6312 stda %d48, [%i1]ASI_BLK_P
6313 subcc %i3, 64, %i3
6314 add %i1, 64, %i1
6315 bgu,pt %ncc, .ci_unaln_110_loop
6316 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6317 ba .ci_unaln_done
6318 nop
6319
6320 .ci_unaln_101:
6321 ldda [%o4+40]%asi, %d10
6322 ldda [%o4+48]%asi, %d12
6323 ldda [%o4+56]%asi, %d14
6324 .ci_unaln_101_loop:
6325 add %o4, 64, %o4
6326 ldda [%o4]ASI_BLK_AIUS, %d16
6327 faligndata %d10, %d12, %d48
6328 faligndata %d12, %d14, %d50
6329 faligndata %d14, %d16, %d52
6330 faligndata %d16, %d18, %d54
6331 faligndata %d18, %d20, %d56
6332 faligndata %d20, %d22, %d58
6333 faligndata %d22, %d24, %d60
6334 faligndata %d24, %d26, %d62
6335 fmovd %d26, %d10
6336 fmovd %d28, %d12
6337 fmovd %d30, %d14
6338 stda %d48, [%i1]ASI_BLK_P
6339 subcc %i3, 64, %i3
6340 add %i1, 64, %i1
6341 bgu,pt %ncc, .ci_unaln_101_loop
6342 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6343 ba .ci_unaln_done
6344 nop
6345
6346 .ci_unaln_100:
6347 ldda [%o4+32]%asi, %d8
6348 ldda [%o4+40]%asi, %d10
6349 ldda [%o4+48]%asi, %d12
6350 ldda [%o4+56]%asi, %d14
6351 .ci_unaln_100_loop:
6352 add %o4, 64, %o4
6353 ldda [%o4]ASI_BLK_AIUS, %d16
6354 faligndata %d8, %d10, %d48
6355 faligndata %d10, %d12, %d50
6356 faligndata %d12, %d14, %d52
6357 faligndata %d14, %d16, %d54
6358 faligndata %d16, %d18, %d56
6359 faligndata %d18, %d20, %d58
6360 faligndata %d20, %d22, %d60
6361 faligndata %d22, %d24, %d62
6362 fmovd %d24, %d8
6363 fmovd %d26, %d10
6364 fmovd %d28, %d12
6365 fmovd %d30, %d14
6366 stda %d48, [%i1]ASI_BLK_P
6367 subcc %i3, 64, %i3
6368 add %i1, 64, %i1
6369 bgu,pt %ncc, .ci_unaln_100_loop
6370 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6371 ba .ci_unaln_done
6372 nop
6373
6374 .ci_unaln_011:
6375 ldda [%o4+24]%asi, %d6
6376 ldda [%o4+32]%asi, %d8
6377 ldda [%o4+40]%asi, %d10
6378 ldda [%o4+48]%asi, %d12
6379 ldda [%o4+56]%asi, %d14
6380 .ci_unaln_011_loop:
6381 add %o4, 64, %o4
6382 ldda [%o4]ASI_BLK_AIUS, %d16
6383 faligndata %d6, %d8, %d48
6384 faligndata %d8, %d10, %d50
6385 faligndata %d10, %d12, %d52
6386 faligndata %d12, %d14, %d54
6387 faligndata %d14, %d16, %d56
6388 faligndata %d16, %d18, %d58
6389 faligndata %d18, %d20, %d60
6390 faligndata %d20, %d22, %d62
6391 fmovd %d22, %d6
6392 fmovd %d24, %d8
6393 fmovd %d26, %d10
6394 fmovd %d28, %d12
6395 fmovd %d30, %d14
6396 stda %d48, [%i1]ASI_BLK_P
6397 subcc %i3, 64, %i3
6398 add %i1, 64, %i1
6399 bgu,pt %ncc, .ci_unaln_011_loop
6400 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6401 ba .ci_unaln_done
6402 nop
6403
6404 .ci_unaln_010:
6405 ldda [%o4+16]%asi, %d4
6406 ldda [%o4+24]%asi, %d6
6407 ldda [%o4+32]%asi, %d8
6408 ldda [%o4+40]%asi, %d10
6409 ldda [%o4+48]%asi, %d12
6410 ldda [%o4+56]%asi, %d14
6411 .ci_unaln_010_loop:
6412 add %o4, 64, %o4
6413 ldda [%o4]ASI_BLK_AIUS, %d16
6414 faligndata %d4, %d6, %d48
6415 faligndata %d6, %d8, %d50
6416 faligndata %d8, %d10, %d52
6417 faligndata %d10, %d12, %d54
6418 faligndata %d12, %d14, %d56
6419 faligndata %d14, %d16, %d58
6420 faligndata %d16, %d18, %d60
6421 faligndata %d18, %d20, %d62
6422 fmovd %d20, %d4
6423 fmovd %d22, %d6
6424 fmovd %d24, %d8
6425 fmovd %d26, %d10
6426 fmovd %d28, %d12
6427 fmovd %d30, %d14
6428 stda %d48, [%i1]ASI_BLK_P
6429 subcc %i3, 64, %i3
6430 add %i1, 64, %i1
6431 bgu,pt %ncc, .ci_unaln_010_loop
6432 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6433 ba .ci_unaln_done
6434 nop
6435
6436 .ci_unaln_001:
6437 ldda [%o4+8]%asi, %d2
6438 ldda [%o4+16]%asi, %d4
6439 ldda [%o4+24]%asi, %d6
6440 ldda [%o4+32]%asi, %d8
6441 ldda [%o4+40]%asi, %d10
6442 ldda [%o4+48]%asi, %d12
6443 ldda [%o4+56]%asi, %d14
6444 .ci_unaln_001_loop:
6445 add %o4, 64, %o4
6446 ldda [%o4]ASI_BLK_AIUS, %d16
6447 faligndata %d2, %d4, %d48
6448 faligndata %d4, %d6, %d50
6449 faligndata %d6, %d8, %d52
6450 faligndata %d8, %d10, %d54
6451 faligndata %d10, %d12, %d56
6452 faligndata %d12, %d14, %d58
6453 faligndata %d14, %d16, %d60
6454 faligndata %d16, %d18, %d62
6455 fmovd %d18, %d2
6456 fmovd %d20, %d4
6457 fmovd %d22, %d6
6458 fmovd %d24, %d8
6459 fmovd %d26, %d10
6460 fmovd %d28, %d12
6461 fmovd %d30, %d14
6462 stda %d48, [%i1]ASI_BLK_P
6463 subcc %i3, 64, %i3
6464 add %i1, 64, %i1
6465 bgu,pt %ncc, .ci_unaln_001_loop
6466 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6467 ba .ci_unaln_done
6468 nop
6469
6470 .ci_unaln_000:
6471 ldda [%o4]ASI_BLK_AIUS, %d0
6472 .ci_unaln_000_loop:
6473 add %o4, 64, %o4
6474 ldda [%o4]ASI_BLK_AIUS, %d16
6475 faligndata %d0, %d2, %d48
6476 faligndata %d2, %d4, %d50
6477 faligndata %d4, %d6, %d52
6478 faligndata %d6, %d8, %d54
6479 faligndata %d8, %d10, %d56
6480 faligndata %d10, %d12, %d58
6481 faligndata %d12, %d14, %d60
6482 faligndata %d14, %d16, %d62
6483 fmovd %d16, %d0
6484 fmovd %d18, %d2
6485 fmovd %d20, %d4
6486 fmovd %d22, %d6
6487 fmovd %d24, %d8
6488 fmovd %d26, %d10
6489 fmovd %d28, %d12
6490 fmovd %d30, %d14
6491 stda %d48, [%i1]ASI_BLK_P
6492 subcc %i3, 64, %i3
6493 add %i1, 64, %i1
6494 bgu,pt %ncc, .ci_unaln_000_loop
6495 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6496
6497 .ci_unaln_done:
6498 ! Handle trailing bytes, 64 to 127
6499 ! Dest long word aligned, Src not long word aligned
6500 cmp %i2, 15
6501 bleu %ncc, .ci_unaln_short
6502
6503 andn %i2, 0x7, %i3 ! %i3 is multiple of 8
6504 and %i2, 0x7, %i2 ! residue bytes in %i2
6505 add %i2, 8, %i2
6506 sub %i3, 8, %i3 ! insure we don't load past end of src
6507 andn %i0, 0x7, %o4 ! %o4 has long word aligned src address
6508 add %i0, %i3, %i0 ! advance %i0 to after multiple of 8
6509 ldda [%o4]%asi, %d0 ! fetch partial word
6510 .ci_unaln_by8:
6511 ldda [%o4+8]%asi, %d2
6512 add %o4, 8, %o4
6513 faligndata %d0, %d2, %d16
6514 subcc %i3, 8, %i3
6515 std %d16, [%i1]
6516 fmovd %d2, %d0
6517 bgu,pt %ncc, .ci_unaln_by8
6518 add %i1, 8, %i1
6519
6520 .ci_unaln_short:
6521 cmp %i2, 8
6522 blt,pt %ncc, .ci_unalnfin
6523 nop
6524 lduba [%i0]%asi, %o4
6525 sll %o4, 24, %o3
6526 lduba [%i0+1]%asi, %o4
6527 sll %o4, 16, %o4
6528 or %o4, %o3, %o3
6529 lduba [%i0+2]%asi, %o4
6530 sll %o4, 8, %o4
6531 or %o4, %o3, %o3
6532 lduba [%i0+3]%asi, %o4
6533 or %o4, %o3, %o3
6534 stw %o3, [%i1]
6535 lduba [%i0+4]%asi, %o4
6536 sll %o4, 24, %o3
6537 lduba [%i0+5]%asi, %o4
6538 sll %o4, 16, %o4
6539 or %o4, %o3, %o3
6540 lduba [%i0+6]%asi, %o4
6541 sll %o4, 8, %o4
6542 or %o4, %o3, %o3
6543 lduba [%i0+7]%asi, %o4
6544 or %o4, %o3, %o3
6545 stw %o3, [%i1+4]
6546 add %i0, 8, %i0
6547 add %i1, 8, %i1
6548 sub %i2, 8, %i2
6549 .ci_unalnfin:
6550 cmp %i2, 4
6551 blt,pt %ncc, .ci_unalnz
6552 tst %i2
6553 lduba [%i0]%asi, %o3 ! read byte
6554 subcc %i2, 4, %i2 ! reduce count by 4
6555 sll %o3, 24, %o3 ! position
6556 lduba [%i0+1]%asi, %o4
6557 sll %o4, 16, %o4 ! position
6558 or %o4, %o3, %o3 ! merge
6559 lduba [%i0+2]%asi, %o4
6560 sll %o4, 8, %o4 ! position
6561 or %o4, %o3, %o3 ! merge
6562 add %i1, 4, %i1 ! advance dst by 4
6563 lduba [%i0+3]%asi, %o4
6564 add %i0, 4, %i0 ! advance src by 4
6565 or %o4, %o3, %o4 ! merge
6566 bnz,pt %ncc, .ci_unaln3x
6567 stw %o4, [%i1-4]
6568 ba .ci_exit
6569 nop
6570 .ci_unalnz:
6571 bz,pt %ncc, .ci_exit
6572 wr %l5, %g0, %gsr ! restore %gsr
6573 .ci_unaln3x: ! Exactly 1, 2, or 3 bytes remain
6574 subcc %i2, 1, %i2 ! reduce count for cc test
6575 lduba [%i0]%asi, %o4 ! load one byte
6576 bz,pt %ncc, .ci_exit
6577 stb %o4, [%i1] ! store one byte
6578 lduba [%i0+1]%asi, %o4 ! load second byte
6579 subcc %i2, 1, %i2
6580 bz,pt %ncc, .ci_exit
6581 stb %o4, [%i1+1] ! store second byte
6582 lduba [%i0+2]%asi, %o4 ! load third byte
6583 stb %o4, [%i1+2] ! store third byte
6584 .ci_exit:
6585 brnz %g1, .ci_fp_restore
6586 nop
6587 FZERO
6588 wr %g1, %g0, %fprs
6589 ba,pt %ncc, .ci_ex2
6590 membar #Sync
6591 .ci_fp_restore:
6592 BLD_FP_FROMSTACK(%o4)
6593 .ci_ex2:
6594 andn SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
6595 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
6596 ret
6597 restore %g0, 0, %o0
6598
6599 .copyin_err:
6600 ldn [THREAD_REG + T_COPYOPS], %o4
6601 brz %o4, 2f
6602 nop
6603 ldn [%o4 + CP_COPYIN], %g2
6604 jmp %g2
6605 nop
6606 2:
6607 retl
6608 mov -1, %o0
6609
6610 #else /* NIAGARA_IMPL */
6611 .do_copyin:
6612 !
6613 ! Check the length and bail if zero.
6614 !
6615 tst %o2
6616 bnz,pt %ncc, 1f
6617 nop
6618 retl
6619 clr %o0
6620 1:
6621 sethi %hi(copyio_fault), %o4
6622 or %o4, %lo(copyio_fault), %o4
6623 sethi %hi(copyio_fault_nowindow), %o3
6624 ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
6625 or %o3, %lo(copyio_fault_nowindow), %o3
6626 membar #Sync
6627 stn %o3, [THREAD_REG + T_LOFAULT]
6628
6629 mov %o0, SAVE_SRC
6630 mov %o1, SAVE_DST
6631 mov %o2, SAVE_COUNT
6632
6633 !
6634 ! Check to see if we're more than SMALL_LIMIT.
6635 !
6636 subcc %o2, SMALL_LIMIT, %o3
6637 bgu,a,pt %ncc, .dci_ns
6638 or %o0, %o1, %o3
6639 !
6640 ! What was previously ".small_copyin"
6641 !
6642 .dcibcp:
6643 sub %g0, %o2, %o3 ! setup for copy loop
6644 add %o0, %o2, %o0
6645 add %o1, %o2, %o1
6646 ba,pt %ncc, .dcicl
6647 lduba [%o0 + %o3]ASI_USER, %o4
6648 !
6649 ! %o0 and %o1 point at the end and remain pointing at the end
6650 ! of their buffers. We pull things out by adding %o3 (which is
6651 ! the negation of the length) to the buffer end which gives us
6652 ! the curent location in the buffers. By incrementing %o3 we walk
6653 ! through both buffers without having to bump each buffer's
6654 ! pointer. A very fast 4 instruction loop.
6655 !
6656 .align 16
6657 .dcicl:
6658 stb %o4, [%o1 + %o3]
6659 inccc %o3
6660 bl,a,pt %ncc, .dcicl
6661 lduba [%o0 + %o3]ASI_USER, %o4
6662 !
6663 ! We're done. Go home.
6664 !
6665 membar #Sync
6666 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
6667 retl
6668 clr %o0
6669 !
6670 ! Try aligned copies from here.
6671 !
6672 .dci_ns:
6673 !
6674 ! See if we're single byte aligned. If we are, check the
6675 ! limit for single byte copies. If we're smaller, or equal,
6676 ! bounce to the byte for byte copy loop. Otherwise do it in
6677 ! HW (if enabled).
6678 !
6679 btst 1, %o3
6680 bz,a,pt %icc, .dcih8
6681 btst 7, %o3
6682 !
6683 ! We're single byte aligned.
6684 !
6685 sethi %hi(hw_copy_limit_1), %o3
6686 ld [%o3 + %lo(hw_copy_limit_1)], %o3
6687 !
6688 ! Is HW copy on? If not do everything byte for byte.
6689 !
6690 tst %o3
6691 bz,pn %icc, .dcibcp
6692 subcc %o3, %o2, %o3
6693 !
6694 ! Are we bigger than the HW limit? If not
6695 ! go to byte for byte.
6696 !
6697 bge,pt %ncc, .dcibcp
6698 nop
6699 !
6700 ! We're big enough and copy is on. Do it with HW.
6701 !
6702 ba,pt %ncc, .big_copyin
6703 nop
6704 .dcih8:
6705 !
6706 ! 8 byte aligned?
6707 !
6708 bnz,a %ncc, .dcih4
6709 btst 3, %o3
6710 !
6711 ! We're eight byte aligned.
6712 !
6713 sethi %hi(hw_copy_limit_8), %o3
6714 ld [%o3 + %lo(hw_copy_limit_8)], %o3
6715 !
6716 ! Is HW assist on? If not, do it with the aligned copy.
6717 !
6718 tst %o3
6719 bz,pn %icc, .dcis8
6720 subcc %o3, %o2, %o3
6721 bge %ncc, .dcis8
6722 nop
6723 ba,pt %ncc, .big_copyin
6724 nop
6725 .dcis8:
6726 !
6727 ! Housekeeping for copy loops. Uses same idea as in the byte for
6728 ! byte copy loop above.
6729 !
6730 add %o0, %o2, %o0
6731 add %o1, %o2, %o1
6732 sub %g0, %o2, %o3
6733 ba,pt %ncc, .didebc
6734 srl %o2, 3, %o2 ! Number of 8 byte chunks to copy
6735 !
6736 ! 4 byte aligned?
6737 !
6738 .dcih4:
6739 bnz %ncc, .dcih2
6740 sethi %hi(hw_copy_limit_4), %o3
6741 ld [%o3 + %lo(hw_copy_limit_4)], %o3
6742 !
6743 ! Is HW assist on? If not, do it with the aligned copy.
6744 !
6745 tst %o3
6746 bz,pn %icc, .dcis4
6747 subcc %o3, %o2, %o3
6748 !
6749 ! We're negative if our size is less than or equal to hw_copy_limit_4.
6750 !
6751 bge %ncc, .dcis4
6752 nop
6753 ba,pt %ncc, .big_copyin
6754 nop
6755 .dcis4:
6756 !
6757 ! Housekeeping for copy loops. Uses same idea as in the byte
6758 ! for byte copy loop above.
6759 !
6760 add %o0, %o2, %o0
6761 add %o1, %o2, %o1
6762 sub %g0, %o2, %o3
6763 ba,pt %ncc, .didfbc
6764 srl %o2, 2, %o2 ! Number of 4 byte chunks to copy
6765 .dcih2:
6766 !
6767 ! We're two byte aligned. Check for "smallness"
6768 ! done in delay at .dcih4
6769 !
6770 bleu,pt %ncc, .dcis2
6771 sethi %hi(hw_copy_limit_2), %o3
6772 ld [%o3 + %lo(hw_copy_limit_2)], %o3
6773 !
6774 ! Is HW assist on? If not, do it with the aligned copy.
6775 !
6776 tst %o3
6777 bz,pn %icc, .dcis2
6778 subcc %o3, %o2, %o3
6779 !
6780 ! Are we larger than the HW limit?
6781 !
6782 bge %ncc, .dcis2
6783 nop
6784 !
6785 ! HW assist is on and we're large enough to use it.
6786 !
6787 ba,pt %ncc, .big_copyin
6788 nop
6789 !
6790 ! Housekeeping for copy loops. Uses same idea as in the byte
6791 ! for byte copy loop above.
6792 !
6793 .dcis2:
6794 add %o0, %o2, %o0
6795 add %o1, %o2, %o1
6796 sub %g0, %o2, %o3
6797 ba,pt %ncc, .didtbc
6798 srl %o2, 1, %o2 ! Number of 2 byte chunks to copy
6799 !
6800 .small_copyin:
6801 !
6802 ! Why are we doing this AGAIN? There are certain conditions in
6803 ! big copyin that will cause us to forgo the HW assisted copys
6804 ! and bounce back to a non-hw assisted copy. This dispatches
6805 ! those copies. Note that we branch around this in the main line
6806 ! code.
6807 !
6808 ! We make no check for limits or HW enablement here. We've
6809 ! already been told that we're a poster child so just go off
6810 ! and do it.
6811 !
6812 or %o0, %o1, %o3
6813 btst 1, %o3
6814 bnz %icc, .dcibcp ! Most likely
6815 btst 7, %o3
6816 bz %icc, .dcis8
6817 btst 3, %o3
6818 bz %icc, .dcis4
6819 nop
6820 ba,pt %ncc, .dcis2
6821 nop
6822 !
6823 ! Eight byte aligned copies. A steal from the original .small_copyin
6824 ! with modifications. %o2 is number of 8 byte chunks to copy. When
6825 ! done, we examine %o3. If this is < 0, we have 1 - 7 bytes more
6826 ! to copy.
6827 !
6828 .align 32
6829 .didebc:
6830 ldxa [%o0 + %o3]ASI_USER, %o4
6831 deccc %o2
6832 stx %o4, [%o1 + %o3]
6833 bg,pt %ncc, .didebc
6834 addcc %o3, 8, %o3
6835 !
6836 ! End of copy loop. Most 8 byte aligned copies end here.
6837 !
6838 bz,pt %ncc, .dcifh
6839 nop
6840 !
6841 ! Something is left. Do it byte for byte.
6842 !
6843 ba,pt %ncc, .dcicl
6844 lduba [%o0 + %o3]ASI_USER, %o4
6845 !
6846 ! 4 byte copy loop. %o2 is number of 4 byte chunks to copy.
6847 !
6848 .align 32
6849 .didfbc:
6850 lduwa [%o0 + %o3]ASI_USER, %o4
6851 deccc %o2
6852 st %o4, [%o1 + %o3]
6853 bg,pt %ncc, .didfbc
6854 addcc %o3, 4, %o3
6855 !
6856 ! End of copy loop. Most 4 byte aligned copies end here.
6857 !
6858 bz,pt %ncc, .dcifh
6859 nop
6860 !
6861 ! Something is left. Do it byte for byte.
6862 !
6863 ba,pt %ncc, .dcicl
6864 lduba [%o0 + %o3]ASI_USER, %o4
6865 !
6866 ! 2 byte aligned copy loop. %o2 is number of 2 byte chunks to
6867 ! copy.
6868 !
6869 .align 32
6870 .didtbc:
6871 lduha [%o0 + %o3]ASI_USER, %o4
6872 deccc %o2
6873 sth %o4, [%o1 + %o3]
6874 bg,pt %ncc, .didtbc
6875 addcc %o3, 2, %o3
6876 !
6877 ! End of copy loop. Most 2 byte aligned copies end here.
6878 !
6879 bz,pt %ncc, .dcifh
6880 nop
6881 !
6882 ! Deal with the last byte
6883 !
6884 lduba [%o0 + %o3]ASI_USER, %o4
6885 stb %o4, [%o1 + %o3]
6886 .dcifh:
6887 membar #Sync
6888 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
6889 retl
6890 clr %o0
6891
6892 .big_copyin:
6893 ! We're going off to do a block copy.
6894 ! Switch fault hendlers and grab a window. We
6895 ! don't do a membar #Sync since we've done only
6896 ! kernel data to this point.
6897 stn %o4, [THREAD_REG + T_LOFAULT]
6898
6899 ! Copy in that reach here are larger than 256 bytes. The
6900 ! hw_copy_limit_1 is set to 256. Never set this limit less
6901 ! 128 bytes.
6902 save %sp, -SA(MINFRAME), %sp
6903 .do_blockcopyin:
6904
6905 ! Swap src/dst since the code below is memcpy code
6906 ! and memcpy/bcopy have different calling sequences
6907 mov %i1, %i5
6908 mov %i0, %i1
6909 mov %i5, %i0
6910
6911 ! Block (64 bytes) align the destination.
6912 andcc %i0, 0x3f, %i3 ! is dst block aligned
6913 bz %ncc, copyin_blalign ! dst already block aligned
6914 sub %i3, 0x40, %i3
6915 neg %i3 ! bytes till dst 64 bytes aligned
6916 sub %i2, %i3, %i2 ! update i2 with new count
6917
6918 ! Based on source and destination alignment do
6919 ! either 8 bytes, 4 bytes, 2 bytes or byte copy.
6920
6921 ! Is dst & src 8B aligned
6922 or %i0, %i1, %o2
6923 andcc %o2, 0x7, %g0
6924 bz %ncc, .ci_alewdcp
6925 nop
6926
6927 ! Is dst & src 4B aligned
6928 andcc %o2, 0x3, %g0
6929 bz %ncc, .ci_alwdcp
6930 nop
6931
6932 ! Is dst & src 2B aligned
6933 andcc %o2, 0x1, %g0
6934 bz %ncc, .ci_alhlfwdcp
6935 nop
6936
6937 ! 1B aligned
6938 1: lduba [%i1]ASI_USER, %o2
6939 stb %o2, [%i0]
6940 inc %i1
6941 deccc %i3
6942 bgu,pt %ncc, 1b
6943 inc %i0
6944
6945 ba copyin_blalign
6946 nop
6947
6948 ! dst & src 4B aligned
6949 .ci_alwdcp:
6950 lda [%i1]ASI_USER, %o2
6951 st %o2, [%i0]
6952 add %i1, 0x4, %i1
6953 subcc %i3, 0x4, %i3
6954 bgu,pt %ncc, .ci_alwdcp
6955 add %i0, 0x4, %i0
6956
6957 ba copyin_blalign
6958 nop
6959
6960 ! dst & src 2B aligned
6961 .ci_alhlfwdcp:
6962 lduha [%i1]ASI_USER, %o2
6963 stuh %o2, [%i0]
6964 add %i1, 0x2, %i1
6965 subcc %i3, 0x2, %i3
6966 bgu,pt %ncc, .ci_alhlfwdcp
6967 add %i0, 0x2, %i0
6968
6969 ba copyin_blalign
6970 nop
6971
6972 ! dst & src 8B aligned
6973 .ci_alewdcp:
6974 ldxa [%i1]ASI_USER, %o2
6975 stx %o2, [%i0]
6976 add %i1, 0x8, %i1
6977 subcc %i3, 0x8, %i3
6978 bgu,pt %ncc, .ci_alewdcp
6979 add %i0, 0x8, %i0
6980
6981 copyin_blalign:
6982 andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size
6983 sub %i2, %i3, %i2 ! Residue bytes in %i2
6984
6985 mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
6986
6987 andcc %i1, 0xf, %o2 ! is src quadword aligned
6988 bz,pn %xcc, .ci_blkcpy ! src offset in %o2 (last 4-bits)
6989 nop
6990 cmp %o2, 0x8
6991 bg .ci_upper_double
6992 nop
6993 bl .ci_lower_double
6994 nop
6995
6996 ! Falls through when source offset is equal to 8 i.e.
6997 ! source is double word aligned.
6998 ! In this case no shift/merge of data is required
6999
7000 sub %i1, %o2, %i1 ! align the src at 16 bytes.
7001 andn %i1, 0x3f, %l0 ! %l0 has block aligned source
7002 prefetcha [%l0]ASI_USER, #one_read
7003 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7004 add %l0, 0x40, %l0
7005 .ci_loop0:
7006 add %i1, 0x10, %i1
7007 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
7008
7009 prefetcha [%l0]ASI_USER, #one_read
7010
7011 stxa %l3, [%i0+0x0]%asi
7012 stxa %l4, [%i0+0x8]%asi
7013
7014 add %i1, 0x10, %i1
7015 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7016
7017 stxa %l5, [%i0+0x10]%asi
7018 stxa %l2, [%i0+0x18]%asi
7019
7020 add %i1, 0x10, %i1
7021 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
7022
7023 stxa %l3, [%i0+0x20]%asi
7024 stxa %l4, [%i0+0x28]%asi
7025
7026 add %i1, 0x10, %i1
7027 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7028
7029 stxa %l5, [%i0+0x30]%asi
7030 stxa %l2, [%i0+0x38]%asi
7031
7032 add %l0, 0x40, %l0
7033 subcc %i3, 0x40, %i3
7034 bgu,pt %xcc, .ci_loop0
7035 add %i0, 0x40, %i0
7036 ba .ci_blkdone
7037 add %i1, %o2, %i1 ! increment the source by src offset
7038 ! the src offset was stored in %o2
7039
7040 .ci_lower_double:
7041
7042 sub %i1, %o2, %i1 ! align the src at 16 bytes.
7043 sll %o2, 3, %o0 ! %o0 left shift
7044 mov 0x40, %o1
7045 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift)
7046 andn %i1, 0x3f, %l0 ! %l0 has block aligned source
7047 prefetcha [%l0]ASI_USER, #one_read
7048 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 ! partial data in %l2
7049 ! and %l3 has complete
7050 ! data
7051 add %l0, 0x40, %l0
7052 .ci_loop1:
7053 add %i1, 0x10, %i1
7054 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 ! %l4 has partial data
7055 ! for this read.
7056 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) ! merge %l2, %l3 and %l4
7057 ! into %l2 and %l3
7058
7059 prefetcha [%l0]ASI_USER, #one_read
7060
7061 stxa %l2, [%i0+0x0]%asi
7062 stxa %l3, [%i0+0x8]%asi
7063
7064 add %i1, 0x10, %i1
7065 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7066 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) ! merge %l2 with %l5 and
7067 ! %l4 from previous read
7068 ! into %l4 and %l5
7069 stxa %l4, [%i0+0x10]%asi
7070 stxa %l5, [%i0+0x18]%asi
7071
7072 ! Repeat the same for next 32 bytes.
7073
7074 add %i1, 0x10, %i1
7075 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
7076 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
7077
7078 stxa %l2, [%i0+0x20]%asi
7079 stxa %l3, [%i0+0x28]%asi
7080
7081 add %i1, 0x10, %i1
7082 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7083 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
7084
7085 stxa %l4, [%i0+0x30]%asi
7086 stxa %l5, [%i0+0x38]%asi
7087
7088 add %l0, 0x40, %l0
7089 subcc %i3, 0x40, %i3
7090 bgu,pt %xcc, .ci_loop1
7091 add %i0, 0x40, %i0
7092 ba .ci_blkdone
7093 add %i1, %o2, %i1 ! increment the source by src offset
7094 ! the src offset was stored in %o2
7095
7096 .ci_upper_double:
7097
7098 sub %i1, %o2, %i1 ! align the src at 16 bytes.
7099 sub %o2, 0x8, %o0
7100 sll %o0, 3, %o0 ! %o0 left shift
7101 mov 0x40, %o1
7102 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift)
7103 andn %i1, 0x3f, %l0 ! %l0 has block aligned source
7104 prefetcha [%l0]ASI_USER, #one_read
7105 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 ! partial data in %l3
7106 ! for this read and
7107 ! no data in %l2
7108 add %l0, 0x40, %l0
7109 .ci_loop2:
7110 add %i1, 0x10, %i1
7111 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 ! %l4 has complete data
7112 ! and %l5 has partial
7113 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) ! merge %l3, %l4 and %l5
7114 ! into %l3 and %l4
7115 prefetcha [%l0]ASI_USER, #one_read
7116
7117 stxa %l3, [%i0+0x0]%asi
7118 stxa %l4, [%i0+0x8]%asi
7119
7120 add %i1, 0x10, %i1
7121 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7122 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) ! merge %l2 and %l3 with
7123 ! %l5 from previous read
7124 ! into %l5 and %l2
7125
7126 stxa %l5, [%i0+0x10]%asi
7127 stxa %l2, [%i0+0x18]%asi
7128
7129 ! Repeat the same for next 32 bytes.
7130
7131 add %i1, 0x10, %i1
7132 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
7133 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
7134
7135 stxa %l3, [%i0+0x20]%asi
7136 stxa %l4, [%i0+0x28]%asi
7137
7138 add %i1, 0x10, %i1
7139 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7140 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
7141
7142 stxa %l5, [%i0+0x30]%asi
7143 stxa %l2, [%i0+0x38]%asi
7144
7145 add %l0, 0x40, %l0
7146 subcc %i3, 0x40, %i3
7147 bgu,pt %xcc, .ci_loop2
7148 add %i0, 0x40, %i0
7149 ba .ci_blkdone
7150 add %i1, %o2, %i1 ! increment the source by src offset
7151 ! the src offset was stored in %o2
7152
7153
7154 ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
7155 .ci_blkcpy:
7156
7157 andn %i1, 0x3f, %o0 ! %o0 has block aligned source
7158 prefetcha [%o0]ASI_USER, #one_read
7159 add %o0, 0x40, %o0
7160 1:
7161 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l0
7162 add %i1, 0x10, %i1
7163 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7164 add %i1, 0x10, %i1
7165
7166 prefetcha [%o0]ASI_USER, #one_read
7167
7168 stxa %l0, [%i0+0x0]%asi
7169
7170 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
7171 add %i1, 0x10, %i1
7172 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l6
7173 add %i1, 0x10, %i1
7174
7175 stxa %l1, [%i0+0x8]%asi
7176 stxa %l2, [%i0+0x10]%asi
7177 stxa %l3, [%i0+0x18]%asi
7178 stxa %l4, [%i0+0x20]%asi
7179 stxa %l5, [%i0+0x28]%asi
7180 stxa %l6, [%i0+0x30]%asi
7181 stxa %l7, [%i0+0x38]%asi
7182
7183 add %o0, 0x40, %o0
7184 subcc %i3, 0x40, %i3
7185 bgu,pt %xcc, 1b
7186 add %i0, 0x40, %i0
7187
7188 .ci_blkdone:
7189 membar #Sync
7190
7191 brz,pt %i2, .copyin_exit
7192 nop
7193
7194 ! Handle trailing bytes
7195 cmp %i2, 0x8
7196 blu,pt %ncc, .ci_residue
7197 nop
7198
7199 ! Can we do some 8B ops
7200 or %i1, %i0, %o2
7201 andcc %o2, 0x7, %g0
7202 bnz %ncc, .ci_last4
7203 nop
7204
7205 ! Do 8byte ops as long as possible
7206 .ci_last8:
7207 ldxa [%i1]ASI_USER, %o2
7208 stx %o2, [%i0]
7209 add %i1, 0x8, %i1
7210 sub %i2, 0x8, %i2
7211 cmp %i2, 0x8
7212 bgu,pt %ncc, .ci_last8
7213 add %i0, 0x8, %i0
7214
7215 brz,pt %i2, .copyin_exit
7216 nop
7217
7218 ba .ci_residue
7219 nop
7220
7221 .ci_last4:
7222 ! Can we do 4B ops
7223 andcc %o2, 0x3, %g0
7224 bnz %ncc, .ci_last2
7225 nop
7226 1:
7227 lda [%i1]ASI_USER, %o2
7228 st %o2, [%i0]
7229 add %i1, 0x4, %i1
7230 sub %i2, 0x4, %i2
7231 cmp %i2, 0x4
7232 bgu,pt %ncc, 1b
7233 add %i0, 0x4, %i0
7234
7235 brz,pt %i2, .copyin_exit
7236 nop
7237
7238 ba .ci_residue
7239 nop
7240
7241 .ci_last2:
7242 ! Can we do 2B ops
7243 andcc %o2, 0x1, %g0
7244 bnz %ncc, .ci_residue
7245 nop
7246
7247 1:
7248 lduha [%i1]ASI_USER, %o2
7249 stuh %o2, [%i0]
7250 add %i1, 0x2, %i1
7251 sub %i2, 0x2, %i2
7252 cmp %i2, 0x2
7253 bgu,pt %ncc, 1b
7254 add %i0, 0x2, %i0
7255
7256 brz,pt %i2, .copyin_exit
7257 nop
7258
7259 ! Copy the residue as byte copy
7260 .ci_residue:
7261 lduba [%i1]ASI_USER, %i4
7262 stb %i4, [%i0]
7263 inc %i1
7264 deccc %i2
7265 bgu,pt %xcc, .ci_residue
7266 inc %i0
7267
7268 .copyin_exit:
7269 membar #Sync
7270 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
7271 ret
7272 restore %g0, 0, %o0
7273 .copyin_err:
7274 ldn [THREAD_REG + T_COPYOPS], %o4
7275 brz %o4, 2f
7276 nop
7277 ldn [%o4 + CP_COPYIN], %g2
7278 jmp %g2
7279 nop
7280 2:
7281 retl
7282 mov -1, %o0
7283 #endif /* NIAGARA_IMPL */
7284 SET_SIZE(copyin)
7285
7286 #endif /* lint */
7287
7288 #ifdef lint
7289
7290 /*ARGSUSED*/
7291 int
7292 xcopyin(const void *uaddr, void *kaddr, size_t count)
7293 { return (0); }
7294
7295 #else /* lint */
7296
7297 ENTRY(xcopyin)
7298 sethi %hi(.xcopyin_err), REAL_LOFAULT
7299 b .do_copyin
7300 or REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
7301 .xcopyin_err:
7302 ldn [THREAD_REG + T_COPYOPS], %o4
7303 brz %o4, 2f
7304 nop
7305 ldn [%o4 + CP_XCOPYIN], %g2
7306 jmp %g2
7307 nop
7308 2:
7309 retl
7310 mov %g1, %o0
7311 SET_SIZE(xcopyin)
7312
7313 #endif /* lint */
7314
7315 #ifdef lint
7316
7317 /*ARGSUSED*/
7318 int
7319 xcopyin_little(const void *uaddr, void *kaddr, size_t count)
7320 { return (0); }
7321
7322 #else /* lint */
7323
7324 ENTRY(xcopyin_little)
7325 sethi %hi(.little_err), %o4
7326 ldn [THREAD_REG + T_LOFAULT], %o5
7327 or %o4, %lo(.little_err), %o4
7328 membar #Sync ! sync error barrier
7329 stn %o4, [THREAD_REG + T_LOFAULT]
7330
7331 subcc %g0, %o2, %o3
7332 add %o0, %o2, %o0
7333 bz,pn %ncc, 2f ! check for zero bytes
7334 sub %o2, 1, %o4
7335 add %o0, %o4, %o0 ! start w/last byte
7336 add %o1, %o2, %o1
7337 lduba [%o0+%o3]ASI_AIUSL, %o4
7338
7339 1: stb %o4, [%o1+%o3]
7340 inccc %o3
7341 sub %o0, 2, %o0 ! get next byte
7342 bcc,a,pt %ncc, 1b
7343 lduba [%o0+%o3]ASI_AIUSL, %o4
7344
7345 2: membar #Sync ! sync error barrier
7346 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
7347 retl
7348 mov %g0, %o0 ! return (0)
7349
7350 .little_err:
7351 membar #Sync ! sync error barrier
7352 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
7353 retl
7354 mov %g1, %o0
7355 SET_SIZE(xcopyin_little)
7356
7357 #endif /* lint */
7358
7359
7360 /*
7361 * Copy a block of storage - must not overlap (from + len <= to).
7362 * No fault handler installed (to be called under on_fault())
7363 */
7364 #if defined(lint)
7365
7366 /* ARGSUSED */
7367 void
7368 copyin_noerr(const void *ufrom, void *kto, size_t count)
7369 {}
7370
7371 #else /* lint */
7372
7373 ENTRY(copyin_noerr)
7374 sethi %hi(.copyio_noerr), REAL_LOFAULT
7375 b .do_copyin
7376 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
7377 .copyio_noerr:
7378 jmp SAVED_LOFAULT
7379 nop
7380 SET_SIZE(copyin_noerr)
7381
7382 #endif /* lint */
7383
7384 /*
7385 * Copy a block of storage - must not overlap (from + len <= to).
7386 * No fault handler installed (to be called under on_fault())
7387 */
7388
7389 #if defined(lint)
7390
7391 /* ARGSUSED */
7392 void
7393 copyout_noerr(const void *kfrom, void *uto, size_t count)
7394 {}
7395
7396 #else /* lint */
7397
7398 ENTRY(copyout_noerr)
7399 sethi %hi(.copyio_noerr), REAL_LOFAULT
7400 b .do_copyout
7401 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
7402 SET_SIZE(copyout_noerr)
7403
7404 #endif /* lint */
7405
7406 #if defined(lint)
7407
7408 int use_hw_bcopy = 1;
7409 int use_hw_bzero = 1;
7410 uint_t hw_copy_limit_1 = 0x100;
7411 uint_t hw_copy_limit_2 = 0x200;
7412 uint_t hw_copy_limit_4 = 0x400;
7413 uint_t hw_copy_limit_8 = 0x400;
7414
7415 #else /* !lint */
7416
7417 .align 4
7418 DGDEF(use_hw_bcopy)
7419 .word 1
7420 DGDEF(use_hw_bzero)
7421 .word 1
7422 DGDEF(hw_copy_limit_1)
7423 .word 0x100
7424 DGDEF(hw_copy_limit_2)
7425 .word 0x200
7426 DGDEF(hw_copy_limit_4)
7427 .word 0x400
7428 DGDEF(hw_copy_limit_8)
7429 .word 0x400
7430
7431 .align 64
7432 .section ".text"
7433 #endif /* !lint */
7434
7435 /*
7436 * hwblkclr - clears block-aligned, block-multiple-sized regions that are
7437 * longer than 256 bytes in length using Niagara's block stores/quad store.
7438 * If the criteria for using this routine are not met then it calls bzero
7439 * and returns 1. Otherwise 0 is returned indicating success.
7440 * Caller is responsible for ensuring use_hw_bzero is true and that
7441 * kpreempt_disable() has been called.
7442 */
7443 #ifdef lint
7444 /*ARGSUSED*/
7445 int
7446 hwblkclr(void *addr, size_t len)
7447 {
7448 return(0);
7449 }
7450 #else /* lint */
7451 ! %i0 - start address
7452 ! %i1 - length of region (multiple of 64)
7453
7454 ENTRY(hwblkclr)
7455 save %sp, -SA(MINFRAME), %sp
7456
7457 ! Must be block-aligned
7458 andcc %i0, 0x3f, %g0
7459 bnz,pn %ncc, 1f
7460 nop
7461
7462 ! ... and must be 256 bytes or more
7463 cmp %i1, 0x100
7464 blu,pn %ncc, 1f
7465 nop
7466
7467 ! ... and length must be a multiple of 64
7468 andcc %i1, 0x3f, %g0
7469 bz,pn %ncc, .pz_doblock
7470 mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
7471
7472 1: ! punt, call bzero but notify the caller that bzero was used
7473 mov %i0, %o0
7474 call bzero
7475 mov %i1, %o1
7476 ret
7477 restore %g0, 1, %o0 ! return (1) - did not use block operations
7478
7479 ! Already verified that there are at least 256 bytes to set
7480 .pz_doblock:
7481 stxa %g0, [%i0+0x0]%asi
7482 stxa %g0, [%i0+0x40]%asi
7483 stxa %g0, [%i0+0x80]%asi
7484 stxa %g0, [%i0+0xc0]%asi
7485
7486 stxa %g0, [%i0+0x8]%asi
7487 stxa %g0, [%i0+0x10]%asi
7488 stxa %g0, [%i0+0x18]%asi
7489 stxa %g0, [%i0+0x20]%asi
7490 stxa %g0, [%i0+0x28]%asi
7491 stxa %g0, [%i0+0x30]%asi
7492 stxa %g0, [%i0+0x38]%asi
7493
7494 stxa %g0, [%i0+0x48]%asi
7495 stxa %g0, [%i0+0x50]%asi
7496 stxa %g0, [%i0+0x58]%asi
7497 stxa %g0, [%i0+0x60]%asi
7498 stxa %g0, [%i0+0x68]%asi
7499 stxa %g0, [%i0+0x70]%asi
7500 stxa %g0, [%i0+0x78]%asi
7501
7502 stxa %g0, [%i0+0x88]%asi
7503 stxa %g0, [%i0+0x90]%asi
7504 stxa %g0, [%i0+0x98]%asi
7505 stxa %g0, [%i0+0xa0]%asi
7506 stxa %g0, [%i0+0xa8]%asi
7507 stxa %g0, [%i0+0xb0]%asi
7508 stxa %g0, [%i0+0xb8]%asi
7509
7510 stxa %g0, [%i0+0xc8]%asi
7511 stxa %g0, [%i0+0xd0]%asi
7512 stxa %g0, [%i0+0xd8]%asi
7513 stxa %g0, [%i0+0xe0]%asi
7514 stxa %g0, [%i0+0xe8]%asi
7515 stxa %g0, [%i0+0xf0]%asi
7516 stxa %g0, [%i0+0xf8]%asi
7517
7518 sub %i1, 0x100, %i1
7519 cmp %i1, 0x100
7520 bgu,pt %ncc, .pz_doblock
7521 add %i0, 0x100, %i0
7522
7523 2:
7524 ! Check if more than 64 bytes to set
7525 cmp %i1,0x40
7526 blu %ncc, .pz_finish
7527 nop
7528
7529 3:
7530 stxa %g0, [%i0+0x0]%asi
7531 stxa %g0, [%i0+0x8]%asi
7532 stxa %g0, [%i0+0x10]%asi
7533 stxa %g0, [%i0+0x18]%asi
7534 stxa %g0, [%i0+0x20]%asi
7535 stxa %g0, [%i0+0x28]%asi
7536 stxa %g0, [%i0+0x30]%asi
7537 stxa %g0, [%i0+0x38]%asi
7538
7539 subcc %i1, 0x40, %i1
7540 bgu,pt %ncc, 3b
7541 add %i0, 0x40, %i0
7542
7543 .pz_finish:
7544 membar #Sync
7545 ret
7546 restore %g0, 0, %o0 ! return (bzero or not)
7547 SET_SIZE(hwblkclr)
7548 #endif /* lint */
7549
7550 #ifdef lint
7551 /* Copy 32 bytes of data from src to dst using physical addresses */
7552 /*ARGSUSED*/
7553 void
7554 hw_pa_bcopy32(uint64_t src, uint64_t dst)
7555 {}
7556 #else /*!lint */
7557
7558 /*
7559 * Copy 32 bytes of data from src (%o0) to dst (%o1)
7560 * using physical addresses.
7561 */
7562 ENTRY_NP(hw_pa_bcopy32)
7563 rdpr %pstate, %g1
7564 andn %g1, PSTATE_IE, %g2
7565 wrpr %g0, %g2, %pstate
7566
7567 ldxa [%o0]ASI_MEM, %o2
7568 add %o0, 8, %o0
7569 ldxa [%o0]ASI_MEM, %o3
7570 add %o0, 8, %o0
7571 ldxa [%o0]ASI_MEM, %o4
7572 add %o0, 8, %o0
7573 ldxa [%o0]ASI_MEM, %o5
7574 stxa %o2, [%o1]ASI_MEM
7575 add %o1, 8, %o1
7576 stxa %o3, [%o1]ASI_MEM
7577 add %o1, 8, %o1
7578 stxa %o4, [%o1]ASI_MEM
7579 add %o1, 8, %o1
7580 stxa %o5, [%o1]ASI_MEM
7581
7582 membar #Sync
7583 retl
7584 wrpr %g0, %g1, %pstate
7585 SET_SIZE(hw_pa_bcopy32)
7586 #endif /* lint */
7587
7588 /*
7589 * Zero a block of storage.
7590 *
7591 * uzero is used by the kernel to zero a block in user address space.
7592 */
7593
7594 /*
7595 * Control flow of the bzero/kzero/uzero routine.
7596 *
7597 * For fewer than 7 bytes stores, bytes will be zeroed.
7598 *
7599 * For less than 15 bytes stores, align the address on 4 byte boundary.
7600 * Then store as many 4-byte chunks, followed by trailing bytes.
7601 *
7602 * For sizes greater than 15 bytes, align the address on 8 byte boundary.
7603 * if (count > 128) {
7604 * store as many 8-bytes chunks to block align the address
7605 * store using ASI_BLK_INIT_ST_QUAD_LDD_P (bzero/kzero) OR
7606 * store using ASI_BLK_INIT_QUAD_LDD_AIUS (uzero)
7607 * }
7608 * Store as many 8-byte chunks, followed by trailing bytes.
7609 */
7610
7611 #if defined(lint)
7612
7613 /* ARGSUSED */
7614 int
7615 kzero(void *addr, size_t count)
7616 { return(0); }
7617
7618 /* ARGSUSED */
7619 void
7620 uzero(void *addr, size_t count)
7621 {}
7622
7623 #else /* lint */
7624
7625 ENTRY(uzero)
7626 !
7627 ! Set a new lo_fault handler only if we came in with one
7628 ! already specified.
7629 !
7630 wr %g0, ASI_USER, %asi
7631 ldn [THREAD_REG + T_LOFAULT], %o5
7632 tst %o5
7633 bz,pt %ncc, .do_zero
7634 sethi %hi(.zeroerr), %o2
7635 or %o2, %lo(.zeroerr), %o2
7636 membar #Sync
7637 ba,pt %ncc, .do_zero
7638 stn %o2, [THREAD_REG + T_LOFAULT]
7639
7640 ENTRY(kzero)
7641 !
7642 ! Always set a lo_fault handler
7643 !
7644 wr %g0, ASI_P, %asi
7645 ldn [THREAD_REG + T_LOFAULT], %o5
7646 sethi %hi(.zeroerr), %o2
7647 or %o5, LOFAULT_SET, %o5
7648 or %o2, %lo(.zeroerr), %o2
7649 membar #Sync
7650 ba,pt %ncc, .do_zero
7651 stn %o2, [THREAD_REG + T_LOFAULT]
7652
7653 /*
7654 * We got here because of a fault during kzero or if
7655 * uzero or bzero was called with t_lofault non-zero.
7656 * Otherwise we've already run screaming from the room.
7657 * Errno value is in %g1. Note that we're here iff
7658 * we did set t_lofault.
7659 */
7660 .zeroerr:
7661 !
7662 ! Undo asi register setting. Just set it to be the
7663 ! kernel default without checking.
7664 !
7665 wr %g0, ASI_P, %asi
7666
7667 !
7668 ! We did set t_lofault. It may well have been zero coming in.
7669 !
7670 1:
7671 tst %o5
7672 membar #Sync
7673 bne,pn %ncc, 3f
7674 andncc %o5, LOFAULT_SET, %o5
7675 2:
7676 !
7677 ! Old handler was zero. Just return the error.
7678 !
7679 retl ! return
7680 mov %g1, %o0 ! error code from %g1
7681 3:
7682 !
7683 ! We're here because %o5 was non-zero. It was non-zero
7684 ! because either LOFAULT_SET was present, a previous fault
7685 ! handler was present or both. In all cases we need to reset
7686 ! T_LOFAULT to the value of %o5 after clearing LOFAULT_SET
7687 ! before we either simply return the error or we invoke the
7688 ! previously specified handler.
7689 !
7690 be %ncc, 2b
7691 stn %o5, [THREAD_REG + T_LOFAULT]
7692 jmp %o5 ! goto real handler
7693 nop
7694 SET_SIZE(kzero)
7695 SET_SIZE(uzero)
7696
7697 #endif /* lint */
7698
7699 /*
7700 * Zero a block of storage.
7701 */
7702
7703 #if defined(lint)
7704
7705 /* ARGSUSED */
7706 void
7707 bzero(void *addr, size_t count)
7708 {}
7709
7710 #else /* lint */
7711
7712 ENTRY(bzero)
7713 wr %g0, ASI_P, %asi
7714
7715 ldn [THREAD_REG + T_LOFAULT], %o5 ! save old vector
7716 tst %o5
7717 bz,pt %ncc, .do_zero
7718 sethi %hi(.zeroerr), %o2
7719 or %o2, %lo(.zeroerr), %o2
7720 membar #Sync ! sync error barrier
7721 stn %o2, [THREAD_REG + T_LOFAULT] ! install new vector
7722
7723 .do_zero:
7724 cmp %o1, 7
7725 blu,pn %ncc, .byteclr
7726 nop
7727
7728 cmp %o1, 15
7729 blu,pn %ncc, .wdalign
7730 nop
7731
7732 andcc %o0, 7, %o3 ! is add aligned on a 8 byte bound
7733 bz,pt %ncc, .blkalign ! already double aligned
7734 sub %o3, 8, %o3 ! -(bytes till double aligned)
7735 add %o1, %o3, %o1 ! update o1 with new count
7736
7737 1:
7738 stba %g0, [%o0]%asi
7739 inccc %o3
7740 bl,pt %ncc, 1b
7741 inc %o0
7742
7743 ! Now address is double aligned
7744 .blkalign:
7745 cmp %o1, 0x80 ! check if there are 128 bytes to set
7746 blu,pn %ncc, .bzero_small
7747 mov %o1, %o3
7748
7749 sethi %hi(use_hw_bzero), %o2
7750 ld [%o2 + %lo(use_hw_bzero)], %o2
7751 tst %o2
7752 bz %ncc, .bzero_small
7753 mov %o1, %o3
7754
7755 rd %asi, %o3
7756 wr %g0, ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
7757 cmp %o3, ASI_P
7758 bne,a %ncc, .algnblk
7759 wr %g0, ASI_BLK_INIT_QUAD_LDD_AIUS, %asi
7760
7761 .algnblk:
7762 andcc %o0, 0x3f, %o3 ! is block aligned?
7763 bz,pt %ncc, .bzero_blk
7764 sub %o3, 0x40, %o3 ! -(bytes till block aligned)
7765 add %o1, %o3, %o1 ! o1 is the remainder
7766
7767 ! Clear -(%o3) bytes till block aligned
7768 1:
7769 stxa %g0, [%o0]%asi
7770 addcc %o3, 8, %o3
7771 bl,pt %ncc, 1b
7772 add %o0, 8, %o0
7773
7774 .bzero_blk:
7775 and %o1, 0x3f, %o3 ! calc bytes left after blk clear
7776 andn %o1, 0x3f, %o4 ! calc size of blocks in bytes
7777
7778 cmp %o4, 0x100 ! 256 bytes or more
7779 blu,pn %ncc, 3f
7780 nop
7781
7782 2:
7783 stxa %g0, [%o0+0x0]%asi
7784 stxa %g0, [%o0+0x40]%asi
7785 stxa %g0, [%o0+0x80]%asi
7786 stxa %g0, [%o0+0xc0]%asi
7787
7788 stxa %g0, [%o0+0x8]%asi
7789 stxa %g0, [%o0+0x10]%asi
7790 stxa %g0, [%o0+0x18]%asi
7791 stxa %g0, [%o0+0x20]%asi
7792 stxa %g0, [%o0+0x28]%asi
7793 stxa %g0, [%o0+0x30]%asi
7794 stxa %g0, [%o0+0x38]%asi
7795
7796 stxa %g0, [%o0+0x48]%asi
7797 stxa %g0, [%o0+0x50]%asi
7798 stxa %g0, [%o0+0x58]%asi
7799 stxa %g0, [%o0+0x60]%asi
7800 stxa %g0, [%o0+0x68]%asi
7801 stxa %g0, [%o0+0x70]%asi
7802 stxa %g0, [%o0+0x78]%asi
7803
7804 stxa %g0, [%o0+0x88]%asi
7805 stxa %g0, [%o0+0x90]%asi
7806 stxa %g0, [%o0+0x98]%asi
7807 stxa %g0, [%o0+0xa0]%asi
7808 stxa %g0, [%o0+0xa8]%asi
7809 stxa %g0, [%o0+0xb0]%asi
7810 stxa %g0, [%o0+0xb8]%asi
7811
7812 stxa %g0, [%o0+0xc8]%asi
7813 stxa %g0, [%o0+0xd0]%asi
7814 stxa %g0, [%o0+0xd8]%asi
7815 stxa %g0, [%o0+0xe0]%asi
7816 stxa %g0, [%o0+0xe8]%asi
7817 stxa %g0, [%o0+0xf0]%asi
7818 stxa %g0, [%o0+0xf8]%asi
7819
7820 sub %o4, 0x100, %o4
7821 cmp %o4, 0x100
7822 bgu,pt %ncc, 2b
7823 add %o0, 0x100, %o0
7824
7825 3:
7826 ! ... check if 64 bytes to set
7827 cmp %o4, 0x40
7828 blu %ncc, .bzero_blk_done
7829 nop
7830
7831 4:
7832 stxa %g0, [%o0+0x0]%asi
7833 stxa %g0, [%o0+0x8]%asi
7834 stxa %g0, [%o0+0x10]%asi
7835 stxa %g0, [%o0+0x18]%asi
7836 stxa %g0, [%o0+0x20]%asi
7837 stxa %g0, [%o0+0x28]%asi
7838 stxa %g0, [%o0+0x30]%asi
7839 stxa %g0, [%o0+0x38]%asi
7840
7841 subcc %o4, 0x40, %o4
7842 bgu,pt %ncc, 3b
7843 add %o0, 0x40, %o0
7844
7845 .bzero_blk_done:
7846 membar #Sync
7847 !
7848 ! Undo asi register setting.
7849 !
7850 rd %asi, %o4
7851 wr %g0, ASI_P, %asi
7852 cmp %o4, ASI_BLK_INIT_ST_QUAD_LDD_P
7853 bne,a %ncc, .bzero_small
7854 wr %g0, ASI_USER, %asi
7855
7856 .bzero_small:
7857 ! Set the remaining doubles
7858 subcc %o3, 8, %o3 ! Can we store any doubles?
7859 blu,pn %ncc, .byteclr
7860 and %o1, 7, %o1 ! calc bytes left after doubles
7861
7862 .dbclr:
7863 stxa %g0, [%o0]%asi ! Clear the doubles
7864 subcc %o3, 8, %o3
7865 bgeu,pt %ncc, .dbclr
7866 add %o0, 8, %o0
7867
7868 ba .byteclr
7869 nop
7870
7871 .wdalign:
7872 andcc %o0, 3, %o3 ! is add aligned on a word boundary
7873 bz,pn %ncc, .wdclr
7874 andn %o1, 3, %o3 ! create word sized count in %o3
7875
7876 dec %o1 ! decrement count
7877 stba %g0, [%o0]%asi ! clear a byte
7878 ba .wdalign
7879 inc %o0 ! next byte
7880
7881 .wdclr:
7882 sta %g0, [%o0]%asi ! 4-byte clearing loop
7883 subcc %o3, 4, %o3
7884 bnz,pt %ncc, .wdclr
7885 inc 4, %o0
7886
7887 and %o1, 3, %o1 ! leftover count, if any
7888
7889 .byteclr:
7890 ! Set the leftover bytes
7891 brz %o1, .bzero_exit
7892 nop
7893
7894 7:
7895 deccc %o1 ! byte clearing loop
7896 stba %g0, [%o0]%asi
7897 bgu,pt %ncc, 7b
7898 inc %o0
7899
7900 .bzero_exit:
7901 !
7902 ! We're just concerned with whether t_lofault was set
7903 ! when we came in. We end up here from either kzero()
7904 ! or bzero(). kzero() *always* sets a lofault handler.
7905 ! It ors LOFAULT_SET into %o5 to indicate it has done
7906 ! this even if the value of %o5 is otherwise zero.
7907 ! bzero() sets a lofault handler *only* if one was
7908 ! previously set. Accordingly we need to examine
7909 ! %o5 and if it is non-zero be sure to clear LOFAULT_SET
7910 ! before resetting the error handler.
7911 !
7912 tst %o5
7913 bz %ncc, 1f
7914 andn %o5, LOFAULT_SET, %o5
7915 membar #Sync ! sync error barrier
7916 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
7917 1:
7918 retl
7919 clr %o0 ! return (0)
7920
7921 SET_SIZE(bzero)
7922 #endif /* lint */