1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #include <sys/param.h>
27 #include <sys/errno.h>
28 #include <sys/asm_linkage.h>
29 #include <sys/vtrace.h>
30 #include <sys/machthread.h>
31 #include <sys/clock.h>
32 #include <sys/asi.h>
33 #include <sys/fsr.h>
34 #include <sys/privregs.h>
35
36 #include "assym.h"
37
38 /*
39 * Pseudo-code to aid in understanding the control flow of the
40 * bcopy/copyin/copyout routines.
41 *
42 * On entry:
43 *
44 * ! Determine whether to use the FP register version
45 * ! or the leaf routine version depending on size
46 * ! of copy and flags. Set up error handling accordingly.
47 * ! The transition point depends on whether the src and
48 * ! dst addresses can be aligned to long word, word,
49 * ! half word, or byte boundaries.
50 * !
51 * ! WARNING: <Register usage convention>
52 * ! For FP version, %l6 holds previous error handling and
53 * ! a flag: TRAMP_FLAG (low bits)
54 * ! for leaf routine version, %o4 holds those values.
55 * ! So either %l6 or %o4 is reserved and not available for
56 * ! any other use.
57 *
58 * if (length <= VIS_COPY_THRESHOLD) ! start with a quick test
59 * go to small_copy; ! to speed short copies
60 *
61 * ! src, dst long word alignable
62 * if (hw_copy_limit_8 == 0) ! hw_copy disabled
63 * go to small_copy;
64 * if (length <= hw_copy_limit_8)
65 * go to small_copy;
66 * go to FPBLK_copy;
67 * }
68 * if (src,dst not alignable) {
69 * if (hw_copy_limit_1 == 0) ! hw_copy disabled
70 * go to small_copy;
71 * if (length <= hw_copy_limit_1)
72 * go to small_copy;
73 * go to FPBLK_copy;
74 * }
75 * if (src,dst halfword alignable) {
76 * if (hw_copy_limit_2 == 0) ! hw_copy disabled
77 * go to small_copy;
78 * if (length <= hw_copy_limit_2)
79 * go to small_copy;
80 * go to FPBLK_copy;
81 * }
82 * if (src,dst word alignable) {
83 * if (hw_copy_limit_4 == 0) ! hw_copy disabled
84 * go to small_copy;
85 * if (length <= hw_copy_limit_4)
86 * go to small_copy;
87 * go to FPBLK_copy;
88 * }
89 *
90 * small_copy:
91 * Setup_leaf_rtn_error_handler; ! diffs for each entry point
92 *
93 * if (count <= 3) ! fast path for tiny copies
94 * go to sm_left; ! special finish up code
95 * else
96 * if (count > CHKSIZE) ! medium sized copies
97 * go to sm_med ! tuned by alignment
98 * if(src&dst not both word aligned) {
99 * sm_movebytes:
100 * move byte by byte in 4-way unrolled loop
101 * fall into sm_left;
102 * sm_left:
103 * move 0-3 bytes byte at a time as needed.
104 * restore error handler and exit.
105 *
106 * } else { ! src&dst are word aligned
107 * check for at least 8 bytes left,
108 * move word at a time, unrolled by 2
109 * when fewer than 8 bytes left,
110 * sm_half: move half word at a time while 2 or more bytes left
111 * sm_byte: move final byte if necessary
112 * sm_exit:
113 * restore error handler and exit.
114 * }
115 *
116 * ! Medium length cases with at least CHKSIZE bytes available
117 * ! method: line up src and dst as best possible, then
118 * ! move data in 4-way unrolled loops.
119 *
120 * sm_med:
121 * if(src&dst unalignable)
122 * go to sm_movebytes
123 * if(src&dst halfword alignable)
124 * go to sm_movehalf
125 * if(src&dst word alignable)
126 * go to sm_moveword
127 * ! fall into long word movement
128 * move bytes until src is word aligned
129 * if not long word aligned, move a word
130 * move long words in 4-way unrolled loop until < 32 bytes left
131 * move long words in 1-way unrolled loop until < 8 bytes left
132 * if zero bytes left, goto sm_exit
133 * if one byte left, go to sm_byte
134 * else go to sm_half
135 *
136 * sm_moveword:
137 * move bytes until src is word aligned
138 * move words in 4-way unrolled loop until < 16 bytes left
139 * move words in 1-way unrolled loop until < 4 bytes left
140 * if zero bytes left, goto sm_exit
141 * if one byte left, go to sm_byte
142 * else go to sm_half
143 *
144 * sm_movehalf:
145 * move a byte if needed to align src on halfword
146 * move halfwords in 4-way unrolled loop until < 8 bytes left
147 * if zero bytes left, goto sm_exit
148 * if one byte left, go to sm_byte
149 * else go to sm_half
150 *
151 *
152 * FPBLK_copy:
153 * %l6 = curthread->t_lofault;
154 * if (%l6 != NULL) {
155 * membar #Sync
156 * curthread->t_lofault = .copyerr;
157 * caller_error_handler = TRUE ! %l6 |= 2
158 * }
159 *
160 * ! for FPU testing we must not migrate cpus
161 * if (curthread->t_lwp == NULL) {
162 * ! Kernel threads do not have pcb's in which to store
163 * ! the floating point state, so disallow preemption during
164 * ! the copy. This also prevents cpu migration.
165 * kpreempt_disable(curthread);
166 * } else {
167 * thread_nomigrate();
168 * }
169 *
170 * old_fprs = %fprs;
171 * old_gsr = %gsr;
172 * if (%fprs.fef) {
173 * %fprs.fef = 1;
174 * save current fpregs on stack using blockstore
175 * } else {
176 * %fprs.fef = 1;
177 * }
178 *
179 *
180 * do_blockcopy_here;
181 *
182 * In lofault handler:
183 * curthread->t_lofault = .copyerr2;
184 * Continue on with the normal exit handler
185 *
186 * On normal exit:
187 * %gsr = old_gsr;
188 * if (old_fprs & FPRS_FEF)
189 * restore fpregs from stack using blockload
190 * else
191 * zero fpregs
192 * %fprs = old_fprs;
193 * membar #Sync
194 * curthread->t_lofault = (%l6 & ~3);
195 * ! following test omitted from copyin/copyout as they
196 * ! will always have a current thread
197 * if (curthread->t_lwp == NULL)
198 * kpreempt_enable(curthread);
199 * else
200 * thread_allowmigrate();
201 * return (0)
202 *
203 * In second lofault handler (.copyerr2):
204 * We've tried to restore fp state from the stack and failed. To
205 * prevent from returning with a corrupted fp state, we will panic.
206 */
207
208 /*
209 * Comments about optimization choices
210 *
211 * The initial optimization decision in this code is to determine
212 * whether to use the FP registers for a copy or not. If we don't
213 * use the FP registers, we can execute the copy as a leaf routine,
214 * saving a register save and restore. Also, less elaborate setup
215 * is required, allowing short copies to be completed more quickly.
216 * For longer copies, especially unaligned ones (where the src and
217 * dst do not align to allow simple ldx,stx operation), the FP
218 * registers allow much faster copy operations.
219 *
220 * The estimated extra cost of the FP path will vary depending on
221 * src/dst alignment, dst offset from the next 64 byte FPblock store
222 * boundary, remaining src data after the last full dst cache line is
223 * moved whether the FP registers need to be saved, and some other
224 * minor issues. The average additional overhead is estimated to be
225 * 400 clocks. Since each non-repeated/predicted tst and branch costs
226 * around 10 clocks, elaborate calculation would slow down to all
227 * longer copies and only benefit a small portion of medium sized
228 * copies. Rather than incur such cost, we chose fixed transition
229 * points for each of the alignment choices.
230 *
231 * For the inner loop, here is a comparison of the per cache line
232 * costs for each alignment when src&dst are in cache:
233 *
234 * byte aligned: 108 clocks slower for non-FPBLK
235 * half aligned: 44 clocks slower for non-FPBLK
236 * word aligned: 12 clocks slower for non-FPBLK
237 * long aligned: 4 clocks >>faster<< for non-FPBLK
238 *
239 * The long aligned loop runs faster because it does no prefetching.
240 * That wins if the data is not in cache or there is too little
241 * data to gain much benefit from prefetching. But when there
242 * is more data and that data is not in cache, failing to prefetch
243 * can run much slower. In addition, there is a 2 Kbyte store queue
244 * which will cause the non-FPBLK inner loop to slow for larger copies.
245 * The exact tradeoff is strongly load and application dependent, with
246 * increasing risk of a customer visible performance regression if the
247 * non-FPBLK code is used for larger copies. Studies of synthetic in-cache
248 * vs out-of-cache copy tests in user space suggest 1024 bytes as a safe
249 * upper limit for the non-FPBLK code. To minimize performance regression
250 * risk while still gaining the primary benefits of the improvements to
251 * the non-FPBLK code, we set an upper bound of 1024 bytes for the various
252 * hw_copy_limit_*. Later experimental studies using different values
253 * of hw_copy_limit_* can be used to make further adjustments if
254 * appropriate.
255 *
256 * hw_copy_limit_1 = src and dst are byte aligned but not halfword aligned
257 * hw_copy_limit_2 = src and dst are halfword aligned but not word aligned
258 * hw_copy_limit_4 = src and dst are word aligned but not longword aligned
259 * hw_copy_limit_8 = src and dst are longword aligned
260 *
261 * To say that src and dst are word aligned means that after
262 * some initial alignment activity of moving 0 to 3 bytes,
263 * both the src and dst will be on word boundaries so that
264 * word loads and stores may be used.
265 *
266 * Default values at May,2005 are:
267 * hw_copy_limit_1 = 256
268 * hw_copy_limit_2 = 512
269 * hw_copy_limit_4 = 1024
270 * hw_copy_limit_8 = 1024 (or 1536 on some systems)
271 *
272 *
273 * If hw_copy_limit_? is set to zero, then use of FPBLK copy is
274 * disabled for that alignment choice.
275 * If hw_copy_limit_? is set to a value between 1 and VIS_COPY_THRESHOLD (256)
276 * the value of VIS_COPY_THRESHOLD is used.
277 * It is not envisioned that hw_copy_limit_? will be changed in the field
278 * It is provided to allow for disabling FPBLK copies and to allow
279 * easy testing of alternate values on future HW implementations
280 * that might have different cache sizes, clock rates or instruction
281 * timing rules.
282 *
283 * Our first test for FPBLK copies vs non-FPBLK copies checks a minimum
284 * threshold to speedup all shorter copies (less than 256). That
285 * saves an alignment test, memory reference, and enabling test
286 * for all short copies, or an estimated 24 clocks.
287 *
288 * The order in which these limits are checked does matter since each
289 * non-predicted tst and branch costs around 10 clocks.
290 * If src and dst are randomly selected addresses,
291 * 4 of 8 will not be alignable.
292 * 2 of 8 will be half word alignable.
293 * 1 of 8 will be word alignable.
294 * 1 of 8 will be long word alignable.
295 * But, tests on running kernels show that src and dst to copy code
296 * are typically not on random alignments. Structure copies and
297 * copies of larger data sizes are often on long word boundaries.
298 * So we test the long word alignment case first, then
299 * the byte alignment, then halfword, then word alignment.
300 *
301 * Several times, tests for length are made to split the code
302 * into subcases. These tests often allow later tests to be
303 * avoided. For example, within the non-FPBLK copy, we first
304 * check for tiny copies of 3 bytes or less. That allows us
305 * to use a 4-way unrolled loop for the general byte copy case
306 * without a test on loop entry.
307 * We subdivide the non-FPBLK case further into CHKSIZE bytes and less
308 * vs longer cases. For the really short case, we don't attempt
309 * align src and dst. We try to minimize special case tests in
310 * the shortest loops as each test adds a significant percentage
311 * to the total time.
312 *
313 * For the medium sized cases, we allow ourselves to adjust the
314 * src and dst alignment and provide special cases for each of
315 * the four adjusted alignment cases. The CHKSIZE that was used
316 * to decide between short and medium size was chosen to be 39
317 * as that allows for the worst case of 7 bytes of alignment
318 * shift and 4 times 8 bytes for the first long word unrolling.
319 * That knowledge saves an initial test for length on entry into
320 * the medium cases. If the general loop unrolling factor were
321 * to be increases, this number would also need to be adjusted.
322 *
323 * For all cases in the non-FPBLK code where it is known that at
324 * least 4 chunks of data are available for movement, the
325 * loop is unrolled by four. This 4-way loop runs in 8 clocks
326 * or 2 clocks per data element.
327 *
328 * Instruction alignment is forced by used of .align 16 directives
329 * and nops which are not executed in the code. This
330 * combination of operations shifts the alignment of following
331 * loops to insure that loops are aligned so that their instructions
332 * fall within the minimum number of 4 instruction fetch groups.
333 * If instructions are inserted or removed between the .align
334 * instruction and the unrolled loops, then the alignment needs
335 * to be readjusted. Misaligned loops can add a clock per loop
336 * iteration to the loop timing.
337 *
338 * In a few cases, code is duplicated to avoid a branch. Since
339 * a non-predicted tst and branch takes 10 clocks, this savings
340 * is judged an appropriate time-space tradeoff.
341 *
342 * Within the FPBLK-code, the prefetch method in the inner
343 * loop needs to be explained as it is not standard. Two
344 * prefetches are issued for each cache line instead of one.
345 * The primary one is at the maximum reach of 8 cache lines.
346 * Most of the time, that maximum prefetch reach gives the
347 * cache line more time to reach the processor for systems with
348 * higher processor clocks. But, sometimes memory interference
349 * can cause that prefetch to be dropped. Putting a second
350 * prefetch at a reach of 5 cache lines catches the drops
351 * three iterations later and shows a measured improvement
352 * in performance over any similar loop with a single prefetch.
353 * The prefetches are placed in the loop so they overlap with
354 * non-memory instructions, so that there is no extra cost
355 * when the data is already in-cache.
356 *
357 */
358
359 /*
360 * Notes on preserving existing fp state and on membars.
361 *
362 * When a copyOP decides to use fp we may have to preserve existing
363 * floating point state. It is not the caller's state that we need to
364 * preserve - the rest of the kernel does not use fp and, anyway, fp
365 * registers are volatile across a call. Some examples:
366 *
367 * - userland has fp state and is interrupted (device interrupt
368 * or trap) and within the interrupt/trap handling we use
369 * bcopy()
370 * - another (higher level) interrupt or trap handler uses bcopy
371 * while a bcopy from an earlier interrupt is still active
372 * - an asynchronous error trap occurs while fp state exists (in
373 * userland or in kernel copy) and the tl0 component of the handling
374 * uses bcopy
375 * - a user process with fp state incurs a copy-on-write fault and
376 * hwblkpagecopy always uses fp
377 *
378 * We therefore need a per-call place in which to preserve fp state -
379 * using our stack is ideal (and since fp copy cannot be leaf optimized
380 * because of calls it makes, this is no hardship).
381 *
382 * When we have finished fp copy (with it's repeated block stores)
383 * we must membar #Sync so that our block stores may complete before
384 * we either restore the original fp state into the fp registers or
385 * return to a caller which may initiate other fp operations that could
386 * modify the fp regs we used before the block stores complete.
387 *
388 * Synchronous faults (eg, unresolvable DMMU miss) that occur while
389 * t_lofault is not NULL will not panic but will instead trampoline
390 * to the registered lofault handler. There is no need for any
391 * membars for these - eg, our store to t_lofault will always be visible to
392 * ourselves and it is our cpu which will take any trap.
393 *
394 * Asynchronous faults (eg, uncorrectable ECC error from memory) that occur
395 * while t_lofault is not NULL will also not panic. Since we're copying
396 * to or from userland the extent of the damage is known - the destination
397 * buffer is incomplete. So trap handlers will trampoline to the lofault
398 * handler in this case which should take some form of error action to
399 * avoid using the incomplete buffer. The trap handler also flags the
400 * fault so that later return-from-trap handling (for the trap that brought
401 * this thread into the kernel in the first place) can notify the process
402 * and reboot the system (or restart the service with Greenline/Contracts).
403 *
404 * Asynchronous faults (eg, uncorrectable ECC error from memory) can
405 * result in deferred error traps - the trap is taken sometime after
406 * the event and the trap PC may not be the PC of the faulting access.
407 * Delivery of such pending traps can be forced by a membar #Sync, acting
408 * as an "error barrier" in this role. To accurately apply the user/kernel
409 * separation described in the preceding paragraph we must force delivery
410 * of deferred traps affecting kernel state before we install a lofault
411 * handler (if we interpose a new lofault handler on an existing one there
412 * is no need to repeat this), and we must force delivery of deferred
413 * errors affecting the lofault-protected region before we clear t_lofault.
414 * Failure to do so results in lost kernel state being interpreted as
415 * affecting a copyin/copyout only, or of an error that really only
416 * affects copy data being interpreted as losing kernel state.
417 *
418 * Since the copy operations may preserve and later restore floating
419 * point state that does not belong to the caller (see examples above),
420 * we must be careful in how we do this in order to prevent corruption
421 * of another program.
422 *
423 * To make sure that floating point state is always saved and restored
424 * correctly, the following "big rules" must be followed when the floating
425 * point registers will be used:
426 *
427 * 1. %l6 always holds the caller's lofault handler. Also in this register,
428 * Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in
429 * use. Bit 2 (TRAMP_FLAG) indicates that the call was to bcopy, and a
430 * lofault handler was set coming in.
431 *
432 * 2. The FPUSED flag indicates that all FP state has been successfully stored
433 * on the stack. It should not be set until this save has been completed.
434 *
435 * 3. The FPUSED flag should not be cleared on exit until all FP state has
436 * been restored from the stack. If an error occurs while restoring
437 * data from the stack, the error handler can check this flag to see if
438 * a restore is necessary.
439 *
440 * 4. Code run under the new lofault handler must be kept to a minimum. In
441 * particular, any calls to FP_ALLOWMIGRATE, which could result in a call
442 * to kpreempt(), should not be made until after the lofault handler has
443 * been restored.
444 */
445
446 /*
447 * VIS_COPY_THRESHOLD indicates the minimum number of bytes needed
448 * to "break even" using FP/VIS-accelerated memory operations.
449 * The FPBLK code assumes a minimum number of bytes are available
450 * to be moved on entry. Check that code carefully before
451 * reducing VIS_COPY_THRESHOLD below 256.
452 */
453 /*
454 * This shadows sys/machsystm.h which can't be included due to the lack of
455 * _ASM guards in include files it references. Change it here, change it there.
456 */
457 #define VIS_COPY_THRESHOLD 256
458
459 /*
460 * TEST for very short copies
461 * Be aware that the maximum unroll for the short unaligned case
462 * is SHORTCOPY+1
463 */
464 #define SHORTCOPY 3
465 #define CHKSIZE 39
466
467 /*
468 * Indicates that we're to trampoline to the error handler.
469 * Entry points bcopy, copyin_noerr, and copyout_noerr use this flag.
470 * kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag.
471 */
472 #define FPUSED_FLAG 1
473 #define TRAMP_FLAG 2
474 #define MASK_FLAGS 3
475
476 /*
477 * Number of outstanding prefetches.
478 * first prefetch moves data from L2 to L1 (n_reads)
479 * second prefetch moves data from memory to L2 (one_read)
480 */
481 #define OLYMPUS_C_PREFETCH 24
482 #define OLYMPUS_C_2ND_PREFETCH 12
483
484 #define VIS_BLOCKSIZE 64
485
486 /*
487 * Size of stack frame in order to accomodate a 64-byte aligned
488 * floating-point register save area and 2 64-bit temp locations.
489 * All copy functions use two quadrants of fp registers; to assure a
490 * block-aligned two block buffer in which to save we must reserve
491 * three blocks on stack. Not all functions preserve %pfrs on stack
492 * or need to preserve %gsr but we use HWCOPYFRAMESIZE for all.
493 *
494 * _______________________________________ <-- %fp + STACK_BIAS
495 * | We may need to preserve 2 quadrants |
496 * | of fp regs, but since we do so with |
497 * | BST/BLD we need room in which to |
498 * | align to VIS_BLOCKSIZE bytes. So |
499 * | this area is 3 * VIS_BLOCKSIZE. | <-- - SAVED_FPREGS_OFFSET
500 * |-------------------------------------|
501 * | 8 bytes to save %fprs | <-- - SAVED_FPRS_OFFSET
502 * |-------------------------------------|
503 * | 8 bytes to save %gsr | <-- - SAVED_GSR_OFFSET
504 * ---------------------------------------
505 */
506 #define HWCOPYFRAMESIZE ((VIS_BLOCKSIZE * (2 + 1)) + (2 * 8))
507 #define SAVED_FPREGS_OFFSET (VIS_BLOCKSIZE * 3)
508 #define SAVED_FPREGS_ADJUST ((VIS_BLOCKSIZE * 2) - 1)
509 #define SAVED_FPRS_OFFSET (SAVED_FPREGS_OFFSET + 8)
510 #define SAVED_GSR_OFFSET (SAVED_FPRS_OFFSET + 8)
511
512 /*
513 * Common macros used by the various versions of the block copy
514 * routines in this file.
515 */
516
517 /*
518 * In FP copies if we do not have preserved data to restore over
519 * the fp regs we used then we must zero those regs to avoid
520 * exposing portions of the data to later threads (data security).
521 *
522 * Copy functions use either quadrants 1 and 3 or 2 and 4.
523 *
524 * FZEROQ1Q3: Zero quadrants 1 and 3, ie %f0 - %f15 and %f32 - %f47
525 * FZEROQ2Q4: Zero quadrants 2 and 4, ie %f16 - %f31 and %f48 - %f63
526 *
527 * The instructions below are quicker than repeated fzero instructions
528 * since they can dispatch down two fp pipelines.
529 */
530 #define FZEROQ1Q3 \
531 fzero %f0 ;\
532 fmovd %f0, %f2 ;\
533 fmovd %f0, %f4 ;\
534 fmovd %f0, %f6 ;\
535 fmovd %f0, %f8 ;\
536 fmovd %f0, %f10 ;\
537 fmovd %f0, %f12 ;\
538 fmovd %f0, %f14 ;\
539 fmovd %f0, %f32 ;\
540 fmovd %f0, %f34 ;\
541 fmovd %f0, %f36 ;\
542 fmovd %f0, %f38 ;\
543 fmovd %f0, %f40 ;\
544 fmovd %f0, %f42 ;\
545 fmovd %f0, %f44 ;\
546 fmovd %f0, %f46
547
548 #define FZEROQ2Q4 \
549 fzero %f16 ;\
550 fmovd %f0, %f18 ;\
551 fmovd %f0, %f20 ;\
552 fmovd %f0, %f22 ;\
553 fmovd %f0, %f24 ;\
554 fmovd %f0, %f26 ;\
555 fmovd %f0, %f28 ;\
556 fmovd %f0, %f30 ;\
557 fmovd %f0, %f48 ;\
558 fmovd %f0, %f50 ;\
559 fmovd %f0, %f52 ;\
560 fmovd %f0, %f54 ;\
561 fmovd %f0, %f56 ;\
562 fmovd %f0, %f58 ;\
563 fmovd %f0, %f60 ;\
564 fmovd %f0, %f62
565
566 /*
567 * Macros to save and restore quadrants 1 and 3 or 2 and 4 to/from the stack.
568 * Used to save and restore in-use fp registers when we want to use FP
569 * and find fp already in use and copy size still large enough to justify
570 * the additional overhead of this save and restore.
571 *
572 * A membar #Sync is needed before save to sync fp ops initiated before
573 * the call to the copy function (by whoever has fp in use); for example
574 * an earlier block load to the quadrant we are about to save may still be
575 * "in flight". A membar #Sync is required at the end of the save to
576 * sync our block store (the copy code is about to begin ldd's to the
577 * first quadrant).
578 *
579 * Similarly: a membar #Sync before restore allows the block stores of
580 * the copy operation to complete before we fill the quadrants with their
581 * original data, and a membar #Sync after restore lets the block loads
582 * of the restore complete before we return to whoever has the fp regs
583 * in use. To avoid repeated membar #Sync we make it the responsibility
584 * of the copy code to membar #Sync immediately after copy is complete
585 * and before using the BLD_*_FROMSTACK macro.
586 */
587 #define BST_FPQ1Q3_TOSTACK(tmp1) \
588 /* membar #Sync */ ;\
589 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\
590 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\
591 stda %f0, [tmp1]ASI_BLK_P ;\
592 add tmp1, VIS_BLOCKSIZE, tmp1 ;\
593 stda %f32, [tmp1]ASI_BLK_P ;\
594 membar #Sync
595
596 #define BLD_FPQ1Q3_FROMSTACK(tmp1) \
597 /* membar #Sync - provided at copy completion */ ;\
598 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\
599 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\
600 ldda [tmp1]ASI_BLK_P, %f0 ;\
601 add tmp1, VIS_BLOCKSIZE, tmp1 ;\
602 ldda [tmp1]ASI_BLK_P, %f32 ;\
603 membar #Sync
604
605 #define BST_FPQ2Q4_TOSTACK(tmp1) \
606 /* membar #Sync */ ;\
607 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\
608 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\
609 stda %f16, [tmp1]ASI_BLK_P ;\
610 add tmp1, VIS_BLOCKSIZE, tmp1 ;\
611 stda %f48, [tmp1]ASI_BLK_P ;\
612 membar #Sync
613
614 #define BLD_FPQ2Q4_FROMSTACK(tmp1) \
615 /* membar #Sync - provided at copy completion */ ;\
616 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\
617 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\
618 ldda [tmp1]ASI_BLK_P, %f16 ;\
619 add tmp1, VIS_BLOCKSIZE, tmp1 ;\
620 ldda [tmp1]ASI_BLK_P, %f48 ;\
621 membar #Sync
622
623 /*
624 * FP_NOMIGRATE and FP_ALLOWMIGRATE. Prevent migration (or, stronger,
625 * prevent preemption if there is no t_lwp to save FP state to on context
626 * switch) before commencing a FP copy, and reallow it on completion or
627 * in error trampoline paths when we were using FP copy.
628 *
629 * Both macros may call other functions, so be aware that all outputs are
630 * forfeit after using these macros. For this reason we do not pass registers
631 * to use - we just use any outputs we want.
632 *
633 * Pseudo code:
634 *
635 * FP_NOMIGRATE:
636 *
637 * if (curthread->t_lwp) {
638 * thread_nomigrate();
639 * } else {
640 * kpreempt_disable();
641 * }
642 *
643 * FP_ALLOWMIGRATE:
644 *
645 * if (curthread->t_lwp) {
646 * thread_allowmigrate();
647 * } else {
648 * kpreempt_enable();
649 * }
650 */
651
652 #define FP_NOMIGRATE(label1, label2) \
653 ldn [THREAD_REG + T_LWP], %o0 ;\
654 brz,a,pn %o0, label1/**/f ;\
655 ldsb [THREAD_REG + T_PREEMPT], %o1 ;\
656 call thread_nomigrate ;\
657 nop ;\
658 ba label2/**/f ;\
659 nop ;\
660 label1: ;\
661 inc %o1 ;\
662 stb %o1, [THREAD_REG + T_PREEMPT] ;\
663 label2:
664
665 #define FP_ALLOWMIGRATE(label1, label2) \
666 ldn [THREAD_REG + T_LWP], %o0 ;\
667 brz,a,pn %o0, label1/**/f ;\
668 ldsb [THREAD_REG + T_PREEMPT], %o1 ;\
669 call thread_allowmigrate ;\
670 nop ;\
671 ba label2/**/f ;\
672 nop ;\
673 label1: ;\
674 dec %o1 ;\
675 brnz,pn %o1, label2/**/f ;\
676 stb %o1, [THREAD_REG + T_PREEMPT] ;\
677 ldn [THREAD_REG + T_CPU], %o0 ;\
678 ldub [%o0 + CPU_KPRUNRUN], %o0 ;\
679 brz,pt %o0, label2/**/f ;\
680 nop ;\
681 call kpreempt ;\
682 rdpr %pil, %o0 ;\
683 label2:
684
685 /*
686 * Copy a block of storage, returning an error code if `from' or
687 * `to' takes a kernel pagefault which cannot be resolved.
688 * Returns errno value on pagefault error, 0 if all ok
689 */
690
691 .seg ".text"
692 .align 4
693
694 ENTRY(kcopy)
695
696 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
697 bleu,pt %ncc, .kcopy_small ! go to larger cases
698 xor %o0, %o1, %o3 ! are src, dst alignable?
699 btst 7, %o3 !
700 bz,pt %ncc, .kcopy_8 ! check for longword alignment
701 nop
702 btst 1, %o3 !
703 bz,pt %ncc, .kcopy_2 ! check for half-word
704 nop
705 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
706 ld [%o3 + %lo(hw_copy_limit_1)], %o3
707 tst %o3
708 bz,pn %icc, .kcopy_small ! if zero, disable HW copy
709 cmp %o2, %o3 ! if length <= limit
710 bleu,pt %ncc, .kcopy_small ! go to small copy
711 nop
712 ba,pt %ncc, .kcopy_more ! otherwise go to large copy
713 nop
714 .kcopy_2:
715 btst 3, %o3 !
716 bz,pt %ncc, .kcopy_4 ! check for word alignment
717 nop
718 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
719 ld [%o3 + %lo(hw_copy_limit_2)], %o3
720 tst %o3
721 bz,pn %icc, .kcopy_small ! if zero, disable HW copy
722 cmp %o2, %o3 ! if length <= limit
723 bleu,pt %ncc, .kcopy_small ! go to small copy
724 nop
725 ba,pt %ncc, .kcopy_more ! otherwise go to large copy
726 nop
727 .kcopy_4:
728 ! already checked longword, must be word aligned
729 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
730 ld [%o3 + %lo(hw_copy_limit_4)], %o3
731 tst %o3
732 bz,pn %icc, .kcopy_small ! if zero, disable HW copy
733 cmp %o2, %o3 ! if length <= limit
734 bleu,pt %ncc, .kcopy_small ! go to small copy
735 nop
736 ba,pt %ncc, .kcopy_more ! otherwise go to large copy
737 nop
738 .kcopy_8:
739 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
740 ld [%o3 + %lo(hw_copy_limit_8)], %o3
741 tst %o3
742 bz,pn %icc, .kcopy_small ! if zero, disable HW copy
743 cmp %o2, %o3 ! if length <= limit
744 bleu,pt %ncc, .kcopy_small ! go to small copy
745 nop
746 ba,pt %ncc, .kcopy_more ! otherwise go to large copy
747 nop
748
749 .kcopy_small:
750 sethi %hi(.sm_copyerr), %o5 ! sm_copyerr is lofault value
751 or %o5, %lo(.sm_copyerr), %o5
752 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler
753 membar #Sync ! sync error barrier
754 ba,pt %ncc, .sm_do_copy ! common code
755 stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault
756
757 .kcopy_more:
758 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
759 sethi %hi(.copyerr), %l7 ! copyerr is lofault value
760 or %l7, %lo(.copyerr), %l7
761 ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler
762 membar #Sync ! sync error barrier
763 ba,pt %ncc, .do_copy ! common code
764 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault
765
766
767 /*
768 * We got here because of a fault during bcopy_more, called from kcopy or bcopy.
769 * Errno value is in %g1. bcopy_more uses fp quadrants 1 and 3.
770 */
771 .copyerr:
772 set .copyerr2, %l0
773 membar #Sync ! sync error barrier
774 stn %l0, [THREAD_REG + T_LOFAULT] ! set t_lofault
775 btst FPUSED_FLAG, %l6
776 bz %ncc, 1f
777 and %l6, TRAMP_FLAG, %l0 ! copy trampoline flag to %l0
778
779 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr
780 wr %o2, 0, %gsr
781
782 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
783 btst FPRS_FEF, %o3
784 bz,pt %icc, 4f
785 nop
786
787 BLD_FPQ1Q3_FROMSTACK(%o2)
788
789 ba,pt %ncc, 1f
790 wr %o3, 0, %fprs ! restore fprs
791
792 4:
793 FZEROQ1Q3
794 wr %o3, 0, %fprs ! restore fprs
795
796 !
797 ! Need to cater for the different expectations of kcopy
798 ! and bcopy. kcopy will *always* set a t_lofault handler
799 ! If it fires, we're expected to just return the error code
800 ! and *not* to invoke any existing error handler. As far as
801 ! bcopy is concerned, we only set t_lofault if there was an
802 ! existing lofault handler. In that case we're expected to
803 ! invoke the previously existing handler after resetting the
804 ! t_lofault value.
805 !
806 1:
807 andn %l6, MASK_FLAGS, %l6 ! turn trampoline flag off
808 membar #Sync ! sync error barrier
809 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
810 FP_ALLOWMIGRATE(5, 6)
811
812 btst TRAMP_FLAG, %l0
813 bnz,pn %ncc, 3f
814 nop
815 ret
816 restore %g1, 0, %o0
817
818 3:
819 !
820 ! We're here via bcopy. There *must* have been an error handler
821 ! in place otherwise we would have died a nasty death already.
822 !
823 jmp %l6 ! goto real handler
824 restore %g0, 0, %o0 ! dispose of copy window
825
826 /*
827 * We got here because of a fault in .copyerr. We can't safely restore fp
828 * state, so we panic.
829 */
830 fp_panic_msg:
831 .asciz "Unable to restore fp state after copy operation"
832
833 .align 4
834 .copyerr2:
835 set fp_panic_msg, %o0
836 call panic
837 nop
838
839 /*
840 * We got here because of a fault during a small kcopy or bcopy.
841 * No floating point registers are used by the small copies.
842 * Errno value is in %g1.
843 */
844 .sm_copyerr:
845 1:
846 btst TRAMP_FLAG, %o4
847 membar #Sync
848 andn %o4, TRAMP_FLAG, %o4
849 bnz,pn %ncc, 3f
850 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
851 retl
852 mov %g1, %o0
853 3:
854 jmp %o4 ! goto real handler
855 mov %g0, %o0 !
856
857 SET_SIZE(kcopy)
858
859
860 /*
861 * Copy a block of storage - must not overlap (from + len <= to).
862 * Registers: l6 - saved t_lofault
863 * (for short copies, o4 - saved t_lofault)
864 *
865 * Copy a page of memory.
866 * Assumes double word alignment and a count >= 256.
867 */
868
869 ENTRY(bcopy)
870
871 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
872 bleu,pt %ncc, .bcopy_small ! go to larger cases
873 xor %o0, %o1, %o3 ! are src, dst alignable?
874 btst 7, %o3 !
875 bz,pt %ncc, .bcopy_8 ! check for longword alignment
876 nop
877 btst 1, %o3 !
878 bz,pt %ncc, .bcopy_2 ! check for half-word
879 nop
880 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
881 ld [%o3 + %lo(hw_copy_limit_1)], %o3
882 tst %o3
883 bz,pn %icc, .bcopy_small ! if zero, disable HW copy
884 cmp %o2, %o3 ! if length <= limit
885 bleu,pt %ncc, .bcopy_small ! go to small copy
886 nop
887 ba,pt %ncc, .bcopy_more ! otherwise go to large copy
888 nop
889 .bcopy_2:
890 btst 3, %o3 !
891 bz,pt %ncc, .bcopy_4 ! check for word alignment
892 nop
893 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
894 ld [%o3 + %lo(hw_copy_limit_2)], %o3
895 tst %o3
896 bz,pn %icc, .bcopy_small ! if zero, disable HW copy
897 cmp %o2, %o3 ! if length <= limit
898 bleu,pt %ncc, .bcopy_small ! go to small copy
899 nop
900 ba,pt %ncc, .bcopy_more ! otherwise go to large copy
901 nop
902 .bcopy_4:
903 ! already checked longword, must be word aligned
904 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
905 ld [%o3 + %lo(hw_copy_limit_4)], %o3
906 tst %o3
907 bz,pn %icc, .bcopy_small ! if zero, disable HW copy
908 cmp %o2, %o3 ! if length <= limit
909 bleu,pt %ncc, .bcopy_small ! go to small copy
910 nop
911 ba,pt %ncc, .bcopy_more ! otherwise go to large copy
912 nop
913 .bcopy_8:
914 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
915 ld [%o3 + %lo(hw_copy_limit_8)], %o3
916 tst %o3
917 bz,pn %icc, .bcopy_small ! if zero, disable HW copy
918 cmp %o2, %o3 ! if length <= limit
919 bleu,pt %ncc, .bcopy_small ! go to small copy
920 nop
921 ba,pt %ncc, .bcopy_more ! otherwise go to large copy
922 nop
923
924 .align 16
925 .bcopy_small:
926 ldn [THREAD_REG + T_LOFAULT], %o4 ! save t_lofault
927 tst %o4
928 bz,pt %icc, .sm_do_copy
929 nop
930 sethi %hi(.sm_copyerr), %o5
931 or %o5, %lo(.sm_copyerr), %o5
932 membar #Sync ! sync error barrier
933 stn %o5, [THREAD_REG + T_LOFAULT] ! install new vector
934 or %o4, TRAMP_FLAG, %o4 ! error should trampoline
935 .sm_do_copy:
936 cmp %o2, SHORTCOPY ! check for really short case
937 bleu,pt %ncc, .bc_sm_left !
938 cmp %o2, CHKSIZE ! check for medium length cases
939 bgu,pn %ncc, .bc_med !
940 or %o0, %o1, %o3 ! prepare alignment check
941 andcc %o3, 0x3, %g0 ! test for alignment
942 bz,pt %ncc, .bc_sm_word ! branch to word aligned case
943 .bc_sm_movebytes:
944 sub %o2, 3, %o2 ! adjust count to allow cc zero test
945 .bc_sm_notalign4:
946 ldub [%o0], %o3 ! read byte
947 stb %o3, [%o1] ! write byte
948 subcc %o2, 4, %o2 ! reduce count by 4
949 ldub [%o0 + 1], %o3 ! repeat for a total of 4 bytes
950 add %o0, 4, %o0 ! advance SRC by 4
951 stb %o3, [%o1 + 1]
952 ldub [%o0 - 2], %o3
953 add %o1, 4, %o1 ! advance DST by 4
954 stb %o3, [%o1 - 2]
955 ldub [%o0 - 1], %o3
956 bgt,pt %ncc, .bc_sm_notalign4 ! loop til 3 or fewer bytes remain
957 stb %o3, [%o1 - 1]
958 add %o2, 3, %o2 ! restore count
959 .bc_sm_left:
960 tst %o2
961 bz,pt %ncc, .bc_sm_exit ! check for zero length
962 deccc %o2 ! reduce count for cc test
963 ldub [%o0], %o3 ! move one byte
964 bz,pt %ncc, .bc_sm_exit
965 stb %o3, [%o1]
966 ldub [%o0 + 1], %o3 ! move another byte
967 deccc %o2 ! check for more
968 bz,pt %ncc, .bc_sm_exit
969 stb %o3, [%o1 + 1]
970 ldub [%o0 + 2], %o3 ! move final byte
971 ba,pt %ncc, .bc_sm_exit
972 stb %o3, [%o1 + 2]
973 .align 16
974 nop ! instruction alignment
975 ! see discussion at start of file
976 .bc_sm_words:
977 lduw [%o0], %o3 ! read word
978 .bc_sm_wordx:
979 subcc %o2, 8, %o2 ! update count
980 stw %o3, [%o1] ! write word
981 add %o0, 8, %o0 ! update SRC
982 lduw [%o0 - 4], %o3 ! read word
983 add %o1, 8, %o1 ! update DST
984 bgt,pt %ncc, .bc_sm_words ! loop til done
985 stw %o3, [%o1 - 4] ! write word
986 addcc %o2, 7, %o2 ! restore count
987 bz,pt %ncc, .bc_sm_exit
988 deccc %o2
989 bz,pt %ncc, .bc_sm_byte
990 .bc_sm_half:
991 subcc %o2, 2, %o2 ! reduce count by 2
992 add %o0, 2, %o0 ! advance SRC by 2
993 lduh [%o0 - 2], %o3 ! read half word
994 add %o1, 2, %o1 ! advance DST by 2
995 bgt,pt %ncc, .bc_sm_half ! loop til done
996 sth %o3, [%o1 - 2] ! write half word
997 addcc %o2, 1, %o2 ! restore count
998 bz,pt %ncc, .bc_sm_exit
999 nop
1000 .bc_sm_byte:
1001 ldub [%o0], %o3
1002 ba,pt %ncc, .bc_sm_exit
1003 stb %o3, [%o1]
1004
1005 .bc_sm_word:
1006 subcc %o2, 4, %o2 ! update count
1007 bgt,pt %ncc, .bc_sm_wordx
1008 lduw [%o0], %o3 ! read word
1009 addcc %o2, 3, %o2 ! restore count
1010 bz,pt %ncc, .bc_sm_exit
1011 stw %o3, [%o1] ! write word
1012 deccc %o2 ! reduce count for cc test
1013 ldub [%o0 + 4], %o3 ! load one byte
1014 bz,pt %ncc, .bc_sm_exit
1015 stb %o3, [%o1 + 4] ! store one byte
1016 ldub [%o0 + 5], %o3 ! load second byte
1017 deccc %o2
1018 bz,pt %ncc, .bc_sm_exit
1019 stb %o3, [%o1 + 5] ! store second byte
1020 ldub [%o0 + 6], %o3 ! load third byte
1021 stb %o3, [%o1 + 6] ! store third byte
1022 .bc_sm_exit:
1023 ldn [THREAD_REG + T_LOFAULT], %o3
1024 brz,pt %o3, .bc_sm_done
1025 nop
1026 membar #Sync ! sync error barrier
1027 andn %o4, TRAMP_FLAG, %o4
1028 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
1029 .bc_sm_done:
1030 retl
1031 mov %g0, %o0 ! return 0
1032
1033 .align 16
1034 .bc_med:
1035 xor %o0, %o1, %o3 ! setup alignment check
1036 btst 1, %o3
1037 bnz,pt %ncc, .bc_sm_movebytes ! unaligned
1038 nop
1039 btst 3, %o3
1040 bnz,pt %ncc, .bc_med_half ! halfword aligned
1041 nop
1042 btst 7, %o3
1043 bnz,pt %ncc, .bc_med_word ! word aligned
1044 nop
1045 .bc_med_long:
1046 btst 3, %o0 ! check for
1047 bz,pt %ncc, .bc_med_long1 ! word alignment
1048 nop
1049 .bc_med_long0:
1050 ldub [%o0], %o3 ! load one byte
1051 inc %o0
1052 stb %o3,[%o1] ! store byte
1053 inc %o1
1054 btst 3, %o0
1055 bnz,pt %ncc, .bc_med_long0
1056 dec %o2
1057 .bc_med_long1: ! word aligned
1058 btst 7, %o0 ! check for long word
1059 bz,pt %ncc, .bc_med_long2
1060 nop
1061 lduw [%o0], %o3 ! load word
1062 add %o0, 4, %o0 ! advance SRC by 4
1063 stw %o3, [%o1] ! store word
1064 add %o1, 4, %o1 ! advance DST by 4
1065 sub %o2, 4, %o2 ! reduce count by 4
1066 !
1067 ! Now long word aligned and have at least 32 bytes to move
1068 !
1069 .bc_med_long2:
1070 sub %o2, 31, %o2 ! adjust count to allow cc zero test
1071 .bc_med_lmove:
1072 ldx [%o0], %o3 ! read long word
1073 stx %o3, [%o1] ! write long word
1074 subcc %o2, 32, %o2 ! reduce count by 32
1075 ldx [%o0 + 8], %o3 ! repeat for a total for 4 long words
1076 add %o0, 32, %o0 ! advance SRC by 32
1077 stx %o3, [%o1 + 8]
1078 ldx [%o0 - 16], %o3
1079 add %o1, 32, %o1 ! advance DST by 32
1080 stx %o3, [%o1 - 16]
1081 ldx [%o0 - 8], %o3
1082 bgt,pt %ncc, .bc_med_lmove ! loop til 31 or fewer bytes left
1083 stx %o3, [%o1 - 8]
1084 addcc %o2, 24, %o2 ! restore count to long word offset
1085 ble,pt %ncc, .bc_med_lextra ! check for more long words to move
1086 nop
1087 .bc_med_lword:
1088 ldx [%o0], %o3 ! read long word
1089 subcc %o2, 8, %o2 ! reduce count by 8
1090 stx %o3, [%o1] ! write long word
1091 add %o0, 8, %o0 ! advance SRC by 8
1092 bgt,pt %ncc, .bc_med_lword ! loop til 7 or fewer bytes left
1093 add %o1, 8, %o1 ! advance DST by 8
1094 .bc_med_lextra:
1095 addcc %o2, 7, %o2 ! restore rest of count
1096 bz,pt %ncc, .bc_sm_exit ! if zero, then done
1097 deccc %o2
1098 bz,pt %ncc, .bc_sm_byte
1099 nop
1100 ba,pt %ncc, .bc_sm_half
1101 nop
1102
1103 .align 16
1104 .bc_med_word:
1105 btst 3, %o0 ! check for
1106 bz,pt %ncc, .bc_med_word1 ! word alignment
1107 nop
1108 .bc_med_word0:
1109 ldub [%o0], %o3 ! load one byte
1110 inc %o0
1111 stb %o3,[%o1] ! store byte
1112 inc %o1
1113 btst 3, %o0
1114 bnz,pt %ncc, .bc_med_word0
1115 dec %o2
1116 !
1117 ! Now word aligned and have at least 36 bytes to move
1118 !
1119 .bc_med_word1:
1120 sub %o2, 15, %o2 ! adjust count to allow cc zero test
1121 .bc_med_wmove:
1122 lduw [%o0], %o3 ! read word
1123 stw %o3, [%o1] ! write word
1124 subcc %o2, 16, %o2 ! reduce count by 16
1125 lduw [%o0 + 4], %o3 ! repeat for a total for 4 words
1126 add %o0, 16, %o0 ! advance SRC by 16
1127 stw %o3, [%o1 + 4]
1128 lduw [%o0 - 8], %o3
1129 add %o1, 16, %o1 ! advance DST by 16
1130 stw %o3, [%o1 - 8]
1131 lduw [%o0 - 4], %o3
1132 bgt,pt %ncc, .bc_med_wmove ! loop til 15 or fewer bytes left
1133 stw %o3, [%o1 - 4]
1134 addcc %o2, 12, %o2 ! restore count to word offset
1135 ble,pt %ncc, .bc_med_wextra ! check for more words to move
1136 nop
1137 .bc_med_word2:
1138 lduw [%o0], %o3 ! read word
1139 subcc %o2, 4, %o2 ! reduce count by 4
1140 stw %o3, [%o1] ! write word
1141 add %o0, 4, %o0 ! advance SRC by 4
1142 bgt,pt %ncc, .bc_med_word2 ! loop til 3 or fewer bytes left
1143 add %o1, 4, %o1 ! advance DST by 4
1144 .bc_med_wextra:
1145 addcc %o2, 3, %o2 ! restore rest of count
1146 bz,pt %ncc, .bc_sm_exit ! if zero, then done
1147 deccc %o2
1148 bz,pt %ncc, .bc_sm_byte
1149 nop
1150 ba,pt %ncc, .bc_sm_half
1151 nop
1152
1153 .align 16
1154 .bc_med_half:
1155 btst 1, %o0 ! check for
1156 bz,pt %ncc, .bc_med_half1 ! half word alignment
1157 nop
1158 ldub [%o0], %o3 ! load one byte
1159 inc %o0
1160 stb %o3,[%o1] ! store byte
1161 inc %o1
1162 dec %o2
1163 !
1164 ! Now half word aligned and have at least 38 bytes to move
1165 !
1166 .bc_med_half1:
1167 sub %o2, 7, %o2 ! adjust count to allow cc zero test
1168 .bc_med_hmove:
1169 lduh [%o0], %o3 ! read half word
1170 sth %o3, [%o1] ! write half word
1171 subcc %o2, 8, %o2 ! reduce count by 8
1172 lduh [%o0 + 2], %o3 ! repeat for a total for 4 halfwords
1173 add %o0, 8, %o0 ! advance SRC by 8
1174 sth %o3, [%o1 + 2]
1175 lduh [%o0 - 4], %o3
1176 add %o1, 8, %o1 ! advance DST by 8
1177 sth %o3, [%o1 - 4]
1178 lduh [%o0 - 2], %o3
1179 bgt,pt %ncc, .bc_med_hmove ! loop til 7 or fewer bytes left
1180 sth %o3, [%o1 - 2]
1181 addcc %o2, 7, %o2 ! restore count
1182 bz,pt %ncc, .bc_sm_exit
1183 deccc %o2
1184 bz,pt %ncc, .bc_sm_byte
1185 nop
1186 ba,pt %ncc, .bc_sm_half
1187 nop
1188
1189 SET_SIZE(bcopy)
1190
1191 /*
1192 * The _more entry points are not intended to be used directly by
1193 * any caller from outside this file. They are provided to allow
1194 * profiling and dtrace of the portions of the copy code that uses
1195 * the floating point registers.
1196 * This entry is particularly important as DTRACE (at least as of
1197 * 4/2004) does not support leaf functions.
1198 */
1199
1200 ENTRY(bcopy_more)
1201 .bcopy_more:
1202 prefetch [%o0], #n_reads
1203 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1204 ldn [THREAD_REG + T_LOFAULT], %l6 ! save t_lofault
1205 tst %l6
1206 bz,pt %ncc, .do_copy
1207 nop
1208 sethi %hi(.copyerr), %o2
1209 or %o2, %lo(.copyerr), %o2
1210 membar #Sync ! sync error barrier
1211 stn %o2, [THREAD_REG + T_LOFAULT] ! install new vector
1212 !
1213 ! We've already captured whether t_lofault was zero on entry.
1214 ! We need to mark ourselves as being from bcopy since both
1215 ! kcopy and bcopy use the same code path. If TRAMP_FLAG is set
1216 ! and the saved lofault was zero, we won't reset lofault on
1217 ! returning.
1218 !
1219 or %l6, TRAMP_FLAG, %l6
1220
1221 /*
1222 * Copies that reach here are larger than VIS_COPY_THRESHOLD bytes
1223 * Also, use of FP registers has been tested to be enabled
1224 */
1225 .do_copy:
1226 FP_NOMIGRATE(6, 7)
1227
1228 rd %fprs, %o2 ! check for unused fp
1229 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
1230 btst FPRS_FEF, %o2
1231 bz,a,pt %icc, .do_blockcopy
1232 wr %g0, FPRS_FEF, %fprs
1233
1234 BST_FPQ1Q3_TOSTACK(%o2)
1235
1236 .do_blockcopy:
1237 rd %gsr, %o2
1238 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr
1239 or %l6, FPUSED_FLAG, %l6
1240
1241 #define REALSRC %i0
1242 #define DST %i1
1243 #define CNT %i2
1244 #define SRC %i3
1245 #define TMP %i5
1246
1247 andcc DST, VIS_BLOCKSIZE - 1, TMP
1248 bz,pt %ncc, 2f
1249 neg TMP
1250 add TMP, VIS_BLOCKSIZE, TMP
1251
1252 ! TMP = bytes required to align DST on FP_BLOCK boundary
1253 ! Using SRC as a tmp here
1254 cmp TMP, 3
1255 bleu,pt %ncc, 1f
1256 sub CNT,TMP,CNT ! adjust main count
1257 sub TMP, 3, TMP ! adjust for end of loop test
1258 .bc_blkalign:
1259 ldub [REALSRC], SRC ! move 4 bytes per loop iteration
1260 stb SRC, [DST]
1261 subcc TMP, 4, TMP
1262 ldub [REALSRC + 1], SRC
1263 add REALSRC, 4, REALSRC
1264 stb SRC, [DST + 1]
1265 ldub [REALSRC - 2], SRC
1266 add DST, 4, DST
1267 stb SRC, [DST - 2]
1268 ldub [REALSRC - 1], SRC
1269 bgu,pt %ncc, .bc_blkalign
1270 stb SRC, [DST - 1]
1271
1272 addcc TMP, 3, TMP ! restore count adjustment
1273 bz,pt %ncc, 2f ! no bytes left?
1274 nop
1275 1: ldub [REALSRC], SRC
1276 inc REALSRC
1277 inc DST
1278 deccc TMP
1279 bgu %ncc, 1b
1280 stb SRC, [DST - 1]
1281
1282 2:
1283 membar #StoreLoad
1284 andn REALSRC, 0x7, SRC
1285
1286 ! SRC - 8-byte aligned
1287 ! DST - 64-byte aligned
1288 ldd [SRC], %f0
1289 prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads
1290 alignaddr REALSRC, %g0, %g0
1291 ldd [SRC + 0x08], %f2
1292 prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads
1293 faligndata %f0, %f2, %f32
1294 ldd [SRC + 0x10], %f4
1295 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
1296 faligndata %f2, %f4, %f34
1297 ldd [SRC + 0x18], %f6
1298 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
1299 faligndata %f4, %f6, %f36
1300 ldd [SRC + 0x20], %f8
1301 prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read
1302 faligndata %f6, %f8, %f38
1303 ldd [SRC + 0x28], %f10
1304 prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read
1305 faligndata %f8, %f10, %f40
1306 ldd [SRC + 0x30], %f12
1307 prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read
1308 faligndata %f10, %f12, %f42
1309 ldd [SRC + 0x38], %f14
1310 ldd [SRC + VIS_BLOCKSIZE], %f0
1311 sub CNT, VIS_BLOCKSIZE, CNT
1312 add SRC, VIS_BLOCKSIZE, SRC
1313 prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read
1314 add REALSRC, VIS_BLOCKSIZE, REALSRC
1315 ba,pt %ncc, 1f
1316 prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read
1317 .align 32
1318 1:
1319 ldd [SRC + 0x08], %f2
1320 faligndata %f12, %f14, %f44
1321 ldd [SRC + 0x10], %f4
1322 faligndata %f14, %f0, %f46
1323 stda %f32, [DST]ASI_BLK_P
1324 ldd [SRC + 0x18], %f6
1325 faligndata %f0, %f2, %f32
1326 ldd [SRC + 0x20], %f8
1327 faligndata %f2, %f4, %f34
1328 ldd [SRC + 0x28], %f10
1329 faligndata %f4, %f6, %f36
1330 ldd [SRC + 0x30], %f12
1331 faligndata %f6, %f8, %f38
1332 sub CNT, VIS_BLOCKSIZE, CNT
1333 ldd [SRC + 0x38], %f14
1334 faligndata %f8, %f10, %f40
1335 add DST, VIS_BLOCKSIZE, DST
1336 ldd [SRC + VIS_BLOCKSIZE], %f0
1337 faligndata %f10, %f12, %f42
1338 add REALSRC, VIS_BLOCKSIZE, REALSRC
1339 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
1340 add SRC, VIS_BLOCKSIZE, SRC
1341 prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1342 cmp CNT, VIS_BLOCKSIZE + 8
1343 bgu,pt %ncc, 1b
1344 prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1345
1346 ! only if REALSRC & 0x7 is 0
1347 cmp CNT, VIS_BLOCKSIZE
1348 bne %ncc, 3f
1349 andcc REALSRC, 0x7, %g0
1350 bz,pt %ncc, 2f
1351 nop
1352 3:
1353 faligndata %f12, %f14, %f44
1354 faligndata %f14, %f0, %f46
1355 stda %f32, [DST]ASI_BLK_P
1356 add DST, VIS_BLOCKSIZE, DST
1357 ba,pt %ncc, 3f
1358 nop
1359 2:
1360 ldd [SRC + 0x08], %f2
1361 fsrc1 %f12, %f44
1362 ldd [SRC + 0x10], %f4
1363 fsrc1 %f14, %f46
1364 stda %f32, [DST]ASI_BLK_P
1365 ldd [SRC + 0x18], %f6
1366 fsrc1 %f0, %f32
1367 ldd [SRC + 0x20], %f8
1368 fsrc1 %f2, %f34
1369 ldd [SRC + 0x28], %f10
1370 fsrc1 %f4, %f36
1371 ldd [SRC + 0x30], %f12
1372 fsrc1 %f6, %f38
1373 ldd [SRC + 0x38], %f14
1374 fsrc1 %f8, %f40
1375 sub CNT, VIS_BLOCKSIZE, CNT
1376 add DST, VIS_BLOCKSIZE, DST
1377 add SRC, VIS_BLOCKSIZE, SRC
1378 add REALSRC, VIS_BLOCKSIZE, REALSRC
1379 fsrc1 %f10, %f42
1380 fsrc1 %f12, %f44
1381 fsrc1 %f14, %f46
1382 stda %f32, [DST]ASI_BLK_P
1383 add DST, VIS_BLOCKSIZE, DST
1384 ba,a,pt %ncc, .bcb_exit
1385 nop
1386
1387 3: tst CNT
1388 bz,a,pt %ncc, .bcb_exit
1389 nop
1390
1391 5: ldub [REALSRC], TMP
1392 inc REALSRC
1393 inc DST
1394 deccc CNT
1395 bgu %ncc, 5b
1396 stb TMP, [DST - 1]
1397 .bcb_exit:
1398 membar #Sync
1399
1400 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr
1401 wr %o2, 0, %gsr
1402
1403 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1404 btst FPRS_FEF, %o3
1405 bz,pt %icc, 4f
1406 nop
1407
1408 BLD_FPQ1Q3_FROMSTACK(%o2)
1409
1410 ba,pt %ncc, 2f
1411 wr %o3, 0, %fprs ! restore fprs
1412 4:
1413 FZEROQ1Q3
1414 wr %o3, 0, %fprs ! restore fprs
1415 2:
1416 membar #Sync ! sync error barrier
1417 andn %l6, MASK_FLAGS, %l6
1418 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
1419 FP_ALLOWMIGRATE(5, 6)
1420 ret
1421 restore %g0, 0, %o0
1422
1423 SET_SIZE(bcopy_more)
1424
1425 /*
1426 * Block copy with possibly overlapped operands.
1427 */
1428
1429 ENTRY(ovbcopy)
1430 tst %o2 ! check count
1431 bgu,a %ncc, 1f ! nothing to do or bad arguments
1432 subcc %o0, %o1, %o3 ! difference of from and to address
1433
1434 retl ! return
1435 nop
1436 1:
1437 bneg,a %ncc, 2f
1438 neg %o3 ! if < 0, make it positive
1439 2: cmp %o2, %o3 ! cmp size and abs(from - to)
1440 bleu %ncc, bcopy ! if size <= abs(diff): use bcopy,
1441 .empty ! no overlap
1442 cmp %o0, %o1 ! compare from and to addresses
1443 blu %ncc, .ov_bkwd ! if from < to, copy backwards
1444 nop
1445 !
1446 ! Copy forwards.
1447 !
1448 .ov_fwd:
1449 ldub [%o0], %o3 ! read from address
1450 inc %o0 ! inc from address
1451 stb %o3, [%o1] ! write to address
1452 deccc %o2 ! dec count
1453 bgu %ncc, .ov_fwd ! loop till done
1454 inc %o1 ! inc to address
1455
1456 retl ! return
1457 nop
1458 !
1459 ! Copy backwards.
1460 !
1461 .ov_bkwd:
1462 deccc %o2 ! dec count
1463 ldub [%o0 + %o2], %o3 ! get byte at end of src
1464 bgu %ncc, .ov_bkwd ! loop till done
1465 stb %o3, [%o1 + %o2] ! delay slot, store at end of dst
1466
1467 retl ! return
1468 nop
1469
1470 SET_SIZE(ovbcopy)
1471
1472
1473 /*
1474 * hwblkpagecopy()
1475 *
1476 * Copies exactly one page. This routine assumes the caller (ppcopy)
1477 * has already disabled kernel preemption and has checked
1478 * use_hw_bcopy. Preventing preemption also prevents cpu migration.
1479 */
1480 ENTRY(hwblkpagecopy)
1481 ! get another window w/space for three aligned blocks of saved fpregs
1482 prefetch [%o0], #n_reads
1483 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1484
1485 ! %i0 - source address (arg)
1486 ! %i1 - destination address (arg)
1487 ! %i2 - length of region (not arg)
1488 ! %l0 - saved fprs
1489 ! %l1 - pointer to saved fpregs
1490
1491 rd %fprs, %l0 ! check for unused fp
1492 btst FPRS_FEF, %l0
1493 bz,a,pt %icc, 1f
1494 wr %g0, FPRS_FEF, %fprs
1495
1496 BST_FPQ1Q3_TOSTACK(%l1)
1497
1498 1: set PAGESIZE, CNT
1499 mov REALSRC, SRC
1500
1501 ldd [SRC], %f0
1502 prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads
1503 ldd [SRC + 0x08], %f2
1504 prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads
1505 fmovd %f0, %f32
1506 ldd [SRC + 0x10], %f4
1507 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
1508 fmovd %f2, %f34
1509 ldd [SRC + 0x18], %f6
1510 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
1511 fmovd %f4, %f36
1512 ldd [SRC + 0x20], %f8
1513 prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read
1514 fmovd %f6, %f38
1515 ldd [SRC + 0x28], %f10
1516 prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read
1517 fmovd %f8, %f40
1518 ldd [SRC + 0x30], %f12
1519 prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read
1520 fmovd %f10, %f42
1521 ldd [SRC + 0x38], %f14
1522 ldd [SRC + VIS_BLOCKSIZE], %f0
1523 sub CNT, VIS_BLOCKSIZE, CNT
1524 add SRC, VIS_BLOCKSIZE, SRC
1525 prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read
1526 ba,pt %ncc, 2f
1527 prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read
1528 .align 32
1529 2:
1530 ldd [SRC + 0x08], %f2
1531 fmovd %f12, %f44
1532 ldd [SRC + 0x10], %f4
1533 fmovd %f14, %f46
1534 stda %f32, [DST]ASI_BLK_P
1535 ldd [SRC + 0x18], %f6
1536 fmovd %f0, %f32
1537 ldd [SRC + 0x20], %f8
1538 fmovd %f2, %f34
1539 ldd [SRC + 0x28], %f10
1540 fmovd %f4, %f36
1541 ldd [SRC + 0x30], %f12
1542 fmovd %f6, %f38
1543 ldd [SRC + 0x38], %f14
1544 fmovd %f8, %f40
1545 ldd [SRC + VIS_BLOCKSIZE], %f0
1546 fmovd %f10, %f42
1547 sub CNT, VIS_BLOCKSIZE, CNT
1548 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
1549 add DST, VIS_BLOCKSIZE, DST
1550 prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1551 add SRC, VIS_BLOCKSIZE, SRC
1552 cmp CNT, VIS_BLOCKSIZE + 8
1553 bgu,pt %ncc, 2b
1554 prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1555
1556 ! trailing block
1557 ldd [SRC + 0x08], %f2
1558 fsrc1 %f12, %f44
1559 ldd [SRC + 0x10], %f4
1560 fsrc1 %f14, %f46
1561 stda %f32, [DST]ASI_BLK_P
1562 ldd [SRC + 0x18], %f6
1563 fsrc1 %f0, %f32
1564 ldd [SRC + 0x20], %f8
1565 fsrc1 %f2, %f34
1566 ldd [SRC + 0x28], %f10
1567 fsrc1 %f4, %f36
1568 ldd [SRC + 0x30], %f12
1569 fsrc1 %f6, %f38
1570 ldd [SRC + 0x38], %f14
1571 fsrc1 %f8, %f40
1572 sub CNT, VIS_BLOCKSIZE, CNT
1573 add DST, VIS_BLOCKSIZE, DST
1574 add SRC, VIS_BLOCKSIZE, SRC
1575 fsrc1 %f10, %f42
1576 fsrc1 %f12, %f44
1577 fsrc1 %f14, %f46
1578 stda %f32, [DST]ASI_BLK_P
1579
1580 membar #Sync
1581
1582 btst FPRS_FEF, %l0
1583 bz,pt %icc, 2f
1584 nop
1585
1586 BLD_FPQ1Q3_FROMSTACK(%l3)
1587 ba 3f
1588 nop
1589
1590 2: FZEROQ1Q3
1591
1592 3: wr %l0, 0, %fprs ! restore fprs
1593 ret
1594 restore %g0, 0, %o0
1595
1596 SET_SIZE(hwblkpagecopy)
1597
1598
1599 /*
1600 * Transfer data to and from user space -
1601 * Note that these routines can cause faults
1602 * It is assumed that the kernel has nothing at
1603 * less than KERNELBASE in the virtual address space.
1604 *
1605 * Note that copyin(9F) and copyout(9F) are part of the
1606 * DDI/DKI which specifies that they return '-1' on "errors."
1607 *
1608 * Sigh.
1609 *
1610 * So there's two extremely similar routines - xcopyin() and xcopyout()
1611 * which return the errno that we've faithfully computed. This
1612 * allows other callers (e.g. uiomove(9F)) to work correctly.
1613 * Given that these are used pretty heavily, we expand the calling
1614 * sequences inline for all flavours (rather than making wrappers).
1615 *
1616 * There are also stub routines for xcopyout_little and xcopyin_little,
1617 * which currently are intended to handle requests of <= 16 bytes from
1618 * do_unaligned. Future enhancement to make them handle 8k pages efficiently
1619 * is left as an exercise...
1620 */
1621
1622 /*
1623 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
1624 *
1625 * General theory of operation:
1626 *
1627 * The only difference between copy{in,out} and
1628 * xcopy{in,out} is in the error handling routine they invoke
1629 * when a memory access error occurs. xcopyOP returns the errno
1630 * while copyOP returns -1 (see above). copy{in,out}_noerr set
1631 * a special flag (by oring the TRAMP_FLAG into the fault handler address)
1632 * if they are called with a fault handler already in place. That flag
1633 * causes the default handlers to trampoline to the previous handler
1634 * upon an error.
1635 *
1636 * None of the copyops routines grab a window until it's decided that
1637 * we need to do a HW block copy operation. This saves a window
1638 * spill/fill when we're called during socket ops. The typical IO
1639 * path won't cause spill/fill traps.
1640 *
1641 * This code uses a set of 4 limits for the maximum size that will
1642 * be copied given a particular input/output address alignment.
1643 * If the value for a particular limit is zero, the copy will be performed
1644 * by the plain copy loops rather than FPBLK.
1645 *
1646 * See the description of bcopy above for more details of the
1647 * data copying algorithm and the default limits.
1648 *
1649 */
1650
1651 /*
1652 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
1653 */
1654
1655 /*
1656 * We save the arguments in the following registers in case of a fault:
1657 * kaddr - %l1
1658 * uaddr - %l2
1659 * count - %l3
1660 */
1661 #define SAVE_SRC %l1
1662 #define SAVE_DST %l2
1663 #define SAVE_COUNT %l3
1664
1665 #define SM_SAVE_SRC %g4
1666 #define SM_SAVE_DST %g5
1667 #define SM_SAVE_COUNT %o5
1668 #define ERRNO %l5
1669
1670
1671 #define REAL_LOFAULT %l4
1672 /*
1673 * Generic copyio fault handler. This is the first line of defense when a
1674 * fault occurs in (x)copyin/(x)copyout. In order for this to function
1675 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
1676 * This allows us to share common code for all the flavors of the copy
1677 * operations, including the _noerr versions.
1678 *
1679 * Note that this function will restore the original input parameters before
1680 * calling REAL_LOFAULT. So the real handler can vector to the appropriate
1681 * member of the t_copyop structure, if needed.
1682 */
1683 ENTRY(copyio_fault)
1684 membar #Sync
1685 mov %g1,ERRNO ! save errno in ERRNO
1686 btst FPUSED_FLAG, %l6
1687 bz %ncc, 1f
1688 nop
1689
1690 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
1691 wr %o2, 0, %gsr ! restore gsr
1692
1693 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1694 btst FPRS_FEF, %o3
1695 bz,pt %icc, 4f
1696 nop
1697
1698 BLD_FPQ2Q4_FROMSTACK(%o2)
1699
1700 ba,pt %ncc, 1f
1701 wr %o3, 0, %fprs ! restore fprs
1702
1703 4:
1704 FZEROQ2Q4
1705 wr %o3, 0, %fprs ! restore fprs
1706
1707 1:
1708 andn %l6, FPUSED_FLAG, %l6
1709 membar #Sync
1710 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
1711 FP_ALLOWMIGRATE(5, 6)
1712
1713 mov SAVE_SRC, %i0
1714 mov SAVE_DST, %i1
1715 jmp REAL_LOFAULT
1716 mov SAVE_COUNT, %i2
1717
1718 SET_SIZE(copyio_fault)
1719
1720
1721 ENTRY(copyout)
1722
1723 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
1724 bleu,pt %ncc, .copyout_small ! go to larger cases
1725 xor %o0, %o1, %o3 ! are src, dst alignable?
1726 btst 7, %o3 !
1727 bz,pt %ncc, .copyout_8 ! check for longword alignment
1728 nop
1729 btst 1, %o3 !
1730 bz,pt %ncc, .copyout_2 ! check for half-word
1731 nop
1732 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
1733 ld [%o3 + %lo(hw_copy_limit_1)], %o3
1734 tst %o3
1735 bz,pn %icc, .copyout_small ! if zero, disable HW copy
1736 cmp %o2, %o3 ! if length <= limit
1737 bleu,pt %ncc, .copyout_small ! go to small copy
1738 nop
1739 ba,pt %ncc, .copyout_more ! otherwise go to large copy
1740 nop
1741 .copyout_2:
1742 btst 3, %o3 !
1743 bz,pt %ncc, .copyout_4 ! check for word alignment
1744 nop
1745 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
1746 ld [%o3 + %lo(hw_copy_limit_2)], %o3
1747 tst %o3
1748 bz,pn %icc, .copyout_small ! if zero, disable HW copy
1749 cmp %o2, %o3 ! if length <= limit
1750 bleu,pt %ncc, .copyout_small ! go to small copy
1751 nop
1752 ba,pt %ncc, .copyout_more ! otherwise go to large copy
1753 nop
1754 .copyout_4:
1755 ! already checked longword, must be word aligned
1756 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
1757 ld [%o3 + %lo(hw_copy_limit_4)], %o3
1758 tst %o3
1759 bz,pn %icc, .copyout_small ! if zero, disable HW copy
1760 cmp %o2, %o3 ! if length <= limit
1761 bleu,pt %ncc, .copyout_small ! go to small copy
1762 nop
1763 ba,pt %ncc, .copyout_more ! otherwise go to large copy
1764 nop
1765 .copyout_8:
1766 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
1767 ld [%o3 + %lo(hw_copy_limit_8)], %o3
1768 tst %o3
1769 bz,pn %icc, .copyout_small ! if zero, disable HW copy
1770 cmp %o2, %o3 ! if length <= limit
1771 bleu,pt %ncc, .copyout_small ! go to small copy
1772 nop
1773 ba,pt %ncc, .copyout_more ! otherwise go to large copy
1774 nop
1775
1776 .align 16
1777 nop ! instruction alignment
1778 ! see discussion at start of file
1779 .copyout_small:
1780 sethi %hi(.sm_copyout_err), %o5 ! .sm_copyout_err is lofault
1781 or %o5, %lo(.sm_copyout_err), %o5
1782 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler
1783 membar #Sync ! sync error barrier
1784 stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault
1785 .sm_do_copyout:
1786 mov %o0, SM_SAVE_SRC
1787 mov %o1, SM_SAVE_DST
1788 cmp %o2, SHORTCOPY ! check for really short case
1789 bleu,pt %ncc, .co_sm_left !
1790 mov %o2, SM_SAVE_COUNT
1791 cmp %o2, CHKSIZE ! check for medium length cases
1792 bgu,pn %ncc, .co_med !
1793 or %o0, %o1, %o3 ! prepare alignment check
1794 andcc %o3, 0x3, %g0 ! test for alignment
1795 bz,pt %ncc, .co_sm_word ! branch to word aligned case
1796 .co_sm_movebytes:
1797 sub %o2, 3, %o2 ! adjust count to allow cc zero test
1798 .co_sm_notalign4:
1799 ldub [%o0], %o3 ! read byte
1800 subcc %o2, 4, %o2 ! reduce count by 4
1801 stba %o3, [%o1]ASI_USER ! write byte
1802 inc %o1 ! advance DST by 1
1803 ldub [%o0 + 1], %o3 ! repeat for a total of 4 bytes
1804 add %o0, 4, %o0 ! advance SRC by 4
1805 stba %o3, [%o1]ASI_USER
1806 inc %o1 ! advance DST by 1
1807 ldub [%o0 - 2], %o3
1808 stba %o3, [%o1]ASI_USER
1809 inc %o1 ! advance DST by 1
1810 ldub [%o0 - 1], %o3
1811 stba %o3, [%o1]ASI_USER
1812 bgt,pt %ncc, .co_sm_notalign4 ! loop til 3 or fewer bytes remain
1813 inc %o1 ! advance DST by 1
1814 add %o2, 3, %o2 ! restore count
1815 .co_sm_left:
1816 tst %o2
1817 bz,pt %ncc, .co_sm_exit ! check for zero length
1818 nop
1819 ldub [%o0], %o3 ! load one byte
1820 deccc %o2 ! reduce count for cc test
1821 bz,pt %ncc, .co_sm_exit
1822 stba %o3,[%o1]ASI_USER ! store one byte
1823 ldub [%o0 + 1], %o3 ! load second byte
1824 deccc %o2
1825 inc %o1
1826 bz,pt %ncc, .co_sm_exit
1827 stba %o3,[%o1]ASI_USER ! store second byte
1828 ldub [%o0 + 2], %o3 ! load third byte
1829 inc %o1
1830 stba %o3,[%o1]ASI_USER ! store third byte
1831 membar #Sync ! sync error barrier
1832 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
1833 retl
1834 mov %g0, %o0 ! return 0
1835 .align 16
1836 .co_sm_words:
1837 lduw [%o0], %o3 ! read word
1838 .co_sm_wordx:
1839 subcc %o2, 8, %o2 ! update count
1840 stwa %o3, [%o1]ASI_USER ! write word
1841 add %o0, 8, %o0 ! update SRC
1842 lduw [%o0 - 4], %o3 ! read word
1843 add %o1, 4, %o1 ! update DST
1844 stwa %o3, [%o1]ASI_USER ! write word
1845 bgt,pt %ncc, .co_sm_words ! loop til done
1846 add %o1, 4, %o1 ! update DST
1847 addcc %o2, 7, %o2 ! restore count
1848 bz,pt %ncc, .co_sm_exit
1849 nop
1850 deccc %o2
1851 bz,pt %ncc, .co_sm_byte
1852 .co_sm_half:
1853 subcc %o2, 2, %o2 ! reduce count by 2
1854 lduh [%o0], %o3 ! read half word
1855 add %o0, 2, %o0 ! advance SRC by 2
1856 stha %o3, [%o1]ASI_USER ! write half word
1857 bgt,pt %ncc, .co_sm_half ! loop til done
1858 add %o1, 2, %o1 ! advance DST by 2
1859 addcc %o2, 1, %o2 ! restore count
1860 bz,pt %ncc, .co_sm_exit
1861 nop
1862 .co_sm_byte:
1863 ldub [%o0], %o3
1864 stba %o3, [%o1]ASI_USER
1865 membar #Sync ! sync error barrier
1866 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
1867 retl
1868 mov %g0, %o0 ! return 0
1869 .align 16
1870 .co_sm_word:
1871 subcc %o2, 4, %o2 ! update count
1872 bgt,pt %ncc, .co_sm_wordx
1873 lduw [%o0], %o3 ! read word
1874 addcc %o2, 3, %o2 ! restore count
1875 bz,pt %ncc, .co_sm_exit
1876 stwa %o3, [%o1]ASI_USER ! write word
1877 deccc %o2 ! reduce count for cc test
1878 ldub [%o0 + 4], %o3 ! load one byte
1879 add %o1, 4, %o1
1880 bz,pt %ncc, .co_sm_exit
1881 stba %o3, [%o1]ASI_USER ! store one byte
1882 ldub [%o0 + 5], %o3 ! load second byte
1883 deccc %o2
1884 inc %o1
1885 bz,pt %ncc, .co_sm_exit
1886 stba %o3, [%o1]ASI_USER ! store second byte
1887 ldub [%o0 + 6], %o3 ! load third byte
1888 inc %o1
1889 stba %o3, [%o1]ASI_USER ! store third byte
1890 .co_sm_exit:
1891 membar #Sync ! sync error barrier
1892 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
1893 retl
1894 mov %g0, %o0 ! return 0
1895
1896 .align 16
1897 .co_med:
1898 xor %o0, %o1, %o3 ! setup alignment check
1899 btst 1, %o3
1900 bnz,pt %ncc, .co_sm_movebytes ! unaligned
1901 nop
1902 btst 3, %o3
1903 bnz,pt %ncc, .co_med_half ! halfword aligned
1904 nop
1905 btst 7, %o3
1906 bnz,pt %ncc, .co_med_word ! word aligned
1907 nop
1908 .co_med_long:
1909 btst 3, %o0 ! check for
1910 bz,pt %ncc, .co_med_long1 ! word alignment
1911 nop
1912 .co_med_long0:
1913 ldub [%o0], %o3 ! load one byte
1914 inc %o0
1915 stba %o3,[%o1]ASI_USER ! store byte
1916 inc %o1
1917 btst 3, %o0
1918 bnz,pt %ncc, .co_med_long0
1919 dec %o2
1920 .co_med_long1: ! word aligned
1921 btst 7, %o0 ! check for long word
1922 bz,pt %ncc, .co_med_long2
1923 nop
1924 lduw [%o0], %o3 ! load word
1925 add %o0, 4, %o0 ! advance SRC by 4
1926 stwa %o3, [%o1]ASI_USER ! store word
1927 add %o1, 4, %o1 ! advance DST by 4
1928 sub %o2, 4, %o2 ! reduce count by 4
1929 !
1930 ! Now long word aligned and have at least 32 bytes to move
1931 !
1932 .co_med_long2:
1933 sub %o2, 31, %o2 ! adjust count to allow cc zero test
1934 sub %o1, 8, %o1 ! adjust pointer to allow store in
1935 ! branch delay slot instead of add
1936 .co_med_lmove:
1937 add %o1, 8, %o1 ! advance DST by 8
1938 ldx [%o0], %o3 ! read long word
1939 subcc %o2, 32, %o2 ! reduce count by 32
1940 stxa %o3, [%o1]ASI_USER ! write long word
1941 add %o1, 8, %o1 ! advance DST by 8
1942 ldx [%o0 + 8], %o3 ! repeat for a total for 4 long words
1943 add %o0, 32, %o0 ! advance SRC by 32
1944 stxa %o3, [%o1]ASI_USER
1945 ldx [%o0 - 16], %o3
1946 add %o1, 8, %o1 ! advance DST by 8
1947 stxa %o3, [%o1]ASI_USER
1948 ldx [%o0 - 8], %o3
1949 add %o1, 8, %o1 ! advance DST by 8
1950 bgt,pt %ncc, .co_med_lmove ! loop til 31 or fewer bytes left
1951 stxa %o3, [%o1]ASI_USER
1952 add %o1, 8, %o1 ! advance DST by 8
1953 addcc %o2, 24, %o2 ! restore count to long word offset
1954 ble,pt %ncc, .co_med_lextra ! check for more long words to move
1955 nop
1956 .co_med_lword:
1957 ldx [%o0], %o3 ! read long word
1958 subcc %o2, 8, %o2 ! reduce count by 8
1959 stxa %o3, [%o1]ASI_USER ! write long word
1960 add %o0, 8, %o0 ! advance SRC by 8
1961 bgt,pt %ncc, .co_med_lword ! loop til 7 or fewer bytes left
1962 add %o1, 8, %o1 ! advance DST by 8
1963 .co_med_lextra:
1964 addcc %o2, 7, %o2 ! restore rest of count
1965 bz,pt %ncc, .co_sm_exit ! if zero, then done
1966 deccc %o2
1967 bz,pt %ncc, .co_sm_byte
1968 nop
1969 ba,pt %ncc, .co_sm_half
1970 nop
1971
1972 .align 16
1973 nop ! instruction alignment
1974 ! see discussion at start of file
1975 .co_med_word:
1976 btst 3, %o0 ! check for
1977 bz,pt %ncc, .co_med_word1 ! word alignment
1978 nop
1979 .co_med_word0:
1980 ldub [%o0], %o3 ! load one byte
1981 inc %o0
1982 stba %o3,[%o1]ASI_USER ! store byte
1983 inc %o1
1984 btst 3, %o0
1985 bnz,pt %ncc, .co_med_word0
1986 dec %o2
1987 !
1988 ! Now word aligned and have at least 36 bytes to move
1989 !
1990 .co_med_word1:
1991 sub %o2, 15, %o2 ! adjust count to allow cc zero test
1992 .co_med_wmove:
1993 lduw [%o0], %o3 ! read word
1994 subcc %o2, 16, %o2 ! reduce count by 16
1995 stwa %o3, [%o1]ASI_USER ! write word
1996 add %o1, 4, %o1 ! advance DST by 4
1997 lduw [%o0 + 4], %o3 ! repeat for a total for 4 words
1998 add %o0, 16, %o0 ! advance SRC by 16
1999 stwa %o3, [%o1]ASI_USER
2000 add %o1, 4, %o1 ! advance DST by 4
2001 lduw [%o0 - 8], %o3
2002 stwa %o3, [%o1]ASI_USER
2003 add %o1, 4, %o1 ! advance DST by 4
2004 lduw [%o0 - 4], %o3
2005 stwa %o3, [%o1]ASI_USER
2006 bgt,pt %ncc, .co_med_wmove ! loop til 15 or fewer bytes left
2007 add %o1, 4, %o1 ! advance DST by 4
2008 addcc %o2, 12, %o2 ! restore count to word offset
2009 ble,pt %ncc, .co_med_wextra ! check for more words to move
2010 nop
2011 .co_med_word2:
2012 lduw [%o0], %o3 ! read word
2013 subcc %o2, 4, %o2 ! reduce count by 4
2014 stwa %o3, [%o1]ASI_USER ! write word
2015 add %o0, 4, %o0 ! advance SRC by 4
2016 bgt,pt %ncc, .co_med_word2 ! loop til 3 or fewer bytes left
2017 add %o1, 4, %o1 ! advance DST by 4
2018 .co_med_wextra:
2019 addcc %o2, 3, %o2 ! restore rest of count
2020 bz,pt %ncc, .co_sm_exit ! if zero, then done
2021 deccc %o2
2022 bz,pt %ncc, .co_sm_byte
2023 nop
2024 ba,pt %ncc, .co_sm_half
2025 nop
2026
2027 .align 16
2028 nop ! instruction alignment
2029 nop ! see discussion at start of file
2030 nop
2031 .co_med_half:
2032 btst 1, %o0 ! check for
2033 bz,pt %ncc, .co_med_half1 ! half word alignment
2034 nop
2035 ldub [%o0], %o3 ! load one byte
2036 inc %o0
2037 stba %o3,[%o1]ASI_USER ! store byte
2038 inc %o1
2039 dec %o2
2040 !
2041 ! Now half word aligned and have at least 38 bytes to move
2042 !
2043 .co_med_half1:
2044 sub %o2, 7, %o2 ! adjust count to allow cc zero test
2045 .co_med_hmove:
2046 lduh [%o0], %o3 ! read half word
2047 subcc %o2, 8, %o2 ! reduce count by 8
2048 stha %o3, [%o1]ASI_USER ! write half word
2049 add %o1, 2, %o1 ! advance DST by 2
2050 lduh [%o0 + 2], %o3 ! repeat for a total for 4 halfwords
2051 add %o0, 8, %o0 ! advance SRC by 8
2052 stha %o3, [%o1]ASI_USER
2053 add %o1, 2, %o1 ! advance DST by 2
2054 lduh [%o0 - 4], %o3
2055 stha %o3, [%o1]ASI_USER
2056 add %o1, 2, %o1 ! advance DST by 2
2057 lduh [%o0 - 2], %o3
2058 stha %o3, [%o1]ASI_USER
2059 bgt,pt %ncc, .co_med_hmove ! loop til 7 or fewer bytes left
2060 add %o1, 2, %o1 ! advance DST by 2
2061 addcc %o2, 7, %o2 ! restore count
2062 bz,pt %ncc, .co_sm_exit
2063 deccc %o2
2064 bz,pt %ncc, .co_sm_byte
2065 nop
2066 ba,pt %ncc, .co_sm_half
2067 nop
2068
2069 /*
2070 * We got here because of a fault during short copyout.
2071 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
2072 */
2073 .sm_copyout_err:
2074 membar #Sync
2075 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2076 mov SM_SAVE_SRC, %o0
2077 mov SM_SAVE_DST, %o1
2078 mov SM_SAVE_COUNT, %o2
2079 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler
2080 tst %o3
2081 bz,pt %ncc, 3f ! if not, return error
2082 nop
2083 ldn [%o3 + CP_COPYOUT], %o5 ! if handler, invoke it with
2084 jmp %o5 ! original arguments
2085 nop
2086 3:
2087 retl
2088 or %g0, -1, %o0 ! return error value
2089
2090 SET_SIZE(copyout)
2091
2092 /*
2093 * The _more entry points are not intended to be used directly by
2094 * any caller from outside this file. They are provided to allow
2095 * profiling and dtrace of the portions of the copy code that uses
2096 * the floating point registers.
2097 * This entry is particularly important as DTRACE (at least as of
2098 * 4/2004) does not support leaf functions.
2099 */
2100
2101 ENTRY(copyout_more)
2102 .copyout_more:
2103 prefetch [%o0], #n_reads
2104 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2105 set .copyout_err, REAL_LOFAULT
2106
2107 /*
2108 * Copy outs that reach here are larger than VIS_COPY_THRESHOLD bytes
2109 */
2110 .do_copyout:
2111 set copyio_fault, %l7 ! .copyio_fault is lofault val
2112
2113 ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler
2114 membar #Sync ! sync error barrier
2115 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault
2116
2117 mov %i0, SAVE_SRC
2118 mov %i1, SAVE_DST
2119 mov %i2, SAVE_COUNT
2120
2121 FP_NOMIGRATE(6, 7)
2122
2123 rd %fprs, %o2 ! check for unused fp
2124 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
2125 btst FPRS_FEF, %o2
2126 bz,a,pt %icc, .do_blockcopyout
2127 wr %g0, FPRS_FEF, %fprs
2128
2129 BST_FPQ2Q4_TOSTACK(%o2)
2130
2131 .do_blockcopyout:
2132 rd %gsr, %o2
2133 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr
2134 or %l6, FPUSED_FLAG, %l6
2135
2136 andcc DST, VIS_BLOCKSIZE - 1, TMP
2137 mov ASI_USER, %asi
2138 bz,pt %ncc, 2f
2139 neg TMP
2140 add TMP, VIS_BLOCKSIZE, TMP
2141
2142 ! TMP = bytes required to align DST on FP_BLOCK boundary
2143 ! Using SRC as a tmp here
2144 cmp TMP, 3
2145 bleu,pt %ncc, 1f
2146 sub CNT,TMP,CNT ! adjust main count
2147 sub TMP, 3, TMP ! adjust for end of loop test
2148 .co_blkalign:
2149 ldub [REALSRC], SRC ! move 4 bytes per loop iteration
2150 stba SRC, [DST]%asi
2151 subcc TMP, 4, TMP
2152 ldub [REALSRC + 1], SRC
2153 add REALSRC, 4, REALSRC
2154 stba SRC, [DST + 1]%asi
2155 ldub [REALSRC - 2], SRC
2156 add DST, 4, DST
2157 stba SRC, [DST - 2]%asi
2158 ldub [REALSRC - 1], SRC
2159 bgu,pt %ncc, .co_blkalign
2160 stba SRC, [DST - 1]%asi
2161
2162 addcc TMP, 3, TMP ! restore count adjustment
2163 bz,pt %ncc, 2f ! no bytes left?
2164 nop
2165 1: ldub [REALSRC], SRC
2166 inc REALSRC
2167 inc DST
2168 deccc TMP
2169 bgu %ncc, 1b
2170 stba SRC, [DST - 1]%asi
2171
2172 2:
2173 membar #StoreLoad
2174 andn REALSRC, 0x7, SRC
2175
2176 ! SRC - 8-byte aligned
2177 ! DST - 64-byte aligned
2178 ldd [SRC], %f16
2179 prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads
2180 alignaddr REALSRC, %g0, %g0
2181 ldd [SRC + 0x08], %f18
2182 prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads
2183 faligndata %f16, %f18, %f48
2184 ldd [SRC + 0x10], %f20
2185 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
2186 faligndata %f18, %f20, %f50
2187 ldd [SRC + 0x18], %f22
2188 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
2189 faligndata %f20, %f22, %f52
2190 ldd [SRC + 0x20], %f24
2191 prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read
2192 faligndata %f22, %f24, %f54
2193 ldd [SRC + 0x28], %f26
2194 prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read
2195 faligndata %f24, %f26, %f56
2196 ldd [SRC + 0x30], %f28
2197 prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read
2198 faligndata %f26, %f28, %f58
2199 ldd [SRC + 0x38], %f30
2200 ldd [SRC + VIS_BLOCKSIZE], %f16
2201 sub CNT, VIS_BLOCKSIZE, CNT
2202 add SRC, VIS_BLOCKSIZE, SRC
2203 prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read
2204 add REALSRC, VIS_BLOCKSIZE, REALSRC
2205 ba,pt %ncc, 1f
2206 prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read
2207 .align 32
2208 1:
2209 ldd [SRC + 0x08], %f18
2210 faligndata %f28, %f30, %f60
2211 ldd [SRC + 0x10], %f20
2212 faligndata %f30, %f16, %f62
2213 stda %f48, [DST]ASI_BLK_AIUS
2214 ldd [SRC + 0x18], %f22
2215 faligndata %f16, %f18, %f48
2216 ldd [SRC + 0x20], %f24
2217 faligndata %f18, %f20, %f50
2218 ldd [SRC + 0x28], %f26
2219 faligndata %f20, %f22, %f52
2220 ldd [SRC + 0x30], %f28
2221 faligndata %f22, %f24, %f54
2222 sub CNT, VIS_BLOCKSIZE, CNT
2223 ldd [SRC + 0x38], %f30
2224 faligndata %f24, %f26, %f56
2225 add DST, VIS_BLOCKSIZE, DST
2226 ldd [SRC + VIS_BLOCKSIZE], %f16
2227 faligndata %f26, %f28, %f58
2228 add REALSRC, VIS_BLOCKSIZE, REALSRC
2229 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
2230 add SRC, VIS_BLOCKSIZE, SRC
2231 prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read
2232 cmp CNT, VIS_BLOCKSIZE + 8
2233 bgu,pt %ncc, 1b
2234 prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
2235
2236 ! only if REALSRC & 0x7 is 0
2237 cmp CNT, VIS_BLOCKSIZE
2238 bne %ncc, 3f
2239 andcc REALSRC, 0x7, %g0
2240 bz,pt %ncc, 2f
2241 nop
2242 3:
2243 faligndata %f28, %f30, %f60
2244 faligndata %f30, %f16, %f62
2245 stda %f48, [DST]ASI_BLK_AIUS
2246 add DST, VIS_BLOCKSIZE, DST
2247 ba,pt %ncc, 3f
2248 nop
2249 2:
2250 ldd [SRC + 0x08], %f18
2251 fsrc1 %f28, %f60
2252 ldd [SRC + 0x10], %f20
2253 fsrc1 %f30, %f62
2254 stda %f48, [DST]ASI_BLK_AIUS
2255 ldd [SRC + 0x18], %f22
2256 fsrc1 %f16, %f48
2257 ldd [SRC + 0x20], %f24
2258 fsrc1 %f18, %f50
2259 ldd [SRC + 0x28], %f26
2260 fsrc1 %f20, %f52
2261 ldd [SRC + 0x30], %f28
2262 fsrc1 %f22, %f54
2263 ldd [SRC + 0x38], %f30
2264 fsrc1 %f24, %f56
2265 sub CNT, VIS_BLOCKSIZE, CNT
2266 add DST, VIS_BLOCKSIZE, DST
2267 add SRC, VIS_BLOCKSIZE, SRC
2268 add REALSRC, VIS_BLOCKSIZE, REALSRC
2269 fsrc1 %f26, %f58
2270 fsrc1 %f28, %f60
2271 fsrc1 %f30, %f62
2272 stda %f48, [DST]ASI_BLK_AIUS
2273 add DST, VIS_BLOCKSIZE, DST
2274 ba,a,pt %ncc, 4f
2275 nop
2276
2277 3: tst CNT
2278 bz,a %ncc, 4f
2279 nop
2280
2281 5: ldub [REALSRC], TMP
2282 inc REALSRC
2283 inc DST
2284 deccc CNT
2285 bgu %ncc, 5b
2286 stba TMP, [DST - 1]%asi
2287 4:
2288
2289 .copyout_exit:
2290 membar #Sync
2291
2292 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
2293 wr %o2, 0, %gsr ! restore gsr
2294
2295 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
2296 btst FPRS_FEF, %o3
2297 bz,pt %icc, 4f
2298 nop
2299
2300 BLD_FPQ2Q4_FROMSTACK(%o2)
2301
2302 ba,pt %ncc, 1f
2303 wr %o3, 0, %fprs ! restore fprs
2304
2305 4:
2306 FZEROQ2Q4
2307 wr %o3, 0, %fprs ! restore fprs
2308
2309 1:
2310 membar #Sync
2311 andn %l6, FPUSED_FLAG, %l6
2312 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2313 FP_ALLOWMIGRATE(5, 6)
2314 ret
2315 restore %g0, 0, %o0
2316
2317 /*
2318 * We got here because of a fault during copyout.
2319 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
2320 */
2321 .copyout_err:
2322 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler
2323 tst %o4
2324 bz,pt %ncc, 2f ! if not, return error
2325 nop
2326 ldn [%o4 + CP_COPYOUT], %g2 ! if handler, invoke it with
2327 jmp %g2 ! original arguments
2328 restore %g0, 0, %g0 ! dispose of copy window
2329 2:
2330 ret
2331 restore %g0, -1, %o0 ! return error value
2332
2333
2334 SET_SIZE(copyout_more)
2335
2336
2337 ENTRY(xcopyout)
2338 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
2339 bleu,pt %ncc, .xcopyout_small ! go to larger cases
2340 xor %o0, %o1, %o3 ! are src, dst alignable?
2341 btst 7, %o3 !
2342 bz,pt %ncc, .xcopyout_8 !
2343 nop
2344 btst 1, %o3 !
2345 bz,pt %ncc, .xcopyout_2 ! check for half-word
2346 nop
2347 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
2348 ld [%o3 + %lo(hw_copy_limit_1)], %o3
2349 tst %o3
2350 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy
2351 cmp %o2, %o3 ! if length <= limit
2352 bleu,pt %ncc, .xcopyout_small ! go to small copy
2353 nop
2354 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy
2355 nop
2356 .xcopyout_2:
2357 btst 3, %o3 !
2358 bz,pt %ncc, .xcopyout_4 ! check for word alignment
2359 nop
2360 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
2361 ld [%o3 + %lo(hw_copy_limit_2)], %o3
2362 tst %o3
2363 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy
2364 cmp %o2, %o3 ! if length <= limit
2365 bleu,pt %ncc, .xcopyout_small ! go to small copy
2366 nop
2367 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy
2368 nop
2369 .xcopyout_4:
2370 ! already checked longword, must be word aligned
2371 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
2372 ld [%o3 + %lo(hw_copy_limit_4)], %o3
2373 tst %o3
2374 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy
2375 cmp %o2, %o3 ! if length <= limit
2376 bleu,pt %ncc, .xcopyout_small ! go to small copy
2377 nop
2378 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy
2379 nop
2380 .xcopyout_8:
2381 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
2382 ld [%o3 + %lo(hw_copy_limit_8)], %o3
2383 tst %o3
2384 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy
2385 cmp %o2, %o3 ! if length <= limit
2386 bleu,pt %ncc, .xcopyout_small ! go to small copy
2387 nop
2388 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy
2389 nop
2390
2391 .xcopyout_small:
2392 sethi %hi(.sm_xcopyout_err), %o5 ! .sm_xcopyout_err is lofault
2393 or %o5, %lo(.sm_xcopyout_err), %o5
2394 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler
2395 membar #Sync ! sync error barrier
2396 ba,pt %ncc, .sm_do_copyout ! common code
2397 stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault
2398
2399 .xcopyout_more:
2400 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2401 sethi %hi(.xcopyout_err), REAL_LOFAULT
2402 ba,pt %ncc, .do_copyout ! common code
2403 or REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
2404
2405 /*
2406 * We got here because of fault during xcopyout
2407 * Errno value is in ERRNO
2408 */
2409 .xcopyout_err:
2410 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler
2411 tst %o4
2412 bz,pt %ncc, 2f ! if not, return error
2413 nop
2414 ldn [%o4 + CP_XCOPYOUT], %g2 ! if handler, invoke it with
2415 jmp %g2 ! original arguments
2416 restore %g0, 0, %g0 ! dispose of copy window
2417 2:
2418 ret
2419 restore ERRNO, 0, %o0 ! return errno value
2420
2421 .sm_xcopyout_err:
2422
2423 membar #Sync
2424 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2425 mov SM_SAVE_SRC, %o0
2426 mov SM_SAVE_DST, %o1
2427 mov SM_SAVE_COUNT, %o2
2428 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler
2429 tst %o3
2430 bz,pt %ncc, 3f ! if not, return error
2431 nop
2432 ldn [%o3 + CP_XCOPYOUT], %o5 ! if handler, invoke it with
2433 jmp %o5 ! original arguments
2434 nop
2435 3:
2436 retl
2437 or %g1, 0, %o0 ! return errno value
2438
2439 SET_SIZE(xcopyout)
2440
2441 ENTRY(xcopyout_little)
2442 sethi %hi(.xcopyio_err), %o5
2443 or %o5, %lo(.xcopyio_err), %o5
2444 ldn [THREAD_REG + T_LOFAULT], %o4
2445 membar #Sync ! sync error barrier
2446 stn %o5, [THREAD_REG + T_LOFAULT]
2447 mov %o4, %o5
2448
2449 subcc %g0, %o2, %o3
2450 add %o0, %o2, %o0
2451 bz,pn %ncc, 2f ! check for zero bytes
2452 sub %o2, 1, %o4
2453 add %o0, %o4, %o0 ! start w/last byte
2454 add %o1, %o2, %o1
2455 ldub [%o0 + %o3], %o4
2456
2457 1: stba %o4, [%o1 + %o3]ASI_AIUSL
2458 inccc %o3
2459 sub %o0, 2, %o0 ! get next byte
2460 bcc,a,pt %ncc, 1b
2461 ldub [%o0 + %o3], %o4
2462
2463 2:
2464 membar #Sync ! sync error barrier
2465 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2466 retl
2467 mov %g0, %o0 ! return (0)
2468
2469 SET_SIZE(xcopyout_little)
2470
2471 /*
2472 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
2473 */
2474
2475 ENTRY(copyin)
2476 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
2477 bleu,pt %ncc, .copyin_small ! go to larger cases
2478 xor %o0, %o1, %o3 ! are src, dst alignable?
2479 btst 7, %o3 !
2480 bz,pt %ncc, .copyin_8 ! check for longword alignment
2481 nop
2482 btst 1, %o3 !
2483 bz,pt %ncc, .copyin_2 ! check for half-word
2484 nop
2485 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
2486 ld [%o3 + %lo(hw_copy_limit_1)], %o3
2487 tst %o3
2488 bz,pn %icc, .copyin_small ! if zero, disable HW copy
2489 cmp %o2, %o3 ! if length <= limit
2490 bleu,pt %ncc, .copyin_small ! go to small copy
2491 nop
2492 ba,pt %ncc, .copyin_more ! otherwise go to large copy
2493 nop
2494 .copyin_2:
2495 btst 3, %o3 !
2496 bz,pt %ncc, .copyin_4 ! check for word alignment
2497 nop
2498 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
2499 ld [%o3 + %lo(hw_copy_limit_2)], %o3
2500 tst %o3
2501 bz,pn %icc, .copyin_small ! if zero, disable HW copy
2502 cmp %o2, %o3 ! if length <= limit
2503 bleu,pt %ncc, .copyin_small ! go to small copy
2504 nop
2505 ba,pt %ncc, .copyin_more ! otherwise go to large copy
2506 nop
2507 .copyin_4:
2508 ! already checked longword, must be word aligned
2509 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
2510 ld [%o3 + %lo(hw_copy_limit_4)], %o3
2511 tst %o3
2512 bz,pn %icc, .copyin_small ! if zero, disable HW copy
2513 cmp %o2, %o3 ! if length <= limit
2514 bleu,pt %ncc, .copyin_small ! go to small copy
2515 nop
2516 ba,pt %ncc, .copyin_more ! otherwise go to large copy
2517 nop
2518 .copyin_8:
2519 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
2520 ld [%o3 + %lo(hw_copy_limit_8)], %o3
2521 tst %o3
2522 bz,pn %icc, .copyin_small ! if zero, disable HW copy
2523 cmp %o2, %o3 ! if length <= limit
2524 bleu,pt %ncc, .copyin_small ! go to small copy
2525 nop
2526 ba,pt %ncc, .copyin_more ! otherwise go to large copy
2527 nop
2528
2529 .align 16
2530 nop ! instruction alignment
2531 ! see discussion at start of file
2532 .copyin_small:
2533 sethi %hi(.sm_copyin_err), %o5 ! .sm_copyin_err is lofault
2534 or %o5, %lo(.sm_copyin_err), %o5
2535 ldn [THREAD_REG + T_LOFAULT], %o4 ! set/save t_lofault, no tramp
2536 membar #Sync ! sync error barrier
2537 stn %o5, [THREAD_REG + T_LOFAULT]
2538 .sm_do_copyin:
2539 mov %o0, SM_SAVE_SRC
2540 mov %o1, SM_SAVE_DST
2541 cmp %o2, SHORTCOPY ! check for really short case
2542 bleu,pt %ncc, .ci_sm_left !
2543 mov %o2, SM_SAVE_COUNT
2544 cmp %o2, CHKSIZE ! check for medium length cases
2545 bgu,pn %ncc, .ci_med !
2546 or %o0, %o1, %o3 ! prepare alignment check
2547 andcc %o3, 0x3, %g0 ! test for alignment
2548 bz,pt %ncc, .ci_sm_word ! branch to word aligned case
2549 .ci_sm_movebytes:
2550 sub %o2, 3, %o2 ! adjust count to allow cc zero test
2551 .ci_sm_notalign4:
2552 lduba [%o0]ASI_USER, %o3 ! read byte
2553 subcc %o2, 4, %o2 ! reduce count by 4
2554 stb %o3, [%o1] ! write byte
2555 add %o0, 1, %o0 ! advance SRC by 1
2556 lduba [%o0]ASI_USER, %o3 ! repeat for a total of 4 bytes
2557 add %o0, 1, %o0 ! advance SRC by 1
2558 stb %o3, [%o1 + 1]
2559 add %o1, 4, %o1 ! advance DST by 4
2560 lduba [%o0]ASI_USER, %o3
2561 add %o0, 1, %o0 ! advance SRC by 1
2562 stb %o3, [%o1 - 2]
2563 lduba [%o0]ASI_USER, %o3
2564 add %o0, 1, %o0 ! advance SRC by 1
2565 bgt,pt %ncc, .ci_sm_notalign4 ! loop til 3 or fewer bytes remain
2566 stb %o3, [%o1 - 1]
2567 add %o2, 3, %o2 ! restore count
2568 .ci_sm_left:
2569 tst %o2
2570 bz,pt %ncc, .ci_sm_exit
2571 nop
2572 lduba [%o0]ASI_USER, %o3 ! load one byte
2573 deccc %o2 ! reduce count for cc test
2574 bz,pt %ncc, .ci_sm_exit
2575 stb %o3,[%o1] ! store one byte
2576 inc %o0
2577 lduba [%o0]ASI_USER, %o3 ! load second byte
2578 deccc %o2
2579 bz,pt %ncc, .ci_sm_exit
2580 stb %o3,[%o1 + 1] ! store second byte
2581 inc %o0
2582 lduba [%o0]ASI_USER, %o3 ! load third byte
2583 stb %o3,[%o1 + 2] ! store third byte
2584 membar #Sync ! sync error barrier
2585 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2586 retl
2587 mov %g0, %o0 ! return 0
2588 .align 16
2589 .ci_sm_words:
2590 lduwa [%o0]ASI_USER, %o3 ! read word
2591 .ci_sm_wordx:
2592 subcc %o2, 8, %o2 ! update count
2593 stw %o3, [%o1] ! write word
2594 add %o0, 4, %o0 ! update SRC
2595 add %o1, 8, %o1 ! update DST
2596 lduwa [%o0]ASI_USER, %o3 ! read word
2597 add %o0, 4, %o0 ! update SRC
2598 bgt,pt %ncc, .ci_sm_words ! loop til done
2599 stw %o3, [%o1 - 4] ! write word
2600 addcc %o2, 7, %o2 ! restore count
2601 bz,pt %ncc, .ci_sm_exit
2602 nop
2603 deccc %o2
2604 bz,pt %ncc, .ci_sm_byte
2605 .ci_sm_half:
2606 subcc %o2, 2, %o2 ! reduce count by 2
2607 lduha [%o0]ASI_USER, %o3 ! read half word
2608 add %o0, 2, %o0 ! advance SRC by 2
2609 add %o1, 2, %o1 ! advance DST by 2
2610 bgt,pt %ncc, .ci_sm_half ! loop til done
2611 sth %o3, [%o1 - 2] ! write half word
2612 addcc %o2, 1, %o2 ! restore count
2613 bz,pt %ncc, .ci_sm_exit
2614 nop
2615 .ci_sm_byte:
2616 lduba [%o0]ASI_USER, %o3
2617 stb %o3, [%o1]
2618 membar #Sync ! sync error barrier
2619 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2620 retl
2621 mov %g0, %o0 ! return 0
2622 .align 16
2623 .ci_sm_word:
2624 subcc %o2, 4, %o2 ! update count
2625 bgt,pt %ncc, .ci_sm_wordx
2626 lduwa [%o0]ASI_USER, %o3 ! read word
2627 addcc %o2, 3, %o2 ! restore count
2628 bz,pt %ncc, .ci_sm_exit
2629 stw %o3, [%o1] ! write word
2630 deccc %o2 ! reduce count for cc test
2631 add %o0, 4, %o0
2632 lduba [%o0]ASI_USER, %o3 ! load one byte
2633 bz,pt %ncc, .ci_sm_exit
2634 stb %o3, [%o1 + 4] ! store one byte
2635 inc %o0
2636 lduba [%o0]ASI_USER, %o3 ! load second byte
2637 deccc %o2
2638 bz,pt %ncc, .ci_sm_exit
2639 stb %o3, [%o1 + 5] ! store second byte
2640 inc %o0
2641 lduba [%o0]ASI_USER, %o3 ! load third byte
2642 stb %o3, [%o1 + 6] ! store third byte
2643 .ci_sm_exit:
2644 membar #Sync ! sync error barrier
2645 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2646 retl
2647 mov %g0, %o0 ! return 0
2648
2649 .align 16
2650 .ci_med:
2651 xor %o0, %o1, %o3 ! setup alignment check
2652 btst 1, %o3
2653 bnz,pt %ncc, .ci_sm_movebytes ! unaligned
2654 nop
2655 btst 3, %o3
2656 bnz,pt %ncc, .ci_med_half ! halfword aligned
2657 nop
2658 btst 7, %o3
2659 bnz,pt %ncc, .ci_med_word ! word aligned
2660 nop
2661 .ci_med_long:
2662 btst 3, %o0 ! check for
2663 bz,pt %ncc, .ci_med_long1 ! word alignment
2664 nop
2665 .ci_med_long0:
2666 lduba [%o0]ASI_USER, %o3 ! load one byte
2667 inc %o0
2668 stb %o3,[%o1] ! store byte
2669 inc %o1
2670 btst 3, %o0
2671 bnz,pt %ncc, .ci_med_long0
2672 dec %o2
2673 .ci_med_long1: ! word aligned
2674 btst 7, %o0 ! check for long word
2675 bz,pt %ncc, .ci_med_long2
2676 nop
2677 lduwa [%o0]ASI_USER, %o3 ! load word
2678 add %o0, 4, %o0 ! advance SRC by 4
2679 stw %o3, [%o1] ! store word
2680 add %o1, 4, %o1 ! advance DST by 4
2681 sub %o2, 4, %o2 ! reduce count by 4
2682 !
2683 ! Now long word aligned and have at least 32 bytes to move
2684 !
2685 .ci_med_long2:
2686 sub %o2, 31, %o2 ! adjust count to allow cc zero test
2687 .ci_med_lmove:
2688 ldxa [%o0]ASI_USER, %o3 ! read long word
2689 subcc %o2, 32, %o2 ! reduce count by 32
2690 stx %o3, [%o1] ! write long word
2691 add %o0, 8, %o0 ! advance SRC by 8
2692 ldxa [%o0]ASI_USER, %o3 ! repeat for a total for 4 long words
2693 add %o0, 8, %o0 ! advance SRC by 8
2694 stx %o3, [%o1 + 8]
2695 add %o1, 32, %o1 ! advance DST by 32
2696 ldxa [%o0]ASI_USER, %o3
2697 add %o0, 8, %o0 ! advance SRC by 8
2698 stx %o3, [%o1 - 16]
2699 ldxa [%o0]ASI_USER, %o3
2700 add %o0, 8, %o0 ! advance SRC by 8
2701 bgt,pt %ncc, .ci_med_lmove ! loop til 31 or fewer bytes left
2702 stx %o3, [%o1 - 8]
2703 addcc %o2, 24, %o2 ! restore count to long word offset
2704 ble,pt %ncc, .ci_med_lextra ! check for more long words to move
2705 nop
2706 .ci_med_lword:
2707 ldxa [%o0]ASI_USER, %o3 ! read long word
2708 subcc %o2, 8, %o2 ! reduce count by 8
2709 stx %o3, [%o1] ! write long word
2710 add %o0, 8, %o0 ! advance SRC by 8
2711 bgt,pt %ncc, .ci_med_lword ! loop til 7 or fewer bytes left
2712 add %o1, 8, %o1 ! advance DST by 8
2713 .ci_med_lextra:
2714 addcc %o2, 7, %o2 ! restore rest of count
2715 bz,pt %ncc, .ci_sm_exit ! if zero, then done
2716 deccc %o2
2717 bz,pt %ncc, .ci_sm_byte
2718 nop
2719 ba,pt %ncc, .ci_sm_half
2720 nop
2721
2722 .align 16
2723 nop ! instruction alignment
2724 ! see discussion at start of file
2725 .ci_med_word:
2726 btst 3, %o0 ! check for
2727 bz,pt %ncc, .ci_med_word1 ! word alignment
2728 nop
2729 .ci_med_word0:
2730 lduba [%o0]ASI_USER, %o3 ! load one byte
2731 inc %o0
2732 stb %o3,[%o1] ! store byte
2733 inc %o1
2734 btst 3, %o0
2735 bnz,pt %ncc, .ci_med_word0
2736 dec %o2
2737 !
2738 ! Now word aligned and have at least 36 bytes to move
2739 !
2740 .ci_med_word1:
2741 sub %o2, 15, %o2 ! adjust count to allow cc zero test
2742 .ci_med_wmove:
2743 lduwa [%o0]ASI_USER, %o3 ! read word
2744 subcc %o2, 16, %o2 ! reduce count by 16
2745 stw %o3, [%o1] ! write word
2746 add %o0, 4, %o0 ! advance SRC by 4
2747 lduwa [%o0]ASI_USER, %o3 ! repeat for a total for 4 words
2748 add %o0, 4, %o0 ! advance SRC by 4
2749 stw %o3, [%o1 + 4]
2750 add %o1, 16, %o1 ! advance DST by 16
2751 lduwa [%o0]ASI_USER, %o3
2752 add %o0, 4, %o0 ! advance SRC by 4
2753 stw %o3, [%o1 - 8]
2754 lduwa [%o0]ASI_USER, %o3
2755 add %o0, 4, %o0 ! advance SRC by 4
2756 bgt,pt %ncc, .ci_med_wmove ! loop til 15 or fewer bytes left
2757 stw %o3, [%o1 - 4]
2758 addcc %o2, 12, %o2 ! restore count to word offset
2759 ble,pt %ncc, .ci_med_wextra ! check for more words to move
2760 nop
2761 .ci_med_word2:
2762 lduwa [%o0]ASI_USER, %o3 ! read word
2763 subcc %o2, 4, %o2 ! reduce count by 4
2764 stw %o3, [%o1] ! write word
2765 add %o0, 4, %o0 ! advance SRC by 4
2766 bgt,pt %ncc, .ci_med_word2 ! loop til 3 or fewer bytes left
2767 add %o1, 4, %o1 ! advance DST by 4
2768 .ci_med_wextra:
2769 addcc %o2, 3, %o2 ! restore rest of count
2770 bz,pt %ncc, .ci_sm_exit ! if zero, then done
2771 deccc %o2
2772 bz,pt %ncc, .ci_sm_byte
2773 nop
2774 ba,pt %ncc, .ci_sm_half
2775 nop
2776
2777 .align 16
2778 nop ! instruction alignment
2779 ! see discussion at start of file
2780 .ci_med_half:
2781 btst 1, %o0 ! check for
2782 bz,pt %ncc, .ci_med_half1 ! half word alignment
2783 nop
2784 lduba [%o0]ASI_USER, %o3 ! load one byte
2785 inc %o0
2786 stb %o3,[%o1] ! store byte
2787 inc %o1
2788 dec %o2
2789 !
2790 ! Now half word aligned and have at least 38 bytes to move
2791 !
2792 .ci_med_half1:
2793 sub %o2, 7, %o2 ! adjust count to allow cc zero test
2794 .ci_med_hmove:
2795 lduha [%o0]ASI_USER, %o3 ! read half word
2796 subcc %o2, 8, %o2 ! reduce count by 8
2797 sth %o3, [%o1] ! write half word
2798 add %o0, 2, %o0 ! advance SRC by 2
2799 lduha [%o0]ASI_USER, %o3 ! repeat for a total for 4 halfwords
2800 add %o0, 2, %o0 ! advance SRC by 2
2801 sth %o3, [%o1 + 2]
2802 add %o1, 8, %o1 ! advance DST by 8
2803 lduha [%o0]ASI_USER, %o3
2804 add %o0, 2, %o0 ! advance SRC by 2
2805 sth %o3, [%o1 - 4]
2806 lduha [%o0]ASI_USER, %o3
2807 add %o0, 2, %o0 ! advance SRC by 2
2808 bgt,pt %ncc, .ci_med_hmove ! loop til 7 or fewer bytes left
2809 sth %o3, [%o1 - 2]
2810 addcc %o2, 7, %o2 ! restore count
2811 bz,pt %ncc, .ci_sm_exit
2812 deccc %o2
2813 bz,pt %ncc, .ci_sm_byte
2814 nop
2815 ba,pt %ncc, .ci_sm_half
2816 nop
2817
2818 .sm_copyin_err:
2819 membar #Sync
2820 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2821 mov SM_SAVE_SRC, %o0
2822 mov SM_SAVE_DST, %o1
2823 mov SM_SAVE_COUNT, %o2
2824 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler
2825 tst %o3
2826 bz,pt %ncc, 3f ! if not, return error
2827 nop
2828 ldn [%o3 + CP_COPYIN], %o5 ! if handler, invoke it with
2829 jmp %o5 ! original arguments
2830 nop
2831 3:
2832 retl
2833 or %g0, -1, %o0 ! return errno value
2834
2835 SET_SIZE(copyin)
2836
2837
2838 /*
2839 * The _more entry points are not intended to be used directly by
2840 * any caller from outside this file. They are provided to allow
2841 * profiling and dtrace of the portions of the copy code that uses
2842 * the floating point registers.
2843 * This entry is particularly important as DTRACE (at least as of
2844 * 4/2004) does not support leaf functions.
2845 */
2846
2847 ENTRY(copyin_more)
2848 .copyin_more:
2849 prefetch [%o0], #n_reads
2850 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2851 set .copyin_err, REAL_LOFAULT
2852
2853 /*
2854 * Copy ins that reach here are larger than VIS_COPY_THRESHOLD bytes
2855 */
2856 .do_copyin:
2857 set copyio_fault, %l7 ! .copyio_fault is lofault val
2858
2859 ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler
2860 membar #Sync ! sync error barrier
2861 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault
2862
2863 mov %i0, SAVE_SRC
2864 mov %i1, SAVE_DST
2865 mov %i2, SAVE_COUNT
2866
2867 FP_NOMIGRATE(6, 7)
2868
2869 rd %fprs, %o2 ! check for unused fp
2870 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
2871 btst FPRS_FEF, %o2
2872 bz,a,pt %icc, .do_blockcopyin
2873 wr %g0, FPRS_FEF, %fprs
2874
2875 BST_FPQ2Q4_TOSTACK(%o2)
2876
2877 .do_blockcopyin:
2878 rd %gsr, %o2
2879 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr
2880 or %l6, FPUSED_FLAG, %l6
2881
2882 andcc DST, VIS_BLOCKSIZE - 1, TMP
2883 mov ASI_USER, %asi
2884 bz,pt %ncc, 2f
2885 neg TMP
2886 add TMP, VIS_BLOCKSIZE, TMP
2887
2888 ! TMP = bytes required to align DST on FP_BLOCK boundary
2889 ! Using SRC as a tmp here
2890 cmp TMP, 3
2891 bleu,pt %ncc, 1f
2892 sub CNT,TMP,CNT ! adjust main count
2893 sub TMP, 3, TMP ! adjust for end of loop test
2894 .ci_blkalign:
2895 lduba [REALSRC]%asi, SRC ! move 4 bytes per loop iteration
2896 stb SRC, [DST]
2897 subcc TMP, 4, TMP
2898 lduba [REALSRC + 1]%asi, SRC
2899 add REALSRC, 4, REALSRC
2900 stb SRC, [DST + 1]
2901 lduba [REALSRC - 2]%asi, SRC
2902 add DST, 4, DST
2903 stb SRC, [DST - 2]
2904 lduba [REALSRC - 1]%asi, SRC
2905 bgu,pt %ncc, .ci_blkalign
2906 stb SRC, [DST - 1]
2907
2908 addcc TMP, 3, TMP ! restore count adjustment
2909 bz,pt %ncc, 2f ! no bytes left?
2910 nop
2911 1: lduba [REALSRC]%asi, SRC
2912 inc REALSRC
2913 inc DST
2914 deccc TMP
2915 bgu %ncc, 1b
2916 stb SRC, [DST - 1]
2917
2918 2:
2919 membar #StoreLoad
2920 andn REALSRC, 0x7, SRC
2921
2922 ! SRC - 8-byte aligned
2923 ! DST - 64-byte aligned
2924 ldda [SRC]%asi, %f16
2925 prefetcha [SRC + (1 * VIS_BLOCKSIZE)]%asi, #n_reads
2926 alignaddr REALSRC, %g0, %g0
2927 ldda [SRC + 0x08]%asi, %f18
2928 prefetcha [SRC + (2 * VIS_BLOCKSIZE)]%asi, #n_reads
2929 faligndata %f16, %f18, %f48
2930 ldda [SRC + 0x10]%asi, %f20
2931 prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #n_reads
2932 faligndata %f18, %f20, %f50
2933 ldda [SRC + 0x18]%asi, %f22
2934 prefetcha [SRC + (4 * VIS_BLOCKSIZE)]%asi, #one_read
2935 faligndata %f20, %f22, %f52
2936 ldda [SRC + 0x20]%asi, %f24
2937 prefetcha [SRC + (8 * VIS_BLOCKSIZE)]%asi, #one_read
2938 faligndata %f22, %f24, %f54
2939 ldda [SRC + 0x28]%asi, %f26
2940 prefetcha [SRC + (12 * VIS_BLOCKSIZE)]%asi, #one_read
2941 faligndata %f24, %f26, %f56
2942 ldda [SRC + 0x30]%asi, %f28
2943 prefetcha [SRC + (16 * VIS_BLOCKSIZE)]%asi, #one_read
2944 faligndata %f26, %f28, %f58
2945 ldda [SRC + 0x38]%asi, %f30
2946 ldda [SRC + VIS_BLOCKSIZE]%asi, %f16
2947 sub CNT, VIS_BLOCKSIZE, CNT
2948 add SRC, VIS_BLOCKSIZE, SRC
2949 prefetcha [SRC + (19 * VIS_BLOCKSIZE)]%asi, #one_read
2950 add REALSRC, VIS_BLOCKSIZE, REALSRC
2951 ba,pt %ncc, 1f
2952 prefetcha [SRC + (23 * VIS_BLOCKSIZE)]%asi, #one_read
2953 .align 32
2954 1:
2955 ldda [SRC + 0x08]%asi, %f18
2956 faligndata %f28, %f30, %f60
2957 ldda [SRC + 0x10]%asi, %f20
2958 faligndata %f30, %f16, %f62
2959 stda %f48, [DST]ASI_BLK_P
2960 ldda [SRC + 0x18]%asi, %f22
2961 faligndata %f16, %f18, %f48
2962 ldda [SRC + 0x20]%asi, %f24
2963 faligndata %f18, %f20, %f50
2964 ldda [SRC + 0x28]%asi, %f26
2965 faligndata %f20, %f22, %f52
2966 ldda [SRC + 0x30]%asi, %f28
2967 faligndata %f22, %f24, %f54
2968 sub CNT, VIS_BLOCKSIZE, CNT
2969 ldda [SRC + 0x38]%asi, %f30
2970 faligndata %f24, %f26, %f56
2971 add DST, VIS_BLOCKSIZE, DST
2972 ldda [SRC + VIS_BLOCKSIZE]%asi, %f16
2973 faligndata %f26, %f28, %f58
2974 add REALSRC, VIS_BLOCKSIZE, REALSRC
2975 prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #n_reads
2976 add SRC, VIS_BLOCKSIZE, SRC
2977 prefetcha [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read
2978 cmp CNT, VIS_BLOCKSIZE + 8
2979 bgu,pt %ncc, 1b
2980 prefetcha [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read
2981
2982 ! only if REALSRC & 0x7 is 0
2983 cmp CNT, VIS_BLOCKSIZE
2984 bne %ncc, 3f
2985 andcc REALSRC, 0x7, %g0
2986 bz,pt %ncc, 2f
2987 nop
2988 3:
2989 faligndata %f28, %f30, %f60
2990 faligndata %f30, %f16, %f62
2991 stda %f48, [DST]ASI_BLK_P
2992 add DST, VIS_BLOCKSIZE, DST
2993 ba,pt %ncc, 3f
2994 nop
2995 2:
2996 ldda [SRC + 0x08]%asi, %f18
2997 fsrc1 %f28, %f60
2998 ldda [SRC + 0x10]%asi, %f20
2999 fsrc1 %f30, %f62
3000 stda %f48, [DST]ASI_BLK_P
3001 ldda [SRC + 0x18]%asi, %f22
3002 fsrc1 %f16, %f48
3003 ldda [SRC + 0x20]%asi, %f24
3004 fsrc1 %f18, %f50
3005 ldda [SRC + 0x28]%asi, %f26
3006 fsrc1 %f20, %f52
3007 ldda [SRC + 0x30]%asi, %f28
3008 fsrc1 %f22, %f54
3009 ldda [SRC + 0x38]%asi, %f30
3010 fsrc1 %f24, %f56
3011 sub CNT, VIS_BLOCKSIZE, CNT
3012 add DST, VIS_BLOCKSIZE, DST
3013 add SRC, VIS_BLOCKSIZE, SRC
3014 add REALSRC, VIS_BLOCKSIZE, REALSRC
3015 fsrc1 %f26, %f58
3016 fsrc1 %f28, %f60
3017 fsrc1 %f30, %f62
3018 stda %f48, [DST]ASI_BLK_P
3019 add DST, VIS_BLOCKSIZE, DST
3020 ba,a,pt %ncc, 4f
3021 nop
3022
3023 3: tst CNT
3024 bz,a %ncc, 4f
3025 nop
3026
3027 5: lduba [REALSRC]ASI_USER, TMP
3028 inc REALSRC
3029 inc DST
3030 deccc CNT
3031 bgu %ncc, 5b
3032 stb TMP, [DST - 1]
3033 4:
3034
3035 .copyin_exit:
3036 membar #Sync
3037
3038 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr
3039 wr %o2, 0, %gsr
3040
3041 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
3042 btst FPRS_FEF, %o3
3043 bz,pt %icc, 4f
3044 nop
3045
3046 BLD_FPQ2Q4_FROMSTACK(%o2)
3047
3048 ba,pt %ncc, 1f
3049 wr %o3, 0, %fprs ! restore fprs
3050
3051 4:
3052 FZEROQ2Q4
3053 wr %o3, 0, %fprs ! restore fprs
3054
3055 1:
3056 membar #Sync ! sync error barrier
3057 andn %l6, FPUSED_FLAG, %l6
3058 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
3059 FP_ALLOWMIGRATE(5, 6)
3060 ret
3061 restore %g0, 0, %o0
3062 /*
3063 * We got here because of a fault during copyin
3064 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
3065 */
3066 .copyin_err:
3067 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler
3068 tst %o4
3069 bz,pt %ncc, 2f ! if not, return error
3070 nop
3071 ldn [%o4 + CP_COPYIN], %g2 ! if handler, invoke it with
3072 jmp %g2 ! original arguments
3073 restore %g0, 0, %g0 ! dispose of copy window
3074 2:
3075 ret
3076 restore %g0, -1, %o0 ! return error value
3077
3078
3079 SET_SIZE(copyin_more)
3080
3081 ENTRY(xcopyin)
3082
3083 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
3084 bleu,pt %ncc, .xcopyin_small ! go to larger cases
3085 xor %o0, %o1, %o3 ! are src, dst alignable?
3086 btst 7, %o3 !
3087 bz,pt %ncc, .xcopyin_8 ! check for longword alignment
3088 nop
3089 btst 1, %o3 !
3090 bz,pt %ncc, .xcopyin_2 ! check for half-word
3091 nop
3092 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
3093 ld [%o3 + %lo(hw_copy_limit_1)], %o3
3094 tst %o3
3095 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy
3096 cmp %o2, %o3 ! if length <= limit
3097 bleu,pt %ncc, .xcopyin_small ! go to small copy
3098 nop
3099 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy
3100 nop
3101 .xcopyin_2:
3102 btst 3, %o3 !
3103 bz,pt %ncc, .xcopyin_4 ! check for word alignment
3104 nop
3105 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
3106 ld [%o3 + %lo(hw_copy_limit_2)], %o3
3107 tst %o3
3108 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy
3109 cmp %o2, %o3 ! if length <= limit
3110 bleu,pt %ncc, .xcopyin_small ! go to small copy
3111 nop
3112 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy
3113 nop
3114 .xcopyin_4:
3115 ! already checked longword, must be word aligned
3116 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
3117 ld [%o3 + %lo(hw_copy_limit_4)], %o3
3118 tst %o3
3119 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy
3120 cmp %o2, %o3 ! if length <= limit
3121 bleu,pt %ncc, .xcopyin_small ! go to small copy
3122 nop
3123 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy
3124 nop
3125 .xcopyin_8:
3126 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
3127 ld [%o3 + %lo(hw_copy_limit_8)], %o3
3128 tst %o3
3129 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy
3130 cmp %o2, %o3 ! if length <= limit
3131 bleu,pt %ncc, .xcopyin_small ! go to small copy
3132 nop
3133 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy
3134 nop
3135
3136 .xcopyin_small:
3137 sethi %hi(.sm_xcopyin_err), %o5 ! .sm_xcopyin_err is lofault value
3138 or %o5, %lo(.sm_xcopyin_err), %o5
3139 ldn [THREAD_REG + T_LOFAULT], %o4 ! set/save t_lofaul
3140 membar #Sync ! sync error barrier
3141 ba,pt %ncc, .sm_do_copyin ! common code
3142 stn %o5, [THREAD_REG + T_LOFAULT]
3143
3144 .xcopyin_more:
3145 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3146 sethi %hi(.xcopyin_err), REAL_LOFAULT ! .xcopyin_err is lofault value
3147 ba,pt %ncc, .do_copyin
3148 or REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
3149
3150 /*
3151 * We got here because of fault during xcopyin
3152 * Errno value is in ERRNO
3153 */
3154 .xcopyin_err:
3155 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler
3156 tst %o4
3157 bz,pt %ncc, 2f ! if not, return error
3158 nop
3159 ldn [%o4 + CP_XCOPYIN], %g2 ! if handler, invoke it with
3160 jmp %g2 ! original arguments
3161 restore %g0, 0, %g0 ! dispose of copy window
3162 2:
3163 ret
3164 restore ERRNO, 0, %o0 ! return errno value
3165
3166 .sm_xcopyin_err:
3167
3168 membar #Sync
3169 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
3170 mov SM_SAVE_SRC, %o0
3171 mov SM_SAVE_DST, %o1
3172 mov SM_SAVE_COUNT, %o2
3173 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler
3174 tst %o3
3175 bz,pt %ncc, 3f ! if not, return error
3176 nop
3177 ldn [%o3 + CP_XCOPYIN], %o5 ! if handler, invoke it with
3178 jmp %o5 ! original arguments
3179 nop
3180 3:
3181 retl
3182 or %g1, 0, %o0 ! return errno value
3183
3184 SET_SIZE(xcopyin)
3185
3186 ENTRY(xcopyin_little)
3187 sethi %hi(.xcopyio_err), %o5
3188 or %o5, %lo(.xcopyio_err), %o5
3189 ldn [THREAD_REG + T_LOFAULT], %o4
3190 membar #Sync ! sync error barrier
3191 stn %o5, [THREAD_REG + T_LOFAULT]
3192 mov %o4, %o5
3193
3194 subcc %g0, %o2, %o3
3195 add %o0, %o2, %o0
3196 bz,pn %ncc, 2f ! check for zero bytes
3197 sub %o2, 1, %o4
3198 add %o0, %o4, %o0 ! start w/last byte
3199 add %o1, %o2, %o1
3200 lduba [%o0 + %o3]ASI_AIUSL, %o4
3201
3202 1: stb %o4, [%o1 + %o3]
3203 inccc %o3
3204 sub %o0, 2, %o0 ! get next byte
3205 bcc,a,pt %ncc, 1b
3206 lduba [%o0 + %o3]ASI_AIUSL, %o4
3207
3208 2:
3209 membar #Sync ! sync error barrier
3210 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
3211 retl
3212 mov %g0, %o0 ! return (0)
3213
3214 .xcopyio_err:
3215 membar #Sync ! sync error barrier
3216 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
3217 retl
3218 mov %g1, %o0
3219
3220 SET_SIZE(xcopyin_little)
3221
3222
3223 /*
3224 * Copy a block of storage - must not overlap (from + len <= to).
3225 * No fault handler installed (to be called under on_fault())
3226 */
3227 ENTRY(copyin_noerr)
3228
3229 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
3230 bleu,pt %ncc, .copyin_ne_small ! go to larger cases
3231 xor %o0, %o1, %o3 ! are src, dst alignable?
3232 btst 7, %o3 !
3233 bz,pt %ncc, .copyin_ne_8 ! check for longword alignment
3234 nop
3235 btst 1, %o3 !
3236 bz,pt %ncc, .copyin_ne_2 ! check for half-word
3237 nop
3238 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
3239 ld [%o3 + %lo(hw_copy_limit_1)], %o3
3240 tst %o3
3241 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy
3242 cmp %o2, %o3 ! if length <= limit
3243 bleu,pt %ncc, .copyin_ne_small ! go to small copy
3244 nop
3245 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy
3246 nop
3247 .copyin_ne_2:
3248 btst 3, %o3 !
3249 bz,pt %ncc, .copyin_ne_4 ! check for word alignment
3250 nop
3251 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
3252 ld [%o3 + %lo(hw_copy_limit_2)], %o3
3253 tst %o3
3254 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy
3255 cmp %o2, %o3 ! if length <= limit
3256 bleu,pt %ncc, .copyin_ne_small ! go to small copy
3257 nop
3258 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy
3259 nop
3260 .copyin_ne_4:
3261 ! already checked longword, must be word aligned
3262 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
3263 ld [%o3 + %lo(hw_copy_limit_4)], %o3
3264 tst %o3
3265 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy
3266 cmp %o2, %o3 ! if length <= limit
3267 bleu,pt %ncc, .copyin_ne_small ! go to small copy
3268 nop
3269 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy
3270 nop
3271 .copyin_ne_8:
3272 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
3273 ld [%o3 + %lo(hw_copy_limit_8)], %o3
3274 tst %o3
3275 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy
3276 cmp %o2, %o3 ! if length <= limit
3277 bleu,pt %ncc, .copyin_ne_small ! go to small copy
3278 nop
3279 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy
3280 nop
3281
3282 .copyin_ne_small:
3283 ldn [THREAD_REG + T_LOFAULT], %o4
3284 tst %o4
3285 bz,pn %ncc, .sm_do_copyin
3286 nop
3287 sethi %hi(.sm_copyio_noerr), %o5
3288 or %o5, %lo(.sm_copyio_noerr), %o5
3289 membar #Sync ! sync error barrier
3290 ba,pt %ncc, .sm_do_copyin
3291 stn %o5, [THREAD_REG + T_LOFAULT] ! set/save t_lofault
3292
3293 .copyin_noerr_more:
3294 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3295 sethi %hi(.copyio_noerr), REAL_LOFAULT
3296 ba,pt %ncc, .do_copyin
3297 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
3298
3299 .copyio_noerr:
3300 jmp %l6
3301 restore %g0,0,%g0
3302
3303 .sm_copyio_noerr:
3304 membar #Sync
3305 stn %o4, [THREAD_REG + T_LOFAULT] ! restore t_lofault
3306 jmp %o4
3307 nop
3308
3309 SET_SIZE(copyin_noerr)
3310
3311 /*
3312 * Copy a block of storage - must not overlap (from + len <= to).
3313 * No fault handler installed (to be called under on_fault())
3314 */
3315
3316 ENTRY(copyout_noerr)
3317
3318 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
3319 bleu,pt %ncc, .copyout_ne_small ! go to larger cases
3320 xor %o0, %o1, %o3 ! are src, dst alignable?
3321 btst 7, %o3 !
3322 bz,pt %ncc, .copyout_ne_8 ! check for longword alignment
3323 nop
3324 btst 1, %o3 !
3325 bz,pt %ncc, .copyout_ne_2 ! check for half-word
3326 nop
3327 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
3328 ld [%o3 + %lo(hw_copy_limit_1)], %o3
3329 tst %o3
3330 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy
3331 cmp %o2, %o3 ! if length <= limit
3332 bleu,pt %ncc, .copyout_ne_small ! go to small copy
3333 nop
3334 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy
3335 nop
3336 .copyout_ne_2:
3337 btst 3, %o3 !
3338 bz,pt %ncc, .copyout_ne_4 ! check for word alignment
3339 nop
3340 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
3341 ld [%o3 + %lo(hw_copy_limit_2)], %o3
3342 tst %o3
3343 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy
3344 cmp %o2, %o3 ! if length <= limit
3345 bleu,pt %ncc, .copyout_ne_small ! go to small copy
3346 nop
3347 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy
3348 nop
3349 .copyout_ne_4:
3350 ! already checked longword, must be word aligned
3351 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
3352 ld [%o3 + %lo(hw_copy_limit_4)], %o3
3353 tst %o3
3354 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy
3355 cmp %o2, %o3 ! if length <= limit
3356 bleu,pt %ncc, .copyout_ne_small ! go to small copy
3357 nop
3358 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy
3359 nop
3360 .copyout_ne_8:
3361 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
3362 ld [%o3 + %lo(hw_copy_limit_8)], %o3
3363 tst %o3
3364 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy
3365 cmp %o2, %o3 ! if length <= limit
3366 bleu,pt %ncc, .copyout_ne_small ! go to small copy
3367 nop
3368 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy
3369 nop
3370
3371 .copyout_ne_small:
3372 ldn [THREAD_REG + T_LOFAULT], %o4
3373 tst %o4
3374 bz,pn %ncc, .sm_do_copyout
3375 nop
3376 sethi %hi(.sm_copyio_noerr), %o5
3377 or %o5, %lo(.sm_copyio_noerr), %o5
3378 membar #Sync ! sync error barrier
3379 ba,pt %ncc, .sm_do_copyout
3380 stn %o5, [THREAD_REG + T_LOFAULT] ! set/save t_lofault
3381
3382 .copyout_noerr_more:
3383 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3384 sethi %hi(.copyio_noerr), REAL_LOFAULT
3385 ba,pt %ncc, .do_copyout
3386 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
3387
3388 SET_SIZE(copyout_noerr)
3389
3390
3391 /*
3392 * hwblkclr - clears block-aligned, block-multiple-sized regions that are
3393 * longer than 256 bytes in length using spitfire's block stores. If
3394 * the criteria for using this routine are not met then it calls bzero
3395 * and returns 1. Otherwise 0 is returned indicating success.
3396 * Caller is responsible for ensuring use_hw_bzero is true and that
3397 * kpreempt_disable() has been called.
3398 */
3399 ! %i0 - start address
3400 ! %i1 - length of region (multiple of 64)
3401 ! %l0 - saved fprs
3402 ! %l1 - pointer to saved %d0 block
3403 ! %l2 - saved curthread->t_lwp
3404
3405 ENTRY(hwblkclr)
3406 ! get another window w/space for one aligned block of saved fpregs
3407 save %sp, -SA(MINFRAME + 2*VIS_BLOCKSIZE), %sp
3408
3409 ! Must be block-aligned
3410 andcc %i0, (VIS_BLOCKSIZE-1), %g0
3411 bnz,pn %ncc, 1f
3412 nop
3413
3414 ! ... and must be 256 bytes or more
3415 cmp %i1, 256
3416 blu,pn %ncc, 1f
3417 nop
3418
3419 ! ... and length must be a multiple of VIS_BLOCKSIZE
3420 andcc %i1, (VIS_BLOCKSIZE-1), %g0
3421 bz,pn %ncc, 2f
3422 nop
3423
3424 1: ! punt, call bzero but notify the caller that bzero was used
3425 mov %i0, %o0
3426 call bzero
3427 mov %i1, %o1
3428 ret
3429 restore %g0, 1, %o0 ! return (1) - did not use block operations
3430
3431 2: rd %fprs, %l0 ! check for unused fp
3432 btst FPRS_FEF, %l0
3433 bz,pt %icc, 1f
3434 nop
3435
3436 ! save in-use fpregs on stack
3437 membar #Sync
3438 add %fp, STACK_BIAS - 65, %l1
3439 and %l1, -VIS_BLOCKSIZE, %l1
3440 stda %d0, [%l1]ASI_BLK_P
3441
3442 1: membar #StoreStore|#StoreLoad|#LoadStore
3443 wr %g0, FPRS_FEF, %fprs
3444 wr %g0, ASI_BLK_P, %asi
3445
3446 ! Clear block
3447 fzero %d0
3448 fzero %d2
3449 fzero %d4
3450 fzero %d6
3451 fzero %d8
3452 fzero %d10
3453 fzero %d12
3454 fzero %d14
3455
3456 mov 256, %i3
3457 ba,pt %ncc, .pz_doblock
3458 nop
3459
3460 .pz_blkstart:
3461 ! stda %d0, [%i0 + 192]%asi ! in dly slot of branch that got us here
3462 stda %d0, [%i0 + 128]%asi
3463 stda %d0, [%i0 + 64]%asi
3464 stda %d0, [%i0]%asi
3465 .pz_zinst:
3466 add %i0, %i3, %i0
3467 sub %i1, %i3, %i1
3468 .pz_doblock:
3469 cmp %i1, 256
3470 bgeu,a %ncc, .pz_blkstart
3471 stda %d0, [%i0 + 192]%asi
3472
3473 cmp %i1, 64
3474 blu %ncc, .pz_finish
3475
3476 andn %i1, (64-1), %i3
3477 srl %i3, 4, %i2 ! using blocks, 1 instr / 16 words
3478 set .pz_zinst, %i4
3479 sub %i4, %i2, %i4
3480 jmp %i4
3481 nop
3482
3483 .pz_finish:
3484 membar #Sync
3485 btst FPRS_FEF, %l0
3486 bz,a .pz_finished
3487 wr %l0, 0, %fprs ! restore fprs
3488
3489 ! restore fpregs from stack
3490 ldda [%l1]ASI_BLK_P, %d0
3491 membar #Sync
3492 wr %l0, 0, %fprs ! restore fprs
3493
3494 .pz_finished:
3495 ret
3496 restore %g0, 0, %o0 ! return (bzero or not)
3497
3498 SET_SIZE(hwblkclr)
3499
3500 /*
3501 * Copy 32 bytes of data from src (%o0) to dst (%o1)
3502 * using physical addresses.
3503 */
3504 ENTRY_NP(hw_pa_bcopy32)
3505 rdpr %pstate, %g1
3506 andn %g1, PSTATE_IE, %g2
3507 wrpr %g0, %g2, %pstate
3508
3509 rdpr %pstate, %g0
3510 ldxa [%o0]ASI_MEM, %o2
3511 add %o0, 8, %o0
3512 ldxa [%o0]ASI_MEM, %o3
3513 add %o0, 8, %o0
3514 ldxa [%o0]ASI_MEM, %o4
3515 add %o0, 8, %o0
3516 ldxa [%o0]ASI_MEM, %o5
3517 membar #Sync
3518
3519 stxa %o2, [%o1]ASI_MEM
3520 add %o1, 8, %o1
3521 stxa %o3, [%o1]ASI_MEM
3522 add %o1, 8, %o1
3523 stxa %o4, [%o1]ASI_MEM
3524 add %o1, 8, %o1
3525 stxa %o5, [%o1]ASI_MEM
3526
3527 retl
3528 wrpr %g0, %g1, %pstate
3529
3530 SET_SIZE(hw_pa_bcopy32)
3531
3532 DGDEF(use_hw_bcopy)
3533 .word 1
3534 DGDEF(use_hw_bzero)
3535 .word 1
3536 DGDEF(hw_copy_limit_1)
3537 .word 0
3538 DGDEF(hw_copy_limit_2)
3539 .word 0
3540 DGDEF(hw_copy_limit_4)
3541 .word 0
3542 DGDEF(hw_copy_limit_8)
3543 .word 0
3544
3545 .align 64
3546 .section ".text"