1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 #include <sys/param.h>
28 #include <sys/errno.h>
29 #include <sys/asm_linkage.h>
30 #include <sys/vtrace.h>
31 #include <sys/machthread.h>
32 #include <sys/clock.h>
33 #include <sys/asi.h>
34 #include <sys/fsr.h>
35 #include <sys/privregs.h>
36 #include <sys/fpras_impl.h>
37
38 #include "assym.h"
39
40 /*
41 * Pseudo-code to aid in understanding the control flow of the
42 * bcopy/copyin/copyout routines.
43 *
44 * On entry:
45 *
46 * ! Determine whether to use the FP register version
47 * ! or the leaf routine version depending on size
48 * ! of copy and flags. Set up error handling accordingly.
49 * ! The transition point depends on whether the src and
50 * ! dst addresses can be aligned to long word, word,
51 * ! half word, or byte boundaries.
52 * !
53 * ! WARNING: <Register usage convention>
54 * ! For FP version, %l6 holds previous error handling and
55 * ! a flag: TRAMP_FLAG (low bits)
56 * ! for leaf routine version, %o4 holds those values.
57 * ! So either %l6 or %o4 is reserved and not available for
58 * ! any other use.
59 *
60 * if (length <= VIS_COPY_THRESHOLD) ! start with a quick test
61 * go to small_copy; ! to speed short copies
62 *
63 * ! src, dst long word alignable
64 * if (hw_copy_limit_8 == 0) ! hw_copy disabled
65 * go to small_copy;
66 * if (length <= hw_copy_limit_8)
67 * go to small_copy;
68 * go to FPBLK_copy;
69 * }
70 * if (src,dst not alignable) {
71 * if (hw_copy_limit_1 == 0) ! hw_copy disabled
72 * go to small_copy;
73 * if (length <= hw_copy_limit_1)
74 * go to small_copy;
75 * go to FPBLK_copy;
76 * }
77 * if (src,dst halfword alignable) {
78 * if (hw_copy_limit_2 == 0) ! hw_copy disabled
79 * go to small_copy;
80 * if (length <= hw_copy_limit_2)
81 * go to small_copy;
82 * go to FPBLK_copy;
83 * }
84 * if (src,dst word alignable) {
85 * if (hw_copy_limit_4 == 0) ! hw_copy disabled
86 * go to small_copy;
87 * if (length <= hw_copy_limit_4)
88 * go to small_copy;
89 * go to FPBLK_copy;
90 * }
91 *
92 * small_copy:
93 * Setup_leaf_rtn_error_handler; ! diffs for each entry point
94 *
95 * if (count <= 3) ! fast path for tiny copies
96 * go to sm_left; ! special finish up code
97 * else
98 * if (count > CHKSIZE) ! medium sized copies
99 * go to sm_med ! tuned by alignment
100 * if(src&dst not both word aligned) {
101 * sm_movebytes:
102 * move byte by byte in 4-way unrolled loop
103 * fall into sm_left;
104 * sm_left:
105 * move 0-3 bytes byte at a time as needed.
106 * restore error handler and exit.
107 *
108 * } else { ! src&dst are word aligned
109 * check for at least 8 bytes left,
110 * move word at a time, unrolled by 2
111 * when fewer than 8 bytes left,
112 * sm_half: move half word at a time while 2 or more bytes left
113 * sm_byte: move final byte if necessary
114 * sm_exit:
115 * restore error handler and exit.
116 * }
117 *
118 * ! Medium length cases with at least CHKSIZE bytes available
119 * ! method: line up src and dst as best possible, then
120 * ! move data in 4-way unrolled loops.
121 *
122 * sm_med:
123 * if(src&dst unalignable)
124 * go to sm_movebytes
125 * if(src&dst halfword alignable)
126 * go to sm_movehalf
127 * if(src&dst word alignable)
128 * go to sm_moveword
129 * ! fall into long word movement
130 * move bytes until src is word aligned
131 * if not long word aligned, move a word
132 * move long words in 4-way unrolled loop until < 32 bytes left
133 * move long words in 1-way unrolled loop until < 8 bytes left
134 * if zero bytes left, goto sm_exit
135 * if one byte left, go to sm_byte
136 * else go to sm_half
137 *
138 * sm_moveword:
139 * move bytes until src is word aligned
140 * move words in 4-way unrolled loop until < 16 bytes left
141 * move words in 1-way unrolled loop until < 4 bytes left
142 * if zero bytes left, goto sm_exit
143 * if one byte left, go to sm_byte
144 * else go to sm_half
145 *
146 * sm_movehalf:
147 * move a byte if needed to align src on halfword
148 * move halfwords in 4-way unrolled loop until < 8 bytes left
149 * if zero bytes left, goto sm_exit
150 * if one byte left, go to sm_byte
151 * else go to sm_half
152 *
153 *
154 * FPBLK_copy:
155 * %l6 = curthread->t_lofault;
156 * if (%l6 != NULL) {
157 * membar #Sync
158 * curthread->t_lofault = .copyerr;
159 * caller_error_handler = TRUE ! %l6 |= 2
160 * }
161 *
162 * ! for FPU testing we must not migrate cpus
163 * if (curthread->t_lwp == NULL) {
164 * ! Kernel threads do not have pcb's in which to store
165 * ! the floating point state, so disallow preemption during
166 * ! the copy. This also prevents cpu migration.
167 * kpreempt_disable(curthread);
168 * } else {
169 * thread_nomigrate();
170 * }
171 *
172 * old_fprs = %fprs;
173 * old_gsr = %gsr;
174 * if (%fprs.fef) {
175 * %fprs.fef = 1;
176 * save current fpregs on stack using blockstore
177 * } else {
178 * %fprs.fef = 1;
179 * }
180 *
181 *
182 * do_blockcopy_here;
183 *
184 * In lofault handler:
185 * curthread->t_lofault = .copyerr2;
186 * Continue on with the normal exit handler
187 *
188 * On normal exit:
189 * %gsr = old_gsr;
190 * if (old_fprs & FPRS_FEF)
191 * restore fpregs from stack using blockload
192 * else
193 * zero fpregs
194 * %fprs = old_fprs;
195 * membar #Sync
196 * curthread->t_lofault = (%l6 & ~3);
197 * ! following test omitted from copyin/copyout as they
198 * ! will always have a current thread
199 * if (curthread->t_lwp == NULL)
200 * kpreempt_enable(curthread);
201 * else
202 * thread_allowmigrate();
203 * return (0)
204 *
205 * In second lofault handler (.copyerr2):
206 * We've tried to restore fp state from the stack and failed. To
207 * prevent from returning with a corrupted fp state, we will panic.
208 */
209
210 /*
211 * Comments about optimization choices
212 *
213 * The initial optimization decision in this code is to determine
214 * whether to use the FP registers for a copy or not. If we don't
215 * use the FP registers, we can execute the copy as a leaf routine,
216 * saving a register save and restore. Also, less elaborate setup
217 * is required, allowing short copies to be completed more quickly.
218 * For longer copies, especially unaligned ones (where the src and
219 * dst do not align to allow simple ldx,stx operation), the FP
220 * registers allow much faster copy operations.
221 *
222 * The estimated extra cost of the FP path will vary depending on
223 * src/dst alignment, dst offset from the next 64 byte FPblock store
224 * boundary, remaining src data after the last full dst cache line is
225 * moved whether the FP registers need to be saved, and some other
226 * minor issues. The average additional overhead is estimated to be
227 * 400 clocks. Since each non-repeated/predicted tst and branch costs
228 * around 10 clocks, elaborate calculation would slow down to all
229 * longer copies and only benefit a small portion of medium sized
230 * copies. Rather than incur such cost, we chose fixed transition
231 * points for each of the alignment choices.
232 *
233 * For the inner loop, here is a comparison of the per cache line
234 * costs for each alignment when src&dst are in cache:
235 *
236 * byte aligned: 108 clocks slower for non-FPBLK
237 * half aligned: 44 clocks slower for non-FPBLK
238 * word aligned: 12 clocks slower for non-FPBLK
239 * long aligned: 4 clocks >>faster<< for non-FPBLK
240 *
241 * The long aligned loop runs faster because it does no prefetching.
242 * That wins if the data is not in cache or there is too little
243 * data to gain much benefit from prefetching. But when there
244 * is more data and that data is not in cache, failing to prefetch
245 * can run much slower. In addition, there is a 2 Kbyte store queue
246 * which will cause the non-FPBLK inner loop to slow for larger copies.
247 * The exact tradeoff is strongly load and application dependent, with
248 * increasing risk of a customer visible performance regression if the
249 * non-FPBLK code is used for larger copies. Studies of synthetic in-cache
250 * vs out-of-cache copy tests in user space suggest 1024 bytes as a safe
251 * upper limit for the non-FPBLK code. To minimize performance regression
252 * risk while still gaining the primary benefits of the improvements to
253 * the non-FPBLK code, we set an upper bound of 1024 bytes for the various
254 * hw_copy_limit_*. Later experimental studies using different values
255 * of hw_copy_limit_* can be used to make further adjustments if
256 * appropriate.
257 *
258 * hw_copy_limit_1 = src and dst are byte aligned but not halfword aligned
259 * hw_copy_limit_2 = src and dst are halfword aligned but not word aligned
260 * hw_copy_limit_4 = src and dst are word aligned but not longword aligned
261 * hw_copy_limit_8 = src and dst are longword aligned
262 *
263 * To say that src and dst are word aligned means that after
264 * some initial alignment activity of moving 0 to 3 bytes,
265 * both the src and dst will be on word boundaries so that
266 * word loads and stores may be used.
267 *
268 * Recommended initial values as of Mar 2004, includes testing
269 * on Cheetah+ (900MHz), Cheetah++ (1200MHz), and Jaguar(1050MHz):
270 * hw_copy_limit_1 = 256
271 * hw_copy_limit_2 = 512
272 * hw_copy_limit_4 = 1024
273 * hw_copy_limit_8 = 1024 (or 1536 on some systems)
274 *
275 *
276 * If hw_copy_limit_? is set to zero, then use of FPBLK copy is
277 * disabled for that alignment choice.
278 * If hw_copy_limit_? is set to a value between 1 and VIS_COPY_THRESHOLD (256)
279 * the value of VIS_COPY_THRESHOLD is used.
280 * It is not envisioned that hw_copy_limit_? will be changed in the field
281 * It is provided to allow for disabling FPBLK copies and to allow
282 * easy testing of alternate values on future HW implementations
283 * that might have different cache sizes, clock rates or instruction
284 * timing rules.
285 *
286 * Our first test for FPBLK copies vs non-FPBLK copies checks a minimum
287 * threshold to speedup all shorter copies (less than 256). That
288 * saves an alignment test, memory reference, and enabling test
289 * for all short copies, or an estimated 24 clocks.
290 *
291 * The order in which these limits are checked does matter since each
292 * non-predicted tst and branch costs around 10 clocks.
293 * If src and dst are randomly selected addresses,
294 * 4 of 8 will not be alignable.
295 * 2 of 8 will be half word alignable.
296 * 1 of 8 will be word alignable.
297 * 1 of 8 will be long word alignable.
298 * But, tests on running kernels show that src and dst to copy code
299 * are typically not on random alignments. Structure copies and
300 * copies of larger data sizes are often on long word boundaries.
301 * So we test the long word alignment case first, then
302 * the byte alignment, then halfword, then word alignment.
303 *
304 * Several times, tests for length are made to split the code
305 * into subcases. These tests often allow later tests to be
306 * avoided. For example, within the non-FPBLK copy, we first
307 * check for tiny copies of 3 bytes or less. That allows us
308 * to use a 4-way unrolled loop for the general byte copy case
309 * without a test on loop entry.
310 * We subdivide the non-FPBLK case further into CHKSIZE bytes and less
311 * vs longer cases. For the really short case, we don't attempt
312 * align src and dst. We try to minimize special case tests in
313 * the shortest loops as each test adds a significant percentage
314 * to the total time.
315 *
316 * For the medium sized cases, we allow ourselves to adjust the
317 * src and dst alignment and provide special cases for each of
318 * the four adjusted alignment cases. The CHKSIZE that was used
319 * to decide between short and medium size was chosen to be 39
320 * as that allows for the worst case of 7 bytes of alignment
321 * shift and 4 times 8 bytes for the first long word unrolling.
322 * That knowledge saves an initial test for length on entry into
323 * the medium cases. If the general loop unrolling factor were
324 * to be increases, this number would also need to be adjusted.
325 *
326 * For all cases in the non-FPBLK code where it is known that at
327 * least 4 chunks of data are available for movement, the
328 * loop is unrolled by four. This 4-way loop runs in 8 clocks
329 * or 2 clocks per data element. Due to limitations of the
330 * branch instruction on Cheetah, Jaguar, and Panther, the
331 * minimum time for a small, tight loop is 3 clocks. So
332 * the 4-way loop runs 50% faster than the fastest non-unrolled
333 * loop.
334 *
335 * Instruction alignment is forced by used of .align 16 directives
336 * and nops which are not executed in the code. This
337 * combination of operations shifts the alignment of following
338 * loops to insure that loops are aligned so that their instructions
339 * fall within the minimum number of 4 instruction fetch groups.
340 * If instructions are inserted or removed between the .align
341 * instruction and the unrolled loops, then the alignment needs
342 * to be readjusted. Misaligned loops can add a clock per loop
343 * iteration to the loop timing.
344 *
345 * In a few cases, code is duplicated to avoid a branch. Since
346 * a non-predicted tst and branch takes 10 clocks, this savings
347 * is judged an appropriate time-space tradeoff.
348 *
349 * Within the FPBLK-code, the prefetch method in the inner
350 * loop needs to be explained as it is not standard. Two
351 * prefetches are issued for each cache line instead of one.
352 * The primary one is at the maximum reach of 8 cache lines.
353 * Most of the time, that maximum prefetch reach gives the
354 * cache line more time to reach the processor for systems with
355 * higher processor clocks. But, sometimes memory interference
356 * can cause that prefetch to be dropped. Putting a second
357 * prefetch at a reach of 5 cache lines catches the drops
358 * three iterations later and shows a measured improvement
359 * in performance over any similar loop with a single prefetch.
360 * The prefetches are placed in the loop so they overlap with
361 * non-memory instructions, so that there is no extra cost
362 * when the data is already in-cache.
363 *
364 */
365
366 /*
367 * Notes on preserving existing fp state and on membars.
368 *
369 * When a copyOP decides to use fp we may have to preserve existing
370 * floating point state. It is not the caller's state that we need to
371 * preserve - the rest of the kernel does not use fp and, anyway, fp
372 * registers are volatile across a call. Some examples:
373 *
374 * - userland has fp state and is interrupted (device interrupt
375 * or trap) and within the interrupt/trap handling we use
376 * bcopy()
377 * - another (higher level) interrupt or trap handler uses bcopy
378 * while a bcopy from an earlier interrupt is still active
379 * - an asynchronous error trap occurs while fp state exists (in
380 * userland or in kernel copy) and the tl0 component of the handling
381 * uses bcopy
382 * - a user process with fp state incurs a copy-on-write fault and
383 * hwblkpagecopy always uses fp
384 *
385 * We therefore need a per-call place in which to preserve fp state -
386 * using our stack is ideal (and since fp copy cannot be leaf optimized
387 * because of calls it makes, this is no hardship).
388 *
389 * The following membar BLD/BST discussion is Cheetah pipeline specific.
390 * In Cheetah BLD is blocking, #LoadLoad/#LoadStore/#StoreStore are
391 * nops (those semantics always apply) and #StoreLoad is implemented
392 * as a membar #Sync.
393 *
394 * It is possible that the owner of the fp state has a block load or
395 * block store still "in flight" at the time we come to preserve that
396 * state. Block loads are blocking in Cheetah pipelines so we do not
397 * need to sync with them. In preserving fp regs we will use block stores
398 * (which are not blocking in Cheetah pipelines) so we require a membar #Sync
399 * after storing state (so that our subsequent use of those registers
400 * does not modify them before the block stores complete); this membar
401 * also serves to sync with block stores the owner of the fp state has
402 * initiated.
403 *
404 * When we have finished fp copy (with it's repeated block stores)
405 * we must membar #Sync so that our block stores may complete before
406 * we either restore the original fp state into the fp registers or
407 * return to a caller which may initiate other fp operations that could
408 * modify the fp regs we used before the block stores complete.
409 *
410 * Synchronous faults (eg, unresolvable DMMU miss) that occur while
411 * t_lofault is not NULL will not panic but will instead trampoline
412 * to the registered lofault handler. There is no need for any
413 * membars for these - eg, our store to t_lofault will always be visible to
414 * ourselves and it is our cpu which will take any trap.
415 *
416 * Asynchronous faults (eg, uncorrectable ECC error from memory) that occur
417 * while t_lofault is not NULL will also not panic. Since we're copying
418 * to or from userland the extent of the damage is known - the destination
419 * buffer is incomplete. So trap handlers will trampoline to the lofault
420 * handler in this case which should take some form of error action to
421 * avoid using the incomplete buffer. The trap handler also flags the
422 * fault so that later return-from-trap handling (for the trap that brought
423 * this thread into the kernel in the first place) can notify the process
424 * and reboot the system (or restart the service with Greenline/Contracts).
425 *
426 * Asynchronous faults (eg, uncorrectable ECC error from memory) can
427 * result in deferred error traps - the trap is taken sometime after
428 * the event and the trap PC may not be the PC of the faulting access.
429 * Delivery of such pending traps can be forced by a membar #Sync, acting
430 * as an "error barrier" in this role. To accurately apply the user/kernel
431 * separation described in the preceding paragraph we must force delivery
432 * of deferred traps affecting kernel state before we install a lofault
433 * handler (if we interpose a new lofault handler on an existing one there
434 * is no need to repeat this), and we must force delivery of deferred
435 * errors affecting the lofault-protected region before we clear t_lofault.
436 * Failure to do so results in lost kernel state being interpreted as
437 * affecting a copyin/copyout only, or of an error that really only
438 * affects copy data being interpreted as losing kernel state.
439 *
440 * Since the copy operations may preserve and later restore floating
441 * point state that does not belong to the caller (see examples above),
442 * we must be careful in how we do this in order to prevent corruption
443 * of another program.
444 *
445 * To make sure that floating point state is always saved and restored
446 * correctly, the following "big rules" must be followed when the floating
447 * point registers will be used:
448 *
449 * 1. %l6 always holds the caller's lofault handler. Also in this register,
450 * Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in
451 * use. Bit 2 (TRAMP_FLAG) indicates that the call was to bcopy, and a
452 * lofault handler was set coming in.
453 *
454 * 2. The FPUSED flag indicates that all FP state has been successfully stored
455 * on the stack. It should not be set until this save has been completed.
456 *
457 * 3. The FPUSED flag should not be cleared on exit until all FP state has
458 * been restored from the stack. If an error occurs while restoring
459 * data from the stack, the error handler can check this flag to see if
460 * a restore is necessary.
461 *
462 * 4. Code run under the new lofault handler must be kept to a minimum. In
463 * particular, any calls to FP_ALLOWMIGRATE, which could result in a call
464 * to kpreempt(), should not be made until after the lofault handler has
465 * been restored.
466 */
467
468 /*
469 * VIS_COPY_THRESHOLD indicates the minimum number of bytes needed
470 * to "break even" using FP/VIS-accelerated memory operations.
471 * The FPBLK code assumes a minimum number of bytes are available
472 * to be moved on entry. Check that code carefully before
473 * reducing VIS_COPY_THRESHOLD below 256.
474 */
475 /*
476 * This shadows sys/machsystm.h which can't be included due to the lack of
477 * _ASM guards in include files it references. Change it here, change it there.
478 */
479 #define VIS_COPY_THRESHOLD 256
480
481 /*
482 * TEST for very short copies
483 * Be aware that the maximum unroll for the short unaligned case
484 * is SHORTCOPY+1
485 */
486 #define SHORTCOPY 3
487 #define CHKSIZE 39
488
489 /*
490 * Indicates that we're to trampoline to the error handler.
491 * Entry points bcopy, copyin_noerr, and copyout_noerr use this flag.
492 * kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag.
493 */
494 #define FPUSED_FLAG 1
495 #define TRAMP_FLAG 2
496 #define MASK_FLAGS 3
497
498 /*
499 * Number of outstanding prefetches.
500 * Testing with 1200 MHz Cheetah+ and Jaguar gives best results with
501 * two prefetches, one with a reach of 8*BLOCK_SIZE+8 and one with a
502 * reach of 5*BLOCK_SIZE. The double prefetch gives an typical improvement
503 * of 5% for large copies as compared to a single prefetch. The reason
504 * for the improvement is that with Cheetah and Jaguar, some prefetches
505 * are dropped due to the prefetch queue being full. The second prefetch
506 * reduces the number of cache lines that are dropped.
507 * Do not remove the double prefetch or change either CHEETAH_PREFETCH
508 * or CHEETAH_2ND_PREFETCH without extensive performance tests to prove
509 * there is no loss of performance.
510 */
511 #define CHEETAH_PREFETCH 8
512 #define CHEETAH_2ND_PREFETCH 5
513
514 #define VIS_BLOCKSIZE 64
515
516 /*
517 * Size of stack frame in order to accomodate a 64-byte aligned
518 * floating-point register save area and 2 64-bit temp locations.
519 * All copy functions use two quadrants of fp registers; to assure a
520 * block-aligned two block buffer in which to save we must reserve
521 * three blocks on stack. Not all functions preserve %pfrs on stack
522 * or need to preserve %gsr but we use HWCOPYFRAMESIZE for all.
523 *
524 * _______________________________________ <-- %fp + STACK_BIAS
525 * | We may need to preserve 2 quadrants |
526 * | of fp regs, but since we do so with |
527 * | BST/BLD we need room in which to |
528 * | align to VIS_BLOCKSIZE bytes. So |
529 * | this area is 3 * VIS_BLOCKSIZE. | <-- - SAVED_FPREGS_OFFSET
530 * |-------------------------------------|
531 * | 8 bytes to save %fprs | <-- - SAVED_FPRS_OFFSET
532 * |-------------------------------------|
533 * | 8 bytes to save %gsr | <-- - SAVED_GSR_OFFSET
534 * ---------------------------------------
535 */
536 #define HWCOPYFRAMESIZE ((VIS_BLOCKSIZE * (2 + 1)) + (2 * 8))
537 #define SAVED_FPREGS_OFFSET (VIS_BLOCKSIZE * 3)
538 #define SAVED_FPREGS_ADJUST ((VIS_BLOCKSIZE * 2) - 1)
539 #define SAVED_FPRS_OFFSET (SAVED_FPREGS_OFFSET + 8)
540 #define SAVED_GSR_OFFSET (SAVED_FPRS_OFFSET + 8)
541
542 /*
543 * Common macros used by the various versions of the block copy
544 * routines in this file.
545 */
546
547 /*
548 * In FP copies if we do not have preserved data to restore over
549 * the fp regs we used then we must zero those regs to avoid
550 * exposing portions of the data to later threads (data security).
551 *
552 * Copy functions use either quadrants 1 and 3 or 2 and 4.
553 *
554 * FZEROQ1Q3: Zero quadrants 1 and 3, ie %f0 - %f15 and %f32 - %f47
555 * FZEROQ2Q4: Zero quadrants 2 and 4, ie %f16 - %f31 and %f48 - %f63
556 *
557 * The instructions below are quicker than repeated fzero instructions
558 * since they can dispatch down two fp pipelines.
559 */
560 #define FZEROQ1Q3 \
561 fzero %f0 ;\
562 fzero %f2 ;\
563 faddd %f0, %f2, %f4 ;\
564 fmuld %f0, %f2, %f6 ;\
565 faddd %f0, %f2, %f8 ;\
566 fmuld %f0, %f2, %f10 ;\
567 faddd %f0, %f2, %f12 ;\
568 fmuld %f0, %f2, %f14 ;\
569 faddd %f0, %f2, %f32 ;\
570 fmuld %f0, %f2, %f34 ;\
571 faddd %f0, %f2, %f36 ;\
572 fmuld %f0, %f2, %f38 ;\
573 faddd %f0, %f2, %f40 ;\
574 fmuld %f0, %f2, %f42 ;\
575 faddd %f0, %f2, %f44 ;\
576 fmuld %f0, %f2, %f46
577
578 #define FZEROQ2Q4 \
579 fzero %f16 ;\
580 fzero %f18 ;\
581 faddd %f16, %f18, %f20 ;\
582 fmuld %f16, %f18, %f22 ;\
583 faddd %f16, %f18, %f24 ;\
584 fmuld %f16, %f18, %f26 ;\
585 faddd %f16, %f18, %f28 ;\
586 fmuld %f16, %f18, %f30 ;\
587 faddd %f16, %f18, %f48 ;\
588 fmuld %f16, %f18, %f50 ;\
589 faddd %f16, %f18, %f52 ;\
590 fmuld %f16, %f18, %f54 ;\
591 faddd %f16, %f18, %f56 ;\
592 fmuld %f16, %f18, %f58 ;\
593 faddd %f16, %f18, %f60 ;\
594 fmuld %f16, %f18, %f62
595
596 /*
597 * Macros to save and restore quadrants 1 and 3 or 2 and 4 to/from the stack.
598 * Used to save and restore in-use fp registers when we want to use FP
599 * and find fp already in use and copy size still large enough to justify
600 * the additional overhead of this save and restore.
601 *
602 * A membar #Sync is needed before save to sync fp ops initiated before
603 * the call to the copy function (by whoever has fp in use); for example
604 * an earlier block load to the quadrant we are about to save may still be
605 * "in flight". A membar #Sync is required at the end of the save to
606 * sync our block store (the copy code is about to begin ldd's to the
607 * first quadrant). Note, however, that since Cheetah pipeline block load
608 * is blocking we can omit the initial membar before saving fp state (they're
609 * commented below in case of future porting to a chip that does not block
610 * on block load).
611 *
612 * Similarly: a membar #Sync before restore allows the block stores of
613 * the copy operation to complete before we fill the quadrants with their
614 * original data, and a membar #Sync after restore lets the block loads
615 * of the restore complete before we return to whoever has the fp regs
616 * in use. To avoid repeated membar #Sync we make it the responsibility
617 * of the copy code to membar #Sync immediately after copy is complete
618 * and before using the BLD_*_FROMSTACK macro.
619 */
620 #define BST_FPQ1Q3_TOSTACK(tmp1) \
621 /* membar #Sync */ ;\
622 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\
623 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\
624 stda %f0, [tmp1]ASI_BLK_P ;\
625 add tmp1, VIS_BLOCKSIZE, tmp1 ;\
626 stda %f32, [tmp1]ASI_BLK_P ;\
627 membar #Sync
628
629 #define BLD_FPQ1Q3_FROMSTACK(tmp1) \
630 /* membar #Sync - provided at copy completion */ ;\
631 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\
632 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\
633 ldda [tmp1]ASI_BLK_P, %f0 ;\
634 add tmp1, VIS_BLOCKSIZE, tmp1 ;\
635 ldda [tmp1]ASI_BLK_P, %f32 ;\
636 membar #Sync
637
638 #define BST_FPQ2Q4_TOSTACK(tmp1) \
639 /* membar #Sync */ ;\
640 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\
641 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\
642 stda %f16, [tmp1]ASI_BLK_P ;\
643 add tmp1, VIS_BLOCKSIZE, tmp1 ;\
644 stda %f48, [tmp1]ASI_BLK_P ;\
645 membar #Sync
646
647 #define BLD_FPQ2Q4_FROMSTACK(tmp1) \
648 /* membar #Sync - provided at copy completion */ ;\
649 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\
650 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\
651 ldda [tmp1]ASI_BLK_P, %f16 ;\
652 add tmp1, VIS_BLOCKSIZE, tmp1 ;\
653 ldda [tmp1]ASI_BLK_P, %f48 ;\
654 membar #Sync
655
656 /*
657 * FP_NOMIGRATE and FP_ALLOWMIGRATE. Prevent migration (or, stronger,
658 * prevent preemption if there is no t_lwp to save FP state to on context
659 * switch) before commencing a FP copy, and reallow it on completion or
660 * in error trampoline paths when we were using FP copy.
661 *
662 * Both macros may call other functions, so be aware that all outputs are
663 * forfeit after using these macros. For this reason we do not pass registers
664 * to use - we just use any outputs we want.
665 *
666 * For fpRAS we need to perform the fpRAS mechanism test on the same
667 * CPU as we use for the copy operation, both so that we validate the
668 * CPU we perform the copy on and so that we know which CPU failed
669 * if a failure is detected. Hence we need to be bound to "our" CPU.
670 * This could be achieved through disabling preemption (and we have do it that
671 * way for threads with no t_lwp) but for larger copies this may hold
672 * higher priority threads off of cpu for too long (eg, realtime). So we
673 * make use of the lightweight t_nomigrate mechanism where we can (ie, when
674 * we have a t_lwp).
675 *
676 * Pseudo code:
677 *
678 * FP_NOMIGRATE:
679 *
680 * if (curthread->t_lwp) {
681 * thread_nomigrate();
682 * } else {
683 * kpreempt_disable();
684 * }
685 *
686 * FP_ALLOWMIGRATE:
687 *
688 * if (curthread->t_lwp) {
689 * thread_allowmigrate();
690 * } else {
691 * kpreempt_enable();
692 * }
693 */
694
695 #define FP_NOMIGRATE(label1, label2) \
696 ldn [THREAD_REG + T_LWP], %o0 ;\
697 brz,a,pn %o0, label1/**/f ;\
698 ldsb [THREAD_REG + T_PREEMPT], %o1 ;\
699 call thread_nomigrate ;\
700 nop ;\
701 ba label2/**/f ;\
702 nop ;\
703 label1: ;\
704 inc %o1 ;\
705 stb %o1, [THREAD_REG + T_PREEMPT] ;\
706 label2:
707
708 #define FP_ALLOWMIGRATE(label1, label2) \
709 ldn [THREAD_REG + T_LWP], %o0 ;\
710 brz,a,pn %o0, label1/**/f ;\
711 ldsb [THREAD_REG + T_PREEMPT], %o1 ;\
712 call thread_allowmigrate ;\
713 nop ;\
714 ba label2/**/f ;\
715 nop ;\
716 label1: ;\
717 dec %o1 ;\
718 brnz,pn %o1, label2/**/f ;\
719 stb %o1, [THREAD_REG + T_PREEMPT] ;\
720 ldn [THREAD_REG + T_CPU], %o0 ;\
721 ldub [%o0 + CPU_KPRUNRUN], %o0 ;\
722 brz,pt %o0, label2/**/f ;\
723 nop ;\
724 call kpreempt ;\
725 rdpr %pil, %o0 ;\
726 label2:
727
728 /*
729 * Copy a block of storage, returning an error code if `from' or
730 * `to' takes a kernel pagefault which cannot be resolved.
731 * Returns errno value on pagefault error, 0 if all ok
732 */
733
734 .seg ".text"
735 .align 4
736
737 ENTRY(kcopy)
738
739 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
740 bleu,pt %ncc, .kcopy_small ! go to larger cases
741 xor %o0, %o1, %o3 ! are src, dst alignable?
742 btst 7, %o3 !
743 bz,pt %ncc, .kcopy_8 ! check for longword alignment
744 nop
745 btst 1, %o3 !
746 bz,pt %ncc, .kcopy_2 ! check for half-word
747 nop
748 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
749 ld [%o3 + %lo(hw_copy_limit_1)], %o3
750 tst %o3
751 bz,pn %icc, .kcopy_small ! if zero, disable HW copy
752 cmp %o2, %o3 ! if length <= limit
753 bleu,pt %ncc, .kcopy_small ! go to small copy
754 nop
755 ba,pt %ncc, .kcopy_more ! otherwise go to large copy
756 nop
757 .kcopy_2:
758 btst 3, %o3 !
759 bz,pt %ncc, .kcopy_4 ! check for word alignment
760 nop
761 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
762 ld [%o3 + %lo(hw_copy_limit_2)], %o3
763 tst %o3
764 bz,pn %icc, .kcopy_small ! if zero, disable HW copy
765 cmp %o2, %o3 ! if length <= limit
766 bleu,pt %ncc, .kcopy_small ! go to small copy
767 nop
768 ba,pt %ncc, .kcopy_more ! otherwise go to large copy
769 nop
770 .kcopy_4:
771 ! already checked longword, must be word aligned
772 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
773 ld [%o3 + %lo(hw_copy_limit_4)], %o3
774 tst %o3
775 bz,pn %icc, .kcopy_small ! if zero, disable HW copy
776 cmp %o2, %o3 ! if length <= limit
777 bleu,pt %ncc, .kcopy_small ! go to small copy
778 nop
779 ba,pt %ncc, .kcopy_more ! otherwise go to large copy
780 nop
781 .kcopy_8:
782 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
783 ld [%o3 + %lo(hw_copy_limit_8)], %o3
784 tst %o3
785 bz,pn %icc, .kcopy_small ! if zero, disable HW copy
786 cmp %o2, %o3 ! if length <= limit
787 bleu,pt %ncc, .kcopy_small ! go to small copy
788 nop
789 ba,pt %ncc, .kcopy_more ! otherwise go to large copy
790 nop
791
792 .kcopy_small:
793 sethi %hi(.sm_copyerr), %o5 ! sm_copyerr is lofault value
794 or %o5, %lo(.sm_copyerr), %o5
795 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler
796 membar #Sync ! sync error barrier
797 ba,pt %ncc, .sm_do_copy ! common code
798 stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault
799
800 .kcopy_more:
801 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
802 sethi %hi(.copyerr), %l7 ! copyerr is lofault value
803 or %l7, %lo(.copyerr), %l7
804 ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler
805 membar #Sync ! sync error barrier
806 ba,pt %ncc, .do_copy ! common code
807 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault
808
809
810 /*
811 * We got here because of a fault during bcopy_more, called from kcopy or bcopy.
812 * Errno value is in %g1. bcopy_more uses fp quadrants 1 and 3.
813 */
814 .copyerr:
815 set .copyerr2, %l0
816 membar #Sync ! sync error barrier
817 stn %l0, [THREAD_REG + T_LOFAULT] ! set t_lofault
818 btst FPUSED_FLAG, %l6
819 bz %ncc, 1f
820 and %l6, TRAMP_FLAG, %l0 ! copy trampoline flag to %l0
821
822 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr
823 wr %o2, 0, %gsr
824
825 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
826 btst FPRS_FEF, %o3
827 bz,pt %icc, 4f
828 nop
829
830 BLD_FPQ1Q3_FROMSTACK(%o2)
831
832 ba,pt %ncc, 1f
833 wr %o3, 0, %fprs ! restore fprs
834
835 4:
836 FZEROQ1Q3
837 wr %o3, 0, %fprs ! restore fprs
838
839 !
840 ! Need to cater for the different expectations of kcopy
841 ! and bcopy. kcopy will *always* set a t_lofault handler
842 ! If it fires, we're expected to just return the error code
843 ! and *not* to invoke any existing error handler. As far as
844 ! bcopy is concerned, we only set t_lofault if there was an
845 ! existing lofault handler. In that case we're expected to
846 ! invoke the previously existing handler after resetting the
847 ! t_lofault value.
848 !
849 1:
850 andn %l6, MASK_FLAGS, %l6 ! turn trampoline flag off
851 membar #Sync ! sync error barrier
852 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
853 FP_ALLOWMIGRATE(5, 6)
854
855 btst TRAMP_FLAG, %l0
856 bnz,pn %ncc, 3f
857 nop
858 ret
859 restore %g1, 0, %o0
860
861 3:
862 !
863 ! We're here via bcopy. There *must* have been an error handler
864 ! in place otherwise we would have died a nasty death already.
865 !
866 jmp %l6 ! goto real handler
867 restore %g0, 0, %o0 ! dispose of copy window
868
869 /*
870 * We got here because of a fault in .copyerr. We can't safely restore fp
871 * state, so we panic.
872 */
873 fp_panic_msg:
874 .asciz "Unable to restore fp state after copy operation"
875
876 .align 4
877 .copyerr2:
878 set fp_panic_msg, %o0
879 call panic
880 nop
881
882 /*
883 * We got here because of a fault during a small kcopy or bcopy.
884 * No floating point registers are used by the small copies.
885 * Errno value is in %g1.
886 */
887 .sm_copyerr:
888 1:
889 btst TRAMP_FLAG, %o4
890 membar #Sync
891 andn %o4, TRAMP_FLAG, %o4
892 bnz,pn %ncc, 3f
893 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
894 retl
895 mov %g1, %o0
896 3:
897 jmp %o4 ! goto real handler
898 mov %g0, %o0 !
899
900 SET_SIZE(kcopy)
901
902
903 /*
904 * Copy a block of storage - must not overlap (from + len <= to).
905 * Registers: l6 - saved t_lofault
906 * (for short copies, o4 - saved t_lofault)
907 *
908 * Copy a page of memory.
909 * Assumes double word alignment and a count >= 256.
910 */
911
912 ENTRY(bcopy)
913
914 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
915 bleu,pt %ncc, .bcopy_small ! go to larger cases
916 xor %o0, %o1, %o3 ! are src, dst alignable?
917 btst 7, %o3 !
918 bz,pt %ncc, .bcopy_8 ! check for longword alignment
919 nop
920 btst 1, %o3 !
921 bz,pt %ncc, .bcopy_2 ! check for half-word
922 nop
923 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
924 ld [%o3 + %lo(hw_copy_limit_1)], %o3
925 tst %o3
926 bz,pn %icc, .bcopy_small ! if zero, disable HW copy
927 cmp %o2, %o3 ! if length <= limit
928 bleu,pt %ncc, .bcopy_small ! go to small copy
929 nop
930 ba,pt %ncc, .bcopy_more ! otherwise go to large copy
931 nop
932 .bcopy_2:
933 btst 3, %o3 !
934 bz,pt %ncc, .bcopy_4 ! check for word alignment
935 nop
936 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
937 ld [%o3 + %lo(hw_copy_limit_2)], %o3
938 tst %o3
939 bz,pn %icc, .bcopy_small ! if zero, disable HW copy
940 cmp %o2, %o3 ! if length <= limit
941 bleu,pt %ncc, .bcopy_small ! go to small copy
942 nop
943 ba,pt %ncc, .bcopy_more ! otherwise go to large copy
944 nop
945 .bcopy_4:
946 ! already checked longword, must be word aligned
947 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
948 ld [%o3 + %lo(hw_copy_limit_4)], %o3
949 tst %o3
950 bz,pn %icc, .bcopy_small ! if zero, disable HW copy
951 cmp %o2, %o3 ! if length <= limit
952 bleu,pt %ncc, .bcopy_small ! go to small copy
953 nop
954 ba,pt %ncc, .bcopy_more ! otherwise go to large copy
955 nop
956 .bcopy_8:
957 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
958 ld [%o3 + %lo(hw_copy_limit_8)], %o3
959 tst %o3
960 bz,pn %icc, .bcopy_small ! if zero, disable HW copy
961 cmp %o2, %o3 ! if length <= limit
962 bleu,pt %ncc, .bcopy_small ! go to small copy
963 nop
964 ba,pt %ncc, .bcopy_more ! otherwise go to large copy
965 nop
966
967 .align 16
968 .bcopy_small:
969 ldn [THREAD_REG + T_LOFAULT], %o4 ! save t_lofault
970 tst %o4
971 bz,pt %icc, .sm_do_copy
972 nop
973 sethi %hi(.sm_copyerr), %o5
974 or %o5, %lo(.sm_copyerr), %o5
975 membar #Sync ! sync error barrier
976 stn %o5, [THREAD_REG + T_LOFAULT] ! install new vector
977 or %o4, TRAMP_FLAG, %o4 ! error should trampoline
978 .sm_do_copy:
979 cmp %o2, SHORTCOPY ! check for really short case
980 bleu,pt %ncc, .bc_sm_left !
981 cmp %o2, CHKSIZE ! check for medium length cases
982 bgu,pn %ncc, .bc_med !
983 or %o0, %o1, %o3 ! prepare alignment check
984 andcc %o3, 0x3, %g0 ! test for alignment
985 bz,pt %ncc, .bc_sm_word ! branch to word aligned case
986 .bc_sm_movebytes:
987 sub %o2, 3, %o2 ! adjust count to allow cc zero test
988 .bc_sm_notalign4:
989 ldub [%o0], %o3 ! read byte
990 stb %o3, [%o1] ! write byte
991 subcc %o2, 4, %o2 ! reduce count by 4
992 ldub [%o0 + 1], %o3 ! repeat for a total of 4 bytes
993 add %o0, 4, %o0 ! advance SRC by 4
994 stb %o3, [%o1 + 1]
995 ldub [%o0 - 2], %o3
996 add %o1, 4, %o1 ! advance DST by 4
997 stb %o3, [%o1 - 2]
998 ldub [%o0 - 1], %o3
999 bgt,pt %ncc, .bc_sm_notalign4 ! loop til 3 or fewer bytes remain
1000 stb %o3, [%o1 - 1]
1001 add %o2, 3, %o2 ! restore count
1002 .bc_sm_left:
1003 tst %o2
1004 bz,pt %ncc, .bc_sm_exit ! check for zero length
1005 deccc %o2 ! reduce count for cc test
1006 ldub [%o0], %o3 ! move one byte
1007 bz,pt %ncc, .bc_sm_exit
1008 stb %o3, [%o1]
1009 ldub [%o0 + 1], %o3 ! move another byte
1010 deccc %o2 ! check for more
1011 bz,pt %ncc, .bc_sm_exit
1012 stb %o3, [%o1 + 1]
1013 ldub [%o0 + 2], %o3 ! move final byte
1014 stb %o3, [%o1 + 2]
1015 membar #Sync ! sync error barrier
1016 andn %o4, TRAMP_FLAG, %o4
1017 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
1018 retl
1019 mov %g0, %o0 ! return 0
1020 .align 16
1021 nop ! instruction alignment
1022 ! see discussion at start of file
1023 .bc_sm_words:
1024 lduw [%o0], %o3 ! read word
1025 .bc_sm_wordx:
1026 subcc %o2, 8, %o2 ! update count
1027 stw %o3, [%o1] ! write word
1028 add %o0, 8, %o0 ! update SRC
1029 lduw [%o0 - 4], %o3 ! read word
1030 add %o1, 8, %o1 ! update DST
1031 bgt,pt %ncc, .bc_sm_words ! loop til done
1032 stw %o3, [%o1 - 4] ! write word
1033 addcc %o2, 7, %o2 ! restore count
1034 bz,pt %ncc, .bc_sm_exit
1035 deccc %o2
1036 bz,pt %ncc, .bc_sm_byte
1037 .bc_sm_half:
1038 subcc %o2, 2, %o2 ! reduce count by 2
1039 add %o0, 2, %o0 ! advance SRC by 2
1040 lduh [%o0 - 2], %o3 ! read half word
1041 add %o1, 2, %o1 ! advance DST by 2
1042 bgt,pt %ncc, .bc_sm_half ! loop til done
1043 sth %o3, [%o1 - 2] ! write half word
1044 addcc %o2, 1, %o2 ! restore count
1045 bz,pt %ncc, .bc_sm_exit
1046 nop
1047 .bc_sm_byte:
1048 ldub [%o0], %o3
1049 stb %o3, [%o1]
1050 membar #Sync ! sync error barrier
1051 andn %o4, TRAMP_FLAG, %o4
1052 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
1053 retl
1054 mov %g0, %o0 ! return 0
1055
1056 .bc_sm_word:
1057 subcc %o2, 4, %o2 ! update count
1058 bgt,pt %ncc, .bc_sm_wordx
1059 lduw [%o0], %o3 ! read word
1060 addcc %o2, 3, %o2 ! restore count
1061 bz,pt %ncc, .bc_sm_exit
1062 stw %o3, [%o1] ! write word
1063 deccc %o2 ! reduce count for cc test
1064 ldub [%o0 + 4], %o3 ! load one byte
1065 bz,pt %ncc, .bc_sm_exit
1066 stb %o3, [%o1 + 4] ! store one byte
1067 ldub [%o0 + 5], %o3 ! load second byte
1068 deccc %o2
1069 bz,pt %ncc, .bc_sm_exit
1070 stb %o3, [%o1 + 5] ! store second byte
1071 ldub [%o0 + 6], %o3 ! load third byte
1072 stb %o3, [%o1 + 6] ! store third byte
1073 .bc_sm_exit:
1074 membar #Sync ! sync error barrier
1075 andn %o4, TRAMP_FLAG, %o4
1076 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
1077 retl
1078 mov %g0, %o0 ! return 0
1079
1080 .align 16
1081 .bc_med:
1082 xor %o0, %o1, %o3 ! setup alignment check
1083 btst 1, %o3
1084 bnz,pt %ncc, .bc_sm_movebytes ! unaligned
1085 nop
1086 btst 3, %o3
1087 bnz,pt %ncc, .bc_med_half ! halfword aligned
1088 nop
1089 btst 7, %o3
1090 bnz,pt %ncc, .bc_med_word ! word aligned
1091 nop
1092 .bc_med_long:
1093 btst 3, %o0 ! check for
1094 bz,pt %ncc, .bc_med_long1 ! word alignment
1095 nop
1096 .bc_med_long0:
1097 ldub [%o0], %o3 ! load one byte
1098 inc %o0
1099 stb %o3,[%o1] ! store byte
1100 inc %o1
1101 btst 3, %o0
1102 bnz,pt %ncc, .bc_med_long0
1103 dec %o2
1104 .bc_med_long1: ! word aligned
1105 btst 7, %o0 ! check for long word
1106 bz,pt %ncc, .bc_med_long2
1107 nop
1108 lduw [%o0], %o3 ! load word
1109 add %o0, 4, %o0 ! advance SRC by 4
1110 stw %o3, [%o1] ! store word
1111 add %o1, 4, %o1 ! advance DST by 4
1112 sub %o2, 4, %o2 ! reduce count by 4
1113 !
1114 ! Now long word aligned and have at least 32 bytes to move
1115 !
1116 .bc_med_long2:
1117 sub %o2, 31, %o2 ! adjust count to allow cc zero test
1118 .bc_med_lmove:
1119 ldx [%o0], %o3 ! read long word
1120 stx %o3, [%o1] ! write long word
1121 subcc %o2, 32, %o2 ! reduce count by 32
1122 ldx [%o0 + 8], %o3 ! repeat for a total for 4 long words
1123 add %o0, 32, %o0 ! advance SRC by 32
1124 stx %o3, [%o1 + 8]
1125 ldx [%o0 - 16], %o3
1126 add %o1, 32, %o1 ! advance DST by 32
1127 stx %o3, [%o1 - 16]
1128 ldx [%o0 - 8], %o3
1129 bgt,pt %ncc, .bc_med_lmove ! loop til 31 or fewer bytes left
1130 stx %o3, [%o1 - 8]
1131 addcc %o2, 24, %o2 ! restore count to long word offset
1132 ble,pt %ncc, .bc_med_lextra ! check for more long words to move
1133 nop
1134 .bc_med_lword:
1135 ldx [%o0], %o3 ! read long word
1136 subcc %o2, 8, %o2 ! reduce count by 8
1137 stx %o3, [%o1] ! write long word
1138 add %o0, 8, %o0 ! advance SRC by 8
1139 bgt,pt %ncc, .bc_med_lword ! loop til 7 or fewer bytes left
1140 add %o1, 8, %o1 ! advance DST by 8
1141 .bc_med_lextra:
1142 addcc %o2, 7, %o2 ! restore rest of count
1143 bz,pt %ncc, .bc_sm_exit ! if zero, then done
1144 deccc %o2
1145 bz,pt %ncc, .bc_sm_byte
1146 nop
1147 ba,pt %ncc, .bc_sm_half
1148 nop
1149
1150 .align 16
1151 .bc_med_word:
1152 btst 3, %o0 ! check for
1153 bz,pt %ncc, .bc_med_word1 ! word alignment
1154 nop
1155 .bc_med_word0:
1156 ldub [%o0], %o3 ! load one byte
1157 inc %o0
1158 stb %o3,[%o1] ! store byte
1159 inc %o1
1160 btst 3, %o0
1161 bnz,pt %ncc, .bc_med_word0
1162 dec %o2
1163 !
1164 ! Now word aligned and have at least 36 bytes to move
1165 !
1166 .bc_med_word1:
1167 sub %o2, 15, %o2 ! adjust count to allow cc zero test
1168 .bc_med_wmove:
1169 lduw [%o0], %o3 ! read word
1170 stw %o3, [%o1] ! write word
1171 subcc %o2, 16, %o2 ! reduce count by 16
1172 lduw [%o0 + 4], %o3 ! repeat for a total for 4 words
1173 add %o0, 16, %o0 ! advance SRC by 16
1174 stw %o3, [%o1 + 4]
1175 lduw [%o0 - 8], %o3
1176 add %o1, 16, %o1 ! advance DST by 16
1177 stw %o3, [%o1 - 8]
1178 lduw [%o0 - 4], %o3
1179 bgt,pt %ncc, .bc_med_wmove ! loop til 15 or fewer bytes left
1180 stw %o3, [%o1 - 4]
1181 addcc %o2, 12, %o2 ! restore count to word offset
1182 ble,pt %ncc, .bc_med_wextra ! check for more words to move
1183 nop
1184 .bc_med_word2:
1185 lduw [%o0], %o3 ! read word
1186 subcc %o2, 4, %o2 ! reduce count by 4
1187 stw %o3, [%o1] ! write word
1188 add %o0, 4, %o0 ! advance SRC by 4
1189 bgt,pt %ncc, .bc_med_word2 ! loop til 3 or fewer bytes left
1190 add %o1, 4, %o1 ! advance DST by 4
1191 .bc_med_wextra:
1192 addcc %o2, 3, %o2 ! restore rest of count
1193 bz,pt %ncc, .bc_sm_exit ! if zero, then done
1194 deccc %o2
1195 bz,pt %ncc, .bc_sm_byte
1196 nop
1197 ba,pt %ncc, .bc_sm_half
1198 nop
1199
1200 .align 16
1201 .bc_med_half:
1202 btst 1, %o0 ! check for
1203 bz,pt %ncc, .bc_med_half1 ! half word alignment
1204 nop
1205 ldub [%o0], %o3 ! load one byte
1206 inc %o0
1207 stb %o3,[%o1] ! store byte
1208 inc %o1
1209 dec %o2
1210 !
1211 ! Now half word aligned and have at least 38 bytes to move
1212 !
1213 .bc_med_half1:
1214 sub %o2, 7, %o2 ! adjust count to allow cc zero test
1215 .bc_med_hmove:
1216 lduh [%o0], %o3 ! read half word
1217 sth %o3, [%o1] ! write half word
1218 subcc %o2, 8, %o2 ! reduce count by 8
1219 lduh [%o0 + 2], %o3 ! repeat for a total for 4 halfwords
1220 add %o0, 8, %o0 ! advance SRC by 8
1221 sth %o3, [%o1 + 2]
1222 lduh [%o0 - 4], %o3
1223 add %o1, 8, %o1 ! advance DST by 8
1224 sth %o3, [%o1 - 4]
1225 lduh [%o0 - 2], %o3
1226 bgt,pt %ncc, .bc_med_hmove ! loop til 7 or fewer bytes left
1227 sth %o3, [%o1 - 2]
1228 addcc %o2, 7, %o2 ! restore count
1229 bz,pt %ncc, .bc_sm_exit
1230 deccc %o2
1231 bz,pt %ncc, .bc_sm_byte
1232 nop
1233 ba,pt %ncc, .bc_sm_half
1234 nop
1235
1236 SET_SIZE(bcopy)
1237
1238 /*
1239 * The _more entry points are not intended to be used directly by
1240 * any caller from outside this file. They are provided to allow
1241 * profiling and dtrace of the portions of the copy code that uses
1242 * the floating point registers.
1243 * This entry is particularly important as DTRACE (at least as of
1244 * 4/2004) does not support leaf functions.
1245 */
1246
1247 ENTRY(bcopy_more)
1248 .bcopy_more:
1249 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1250 ldn [THREAD_REG + T_LOFAULT], %l6 ! save t_lofault
1251 tst %l6
1252 bz,pt %ncc, .do_copy
1253 nop
1254 sethi %hi(.copyerr), %o2
1255 or %o2, %lo(.copyerr), %o2
1256 membar #Sync ! sync error barrier
1257 stn %o2, [THREAD_REG + T_LOFAULT] ! install new vector
1258 !
1259 ! We've already captured whether t_lofault was zero on entry.
1260 ! We need to mark ourselves as being from bcopy since both
1261 ! kcopy and bcopy use the same code path. If TRAMP_FLAG is set
1262 ! and the saved lofault was zero, we won't reset lofault on
1263 ! returning.
1264 !
1265 or %l6, TRAMP_FLAG, %l6
1266
1267 /*
1268 * Copies that reach here are larger than VIS_COPY_THRESHOLD bytes
1269 * Also, use of FP registers has been tested to be enabled
1270 */
1271 .do_copy:
1272 FP_NOMIGRATE(6, 7)
1273
1274 rd %fprs, %o2 ! check for unused fp
1275 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
1276 btst FPRS_FEF, %o2
1277 bz,a,pt %icc, .do_blockcopy
1278 wr %g0, FPRS_FEF, %fprs
1279
1280 BST_FPQ1Q3_TOSTACK(%o2)
1281
1282 .do_blockcopy:
1283 rd %gsr, %o2
1284 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr
1285 or %l6, FPUSED_FLAG, %l6
1286
1287 #define REALSRC %i0
1288 #define DST %i1
1289 #define CNT %i2
1290 #define SRC %i3
1291 #define TMP %i5
1292
1293 andcc DST, VIS_BLOCKSIZE - 1, TMP
1294 bz,pt %ncc, 2f
1295 neg TMP
1296 add TMP, VIS_BLOCKSIZE, TMP
1297
1298 ! TMP = bytes required to align DST on FP_BLOCK boundary
1299 ! Using SRC as a tmp here
1300 cmp TMP, 3
1301 bleu,pt %ncc, 1f
1302 sub CNT,TMP,CNT ! adjust main count
1303 sub TMP, 3, TMP ! adjust for end of loop test
1304 .bc_blkalign:
1305 ldub [REALSRC], SRC ! move 4 bytes per loop iteration
1306 stb SRC, [DST]
1307 subcc TMP, 4, TMP
1308 ldub [REALSRC + 1], SRC
1309 add REALSRC, 4, REALSRC
1310 stb SRC, [DST + 1]
1311 ldub [REALSRC - 2], SRC
1312 add DST, 4, DST
1313 stb SRC, [DST - 2]
1314 ldub [REALSRC - 1], SRC
1315 bgu,pt %ncc, .bc_blkalign
1316 stb SRC, [DST - 1]
1317
1318 addcc TMP, 3, TMP ! restore count adjustment
1319 bz,pt %ncc, 2f ! no bytes left?
1320 nop
1321 1: ldub [REALSRC], SRC
1322 inc REALSRC
1323 inc DST
1324 deccc TMP
1325 bgu %ncc, 1b
1326 stb SRC, [DST - 1]
1327
1328 2:
1329 andn REALSRC, 0x7, SRC
1330 alignaddr REALSRC, %g0, %g0
1331
1332 ! SRC - 8-byte aligned
1333 ! DST - 64-byte aligned
1334 prefetch [SRC], #one_read
1335 prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read
1336 prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read
1337 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read
1338 ldd [SRC], %f0
1339 #if CHEETAH_PREFETCH > 4
1340 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
1341 #endif
1342 ldd [SRC + 0x08], %f2
1343 #if CHEETAH_PREFETCH > 5
1344 prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read
1345 #endif
1346 ldd [SRC + 0x10], %f4
1347 #if CHEETAH_PREFETCH > 6
1348 prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read
1349 #endif
1350 faligndata %f0, %f2, %f32
1351 ldd [SRC + 0x18], %f6
1352 #if CHEETAH_PREFETCH > 7
1353 prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read
1354 #endif
1355 faligndata %f2, %f4, %f34
1356 ldd [SRC + 0x20], %f8
1357 faligndata %f4, %f6, %f36
1358 ldd [SRC + 0x28], %f10
1359 faligndata %f6, %f8, %f38
1360 ldd [SRC + 0x30], %f12
1361 faligndata %f8, %f10, %f40
1362 ldd [SRC + 0x38], %f14
1363 faligndata %f10, %f12, %f42
1364 ldd [SRC + VIS_BLOCKSIZE], %f0
1365 sub CNT, VIS_BLOCKSIZE, CNT
1366 add SRC, VIS_BLOCKSIZE, SRC
1367 add REALSRC, VIS_BLOCKSIZE, REALSRC
1368 ba,a,pt %ncc, 1f
1369 nop
1370 .align 16
1371 1:
1372 ldd [SRC + 0x08], %f2
1373 faligndata %f12, %f14, %f44
1374 ldd [SRC + 0x10], %f4
1375 faligndata %f14, %f0, %f46
1376 stda %f32, [DST]ASI_BLK_P
1377 ldd [SRC + 0x18], %f6
1378 faligndata %f0, %f2, %f32
1379 ldd [SRC + 0x20], %f8
1380 faligndata %f2, %f4, %f34
1381 ldd [SRC + 0x28], %f10
1382 faligndata %f4, %f6, %f36
1383 ldd [SRC + 0x30], %f12
1384 faligndata %f6, %f8, %f38
1385 ldd [SRC + 0x38], %f14
1386 faligndata %f8, %f10, %f40
1387 sub CNT, VIS_BLOCKSIZE, CNT
1388 ldd [SRC + VIS_BLOCKSIZE], %f0
1389 faligndata %f10, %f12, %f42
1390 prefetch [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read
1391 add DST, VIS_BLOCKSIZE, DST
1392 prefetch [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1393 add REALSRC, VIS_BLOCKSIZE, REALSRC
1394 cmp CNT, VIS_BLOCKSIZE + 8
1395 bgu,pt %ncc, 1b
1396 add SRC, VIS_BLOCKSIZE, SRC
1397
1398 ! only if REALSRC & 0x7 is 0
1399 cmp CNT, VIS_BLOCKSIZE
1400 bne %ncc, 3f
1401 andcc REALSRC, 0x7, %g0
1402 bz,pt %ncc, 2f
1403 nop
1404 3:
1405 faligndata %f12, %f14, %f44
1406 faligndata %f14, %f0, %f46
1407 stda %f32, [DST]ASI_BLK_P
1408 add DST, VIS_BLOCKSIZE, DST
1409 ba,pt %ncc, 3f
1410 nop
1411 2:
1412 ldd [SRC + 0x08], %f2
1413 fsrc1 %f12, %f44
1414 ldd [SRC + 0x10], %f4
1415 fsrc1 %f14, %f46
1416 stda %f32, [DST]ASI_BLK_P
1417 ldd [SRC + 0x18], %f6
1418 fsrc1 %f0, %f32
1419 ldd [SRC + 0x20], %f8
1420 fsrc1 %f2, %f34
1421 ldd [SRC + 0x28], %f10
1422 fsrc1 %f4, %f36
1423 ldd [SRC + 0x30], %f12
1424 fsrc1 %f6, %f38
1425 ldd [SRC + 0x38], %f14
1426 fsrc1 %f8, %f40
1427 sub CNT, VIS_BLOCKSIZE, CNT
1428 add DST, VIS_BLOCKSIZE, DST
1429 add SRC, VIS_BLOCKSIZE, SRC
1430 add REALSRC, VIS_BLOCKSIZE, REALSRC
1431 fsrc1 %f10, %f42
1432 fsrc1 %f12, %f44
1433 fsrc1 %f14, %f46
1434 stda %f32, [DST]ASI_BLK_P
1435 add DST, VIS_BLOCKSIZE, DST
1436 ba,a,pt %ncc, .bcb_exit
1437 nop
1438
1439 3: tst CNT
1440 bz,a,pt %ncc, .bcb_exit
1441 nop
1442
1443 5: ldub [REALSRC], TMP
1444 inc REALSRC
1445 inc DST
1446 deccc CNT
1447 bgu %ncc, 5b
1448 stb TMP, [DST - 1]
1449 .bcb_exit:
1450 membar #Sync
1451
1452 FPRAS_INTERVAL(FPRAS_BCOPY, 0, %l5, %o2, %o3, %o4, %o5, 8)
1453 FPRAS_REWRITE_TYPE2Q1(0, %l5, %o2, %o3, 8, 9)
1454 FPRAS_CHECK(FPRAS_BCOPY, %l5, 9) ! outputs lost
1455
1456 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr
1457 wr %o2, 0, %gsr
1458
1459 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1460 btst FPRS_FEF, %o3
1461 bz,pt %icc, 4f
1462 nop
1463
1464 BLD_FPQ1Q3_FROMSTACK(%o2)
1465
1466 ba,pt %ncc, 2f
1467 wr %o3, 0, %fprs ! restore fprs
1468 4:
1469 FZEROQ1Q3
1470 wr %o3, 0, %fprs ! restore fprs
1471 2:
1472 membar #Sync ! sync error barrier
1473 andn %l6, MASK_FLAGS, %l6
1474 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
1475 FP_ALLOWMIGRATE(5, 6)
1476 ret
1477 restore %g0, 0, %o0
1478
1479 SET_SIZE(bcopy_more)
1480
1481 /*
1482 * Block copy with possibly overlapped operands.
1483 */
1484
1485 ENTRY(ovbcopy)
1486 tst %o2 ! check count
1487 bgu,a %ncc, 1f ! nothing to do or bad arguments
1488 subcc %o0, %o1, %o3 ! difference of from and to address
1489
1490 retl ! return
1491 nop
1492 1:
1493 bneg,a %ncc, 2f
1494 neg %o3 ! if < 0, make it positive
1495 2: cmp %o2, %o3 ! cmp size and abs(from - to)
1496 bleu %ncc, bcopy ! if size <= abs(diff): use bcopy,
1497 .empty ! no overlap
1498 cmp %o0, %o1 ! compare from and to addresses
1499 blu %ncc, .ov_bkwd ! if from < to, copy backwards
1500 nop
1501 !
1502 ! Copy forwards.
1503 !
1504 .ov_fwd:
1505 ldub [%o0], %o3 ! read from address
1506 inc %o0 ! inc from address
1507 stb %o3, [%o1] ! write to address
1508 deccc %o2 ! dec count
1509 bgu %ncc, .ov_fwd ! loop till done
1510 inc %o1 ! inc to address
1511
1512 retl ! return
1513 nop
1514 !
1515 ! Copy backwards.
1516 !
1517 .ov_bkwd:
1518 deccc %o2 ! dec count
1519 ldub [%o0 + %o2], %o3 ! get byte at end of src
1520 bgu %ncc, .ov_bkwd ! loop till done
1521 stb %o3, [%o1 + %o2] ! delay slot, store at end of dst
1522
1523 retl ! return
1524 nop
1525
1526 SET_SIZE(ovbcopy)
1527
1528
1529 /*
1530 * hwblkpagecopy()
1531 *
1532 * Copies exactly one page. This routine assumes the caller (ppcopy)
1533 * has already disabled kernel preemption and has checked
1534 * use_hw_bcopy. Preventing preemption also prevents cpu migration.
1535 */
1536 ENTRY(hwblkpagecopy)
1537 ! get another window w/space for three aligned blocks of saved fpregs
1538 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1539
1540 ! %i0 - source address (arg)
1541 ! %i1 - destination address (arg)
1542 ! %i2 - length of region (not arg)
1543 ! %l0 - saved fprs
1544 ! %l1 - pointer to saved fpregs
1545
1546 rd %fprs, %l0 ! check for unused fp
1547 btst FPRS_FEF, %l0
1548 bz,a,pt %icc, 1f
1549 wr %g0, FPRS_FEF, %fprs
1550
1551 BST_FPQ1Q3_TOSTACK(%l1)
1552
1553 1: set PAGESIZE, CNT
1554 mov REALSRC, SRC
1555
1556 prefetch [SRC], #one_read
1557 prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read
1558 prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read
1559 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read
1560 ldd [SRC], %f0
1561 #if CHEETAH_PREFETCH > 4
1562 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
1563 #endif
1564 ldd [SRC + 0x08], %f2
1565 #if CHEETAH_PREFETCH > 5
1566 prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read
1567 #endif
1568 ldd [SRC + 0x10], %f4
1569 #if CHEETAH_PREFETCH > 6
1570 prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read
1571 #endif
1572 fsrc1 %f0, %f32
1573 ldd [SRC + 0x18], %f6
1574 #if CHEETAH_PREFETCH > 7
1575 prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read
1576 #endif
1577 fsrc1 %f2, %f34
1578 ldd [SRC + 0x20], %f8
1579 fsrc1 %f4, %f36
1580 ldd [SRC + 0x28], %f10
1581 fsrc1 %f6, %f38
1582 ldd [SRC + 0x30], %f12
1583 fsrc1 %f8, %f40
1584 ldd [SRC + 0x38], %f14
1585 fsrc1 %f10, %f42
1586 ldd [SRC + VIS_BLOCKSIZE], %f0
1587 sub CNT, VIS_BLOCKSIZE, CNT
1588 add SRC, VIS_BLOCKSIZE, SRC
1589 ba,a,pt %ncc, 2f
1590 nop
1591 .align 16
1592 2:
1593 ldd [SRC + 0x08], %f2
1594 fsrc1 %f12, %f44
1595 ldd [SRC + 0x10], %f4
1596 fsrc1 %f14, %f46
1597 stda %f32, [DST]ASI_BLK_P
1598 ldd [SRC + 0x18], %f6
1599 fsrc1 %f0, %f32
1600 ldd [SRC + 0x20], %f8
1601 fsrc1 %f2, %f34
1602 ldd [SRC + 0x28], %f10
1603 fsrc1 %f4, %f36
1604 ldd [SRC + 0x30], %f12
1605 fsrc1 %f6, %f38
1606 ldd [SRC + 0x38], %f14
1607 fsrc1 %f8, %f40
1608 ldd [SRC + VIS_BLOCKSIZE], %f0
1609 fsrc1 %f10, %f42
1610 prefetch [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read
1611 sub CNT, VIS_BLOCKSIZE, CNT
1612 add DST, VIS_BLOCKSIZE, DST
1613 cmp CNT, VIS_BLOCKSIZE + 8
1614 prefetch [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1615 bgu,pt %ncc, 2b
1616 add SRC, VIS_BLOCKSIZE, SRC
1617
1618 ! trailing block
1619 ldd [SRC + 0x08], %f2
1620 fsrc1 %f12, %f44
1621 ldd [SRC + 0x10], %f4
1622 fsrc1 %f14, %f46
1623 stda %f32, [DST]ASI_BLK_P
1624 ldd [SRC + 0x18], %f6
1625 fsrc1 %f0, %f32
1626 ldd [SRC + 0x20], %f8
1627 fsrc1 %f2, %f34
1628 ldd [SRC + 0x28], %f10
1629 fsrc1 %f4, %f36
1630 ldd [SRC + 0x30], %f12
1631 fsrc1 %f6, %f38
1632 ldd [SRC + 0x38], %f14
1633 fsrc1 %f8, %f40
1634 sub CNT, VIS_BLOCKSIZE, CNT
1635 add DST, VIS_BLOCKSIZE, DST
1636 add SRC, VIS_BLOCKSIZE, SRC
1637 fsrc1 %f10, %f42
1638 fsrc1 %f12, %f44
1639 fsrc1 %f14, %f46
1640 stda %f32, [DST]ASI_BLK_P
1641
1642 membar #Sync
1643
1644 FPRAS_INTERVAL(FPRAS_PGCOPY, 1, %l5, %o2, %o3, %o4, %o5, 8)
1645 FPRAS_REWRITE_TYPE1(1, %l5, %f32, %o2, 9)
1646 FPRAS_CHECK(FPRAS_PGCOPY, %l5, 9) ! lose outputs
1647
1648 btst FPRS_FEF, %l0
1649 bz,pt %icc, 2f
1650 nop
1651
1652 BLD_FPQ1Q3_FROMSTACK(%l3)
1653 ba 3f
1654 nop
1655
1656 2: FZEROQ1Q3
1657
1658 3: wr %l0, 0, %fprs ! restore fprs
1659 ret
1660 restore %g0, 0, %o0
1661
1662 SET_SIZE(hwblkpagecopy)
1663
1664
1665 /*
1666 * Transfer data to and from user space -
1667 * Note that these routines can cause faults
1668 * It is assumed that the kernel has nothing at
1669 * less than KERNELBASE in the virtual address space.
1670 *
1671 * Note that copyin(9F) and copyout(9F) are part of the
1672 * DDI/DKI which specifies that they return '-1' on "errors."
1673 *
1674 * Sigh.
1675 *
1676 * So there's two extremely similar routines - xcopyin() and xcopyout()
1677 * which return the errno that we've faithfully computed. This
1678 * allows other callers (e.g. uiomove(9F)) to work correctly.
1679 * Given that these are used pretty heavily, we expand the calling
1680 * sequences inline for all flavours (rather than making wrappers).
1681 *
1682 * There are also stub routines for xcopyout_little and xcopyin_little,
1683 * which currently are intended to handle requests of <= 16 bytes from
1684 * do_unaligned. Future enhancement to make them handle 8k pages efficiently
1685 * is left as an exercise...
1686 */
1687
1688 /*
1689 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
1690 *
1691 * General theory of operation:
1692 *
1693 * The only difference between copy{in,out} and
1694 * xcopy{in,out} is in the error handling routine they invoke
1695 * when a memory access error occurs. xcopyOP returns the errno
1696 * while copyOP returns -1 (see above). copy{in,out}_noerr set
1697 * a special flag (by oring the TRAMP_FLAG into the fault handler address)
1698 * if they are called with a fault handler already in place. That flag
1699 * causes the default handlers to trampoline to the previous handler
1700 * upon an error.
1701 *
1702 * None of the copyops routines grab a window until it's decided that
1703 * we need to do a HW block copy operation. This saves a window
1704 * spill/fill when we're called during socket ops. The typical IO
1705 * path won't cause spill/fill traps.
1706 *
1707 * This code uses a set of 4 limits for the maximum size that will
1708 * be copied given a particular input/output address alignment.
1709 * If the value for a particular limit is zero, the copy will be performed
1710 * by the plain copy loops rather than FPBLK.
1711 *
1712 * See the description of bcopy above for more details of the
1713 * data copying algorithm and the default limits.
1714 *
1715 */
1716
1717 /*
1718 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
1719 */
1720
1721 /*
1722 * We save the arguments in the following registers in case of a fault:
1723 * kaddr - %l1
1724 * uaddr - %l2
1725 * count - %l3
1726 */
1727 #define SAVE_SRC %l1
1728 #define SAVE_DST %l2
1729 #define SAVE_COUNT %l3
1730
1731 #define SM_SAVE_SRC %g4
1732 #define SM_SAVE_DST %g5
1733 #define SM_SAVE_COUNT %o5
1734 #define ERRNO %l5
1735
1736
1737 #define REAL_LOFAULT %l4
1738 /*
1739 * Generic copyio fault handler. This is the first line of defense when a
1740 * fault occurs in (x)copyin/(x)copyout. In order for this to function
1741 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
1742 * This allows us to share common code for all the flavors of the copy
1743 * operations, including the _noerr versions.
1744 *
1745 * Note that this function will restore the original input parameters before
1746 * calling REAL_LOFAULT. So the real handler can vector to the appropriate
1747 * member of the t_copyop structure, if needed.
1748 */
1749 ENTRY(copyio_fault)
1750 membar #Sync
1751 mov %g1,ERRNO ! save errno in ERRNO
1752 btst FPUSED_FLAG, %l6
1753 bz %ncc, 1f
1754 nop
1755
1756 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
1757 wr %o2, 0, %gsr ! restore gsr
1758
1759 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1760 btst FPRS_FEF, %o3
1761 bz,pt %icc, 4f
1762 nop
1763
1764 BLD_FPQ2Q4_FROMSTACK(%o2)
1765
1766 ba,pt %ncc, 1f
1767 wr %o3, 0, %fprs ! restore fprs
1768
1769 4:
1770 FZEROQ2Q4
1771 wr %o3, 0, %fprs ! restore fprs
1772
1773 1:
1774 andn %l6, FPUSED_FLAG, %l6
1775 membar #Sync
1776 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
1777 FP_ALLOWMIGRATE(5, 6)
1778
1779 mov SAVE_SRC, %i0
1780 mov SAVE_DST, %i1
1781 jmp REAL_LOFAULT
1782 mov SAVE_COUNT, %i2
1783
1784 SET_SIZE(copyio_fault)
1785
1786
1787 ENTRY(copyout)
1788
1789 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
1790 bleu,pt %ncc, .copyout_small ! go to larger cases
1791 xor %o0, %o1, %o3 ! are src, dst alignable?
1792 btst 7, %o3 !
1793 bz,pt %ncc, .copyout_8 ! check for longword alignment
1794 nop
1795 btst 1, %o3 !
1796 bz,pt %ncc, .copyout_2 ! check for half-word
1797 nop
1798 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
1799 ld [%o3 + %lo(hw_copy_limit_1)], %o3
1800 tst %o3
1801 bz,pn %icc, .copyout_small ! if zero, disable HW copy
1802 cmp %o2, %o3 ! if length <= limit
1803 bleu,pt %ncc, .copyout_small ! go to small copy
1804 nop
1805 ba,pt %ncc, .copyout_more ! otherwise go to large copy
1806 nop
1807 .copyout_2:
1808 btst 3, %o3 !
1809 bz,pt %ncc, .copyout_4 ! check for word alignment
1810 nop
1811 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
1812 ld [%o3 + %lo(hw_copy_limit_2)], %o3
1813 tst %o3
1814 bz,pn %icc, .copyout_small ! if zero, disable HW copy
1815 cmp %o2, %o3 ! if length <= limit
1816 bleu,pt %ncc, .copyout_small ! go to small copy
1817 nop
1818 ba,pt %ncc, .copyout_more ! otherwise go to large copy
1819 nop
1820 .copyout_4:
1821 ! already checked longword, must be word aligned
1822 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
1823 ld [%o3 + %lo(hw_copy_limit_4)], %o3
1824 tst %o3
1825 bz,pn %icc, .copyout_small ! if zero, disable HW copy
1826 cmp %o2, %o3 ! if length <= limit
1827 bleu,pt %ncc, .copyout_small ! go to small copy
1828 nop
1829 ba,pt %ncc, .copyout_more ! otherwise go to large copy
1830 nop
1831 .copyout_8:
1832 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
1833 ld [%o3 + %lo(hw_copy_limit_8)], %o3
1834 tst %o3
1835 bz,pn %icc, .copyout_small ! if zero, disable HW copy
1836 cmp %o2, %o3 ! if length <= limit
1837 bleu,pt %ncc, .copyout_small ! go to small copy
1838 nop
1839 ba,pt %ncc, .copyout_more ! otherwise go to large copy
1840 nop
1841
1842 .align 16
1843 nop ! instruction alignment
1844 ! see discussion at start of file
1845 .copyout_small:
1846 sethi %hi(.sm_copyout_err), %o5 ! .sm_copyout_err is lofault
1847 or %o5, %lo(.sm_copyout_err), %o5
1848 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler
1849 membar #Sync ! sync error barrier
1850 stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault
1851 .sm_do_copyout:
1852 mov %o0, SM_SAVE_SRC
1853 mov %o1, SM_SAVE_DST
1854 cmp %o2, SHORTCOPY ! check for really short case
1855 bleu,pt %ncc, .co_sm_left !
1856 mov %o2, SM_SAVE_COUNT
1857 cmp %o2, CHKSIZE ! check for medium length cases
1858 bgu,pn %ncc, .co_med !
1859 or %o0, %o1, %o3 ! prepare alignment check
1860 andcc %o3, 0x3, %g0 ! test for alignment
1861 bz,pt %ncc, .co_sm_word ! branch to word aligned case
1862 .co_sm_movebytes:
1863 sub %o2, 3, %o2 ! adjust count to allow cc zero test
1864 .co_sm_notalign4:
1865 ldub [%o0], %o3 ! read byte
1866 subcc %o2, 4, %o2 ! reduce count by 4
1867 stba %o3, [%o1]ASI_USER ! write byte
1868 inc %o1 ! advance DST by 1
1869 ldub [%o0 + 1], %o3 ! repeat for a total of 4 bytes
1870 add %o0, 4, %o0 ! advance SRC by 4
1871 stba %o3, [%o1]ASI_USER
1872 inc %o1 ! advance DST by 1
1873 ldub [%o0 - 2], %o3
1874 stba %o3, [%o1]ASI_USER
1875 inc %o1 ! advance DST by 1
1876 ldub [%o0 - 1], %o3
1877 stba %o3, [%o1]ASI_USER
1878 bgt,pt %ncc, .co_sm_notalign4 ! loop til 3 or fewer bytes remain
1879 inc %o1 ! advance DST by 1
1880 add %o2, 3, %o2 ! restore count
1881 .co_sm_left:
1882 tst %o2
1883 bz,pt %ncc, .co_sm_exit ! check for zero length
1884 nop
1885 ldub [%o0], %o3 ! load one byte
1886 deccc %o2 ! reduce count for cc test
1887 bz,pt %ncc, .co_sm_exit
1888 stba %o3,[%o1]ASI_USER ! store one byte
1889 ldub [%o0 + 1], %o3 ! load second byte
1890 deccc %o2
1891 inc %o1
1892 bz,pt %ncc, .co_sm_exit
1893 stba %o3,[%o1]ASI_USER ! store second byte
1894 ldub [%o0 + 2], %o3 ! load third byte
1895 inc %o1
1896 stba %o3,[%o1]ASI_USER ! store third byte
1897 membar #Sync ! sync error barrier
1898 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
1899 retl
1900 mov %g0, %o0 ! return 0
1901 .align 16
1902 .co_sm_words:
1903 lduw [%o0], %o3 ! read word
1904 .co_sm_wordx:
1905 subcc %o2, 8, %o2 ! update count
1906 stwa %o3, [%o1]ASI_USER ! write word
1907 add %o0, 8, %o0 ! update SRC
1908 lduw [%o0 - 4], %o3 ! read word
1909 add %o1, 4, %o1 ! update DST
1910 stwa %o3, [%o1]ASI_USER ! write word
1911 bgt,pt %ncc, .co_sm_words ! loop til done
1912 add %o1, 4, %o1 ! update DST
1913 addcc %o2, 7, %o2 ! restore count
1914 bz,pt %ncc, .co_sm_exit
1915 nop
1916 deccc %o2
1917 bz,pt %ncc, .co_sm_byte
1918 .co_sm_half:
1919 subcc %o2, 2, %o2 ! reduce count by 2
1920 lduh [%o0], %o3 ! read half word
1921 add %o0, 2, %o0 ! advance SRC by 2
1922 stha %o3, [%o1]ASI_USER ! write half word
1923 bgt,pt %ncc, .co_sm_half ! loop til done
1924 add %o1, 2, %o1 ! advance DST by 2
1925 addcc %o2, 1, %o2 ! restore count
1926 bz,pt %ncc, .co_sm_exit
1927 nop
1928 .co_sm_byte:
1929 ldub [%o0], %o3
1930 stba %o3, [%o1]ASI_USER
1931 membar #Sync ! sync error barrier
1932 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
1933 retl
1934 mov %g0, %o0 ! return 0
1935 .align 16
1936 .co_sm_word:
1937 subcc %o2, 4, %o2 ! update count
1938 bgt,pt %ncc, .co_sm_wordx
1939 lduw [%o0], %o3 ! read word
1940 addcc %o2, 3, %o2 ! restore count
1941 bz,pt %ncc, .co_sm_exit
1942 stwa %o3, [%o1]ASI_USER ! write word
1943 deccc %o2 ! reduce count for cc test
1944 ldub [%o0 + 4], %o3 ! load one byte
1945 add %o1, 4, %o1
1946 bz,pt %ncc, .co_sm_exit
1947 stba %o3, [%o1]ASI_USER ! store one byte
1948 ldub [%o0 + 5], %o3 ! load second byte
1949 deccc %o2
1950 inc %o1
1951 bz,pt %ncc, .co_sm_exit
1952 stba %o3, [%o1]ASI_USER ! store second byte
1953 ldub [%o0 + 6], %o3 ! load third byte
1954 inc %o1
1955 stba %o3, [%o1]ASI_USER ! store third byte
1956 .co_sm_exit:
1957 membar #Sync ! sync error barrier
1958 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
1959 retl
1960 mov %g0, %o0 ! return 0
1961
1962 .align 16
1963 .co_med:
1964 xor %o0, %o1, %o3 ! setup alignment check
1965 btst 1, %o3
1966 bnz,pt %ncc, .co_sm_movebytes ! unaligned
1967 nop
1968 btst 3, %o3
1969 bnz,pt %ncc, .co_med_half ! halfword aligned
1970 nop
1971 btst 7, %o3
1972 bnz,pt %ncc, .co_med_word ! word aligned
1973 nop
1974 .co_med_long:
1975 btst 3, %o0 ! check for
1976 bz,pt %ncc, .co_med_long1 ! word alignment
1977 nop
1978 .co_med_long0:
1979 ldub [%o0], %o3 ! load one byte
1980 inc %o0
1981 stba %o3,[%o1]ASI_USER ! store byte
1982 inc %o1
1983 btst 3, %o0
1984 bnz,pt %ncc, .co_med_long0
1985 dec %o2
1986 .co_med_long1: ! word aligned
1987 btst 7, %o0 ! check for long word
1988 bz,pt %ncc, .co_med_long2
1989 nop
1990 lduw [%o0], %o3 ! load word
1991 add %o0, 4, %o0 ! advance SRC by 4
1992 stwa %o3, [%o1]ASI_USER ! store word
1993 add %o1, 4, %o1 ! advance DST by 4
1994 sub %o2, 4, %o2 ! reduce count by 4
1995 !
1996 ! Now long word aligned and have at least 32 bytes to move
1997 !
1998 .co_med_long2:
1999 sub %o2, 31, %o2 ! adjust count to allow cc zero test
2000 sub %o1, 8, %o1 ! adjust pointer to allow store in
2001 ! branch delay slot instead of add
2002 .co_med_lmove:
2003 add %o1, 8, %o1 ! advance DST by 8
2004 ldx [%o0], %o3 ! read long word
2005 subcc %o2, 32, %o2 ! reduce count by 32
2006 stxa %o3, [%o1]ASI_USER ! write long word
2007 add %o1, 8, %o1 ! advance DST by 8
2008 ldx [%o0 + 8], %o3 ! repeat for a total for 4 long words
2009 add %o0, 32, %o0 ! advance SRC by 32
2010 stxa %o3, [%o1]ASI_USER
2011 ldx [%o0 - 16], %o3
2012 add %o1, 8, %o1 ! advance DST by 8
2013 stxa %o3, [%o1]ASI_USER
2014 ldx [%o0 - 8], %o3
2015 add %o1, 8, %o1 ! advance DST by 8
2016 bgt,pt %ncc, .co_med_lmove ! loop til 31 or fewer bytes left
2017 stxa %o3, [%o1]ASI_USER
2018 add %o1, 8, %o1 ! advance DST by 8
2019 addcc %o2, 24, %o2 ! restore count to long word offset
2020 ble,pt %ncc, .co_med_lextra ! check for more long words to move
2021 nop
2022 .co_med_lword:
2023 ldx [%o0], %o3 ! read long word
2024 subcc %o2, 8, %o2 ! reduce count by 8
2025 stxa %o3, [%o1]ASI_USER ! write long word
2026 add %o0, 8, %o0 ! advance SRC by 8
2027 bgt,pt %ncc, .co_med_lword ! loop til 7 or fewer bytes left
2028 add %o1, 8, %o1 ! advance DST by 8
2029 .co_med_lextra:
2030 addcc %o2, 7, %o2 ! restore rest of count
2031 bz,pt %ncc, .co_sm_exit ! if zero, then done
2032 deccc %o2
2033 bz,pt %ncc, .co_sm_byte
2034 nop
2035 ba,pt %ncc, .co_sm_half
2036 nop
2037
2038 .align 16
2039 nop ! instruction alignment
2040 ! see discussion at start of file
2041 .co_med_word:
2042 btst 3, %o0 ! check for
2043 bz,pt %ncc, .co_med_word1 ! word alignment
2044 nop
2045 .co_med_word0:
2046 ldub [%o0], %o3 ! load one byte
2047 inc %o0
2048 stba %o3,[%o1]ASI_USER ! store byte
2049 inc %o1
2050 btst 3, %o0
2051 bnz,pt %ncc, .co_med_word0
2052 dec %o2
2053 !
2054 ! Now word aligned and have at least 36 bytes to move
2055 !
2056 .co_med_word1:
2057 sub %o2, 15, %o2 ! adjust count to allow cc zero test
2058 .co_med_wmove:
2059 lduw [%o0], %o3 ! read word
2060 subcc %o2, 16, %o2 ! reduce count by 16
2061 stwa %o3, [%o1]ASI_USER ! write word
2062 add %o1, 4, %o1 ! advance DST by 4
2063 lduw [%o0 + 4], %o3 ! repeat for a total for 4 words
2064 add %o0, 16, %o0 ! advance SRC by 16
2065 stwa %o3, [%o1]ASI_USER
2066 add %o1, 4, %o1 ! advance DST by 4
2067 lduw [%o0 - 8], %o3
2068 stwa %o3, [%o1]ASI_USER
2069 add %o1, 4, %o1 ! advance DST by 4
2070 lduw [%o0 - 4], %o3
2071 stwa %o3, [%o1]ASI_USER
2072 bgt,pt %ncc, .co_med_wmove ! loop til 15 or fewer bytes left
2073 add %o1, 4, %o1 ! advance DST by 4
2074 addcc %o2, 12, %o2 ! restore count to word offset
2075 ble,pt %ncc, .co_med_wextra ! check for more words to move
2076 nop
2077 .co_med_word2:
2078 lduw [%o0], %o3 ! read word
2079 subcc %o2, 4, %o2 ! reduce count by 4
2080 stwa %o3, [%o1]ASI_USER ! write word
2081 add %o0, 4, %o0 ! advance SRC by 4
2082 bgt,pt %ncc, .co_med_word2 ! loop til 3 or fewer bytes left
2083 add %o1, 4, %o1 ! advance DST by 4
2084 .co_med_wextra:
2085 addcc %o2, 3, %o2 ! restore rest of count
2086 bz,pt %ncc, .co_sm_exit ! if zero, then done
2087 deccc %o2
2088 bz,pt %ncc, .co_sm_byte
2089 nop
2090 ba,pt %ncc, .co_sm_half
2091 nop
2092
2093 .align 16
2094 nop ! instruction alignment
2095 nop ! see discussion at start of file
2096 nop
2097 .co_med_half:
2098 btst 1, %o0 ! check for
2099 bz,pt %ncc, .co_med_half1 ! half word alignment
2100 nop
2101 ldub [%o0], %o3 ! load one byte
2102 inc %o0
2103 stba %o3,[%o1]ASI_USER ! store byte
2104 inc %o1
2105 dec %o2
2106 !
2107 ! Now half word aligned and have at least 38 bytes to move
2108 !
2109 .co_med_half1:
2110 sub %o2, 7, %o2 ! adjust count to allow cc zero test
2111 .co_med_hmove:
2112 lduh [%o0], %o3 ! read half word
2113 subcc %o2, 8, %o2 ! reduce count by 8
2114 stha %o3, [%o1]ASI_USER ! write half word
2115 add %o1, 2, %o1 ! advance DST by 2
2116 lduh [%o0 + 2], %o3 ! repeat for a total for 4 halfwords
2117 add %o0, 8, %o0 ! advance SRC by 8
2118 stha %o3, [%o1]ASI_USER
2119 add %o1, 2, %o1 ! advance DST by 2
2120 lduh [%o0 - 4], %o3
2121 stha %o3, [%o1]ASI_USER
2122 add %o1, 2, %o1 ! advance DST by 2
2123 lduh [%o0 - 2], %o3
2124 stha %o3, [%o1]ASI_USER
2125 bgt,pt %ncc, .co_med_hmove ! loop til 7 or fewer bytes left
2126 add %o1, 2, %o1 ! advance DST by 2
2127 addcc %o2, 7, %o2 ! restore count
2128 bz,pt %ncc, .co_sm_exit
2129 deccc %o2
2130 bz,pt %ncc, .co_sm_byte
2131 nop
2132 ba,pt %ncc, .co_sm_half
2133 nop
2134
2135 /*
2136 * We got here because of a fault during short copyout.
2137 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
2138 */
2139 .sm_copyout_err:
2140 membar #Sync
2141 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2142 mov SM_SAVE_SRC, %o0
2143 mov SM_SAVE_DST, %o1
2144 mov SM_SAVE_COUNT, %o2
2145 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler
2146 tst %o3
2147 bz,pt %ncc, 3f ! if not, return error
2148 nop
2149 ldn [%o3 + CP_COPYOUT], %o5 ! if handler, invoke it with
2150 jmp %o5 ! original arguments
2151 nop
2152 3:
2153 retl
2154 or %g0, -1, %o0 ! return error value
2155
2156 SET_SIZE(copyout)
2157
2158 /*
2159 * The _more entry points are not intended to be used directly by
2160 * any caller from outside this file. They are provided to allow
2161 * profiling and dtrace of the portions of the copy code that uses
2162 * the floating point registers.
2163 * This entry is particularly important as DTRACE (at least as of
2164 * 4/2004) does not support leaf functions.
2165 */
2166
2167 ENTRY(copyout_more)
2168 .copyout_more:
2169 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2170 set .copyout_err, REAL_LOFAULT
2171
2172 /*
2173 * Copy outs that reach here are larger than VIS_COPY_THRESHOLD bytes
2174 */
2175 .do_copyout:
2176 set copyio_fault, %l7 ! .copyio_fault is lofault val
2177
2178 ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler
2179 membar #Sync ! sync error barrier
2180 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault
2181
2182 mov %i0, SAVE_SRC
2183 mov %i1, SAVE_DST
2184 mov %i2, SAVE_COUNT
2185
2186 FP_NOMIGRATE(6, 7)
2187
2188 rd %fprs, %o2 ! check for unused fp
2189 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
2190 btst FPRS_FEF, %o2
2191 bz,a,pt %icc, .do_blockcopyout
2192 wr %g0, FPRS_FEF, %fprs
2193
2194 BST_FPQ2Q4_TOSTACK(%o2)
2195
2196 .do_blockcopyout:
2197 rd %gsr, %o2
2198 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr
2199 or %l6, FPUSED_FLAG, %l6
2200
2201 andcc DST, VIS_BLOCKSIZE - 1, TMP
2202 mov ASI_USER, %asi
2203 bz,pt %ncc, 2f
2204 neg TMP
2205 add TMP, VIS_BLOCKSIZE, TMP
2206
2207 ! TMP = bytes required to align DST on FP_BLOCK boundary
2208 ! Using SRC as a tmp here
2209 cmp TMP, 3
2210 bleu,pt %ncc, 1f
2211 sub CNT,TMP,CNT ! adjust main count
2212 sub TMP, 3, TMP ! adjust for end of loop test
2213 .co_blkalign:
2214 ldub [REALSRC], SRC ! move 4 bytes per loop iteration
2215 stba SRC, [DST]%asi
2216 subcc TMP, 4, TMP
2217 ldub [REALSRC + 1], SRC
2218 add REALSRC, 4, REALSRC
2219 stba SRC, [DST + 1]%asi
2220 ldub [REALSRC - 2], SRC
2221 add DST, 4, DST
2222 stba SRC, [DST - 2]%asi
2223 ldub [REALSRC - 1], SRC
2224 bgu,pt %ncc, .co_blkalign
2225 stba SRC, [DST - 1]%asi
2226
2227 addcc TMP, 3, TMP ! restore count adjustment
2228 bz,pt %ncc, 2f ! no bytes left?
2229 nop
2230 1: ldub [REALSRC], SRC
2231 inc REALSRC
2232 inc DST
2233 deccc TMP
2234 bgu %ncc, 1b
2235 stba SRC, [DST - 1]%asi
2236
2237 2:
2238 andn REALSRC, 0x7, SRC
2239 alignaddr REALSRC, %g0, %g0
2240
2241 ! SRC - 8-byte aligned
2242 ! DST - 64-byte aligned
2243 prefetch [SRC], #one_read
2244 prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read
2245 prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read
2246 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read
2247 ldd [SRC], %f16
2248 #if CHEETAH_PREFETCH > 4
2249 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
2250 #endif
2251 ldd [SRC + 0x08], %f18
2252 #if CHEETAH_PREFETCH > 5
2253 prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read
2254 #endif
2255 ldd [SRC + 0x10], %f20
2256 #if CHEETAH_PREFETCH > 6
2257 prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read
2258 #endif
2259 faligndata %f16, %f18, %f48
2260 ldd [SRC + 0x18], %f22
2261 #if CHEETAH_PREFETCH > 7
2262 prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read
2263 #endif
2264 faligndata %f18, %f20, %f50
2265 ldd [SRC + 0x20], %f24
2266 faligndata %f20, %f22, %f52
2267 ldd [SRC + 0x28], %f26
2268 faligndata %f22, %f24, %f54
2269 ldd [SRC + 0x30], %f28
2270 faligndata %f24, %f26, %f56
2271 ldd [SRC + 0x38], %f30
2272 faligndata %f26, %f28, %f58
2273 ldd [SRC + VIS_BLOCKSIZE], %f16
2274 sub CNT, VIS_BLOCKSIZE, CNT
2275 add SRC, VIS_BLOCKSIZE, SRC
2276 add REALSRC, VIS_BLOCKSIZE, REALSRC
2277 ba,a,pt %ncc, 1f
2278 nop
2279 .align 16
2280 1:
2281 ldd [SRC + 0x08], %f18
2282 faligndata %f28, %f30, %f60
2283 ldd [SRC + 0x10], %f20
2284 faligndata %f30, %f16, %f62
2285 stda %f48, [DST]ASI_BLK_AIUS
2286 ldd [SRC + 0x18], %f22
2287 faligndata %f16, %f18, %f48
2288 ldd [SRC + 0x20], %f24
2289 faligndata %f18, %f20, %f50
2290 ldd [SRC + 0x28], %f26
2291 faligndata %f20, %f22, %f52
2292 ldd [SRC + 0x30], %f28
2293 faligndata %f22, %f24, %f54
2294 ldd [SRC + 0x38], %f30
2295 faligndata %f24, %f26, %f56
2296 sub CNT, VIS_BLOCKSIZE, CNT
2297 ldd [SRC + VIS_BLOCKSIZE], %f16
2298 faligndata %f26, %f28, %f58
2299 prefetch [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read
2300 add DST, VIS_BLOCKSIZE, DST
2301 prefetch [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
2302 add REALSRC, VIS_BLOCKSIZE, REALSRC
2303 cmp CNT, VIS_BLOCKSIZE + 8
2304 bgu,pt %ncc, 1b
2305 add SRC, VIS_BLOCKSIZE, SRC
2306
2307 ! only if REALSRC & 0x7 is 0
2308 cmp CNT, VIS_BLOCKSIZE
2309 bne %ncc, 3f
2310 andcc REALSRC, 0x7, %g0
2311 bz,pt %ncc, 2f
2312 nop
2313 3:
2314 faligndata %f28, %f30, %f60
2315 faligndata %f30, %f16, %f62
2316 stda %f48, [DST]ASI_BLK_AIUS
2317 add DST, VIS_BLOCKSIZE, DST
2318 ba,pt %ncc, 3f
2319 nop
2320 2:
2321 ldd [SRC + 0x08], %f18
2322 fsrc1 %f28, %f60
2323 ldd [SRC + 0x10], %f20
2324 fsrc1 %f30, %f62
2325 stda %f48, [DST]ASI_BLK_AIUS
2326 ldd [SRC + 0x18], %f22
2327 fsrc1 %f16, %f48
2328 ldd [SRC + 0x20], %f24
2329 fsrc1 %f18, %f50
2330 ldd [SRC + 0x28], %f26
2331 fsrc1 %f20, %f52
2332 ldd [SRC + 0x30], %f28
2333 fsrc1 %f22, %f54
2334 ldd [SRC + 0x38], %f30
2335 fsrc1 %f24, %f56
2336 sub CNT, VIS_BLOCKSIZE, CNT
2337 add DST, VIS_BLOCKSIZE, DST
2338 add SRC, VIS_BLOCKSIZE, SRC
2339 add REALSRC, VIS_BLOCKSIZE, REALSRC
2340 fsrc1 %f26, %f58
2341 fsrc1 %f28, %f60
2342 fsrc1 %f30, %f62
2343 stda %f48, [DST]ASI_BLK_AIUS
2344 add DST, VIS_BLOCKSIZE, DST
2345 ba,a,pt %ncc, 4f
2346 nop
2347
2348 3: tst CNT
2349 bz,a %ncc, 4f
2350 nop
2351
2352 5: ldub [REALSRC], TMP
2353 inc REALSRC
2354 inc DST
2355 deccc CNT
2356 bgu %ncc, 5b
2357 stba TMP, [DST - 1]%asi
2358 4:
2359
2360 .copyout_exit:
2361 membar #Sync
2362
2363 FPRAS_INTERVAL(FPRAS_COPYOUT, 0, %l5, %o2, %o3, %o4, %o5, 8)
2364 FPRAS_REWRITE_TYPE2Q2(0, %l5, %o2, %o3, 8, 9)
2365 FPRAS_CHECK(FPRAS_COPYOUT, %l5, 9) ! lose outputs
2366
2367 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
2368 wr %o2, 0, %gsr ! restore gsr
2369
2370 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
2371 btst FPRS_FEF, %o3
2372 bz,pt %icc, 4f
2373 nop
2374
2375 BLD_FPQ2Q4_FROMSTACK(%o2)
2376
2377 ba,pt %ncc, 1f
2378 wr %o3, 0, %fprs ! restore fprs
2379
2380 4:
2381 FZEROQ2Q4
2382 wr %o3, 0, %fprs ! restore fprs
2383
2384 1:
2385 membar #Sync
2386 andn %l6, FPUSED_FLAG, %l6
2387 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2388 FP_ALLOWMIGRATE(5, 6)
2389 ret
2390 restore %g0, 0, %o0
2391
2392 /*
2393 * We got here because of a fault during copyout.
2394 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
2395 */
2396 .copyout_err:
2397 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler
2398 tst %o4
2399 bz,pt %ncc, 2f ! if not, return error
2400 nop
2401 ldn [%o4 + CP_COPYOUT], %g2 ! if handler, invoke it with
2402 jmp %g2 ! original arguments
2403 restore %g0, 0, %g0 ! dispose of copy window
2404 2:
2405 ret
2406 restore %g0, -1, %o0 ! return error value
2407
2408
2409 SET_SIZE(copyout_more)
2410
2411
2412 ENTRY(xcopyout)
2413 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
2414 bleu,pt %ncc, .xcopyout_small ! go to larger cases
2415 xor %o0, %o1, %o3 ! are src, dst alignable?
2416 btst 7, %o3 !
2417 bz,pt %ncc, .xcopyout_8 !
2418 nop
2419 btst 1, %o3 !
2420 bz,pt %ncc, .xcopyout_2 ! check for half-word
2421 nop
2422 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
2423 ld [%o3 + %lo(hw_copy_limit_1)], %o3
2424 tst %o3
2425 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy
2426 cmp %o2, %o3 ! if length <= limit
2427 bleu,pt %ncc, .xcopyout_small ! go to small copy
2428 nop
2429 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy
2430 nop
2431 .xcopyout_2:
2432 btst 3, %o3 !
2433 bz,pt %ncc, .xcopyout_4 ! check for word alignment
2434 nop
2435 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
2436 ld [%o3 + %lo(hw_copy_limit_2)], %o3
2437 tst %o3
2438 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy
2439 cmp %o2, %o3 ! if length <= limit
2440 bleu,pt %ncc, .xcopyout_small ! go to small copy
2441 nop
2442 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy
2443 nop
2444 .xcopyout_4:
2445 ! already checked longword, must be word aligned
2446 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
2447 ld [%o3 + %lo(hw_copy_limit_4)], %o3
2448 tst %o3
2449 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy
2450 cmp %o2, %o3 ! if length <= limit
2451 bleu,pt %ncc, .xcopyout_small ! go to small copy
2452 nop
2453 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy
2454 nop
2455 .xcopyout_8:
2456 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
2457 ld [%o3 + %lo(hw_copy_limit_8)], %o3
2458 tst %o3
2459 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy
2460 cmp %o2, %o3 ! if length <= limit
2461 bleu,pt %ncc, .xcopyout_small ! go to small copy
2462 nop
2463 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy
2464 nop
2465
2466 .xcopyout_small:
2467 sethi %hi(.sm_xcopyout_err), %o5 ! .sm_xcopyout_err is lofault
2468 or %o5, %lo(.sm_xcopyout_err), %o5
2469 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler
2470 membar #Sync ! sync error barrier
2471 ba,pt %ncc, .sm_do_copyout ! common code
2472 stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault
2473
2474 .xcopyout_more:
2475 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2476 sethi %hi(.xcopyout_err), REAL_LOFAULT
2477 ba,pt %ncc, .do_copyout ! common code
2478 or REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
2479
2480 /*
2481 * We got here because of fault during xcopyout
2482 * Errno value is in ERRNO
2483 */
2484 .xcopyout_err:
2485 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler
2486 tst %o4
2487 bz,pt %ncc, 2f ! if not, return error
2488 nop
2489 ldn [%o4 + CP_XCOPYOUT], %g2 ! if handler, invoke it with
2490 jmp %g2 ! original arguments
2491 restore %g0, 0, %g0 ! dispose of copy window
2492 2:
2493 ret
2494 restore ERRNO, 0, %o0 ! return errno value
2495
2496 .sm_xcopyout_err:
2497
2498 membar #Sync
2499 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2500 mov SM_SAVE_SRC, %o0
2501 mov SM_SAVE_DST, %o1
2502 mov SM_SAVE_COUNT, %o2
2503 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler
2504 tst %o3
2505 bz,pt %ncc, 3f ! if not, return error
2506 nop
2507 ldn [%o3 + CP_XCOPYOUT], %o5 ! if handler, invoke it with
2508 jmp %o5 ! original arguments
2509 nop
2510 3:
2511 retl
2512 or %g1, 0, %o0 ! return errno value
2513
2514 SET_SIZE(xcopyout)
2515
2516 ENTRY(xcopyout_little)
2517 sethi %hi(.xcopyio_err), %o5
2518 or %o5, %lo(.xcopyio_err), %o5
2519 ldn [THREAD_REG + T_LOFAULT], %o4
2520 membar #Sync ! sync error barrier
2521 stn %o5, [THREAD_REG + T_LOFAULT]
2522 mov %o4, %o5
2523
2524 subcc %g0, %o2, %o3
2525 add %o0, %o2, %o0
2526 bz,pn %ncc, 2f ! check for zero bytes
2527 sub %o2, 1, %o4
2528 add %o0, %o4, %o0 ! start w/last byte
2529 add %o1, %o2, %o1
2530 ldub [%o0 + %o3], %o4
2531
2532 1: stba %o4, [%o1 + %o3]ASI_AIUSL
2533 inccc %o3
2534 sub %o0, 2, %o0 ! get next byte
2535 bcc,a,pt %ncc, 1b
2536 ldub [%o0 + %o3], %o4
2537
2538 2:
2539 membar #Sync ! sync error barrier
2540 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2541 retl
2542 mov %g0, %o0 ! return (0)
2543
2544 SET_SIZE(xcopyout_little)
2545
2546 /*
2547 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
2548 */
2549
2550 ENTRY(copyin)
2551 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
2552 bleu,pt %ncc, .copyin_small ! go to larger cases
2553 xor %o0, %o1, %o3 ! are src, dst alignable?
2554 btst 7, %o3 !
2555 bz,pt %ncc, .copyin_8 ! check for longword alignment
2556 nop
2557 btst 1, %o3 !
2558 bz,pt %ncc, .copyin_2 ! check for half-word
2559 nop
2560 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
2561 ld [%o3 + %lo(hw_copy_limit_1)], %o3
2562 tst %o3
2563 bz,pn %icc, .copyin_small ! if zero, disable HW copy
2564 cmp %o2, %o3 ! if length <= limit
2565 bleu,pt %ncc, .copyin_small ! go to small copy
2566 nop
2567 ba,pt %ncc, .copyin_more ! otherwise go to large copy
2568 nop
2569 .copyin_2:
2570 btst 3, %o3 !
2571 bz,pt %ncc, .copyin_4 ! check for word alignment
2572 nop
2573 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
2574 ld [%o3 + %lo(hw_copy_limit_2)], %o3
2575 tst %o3
2576 bz,pn %icc, .copyin_small ! if zero, disable HW copy
2577 cmp %o2, %o3 ! if length <= limit
2578 bleu,pt %ncc, .copyin_small ! go to small copy
2579 nop
2580 ba,pt %ncc, .copyin_more ! otherwise go to large copy
2581 nop
2582 .copyin_4:
2583 ! already checked longword, must be word aligned
2584 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
2585 ld [%o3 + %lo(hw_copy_limit_4)], %o3
2586 tst %o3
2587 bz,pn %icc, .copyin_small ! if zero, disable HW copy
2588 cmp %o2, %o3 ! if length <= limit
2589 bleu,pt %ncc, .copyin_small ! go to small copy
2590 nop
2591 ba,pt %ncc, .copyin_more ! otherwise go to large copy
2592 nop
2593 .copyin_8:
2594 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
2595 ld [%o3 + %lo(hw_copy_limit_8)], %o3
2596 tst %o3
2597 bz,pn %icc, .copyin_small ! if zero, disable HW copy
2598 cmp %o2, %o3 ! if length <= limit
2599 bleu,pt %ncc, .copyin_small ! go to small copy
2600 nop
2601 ba,pt %ncc, .copyin_more ! otherwise go to large copy
2602 nop
2603
2604 .align 16
2605 nop ! instruction alignment
2606 ! see discussion at start of file
2607 .copyin_small:
2608 sethi %hi(.sm_copyin_err), %o5 ! .sm_copyin_err is lofault
2609 or %o5, %lo(.sm_copyin_err), %o5
2610 ldn [THREAD_REG + T_LOFAULT], %o4 ! set/save t_lofault, no tramp
2611 membar #Sync ! sync error barrier
2612 stn %o5, [THREAD_REG + T_LOFAULT]
2613 .sm_do_copyin:
2614 mov %o0, SM_SAVE_SRC
2615 mov %o1, SM_SAVE_DST
2616 cmp %o2, SHORTCOPY ! check for really short case
2617 bleu,pt %ncc, .ci_sm_left !
2618 mov %o2, SM_SAVE_COUNT
2619 cmp %o2, CHKSIZE ! check for medium length cases
2620 bgu,pn %ncc, .ci_med !
2621 or %o0, %o1, %o3 ! prepare alignment check
2622 andcc %o3, 0x3, %g0 ! test for alignment
2623 bz,pt %ncc, .ci_sm_word ! branch to word aligned case
2624 .ci_sm_movebytes:
2625 sub %o2, 3, %o2 ! adjust count to allow cc zero test
2626 .ci_sm_notalign4:
2627 lduba [%o0]ASI_USER, %o3 ! read byte
2628 subcc %o2, 4, %o2 ! reduce count by 4
2629 stb %o3, [%o1] ! write byte
2630 add %o0, 1, %o0 ! advance SRC by 1
2631 lduba [%o0]ASI_USER, %o3 ! repeat for a total of 4 bytes
2632 add %o0, 1, %o0 ! advance SRC by 1
2633 stb %o3, [%o1 + 1]
2634 add %o1, 4, %o1 ! advance DST by 4
2635 lduba [%o0]ASI_USER, %o3
2636 add %o0, 1, %o0 ! advance SRC by 1
2637 stb %o3, [%o1 - 2]
2638 lduba [%o0]ASI_USER, %o3
2639 add %o0, 1, %o0 ! advance SRC by 1
2640 bgt,pt %ncc, .ci_sm_notalign4 ! loop til 3 or fewer bytes remain
2641 stb %o3, [%o1 - 1]
2642 add %o2, 3, %o2 ! restore count
2643 .ci_sm_left:
2644 tst %o2
2645 bz,pt %ncc, .ci_sm_exit
2646 nop
2647 lduba [%o0]ASI_USER, %o3 ! load one byte
2648 deccc %o2 ! reduce count for cc test
2649 bz,pt %ncc, .ci_sm_exit
2650 stb %o3,[%o1] ! store one byte
2651 inc %o0
2652 lduba [%o0]ASI_USER, %o3 ! load second byte
2653 deccc %o2
2654 bz,pt %ncc, .ci_sm_exit
2655 stb %o3,[%o1 + 1] ! store second byte
2656 inc %o0
2657 lduba [%o0]ASI_USER, %o3 ! load third byte
2658 stb %o3,[%o1 + 2] ! store third byte
2659 membar #Sync ! sync error barrier
2660 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2661 retl
2662 mov %g0, %o0 ! return 0
2663 .align 16
2664 .ci_sm_words:
2665 lduwa [%o0]ASI_USER, %o3 ! read word
2666 .ci_sm_wordx:
2667 subcc %o2, 8, %o2 ! update count
2668 stw %o3, [%o1] ! write word
2669 add %o0, 4, %o0 ! update SRC
2670 add %o1, 8, %o1 ! update DST
2671 lduwa [%o0]ASI_USER, %o3 ! read word
2672 add %o0, 4, %o0 ! update SRC
2673 bgt,pt %ncc, .ci_sm_words ! loop til done
2674 stw %o3, [%o1 - 4] ! write word
2675 addcc %o2, 7, %o2 ! restore count
2676 bz,pt %ncc, .ci_sm_exit
2677 nop
2678 deccc %o2
2679 bz,pt %ncc, .ci_sm_byte
2680 .ci_sm_half:
2681 subcc %o2, 2, %o2 ! reduce count by 2
2682 lduha [%o0]ASI_USER, %o3 ! read half word
2683 add %o0, 2, %o0 ! advance SRC by 2
2684 add %o1, 2, %o1 ! advance DST by 2
2685 bgt,pt %ncc, .ci_sm_half ! loop til done
2686 sth %o3, [%o1 - 2] ! write half word
2687 addcc %o2, 1, %o2 ! restore count
2688 bz,pt %ncc, .ci_sm_exit
2689 nop
2690 .ci_sm_byte:
2691 lduba [%o0]ASI_USER, %o3
2692 stb %o3, [%o1]
2693 membar #Sync ! sync error barrier
2694 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2695 retl
2696 mov %g0, %o0 ! return 0
2697 .align 16
2698 .ci_sm_word:
2699 subcc %o2, 4, %o2 ! update count
2700 bgt,pt %ncc, .ci_sm_wordx
2701 lduwa [%o0]ASI_USER, %o3 ! read word
2702 addcc %o2, 3, %o2 ! restore count
2703 bz,pt %ncc, .ci_sm_exit
2704 stw %o3, [%o1] ! write word
2705 deccc %o2 ! reduce count for cc test
2706 add %o0, 4, %o0
2707 lduba [%o0]ASI_USER, %o3 ! load one byte
2708 bz,pt %ncc, .ci_sm_exit
2709 stb %o3, [%o1 + 4] ! store one byte
2710 inc %o0
2711 lduba [%o0]ASI_USER, %o3 ! load second byte
2712 deccc %o2
2713 bz,pt %ncc, .ci_sm_exit
2714 stb %o3, [%o1 + 5] ! store second byte
2715 inc %o0
2716 lduba [%o0]ASI_USER, %o3 ! load third byte
2717 stb %o3, [%o1 + 6] ! store third byte
2718 .ci_sm_exit:
2719 membar #Sync ! sync error barrier
2720 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2721 retl
2722 mov %g0, %o0 ! return 0
2723
2724 .align 16
2725 .ci_med:
2726 xor %o0, %o1, %o3 ! setup alignment check
2727 btst 1, %o3
2728 bnz,pt %ncc, .ci_sm_movebytes ! unaligned
2729 nop
2730 btst 3, %o3
2731 bnz,pt %ncc, .ci_med_half ! halfword aligned
2732 nop
2733 btst 7, %o3
2734 bnz,pt %ncc, .ci_med_word ! word aligned
2735 nop
2736 .ci_med_long:
2737 btst 3, %o0 ! check for
2738 bz,pt %ncc, .ci_med_long1 ! word alignment
2739 nop
2740 .ci_med_long0:
2741 lduba [%o0]ASI_USER, %o3 ! load one byte
2742 inc %o0
2743 stb %o3,[%o1] ! store byte
2744 inc %o1
2745 btst 3, %o0
2746 bnz,pt %ncc, .ci_med_long0
2747 dec %o2
2748 .ci_med_long1: ! word aligned
2749 btst 7, %o0 ! check for long word
2750 bz,pt %ncc, .ci_med_long2
2751 nop
2752 lduwa [%o0]ASI_USER, %o3 ! load word
2753 add %o0, 4, %o0 ! advance SRC by 4
2754 stw %o3, [%o1] ! store word
2755 add %o1, 4, %o1 ! advance DST by 4
2756 sub %o2, 4, %o2 ! reduce count by 4
2757 !
2758 ! Now long word aligned and have at least 32 bytes to move
2759 !
2760 .ci_med_long2:
2761 sub %o2, 31, %o2 ! adjust count to allow cc zero test
2762 .ci_med_lmove:
2763 ldxa [%o0]ASI_USER, %o3 ! read long word
2764 subcc %o2, 32, %o2 ! reduce count by 32
2765 stx %o3, [%o1] ! write long word
2766 add %o0, 8, %o0 ! advance SRC by 8
2767 ldxa [%o0]ASI_USER, %o3 ! repeat for a total for 4 long words
2768 add %o0, 8, %o0 ! advance SRC by 8
2769 stx %o3, [%o1 + 8]
2770 add %o1, 32, %o1 ! advance DST by 32
2771 ldxa [%o0]ASI_USER, %o3
2772 add %o0, 8, %o0 ! advance SRC by 8
2773 stx %o3, [%o1 - 16]
2774 ldxa [%o0]ASI_USER, %o3
2775 add %o0, 8, %o0 ! advance SRC by 8
2776 bgt,pt %ncc, .ci_med_lmove ! loop til 31 or fewer bytes left
2777 stx %o3, [%o1 - 8]
2778 addcc %o2, 24, %o2 ! restore count to long word offset
2779 ble,pt %ncc, .ci_med_lextra ! check for more long words to move
2780 nop
2781 .ci_med_lword:
2782 ldxa [%o0]ASI_USER, %o3 ! read long word
2783 subcc %o2, 8, %o2 ! reduce count by 8
2784 stx %o3, [%o1] ! write long word
2785 add %o0, 8, %o0 ! advance SRC by 8
2786 bgt,pt %ncc, .ci_med_lword ! loop til 7 or fewer bytes left
2787 add %o1, 8, %o1 ! advance DST by 8
2788 .ci_med_lextra:
2789 addcc %o2, 7, %o2 ! restore rest of count
2790 bz,pt %ncc, .ci_sm_exit ! if zero, then done
2791 deccc %o2
2792 bz,pt %ncc, .ci_sm_byte
2793 nop
2794 ba,pt %ncc, .ci_sm_half
2795 nop
2796
2797 .align 16
2798 nop ! instruction alignment
2799 ! see discussion at start of file
2800 .ci_med_word:
2801 btst 3, %o0 ! check for
2802 bz,pt %ncc, .ci_med_word1 ! word alignment
2803 nop
2804 .ci_med_word0:
2805 lduba [%o0]ASI_USER, %o3 ! load one byte
2806 inc %o0
2807 stb %o3,[%o1] ! store byte
2808 inc %o1
2809 btst 3, %o0
2810 bnz,pt %ncc, .ci_med_word0
2811 dec %o2
2812 !
2813 ! Now word aligned and have at least 36 bytes to move
2814 !
2815 .ci_med_word1:
2816 sub %o2, 15, %o2 ! adjust count to allow cc zero test
2817 .ci_med_wmove:
2818 lduwa [%o0]ASI_USER, %o3 ! read word
2819 subcc %o2, 16, %o2 ! reduce count by 16
2820 stw %o3, [%o1] ! write word
2821 add %o0, 4, %o0 ! advance SRC by 4
2822 lduwa [%o0]ASI_USER, %o3 ! repeat for a total for 4 words
2823 add %o0, 4, %o0 ! advance SRC by 4
2824 stw %o3, [%o1 + 4]
2825 add %o1, 16, %o1 ! advance DST by 16
2826 lduwa [%o0]ASI_USER, %o3
2827 add %o0, 4, %o0 ! advance SRC by 4
2828 stw %o3, [%o1 - 8]
2829 lduwa [%o0]ASI_USER, %o3
2830 add %o0, 4, %o0 ! advance SRC by 4
2831 bgt,pt %ncc, .ci_med_wmove ! loop til 15 or fewer bytes left
2832 stw %o3, [%o1 - 4]
2833 addcc %o2, 12, %o2 ! restore count to word offset
2834 ble,pt %ncc, .ci_med_wextra ! check for more words to move
2835 nop
2836 .ci_med_word2:
2837 lduwa [%o0]ASI_USER, %o3 ! read word
2838 subcc %o2, 4, %o2 ! reduce count by 4
2839 stw %o3, [%o1] ! write word
2840 add %o0, 4, %o0 ! advance SRC by 4
2841 bgt,pt %ncc, .ci_med_word2 ! loop til 3 or fewer bytes left
2842 add %o1, 4, %o1 ! advance DST by 4
2843 .ci_med_wextra:
2844 addcc %o2, 3, %o2 ! restore rest of count
2845 bz,pt %ncc, .ci_sm_exit ! if zero, then done
2846 deccc %o2
2847 bz,pt %ncc, .ci_sm_byte
2848 nop
2849 ba,pt %ncc, .ci_sm_half
2850 nop
2851
2852 .align 16
2853 nop ! instruction alignment
2854 ! see discussion at start of file
2855 .ci_med_half:
2856 btst 1, %o0 ! check for
2857 bz,pt %ncc, .ci_med_half1 ! half word alignment
2858 nop
2859 lduba [%o0]ASI_USER, %o3 ! load one byte
2860 inc %o0
2861 stb %o3,[%o1] ! store byte
2862 inc %o1
2863 dec %o2
2864 !
2865 ! Now half word aligned and have at least 38 bytes to move
2866 !
2867 .ci_med_half1:
2868 sub %o2, 7, %o2 ! adjust count to allow cc zero test
2869 .ci_med_hmove:
2870 lduha [%o0]ASI_USER, %o3 ! read half word
2871 subcc %o2, 8, %o2 ! reduce count by 8
2872 sth %o3, [%o1] ! write half word
2873 add %o0, 2, %o0 ! advance SRC by 2
2874 lduha [%o0]ASI_USER, %o3 ! repeat for a total for 4 halfwords
2875 add %o0, 2, %o0 ! advance SRC by 2
2876 sth %o3, [%o1 + 2]
2877 add %o1, 8, %o1 ! advance DST by 8
2878 lduha [%o0]ASI_USER, %o3
2879 add %o0, 2, %o0 ! advance SRC by 2
2880 sth %o3, [%o1 - 4]
2881 lduha [%o0]ASI_USER, %o3
2882 add %o0, 2, %o0 ! advance SRC by 2
2883 bgt,pt %ncc, .ci_med_hmove ! loop til 7 or fewer bytes left
2884 sth %o3, [%o1 - 2]
2885 addcc %o2, 7, %o2 ! restore count
2886 bz,pt %ncc, .ci_sm_exit
2887 deccc %o2
2888 bz,pt %ncc, .ci_sm_byte
2889 nop
2890 ba,pt %ncc, .ci_sm_half
2891 nop
2892
2893 .sm_copyin_err:
2894 membar #Sync
2895 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2896 mov SM_SAVE_SRC, %o0
2897 mov SM_SAVE_DST, %o1
2898 mov SM_SAVE_COUNT, %o2
2899 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler
2900 tst %o3
2901 bz,pt %ncc, 3f ! if not, return error
2902 nop
2903 ldn [%o3 + CP_COPYIN], %o5 ! if handler, invoke it with
2904 jmp %o5 ! original arguments
2905 nop
2906 3:
2907 retl
2908 or %g0, -1, %o0 ! return errno value
2909
2910 SET_SIZE(copyin)
2911
2912
2913 /*
2914 * The _more entry points are not intended to be used directly by
2915 * any caller from outside this file. They are provided to allow
2916 * profiling and dtrace of the portions of the copy code that uses
2917 * the floating point registers.
2918 * This entry is particularly important as DTRACE (at least as of
2919 * 4/2004) does not support leaf functions.
2920 */
2921
2922 ENTRY(copyin_more)
2923 .copyin_more:
2924 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2925 set .copyin_err, REAL_LOFAULT
2926
2927 /*
2928 * Copy ins that reach here are larger than VIS_COPY_THRESHOLD bytes
2929 */
2930 .do_copyin:
2931 set copyio_fault, %l7 ! .copyio_fault is lofault val
2932
2933 ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler
2934 membar #Sync ! sync error barrier
2935 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault
2936
2937 mov %i0, SAVE_SRC
2938 mov %i1, SAVE_DST
2939 mov %i2, SAVE_COUNT
2940
2941 FP_NOMIGRATE(6, 7)
2942
2943 rd %fprs, %o2 ! check for unused fp
2944 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
2945 btst FPRS_FEF, %o2
2946 bz,a,pt %icc, .do_blockcopyin
2947 wr %g0, FPRS_FEF, %fprs
2948
2949 BST_FPQ2Q4_TOSTACK(%o2)
2950
2951 .do_blockcopyin:
2952 rd %gsr, %o2
2953 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr
2954 or %l6, FPUSED_FLAG, %l6
2955
2956 andcc DST, VIS_BLOCKSIZE - 1, TMP
2957 mov ASI_USER, %asi
2958 bz,pt %ncc, 2f
2959 neg TMP
2960 add TMP, VIS_BLOCKSIZE, TMP
2961
2962 ! TMP = bytes required to align DST on FP_BLOCK boundary
2963 ! Using SRC as a tmp here
2964 cmp TMP, 3
2965 bleu,pt %ncc, 1f
2966 sub CNT,TMP,CNT ! adjust main count
2967 sub TMP, 3, TMP ! adjust for end of loop test
2968 .ci_blkalign:
2969 lduba [REALSRC]%asi, SRC ! move 4 bytes per loop iteration
2970 stb SRC, [DST]
2971 subcc TMP, 4, TMP
2972 lduba [REALSRC + 1]%asi, SRC
2973 add REALSRC, 4, REALSRC
2974 stb SRC, [DST + 1]
2975 lduba [REALSRC - 2]%asi, SRC
2976 add DST, 4, DST
2977 stb SRC, [DST - 2]
2978 lduba [REALSRC - 1]%asi, SRC
2979 bgu,pt %ncc, .ci_blkalign
2980 stb SRC, [DST - 1]
2981
2982 addcc TMP, 3, TMP ! restore count adjustment
2983 bz,pt %ncc, 2f ! no bytes left?
2984 nop
2985 1: lduba [REALSRC]%asi, SRC
2986 inc REALSRC
2987 inc DST
2988 deccc TMP
2989 bgu %ncc, 1b
2990 stb SRC, [DST - 1]
2991
2992 2:
2993 andn REALSRC, 0x7, SRC
2994 alignaddr REALSRC, %g0, %g0
2995
2996 ! SRC - 8-byte aligned
2997 ! DST - 64-byte aligned
2998 prefetcha [SRC]%asi, #one_read
2999 prefetcha [SRC + (1 * VIS_BLOCKSIZE)]%asi, #one_read
3000 prefetcha [SRC + (2 * VIS_BLOCKSIZE)]%asi, #one_read
3001 prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #one_read
3002 ldda [SRC]%asi, %f16
3003 #if CHEETAH_PREFETCH > 4
3004 prefetcha [SRC + (4 * VIS_BLOCKSIZE)]%asi, #one_read
3005 #endif
3006 ldda [SRC + 0x08]%asi, %f18
3007 #if CHEETAH_PREFETCH > 5
3008 prefetcha [SRC + (5 * VIS_BLOCKSIZE)]%asi, #one_read
3009 #endif
3010 ldda [SRC + 0x10]%asi, %f20
3011 #if CHEETAH_PREFETCH > 6
3012 prefetcha [SRC + (6 * VIS_BLOCKSIZE)]%asi, #one_read
3013 #endif
3014 faligndata %f16, %f18, %f48
3015 ldda [SRC + 0x18]%asi, %f22
3016 #if CHEETAH_PREFETCH > 7
3017 prefetcha [SRC + (7 * VIS_BLOCKSIZE)]%asi, #one_read
3018 #endif
3019 faligndata %f18, %f20, %f50
3020 ldda [SRC + 0x20]%asi, %f24
3021 faligndata %f20, %f22, %f52
3022 ldda [SRC + 0x28]%asi, %f26
3023 faligndata %f22, %f24, %f54
3024 ldda [SRC + 0x30]%asi, %f28
3025 faligndata %f24, %f26, %f56
3026 ldda [SRC + 0x38]%asi, %f30
3027 faligndata %f26, %f28, %f58
3028 ldda [SRC + VIS_BLOCKSIZE]%asi, %f16
3029 sub CNT, VIS_BLOCKSIZE, CNT
3030 add SRC, VIS_BLOCKSIZE, SRC
3031 add REALSRC, VIS_BLOCKSIZE, REALSRC
3032 ba,a,pt %ncc, 1f
3033 nop
3034 .align 16
3035 1:
3036 ldda [SRC + 0x08]%asi, %f18
3037 faligndata %f28, %f30, %f60
3038 ldda [SRC + 0x10]%asi, %f20
3039 faligndata %f30, %f16, %f62
3040 stda %f48, [DST]ASI_BLK_P
3041 ldda [SRC + 0x18]%asi, %f22
3042 faligndata %f16, %f18, %f48
3043 ldda [SRC + 0x20]%asi, %f24
3044 faligndata %f18, %f20, %f50
3045 ldda [SRC + 0x28]%asi, %f26
3046 faligndata %f20, %f22, %f52
3047 ldda [SRC + 0x30]%asi, %f28
3048 faligndata %f22, %f24, %f54
3049 ldda [SRC + 0x38]%asi, %f30
3050 faligndata %f24, %f26, %f56
3051 sub CNT, VIS_BLOCKSIZE, CNT
3052 ldda [SRC + VIS_BLOCKSIZE]%asi, %f16
3053 faligndata %f26, %f28, %f58
3054 prefetcha [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8]%asi, #one_read
3055 add DST, VIS_BLOCKSIZE, DST
3056 prefetcha [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read
3057 add REALSRC, VIS_BLOCKSIZE, REALSRC
3058 cmp CNT, VIS_BLOCKSIZE + 8
3059 bgu,pt %ncc, 1b
3060 add SRC, VIS_BLOCKSIZE, SRC
3061
3062 ! only if REALSRC & 0x7 is 0
3063 cmp CNT, VIS_BLOCKSIZE
3064 bne %ncc, 3f
3065 andcc REALSRC, 0x7, %g0
3066 bz,pt %ncc, 2f
3067 nop
3068 3:
3069 faligndata %f28, %f30, %f60
3070 faligndata %f30, %f16, %f62
3071 stda %f48, [DST]ASI_BLK_P
3072 add DST, VIS_BLOCKSIZE, DST
3073 ba,pt %ncc, 3f
3074 nop
3075 2:
3076 ldda [SRC + 0x08]%asi, %f18
3077 fsrc1 %f28, %f60
3078 ldda [SRC + 0x10]%asi, %f20
3079 fsrc1 %f30, %f62
3080 stda %f48, [DST]ASI_BLK_P
3081 ldda [SRC + 0x18]%asi, %f22
3082 fsrc1 %f16, %f48
3083 ldda [SRC + 0x20]%asi, %f24
3084 fsrc1 %f18, %f50
3085 ldda [SRC + 0x28]%asi, %f26
3086 fsrc1 %f20, %f52
3087 ldda [SRC + 0x30]%asi, %f28
3088 fsrc1 %f22, %f54
3089 ldda [SRC + 0x38]%asi, %f30
3090 fsrc1 %f24, %f56
3091 sub CNT, VIS_BLOCKSIZE, CNT
3092 add DST, VIS_BLOCKSIZE, DST
3093 add SRC, VIS_BLOCKSIZE, SRC
3094 add REALSRC, VIS_BLOCKSIZE, REALSRC
3095 fsrc1 %f26, %f58
3096 fsrc1 %f28, %f60
3097 fsrc1 %f30, %f62
3098 stda %f48, [DST]ASI_BLK_P
3099 add DST, VIS_BLOCKSIZE, DST
3100 ba,a,pt %ncc, 4f
3101 nop
3102
3103 3: tst CNT
3104 bz,a %ncc, 4f
3105 nop
3106
3107 5: lduba [REALSRC]ASI_USER, TMP
3108 inc REALSRC
3109 inc DST
3110 deccc CNT
3111 bgu %ncc, 5b
3112 stb TMP, [DST - 1]
3113 4:
3114
3115 .copyin_exit:
3116 membar #Sync
3117
3118 FPRAS_INTERVAL(FPRAS_COPYIN, 1, %l5, %o2, %o3, %o4, %o5, 8)
3119 FPRAS_REWRITE_TYPE1(1, %l5, %f48, %o2, 9)
3120 FPRAS_CHECK(FPRAS_COPYIN, %l5, 9) ! lose outputs
3121
3122 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr
3123 wr %o2, 0, %gsr
3124
3125 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
3126 btst FPRS_FEF, %o3
3127 bz,pt %icc, 4f
3128 nop
3129
3130 BLD_FPQ2Q4_FROMSTACK(%o2)
3131
3132 ba,pt %ncc, 1f
3133 wr %o3, 0, %fprs ! restore fprs
3134
3135 4:
3136 FZEROQ2Q4
3137 wr %o3, 0, %fprs ! restore fprs
3138
3139 1:
3140 membar #Sync ! sync error barrier
3141 andn %l6, FPUSED_FLAG, %l6
3142 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
3143 FP_ALLOWMIGRATE(5, 6)
3144 ret
3145 restore %g0, 0, %o0
3146 /*
3147 * We got here because of a fault during copyin
3148 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
3149 */
3150 .copyin_err:
3151 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler
3152 tst %o4
3153 bz,pt %ncc, 2f ! if not, return error
3154 nop
3155 ldn [%o4 + CP_COPYIN], %g2 ! if handler, invoke it with
3156 jmp %g2 ! original arguments
3157 restore %g0, 0, %g0 ! dispose of copy window
3158 2:
3159 ret
3160 restore %g0, -1, %o0 ! return error value
3161
3162
3163 SET_SIZE(copyin_more)
3164
3165 ENTRY(xcopyin)
3166
3167 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
3168 bleu,pt %ncc, .xcopyin_small ! go to larger cases
3169 xor %o0, %o1, %o3 ! are src, dst alignable?
3170 btst 7, %o3 !
3171 bz,pt %ncc, .xcopyin_8 ! check for longword alignment
3172 nop
3173 btst 1, %o3 !
3174 bz,pt %ncc, .xcopyin_2 ! check for half-word
3175 nop
3176 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
3177 ld [%o3 + %lo(hw_copy_limit_1)], %o3
3178 tst %o3
3179 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy
3180 cmp %o2, %o3 ! if length <= limit
3181 bleu,pt %ncc, .xcopyin_small ! go to small copy
3182 nop
3183 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy
3184 nop
3185 .xcopyin_2:
3186 btst 3, %o3 !
3187 bz,pt %ncc, .xcopyin_4 ! check for word alignment
3188 nop
3189 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
3190 ld [%o3 + %lo(hw_copy_limit_2)], %o3
3191 tst %o3
3192 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy
3193 cmp %o2, %o3 ! if length <= limit
3194 bleu,pt %ncc, .xcopyin_small ! go to small copy
3195 nop
3196 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy
3197 nop
3198 .xcopyin_4:
3199 ! already checked longword, must be word aligned
3200 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
3201 ld [%o3 + %lo(hw_copy_limit_4)], %o3
3202 tst %o3
3203 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy
3204 cmp %o2, %o3 ! if length <= limit
3205 bleu,pt %ncc, .xcopyin_small ! go to small copy
3206 nop
3207 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy
3208 nop
3209 .xcopyin_8:
3210 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
3211 ld [%o3 + %lo(hw_copy_limit_8)], %o3
3212 tst %o3
3213 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy
3214 cmp %o2, %o3 ! if length <= limit
3215 bleu,pt %ncc, .xcopyin_small ! go to small copy
3216 nop
3217 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy
3218 nop
3219
3220 .xcopyin_small:
3221 sethi %hi(.sm_xcopyin_err), %o5 ! .sm_xcopyin_err is lofault value
3222 or %o5, %lo(.sm_xcopyin_err), %o5
3223 ldn [THREAD_REG + T_LOFAULT], %o4 ! set/save t_lofaul
3224 membar #Sync ! sync error barrier
3225 ba,pt %ncc, .sm_do_copyin ! common code
3226 stn %o5, [THREAD_REG + T_LOFAULT]
3227
3228 .xcopyin_more:
3229 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3230 sethi %hi(.xcopyin_err), REAL_LOFAULT ! .xcopyin_err is lofault value
3231 ba,pt %ncc, .do_copyin
3232 or REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
3233
3234 /*
3235 * We got here because of fault during xcopyin
3236 * Errno value is in ERRNO
3237 */
3238 .xcopyin_err:
3239 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler
3240 tst %o4
3241 bz,pt %ncc, 2f ! if not, return error
3242 nop
3243 ldn [%o4 + CP_XCOPYIN], %g2 ! if handler, invoke it with
3244 jmp %g2 ! original arguments
3245 restore %g0, 0, %g0 ! dispose of copy window
3246 2:
3247 ret
3248 restore ERRNO, 0, %o0 ! return errno value
3249
3250 .sm_xcopyin_err:
3251
3252 membar #Sync
3253 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
3254 mov SM_SAVE_SRC, %o0
3255 mov SM_SAVE_DST, %o1
3256 mov SM_SAVE_COUNT, %o2
3257 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler
3258 tst %o3
3259 bz,pt %ncc, 3f ! if not, return error
3260 nop
3261 ldn [%o3 + CP_XCOPYIN], %o5 ! if handler, invoke it with
3262 jmp %o5 ! original arguments
3263 nop
3264 3:
3265 retl
3266 or %g1, 0, %o0 ! return errno value
3267
3268 SET_SIZE(xcopyin)
3269
3270 ENTRY(xcopyin_little)
3271 sethi %hi(.xcopyio_err), %o5
3272 or %o5, %lo(.xcopyio_err), %o5
3273 ldn [THREAD_REG + T_LOFAULT], %o4
3274 membar #Sync ! sync error barrier
3275 stn %o5, [THREAD_REG + T_LOFAULT]
3276 mov %o4, %o5
3277
3278 subcc %g0, %o2, %o3
3279 add %o0, %o2, %o0
3280 bz,pn %ncc, 2f ! check for zero bytes
3281 sub %o2, 1, %o4
3282 add %o0, %o4, %o0 ! start w/last byte
3283 add %o1, %o2, %o1
3284 lduba [%o0 + %o3]ASI_AIUSL, %o4
3285
3286 1: stb %o4, [%o1 + %o3]
3287 inccc %o3
3288 sub %o0, 2, %o0 ! get next byte
3289 bcc,a,pt %ncc, 1b
3290 lduba [%o0 + %o3]ASI_AIUSL, %o4
3291
3292 2:
3293 membar #Sync ! sync error barrier
3294 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
3295 retl
3296 mov %g0, %o0 ! return (0)
3297
3298 .xcopyio_err:
3299 membar #Sync ! sync error barrier
3300 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
3301 retl
3302 mov %g1, %o0
3303
3304 SET_SIZE(xcopyin_little)
3305
3306
3307 /*
3308 * Copy a block of storage - must not overlap (from + len <= to).
3309 * No fault handler installed (to be called under on_fault())
3310 */
3311 ENTRY(copyin_noerr)
3312
3313 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
3314 bleu,pt %ncc, .copyin_ne_small ! go to larger cases
3315 xor %o0, %o1, %o3 ! are src, dst alignable?
3316 btst 7, %o3 !
3317 bz,pt %ncc, .copyin_ne_8 ! check for longword alignment
3318 nop
3319 btst 1, %o3 !
3320 bz,pt %ncc, .copyin_ne_2 ! check for half-word
3321 nop
3322 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
3323 ld [%o3 + %lo(hw_copy_limit_1)], %o3
3324 tst %o3
3325 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy
3326 cmp %o2, %o3 ! if length <= limit
3327 bleu,pt %ncc, .copyin_ne_small ! go to small copy
3328 nop
3329 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy
3330 nop
3331 .copyin_ne_2:
3332 btst 3, %o3 !
3333 bz,pt %ncc, .copyin_ne_4 ! check for word alignment
3334 nop
3335 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
3336 ld [%o3 + %lo(hw_copy_limit_2)], %o3
3337 tst %o3
3338 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy
3339 cmp %o2, %o3 ! if length <= limit
3340 bleu,pt %ncc, .copyin_ne_small ! go to small copy
3341 nop
3342 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy
3343 nop
3344 .copyin_ne_4:
3345 ! already checked longword, must be word aligned
3346 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
3347 ld [%o3 + %lo(hw_copy_limit_4)], %o3
3348 tst %o3
3349 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy
3350 cmp %o2, %o3 ! if length <= limit
3351 bleu,pt %ncc, .copyin_ne_small ! go to small copy
3352 nop
3353 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy
3354 nop
3355 .copyin_ne_8:
3356 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
3357 ld [%o3 + %lo(hw_copy_limit_8)], %o3
3358 tst %o3
3359 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy
3360 cmp %o2, %o3 ! if length <= limit
3361 bleu,pt %ncc, .copyin_ne_small ! go to small copy
3362 nop
3363 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy
3364 nop
3365
3366 .copyin_ne_small:
3367 ldn [THREAD_REG + T_LOFAULT], %o4
3368 tst %o4
3369 bz,pn %ncc, .sm_do_copyin
3370 nop
3371 sethi %hi(.sm_copyio_noerr), %o5
3372 or %o5, %lo(.sm_copyio_noerr), %o5
3373 membar #Sync ! sync error barrier
3374 ba,pt %ncc, .sm_do_copyin
3375 stn %o5, [THREAD_REG + T_LOFAULT] ! set/save t_lofault
3376
3377 .copyin_noerr_more:
3378 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3379 sethi %hi(.copyio_noerr), REAL_LOFAULT
3380 ba,pt %ncc, .do_copyin
3381 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
3382
3383 .copyio_noerr:
3384 jmp %l6
3385 restore %g0,0,%g0
3386
3387 .sm_copyio_noerr:
3388 membar #Sync
3389 stn %o4, [THREAD_REG + T_LOFAULT] ! restore t_lofault
3390 jmp %o4
3391 nop
3392
3393 SET_SIZE(copyin_noerr)
3394
3395 /*
3396 * Copy a block of storage - must not overlap (from + len <= to).
3397 * No fault handler installed (to be called under on_fault())
3398 */
3399
3400 ENTRY(copyout_noerr)
3401
3402 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
3403 bleu,pt %ncc, .copyout_ne_small ! go to larger cases
3404 xor %o0, %o1, %o3 ! are src, dst alignable?
3405 btst 7, %o3 !
3406 bz,pt %ncc, .copyout_ne_8 ! check for longword alignment
3407 nop
3408 btst 1, %o3 !
3409 bz,pt %ncc, .copyout_ne_2 ! check for half-word
3410 nop
3411 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
3412 ld [%o3 + %lo(hw_copy_limit_1)], %o3
3413 tst %o3
3414 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy
3415 cmp %o2, %o3 ! if length <= limit
3416 bleu,pt %ncc, .copyout_ne_small ! go to small copy
3417 nop
3418 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy
3419 nop
3420 .copyout_ne_2:
3421 btst 3, %o3 !
3422 bz,pt %ncc, .copyout_ne_4 ! check for word alignment
3423 nop
3424 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
3425 ld [%o3 + %lo(hw_copy_limit_2)], %o3
3426 tst %o3
3427 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy
3428 cmp %o2, %o3 ! if length <= limit
3429 bleu,pt %ncc, .copyout_ne_small ! go to small copy
3430 nop
3431 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy
3432 nop
3433 .copyout_ne_4:
3434 ! already checked longword, must be word aligned
3435 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
3436 ld [%o3 + %lo(hw_copy_limit_4)], %o3
3437 tst %o3
3438 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy
3439 cmp %o2, %o3 ! if length <= limit
3440 bleu,pt %ncc, .copyout_ne_small ! go to small copy
3441 nop
3442 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy
3443 nop
3444 .copyout_ne_8:
3445 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
3446 ld [%o3 + %lo(hw_copy_limit_8)], %o3
3447 tst %o3
3448 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy
3449 cmp %o2, %o3 ! if length <= limit
3450 bleu,pt %ncc, .copyout_ne_small ! go to small copy
3451 nop
3452 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy
3453 nop
3454
3455 .copyout_ne_small:
3456 ldn [THREAD_REG + T_LOFAULT], %o4
3457 tst %o4
3458 bz,pn %ncc, .sm_do_copyout
3459 nop
3460 sethi %hi(.sm_copyio_noerr), %o5
3461 or %o5, %lo(.sm_copyio_noerr), %o5
3462 membar #Sync ! sync error barrier
3463 ba,pt %ncc, .sm_do_copyout
3464 stn %o5, [THREAD_REG + T_LOFAULT] ! set/save t_lofault
3465
3466 .copyout_noerr_more:
3467 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3468 sethi %hi(.copyio_noerr), REAL_LOFAULT
3469 ba,pt %ncc, .do_copyout
3470 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
3471
3472 SET_SIZE(copyout_noerr)
3473
3474
3475 /*
3476 * hwblkclr - clears block-aligned, block-multiple-sized regions that are
3477 * longer than 256 bytes in length using spitfire's block stores. If
3478 * the criteria for using this routine are not met then it calls bzero
3479 * and returns 1. Otherwise 0 is returned indicating success.
3480 * Caller is responsible for ensuring use_hw_bzero is true and that
3481 * kpreempt_disable() has been called.
3482 */
3483 ! %i0 - start address
3484 ! %i1 - length of region (multiple of 64)
3485 ! %l0 - saved fprs
3486 ! %l1 - pointer to saved %d0 block
3487 ! %l2 - saved curthread->t_lwp
3488
3489 ENTRY(hwblkclr)
3490 ! get another window w/space for one aligned block of saved fpregs
3491 save %sp, -SA(MINFRAME + 2*VIS_BLOCKSIZE), %sp
3492
3493 ! Must be block-aligned
3494 andcc %i0, (VIS_BLOCKSIZE-1), %g0
3495 bnz,pn %ncc, 1f
3496 nop
3497
3498 ! ... and must be 256 bytes or more
3499 cmp %i1, 256
3500 blu,pn %ncc, 1f
3501 nop
3502
3503 ! ... and length must be a multiple of VIS_BLOCKSIZE
3504 andcc %i1, (VIS_BLOCKSIZE-1), %g0
3505 bz,pn %ncc, 2f
3506 nop
3507
3508 1: ! punt, call bzero but notify the caller that bzero was used
3509 mov %i0, %o0
3510 call bzero
3511 mov %i1, %o1
3512 ret
3513 restore %g0, 1, %o0 ! return (1) - did not use block operations
3514
3515 2: rd %fprs, %l0 ! check for unused fp
3516 btst FPRS_FEF, %l0
3517 bz,pt %icc, 1f
3518 nop
3519
3520 ! save in-use fpregs on stack
3521 membar #Sync
3522 add %fp, STACK_BIAS - 65, %l1
3523 and %l1, -VIS_BLOCKSIZE, %l1
3524 stda %d0, [%l1]ASI_BLK_P
3525
3526 1: membar #StoreStore|#StoreLoad|#LoadStore
3527 wr %g0, FPRS_FEF, %fprs
3528 wr %g0, ASI_BLK_P, %asi
3529
3530 ! Clear block
3531 fzero %d0
3532 fzero %d2
3533 fzero %d4
3534 fzero %d6
3535 fzero %d8
3536 fzero %d10
3537 fzero %d12
3538 fzero %d14
3539
3540 mov 256, %i3
3541 ba,pt %ncc, .pz_doblock
3542 nop
3543
3544 .pz_blkstart:
3545 ! stda %d0, [%i0 + 192]%asi ! in dly slot of branch that got us here
3546 stda %d0, [%i0 + 128]%asi
3547 stda %d0, [%i0 + 64]%asi
3548 stda %d0, [%i0]%asi
3549 .pz_zinst:
3550 add %i0, %i3, %i0
3551 sub %i1, %i3, %i1
3552 .pz_doblock:
3553 cmp %i1, 256
3554 bgeu,a %ncc, .pz_blkstart
3555 stda %d0, [%i0 + 192]%asi
3556
3557 cmp %i1, 64
3558 blu %ncc, .pz_finish
3559
3560 andn %i1, (64-1), %i3
3561 srl %i3, 4, %i2 ! using blocks, 1 instr / 16 words
3562 set .pz_zinst, %i4
3563 sub %i4, %i2, %i4
3564 jmp %i4
3565 nop
3566
3567 .pz_finish:
3568 membar #Sync
3569 btst FPRS_FEF, %l0
3570 bz,a .pz_finished
3571 wr %l0, 0, %fprs ! restore fprs
3572
3573 ! restore fpregs from stack
3574 ldda [%l1]ASI_BLK_P, %d0
3575 membar #Sync
3576 wr %l0, 0, %fprs ! restore fprs
3577
3578 .pz_finished:
3579 ret
3580 restore %g0, 0, %o0 ! return (bzero or not)
3581
3582 SET_SIZE(hwblkclr)
3583
3584 /*
3585 * Copy 32 bytes of data from src (%o0) to dst (%o1)
3586 * using physical addresses.
3587 */
3588 ENTRY_NP(hw_pa_bcopy32)
3589 rdpr %pstate, %g1
3590 andn %g1, PSTATE_IE, %g2
3591 wrpr %g0, %g2, %pstate
3592
3593 rdpr %pstate, %g0
3594 ldxa [%o0]ASI_MEM, %o2
3595 add %o0, 8, %o0
3596 ldxa [%o0]ASI_MEM, %o3
3597 add %o0, 8, %o0
3598 ldxa [%o0]ASI_MEM, %o4
3599 add %o0, 8, %o0
3600 ldxa [%o0]ASI_MEM, %o5
3601
3602 stxa %g0, [%o1]ASI_DC_INVAL
3603 membar #Sync
3604
3605 stxa %o2, [%o1]ASI_MEM
3606 add %o1, 8, %o1
3607 stxa %o3, [%o1]ASI_MEM
3608 add %o1, 8, %o1
3609 stxa %o4, [%o1]ASI_MEM
3610 add %o1, 8, %o1
3611 stxa %o5, [%o1]ASI_MEM
3612
3613 retl
3614 wrpr %g0, %g1, %pstate
3615
3616 SET_SIZE(hw_pa_bcopy32)
3617
3618 DGDEF(use_hw_bcopy)
3619 .word 1
3620 DGDEF(use_hw_bzero)
3621 .word 1
3622 DGDEF(hw_copy_limit_1)
3623 .word 0
3624 DGDEF(hw_copy_limit_2)
3625 .word 0
3626 DGDEF(hw_copy_limit_4)
3627 .word 0
3628 DGDEF(hw_copy_limit_8)
3629 .word 0
3630
3631 .align 64
3632 .section ".text"