Print this page
de-linting of .s files
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/sun4u/cpu/opl_olympus_copy.s
+++ new/usr/src/uts/sun4u/cpu/opl_olympus_copy.s
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 24 */
25 25
↓ open down ↓ |
25 lines elided |
↑ open up ↑ |
26 26 #include <sys/param.h>
27 27 #include <sys/errno.h>
28 28 #include <sys/asm_linkage.h>
29 29 #include <sys/vtrace.h>
30 30 #include <sys/machthread.h>
31 31 #include <sys/clock.h>
32 32 #include <sys/asi.h>
33 33 #include <sys/fsr.h>
34 34 #include <sys/privregs.h>
35 35
36 -#if !defined(lint)
37 36 #include "assym.h"
38 -#endif /* lint */
39 37
40 38 /*
41 39 * Pseudo-code to aid in understanding the control flow of the
42 40 * bcopy/copyin/copyout routines.
43 41 *
44 42 * On entry:
45 43 *
46 44 * ! Determine whether to use the FP register version
47 45 * ! or the leaf routine version depending on size
48 46 * ! of copy and flags. Set up error handling accordingly.
49 47 * ! The transition point depends on whether the src and
50 48 * ! dst addresses can be aligned to long word, word,
51 49 * ! half word, or byte boundaries.
52 50 * !
53 51 * ! WARNING: <Register usage convention>
54 52 * ! For FP version, %l6 holds previous error handling and
55 53 * ! a flag: TRAMP_FLAG (low bits)
56 54 * ! for leaf routine version, %o4 holds those values.
57 55 * ! So either %l6 or %o4 is reserved and not available for
58 56 * ! any other use.
59 57 *
60 58 * if (length <= VIS_COPY_THRESHOLD) ! start with a quick test
61 59 * go to small_copy; ! to speed short copies
62 60 *
63 61 * ! src, dst long word alignable
64 62 * if (hw_copy_limit_8 == 0) ! hw_copy disabled
65 63 * go to small_copy;
66 64 * if (length <= hw_copy_limit_8)
67 65 * go to small_copy;
68 66 * go to FPBLK_copy;
69 67 * }
70 68 * if (src,dst not alignable) {
71 69 * if (hw_copy_limit_1 == 0) ! hw_copy disabled
72 70 * go to small_copy;
73 71 * if (length <= hw_copy_limit_1)
74 72 * go to small_copy;
75 73 * go to FPBLK_copy;
76 74 * }
77 75 * if (src,dst halfword alignable) {
78 76 * if (hw_copy_limit_2 == 0) ! hw_copy disabled
79 77 * go to small_copy;
80 78 * if (length <= hw_copy_limit_2)
81 79 * go to small_copy;
82 80 * go to FPBLK_copy;
83 81 * }
84 82 * if (src,dst word alignable) {
85 83 * if (hw_copy_limit_4 == 0) ! hw_copy disabled
86 84 * go to small_copy;
87 85 * if (length <= hw_copy_limit_4)
88 86 * go to small_copy;
89 87 * go to FPBLK_copy;
90 88 * }
91 89 *
92 90 * small_copy:
93 91 * Setup_leaf_rtn_error_handler; ! diffs for each entry point
94 92 *
95 93 * if (count <= 3) ! fast path for tiny copies
96 94 * go to sm_left; ! special finish up code
97 95 * else
98 96 * if (count > CHKSIZE) ! medium sized copies
99 97 * go to sm_med ! tuned by alignment
100 98 * if(src&dst not both word aligned) {
101 99 * sm_movebytes:
102 100 * move byte by byte in 4-way unrolled loop
103 101 * fall into sm_left;
104 102 * sm_left:
105 103 * move 0-3 bytes byte at a time as needed.
106 104 * restore error handler and exit.
107 105 *
108 106 * } else { ! src&dst are word aligned
109 107 * check for at least 8 bytes left,
110 108 * move word at a time, unrolled by 2
111 109 * when fewer than 8 bytes left,
112 110 * sm_half: move half word at a time while 2 or more bytes left
113 111 * sm_byte: move final byte if necessary
114 112 * sm_exit:
115 113 * restore error handler and exit.
116 114 * }
117 115 *
118 116 * ! Medium length cases with at least CHKSIZE bytes available
119 117 * ! method: line up src and dst as best possible, then
120 118 * ! move data in 4-way unrolled loops.
121 119 *
122 120 * sm_med:
123 121 * if(src&dst unalignable)
124 122 * go to sm_movebytes
125 123 * if(src&dst halfword alignable)
126 124 * go to sm_movehalf
127 125 * if(src&dst word alignable)
128 126 * go to sm_moveword
129 127 * ! fall into long word movement
130 128 * move bytes until src is word aligned
131 129 * if not long word aligned, move a word
132 130 * move long words in 4-way unrolled loop until < 32 bytes left
133 131 * move long words in 1-way unrolled loop until < 8 bytes left
134 132 * if zero bytes left, goto sm_exit
135 133 * if one byte left, go to sm_byte
136 134 * else go to sm_half
137 135 *
138 136 * sm_moveword:
139 137 * move bytes until src is word aligned
140 138 * move words in 4-way unrolled loop until < 16 bytes left
141 139 * move words in 1-way unrolled loop until < 4 bytes left
142 140 * if zero bytes left, goto sm_exit
143 141 * if one byte left, go to sm_byte
144 142 * else go to sm_half
145 143 *
146 144 * sm_movehalf:
147 145 * move a byte if needed to align src on halfword
148 146 * move halfwords in 4-way unrolled loop until < 8 bytes left
149 147 * if zero bytes left, goto sm_exit
150 148 * if one byte left, go to sm_byte
151 149 * else go to sm_half
152 150 *
153 151 *
154 152 * FPBLK_copy:
155 153 * %l6 = curthread->t_lofault;
156 154 * if (%l6 != NULL) {
157 155 * membar #Sync
158 156 * curthread->t_lofault = .copyerr;
159 157 * caller_error_handler = TRUE ! %l6 |= 2
160 158 * }
161 159 *
162 160 * ! for FPU testing we must not migrate cpus
163 161 * if (curthread->t_lwp == NULL) {
164 162 * ! Kernel threads do not have pcb's in which to store
165 163 * ! the floating point state, so disallow preemption during
166 164 * ! the copy. This also prevents cpu migration.
167 165 * kpreempt_disable(curthread);
168 166 * } else {
169 167 * thread_nomigrate();
170 168 * }
171 169 *
172 170 * old_fprs = %fprs;
173 171 * old_gsr = %gsr;
174 172 * if (%fprs.fef) {
175 173 * %fprs.fef = 1;
176 174 * save current fpregs on stack using blockstore
177 175 * } else {
178 176 * %fprs.fef = 1;
179 177 * }
180 178 *
181 179 *
182 180 * do_blockcopy_here;
183 181 *
184 182 * In lofault handler:
185 183 * curthread->t_lofault = .copyerr2;
186 184 * Continue on with the normal exit handler
187 185 *
188 186 * On normal exit:
189 187 * %gsr = old_gsr;
190 188 * if (old_fprs & FPRS_FEF)
191 189 * restore fpregs from stack using blockload
192 190 * else
193 191 * zero fpregs
194 192 * %fprs = old_fprs;
195 193 * membar #Sync
196 194 * curthread->t_lofault = (%l6 & ~3);
197 195 * ! following test omitted from copyin/copyout as they
198 196 * ! will always have a current thread
199 197 * if (curthread->t_lwp == NULL)
200 198 * kpreempt_enable(curthread);
201 199 * else
202 200 * thread_allowmigrate();
203 201 * return (0)
204 202 *
205 203 * In second lofault handler (.copyerr2):
206 204 * We've tried to restore fp state from the stack and failed. To
207 205 * prevent from returning with a corrupted fp state, we will panic.
208 206 */
209 207
210 208 /*
211 209 * Comments about optimization choices
212 210 *
213 211 * The initial optimization decision in this code is to determine
214 212 * whether to use the FP registers for a copy or not. If we don't
215 213 * use the FP registers, we can execute the copy as a leaf routine,
216 214 * saving a register save and restore. Also, less elaborate setup
217 215 * is required, allowing short copies to be completed more quickly.
218 216 * For longer copies, especially unaligned ones (where the src and
219 217 * dst do not align to allow simple ldx,stx operation), the FP
220 218 * registers allow much faster copy operations.
221 219 *
222 220 * The estimated extra cost of the FP path will vary depending on
223 221 * src/dst alignment, dst offset from the next 64 byte FPblock store
224 222 * boundary, remaining src data after the last full dst cache line is
225 223 * moved whether the FP registers need to be saved, and some other
226 224 * minor issues. The average additional overhead is estimated to be
227 225 * 400 clocks. Since each non-repeated/predicted tst and branch costs
228 226 * around 10 clocks, elaborate calculation would slow down to all
229 227 * longer copies and only benefit a small portion of medium sized
230 228 * copies. Rather than incur such cost, we chose fixed transition
231 229 * points for each of the alignment choices.
232 230 *
233 231 * For the inner loop, here is a comparison of the per cache line
234 232 * costs for each alignment when src&dst are in cache:
235 233 *
236 234 * byte aligned: 108 clocks slower for non-FPBLK
237 235 * half aligned: 44 clocks slower for non-FPBLK
238 236 * word aligned: 12 clocks slower for non-FPBLK
239 237 * long aligned: 4 clocks >>faster<< for non-FPBLK
240 238 *
241 239 * The long aligned loop runs faster because it does no prefetching.
242 240 * That wins if the data is not in cache or there is too little
243 241 * data to gain much benefit from prefetching. But when there
244 242 * is more data and that data is not in cache, failing to prefetch
245 243 * can run much slower. In addition, there is a 2 Kbyte store queue
246 244 * which will cause the non-FPBLK inner loop to slow for larger copies.
247 245 * The exact tradeoff is strongly load and application dependent, with
248 246 * increasing risk of a customer visible performance regression if the
249 247 * non-FPBLK code is used for larger copies. Studies of synthetic in-cache
250 248 * vs out-of-cache copy tests in user space suggest 1024 bytes as a safe
251 249 * upper limit for the non-FPBLK code. To minimize performance regression
252 250 * risk while still gaining the primary benefits of the improvements to
253 251 * the non-FPBLK code, we set an upper bound of 1024 bytes for the various
254 252 * hw_copy_limit_*. Later experimental studies using different values
255 253 * of hw_copy_limit_* can be used to make further adjustments if
256 254 * appropriate.
257 255 *
258 256 * hw_copy_limit_1 = src and dst are byte aligned but not halfword aligned
259 257 * hw_copy_limit_2 = src and dst are halfword aligned but not word aligned
260 258 * hw_copy_limit_4 = src and dst are word aligned but not longword aligned
261 259 * hw_copy_limit_8 = src and dst are longword aligned
262 260 *
263 261 * To say that src and dst are word aligned means that after
264 262 * some initial alignment activity of moving 0 to 3 bytes,
265 263 * both the src and dst will be on word boundaries so that
266 264 * word loads and stores may be used.
267 265 *
268 266 * Default values at May,2005 are:
269 267 * hw_copy_limit_1 = 256
270 268 * hw_copy_limit_2 = 512
271 269 * hw_copy_limit_4 = 1024
272 270 * hw_copy_limit_8 = 1024 (or 1536 on some systems)
273 271 *
274 272 *
275 273 * If hw_copy_limit_? is set to zero, then use of FPBLK copy is
276 274 * disabled for that alignment choice.
277 275 * If hw_copy_limit_? is set to a value between 1 and VIS_COPY_THRESHOLD (256)
278 276 * the value of VIS_COPY_THRESHOLD is used.
279 277 * It is not envisioned that hw_copy_limit_? will be changed in the field
280 278 * It is provided to allow for disabling FPBLK copies and to allow
281 279 * easy testing of alternate values on future HW implementations
282 280 * that might have different cache sizes, clock rates or instruction
283 281 * timing rules.
284 282 *
285 283 * Our first test for FPBLK copies vs non-FPBLK copies checks a minimum
286 284 * threshold to speedup all shorter copies (less than 256). That
287 285 * saves an alignment test, memory reference, and enabling test
288 286 * for all short copies, or an estimated 24 clocks.
289 287 *
290 288 * The order in which these limits are checked does matter since each
291 289 * non-predicted tst and branch costs around 10 clocks.
292 290 * If src and dst are randomly selected addresses,
293 291 * 4 of 8 will not be alignable.
294 292 * 2 of 8 will be half word alignable.
295 293 * 1 of 8 will be word alignable.
296 294 * 1 of 8 will be long word alignable.
297 295 * But, tests on running kernels show that src and dst to copy code
298 296 * are typically not on random alignments. Structure copies and
299 297 * copies of larger data sizes are often on long word boundaries.
300 298 * So we test the long word alignment case first, then
301 299 * the byte alignment, then halfword, then word alignment.
302 300 *
303 301 * Several times, tests for length are made to split the code
304 302 * into subcases. These tests often allow later tests to be
305 303 * avoided. For example, within the non-FPBLK copy, we first
306 304 * check for tiny copies of 3 bytes or less. That allows us
307 305 * to use a 4-way unrolled loop for the general byte copy case
308 306 * without a test on loop entry.
309 307 * We subdivide the non-FPBLK case further into CHKSIZE bytes and less
310 308 * vs longer cases. For the really short case, we don't attempt
311 309 * align src and dst. We try to minimize special case tests in
312 310 * the shortest loops as each test adds a significant percentage
313 311 * to the total time.
314 312 *
315 313 * For the medium sized cases, we allow ourselves to adjust the
316 314 * src and dst alignment and provide special cases for each of
317 315 * the four adjusted alignment cases. The CHKSIZE that was used
318 316 * to decide between short and medium size was chosen to be 39
319 317 * as that allows for the worst case of 7 bytes of alignment
320 318 * shift and 4 times 8 bytes for the first long word unrolling.
321 319 * That knowledge saves an initial test for length on entry into
322 320 * the medium cases. If the general loop unrolling factor were
323 321 * to be increases, this number would also need to be adjusted.
324 322 *
325 323 * For all cases in the non-FPBLK code where it is known that at
326 324 * least 4 chunks of data are available for movement, the
327 325 * loop is unrolled by four. This 4-way loop runs in 8 clocks
328 326 * or 2 clocks per data element.
329 327 *
330 328 * Instruction alignment is forced by used of .align 16 directives
331 329 * and nops which are not executed in the code. This
332 330 * combination of operations shifts the alignment of following
333 331 * loops to insure that loops are aligned so that their instructions
334 332 * fall within the minimum number of 4 instruction fetch groups.
335 333 * If instructions are inserted or removed between the .align
336 334 * instruction and the unrolled loops, then the alignment needs
337 335 * to be readjusted. Misaligned loops can add a clock per loop
338 336 * iteration to the loop timing.
339 337 *
340 338 * In a few cases, code is duplicated to avoid a branch. Since
341 339 * a non-predicted tst and branch takes 10 clocks, this savings
342 340 * is judged an appropriate time-space tradeoff.
343 341 *
344 342 * Within the FPBLK-code, the prefetch method in the inner
345 343 * loop needs to be explained as it is not standard. Two
346 344 * prefetches are issued for each cache line instead of one.
347 345 * The primary one is at the maximum reach of 8 cache lines.
348 346 * Most of the time, that maximum prefetch reach gives the
349 347 * cache line more time to reach the processor for systems with
350 348 * higher processor clocks. But, sometimes memory interference
351 349 * can cause that prefetch to be dropped. Putting a second
352 350 * prefetch at a reach of 5 cache lines catches the drops
353 351 * three iterations later and shows a measured improvement
354 352 * in performance over any similar loop with a single prefetch.
355 353 * The prefetches are placed in the loop so they overlap with
356 354 * non-memory instructions, so that there is no extra cost
357 355 * when the data is already in-cache.
358 356 *
359 357 */
360 358
361 359 /*
362 360 * Notes on preserving existing fp state and on membars.
363 361 *
364 362 * When a copyOP decides to use fp we may have to preserve existing
365 363 * floating point state. It is not the caller's state that we need to
366 364 * preserve - the rest of the kernel does not use fp and, anyway, fp
367 365 * registers are volatile across a call. Some examples:
368 366 *
369 367 * - userland has fp state and is interrupted (device interrupt
370 368 * or trap) and within the interrupt/trap handling we use
371 369 * bcopy()
372 370 * - another (higher level) interrupt or trap handler uses bcopy
373 371 * while a bcopy from an earlier interrupt is still active
374 372 * - an asynchronous error trap occurs while fp state exists (in
375 373 * userland or in kernel copy) and the tl0 component of the handling
376 374 * uses bcopy
377 375 * - a user process with fp state incurs a copy-on-write fault and
378 376 * hwblkpagecopy always uses fp
379 377 *
380 378 * We therefore need a per-call place in which to preserve fp state -
381 379 * using our stack is ideal (and since fp copy cannot be leaf optimized
382 380 * because of calls it makes, this is no hardship).
383 381 *
384 382 * When we have finished fp copy (with it's repeated block stores)
385 383 * we must membar #Sync so that our block stores may complete before
386 384 * we either restore the original fp state into the fp registers or
387 385 * return to a caller which may initiate other fp operations that could
388 386 * modify the fp regs we used before the block stores complete.
389 387 *
390 388 * Synchronous faults (eg, unresolvable DMMU miss) that occur while
391 389 * t_lofault is not NULL will not panic but will instead trampoline
392 390 * to the registered lofault handler. There is no need for any
393 391 * membars for these - eg, our store to t_lofault will always be visible to
394 392 * ourselves and it is our cpu which will take any trap.
395 393 *
396 394 * Asynchronous faults (eg, uncorrectable ECC error from memory) that occur
397 395 * while t_lofault is not NULL will also not panic. Since we're copying
398 396 * to or from userland the extent of the damage is known - the destination
399 397 * buffer is incomplete. So trap handlers will trampoline to the lofault
400 398 * handler in this case which should take some form of error action to
401 399 * avoid using the incomplete buffer. The trap handler also flags the
402 400 * fault so that later return-from-trap handling (for the trap that brought
403 401 * this thread into the kernel in the first place) can notify the process
404 402 * and reboot the system (or restart the service with Greenline/Contracts).
405 403 *
406 404 * Asynchronous faults (eg, uncorrectable ECC error from memory) can
407 405 * result in deferred error traps - the trap is taken sometime after
408 406 * the event and the trap PC may not be the PC of the faulting access.
409 407 * Delivery of such pending traps can be forced by a membar #Sync, acting
410 408 * as an "error barrier" in this role. To accurately apply the user/kernel
411 409 * separation described in the preceding paragraph we must force delivery
412 410 * of deferred traps affecting kernel state before we install a lofault
413 411 * handler (if we interpose a new lofault handler on an existing one there
414 412 * is no need to repeat this), and we must force delivery of deferred
415 413 * errors affecting the lofault-protected region before we clear t_lofault.
416 414 * Failure to do so results in lost kernel state being interpreted as
417 415 * affecting a copyin/copyout only, or of an error that really only
418 416 * affects copy data being interpreted as losing kernel state.
419 417 *
420 418 * Since the copy operations may preserve and later restore floating
421 419 * point state that does not belong to the caller (see examples above),
422 420 * we must be careful in how we do this in order to prevent corruption
423 421 * of another program.
424 422 *
425 423 * To make sure that floating point state is always saved and restored
426 424 * correctly, the following "big rules" must be followed when the floating
427 425 * point registers will be used:
428 426 *
429 427 * 1. %l6 always holds the caller's lofault handler. Also in this register,
430 428 * Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in
431 429 * use. Bit 2 (TRAMP_FLAG) indicates that the call was to bcopy, and a
432 430 * lofault handler was set coming in.
433 431 *
434 432 * 2. The FPUSED flag indicates that all FP state has been successfully stored
435 433 * on the stack. It should not be set until this save has been completed.
436 434 *
437 435 * 3. The FPUSED flag should not be cleared on exit until all FP state has
438 436 * been restored from the stack. If an error occurs while restoring
439 437 * data from the stack, the error handler can check this flag to see if
440 438 * a restore is necessary.
441 439 *
442 440 * 4. Code run under the new lofault handler must be kept to a minimum. In
443 441 * particular, any calls to FP_ALLOWMIGRATE, which could result in a call
444 442 * to kpreempt(), should not be made until after the lofault handler has
445 443 * been restored.
446 444 */
447 445
448 446 /*
449 447 * VIS_COPY_THRESHOLD indicates the minimum number of bytes needed
450 448 * to "break even" using FP/VIS-accelerated memory operations.
451 449 * The FPBLK code assumes a minimum number of bytes are available
452 450 * to be moved on entry. Check that code carefully before
453 451 * reducing VIS_COPY_THRESHOLD below 256.
454 452 */
455 453 /*
456 454 * This shadows sys/machsystm.h which can't be included due to the lack of
457 455 * _ASM guards in include files it references. Change it here, change it there.
458 456 */
459 457 #define VIS_COPY_THRESHOLD 256
460 458
461 459 /*
462 460 * TEST for very short copies
463 461 * Be aware that the maximum unroll for the short unaligned case
464 462 * is SHORTCOPY+1
465 463 */
466 464 #define SHORTCOPY 3
467 465 #define CHKSIZE 39
468 466
469 467 /*
470 468 * Indicates that we're to trampoline to the error handler.
471 469 * Entry points bcopy, copyin_noerr, and copyout_noerr use this flag.
472 470 * kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag.
473 471 */
474 472 #define FPUSED_FLAG 1
475 473 #define TRAMP_FLAG 2
476 474 #define MASK_FLAGS 3
477 475
478 476 /*
479 477 * Number of outstanding prefetches.
480 478 * first prefetch moves data from L2 to L1 (n_reads)
481 479 * second prefetch moves data from memory to L2 (one_read)
482 480 */
483 481 #define OLYMPUS_C_PREFETCH 24
484 482 #define OLYMPUS_C_2ND_PREFETCH 12
485 483
486 484 #define VIS_BLOCKSIZE 64
487 485
488 486 /*
489 487 * Size of stack frame in order to accomodate a 64-byte aligned
490 488 * floating-point register save area and 2 64-bit temp locations.
491 489 * All copy functions use two quadrants of fp registers; to assure a
492 490 * block-aligned two block buffer in which to save we must reserve
493 491 * three blocks on stack. Not all functions preserve %pfrs on stack
494 492 * or need to preserve %gsr but we use HWCOPYFRAMESIZE for all.
495 493 *
496 494 * _______________________________________ <-- %fp + STACK_BIAS
497 495 * | We may need to preserve 2 quadrants |
498 496 * | of fp regs, but since we do so with |
499 497 * | BST/BLD we need room in which to |
500 498 * | align to VIS_BLOCKSIZE bytes. So |
501 499 * | this area is 3 * VIS_BLOCKSIZE. | <-- - SAVED_FPREGS_OFFSET
502 500 * |-------------------------------------|
503 501 * | 8 bytes to save %fprs | <-- - SAVED_FPRS_OFFSET
504 502 * |-------------------------------------|
505 503 * | 8 bytes to save %gsr | <-- - SAVED_GSR_OFFSET
506 504 * ---------------------------------------
507 505 */
508 506 #define HWCOPYFRAMESIZE ((VIS_BLOCKSIZE * (2 + 1)) + (2 * 8))
509 507 #define SAVED_FPREGS_OFFSET (VIS_BLOCKSIZE * 3)
510 508 #define SAVED_FPREGS_ADJUST ((VIS_BLOCKSIZE * 2) - 1)
511 509 #define SAVED_FPRS_OFFSET (SAVED_FPREGS_OFFSET + 8)
512 510 #define SAVED_GSR_OFFSET (SAVED_FPRS_OFFSET + 8)
513 511
514 512 /*
515 513 * Common macros used by the various versions of the block copy
516 514 * routines in this file.
517 515 */
518 516
519 517 /*
520 518 * In FP copies if we do not have preserved data to restore over
521 519 * the fp regs we used then we must zero those regs to avoid
522 520 * exposing portions of the data to later threads (data security).
523 521 *
524 522 * Copy functions use either quadrants 1 and 3 or 2 and 4.
525 523 *
526 524 * FZEROQ1Q3: Zero quadrants 1 and 3, ie %f0 - %f15 and %f32 - %f47
527 525 * FZEROQ2Q4: Zero quadrants 2 and 4, ie %f16 - %f31 and %f48 - %f63
528 526 *
529 527 * The instructions below are quicker than repeated fzero instructions
530 528 * since they can dispatch down two fp pipelines.
531 529 */
532 530 #define FZEROQ1Q3 \
533 531 fzero %f0 ;\
534 532 fmovd %f0, %f2 ;\
535 533 fmovd %f0, %f4 ;\
536 534 fmovd %f0, %f6 ;\
537 535 fmovd %f0, %f8 ;\
538 536 fmovd %f0, %f10 ;\
539 537 fmovd %f0, %f12 ;\
540 538 fmovd %f0, %f14 ;\
541 539 fmovd %f0, %f32 ;\
542 540 fmovd %f0, %f34 ;\
543 541 fmovd %f0, %f36 ;\
544 542 fmovd %f0, %f38 ;\
545 543 fmovd %f0, %f40 ;\
546 544 fmovd %f0, %f42 ;\
547 545 fmovd %f0, %f44 ;\
548 546 fmovd %f0, %f46
549 547
550 548 #define FZEROQ2Q4 \
551 549 fzero %f16 ;\
552 550 fmovd %f0, %f18 ;\
553 551 fmovd %f0, %f20 ;\
554 552 fmovd %f0, %f22 ;\
555 553 fmovd %f0, %f24 ;\
556 554 fmovd %f0, %f26 ;\
557 555 fmovd %f0, %f28 ;\
558 556 fmovd %f0, %f30 ;\
559 557 fmovd %f0, %f48 ;\
560 558 fmovd %f0, %f50 ;\
561 559 fmovd %f0, %f52 ;\
562 560 fmovd %f0, %f54 ;\
563 561 fmovd %f0, %f56 ;\
564 562 fmovd %f0, %f58 ;\
565 563 fmovd %f0, %f60 ;\
566 564 fmovd %f0, %f62
567 565
568 566 /*
569 567 * Macros to save and restore quadrants 1 and 3 or 2 and 4 to/from the stack.
570 568 * Used to save and restore in-use fp registers when we want to use FP
571 569 * and find fp already in use and copy size still large enough to justify
572 570 * the additional overhead of this save and restore.
573 571 *
574 572 * A membar #Sync is needed before save to sync fp ops initiated before
575 573 * the call to the copy function (by whoever has fp in use); for example
576 574 * an earlier block load to the quadrant we are about to save may still be
577 575 * "in flight". A membar #Sync is required at the end of the save to
578 576 * sync our block store (the copy code is about to begin ldd's to the
↓ open down ↓ |
530 lines elided |
↑ open up ↑ |
579 577 * first quadrant).
580 578 *
581 579 * Similarly: a membar #Sync before restore allows the block stores of
582 580 * the copy operation to complete before we fill the quadrants with their
583 581 * original data, and a membar #Sync after restore lets the block loads
584 582 * of the restore complete before we return to whoever has the fp regs
585 583 * in use. To avoid repeated membar #Sync we make it the responsibility
586 584 * of the copy code to membar #Sync immediately after copy is complete
587 585 * and before using the BLD_*_FROMSTACK macro.
588 586 */
589 -#if !defined(lint)
590 587 #define BST_FPQ1Q3_TOSTACK(tmp1) \
591 588 /* membar #Sync */ ;\
592 589 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\
593 590 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\
594 591 stda %f0, [tmp1]ASI_BLK_P ;\
595 592 add tmp1, VIS_BLOCKSIZE, tmp1 ;\
596 593 stda %f32, [tmp1]ASI_BLK_P ;\
597 594 membar #Sync
598 595
599 596 #define BLD_FPQ1Q3_FROMSTACK(tmp1) \
600 597 /* membar #Sync - provided at copy completion */ ;\
601 598 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\
602 599 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\
603 600 ldda [tmp1]ASI_BLK_P, %f0 ;\
604 601 add tmp1, VIS_BLOCKSIZE, tmp1 ;\
605 602 ldda [tmp1]ASI_BLK_P, %f32 ;\
606 603 membar #Sync
607 604
608 605 #define BST_FPQ2Q4_TOSTACK(tmp1) \
609 606 /* membar #Sync */ ;\
610 607 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\
611 608 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\
612 609 stda %f16, [tmp1]ASI_BLK_P ;\
613 610 add tmp1, VIS_BLOCKSIZE, tmp1 ;\
614 611 stda %f48, [tmp1]ASI_BLK_P ;\
↓ open down ↓ |
15 lines elided |
↑ open up ↑ |
615 612 membar #Sync
616 613
617 614 #define BLD_FPQ2Q4_FROMSTACK(tmp1) \
618 615 /* membar #Sync - provided at copy completion */ ;\
619 616 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\
620 617 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\
621 618 ldda [tmp1]ASI_BLK_P, %f16 ;\
622 619 add tmp1, VIS_BLOCKSIZE, tmp1 ;\
623 620 ldda [tmp1]ASI_BLK_P, %f48 ;\
624 621 membar #Sync
625 -#endif
626 622
627 623 /*
628 624 * FP_NOMIGRATE and FP_ALLOWMIGRATE. Prevent migration (or, stronger,
629 625 * prevent preemption if there is no t_lwp to save FP state to on context
630 626 * switch) before commencing a FP copy, and reallow it on completion or
631 627 * in error trampoline paths when we were using FP copy.
632 628 *
633 629 * Both macros may call other functions, so be aware that all outputs are
634 630 * forfeit after using these macros. For this reason we do not pass registers
635 631 * to use - we just use any outputs we want.
636 632 *
637 633 * Pseudo code:
638 634 *
639 635 * FP_NOMIGRATE:
640 636 *
641 637 * if (curthread->t_lwp) {
642 638 * thread_nomigrate();
643 639 * } else {
644 640 * kpreempt_disable();
645 641 * }
646 642 *
647 643 * FP_ALLOWMIGRATE:
648 644 *
649 645 * if (curthread->t_lwp) {
650 646 * thread_allowmigrate();
651 647 * } else {
652 648 * kpreempt_enable();
653 649 * }
654 650 */
655 651
656 652 #define FP_NOMIGRATE(label1, label2) \
657 653 ldn [THREAD_REG + T_LWP], %o0 ;\
658 654 brz,a,pn %o0, label1/**/f ;\
659 655 ldsb [THREAD_REG + T_PREEMPT], %o1 ;\
660 656 call thread_nomigrate ;\
661 657 nop ;\
662 658 ba label2/**/f ;\
663 659 nop ;\
664 660 label1: ;\
665 661 inc %o1 ;\
666 662 stb %o1, [THREAD_REG + T_PREEMPT] ;\
667 663 label2:
668 664
669 665 #define FP_ALLOWMIGRATE(label1, label2) \
670 666 ldn [THREAD_REG + T_LWP], %o0 ;\
671 667 brz,a,pn %o0, label1/**/f ;\
672 668 ldsb [THREAD_REG + T_PREEMPT], %o1 ;\
673 669 call thread_allowmigrate ;\
674 670 nop ;\
675 671 ba label2/**/f ;\
676 672 nop ;\
677 673 label1: ;\
678 674 dec %o1 ;\
679 675 brnz,pn %o1, label2/**/f ;\
680 676 stb %o1, [THREAD_REG + T_PREEMPT] ;\
681 677 ldn [THREAD_REG + T_CPU], %o0 ;\
682 678 ldub [%o0 + CPU_KPRUNRUN], %o0 ;\
683 679 brz,pt %o0, label2/**/f ;\
684 680 nop ;\
↓ open down ↓ |
49 lines elided |
↑ open up ↑ |
685 681 call kpreempt ;\
686 682 rdpr %pil, %o0 ;\
687 683 label2:
688 684
689 685 /*
690 686 * Copy a block of storage, returning an error code if `from' or
691 687 * `to' takes a kernel pagefault which cannot be resolved.
692 688 * Returns errno value on pagefault error, 0 if all ok
693 689 */
694 690
695 -#if defined(lint)
696 -
697 -/* ARGSUSED */
698 -int
699 -kcopy(const void *from, void *to, size_t count)
700 -{ return(0); }
701 -
702 -#else /* lint */
703 -
704 691 .seg ".text"
705 692 .align 4
706 693
707 694 ENTRY(kcopy)
708 695
709 696 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
710 697 bleu,pt %ncc, .kcopy_small ! go to larger cases
711 698 xor %o0, %o1, %o3 ! are src, dst alignable?
712 699 btst 7, %o3 !
713 700 bz,pt %ncc, .kcopy_8 ! check for longword alignment
714 701 nop
715 702 btst 1, %o3 !
716 703 bz,pt %ncc, .kcopy_2 ! check for half-word
717 704 nop
718 705 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
719 706 ld [%o3 + %lo(hw_copy_limit_1)], %o3
720 707 tst %o3
721 708 bz,pn %icc, .kcopy_small ! if zero, disable HW copy
722 709 cmp %o2, %o3 ! if length <= limit
723 710 bleu,pt %ncc, .kcopy_small ! go to small copy
724 711 nop
725 712 ba,pt %ncc, .kcopy_more ! otherwise go to large copy
726 713 nop
727 714 .kcopy_2:
728 715 btst 3, %o3 !
729 716 bz,pt %ncc, .kcopy_4 ! check for word alignment
730 717 nop
731 718 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
732 719 ld [%o3 + %lo(hw_copy_limit_2)], %o3
733 720 tst %o3
734 721 bz,pn %icc, .kcopy_small ! if zero, disable HW copy
735 722 cmp %o2, %o3 ! if length <= limit
736 723 bleu,pt %ncc, .kcopy_small ! go to small copy
737 724 nop
738 725 ba,pt %ncc, .kcopy_more ! otherwise go to large copy
739 726 nop
740 727 .kcopy_4:
741 728 ! already checked longword, must be word aligned
742 729 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
743 730 ld [%o3 + %lo(hw_copy_limit_4)], %o3
744 731 tst %o3
745 732 bz,pn %icc, .kcopy_small ! if zero, disable HW copy
746 733 cmp %o2, %o3 ! if length <= limit
747 734 bleu,pt %ncc, .kcopy_small ! go to small copy
748 735 nop
749 736 ba,pt %ncc, .kcopy_more ! otherwise go to large copy
750 737 nop
751 738 .kcopy_8:
752 739 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
753 740 ld [%o3 + %lo(hw_copy_limit_8)], %o3
754 741 tst %o3
755 742 bz,pn %icc, .kcopy_small ! if zero, disable HW copy
756 743 cmp %o2, %o3 ! if length <= limit
757 744 bleu,pt %ncc, .kcopy_small ! go to small copy
758 745 nop
759 746 ba,pt %ncc, .kcopy_more ! otherwise go to large copy
760 747 nop
761 748
762 749 .kcopy_small:
763 750 sethi %hi(.sm_copyerr), %o5 ! sm_copyerr is lofault value
764 751 or %o5, %lo(.sm_copyerr), %o5
765 752 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler
766 753 membar #Sync ! sync error barrier
767 754 ba,pt %ncc, .sm_do_copy ! common code
768 755 stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault
769 756
770 757 .kcopy_more:
771 758 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
772 759 sethi %hi(.copyerr), %l7 ! copyerr is lofault value
773 760 or %l7, %lo(.copyerr), %l7
774 761 ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler
775 762 membar #Sync ! sync error barrier
776 763 ba,pt %ncc, .do_copy ! common code
777 764 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault
778 765
779 766
780 767 /*
781 768 * We got here because of a fault during bcopy_more, called from kcopy or bcopy.
782 769 * Errno value is in %g1. bcopy_more uses fp quadrants 1 and 3.
783 770 */
784 771 .copyerr:
785 772 set .copyerr2, %l0
786 773 membar #Sync ! sync error barrier
787 774 stn %l0, [THREAD_REG + T_LOFAULT] ! set t_lofault
788 775 btst FPUSED_FLAG, %l6
789 776 bz %ncc, 1f
790 777 and %l6, TRAMP_FLAG, %l0 ! copy trampoline flag to %l0
791 778
792 779 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr
793 780 wr %o2, 0, %gsr
794 781
795 782 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
796 783 btst FPRS_FEF, %o3
797 784 bz,pt %icc, 4f
798 785 nop
799 786
800 787 BLD_FPQ1Q3_FROMSTACK(%o2)
801 788
802 789 ba,pt %ncc, 1f
803 790 wr %o3, 0, %fprs ! restore fprs
804 791
805 792 4:
806 793 FZEROQ1Q3
807 794 wr %o3, 0, %fprs ! restore fprs
808 795
809 796 !
810 797 ! Need to cater for the different expectations of kcopy
811 798 ! and bcopy. kcopy will *always* set a t_lofault handler
812 799 ! If it fires, we're expected to just return the error code
813 800 ! and *not* to invoke any existing error handler. As far as
814 801 ! bcopy is concerned, we only set t_lofault if there was an
815 802 ! existing lofault handler. In that case we're expected to
816 803 ! invoke the previously existing handler after resetting the
817 804 ! t_lofault value.
818 805 !
819 806 1:
820 807 andn %l6, MASK_FLAGS, %l6 ! turn trampoline flag off
821 808 membar #Sync ! sync error barrier
822 809 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
823 810 FP_ALLOWMIGRATE(5, 6)
824 811
825 812 btst TRAMP_FLAG, %l0
826 813 bnz,pn %ncc, 3f
827 814 nop
828 815 ret
829 816 restore %g1, 0, %o0
830 817
831 818 3:
832 819 !
833 820 ! We're here via bcopy. There *must* have been an error handler
834 821 ! in place otherwise we would have died a nasty death already.
835 822 !
836 823 jmp %l6 ! goto real handler
837 824 restore %g0, 0, %o0 ! dispose of copy window
838 825
839 826 /*
840 827 * We got here because of a fault in .copyerr. We can't safely restore fp
841 828 * state, so we panic.
842 829 */
843 830 fp_panic_msg:
844 831 .asciz "Unable to restore fp state after copy operation"
845 832
846 833 .align 4
847 834 .copyerr2:
848 835 set fp_panic_msg, %o0
849 836 call panic
850 837 nop
851 838
852 839 /*
853 840 * We got here because of a fault during a small kcopy or bcopy.
854 841 * No floating point registers are used by the small copies.
855 842 * Errno value is in %g1.
856 843 */
857 844 .sm_copyerr:
858 845 1:
859 846 btst TRAMP_FLAG, %o4
860 847 membar #Sync
↓ open down ↓ |
147 lines elided |
↑ open up ↑ |
861 848 andn %o4, TRAMP_FLAG, %o4
862 849 bnz,pn %ncc, 3f
863 850 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
864 851 retl
865 852 mov %g1, %o0
866 853 3:
867 854 jmp %o4 ! goto real handler
868 855 mov %g0, %o0 !
869 856
870 857 SET_SIZE(kcopy)
871 -#endif /* lint */
872 858
873 859
874 860 /*
875 861 * Copy a block of storage - must not overlap (from + len <= to).
876 862 * Registers: l6 - saved t_lofault
877 863 * (for short copies, o4 - saved t_lofault)
878 864 *
879 865 * Copy a page of memory.
880 866 * Assumes double word alignment and a count >= 256.
881 867 */
882 -#if defined(lint)
883 868
884 -/* ARGSUSED */
885 -void
886 -bcopy(const void *from, void *to, size_t count)
887 -{}
888 -
889 -#else /* lint */
890 -
891 869 ENTRY(bcopy)
892 870
893 871 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
894 872 bleu,pt %ncc, .bcopy_small ! go to larger cases
895 873 xor %o0, %o1, %o3 ! are src, dst alignable?
896 874 btst 7, %o3 !
897 875 bz,pt %ncc, .bcopy_8 ! check for longword alignment
898 876 nop
899 877 btst 1, %o3 !
900 878 bz,pt %ncc, .bcopy_2 ! check for half-word
901 879 nop
902 880 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
903 881 ld [%o3 + %lo(hw_copy_limit_1)], %o3
904 882 tst %o3
905 883 bz,pn %icc, .bcopy_small ! if zero, disable HW copy
906 884 cmp %o2, %o3 ! if length <= limit
907 885 bleu,pt %ncc, .bcopy_small ! go to small copy
908 886 nop
909 887 ba,pt %ncc, .bcopy_more ! otherwise go to large copy
910 888 nop
911 889 .bcopy_2:
912 890 btst 3, %o3 !
913 891 bz,pt %ncc, .bcopy_4 ! check for word alignment
914 892 nop
915 893 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
916 894 ld [%o3 + %lo(hw_copy_limit_2)], %o3
917 895 tst %o3
918 896 bz,pn %icc, .bcopy_small ! if zero, disable HW copy
919 897 cmp %o2, %o3 ! if length <= limit
920 898 bleu,pt %ncc, .bcopy_small ! go to small copy
921 899 nop
922 900 ba,pt %ncc, .bcopy_more ! otherwise go to large copy
923 901 nop
924 902 .bcopy_4:
925 903 ! already checked longword, must be word aligned
926 904 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
927 905 ld [%o3 + %lo(hw_copy_limit_4)], %o3
928 906 tst %o3
929 907 bz,pn %icc, .bcopy_small ! if zero, disable HW copy
930 908 cmp %o2, %o3 ! if length <= limit
931 909 bleu,pt %ncc, .bcopy_small ! go to small copy
932 910 nop
933 911 ba,pt %ncc, .bcopy_more ! otherwise go to large copy
934 912 nop
935 913 .bcopy_8:
936 914 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
937 915 ld [%o3 + %lo(hw_copy_limit_8)], %o3
938 916 tst %o3
939 917 bz,pn %icc, .bcopy_small ! if zero, disable HW copy
940 918 cmp %o2, %o3 ! if length <= limit
941 919 bleu,pt %ncc, .bcopy_small ! go to small copy
942 920 nop
943 921 ba,pt %ncc, .bcopy_more ! otherwise go to large copy
944 922 nop
945 923
946 924 .align 16
947 925 .bcopy_small:
948 926 ldn [THREAD_REG + T_LOFAULT], %o4 ! save t_lofault
949 927 tst %o4
950 928 bz,pt %icc, .sm_do_copy
951 929 nop
952 930 sethi %hi(.sm_copyerr), %o5
953 931 or %o5, %lo(.sm_copyerr), %o5
954 932 membar #Sync ! sync error barrier
955 933 stn %o5, [THREAD_REG + T_LOFAULT] ! install new vector
956 934 or %o4, TRAMP_FLAG, %o4 ! error should trampoline
957 935 .sm_do_copy:
958 936 cmp %o2, SHORTCOPY ! check for really short case
959 937 bleu,pt %ncc, .bc_sm_left !
960 938 cmp %o2, CHKSIZE ! check for medium length cases
961 939 bgu,pn %ncc, .bc_med !
962 940 or %o0, %o1, %o3 ! prepare alignment check
963 941 andcc %o3, 0x3, %g0 ! test for alignment
964 942 bz,pt %ncc, .bc_sm_word ! branch to word aligned case
965 943 .bc_sm_movebytes:
966 944 sub %o2, 3, %o2 ! adjust count to allow cc zero test
967 945 .bc_sm_notalign4:
968 946 ldub [%o0], %o3 ! read byte
969 947 stb %o3, [%o1] ! write byte
970 948 subcc %o2, 4, %o2 ! reduce count by 4
971 949 ldub [%o0 + 1], %o3 ! repeat for a total of 4 bytes
972 950 add %o0, 4, %o0 ! advance SRC by 4
973 951 stb %o3, [%o1 + 1]
974 952 ldub [%o0 - 2], %o3
975 953 add %o1, 4, %o1 ! advance DST by 4
976 954 stb %o3, [%o1 - 2]
977 955 ldub [%o0 - 1], %o3
978 956 bgt,pt %ncc, .bc_sm_notalign4 ! loop til 3 or fewer bytes remain
979 957 stb %o3, [%o1 - 1]
980 958 add %o2, 3, %o2 ! restore count
981 959 .bc_sm_left:
982 960 tst %o2
983 961 bz,pt %ncc, .bc_sm_exit ! check for zero length
984 962 deccc %o2 ! reduce count for cc test
985 963 ldub [%o0], %o3 ! move one byte
986 964 bz,pt %ncc, .bc_sm_exit
987 965 stb %o3, [%o1]
988 966 ldub [%o0 + 1], %o3 ! move another byte
989 967 deccc %o2 ! check for more
990 968 bz,pt %ncc, .bc_sm_exit
991 969 stb %o3, [%o1 + 1]
992 970 ldub [%o0 + 2], %o3 ! move final byte
993 971 ba,pt %ncc, .bc_sm_exit
994 972 stb %o3, [%o1 + 2]
995 973 .align 16
996 974 nop ! instruction alignment
997 975 ! see discussion at start of file
998 976 .bc_sm_words:
999 977 lduw [%o0], %o3 ! read word
1000 978 .bc_sm_wordx:
1001 979 subcc %o2, 8, %o2 ! update count
1002 980 stw %o3, [%o1] ! write word
1003 981 add %o0, 8, %o0 ! update SRC
1004 982 lduw [%o0 - 4], %o3 ! read word
1005 983 add %o1, 8, %o1 ! update DST
1006 984 bgt,pt %ncc, .bc_sm_words ! loop til done
1007 985 stw %o3, [%o1 - 4] ! write word
1008 986 addcc %o2, 7, %o2 ! restore count
1009 987 bz,pt %ncc, .bc_sm_exit
1010 988 deccc %o2
1011 989 bz,pt %ncc, .bc_sm_byte
1012 990 .bc_sm_half:
1013 991 subcc %o2, 2, %o2 ! reduce count by 2
1014 992 add %o0, 2, %o0 ! advance SRC by 2
1015 993 lduh [%o0 - 2], %o3 ! read half word
1016 994 add %o1, 2, %o1 ! advance DST by 2
1017 995 bgt,pt %ncc, .bc_sm_half ! loop til done
1018 996 sth %o3, [%o1 - 2] ! write half word
1019 997 addcc %o2, 1, %o2 ! restore count
1020 998 bz,pt %ncc, .bc_sm_exit
1021 999 nop
1022 1000 .bc_sm_byte:
1023 1001 ldub [%o0], %o3
1024 1002 ba,pt %ncc, .bc_sm_exit
1025 1003 stb %o3, [%o1]
1026 1004
1027 1005 .bc_sm_word:
1028 1006 subcc %o2, 4, %o2 ! update count
1029 1007 bgt,pt %ncc, .bc_sm_wordx
1030 1008 lduw [%o0], %o3 ! read word
1031 1009 addcc %o2, 3, %o2 ! restore count
1032 1010 bz,pt %ncc, .bc_sm_exit
1033 1011 stw %o3, [%o1] ! write word
1034 1012 deccc %o2 ! reduce count for cc test
1035 1013 ldub [%o0 + 4], %o3 ! load one byte
1036 1014 bz,pt %ncc, .bc_sm_exit
1037 1015 stb %o3, [%o1 + 4] ! store one byte
1038 1016 ldub [%o0 + 5], %o3 ! load second byte
1039 1017 deccc %o2
1040 1018 bz,pt %ncc, .bc_sm_exit
1041 1019 stb %o3, [%o1 + 5] ! store second byte
1042 1020 ldub [%o0 + 6], %o3 ! load third byte
1043 1021 stb %o3, [%o1 + 6] ! store third byte
1044 1022 .bc_sm_exit:
1045 1023 ldn [THREAD_REG + T_LOFAULT], %o3
1046 1024 brz,pt %o3, .bc_sm_done
1047 1025 nop
1048 1026 membar #Sync ! sync error barrier
1049 1027 andn %o4, TRAMP_FLAG, %o4
1050 1028 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
1051 1029 .bc_sm_done:
1052 1030 retl
1053 1031 mov %g0, %o0 ! return 0
1054 1032
1055 1033 .align 16
1056 1034 .bc_med:
1057 1035 xor %o0, %o1, %o3 ! setup alignment check
1058 1036 btst 1, %o3
1059 1037 bnz,pt %ncc, .bc_sm_movebytes ! unaligned
1060 1038 nop
1061 1039 btst 3, %o3
1062 1040 bnz,pt %ncc, .bc_med_half ! halfword aligned
1063 1041 nop
1064 1042 btst 7, %o3
1065 1043 bnz,pt %ncc, .bc_med_word ! word aligned
1066 1044 nop
1067 1045 .bc_med_long:
1068 1046 btst 3, %o0 ! check for
1069 1047 bz,pt %ncc, .bc_med_long1 ! word alignment
1070 1048 nop
1071 1049 .bc_med_long0:
1072 1050 ldub [%o0], %o3 ! load one byte
1073 1051 inc %o0
1074 1052 stb %o3,[%o1] ! store byte
1075 1053 inc %o1
1076 1054 btst 3, %o0
1077 1055 bnz,pt %ncc, .bc_med_long0
1078 1056 dec %o2
1079 1057 .bc_med_long1: ! word aligned
1080 1058 btst 7, %o0 ! check for long word
1081 1059 bz,pt %ncc, .bc_med_long2
1082 1060 nop
1083 1061 lduw [%o0], %o3 ! load word
1084 1062 add %o0, 4, %o0 ! advance SRC by 4
1085 1063 stw %o3, [%o1] ! store word
1086 1064 add %o1, 4, %o1 ! advance DST by 4
1087 1065 sub %o2, 4, %o2 ! reduce count by 4
1088 1066 !
1089 1067 ! Now long word aligned and have at least 32 bytes to move
1090 1068 !
1091 1069 .bc_med_long2:
1092 1070 sub %o2, 31, %o2 ! adjust count to allow cc zero test
1093 1071 .bc_med_lmove:
1094 1072 ldx [%o0], %o3 ! read long word
1095 1073 stx %o3, [%o1] ! write long word
1096 1074 subcc %o2, 32, %o2 ! reduce count by 32
1097 1075 ldx [%o0 + 8], %o3 ! repeat for a total for 4 long words
1098 1076 add %o0, 32, %o0 ! advance SRC by 32
1099 1077 stx %o3, [%o1 + 8]
1100 1078 ldx [%o0 - 16], %o3
1101 1079 add %o1, 32, %o1 ! advance DST by 32
1102 1080 stx %o3, [%o1 - 16]
1103 1081 ldx [%o0 - 8], %o3
1104 1082 bgt,pt %ncc, .bc_med_lmove ! loop til 31 or fewer bytes left
1105 1083 stx %o3, [%o1 - 8]
1106 1084 addcc %o2, 24, %o2 ! restore count to long word offset
1107 1085 ble,pt %ncc, .bc_med_lextra ! check for more long words to move
1108 1086 nop
1109 1087 .bc_med_lword:
1110 1088 ldx [%o0], %o3 ! read long word
1111 1089 subcc %o2, 8, %o2 ! reduce count by 8
1112 1090 stx %o3, [%o1] ! write long word
1113 1091 add %o0, 8, %o0 ! advance SRC by 8
1114 1092 bgt,pt %ncc, .bc_med_lword ! loop til 7 or fewer bytes left
1115 1093 add %o1, 8, %o1 ! advance DST by 8
1116 1094 .bc_med_lextra:
1117 1095 addcc %o2, 7, %o2 ! restore rest of count
1118 1096 bz,pt %ncc, .bc_sm_exit ! if zero, then done
1119 1097 deccc %o2
1120 1098 bz,pt %ncc, .bc_sm_byte
1121 1099 nop
1122 1100 ba,pt %ncc, .bc_sm_half
1123 1101 nop
1124 1102
1125 1103 .align 16
1126 1104 .bc_med_word:
1127 1105 btst 3, %o0 ! check for
1128 1106 bz,pt %ncc, .bc_med_word1 ! word alignment
1129 1107 nop
1130 1108 .bc_med_word0:
1131 1109 ldub [%o0], %o3 ! load one byte
1132 1110 inc %o0
1133 1111 stb %o3,[%o1] ! store byte
1134 1112 inc %o1
1135 1113 btst 3, %o0
1136 1114 bnz,pt %ncc, .bc_med_word0
1137 1115 dec %o2
1138 1116 !
1139 1117 ! Now word aligned and have at least 36 bytes to move
1140 1118 !
1141 1119 .bc_med_word1:
1142 1120 sub %o2, 15, %o2 ! adjust count to allow cc zero test
1143 1121 .bc_med_wmove:
1144 1122 lduw [%o0], %o3 ! read word
1145 1123 stw %o3, [%o1] ! write word
1146 1124 subcc %o2, 16, %o2 ! reduce count by 16
1147 1125 lduw [%o0 + 4], %o3 ! repeat for a total for 4 words
1148 1126 add %o0, 16, %o0 ! advance SRC by 16
1149 1127 stw %o3, [%o1 + 4]
1150 1128 lduw [%o0 - 8], %o3
1151 1129 add %o1, 16, %o1 ! advance DST by 16
1152 1130 stw %o3, [%o1 - 8]
1153 1131 lduw [%o0 - 4], %o3
1154 1132 bgt,pt %ncc, .bc_med_wmove ! loop til 15 or fewer bytes left
1155 1133 stw %o3, [%o1 - 4]
1156 1134 addcc %o2, 12, %o2 ! restore count to word offset
1157 1135 ble,pt %ncc, .bc_med_wextra ! check for more words to move
1158 1136 nop
1159 1137 .bc_med_word2:
1160 1138 lduw [%o0], %o3 ! read word
1161 1139 subcc %o2, 4, %o2 ! reduce count by 4
1162 1140 stw %o3, [%o1] ! write word
1163 1141 add %o0, 4, %o0 ! advance SRC by 4
1164 1142 bgt,pt %ncc, .bc_med_word2 ! loop til 3 or fewer bytes left
1165 1143 add %o1, 4, %o1 ! advance DST by 4
1166 1144 .bc_med_wextra:
1167 1145 addcc %o2, 3, %o2 ! restore rest of count
1168 1146 bz,pt %ncc, .bc_sm_exit ! if zero, then done
1169 1147 deccc %o2
1170 1148 bz,pt %ncc, .bc_sm_byte
1171 1149 nop
1172 1150 ba,pt %ncc, .bc_sm_half
1173 1151 nop
1174 1152
1175 1153 .align 16
1176 1154 .bc_med_half:
1177 1155 btst 1, %o0 ! check for
1178 1156 bz,pt %ncc, .bc_med_half1 ! half word alignment
1179 1157 nop
1180 1158 ldub [%o0], %o3 ! load one byte
1181 1159 inc %o0
1182 1160 stb %o3,[%o1] ! store byte
1183 1161 inc %o1
1184 1162 dec %o2
1185 1163 !
1186 1164 ! Now half word aligned and have at least 38 bytes to move
1187 1165 !
1188 1166 .bc_med_half1:
1189 1167 sub %o2, 7, %o2 ! adjust count to allow cc zero test
1190 1168 .bc_med_hmove:
1191 1169 lduh [%o0], %o3 ! read half word
1192 1170 sth %o3, [%o1] ! write half word
1193 1171 subcc %o2, 8, %o2 ! reduce count by 8
1194 1172 lduh [%o0 + 2], %o3 ! repeat for a total for 4 halfwords
1195 1173 add %o0, 8, %o0 ! advance SRC by 8
1196 1174 sth %o3, [%o1 + 2]
1197 1175 lduh [%o0 - 4], %o3
1198 1176 add %o1, 8, %o1 ! advance DST by 8
1199 1177 sth %o3, [%o1 - 4]
1200 1178 lduh [%o0 - 2], %o3
1201 1179 bgt,pt %ncc, .bc_med_hmove ! loop til 7 or fewer bytes left
1202 1180 sth %o3, [%o1 - 2]
1203 1181 addcc %o2, 7, %o2 ! restore count
1204 1182 bz,pt %ncc, .bc_sm_exit
1205 1183 deccc %o2
1206 1184 bz,pt %ncc, .bc_sm_byte
1207 1185 nop
1208 1186 ba,pt %ncc, .bc_sm_half
1209 1187 nop
1210 1188
1211 1189 SET_SIZE(bcopy)
1212 1190
1213 1191 /*
1214 1192 * The _more entry points are not intended to be used directly by
1215 1193 * any caller from outside this file. They are provided to allow
1216 1194 * profiling and dtrace of the portions of the copy code that uses
1217 1195 * the floating point registers.
1218 1196 * This entry is particularly important as DTRACE (at least as of
1219 1197 * 4/2004) does not support leaf functions.
1220 1198 */
1221 1199
1222 1200 ENTRY(bcopy_more)
1223 1201 .bcopy_more:
1224 1202 prefetch [%o0], #n_reads
1225 1203 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1226 1204 ldn [THREAD_REG + T_LOFAULT], %l6 ! save t_lofault
1227 1205 tst %l6
1228 1206 bz,pt %ncc, .do_copy
1229 1207 nop
1230 1208 sethi %hi(.copyerr), %o2
1231 1209 or %o2, %lo(.copyerr), %o2
1232 1210 membar #Sync ! sync error barrier
1233 1211 stn %o2, [THREAD_REG + T_LOFAULT] ! install new vector
1234 1212 !
1235 1213 ! We've already captured whether t_lofault was zero on entry.
1236 1214 ! We need to mark ourselves as being from bcopy since both
1237 1215 ! kcopy and bcopy use the same code path. If TRAMP_FLAG is set
1238 1216 ! and the saved lofault was zero, we won't reset lofault on
1239 1217 ! returning.
1240 1218 !
1241 1219 or %l6, TRAMP_FLAG, %l6
1242 1220
1243 1221 /*
1244 1222 * Copies that reach here are larger than VIS_COPY_THRESHOLD bytes
1245 1223 * Also, use of FP registers has been tested to be enabled
1246 1224 */
1247 1225 .do_copy:
1248 1226 FP_NOMIGRATE(6, 7)
1249 1227
1250 1228 rd %fprs, %o2 ! check for unused fp
1251 1229 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
1252 1230 btst FPRS_FEF, %o2
1253 1231 bz,a,pt %icc, .do_blockcopy
1254 1232 wr %g0, FPRS_FEF, %fprs
1255 1233
1256 1234 BST_FPQ1Q3_TOSTACK(%o2)
1257 1235
1258 1236 .do_blockcopy:
1259 1237 rd %gsr, %o2
1260 1238 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr
1261 1239 or %l6, FPUSED_FLAG, %l6
1262 1240
1263 1241 #define REALSRC %i0
1264 1242 #define DST %i1
1265 1243 #define CNT %i2
1266 1244 #define SRC %i3
1267 1245 #define TMP %i5
1268 1246
1269 1247 andcc DST, VIS_BLOCKSIZE - 1, TMP
1270 1248 bz,pt %ncc, 2f
1271 1249 neg TMP
1272 1250 add TMP, VIS_BLOCKSIZE, TMP
1273 1251
1274 1252 ! TMP = bytes required to align DST on FP_BLOCK boundary
1275 1253 ! Using SRC as a tmp here
1276 1254 cmp TMP, 3
1277 1255 bleu,pt %ncc, 1f
1278 1256 sub CNT,TMP,CNT ! adjust main count
1279 1257 sub TMP, 3, TMP ! adjust for end of loop test
1280 1258 .bc_blkalign:
1281 1259 ldub [REALSRC], SRC ! move 4 bytes per loop iteration
1282 1260 stb SRC, [DST]
1283 1261 subcc TMP, 4, TMP
1284 1262 ldub [REALSRC + 1], SRC
1285 1263 add REALSRC, 4, REALSRC
1286 1264 stb SRC, [DST + 1]
1287 1265 ldub [REALSRC - 2], SRC
1288 1266 add DST, 4, DST
1289 1267 stb SRC, [DST - 2]
1290 1268 ldub [REALSRC - 1], SRC
1291 1269 bgu,pt %ncc, .bc_blkalign
1292 1270 stb SRC, [DST - 1]
1293 1271
1294 1272 addcc TMP, 3, TMP ! restore count adjustment
1295 1273 bz,pt %ncc, 2f ! no bytes left?
1296 1274 nop
1297 1275 1: ldub [REALSRC], SRC
1298 1276 inc REALSRC
1299 1277 inc DST
1300 1278 deccc TMP
1301 1279 bgu %ncc, 1b
1302 1280 stb SRC, [DST - 1]
1303 1281
1304 1282 2:
1305 1283 membar #StoreLoad
1306 1284 andn REALSRC, 0x7, SRC
1307 1285
1308 1286 ! SRC - 8-byte aligned
1309 1287 ! DST - 64-byte aligned
1310 1288 ldd [SRC], %f0
1311 1289 prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads
1312 1290 alignaddr REALSRC, %g0, %g0
1313 1291 ldd [SRC + 0x08], %f2
1314 1292 prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads
1315 1293 faligndata %f0, %f2, %f32
1316 1294 ldd [SRC + 0x10], %f4
1317 1295 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
1318 1296 faligndata %f2, %f4, %f34
1319 1297 ldd [SRC + 0x18], %f6
1320 1298 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
1321 1299 faligndata %f4, %f6, %f36
1322 1300 ldd [SRC + 0x20], %f8
1323 1301 prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read
1324 1302 faligndata %f6, %f8, %f38
1325 1303 ldd [SRC + 0x28], %f10
1326 1304 prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read
1327 1305 faligndata %f8, %f10, %f40
1328 1306 ldd [SRC + 0x30], %f12
1329 1307 prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read
1330 1308 faligndata %f10, %f12, %f42
1331 1309 ldd [SRC + 0x38], %f14
1332 1310 ldd [SRC + VIS_BLOCKSIZE], %f0
1333 1311 sub CNT, VIS_BLOCKSIZE, CNT
1334 1312 add SRC, VIS_BLOCKSIZE, SRC
1335 1313 prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read
1336 1314 add REALSRC, VIS_BLOCKSIZE, REALSRC
1337 1315 ba,pt %ncc, 1f
1338 1316 prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read
1339 1317 .align 32
1340 1318 1:
1341 1319 ldd [SRC + 0x08], %f2
1342 1320 faligndata %f12, %f14, %f44
1343 1321 ldd [SRC + 0x10], %f4
1344 1322 faligndata %f14, %f0, %f46
1345 1323 stda %f32, [DST]ASI_BLK_P
1346 1324 ldd [SRC + 0x18], %f6
1347 1325 faligndata %f0, %f2, %f32
1348 1326 ldd [SRC + 0x20], %f8
1349 1327 faligndata %f2, %f4, %f34
1350 1328 ldd [SRC + 0x28], %f10
1351 1329 faligndata %f4, %f6, %f36
1352 1330 ldd [SRC + 0x30], %f12
1353 1331 faligndata %f6, %f8, %f38
1354 1332 sub CNT, VIS_BLOCKSIZE, CNT
1355 1333 ldd [SRC + 0x38], %f14
1356 1334 faligndata %f8, %f10, %f40
1357 1335 add DST, VIS_BLOCKSIZE, DST
1358 1336 ldd [SRC + VIS_BLOCKSIZE], %f0
1359 1337 faligndata %f10, %f12, %f42
1360 1338 add REALSRC, VIS_BLOCKSIZE, REALSRC
1361 1339 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
1362 1340 add SRC, VIS_BLOCKSIZE, SRC
1363 1341 prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1364 1342 cmp CNT, VIS_BLOCKSIZE + 8
1365 1343 bgu,pt %ncc, 1b
1366 1344 prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1367 1345
1368 1346 ! only if REALSRC & 0x7 is 0
1369 1347 cmp CNT, VIS_BLOCKSIZE
1370 1348 bne %ncc, 3f
1371 1349 andcc REALSRC, 0x7, %g0
1372 1350 bz,pt %ncc, 2f
1373 1351 nop
1374 1352 3:
1375 1353 faligndata %f12, %f14, %f44
1376 1354 faligndata %f14, %f0, %f46
1377 1355 stda %f32, [DST]ASI_BLK_P
1378 1356 add DST, VIS_BLOCKSIZE, DST
1379 1357 ba,pt %ncc, 3f
1380 1358 nop
1381 1359 2:
1382 1360 ldd [SRC + 0x08], %f2
1383 1361 fsrc1 %f12, %f44
1384 1362 ldd [SRC + 0x10], %f4
1385 1363 fsrc1 %f14, %f46
1386 1364 stda %f32, [DST]ASI_BLK_P
1387 1365 ldd [SRC + 0x18], %f6
1388 1366 fsrc1 %f0, %f32
1389 1367 ldd [SRC + 0x20], %f8
1390 1368 fsrc1 %f2, %f34
1391 1369 ldd [SRC + 0x28], %f10
1392 1370 fsrc1 %f4, %f36
1393 1371 ldd [SRC + 0x30], %f12
1394 1372 fsrc1 %f6, %f38
1395 1373 ldd [SRC + 0x38], %f14
1396 1374 fsrc1 %f8, %f40
1397 1375 sub CNT, VIS_BLOCKSIZE, CNT
1398 1376 add DST, VIS_BLOCKSIZE, DST
1399 1377 add SRC, VIS_BLOCKSIZE, SRC
1400 1378 add REALSRC, VIS_BLOCKSIZE, REALSRC
1401 1379 fsrc1 %f10, %f42
1402 1380 fsrc1 %f12, %f44
1403 1381 fsrc1 %f14, %f46
1404 1382 stda %f32, [DST]ASI_BLK_P
1405 1383 add DST, VIS_BLOCKSIZE, DST
1406 1384 ba,a,pt %ncc, .bcb_exit
1407 1385 nop
1408 1386
1409 1387 3: tst CNT
1410 1388 bz,a,pt %ncc, .bcb_exit
1411 1389 nop
1412 1390
1413 1391 5: ldub [REALSRC], TMP
1414 1392 inc REALSRC
1415 1393 inc DST
1416 1394 deccc CNT
1417 1395 bgu %ncc, 5b
1418 1396 stb TMP, [DST - 1]
1419 1397 .bcb_exit:
1420 1398 membar #Sync
1421 1399
1422 1400 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr
1423 1401 wr %o2, 0, %gsr
1424 1402
1425 1403 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1426 1404 btst FPRS_FEF, %o3
1427 1405 bz,pt %icc, 4f
1428 1406 nop
1429 1407
1430 1408 BLD_FPQ1Q3_FROMSTACK(%o2)
1431 1409
1432 1410 ba,pt %ncc, 2f
1433 1411 wr %o3, 0, %fprs ! restore fprs
1434 1412 4:
1435 1413 FZEROQ1Q3
1436 1414 wr %o3, 0, %fprs ! restore fprs
↓ open down ↓ |
536 lines elided |
↑ open up ↑ |
1437 1415 2:
1438 1416 membar #Sync ! sync error barrier
1439 1417 andn %l6, MASK_FLAGS, %l6
1440 1418 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
1441 1419 FP_ALLOWMIGRATE(5, 6)
1442 1420 ret
1443 1421 restore %g0, 0, %o0
1444 1422
1445 1423 SET_SIZE(bcopy_more)
1446 1424
1447 -#endif /* lint */
1448 -
1449 1425 /*
1450 1426 * Block copy with possibly overlapped operands.
1451 1427 */
1452 1428
1453 -#if defined(lint)
1454 -
1455 -/*ARGSUSED*/
1456 -void
1457 -ovbcopy(const void *from, void *to, size_t count)
1458 -{}
1459 -
1460 -#else /* lint */
1461 -
1462 1429 ENTRY(ovbcopy)
1463 1430 tst %o2 ! check count
1464 1431 bgu,a %ncc, 1f ! nothing to do or bad arguments
1465 1432 subcc %o0, %o1, %o3 ! difference of from and to address
1466 1433
1467 1434 retl ! return
1468 1435 nop
1469 1436 1:
1470 1437 bneg,a %ncc, 2f
1471 1438 neg %o3 ! if < 0, make it positive
1472 1439 2: cmp %o2, %o3 ! cmp size and abs(from - to)
1473 1440 bleu %ncc, bcopy ! if size <= abs(diff): use bcopy,
1474 1441 .empty ! no overlap
1475 1442 cmp %o0, %o1 ! compare from and to addresses
1476 1443 blu %ncc, .ov_bkwd ! if from < to, copy backwards
1477 1444 nop
1478 1445 !
1479 1446 ! Copy forwards.
1480 1447 !
1481 1448 .ov_fwd:
1482 1449 ldub [%o0], %o3 ! read from address
1483 1450 inc %o0 ! inc from address
1484 1451 stb %o3, [%o1] ! write to address
1485 1452 deccc %o2 ! dec count
1486 1453 bgu %ncc, .ov_fwd ! loop till done
1487 1454 inc %o1 ! inc to address
1488 1455
1489 1456 retl ! return
1490 1457 nop
1491 1458 !
1492 1459 ! Copy backwards.
1493 1460 !
1494 1461 .ov_bkwd:
↓ open down ↓ |
23 lines elided |
↑ open up ↑ |
1495 1462 deccc %o2 ! dec count
1496 1463 ldub [%o0 + %o2], %o3 ! get byte at end of src
1497 1464 bgu %ncc, .ov_bkwd ! loop till done
1498 1465 stb %o3, [%o1 + %o2] ! delay slot, store at end of dst
1499 1466
1500 1467 retl ! return
1501 1468 nop
1502 1469
1503 1470 SET_SIZE(ovbcopy)
1504 1471
1505 -#endif /* lint */
1506 1472
1507 -
1508 1473 /*
1509 1474 * hwblkpagecopy()
1510 1475 *
1511 1476 * Copies exactly one page. This routine assumes the caller (ppcopy)
1512 1477 * has already disabled kernel preemption and has checked
1513 1478 * use_hw_bcopy. Preventing preemption also prevents cpu migration.
1514 1479 */
1515 -#ifdef lint
1516 -/*ARGSUSED*/
1517 -void
1518 -hwblkpagecopy(const void *src, void *dst)
1519 -{ }
1520 -#else /* lint */
1521 1480 ENTRY(hwblkpagecopy)
1522 1481 ! get another window w/space for three aligned blocks of saved fpregs
1523 1482 prefetch [%o0], #n_reads
1524 1483 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1525 1484
1526 1485 ! %i0 - source address (arg)
1527 1486 ! %i1 - destination address (arg)
1528 1487 ! %i2 - length of region (not arg)
1529 1488 ! %l0 - saved fprs
1530 1489 ! %l1 - pointer to saved fpregs
1531 1490
1532 1491 rd %fprs, %l0 ! check for unused fp
1533 1492 btst FPRS_FEF, %l0
1534 1493 bz,a,pt %icc, 1f
1535 1494 wr %g0, FPRS_FEF, %fprs
1536 1495
1537 1496 BST_FPQ1Q3_TOSTACK(%l1)
1538 1497
1539 1498 1: set PAGESIZE, CNT
1540 1499 mov REALSRC, SRC
1541 1500
1542 1501 ldd [SRC], %f0
1543 1502 prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads
1544 1503 ldd [SRC + 0x08], %f2
1545 1504 prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads
1546 1505 fmovd %f0, %f32
1547 1506 ldd [SRC + 0x10], %f4
1548 1507 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
1549 1508 fmovd %f2, %f34
1550 1509 ldd [SRC + 0x18], %f6
1551 1510 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
1552 1511 fmovd %f4, %f36
1553 1512 ldd [SRC + 0x20], %f8
1554 1513 prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read
1555 1514 fmovd %f6, %f38
1556 1515 ldd [SRC + 0x28], %f10
1557 1516 prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read
1558 1517 fmovd %f8, %f40
1559 1518 ldd [SRC + 0x30], %f12
1560 1519 prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read
1561 1520 fmovd %f10, %f42
1562 1521 ldd [SRC + 0x38], %f14
1563 1522 ldd [SRC + VIS_BLOCKSIZE], %f0
1564 1523 sub CNT, VIS_BLOCKSIZE, CNT
1565 1524 add SRC, VIS_BLOCKSIZE, SRC
1566 1525 prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read
1567 1526 ba,pt %ncc, 2f
1568 1527 prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read
1569 1528 .align 32
1570 1529 2:
1571 1530 ldd [SRC + 0x08], %f2
1572 1531 fmovd %f12, %f44
1573 1532 ldd [SRC + 0x10], %f4
1574 1533 fmovd %f14, %f46
1575 1534 stda %f32, [DST]ASI_BLK_P
1576 1535 ldd [SRC + 0x18], %f6
1577 1536 fmovd %f0, %f32
1578 1537 ldd [SRC + 0x20], %f8
1579 1538 fmovd %f2, %f34
1580 1539 ldd [SRC + 0x28], %f10
1581 1540 fmovd %f4, %f36
1582 1541 ldd [SRC + 0x30], %f12
1583 1542 fmovd %f6, %f38
1584 1543 ldd [SRC + 0x38], %f14
1585 1544 fmovd %f8, %f40
1586 1545 ldd [SRC + VIS_BLOCKSIZE], %f0
1587 1546 fmovd %f10, %f42
1588 1547 sub CNT, VIS_BLOCKSIZE, CNT
1589 1548 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
1590 1549 add DST, VIS_BLOCKSIZE, DST
1591 1550 prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1592 1551 add SRC, VIS_BLOCKSIZE, SRC
1593 1552 cmp CNT, VIS_BLOCKSIZE + 8
1594 1553 bgu,pt %ncc, 2b
1595 1554 prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1596 1555
1597 1556 ! trailing block
1598 1557 ldd [SRC + 0x08], %f2
1599 1558 fsrc1 %f12, %f44
1600 1559 ldd [SRC + 0x10], %f4
1601 1560 fsrc1 %f14, %f46
1602 1561 stda %f32, [DST]ASI_BLK_P
1603 1562 ldd [SRC + 0x18], %f6
1604 1563 fsrc1 %f0, %f32
1605 1564 ldd [SRC + 0x20], %f8
1606 1565 fsrc1 %f2, %f34
1607 1566 ldd [SRC + 0x28], %f10
1608 1567 fsrc1 %f4, %f36
1609 1568 ldd [SRC + 0x30], %f12
1610 1569 fsrc1 %f6, %f38
1611 1570 ldd [SRC + 0x38], %f14
1612 1571 fsrc1 %f8, %f40
1613 1572 sub CNT, VIS_BLOCKSIZE, CNT
1614 1573 add DST, VIS_BLOCKSIZE, DST
1615 1574 add SRC, VIS_BLOCKSIZE, SRC
1616 1575 fsrc1 %f10, %f42
1617 1576 fsrc1 %f12, %f44
1618 1577 fsrc1 %f14, %f46
1619 1578 stda %f32, [DST]ASI_BLK_P
1620 1579
1621 1580 membar #Sync
1622 1581
1623 1582 btst FPRS_FEF, %l0
1624 1583 bz,pt %icc, 2f
1625 1584 nop
1626 1585
1627 1586 BLD_FPQ1Q3_FROMSTACK(%l3)
↓ open down ↓ |
97 lines elided |
↑ open up ↑ |
1628 1587 ba 3f
1629 1588 nop
1630 1589
1631 1590 2: FZEROQ1Q3
1632 1591
1633 1592 3: wr %l0, 0, %fprs ! restore fprs
1634 1593 ret
1635 1594 restore %g0, 0, %o0
1636 1595
1637 1596 SET_SIZE(hwblkpagecopy)
1638 -#endif /* lint */
1639 1597
1640 1598
1641 1599 /*
1642 1600 * Transfer data to and from user space -
1643 1601 * Note that these routines can cause faults
1644 1602 * It is assumed that the kernel has nothing at
1645 1603 * less than KERNELBASE in the virtual address space.
1646 1604 *
1647 1605 * Note that copyin(9F) and copyout(9F) are part of the
1648 1606 * DDI/DKI which specifies that they return '-1' on "errors."
1649 1607 *
1650 1608 * Sigh.
1651 1609 *
1652 1610 * So there's two extremely similar routines - xcopyin() and xcopyout()
1653 1611 * which return the errno that we've faithfully computed. This
1654 1612 * allows other callers (e.g. uiomove(9F)) to work correctly.
1655 1613 * Given that these are used pretty heavily, we expand the calling
1656 1614 * sequences inline for all flavours (rather than making wrappers).
1657 1615 *
1658 1616 * There are also stub routines for xcopyout_little and xcopyin_little,
1659 1617 * which currently are intended to handle requests of <= 16 bytes from
1660 1618 * do_unaligned. Future enhancement to make them handle 8k pages efficiently
1661 1619 * is left as an exercise...
1662 1620 */
1663 1621
1664 1622 /*
1665 1623 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
1666 1624 *
1667 1625 * General theory of operation:
1668 1626 *
1669 1627 * The only difference between copy{in,out} and
1670 1628 * xcopy{in,out} is in the error handling routine they invoke
1671 1629 * when a memory access error occurs. xcopyOP returns the errno
1672 1630 * while copyOP returns -1 (see above). copy{in,out}_noerr set
1673 1631 * a special flag (by oring the TRAMP_FLAG into the fault handler address)
1674 1632 * if they are called with a fault handler already in place. That flag
1675 1633 * causes the default handlers to trampoline to the previous handler
1676 1634 * upon an error.
1677 1635 *
1678 1636 * None of the copyops routines grab a window until it's decided that
1679 1637 * we need to do a HW block copy operation. This saves a window
1680 1638 * spill/fill when we're called during socket ops. The typical IO
1681 1639 * path won't cause spill/fill traps.
1682 1640 *
1683 1641 * This code uses a set of 4 limits for the maximum size that will
1684 1642 * be copied given a particular input/output address alignment.
1685 1643 * If the value for a particular limit is zero, the copy will be performed
1686 1644 * by the plain copy loops rather than FPBLK.
↓ open down ↓ |
38 lines elided |
↑ open up ↑ |
1687 1645 *
1688 1646 * See the description of bcopy above for more details of the
1689 1647 * data copying algorithm and the default limits.
1690 1648 *
1691 1649 */
1692 1650
1693 1651 /*
1694 1652 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
1695 1653 */
1696 1654
1697 -#if defined(lint)
1698 -
1699 -
1700 -#else /* lint */
1701 1655 /*
1702 1656 * We save the arguments in the following registers in case of a fault:
1703 1657 * kaddr - %l1
1704 1658 * uaddr - %l2
1705 1659 * count - %l3
1706 1660 */
1707 1661 #define SAVE_SRC %l1
1708 1662 #define SAVE_DST %l2
1709 1663 #define SAVE_COUNT %l3
1710 1664
1711 1665 #define SM_SAVE_SRC %g4
1712 1666 #define SM_SAVE_DST %g5
1713 1667 #define SM_SAVE_COUNT %o5
1714 1668 #define ERRNO %l5
1715 1669
1716 1670
1717 1671 #define REAL_LOFAULT %l4
1718 1672 /*
1719 1673 * Generic copyio fault handler. This is the first line of defense when a
1720 1674 * fault occurs in (x)copyin/(x)copyout. In order for this to function
1721 1675 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
1722 1676 * This allows us to share common code for all the flavors of the copy
1723 1677 * operations, including the _noerr versions.
1724 1678 *
1725 1679 * Note that this function will restore the original input parameters before
1726 1680 * calling REAL_LOFAULT. So the real handler can vector to the appropriate
1727 1681 * member of the t_copyop structure, if needed.
1728 1682 */
1729 1683 ENTRY(copyio_fault)
1730 1684 membar #Sync
1731 1685 mov %g1,ERRNO ! save errno in ERRNO
1732 1686 btst FPUSED_FLAG, %l6
1733 1687 bz %ncc, 1f
1734 1688 nop
1735 1689
1736 1690 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
1737 1691 wr %o2, 0, %gsr ! restore gsr
1738 1692
1739 1693 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1740 1694 btst FPRS_FEF, %o3
1741 1695 bz,pt %icc, 4f
1742 1696 nop
1743 1697
1744 1698 BLD_FPQ2Q4_FROMSTACK(%o2)
1745 1699
1746 1700 ba,pt %ncc, 1f
1747 1701 wr %o3, 0, %fprs ! restore fprs
1748 1702
1749 1703 4:
1750 1704 FZEROQ2Q4
1751 1705 wr %o3, 0, %fprs ! restore fprs
1752 1706
1753 1707 1:
1754 1708 andn %l6, FPUSED_FLAG, %l6
1755 1709 membar #Sync
1756 1710 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
↓ open down ↓ |
46 lines elided |
↑ open up ↑ |
1757 1711 FP_ALLOWMIGRATE(5, 6)
1758 1712
1759 1713 mov SAVE_SRC, %i0
1760 1714 mov SAVE_DST, %i1
1761 1715 jmp REAL_LOFAULT
1762 1716 mov SAVE_COUNT, %i2
1763 1717
1764 1718 SET_SIZE(copyio_fault)
1765 1719
1766 1720
1767 -#endif
1768 -
1769 -#if defined(lint)
1770 -
1771 -/*ARGSUSED*/
1772 -int
1773 -copyout(const void *kaddr, void *uaddr, size_t count)
1774 -{ return (0); }
1775 -
1776 -#else /* lint */
1777 -
1778 1721 ENTRY(copyout)
1779 1722
1780 1723 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
1781 1724 bleu,pt %ncc, .copyout_small ! go to larger cases
1782 1725 xor %o0, %o1, %o3 ! are src, dst alignable?
1783 1726 btst 7, %o3 !
1784 1727 bz,pt %ncc, .copyout_8 ! check for longword alignment
1785 1728 nop
1786 1729 btst 1, %o3 !
1787 1730 bz,pt %ncc, .copyout_2 ! check for half-word
1788 1731 nop
1789 1732 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
1790 1733 ld [%o3 + %lo(hw_copy_limit_1)], %o3
1791 1734 tst %o3
1792 1735 bz,pn %icc, .copyout_small ! if zero, disable HW copy
1793 1736 cmp %o2, %o3 ! if length <= limit
1794 1737 bleu,pt %ncc, .copyout_small ! go to small copy
1795 1738 nop
1796 1739 ba,pt %ncc, .copyout_more ! otherwise go to large copy
1797 1740 nop
1798 1741 .copyout_2:
1799 1742 btst 3, %o3 !
1800 1743 bz,pt %ncc, .copyout_4 ! check for word alignment
1801 1744 nop
1802 1745 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
1803 1746 ld [%o3 + %lo(hw_copy_limit_2)], %o3
1804 1747 tst %o3
1805 1748 bz,pn %icc, .copyout_small ! if zero, disable HW copy
1806 1749 cmp %o2, %o3 ! if length <= limit
1807 1750 bleu,pt %ncc, .copyout_small ! go to small copy
1808 1751 nop
1809 1752 ba,pt %ncc, .copyout_more ! otherwise go to large copy
1810 1753 nop
1811 1754 .copyout_4:
1812 1755 ! already checked longword, must be word aligned
1813 1756 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
1814 1757 ld [%o3 + %lo(hw_copy_limit_4)], %o3
1815 1758 tst %o3
1816 1759 bz,pn %icc, .copyout_small ! if zero, disable HW copy
1817 1760 cmp %o2, %o3 ! if length <= limit
1818 1761 bleu,pt %ncc, .copyout_small ! go to small copy
1819 1762 nop
1820 1763 ba,pt %ncc, .copyout_more ! otherwise go to large copy
1821 1764 nop
1822 1765 .copyout_8:
1823 1766 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
1824 1767 ld [%o3 + %lo(hw_copy_limit_8)], %o3
1825 1768 tst %o3
1826 1769 bz,pn %icc, .copyout_small ! if zero, disable HW copy
1827 1770 cmp %o2, %o3 ! if length <= limit
1828 1771 bleu,pt %ncc, .copyout_small ! go to small copy
1829 1772 nop
1830 1773 ba,pt %ncc, .copyout_more ! otherwise go to large copy
1831 1774 nop
1832 1775
1833 1776 .align 16
1834 1777 nop ! instruction alignment
1835 1778 ! see discussion at start of file
1836 1779 .copyout_small:
1837 1780 sethi %hi(.sm_copyout_err), %o5 ! .sm_copyout_err is lofault
1838 1781 or %o5, %lo(.sm_copyout_err), %o5
1839 1782 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler
1840 1783 membar #Sync ! sync error barrier
1841 1784 stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault
1842 1785 .sm_do_copyout:
1843 1786 mov %o0, SM_SAVE_SRC
1844 1787 mov %o1, SM_SAVE_DST
1845 1788 cmp %o2, SHORTCOPY ! check for really short case
1846 1789 bleu,pt %ncc, .co_sm_left !
1847 1790 mov %o2, SM_SAVE_COUNT
1848 1791 cmp %o2, CHKSIZE ! check for medium length cases
1849 1792 bgu,pn %ncc, .co_med !
1850 1793 or %o0, %o1, %o3 ! prepare alignment check
1851 1794 andcc %o3, 0x3, %g0 ! test for alignment
1852 1795 bz,pt %ncc, .co_sm_word ! branch to word aligned case
1853 1796 .co_sm_movebytes:
1854 1797 sub %o2, 3, %o2 ! adjust count to allow cc zero test
1855 1798 .co_sm_notalign4:
1856 1799 ldub [%o0], %o3 ! read byte
1857 1800 subcc %o2, 4, %o2 ! reduce count by 4
1858 1801 stba %o3, [%o1]ASI_USER ! write byte
1859 1802 inc %o1 ! advance DST by 1
1860 1803 ldub [%o0 + 1], %o3 ! repeat for a total of 4 bytes
1861 1804 add %o0, 4, %o0 ! advance SRC by 4
1862 1805 stba %o3, [%o1]ASI_USER
1863 1806 inc %o1 ! advance DST by 1
1864 1807 ldub [%o0 - 2], %o3
1865 1808 stba %o3, [%o1]ASI_USER
1866 1809 inc %o1 ! advance DST by 1
1867 1810 ldub [%o0 - 1], %o3
1868 1811 stba %o3, [%o1]ASI_USER
1869 1812 bgt,pt %ncc, .co_sm_notalign4 ! loop til 3 or fewer bytes remain
1870 1813 inc %o1 ! advance DST by 1
1871 1814 add %o2, 3, %o2 ! restore count
1872 1815 .co_sm_left:
1873 1816 tst %o2
1874 1817 bz,pt %ncc, .co_sm_exit ! check for zero length
1875 1818 nop
1876 1819 ldub [%o0], %o3 ! load one byte
1877 1820 deccc %o2 ! reduce count for cc test
1878 1821 bz,pt %ncc, .co_sm_exit
1879 1822 stba %o3,[%o1]ASI_USER ! store one byte
1880 1823 ldub [%o0 + 1], %o3 ! load second byte
1881 1824 deccc %o2
1882 1825 inc %o1
1883 1826 bz,pt %ncc, .co_sm_exit
1884 1827 stba %o3,[%o1]ASI_USER ! store second byte
1885 1828 ldub [%o0 + 2], %o3 ! load third byte
1886 1829 inc %o1
1887 1830 stba %o3,[%o1]ASI_USER ! store third byte
1888 1831 membar #Sync ! sync error barrier
1889 1832 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
1890 1833 retl
1891 1834 mov %g0, %o0 ! return 0
1892 1835 .align 16
1893 1836 .co_sm_words:
1894 1837 lduw [%o0], %o3 ! read word
1895 1838 .co_sm_wordx:
1896 1839 subcc %o2, 8, %o2 ! update count
1897 1840 stwa %o3, [%o1]ASI_USER ! write word
1898 1841 add %o0, 8, %o0 ! update SRC
1899 1842 lduw [%o0 - 4], %o3 ! read word
1900 1843 add %o1, 4, %o1 ! update DST
1901 1844 stwa %o3, [%o1]ASI_USER ! write word
1902 1845 bgt,pt %ncc, .co_sm_words ! loop til done
1903 1846 add %o1, 4, %o1 ! update DST
1904 1847 addcc %o2, 7, %o2 ! restore count
1905 1848 bz,pt %ncc, .co_sm_exit
1906 1849 nop
1907 1850 deccc %o2
1908 1851 bz,pt %ncc, .co_sm_byte
1909 1852 .co_sm_half:
1910 1853 subcc %o2, 2, %o2 ! reduce count by 2
1911 1854 lduh [%o0], %o3 ! read half word
1912 1855 add %o0, 2, %o0 ! advance SRC by 2
1913 1856 stha %o3, [%o1]ASI_USER ! write half word
1914 1857 bgt,pt %ncc, .co_sm_half ! loop til done
1915 1858 add %o1, 2, %o1 ! advance DST by 2
1916 1859 addcc %o2, 1, %o2 ! restore count
1917 1860 bz,pt %ncc, .co_sm_exit
1918 1861 nop
1919 1862 .co_sm_byte:
1920 1863 ldub [%o0], %o3
1921 1864 stba %o3, [%o1]ASI_USER
1922 1865 membar #Sync ! sync error barrier
1923 1866 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
1924 1867 retl
1925 1868 mov %g0, %o0 ! return 0
1926 1869 .align 16
1927 1870 .co_sm_word:
1928 1871 subcc %o2, 4, %o2 ! update count
1929 1872 bgt,pt %ncc, .co_sm_wordx
1930 1873 lduw [%o0], %o3 ! read word
1931 1874 addcc %o2, 3, %o2 ! restore count
1932 1875 bz,pt %ncc, .co_sm_exit
1933 1876 stwa %o3, [%o1]ASI_USER ! write word
1934 1877 deccc %o2 ! reduce count for cc test
1935 1878 ldub [%o0 + 4], %o3 ! load one byte
1936 1879 add %o1, 4, %o1
1937 1880 bz,pt %ncc, .co_sm_exit
1938 1881 stba %o3, [%o1]ASI_USER ! store one byte
1939 1882 ldub [%o0 + 5], %o3 ! load second byte
1940 1883 deccc %o2
1941 1884 inc %o1
1942 1885 bz,pt %ncc, .co_sm_exit
1943 1886 stba %o3, [%o1]ASI_USER ! store second byte
1944 1887 ldub [%o0 + 6], %o3 ! load third byte
1945 1888 inc %o1
1946 1889 stba %o3, [%o1]ASI_USER ! store third byte
1947 1890 .co_sm_exit:
1948 1891 membar #Sync ! sync error barrier
1949 1892 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
1950 1893 retl
1951 1894 mov %g0, %o0 ! return 0
1952 1895
1953 1896 .align 16
1954 1897 .co_med:
1955 1898 xor %o0, %o1, %o3 ! setup alignment check
1956 1899 btst 1, %o3
1957 1900 bnz,pt %ncc, .co_sm_movebytes ! unaligned
1958 1901 nop
1959 1902 btst 3, %o3
1960 1903 bnz,pt %ncc, .co_med_half ! halfword aligned
1961 1904 nop
1962 1905 btst 7, %o3
1963 1906 bnz,pt %ncc, .co_med_word ! word aligned
1964 1907 nop
1965 1908 .co_med_long:
1966 1909 btst 3, %o0 ! check for
1967 1910 bz,pt %ncc, .co_med_long1 ! word alignment
1968 1911 nop
1969 1912 .co_med_long0:
1970 1913 ldub [%o0], %o3 ! load one byte
1971 1914 inc %o0
1972 1915 stba %o3,[%o1]ASI_USER ! store byte
1973 1916 inc %o1
1974 1917 btst 3, %o0
1975 1918 bnz,pt %ncc, .co_med_long0
1976 1919 dec %o2
1977 1920 .co_med_long1: ! word aligned
1978 1921 btst 7, %o0 ! check for long word
1979 1922 bz,pt %ncc, .co_med_long2
1980 1923 nop
1981 1924 lduw [%o0], %o3 ! load word
1982 1925 add %o0, 4, %o0 ! advance SRC by 4
1983 1926 stwa %o3, [%o1]ASI_USER ! store word
1984 1927 add %o1, 4, %o1 ! advance DST by 4
1985 1928 sub %o2, 4, %o2 ! reduce count by 4
1986 1929 !
1987 1930 ! Now long word aligned and have at least 32 bytes to move
1988 1931 !
1989 1932 .co_med_long2:
1990 1933 sub %o2, 31, %o2 ! adjust count to allow cc zero test
1991 1934 sub %o1, 8, %o1 ! adjust pointer to allow store in
1992 1935 ! branch delay slot instead of add
1993 1936 .co_med_lmove:
1994 1937 add %o1, 8, %o1 ! advance DST by 8
1995 1938 ldx [%o0], %o3 ! read long word
1996 1939 subcc %o2, 32, %o2 ! reduce count by 32
1997 1940 stxa %o3, [%o1]ASI_USER ! write long word
1998 1941 add %o1, 8, %o1 ! advance DST by 8
1999 1942 ldx [%o0 + 8], %o3 ! repeat for a total for 4 long words
2000 1943 add %o0, 32, %o0 ! advance SRC by 32
2001 1944 stxa %o3, [%o1]ASI_USER
2002 1945 ldx [%o0 - 16], %o3
2003 1946 add %o1, 8, %o1 ! advance DST by 8
2004 1947 stxa %o3, [%o1]ASI_USER
2005 1948 ldx [%o0 - 8], %o3
2006 1949 add %o1, 8, %o1 ! advance DST by 8
2007 1950 bgt,pt %ncc, .co_med_lmove ! loop til 31 or fewer bytes left
2008 1951 stxa %o3, [%o1]ASI_USER
2009 1952 add %o1, 8, %o1 ! advance DST by 8
2010 1953 addcc %o2, 24, %o2 ! restore count to long word offset
2011 1954 ble,pt %ncc, .co_med_lextra ! check for more long words to move
2012 1955 nop
2013 1956 .co_med_lword:
2014 1957 ldx [%o0], %o3 ! read long word
2015 1958 subcc %o2, 8, %o2 ! reduce count by 8
2016 1959 stxa %o3, [%o1]ASI_USER ! write long word
2017 1960 add %o0, 8, %o0 ! advance SRC by 8
2018 1961 bgt,pt %ncc, .co_med_lword ! loop til 7 or fewer bytes left
2019 1962 add %o1, 8, %o1 ! advance DST by 8
2020 1963 .co_med_lextra:
2021 1964 addcc %o2, 7, %o2 ! restore rest of count
2022 1965 bz,pt %ncc, .co_sm_exit ! if zero, then done
2023 1966 deccc %o2
2024 1967 bz,pt %ncc, .co_sm_byte
2025 1968 nop
2026 1969 ba,pt %ncc, .co_sm_half
2027 1970 nop
2028 1971
2029 1972 .align 16
2030 1973 nop ! instruction alignment
2031 1974 ! see discussion at start of file
2032 1975 .co_med_word:
2033 1976 btst 3, %o0 ! check for
2034 1977 bz,pt %ncc, .co_med_word1 ! word alignment
2035 1978 nop
2036 1979 .co_med_word0:
2037 1980 ldub [%o0], %o3 ! load one byte
2038 1981 inc %o0
2039 1982 stba %o3,[%o1]ASI_USER ! store byte
2040 1983 inc %o1
2041 1984 btst 3, %o0
2042 1985 bnz,pt %ncc, .co_med_word0
2043 1986 dec %o2
2044 1987 !
2045 1988 ! Now word aligned and have at least 36 bytes to move
2046 1989 !
2047 1990 .co_med_word1:
2048 1991 sub %o2, 15, %o2 ! adjust count to allow cc zero test
2049 1992 .co_med_wmove:
2050 1993 lduw [%o0], %o3 ! read word
2051 1994 subcc %o2, 16, %o2 ! reduce count by 16
2052 1995 stwa %o3, [%o1]ASI_USER ! write word
2053 1996 add %o1, 4, %o1 ! advance DST by 4
2054 1997 lduw [%o0 + 4], %o3 ! repeat for a total for 4 words
2055 1998 add %o0, 16, %o0 ! advance SRC by 16
2056 1999 stwa %o3, [%o1]ASI_USER
2057 2000 add %o1, 4, %o1 ! advance DST by 4
2058 2001 lduw [%o0 - 8], %o3
2059 2002 stwa %o3, [%o1]ASI_USER
2060 2003 add %o1, 4, %o1 ! advance DST by 4
2061 2004 lduw [%o0 - 4], %o3
2062 2005 stwa %o3, [%o1]ASI_USER
2063 2006 bgt,pt %ncc, .co_med_wmove ! loop til 15 or fewer bytes left
2064 2007 add %o1, 4, %o1 ! advance DST by 4
2065 2008 addcc %o2, 12, %o2 ! restore count to word offset
2066 2009 ble,pt %ncc, .co_med_wextra ! check for more words to move
2067 2010 nop
2068 2011 .co_med_word2:
2069 2012 lduw [%o0], %o3 ! read word
2070 2013 subcc %o2, 4, %o2 ! reduce count by 4
2071 2014 stwa %o3, [%o1]ASI_USER ! write word
2072 2015 add %o0, 4, %o0 ! advance SRC by 4
2073 2016 bgt,pt %ncc, .co_med_word2 ! loop til 3 or fewer bytes left
2074 2017 add %o1, 4, %o1 ! advance DST by 4
2075 2018 .co_med_wextra:
2076 2019 addcc %o2, 3, %o2 ! restore rest of count
2077 2020 bz,pt %ncc, .co_sm_exit ! if zero, then done
2078 2021 deccc %o2
2079 2022 bz,pt %ncc, .co_sm_byte
2080 2023 nop
2081 2024 ba,pt %ncc, .co_sm_half
2082 2025 nop
2083 2026
2084 2027 .align 16
2085 2028 nop ! instruction alignment
2086 2029 nop ! see discussion at start of file
2087 2030 nop
2088 2031 .co_med_half:
2089 2032 btst 1, %o0 ! check for
2090 2033 bz,pt %ncc, .co_med_half1 ! half word alignment
2091 2034 nop
2092 2035 ldub [%o0], %o3 ! load one byte
2093 2036 inc %o0
2094 2037 stba %o3,[%o1]ASI_USER ! store byte
2095 2038 inc %o1
2096 2039 dec %o2
2097 2040 !
2098 2041 ! Now half word aligned and have at least 38 bytes to move
2099 2042 !
2100 2043 .co_med_half1:
2101 2044 sub %o2, 7, %o2 ! adjust count to allow cc zero test
2102 2045 .co_med_hmove:
2103 2046 lduh [%o0], %o3 ! read half word
2104 2047 subcc %o2, 8, %o2 ! reduce count by 8
2105 2048 stha %o3, [%o1]ASI_USER ! write half word
2106 2049 add %o1, 2, %o1 ! advance DST by 2
2107 2050 lduh [%o0 + 2], %o3 ! repeat for a total for 4 halfwords
2108 2051 add %o0, 8, %o0 ! advance SRC by 8
2109 2052 stha %o3, [%o1]ASI_USER
2110 2053 add %o1, 2, %o1 ! advance DST by 2
2111 2054 lduh [%o0 - 4], %o3
2112 2055 stha %o3, [%o1]ASI_USER
2113 2056 add %o1, 2, %o1 ! advance DST by 2
2114 2057 lduh [%o0 - 2], %o3
2115 2058 stha %o3, [%o1]ASI_USER
2116 2059 bgt,pt %ncc, .co_med_hmove ! loop til 7 or fewer bytes left
2117 2060 add %o1, 2, %o1 ! advance DST by 2
2118 2061 addcc %o2, 7, %o2 ! restore count
2119 2062 bz,pt %ncc, .co_sm_exit
2120 2063 deccc %o2
2121 2064 bz,pt %ncc, .co_sm_byte
2122 2065 nop
2123 2066 ba,pt %ncc, .co_sm_half
2124 2067 nop
2125 2068
2126 2069 /*
2127 2070 * We got here because of a fault during short copyout.
2128 2071 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
2129 2072 */
2130 2073 .sm_copyout_err:
2131 2074 membar #Sync
2132 2075 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2133 2076 mov SM_SAVE_SRC, %o0
2134 2077 mov SM_SAVE_DST, %o1
2135 2078 mov SM_SAVE_COUNT, %o2
2136 2079 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler
2137 2080 tst %o3
2138 2081 bz,pt %ncc, 3f ! if not, return error
2139 2082 nop
2140 2083 ldn [%o3 + CP_COPYOUT], %o5 ! if handler, invoke it with
2141 2084 jmp %o5 ! original arguments
2142 2085 nop
2143 2086 3:
2144 2087 retl
2145 2088 or %g0, -1, %o0 ! return error value
2146 2089
2147 2090 SET_SIZE(copyout)
2148 2091
2149 2092 /*
2150 2093 * The _more entry points are not intended to be used directly by
2151 2094 * any caller from outside this file. They are provided to allow
2152 2095 * profiling and dtrace of the portions of the copy code that uses
2153 2096 * the floating point registers.
2154 2097 * This entry is particularly important as DTRACE (at least as of
2155 2098 * 4/2004) does not support leaf functions.
2156 2099 */
2157 2100
2158 2101 ENTRY(copyout_more)
2159 2102 .copyout_more:
2160 2103 prefetch [%o0], #n_reads
2161 2104 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2162 2105 set .copyout_err, REAL_LOFAULT
2163 2106
2164 2107 /*
2165 2108 * Copy outs that reach here are larger than VIS_COPY_THRESHOLD bytes
2166 2109 */
2167 2110 .do_copyout:
2168 2111 set copyio_fault, %l7 ! .copyio_fault is lofault val
2169 2112
2170 2113 ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler
2171 2114 membar #Sync ! sync error barrier
2172 2115 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault
2173 2116
2174 2117 mov %i0, SAVE_SRC
2175 2118 mov %i1, SAVE_DST
2176 2119 mov %i2, SAVE_COUNT
2177 2120
2178 2121 FP_NOMIGRATE(6, 7)
2179 2122
2180 2123 rd %fprs, %o2 ! check for unused fp
2181 2124 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
2182 2125 btst FPRS_FEF, %o2
2183 2126 bz,a,pt %icc, .do_blockcopyout
2184 2127 wr %g0, FPRS_FEF, %fprs
2185 2128
2186 2129 BST_FPQ2Q4_TOSTACK(%o2)
2187 2130
2188 2131 .do_blockcopyout:
2189 2132 rd %gsr, %o2
2190 2133 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr
2191 2134 or %l6, FPUSED_FLAG, %l6
2192 2135
2193 2136 andcc DST, VIS_BLOCKSIZE - 1, TMP
2194 2137 mov ASI_USER, %asi
2195 2138 bz,pt %ncc, 2f
2196 2139 neg TMP
2197 2140 add TMP, VIS_BLOCKSIZE, TMP
2198 2141
2199 2142 ! TMP = bytes required to align DST on FP_BLOCK boundary
2200 2143 ! Using SRC as a tmp here
2201 2144 cmp TMP, 3
2202 2145 bleu,pt %ncc, 1f
2203 2146 sub CNT,TMP,CNT ! adjust main count
2204 2147 sub TMP, 3, TMP ! adjust for end of loop test
2205 2148 .co_blkalign:
2206 2149 ldub [REALSRC], SRC ! move 4 bytes per loop iteration
2207 2150 stba SRC, [DST]%asi
2208 2151 subcc TMP, 4, TMP
2209 2152 ldub [REALSRC + 1], SRC
2210 2153 add REALSRC, 4, REALSRC
2211 2154 stba SRC, [DST + 1]%asi
2212 2155 ldub [REALSRC - 2], SRC
2213 2156 add DST, 4, DST
2214 2157 stba SRC, [DST - 2]%asi
2215 2158 ldub [REALSRC - 1], SRC
2216 2159 bgu,pt %ncc, .co_blkalign
2217 2160 stba SRC, [DST - 1]%asi
2218 2161
2219 2162 addcc TMP, 3, TMP ! restore count adjustment
2220 2163 bz,pt %ncc, 2f ! no bytes left?
2221 2164 nop
2222 2165 1: ldub [REALSRC], SRC
2223 2166 inc REALSRC
2224 2167 inc DST
2225 2168 deccc TMP
2226 2169 bgu %ncc, 1b
2227 2170 stba SRC, [DST - 1]%asi
2228 2171
2229 2172 2:
2230 2173 membar #StoreLoad
2231 2174 andn REALSRC, 0x7, SRC
2232 2175
2233 2176 ! SRC - 8-byte aligned
2234 2177 ! DST - 64-byte aligned
2235 2178 ldd [SRC], %f16
2236 2179 prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads
2237 2180 alignaddr REALSRC, %g0, %g0
2238 2181 ldd [SRC + 0x08], %f18
2239 2182 prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads
2240 2183 faligndata %f16, %f18, %f48
2241 2184 ldd [SRC + 0x10], %f20
2242 2185 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
2243 2186 faligndata %f18, %f20, %f50
2244 2187 ldd [SRC + 0x18], %f22
2245 2188 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
2246 2189 faligndata %f20, %f22, %f52
2247 2190 ldd [SRC + 0x20], %f24
2248 2191 prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read
2249 2192 faligndata %f22, %f24, %f54
2250 2193 ldd [SRC + 0x28], %f26
2251 2194 prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read
2252 2195 faligndata %f24, %f26, %f56
2253 2196 ldd [SRC + 0x30], %f28
2254 2197 prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read
2255 2198 faligndata %f26, %f28, %f58
2256 2199 ldd [SRC + 0x38], %f30
2257 2200 ldd [SRC + VIS_BLOCKSIZE], %f16
2258 2201 sub CNT, VIS_BLOCKSIZE, CNT
2259 2202 add SRC, VIS_BLOCKSIZE, SRC
2260 2203 prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read
2261 2204 add REALSRC, VIS_BLOCKSIZE, REALSRC
2262 2205 ba,pt %ncc, 1f
2263 2206 prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read
2264 2207 .align 32
2265 2208 1:
2266 2209 ldd [SRC + 0x08], %f18
2267 2210 faligndata %f28, %f30, %f60
2268 2211 ldd [SRC + 0x10], %f20
2269 2212 faligndata %f30, %f16, %f62
2270 2213 stda %f48, [DST]ASI_BLK_AIUS
2271 2214 ldd [SRC + 0x18], %f22
2272 2215 faligndata %f16, %f18, %f48
2273 2216 ldd [SRC + 0x20], %f24
2274 2217 faligndata %f18, %f20, %f50
2275 2218 ldd [SRC + 0x28], %f26
2276 2219 faligndata %f20, %f22, %f52
2277 2220 ldd [SRC + 0x30], %f28
2278 2221 faligndata %f22, %f24, %f54
2279 2222 sub CNT, VIS_BLOCKSIZE, CNT
2280 2223 ldd [SRC + 0x38], %f30
2281 2224 faligndata %f24, %f26, %f56
2282 2225 add DST, VIS_BLOCKSIZE, DST
2283 2226 ldd [SRC + VIS_BLOCKSIZE], %f16
2284 2227 faligndata %f26, %f28, %f58
2285 2228 add REALSRC, VIS_BLOCKSIZE, REALSRC
2286 2229 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
2287 2230 add SRC, VIS_BLOCKSIZE, SRC
2288 2231 prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read
2289 2232 cmp CNT, VIS_BLOCKSIZE + 8
2290 2233 bgu,pt %ncc, 1b
2291 2234 prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
2292 2235
2293 2236 ! only if REALSRC & 0x7 is 0
2294 2237 cmp CNT, VIS_BLOCKSIZE
2295 2238 bne %ncc, 3f
2296 2239 andcc REALSRC, 0x7, %g0
2297 2240 bz,pt %ncc, 2f
2298 2241 nop
2299 2242 3:
2300 2243 faligndata %f28, %f30, %f60
2301 2244 faligndata %f30, %f16, %f62
2302 2245 stda %f48, [DST]ASI_BLK_AIUS
2303 2246 add DST, VIS_BLOCKSIZE, DST
2304 2247 ba,pt %ncc, 3f
2305 2248 nop
2306 2249 2:
2307 2250 ldd [SRC + 0x08], %f18
2308 2251 fsrc1 %f28, %f60
2309 2252 ldd [SRC + 0x10], %f20
2310 2253 fsrc1 %f30, %f62
2311 2254 stda %f48, [DST]ASI_BLK_AIUS
2312 2255 ldd [SRC + 0x18], %f22
2313 2256 fsrc1 %f16, %f48
2314 2257 ldd [SRC + 0x20], %f24
2315 2258 fsrc1 %f18, %f50
2316 2259 ldd [SRC + 0x28], %f26
2317 2260 fsrc1 %f20, %f52
2318 2261 ldd [SRC + 0x30], %f28
2319 2262 fsrc1 %f22, %f54
2320 2263 ldd [SRC + 0x38], %f30
2321 2264 fsrc1 %f24, %f56
2322 2265 sub CNT, VIS_BLOCKSIZE, CNT
2323 2266 add DST, VIS_BLOCKSIZE, DST
2324 2267 add SRC, VIS_BLOCKSIZE, SRC
2325 2268 add REALSRC, VIS_BLOCKSIZE, REALSRC
2326 2269 fsrc1 %f26, %f58
2327 2270 fsrc1 %f28, %f60
2328 2271 fsrc1 %f30, %f62
2329 2272 stda %f48, [DST]ASI_BLK_AIUS
2330 2273 add DST, VIS_BLOCKSIZE, DST
2331 2274 ba,a,pt %ncc, 4f
2332 2275 nop
2333 2276
2334 2277 3: tst CNT
2335 2278 bz,a %ncc, 4f
2336 2279 nop
2337 2280
2338 2281 5: ldub [REALSRC], TMP
2339 2282 inc REALSRC
2340 2283 inc DST
2341 2284 deccc CNT
2342 2285 bgu %ncc, 5b
2343 2286 stba TMP, [DST - 1]%asi
2344 2287 4:
2345 2288
2346 2289 .copyout_exit:
2347 2290 membar #Sync
2348 2291
2349 2292 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
2350 2293 wr %o2, 0, %gsr ! restore gsr
2351 2294
2352 2295 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
2353 2296 btst FPRS_FEF, %o3
2354 2297 bz,pt %icc, 4f
2355 2298 nop
2356 2299
2357 2300 BLD_FPQ2Q4_FROMSTACK(%o2)
2358 2301
2359 2302 ba,pt %ncc, 1f
2360 2303 wr %o3, 0, %fprs ! restore fprs
2361 2304
2362 2305 4:
2363 2306 FZEROQ2Q4
2364 2307 wr %o3, 0, %fprs ! restore fprs
2365 2308
2366 2309 1:
2367 2310 membar #Sync
2368 2311 andn %l6, FPUSED_FLAG, %l6
2369 2312 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2370 2313 FP_ALLOWMIGRATE(5, 6)
2371 2314 ret
2372 2315 restore %g0, 0, %o0
2373 2316
2374 2317 /*
2375 2318 * We got here because of a fault during copyout.
2376 2319 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
2377 2320 */
2378 2321 .copyout_err:
2379 2322 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler
2380 2323 tst %o4
2381 2324 bz,pt %ncc, 2f ! if not, return error
2382 2325 nop
↓ open down ↓ |
595 lines elided |
↑ open up ↑ |
2383 2326 ldn [%o4 + CP_COPYOUT], %g2 ! if handler, invoke it with
2384 2327 jmp %g2 ! original arguments
2385 2328 restore %g0, 0, %g0 ! dispose of copy window
2386 2329 2:
2387 2330 ret
2388 2331 restore %g0, -1, %o0 ! return error value
2389 2332
2390 2333
2391 2334 SET_SIZE(copyout_more)
2392 2335
2393 -#endif /* lint */
2394 2336
2395 -
2396 -#ifdef lint
2397 -
2398 -/*ARGSUSED*/
2399 -int
2400 -xcopyout(const void *kaddr, void *uaddr, size_t count)
2401 -{ return (0); }
2402 -
2403 -#else /* lint */
2404 -
2405 2337 ENTRY(xcopyout)
2406 2338 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
2407 2339 bleu,pt %ncc, .xcopyout_small ! go to larger cases
2408 2340 xor %o0, %o1, %o3 ! are src, dst alignable?
2409 2341 btst 7, %o3 !
2410 2342 bz,pt %ncc, .xcopyout_8 !
2411 2343 nop
2412 2344 btst 1, %o3 !
2413 2345 bz,pt %ncc, .xcopyout_2 ! check for half-word
2414 2346 nop
2415 2347 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
2416 2348 ld [%o3 + %lo(hw_copy_limit_1)], %o3
2417 2349 tst %o3
2418 2350 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy
2419 2351 cmp %o2, %o3 ! if length <= limit
2420 2352 bleu,pt %ncc, .xcopyout_small ! go to small copy
2421 2353 nop
2422 2354 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy
2423 2355 nop
2424 2356 .xcopyout_2:
2425 2357 btst 3, %o3 !
2426 2358 bz,pt %ncc, .xcopyout_4 ! check for word alignment
2427 2359 nop
2428 2360 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
2429 2361 ld [%o3 + %lo(hw_copy_limit_2)], %o3
2430 2362 tst %o3
2431 2363 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy
2432 2364 cmp %o2, %o3 ! if length <= limit
2433 2365 bleu,pt %ncc, .xcopyout_small ! go to small copy
2434 2366 nop
2435 2367 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy
2436 2368 nop
2437 2369 .xcopyout_4:
2438 2370 ! already checked longword, must be word aligned
2439 2371 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
2440 2372 ld [%o3 + %lo(hw_copy_limit_4)], %o3
2441 2373 tst %o3
2442 2374 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy
2443 2375 cmp %o2, %o3 ! if length <= limit
2444 2376 bleu,pt %ncc, .xcopyout_small ! go to small copy
2445 2377 nop
2446 2378 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy
2447 2379 nop
2448 2380 .xcopyout_8:
2449 2381 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
2450 2382 ld [%o3 + %lo(hw_copy_limit_8)], %o3
2451 2383 tst %o3
2452 2384 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy
2453 2385 cmp %o2, %o3 ! if length <= limit
2454 2386 bleu,pt %ncc, .xcopyout_small ! go to small copy
2455 2387 nop
2456 2388 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy
2457 2389 nop
2458 2390
2459 2391 .xcopyout_small:
2460 2392 sethi %hi(.sm_xcopyout_err), %o5 ! .sm_xcopyout_err is lofault
2461 2393 or %o5, %lo(.sm_xcopyout_err), %o5
2462 2394 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler
2463 2395 membar #Sync ! sync error barrier
2464 2396 ba,pt %ncc, .sm_do_copyout ! common code
2465 2397 stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault
2466 2398
2467 2399 .xcopyout_more:
2468 2400 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2469 2401 sethi %hi(.xcopyout_err), REAL_LOFAULT
2470 2402 ba,pt %ncc, .do_copyout ! common code
2471 2403 or REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
2472 2404
2473 2405 /*
2474 2406 * We got here because of fault during xcopyout
2475 2407 * Errno value is in ERRNO
2476 2408 */
2477 2409 .xcopyout_err:
2478 2410 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler
2479 2411 tst %o4
2480 2412 bz,pt %ncc, 2f ! if not, return error
2481 2413 nop
2482 2414 ldn [%o4 + CP_XCOPYOUT], %g2 ! if handler, invoke it with
2483 2415 jmp %g2 ! original arguments
2484 2416 restore %g0, 0, %g0 ! dispose of copy window
2485 2417 2:
2486 2418 ret
2487 2419 restore ERRNO, 0, %o0 ! return errno value
2488 2420
2489 2421 .sm_xcopyout_err:
2490 2422
2491 2423 membar #Sync
2492 2424 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2493 2425 mov SM_SAVE_SRC, %o0
2494 2426 mov SM_SAVE_DST, %o1
2495 2427 mov SM_SAVE_COUNT, %o2
2496 2428 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler
2497 2429 tst %o3
2498 2430 bz,pt %ncc, 3f ! if not, return error
↓ open down ↓ |
84 lines elided |
↑ open up ↑ |
2499 2431 nop
2500 2432 ldn [%o3 + CP_XCOPYOUT], %o5 ! if handler, invoke it with
2501 2433 jmp %o5 ! original arguments
2502 2434 nop
2503 2435 3:
2504 2436 retl
2505 2437 or %g1, 0, %o0 ! return errno value
2506 2438
2507 2439 SET_SIZE(xcopyout)
2508 2440
2509 -#endif /* lint */
2510 -
2511 -#ifdef lint
2512 -
2513 -/*ARGSUSED*/
2514 -int
2515 -xcopyout_little(const void *kaddr, void *uaddr, size_t count)
2516 -{ return (0); }
2517 -
2518 -#else /* lint */
2519 -
2520 2441 ENTRY(xcopyout_little)
2521 2442 sethi %hi(.xcopyio_err), %o5
2522 2443 or %o5, %lo(.xcopyio_err), %o5
2523 2444 ldn [THREAD_REG + T_LOFAULT], %o4
2524 2445 membar #Sync ! sync error barrier
2525 2446 stn %o5, [THREAD_REG + T_LOFAULT]
2526 2447 mov %o4, %o5
2527 2448
2528 2449 subcc %g0, %o2, %o3
2529 2450 add %o0, %o2, %o0
2530 2451 bz,pn %ncc, 2f ! check for zero bytes
2531 2452 sub %o2, 1, %o4
2532 2453 add %o0, %o4, %o0 ! start w/last byte
2533 2454 add %o1, %o2, %o1
2534 2455 ldub [%o0 + %o3], %o4
2535 2456
2536 2457 1: stba %o4, [%o1 + %o3]ASI_AIUSL
2537 2458 inccc %o3
2538 2459 sub %o0, 2, %o0 ! get next byte
2539 2460 bcc,a,pt %ncc, 1b
↓ open down ↓ |
10 lines elided |
↑ open up ↑ |
2540 2461 ldub [%o0 + %o3], %o4
2541 2462
2542 2463 2:
2543 2464 membar #Sync ! sync error barrier
2544 2465 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2545 2466 retl
2546 2467 mov %g0, %o0 ! return (0)
2547 2468
2548 2469 SET_SIZE(xcopyout_little)
2549 2470
2550 -#endif /* lint */
2551 -
2552 2471 /*
2553 2472 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
2554 2473 */
2555 2474
2556 -#if defined(lint)
2557 -
2558 -/*ARGSUSED*/
2559 -int
2560 -copyin(const void *uaddr, void *kaddr, size_t count)
2561 -{ return (0); }
2562 -
2563 -#else /* lint */
2564 -
2565 2475 ENTRY(copyin)
2566 2476 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
2567 2477 bleu,pt %ncc, .copyin_small ! go to larger cases
2568 2478 xor %o0, %o1, %o3 ! are src, dst alignable?
2569 2479 btst 7, %o3 !
2570 2480 bz,pt %ncc, .copyin_8 ! check for longword alignment
2571 2481 nop
2572 2482 btst 1, %o3 !
2573 2483 bz,pt %ncc, .copyin_2 ! check for half-word
2574 2484 nop
2575 2485 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
2576 2486 ld [%o3 + %lo(hw_copy_limit_1)], %o3
2577 2487 tst %o3
2578 2488 bz,pn %icc, .copyin_small ! if zero, disable HW copy
2579 2489 cmp %o2, %o3 ! if length <= limit
2580 2490 bleu,pt %ncc, .copyin_small ! go to small copy
2581 2491 nop
2582 2492 ba,pt %ncc, .copyin_more ! otherwise go to large copy
2583 2493 nop
2584 2494 .copyin_2:
2585 2495 btst 3, %o3 !
2586 2496 bz,pt %ncc, .copyin_4 ! check for word alignment
2587 2497 nop
2588 2498 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
2589 2499 ld [%o3 + %lo(hw_copy_limit_2)], %o3
2590 2500 tst %o3
2591 2501 bz,pn %icc, .copyin_small ! if zero, disable HW copy
2592 2502 cmp %o2, %o3 ! if length <= limit
2593 2503 bleu,pt %ncc, .copyin_small ! go to small copy
2594 2504 nop
2595 2505 ba,pt %ncc, .copyin_more ! otherwise go to large copy
2596 2506 nop
2597 2507 .copyin_4:
2598 2508 ! already checked longword, must be word aligned
2599 2509 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
2600 2510 ld [%o3 + %lo(hw_copy_limit_4)], %o3
2601 2511 tst %o3
2602 2512 bz,pn %icc, .copyin_small ! if zero, disable HW copy
2603 2513 cmp %o2, %o3 ! if length <= limit
2604 2514 bleu,pt %ncc, .copyin_small ! go to small copy
2605 2515 nop
2606 2516 ba,pt %ncc, .copyin_more ! otherwise go to large copy
2607 2517 nop
2608 2518 .copyin_8:
2609 2519 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
2610 2520 ld [%o3 + %lo(hw_copy_limit_8)], %o3
2611 2521 tst %o3
2612 2522 bz,pn %icc, .copyin_small ! if zero, disable HW copy
2613 2523 cmp %o2, %o3 ! if length <= limit
2614 2524 bleu,pt %ncc, .copyin_small ! go to small copy
2615 2525 nop
2616 2526 ba,pt %ncc, .copyin_more ! otherwise go to large copy
2617 2527 nop
2618 2528
2619 2529 .align 16
2620 2530 nop ! instruction alignment
2621 2531 ! see discussion at start of file
2622 2532 .copyin_small:
2623 2533 sethi %hi(.sm_copyin_err), %o5 ! .sm_copyin_err is lofault
2624 2534 or %o5, %lo(.sm_copyin_err), %o5
2625 2535 ldn [THREAD_REG + T_LOFAULT], %o4 ! set/save t_lofault, no tramp
2626 2536 membar #Sync ! sync error barrier
2627 2537 stn %o5, [THREAD_REG + T_LOFAULT]
2628 2538 .sm_do_copyin:
2629 2539 mov %o0, SM_SAVE_SRC
2630 2540 mov %o1, SM_SAVE_DST
2631 2541 cmp %o2, SHORTCOPY ! check for really short case
2632 2542 bleu,pt %ncc, .ci_sm_left !
2633 2543 mov %o2, SM_SAVE_COUNT
2634 2544 cmp %o2, CHKSIZE ! check for medium length cases
2635 2545 bgu,pn %ncc, .ci_med !
2636 2546 or %o0, %o1, %o3 ! prepare alignment check
2637 2547 andcc %o3, 0x3, %g0 ! test for alignment
2638 2548 bz,pt %ncc, .ci_sm_word ! branch to word aligned case
2639 2549 .ci_sm_movebytes:
2640 2550 sub %o2, 3, %o2 ! adjust count to allow cc zero test
2641 2551 .ci_sm_notalign4:
2642 2552 lduba [%o0]ASI_USER, %o3 ! read byte
2643 2553 subcc %o2, 4, %o2 ! reduce count by 4
2644 2554 stb %o3, [%o1] ! write byte
2645 2555 add %o0, 1, %o0 ! advance SRC by 1
2646 2556 lduba [%o0]ASI_USER, %o3 ! repeat for a total of 4 bytes
2647 2557 add %o0, 1, %o0 ! advance SRC by 1
2648 2558 stb %o3, [%o1 + 1]
2649 2559 add %o1, 4, %o1 ! advance DST by 4
2650 2560 lduba [%o0]ASI_USER, %o3
2651 2561 add %o0, 1, %o0 ! advance SRC by 1
2652 2562 stb %o3, [%o1 - 2]
2653 2563 lduba [%o0]ASI_USER, %o3
2654 2564 add %o0, 1, %o0 ! advance SRC by 1
2655 2565 bgt,pt %ncc, .ci_sm_notalign4 ! loop til 3 or fewer bytes remain
2656 2566 stb %o3, [%o1 - 1]
2657 2567 add %o2, 3, %o2 ! restore count
2658 2568 .ci_sm_left:
2659 2569 tst %o2
2660 2570 bz,pt %ncc, .ci_sm_exit
2661 2571 nop
2662 2572 lduba [%o0]ASI_USER, %o3 ! load one byte
2663 2573 deccc %o2 ! reduce count for cc test
2664 2574 bz,pt %ncc, .ci_sm_exit
2665 2575 stb %o3,[%o1] ! store one byte
2666 2576 inc %o0
2667 2577 lduba [%o0]ASI_USER, %o3 ! load second byte
2668 2578 deccc %o2
2669 2579 bz,pt %ncc, .ci_sm_exit
2670 2580 stb %o3,[%o1 + 1] ! store second byte
2671 2581 inc %o0
2672 2582 lduba [%o0]ASI_USER, %o3 ! load third byte
2673 2583 stb %o3,[%o1 + 2] ! store third byte
2674 2584 membar #Sync ! sync error barrier
2675 2585 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2676 2586 retl
2677 2587 mov %g0, %o0 ! return 0
2678 2588 .align 16
2679 2589 .ci_sm_words:
2680 2590 lduwa [%o0]ASI_USER, %o3 ! read word
2681 2591 .ci_sm_wordx:
2682 2592 subcc %o2, 8, %o2 ! update count
2683 2593 stw %o3, [%o1] ! write word
2684 2594 add %o0, 4, %o0 ! update SRC
2685 2595 add %o1, 8, %o1 ! update DST
2686 2596 lduwa [%o0]ASI_USER, %o3 ! read word
2687 2597 add %o0, 4, %o0 ! update SRC
2688 2598 bgt,pt %ncc, .ci_sm_words ! loop til done
2689 2599 stw %o3, [%o1 - 4] ! write word
2690 2600 addcc %o2, 7, %o2 ! restore count
2691 2601 bz,pt %ncc, .ci_sm_exit
2692 2602 nop
2693 2603 deccc %o2
2694 2604 bz,pt %ncc, .ci_sm_byte
2695 2605 .ci_sm_half:
2696 2606 subcc %o2, 2, %o2 ! reduce count by 2
2697 2607 lduha [%o0]ASI_USER, %o3 ! read half word
2698 2608 add %o0, 2, %o0 ! advance SRC by 2
2699 2609 add %o1, 2, %o1 ! advance DST by 2
2700 2610 bgt,pt %ncc, .ci_sm_half ! loop til done
2701 2611 sth %o3, [%o1 - 2] ! write half word
2702 2612 addcc %o2, 1, %o2 ! restore count
2703 2613 bz,pt %ncc, .ci_sm_exit
2704 2614 nop
2705 2615 .ci_sm_byte:
2706 2616 lduba [%o0]ASI_USER, %o3
2707 2617 stb %o3, [%o1]
2708 2618 membar #Sync ! sync error barrier
2709 2619 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2710 2620 retl
2711 2621 mov %g0, %o0 ! return 0
2712 2622 .align 16
2713 2623 .ci_sm_word:
2714 2624 subcc %o2, 4, %o2 ! update count
2715 2625 bgt,pt %ncc, .ci_sm_wordx
2716 2626 lduwa [%o0]ASI_USER, %o3 ! read word
2717 2627 addcc %o2, 3, %o2 ! restore count
2718 2628 bz,pt %ncc, .ci_sm_exit
2719 2629 stw %o3, [%o1] ! write word
2720 2630 deccc %o2 ! reduce count for cc test
2721 2631 add %o0, 4, %o0
2722 2632 lduba [%o0]ASI_USER, %o3 ! load one byte
2723 2633 bz,pt %ncc, .ci_sm_exit
2724 2634 stb %o3, [%o1 + 4] ! store one byte
2725 2635 inc %o0
2726 2636 lduba [%o0]ASI_USER, %o3 ! load second byte
2727 2637 deccc %o2
2728 2638 bz,pt %ncc, .ci_sm_exit
2729 2639 stb %o3, [%o1 + 5] ! store second byte
2730 2640 inc %o0
2731 2641 lduba [%o0]ASI_USER, %o3 ! load third byte
2732 2642 stb %o3, [%o1 + 6] ! store third byte
2733 2643 .ci_sm_exit:
2734 2644 membar #Sync ! sync error barrier
2735 2645 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2736 2646 retl
2737 2647 mov %g0, %o0 ! return 0
2738 2648
2739 2649 .align 16
2740 2650 .ci_med:
2741 2651 xor %o0, %o1, %o3 ! setup alignment check
2742 2652 btst 1, %o3
2743 2653 bnz,pt %ncc, .ci_sm_movebytes ! unaligned
2744 2654 nop
2745 2655 btst 3, %o3
2746 2656 bnz,pt %ncc, .ci_med_half ! halfword aligned
2747 2657 nop
2748 2658 btst 7, %o3
2749 2659 bnz,pt %ncc, .ci_med_word ! word aligned
2750 2660 nop
2751 2661 .ci_med_long:
2752 2662 btst 3, %o0 ! check for
2753 2663 bz,pt %ncc, .ci_med_long1 ! word alignment
2754 2664 nop
2755 2665 .ci_med_long0:
2756 2666 lduba [%o0]ASI_USER, %o3 ! load one byte
2757 2667 inc %o0
2758 2668 stb %o3,[%o1] ! store byte
2759 2669 inc %o1
2760 2670 btst 3, %o0
2761 2671 bnz,pt %ncc, .ci_med_long0
2762 2672 dec %o2
2763 2673 .ci_med_long1: ! word aligned
2764 2674 btst 7, %o0 ! check for long word
2765 2675 bz,pt %ncc, .ci_med_long2
2766 2676 nop
2767 2677 lduwa [%o0]ASI_USER, %o3 ! load word
2768 2678 add %o0, 4, %o0 ! advance SRC by 4
2769 2679 stw %o3, [%o1] ! store word
2770 2680 add %o1, 4, %o1 ! advance DST by 4
2771 2681 sub %o2, 4, %o2 ! reduce count by 4
2772 2682 !
2773 2683 ! Now long word aligned and have at least 32 bytes to move
2774 2684 !
2775 2685 .ci_med_long2:
2776 2686 sub %o2, 31, %o2 ! adjust count to allow cc zero test
2777 2687 .ci_med_lmove:
2778 2688 ldxa [%o0]ASI_USER, %o3 ! read long word
2779 2689 subcc %o2, 32, %o2 ! reduce count by 32
2780 2690 stx %o3, [%o1] ! write long word
2781 2691 add %o0, 8, %o0 ! advance SRC by 8
2782 2692 ldxa [%o0]ASI_USER, %o3 ! repeat for a total for 4 long words
2783 2693 add %o0, 8, %o0 ! advance SRC by 8
2784 2694 stx %o3, [%o1 + 8]
2785 2695 add %o1, 32, %o1 ! advance DST by 32
2786 2696 ldxa [%o0]ASI_USER, %o3
2787 2697 add %o0, 8, %o0 ! advance SRC by 8
2788 2698 stx %o3, [%o1 - 16]
2789 2699 ldxa [%o0]ASI_USER, %o3
2790 2700 add %o0, 8, %o0 ! advance SRC by 8
2791 2701 bgt,pt %ncc, .ci_med_lmove ! loop til 31 or fewer bytes left
2792 2702 stx %o3, [%o1 - 8]
2793 2703 addcc %o2, 24, %o2 ! restore count to long word offset
2794 2704 ble,pt %ncc, .ci_med_lextra ! check for more long words to move
2795 2705 nop
2796 2706 .ci_med_lword:
2797 2707 ldxa [%o0]ASI_USER, %o3 ! read long word
2798 2708 subcc %o2, 8, %o2 ! reduce count by 8
2799 2709 stx %o3, [%o1] ! write long word
2800 2710 add %o0, 8, %o0 ! advance SRC by 8
2801 2711 bgt,pt %ncc, .ci_med_lword ! loop til 7 or fewer bytes left
2802 2712 add %o1, 8, %o1 ! advance DST by 8
2803 2713 .ci_med_lextra:
2804 2714 addcc %o2, 7, %o2 ! restore rest of count
2805 2715 bz,pt %ncc, .ci_sm_exit ! if zero, then done
2806 2716 deccc %o2
2807 2717 bz,pt %ncc, .ci_sm_byte
2808 2718 nop
2809 2719 ba,pt %ncc, .ci_sm_half
2810 2720 nop
2811 2721
2812 2722 .align 16
2813 2723 nop ! instruction alignment
2814 2724 ! see discussion at start of file
2815 2725 .ci_med_word:
2816 2726 btst 3, %o0 ! check for
2817 2727 bz,pt %ncc, .ci_med_word1 ! word alignment
2818 2728 nop
2819 2729 .ci_med_word0:
2820 2730 lduba [%o0]ASI_USER, %o3 ! load one byte
2821 2731 inc %o0
2822 2732 stb %o3,[%o1] ! store byte
2823 2733 inc %o1
2824 2734 btst 3, %o0
2825 2735 bnz,pt %ncc, .ci_med_word0
2826 2736 dec %o2
2827 2737 !
2828 2738 ! Now word aligned and have at least 36 bytes to move
2829 2739 !
2830 2740 .ci_med_word1:
2831 2741 sub %o2, 15, %o2 ! adjust count to allow cc zero test
2832 2742 .ci_med_wmove:
2833 2743 lduwa [%o0]ASI_USER, %o3 ! read word
2834 2744 subcc %o2, 16, %o2 ! reduce count by 16
2835 2745 stw %o3, [%o1] ! write word
2836 2746 add %o0, 4, %o0 ! advance SRC by 4
2837 2747 lduwa [%o0]ASI_USER, %o3 ! repeat for a total for 4 words
2838 2748 add %o0, 4, %o0 ! advance SRC by 4
2839 2749 stw %o3, [%o1 + 4]
2840 2750 add %o1, 16, %o1 ! advance DST by 16
2841 2751 lduwa [%o0]ASI_USER, %o3
2842 2752 add %o0, 4, %o0 ! advance SRC by 4
2843 2753 stw %o3, [%o1 - 8]
2844 2754 lduwa [%o0]ASI_USER, %o3
2845 2755 add %o0, 4, %o0 ! advance SRC by 4
2846 2756 bgt,pt %ncc, .ci_med_wmove ! loop til 15 or fewer bytes left
2847 2757 stw %o3, [%o1 - 4]
2848 2758 addcc %o2, 12, %o2 ! restore count to word offset
2849 2759 ble,pt %ncc, .ci_med_wextra ! check for more words to move
2850 2760 nop
2851 2761 .ci_med_word2:
2852 2762 lduwa [%o0]ASI_USER, %o3 ! read word
2853 2763 subcc %o2, 4, %o2 ! reduce count by 4
2854 2764 stw %o3, [%o1] ! write word
2855 2765 add %o0, 4, %o0 ! advance SRC by 4
2856 2766 bgt,pt %ncc, .ci_med_word2 ! loop til 3 or fewer bytes left
2857 2767 add %o1, 4, %o1 ! advance DST by 4
2858 2768 .ci_med_wextra:
2859 2769 addcc %o2, 3, %o2 ! restore rest of count
2860 2770 bz,pt %ncc, .ci_sm_exit ! if zero, then done
2861 2771 deccc %o2
2862 2772 bz,pt %ncc, .ci_sm_byte
2863 2773 nop
2864 2774 ba,pt %ncc, .ci_sm_half
2865 2775 nop
2866 2776
2867 2777 .align 16
2868 2778 nop ! instruction alignment
2869 2779 ! see discussion at start of file
2870 2780 .ci_med_half:
2871 2781 btst 1, %o0 ! check for
2872 2782 bz,pt %ncc, .ci_med_half1 ! half word alignment
2873 2783 nop
2874 2784 lduba [%o0]ASI_USER, %o3 ! load one byte
2875 2785 inc %o0
2876 2786 stb %o3,[%o1] ! store byte
2877 2787 inc %o1
2878 2788 dec %o2
2879 2789 !
2880 2790 ! Now half word aligned and have at least 38 bytes to move
2881 2791 !
2882 2792 .ci_med_half1:
2883 2793 sub %o2, 7, %o2 ! adjust count to allow cc zero test
2884 2794 .ci_med_hmove:
2885 2795 lduha [%o0]ASI_USER, %o3 ! read half word
2886 2796 subcc %o2, 8, %o2 ! reduce count by 8
2887 2797 sth %o3, [%o1] ! write half word
2888 2798 add %o0, 2, %o0 ! advance SRC by 2
2889 2799 lduha [%o0]ASI_USER, %o3 ! repeat for a total for 4 halfwords
2890 2800 add %o0, 2, %o0 ! advance SRC by 2
2891 2801 sth %o3, [%o1 + 2]
2892 2802 add %o1, 8, %o1 ! advance DST by 8
2893 2803 lduha [%o0]ASI_USER, %o3
2894 2804 add %o0, 2, %o0 ! advance SRC by 2
2895 2805 sth %o3, [%o1 - 4]
2896 2806 lduha [%o0]ASI_USER, %o3
2897 2807 add %o0, 2, %o0 ! advance SRC by 2
2898 2808 bgt,pt %ncc, .ci_med_hmove ! loop til 7 or fewer bytes left
2899 2809 sth %o3, [%o1 - 2]
2900 2810 addcc %o2, 7, %o2 ! restore count
2901 2811 bz,pt %ncc, .ci_sm_exit
2902 2812 deccc %o2
2903 2813 bz,pt %ncc, .ci_sm_byte
2904 2814 nop
2905 2815 ba,pt %ncc, .ci_sm_half
2906 2816 nop
2907 2817
2908 2818 .sm_copyin_err:
2909 2819 membar #Sync
2910 2820 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2911 2821 mov SM_SAVE_SRC, %o0
2912 2822 mov SM_SAVE_DST, %o1
2913 2823 mov SM_SAVE_COUNT, %o2
2914 2824 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler
2915 2825 tst %o3
2916 2826 bz,pt %ncc, 3f ! if not, return error
2917 2827 nop
2918 2828 ldn [%o3 + CP_COPYIN], %o5 ! if handler, invoke it with
2919 2829 jmp %o5 ! original arguments
2920 2830 nop
2921 2831 3:
2922 2832 retl
2923 2833 or %g0, -1, %o0 ! return errno value
2924 2834
2925 2835 SET_SIZE(copyin)
2926 2836
2927 2837
2928 2838 /*
2929 2839 * The _more entry points are not intended to be used directly by
2930 2840 * any caller from outside this file. They are provided to allow
2931 2841 * profiling and dtrace of the portions of the copy code that uses
2932 2842 * the floating point registers.
2933 2843 * This entry is particularly important as DTRACE (at least as of
2934 2844 * 4/2004) does not support leaf functions.
2935 2845 */
2936 2846
2937 2847 ENTRY(copyin_more)
2938 2848 .copyin_more:
2939 2849 prefetch [%o0], #n_reads
2940 2850 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2941 2851 set .copyin_err, REAL_LOFAULT
2942 2852
2943 2853 /*
2944 2854 * Copy ins that reach here are larger than VIS_COPY_THRESHOLD bytes
2945 2855 */
2946 2856 .do_copyin:
2947 2857 set copyio_fault, %l7 ! .copyio_fault is lofault val
2948 2858
2949 2859 ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler
2950 2860 membar #Sync ! sync error barrier
2951 2861 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault
2952 2862
2953 2863 mov %i0, SAVE_SRC
2954 2864 mov %i1, SAVE_DST
2955 2865 mov %i2, SAVE_COUNT
2956 2866
2957 2867 FP_NOMIGRATE(6, 7)
2958 2868
2959 2869 rd %fprs, %o2 ! check for unused fp
2960 2870 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
2961 2871 btst FPRS_FEF, %o2
2962 2872 bz,a,pt %icc, .do_blockcopyin
2963 2873 wr %g0, FPRS_FEF, %fprs
2964 2874
2965 2875 BST_FPQ2Q4_TOSTACK(%o2)
2966 2876
2967 2877 .do_blockcopyin:
2968 2878 rd %gsr, %o2
2969 2879 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr
2970 2880 or %l6, FPUSED_FLAG, %l6
2971 2881
2972 2882 andcc DST, VIS_BLOCKSIZE - 1, TMP
2973 2883 mov ASI_USER, %asi
2974 2884 bz,pt %ncc, 2f
2975 2885 neg TMP
2976 2886 add TMP, VIS_BLOCKSIZE, TMP
2977 2887
2978 2888 ! TMP = bytes required to align DST on FP_BLOCK boundary
2979 2889 ! Using SRC as a tmp here
2980 2890 cmp TMP, 3
2981 2891 bleu,pt %ncc, 1f
2982 2892 sub CNT,TMP,CNT ! adjust main count
2983 2893 sub TMP, 3, TMP ! adjust for end of loop test
2984 2894 .ci_blkalign:
2985 2895 lduba [REALSRC]%asi, SRC ! move 4 bytes per loop iteration
2986 2896 stb SRC, [DST]
2987 2897 subcc TMP, 4, TMP
2988 2898 lduba [REALSRC + 1]%asi, SRC
2989 2899 add REALSRC, 4, REALSRC
2990 2900 stb SRC, [DST + 1]
2991 2901 lduba [REALSRC - 2]%asi, SRC
2992 2902 add DST, 4, DST
2993 2903 stb SRC, [DST - 2]
2994 2904 lduba [REALSRC - 1]%asi, SRC
2995 2905 bgu,pt %ncc, .ci_blkalign
2996 2906 stb SRC, [DST - 1]
2997 2907
2998 2908 addcc TMP, 3, TMP ! restore count adjustment
2999 2909 bz,pt %ncc, 2f ! no bytes left?
3000 2910 nop
3001 2911 1: lduba [REALSRC]%asi, SRC
3002 2912 inc REALSRC
3003 2913 inc DST
3004 2914 deccc TMP
3005 2915 bgu %ncc, 1b
3006 2916 stb SRC, [DST - 1]
3007 2917
3008 2918 2:
3009 2919 membar #StoreLoad
3010 2920 andn REALSRC, 0x7, SRC
3011 2921
3012 2922 ! SRC - 8-byte aligned
3013 2923 ! DST - 64-byte aligned
3014 2924 ldda [SRC]%asi, %f16
3015 2925 prefetcha [SRC + (1 * VIS_BLOCKSIZE)]%asi, #n_reads
3016 2926 alignaddr REALSRC, %g0, %g0
3017 2927 ldda [SRC + 0x08]%asi, %f18
3018 2928 prefetcha [SRC + (2 * VIS_BLOCKSIZE)]%asi, #n_reads
3019 2929 faligndata %f16, %f18, %f48
3020 2930 ldda [SRC + 0x10]%asi, %f20
3021 2931 prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #n_reads
3022 2932 faligndata %f18, %f20, %f50
3023 2933 ldda [SRC + 0x18]%asi, %f22
3024 2934 prefetcha [SRC + (4 * VIS_BLOCKSIZE)]%asi, #one_read
3025 2935 faligndata %f20, %f22, %f52
3026 2936 ldda [SRC + 0x20]%asi, %f24
3027 2937 prefetcha [SRC + (8 * VIS_BLOCKSIZE)]%asi, #one_read
3028 2938 faligndata %f22, %f24, %f54
3029 2939 ldda [SRC + 0x28]%asi, %f26
3030 2940 prefetcha [SRC + (12 * VIS_BLOCKSIZE)]%asi, #one_read
3031 2941 faligndata %f24, %f26, %f56
3032 2942 ldda [SRC + 0x30]%asi, %f28
3033 2943 prefetcha [SRC + (16 * VIS_BLOCKSIZE)]%asi, #one_read
3034 2944 faligndata %f26, %f28, %f58
3035 2945 ldda [SRC + 0x38]%asi, %f30
3036 2946 ldda [SRC + VIS_BLOCKSIZE]%asi, %f16
3037 2947 sub CNT, VIS_BLOCKSIZE, CNT
3038 2948 add SRC, VIS_BLOCKSIZE, SRC
3039 2949 prefetcha [SRC + (19 * VIS_BLOCKSIZE)]%asi, #one_read
3040 2950 add REALSRC, VIS_BLOCKSIZE, REALSRC
3041 2951 ba,pt %ncc, 1f
3042 2952 prefetcha [SRC + (23 * VIS_BLOCKSIZE)]%asi, #one_read
3043 2953 .align 32
3044 2954 1:
3045 2955 ldda [SRC + 0x08]%asi, %f18
3046 2956 faligndata %f28, %f30, %f60
3047 2957 ldda [SRC + 0x10]%asi, %f20
3048 2958 faligndata %f30, %f16, %f62
3049 2959 stda %f48, [DST]ASI_BLK_P
3050 2960 ldda [SRC + 0x18]%asi, %f22
3051 2961 faligndata %f16, %f18, %f48
3052 2962 ldda [SRC + 0x20]%asi, %f24
3053 2963 faligndata %f18, %f20, %f50
3054 2964 ldda [SRC + 0x28]%asi, %f26
3055 2965 faligndata %f20, %f22, %f52
3056 2966 ldda [SRC + 0x30]%asi, %f28
3057 2967 faligndata %f22, %f24, %f54
3058 2968 sub CNT, VIS_BLOCKSIZE, CNT
3059 2969 ldda [SRC + 0x38]%asi, %f30
3060 2970 faligndata %f24, %f26, %f56
3061 2971 add DST, VIS_BLOCKSIZE, DST
3062 2972 ldda [SRC + VIS_BLOCKSIZE]%asi, %f16
3063 2973 faligndata %f26, %f28, %f58
3064 2974 add REALSRC, VIS_BLOCKSIZE, REALSRC
3065 2975 prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #n_reads
3066 2976 add SRC, VIS_BLOCKSIZE, SRC
3067 2977 prefetcha [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read
3068 2978 cmp CNT, VIS_BLOCKSIZE + 8
3069 2979 bgu,pt %ncc, 1b
3070 2980 prefetcha [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read
3071 2981
3072 2982 ! only if REALSRC & 0x7 is 0
3073 2983 cmp CNT, VIS_BLOCKSIZE
3074 2984 bne %ncc, 3f
3075 2985 andcc REALSRC, 0x7, %g0
3076 2986 bz,pt %ncc, 2f
3077 2987 nop
3078 2988 3:
3079 2989 faligndata %f28, %f30, %f60
3080 2990 faligndata %f30, %f16, %f62
3081 2991 stda %f48, [DST]ASI_BLK_P
3082 2992 add DST, VIS_BLOCKSIZE, DST
3083 2993 ba,pt %ncc, 3f
3084 2994 nop
3085 2995 2:
3086 2996 ldda [SRC + 0x08]%asi, %f18
3087 2997 fsrc1 %f28, %f60
3088 2998 ldda [SRC + 0x10]%asi, %f20
3089 2999 fsrc1 %f30, %f62
3090 3000 stda %f48, [DST]ASI_BLK_P
3091 3001 ldda [SRC + 0x18]%asi, %f22
3092 3002 fsrc1 %f16, %f48
3093 3003 ldda [SRC + 0x20]%asi, %f24
3094 3004 fsrc1 %f18, %f50
3095 3005 ldda [SRC + 0x28]%asi, %f26
3096 3006 fsrc1 %f20, %f52
3097 3007 ldda [SRC + 0x30]%asi, %f28
3098 3008 fsrc1 %f22, %f54
3099 3009 ldda [SRC + 0x38]%asi, %f30
3100 3010 fsrc1 %f24, %f56
3101 3011 sub CNT, VIS_BLOCKSIZE, CNT
3102 3012 add DST, VIS_BLOCKSIZE, DST
3103 3013 add SRC, VIS_BLOCKSIZE, SRC
3104 3014 add REALSRC, VIS_BLOCKSIZE, REALSRC
3105 3015 fsrc1 %f26, %f58
3106 3016 fsrc1 %f28, %f60
3107 3017 fsrc1 %f30, %f62
3108 3018 stda %f48, [DST]ASI_BLK_P
3109 3019 add DST, VIS_BLOCKSIZE, DST
3110 3020 ba,a,pt %ncc, 4f
3111 3021 nop
3112 3022
3113 3023 3: tst CNT
3114 3024 bz,a %ncc, 4f
3115 3025 nop
3116 3026
3117 3027 5: lduba [REALSRC]ASI_USER, TMP
3118 3028 inc REALSRC
3119 3029 inc DST
3120 3030 deccc CNT
3121 3031 bgu %ncc, 5b
3122 3032 stb TMP, [DST - 1]
3123 3033 4:
3124 3034
3125 3035 .copyin_exit:
3126 3036 membar #Sync
3127 3037
3128 3038 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr
3129 3039 wr %o2, 0, %gsr
3130 3040
3131 3041 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
3132 3042 btst FPRS_FEF, %o3
3133 3043 bz,pt %icc, 4f
3134 3044 nop
3135 3045
3136 3046 BLD_FPQ2Q4_FROMSTACK(%o2)
3137 3047
3138 3048 ba,pt %ncc, 1f
3139 3049 wr %o3, 0, %fprs ! restore fprs
3140 3050
3141 3051 4:
3142 3052 FZEROQ2Q4
3143 3053 wr %o3, 0, %fprs ! restore fprs
3144 3054
3145 3055 1:
3146 3056 membar #Sync ! sync error barrier
3147 3057 andn %l6, FPUSED_FLAG, %l6
3148 3058 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
3149 3059 FP_ALLOWMIGRATE(5, 6)
3150 3060 ret
3151 3061 restore %g0, 0, %o0
3152 3062 /*
3153 3063 * We got here because of a fault during copyin
3154 3064 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
3155 3065 */
3156 3066 .copyin_err:
3157 3067 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler
3158 3068 tst %o4
3159 3069 bz,pt %ncc, 2f ! if not, return error
3160 3070 nop
↓ open down ↓ |
586 lines elided |
↑ open up ↑ |
3161 3071 ldn [%o4 + CP_COPYIN], %g2 ! if handler, invoke it with
3162 3072 jmp %g2 ! original arguments
3163 3073 restore %g0, 0, %g0 ! dispose of copy window
3164 3074 2:
3165 3075 ret
3166 3076 restore %g0, -1, %o0 ! return error value
3167 3077
3168 3078
3169 3079 SET_SIZE(copyin_more)
3170 3080
3171 -#endif /* lint */
3172 -
3173 -#ifdef lint
3174 -
3175 -/*ARGSUSED*/
3176 -int
3177 -xcopyin(const void *uaddr, void *kaddr, size_t count)
3178 -{ return (0); }
3179 -
3180 -#else /* lint */
3181 -
3182 3081 ENTRY(xcopyin)
3183 3082
3184 3083 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
3185 3084 bleu,pt %ncc, .xcopyin_small ! go to larger cases
3186 3085 xor %o0, %o1, %o3 ! are src, dst alignable?
3187 3086 btst 7, %o3 !
3188 3087 bz,pt %ncc, .xcopyin_8 ! check for longword alignment
3189 3088 nop
3190 3089 btst 1, %o3 !
3191 3090 bz,pt %ncc, .xcopyin_2 ! check for half-word
3192 3091 nop
3193 3092 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
3194 3093 ld [%o3 + %lo(hw_copy_limit_1)], %o3
3195 3094 tst %o3
3196 3095 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy
3197 3096 cmp %o2, %o3 ! if length <= limit
3198 3097 bleu,pt %ncc, .xcopyin_small ! go to small copy
3199 3098 nop
3200 3099 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy
3201 3100 nop
3202 3101 .xcopyin_2:
3203 3102 btst 3, %o3 !
3204 3103 bz,pt %ncc, .xcopyin_4 ! check for word alignment
3205 3104 nop
3206 3105 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
3207 3106 ld [%o3 + %lo(hw_copy_limit_2)], %o3
3208 3107 tst %o3
3209 3108 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy
3210 3109 cmp %o2, %o3 ! if length <= limit
3211 3110 bleu,pt %ncc, .xcopyin_small ! go to small copy
3212 3111 nop
3213 3112 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy
3214 3113 nop
3215 3114 .xcopyin_4:
3216 3115 ! already checked longword, must be word aligned
3217 3116 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
3218 3117 ld [%o3 + %lo(hw_copy_limit_4)], %o3
3219 3118 tst %o3
3220 3119 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy
3221 3120 cmp %o2, %o3 ! if length <= limit
3222 3121 bleu,pt %ncc, .xcopyin_small ! go to small copy
3223 3122 nop
3224 3123 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy
3225 3124 nop
3226 3125 .xcopyin_8:
3227 3126 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
3228 3127 ld [%o3 + %lo(hw_copy_limit_8)], %o3
3229 3128 tst %o3
3230 3129 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy
3231 3130 cmp %o2, %o3 ! if length <= limit
3232 3131 bleu,pt %ncc, .xcopyin_small ! go to small copy
3233 3132 nop
3234 3133 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy
3235 3134 nop
3236 3135
3237 3136 .xcopyin_small:
3238 3137 sethi %hi(.sm_xcopyin_err), %o5 ! .sm_xcopyin_err is lofault value
3239 3138 or %o5, %lo(.sm_xcopyin_err), %o5
3240 3139 ldn [THREAD_REG + T_LOFAULT], %o4 ! set/save t_lofaul
3241 3140 membar #Sync ! sync error barrier
3242 3141 ba,pt %ncc, .sm_do_copyin ! common code
3243 3142 stn %o5, [THREAD_REG + T_LOFAULT]
3244 3143
3245 3144 .xcopyin_more:
3246 3145 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3247 3146 sethi %hi(.xcopyin_err), REAL_LOFAULT ! .xcopyin_err is lofault value
3248 3147 ba,pt %ncc, .do_copyin
3249 3148 or REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
3250 3149
3251 3150 /*
3252 3151 * We got here because of fault during xcopyin
3253 3152 * Errno value is in ERRNO
3254 3153 */
3255 3154 .xcopyin_err:
3256 3155 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler
3257 3156 tst %o4
3258 3157 bz,pt %ncc, 2f ! if not, return error
3259 3158 nop
3260 3159 ldn [%o4 + CP_XCOPYIN], %g2 ! if handler, invoke it with
3261 3160 jmp %g2 ! original arguments
3262 3161 restore %g0, 0, %g0 ! dispose of copy window
3263 3162 2:
3264 3163 ret
3265 3164 restore ERRNO, 0, %o0 ! return errno value
3266 3165
3267 3166 .sm_xcopyin_err:
3268 3167
3269 3168 membar #Sync
3270 3169 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
3271 3170 mov SM_SAVE_SRC, %o0
3272 3171 mov SM_SAVE_DST, %o1
3273 3172 mov SM_SAVE_COUNT, %o2
3274 3173 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler
3275 3174 tst %o3
3276 3175 bz,pt %ncc, 3f ! if not, return error
↓ open down ↓ |
85 lines elided |
↑ open up ↑ |
3277 3176 nop
3278 3177 ldn [%o3 + CP_XCOPYIN], %o5 ! if handler, invoke it with
3279 3178 jmp %o5 ! original arguments
3280 3179 nop
3281 3180 3:
3282 3181 retl
3283 3182 or %g1, 0, %o0 ! return errno value
3284 3183
3285 3184 SET_SIZE(xcopyin)
3286 3185
3287 -#endif /* lint */
3288 -
3289 -#ifdef lint
3290 -
3291 -/*ARGSUSED*/
3292 -int
3293 -xcopyin_little(const void *uaddr, void *kaddr, size_t count)
3294 -{ return (0); }
3295 -
3296 -#else /* lint */
3297 -
3298 3186 ENTRY(xcopyin_little)
3299 3187 sethi %hi(.xcopyio_err), %o5
3300 3188 or %o5, %lo(.xcopyio_err), %o5
3301 3189 ldn [THREAD_REG + T_LOFAULT], %o4
3302 3190 membar #Sync ! sync error barrier
3303 3191 stn %o5, [THREAD_REG + T_LOFAULT]
3304 3192 mov %o4, %o5
3305 3193
3306 3194 subcc %g0, %o2, %o3
3307 3195 add %o0, %o2, %o0
3308 3196 bz,pn %ncc, 2f ! check for zero bytes
3309 3197 sub %o2, 1, %o4
3310 3198 add %o0, %o4, %o0 ! start w/last byte
3311 3199 add %o1, %o2, %o1
3312 3200 lduba [%o0 + %o3]ASI_AIUSL, %o4
3313 3201
3314 3202 1: stb %o4, [%o1 + %o3]
3315 3203 inccc %o3
3316 3204 sub %o0, 2, %o0 ! get next byte
3317 3205 bcc,a,pt %ncc, 1b
3318 3206 lduba [%o0 + %o3]ASI_AIUSL, %o4
3319 3207
3320 3208 2:
3321 3209 membar #Sync ! sync error barrier
3322 3210 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
3323 3211 retl
↓ open down ↓ |
16 lines elided |
↑ open up ↑ |
3324 3212 mov %g0, %o0 ! return (0)
3325 3213
3326 3214 .xcopyio_err:
3327 3215 membar #Sync ! sync error barrier
3328 3216 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
3329 3217 retl
3330 3218 mov %g1, %o0
3331 3219
3332 3220 SET_SIZE(xcopyin_little)
3333 3221
3334 -#endif /* lint */
3335 3222
3336 -
3337 3223 /*
3338 3224 * Copy a block of storage - must not overlap (from + len <= to).
3339 3225 * No fault handler installed (to be called under on_fault())
3340 3226 */
3341 -#if defined(lint)
3342 -
3343 -/* ARGSUSED */
3344 -void
3345 -copyin_noerr(const void *ufrom, void *kto, size_t count)
3346 -{}
3347 -
3348 -#else /* lint */
3349 3227 ENTRY(copyin_noerr)
3350 3228
3351 3229 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
3352 3230 bleu,pt %ncc, .copyin_ne_small ! go to larger cases
3353 3231 xor %o0, %o1, %o3 ! are src, dst alignable?
3354 3232 btst 7, %o3 !
3355 3233 bz,pt %ncc, .copyin_ne_8 ! check for longword alignment
3356 3234 nop
3357 3235 btst 1, %o3 !
3358 3236 bz,pt %ncc, .copyin_ne_2 ! check for half-word
3359 3237 nop
3360 3238 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
3361 3239 ld [%o3 + %lo(hw_copy_limit_1)], %o3
3362 3240 tst %o3
3363 3241 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy
3364 3242 cmp %o2, %o3 ! if length <= limit
3365 3243 bleu,pt %ncc, .copyin_ne_small ! go to small copy
3366 3244 nop
3367 3245 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy
3368 3246 nop
3369 3247 .copyin_ne_2:
3370 3248 btst 3, %o3 !
3371 3249 bz,pt %ncc, .copyin_ne_4 ! check for word alignment
3372 3250 nop
3373 3251 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
3374 3252 ld [%o3 + %lo(hw_copy_limit_2)], %o3
3375 3253 tst %o3
3376 3254 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy
3377 3255 cmp %o2, %o3 ! if length <= limit
3378 3256 bleu,pt %ncc, .copyin_ne_small ! go to small copy
3379 3257 nop
3380 3258 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy
3381 3259 nop
3382 3260 .copyin_ne_4:
3383 3261 ! already checked longword, must be word aligned
3384 3262 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
3385 3263 ld [%o3 + %lo(hw_copy_limit_4)], %o3
3386 3264 tst %o3
3387 3265 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy
3388 3266 cmp %o2, %o3 ! if length <= limit
3389 3267 bleu,pt %ncc, .copyin_ne_small ! go to small copy
3390 3268 nop
3391 3269 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy
3392 3270 nop
3393 3271 .copyin_ne_8:
3394 3272 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
3395 3273 ld [%o3 + %lo(hw_copy_limit_8)], %o3
3396 3274 tst %o3
3397 3275 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy
3398 3276 cmp %o2, %o3 ! if length <= limit
3399 3277 bleu,pt %ncc, .copyin_ne_small ! go to small copy
3400 3278 nop
3401 3279 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy
3402 3280 nop
3403 3281
3404 3282 .copyin_ne_small:
3405 3283 ldn [THREAD_REG + T_LOFAULT], %o4
3406 3284 tst %o4
3407 3285 bz,pn %ncc, .sm_do_copyin
3408 3286 nop
3409 3287 sethi %hi(.sm_copyio_noerr), %o5
3410 3288 or %o5, %lo(.sm_copyio_noerr), %o5
3411 3289 membar #Sync ! sync error barrier
3412 3290 ba,pt %ncc, .sm_do_copyin
3413 3291 stn %o5, [THREAD_REG + T_LOFAULT] ! set/save t_lofault
3414 3292
3415 3293 .copyin_noerr_more:
3416 3294 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3417 3295 sethi %hi(.copyio_noerr), REAL_LOFAULT
3418 3296 ba,pt %ncc, .do_copyin
3419 3297 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
3420 3298
3421 3299 .copyio_noerr:
↓ open down ↓ |
63 lines elided |
↑ open up ↑ |
3422 3300 jmp %l6
3423 3301 restore %g0,0,%g0
3424 3302
3425 3303 .sm_copyio_noerr:
3426 3304 membar #Sync
3427 3305 stn %o4, [THREAD_REG + T_LOFAULT] ! restore t_lofault
3428 3306 jmp %o4
3429 3307 nop
3430 3308
3431 3309 SET_SIZE(copyin_noerr)
3432 -#endif /* lint */
3433 3310
3434 3311 /*
3435 3312 * Copy a block of storage - must not overlap (from + len <= to).
3436 3313 * No fault handler installed (to be called under on_fault())
3437 3314 */
3438 3315
3439 -#if defined(lint)
3440 -
3441 -/* ARGSUSED */
3442 -void
3443 -copyout_noerr(const void *kfrom, void *uto, size_t count)
3444 -{}
3445 -
3446 -#else /* lint */
3447 3316 ENTRY(copyout_noerr)
3448 3317
3449 3318 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
3450 3319 bleu,pt %ncc, .copyout_ne_small ! go to larger cases
3451 3320 xor %o0, %o1, %o3 ! are src, dst alignable?
3452 3321 btst 7, %o3 !
3453 3322 bz,pt %ncc, .copyout_ne_8 ! check for longword alignment
3454 3323 nop
3455 3324 btst 1, %o3 !
3456 3325 bz,pt %ncc, .copyout_ne_2 ! check for half-word
3457 3326 nop
3458 3327 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
3459 3328 ld [%o3 + %lo(hw_copy_limit_1)], %o3
3460 3329 tst %o3
3461 3330 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy
3462 3331 cmp %o2, %o3 ! if length <= limit
3463 3332 bleu,pt %ncc, .copyout_ne_small ! go to small copy
3464 3333 nop
3465 3334 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy
3466 3335 nop
3467 3336 .copyout_ne_2:
3468 3337 btst 3, %o3 !
3469 3338 bz,pt %ncc, .copyout_ne_4 ! check for word alignment
3470 3339 nop
3471 3340 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
3472 3341 ld [%o3 + %lo(hw_copy_limit_2)], %o3
3473 3342 tst %o3
3474 3343 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy
3475 3344 cmp %o2, %o3 ! if length <= limit
3476 3345 bleu,pt %ncc, .copyout_ne_small ! go to small copy
3477 3346 nop
3478 3347 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy
3479 3348 nop
3480 3349 .copyout_ne_4:
3481 3350 ! already checked longword, must be word aligned
3482 3351 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
3483 3352 ld [%o3 + %lo(hw_copy_limit_4)], %o3
3484 3353 tst %o3
3485 3354 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy
3486 3355 cmp %o2, %o3 ! if length <= limit
3487 3356 bleu,pt %ncc, .copyout_ne_small ! go to small copy
3488 3357 nop
3489 3358 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy
3490 3359 nop
3491 3360 .copyout_ne_8:
3492 3361 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
3493 3362 ld [%o3 + %lo(hw_copy_limit_8)], %o3
3494 3363 tst %o3
3495 3364 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy
3496 3365 cmp %o2, %o3 ! if length <= limit
3497 3366 bleu,pt %ncc, .copyout_ne_small ! go to small copy
3498 3367 nop
3499 3368 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy
3500 3369 nop
3501 3370
3502 3371 .copyout_ne_small:
3503 3372 ldn [THREAD_REG + T_LOFAULT], %o4
3504 3373 tst %o4
3505 3374 bz,pn %ncc, .sm_do_copyout
3506 3375 nop
3507 3376 sethi %hi(.sm_copyio_noerr), %o5
3508 3377 or %o5, %lo(.sm_copyio_noerr), %o5
3509 3378 membar #Sync ! sync error barrier
↓ open down ↓ |
53 lines elided |
↑ open up ↑ |
3510 3379 ba,pt %ncc, .sm_do_copyout
3511 3380 stn %o5, [THREAD_REG + T_LOFAULT] ! set/save t_lofault
3512 3381
3513 3382 .copyout_noerr_more:
3514 3383 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3515 3384 sethi %hi(.copyio_noerr), REAL_LOFAULT
3516 3385 ba,pt %ncc, .do_copyout
3517 3386 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
3518 3387
3519 3388 SET_SIZE(copyout_noerr)
3520 -#endif /* lint */
3521 3389
3522 3390
3523 3391 /*
3524 3392 * hwblkclr - clears block-aligned, block-multiple-sized regions that are
3525 3393 * longer than 256 bytes in length using spitfire's block stores. If
3526 3394 * the criteria for using this routine are not met then it calls bzero
3527 3395 * and returns 1. Otherwise 0 is returned indicating success.
3528 3396 * Caller is responsible for ensuring use_hw_bzero is true and that
3529 3397 * kpreempt_disable() has been called.
3530 3398 */
3531 -#ifdef lint
3532 -/*ARGSUSED*/
3533 -int
3534 -hwblkclr(void *addr, size_t len)
3535 -{
3536 - return(0);
3537 -}
3538 -#else /* lint */
3539 3399 ! %i0 - start address
3540 3400 ! %i1 - length of region (multiple of 64)
3541 3401 ! %l0 - saved fprs
3542 3402 ! %l1 - pointer to saved %d0 block
3543 3403 ! %l2 - saved curthread->t_lwp
3544 3404
3545 3405 ENTRY(hwblkclr)
3546 3406 ! get another window w/space for one aligned block of saved fpregs
3547 3407 save %sp, -SA(MINFRAME + 2*VIS_BLOCKSIZE), %sp
3548 3408
3549 3409 ! Must be block-aligned
3550 3410 andcc %i0, (VIS_BLOCKSIZE-1), %g0
3551 3411 bnz,pn %ncc, 1f
3552 3412 nop
3553 3413
3554 3414 ! ... and must be 256 bytes or more
3555 3415 cmp %i1, 256
3556 3416 blu,pn %ncc, 1f
3557 3417 nop
3558 3418
3559 3419 ! ... and length must be a multiple of VIS_BLOCKSIZE
3560 3420 andcc %i1, (VIS_BLOCKSIZE-1), %g0
3561 3421 bz,pn %ncc, 2f
3562 3422 nop
3563 3423
3564 3424 1: ! punt, call bzero but notify the caller that bzero was used
3565 3425 mov %i0, %o0
3566 3426 call bzero
3567 3427 mov %i1, %o1
3568 3428 ret
3569 3429 restore %g0, 1, %o0 ! return (1) - did not use block operations
3570 3430
3571 3431 2: rd %fprs, %l0 ! check for unused fp
3572 3432 btst FPRS_FEF, %l0
3573 3433 bz,pt %icc, 1f
3574 3434 nop
3575 3435
3576 3436 ! save in-use fpregs on stack
3577 3437 membar #Sync
3578 3438 add %fp, STACK_BIAS - 65, %l1
3579 3439 and %l1, -VIS_BLOCKSIZE, %l1
3580 3440 stda %d0, [%l1]ASI_BLK_P
3581 3441
3582 3442 1: membar #StoreStore|#StoreLoad|#LoadStore
3583 3443 wr %g0, FPRS_FEF, %fprs
3584 3444 wr %g0, ASI_BLK_P, %asi
3585 3445
3586 3446 ! Clear block
3587 3447 fzero %d0
3588 3448 fzero %d2
3589 3449 fzero %d4
3590 3450 fzero %d6
3591 3451 fzero %d8
3592 3452 fzero %d10
3593 3453 fzero %d12
3594 3454 fzero %d14
3595 3455
3596 3456 mov 256, %i3
3597 3457 ba,pt %ncc, .pz_doblock
3598 3458 nop
3599 3459
3600 3460 .pz_blkstart:
3601 3461 ! stda %d0, [%i0 + 192]%asi ! in dly slot of branch that got us here
3602 3462 stda %d0, [%i0 + 128]%asi
3603 3463 stda %d0, [%i0 + 64]%asi
3604 3464 stda %d0, [%i0]%asi
3605 3465 .pz_zinst:
3606 3466 add %i0, %i3, %i0
3607 3467 sub %i1, %i3, %i1
3608 3468 .pz_doblock:
3609 3469 cmp %i1, 256
3610 3470 bgeu,a %ncc, .pz_blkstart
3611 3471 stda %d0, [%i0 + 192]%asi
3612 3472
3613 3473 cmp %i1, 64
3614 3474 blu %ncc, .pz_finish
3615 3475
3616 3476 andn %i1, (64-1), %i3
3617 3477 srl %i3, 4, %i2 ! using blocks, 1 instr / 16 words
3618 3478 set .pz_zinst, %i4
3619 3479 sub %i4, %i2, %i4
3620 3480 jmp %i4
3621 3481 nop
3622 3482
3623 3483 .pz_finish:
3624 3484 membar #Sync
3625 3485 btst FPRS_FEF, %l0
3626 3486 bz,a .pz_finished
3627 3487 wr %l0, 0, %fprs ! restore fprs
3628 3488
↓ open down ↓ |
80 lines elided |
↑ open up ↑ |
3629 3489 ! restore fpregs from stack
3630 3490 ldda [%l1]ASI_BLK_P, %d0
3631 3491 membar #Sync
3632 3492 wr %l0, 0, %fprs ! restore fprs
3633 3493
3634 3494 .pz_finished:
3635 3495 ret
3636 3496 restore %g0, 0, %o0 ! return (bzero or not)
3637 3497
3638 3498 SET_SIZE(hwblkclr)
3639 -#endif /* lint */
3640 3499
3641 -#ifdef lint
3642 -/*ARGSUSED*/
3643 -void
3644 -hw_pa_bcopy32(uint64_t src, uint64_t dst)
3645 -{}
3646 -#else /*!lint */
3647 3500 /*
3648 3501 * Copy 32 bytes of data from src (%o0) to dst (%o1)
3649 3502 * using physical addresses.
3650 3503 */
3651 3504 ENTRY_NP(hw_pa_bcopy32)
3652 3505 rdpr %pstate, %g1
3653 3506 andn %g1, PSTATE_IE, %g2
3654 3507 wrpr %g0, %g2, %pstate
3655 3508
3656 3509 rdpr %pstate, %g0
3657 3510 ldxa [%o0]ASI_MEM, %o2
3658 3511 add %o0, 8, %o0
3659 3512 ldxa [%o0]ASI_MEM, %o3
3660 3513 add %o0, 8, %o0
3661 3514 ldxa [%o0]ASI_MEM, %o4
3662 3515 add %o0, 8, %o0
3663 3516 ldxa [%o0]ASI_MEM, %o5
3664 3517 membar #Sync
3665 3518
3666 3519 stxa %o2, [%o1]ASI_MEM
3667 3520 add %o1, 8, %o1
3668 3521 stxa %o3, [%o1]ASI_MEM
↓ open down ↓ |
12 lines elided |
↑ open up ↑ |
3669 3522 add %o1, 8, %o1
3670 3523 stxa %o4, [%o1]ASI_MEM
3671 3524 add %o1, 8, %o1
3672 3525 stxa %o5, [%o1]ASI_MEM
3673 3526
3674 3527 retl
3675 3528 wrpr %g0, %g1, %pstate
3676 3529
3677 3530 SET_SIZE(hw_pa_bcopy32)
3678 3531
3679 -#endif /* lint */
3680 -
3681 -#if defined(lint)
3682 -
3683 -int use_hw_bcopy = 1;
3684 -int use_hw_bzero = 1;
3685 -uint_t hw_copy_limit_1 = 0;
3686 -uint_t hw_copy_limit_2 = 0;
3687 -uint_t hw_copy_limit_4 = 0;
3688 -uint_t hw_copy_limit_8 = 0;
3689 -
3690 -#else /* !lint */
3691 -
3692 3532 DGDEF(use_hw_bcopy)
3693 3533 .word 1
3694 3534 DGDEF(use_hw_bzero)
3695 3535 .word 1
3696 3536 DGDEF(hw_copy_limit_1)
3697 3537 .word 0
3698 3538 DGDEF(hw_copy_limit_2)
3699 3539 .word 0
3700 3540 DGDEF(hw_copy_limit_4)
3701 3541 .word 0
3702 3542 DGDEF(hw_copy_limit_8)
3703 3543 .word 0
3704 3544
3705 3545 .align 64
3706 3546 .section ".text"
3707 -#endif /* !lint */
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX