Print this page
de-linting of .s files
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/sun4u/cpu/cheetah_copy.s
+++ new/usr/src/uts/sun4u/cpu/cheetah_copy.s
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License, Version 1.0 only
6 6 * (the "License"). You may not use this file except in compliance
7 7 * with the License.
8 8 *
9 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 10 * or http://www.opensolaris.org/os/licensing.
11 11 * See the License for the specific language governing permissions
12 12 * and limitations under the License.
13 13 *
14 14 * When distributing Covered Code, include this CDDL HEADER in each
15 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 16 * If applicable, add the following below this CDDL HEADER, with the
↓ open down ↓ |
16 lines elided |
↑ open up ↑ |
17 17 * fields enclosed by brackets "[]" replaced with your own identifying
18 18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 19 *
20 20 * CDDL HEADER END
21 21 */
22 22 /*
23 23 * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
24 24 * Use is subject to license terms.
25 25 */
26 26
27 -#pragma ident "%Z%%M% %I% %E% SMI"
28 -
29 27 #include <sys/param.h>
30 28 #include <sys/errno.h>
31 29 #include <sys/asm_linkage.h>
32 30 #include <sys/vtrace.h>
33 31 #include <sys/machthread.h>
34 32 #include <sys/clock.h>
35 33 #include <sys/asi.h>
36 34 #include <sys/fsr.h>
37 35 #include <sys/privregs.h>
38 36 #include <sys/fpras_impl.h>
39 37
40 -#if !defined(lint)
41 38 #include "assym.h"
42 -#endif /* lint */
43 39
44 40 /*
45 41 * Pseudo-code to aid in understanding the control flow of the
46 42 * bcopy/copyin/copyout routines.
47 43 *
48 44 * On entry:
49 45 *
50 46 * ! Determine whether to use the FP register version
51 47 * ! or the leaf routine version depending on size
52 48 * ! of copy and flags. Set up error handling accordingly.
53 49 * ! The transition point depends on whether the src and
54 50 * ! dst addresses can be aligned to long word, word,
55 51 * ! half word, or byte boundaries.
56 52 * !
57 53 * ! WARNING: <Register usage convention>
58 54 * ! For FP version, %l6 holds previous error handling and
59 55 * ! a flag: TRAMP_FLAG (low bits)
60 56 * ! for leaf routine version, %o4 holds those values.
61 57 * ! So either %l6 or %o4 is reserved and not available for
62 58 * ! any other use.
63 59 *
64 60 * if (length <= VIS_COPY_THRESHOLD) ! start with a quick test
65 61 * go to small_copy; ! to speed short copies
66 62 *
67 63 * ! src, dst long word alignable
68 64 * if (hw_copy_limit_8 == 0) ! hw_copy disabled
69 65 * go to small_copy;
70 66 * if (length <= hw_copy_limit_8)
71 67 * go to small_copy;
72 68 * go to FPBLK_copy;
73 69 * }
74 70 * if (src,dst not alignable) {
75 71 * if (hw_copy_limit_1 == 0) ! hw_copy disabled
76 72 * go to small_copy;
77 73 * if (length <= hw_copy_limit_1)
78 74 * go to small_copy;
79 75 * go to FPBLK_copy;
80 76 * }
81 77 * if (src,dst halfword alignable) {
82 78 * if (hw_copy_limit_2 == 0) ! hw_copy disabled
83 79 * go to small_copy;
84 80 * if (length <= hw_copy_limit_2)
85 81 * go to small_copy;
86 82 * go to FPBLK_copy;
87 83 * }
88 84 * if (src,dst word alignable) {
89 85 * if (hw_copy_limit_4 == 0) ! hw_copy disabled
90 86 * go to small_copy;
91 87 * if (length <= hw_copy_limit_4)
92 88 * go to small_copy;
93 89 * go to FPBLK_copy;
94 90 * }
95 91 *
96 92 * small_copy:
97 93 * Setup_leaf_rtn_error_handler; ! diffs for each entry point
98 94 *
99 95 * if (count <= 3) ! fast path for tiny copies
100 96 * go to sm_left; ! special finish up code
101 97 * else
102 98 * if (count > CHKSIZE) ! medium sized copies
103 99 * go to sm_med ! tuned by alignment
104 100 * if(src&dst not both word aligned) {
105 101 * sm_movebytes:
106 102 * move byte by byte in 4-way unrolled loop
107 103 * fall into sm_left;
108 104 * sm_left:
109 105 * move 0-3 bytes byte at a time as needed.
110 106 * restore error handler and exit.
111 107 *
112 108 * } else { ! src&dst are word aligned
113 109 * check for at least 8 bytes left,
114 110 * move word at a time, unrolled by 2
115 111 * when fewer than 8 bytes left,
116 112 * sm_half: move half word at a time while 2 or more bytes left
117 113 * sm_byte: move final byte if necessary
118 114 * sm_exit:
119 115 * restore error handler and exit.
120 116 * }
121 117 *
122 118 * ! Medium length cases with at least CHKSIZE bytes available
123 119 * ! method: line up src and dst as best possible, then
124 120 * ! move data in 4-way unrolled loops.
125 121 *
126 122 * sm_med:
127 123 * if(src&dst unalignable)
128 124 * go to sm_movebytes
129 125 * if(src&dst halfword alignable)
130 126 * go to sm_movehalf
131 127 * if(src&dst word alignable)
132 128 * go to sm_moveword
133 129 * ! fall into long word movement
134 130 * move bytes until src is word aligned
135 131 * if not long word aligned, move a word
136 132 * move long words in 4-way unrolled loop until < 32 bytes left
137 133 * move long words in 1-way unrolled loop until < 8 bytes left
138 134 * if zero bytes left, goto sm_exit
139 135 * if one byte left, go to sm_byte
140 136 * else go to sm_half
141 137 *
142 138 * sm_moveword:
143 139 * move bytes until src is word aligned
144 140 * move words in 4-way unrolled loop until < 16 bytes left
145 141 * move words in 1-way unrolled loop until < 4 bytes left
146 142 * if zero bytes left, goto sm_exit
147 143 * if one byte left, go to sm_byte
148 144 * else go to sm_half
149 145 *
150 146 * sm_movehalf:
151 147 * move a byte if needed to align src on halfword
152 148 * move halfwords in 4-way unrolled loop until < 8 bytes left
153 149 * if zero bytes left, goto sm_exit
154 150 * if one byte left, go to sm_byte
155 151 * else go to sm_half
156 152 *
157 153 *
158 154 * FPBLK_copy:
159 155 * %l6 = curthread->t_lofault;
160 156 * if (%l6 != NULL) {
161 157 * membar #Sync
162 158 * curthread->t_lofault = .copyerr;
163 159 * caller_error_handler = TRUE ! %l6 |= 2
164 160 * }
165 161 *
166 162 * ! for FPU testing we must not migrate cpus
167 163 * if (curthread->t_lwp == NULL) {
168 164 * ! Kernel threads do not have pcb's in which to store
169 165 * ! the floating point state, so disallow preemption during
170 166 * ! the copy. This also prevents cpu migration.
171 167 * kpreempt_disable(curthread);
172 168 * } else {
173 169 * thread_nomigrate();
174 170 * }
175 171 *
176 172 * old_fprs = %fprs;
177 173 * old_gsr = %gsr;
178 174 * if (%fprs.fef) {
179 175 * %fprs.fef = 1;
180 176 * save current fpregs on stack using blockstore
181 177 * } else {
182 178 * %fprs.fef = 1;
183 179 * }
184 180 *
185 181 *
186 182 * do_blockcopy_here;
187 183 *
188 184 * In lofault handler:
189 185 * curthread->t_lofault = .copyerr2;
190 186 * Continue on with the normal exit handler
191 187 *
192 188 * On normal exit:
193 189 * %gsr = old_gsr;
194 190 * if (old_fprs & FPRS_FEF)
195 191 * restore fpregs from stack using blockload
196 192 * else
197 193 * zero fpregs
198 194 * %fprs = old_fprs;
199 195 * membar #Sync
200 196 * curthread->t_lofault = (%l6 & ~3);
201 197 * ! following test omitted from copyin/copyout as they
202 198 * ! will always have a current thread
203 199 * if (curthread->t_lwp == NULL)
204 200 * kpreempt_enable(curthread);
205 201 * else
206 202 * thread_allowmigrate();
207 203 * return (0)
208 204 *
209 205 * In second lofault handler (.copyerr2):
210 206 * We've tried to restore fp state from the stack and failed. To
211 207 * prevent from returning with a corrupted fp state, we will panic.
212 208 */
213 209
214 210 /*
215 211 * Comments about optimization choices
216 212 *
217 213 * The initial optimization decision in this code is to determine
218 214 * whether to use the FP registers for a copy or not. If we don't
219 215 * use the FP registers, we can execute the copy as a leaf routine,
220 216 * saving a register save and restore. Also, less elaborate setup
221 217 * is required, allowing short copies to be completed more quickly.
222 218 * For longer copies, especially unaligned ones (where the src and
223 219 * dst do not align to allow simple ldx,stx operation), the FP
224 220 * registers allow much faster copy operations.
225 221 *
226 222 * The estimated extra cost of the FP path will vary depending on
227 223 * src/dst alignment, dst offset from the next 64 byte FPblock store
228 224 * boundary, remaining src data after the last full dst cache line is
229 225 * moved whether the FP registers need to be saved, and some other
230 226 * minor issues. The average additional overhead is estimated to be
231 227 * 400 clocks. Since each non-repeated/predicted tst and branch costs
232 228 * around 10 clocks, elaborate calculation would slow down to all
233 229 * longer copies and only benefit a small portion of medium sized
234 230 * copies. Rather than incur such cost, we chose fixed transition
235 231 * points for each of the alignment choices.
236 232 *
237 233 * For the inner loop, here is a comparison of the per cache line
238 234 * costs for each alignment when src&dst are in cache:
239 235 *
240 236 * byte aligned: 108 clocks slower for non-FPBLK
241 237 * half aligned: 44 clocks slower for non-FPBLK
242 238 * word aligned: 12 clocks slower for non-FPBLK
243 239 * long aligned: 4 clocks >>faster<< for non-FPBLK
244 240 *
245 241 * The long aligned loop runs faster because it does no prefetching.
246 242 * That wins if the data is not in cache or there is too little
247 243 * data to gain much benefit from prefetching. But when there
248 244 * is more data and that data is not in cache, failing to prefetch
249 245 * can run much slower. In addition, there is a 2 Kbyte store queue
250 246 * which will cause the non-FPBLK inner loop to slow for larger copies.
251 247 * The exact tradeoff is strongly load and application dependent, with
252 248 * increasing risk of a customer visible performance regression if the
253 249 * non-FPBLK code is used for larger copies. Studies of synthetic in-cache
254 250 * vs out-of-cache copy tests in user space suggest 1024 bytes as a safe
255 251 * upper limit for the non-FPBLK code. To minimize performance regression
256 252 * risk while still gaining the primary benefits of the improvements to
257 253 * the non-FPBLK code, we set an upper bound of 1024 bytes for the various
258 254 * hw_copy_limit_*. Later experimental studies using different values
259 255 * of hw_copy_limit_* can be used to make further adjustments if
260 256 * appropriate.
261 257 *
262 258 * hw_copy_limit_1 = src and dst are byte aligned but not halfword aligned
263 259 * hw_copy_limit_2 = src and dst are halfword aligned but not word aligned
264 260 * hw_copy_limit_4 = src and dst are word aligned but not longword aligned
265 261 * hw_copy_limit_8 = src and dst are longword aligned
266 262 *
267 263 * To say that src and dst are word aligned means that after
268 264 * some initial alignment activity of moving 0 to 3 bytes,
269 265 * both the src and dst will be on word boundaries so that
270 266 * word loads and stores may be used.
271 267 *
272 268 * Recommended initial values as of Mar 2004, includes testing
273 269 * on Cheetah+ (900MHz), Cheetah++ (1200MHz), and Jaguar(1050MHz):
274 270 * hw_copy_limit_1 = 256
275 271 * hw_copy_limit_2 = 512
276 272 * hw_copy_limit_4 = 1024
277 273 * hw_copy_limit_8 = 1024 (or 1536 on some systems)
278 274 *
279 275 *
280 276 * If hw_copy_limit_? is set to zero, then use of FPBLK copy is
281 277 * disabled for that alignment choice.
282 278 * If hw_copy_limit_? is set to a value between 1 and VIS_COPY_THRESHOLD (256)
283 279 * the value of VIS_COPY_THRESHOLD is used.
284 280 * It is not envisioned that hw_copy_limit_? will be changed in the field
285 281 * It is provided to allow for disabling FPBLK copies and to allow
286 282 * easy testing of alternate values on future HW implementations
287 283 * that might have different cache sizes, clock rates or instruction
288 284 * timing rules.
289 285 *
290 286 * Our first test for FPBLK copies vs non-FPBLK copies checks a minimum
291 287 * threshold to speedup all shorter copies (less than 256). That
292 288 * saves an alignment test, memory reference, and enabling test
293 289 * for all short copies, or an estimated 24 clocks.
294 290 *
295 291 * The order in which these limits are checked does matter since each
296 292 * non-predicted tst and branch costs around 10 clocks.
297 293 * If src and dst are randomly selected addresses,
298 294 * 4 of 8 will not be alignable.
299 295 * 2 of 8 will be half word alignable.
300 296 * 1 of 8 will be word alignable.
301 297 * 1 of 8 will be long word alignable.
302 298 * But, tests on running kernels show that src and dst to copy code
303 299 * are typically not on random alignments. Structure copies and
304 300 * copies of larger data sizes are often on long word boundaries.
305 301 * So we test the long word alignment case first, then
306 302 * the byte alignment, then halfword, then word alignment.
307 303 *
308 304 * Several times, tests for length are made to split the code
309 305 * into subcases. These tests often allow later tests to be
310 306 * avoided. For example, within the non-FPBLK copy, we first
311 307 * check for tiny copies of 3 bytes or less. That allows us
312 308 * to use a 4-way unrolled loop for the general byte copy case
313 309 * without a test on loop entry.
314 310 * We subdivide the non-FPBLK case further into CHKSIZE bytes and less
315 311 * vs longer cases. For the really short case, we don't attempt
316 312 * align src and dst. We try to minimize special case tests in
317 313 * the shortest loops as each test adds a significant percentage
318 314 * to the total time.
319 315 *
320 316 * For the medium sized cases, we allow ourselves to adjust the
321 317 * src and dst alignment and provide special cases for each of
322 318 * the four adjusted alignment cases. The CHKSIZE that was used
323 319 * to decide between short and medium size was chosen to be 39
324 320 * as that allows for the worst case of 7 bytes of alignment
325 321 * shift and 4 times 8 bytes for the first long word unrolling.
326 322 * That knowledge saves an initial test for length on entry into
327 323 * the medium cases. If the general loop unrolling factor were
328 324 * to be increases, this number would also need to be adjusted.
329 325 *
330 326 * For all cases in the non-FPBLK code where it is known that at
331 327 * least 4 chunks of data are available for movement, the
332 328 * loop is unrolled by four. This 4-way loop runs in 8 clocks
333 329 * or 2 clocks per data element. Due to limitations of the
334 330 * branch instruction on Cheetah, Jaguar, and Panther, the
335 331 * minimum time for a small, tight loop is 3 clocks. So
336 332 * the 4-way loop runs 50% faster than the fastest non-unrolled
337 333 * loop.
338 334 *
339 335 * Instruction alignment is forced by used of .align 16 directives
340 336 * and nops which are not executed in the code. This
341 337 * combination of operations shifts the alignment of following
342 338 * loops to insure that loops are aligned so that their instructions
343 339 * fall within the minimum number of 4 instruction fetch groups.
344 340 * If instructions are inserted or removed between the .align
345 341 * instruction and the unrolled loops, then the alignment needs
346 342 * to be readjusted. Misaligned loops can add a clock per loop
347 343 * iteration to the loop timing.
348 344 *
349 345 * In a few cases, code is duplicated to avoid a branch. Since
350 346 * a non-predicted tst and branch takes 10 clocks, this savings
351 347 * is judged an appropriate time-space tradeoff.
352 348 *
353 349 * Within the FPBLK-code, the prefetch method in the inner
354 350 * loop needs to be explained as it is not standard. Two
355 351 * prefetches are issued for each cache line instead of one.
356 352 * The primary one is at the maximum reach of 8 cache lines.
357 353 * Most of the time, that maximum prefetch reach gives the
358 354 * cache line more time to reach the processor for systems with
359 355 * higher processor clocks. But, sometimes memory interference
360 356 * can cause that prefetch to be dropped. Putting a second
361 357 * prefetch at a reach of 5 cache lines catches the drops
362 358 * three iterations later and shows a measured improvement
363 359 * in performance over any similar loop with a single prefetch.
364 360 * The prefetches are placed in the loop so they overlap with
365 361 * non-memory instructions, so that there is no extra cost
366 362 * when the data is already in-cache.
367 363 *
368 364 */
369 365
370 366 /*
371 367 * Notes on preserving existing fp state and on membars.
372 368 *
373 369 * When a copyOP decides to use fp we may have to preserve existing
374 370 * floating point state. It is not the caller's state that we need to
375 371 * preserve - the rest of the kernel does not use fp and, anyway, fp
376 372 * registers are volatile across a call. Some examples:
377 373 *
378 374 * - userland has fp state and is interrupted (device interrupt
379 375 * or trap) and within the interrupt/trap handling we use
380 376 * bcopy()
381 377 * - another (higher level) interrupt or trap handler uses bcopy
382 378 * while a bcopy from an earlier interrupt is still active
383 379 * - an asynchronous error trap occurs while fp state exists (in
384 380 * userland or in kernel copy) and the tl0 component of the handling
385 381 * uses bcopy
386 382 * - a user process with fp state incurs a copy-on-write fault and
387 383 * hwblkpagecopy always uses fp
388 384 *
389 385 * We therefore need a per-call place in which to preserve fp state -
390 386 * using our stack is ideal (and since fp copy cannot be leaf optimized
391 387 * because of calls it makes, this is no hardship).
392 388 *
393 389 * The following membar BLD/BST discussion is Cheetah pipeline specific.
394 390 * In Cheetah BLD is blocking, #LoadLoad/#LoadStore/#StoreStore are
395 391 * nops (those semantics always apply) and #StoreLoad is implemented
396 392 * as a membar #Sync.
397 393 *
398 394 * It is possible that the owner of the fp state has a block load or
399 395 * block store still "in flight" at the time we come to preserve that
400 396 * state. Block loads are blocking in Cheetah pipelines so we do not
401 397 * need to sync with them. In preserving fp regs we will use block stores
402 398 * (which are not blocking in Cheetah pipelines) so we require a membar #Sync
403 399 * after storing state (so that our subsequent use of those registers
404 400 * does not modify them before the block stores complete); this membar
405 401 * also serves to sync with block stores the owner of the fp state has
406 402 * initiated.
407 403 *
408 404 * When we have finished fp copy (with it's repeated block stores)
409 405 * we must membar #Sync so that our block stores may complete before
410 406 * we either restore the original fp state into the fp registers or
411 407 * return to a caller which may initiate other fp operations that could
412 408 * modify the fp regs we used before the block stores complete.
413 409 *
414 410 * Synchronous faults (eg, unresolvable DMMU miss) that occur while
415 411 * t_lofault is not NULL will not panic but will instead trampoline
416 412 * to the registered lofault handler. There is no need for any
417 413 * membars for these - eg, our store to t_lofault will always be visible to
418 414 * ourselves and it is our cpu which will take any trap.
419 415 *
420 416 * Asynchronous faults (eg, uncorrectable ECC error from memory) that occur
421 417 * while t_lofault is not NULL will also not panic. Since we're copying
422 418 * to or from userland the extent of the damage is known - the destination
423 419 * buffer is incomplete. So trap handlers will trampoline to the lofault
424 420 * handler in this case which should take some form of error action to
425 421 * avoid using the incomplete buffer. The trap handler also flags the
426 422 * fault so that later return-from-trap handling (for the trap that brought
427 423 * this thread into the kernel in the first place) can notify the process
428 424 * and reboot the system (or restart the service with Greenline/Contracts).
429 425 *
430 426 * Asynchronous faults (eg, uncorrectable ECC error from memory) can
431 427 * result in deferred error traps - the trap is taken sometime after
432 428 * the event and the trap PC may not be the PC of the faulting access.
433 429 * Delivery of such pending traps can be forced by a membar #Sync, acting
434 430 * as an "error barrier" in this role. To accurately apply the user/kernel
435 431 * separation described in the preceding paragraph we must force delivery
436 432 * of deferred traps affecting kernel state before we install a lofault
437 433 * handler (if we interpose a new lofault handler on an existing one there
438 434 * is no need to repeat this), and we must force delivery of deferred
439 435 * errors affecting the lofault-protected region before we clear t_lofault.
440 436 * Failure to do so results in lost kernel state being interpreted as
441 437 * affecting a copyin/copyout only, or of an error that really only
442 438 * affects copy data being interpreted as losing kernel state.
443 439 *
444 440 * Since the copy operations may preserve and later restore floating
445 441 * point state that does not belong to the caller (see examples above),
446 442 * we must be careful in how we do this in order to prevent corruption
447 443 * of another program.
448 444 *
449 445 * To make sure that floating point state is always saved and restored
450 446 * correctly, the following "big rules" must be followed when the floating
451 447 * point registers will be used:
452 448 *
453 449 * 1. %l6 always holds the caller's lofault handler. Also in this register,
454 450 * Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in
455 451 * use. Bit 2 (TRAMP_FLAG) indicates that the call was to bcopy, and a
456 452 * lofault handler was set coming in.
457 453 *
458 454 * 2. The FPUSED flag indicates that all FP state has been successfully stored
459 455 * on the stack. It should not be set until this save has been completed.
460 456 *
461 457 * 3. The FPUSED flag should not be cleared on exit until all FP state has
462 458 * been restored from the stack. If an error occurs while restoring
463 459 * data from the stack, the error handler can check this flag to see if
464 460 * a restore is necessary.
465 461 *
466 462 * 4. Code run under the new lofault handler must be kept to a minimum. In
467 463 * particular, any calls to FP_ALLOWMIGRATE, which could result in a call
468 464 * to kpreempt(), should not be made until after the lofault handler has
469 465 * been restored.
470 466 */
471 467
472 468 /*
473 469 * VIS_COPY_THRESHOLD indicates the minimum number of bytes needed
474 470 * to "break even" using FP/VIS-accelerated memory operations.
475 471 * The FPBLK code assumes a minimum number of bytes are available
476 472 * to be moved on entry. Check that code carefully before
477 473 * reducing VIS_COPY_THRESHOLD below 256.
478 474 */
479 475 /*
480 476 * This shadows sys/machsystm.h which can't be included due to the lack of
481 477 * _ASM guards in include files it references. Change it here, change it there.
482 478 */
483 479 #define VIS_COPY_THRESHOLD 256
484 480
485 481 /*
486 482 * TEST for very short copies
487 483 * Be aware that the maximum unroll for the short unaligned case
488 484 * is SHORTCOPY+1
489 485 */
490 486 #define SHORTCOPY 3
491 487 #define CHKSIZE 39
492 488
493 489 /*
494 490 * Indicates that we're to trampoline to the error handler.
495 491 * Entry points bcopy, copyin_noerr, and copyout_noerr use this flag.
496 492 * kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag.
497 493 */
498 494 #define FPUSED_FLAG 1
499 495 #define TRAMP_FLAG 2
500 496 #define MASK_FLAGS 3
501 497
502 498 /*
503 499 * Number of outstanding prefetches.
504 500 * Testing with 1200 MHz Cheetah+ and Jaguar gives best results with
505 501 * two prefetches, one with a reach of 8*BLOCK_SIZE+8 and one with a
506 502 * reach of 5*BLOCK_SIZE. The double prefetch gives an typical improvement
507 503 * of 5% for large copies as compared to a single prefetch. The reason
508 504 * for the improvement is that with Cheetah and Jaguar, some prefetches
509 505 * are dropped due to the prefetch queue being full. The second prefetch
510 506 * reduces the number of cache lines that are dropped.
511 507 * Do not remove the double prefetch or change either CHEETAH_PREFETCH
512 508 * or CHEETAH_2ND_PREFETCH without extensive performance tests to prove
513 509 * there is no loss of performance.
514 510 */
515 511 #define CHEETAH_PREFETCH 8
516 512 #define CHEETAH_2ND_PREFETCH 5
517 513
518 514 #define VIS_BLOCKSIZE 64
519 515
520 516 /*
521 517 * Size of stack frame in order to accomodate a 64-byte aligned
522 518 * floating-point register save area and 2 64-bit temp locations.
523 519 * All copy functions use two quadrants of fp registers; to assure a
524 520 * block-aligned two block buffer in which to save we must reserve
525 521 * three blocks on stack. Not all functions preserve %pfrs on stack
526 522 * or need to preserve %gsr but we use HWCOPYFRAMESIZE for all.
527 523 *
528 524 * _______________________________________ <-- %fp + STACK_BIAS
529 525 * | We may need to preserve 2 quadrants |
530 526 * | of fp regs, but since we do so with |
531 527 * | BST/BLD we need room in which to |
532 528 * | align to VIS_BLOCKSIZE bytes. So |
533 529 * | this area is 3 * VIS_BLOCKSIZE. | <-- - SAVED_FPREGS_OFFSET
534 530 * |-------------------------------------|
535 531 * | 8 bytes to save %fprs | <-- - SAVED_FPRS_OFFSET
536 532 * |-------------------------------------|
537 533 * | 8 bytes to save %gsr | <-- - SAVED_GSR_OFFSET
538 534 * ---------------------------------------
539 535 */
540 536 #define HWCOPYFRAMESIZE ((VIS_BLOCKSIZE * (2 + 1)) + (2 * 8))
541 537 #define SAVED_FPREGS_OFFSET (VIS_BLOCKSIZE * 3)
542 538 #define SAVED_FPREGS_ADJUST ((VIS_BLOCKSIZE * 2) - 1)
543 539 #define SAVED_FPRS_OFFSET (SAVED_FPREGS_OFFSET + 8)
544 540 #define SAVED_GSR_OFFSET (SAVED_FPRS_OFFSET + 8)
545 541
546 542 /*
547 543 * Common macros used by the various versions of the block copy
548 544 * routines in this file.
549 545 */
550 546
551 547 /*
552 548 * In FP copies if we do not have preserved data to restore over
553 549 * the fp regs we used then we must zero those regs to avoid
554 550 * exposing portions of the data to later threads (data security).
555 551 *
556 552 * Copy functions use either quadrants 1 and 3 or 2 and 4.
557 553 *
558 554 * FZEROQ1Q3: Zero quadrants 1 and 3, ie %f0 - %f15 and %f32 - %f47
559 555 * FZEROQ2Q4: Zero quadrants 2 and 4, ie %f16 - %f31 and %f48 - %f63
560 556 *
561 557 * The instructions below are quicker than repeated fzero instructions
562 558 * since they can dispatch down two fp pipelines.
563 559 */
564 560 #define FZEROQ1Q3 \
565 561 fzero %f0 ;\
566 562 fzero %f2 ;\
567 563 faddd %f0, %f2, %f4 ;\
568 564 fmuld %f0, %f2, %f6 ;\
569 565 faddd %f0, %f2, %f8 ;\
570 566 fmuld %f0, %f2, %f10 ;\
571 567 faddd %f0, %f2, %f12 ;\
572 568 fmuld %f0, %f2, %f14 ;\
573 569 faddd %f0, %f2, %f32 ;\
574 570 fmuld %f0, %f2, %f34 ;\
575 571 faddd %f0, %f2, %f36 ;\
576 572 fmuld %f0, %f2, %f38 ;\
577 573 faddd %f0, %f2, %f40 ;\
578 574 fmuld %f0, %f2, %f42 ;\
579 575 faddd %f0, %f2, %f44 ;\
580 576 fmuld %f0, %f2, %f46
581 577
582 578 #define FZEROQ2Q4 \
583 579 fzero %f16 ;\
584 580 fzero %f18 ;\
585 581 faddd %f16, %f18, %f20 ;\
586 582 fmuld %f16, %f18, %f22 ;\
587 583 faddd %f16, %f18, %f24 ;\
588 584 fmuld %f16, %f18, %f26 ;\
589 585 faddd %f16, %f18, %f28 ;\
590 586 fmuld %f16, %f18, %f30 ;\
591 587 faddd %f16, %f18, %f48 ;\
592 588 fmuld %f16, %f18, %f50 ;\
593 589 faddd %f16, %f18, %f52 ;\
594 590 fmuld %f16, %f18, %f54 ;\
595 591 faddd %f16, %f18, %f56 ;\
596 592 fmuld %f16, %f18, %f58 ;\
597 593 faddd %f16, %f18, %f60 ;\
598 594 fmuld %f16, %f18, %f62
599 595
600 596 /*
601 597 * Macros to save and restore quadrants 1 and 3 or 2 and 4 to/from the stack.
602 598 * Used to save and restore in-use fp registers when we want to use FP
603 599 * and find fp already in use and copy size still large enough to justify
604 600 * the additional overhead of this save and restore.
605 601 *
606 602 * A membar #Sync is needed before save to sync fp ops initiated before
607 603 * the call to the copy function (by whoever has fp in use); for example
608 604 * an earlier block load to the quadrant we are about to save may still be
609 605 * "in flight". A membar #Sync is required at the end of the save to
610 606 * sync our block store (the copy code is about to begin ldd's to the
611 607 * first quadrant). Note, however, that since Cheetah pipeline block load
612 608 * is blocking we can omit the initial membar before saving fp state (they're
613 609 * commented below in case of future porting to a chip that does not block
↓ open down ↓ |
561 lines elided |
↑ open up ↑ |
614 610 * on block load).
615 611 *
616 612 * Similarly: a membar #Sync before restore allows the block stores of
617 613 * the copy operation to complete before we fill the quadrants with their
618 614 * original data, and a membar #Sync after restore lets the block loads
619 615 * of the restore complete before we return to whoever has the fp regs
620 616 * in use. To avoid repeated membar #Sync we make it the responsibility
621 617 * of the copy code to membar #Sync immediately after copy is complete
622 618 * and before using the BLD_*_FROMSTACK macro.
623 619 */
624 -#if !defined(lint)
625 620 #define BST_FPQ1Q3_TOSTACK(tmp1) \
626 621 /* membar #Sync */ ;\
627 622 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\
628 623 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\
629 624 stda %f0, [tmp1]ASI_BLK_P ;\
630 625 add tmp1, VIS_BLOCKSIZE, tmp1 ;\
631 626 stda %f32, [tmp1]ASI_BLK_P ;\
632 627 membar #Sync
633 628
634 629 #define BLD_FPQ1Q3_FROMSTACK(tmp1) \
635 630 /* membar #Sync - provided at copy completion */ ;\
636 631 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\
637 632 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\
638 633 ldda [tmp1]ASI_BLK_P, %f0 ;\
639 634 add tmp1, VIS_BLOCKSIZE, tmp1 ;\
640 635 ldda [tmp1]ASI_BLK_P, %f32 ;\
641 636 membar #Sync
642 637
643 638 #define BST_FPQ2Q4_TOSTACK(tmp1) \
644 639 /* membar #Sync */ ;\
645 640 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\
646 641 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\
647 642 stda %f16, [tmp1]ASI_BLK_P ;\
648 643 add tmp1, VIS_BLOCKSIZE, tmp1 ;\
649 644 stda %f48, [tmp1]ASI_BLK_P ;\
↓ open down ↓ |
15 lines elided |
↑ open up ↑ |
650 645 membar #Sync
651 646
652 647 #define BLD_FPQ2Q4_FROMSTACK(tmp1) \
653 648 /* membar #Sync - provided at copy completion */ ;\
654 649 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\
655 650 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\
656 651 ldda [tmp1]ASI_BLK_P, %f16 ;\
657 652 add tmp1, VIS_BLOCKSIZE, tmp1 ;\
658 653 ldda [tmp1]ASI_BLK_P, %f48 ;\
659 654 membar #Sync
660 -#endif
661 655
662 656 /*
663 657 * FP_NOMIGRATE and FP_ALLOWMIGRATE. Prevent migration (or, stronger,
664 658 * prevent preemption if there is no t_lwp to save FP state to on context
665 659 * switch) before commencing a FP copy, and reallow it on completion or
666 660 * in error trampoline paths when we were using FP copy.
667 661 *
668 662 * Both macros may call other functions, so be aware that all outputs are
669 663 * forfeit after using these macros. For this reason we do not pass registers
670 664 * to use - we just use any outputs we want.
671 665 *
672 666 * For fpRAS we need to perform the fpRAS mechanism test on the same
673 667 * CPU as we use for the copy operation, both so that we validate the
674 668 * CPU we perform the copy on and so that we know which CPU failed
675 669 * if a failure is detected. Hence we need to be bound to "our" CPU.
676 670 * This could be achieved through disabling preemption (and we have do it that
677 671 * way for threads with no t_lwp) but for larger copies this may hold
678 672 * higher priority threads off of cpu for too long (eg, realtime). So we
679 673 * make use of the lightweight t_nomigrate mechanism where we can (ie, when
680 674 * we have a t_lwp).
681 675 *
682 676 * Pseudo code:
683 677 *
684 678 * FP_NOMIGRATE:
685 679 *
686 680 * if (curthread->t_lwp) {
687 681 * thread_nomigrate();
688 682 * } else {
689 683 * kpreempt_disable();
690 684 * }
691 685 *
692 686 * FP_ALLOWMIGRATE:
693 687 *
694 688 * if (curthread->t_lwp) {
695 689 * thread_allowmigrate();
696 690 * } else {
697 691 * kpreempt_enable();
698 692 * }
699 693 */
700 694
701 695 #define FP_NOMIGRATE(label1, label2) \
702 696 ldn [THREAD_REG + T_LWP], %o0 ;\
703 697 brz,a,pn %o0, label1/**/f ;\
704 698 ldsb [THREAD_REG + T_PREEMPT], %o1 ;\
705 699 call thread_nomigrate ;\
706 700 nop ;\
707 701 ba label2/**/f ;\
708 702 nop ;\
709 703 label1: ;\
710 704 inc %o1 ;\
711 705 stb %o1, [THREAD_REG + T_PREEMPT] ;\
712 706 label2:
713 707
714 708 #define FP_ALLOWMIGRATE(label1, label2) \
715 709 ldn [THREAD_REG + T_LWP], %o0 ;\
716 710 brz,a,pn %o0, label1/**/f ;\
717 711 ldsb [THREAD_REG + T_PREEMPT], %o1 ;\
718 712 call thread_allowmigrate ;\
719 713 nop ;\
720 714 ba label2/**/f ;\
721 715 nop ;\
722 716 label1: ;\
723 717 dec %o1 ;\
724 718 brnz,pn %o1, label2/**/f ;\
725 719 stb %o1, [THREAD_REG + T_PREEMPT] ;\
726 720 ldn [THREAD_REG + T_CPU], %o0 ;\
727 721 ldub [%o0 + CPU_KPRUNRUN], %o0 ;\
728 722 brz,pt %o0, label2/**/f ;\
729 723 nop ;\
↓ open down ↓ |
59 lines elided |
↑ open up ↑ |
730 724 call kpreempt ;\
731 725 rdpr %pil, %o0 ;\
732 726 label2:
733 727
734 728 /*
735 729 * Copy a block of storage, returning an error code if `from' or
736 730 * `to' takes a kernel pagefault which cannot be resolved.
737 731 * Returns errno value on pagefault error, 0 if all ok
738 732 */
739 733
740 -#if defined(lint)
741 -
742 -/* ARGSUSED */
743 -int
744 -kcopy(const void *from, void *to, size_t count)
745 -{ return(0); }
746 -
747 -#else /* lint */
748 -
749 734 .seg ".text"
750 735 .align 4
751 736
752 737 ENTRY(kcopy)
753 738
754 739 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
755 740 bleu,pt %ncc, .kcopy_small ! go to larger cases
756 741 xor %o0, %o1, %o3 ! are src, dst alignable?
757 742 btst 7, %o3 !
758 743 bz,pt %ncc, .kcopy_8 ! check for longword alignment
759 744 nop
760 745 btst 1, %o3 !
761 746 bz,pt %ncc, .kcopy_2 ! check for half-word
762 747 nop
763 748 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
764 749 ld [%o3 + %lo(hw_copy_limit_1)], %o3
765 750 tst %o3
766 751 bz,pn %icc, .kcopy_small ! if zero, disable HW copy
767 752 cmp %o2, %o3 ! if length <= limit
768 753 bleu,pt %ncc, .kcopy_small ! go to small copy
769 754 nop
770 755 ba,pt %ncc, .kcopy_more ! otherwise go to large copy
771 756 nop
772 757 .kcopy_2:
773 758 btst 3, %o3 !
774 759 bz,pt %ncc, .kcopy_4 ! check for word alignment
775 760 nop
776 761 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
777 762 ld [%o3 + %lo(hw_copy_limit_2)], %o3
778 763 tst %o3
779 764 bz,pn %icc, .kcopy_small ! if zero, disable HW copy
780 765 cmp %o2, %o3 ! if length <= limit
781 766 bleu,pt %ncc, .kcopy_small ! go to small copy
782 767 nop
783 768 ba,pt %ncc, .kcopy_more ! otherwise go to large copy
784 769 nop
785 770 .kcopy_4:
786 771 ! already checked longword, must be word aligned
787 772 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
788 773 ld [%o3 + %lo(hw_copy_limit_4)], %o3
789 774 tst %o3
790 775 bz,pn %icc, .kcopy_small ! if zero, disable HW copy
791 776 cmp %o2, %o3 ! if length <= limit
792 777 bleu,pt %ncc, .kcopy_small ! go to small copy
793 778 nop
794 779 ba,pt %ncc, .kcopy_more ! otherwise go to large copy
795 780 nop
796 781 .kcopy_8:
797 782 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
798 783 ld [%o3 + %lo(hw_copy_limit_8)], %o3
799 784 tst %o3
800 785 bz,pn %icc, .kcopy_small ! if zero, disable HW copy
801 786 cmp %o2, %o3 ! if length <= limit
802 787 bleu,pt %ncc, .kcopy_small ! go to small copy
803 788 nop
804 789 ba,pt %ncc, .kcopy_more ! otherwise go to large copy
805 790 nop
806 791
807 792 .kcopy_small:
808 793 sethi %hi(.sm_copyerr), %o5 ! sm_copyerr is lofault value
809 794 or %o5, %lo(.sm_copyerr), %o5
810 795 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler
811 796 membar #Sync ! sync error barrier
812 797 ba,pt %ncc, .sm_do_copy ! common code
813 798 stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault
814 799
815 800 .kcopy_more:
816 801 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
817 802 sethi %hi(.copyerr), %l7 ! copyerr is lofault value
818 803 or %l7, %lo(.copyerr), %l7
819 804 ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler
820 805 membar #Sync ! sync error barrier
821 806 ba,pt %ncc, .do_copy ! common code
822 807 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault
823 808
824 809
825 810 /*
826 811 * We got here because of a fault during bcopy_more, called from kcopy or bcopy.
827 812 * Errno value is in %g1. bcopy_more uses fp quadrants 1 and 3.
828 813 */
829 814 .copyerr:
830 815 set .copyerr2, %l0
831 816 membar #Sync ! sync error barrier
832 817 stn %l0, [THREAD_REG + T_LOFAULT] ! set t_lofault
833 818 btst FPUSED_FLAG, %l6
834 819 bz %ncc, 1f
835 820 and %l6, TRAMP_FLAG, %l0 ! copy trampoline flag to %l0
836 821
837 822 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr
838 823 wr %o2, 0, %gsr
839 824
840 825 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
841 826 btst FPRS_FEF, %o3
842 827 bz,pt %icc, 4f
843 828 nop
844 829
845 830 BLD_FPQ1Q3_FROMSTACK(%o2)
846 831
847 832 ba,pt %ncc, 1f
848 833 wr %o3, 0, %fprs ! restore fprs
849 834
850 835 4:
851 836 FZEROQ1Q3
852 837 wr %o3, 0, %fprs ! restore fprs
853 838
854 839 !
855 840 ! Need to cater for the different expectations of kcopy
856 841 ! and bcopy. kcopy will *always* set a t_lofault handler
857 842 ! If it fires, we're expected to just return the error code
858 843 ! and *not* to invoke any existing error handler. As far as
859 844 ! bcopy is concerned, we only set t_lofault if there was an
860 845 ! existing lofault handler. In that case we're expected to
861 846 ! invoke the previously existing handler after resetting the
862 847 ! t_lofault value.
863 848 !
864 849 1:
865 850 andn %l6, MASK_FLAGS, %l6 ! turn trampoline flag off
866 851 membar #Sync ! sync error barrier
867 852 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
868 853 FP_ALLOWMIGRATE(5, 6)
869 854
870 855 btst TRAMP_FLAG, %l0
871 856 bnz,pn %ncc, 3f
872 857 nop
873 858 ret
874 859 restore %g1, 0, %o0
875 860
876 861 3:
877 862 !
878 863 ! We're here via bcopy. There *must* have been an error handler
879 864 ! in place otherwise we would have died a nasty death already.
880 865 !
881 866 jmp %l6 ! goto real handler
882 867 restore %g0, 0, %o0 ! dispose of copy window
883 868
884 869 /*
885 870 * We got here because of a fault in .copyerr. We can't safely restore fp
886 871 * state, so we panic.
887 872 */
888 873 fp_panic_msg:
889 874 .asciz "Unable to restore fp state after copy operation"
890 875
891 876 .align 4
892 877 .copyerr2:
893 878 set fp_panic_msg, %o0
894 879 call panic
895 880 nop
896 881
897 882 /*
898 883 * We got here because of a fault during a small kcopy or bcopy.
899 884 * No floating point registers are used by the small copies.
900 885 * Errno value is in %g1.
901 886 */
902 887 .sm_copyerr:
903 888 1:
904 889 btst TRAMP_FLAG, %o4
905 890 membar #Sync
↓ open down ↓ |
147 lines elided |
↑ open up ↑ |
906 891 andn %o4, TRAMP_FLAG, %o4
907 892 bnz,pn %ncc, 3f
908 893 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
909 894 retl
910 895 mov %g1, %o0
911 896 3:
912 897 jmp %o4 ! goto real handler
913 898 mov %g0, %o0 !
914 899
915 900 SET_SIZE(kcopy)
916 -#endif /* lint */
917 901
918 902
919 903 /*
920 904 * Copy a block of storage - must not overlap (from + len <= to).
921 905 * Registers: l6 - saved t_lofault
922 906 * (for short copies, o4 - saved t_lofault)
923 907 *
924 908 * Copy a page of memory.
925 909 * Assumes double word alignment and a count >= 256.
926 910 */
927 -#if defined(lint)
928 911
929 -/* ARGSUSED */
930 -void
931 -bcopy(const void *from, void *to, size_t count)
932 -{}
933 -
934 -#else /* lint */
935 -
936 912 ENTRY(bcopy)
937 913
938 914 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
939 915 bleu,pt %ncc, .bcopy_small ! go to larger cases
940 916 xor %o0, %o1, %o3 ! are src, dst alignable?
941 917 btst 7, %o3 !
942 918 bz,pt %ncc, .bcopy_8 ! check for longword alignment
943 919 nop
944 920 btst 1, %o3 !
945 921 bz,pt %ncc, .bcopy_2 ! check for half-word
946 922 nop
947 923 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
948 924 ld [%o3 + %lo(hw_copy_limit_1)], %o3
949 925 tst %o3
950 926 bz,pn %icc, .bcopy_small ! if zero, disable HW copy
951 927 cmp %o2, %o3 ! if length <= limit
952 928 bleu,pt %ncc, .bcopy_small ! go to small copy
953 929 nop
954 930 ba,pt %ncc, .bcopy_more ! otherwise go to large copy
955 931 nop
956 932 .bcopy_2:
957 933 btst 3, %o3 !
958 934 bz,pt %ncc, .bcopy_4 ! check for word alignment
959 935 nop
960 936 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
961 937 ld [%o3 + %lo(hw_copy_limit_2)], %o3
962 938 tst %o3
963 939 bz,pn %icc, .bcopy_small ! if zero, disable HW copy
964 940 cmp %o2, %o3 ! if length <= limit
965 941 bleu,pt %ncc, .bcopy_small ! go to small copy
966 942 nop
967 943 ba,pt %ncc, .bcopy_more ! otherwise go to large copy
968 944 nop
969 945 .bcopy_4:
970 946 ! already checked longword, must be word aligned
971 947 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
972 948 ld [%o3 + %lo(hw_copy_limit_4)], %o3
973 949 tst %o3
974 950 bz,pn %icc, .bcopy_small ! if zero, disable HW copy
975 951 cmp %o2, %o3 ! if length <= limit
976 952 bleu,pt %ncc, .bcopy_small ! go to small copy
977 953 nop
978 954 ba,pt %ncc, .bcopy_more ! otherwise go to large copy
979 955 nop
980 956 .bcopy_8:
981 957 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
982 958 ld [%o3 + %lo(hw_copy_limit_8)], %o3
983 959 tst %o3
984 960 bz,pn %icc, .bcopy_small ! if zero, disable HW copy
985 961 cmp %o2, %o3 ! if length <= limit
986 962 bleu,pt %ncc, .bcopy_small ! go to small copy
987 963 nop
988 964 ba,pt %ncc, .bcopy_more ! otherwise go to large copy
989 965 nop
990 966
991 967 .align 16
992 968 .bcopy_small:
993 969 ldn [THREAD_REG + T_LOFAULT], %o4 ! save t_lofault
994 970 tst %o4
995 971 bz,pt %icc, .sm_do_copy
996 972 nop
997 973 sethi %hi(.sm_copyerr), %o5
998 974 or %o5, %lo(.sm_copyerr), %o5
999 975 membar #Sync ! sync error barrier
1000 976 stn %o5, [THREAD_REG + T_LOFAULT] ! install new vector
1001 977 or %o4, TRAMP_FLAG, %o4 ! error should trampoline
1002 978 .sm_do_copy:
1003 979 cmp %o2, SHORTCOPY ! check for really short case
1004 980 bleu,pt %ncc, .bc_sm_left !
1005 981 cmp %o2, CHKSIZE ! check for medium length cases
1006 982 bgu,pn %ncc, .bc_med !
1007 983 or %o0, %o1, %o3 ! prepare alignment check
1008 984 andcc %o3, 0x3, %g0 ! test for alignment
1009 985 bz,pt %ncc, .bc_sm_word ! branch to word aligned case
1010 986 .bc_sm_movebytes:
1011 987 sub %o2, 3, %o2 ! adjust count to allow cc zero test
1012 988 .bc_sm_notalign4:
1013 989 ldub [%o0], %o3 ! read byte
1014 990 stb %o3, [%o1] ! write byte
1015 991 subcc %o2, 4, %o2 ! reduce count by 4
1016 992 ldub [%o0 + 1], %o3 ! repeat for a total of 4 bytes
1017 993 add %o0, 4, %o0 ! advance SRC by 4
1018 994 stb %o3, [%o1 + 1]
1019 995 ldub [%o0 - 2], %o3
1020 996 add %o1, 4, %o1 ! advance DST by 4
1021 997 stb %o3, [%o1 - 2]
1022 998 ldub [%o0 - 1], %o3
1023 999 bgt,pt %ncc, .bc_sm_notalign4 ! loop til 3 or fewer bytes remain
1024 1000 stb %o3, [%o1 - 1]
1025 1001 add %o2, 3, %o2 ! restore count
1026 1002 .bc_sm_left:
1027 1003 tst %o2
1028 1004 bz,pt %ncc, .bc_sm_exit ! check for zero length
1029 1005 deccc %o2 ! reduce count for cc test
1030 1006 ldub [%o0], %o3 ! move one byte
1031 1007 bz,pt %ncc, .bc_sm_exit
1032 1008 stb %o3, [%o1]
1033 1009 ldub [%o0 + 1], %o3 ! move another byte
1034 1010 deccc %o2 ! check for more
1035 1011 bz,pt %ncc, .bc_sm_exit
1036 1012 stb %o3, [%o1 + 1]
1037 1013 ldub [%o0 + 2], %o3 ! move final byte
1038 1014 stb %o3, [%o1 + 2]
1039 1015 membar #Sync ! sync error barrier
1040 1016 andn %o4, TRAMP_FLAG, %o4
1041 1017 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
1042 1018 retl
1043 1019 mov %g0, %o0 ! return 0
1044 1020 .align 16
1045 1021 nop ! instruction alignment
1046 1022 ! see discussion at start of file
1047 1023 .bc_sm_words:
1048 1024 lduw [%o0], %o3 ! read word
1049 1025 .bc_sm_wordx:
1050 1026 subcc %o2, 8, %o2 ! update count
1051 1027 stw %o3, [%o1] ! write word
1052 1028 add %o0, 8, %o0 ! update SRC
1053 1029 lduw [%o0 - 4], %o3 ! read word
1054 1030 add %o1, 8, %o1 ! update DST
1055 1031 bgt,pt %ncc, .bc_sm_words ! loop til done
1056 1032 stw %o3, [%o1 - 4] ! write word
1057 1033 addcc %o2, 7, %o2 ! restore count
1058 1034 bz,pt %ncc, .bc_sm_exit
1059 1035 deccc %o2
1060 1036 bz,pt %ncc, .bc_sm_byte
1061 1037 .bc_sm_half:
1062 1038 subcc %o2, 2, %o2 ! reduce count by 2
1063 1039 add %o0, 2, %o0 ! advance SRC by 2
1064 1040 lduh [%o0 - 2], %o3 ! read half word
1065 1041 add %o1, 2, %o1 ! advance DST by 2
1066 1042 bgt,pt %ncc, .bc_sm_half ! loop til done
1067 1043 sth %o3, [%o1 - 2] ! write half word
1068 1044 addcc %o2, 1, %o2 ! restore count
1069 1045 bz,pt %ncc, .bc_sm_exit
1070 1046 nop
1071 1047 .bc_sm_byte:
1072 1048 ldub [%o0], %o3
1073 1049 stb %o3, [%o1]
1074 1050 membar #Sync ! sync error barrier
1075 1051 andn %o4, TRAMP_FLAG, %o4
1076 1052 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
1077 1053 retl
1078 1054 mov %g0, %o0 ! return 0
1079 1055
1080 1056 .bc_sm_word:
1081 1057 subcc %o2, 4, %o2 ! update count
1082 1058 bgt,pt %ncc, .bc_sm_wordx
1083 1059 lduw [%o0], %o3 ! read word
1084 1060 addcc %o2, 3, %o2 ! restore count
1085 1061 bz,pt %ncc, .bc_sm_exit
1086 1062 stw %o3, [%o1] ! write word
1087 1063 deccc %o2 ! reduce count for cc test
1088 1064 ldub [%o0 + 4], %o3 ! load one byte
1089 1065 bz,pt %ncc, .bc_sm_exit
1090 1066 stb %o3, [%o1 + 4] ! store one byte
1091 1067 ldub [%o0 + 5], %o3 ! load second byte
1092 1068 deccc %o2
1093 1069 bz,pt %ncc, .bc_sm_exit
1094 1070 stb %o3, [%o1 + 5] ! store second byte
1095 1071 ldub [%o0 + 6], %o3 ! load third byte
1096 1072 stb %o3, [%o1 + 6] ! store third byte
1097 1073 .bc_sm_exit:
1098 1074 membar #Sync ! sync error barrier
1099 1075 andn %o4, TRAMP_FLAG, %o4
1100 1076 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
1101 1077 retl
1102 1078 mov %g0, %o0 ! return 0
1103 1079
1104 1080 .align 16
1105 1081 .bc_med:
1106 1082 xor %o0, %o1, %o3 ! setup alignment check
1107 1083 btst 1, %o3
1108 1084 bnz,pt %ncc, .bc_sm_movebytes ! unaligned
1109 1085 nop
1110 1086 btst 3, %o3
1111 1087 bnz,pt %ncc, .bc_med_half ! halfword aligned
1112 1088 nop
1113 1089 btst 7, %o3
1114 1090 bnz,pt %ncc, .bc_med_word ! word aligned
1115 1091 nop
1116 1092 .bc_med_long:
1117 1093 btst 3, %o0 ! check for
1118 1094 bz,pt %ncc, .bc_med_long1 ! word alignment
1119 1095 nop
1120 1096 .bc_med_long0:
1121 1097 ldub [%o0], %o3 ! load one byte
1122 1098 inc %o0
1123 1099 stb %o3,[%o1] ! store byte
1124 1100 inc %o1
1125 1101 btst 3, %o0
1126 1102 bnz,pt %ncc, .bc_med_long0
1127 1103 dec %o2
1128 1104 .bc_med_long1: ! word aligned
1129 1105 btst 7, %o0 ! check for long word
1130 1106 bz,pt %ncc, .bc_med_long2
1131 1107 nop
1132 1108 lduw [%o0], %o3 ! load word
1133 1109 add %o0, 4, %o0 ! advance SRC by 4
1134 1110 stw %o3, [%o1] ! store word
1135 1111 add %o1, 4, %o1 ! advance DST by 4
1136 1112 sub %o2, 4, %o2 ! reduce count by 4
1137 1113 !
1138 1114 ! Now long word aligned and have at least 32 bytes to move
1139 1115 !
1140 1116 .bc_med_long2:
1141 1117 sub %o2, 31, %o2 ! adjust count to allow cc zero test
1142 1118 .bc_med_lmove:
1143 1119 ldx [%o0], %o3 ! read long word
1144 1120 stx %o3, [%o1] ! write long word
1145 1121 subcc %o2, 32, %o2 ! reduce count by 32
1146 1122 ldx [%o0 + 8], %o3 ! repeat for a total for 4 long words
1147 1123 add %o0, 32, %o0 ! advance SRC by 32
1148 1124 stx %o3, [%o1 + 8]
1149 1125 ldx [%o0 - 16], %o3
1150 1126 add %o1, 32, %o1 ! advance DST by 32
1151 1127 stx %o3, [%o1 - 16]
1152 1128 ldx [%o0 - 8], %o3
1153 1129 bgt,pt %ncc, .bc_med_lmove ! loop til 31 or fewer bytes left
1154 1130 stx %o3, [%o1 - 8]
1155 1131 addcc %o2, 24, %o2 ! restore count to long word offset
1156 1132 ble,pt %ncc, .bc_med_lextra ! check for more long words to move
1157 1133 nop
1158 1134 .bc_med_lword:
1159 1135 ldx [%o0], %o3 ! read long word
1160 1136 subcc %o2, 8, %o2 ! reduce count by 8
1161 1137 stx %o3, [%o1] ! write long word
1162 1138 add %o0, 8, %o0 ! advance SRC by 8
1163 1139 bgt,pt %ncc, .bc_med_lword ! loop til 7 or fewer bytes left
1164 1140 add %o1, 8, %o1 ! advance DST by 8
1165 1141 .bc_med_lextra:
1166 1142 addcc %o2, 7, %o2 ! restore rest of count
1167 1143 bz,pt %ncc, .bc_sm_exit ! if zero, then done
1168 1144 deccc %o2
1169 1145 bz,pt %ncc, .bc_sm_byte
1170 1146 nop
1171 1147 ba,pt %ncc, .bc_sm_half
1172 1148 nop
1173 1149
1174 1150 .align 16
1175 1151 .bc_med_word:
1176 1152 btst 3, %o0 ! check for
1177 1153 bz,pt %ncc, .bc_med_word1 ! word alignment
1178 1154 nop
1179 1155 .bc_med_word0:
1180 1156 ldub [%o0], %o3 ! load one byte
1181 1157 inc %o0
1182 1158 stb %o3,[%o1] ! store byte
1183 1159 inc %o1
1184 1160 btst 3, %o0
1185 1161 bnz,pt %ncc, .bc_med_word0
1186 1162 dec %o2
1187 1163 !
1188 1164 ! Now word aligned and have at least 36 bytes to move
1189 1165 !
1190 1166 .bc_med_word1:
1191 1167 sub %o2, 15, %o2 ! adjust count to allow cc zero test
1192 1168 .bc_med_wmove:
1193 1169 lduw [%o0], %o3 ! read word
1194 1170 stw %o3, [%o1] ! write word
1195 1171 subcc %o2, 16, %o2 ! reduce count by 16
1196 1172 lduw [%o0 + 4], %o3 ! repeat for a total for 4 words
1197 1173 add %o0, 16, %o0 ! advance SRC by 16
1198 1174 stw %o3, [%o1 + 4]
1199 1175 lduw [%o0 - 8], %o3
1200 1176 add %o1, 16, %o1 ! advance DST by 16
1201 1177 stw %o3, [%o1 - 8]
1202 1178 lduw [%o0 - 4], %o3
1203 1179 bgt,pt %ncc, .bc_med_wmove ! loop til 15 or fewer bytes left
1204 1180 stw %o3, [%o1 - 4]
1205 1181 addcc %o2, 12, %o2 ! restore count to word offset
1206 1182 ble,pt %ncc, .bc_med_wextra ! check for more words to move
1207 1183 nop
1208 1184 .bc_med_word2:
1209 1185 lduw [%o0], %o3 ! read word
1210 1186 subcc %o2, 4, %o2 ! reduce count by 4
1211 1187 stw %o3, [%o1] ! write word
1212 1188 add %o0, 4, %o0 ! advance SRC by 4
1213 1189 bgt,pt %ncc, .bc_med_word2 ! loop til 3 or fewer bytes left
1214 1190 add %o1, 4, %o1 ! advance DST by 4
1215 1191 .bc_med_wextra:
1216 1192 addcc %o2, 3, %o2 ! restore rest of count
1217 1193 bz,pt %ncc, .bc_sm_exit ! if zero, then done
1218 1194 deccc %o2
1219 1195 bz,pt %ncc, .bc_sm_byte
1220 1196 nop
1221 1197 ba,pt %ncc, .bc_sm_half
1222 1198 nop
1223 1199
1224 1200 .align 16
1225 1201 .bc_med_half:
1226 1202 btst 1, %o0 ! check for
1227 1203 bz,pt %ncc, .bc_med_half1 ! half word alignment
1228 1204 nop
1229 1205 ldub [%o0], %o3 ! load one byte
1230 1206 inc %o0
1231 1207 stb %o3,[%o1] ! store byte
1232 1208 inc %o1
1233 1209 dec %o2
1234 1210 !
1235 1211 ! Now half word aligned and have at least 38 bytes to move
1236 1212 !
1237 1213 .bc_med_half1:
1238 1214 sub %o2, 7, %o2 ! adjust count to allow cc zero test
1239 1215 .bc_med_hmove:
1240 1216 lduh [%o0], %o3 ! read half word
1241 1217 sth %o3, [%o1] ! write half word
1242 1218 subcc %o2, 8, %o2 ! reduce count by 8
1243 1219 lduh [%o0 + 2], %o3 ! repeat for a total for 4 halfwords
1244 1220 add %o0, 8, %o0 ! advance SRC by 8
1245 1221 sth %o3, [%o1 + 2]
1246 1222 lduh [%o0 - 4], %o3
1247 1223 add %o1, 8, %o1 ! advance DST by 8
1248 1224 sth %o3, [%o1 - 4]
1249 1225 lduh [%o0 - 2], %o3
1250 1226 bgt,pt %ncc, .bc_med_hmove ! loop til 7 or fewer bytes left
1251 1227 sth %o3, [%o1 - 2]
1252 1228 addcc %o2, 7, %o2 ! restore count
1253 1229 bz,pt %ncc, .bc_sm_exit
1254 1230 deccc %o2
1255 1231 bz,pt %ncc, .bc_sm_byte
1256 1232 nop
1257 1233 ba,pt %ncc, .bc_sm_half
1258 1234 nop
1259 1235
1260 1236 SET_SIZE(bcopy)
1261 1237
1262 1238 /*
1263 1239 * The _more entry points are not intended to be used directly by
1264 1240 * any caller from outside this file. They are provided to allow
1265 1241 * profiling and dtrace of the portions of the copy code that uses
1266 1242 * the floating point registers.
1267 1243 * This entry is particularly important as DTRACE (at least as of
1268 1244 * 4/2004) does not support leaf functions.
1269 1245 */
1270 1246
1271 1247 ENTRY(bcopy_more)
1272 1248 .bcopy_more:
1273 1249 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1274 1250 ldn [THREAD_REG + T_LOFAULT], %l6 ! save t_lofault
1275 1251 tst %l6
1276 1252 bz,pt %ncc, .do_copy
1277 1253 nop
1278 1254 sethi %hi(.copyerr), %o2
1279 1255 or %o2, %lo(.copyerr), %o2
1280 1256 membar #Sync ! sync error barrier
1281 1257 stn %o2, [THREAD_REG + T_LOFAULT] ! install new vector
1282 1258 !
1283 1259 ! We've already captured whether t_lofault was zero on entry.
1284 1260 ! We need to mark ourselves as being from bcopy since both
1285 1261 ! kcopy and bcopy use the same code path. If TRAMP_FLAG is set
1286 1262 ! and the saved lofault was zero, we won't reset lofault on
1287 1263 ! returning.
1288 1264 !
1289 1265 or %l6, TRAMP_FLAG, %l6
1290 1266
1291 1267 /*
1292 1268 * Copies that reach here are larger than VIS_COPY_THRESHOLD bytes
1293 1269 * Also, use of FP registers has been tested to be enabled
1294 1270 */
1295 1271 .do_copy:
1296 1272 FP_NOMIGRATE(6, 7)
1297 1273
1298 1274 rd %fprs, %o2 ! check for unused fp
1299 1275 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
1300 1276 btst FPRS_FEF, %o2
1301 1277 bz,a,pt %icc, .do_blockcopy
1302 1278 wr %g0, FPRS_FEF, %fprs
1303 1279
1304 1280 BST_FPQ1Q3_TOSTACK(%o2)
1305 1281
1306 1282 .do_blockcopy:
1307 1283 rd %gsr, %o2
1308 1284 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr
1309 1285 or %l6, FPUSED_FLAG, %l6
1310 1286
1311 1287 #define REALSRC %i0
1312 1288 #define DST %i1
1313 1289 #define CNT %i2
1314 1290 #define SRC %i3
1315 1291 #define TMP %i5
1316 1292
1317 1293 andcc DST, VIS_BLOCKSIZE - 1, TMP
1318 1294 bz,pt %ncc, 2f
1319 1295 neg TMP
1320 1296 add TMP, VIS_BLOCKSIZE, TMP
1321 1297
1322 1298 ! TMP = bytes required to align DST on FP_BLOCK boundary
1323 1299 ! Using SRC as a tmp here
1324 1300 cmp TMP, 3
1325 1301 bleu,pt %ncc, 1f
1326 1302 sub CNT,TMP,CNT ! adjust main count
1327 1303 sub TMP, 3, TMP ! adjust for end of loop test
1328 1304 .bc_blkalign:
1329 1305 ldub [REALSRC], SRC ! move 4 bytes per loop iteration
1330 1306 stb SRC, [DST]
1331 1307 subcc TMP, 4, TMP
1332 1308 ldub [REALSRC + 1], SRC
1333 1309 add REALSRC, 4, REALSRC
1334 1310 stb SRC, [DST + 1]
1335 1311 ldub [REALSRC - 2], SRC
1336 1312 add DST, 4, DST
1337 1313 stb SRC, [DST - 2]
1338 1314 ldub [REALSRC - 1], SRC
1339 1315 bgu,pt %ncc, .bc_blkalign
1340 1316 stb SRC, [DST - 1]
1341 1317
1342 1318 addcc TMP, 3, TMP ! restore count adjustment
1343 1319 bz,pt %ncc, 2f ! no bytes left?
1344 1320 nop
1345 1321 1: ldub [REALSRC], SRC
1346 1322 inc REALSRC
1347 1323 inc DST
1348 1324 deccc TMP
1349 1325 bgu %ncc, 1b
1350 1326 stb SRC, [DST - 1]
1351 1327
1352 1328 2:
1353 1329 andn REALSRC, 0x7, SRC
1354 1330 alignaddr REALSRC, %g0, %g0
1355 1331
1356 1332 ! SRC - 8-byte aligned
1357 1333 ! DST - 64-byte aligned
1358 1334 prefetch [SRC], #one_read
1359 1335 prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read
1360 1336 prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read
1361 1337 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read
1362 1338 ldd [SRC], %f0
1363 1339 #if CHEETAH_PREFETCH > 4
1364 1340 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
1365 1341 #endif
1366 1342 ldd [SRC + 0x08], %f2
1367 1343 #if CHEETAH_PREFETCH > 5
1368 1344 prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read
1369 1345 #endif
1370 1346 ldd [SRC + 0x10], %f4
1371 1347 #if CHEETAH_PREFETCH > 6
1372 1348 prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read
1373 1349 #endif
1374 1350 faligndata %f0, %f2, %f32
1375 1351 ldd [SRC + 0x18], %f6
1376 1352 #if CHEETAH_PREFETCH > 7
1377 1353 prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read
1378 1354 #endif
1379 1355 faligndata %f2, %f4, %f34
1380 1356 ldd [SRC + 0x20], %f8
1381 1357 faligndata %f4, %f6, %f36
1382 1358 ldd [SRC + 0x28], %f10
1383 1359 faligndata %f6, %f8, %f38
1384 1360 ldd [SRC + 0x30], %f12
1385 1361 faligndata %f8, %f10, %f40
1386 1362 ldd [SRC + 0x38], %f14
1387 1363 faligndata %f10, %f12, %f42
1388 1364 ldd [SRC + VIS_BLOCKSIZE], %f0
1389 1365 sub CNT, VIS_BLOCKSIZE, CNT
1390 1366 add SRC, VIS_BLOCKSIZE, SRC
1391 1367 add REALSRC, VIS_BLOCKSIZE, REALSRC
1392 1368 ba,a,pt %ncc, 1f
1393 1369 nop
1394 1370 .align 16
1395 1371 1:
1396 1372 ldd [SRC + 0x08], %f2
1397 1373 faligndata %f12, %f14, %f44
1398 1374 ldd [SRC + 0x10], %f4
1399 1375 faligndata %f14, %f0, %f46
1400 1376 stda %f32, [DST]ASI_BLK_P
1401 1377 ldd [SRC + 0x18], %f6
1402 1378 faligndata %f0, %f2, %f32
1403 1379 ldd [SRC + 0x20], %f8
1404 1380 faligndata %f2, %f4, %f34
1405 1381 ldd [SRC + 0x28], %f10
1406 1382 faligndata %f4, %f6, %f36
1407 1383 ldd [SRC + 0x30], %f12
1408 1384 faligndata %f6, %f8, %f38
1409 1385 ldd [SRC + 0x38], %f14
1410 1386 faligndata %f8, %f10, %f40
1411 1387 sub CNT, VIS_BLOCKSIZE, CNT
1412 1388 ldd [SRC + VIS_BLOCKSIZE], %f0
1413 1389 faligndata %f10, %f12, %f42
1414 1390 prefetch [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read
1415 1391 add DST, VIS_BLOCKSIZE, DST
1416 1392 prefetch [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1417 1393 add REALSRC, VIS_BLOCKSIZE, REALSRC
1418 1394 cmp CNT, VIS_BLOCKSIZE + 8
1419 1395 bgu,pt %ncc, 1b
1420 1396 add SRC, VIS_BLOCKSIZE, SRC
1421 1397
1422 1398 ! only if REALSRC & 0x7 is 0
1423 1399 cmp CNT, VIS_BLOCKSIZE
1424 1400 bne %ncc, 3f
1425 1401 andcc REALSRC, 0x7, %g0
1426 1402 bz,pt %ncc, 2f
1427 1403 nop
1428 1404 3:
1429 1405 faligndata %f12, %f14, %f44
1430 1406 faligndata %f14, %f0, %f46
1431 1407 stda %f32, [DST]ASI_BLK_P
1432 1408 add DST, VIS_BLOCKSIZE, DST
1433 1409 ba,pt %ncc, 3f
1434 1410 nop
1435 1411 2:
1436 1412 ldd [SRC + 0x08], %f2
1437 1413 fsrc1 %f12, %f44
1438 1414 ldd [SRC + 0x10], %f4
1439 1415 fsrc1 %f14, %f46
1440 1416 stda %f32, [DST]ASI_BLK_P
1441 1417 ldd [SRC + 0x18], %f6
1442 1418 fsrc1 %f0, %f32
1443 1419 ldd [SRC + 0x20], %f8
1444 1420 fsrc1 %f2, %f34
1445 1421 ldd [SRC + 0x28], %f10
1446 1422 fsrc1 %f4, %f36
1447 1423 ldd [SRC + 0x30], %f12
1448 1424 fsrc1 %f6, %f38
1449 1425 ldd [SRC + 0x38], %f14
1450 1426 fsrc1 %f8, %f40
1451 1427 sub CNT, VIS_BLOCKSIZE, CNT
1452 1428 add DST, VIS_BLOCKSIZE, DST
1453 1429 add SRC, VIS_BLOCKSIZE, SRC
1454 1430 add REALSRC, VIS_BLOCKSIZE, REALSRC
1455 1431 fsrc1 %f10, %f42
1456 1432 fsrc1 %f12, %f44
1457 1433 fsrc1 %f14, %f46
1458 1434 stda %f32, [DST]ASI_BLK_P
1459 1435 add DST, VIS_BLOCKSIZE, DST
1460 1436 ba,a,pt %ncc, .bcb_exit
1461 1437 nop
1462 1438
1463 1439 3: tst CNT
1464 1440 bz,a,pt %ncc, .bcb_exit
1465 1441 nop
1466 1442
1467 1443 5: ldub [REALSRC], TMP
1468 1444 inc REALSRC
1469 1445 inc DST
1470 1446 deccc CNT
1471 1447 bgu %ncc, 5b
1472 1448 stb TMP, [DST - 1]
1473 1449 .bcb_exit:
1474 1450 membar #Sync
1475 1451
1476 1452 FPRAS_INTERVAL(FPRAS_BCOPY, 0, %l5, %o2, %o3, %o4, %o5, 8)
1477 1453 FPRAS_REWRITE_TYPE2Q1(0, %l5, %o2, %o3, 8, 9)
1478 1454 FPRAS_CHECK(FPRAS_BCOPY, %l5, 9) ! outputs lost
1479 1455
1480 1456 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr
1481 1457 wr %o2, 0, %gsr
1482 1458
1483 1459 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1484 1460 btst FPRS_FEF, %o3
1485 1461 bz,pt %icc, 4f
1486 1462 nop
1487 1463
1488 1464 BLD_FPQ1Q3_FROMSTACK(%o2)
1489 1465
1490 1466 ba,pt %ncc, 2f
1491 1467 wr %o3, 0, %fprs ! restore fprs
1492 1468 4:
1493 1469 FZEROQ1Q3
1494 1470 wr %o3, 0, %fprs ! restore fprs
↓ open down ↓ |
549 lines elided |
↑ open up ↑ |
1495 1471 2:
1496 1472 membar #Sync ! sync error barrier
1497 1473 andn %l6, MASK_FLAGS, %l6
1498 1474 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
1499 1475 FP_ALLOWMIGRATE(5, 6)
1500 1476 ret
1501 1477 restore %g0, 0, %o0
1502 1478
1503 1479 SET_SIZE(bcopy_more)
1504 1480
1505 -#endif /* lint */
1506 -
1507 1481 /*
1508 1482 * Block copy with possibly overlapped operands.
1509 1483 */
1510 1484
1511 -#if defined(lint)
1512 -
1513 -/*ARGSUSED*/
1514 -void
1515 -ovbcopy(const void *from, void *to, size_t count)
1516 -{}
1517 -
1518 -#else /* lint */
1519 -
1520 1485 ENTRY(ovbcopy)
1521 1486 tst %o2 ! check count
1522 1487 bgu,a %ncc, 1f ! nothing to do or bad arguments
1523 1488 subcc %o0, %o1, %o3 ! difference of from and to address
1524 1489
1525 1490 retl ! return
1526 1491 nop
1527 1492 1:
1528 1493 bneg,a %ncc, 2f
1529 1494 neg %o3 ! if < 0, make it positive
1530 1495 2: cmp %o2, %o3 ! cmp size and abs(from - to)
1531 1496 bleu %ncc, bcopy ! if size <= abs(diff): use bcopy,
1532 1497 .empty ! no overlap
1533 1498 cmp %o0, %o1 ! compare from and to addresses
1534 1499 blu %ncc, .ov_bkwd ! if from < to, copy backwards
1535 1500 nop
1536 1501 !
1537 1502 ! Copy forwards.
1538 1503 !
1539 1504 .ov_fwd:
1540 1505 ldub [%o0], %o3 ! read from address
1541 1506 inc %o0 ! inc from address
1542 1507 stb %o3, [%o1] ! write to address
1543 1508 deccc %o2 ! dec count
1544 1509 bgu %ncc, .ov_fwd ! loop till done
1545 1510 inc %o1 ! inc to address
1546 1511
1547 1512 retl ! return
1548 1513 nop
1549 1514 !
1550 1515 ! Copy backwards.
1551 1516 !
1552 1517 .ov_bkwd:
↓ open down ↓ |
23 lines elided |
↑ open up ↑ |
1553 1518 deccc %o2 ! dec count
1554 1519 ldub [%o0 + %o2], %o3 ! get byte at end of src
1555 1520 bgu %ncc, .ov_bkwd ! loop till done
1556 1521 stb %o3, [%o1 + %o2] ! delay slot, store at end of dst
1557 1522
1558 1523 retl ! return
1559 1524 nop
1560 1525
1561 1526 SET_SIZE(ovbcopy)
1562 1527
1563 -#endif /* lint */
1564 1528
1565 -
1566 1529 /*
1567 1530 * hwblkpagecopy()
1568 1531 *
1569 1532 * Copies exactly one page. This routine assumes the caller (ppcopy)
1570 1533 * has already disabled kernel preemption and has checked
1571 1534 * use_hw_bcopy. Preventing preemption also prevents cpu migration.
1572 1535 */
1573 -#ifdef lint
1574 -/*ARGSUSED*/
1575 -void
1576 -hwblkpagecopy(const void *src, void *dst)
1577 -{ }
1578 -#else /* lint */
1579 1536 ENTRY(hwblkpagecopy)
1580 1537 ! get another window w/space for three aligned blocks of saved fpregs
1581 1538 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1582 1539
1583 1540 ! %i0 - source address (arg)
1584 1541 ! %i1 - destination address (arg)
1585 1542 ! %i2 - length of region (not arg)
1586 1543 ! %l0 - saved fprs
1587 1544 ! %l1 - pointer to saved fpregs
1588 1545
1589 1546 rd %fprs, %l0 ! check for unused fp
1590 1547 btst FPRS_FEF, %l0
1591 1548 bz,a,pt %icc, 1f
1592 1549 wr %g0, FPRS_FEF, %fprs
1593 1550
1594 1551 BST_FPQ1Q3_TOSTACK(%l1)
1595 1552
1596 1553 1: set PAGESIZE, CNT
1597 1554 mov REALSRC, SRC
1598 1555
1599 1556 prefetch [SRC], #one_read
1600 1557 prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read
1601 1558 prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read
1602 1559 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read
1603 1560 ldd [SRC], %f0
1604 1561 #if CHEETAH_PREFETCH > 4
1605 1562 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
1606 1563 #endif
1607 1564 ldd [SRC + 0x08], %f2
1608 1565 #if CHEETAH_PREFETCH > 5
1609 1566 prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read
1610 1567 #endif
1611 1568 ldd [SRC + 0x10], %f4
1612 1569 #if CHEETAH_PREFETCH > 6
1613 1570 prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read
1614 1571 #endif
1615 1572 fsrc1 %f0, %f32
1616 1573 ldd [SRC + 0x18], %f6
1617 1574 #if CHEETAH_PREFETCH > 7
1618 1575 prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read
1619 1576 #endif
1620 1577 fsrc1 %f2, %f34
1621 1578 ldd [SRC + 0x20], %f8
1622 1579 fsrc1 %f4, %f36
1623 1580 ldd [SRC + 0x28], %f10
1624 1581 fsrc1 %f6, %f38
1625 1582 ldd [SRC + 0x30], %f12
1626 1583 fsrc1 %f8, %f40
1627 1584 ldd [SRC + 0x38], %f14
1628 1585 fsrc1 %f10, %f42
1629 1586 ldd [SRC + VIS_BLOCKSIZE], %f0
1630 1587 sub CNT, VIS_BLOCKSIZE, CNT
1631 1588 add SRC, VIS_BLOCKSIZE, SRC
1632 1589 ba,a,pt %ncc, 2f
1633 1590 nop
1634 1591 .align 16
1635 1592 2:
1636 1593 ldd [SRC + 0x08], %f2
1637 1594 fsrc1 %f12, %f44
1638 1595 ldd [SRC + 0x10], %f4
1639 1596 fsrc1 %f14, %f46
1640 1597 stda %f32, [DST]ASI_BLK_P
1641 1598 ldd [SRC + 0x18], %f6
1642 1599 fsrc1 %f0, %f32
1643 1600 ldd [SRC + 0x20], %f8
1644 1601 fsrc1 %f2, %f34
1645 1602 ldd [SRC + 0x28], %f10
1646 1603 fsrc1 %f4, %f36
1647 1604 ldd [SRC + 0x30], %f12
1648 1605 fsrc1 %f6, %f38
1649 1606 ldd [SRC + 0x38], %f14
1650 1607 fsrc1 %f8, %f40
1651 1608 ldd [SRC + VIS_BLOCKSIZE], %f0
1652 1609 fsrc1 %f10, %f42
1653 1610 prefetch [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read
1654 1611 sub CNT, VIS_BLOCKSIZE, CNT
1655 1612 add DST, VIS_BLOCKSIZE, DST
1656 1613 cmp CNT, VIS_BLOCKSIZE + 8
1657 1614 prefetch [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1658 1615 bgu,pt %ncc, 2b
1659 1616 add SRC, VIS_BLOCKSIZE, SRC
1660 1617
1661 1618 ! trailing block
1662 1619 ldd [SRC + 0x08], %f2
1663 1620 fsrc1 %f12, %f44
1664 1621 ldd [SRC + 0x10], %f4
1665 1622 fsrc1 %f14, %f46
1666 1623 stda %f32, [DST]ASI_BLK_P
1667 1624 ldd [SRC + 0x18], %f6
1668 1625 fsrc1 %f0, %f32
1669 1626 ldd [SRC + 0x20], %f8
1670 1627 fsrc1 %f2, %f34
1671 1628 ldd [SRC + 0x28], %f10
1672 1629 fsrc1 %f4, %f36
1673 1630 ldd [SRC + 0x30], %f12
1674 1631 fsrc1 %f6, %f38
1675 1632 ldd [SRC + 0x38], %f14
1676 1633 fsrc1 %f8, %f40
1677 1634 sub CNT, VIS_BLOCKSIZE, CNT
1678 1635 add DST, VIS_BLOCKSIZE, DST
1679 1636 add SRC, VIS_BLOCKSIZE, SRC
1680 1637 fsrc1 %f10, %f42
1681 1638 fsrc1 %f12, %f44
1682 1639 fsrc1 %f14, %f46
1683 1640 stda %f32, [DST]ASI_BLK_P
1684 1641
1685 1642 membar #Sync
1686 1643
1687 1644 FPRAS_INTERVAL(FPRAS_PGCOPY, 1, %l5, %o2, %o3, %o4, %o5, 8)
1688 1645 FPRAS_REWRITE_TYPE1(1, %l5, %f32, %o2, 9)
1689 1646 FPRAS_CHECK(FPRAS_PGCOPY, %l5, 9) ! lose outputs
1690 1647
1691 1648 btst FPRS_FEF, %l0
1692 1649 bz,pt %icc, 2f
1693 1650 nop
1694 1651
1695 1652 BLD_FPQ1Q3_FROMSTACK(%l3)
↓ open down ↓ |
107 lines elided |
↑ open up ↑ |
1696 1653 ba 3f
1697 1654 nop
1698 1655
1699 1656 2: FZEROQ1Q3
1700 1657
1701 1658 3: wr %l0, 0, %fprs ! restore fprs
1702 1659 ret
1703 1660 restore %g0, 0, %o0
1704 1661
1705 1662 SET_SIZE(hwblkpagecopy)
1706 -#endif /* lint */
1707 1663
1708 1664
1709 1665 /*
1710 1666 * Transfer data to and from user space -
1711 1667 * Note that these routines can cause faults
1712 1668 * It is assumed that the kernel has nothing at
1713 1669 * less than KERNELBASE in the virtual address space.
1714 1670 *
1715 1671 * Note that copyin(9F) and copyout(9F) are part of the
1716 1672 * DDI/DKI which specifies that they return '-1' on "errors."
1717 1673 *
1718 1674 * Sigh.
1719 1675 *
1720 1676 * So there's two extremely similar routines - xcopyin() and xcopyout()
1721 1677 * which return the errno that we've faithfully computed. This
1722 1678 * allows other callers (e.g. uiomove(9F)) to work correctly.
1723 1679 * Given that these are used pretty heavily, we expand the calling
1724 1680 * sequences inline for all flavours (rather than making wrappers).
1725 1681 *
1726 1682 * There are also stub routines for xcopyout_little and xcopyin_little,
1727 1683 * which currently are intended to handle requests of <= 16 bytes from
1728 1684 * do_unaligned. Future enhancement to make them handle 8k pages efficiently
1729 1685 * is left as an exercise...
1730 1686 */
1731 1687
1732 1688 /*
1733 1689 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
1734 1690 *
1735 1691 * General theory of operation:
1736 1692 *
1737 1693 * The only difference between copy{in,out} and
1738 1694 * xcopy{in,out} is in the error handling routine they invoke
1739 1695 * when a memory access error occurs. xcopyOP returns the errno
1740 1696 * while copyOP returns -1 (see above). copy{in,out}_noerr set
1741 1697 * a special flag (by oring the TRAMP_FLAG into the fault handler address)
1742 1698 * if they are called with a fault handler already in place. That flag
1743 1699 * causes the default handlers to trampoline to the previous handler
1744 1700 * upon an error.
1745 1701 *
1746 1702 * None of the copyops routines grab a window until it's decided that
1747 1703 * we need to do a HW block copy operation. This saves a window
1748 1704 * spill/fill when we're called during socket ops. The typical IO
1749 1705 * path won't cause spill/fill traps.
1750 1706 *
1751 1707 * This code uses a set of 4 limits for the maximum size that will
1752 1708 * be copied given a particular input/output address alignment.
1753 1709 * If the value for a particular limit is zero, the copy will be performed
1754 1710 * by the plain copy loops rather than FPBLK.
↓ open down ↓ |
38 lines elided |
↑ open up ↑ |
1755 1711 *
1756 1712 * See the description of bcopy above for more details of the
1757 1713 * data copying algorithm and the default limits.
1758 1714 *
1759 1715 */
1760 1716
1761 1717 /*
1762 1718 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
1763 1719 */
1764 1720
1765 -#if defined(lint)
1766 -
1767 -
1768 -#else /* lint */
1769 1721 /*
1770 1722 * We save the arguments in the following registers in case of a fault:
1771 1723 * kaddr - %l1
1772 1724 * uaddr - %l2
1773 1725 * count - %l3
1774 1726 */
1775 1727 #define SAVE_SRC %l1
1776 1728 #define SAVE_DST %l2
1777 1729 #define SAVE_COUNT %l3
1778 1730
1779 1731 #define SM_SAVE_SRC %g4
1780 1732 #define SM_SAVE_DST %g5
1781 1733 #define SM_SAVE_COUNT %o5
1782 1734 #define ERRNO %l5
1783 1735
1784 1736
1785 1737 #define REAL_LOFAULT %l4
1786 1738 /*
1787 1739 * Generic copyio fault handler. This is the first line of defense when a
1788 1740 * fault occurs in (x)copyin/(x)copyout. In order for this to function
1789 1741 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
1790 1742 * This allows us to share common code for all the flavors of the copy
1791 1743 * operations, including the _noerr versions.
1792 1744 *
1793 1745 * Note that this function will restore the original input parameters before
1794 1746 * calling REAL_LOFAULT. So the real handler can vector to the appropriate
1795 1747 * member of the t_copyop structure, if needed.
1796 1748 */
1797 1749 ENTRY(copyio_fault)
1798 1750 membar #Sync
1799 1751 mov %g1,ERRNO ! save errno in ERRNO
1800 1752 btst FPUSED_FLAG, %l6
1801 1753 bz %ncc, 1f
1802 1754 nop
1803 1755
1804 1756 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
1805 1757 wr %o2, 0, %gsr ! restore gsr
1806 1758
1807 1759 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1808 1760 btst FPRS_FEF, %o3
1809 1761 bz,pt %icc, 4f
1810 1762 nop
1811 1763
1812 1764 BLD_FPQ2Q4_FROMSTACK(%o2)
1813 1765
1814 1766 ba,pt %ncc, 1f
1815 1767 wr %o3, 0, %fprs ! restore fprs
1816 1768
1817 1769 4:
1818 1770 FZEROQ2Q4
1819 1771 wr %o3, 0, %fprs ! restore fprs
1820 1772
1821 1773 1:
1822 1774 andn %l6, FPUSED_FLAG, %l6
1823 1775 membar #Sync
1824 1776 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
↓ open down ↓ |
46 lines elided |
↑ open up ↑ |
1825 1777 FP_ALLOWMIGRATE(5, 6)
1826 1778
1827 1779 mov SAVE_SRC, %i0
1828 1780 mov SAVE_DST, %i1
1829 1781 jmp REAL_LOFAULT
1830 1782 mov SAVE_COUNT, %i2
1831 1783
1832 1784 SET_SIZE(copyio_fault)
1833 1785
1834 1786
1835 -#endif
1836 -
1837 -#if defined(lint)
1838 -
1839 -/*ARGSUSED*/
1840 -int
1841 -copyout(const void *kaddr, void *uaddr, size_t count)
1842 -{ return (0); }
1843 -
1844 -#else /* lint */
1845 -
1846 1787 ENTRY(copyout)
1847 1788
1848 1789 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
1849 1790 bleu,pt %ncc, .copyout_small ! go to larger cases
1850 1791 xor %o0, %o1, %o3 ! are src, dst alignable?
1851 1792 btst 7, %o3 !
1852 1793 bz,pt %ncc, .copyout_8 ! check for longword alignment
1853 1794 nop
1854 1795 btst 1, %o3 !
1855 1796 bz,pt %ncc, .copyout_2 ! check for half-word
1856 1797 nop
1857 1798 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
1858 1799 ld [%o3 + %lo(hw_copy_limit_1)], %o3
1859 1800 tst %o3
1860 1801 bz,pn %icc, .copyout_small ! if zero, disable HW copy
1861 1802 cmp %o2, %o3 ! if length <= limit
1862 1803 bleu,pt %ncc, .copyout_small ! go to small copy
1863 1804 nop
1864 1805 ba,pt %ncc, .copyout_more ! otherwise go to large copy
1865 1806 nop
1866 1807 .copyout_2:
1867 1808 btst 3, %o3 !
1868 1809 bz,pt %ncc, .copyout_4 ! check for word alignment
1869 1810 nop
1870 1811 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
1871 1812 ld [%o3 + %lo(hw_copy_limit_2)], %o3
1872 1813 tst %o3
1873 1814 bz,pn %icc, .copyout_small ! if zero, disable HW copy
1874 1815 cmp %o2, %o3 ! if length <= limit
1875 1816 bleu,pt %ncc, .copyout_small ! go to small copy
1876 1817 nop
1877 1818 ba,pt %ncc, .copyout_more ! otherwise go to large copy
1878 1819 nop
1879 1820 .copyout_4:
1880 1821 ! already checked longword, must be word aligned
1881 1822 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
1882 1823 ld [%o3 + %lo(hw_copy_limit_4)], %o3
1883 1824 tst %o3
1884 1825 bz,pn %icc, .copyout_small ! if zero, disable HW copy
1885 1826 cmp %o2, %o3 ! if length <= limit
1886 1827 bleu,pt %ncc, .copyout_small ! go to small copy
1887 1828 nop
1888 1829 ba,pt %ncc, .copyout_more ! otherwise go to large copy
1889 1830 nop
1890 1831 .copyout_8:
1891 1832 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
1892 1833 ld [%o3 + %lo(hw_copy_limit_8)], %o3
1893 1834 tst %o3
1894 1835 bz,pn %icc, .copyout_small ! if zero, disable HW copy
1895 1836 cmp %o2, %o3 ! if length <= limit
1896 1837 bleu,pt %ncc, .copyout_small ! go to small copy
1897 1838 nop
1898 1839 ba,pt %ncc, .copyout_more ! otherwise go to large copy
1899 1840 nop
1900 1841
1901 1842 .align 16
1902 1843 nop ! instruction alignment
1903 1844 ! see discussion at start of file
1904 1845 .copyout_small:
1905 1846 sethi %hi(.sm_copyout_err), %o5 ! .sm_copyout_err is lofault
1906 1847 or %o5, %lo(.sm_copyout_err), %o5
1907 1848 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler
1908 1849 membar #Sync ! sync error barrier
1909 1850 stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault
1910 1851 .sm_do_copyout:
1911 1852 mov %o0, SM_SAVE_SRC
1912 1853 mov %o1, SM_SAVE_DST
1913 1854 cmp %o2, SHORTCOPY ! check for really short case
1914 1855 bleu,pt %ncc, .co_sm_left !
1915 1856 mov %o2, SM_SAVE_COUNT
1916 1857 cmp %o2, CHKSIZE ! check for medium length cases
1917 1858 bgu,pn %ncc, .co_med !
1918 1859 or %o0, %o1, %o3 ! prepare alignment check
1919 1860 andcc %o3, 0x3, %g0 ! test for alignment
1920 1861 bz,pt %ncc, .co_sm_word ! branch to word aligned case
1921 1862 .co_sm_movebytes:
1922 1863 sub %o2, 3, %o2 ! adjust count to allow cc zero test
1923 1864 .co_sm_notalign4:
1924 1865 ldub [%o0], %o3 ! read byte
1925 1866 subcc %o2, 4, %o2 ! reduce count by 4
1926 1867 stba %o3, [%o1]ASI_USER ! write byte
1927 1868 inc %o1 ! advance DST by 1
1928 1869 ldub [%o0 + 1], %o3 ! repeat for a total of 4 bytes
1929 1870 add %o0, 4, %o0 ! advance SRC by 4
1930 1871 stba %o3, [%o1]ASI_USER
1931 1872 inc %o1 ! advance DST by 1
1932 1873 ldub [%o0 - 2], %o3
1933 1874 stba %o3, [%o1]ASI_USER
1934 1875 inc %o1 ! advance DST by 1
1935 1876 ldub [%o0 - 1], %o3
1936 1877 stba %o3, [%o1]ASI_USER
1937 1878 bgt,pt %ncc, .co_sm_notalign4 ! loop til 3 or fewer bytes remain
1938 1879 inc %o1 ! advance DST by 1
1939 1880 add %o2, 3, %o2 ! restore count
1940 1881 .co_sm_left:
1941 1882 tst %o2
1942 1883 bz,pt %ncc, .co_sm_exit ! check for zero length
1943 1884 nop
1944 1885 ldub [%o0], %o3 ! load one byte
1945 1886 deccc %o2 ! reduce count for cc test
1946 1887 bz,pt %ncc, .co_sm_exit
1947 1888 stba %o3,[%o1]ASI_USER ! store one byte
1948 1889 ldub [%o0 + 1], %o3 ! load second byte
1949 1890 deccc %o2
1950 1891 inc %o1
1951 1892 bz,pt %ncc, .co_sm_exit
1952 1893 stba %o3,[%o1]ASI_USER ! store second byte
1953 1894 ldub [%o0 + 2], %o3 ! load third byte
1954 1895 inc %o1
1955 1896 stba %o3,[%o1]ASI_USER ! store third byte
1956 1897 membar #Sync ! sync error barrier
1957 1898 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
1958 1899 retl
1959 1900 mov %g0, %o0 ! return 0
1960 1901 .align 16
1961 1902 .co_sm_words:
1962 1903 lduw [%o0], %o3 ! read word
1963 1904 .co_sm_wordx:
1964 1905 subcc %o2, 8, %o2 ! update count
1965 1906 stwa %o3, [%o1]ASI_USER ! write word
1966 1907 add %o0, 8, %o0 ! update SRC
1967 1908 lduw [%o0 - 4], %o3 ! read word
1968 1909 add %o1, 4, %o1 ! update DST
1969 1910 stwa %o3, [%o1]ASI_USER ! write word
1970 1911 bgt,pt %ncc, .co_sm_words ! loop til done
1971 1912 add %o1, 4, %o1 ! update DST
1972 1913 addcc %o2, 7, %o2 ! restore count
1973 1914 bz,pt %ncc, .co_sm_exit
1974 1915 nop
1975 1916 deccc %o2
1976 1917 bz,pt %ncc, .co_sm_byte
1977 1918 .co_sm_half:
1978 1919 subcc %o2, 2, %o2 ! reduce count by 2
1979 1920 lduh [%o0], %o3 ! read half word
1980 1921 add %o0, 2, %o0 ! advance SRC by 2
1981 1922 stha %o3, [%o1]ASI_USER ! write half word
1982 1923 bgt,pt %ncc, .co_sm_half ! loop til done
1983 1924 add %o1, 2, %o1 ! advance DST by 2
1984 1925 addcc %o2, 1, %o2 ! restore count
1985 1926 bz,pt %ncc, .co_sm_exit
1986 1927 nop
1987 1928 .co_sm_byte:
1988 1929 ldub [%o0], %o3
1989 1930 stba %o3, [%o1]ASI_USER
1990 1931 membar #Sync ! sync error barrier
1991 1932 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
1992 1933 retl
1993 1934 mov %g0, %o0 ! return 0
1994 1935 .align 16
1995 1936 .co_sm_word:
1996 1937 subcc %o2, 4, %o2 ! update count
1997 1938 bgt,pt %ncc, .co_sm_wordx
1998 1939 lduw [%o0], %o3 ! read word
1999 1940 addcc %o2, 3, %o2 ! restore count
2000 1941 bz,pt %ncc, .co_sm_exit
2001 1942 stwa %o3, [%o1]ASI_USER ! write word
2002 1943 deccc %o2 ! reduce count for cc test
2003 1944 ldub [%o0 + 4], %o3 ! load one byte
2004 1945 add %o1, 4, %o1
2005 1946 bz,pt %ncc, .co_sm_exit
2006 1947 stba %o3, [%o1]ASI_USER ! store one byte
2007 1948 ldub [%o0 + 5], %o3 ! load second byte
2008 1949 deccc %o2
2009 1950 inc %o1
2010 1951 bz,pt %ncc, .co_sm_exit
2011 1952 stba %o3, [%o1]ASI_USER ! store second byte
2012 1953 ldub [%o0 + 6], %o3 ! load third byte
2013 1954 inc %o1
2014 1955 stba %o3, [%o1]ASI_USER ! store third byte
2015 1956 .co_sm_exit:
2016 1957 membar #Sync ! sync error barrier
2017 1958 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2018 1959 retl
2019 1960 mov %g0, %o0 ! return 0
2020 1961
2021 1962 .align 16
2022 1963 .co_med:
2023 1964 xor %o0, %o1, %o3 ! setup alignment check
2024 1965 btst 1, %o3
2025 1966 bnz,pt %ncc, .co_sm_movebytes ! unaligned
2026 1967 nop
2027 1968 btst 3, %o3
2028 1969 bnz,pt %ncc, .co_med_half ! halfword aligned
2029 1970 nop
2030 1971 btst 7, %o3
2031 1972 bnz,pt %ncc, .co_med_word ! word aligned
2032 1973 nop
2033 1974 .co_med_long:
2034 1975 btst 3, %o0 ! check for
2035 1976 bz,pt %ncc, .co_med_long1 ! word alignment
2036 1977 nop
2037 1978 .co_med_long0:
2038 1979 ldub [%o0], %o3 ! load one byte
2039 1980 inc %o0
2040 1981 stba %o3,[%o1]ASI_USER ! store byte
2041 1982 inc %o1
2042 1983 btst 3, %o0
2043 1984 bnz,pt %ncc, .co_med_long0
2044 1985 dec %o2
2045 1986 .co_med_long1: ! word aligned
2046 1987 btst 7, %o0 ! check for long word
2047 1988 bz,pt %ncc, .co_med_long2
2048 1989 nop
2049 1990 lduw [%o0], %o3 ! load word
2050 1991 add %o0, 4, %o0 ! advance SRC by 4
2051 1992 stwa %o3, [%o1]ASI_USER ! store word
2052 1993 add %o1, 4, %o1 ! advance DST by 4
2053 1994 sub %o2, 4, %o2 ! reduce count by 4
2054 1995 !
2055 1996 ! Now long word aligned and have at least 32 bytes to move
2056 1997 !
2057 1998 .co_med_long2:
2058 1999 sub %o2, 31, %o2 ! adjust count to allow cc zero test
2059 2000 sub %o1, 8, %o1 ! adjust pointer to allow store in
2060 2001 ! branch delay slot instead of add
2061 2002 .co_med_lmove:
2062 2003 add %o1, 8, %o1 ! advance DST by 8
2063 2004 ldx [%o0], %o3 ! read long word
2064 2005 subcc %o2, 32, %o2 ! reduce count by 32
2065 2006 stxa %o3, [%o1]ASI_USER ! write long word
2066 2007 add %o1, 8, %o1 ! advance DST by 8
2067 2008 ldx [%o0 + 8], %o3 ! repeat for a total for 4 long words
2068 2009 add %o0, 32, %o0 ! advance SRC by 32
2069 2010 stxa %o3, [%o1]ASI_USER
2070 2011 ldx [%o0 - 16], %o3
2071 2012 add %o1, 8, %o1 ! advance DST by 8
2072 2013 stxa %o3, [%o1]ASI_USER
2073 2014 ldx [%o0 - 8], %o3
2074 2015 add %o1, 8, %o1 ! advance DST by 8
2075 2016 bgt,pt %ncc, .co_med_lmove ! loop til 31 or fewer bytes left
2076 2017 stxa %o3, [%o1]ASI_USER
2077 2018 add %o1, 8, %o1 ! advance DST by 8
2078 2019 addcc %o2, 24, %o2 ! restore count to long word offset
2079 2020 ble,pt %ncc, .co_med_lextra ! check for more long words to move
2080 2021 nop
2081 2022 .co_med_lword:
2082 2023 ldx [%o0], %o3 ! read long word
2083 2024 subcc %o2, 8, %o2 ! reduce count by 8
2084 2025 stxa %o3, [%o1]ASI_USER ! write long word
2085 2026 add %o0, 8, %o0 ! advance SRC by 8
2086 2027 bgt,pt %ncc, .co_med_lword ! loop til 7 or fewer bytes left
2087 2028 add %o1, 8, %o1 ! advance DST by 8
2088 2029 .co_med_lextra:
2089 2030 addcc %o2, 7, %o2 ! restore rest of count
2090 2031 bz,pt %ncc, .co_sm_exit ! if zero, then done
2091 2032 deccc %o2
2092 2033 bz,pt %ncc, .co_sm_byte
2093 2034 nop
2094 2035 ba,pt %ncc, .co_sm_half
2095 2036 nop
2096 2037
2097 2038 .align 16
2098 2039 nop ! instruction alignment
2099 2040 ! see discussion at start of file
2100 2041 .co_med_word:
2101 2042 btst 3, %o0 ! check for
2102 2043 bz,pt %ncc, .co_med_word1 ! word alignment
2103 2044 nop
2104 2045 .co_med_word0:
2105 2046 ldub [%o0], %o3 ! load one byte
2106 2047 inc %o0
2107 2048 stba %o3,[%o1]ASI_USER ! store byte
2108 2049 inc %o1
2109 2050 btst 3, %o0
2110 2051 bnz,pt %ncc, .co_med_word0
2111 2052 dec %o2
2112 2053 !
2113 2054 ! Now word aligned and have at least 36 bytes to move
2114 2055 !
2115 2056 .co_med_word1:
2116 2057 sub %o2, 15, %o2 ! adjust count to allow cc zero test
2117 2058 .co_med_wmove:
2118 2059 lduw [%o0], %o3 ! read word
2119 2060 subcc %o2, 16, %o2 ! reduce count by 16
2120 2061 stwa %o3, [%o1]ASI_USER ! write word
2121 2062 add %o1, 4, %o1 ! advance DST by 4
2122 2063 lduw [%o0 + 4], %o3 ! repeat for a total for 4 words
2123 2064 add %o0, 16, %o0 ! advance SRC by 16
2124 2065 stwa %o3, [%o1]ASI_USER
2125 2066 add %o1, 4, %o1 ! advance DST by 4
2126 2067 lduw [%o0 - 8], %o3
2127 2068 stwa %o3, [%o1]ASI_USER
2128 2069 add %o1, 4, %o1 ! advance DST by 4
2129 2070 lduw [%o0 - 4], %o3
2130 2071 stwa %o3, [%o1]ASI_USER
2131 2072 bgt,pt %ncc, .co_med_wmove ! loop til 15 or fewer bytes left
2132 2073 add %o1, 4, %o1 ! advance DST by 4
2133 2074 addcc %o2, 12, %o2 ! restore count to word offset
2134 2075 ble,pt %ncc, .co_med_wextra ! check for more words to move
2135 2076 nop
2136 2077 .co_med_word2:
2137 2078 lduw [%o0], %o3 ! read word
2138 2079 subcc %o2, 4, %o2 ! reduce count by 4
2139 2080 stwa %o3, [%o1]ASI_USER ! write word
2140 2081 add %o0, 4, %o0 ! advance SRC by 4
2141 2082 bgt,pt %ncc, .co_med_word2 ! loop til 3 or fewer bytes left
2142 2083 add %o1, 4, %o1 ! advance DST by 4
2143 2084 .co_med_wextra:
2144 2085 addcc %o2, 3, %o2 ! restore rest of count
2145 2086 bz,pt %ncc, .co_sm_exit ! if zero, then done
2146 2087 deccc %o2
2147 2088 bz,pt %ncc, .co_sm_byte
2148 2089 nop
2149 2090 ba,pt %ncc, .co_sm_half
2150 2091 nop
2151 2092
2152 2093 .align 16
2153 2094 nop ! instruction alignment
2154 2095 nop ! see discussion at start of file
2155 2096 nop
2156 2097 .co_med_half:
2157 2098 btst 1, %o0 ! check for
2158 2099 bz,pt %ncc, .co_med_half1 ! half word alignment
2159 2100 nop
2160 2101 ldub [%o0], %o3 ! load one byte
2161 2102 inc %o0
2162 2103 stba %o3,[%o1]ASI_USER ! store byte
2163 2104 inc %o1
2164 2105 dec %o2
2165 2106 !
2166 2107 ! Now half word aligned and have at least 38 bytes to move
2167 2108 !
2168 2109 .co_med_half1:
2169 2110 sub %o2, 7, %o2 ! adjust count to allow cc zero test
2170 2111 .co_med_hmove:
2171 2112 lduh [%o0], %o3 ! read half word
2172 2113 subcc %o2, 8, %o2 ! reduce count by 8
2173 2114 stha %o3, [%o1]ASI_USER ! write half word
2174 2115 add %o1, 2, %o1 ! advance DST by 2
2175 2116 lduh [%o0 + 2], %o3 ! repeat for a total for 4 halfwords
2176 2117 add %o0, 8, %o0 ! advance SRC by 8
2177 2118 stha %o3, [%o1]ASI_USER
2178 2119 add %o1, 2, %o1 ! advance DST by 2
2179 2120 lduh [%o0 - 4], %o3
2180 2121 stha %o3, [%o1]ASI_USER
2181 2122 add %o1, 2, %o1 ! advance DST by 2
2182 2123 lduh [%o0 - 2], %o3
2183 2124 stha %o3, [%o1]ASI_USER
2184 2125 bgt,pt %ncc, .co_med_hmove ! loop til 7 or fewer bytes left
2185 2126 add %o1, 2, %o1 ! advance DST by 2
2186 2127 addcc %o2, 7, %o2 ! restore count
2187 2128 bz,pt %ncc, .co_sm_exit
2188 2129 deccc %o2
2189 2130 bz,pt %ncc, .co_sm_byte
2190 2131 nop
2191 2132 ba,pt %ncc, .co_sm_half
2192 2133 nop
2193 2134
2194 2135 /*
2195 2136 * We got here because of a fault during short copyout.
2196 2137 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
2197 2138 */
2198 2139 .sm_copyout_err:
2199 2140 membar #Sync
2200 2141 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2201 2142 mov SM_SAVE_SRC, %o0
2202 2143 mov SM_SAVE_DST, %o1
2203 2144 mov SM_SAVE_COUNT, %o2
2204 2145 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler
2205 2146 tst %o3
2206 2147 bz,pt %ncc, 3f ! if not, return error
2207 2148 nop
2208 2149 ldn [%o3 + CP_COPYOUT], %o5 ! if handler, invoke it with
2209 2150 jmp %o5 ! original arguments
2210 2151 nop
2211 2152 3:
2212 2153 retl
2213 2154 or %g0, -1, %o0 ! return error value
2214 2155
2215 2156 SET_SIZE(copyout)
2216 2157
2217 2158 /*
2218 2159 * The _more entry points are not intended to be used directly by
2219 2160 * any caller from outside this file. They are provided to allow
2220 2161 * profiling and dtrace of the portions of the copy code that uses
2221 2162 * the floating point registers.
2222 2163 * This entry is particularly important as DTRACE (at least as of
2223 2164 * 4/2004) does not support leaf functions.
2224 2165 */
2225 2166
2226 2167 ENTRY(copyout_more)
2227 2168 .copyout_more:
2228 2169 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2229 2170 set .copyout_err, REAL_LOFAULT
2230 2171
2231 2172 /*
2232 2173 * Copy outs that reach here are larger than VIS_COPY_THRESHOLD bytes
2233 2174 */
2234 2175 .do_copyout:
2235 2176 set copyio_fault, %l7 ! .copyio_fault is lofault val
2236 2177
2237 2178 ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler
2238 2179 membar #Sync ! sync error barrier
2239 2180 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault
2240 2181
2241 2182 mov %i0, SAVE_SRC
2242 2183 mov %i1, SAVE_DST
2243 2184 mov %i2, SAVE_COUNT
2244 2185
2245 2186 FP_NOMIGRATE(6, 7)
2246 2187
2247 2188 rd %fprs, %o2 ! check for unused fp
2248 2189 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
2249 2190 btst FPRS_FEF, %o2
2250 2191 bz,a,pt %icc, .do_blockcopyout
2251 2192 wr %g0, FPRS_FEF, %fprs
2252 2193
2253 2194 BST_FPQ2Q4_TOSTACK(%o2)
2254 2195
2255 2196 .do_blockcopyout:
2256 2197 rd %gsr, %o2
2257 2198 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr
2258 2199 or %l6, FPUSED_FLAG, %l6
2259 2200
2260 2201 andcc DST, VIS_BLOCKSIZE - 1, TMP
2261 2202 mov ASI_USER, %asi
2262 2203 bz,pt %ncc, 2f
2263 2204 neg TMP
2264 2205 add TMP, VIS_BLOCKSIZE, TMP
2265 2206
2266 2207 ! TMP = bytes required to align DST on FP_BLOCK boundary
2267 2208 ! Using SRC as a tmp here
2268 2209 cmp TMP, 3
2269 2210 bleu,pt %ncc, 1f
2270 2211 sub CNT,TMP,CNT ! adjust main count
2271 2212 sub TMP, 3, TMP ! adjust for end of loop test
2272 2213 .co_blkalign:
2273 2214 ldub [REALSRC], SRC ! move 4 bytes per loop iteration
2274 2215 stba SRC, [DST]%asi
2275 2216 subcc TMP, 4, TMP
2276 2217 ldub [REALSRC + 1], SRC
2277 2218 add REALSRC, 4, REALSRC
2278 2219 stba SRC, [DST + 1]%asi
2279 2220 ldub [REALSRC - 2], SRC
2280 2221 add DST, 4, DST
2281 2222 stba SRC, [DST - 2]%asi
2282 2223 ldub [REALSRC - 1], SRC
2283 2224 bgu,pt %ncc, .co_blkalign
2284 2225 stba SRC, [DST - 1]%asi
2285 2226
2286 2227 addcc TMP, 3, TMP ! restore count adjustment
2287 2228 bz,pt %ncc, 2f ! no bytes left?
2288 2229 nop
2289 2230 1: ldub [REALSRC], SRC
2290 2231 inc REALSRC
2291 2232 inc DST
2292 2233 deccc TMP
2293 2234 bgu %ncc, 1b
2294 2235 stba SRC, [DST - 1]%asi
2295 2236
2296 2237 2:
2297 2238 andn REALSRC, 0x7, SRC
2298 2239 alignaddr REALSRC, %g0, %g0
2299 2240
2300 2241 ! SRC - 8-byte aligned
2301 2242 ! DST - 64-byte aligned
2302 2243 prefetch [SRC], #one_read
2303 2244 prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read
2304 2245 prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read
2305 2246 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read
2306 2247 ldd [SRC], %f16
2307 2248 #if CHEETAH_PREFETCH > 4
2308 2249 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
2309 2250 #endif
2310 2251 ldd [SRC + 0x08], %f18
2311 2252 #if CHEETAH_PREFETCH > 5
2312 2253 prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read
2313 2254 #endif
2314 2255 ldd [SRC + 0x10], %f20
2315 2256 #if CHEETAH_PREFETCH > 6
2316 2257 prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read
2317 2258 #endif
2318 2259 faligndata %f16, %f18, %f48
2319 2260 ldd [SRC + 0x18], %f22
2320 2261 #if CHEETAH_PREFETCH > 7
2321 2262 prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read
2322 2263 #endif
2323 2264 faligndata %f18, %f20, %f50
2324 2265 ldd [SRC + 0x20], %f24
2325 2266 faligndata %f20, %f22, %f52
2326 2267 ldd [SRC + 0x28], %f26
2327 2268 faligndata %f22, %f24, %f54
2328 2269 ldd [SRC + 0x30], %f28
2329 2270 faligndata %f24, %f26, %f56
2330 2271 ldd [SRC + 0x38], %f30
2331 2272 faligndata %f26, %f28, %f58
2332 2273 ldd [SRC + VIS_BLOCKSIZE], %f16
2333 2274 sub CNT, VIS_BLOCKSIZE, CNT
2334 2275 add SRC, VIS_BLOCKSIZE, SRC
2335 2276 add REALSRC, VIS_BLOCKSIZE, REALSRC
2336 2277 ba,a,pt %ncc, 1f
2337 2278 nop
2338 2279 .align 16
2339 2280 1:
2340 2281 ldd [SRC + 0x08], %f18
2341 2282 faligndata %f28, %f30, %f60
2342 2283 ldd [SRC + 0x10], %f20
2343 2284 faligndata %f30, %f16, %f62
2344 2285 stda %f48, [DST]ASI_BLK_AIUS
2345 2286 ldd [SRC + 0x18], %f22
2346 2287 faligndata %f16, %f18, %f48
2347 2288 ldd [SRC + 0x20], %f24
2348 2289 faligndata %f18, %f20, %f50
2349 2290 ldd [SRC + 0x28], %f26
2350 2291 faligndata %f20, %f22, %f52
2351 2292 ldd [SRC + 0x30], %f28
2352 2293 faligndata %f22, %f24, %f54
2353 2294 ldd [SRC + 0x38], %f30
2354 2295 faligndata %f24, %f26, %f56
2355 2296 sub CNT, VIS_BLOCKSIZE, CNT
2356 2297 ldd [SRC + VIS_BLOCKSIZE], %f16
2357 2298 faligndata %f26, %f28, %f58
2358 2299 prefetch [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read
2359 2300 add DST, VIS_BLOCKSIZE, DST
2360 2301 prefetch [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
2361 2302 add REALSRC, VIS_BLOCKSIZE, REALSRC
2362 2303 cmp CNT, VIS_BLOCKSIZE + 8
2363 2304 bgu,pt %ncc, 1b
2364 2305 add SRC, VIS_BLOCKSIZE, SRC
2365 2306
2366 2307 ! only if REALSRC & 0x7 is 0
2367 2308 cmp CNT, VIS_BLOCKSIZE
2368 2309 bne %ncc, 3f
2369 2310 andcc REALSRC, 0x7, %g0
2370 2311 bz,pt %ncc, 2f
2371 2312 nop
2372 2313 3:
2373 2314 faligndata %f28, %f30, %f60
2374 2315 faligndata %f30, %f16, %f62
2375 2316 stda %f48, [DST]ASI_BLK_AIUS
2376 2317 add DST, VIS_BLOCKSIZE, DST
2377 2318 ba,pt %ncc, 3f
2378 2319 nop
2379 2320 2:
2380 2321 ldd [SRC + 0x08], %f18
2381 2322 fsrc1 %f28, %f60
2382 2323 ldd [SRC + 0x10], %f20
2383 2324 fsrc1 %f30, %f62
2384 2325 stda %f48, [DST]ASI_BLK_AIUS
2385 2326 ldd [SRC + 0x18], %f22
2386 2327 fsrc1 %f16, %f48
2387 2328 ldd [SRC + 0x20], %f24
2388 2329 fsrc1 %f18, %f50
2389 2330 ldd [SRC + 0x28], %f26
2390 2331 fsrc1 %f20, %f52
2391 2332 ldd [SRC + 0x30], %f28
2392 2333 fsrc1 %f22, %f54
2393 2334 ldd [SRC + 0x38], %f30
2394 2335 fsrc1 %f24, %f56
2395 2336 sub CNT, VIS_BLOCKSIZE, CNT
2396 2337 add DST, VIS_BLOCKSIZE, DST
2397 2338 add SRC, VIS_BLOCKSIZE, SRC
2398 2339 add REALSRC, VIS_BLOCKSIZE, REALSRC
2399 2340 fsrc1 %f26, %f58
2400 2341 fsrc1 %f28, %f60
2401 2342 fsrc1 %f30, %f62
2402 2343 stda %f48, [DST]ASI_BLK_AIUS
2403 2344 add DST, VIS_BLOCKSIZE, DST
2404 2345 ba,a,pt %ncc, 4f
2405 2346 nop
2406 2347
2407 2348 3: tst CNT
2408 2349 bz,a %ncc, 4f
2409 2350 nop
2410 2351
2411 2352 5: ldub [REALSRC], TMP
2412 2353 inc REALSRC
2413 2354 inc DST
2414 2355 deccc CNT
2415 2356 bgu %ncc, 5b
2416 2357 stba TMP, [DST - 1]%asi
2417 2358 4:
2418 2359
2419 2360 .copyout_exit:
2420 2361 membar #Sync
2421 2362
2422 2363 FPRAS_INTERVAL(FPRAS_COPYOUT, 0, %l5, %o2, %o3, %o4, %o5, 8)
2423 2364 FPRAS_REWRITE_TYPE2Q2(0, %l5, %o2, %o3, 8, 9)
2424 2365 FPRAS_CHECK(FPRAS_COPYOUT, %l5, 9) ! lose outputs
2425 2366
2426 2367 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
2427 2368 wr %o2, 0, %gsr ! restore gsr
2428 2369
2429 2370 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
2430 2371 btst FPRS_FEF, %o3
2431 2372 bz,pt %icc, 4f
2432 2373 nop
2433 2374
2434 2375 BLD_FPQ2Q4_FROMSTACK(%o2)
2435 2376
2436 2377 ba,pt %ncc, 1f
2437 2378 wr %o3, 0, %fprs ! restore fprs
2438 2379
2439 2380 4:
2440 2381 FZEROQ2Q4
2441 2382 wr %o3, 0, %fprs ! restore fprs
2442 2383
2443 2384 1:
2444 2385 membar #Sync
2445 2386 andn %l6, FPUSED_FLAG, %l6
2446 2387 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2447 2388 FP_ALLOWMIGRATE(5, 6)
2448 2389 ret
2449 2390 restore %g0, 0, %o0
2450 2391
2451 2392 /*
2452 2393 * We got here because of a fault during copyout.
2453 2394 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
2454 2395 */
2455 2396 .copyout_err:
2456 2397 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler
2457 2398 tst %o4
2458 2399 bz,pt %ncc, 2f ! if not, return error
2459 2400 nop
↓ open down ↓ |
604 lines elided |
↑ open up ↑ |
2460 2401 ldn [%o4 + CP_COPYOUT], %g2 ! if handler, invoke it with
2461 2402 jmp %g2 ! original arguments
2462 2403 restore %g0, 0, %g0 ! dispose of copy window
2463 2404 2:
2464 2405 ret
2465 2406 restore %g0, -1, %o0 ! return error value
2466 2407
2467 2408
2468 2409 SET_SIZE(copyout_more)
2469 2410
2470 -#endif /* lint */
2471 2411
2472 -
2473 -#ifdef lint
2474 -
2475 -/*ARGSUSED*/
2476 -int
2477 -xcopyout(const void *kaddr, void *uaddr, size_t count)
2478 -{ return (0); }
2479 -
2480 -#else /* lint */
2481 -
2482 2412 ENTRY(xcopyout)
2483 2413 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
2484 2414 bleu,pt %ncc, .xcopyout_small ! go to larger cases
2485 2415 xor %o0, %o1, %o3 ! are src, dst alignable?
2486 2416 btst 7, %o3 !
2487 2417 bz,pt %ncc, .xcopyout_8 !
2488 2418 nop
2489 2419 btst 1, %o3 !
2490 2420 bz,pt %ncc, .xcopyout_2 ! check for half-word
2491 2421 nop
2492 2422 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
2493 2423 ld [%o3 + %lo(hw_copy_limit_1)], %o3
2494 2424 tst %o3
2495 2425 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy
2496 2426 cmp %o2, %o3 ! if length <= limit
2497 2427 bleu,pt %ncc, .xcopyout_small ! go to small copy
2498 2428 nop
2499 2429 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy
2500 2430 nop
2501 2431 .xcopyout_2:
2502 2432 btst 3, %o3 !
2503 2433 bz,pt %ncc, .xcopyout_4 ! check for word alignment
2504 2434 nop
2505 2435 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
2506 2436 ld [%o3 + %lo(hw_copy_limit_2)], %o3
2507 2437 tst %o3
2508 2438 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy
2509 2439 cmp %o2, %o3 ! if length <= limit
2510 2440 bleu,pt %ncc, .xcopyout_small ! go to small copy
2511 2441 nop
2512 2442 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy
2513 2443 nop
2514 2444 .xcopyout_4:
2515 2445 ! already checked longword, must be word aligned
2516 2446 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
2517 2447 ld [%o3 + %lo(hw_copy_limit_4)], %o3
2518 2448 tst %o3
2519 2449 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy
2520 2450 cmp %o2, %o3 ! if length <= limit
2521 2451 bleu,pt %ncc, .xcopyout_small ! go to small copy
2522 2452 nop
2523 2453 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy
2524 2454 nop
2525 2455 .xcopyout_8:
2526 2456 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
2527 2457 ld [%o3 + %lo(hw_copy_limit_8)], %o3
2528 2458 tst %o3
2529 2459 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy
2530 2460 cmp %o2, %o3 ! if length <= limit
2531 2461 bleu,pt %ncc, .xcopyout_small ! go to small copy
2532 2462 nop
2533 2463 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy
2534 2464 nop
2535 2465
2536 2466 .xcopyout_small:
2537 2467 sethi %hi(.sm_xcopyout_err), %o5 ! .sm_xcopyout_err is lofault
2538 2468 or %o5, %lo(.sm_xcopyout_err), %o5
2539 2469 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler
2540 2470 membar #Sync ! sync error barrier
2541 2471 ba,pt %ncc, .sm_do_copyout ! common code
2542 2472 stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault
2543 2473
2544 2474 .xcopyout_more:
2545 2475 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2546 2476 sethi %hi(.xcopyout_err), REAL_LOFAULT
2547 2477 ba,pt %ncc, .do_copyout ! common code
2548 2478 or REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
2549 2479
2550 2480 /*
2551 2481 * We got here because of fault during xcopyout
2552 2482 * Errno value is in ERRNO
2553 2483 */
2554 2484 .xcopyout_err:
2555 2485 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler
2556 2486 tst %o4
2557 2487 bz,pt %ncc, 2f ! if not, return error
2558 2488 nop
2559 2489 ldn [%o4 + CP_XCOPYOUT], %g2 ! if handler, invoke it with
2560 2490 jmp %g2 ! original arguments
2561 2491 restore %g0, 0, %g0 ! dispose of copy window
2562 2492 2:
2563 2493 ret
2564 2494 restore ERRNO, 0, %o0 ! return errno value
2565 2495
2566 2496 .sm_xcopyout_err:
2567 2497
2568 2498 membar #Sync
2569 2499 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2570 2500 mov SM_SAVE_SRC, %o0
2571 2501 mov SM_SAVE_DST, %o1
2572 2502 mov SM_SAVE_COUNT, %o2
2573 2503 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler
2574 2504 tst %o3
2575 2505 bz,pt %ncc, 3f ! if not, return error
↓ open down ↓ |
84 lines elided |
↑ open up ↑ |
2576 2506 nop
2577 2507 ldn [%o3 + CP_XCOPYOUT], %o5 ! if handler, invoke it with
2578 2508 jmp %o5 ! original arguments
2579 2509 nop
2580 2510 3:
2581 2511 retl
2582 2512 or %g1, 0, %o0 ! return errno value
2583 2513
2584 2514 SET_SIZE(xcopyout)
2585 2515
2586 -#endif /* lint */
2587 -
2588 -#ifdef lint
2589 -
2590 -/*ARGSUSED*/
2591 -int
2592 -xcopyout_little(const void *kaddr, void *uaddr, size_t count)
2593 -{ return (0); }
2594 -
2595 -#else /* lint */
2596 -
2597 2516 ENTRY(xcopyout_little)
2598 2517 sethi %hi(.xcopyio_err), %o5
2599 2518 or %o5, %lo(.xcopyio_err), %o5
2600 2519 ldn [THREAD_REG + T_LOFAULT], %o4
2601 2520 membar #Sync ! sync error barrier
2602 2521 stn %o5, [THREAD_REG + T_LOFAULT]
2603 2522 mov %o4, %o5
2604 2523
2605 2524 subcc %g0, %o2, %o3
2606 2525 add %o0, %o2, %o0
2607 2526 bz,pn %ncc, 2f ! check for zero bytes
2608 2527 sub %o2, 1, %o4
2609 2528 add %o0, %o4, %o0 ! start w/last byte
2610 2529 add %o1, %o2, %o1
2611 2530 ldub [%o0 + %o3], %o4
2612 2531
2613 2532 1: stba %o4, [%o1 + %o3]ASI_AIUSL
2614 2533 inccc %o3
2615 2534 sub %o0, 2, %o0 ! get next byte
2616 2535 bcc,a,pt %ncc, 1b
↓ open down ↓ |
10 lines elided |
↑ open up ↑ |
2617 2536 ldub [%o0 + %o3], %o4
2618 2537
2619 2538 2:
2620 2539 membar #Sync ! sync error barrier
2621 2540 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2622 2541 retl
2623 2542 mov %g0, %o0 ! return (0)
2624 2543
2625 2544 SET_SIZE(xcopyout_little)
2626 2545
2627 -#endif /* lint */
2628 -
2629 2546 /*
2630 2547 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
2631 2548 */
2632 2549
2633 -#if defined(lint)
2634 -
2635 -/*ARGSUSED*/
2636 -int
2637 -copyin(const void *uaddr, void *kaddr, size_t count)
2638 -{ return (0); }
2639 -
2640 -#else /* lint */
2641 -
2642 2550 ENTRY(copyin)
2643 2551 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
2644 2552 bleu,pt %ncc, .copyin_small ! go to larger cases
2645 2553 xor %o0, %o1, %o3 ! are src, dst alignable?
2646 2554 btst 7, %o3 !
2647 2555 bz,pt %ncc, .copyin_8 ! check for longword alignment
2648 2556 nop
2649 2557 btst 1, %o3 !
2650 2558 bz,pt %ncc, .copyin_2 ! check for half-word
2651 2559 nop
2652 2560 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
2653 2561 ld [%o3 + %lo(hw_copy_limit_1)], %o3
2654 2562 tst %o3
2655 2563 bz,pn %icc, .copyin_small ! if zero, disable HW copy
2656 2564 cmp %o2, %o3 ! if length <= limit
2657 2565 bleu,pt %ncc, .copyin_small ! go to small copy
2658 2566 nop
2659 2567 ba,pt %ncc, .copyin_more ! otherwise go to large copy
2660 2568 nop
2661 2569 .copyin_2:
2662 2570 btst 3, %o3 !
2663 2571 bz,pt %ncc, .copyin_4 ! check for word alignment
2664 2572 nop
2665 2573 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
2666 2574 ld [%o3 + %lo(hw_copy_limit_2)], %o3
2667 2575 tst %o3
2668 2576 bz,pn %icc, .copyin_small ! if zero, disable HW copy
2669 2577 cmp %o2, %o3 ! if length <= limit
2670 2578 bleu,pt %ncc, .copyin_small ! go to small copy
2671 2579 nop
2672 2580 ba,pt %ncc, .copyin_more ! otherwise go to large copy
2673 2581 nop
2674 2582 .copyin_4:
2675 2583 ! already checked longword, must be word aligned
2676 2584 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
2677 2585 ld [%o3 + %lo(hw_copy_limit_4)], %o3
2678 2586 tst %o3
2679 2587 bz,pn %icc, .copyin_small ! if zero, disable HW copy
2680 2588 cmp %o2, %o3 ! if length <= limit
2681 2589 bleu,pt %ncc, .copyin_small ! go to small copy
2682 2590 nop
2683 2591 ba,pt %ncc, .copyin_more ! otherwise go to large copy
2684 2592 nop
2685 2593 .copyin_8:
2686 2594 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
2687 2595 ld [%o3 + %lo(hw_copy_limit_8)], %o3
2688 2596 tst %o3
2689 2597 bz,pn %icc, .copyin_small ! if zero, disable HW copy
2690 2598 cmp %o2, %o3 ! if length <= limit
2691 2599 bleu,pt %ncc, .copyin_small ! go to small copy
2692 2600 nop
2693 2601 ba,pt %ncc, .copyin_more ! otherwise go to large copy
2694 2602 nop
2695 2603
2696 2604 .align 16
2697 2605 nop ! instruction alignment
2698 2606 ! see discussion at start of file
2699 2607 .copyin_small:
2700 2608 sethi %hi(.sm_copyin_err), %o5 ! .sm_copyin_err is lofault
2701 2609 or %o5, %lo(.sm_copyin_err), %o5
2702 2610 ldn [THREAD_REG + T_LOFAULT], %o4 ! set/save t_lofault, no tramp
2703 2611 membar #Sync ! sync error barrier
2704 2612 stn %o5, [THREAD_REG + T_LOFAULT]
2705 2613 .sm_do_copyin:
2706 2614 mov %o0, SM_SAVE_SRC
2707 2615 mov %o1, SM_SAVE_DST
2708 2616 cmp %o2, SHORTCOPY ! check for really short case
2709 2617 bleu,pt %ncc, .ci_sm_left !
2710 2618 mov %o2, SM_SAVE_COUNT
2711 2619 cmp %o2, CHKSIZE ! check for medium length cases
2712 2620 bgu,pn %ncc, .ci_med !
2713 2621 or %o0, %o1, %o3 ! prepare alignment check
2714 2622 andcc %o3, 0x3, %g0 ! test for alignment
2715 2623 bz,pt %ncc, .ci_sm_word ! branch to word aligned case
2716 2624 .ci_sm_movebytes:
2717 2625 sub %o2, 3, %o2 ! adjust count to allow cc zero test
2718 2626 .ci_sm_notalign4:
2719 2627 lduba [%o0]ASI_USER, %o3 ! read byte
2720 2628 subcc %o2, 4, %o2 ! reduce count by 4
2721 2629 stb %o3, [%o1] ! write byte
2722 2630 add %o0, 1, %o0 ! advance SRC by 1
2723 2631 lduba [%o0]ASI_USER, %o3 ! repeat for a total of 4 bytes
2724 2632 add %o0, 1, %o0 ! advance SRC by 1
2725 2633 stb %o3, [%o1 + 1]
2726 2634 add %o1, 4, %o1 ! advance DST by 4
2727 2635 lduba [%o0]ASI_USER, %o3
2728 2636 add %o0, 1, %o0 ! advance SRC by 1
2729 2637 stb %o3, [%o1 - 2]
2730 2638 lduba [%o0]ASI_USER, %o3
2731 2639 add %o0, 1, %o0 ! advance SRC by 1
2732 2640 bgt,pt %ncc, .ci_sm_notalign4 ! loop til 3 or fewer bytes remain
2733 2641 stb %o3, [%o1 - 1]
2734 2642 add %o2, 3, %o2 ! restore count
2735 2643 .ci_sm_left:
2736 2644 tst %o2
2737 2645 bz,pt %ncc, .ci_sm_exit
2738 2646 nop
2739 2647 lduba [%o0]ASI_USER, %o3 ! load one byte
2740 2648 deccc %o2 ! reduce count for cc test
2741 2649 bz,pt %ncc, .ci_sm_exit
2742 2650 stb %o3,[%o1] ! store one byte
2743 2651 inc %o0
2744 2652 lduba [%o0]ASI_USER, %o3 ! load second byte
2745 2653 deccc %o2
2746 2654 bz,pt %ncc, .ci_sm_exit
2747 2655 stb %o3,[%o1 + 1] ! store second byte
2748 2656 inc %o0
2749 2657 lduba [%o0]ASI_USER, %o3 ! load third byte
2750 2658 stb %o3,[%o1 + 2] ! store third byte
2751 2659 membar #Sync ! sync error barrier
2752 2660 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2753 2661 retl
2754 2662 mov %g0, %o0 ! return 0
2755 2663 .align 16
2756 2664 .ci_sm_words:
2757 2665 lduwa [%o0]ASI_USER, %o3 ! read word
2758 2666 .ci_sm_wordx:
2759 2667 subcc %o2, 8, %o2 ! update count
2760 2668 stw %o3, [%o1] ! write word
2761 2669 add %o0, 4, %o0 ! update SRC
2762 2670 add %o1, 8, %o1 ! update DST
2763 2671 lduwa [%o0]ASI_USER, %o3 ! read word
2764 2672 add %o0, 4, %o0 ! update SRC
2765 2673 bgt,pt %ncc, .ci_sm_words ! loop til done
2766 2674 stw %o3, [%o1 - 4] ! write word
2767 2675 addcc %o2, 7, %o2 ! restore count
2768 2676 bz,pt %ncc, .ci_sm_exit
2769 2677 nop
2770 2678 deccc %o2
2771 2679 bz,pt %ncc, .ci_sm_byte
2772 2680 .ci_sm_half:
2773 2681 subcc %o2, 2, %o2 ! reduce count by 2
2774 2682 lduha [%o0]ASI_USER, %o3 ! read half word
2775 2683 add %o0, 2, %o0 ! advance SRC by 2
2776 2684 add %o1, 2, %o1 ! advance DST by 2
2777 2685 bgt,pt %ncc, .ci_sm_half ! loop til done
2778 2686 sth %o3, [%o1 - 2] ! write half word
2779 2687 addcc %o2, 1, %o2 ! restore count
2780 2688 bz,pt %ncc, .ci_sm_exit
2781 2689 nop
2782 2690 .ci_sm_byte:
2783 2691 lduba [%o0]ASI_USER, %o3
2784 2692 stb %o3, [%o1]
2785 2693 membar #Sync ! sync error barrier
2786 2694 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2787 2695 retl
2788 2696 mov %g0, %o0 ! return 0
2789 2697 .align 16
2790 2698 .ci_sm_word:
2791 2699 subcc %o2, 4, %o2 ! update count
2792 2700 bgt,pt %ncc, .ci_sm_wordx
2793 2701 lduwa [%o0]ASI_USER, %o3 ! read word
2794 2702 addcc %o2, 3, %o2 ! restore count
2795 2703 bz,pt %ncc, .ci_sm_exit
2796 2704 stw %o3, [%o1] ! write word
2797 2705 deccc %o2 ! reduce count for cc test
2798 2706 add %o0, 4, %o0
2799 2707 lduba [%o0]ASI_USER, %o3 ! load one byte
2800 2708 bz,pt %ncc, .ci_sm_exit
2801 2709 stb %o3, [%o1 + 4] ! store one byte
2802 2710 inc %o0
2803 2711 lduba [%o0]ASI_USER, %o3 ! load second byte
2804 2712 deccc %o2
2805 2713 bz,pt %ncc, .ci_sm_exit
2806 2714 stb %o3, [%o1 + 5] ! store second byte
2807 2715 inc %o0
2808 2716 lduba [%o0]ASI_USER, %o3 ! load third byte
2809 2717 stb %o3, [%o1 + 6] ! store third byte
2810 2718 .ci_sm_exit:
2811 2719 membar #Sync ! sync error barrier
2812 2720 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2813 2721 retl
2814 2722 mov %g0, %o0 ! return 0
2815 2723
2816 2724 .align 16
2817 2725 .ci_med:
2818 2726 xor %o0, %o1, %o3 ! setup alignment check
2819 2727 btst 1, %o3
2820 2728 bnz,pt %ncc, .ci_sm_movebytes ! unaligned
2821 2729 nop
2822 2730 btst 3, %o3
2823 2731 bnz,pt %ncc, .ci_med_half ! halfword aligned
2824 2732 nop
2825 2733 btst 7, %o3
2826 2734 bnz,pt %ncc, .ci_med_word ! word aligned
2827 2735 nop
2828 2736 .ci_med_long:
2829 2737 btst 3, %o0 ! check for
2830 2738 bz,pt %ncc, .ci_med_long1 ! word alignment
2831 2739 nop
2832 2740 .ci_med_long0:
2833 2741 lduba [%o0]ASI_USER, %o3 ! load one byte
2834 2742 inc %o0
2835 2743 stb %o3,[%o1] ! store byte
2836 2744 inc %o1
2837 2745 btst 3, %o0
2838 2746 bnz,pt %ncc, .ci_med_long0
2839 2747 dec %o2
2840 2748 .ci_med_long1: ! word aligned
2841 2749 btst 7, %o0 ! check for long word
2842 2750 bz,pt %ncc, .ci_med_long2
2843 2751 nop
2844 2752 lduwa [%o0]ASI_USER, %o3 ! load word
2845 2753 add %o0, 4, %o0 ! advance SRC by 4
2846 2754 stw %o3, [%o1] ! store word
2847 2755 add %o1, 4, %o1 ! advance DST by 4
2848 2756 sub %o2, 4, %o2 ! reduce count by 4
2849 2757 !
2850 2758 ! Now long word aligned and have at least 32 bytes to move
2851 2759 !
2852 2760 .ci_med_long2:
2853 2761 sub %o2, 31, %o2 ! adjust count to allow cc zero test
2854 2762 .ci_med_lmove:
2855 2763 ldxa [%o0]ASI_USER, %o3 ! read long word
2856 2764 subcc %o2, 32, %o2 ! reduce count by 32
2857 2765 stx %o3, [%o1] ! write long word
2858 2766 add %o0, 8, %o0 ! advance SRC by 8
2859 2767 ldxa [%o0]ASI_USER, %o3 ! repeat for a total for 4 long words
2860 2768 add %o0, 8, %o0 ! advance SRC by 8
2861 2769 stx %o3, [%o1 + 8]
2862 2770 add %o1, 32, %o1 ! advance DST by 32
2863 2771 ldxa [%o0]ASI_USER, %o3
2864 2772 add %o0, 8, %o0 ! advance SRC by 8
2865 2773 stx %o3, [%o1 - 16]
2866 2774 ldxa [%o0]ASI_USER, %o3
2867 2775 add %o0, 8, %o0 ! advance SRC by 8
2868 2776 bgt,pt %ncc, .ci_med_lmove ! loop til 31 or fewer bytes left
2869 2777 stx %o3, [%o1 - 8]
2870 2778 addcc %o2, 24, %o2 ! restore count to long word offset
2871 2779 ble,pt %ncc, .ci_med_lextra ! check for more long words to move
2872 2780 nop
2873 2781 .ci_med_lword:
2874 2782 ldxa [%o0]ASI_USER, %o3 ! read long word
2875 2783 subcc %o2, 8, %o2 ! reduce count by 8
2876 2784 stx %o3, [%o1] ! write long word
2877 2785 add %o0, 8, %o0 ! advance SRC by 8
2878 2786 bgt,pt %ncc, .ci_med_lword ! loop til 7 or fewer bytes left
2879 2787 add %o1, 8, %o1 ! advance DST by 8
2880 2788 .ci_med_lextra:
2881 2789 addcc %o2, 7, %o2 ! restore rest of count
2882 2790 bz,pt %ncc, .ci_sm_exit ! if zero, then done
2883 2791 deccc %o2
2884 2792 bz,pt %ncc, .ci_sm_byte
2885 2793 nop
2886 2794 ba,pt %ncc, .ci_sm_half
2887 2795 nop
2888 2796
2889 2797 .align 16
2890 2798 nop ! instruction alignment
2891 2799 ! see discussion at start of file
2892 2800 .ci_med_word:
2893 2801 btst 3, %o0 ! check for
2894 2802 bz,pt %ncc, .ci_med_word1 ! word alignment
2895 2803 nop
2896 2804 .ci_med_word0:
2897 2805 lduba [%o0]ASI_USER, %o3 ! load one byte
2898 2806 inc %o0
2899 2807 stb %o3,[%o1] ! store byte
2900 2808 inc %o1
2901 2809 btst 3, %o0
2902 2810 bnz,pt %ncc, .ci_med_word0
2903 2811 dec %o2
2904 2812 !
2905 2813 ! Now word aligned and have at least 36 bytes to move
2906 2814 !
2907 2815 .ci_med_word1:
2908 2816 sub %o2, 15, %o2 ! adjust count to allow cc zero test
2909 2817 .ci_med_wmove:
2910 2818 lduwa [%o0]ASI_USER, %o3 ! read word
2911 2819 subcc %o2, 16, %o2 ! reduce count by 16
2912 2820 stw %o3, [%o1] ! write word
2913 2821 add %o0, 4, %o0 ! advance SRC by 4
2914 2822 lduwa [%o0]ASI_USER, %o3 ! repeat for a total for 4 words
2915 2823 add %o0, 4, %o0 ! advance SRC by 4
2916 2824 stw %o3, [%o1 + 4]
2917 2825 add %o1, 16, %o1 ! advance DST by 16
2918 2826 lduwa [%o0]ASI_USER, %o3
2919 2827 add %o0, 4, %o0 ! advance SRC by 4
2920 2828 stw %o3, [%o1 - 8]
2921 2829 lduwa [%o0]ASI_USER, %o3
2922 2830 add %o0, 4, %o0 ! advance SRC by 4
2923 2831 bgt,pt %ncc, .ci_med_wmove ! loop til 15 or fewer bytes left
2924 2832 stw %o3, [%o1 - 4]
2925 2833 addcc %o2, 12, %o2 ! restore count to word offset
2926 2834 ble,pt %ncc, .ci_med_wextra ! check for more words to move
2927 2835 nop
2928 2836 .ci_med_word2:
2929 2837 lduwa [%o0]ASI_USER, %o3 ! read word
2930 2838 subcc %o2, 4, %o2 ! reduce count by 4
2931 2839 stw %o3, [%o1] ! write word
2932 2840 add %o0, 4, %o0 ! advance SRC by 4
2933 2841 bgt,pt %ncc, .ci_med_word2 ! loop til 3 or fewer bytes left
2934 2842 add %o1, 4, %o1 ! advance DST by 4
2935 2843 .ci_med_wextra:
2936 2844 addcc %o2, 3, %o2 ! restore rest of count
2937 2845 bz,pt %ncc, .ci_sm_exit ! if zero, then done
2938 2846 deccc %o2
2939 2847 bz,pt %ncc, .ci_sm_byte
2940 2848 nop
2941 2849 ba,pt %ncc, .ci_sm_half
2942 2850 nop
2943 2851
2944 2852 .align 16
2945 2853 nop ! instruction alignment
2946 2854 ! see discussion at start of file
2947 2855 .ci_med_half:
2948 2856 btst 1, %o0 ! check for
2949 2857 bz,pt %ncc, .ci_med_half1 ! half word alignment
2950 2858 nop
2951 2859 lduba [%o0]ASI_USER, %o3 ! load one byte
2952 2860 inc %o0
2953 2861 stb %o3,[%o1] ! store byte
2954 2862 inc %o1
2955 2863 dec %o2
2956 2864 !
2957 2865 ! Now half word aligned and have at least 38 bytes to move
2958 2866 !
2959 2867 .ci_med_half1:
2960 2868 sub %o2, 7, %o2 ! adjust count to allow cc zero test
2961 2869 .ci_med_hmove:
2962 2870 lduha [%o0]ASI_USER, %o3 ! read half word
2963 2871 subcc %o2, 8, %o2 ! reduce count by 8
2964 2872 sth %o3, [%o1] ! write half word
2965 2873 add %o0, 2, %o0 ! advance SRC by 2
2966 2874 lduha [%o0]ASI_USER, %o3 ! repeat for a total for 4 halfwords
2967 2875 add %o0, 2, %o0 ! advance SRC by 2
2968 2876 sth %o3, [%o1 + 2]
2969 2877 add %o1, 8, %o1 ! advance DST by 8
2970 2878 lduha [%o0]ASI_USER, %o3
2971 2879 add %o0, 2, %o0 ! advance SRC by 2
2972 2880 sth %o3, [%o1 - 4]
2973 2881 lduha [%o0]ASI_USER, %o3
2974 2882 add %o0, 2, %o0 ! advance SRC by 2
2975 2883 bgt,pt %ncc, .ci_med_hmove ! loop til 7 or fewer bytes left
2976 2884 sth %o3, [%o1 - 2]
2977 2885 addcc %o2, 7, %o2 ! restore count
2978 2886 bz,pt %ncc, .ci_sm_exit
2979 2887 deccc %o2
2980 2888 bz,pt %ncc, .ci_sm_byte
2981 2889 nop
2982 2890 ba,pt %ncc, .ci_sm_half
2983 2891 nop
2984 2892
2985 2893 .sm_copyin_err:
2986 2894 membar #Sync
2987 2895 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2988 2896 mov SM_SAVE_SRC, %o0
2989 2897 mov SM_SAVE_DST, %o1
2990 2898 mov SM_SAVE_COUNT, %o2
2991 2899 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler
2992 2900 tst %o3
2993 2901 bz,pt %ncc, 3f ! if not, return error
2994 2902 nop
2995 2903 ldn [%o3 + CP_COPYIN], %o5 ! if handler, invoke it with
2996 2904 jmp %o5 ! original arguments
2997 2905 nop
2998 2906 3:
2999 2907 retl
3000 2908 or %g0, -1, %o0 ! return errno value
3001 2909
3002 2910 SET_SIZE(copyin)
3003 2911
3004 2912
3005 2913 /*
3006 2914 * The _more entry points are not intended to be used directly by
3007 2915 * any caller from outside this file. They are provided to allow
3008 2916 * profiling and dtrace of the portions of the copy code that uses
3009 2917 * the floating point registers.
3010 2918 * This entry is particularly important as DTRACE (at least as of
3011 2919 * 4/2004) does not support leaf functions.
3012 2920 */
3013 2921
3014 2922 ENTRY(copyin_more)
3015 2923 .copyin_more:
3016 2924 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3017 2925 set .copyin_err, REAL_LOFAULT
3018 2926
3019 2927 /*
3020 2928 * Copy ins that reach here are larger than VIS_COPY_THRESHOLD bytes
3021 2929 */
3022 2930 .do_copyin:
3023 2931 set copyio_fault, %l7 ! .copyio_fault is lofault val
3024 2932
3025 2933 ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler
3026 2934 membar #Sync ! sync error barrier
3027 2935 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault
3028 2936
3029 2937 mov %i0, SAVE_SRC
3030 2938 mov %i1, SAVE_DST
3031 2939 mov %i2, SAVE_COUNT
3032 2940
3033 2941 FP_NOMIGRATE(6, 7)
3034 2942
3035 2943 rd %fprs, %o2 ! check for unused fp
3036 2944 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
3037 2945 btst FPRS_FEF, %o2
3038 2946 bz,a,pt %icc, .do_blockcopyin
3039 2947 wr %g0, FPRS_FEF, %fprs
3040 2948
3041 2949 BST_FPQ2Q4_TOSTACK(%o2)
3042 2950
3043 2951 .do_blockcopyin:
3044 2952 rd %gsr, %o2
3045 2953 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr
3046 2954 or %l6, FPUSED_FLAG, %l6
3047 2955
3048 2956 andcc DST, VIS_BLOCKSIZE - 1, TMP
3049 2957 mov ASI_USER, %asi
3050 2958 bz,pt %ncc, 2f
3051 2959 neg TMP
3052 2960 add TMP, VIS_BLOCKSIZE, TMP
3053 2961
3054 2962 ! TMP = bytes required to align DST on FP_BLOCK boundary
3055 2963 ! Using SRC as a tmp here
3056 2964 cmp TMP, 3
3057 2965 bleu,pt %ncc, 1f
3058 2966 sub CNT,TMP,CNT ! adjust main count
3059 2967 sub TMP, 3, TMP ! adjust for end of loop test
3060 2968 .ci_blkalign:
3061 2969 lduba [REALSRC]%asi, SRC ! move 4 bytes per loop iteration
3062 2970 stb SRC, [DST]
3063 2971 subcc TMP, 4, TMP
3064 2972 lduba [REALSRC + 1]%asi, SRC
3065 2973 add REALSRC, 4, REALSRC
3066 2974 stb SRC, [DST + 1]
3067 2975 lduba [REALSRC - 2]%asi, SRC
3068 2976 add DST, 4, DST
3069 2977 stb SRC, [DST - 2]
3070 2978 lduba [REALSRC - 1]%asi, SRC
3071 2979 bgu,pt %ncc, .ci_blkalign
3072 2980 stb SRC, [DST - 1]
3073 2981
3074 2982 addcc TMP, 3, TMP ! restore count adjustment
3075 2983 bz,pt %ncc, 2f ! no bytes left?
3076 2984 nop
3077 2985 1: lduba [REALSRC]%asi, SRC
3078 2986 inc REALSRC
3079 2987 inc DST
3080 2988 deccc TMP
3081 2989 bgu %ncc, 1b
3082 2990 stb SRC, [DST - 1]
3083 2991
3084 2992 2:
3085 2993 andn REALSRC, 0x7, SRC
3086 2994 alignaddr REALSRC, %g0, %g0
3087 2995
3088 2996 ! SRC - 8-byte aligned
3089 2997 ! DST - 64-byte aligned
3090 2998 prefetcha [SRC]%asi, #one_read
3091 2999 prefetcha [SRC + (1 * VIS_BLOCKSIZE)]%asi, #one_read
3092 3000 prefetcha [SRC + (2 * VIS_BLOCKSIZE)]%asi, #one_read
3093 3001 prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #one_read
3094 3002 ldda [SRC]%asi, %f16
3095 3003 #if CHEETAH_PREFETCH > 4
3096 3004 prefetcha [SRC + (4 * VIS_BLOCKSIZE)]%asi, #one_read
3097 3005 #endif
3098 3006 ldda [SRC + 0x08]%asi, %f18
3099 3007 #if CHEETAH_PREFETCH > 5
3100 3008 prefetcha [SRC + (5 * VIS_BLOCKSIZE)]%asi, #one_read
3101 3009 #endif
3102 3010 ldda [SRC + 0x10]%asi, %f20
3103 3011 #if CHEETAH_PREFETCH > 6
3104 3012 prefetcha [SRC + (6 * VIS_BLOCKSIZE)]%asi, #one_read
3105 3013 #endif
3106 3014 faligndata %f16, %f18, %f48
3107 3015 ldda [SRC + 0x18]%asi, %f22
3108 3016 #if CHEETAH_PREFETCH > 7
3109 3017 prefetcha [SRC + (7 * VIS_BLOCKSIZE)]%asi, #one_read
3110 3018 #endif
3111 3019 faligndata %f18, %f20, %f50
3112 3020 ldda [SRC + 0x20]%asi, %f24
3113 3021 faligndata %f20, %f22, %f52
3114 3022 ldda [SRC + 0x28]%asi, %f26
3115 3023 faligndata %f22, %f24, %f54
3116 3024 ldda [SRC + 0x30]%asi, %f28
3117 3025 faligndata %f24, %f26, %f56
3118 3026 ldda [SRC + 0x38]%asi, %f30
3119 3027 faligndata %f26, %f28, %f58
3120 3028 ldda [SRC + VIS_BLOCKSIZE]%asi, %f16
3121 3029 sub CNT, VIS_BLOCKSIZE, CNT
3122 3030 add SRC, VIS_BLOCKSIZE, SRC
3123 3031 add REALSRC, VIS_BLOCKSIZE, REALSRC
3124 3032 ba,a,pt %ncc, 1f
3125 3033 nop
3126 3034 .align 16
3127 3035 1:
3128 3036 ldda [SRC + 0x08]%asi, %f18
3129 3037 faligndata %f28, %f30, %f60
3130 3038 ldda [SRC + 0x10]%asi, %f20
3131 3039 faligndata %f30, %f16, %f62
3132 3040 stda %f48, [DST]ASI_BLK_P
3133 3041 ldda [SRC + 0x18]%asi, %f22
3134 3042 faligndata %f16, %f18, %f48
3135 3043 ldda [SRC + 0x20]%asi, %f24
3136 3044 faligndata %f18, %f20, %f50
3137 3045 ldda [SRC + 0x28]%asi, %f26
3138 3046 faligndata %f20, %f22, %f52
3139 3047 ldda [SRC + 0x30]%asi, %f28
3140 3048 faligndata %f22, %f24, %f54
3141 3049 ldda [SRC + 0x38]%asi, %f30
3142 3050 faligndata %f24, %f26, %f56
3143 3051 sub CNT, VIS_BLOCKSIZE, CNT
3144 3052 ldda [SRC + VIS_BLOCKSIZE]%asi, %f16
3145 3053 faligndata %f26, %f28, %f58
3146 3054 prefetcha [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8]%asi, #one_read
3147 3055 add DST, VIS_BLOCKSIZE, DST
3148 3056 prefetcha [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read
3149 3057 add REALSRC, VIS_BLOCKSIZE, REALSRC
3150 3058 cmp CNT, VIS_BLOCKSIZE + 8
3151 3059 bgu,pt %ncc, 1b
3152 3060 add SRC, VIS_BLOCKSIZE, SRC
3153 3061
3154 3062 ! only if REALSRC & 0x7 is 0
3155 3063 cmp CNT, VIS_BLOCKSIZE
3156 3064 bne %ncc, 3f
3157 3065 andcc REALSRC, 0x7, %g0
3158 3066 bz,pt %ncc, 2f
3159 3067 nop
3160 3068 3:
3161 3069 faligndata %f28, %f30, %f60
3162 3070 faligndata %f30, %f16, %f62
3163 3071 stda %f48, [DST]ASI_BLK_P
3164 3072 add DST, VIS_BLOCKSIZE, DST
3165 3073 ba,pt %ncc, 3f
3166 3074 nop
3167 3075 2:
3168 3076 ldda [SRC + 0x08]%asi, %f18
3169 3077 fsrc1 %f28, %f60
3170 3078 ldda [SRC + 0x10]%asi, %f20
3171 3079 fsrc1 %f30, %f62
3172 3080 stda %f48, [DST]ASI_BLK_P
3173 3081 ldda [SRC + 0x18]%asi, %f22
3174 3082 fsrc1 %f16, %f48
3175 3083 ldda [SRC + 0x20]%asi, %f24
3176 3084 fsrc1 %f18, %f50
3177 3085 ldda [SRC + 0x28]%asi, %f26
3178 3086 fsrc1 %f20, %f52
3179 3087 ldda [SRC + 0x30]%asi, %f28
3180 3088 fsrc1 %f22, %f54
3181 3089 ldda [SRC + 0x38]%asi, %f30
3182 3090 fsrc1 %f24, %f56
3183 3091 sub CNT, VIS_BLOCKSIZE, CNT
3184 3092 add DST, VIS_BLOCKSIZE, DST
3185 3093 add SRC, VIS_BLOCKSIZE, SRC
3186 3094 add REALSRC, VIS_BLOCKSIZE, REALSRC
3187 3095 fsrc1 %f26, %f58
3188 3096 fsrc1 %f28, %f60
3189 3097 fsrc1 %f30, %f62
3190 3098 stda %f48, [DST]ASI_BLK_P
3191 3099 add DST, VIS_BLOCKSIZE, DST
3192 3100 ba,a,pt %ncc, 4f
3193 3101 nop
3194 3102
3195 3103 3: tst CNT
3196 3104 bz,a %ncc, 4f
3197 3105 nop
3198 3106
3199 3107 5: lduba [REALSRC]ASI_USER, TMP
3200 3108 inc REALSRC
3201 3109 inc DST
3202 3110 deccc CNT
3203 3111 bgu %ncc, 5b
3204 3112 stb TMP, [DST - 1]
3205 3113 4:
3206 3114
3207 3115 .copyin_exit:
3208 3116 membar #Sync
3209 3117
3210 3118 FPRAS_INTERVAL(FPRAS_COPYIN, 1, %l5, %o2, %o3, %o4, %o5, 8)
3211 3119 FPRAS_REWRITE_TYPE1(1, %l5, %f48, %o2, 9)
3212 3120 FPRAS_CHECK(FPRAS_COPYIN, %l5, 9) ! lose outputs
3213 3121
3214 3122 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr
3215 3123 wr %o2, 0, %gsr
3216 3124
3217 3125 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
3218 3126 btst FPRS_FEF, %o3
3219 3127 bz,pt %icc, 4f
3220 3128 nop
3221 3129
3222 3130 BLD_FPQ2Q4_FROMSTACK(%o2)
3223 3131
3224 3132 ba,pt %ncc, 1f
3225 3133 wr %o3, 0, %fprs ! restore fprs
3226 3134
3227 3135 4:
3228 3136 FZEROQ2Q4
3229 3137 wr %o3, 0, %fprs ! restore fprs
3230 3138
3231 3139 1:
3232 3140 membar #Sync ! sync error barrier
3233 3141 andn %l6, FPUSED_FLAG, %l6
3234 3142 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
3235 3143 FP_ALLOWMIGRATE(5, 6)
3236 3144 ret
3237 3145 restore %g0, 0, %o0
3238 3146 /*
3239 3147 * We got here because of a fault during copyin
3240 3148 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
3241 3149 */
3242 3150 .copyin_err:
3243 3151 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler
3244 3152 tst %o4
3245 3153 bz,pt %ncc, 2f ! if not, return error
3246 3154 nop
↓ open down ↓ |
595 lines elided |
↑ open up ↑ |
3247 3155 ldn [%o4 + CP_COPYIN], %g2 ! if handler, invoke it with
3248 3156 jmp %g2 ! original arguments
3249 3157 restore %g0, 0, %g0 ! dispose of copy window
3250 3158 2:
3251 3159 ret
3252 3160 restore %g0, -1, %o0 ! return error value
3253 3161
3254 3162
3255 3163 SET_SIZE(copyin_more)
3256 3164
3257 -#endif /* lint */
3258 -
3259 -#ifdef lint
3260 -
3261 -/*ARGSUSED*/
3262 -int
3263 -xcopyin(const void *uaddr, void *kaddr, size_t count)
3264 -{ return (0); }
3265 -
3266 -#else /* lint */
3267 -
3268 3165 ENTRY(xcopyin)
3269 3166
3270 3167 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
3271 3168 bleu,pt %ncc, .xcopyin_small ! go to larger cases
3272 3169 xor %o0, %o1, %o3 ! are src, dst alignable?
3273 3170 btst 7, %o3 !
3274 3171 bz,pt %ncc, .xcopyin_8 ! check for longword alignment
3275 3172 nop
3276 3173 btst 1, %o3 !
3277 3174 bz,pt %ncc, .xcopyin_2 ! check for half-word
3278 3175 nop
3279 3176 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
3280 3177 ld [%o3 + %lo(hw_copy_limit_1)], %o3
3281 3178 tst %o3
3282 3179 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy
3283 3180 cmp %o2, %o3 ! if length <= limit
3284 3181 bleu,pt %ncc, .xcopyin_small ! go to small copy
3285 3182 nop
3286 3183 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy
3287 3184 nop
3288 3185 .xcopyin_2:
3289 3186 btst 3, %o3 !
3290 3187 bz,pt %ncc, .xcopyin_4 ! check for word alignment
3291 3188 nop
3292 3189 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
3293 3190 ld [%o3 + %lo(hw_copy_limit_2)], %o3
3294 3191 tst %o3
3295 3192 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy
3296 3193 cmp %o2, %o3 ! if length <= limit
3297 3194 bleu,pt %ncc, .xcopyin_small ! go to small copy
3298 3195 nop
3299 3196 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy
3300 3197 nop
3301 3198 .xcopyin_4:
3302 3199 ! already checked longword, must be word aligned
3303 3200 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
3304 3201 ld [%o3 + %lo(hw_copy_limit_4)], %o3
3305 3202 tst %o3
3306 3203 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy
3307 3204 cmp %o2, %o3 ! if length <= limit
3308 3205 bleu,pt %ncc, .xcopyin_small ! go to small copy
3309 3206 nop
3310 3207 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy
3311 3208 nop
3312 3209 .xcopyin_8:
3313 3210 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
3314 3211 ld [%o3 + %lo(hw_copy_limit_8)], %o3
3315 3212 tst %o3
3316 3213 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy
3317 3214 cmp %o2, %o3 ! if length <= limit
3318 3215 bleu,pt %ncc, .xcopyin_small ! go to small copy
3319 3216 nop
3320 3217 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy
3321 3218 nop
3322 3219
3323 3220 .xcopyin_small:
3324 3221 sethi %hi(.sm_xcopyin_err), %o5 ! .sm_xcopyin_err is lofault value
3325 3222 or %o5, %lo(.sm_xcopyin_err), %o5
3326 3223 ldn [THREAD_REG + T_LOFAULT], %o4 ! set/save t_lofaul
3327 3224 membar #Sync ! sync error barrier
3328 3225 ba,pt %ncc, .sm_do_copyin ! common code
3329 3226 stn %o5, [THREAD_REG + T_LOFAULT]
3330 3227
3331 3228 .xcopyin_more:
3332 3229 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3333 3230 sethi %hi(.xcopyin_err), REAL_LOFAULT ! .xcopyin_err is lofault value
3334 3231 ba,pt %ncc, .do_copyin
3335 3232 or REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
3336 3233
3337 3234 /*
3338 3235 * We got here because of fault during xcopyin
3339 3236 * Errno value is in ERRNO
3340 3237 */
3341 3238 .xcopyin_err:
3342 3239 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler
3343 3240 tst %o4
3344 3241 bz,pt %ncc, 2f ! if not, return error
3345 3242 nop
3346 3243 ldn [%o4 + CP_XCOPYIN], %g2 ! if handler, invoke it with
3347 3244 jmp %g2 ! original arguments
3348 3245 restore %g0, 0, %g0 ! dispose of copy window
3349 3246 2:
3350 3247 ret
3351 3248 restore ERRNO, 0, %o0 ! return errno value
3352 3249
3353 3250 .sm_xcopyin_err:
3354 3251
3355 3252 membar #Sync
3356 3253 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
3357 3254 mov SM_SAVE_SRC, %o0
3358 3255 mov SM_SAVE_DST, %o1
3359 3256 mov SM_SAVE_COUNT, %o2
3360 3257 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler
3361 3258 tst %o3
3362 3259 bz,pt %ncc, 3f ! if not, return error
↓ open down ↓ |
85 lines elided |
↑ open up ↑ |
3363 3260 nop
3364 3261 ldn [%o3 + CP_XCOPYIN], %o5 ! if handler, invoke it with
3365 3262 jmp %o5 ! original arguments
3366 3263 nop
3367 3264 3:
3368 3265 retl
3369 3266 or %g1, 0, %o0 ! return errno value
3370 3267
3371 3268 SET_SIZE(xcopyin)
3372 3269
3373 -#endif /* lint */
3374 -
3375 -#ifdef lint
3376 -
3377 -/*ARGSUSED*/
3378 -int
3379 -xcopyin_little(const void *uaddr, void *kaddr, size_t count)
3380 -{ return (0); }
3381 -
3382 -#else /* lint */
3383 -
3384 3270 ENTRY(xcopyin_little)
3385 3271 sethi %hi(.xcopyio_err), %o5
3386 3272 or %o5, %lo(.xcopyio_err), %o5
3387 3273 ldn [THREAD_REG + T_LOFAULT], %o4
3388 3274 membar #Sync ! sync error barrier
3389 3275 stn %o5, [THREAD_REG + T_LOFAULT]
3390 3276 mov %o4, %o5
3391 3277
3392 3278 subcc %g0, %o2, %o3
3393 3279 add %o0, %o2, %o0
3394 3280 bz,pn %ncc, 2f ! check for zero bytes
3395 3281 sub %o2, 1, %o4
3396 3282 add %o0, %o4, %o0 ! start w/last byte
3397 3283 add %o1, %o2, %o1
3398 3284 lduba [%o0 + %o3]ASI_AIUSL, %o4
3399 3285
3400 3286 1: stb %o4, [%o1 + %o3]
3401 3287 inccc %o3
3402 3288 sub %o0, 2, %o0 ! get next byte
3403 3289 bcc,a,pt %ncc, 1b
3404 3290 lduba [%o0 + %o3]ASI_AIUSL, %o4
3405 3291
3406 3292 2:
3407 3293 membar #Sync ! sync error barrier
3408 3294 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
3409 3295 retl
↓ open down ↓ |
16 lines elided |
↑ open up ↑ |
3410 3296 mov %g0, %o0 ! return (0)
3411 3297
3412 3298 .xcopyio_err:
3413 3299 membar #Sync ! sync error barrier
3414 3300 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
3415 3301 retl
3416 3302 mov %g1, %o0
3417 3303
3418 3304 SET_SIZE(xcopyin_little)
3419 3305
3420 -#endif /* lint */
3421 3306
3422 -
3423 3307 /*
3424 3308 * Copy a block of storage - must not overlap (from + len <= to).
3425 3309 * No fault handler installed (to be called under on_fault())
3426 3310 */
3427 -#if defined(lint)
3428 -
3429 -/* ARGSUSED */
3430 -void
3431 -copyin_noerr(const void *ufrom, void *kto, size_t count)
3432 -{}
3433 -
3434 -#else /* lint */
3435 3311 ENTRY(copyin_noerr)
3436 3312
3437 3313 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
3438 3314 bleu,pt %ncc, .copyin_ne_small ! go to larger cases
3439 3315 xor %o0, %o1, %o3 ! are src, dst alignable?
3440 3316 btst 7, %o3 !
3441 3317 bz,pt %ncc, .copyin_ne_8 ! check for longword alignment
3442 3318 nop
3443 3319 btst 1, %o3 !
3444 3320 bz,pt %ncc, .copyin_ne_2 ! check for half-word
3445 3321 nop
3446 3322 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
3447 3323 ld [%o3 + %lo(hw_copy_limit_1)], %o3
3448 3324 tst %o3
3449 3325 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy
3450 3326 cmp %o2, %o3 ! if length <= limit
3451 3327 bleu,pt %ncc, .copyin_ne_small ! go to small copy
3452 3328 nop
3453 3329 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy
3454 3330 nop
3455 3331 .copyin_ne_2:
3456 3332 btst 3, %o3 !
3457 3333 bz,pt %ncc, .copyin_ne_4 ! check for word alignment
3458 3334 nop
3459 3335 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
3460 3336 ld [%o3 + %lo(hw_copy_limit_2)], %o3
3461 3337 tst %o3
3462 3338 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy
3463 3339 cmp %o2, %o3 ! if length <= limit
3464 3340 bleu,pt %ncc, .copyin_ne_small ! go to small copy
3465 3341 nop
3466 3342 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy
3467 3343 nop
3468 3344 .copyin_ne_4:
3469 3345 ! already checked longword, must be word aligned
3470 3346 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
3471 3347 ld [%o3 + %lo(hw_copy_limit_4)], %o3
3472 3348 tst %o3
3473 3349 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy
3474 3350 cmp %o2, %o3 ! if length <= limit
3475 3351 bleu,pt %ncc, .copyin_ne_small ! go to small copy
3476 3352 nop
3477 3353 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy
3478 3354 nop
3479 3355 .copyin_ne_8:
3480 3356 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
3481 3357 ld [%o3 + %lo(hw_copy_limit_8)], %o3
3482 3358 tst %o3
3483 3359 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy
3484 3360 cmp %o2, %o3 ! if length <= limit
3485 3361 bleu,pt %ncc, .copyin_ne_small ! go to small copy
3486 3362 nop
3487 3363 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy
3488 3364 nop
3489 3365
3490 3366 .copyin_ne_small:
3491 3367 ldn [THREAD_REG + T_LOFAULT], %o4
3492 3368 tst %o4
3493 3369 bz,pn %ncc, .sm_do_copyin
3494 3370 nop
3495 3371 sethi %hi(.sm_copyio_noerr), %o5
3496 3372 or %o5, %lo(.sm_copyio_noerr), %o5
3497 3373 membar #Sync ! sync error barrier
3498 3374 ba,pt %ncc, .sm_do_copyin
3499 3375 stn %o5, [THREAD_REG + T_LOFAULT] ! set/save t_lofault
3500 3376
3501 3377 .copyin_noerr_more:
3502 3378 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3503 3379 sethi %hi(.copyio_noerr), REAL_LOFAULT
3504 3380 ba,pt %ncc, .do_copyin
3505 3381 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
3506 3382
3507 3383 .copyio_noerr:
↓ open down ↓ |
63 lines elided |
↑ open up ↑ |
3508 3384 jmp %l6
3509 3385 restore %g0,0,%g0
3510 3386
3511 3387 .sm_copyio_noerr:
3512 3388 membar #Sync
3513 3389 stn %o4, [THREAD_REG + T_LOFAULT] ! restore t_lofault
3514 3390 jmp %o4
3515 3391 nop
3516 3392
3517 3393 SET_SIZE(copyin_noerr)
3518 -#endif /* lint */
3519 3394
3520 3395 /*
3521 3396 * Copy a block of storage - must not overlap (from + len <= to).
3522 3397 * No fault handler installed (to be called under on_fault())
3523 3398 */
3524 3399
3525 -#if defined(lint)
3526 -
3527 -/* ARGSUSED */
3528 -void
3529 -copyout_noerr(const void *kfrom, void *uto, size_t count)
3530 -{}
3531 -
3532 -#else /* lint */
3533 3400 ENTRY(copyout_noerr)
3534 3401
3535 3402 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case
3536 3403 bleu,pt %ncc, .copyout_ne_small ! go to larger cases
3537 3404 xor %o0, %o1, %o3 ! are src, dst alignable?
3538 3405 btst 7, %o3 !
3539 3406 bz,pt %ncc, .copyout_ne_8 ! check for longword alignment
3540 3407 nop
3541 3408 btst 1, %o3 !
3542 3409 bz,pt %ncc, .copyout_ne_2 ! check for half-word
3543 3410 nop
3544 3411 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit
3545 3412 ld [%o3 + %lo(hw_copy_limit_1)], %o3
3546 3413 tst %o3
3547 3414 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy
3548 3415 cmp %o2, %o3 ! if length <= limit
3549 3416 bleu,pt %ncc, .copyout_ne_small ! go to small copy
3550 3417 nop
3551 3418 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy
3552 3419 nop
3553 3420 .copyout_ne_2:
3554 3421 btst 3, %o3 !
3555 3422 bz,pt %ncc, .copyout_ne_4 ! check for word alignment
3556 3423 nop
3557 3424 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit
3558 3425 ld [%o3 + %lo(hw_copy_limit_2)], %o3
3559 3426 tst %o3
3560 3427 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy
3561 3428 cmp %o2, %o3 ! if length <= limit
3562 3429 bleu,pt %ncc, .copyout_ne_small ! go to small copy
3563 3430 nop
3564 3431 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy
3565 3432 nop
3566 3433 .copyout_ne_4:
3567 3434 ! already checked longword, must be word aligned
3568 3435 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit
3569 3436 ld [%o3 + %lo(hw_copy_limit_4)], %o3
3570 3437 tst %o3
3571 3438 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy
3572 3439 cmp %o2, %o3 ! if length <= limit
3573 3440 bleu,pt %ncc, .copyout_ne_small ! go to small copy
3574 3441 nop
3575 3442 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy
3576 3443 nop
3577 3444 .copyout_ne_8:
3578 3445 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit
3579 3446 ld [%o3 + %lo(hw_copy_limit_8)], %o3
3580 3447 tst %o3
3581 3448 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy
3582 3449 cmp %o2, %o3 ! if length <= limit
3583 3450 bleu,pt %ncc, .copyout_ne_small ! go to small copy
3584 3451 nop
3585 3452 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy
3586 3453 nop
3587 3454
3588 3455 .copyout_ne_small:
3589 3456 ldn [THREAD_REG + T_LOFAULT], %o4
3590 3457 tst %o4
3591 3458 bz,pn %ncc, .sm_do_copyout
3592 3459 nop
3593 3460 sethi %hi(.sm_copyio_noerr), %o5
3594 3461 or %o5, %lo(.sm_copyio_noerr), %o5
3595 3462 membar #Sync ! sync error barrier
↓ open down ↓ |
53 lines elided |
↑ open up ↑ |
3596 3463 ba,pt %ncc, .sm_do_copyout
3597 3464 stn %o5, [THREAD_REG + T_LOFAULT] ! set/save t_lofault
3598 3465
3599 3466 .copyout_noerr_more:
3600 3467 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3601 3468 sethi %hi(.copyio_noerr), REAL_LOFAULT
3602 3469 ba,pt %ncc, .do_copyout
3603 3470 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
3604 3471
3605 3472 SET_SIZE(copyout_noerr)
3606 -#endif /* lint */
3607 3473
3608 3474
3609 3475 /*
3610 3476 * hwblkclr - clears block-aligned, block-multiple-sized regions that are
3611 3477 * longer than 256 bytes in length using spitfire's block stores. If
3612 3478 * the criteria for using this routine are not met then it calls bzero
3613 3479 * and returns 1. Otherwise 0 is returned indicating success.
3614 3480 * Caller is responsible for ensuring use_hw_bzero is true and that
3615 3481 * kpreempt_disable() has been called.
3616 3482 */
3617 -#ifdef lint
3618 -/*ARGSUSED*/
3619 -int
3620 -hwblkclr(void *addr, size_t len)
3621 -{
3622 - return(0);
3623 -}
3624 -#else /* lint */
3625 3483 ! %i0 - start address
3626 3484 ! %i1 - length of region (multiple of 64)
3627 3485 ! %l0 - saved fprs
3628 3486 ! %l1 - pointer to saved %d0 block
3629 3487 ! %l2 - saved curthread->t_lwp
3630 3488
3631 3489 ENTRY(hwblkclr)
3632 3490 ! get another window w/space for one aligned block of saved fpregs
3633 3491 save %sp, -SA(MINFRAME + 2*VIS_BLOCKSIZE), %sp
3634 3492
3635 3493 ! Must be block-aligned
3636 3494 andcc %i0, (VIS_BLOCKSIZE-1), %g0
3637 3495 bnz,pn %ncc, 1f
3638 3496 nop
3639 3497
3640 3498 ! ... and must be 256 bytes or more
3641 3499 cmp %i1, 256
3642 3500 blu,pn %ncc, 1f
3643 3501 nop
3644 3502
3645 3503 ! ... and length must be a multiple of VIS_BLOCKSIZE
3646 3504 andcc %i1, (VIS_BLOCKSIZE-1), %g0
3647 3505 bz,pn %ncc, 2f
3648 3506 nop
3649 3507
3650 3508 1: ! punt, call bzero but notify the caller that bzero was used
3651 3509 mov %i0, %o0
3652 3510 call bzero
3653 3511 mov %i1, %o1
3654 3512 ret
3655 3513 restore %g0, 1, %o0 ! return (1) - did not use block operations
3656 3514
3657 3515 2: rd %fprs, %l0 ! check for unused fp
3658 3516 btst FPRS_FEF, %l0
3659 3517 bz,pt %icc, 1f
3660 3518 nop
3661 3519
3662 3520 ! save in-use fpregs on stack
3663 3521 membar #Sync
3664 3522 add %fp, STACK_BIAS - 65, %l1
3665 3523 and %l1, -VIS_BLOCKSIZE, %l1
3666 3524 stda %d0, [%l1]ASI_BLK_P
3667 3525
3668 3526 1: membar #StoreStore|#StoreLoad|#LoadStore
3669 3527 wr %g0, FPRS_FEF, %fprs
3670 3528 wr %g0, ASI_BLK_P, %asi
3671 3529
3672 3530 ! Clear block
3673 3531 fzero %d0
3674 3532 fzero %d2
3675 3533 fzero %d4
3676 3534 fzero %d6
3677 3535 fzero %d8
3678 3536 fzero %d10
3679 3537 fzero %d12
3680 3538 fzero %d14
3681 3539
3682 3540 mov 256, %i3
3683 3541 ba,pt %ncc, .pz_doblock
3684 3542 nop
3685 3543
3686 3544 .pz_blkstart:
3687 3545 ! stda %d0, [%i0 + 192]%asi ! in dly slot of branch that got us here
3688 3546 stda %d0, [%i0 + 128]%asi
3689 3547 stda %d0, [%i0 + 64]%asi
3690 3548 stda %d0, [%i0]%asi
3691 3549 .pz_zinst:
3692 3550 add %i0, %i3, %i0
3693 3551 sub %i1, %i3, %i1
3694 3552 .pz_doblock:
3695 3553 cmp %i1, 256
3696 3554 bgeu,a %ncc, .pz_blkstart
3697 3555 stda %d0, [%i0 + 192]%asi
3698 3556
3699 3557 cmp %i1, 64
3700 3558 blu %ncc, .pz_finish
3701 3559
3702 3560 andn %i1, (64-1), %i3
3703 3561 srl %i3, 4, %i2 ! using blocks, 1 instr / 16 words
3704 3562 set .pz_zinst, %i4
3705 3563 sub %i4, %i2, %i4
3706 3564 jmp %i4
3707 3565 nop
3708 3566
3709 3567 .pz_finish:
3710 3568 membar #Sync
3711 3569 btst FPRS_FEF, %l0
3712 3570 bz,a .pz_finished
3713 3571 wr %l0, 0, %fprs ! restore fprs
3714 3572
↓ open down ↓ |
80 lines elided |
↑ open up ↑ |
3715 3573 ! restore fpregs from stack
3716 3574 ldda [%l1]ASI_BLK_P, %d0
3717 3575 membar #Sync
3718 3576 wr %l0, 0, %fprs ! restore fprs
3719 3577
3720 3578 .pz_finished:
3721 3579 ret
3722 3580 restore %g0, 0, %o0 ! return (bzero or not)
3723 3581
3724 3582 SET_SIZE(hwblkclr)
3725 -#endif /* lint */
3726 3583
3727 -#ifdef lint
3728 -/*ARGSUSED*/
3729 -void
3730 -hw_pa_bcopy32(uint64_t src, uint64_t dst)
3731 -{}
3732 -#else /*!lint */
3733 3584 /*
3734 3585 * Copy 32 bytes of data from src (%o0) to dst (%o1)
3735 3586 * using physical addresses.
3736 3587 */
3737 3588 ENTRY_NP(hw_pa_bcopy32)
3738 3589 rdpr %pstate, %g1
3739 3590 andn %g1, PSTATE_IE, %g2
3740 3591 wrpr %g0, %g2, %pstate
3741 3592
3742 3593 rdpr %pstate, %g0
3743 3594 ldxa [%o0]ASI_MEM, %o2
3744 3595 add %o0, 8, %o0
3745 3596 ldxa [%o0]ASI_MEM, %o3
3746 3597 add %o0, 8, %o0
3747 3598 ldxa [%o0]ASI_MEM, %o4
3748 3599 add %o0, 8, %o0
3749 3600 ldxa [%o0]ASI_MEM, %o5
3750 3601
3751 3602 stxa %g0, [%o1]ASI_DC_INVAL
3752 3603 membar #Sync
3753 3604
3754 3605 stxa %o2, [%o1]ASI_MEM
3755 3606 add %o1, 8, %o1
3756 3607 stxa %o3, [%o1]ASI_MEM
↓ open down ↓ |
14 lines elided |
↑ open up ↑ |
3757 3608 add %o1, 8, %o1
3758 3609 stxa %o4, [%o1]ASI_MEM
3759 3610 add %o1, 8, %o1
3760 3611 stxa %o5, [%o1]ASI_MEM
3761 3612
3762 3613 retl
3763 3614 wrpr %g0, %g1, %pstate
3764 3615
3765 3616 SET_SIZE(hw_pa_bcopy32)
3766 3617
3767 -#endif /* lint */
3768 -
3769 -#if defined(lint)
3770 -
3771 -int use_hw_bcopy = 1;
3772 -int use_hw_bzero = 1;
3773 -uint_t hw_copy_limit_1 = 0;
3774 -uint_t hw_copy_limit_2 = 0;
3775 -uint_t hw_copy_limit_4 = 0;
3776 -uint_t hw_copy_limit_8 = 0;
3777 -
3778 -#else /* !lint */
3779 -
3780 3618 DGDEF(use_hw_bcopy)
3781 3619 .word 1
3782 3620 DGDEF(use_hw_bzero)
3783 3621 .word 1
3784 3622 DGDEF(hw_copy_limit_1)
3785 3623 .word 0
3786 3624 DGDEF(hw_copy_limit_2)
3787 3625 .word 0
3788 3626 DGDEF(hw_copy_limit_4)
3789 3627 .word 0
3790 3628 DGDEF(hw_copy_limit_8)
3791 3629 .word 0
3792 3630
3793 3631 .align 64
3794 3632 .section ".text"
3795 -#endif /* !lint */
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX