1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/param.h> 27 #include <sys/errno.h> 28 #include <sys/asm_linkage.h> 29 #include <sys/vtrace.h> 30 #include <sys/machthread.h> 31 #include <sys/clock.h> 32 #include <sys/asi.h> 33 #include <sys/fsr.h> 34 #include <sys/privregs.h> 35 36 #include "assym.h" 37 38 /* 39 * Pseudo-code to aid in understanding the control flow of the 40 * bcopy/copyin/copyout routines. 41 * 42 * On entry: 43 * 44 * ! Determine whether to use the FP register version 45 * ! or the leaf routine version depending on size 46 * ! of copy and flags. Set up error handling accordingly. 47 * ! The transition point depends on whether the src and 48 * ! dst addresses can be aligned to long word, word, 49 * ! half word, or byte boundaries. 50 * ! 51 * ! WARNING: <Register usage convention> 52 * ! For FP version, %l6 holds previous error handling and 53 * ! a flag: TRAMP_FLAG (low bits) 54 * ! for leaf routine version, %o4 holds those values. 55 * ! So either %l6 or %o4 is reserved and not available for 56 * ! any other use. 57 * 58 * if (length <= VIS_COPY_THRESHOLD) ! start with a quick test 59 * go to small_copy; ! to speed short copies 60 * 61 * ! src, dst long word alignable 62 * if (hw_copy_limit_8 == 0) ! hw_copy disabled 63 * go to small_copy; 64 * if (length <= hw_copy_limit_8) 65 * go to small_copy; 66 * go to FPBLK_copy; 67 * } 68 * if (src,dst not alignable) { 69 * if (hw_copy_limit_1 == 0) ! hw_copy disabled 70 * go to small_copy; 71 * if (length <= hw_copy_limit_1) 72 * go to small_copy; 73 * go to FPBLK_copy; 74 * } 75 * if (src,dst halfword alignable) { 76 * if (hw_copy_limit_2 == 0) ! hw_copy disabled 77 * go to small_copy; 78 * if (length <= hw_copy_limit_2) 79 * go to small_copy; 80 * go to FPBLK_copy; 81 * } 82 * if (src,dst word alignable) { 83 * if (hw_copy_limit_4 == 0) ! hw_copy disabled 84 * go to small_copy; 85 * if (length <= hw_copy_limit_4) 86 * go to small_copy; 87 * go to FPBLK_copy; 88 * } 89 * 90 * small_copy: 91 * Setup_leaf_rtn_error_handler; ! diffs for each entry point 92 * 93 * if (count <= 3) ! fast path for tiny copies 94 * go to sm_left; ! special finish up code 95 * else 96 * if (count > CHKSIZE) ! medium sized copies 97 * go to sm_med ! tuned by alignment 98 * if(src&dst not both word aligned) { 99 * sm_movebytes: 100 * move byte by byte in 4-way unrolled loop 101 * fall into sm_left; 102 * sm_left: 103 * move 0-3 bytes byte at a time as needed. 104 * restore error handler and exit. 105 * 106 * } else { ! src&dst are word aligned 107 * check for at least 8 bytes left, 108 * move word at a time, unrolled by 2 109 * when fewer than 8 bytes left, 110 * sm_half: move half word at a time while 2 or more bytes left 111 * sm_byte: move final byte if necessary 112 * sm_exit: 113 * restore error handler and exit. 114 * } 115 * 116 * ! Medium length cases with at least CHKSIZE bytes available 117 * ! method: line up src and dst as best possible, then 118 * ! move data in 4-way unrolled loops. 119 * 120 * sm_med: 121 * if(src&dst unalignable) 122 * go to sm_movebytes 123 * if(src&dst halfword alignable) 124 * go to sm_movehalf 125 * if(src&dst word alignable) 126 * go to sm_moveword 127 * ! fall into long word movement 128 * move bytes until src is word aligned 129 * if not long word aligned, move a word 130 * move long words in 4-way unrolled loop until < 32 bytes left 131 * move long words in 1-way unrolled loop until < 8 bytes left 132 * if zero bytes left, goto sm_exit 133 * if one byte left, go to sm_byte 134 * else go to sm_half 135 * 136 * sm_moveword: 137 * move bytes until src is word aligned 138 * move words in 4-way unrolled loop until < 16 bytes left 139 * move words in 1-way unrolled loop until < 4 bytes left 140 * if zero bytes left, goto sm_exit 141 * if one byte left, go to sm_byte 142 * else go to sm_half 143 * 144 * sm_movehalf: 145 * move a byte if needed to align src on halfword 146 * move halfwords in 4-way unrolled loop until < 8 bytes left 147 * if zero bytes left, goto sm_exit 148 * if one byte left, go to sm_byte 149 * else go to sm_half 150 * 151 * 152 * FPBLK_copy: 153 * %l6 = curthread->t_lofault; 154 * if (%l6 != NULL) { 155 * membar #Sync 156 * curthread->t_lofault = .copyerr; 157 * caller_error_handler = TRUE ! %l6 |= 2 158 * } 159 * 160 * ! for FPU testing we must not migrate cpus 161 * if (curthread->t_lwp == NULL) { 162 * ! Kernel threads do not have pcb's in which to store 163 * ! the floating point state, so disallow preemption during 164 * ! the copy. This also prevents cpu migration. 165 * kpreempt_disable(curthread); 166 * } else { 167 * thread_nomigrate(); 168 * } 169 * 170 * old_fprs = %fprs; 171 * old_gsr = %gsr; 172 * if (%fprs.fef) { 173 * %fprs.fef = 1; 174 * save current fpregs on stack using blockstore 175 * } else { 176 * %fprs.fef = 1; 177 * } 178 * 179 * 180 * do_blockcopy_here; 181 * 182 * In lofault handler: 183 * curthread->t_lofault = .copyerr2; 184 * Continue on with the normal exit handler 185 * 186 * On normal exit: 187 * %gsr = old_gsr; 188 * if (old_fprs & FPRS_FEF) 189 * restore fpregs from stack using blockload 190 * else 191 * zero fpregs 192 * %fprs = old_fprs; 193 * membar #Sync 194 * curthread->t_lofault = (%l6 & ~3); 195 * ! following test omitted from copyin/copyout as they 196 * ! will always have a current thread 197 * if (curthread->t_lwp == NULL) 198 * kpreempt_enable(curthread); 199 * else 200 * thread_allowmigrate(); 201 * return (0) 202 * 203 * In second lofault handler (.copyerr2): 204 * We've tried to restore fp state from the stack and failed. To 205 * prevent from returning with a corrupted fp state, we will panic. 206 */ 207 208 /* 209 * Comments about optimization choices 210 * 211 * The initial optimization decision in this code is to determine 212 * whether to use the FP registers for a copy or not. If we don't 213 * use the FP registers, we can execute the copy as a leaf routine, 214 * saving a register save and restore. Also, less elaborate setup 215 * is required, allowing short copies to be completed more quickly. 216 * For longer copies, especially unaligned ones (where the src and 217 * dst do not align to allow simple ldx,stx operation), the FP 218 * registers allow much faster copy operations. 219 * 220 * The estimated extra cost of the FP path will vary depending on 221 * src/dst alignment, dst offset from the next 64 byte FPblock store 222 * boundary, remaining src data after the last full dst cache line is 223 * moved whether the FP registers need to be saved, and some other 224 * minor issues. The average additional overhead is estimated to be 225 * 400 clocks. Since each non-repeated/predicted tst and branch costs 226 * around 10 clocks, elaborate calculation would slow down to all 227 * longer copies and only benefit a small portion of medium sized 228 * copies. Rather than incur such cost, we chose fixed transition 229 * points for each of the alignment choices. 230 * 231 * For the inner loop, here is a comparison of the per cache line 232 * costs for each alignment when src&dst are in cache: 233 * 234 * byte aligned: 108 clocks slower for non-FPBLK 235 * half aligned: 44 clocks slower for non-FPBLK 236 * word aligned: 12 clocks slower for non-FPBLK 237 * long aligned: 4 clocks >>faster<< for non-FPBLK 238 * 239 * The long aligned loop runs faster because it does no prefetching. 240 * That wins if the data is not in cache or there is too little 241 * data to gain much benefit from prefetching. But when there 242 * is more data and that data is not in cache, failing to prefetch 243 * can run much slower. In addition, there is a 2 Kbyte store queue 244 * which will cause the non-FPBLK inner loop to slow for larger copies. 245 * The exact tradeoff is strongly load and application dependent, with 246 * increasing risk of a customer visible performance regression if the 247 * non-FPBLK code is used for larger copies. Studies of synthetic in-cache 248 * vs out-of-cache copy tests in user space suggest 1024 bytes as a safe 249 * upper limit for the non-FPBLK code. To minimize performance regression 250 * risk while still gaining the primary benefits of the improvements to 251 * the non-FPBLK code, we set an upper bound of 1024 bytes for the various 252 * hw_copy_limit_*. Later experimental studies using different values 253 * of hw_copy_limit_* can be used to make further adjustments if 254 * appropriate. 255 * 256 * hw_copy_limit_1 = src and dst are byte aligned but not halfword aligned 257 * hw_copy_limit_2 = src and dst are halfword aligned but not word aligned 258 * hw_copy_limit_4 = src and dst are word aligned but not longword aligned 259 * hw_copy_limit_8 = src and dst are longword aligned 260 * 261 * To say that src and dst are word aligned means that after 262 * some initial alignment activity of moving 0 to 3 bytes, 263 * both the src and dst will be on word boundaries so that 264 * word loads and stores may be used. 265 * 266 * Default values at May,2005 are: 267 * hw_copy_limit_1 = 256 268 * hw_copy_limit_2 = 512 269 * hw_copy_limit_4 = 1024 270 * hw_copy_limit_8 = 1024 (or 1536 on some systems) 271 * 272 * 273 * If hw_copy_limit_? is set to zero, then use of FPBLK copy is 274 * disabled for that alignment choice. 275 * If hw_copy_limit_? is set to a value between 1 and VIS_COPY_THRESHOLD (256) 276 * the value of VIS_COPY_THRESHOLD is used. 277 * It is not envisioned that hw_copy_limit_? will be changed in the field 278 * It is provided to allow for disabling FPBLK copies and to allow 279 * easy testing of alternate values on future HW implementations 280 * that might have different cache sizes, clock rates or instruction 281 * timing rules. 282 * 283 * Our first test for FPBLK copies vs non-FPBLK copies checks a minimum 284 * threshold to speedup all shorter copies (less than 256). That 285 * saves an alignment test, memory reference, and enabling test 286 * for all short copies, or an estimated 24 clocks. 287 * 288 * The order in which these limits are checked does matter since each 289 * non-predicted tst and branch costs around 10 clocks. 290 * If src and dst are randomly selected addresses, 291 * 4 of 8 will not be alignable. 292 * 2 of 8 will be half word alignable. 293 * 1 of 8 will be word alignable. 294 * 1 of 8 will be long word alignable. 295 * But, tests on running kernels show that src and dst to copy code 296 * are typically not on random alignments. Structure copies and 297 * copies of larger data sizes are often on long word boundaries. 298 * So we test the long word alignment case first, then 299 * the byte alignment, then halfword, then word alignment. 300 * 301 * Several times, tests for length are made to split the code 302 * into subcases. These tests often allow later tests to be 303 * avoided. For example, within the non-FPBLK copy, we first 304 * check for tiny copies of 3 bytes or less. That allows us 305 * to use a 4-way unrolled loop for the general byte copy case 306 * without a test on loop entry. 307 * We subdivide the non-FPBLK case further into CHKSIZE bytes and less 308 * vs longer cases. For the really short case, we don't attempt 309 * align src and dst. We try to minimize special case tests in 310 * the shortest loops as each test adds a significant percentage 311 * to the total time. 312 * 313 * For the medium sized cases, we allow ourselves to adjust the 314 * src and dst alignment and provide special cases for each of 315 * the four adjusted alignment cases. The CHKSIZE that was used 316 * to decide between short and medium size was chosen to be 39 317 * as that allows for the worst case of 7 bytes of alignment 318 * shift and 4 times 8 bytes for the first long word unrolling. 319 * That knowledge saves an initial test for length on entry into 320 * the medium cases. If the general loop unrolling factor were 321 * to be increases, this number would also need to be adjusted. 322 * 323 * For all cases in the non-FPBLK code where it is known that at 324 * least 4 chunks of data are available for movement, the 325 * loop is unrolled by four. This 4-way loop runs in 8 clocks 326 * or 2 clocks per data element. 327 * 328 * Instruction alignment is forced by used of .align 16 directives 329 * and nops which are not executed in the code. This 330 * combination of operations shifts the alignment of following 331 * loops to insure that loops are aligned so that their instructions 332 * fall within the minimum number of 4 instruction fetch groups. 333 * If instructions are inserted or removed between the .align 334 * instruction and the unrolled loops, then the alignment needs 335 * to be readjusted. Misaligned loops can add a clock per loop 336 * iteration to the loop timing. 337 * 338 * In a few cases, code is duplicated to avoid a branch. Since 339 * a non-predicted tst and branch takes 10 clocks, this savings 340 * is judged an appropriate time-space tradeoff. 341 * 342 * Within the FPBLK-code, the prefetch method in the inner 343 * loop needs to be explained as it is not standard. Two 344 * prefetches are issued for each cache line instead of one. 345 * The primary one is at the maximum reach of 8 cache lines. 346 * Most of the time, that maximum prefetch reach gives the 347 * cache line more time to reach the processor for systems with 348 * higher processor clocks. But, sometimes memory interference 349 * can cause that prefetch to be dropped. Putting a second 350 * prefetch at a reach of 5 cache lines catches the drops 351 * three iterations later and shows a measured improvement 352 * in performance over any similar loop with a single prefetch. 353 * The prefetches are placed in the loop so they overlap with 354 * non-memory instructions, so that there is no extra cost 355 * when the data is already in-cache. 356 * 357 */ 358 359 /* 360 * Notes on preserving existing fp state and on membars. 361 * 362 * When a copyOP decides to use fp we may have to preserve existing 363 * floating point state. It is not the caller's state that we need to 364 * preserve - the rest of the kernel does not use fp and, anyway, fp 365 * registers are volatile across a call. Some examples: 366 * 367 * - userland has fp state and is interrupted (device interrupt 368 * or trap) and within the interrupt/trap handling we use 369 * bcopy() 370 * - another (higher level) interrupt or trap handler uses bcopy 371 * while a bcopy from an earlier interrupt is still active 372 * - an asynchronous error trap occurs while fp state exists (in 373 * userland or in kernel copy) and the tl0 component of the handling 374 * uses bcopy 375 * - a user process with fp state incurs a copy-on-write fault and 376 * hwblkpagecopy always uses fp 377 * 378 * We therefore need a per-call place in which to preserve fp state - 379 * using our stack is ideal (and since fp copy cannot be leaf optimized 380 * because of calls it makes, this is no hardship). 381 * 382 * When we have finished fp copy (with it's repeated block stores) 383 * we must membar #Sync so that our block stores may complete before 384 * we either restore the original fp state into the fp registers or 385 * return to a caller which may initiate other fp operations that could 386 * modify the fp regs we used before the block stores complete. 387 * 388 * Synchronous faults (eg, unresolvable DMMU miss) that occur while 389 * t_lofault is not NULL will not panic but will instead trampoline 390 * to the registered lofault handler. There is no need for any 391 * membars for these - eg, our store to t_lofault will always be visible to 392 * ourselves and it is our cpu which will take any trap. 393 * 394 * Asynchronous faults (eg, uncorrectable ECC error from memory) that occur 395 * while t_lofault is not NULL will also not panic. Since we're copying 396 * to or from userland the extent of the damage is known - the destination 397 * buffer is incomplete. So trap handlers will trampoline to the lofault 398 * handler in this case which should take some form of error action to 399 * avoid using the incomplete buffer. The trap handler also flags the 400 * fault so that later return-from-trap handling (for the trap that brought 401 * this thread into the kernel in the first place) can notify the process 402 * and reboot the system (or restart the service with Greenline/Contracts). 403 * 404 * Asynchronous faults (eg, uncorrectable ECC error from memory) can 405 * result in deferred error traps - the trap is taken sometime after 406 * the event and the trap PC may not be the PC of the faulting access. 407 * Delivery of such pending traps can be forced by a membar #Sync, acting 408 * as an "error barrier" in this role. To accurately apply the user/kernel 409 * separation described in the preceding paragraph we must force delivery 410 * of deferred traps affecting kernel state before we install a lofault 411 * handler (if we interpose a new lofault handler on an existing one there 412 * is no need to repeat this), and we must force delivery of deferred 413 * errors affecting the lofault-protected region before we clear t_lofault. 414 * Failure to do so results in lost kernel state being interpreted as 415 * affecting a copyin/copyout only, or of an error that really only 416 * affects copy data being interpreted as losing kernel state. 417 * 418 * Since the copy operations may preserve and later restore floating 419 * point state that does not belong to the caller (see examples above), 420 * we must be careful in how we do this in order to prevent corruption 421 * of another program. 422 * 423 * To make sure that floating point state is always saved and restored 424 * correctly, the following "big rules" must be followed when the floating 425 * point registers will be used: 426 * 427 * 1. %l6 always holds the caller's lofault handler. Also in this register, 428 * Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in 429 * use. Bit 2 (TRAMP_FLAG) indicates that the call was to bcopy, and a 430 * lofault handler was set coming in. 431 * 432 * 2. The FPUSED flag indicates that all FP state has been successfully stored 433 * on the stack. It should not be set until this save has been completed. 434 * 435 * 3. The FPUSED flag should not be cleared on exit until all FP state has 436 * been restored from the stack. If an error occurs while restoring 437 * data from the stack, the error handler can check this flag to see if 438 * a restore is necessary. 439 * 440 * 4. Code run under the new lofault handler must be kept to a minimum. In 441 * particular, any calls to FP_ALLOWMIGRATE, which could result in a call 442 * to kpreempt(), should not be made until after the lofault handler has 443 * been restored. 444 */ 445 446 /* 447 * VIS_COPY_THRESHOLD indicates the minimum number of bytes needed 448 * to "break even" using FP/VIS-accelerated memory operations. 449 * The FPBLK code assumes a minimum number of bytes are available 450 * to be moved on entry. Check that code carefully before 451 * reducing VIS_COPY_THRESHOLD below 256. 452 */ 453 /* 454 * This shadows sys/machsystm.h which can't be included due to the lack of 455 * _ASM guards in include files it references. Change it here, change it there. 456 */ 457 #define VIS_COPY_THRESHOLD 256 458 459 /* 460 * TEST for very short copies 461 * Be aware that the maximum unroll for the short unaligned case 462 * is SHORTCOPY+1 463 */ 464 #define SHORTCOPY 3 465 #define CHKSIZE 39 466 467 /* 468 * Indicates that we're to trampoline to the error handler. 469 * Entry points bcopy, copyin_noerr, and copyout_noerr use this flag. 470 * kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag. 471 */ 472 #define FPUSED_FLAG 1 473 #define TRAMP_FLAG 2 474 #define MASK_FLAGS 3 475 476 /* 477 * Number of outstanding prefetches. 478 * first prefetch moves data from L2 to L1 (n_reads) 479 * second prefetch moves data from memory to L2 (one_read) 480 */ 481 #define OLYMPUS_C_PREFETCH 24 482 #define OLYMPUS_C_2ND_PREFETCH 12 483 484 #define VIS_BLOCKSIZE 64 485 486 /* 487 * Size of stack frame in order to accomodate a 64-byte aligned 488 * floating-point register save area and 2 64-bit temp locations. 489 * All copy functions use two quadrants of fp registers; to assure a 490 * block-aligned two block buffer in which to save we must reserve 491 * three blocks on stack. Not all functions preserve %pfrs on stack 492 * or need to preserve %gsr but we use HWCOPYFRAMESIZE for all. 493 * 494 * _______________________________________ <-- %fp + STACK_BIAS 495 * | We may need to preserve 2 quadrants | 496 * | of fp regs, but since we do so with | 497 * | BST/BLD we need room in which to | 498 * | align to VIS_BLOCKSIZE bytes. So | 499 * | this area is 3 * VIS_BLOCKSIZE. | <-- - SAVED_FPREGS_OFFSET 500 * |-------------------------------------| 501 * | 8 bytes to save %fprs | <-- - SAVED_FPRS_OFFSET 502 * |-------------------------------------| 503 * | 8 bytes to save %gsr | <-- - SAVED_GSR_OFFSET 504 * --------------------------------------- 505 */ 506 #define HWCOPYFRAMESIZE ((VIS_BLOCKSIZE * (2 + 1)) + (2 * 8)) 507 #define SAVED_FPREGS_OFFSET (VIS_BLOCKSIZE * 3) 508 #define SAVED_FPREGS_ADJUST ((VIS_BLOCKSIZE * 2) - 1) 509 #define SAVED_FPRS_OFFSET (SAVED_FPREGS_OFFSET + 8) 510 #define SAVED_GSR_OFFSET (SAVED_FPRS_OFFSET + 8) 511 512 /* 513 * Common macros used by the various versions of the block copy 514 * routines in this file. 515 */ 516 517 /* 518 * In FP copies if we do not have preserved data to restore over 519 * the fp regs we used then we must zero those regs to avoid 520 * exposing portions of the data to later threads (data security). 521 * 522 * Copy functions use either quadrants 1 and 3 or 2 and 4. 523 * 524 * FZEROQ1Q3: Zero quadrants 1 and 3, ie %f0 - %f15 and %f32 - %f47 525 * FZEROQ2Q4: Zero quadrants 2 and 4, ie %f16 - %f31 and %f48 - %f63 526 * 527 * The instructions below are quicker than repeated fzero instructions 528 * since they can dispatch down two fp pipelines. 529 */ 530 #define FZEROQ1Q3 \ 531 fzero %f0 ;\ 532 fmovd %f0, %f2 ;\ 533 fmovd %f0, %f4 ;\ 534 fmovd %f0, %f6 ;\ 535 fmovd %f0, %f8 ;\ 536 fmovd %f0, %f10 ;\ 537 fmovd %f0, %f12 ;\ 538 fmovd %f0, %f14 ;\ 539 fmovd %f0, %f32 ;\ 540 fmovd %f0, %f34 ;\ 541 fmovd %f0, %f36 ;\ 542 fmovd %f0, %f38 ;\ 543 fmovd %f0, %f40 ;\ 544 fmovd %f0, %f42 ;\ 545 fmovd %f0, %f44 ;\ 546 fmovd %f0, %f46 547 548 #define FZEROQ2Q4 \ 549 fzero %f16 ;\ 550 fmovd %f0, %f18 ;\ 551 fmovd %f0, %f20 ;\ 552 fmovd %f0, %f22 ;\ 553 fmovd %f0, %f24 ;\ 554 fmovd %f0, %f26 ;\ 555 fmovd %f0, %f28 ;\ 556 fmovd %f0, %f30 ;\ 557 fmovd %f0, %f48 ;\ 558 fmovd %f0, %f50 ;\ 559 fmovd %f0, %f52 ;\ 560 fmovd %f0, %f54 ;\ 561 fmovd %f0, %f56 ;\ 562 fmovd %f0, %f58 ;\ 563 fmovd %f0, %f60 ;\ 564 fmovd %f0, %f62 565 566 /* 567 * Macros to save and restore quadrants 1 and 3 or 2 and 4 to/from the stack. 568 * Used to save and restore in-use fp registers when we want to use FP 569 * and find fp already in use and copy size still large enough to justify 570 * the additional overhead of this save and restore. 571 * 572 * A membar #Sync is needed before save to sync fp ops initiated before 573 * the call to the copy function (by whoever has fp in use); for example 574 * an earlier block load to the quadrant we are about to save may still be 575 * "in flight". A membar #Sync is required at the end of the save to 576 * sync our block store (the copy code is about to begin ldd's to the 577 * first quadrant). 578 * 579 * Similarly: a membar #Sync before restore allows the block stores of 580 * the copy operation to complete before we fill the quadrants with their 581 * original data, and a membar #Sync after restore lets the block loads 582 * of the restore complete before we return to whoever has the fp regs 583 * in use. To avoid repeated membar #Sync we make it the responsibility 584 * of the copy code to membar #Sync immediately after copy is complete 585 * and before using the BLD_*_FROMSTACK macro. 586 */ 587 #define BST_FPQ1Q3_TOSTACK(tmp1) \ 588 /* membar #Sync */ ;\ 589 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 590 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 591 stda %f0, [tmp1]ASI_BLK_P ;\ 592 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 593 stda %f32, [tmp1]ASI_BLK_P ;\ 594 membar #Sync 595 596 #define BLD_FPQ1Q3_FROMSTACK(tmp1) \ 597 /* membar #Sync - provided at copy completion */ ;\ 598 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 599 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 600 ldda [tmp1]ASI_BLK_P, %f0 ;\ 601 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 602 ldda [tmp1]ASI_BLK_P, %f32 ;\ 603 membar #Sync 604 605 #define BST_FPQ2Q4_TOSTACK(tmp1) \ 606 /* membar #Sync */ ;\ 607 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 608 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 609 stda %f16, [tmp1]ASI_BLK_P ;\ 610 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 611 stda %f48, [tmp1]ASI_BLK_P ;\ 612 membar #Sync 613 614 #define BLD_FPQ2Q4_FROMSTACK(tmp1) \ 615 /* membar #Sync - provided at copy completion */ ;\ 616 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 617 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 618 ldda [tmp1]ASI_BLK_P, %f16 ;\ 619 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 620 ldda [tmp1]ASI_BLK_P, %f48 ;\ 621 membar #Sync 622 623 /* 624 * FP_NOMIGRATE and FP_ALLOWMIGRATE. Prevent migration (or, stronger, 625 * prevent preemption if there is no t_lwp to save FP state to on context 626 * switch) before commencing a FP copy, and reallow it on completion or 627 * in error trampoline paths when we were using FP copy. 628 * 629 * Both macros may call other functions, so be aware that all outputs are 630 * forfeit after using these macros. For this reason we do not pass registers 631 * to use - we just use any outputs we want. 632 * 633 * Pseudo code: 634 * 635 * FP_NOMIGRATE: 636 * 637 * if (curthread->t_lwp) { 638 * thread_nomigrate(); 639 * } else { 640 * kpreempt_disable(); 641 * } 642 * 643 * FP_ALLOWMIGRATE: 644 * 645 * if (curthread->t_lwp) { 646 * thread_allowmigrate(); 647 * } else { 648 * kpreempt_enable(); 649 * } 650 */ 651 652 #define FP_NOMIGRATE(label1, label2) \ 653 ldn [THREAD_REG + T_LWP], %o0 ;\ 654 brz,a,pn %o0, label1/**/f ;\ 655 ldsb [THREAD_REG + T_PREEMPT], %o1 ;\ 656 call thread_nomigrate ;\ 657 nop ;\ 658 ba label2/**/f ;\ 659 nop ;\ 660 label1: ;\ 661 inc %o1 ;\ 662 stb %o1, [THREAD_REG + T_PREEMPT] ;\ 663 label2: 664 665 #define FP_ALLOWMIGRATE(label1, label2) \ 666 ldn [THREAD_REG + T_LWP], %o0 ;\ 667 brz,a,pn %o0, label1/**/f ;\ 668 ldsb [THREAD_REG + T_PREEMPT], %o1 ;\ 669 call thread_allowmigrate ;\ 670 nop ;\ 671 ba label2/**/f ;\ 672 nop ;\ 673 label1: ;\ 674 dec %o1 ;\ 675 brnz,pn %o1, label2/**/f ;\ 676 stb %o1, [THREAD_REG + T_PREEMPT] ;\ 677 ldn [THREAD_REG + T_CPU], %o0 ;\ 678 ldub [%o0 + CPU_KPRUNRUN], %o0 ;\ 679 brz,pt %o0, label2/**/f ;\ 680 nop ;\ 681 call kpreempt ;\ 682 rdpr %pil, %o0 ;\ 683 label2: 684 685 /* 686 * Copy a block of storage, returning an error code if `from' or 687 * `to' takes a kernel pagefault which cannot be resolved. 688 * Returns errno value on pagefault error, 0 if all ok 689 */ 690 691 .seg ".text" 692 .align 4 693 694 ENTRY(kcopy) 695 696 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 697 bleu,pt %ncc, .kcopy_small ! go to larger cases 698 xor %o0, %o1, %o3 ! are src, dst alignable? 699 btst 7, %o3 ! 700 bz,pt %ncc, .kcopy_8 ! check for longword alignment 701 nop 702 btst 1, %o3 ! 703 bz,pt %ncc, .kcopy_2 ! check for half-word 704 nop 705 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 706 ld [%o3 + %lo(hw_copy_limit_1)], %o3 707 tst %o3 708 bz,pn %icc, .kcopy_small ! if zero, disable HW copy 709 cmp %o2, %o3 ! if length <= limit 710 bleu,pt %ncc, .kcopy_small ! go to small copy 711 nop 712 ba,pt %ncc, .kcopy_more ! otherwise go to large copy 713 nop 714 .kcopy_2: 715 btst 3, %o3 ! 716 bz,pt %ncc, .kcopy_4 ! check for word alignment 717 nop 718 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 719 ld [%o3 + %lo(hw_copy_limit_2)], %o3 720 tst %o3 721 bz,pn %icc, .kcopy_small ! if zero, disable HW copy 722 cmp %o2, %o3 ! if length <= limit 723 bleu,pt %ncc, .kcopy_small ! go to small copy 724 nop 725 ba,pt %ncc, .kcopy_more ! otherwise go to large copy 726 nop 727 .kcopy_4: 728 ! already checked longword, must be word aligned 729 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 730 ld [%o3 + %lo(hw_copy_limit_4)], %o3 731 tst %o3 732 bz,pn %icc, .kcopy_small ! if zero, disable HW copy 733 cmp %o2, %o3 ! if length <= limit 734 bleu,pt %ncc, .kcopy_small ! go to small copy 735 nop 736 ba,pt %ncc, .kcopy_more ! otherwise go to large copy 737 nop 738 .kcopy_8: 739 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 740 ld [%o3 + %lo(hw_copy_limit_8)], %o3 741 tst %o3 742 bz,pn %icc, .kcopy_small ! if zero, disable HW copy 743 cmp %o2, %o3 ! if length <= limit 744 bleu,pt %ncc, .kcopy_small ! go to small copy 745 nop 746 ba,pt %ncc, .kcopy_more ! otherwise go to large copy 747 nop 748 749 .kcopy_small: 750 sethi %hi(.sm_copyerr), %o5 ! sm_copyerr is lofault value 751 or %o5, %lo(.sm_copyerr), %o5 752 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler 753 membar #Sync ! sync error barrier 754 ba,pt %ncc, .sm_do_copy ! common code 755 stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault 756 757 .kcopy_more: 758 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 759 sethi %hi(.copyerr), %l7 ! copyerr is lofault value 760 or %l7, %lo(.copyerr), %l7 761 ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler 762 membar #Sync ! sync error barrier 763 ba,pt %ncc, .do_copy ! common code 764 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 765 766 767 /* 768 * We got here because of a fault during bcopy_more, called from kcopy or bcopy. 769 * Errno value is in %g1. bcopy_more uses fp quadrants 1 and 3. 770 */ 771 .copyerr: 772 set .copyerr2, %l0 773 membar #Sync ! sync error barrier 774 stn %l0, [THREAD_REG + T_LOFAULT] ! set t_lofault 775 btst FPUSED_FLAG, %l6 776 bz %ncc, 1f 777 and %l6, TRAMP_FLAG, %l0 ! copy trampoline flag to %l0 778 779 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr 780 wr %o2, 0, %gsr 781 782 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 783 btst FPRS_FEF, %o3 784 bz,pt %icc, 4f 785 nop 786 787 BLD_FPQ1Q3_FROMSTACK(%o2) 788 789 ba,pt %ncc, 1f 790 wr %o3, 0, %fprs ! restore fprs 791 792 4: 793 FZEROQ1Q3 794 wr %o3, 0, %fprs ! restore fprs 795 796 ! 797 ! Need to cater for the different expectations of kcopy 798 ! and bcopy. kcopy will *always* set a t_lofault handler 799 ! If it fires, we're expected to just return the error code 800 ! and *not* to invoke any existing error handler. As far as 801 ! bcopy is concerned, we only set t_lofault if there was an 802 ! existing lofault handler. In that case we're expected to 803 ! invoke the previously existing handler after resetting the 804 ! t_lofault value. 805 ! 806 1: 807 andn %l6, MASK_FLAGS, %l6 ! turn trampoline flag off 808 membar #Sync ! sync error barrier 809 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 810 FP_ALLOWMIGRATE(5, 6) 811 812 btst TRAMP_FLAG, %l0 813 bnz,pn %ncc, 3f 814 nop 815 ret 816 restore %g1, 0, %o0 817 818 3: 819 ! 820 ! We're here via bcopy. There *must* have been an error handler 821 ! in place otherwise we would have died a nasty death already. 822 ! 823 jmp %l6 ! goto real handler 824 restore %g0, 0, %o0 ! dispose of copy window 825 826 /* 827 * We got here because of a fault in .copyerr. We can't safely restore fp 828 * state, so we panic. 829 */ 830 fp_panic_msg: 831 .asciz "Unable to restore fp state after copy operation" 832 833 .align 4 834 .copyerr2: 835 set fp_panic_msg, %o0 836 call panic 837 nop 838 839 /* 840 * We got here because of a fault during a small kcopy or bcopy. 841 * No floating point registers are used by the small copies. 842 * Errno value is in %g1. 843 */ 844 .sm_copyerr: 845 1: 846 btst TRAMP_FLAG, %o4 847 membar #Sync 848 andn %o4, TRAMP_FLAG, %o4 849 bnz,pn %ncc, 3f 850 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 851 retl 852 mov %g1, %o0 853 3: 854 jmp %o4 ! goto real handler 855 mov %g0, %o0 ! 856 857 SET_SIZE(kcopy) 858 859 860 /* 861 * Copy a block of storage - must not overlap (from + len <= to). 862 * Registers: l6 - saved t_lofault 863 * (for short copies, o4 - saved t_lofault) 864 * 865 * Copy a page of memory. 866 * Assumes double word alignment and a count >= 256. 867 */ 868 869 ENTRY(bcopy) 870 871 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 872 bleu,pt %ncc, .bcopy_small ! go to larger cases 873 xor %o0, %o1, %o3 ! are src, dst alignable? 874 btst 7, %o3 ! 875 bz,pt %ncc, .bcopy_8 ! check for longword alignment 876 nop 877 btst 1, %o3 ! 878 bz,pt %ncc, .bcopy_2 ! check for half-word 879 nop 880 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 881 ld [%o3 + %lo(hw_copy_limit_1)], %o3 882 tst %o3 883 bz,pn %icc, .bcopy_small ! if zero, disable HW copy 884 cmp %o2, %o3 ! if length <= limit 885 bleu,pt %ncc, .bcopy_small ! go to small copy 886 nop 887 ba,pt %ncc, .bcopy_more ! otherwise go to large copy 888 nop 889 .bcopy_2: 890 btst 3, %o3 ! 891 bz,pt %ncc, .bcopy_4 ! check for word alignment 892 nop 893 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 894 ld [%o3 + %lo(hw_copy_limit_2)], %o3 895 tst %o3 896 bz,pn %icc, .bcopy_small ! if zero, disable HW copy 897 cmp %o2, %o3 ! if length <= limit 898 bleu,pt %ncc, .bcopy_small ! go to small copy 899 nop 900 ba,pt %ncc, .bcopy_more ! otherwise go to large copy 901 nop 902 .bcopy_4: 903 ! already checked longword, must be word aligned 904 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 905 ld [%o3 + %lo(hw_copy_limit_4)], %o3 906 tst %o3 907 bz,pn %icc, .bcopy_small ! if zero, disable HW copy 908 cmp %o2, %o3 ! if length <= limit 909 bleu,pt %ncc, .bcopy_small ! go to small copy 910 nop 911 ba,pt %ncc, .bcopy_more ! otherwise go to large copy 912 nop 913 .bcopy_8: 914 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 915 ld [%o3 + %lo(hw_copy_limit_8)], %o3 916 tst %o3 917 bz,pn %icc, .bcopy_small ! if zero, disable HW copy 918 cmp %o2, %o3 ! if length <= limit 919 bleu,pt %ncc, .bcopy_small ! go to small copy 920 nop 921 ba,pt %ncc, .bcopy_more ! otherwise go to large copy 922 nop 923 924 .align 16 925 .bcopy_small: 926 ldn [THREAD_REG + T_LOFAULT], %o4 ! save t_lofault 927 tst %o4 928 bz,pt %icc, .sm_do_copy 929 nop 930 sethi %hi(.sm_copyerr), %o5 931 or %o5, %lo(.sm_copyerr), %o5 932 membar #Sync ! sync error barrier 933 stn %o5, [THREAD_REG + T_LOFAULT] ! install new vector 934 or %o4, TRAMP_FLAG, %o4 ! error should trampoline 935 .sm_do_copy: 936 cmp %o2, SHORTCOPY ! check for really short case 937 bleu,pt %ncc, .bc_sm_left ! 938 cmp %o2, CHKSIZE ! check for medium length cases 939 bgu,pn %ncc, .bc_med ! 940 or %o0, %o1, %o3 ! prepare alignment check 941 andcc %o3, 0x3, %g0 ! test for alignment 942 bz,pt %ncc, .bc_sm_word ! branch to word aligned case 943 .bc_sm_movebytes: 944 sub %o2, 3, %o2 ! adjust count to allow cc zero test 945 .bc_sm_notalign4: 946 ldub [%o0], %o3 ! read byte 947 stb %o3, [%o1] ! write byte 948 subcc %o2, 4, %o2 ! reduce count by 4 949 ldub [%o0 + 1], %o3 ! repeat for a total of 4 bytes 950 add %o0, 4, %o0 ! advance SRC by 4 951 stb %o3, [%o1 + 1] 952 ldub [%o0 - 2], %o3 953 add %o1, 4, %o1 ! advance DST by 4 954 stb %o3, [%o1 - 2] 955 ldub [%o0 - 1], %o3 956 bgt,pt %ncc, .bc_sm_notalign4 ! loop til 3 or fewer bytes remain 957 stb %o3, [%o1 - 1] 958 add %o2, 3, %o2 ! restore count 959 .bc_sm_left: 960 tst %o2 961 bz,pt %ncc, .bc_sm_exit ! check for zero length 962 deccc %o2 ! reduce count for cc test 963 ldub [%o0], %o3 ! move one byte 964 bz,pt %ncc, .bc_sm_exit 965 stb %o3, [%o1] 966 ldub [%o0 + 1], %o3 ! move another byte 967 deccc %o2 ! check for more 968 bz,pt %ncc, .bc_sm_exit 969 stb %o3, [%o1 + 1] 970 ldub [%o0 + 2], %o3 ! move final byte 971 ba,pt %ncc, .bc_sm_exit 972 stb %o3, [%o1 + 2] 973 .align 16 974 nop ! instruction alignment 975 ! see discussion at start of file 976 .bc_sm_words: 977 lduw [%o0], %o3 ! read word 978 .bc_sm_wordx: 979 subcc %o2, 8, %o2 ! update count 980 stw %o3, [%o1] ! write word 981 add %o0, 8, %o0 ! update SRC 982 lduw [%o0 - 4], %o3 ! read word 983 add %o1, 8, %o1 ! update DST 984 bgt,pt %ncc, .bc_sm_words ! loop til done 985 stw %o3, [%o1 - 4] ! write word 986 addcc %o2, 7, %o2 ! restore count 987 bz,pt %ncc, .bc_sm_exit 988 deccc %o2 989 bz,pt %ncc, .bc_sm_byte 990 .bc_sm_half: 991 subcc %o2, 2, %o2 ! reduce count by 2 992 add %o0, 2, %o0 ! advance SRC by 2 993 lduh [%o0 - 2], %o3 ! read half word 994 add %o1, 2, %o1 ! advance DST by 2 995 bgt,pt %ncc, .bc_sm_half ! loop til done 996 sth %o3, [%o1 - 2] ! write half word 997 addcc %o2, 1, %o2 ! restore count 998 bz,pt %ncc, .bc_sm_exit 999 nop 1000 .bc_sm_byte: 1001 ldub [%o0], %o3 1002 ba,pt %ncc, .bc_sm_exit 1003 stb %o3, [%o1] 1004 1005 .bc_sm_word: 1006 subcc %o2, 4, %o2 ! update count 1007 bgt,pt %ncc, .bc_sm_wordx 1008 lduw [%o0], %o3 ! read word 1009 addcc %o2, 3, %o2 ! restore count 1010 bz,pt %ncc, .bc_sm_exit 1011 stw %o3, [%o1] ! write word 1012 deccc %o2 ! reduce count for cc test 1013 ldub [%o0 + 4], %o3 ! load one byte 1014 bz,pt %ncc, .bc_sm_exit 1015 stb %o3, [%o1 + 4] ! store one byte 1016 ldub [%o0 + 5], %o3 ! load second byte 1017 deccc %o2 1018 bz,pt %ncc, .bc_sm_exit 1019 stb %o3, [%o1 + 5] ! store second byte 1020 ldub [%o0 + 6], %o3 ! load third byte 1021 stb %o3, [%o1 + 6] ! store third byte 1022 .bc_sm_exit: 1023 ldn [THREAD_REG + T_LOFAULT], %o3 1024 brz,pt %o3, .bc_sm_done 1025 nop 1026 membar #Sync ! sync error barrier 1027 andn %o4, TRAMP_FLAG, %o4 1028 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1029 .bc_sm_done: 1030 retl 1031 mov %g0, %o0 ! return 0 1032 1033 .align 16 1034 .bc_med: 1035 xor %o0, %o1, %o3 ! setup alignment check 1036 btst 1, %o3 1037 bnz,pt %ncc, .bc_sm_movebytes ! unaligned 1038 nop 1039 btst 3, %o3 1040 bnz,pt %ncc, .bc_med_half ! halfword aligned 1041 nop 1042 btst 7, %o3 1043 bnz,pt %ncc, .bc_med_word ! word aligned 1044 nop 1045 .bc_med_long: 1046 btst 3, %o0 ! check for 1047 bz,pt %ncc, .bc_med_long1 ! word alignment 1048 nop 1049 .bc_med_long0: 1050 ldub [%o0], %o3 ! load one byte 1051 inc %o0 1052 stb %o3,[%o1] ! store byte 1053 inc %o1 1054 btst 3, %o0 1055 bnz,pt %ncc, .bc_med_long0 1056 dec %o2 1057 .bc_med_long1: ! word aligned 1058 btst 7, %o0 ! check for long word 1059 bz,pt %ncc, .bc_med_long2 1060 nop 1061 lduw [%o0], %o3 ! load word 1062 add %o0, 4, %o0 ! advance SRC by 4 1063 stw %o3, [%o1] ! store word 1064 add %o1, 4, %o1 ! advance DST by 4 1065 sub %o2, 4, %o2 ! reduce count by 4 1066 ! 1067 ! Now long word aligned and have at least 32 bytes to move 1068 ! 1069 .bc_med_long2: 1070 sub %o2, 31, %o2 ! adjust count to allow cc zero test 1071 .bc_med_lmove: 1072 ldx [%o0], %o3 ! read long word 1073 stx %o3, [%o1] ! write long word 1074 subcc %o2, 32, %o2 ! reduce count by 32 1075 ldx [%o0 + 8], %o3 ! repeat for a total for 4 long words 1076 add %o0, 32, %o0 ! advance SRC by 32 1077 stx %o3, [%o1 + 8] 1078 ldx [%o0 - 16], %o3 1079 add %o1, 32, %o1 ! advance DST by 32 1080 stx %o3, [%o1 - 16] 1081 ldx [%o0 - 8], %o3 1082 bgt,pt %ncc, .bc_med_lmove ! loop til 31 or fewer bytes left 1083 stx %o3, [%o1 - 8] 1084 addcc %o2, 24, %o2 ! restore count to long word offset 1085 ble,pt %ncc, .bc_med_lextra ! check for more long words to move 1086 nop 1087 .bc_med_lword: 1088 ldx [%o0], %o3 ! read long word 1089 subcc %o2, 8, %o2 ! reduce count by 8 1090 stx %o3, [%o1] ! write long word 1091 add %o0, 8, %o0 ! advance SRC by 8 1092 bgt,pt %ncc, .bc_med_lword ! loop til 7 or fewer bytes left 1093 add %o1, 8, %o1 ! advance DST by 8 1094 .bc_med_lextra: 1095 addcc %o2, 7, %o2 ! restore rest of count 1096 bz,pt %ncc, .bc_sm_exit ! if zero, then done 1097 deccc %o2 1098 bz,pt %ncc, .bc_sm_byte 1099 nop 1100 ba,pt %ncc, .bc_sm_half 1101 nop 1102 1103 .align 16 1104 .bc_med_word: 1105 btst 3, %o0 ! check for 1106 bz,pt %ncc, .bc_med_word1 ! word alignment 1107 nop 1108 .bc_med_word0: 1109 ldub [%o0], %o3 ! load one byte 1110 inc %o0 1111 stb %o3,[%o1] ! store byte 1112 inc %o1 1113 btst 3, %o0 1114 bnz,pt %ncc, .bc_med_word0 1115 dec %o2 1116 ! 1117 ! Now word aligned and have at least 36 bytes to move 1118 ! 1119 .bc_med_word1: 1120 sub %o2, 15, %o2 ! adjust count to allow cc zero test 1121 .bc_med_wmove: 1122 lduw [%o0], %o3 ! read word 1123 stw %o3, [%o1] ! write word 1124 subcc %o2, 16, %o2 ! reduce count by 16 1125 lduw [%o0 + 4], %o3 ! repeat for a total for 4 words 1126 add %o0, 16, %o0 ! advance SRC by 16 1127 stw %o3, [%o1 + 4] 1128 lduw [%o0 - 8], %o3 1129 add %o1, 16, %o1 ! advance DST by 16 1130 stw %o3, [%o1 - 8] 1131 lduw [%o0 - 4], %o3 1132 bgt,pt %ncc, .bc_med_wmove ! loop til 15 or fewer bytes left 1133 stw %o3, [%o1 - 4] 1134 addcc %o2, 12, %o2 ! restore count to word offset 1135 ble,pt %ncc, .bc_med_wextra ! check for more words to move 1136 nop 1137 .bc_med_word2: 1138 lduw [%o0], %o3 ! read word 1139 subcc %o2, 4, %o2 ! reduce count by 4 1140 stw %o3, [%o1] ! write word 1141 add %o0, 4, %o0 ! advance SRC by 4 1142 bgt,pt %ncc, .bc_med_word2 ! loop til 3 or fewer bytes left 1143 add %o1, 4, %o1 ! advance DST by 4 1144 .bc_med_wextra: 1145 addcc %o2, 3, %o2 ! restore rest of count 1146 bz,pt %ncc, .bc_sm_exit ! if zero, then done 1147 deccc %o2 1148 bz,pt %ncc, .bc_sm_byte 1149 nop 1150 ba,pt %ncc, .bc_sm_half 1151 nop 1152 1153 .align 16 1154 .bc_med_half: 1155 btst 1, %o0 ! check for 1156 bz,pt %ncc, .bc_med_half1 ! half word alignment 1157 nop 1158 ldub [%o0], %o3 ! load one byte 1159 inc %o0 1160 stb %o3,[%o1] ! store byte 1161 inc %o1 1162 dec %o2 1163 ! 1164 ! Now half word aligned and have at least 38 bytes to move 1165 ! 1166 .bc_med_half1: 1167 sub %o2, 7, %o2 ! adjust count to allow cc zero test 1168 .bc_med_hmove: 1169 lduh [%o0], %o3 ! read half word 1170 sth %o3, [%o1] ! write half word 1171 subcc %o2, 8, %o2 ! reduce count by 8 1172 lduh [%o0 + 2], %o3 ! repeat for a total for 4 halfwords 1173 add %o0, 8, %o0 ! advance SRC by 8 1174 sth %o3, [%o1 + 2] 1175 lduh [%o0 - 4], %o3 1176 add %o1, 8, %o1 ! advance DST by 8 1177 sth %o3, [%o1 - 4] 1178 lduh [%o0 - 2], %o3 1179 bgt,pt %ncc, .bc_med_hmove ! loop til 7 or fewer bytes left 1180 sth %o3, [%o1 - 2] 1181 addcc %o2, 7, %o2 ! restore count 1182 bz,pt %ncc, .bc_sm_exit 1183 deccc %o2 1184 bz,pt %ncc, .bc_sm_byte 1185 nop 1186 ba,pt %ncc, .bc_sm_half 1187 nop 1188 1189 SET_SIZE(bcopy) 1190 1191 /* 1192 * The _more entry points are not intended to be used directly by 1193 * any caller from outside this file. They are provided to allow 1194 * profiling and dtrace of the portions of the copy code that uses 1195 * the floating point registers. 1196 * This entry is particularly important as DTRACE (at least as of 1197 * 4/2004) does not support leaf functions. 1198 */ 1199 1200 ENTRY(bcopy_more) 1201 .bcopy_more: 1202 prefetch [%o0], #n_reads 1203 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 1204 ldn [THREAD_REG + T_LOFAULT], %l6 ! save t_lofault 1205 tst %l6 1206 bz,pt %ncc, .do_copy 1207 nop 1208 sethi %hi(.copyerr), %o2 1209 or %o2, %lo(.copyerr), %o2 1210 membar #Sync ! sync error barrier 1211 stn %o2, [THREAD_REG + T_LOFAULT] ! install new vector 1212 ! 1213 ! We've already captured whether t_lofault was zero on entry. 1214 ! We need to mark ourselves as being from bcopy since both 1215 ! kcopy and bcopy use the same code path. If TRAMP_FLAG is set 1216 ! and the saved lofault was zero, we won't reset lofault on 1217 ! returning. 1218 ! 1219 or %l6, TRAMP_FLAG, %l6 1220 1221 /* 1222 * Copies that reach here are larger than VIS_COPY_THRESHOLD bytes 1223 * Also, use of FP registers has been tested to be enabled 1224 */ 1225 .do_copy: 1226 FP_NOMIGRATE(6, 7) 1227 1228 rd %fprs, %o2 ! check for unused fp 1229 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs 1230 btst FPRS_FEF, %o2 1231 bz,a,pt %icc, .do_blockcopy 1232 wr %g0, FPRS_FEF, %fprs 1233 1234 BST_FPQ1Q3_TOSTACK(%o2) 1235 1236 .do_blockcopy: 1237 rd %gsr, %o2 1238 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr 1239 or %l6, FPUSED_FLAG, %l6 1240 1241 #define REALSRC %i0 1242 #define DST %i1 1243 #define CNT %i2 1244 #define SRC %i3 1245 #define TMP %i5 1246 1247 andcc DST, VIS_BLOCKSIZE - 1, TMP 1248 bz,pt %ncc, 2f 1249 neg TMP 1250 add TMP, VIS_BLOCKSIZE, TMP 1251 1252 ! TMP = bytes required to align DST on FP_BLOCK boundary 1253 ! Using SRC as a tmp here 1254 cmp TMP, 3 1255 bleu,pt %ncc, 1f 1256 sub CNT,TMP,CNT ! adjust main count 1257 sub TMP, 3, TMP ! adjust for end of loop test 1258 .bc_blkalign: 1259 ldub [REALSRC], SRC ! move 4 bytes per loop iteration 1260 stb SRC, [DST] 1261 subcc TMP, 4, TMP 1262 ldub [REALSRC + 1], SRC 1263 add REALSRC, 4, REALSRC 1264 stb SRC, [DST + 1] 1265 ldub [REALSRC - 2], SRC 1266 add DST, 4, DST 1267 stb SRC, [DST - 2] 1268 ldub [REALSRC - 1], SRC 1269 bgu,pt %ncc, .bc_blkalign 1270 stb SRC, [DST - 1] 1271 1272 addcc TMP, 3, TMP ! restore count adjustment 1273 bz,pt %ncc, 2f ! no bytes left? 1274 nop 1275 1: ldub [REALSRC], SRC 1276 inc REALSRC 1277 inc DST 1278 deccc TMP 1279 bgu %ncc, 1b 1280 stb SRC, [DST - 1] 1281 1282 2: 1283 membar #StoreLoad 1284 andn REALSRC, 0x7, SRC 1285 1286 ! SRC - 8-byte aligned 1287 ! DST - 64-byte aligned 1288 ldd [SRC], %f0 1289 prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads 1290 alignaddr REALSRC, %g0, %g0 1291 ldd [SRC + 0x08], %f2 1292 prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads 1293 faligndata %f0, %f2, %f32 1294 ldd [SRC + 0x10], %f4 1295 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads 1296 faligndata %f2, %f4, %f34 1297 ldd [SRC + 0x18], %f6 1298 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read 1299 faligndata %f4, %f6, %f36 1300 ldd [SRC + 0x20], %f8 1301 prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read 1302 faligndata %f6, %f8, %f38 1303 ldd [SRC + 0x28], %f10 1304 prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read 1305 faligndata %f8, %f10, %f40 1306 ldd [SRC + 0x30], %f12 1307 prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read 1308 faligndata %f10, %f12, %f42 1309 ldd [SRC + 0x38], %f14 1310 ldd [SRC + VIS_BLOCKSIZE], %f0 1311 sub CNT, VIS_BLOCKSIZE, CNT 1312 add SRC, VIS_BLOCKSIZE, SRC 1313 prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read 1314 add REALSRC, VIS_BLOCKSIZE, REALSRC 1315 ba,pt %ncc, 1f 1316 prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read 1317 .align 32 1318 1: 1319 ldd [SRC + 0x08], %f2 1320 faligndata %f12, %f14, %f44 1321 ldd [SRC + 0x10], %f4 1322 faligndata %f14, %f0, %f46 1323 stda %f32, [DST]ASI_BLK_P 1324 ldd [SRC + 0x18], %f6 1325 faligndata %f0, %f2, %f32 1326 ldd [SRC + 0x20], %f8 1327 faligndata %f2, %f4, %f34 1328 ldd [SRC + 0x28], %f10 1329 faligndata %f4, %f6, %f36 1330 ldd [SRC + 0x30], %f12 1331 faligndata %f6, %f8, %f38 1332 sub CNT, VIS_BLOCKSIZE, CNT 1333 ldd [SRC + 0x38], %f14 1334 faligndata %f8, %f10, %f40 1335 add DST, VIS_BLOCKSIZE, DST 1336 ldd [SRC + VIS_BLOCKSIZE], %f0 1337 faligndata %f10, %f12, %f42 1338 add REALSRC, VIS_BLOCKSIZE, REALSRC 1339 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads 1340 add SRC, VIS_BLOCKSIZE, SRC 1341 prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read 1342 cmp CNT, VIS_BLOCKSIZE + 8 1343 bgu,pt %ncc, 1b 1344 prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read 1345 1346 ! only if REALSRC & 0x7 is 0 1347 cmp CNT, VIS_BLOCKSIZE 1348 bne %ncc, 3f 1349 andcc REALSRC, 0x7, %g0 1350 bz,pt %ncc, 2f 1351 nop 1352 3: 1353 faligndata %f12, %f14, %f44 1354 faligndata %f14, %f0, %f46 1355 stda %f32, [DST]ASI_BLK_P 1356 add DST, VIS_BLOCKSIZE, DST 1357 ba,pt %ncc, 3f 1358 nop 1359 2: 1360 ldd [SRC + 0x08], %f2 1361 fsrc1 %f12, %f44 1362 ldd [SRC + 0x10], %f4 1363 fsrc1 %f14, %f46 1364 stda %f32, [DST]ASI_BLK_P 1365 ldd [SRC + 0x18], %f6 1366 fsrc1 %f0, %f32 1367 ldd [SRC + 0x20], %f8 1368 fsrc1 %f2, %f34 1369 ldd [SRC + 0x28], %f10 1370 fsrc1 %f4, %f36 1371 ldd [SRC + 0x30], %f12 1372 fsrc1 %f6, %f38 1373 ldd [SRC + 0x38], %f14 1374 fsrc1 %f8, %f40 1375 sub CNT, VIS_BLOCKSIZE, CNT 1376 add DST, VIS_BLOCKSIZE, DST 1377 add SRC, VIS_BLOCKSIZE, SRC 1378 add REALSRC, VIS_BLOCKSIZE, REALSRC 1379 fsrc1 %f10, %f42 1380 fsrc1 %f12, %f44 1381 fsrc1 %f14, %f46 1382 stda %f32, [DST]ASI_BLK_P 1383 add DST, VIS_BLOCKSIZE, DST 1384 ba,a,pt %ncc, .bcb_exit 1385 nop 1386 1387 3: tst CNT 1388 bz,a,pt %ncc, .bcb_exit 1389 nop 1390 1391 5: ldub [REALSRC], TMP 1392 inc REALSRC 1393 inc DST 1394 deccc CNT 1395 bgu %ncc, 5b 1396 stb TMP, [DST - 1] 1397 .bcb_exit: 1398 membar #Sync 1399 1400 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr 1401 wr %o2, 0, %gsr 1402 1403 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 1404 btst FPRS_FEF, %o3 1405 bz,pt %icc, 4f 1406 nop 1407 1408 BLD_FPQ1Q3_FROMSTACK(%o2) 1409 1410 ba,pt %ncc, 2f 1411 wr %o3, 0, %fprs ! restore fprs 1412 4: 1413 FZEROQ1Q3 1414 wr %o3, 0, %fprs ! restore fprs 1415 2: 1416 membar #Sync ! sync error barrier 1417 andn %l6, MASK_FLAGS, %l6 1418 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1419 FP_ALLOWMIGRATE(5, 6) 1420 ret 1421 restore %g0, 0, %o0 1422 1423 SET_SIZE(bcopy_more) 1424 1425 /* 1426 * Block copy with possibly overlapped operands. 1427 */ 1428 1429 ENTRY(ovbcopy) 1430 tst %o2 ! check count 1431 bgu,a %ncc, 1f ! nothing to do or bad arguments 1432 subcc %o0, %o1, %o3 ! difference of from and to address 1433 1434 retl ! return 1435 nop 1436 1: 1437 bneg,a %ncc, 2f 1438 neg %o3 ! if < 0, make it positive 1439 2: cmp %o2, %o3 ! cmp size and abs(from - to) 1440 bleu %ncc, bcopy ! if size <= abs(diff): use bcopy, 1441 .empty ! no overlap 1442 cmp %o0, %o1 ! compare from and to addresses 1443 blu %ncc, .ov_bkwd ! if from < to, copy backwards 1444 nop 1445 ! 1446 ! Copy forwards. 1447 ! 1448 .ov_fwd: 1449 ldub [%o0], %o3 ! read from address 1450 inc %o0 ! inc from address 1451 stb %o3, [%o1] ! write to address 1452 deccc %o2 ! dec count 1453 bgu %ncc, .ov_fwd ! loop till done 1454 inc %o1 ! inc to address 1455 1456 retl ! return 1457 nop 1458 ! 1459 ! Copy backwards. 1460 ! 1461 .ov_bkwd: 1462 deccc %o2 ! dec count 1463 ldub [%o0 + %o2], %o3 ! get byte at end of src 1464 bgu %ncc, .ov_bkwd ! loop till done 1465 stb %o3, [%o1 + %o2] ! delay slot, store at end of dst 1466 1467 retl ! return 1468 nop 1469 1470 SET_SIZE(ovbcopy) 1471 1472 1473 /* 1474 * hwblkpagecopy() 1475 * 1476 * Copies exactly one page. This routine assumes the caller (ppcopy) 1477 * has already disabled kernel preemption and has checked 1478 * use_hw_bcopy. Preventing preemption also prevents cpu migration. 1479 */ 1480 ENTRY(hwblkpagecopy) 1481 ! get another window w/space for three aligned blocks of saved fpregs 1482 prefetch [%o0], #n_reads 1483 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 1484 1485 ! %i0 - source address (arg) 1486 ! %i1 - destination address (arg) 1487 ! %i2 - length of region (not arg) 1488 ! %l0 - saved fprs 1489 ! %l1 - pointer to saved fpregs 1490 1491 rd %fprs, %l0 ! check for unused fp 1492 btst FPRS_FEF, %l0 1493 bz,a,pt %icc, 1f 1494 wr %g0, FPRS_FEF, %fprs 1495 1496 BST_FPQ1Q3_TOSTACK(%l1) 1497 1498 1: set PAGESIZE, CNT 1499 mov REALSRC, SRC 1500 1501 ldd [SRC], %f0 1502 prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads 1503 ldd [SRC + 0x08], %f2 1504 prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads 1505 fmovd %f0, %f32 1506 ldd [SRC + 0x10], %f4 1507 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads 1508 fmovd %f2, %f34 1509 ldd [SRC + 0x18], %f6 1510 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read 1511 fmovd %f4, %f36 1512 ldd [SRC + 0x20], %f8 1513 prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read 1514 fmovd %f6, %f38 1515 ldd [SRC + 0x28], %f10 1516 prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read 1517 fmovd %f8, %f40 1518 ldd [SRC + 0x30], %f12 1519 prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read 1520 fmovd %f10, %f42 1521 ldd [SRC + 0x38], %f14 1522 ldd [SRC + VIS_BLOCKSIZE], %f0 1523 sub CNT, VIS_BLOCKSIZE, CNT 1524 add SRC, VIS_BLOCKSIZE, SRC 1525 prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read 1526 ba,pt %ncc, 2f 1527 prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read 1528 .align 32 1529 2: 1530 ldd [SRC + 0x08], %f2 1531 fmovd %f12, %f44 1532 ldd [SRC + 0x10], %f4 1533 fmovd %f14, %f46 1534 stda %f32, [DST]ASI_BLK_P 1535 ldd [SRC + 0x18], %f6 1536 fmovd %f0, %f32 1537 ldd [SRC + 0x20], %f8 1538 fmovd %f2, %f34 1539 ldd [SRC + 0x28], %f10 1540 fmovd %f4, %f36 1541 ldd [SRC + 0x30], %f12 1542 fmovd %f6, %f38 1543 ldd [SRC + 0x38], %f14 1544 fmovd %f8, %f40 1545 ldd [SRC + VIS_BLOCKSIZE], %f0 1546 fmovd %f10, %f42 1547 sub CNT, VIS_BLOCKSIZE, CNT 1548 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads 1549 add DST, VIS_BLOCKSIZE, DST 1550 prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read 1551 add SRC, VIS_BLOCKSIZE, SRC 1552 cmp CNT, VIS_BLOCKSIZE + 8 1553 bgu,pt %ncc, 2b 1554 prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read 1555 1556 ! trailing block 1557 ldd [SRC + 0x08], %f2 1558 fsrc1 %f12, %f44 1559 ldd [SRC + 0x10], %f4 1560 fsrc1 %f14, %f46 1561 stda %f32, [DST]ASI_BLK_P 1562 ldd [SRC + 0x18], %f6 1563 fsrc1 %f0, %f32 1564 ldd [SRC + 0x20], %f8 1565 fsrc1 %f2, %f34 1566 ldd [SRC + 0x28], %f10 1567 fsrc1 %f4, %f36 1568 ldd [SRC + 0x30], %f12 1569 fsrc1 %f6, %f38 1570 ldd [SRC + 0x38], %f14 1571 fsrc1 %f8, %f40 1572 sub CNT, VIS_BLOCKSIZE, CNT 1573 add DST, VIS_BLOCKSIZE, DST 1574 add SRC, VIS_BLOCKSIZE, SRC 1575 fsrc1 %f10, %f42 1576 fsrc1 %f12, %f44 1577 fsrc1 %f14, %f46 1578 stda %f32, [DST]ASI_BLK_P 1579 1580 membar #Sync 1581 1582 btst FPRS_FEF, %l0 1583 bz,pt %icc, 2f 1584 nop 1585 1586 BLD_FPQ1Q3_FROMSTACK(%l3) 1587 ba 3f 1588 nop 1589 1590 2: FZEROQ1Q3 1591 1592 3: wr %l0, 0, %fprs ! restore fprs 1593 ret 1594 restore %g0, 0, %o0 1595 1596 SET_SIZE(hwblkpagecopy) 1597 1598 1599 /* 1600 * Transfer data to and from user space - 1601 * Note that these routines can cause faults 1602 * It is assumed that the kernel has nothing at 1603 * less than KERNELBASE in the virtual address space. 1604 * 1605 * Note that copyin(9F) and copyout(9F) are part of the 1606 * DDI/DKI which specifies that they return '-1' on "errors." 1607 * 1608 * Sigh. 1609 * 1610 * So there's two extremely similar routines - xcopyin() and xcopyout() 1611 * which return the errno that we've faithfully computed. This 1612 * allows other callers (e.g. uiomove(9F)) to work correctly. 1613 * Given that these are used pretty heavily, we expand the calling 1614 * sequences inline for all flavours (rather than making wrappers). 1615 * 1616 * There are also stub routines for xcopyout_little and xcopyin_little, 1617 * which currently are intended to handle requests of <= 16 bytes from 1618 * do_unaligned. Future enhancement to make them handle 8k pages efficiently 1619 * is left as an exercise... 1620 */ 1621 1622 /* 1623 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr) 1624 * 1625 * General theory of operation: 1626 * 1627 * The only difference between copy{in,out} and 1628 * xcopy{in,out} is in the error handling routine they invoke 1629 * when a memory access error occurs. xcopyOP returns the errno 1630 * while copyOP returns -1 (see above). copy{in,out}_noerr set 1631 * a special flag (by oring the TRAMP_FLAG into the fault handler address) 1632 * if they are called with a fault handler already in place. That flag 1633 * causes the default handlers to trampoline to the previous handler 1634 * upon an error. 1635 * 1636 * None of the copyops routines grab a window until it's decided that 1637 * we need to do a HW block copy operation. This saves a window 1638 * spill/fill when we're called during socket ops. The typical IO 1639 * path won't cause spill/fill traps. 1640 * 1641 * This code uses a set of 4 limits for the maximum size that will 1642 * be copied given a particular input/output address alignment. 1643 * If the value for a particular limit is zero, the copy will be performed 1644 * by the plain copy loops rather than FPBLK. 1645 * 1646 * See the description of bcopy above for more details of the 1647 * data copying algorithm and the default limits. 1648 * 1649 */ 1650 1651 /* 1652 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little). 1653 */ 1654 1655 /* 1656 * We save the arguments in the following registers in case of a fault: 1657 * kaddr - %l1 1658 * uaddr - %l2 1659 * count - %l3 1660 */ 1661 #define SAVE_SRC %l1 1662 #define SAVE_DST %l2 1663 #define SAVE_COUNT %l3 1664 1665 #define SM_SAVE_SRC %g4 1666 #define SM_SAVE_DST %g5 1667 #define SM_SAVE_COUNT %o5 1668 #define ERRNO %l5 1669 1670 1671 #define REAL_LOFAULT %l4 1672 /* 1673 * Generic copyio fault handler. This is the first line of defense when a 1674 * fault occurs in (x)copyin/(x)copyout. In order for this to function 1675 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT. 1676 * This allows us to share common code for all the flavors of the copy 1677 * operations, including the _noerr versions. 1678 * 1679 * Note that this function will restore the original input parameters before 1680 * calling REAL_LOFAULT. So the real handler can vector to the appropriate 1681 * member of the t_copyop structure, if needed. 1682 */ 1683 ENTRY(copyio_fault) 1684 membar #Sync 1685 mov %g1,ERRNO ! save errno in ERRNO 1686 btst FPUSED_FLAG, %l6 1687 bz %ncc, 1f 1688 nop 1689 1690 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 1691 wr %o2, 0, %gsr ! restore gsr 1692 1693 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 1694 btst FPRS_FEF, %o3 1695 bz,pt %icc, 4f 1696 nop 1697 1698 BLD_FPQ2Q4_FROMSTACK(%o2) 1699 1700 ba,pt %ncc, 1f 1701 wr %o3, 0, %fprs ! restore fprs 1702 1703 4: 1704 FZEROQ2Q4 1705 wr %o3, 0, %fprs ! restore fprs 1706 1707 1: 1708 andn %l6, FPUSED_FLAG, %l6 1709 membar #Sync 1710 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1711 FP_ALLOWMIGRATE(5, 6) 1712 1713 mov SAVE_SRC, %i0 1714 mov SAVE_DST, %i1 1715 jmp REAL_LOFAULT 1716 mov SAVE_COUNT, %i2 1717 1718 SET_SIZE(copyio_fault) 1719 1720 1721 ENTRY(copyout) 1722 1723 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 1724 bleu,pt %ncc, .copyout_small ! go to larger cases 1725 xor %o0, %o1, %o3 ! are src, dst alignable? 1726 btst 7, %o3 ! 1727 bz,pt %ncc, .copyout_8 ! check for longword alignment 1728 nop 1729 btst 1, %o3 ! 1730 bz,pt %ncc, .copyout_2 ! check for half-word 1731 nop 1732 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 1733 ld [%o3 + %lo(hw_copy_limit_1)], %o3 1734 tst %o3 1735 bz,pn %icc, .copyout_small ! if zero, disable HW copy 1736 cmp %o2, %o3 ! if length <= limit 1737 bleu,pt %ncc, .copyout_small ! go to small copy 1738 nop 1739 ba,pt %ncc, .copyout_more ! otherwise go to large copy 1740 nop 1741 .copyout_2: 1742 btst 3, %o3 ! 1743 bz,pt %ncc, .copyout_4 ! check for word alignment 1744 nop 1745 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 1746 ld [%o3 + %lo(hw_copy_limit_2)], %o3 1747 tst %o3 1748 bz,pn %icc, .copyout_small ! if zero, disable HW copy 1749 cmp %o2, %o3 ! if length <= limit 1750 bleu,pt %ncc, .copyout_small ! go to small copy 1751 nop 1752 ba,pt %ncc, .copyout_more ! otherwise go to large copy 1753 nop 1754 .copyout_4: 1755 ! already checked longword, must be word aligned 1756 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 1757 ld [%o3 + %lo(hw_copy_limit_4)], %o3 1758 tst %o3 1759 bz,pn %icc, .copyout_small ! if zero, disable HW copy 1760 cmp %o2, %o3 ! if length <= limit 1761 bleu,pt %ncc, .copyout_small ! go to small copy 1762 nop 1763 ba,pt %ncc, .copyout_more ! otherwise go to large copy 1764 nop 1765 .copyout_8: 1766 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 1767 ld [%o3 + %lo(hw_copy_limit_8)], %o3 1768 tst %o3 1769 bz,pn %icc, .copyout_small ! if zero, disable HW copy 1770 cmp %o2, %o3 ! if length <= limit 1771 bleu,pt %ncc, .copyout_small ! go to small copy 1772 nop 1773 ba,pt %ncc, .copyout_more ! otherwise go to large copy 1774 nop 1775 1776 .align 16 1777 nop ! instruction alignment 1778 ! see discussion at start of file 1779 .copyout_small: 1780 sethi %hi(.sm_copyout_err), %o5 ! .sm_copyout_err is lofault 1781 or %o5, %lo(.sm_copyout_err), %o5 1782 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler 1783 membar #Sync ! sync error barrier 1784 stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault 1785 .sm_do_copyout: 1786 mov %o0, SM_SAVE_SRC 1787 mov %o1, SM_SAVE_DST 1788 cmp %o2, SHORTCOPY ! check for really short case 1789 bleu,pt %ncc, .co_sm_left ! 1790 mov %o2, SM_SAVE_COUNT 1791 cmp %o2, CHKSIZE ! check for medium length cases 1792 bgu,pn %ncc, .co_med ! 1793 or %o0, %o1, %o3 ! prepare alignment check 1794 andcc %o3, 0x3, %g0 ! test for alignment 1795 bz,pt %ncc, .co_sm_word ! branch to word aligned case 1796 .co_sm_movebytes: 1797 sub %o2, 3, %o2 ! adjust count to allow cc zero test 1798 .co_sm_notalign4: 1799 ldub [%o0], %o3 ! read byte 1800 subcc %o2, 4, %o2 ! reduce count by 4 1801 stba %o3, [%o1]ASI_USER ! write byte 1802 inc %o1 ! advance DST by 1 1803 ldub [%o0 + 1], %o3 ! repeat for a total of 4 bytes 1804 add %o0, 4, %o0 ! advance SRC by 4 1805 stba %o3, [%o1]ASI_USER 1806 inc %o1 ! advance DST by 1 1807 ldub [%o0 - 2], %o3 1808 stba %o3, [%o1]ASI_USER 1809 inc %o1 ! advance DST by 1 1810 ldub [%o0 - 1], %o3 1811 stba %o3, [%o1]ASI_USER 1812 bgt,pt %ncc, .co_sm_notalign4 ! loop til 3 or fewer bytes remain 1813 inc %o1 ! advance DST by 1 1814 add %o2, 3, %o2 ! restore count 1815 .co_sm_left: 1816 tst %o2 1817 bz,pt %ncc, .co_sm_exit ! check for zero length 1818 nop 1819 ldub [%o0], %o3 ! load one byte 1820 deccc %o2 ! reduce count for cc test 1821 bz,pt %ncc, .co_sm_exit 1822 stba %o3,[%o1]ASI_USER ! store one byte 1823 ldub [%o0 + 1], %o3 ! load second byte 1824 deccc %o2 1825 inc %o1 1826 bz,pt %ncc, .co_sm_exit 1827 stba %o3,[%o1]ASI_USER ! store second byte 1828 ldub [%o0 + 2], %o3 ! load third byte 1829 inc %o1 1830 stba %o3,[%o1]ASI_USER ! store third byte 1831 membar #Sync ! sync error barrier 1832 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1833 retl 1834 mov %g0, %o0 ! return 0 1835 .align 16 1836 .co_sm_words: 1837 lduw [%o0], %o3 ! read word 1838 .co_sm_wordx: 1839 subcc %o2, 8, %o2 ! update count 1840 stwa %o3, [%o1]ASI_USER ! write word 1841 add %o0, 8, %o0 ! update SRC 1842 lduw [%o0 - 4], %o3 ! read word 1843 add %o1, 4, %o1 ! update DST 1844 stwa %o3, [%o1]ASI_USER ! write word 1845 bgt,pt %ncc, .co_sm_words ! loop til done 1846 add %o1, 4, %o1 ! update DST 1847 addcc %o2, 7, %o2 ! restore count 1848 bz,pt %ncc, .co_sm_exit 1849 nop 1850 deccc %o2 1851 bz,pt %ncc, .co_sm_byte 1852 .co_sm_half: 1853 subcc %o2, 2, %o2 ! reduce count by 2 1854 lduh [%o0], %o3 ! read half word 1855 add %o0, 2, %o0 ! advance SRC by 2 1856 stha %o3, [%o1]ASI_USER ! write half word 1857 bgt,pt %ncc, .co_sm_half ! loop til done 1858 add %o1, 2, %o1 ! advance DST by 2 1859 addcc %o2, 1, %o2 ! restore count 1860 bz,pt %ncc, .co_sm_exit 1861 nop 1862 .co_sm_byte: 1863 ldub [%o0], %o3 1864 stba %o3, [%o1]ASI_USER 1865 membar #Sync ! sync error barrier 1866 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1867 retl 1868 mov %g0, %o0 ! return 0 1869 .align 16 1870 .co_sm_word: 1871 subcc %o2, 4, %o2 ! update count 1872 bgt,pt %ncc, .co_sm_wordx 1873 lduw [%o0], %o3 ! read word 1874 addcc %o2, 3, %o2 ! restore count 1875 bz,pt %ncc, .co_sm_exit 1876 stwa %o3, [%o1]ASI_USER ! write word 1877 deccc %o2 ! reduce count for cc test 1878 ldub [%o0 + 4], %o3 ! load one byte 1879 add %o1, 4, %o1 1880 bz,pt %ncc, .co_sm_exit 1881 stba %o3, [%o1]ASI_USER ! store one byte 1882 ldub [%o0 + 5], %o3 ! load second byte 1883 deccc %o2 1884 inc %o1 1885 bz,pt %ncc, .co_sm_exit 1886 stba %o3, [%o1]ASI_USER ! store second byte 1887 ldub [%o0 + 6], %o3 ! load third byte 1888 inc %o1 1889 stba %o3, [%o1]ASI_USER ! store third byte 1890 .co_sm_exit: 1891 membar #Sync ! sync error barrier 1892 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1893 retl 1894 mov %g0, %o0 ! return 0 1895 1896 .align 16 1897 .co_med: 1898 xor %o0, %o1, %o3 ! setup alignment check 1899 btst 1, %o3 1900 bnz,pt %ncc, .co_sm_movebytes ! unaligned 1901 nop 1902 btst 3, %o3 1903 bnz,pt %ncc, .co_med_half ! halfword aligned 1904 nop 1905 btst 7, %o3 1906 bnz,pt %ncc, .co_med_word ! word aligned 1907 nop 1908 .co_med_long: 1909 btst 3, %o0 ! check for 1910 bz,pt %ncc, .co_med_long1 ! word alignment 1911 nop 1912 .co_med_long0: 1913 ldub [%o0], %o3 ! load one byte 1914 inc %o0 1915 stba %o3,[%o1]ASI_USER ! store byte 1916 inc %o1 1917 btst 3, %o0 1918 bnz,pt %ncc, .co_med_long0 1919 dec %o2 1920 .co_med_long1: ! word aligned 1921 btst 7, %o0 ! check for long word 1922 bz,pt %ncc, .co_med_long2 1923 nop 1924 lduw [%o0], %o3 ! load word 1925 add %o0, 4, %o0 ! advance SRC by 4 1926 stwa %o3, [%o1]ASI_USER ! store word 1927 add %o1, 4, %o1 ! advance DST by 4 1928 sub %o2, 4, %o2 ! reduce count by 4 1929 ! 1930 ! Now long word aligned and have at least 32 bytes to move 1931 ! 1932 .co_med_long2: 1933 sub %o2, 31, %o2 ! adjust count to allow cc zero test 1934 sub %o1, 8, %o1 ! adjust pointer to allow store in 1935 ! branch delay slot instead of add 1936 .co_med_lmove: 1937 add %o1, 8, %o1 ! advance DST by 8 1938 ldx [%o0], %o3 ! read long word 1939 subcc %o2, 32, %o2 ! reduce count by 32 1940 stxa %o3, [%o1]ASI_USER ! write long word 1941 add %o1, 8, %o1 ! advance DST by 8 1942 ldx [%o0 + 8], %o3 ! repeat for a total for 4 long words 1943 add %o0, 32, %o0 ! advance SRC by 32 1944 stxa %o3, [%o1]ASI_USER 1945 ldx [%o0 - 16], %o3 1946 add %o1, 8, %o1 ! advance DST by 8 1947 stxa %o3, [%o1]ASI_USER 1948 ldx [%o0 - 8], %o3 1949 add %o1, 8, %o1 ! advance DST by 8 1950 bgt,pt %ncc, .co_med_lmove ! loop til 31 or fewer bytes left 1951 stxa %o3, [%o1]ASI_USER 1952 add %o1, 8, %o1 ! advance DST by 8 1953 addcc %o2, 24, %o2 ! restore count to long word offset 1954 ble,pt %ncc, .co_med_lextra ! check for more long words to move 1955 nop 1956 .co_med_lword: 1957 ldx [%o0], %o3 ! read long word 1958 subcc %o2, 8, %o2 ! reduce count by 8 1959 stxa %o3, [%o1]ASI_USER ! write long word 1960 add %o0, 8, %o0 ! advance SRC by 8 1961 bgt,pt %ncc, .co_med_lword ! loop til 7 or fewer bytes left 1962 add %o1, 8, %o1 ! advance DST by 8 1963 .co_med_lextra: 1964 addcc %o2, 7, %o2 ! restore rest of count 1965 bz,pt %ncc, .co_sm_exit ! if zero, then done 1966 deccc %o2 1967 bz,pt %ncc, .co_sm_byte 1968 nop 1969 ba,pt %ncc, .co_sm_half 1970 nop 1971 1972 .align 16 1973 nop ! instruction alignment 1974 ! see discussion at start of file 1975 .co_med_word: 1976 btst 3, %o0 ! check for 1977 bz,pt %ncc, .co_med_word1 ! word alignment 1978 nop 1979 .co_med_word0: 1980 ldub [%o0], %o3 ! load one byte 1981 inc %o0 1982 stba %o3,[%o1]ASI_USER ! store byte 1983 inc %o1 1984 btst 3, %o0 1985 bnz,pt %ncc, .co_med_word0 1986 dec %o2 1987 ! 1988 ! Now word aligned and have at least 36 bytes to move 1989 ! 1990 .co_med_word1: 1991 sub %o2, 15, %o2 ! adjust count to allow cc zero test 1992 .co_med_wmove: 1993 lduw [%o0], %o3 ! read word 1994 subcc %o2, 16, %o2 ! reduce count by 16 1995 stwa %o3, [%o1]ASI_USER ! write word 1996 add %o1, 4, %o1 ! advance DST by 4 1997 lduw [%o0 + 4], %o3 ! repeat for a total for 4 words 1998 add %o0, 16, %o0 ! advance SRC by 16 1999 stwa %o3, [%o1]ASI_USER 2000 add %o1, 4, %o1 ! advance DST by 4 2001 lduw [%o0 - 8], %o3 2002 stwa %o3, [%o1]ASI_USER 2003 add %o1, 4, %o1 ! advance DST by 4 2004 lduw [%o0 - 4], %o3 2005 stwa %o3, [%o1]ASI_USER 2006 bgt,pt %ncc, .co_med_wmove ! loop til 15 or fewer bytes left 2007 add %o1, 4, %o1 ! advance DST by 4 2008 addcc %o2, 12, %o2 ! restore count to word offset 2009 ble,pt %ncc, .co_med_wextra ! check for more words to move 2010 nop 2011 .co_med_word2: 2012 lduw [%o0], %o3 ! read word 2013 subcc %o2, 4, %o2 ! reduce count by 4 2014 stwa %o3, [%o1]ASI_USER ! write word 2015 add %o0, 4, %o0 ! advance SRC by 4 2016 bgt,pt %ncc, .co_med_word2 ! loop til 3 or fewer bytes left 2017 add %o1, 4, %o1 ! advance DST by 4 2018 .co_med_wextra: 2019 addcc %o2, 3, %o2 ! restore rest of count 2020 bz,pt %ncc, .co_sm_exit ! if zero, then done 2021 deccc %o2 2022 bz,pt %ncc, .co_sm_byte 2023 nop 2024 ba,pt %ncc, .co_sm_half 2025 nop 2026 2027 .align 16 2028 nop ! instruction alignment 2029 nop ! see discussion at start of file 2030 nop 2031 .co_med_half: 2032 btst 1, %o0 ! check for 2033 bz,pt %ncc, .co_med_half1 ! half word alignment 2034 nop 2035 ldub [%o0], %o3 ! load one byte 2036 inc %o0 2037 stba %o3,[%o1]ASI_USER ! store byte 2038 inc %o1 2039 dec %o2 2040 ! 2041 ! Now half word aligned and have at least 38 bytes to move 2042 ! 2043 .co_med_half1: 2044 sub %o2, 7, %o2 ! adjust count to allow cc zero test 2045 .co_med_hmove: 2046 lduh [%o0], %o3 ! read half word 2047 subcc %o2, 8, %o2 ! reduce count by 8 2048 stha %o3, [%o1]ASI_USER ! write half word 2049 add %o1, 2, %o1 ! advance DST by 2 2050 lduh [%o0 + 2], %o3 ! repeat for a total for 4 halfwords 2051 add %o0, 8, %o0 ! advance SRC by 8 2052 stha %o3, [%o1]ASI_USER 2053 add %o1, 2, %o1 ! advance DST by 2 2054 lduh [%o0 - 4], %o3 2055 stha %o3, [%o1]ASI_USER 2056 add %o1, 2, %o1 ! advance DST by 2 2057 lduh [%o0 - 2], %o3 2058 stha %o3, [%o1]ASI_USER 2059 bgt,pt %ncc, .co_med_hmove ! loop til 7 or fewer bytes left 2060 add %o1, 2, %o1 ! advance DST by 2 2061 addcc %o2, 7, %o2 ! restore count 2062 bz,pt %ncc, .co_sm_exit 2063 deccc %o2 2064 bz,pt %ncc, .co_sm_byte 2065 nop 2066 ba,pt %ncc, .co_sm_half 2067 nop 2068 2069 /* 2070 * We got here because of a fault during short copyout. 2071 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh). 2072 */ 2073 .sm_copyout_err: 2074 membar #Sync 2075 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2076 mov SM_SAVE_SRC, %o0 2077 mov SM_SAVE_DST, %o1 2078 mov SM_SAVE_COUNT, %o2 2079 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler 2080 tst %o3 2081 bz,pt %ncc, 3f ! if not, return error 2082 nop 2083 ldn [%o3 + CP_COPYOUT], %o5 ! if handler, invoke it with 2084 jmp %o5 ! original arguments 2085 nop 2086 3: 2087 retl 2088 or %g0, -1, %o0 ! return error value 2089 2090 SET_SIZE(copyout) 2091 2092 /* 2093 * The _more entry points are not intended to be used directly by 2094 * any caller from outside this file. They are provided to allow 2095 * profiling and dtrace of the portions of the copy code that uses 2096 * the floating point registers. 2097 * This entry is particularly important as DTRACE (at least as of 2098 * 4/2004) does not support leaf functions. 2099 */ 2100 2101 ENTRY(copyout_more) 2102 .copyout_more: 2103 prefetch [%o0], #n_reads 2104 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 2105 set .copyout_err, REAL_LOFAULT 2106 2107 /* 2108 * Copy outs that reach here are larger than VIS_COPY_THRESHOLD bytes 2109 */ 2110 .do_copyout: 2111 set copyio_fault, %l7 ! .copyio_fault is lofault val 2112 2113 ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler 2114 membar #Sync ! sync error barrier 2115 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 2116 2117 mov %i0, SAVE_SRC 2118 mov %i1, SAVE_DST 2119 mov %i2, SAVE_COUNT 2120 2121 FP_NOMIGRATE(6, 7) 2122 2123 rd %fprs, %o2 ! check for unused fp 2124 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs 2125 btst FPRS_FEF, %o2 2126 bz,a,pt %icc, .do_blockcopyout 2127 wr %g0, FPRS_FEF, %fprs 2128 2129 BST_FPQ2Q4_TOSTACK(%o2) 2130 2131 .do_blockcopyout: 2132 rd %gsr, %o2 2133 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr 2134 or %l6, FPUSED_FLAG, %l6 2135 2136 andcc DST, VIS_BLOCKSIZE - 1, TMP 2137 mov ASI_USER, %asi 2138 bz,pt %ncc, 2f 2139 neg TMP 2140 add TMP, VIS_BLOCKSIZE, TMP 2141 2142 ! TMP = bytes required to align DST on FP_BLOCK boundary 2143 ! Using SRC as a tmp here 2144 cmp TMP, 3 2145 bleu,pt %ncc, 1f 2146 sub CNT,TMP,CNT ! adjust main count 2147 sub TMP, 3, TMP ! adjust for end of loop test 2148 .co_blkalign: 2149 ldub [REALSRC], SRC ! move 4 bytes per loop iteration 2150 stba SRC, [DST]%asi 2151 subcc TMP, 4, TMP 2152 ldub [REALSRC + 1], SRC 2153 add REALSRC, 4, REALSRC 2154 stba SRC, [DST + 1]%asi 2155 ldub [REALSRC - 2], SRC 2156 add DST, 4, DST 2157 stba SRC, [DST - 2]%asi 2158 ldub [REALSRC - 1], SRC 2159 bgu,pt %ncc, .co_blkalign 2160 stba SRC, [DST - 1]%asi 2161 2162 addcc TMP, 3, TMP ! restore count adjustment 2163 bz,pt %ncc, 2f ! no bytes left? 2164 nop 2165 1: ldub [REALSRC], SRC 2166 inc REALSRC 2167 inc DST 2168 deccc TMP 2169 bgu %ncc, 1b 2170 stba SRC, [DST - 1]%asi 2171 2172 2: 2173 membar #StoreLoad 2174 andn REALSRC, 0x7, SRC 2175 2176 ! SRC - 8-byte aligned 2177 ! DST - 64-byte aligned 2178 ldd [SRC], %f16 2179 prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads 2180 alignaddr REALSRC, %g0, %g0 2181 ldd [SRC + 0x08], %f18 2182 prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads 2183 faligndata %f16, %f18, %f48 2184 ldd [SRC + 0x10], %f20 2185 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads 2186 faligndata %f18, %f20, %f50 2187 ldd [SRC + 0x18], %f22 2188 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read 2189 faligndata %f20, %f22, %f52 2190 ldd [SRC + 0x20], %f24 2191 prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read 2192 faligndata %f22, %f24, %f54 2193 ldd [SRC + 0x28], %f26 2194 prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read 2195 faligndata %f24, %f26, %f56 2196 ldd [SRC + 0x30], %f28 2197 prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read 2198 faligndata %f26, %f28, %f58 2199 ldd [SRC + 0x38], %f30 2200 ldd [SRC + VIS_BLOCKSIZE], %f16 2201 sub CNT, VIS_BLOCKSIZE, CNT 2202 add SRC, VIS_BLOCKSIZE, SRC 2203 prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read 2204 add REALSRC, VIS_BLOCKSIZE, REALSRC 2205 ba,pt %ncc, 1f 2206 prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read 2207 .align 32 2208 1: 2209 ldd [SRC + 0x08], %f18 2210 faligndata %f28, %f30, %f60 2211 ldd [SRC + 0x10], %f20 2212 faligndata %f30, %f16, %f62 2213 stda %f48, [DST]ASI_BLK_AIUS 2214 ldd [SRC + 0x18], %f22 2215 faligndata %f16, %f18, %f48 2216 ldd [SRC + 0x20], %f24 2217 faligndata %f18, %f20, %f50 2218 ldd [SRC + 0x28], %f26 2219 faligndata %f20, %f22, %f52 2220 ldd [SRC + 0x30], %f28 2221 faligndata %f22, %f24, %f54 2222 sub CNT, VIS_BLOCKSIZE, CNT 2223 ldd [SRC + 0x38], %f30 2224 faligndata %f24, %f26, %f56 2225 add DST, VIS_BLOCKSIZE, DST 2226 ldd [SRC + VIS_BLOCKSIZE], %f16 2227 faligndata %f26, %f28, %f58 2228 add REALSRC, VIS_BLOCKSIZE, REALSRC 2229 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads 2230 add SRC, VIS_BLOCKSIZE, SRC 2231 prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read 2232 cmp CNT, VIS_BLOCKSIZE + 8 2233 bgu,pt %ncc, 1b 2234 prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read 2235 2236 ! only if REALSRC & 0x7 is 0 2237 cmp CNT, VIS_BLOCKSIZE 2238 bne %ncc, 3f 2239 andcc REALSRC, 0x7, %g0 2240 bz,pt %ncc, 2f 2241 nop 2242 3: 2243 faligndata %f28, %f30, %f60 2244 faligndata %f30, %f16, %f62 2245 stda %f48, [DST]ASI_BLK_AIUS 2246 add DST, VIS_BLOCKSIZE, DST 2247 ba,pt %ncc, 3f 2248 nop 2249 2: 2250 ldd [SRC + 0x08], %f18 2251 fsrc1 %f28, %f60 2252 ldd [SRC + 0x10], %f20 2253 fsrc1 %f30, %f62 2254 stda %f48, [DST]ASI_BLK_AIUS 2255 ldd [SRC + 0x18], %f22 2256 fsrc1 %f16, %f48 2257 ldd [SRC + 0x20], %f24 2258 fsrc1 %f18, %f50 2259 ldd [SRC + 0x28], %f26 2260 fsrc1 %f20, %f52 2261 ldd [SRC + 0x30], %f28 2262 fsrc1 %f22, %f54 2263 ldd [SRC + 0x38], %f30 2264 fsrc1 %f24, %f56 2265 sub CNT, VIS_BLOCKSIZE, CNT 2266 add DST, VIS_BLOCKSIZE, DST 2267 add SRC, VIS_BLOCKSIZE, SRC 2268 add REALSRC, VIS_BLOCKSIZE, REALSRC 2269 fsrc1 %f26, %f58 2270 fsrc1 %f28, %f60 2271 fsrc1 %f30, %f62 2272 stda %f48, [DST]ASI_BLK_AIUS 2273 add DST, VIS_BLOCKSIZE, DST 2274 ba,a,pt %ncc, 4f 2275 nop 2276 2277 3: tst CNT 2278 bz,a %ncc, 4f 2279 nop 2280 2281 5: ldub [REALSRC], TMP 2282 inc REALSRC 2283 inc DST 2284 deccc CNT 2285 bgu %ncc, 5b 2286 stba TMP, [DST - 1]%asi 2287 4: 2288 2289 .copyout_exit: 2290 membar #Sync 2291 2292 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 2293 wr %o2, 0, %gsr ! restore gsr 2294 2295 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 2296 btst FPRS_FEF, %o3 2297 bz,pt %icc, 4f 2298 nop 2299 2300 BLD_FPQ2Q4_FROMSTACK(%o2) 2301 2302 ba,pt %ncc, 1f 2303 wr %o3, 0, %fprs ! restore fprs 2304 2305 4: 2306 FZEROQ2Q4 2307 wr %o3, 0, %fprs ! restore fprs 2308 2309 1: 2310 membar #Sync 2311 andn %l6, FPUSED_FLAG, %l6 2312 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2313 FP_ALLOWMIGRATE(5, 6) 2314 ret 2315 restore %g0, 0, %o0 2316 2317 /* 2318 * We got here because of a fault during copyout. 2319 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh). 2320 */ 2321 .copyout_err: 2322 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler 2323 tst %o4 2324 bz,pt %ncc, 2f ! if not, return error 2325 nop 2326 ldn [%o4 + CP_COPYOUT], %g2 ! if handler, invoke it with 2327 jmp %g2 ! original arguments 2328 restore %g0, 0, %g0 ! dispose of copy window 2329 2: 2330 ret 2331 restore %g0, -1, %o0 ! return error value 2332 2333 2334 SET_SIZE(copyout_more) 2335 2336 2337 ENTRY(xcopyout) 2338 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 2339 bleu,pt %ncc, .xcopyout_small ! go to larger cases 2340 xor %o0, %o1, %o3 ! are src, dst alignable? 2341 btst 7, %o3 ! 2342 bz,pt %ncc, .xcopyout_8 ! 2343 nop 2344 btst 1, %o3 ! 2345 bz,pt %ncc, .xcopyout_2 ! check for half-word 2346 nop 2347 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 2348 ld [%o3 + %lo(hw_copy_limit_1)], %o3 2349 tst %o3 2350 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy 2351 cmp %o2, %o3 ! if length <= limit 2352 bleu,pt %ncc, .xcopyout_small ! go to small copy 2353 nop 2354 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy 2355 nop 2356 .xcopyout_2: 2357 btst 3, %o3 ! 2358 bz,pt %ncc, .xcopyout_4 ! check for word alignment 2359 nop 2360 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 2361 ld [%o3 + %lo(hw_copy_limit_2)], %o3 2362 tst %o3 2363 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy 2364 cmp %o2, %o3 ! if length <= limit 2365 bleu,pt %ncc, .xcopyout_small ! go to small copy 2366 nop 2367 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy 2368 nop 2369 .xcopyout_4: 2370 ! already checked longword, must be word aligned 2371 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 2372 ld [%o3 + %lo(hw_copy_limit_4)], %o3 2373 tst %o3 2374 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy 2375 cmp %o2, %o3 ! if length <= limit 2376 bleu,pt %ncc, .xcopyout_small ! go to small copy 2377 nop 2378 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy 2379 nop 2380 .xcopyout_8: 2381 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 2382 ld [%o3 + %lo(hw_copy_limit_8)], %o3 2383 tst %o3 2384 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy 2385 cmp %o2, %o3 ! if length <= limit 2386 bleu,pt %ncc, .xcopyout_small ! go to small copy 2387 nop 2388 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy 2389 nop 2390 2391 .xcopyout_small: 2392 sethi %hi(.sm_xcopyout_err), %o5 ! .sm_xcopyout_err is lofault 2393 or %o5, %lo(.sm_xcopyout_err), %o5 2394 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler 2395 membar #Sync ! sync error barrier 2396 ba,pt %ncc, .sm_do_copyout ! common code 2397 stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault 2398 2399 .xcopyout_more: 2400 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 2401 sethi %hi(.xcopyout_err), REAL_LOFAULT 2402 ba,pt %ncc, .do_copyout ! common code 2403 or REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT 2404 2405 /* 2406 * We got here because of fault during xcopyout 2407 * Errno value is in ERRNO 2408 */ 2409 .xcopyout_err: 2410 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler 2411 tst %o4 2412 bz,pt %ncc, 2f ! if not, return error 2413 nop 2414 ldn [%o4 + CP_XCOPYOUT], %g2 ! if handler, invoke it with 2415 jmp %g2 ! original arguments 2416 restore %g0, 0, %g0 ! dispose of copy window 2417 2: 2418 ret 2419 restore ERRNO, 0, %o0 ! return errno value 2420 2421 .sm_xcopyout_err: 2422 2423 membar #Sync 2424 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2425 mov SM_SAVE_SRC, %o0 2426 mov SM_SAVE_DST, %o1 2427 mov SM_SAVE_COUNT, %o2 2428 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler 2429 tst %o3 2430 bz,pt %ncc, 3f ! if not, return error 2431 nop 2432 ldn [%o3 + CP_XCOPYOUT], %o5 ! if handler, invoke it with 2433 jmp %o5 ! original arguments 2434 nop 2435 3: 2436 retl 2437 or %g1, 0, %o0 ! return errno value 2438 2439 SET_SIZE(xcopyout) 2440 2441 ENTRY(xcopyout_little) 2442 sethi %hi(.xcopyio_err), %o5 2443 or %o5, %lo(.xcopyio_err), %o5 2444 ldn [THREAD_REG + T_LOFAULT], %o4 2445 membar #Sync ! sync error barrier 2446 stn %o5, [THREAD_REG + T_LOFAULT] 2447 mov %o4, %o5 2448 2449 subcc %g0, %o2, %o3 2450 add %o0, %o2, %o0 2451 bz,pn %ncc, 2f ! check for zero bytes 2452 sub %o2, 1, %o4 2453 add %o0, %o4, %o0 ! start w/last byte 2454 add %o1, %o2, %o1 2455 ldub [%o0 + %o3], %o4 2456 2457 1: stba %o4, [%o1 + %o3]ASI_AIUSL 2458 inccc %o3 2459 sub %o0, 2, %o0 ! get next byte 2460 bcc,a,pt %ncc, 1b 2461 ldub [%o0 + %o3], %o4 2462 2463 2: 2464 membar #Sync ! sync error barrier 2465 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2466 retl 2467 mov %g0, %o0 ! return (0) 2468 2469 SET_SIZE(xcopyout_little) 2470 2471 /* 2472 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little) 2473 */ 2474 2475 ENTRY(copyin) 2476 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 2477 bleu,pt %ncc, .copyin_small ! go to larger cases 2478 xor %o0, %o1, %o3 ! are src, dst alignable? 2479 btst 7, %o3 ! 2480 bz,pt %ncc, .copyin_8 ! check for longword alignment 2481 nop 2482 btst 1, %o3 ! 2483 bz,pt %ncc, .copyin_2 ! check for half-word 2484 nop 2485 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 2486 ld [%o3 + %lo(hw_copy_limit_1)], %o3 2487 tst %o3 2488 bz,pn %icc, .copyin_small ! if zero, disable HW copy 2489 cmp %o2, %o3 ! if length <= limit 2490 bleu,pt %ncc, .copyin_small ! go to small copy 2491 nop 2492 ba,pt %ncc, .copyin_more ! otherwise go to large copy 2493 nop 2494 .copyin_2: 2495 btst 3, %o3 ! 2496 bz,pt %ncc, .copyin_4 ! check for word alignment 2497 nop 2498 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 2499 ld [%o3 + %lo(hw_copy_limit_2)], %o3 2500 tst %o3 2501 bz,pn %icc, .copyin_small ! if zero, disable HW copy 2502 cmp %o2, %o3 ! if length <= limit 2503 bleu,pt %ncc, .copyin_small ! go to small copy 2504 nop 2505 ba,pt %ncc, .copyin_more ! otherwise go to large copy 2506 nop 2507 .copyin_4: 2508 ! already checked longword, must be word aligned 2509 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 2510 ld [%o3 + %lo(hw_copy_limit_4)], %o3 2511 tst %o3 2512 bz,pn %icc, .copyin_small ! if zero, disable HW copy 2513 cmp %o2, %o3 ! if length <= limit 2514 bleu,pt %ncc, .copyin_small ! go to small copy 2515 nop 2516 ba,pt %ncc, .copyin_more ! otherwise go to large copy 2517 nop 2518 .copyin_8: 2519 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 2520 ld [%o3 + %lo(hw_copy_limit_8)], %o3 2521 tst %o3 2522 bz,pn %icc, .copyin_small ! if zero, disable HW copy 2523 cmp %o2, %o3 ! if length <= limit 2524 bleu,pt %ncc, .copyin_small ! go to small copy 2525 nop 2526 ba,pt %ncc, .copyin_more ! otherwise go to large copy 2527 nop 2528 2529 .align 16 2530 nop ! instruction alignment 2531 ! see discussion at start of file 2532 .copyin_small: 2533 sethi %hi(.sm_copyin_err), %o5 ! .sm_copyin_err is lofault 2534 or %o5, %lo(.sm_copyin_err), %o5 2535 ldn [THREAD_REG + T_LOFAULT], %o4 ! set/save t_lofault, no tramp 2536 membar #Sync ! sync error barrier 2537 stn %o5, [THREAD_REG + T_LOFAULT] 2538 .sm_do_copyin: 2539 mov %o0, SM_SAVE_SRC 2540 mov %o1, SM_SAVE_DST 2541 cmp %o2, SHORTCOPY ! check for really short case 2542 bleu,pt %ncc, .ci_sm_left ! 2543 mov %o2, SM_SAVE_COUNT 2544 cmp %o2, CHKSIZE ! check for medium length cases 2545 bgu,pn %ncc, .ci_med ! 2546 or %o0, %o1, %o3 ! prepare alignment check 2547 andcc %o3, 0x3, %g0 ! test for alignment 2548 bz,pt %ncc, .ci_sm_word ! branch to word aligned case 2549 .ci_sm_movebytes: 2550 sub %o2, 3, %o2 ! adjust count to allow cc zero test 2551 .ci_sm_notalign4: 2552 lduba [%o0]ASI_USER, %o3 ! read byte 2553 subcc %o2, 4, %o2 ! reduce count by 4 2554 stb %o3, [%o1] ! write byte 2555 add %o0, 1, %o0 ! advance SRC by 1 2556 lduba [%o0]ASI_USER, %o3 ! repeat for a total of 4 bytes 2557 add %o0, 1, %o0 ! advance SRC by 1 2558 stb %o3, [%o1 + 1] 2559 add %o1, 4, %o1 ! advance DST by 4 2560 lduba [%o0]ASI_USER, %o3 2561 add %o0, 1, %o0 ! advance SRC by 1 2562 stb %o3, [%o1 - 2] 2563 lduba [%o0]ASI_USER, %o3 2564 add %o0, 1, %o0 ! advance SRC by 1 2565 bgt,pt %ncc, .ci_sm_notalign4 ! loop til 3 or fewer bytes remain 2566 stb %o3, [%o1 - 1] 2567 add %o2, 3, %o2 ! restore count 2568 .ci_sm_left: 2569 tst %o2 2570 bz,pt %ncc, .ci_sm_exit 2571 nop 2572 lduba [%o0]ASI_USER, %o3 ! load one byte 2573 deccc %o2 ! reduce count for cc test 2574 bz,pt %ncc, .ci_sm_exit 2575 stb %o3,[%o1] ! store one byte 2576 inc %o0 2577 lduba [%o0]ASI_USER, %o3 ! load second byte 2578 deccc %o2 2579 bz,pt %ncc, .ci_sm_exit 2580 stb %o3,[%o1 + 1] ! store second byte 2581 inc %o0 2582 lduba [%o0]ASI_USER, %o3 ! load third byte 2583 stb %o3,[%o1 + 2] ! store third byte 2584 membar #Sync ! sync error barrier 2585 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2586 retl 2587 mov %g0, %o0 ! return 0 2588 .align 16 2589 .ci_sm_words: 2590 lduwa [%o0]ASI_USER, %o3 ! read word 2591 .ci_sm_wordx: 2592 subcc %o2, 8, %o2 ! update count 2593 stw %o3, [%o1] ! write word 2594 add %o0, 4, %o0 ! update SRC 2595 add %o1, 8, %o1 ! update DST 2596 lduwa [%o0]ASI_USER, %o3 ! read word 2597 add %o0, 4, %o0 ! update SRC 2598 bgt,pt %ncc, .ci_sm_words ! loop til done 2599 stw %o3, [%o1 - 4] ! write word 2600 addcc %o2, 7, %o2 ! restore count 2601 bz,pt %ncc, .ci_sm_exit 2602 nop 2603 deccc %o2 2604 bz,pt %ncc, .ci_sm_byte 2605 .ci_sm_half: 2606 subcc %o2, 2, %o2 ! reduce count by 2 2607 lduha [%o0]ASI_USER, %o3 ! read half word 2608 add %o0, 2, %o0 ! advance SRC by 2 2609 add %o1, 2, %o1 ! advance DST by 2 2610 bgt,pt %ncc, .ci_sm_half ! loop til done 2611 sth %o3, [%o1 - 2] ! write half word 2612 addcc %o2, 1, %o2 ! restore count 2613 bz,pt %ncc, .ci_sm_exit 2614 nop 2615 .ci_sm_byte: 2616 lduba [%o0]ASI_USER, %o3 2617 stb %o3, [%o1] 2618 membar #Sync ! sync error barrier 2619 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2620 retl 2621 mov %g0, %o0 ! return 0 2622 .align 16 2623 .ci_sm_word: 2624 subcc %o2, 4, %o2 ! update count 2625 bgt,pt %ncc, .ci_sm_wordx 2626 lduwa [%o0]ASI_USER, %o3 ! read word 2627 addcc %o2, 3, %o2 ! restore count 2628 bz,pt %ncc, .ci_sm_exit 2629 stw %o3, [%o1] ! write word 2630 deccc %o2 ! reduce count for cc test 2631 add %o0, 4, %o0 2632 lduba [%o0]ASI_USER, %o3 ! load one byte 2633 bz,pt %ncc, .ci_sm_exit 2634 stb %o3, [%o1 + 4] ! store one byte 2635 inc %o0 2636 lduba [%o0]ASI_USER, %o3 ! load second byte 2637 deccc %o2 2638 bz,pt %ncc, .ci_sm_exit 2639 stb %o3, [%o1 + 5] ! store second byte 2640 inc %o0 2641 lduba [%o0]ASI_USER, %o3 ! load third byte 2642 stb %o3, [%o1 + 6] ! store third byte 2643 .ci_sm_exit: 2644 membar #Sync ! sync error barrier 2645 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2646 retl 2647 mov %g0, %o0 ! return 0 2648 2649 .align 16 2650 .ci_med: 2651 xor %o0, %o1, %o3 ! setup alignment check 2652 btst 1, %o3 2653 bnz,pt %ncc, .ci_sm_movebytes ! unaligned 2654 nop 2655 btst 3, %o3 2656 bnz,pt %ncc, .ci_med_half ! halfword aligned 2657 nop 2658 btst 7, %o3 2659 bnz,pt %ncc, .ci_med_word ! word aligned 2660 nop 2661 .ci_med_long: 2662 btst 3, %o0 ! check for 2663 bz,pt %ncc, .ci_med_long1 ! word alignment 2664 nop 2665 .ci_med_long0: 2666 lduba [%o0]ASI_USER, %o3 ! load one byte 2667 inc %o0 2668 stb %o3,[%o1] ! store byte 2669 inc %o1 2670 btst 3, %o0 2671 bnz,pt %ncc, .ci_med_long0 2672 dec %o2 2673 .ci_med_long1: ! word aligned 2674 btst 7, %o0 ! check for long word 2675 bz,pt %ncc, .ci_med_long2 2676 nop 2677 lduwa [%o0]ASI_USER, %o3 ! load word 2678 add %o0, 4, %o0 ! advance SRC by 4 2679 stw %o3, [%o1] ! store word 2680 add %o1, 4, %o1 ! advance DST by 4 2681 sub %o2, 4, %o2 ! reduce count by 4 2682 ! 2683 ! Now long word aligned and have at least 32 bytes to move 2684 ! 2685 .ci_med_long2: 2686 sub %o2, 31, %o2 ! adjust count to allow cc zero test 2687 .ci_med_lmove: 2688 ldxa [%o0]ASI_USER, %o3 ! read long word 2689 subcc %o2, 32, %o2 ! reduce count by 32 2690 stx %o3, [%o1] ! write long word 2691 add %o0, 8, %o0 ! advance SRC by 8 2692 ldxa [%o0]ASI_USER, %o3 ! repeat for a total for 4 long words 2693 add %o0, 8, %o0 ! advance SRC by 8 2694 stx %o3, [%o1 + 8] 2695 add %o1, 32, %o1 ! advance DST by 32 2696 ldxa [%o0]ASI_USER, %o3 2697 add %o0, 8, %o0 ! advance SRC by 8 2698 stx %o3, [%o1 - 16] 2699 ldxa [%o0]ASI_USER, %o3 2700 add %o0, 8, %o0 ! advance SRC by 8 2701 bgt,pt %ncc, .ci_med_lmove ! loop til 31 or fewer bytes left 2702 stx %o3, [%o1 - 8] 2703 addcc %o2, 24, %o2 ! restore count to long word offset 2704 ble,pt %ncc, .ci_med_lextra ! check for more long words to move 2705 nop 2706 .ci_med_lword: 2707 ldxa [%o0]ASI_USER, %o3 ! read long word 2708 subcc %o2, 8, %o2 ! reduce count by 8 2709 stx %o3, [%o1] ! write long word 2710 add %o0, 8, %o0 ! advance SRC by 8 2711 bgt,pt %ncc, .ci_med_lword ! loop til 7 or fewer bytes left 2712 add %o1, 8, %o1 ! advance DST by 8 2713 .ci_med_lextra: 2714 addcc %o2, 7, %o2 ! restore rest of count 2715 bz,pt %ncc, .ci_sm_exit ! if zero, then done 2716 deccc %o2 2717 bz,pt %ncc, .ci_sm_byte 2718 nop 2719 ba,pt %ncc, .ci_sm_half 2720 nop 2721 2722 .align 16 2723 nop ! instruction alignment 2724 ! see discussion at start of file 2725 .ci_med_word: 2726 btst 3, %o0 ! check for 2727 bz,pt %ncc, .ci_med_word1 ! word alignment 2728 nop 2729 .ci_med_word0: 2730 lduba [%o0]ASI_USER, %o3 ! load one byte 2731 inc %o0 2732 stb %o3,[%o1] ! store byte 2733 inc %o1 2734 btst 3, %o0 2735 bnz,pt %ncc, .ci_med_word0 2736 dec %o2 2737 ! 2738 ! Now word aligned and have at least 36 bytes to move 2739 ! 2740 .ci_med_word1: 2741 sub %o2, 15, %o2 ! adjust count to allow cc zero test 2742 .ci_med_wmove: 2743 lduwa [%o0]ASI_USER, %o3 ! read word 2744 subcc %o2, 16, %o2 ! reduce count by 16 2745 stw %o3, [%o1] ! write word 2746 add %o0, 4, %o0 ! advance SRC by 4 2747 lduwa [%o0]ASI_USER, %o3 ! repeat for a total for 4 words 2748 add %o0, 4, %o0 ! advance SRC by 4 2749 stw %o3, [%o1 + 4] 2750 add %o1, 16, %o1 ! advance DST by 16 2751 lduwa [%o0]ASI_USER, %o3 2752 add %o0, 4, %o0 ! advance SRC by 4 2753 stw %o3, [%o1 - 8] 2754 lduwa [%o0]ASI_USER, %o3 2755 add %o0, 4, %o0 ! advance SRC by 4 2756 bgt,pt %ncc, .ci_med_wmove ! loop til 15 or fewer bytes left 2757 stw %o3, [%o1 - 4] 2758 addcc %o2, 12, %o2 ! restore count to word offset 2759 ble,pt %ncc, .ci_med_wextra ! check for more words to move 2760 nop 2761 .ci_med_word2: 2762 lduwa [%o0]ASI_USER, %o3 ! read word 2763 subcc %o2, 4, %o2 ! reduce count by 4 2764 stw %o3, [%o1] ! write word 2765 add %o0, 4, %o0 ! advance SRC by 4 2766 bgt,pt %ncc, .ci_med_word2 ! loop til 3 or fewer bytes left 2767 add %o1, 4, %o1 ! advance DST by 4 2768 .ci_med_wextra: 2769 addcc %o2, 3, %o2 ! restore rest of count 2770 bz,pt %ncc, .ci_sm_exit ! if zero, then done 2771 deccc %o2 2772 bz,pt %ncc, .ci_sm_byte 2773 nop 2774 ba,pt %ncc, .ci_sm_half 2775 nop 2776 2777 .align 16 2778 nop ! instruction alignment 2779 ! see discussion at start of file 2780 .ci_med_half: 2781 btst 1, %o0 ! check for 2782 bz,pt %ncc, .ci_med_half1 ! half word alignment 2783 nop 2784 lduba [%o0]ASI_USER, %o3 ! load one byte 2785 inc %o0 2786 stb %o3,[%o1] ! store byte 2787 inc %o1 2788 dec %o2 2789 ! 2790 ! Now half word aligned and have at least 38 bytes to move 2791 ! 2792 .ci_med_half1: 2793 sub %o2, 7, %o2 ! adjust count to allow cc zero test 2794 .ci_med_hmove: 2795 lduha [%o0]ASI_USER, %o3 ! read half word 2796 subcc %o2, 8, %o2 ! reduce count by 8 2797 sth %o3, [%o1] ! write half word 2798 add %o0, 2, %o0 ! advance SRC by 2 2799 lduha [%o0]ASI_USER, %o3 ! repeat for a total for 4 halfwords 2800 add %o0, 2, %o0 ! advance SRC by 2 2801 sth %o3, [%o1 + 2] 2802 add %o1, 8, %o1 ! advance DST by 8 2803 lduha [%o0]ASI_USER, %o3 2804 add %o0, 2, %o0 ! advance SRC by 2 2805 sth %o3, [%o1 - 4] 2806 lduha [%o0]ASI_USER, %o3 2807 add %o0, 2, %o0 ! advance SRC by 2 2808 bgt,pt %ncc, .ci_med_hmove ! loop til 7 or fewer bytes left 2809 sth %o3, [%o1 - 2] 2810 addcc %o2, 7, %o2 ! restore count 2811 bz,pt %ncc, .ci_sm_exit 2812 deccc %o2 2813 bz,pt %ncc, .ci_sm_byte 2814 nop 2815 ba,pt %ncc, .ci_sm_half 2816 nop 2817 2818 .sm_copyin_err: 2819 membar #Sync 2820 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2821 mov SM_SAVE_SRC, %o0 2822 mov SM_SAVE_DST, %o1 2823 mov SM_SAVE_COUNT, %o2 2824 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler 2825 tst %o3 2826 bz,pt %ncc, 3f ! if not, return error 2827 nop 2828 ldn [%o3 + CP_COPYIN], %o5 ! if handler, invoke it with 2829 jmp %o5 ! original arguments 2830 nop 2831 3: 2832 retl 2833 or %g0, -1, %o0 ! return errno value 2834 2835 SET_SIZE(copyin) 2836 2837 2838 /* 2839 * The _more entry points are not intended to be used directly by 2840 * any caller from outside this file. They are provided to allow 2841 * profiling and dtrace of the portions of the copy code that uses 2842 * the floating point registers. 2843 * This entry is particularly important as DTRACE (at least as of 2844 * 4/2004) does not support leaf functions. 2845 */ 2846 2847 ENTRY(copyin_more) 2848 .copyin_more: 2849 prefetch [%o0], #n_reads 2850 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 2851 set .copyin_err, REAL_LOFAULT 2852 2853 /* 2854 * Copy ins that reach here are larger than VIS_COPY_THRESHOLD bytes 2855 */ 2856 .do_copyin: 2857 set copyio_fault, %l7 ! .copyio_fault is lofault val 2858 2859 ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler 2860 membar #Sync ! sync error barrier 2861 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 2862 2863 mov %i0, SAVE_SRC 2864 mov %i1, SAVE_DST 2865 mov %i2, SAVE_COUNT 2866 2867 FP_NOMIGRATE(6, 7) 2868 2869 rd %fprs, %o2 ! check for unused fp 2870 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs 2871 btst FPRS_FEF, %o2 2872 bz,a,pt %icc, .do_blockcopyin 2873 wr %g0, FPRS_FEF, %fprs 2874 2875 BST_FPQ2Q4_TOSTACK(%o2) 2876 2877 .do_blockcopyin: 2878 rd %gsr, %o2 2879 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr 2880 or %l6, FPUSED_FLAG, %l6 2881 2882 andcc DST, VIS_BLOCKSIZE - 1, TMP 2883 mov ASI_USER, %asi 2884 bz,pt %ncc, 2f 2885 neg TMP 2886 add TMP, VIS_BLOCKSIZE, TMP 2887 2888 ! TMP = bytes required to align DST on FP_BLOCK boundary 2889 ! Using SRC as a tmp here 2890 cmp TMP, 3 2891 bleu,pt %ncc, 1f 2892 sub CNT,TMP,CNT ! adjust main count 2893 sub TMP, 3, TMP ! adjust for end of loop test 2894 .ci_blkalign: 2895 lduba [REALSRC]%asi, SRC ! move 4 bytes per loop iteration 2896 stb SRC, [DST] 2897 subcc TMP, 4, TMP 2898 lduba [REALSRC + 1]%asi, SRC 2899 add REALSRC, 4, REALSRC 2900 stb SRC, [DST + 1] 2901 lduba [REALSRC - 2]%asi, SRC 2902 add DST, 4, DST 2903 stb SRC, [DST - 2] 2904 lduba [REALSRC - 1]%asi, SRC 2905 bgu,pt %ncc, .ci_blkalign 2906 stb SRC, [DST - 1] 2907 2908 addcc TMP, 3, TMP ! restore count adjustment 2909 bz,pt %ncc, 2f ! no bytes left? 2910 nop 2911 1: lduba [REALSRC]%asi, SRC 2912 inc REALSRC 2913 inc DST 2914 deccc TMP 2915 bgu %ncc, 1b 2916 stb SRC, [DST - 1] 2917 2918 2: 2919 membar #StoreLoad 2920 andn REALSRC, 0x7, SRC 2921 2922 ! SRC - 8-byte aligned 2923 ! DST - 64-byte aligned 2924 ldda [SRC]%asi, %f16 2925 prefetcha [SRC + (1 * VIS_BLOCKSIZE)]%asi, #n_reads 2926 alignaddr REALSRC, %g0, %g0 2927 ldda [SRC + 0x08]%asi, %f18 2928 prefetcha [SRC + (2 * VIS_BLOCKSIZE)]%asi, #n_reads 2929 faligndata %f16, %f18, %f48 2930 ldda [SRC + 0x10]%asi, %f20 2931 prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #n_reads 2932 faligndata %f18, %f20, %f50 2933 ldda [SRC + 0x18]%asi, %f22 2934 prefetcha [SRC + (4 * VIS_BLOCKSIZE)]%asi, #one_read 2935 faligndata %f20, %f22, %f52 2936 ldda [SRC + 0x20]%asi, %f24 2937 prefetcha [SRC + (8 * VIS_BLOCKSIZE)]%asi, #one_read 2938 faligndata %f22, %f24, %f54 2939 ldda [SRC + 0x28]%asi, %f26 2940 prefetcha [SRC + (12 * VIS_BLOCKSIZE)]%asi, #one_read 2941 faligndata %f24, %f26, %f56 2942 ldda [SRC + 0x30]%asi, %f28 2943 prefetcha [SRC + (16 * VIS_BLOCKSIZE)]%asi, #one_read 2944 faligndata %f26, %f28, %f58 2945 ldda [SRC + 0x38]%asi, %f30 2946 ldda [SRC + VIS_BLOCKSIZE]%asi, %f16 2947 sub CNT, VIS_BLOCKSIZE, CNT 2948 add SRC, VIS_BLOCKSIZE, SRC 2949 prefetcha [SRC + (19 * VIS_BLOCKSIZE)]%asi, #one_read 2950 add REALSRC, VIS_BLOCKSIZE, REALSRC 2951 ba,pt %ncc, 1f 2952 prefetcha [SRC + (23 * VIS_BLOCKSIZE)]%asi, #one_read 2953 .align 32 2954 1: 2955 ldda [SRC + 0x08]%asi, %f18 2956 faligndata %f28, %f30, %f60 2957 ldda [SRC + 0x10]%asi, %f20 2958 faligndata %f30, %f16, %f62 2959 stda %f48, [DST]ASI_BLK_P 2960 ldda [SRC + 0x18]%asi, %f22 2961 faligndata %f16, %f18, %f48 2962 ldda [SRC + 0x20]%asi, %f24 2963 faligndata %f18, %f20, %f50 2964 ldda [SRC + 0x28]%asi, %f26 2965 faligndata %f20, %f22, %f52 2966 ldda [SRC + 0x30]%asi, %f28 2967 faligndata %f22, %f24, %f54 2968 sub CNT, VIS_BLOCKSIZE, CNT 2969 ldda [SRC + 0x38]%asi, %f30 2970 faligndata %f24, %f26, %f56 2971 add DST, VIS_BLOCKSIZE, DST 2972 ldda [SRC + VIS_BLOCKSIZE]%asi, %f16 2973 faligndata %f26, %f28, %f58 2974 add REALSRC, VIS_BLOCKSIZE, REALSRC 2975 prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #n_reads 2976 add SRC, VIS_BLOCKSIZE, SRC 2977 prefetcha [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read 2978 cmp CNT, VIS_BLOCKSIZE + 8 2979 bgu,pt %ncc, 1b 2980 prefetcha [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read 2981 2982 ! only if REALSRC & 0x7 is 0 2983 cmp CNT, VIS_BLOCKSIZE 2984 bne %ncc, 3f 2985 andcc REALSRC, 0x7, %g0 2986 bz,pt %ncc, 2f 2987 nop 2988 3: 2989 faligndata %f28, %f30, %f60 2990 faligndata %f30, %f16, %f62 2991 stda %f48, [DST]ASI_BLK_P 2992 add DST, VIS_BLOCKSIZE, DST 2993 ba,pt %ncc, 3f 2994 nop 2995 2: 2996 ldda [SRC + 0x08]%asi, %f18 2997 fsrc1 %f28, %f60 2998 ldda [SRC + 0x10]%asi, %f20 2999 fsrc1 %f30, %f62 3000 stda %f48, [DST]ASI_BLK_P 3001 ldda [SRC + 0x18]%asi, %f22 3002 fsrc1 %f16, %f48 3003 ldda [SRC + 0x20]%asi, %f24 3004 fsrc1 %f18, %f50 3005 ldda [SRC + 0x28]%asi, %f26 3006 fsrc1 %f20, %f52 3007 ldda [SRC + 0x30]%asi, %f28 3008 fsrc1 %f22, %f54 3009 ldda [SRC + 0x38]%asi, %f30 3010 fsrc1 %f24, %f56 3011 sub CNT, VIS_BLOCKSIZE, CNT 3012 add DST, VIS_BLOCKSIZE, DST 3013 add SRC, VIS_BLOCKSIZE, SRC 3014 add REALSRC, VIS_BLOCKSIZE, REALSRC 3015 fsrc1 %f26, %f58 3016 fsrc1 %f28, %f60 3017 fsrc1 %f30, %f62 3018 stda %f48, [DST]ASI_BLK_P 3019 add DST, VIS_BLOCKSIZE, DST 3020 ba,a,pt %ncc, 4f 3021 nop 3022 3023 3: tst CNT 3024 bz,a %ncc, 4f 3025 nop 3026 3027 5: lduba [REALSRC]ASI_USER, TMP 3028 inc REALSRC 3029 inc DST 3030 deccc CNT 3031 bgu %ncc, 5b 3032 stb TMP, [DST - 1] 3033 4: 3034 3035 .copyin_exit: 3036 membar #Sync 3037 3038 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr 3039 wr %o2, 0, %gsr 3040 3041 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 3042 btst FPRS_FEF, %o3 3043 bz,pt %icc, 4f 3044 nop 3045 3046 BLD_FPQ2Q4_FROMSTACK(%o2) 3047 3048 ba,pt %ncc, 1f 3049 wr %o3, 0, %fprs ! restore fprs 3050 3051 4: 3052 FZEROQ2Q4 3053 wr %o3, 0, %fprs ! restore fprs 3054 3055 1: 3056 membar #Sync ! sync error barrier 3057 andn %l6, FPUSED_FLAG, %l6 3058 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 3059 FP_ALLOWMIGRATE(5, 6) 3060 ret 3061 restore %g0, 0, %o0 3062 /* 3063 * We got here because of a fault during copyin 3064 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh). 3065 */ 3066 .copyin_err: 3067 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler 3068 tst %o4 3069 bz,pt %ncc, 2f ! if not, return error 3070 nop 3071 ldn [%o4 + CP_COPYIN], %g2 ! if handler, invoke it with 3072 jmp %g2 ! original arguments 3073 restore %g0, 0, %g0 ! dispose of copy window 3074 2: 3075 ret 3076 restore %g0, -1, %o0 ! return error value 3077 3078 3079 SET_SIZE(copyin_more) 3080 3081 ENTRY(xcopyin) 3082 3083 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 3084 bleu,pt %ncc, .xcopyin_small ! go to larger cases 3085 xor %o0, %o1, %o3 ! are src, dst alignable? 3086 btst 7, %o3 ! 3087 bz,pt %ncc, .xcopyin_8 ! check for longword alignment 3088 nop 3089 btst 1, %o3 ! 3090 bz,pt %ncc, .xcopyin_2 ! check for half-word 3091 nop 3092 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 3093 ld [%o3 + %lo(hw_copy_limit_1)], %o3 3094 tst %o3 3095 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy 3096 cmp %o2, %o3 ! if length <= limit 3097 bleu,pt %ncc, .xcopyin_small ! go to small copy 3098 nop 3099 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy 3100 nop 3101 .xcopyin_2: 3102 btst 3, %o3 ! 3103 bz,pt %ncc, .xcopyin_4 ! check for word alignment 3104 nop 3105 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 3106 ld [%o3 + %lo(hw_copy_limit_2)], %o3 3107 tst %o3 3108 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy 3109 cmp %o2, %o3 ! if length <= limit 3110 bleu,pt %ncc, .xcopyin_small ! go to small copy 3111 nop 3112 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy 3113 nop 3114 .xcopyin_4: 3115 ! already checked longword, must be word aligned 3116 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 3117 ld [%o3 + %lo(hw_copy_limit_4)], %o3 3118 tst %o3 3119 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy 3120 cmp %o2, %o3 ! if length <= limit 3121 bleu,pt %ncc, .xcopyin_small ! go to small copy 3122 nop 3123 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy 3124 nop 3125 .xcopyin_8: 3126 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 3127 ld [%o3 + %lo(hw_copy_limit_8)], %o3 3128 tst %o3 3129 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy 3130 cmp %o2, %o3 ! if length <= limit 3131 bleu,pt %ncc, .xcopyin_small ! go to small copy 3132 nop 3133 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy 3134 nop 3135 3136 .xcopyin_small: 3137 sethi %hi(.sm_xcopyin_err), %o5 ! .sm_xcopyin_err is lofault value 3138 or %o5, %lo(.sm_xcopyin_err), %o5 3139 ldn [THREAD_REG + T_LOFAULT], %o4 ! set/save t_lofaul 3140 membar #Sync ! sync error barrier 3141 ba,pt %ncc, .sm_do_copyin ! common code 3142 stn %o5, [THREAD_REG + T_LOFAULT] 3143 3144 .xcopyin_more: 3145 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 3146 sethi %hi(.xcopyin_err), REAL_LOFAULT ! .xcopyin_err is lofault value 3147 ba,pt %ncc, .do_copyin 3148 or REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT 3149 3150 /* 3151 * We got here because of fault during xcopyin 3152 * Errno value is in ERRNO 3153 */ 3154 .xcopyin_err: 3155 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler 3156 tst %o4 3157 bz,pt %ncc, 2f ! if not, return error 3158 nop 3159 ldn [%o4 + CP_XCOPYIN], %g2 ! if handler, invoke it with 3160 jmp %g2 ! original arguments 3161 restore %g0, 0, %g0 ! dispose of copy window 3162 2: 3163 ret 3164 restore ERRNO, 0, %o0 ! return errno value 3165 3166 .sm_xcopyin_err: 3167 3168 membar #Sync 3169 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 3170 mov SM_SAVE_SRC, %o0 3171 mov SM_SAVE_DST, %o1 3172 mov SM_SAVE_COUNT, %o2 3173 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler 3174 tst %o3 3175 bz,pt %ncc, 3f ! if not, return error 3176 nop 3177 ldn [%o3 + CP_XCOPYIN], %o5 ! if handler, invoke it with 3178 jmp %o5 ! original arguments 3179 nop 3180 3: 3181 retl 3182 or %g1, 0, %o0 ! return errno value 3183 3184 SET_SIZE(xcopyin) 3185 3186 ENTRY(xcopyin_little) 3187 sethi %hi(.xcopyio_err), %o5 3188 or %o5, %lo(.xcopyio_err), %o5 3189 ldn [THREAD_REG + T_LOFAULT], %o4 3190 membar #Sync ! sync error barrier 3191 stn %o5, [THREAD_REG + T_LOFAULT] 3192 mov %o4, %o5 3193 3194 subcc %g0, %o2, %o3 3195 add %o0, %o2, %o0 3196 bz,pn %ncc, 2f ! check for zero bytes 3197 sub %o2, 1, %o4 3198 add %o0, %o4, %o0 ! start w/last byte 3199 add %o1, %o2, %o1 3200 lduba [%o0 + %o3]ASI_AIUSL, %o4 3201 3202 1: stb %o4, [%o1 + %o3] 3203 inccc %o3 3204 sub %o0, 2, %o0 ! get next byte 3205 bcc,a,pt %ncc, 1b 3206 lduba [%o0 + %o3]ASI_AIUSL, %o4 3207 3208 2: 3209 membar #Sync ! sync error barrier 3210 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 3211 retl 3212 mov %g0, %o0 ! return (0) 3213 3214 .xcopyio_err: 3215 membar #Sync ! sync error barrier 3216 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 3217 retl 3218 mov %g1, %o0 3219 3220 SET_SIZE(xcopyin_little) 3221 3222 3223 /* 3224 * Copy a block of storage - must not overlap (from + len <= to). 3225 * No fault handler installed (to be called under on_fault()) 3226 */ 3227 ENTRY(copyin_noerr) 3228 3229 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 3230 bleu,pt %ncc, .copyin_ne_small ! go to larger cases 3231 xor %o0, %o1, %o3 ! are src, dst alignable? 3232 btst 7, %o3 ! 3233 bz,pt %ncc, .copyin_ne_8 ! check for longword alignment 3234 nop 3235 btst 1, %o3 ! 3236 bz,pt %ncc, .copyin_ne_2 ! check for half-word 3237 nop 3238 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 3239 ld [%o3 + %lo(hw_copy_limit_1)], %o3 3240 tst %o3 3241 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy 3242 cmp %o2, %o3 ! if length <= limit 3243 bleu,pt %ncc, .copyin_ne_small ! go to small copy 3244 nop 3245 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy 3246 nop 3247 .copyin_ne_2: 3248 btst 3, %o3 ! 3249 bz,pt %ncc, .copyin_ne_4 ! check for word alignment 3250 nop 3251 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 3252 ld [%o3 + %lo(hw_copy_limit_2)], %o3 3253 tst %o3 3254 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy 3255 cmp %o2, %o3 ! if length <= limit 3256 bleu,pt %ncc, .copyin_ne_small ! go to small copy 3257 nop 3258 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy 3259 nop 3260 .copyin_ne_4: 3261 ! already checked longword, must be word aligned 3262 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 3263 ld [%o3 + %lo(hw_copy_limit_4)], %o3 3264 tst %o3 3265 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy 3266 cmp %o2, %o3 ! if length <= limit 3267 bleu,pt %ncc, .copyin_ne_small ! go to small copy 3268 nop 3269 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy 3270 nop 3271 .copyin_ne_8: 3272 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 3273 ld [%o3 + %lo(hw_copy_limit_8)], %o3 3274 tst %o3 3275 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy 3276 cmp %o2, %o3 ! if length <= limit 3277 bleu,pt %ncc, .copyin_ne_small ! go to small copy 3278 nop 3279 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy 3280 nop 3281 3282 .copyin_ne_small: 3283 ldn [THREAD_REG + T_LOFAULT], %o4 3284 tst %o4 3285 bz,pn %ncc, .sm_do_copyin 3286 nop 3287 sethi %hi(.sm_copyio_noerr), %o5 3288 or %o5, %lo(.sm_copyio_noerr), %o5 3289 membar #Sync ! sync error barrier 3290 ba,pt %ncc, .sm_do_copyin 3291 stn %o5, [THREAD_REG + T_LOFAULT] ! set/save t_lofault 3292 3293 .copyin_noerr_more: 3294 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 3295 sethi %hi(.copyio_noerr), REAL_LOFAULT 3296 ba,pt %ncc, .do_copyin 3297 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT 3298 3299 .copyio_noerr: 3300 jmp %l6 3301 restore %g0,0,%g0 3302 3303 .sm_copyio_noerr: 3304 membar #Sync 3305 stn %o4, [THREAD_REG + T_LOFAULT] ! restore t_lofault 3306 jmp %o4 3307 nop 3308 3309 SET_SIZE(copyin_noerr) 3310 3311 /* 3312 * Copy a block of storage - must not overlap (from + len <= to). 3313 * No fault handler installed (to be called under on_fault()) 3314 */ 3315 3316 ENTRY(copyout_noerr) 3317 3318 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 3319 bleu,pt %ncc, .copyout_ne_small ! go to larger cases 3320 xor %o0, %o1, %o3 ! are src, dst alignable? 3321 btst 7, %o3 ! 3322 bz,pt %ncc, .copyout_ne_8 ! check for longword alignment 3323 nop 3324 btst 1, %o3 ! 3325 bz,pt %ncc, .copyout_ne_2 ! check for half-word 3326 nop 3327 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 3328 ld [%o3 + %lo(hw_copy_limit_1)], %o3 3329 tst %o3 3330 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy 3331 cmp %o2, %o3 ! if length <= limit 3332 bleu,pt %ncc, .copyout_ne_small ! go to small copy 3333 nop 3334 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy 3335 nop 3336 .copyout_ne_2: 3337 btst 3, %o3 ! 3338 bz,pt %ncc, .copyout_ne_4 ! check for word alignment 3339 nop 3340 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 3341 ld [%o3 + %lo(hw_copy_limit_2)], %o3 3342 tst %o3 3343 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy 3344 cmp %o2, %o3 ! if length <= limit 3345 bleu,pt %ncc, .copyout_ne_small ! go to small copy 3346 nop 3347 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy 3348 nop 3349 .copyout_ne_4: 3350 ! already checked longword, must be word aligned 3351 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 3352 ld [%o3 + %lo(hw_copy_limit_4)], %o3 3353 tst %o3 3354 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy 3355 cmp %o2, %o3 ! if length <= limit 3356 bleu,pt %ncc, .copyout_ne_small ! go to small copy 3357 nop 3358 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy 3359 nop 3360 .copyout_ne_8: 3361 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 3362 ld [%o3 + %lo(hw_copy_limit_8)], %o3 3363 tst %o3 3364 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy 3365 cmp %o2, %o3 ! if length <= limit 3366 bleu,pt %ncc, .copyout_ne_small ! go to small copy 3367 nop 3368 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy 3369 nop 3370 3371 .copyout_ne_small: 3372 ldn [THREAD_REG + T_LOFAULT], %o4 3373 tst %o4 3374 bz,pn %ncc, .sm_do_copyout 3375 nop 3376 sethi %hi(.sm_copyio_noerr), %o5 3377 or %o5, %lo(.sm_copyio_noerr), %o5 3378 membar #Sync ! sync error barrier 3379 ba,pt %ncc, .sm_do_copyout 3380 stn %o5, [THREAD_REG + T_LOFAULT] ! set/save t_lofault 3381 3382 .copyout_noerr_more: 3383 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 3384 sethi %hi(.copyio_noerr), REAL_LOFAULT 3385 ba,pt %ncc, .do_copyout 3386 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT 3387 3388 SET_SIZE(copyout_noerr) 3389 3390 3391 /* 3392 * hwblkclr - clears block-aligned, block-multiple-sized regions that are 3393 * longer than 256 bytes in length using spitfire's block stores. If 3394 * the criteria for using this routine are not met then it calls bzero 3395 * and returns 1. Otherwise 0 is returned indicating success. 3396 * Caller is responsible for ensuring use_hw_bzero is true and that 3397 * kpreempt_disable() has been called. 3398 */ 3399 ! %i0 - start address 3400 ! %i1 - length of region (multiple of 64) 3401 ! %l0 - saved fprs 3402 ! %l1 - pointer to saved %d0 block 3403 ! %l2 - saved curthread->t_lwp 3404 3405 ENTRY(hwblkclr) 3406 ! get another window w/space for one aligned block of saved fpregs 3407 save %sp, -SA(MINFRAME + 2*VIS_BLOCKSIZE), %sp 3408 3409 ! Must be block-aligned 3410 andcc %i0, (VIS_BLOCKSIZE-1), %g0 3411 bnz,pn %ncc, 1f 3412 nop 3413 3414 ! ... and must be 256 bytes or more 3415 cmp %i1, 256 3416 blu,pn %ncc, 1f 3417 nop 3418 3419 ! ... and length must be a multiple of VIS_BLOCKSIZE 3420 andcc %i1, (VIS_BLOCKSIZE-1), %g0 3421 bz,pn %ncc, 2f 3422 nop 3423 3424 1: ! punt, call bzero but notify the caller that bzero was used 3425 mov %i0, %o0 3426 call bzero 3427 mov %i1, %o1 3428 ret 3429 restore %g0, 1, %o0 ! return (1) - did not use block operations 3430 3431 2: rd %fprs, %l0 ! check for unused fp 3432 btst FPRS_FEF, %l0 3433 bz,pt %icc, 1f 3434 nop 3435 3436 ! save in-use fpregs on stack 3437 membar #Sync 3438 add %fp, STACK_BIAS - 65, %l1 3439 and %l1, -VIS_BLOCKSIZE, %l1 3440 stda %d0, [%l1]ASI_BLK_P 3441 3442 1: membar #StoreStore|#StoreLoad|#LoadStore 3443 wr %g0, FPRS_FEF, %fprs 3444 wr %g0, ASI_BLK_P, %asi 3445 3446 ! Clear block 3447 fzero %d0 3448 fzero %d2 3449 fzero %d4 3450 fzero %d6 3451 fzero %d8 3452 fzero %d10 3453 fzero %d12 3454 fzero %d14 3455 3456 mov 256, %i3 3457 ba,pt %ncc, .pz_doblock 3458 nop 3459 3460 .pz_blkstart: 3461 ! stda %d0, [%i0 + 192]%asi ! in dly slot of branch that got us here 3462 stda %d0, [%i0 + 128]%asi 3463 stda %d0, [%i0 + 64]%asi 3464 stda %d0, [%i0]%asi 3465 .pz_zinst: 3466 add %i0, %i3, %i0 3467 sub %i1, %i3, %i1 3468 .pz_doblock: 3469 cmp %i1, 256 3470 bgeu,a %ncc, .pz_blkstart 3471 stda %d0, [%i0 + 192]%asi 3472 3473 cmp %i1, 64 3474 blu %ncc, .pz_finish 3475 3476 andn %i1, (64-1), %i3 3477 srl %i3, 4, %i2 ! using blocks, 1 instr / 16 words 3478 set .pz_zinst, %i4 3479 sub %i4, %i2, %i4 3480 jmp %i4 3481 nop 3482 3483 .pz_finish: 3484 membar #Sync 3485 btst FPRS_FEF, %l0 3486 bz,a .pz_finished 3487 wr %l0, 0, %fprs ! restore fprs 3488 3489 ! restore fpregs from stack 3490 ldda [%l1]ASI_BLK_P, %d0 3491 membar #Sync 3492 wr %l0, 0, %fprs ! restore fprs 3493 3494 .pz_finished: 3495 ret 3496 restore %g0, 0, %o0 ! return (bzero or not) 3497 3498 SET_SIZE(hwblkclr) 3499 3500 /* 3501 * Copy 32 bytes of data from src (%o0) to dst (%o1) 3502 * using physical addresses. 3503 */ 3504 ENTRY_NP(hw_pa_bcopy32) 3505 rdpr %pstate, %g1 3506 andn %g1, PSTATE_IE, %g2 3507 wrpr %g0, %g2, %pstate 3508 3509 rdpr %pstate, %g0 3510 ldxa [%o0]ASI_MEM, %o2 3511 add %o0, 8, %o0 3512 ldxa [%o0]ASI_MEM, %o3 3513 add %o0, 8, %o0 3514 ldxa [%o0]ASI_MEM, %o4 3515 add %o0, 8, %o0 3516 ldxa [%o0]ASI_MEM, %o5 3517 membar #Sync 3518 3519 stxa %o2, [%o1]ASI_MEM 3520 add %o1, 8, %o1 3521 stxa %o3, [%o1]ASI_MEM 3522 add %o1, 8, %o1 3523 stxa %o4, [%o1]ASI_MEM 3524 add %o1, 8, %o1 3525 stxa %o5, [%o1]ASI_MEM 3526 3527 retl 3528 wrpr %g0, %g1, %pstate 3529 3530 SET_SIZE(hw_pa_bcopy32) 3531 3532 DGDEF(use_hw_bcopy) 3533 .word 1 3534 DGDEF(use_hw_bzero) 3535 .word 1 3536 DGDEF(hw_copy_limit_1) 3537 .word 0 3538 DGDEF(hw_copy_limit_2) 3539 .word 0 3540 DGDEF(hw_copy_limit_4) 3541 .word 0 3542 DGDEF(hw_copy_limit_8) 3543 .word 0 3544 3545 .align 64 3546 .section ".text"