1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2004 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/param.h> 30 #include <sys/errno.h> 31 #include <sys/asm_linkage.h> 32 #include <sys/vtrace.h> 33 #include <sys/machthread.h> 34 #include <sys/clock.h> 35 #include <sys/asi.h> 36 #include <sys/fsr.h> 37 #include <sys/privregs.h> 38 #include <sys/fpras_impl.h> 39 40 #if !defined(lint) 41 #include "assym.h" 42 #endif /* lint */ 43 44 /* 45 * Pseudo-code to aid in understanding the control flow of the 46 * bcopy/copyin/copyout routines. 47 * 48 * On entry: 49 * 50 * ! Determine whether to use the FP register version 51 * ! or the leaf routine version depending on size 52 * ! of copy and flags. Set up error handling accordingly. 53 * ! The transition point depends on whether the src and 54 * ! dst addresses can be aligned to long word, word, 55 * ! half word, or byte boundaries. 56 * ! 57 * ! WARNING: <Register usage convention> 58 * ! For FP version, %l6 holds previous error handling and 59 * ! a flag: TRAMP_FLAG (low bits) 60 * ! for leaf routine version, %o4 holds those values. 61 * ! So either %l6 or %o4 is reserved and not available for 62 * ! any other use. 63 * 64 * if (length <= VIS_COPY_THRESHOLD) ! start with a quick test 65 * go to small_copy; ! to speed short copies 66 * 67 * ! src, dst long word alignable 68 * if (hw_copy_limit_8 == 0) ! hw_copy disabled 69 * go to small_copy; 70 * if (length <= hw_copy_limit_8) 71 * go to small_copy; 72 * go to FPBLK_copy; 73 * } 74 * if (src,dst not alignable) { 75 * if (hw_copy_limit_1 == 0) ! hw_copy disabled 76 * go to small_copy; 77 * if (length <= hw_copy_limit_1) 78 * go to small_copy; 79 * go to FPBLK_copy; 80 * } 81 * if (src,dst halfword alignable) { 82 * if (hw_copy_limit_2 == 0) ! hw_copy disabled 83 * go to small_copy; 84 * if (length <= hw_copy_limit_2) 85 * go to small_copy; 86 * go to FPBLK_copy; 87 * } 88 * if (src,dst word alignable) { 89 * if (hw_copy_limit_4 == 0) ! hw_copy disabled 90 * go to small_copy; 91 * if (length <= hw_copy_limit_4) 92 * go to small_copy; 93 * go to FPBLK_copy; 94 * } 95 * 96 * small_copy: 97 * Setup_leaf_rtn_error_handler; ! diffs for each entry point 98 * 99 * if (count <= 3) ! fast path for tiny copies 100 * go to sm_left; ! special finish up code 101 * else 102 * if (count > CHKSIZE) ! medium sized copies 103 * go to sm_med ! tuned by alignment 104 * if(src&dst not both word aligned) { 105 * sm_movebytes: 106 * move byte by byte in 4-way unrolled loop 107 * fall into sm_left; 108 * sm_left: 109 * move 0-3 bytes byte at a time as needed. 110 * restore error handler and exit. 111 * 112 * } else { ! src&dst are word aligned 113 * check for at least 8 bytes left, 114 * move word at a time, unrolled by 2 115 * when fewer than 8 bytes left, 116 * sm_half: move half word at a time while 2 or more bytes left 117 * sm_byte: move final byte if necessary 118 * sm_exit: 119 * restore error handler and exit. 120 * } 121 * 122 * ! Medium length cases with at least CHKSIZE bytes available 123 * ! method: line up src and dst as best possible, then 124 * ! move data in 4-way unrolled loops. 125 * 126 * sm_med: 127 * if(src&dst unalignable) 128 * go to sm_movebytes 129 * if(src&dst halfword alignable) 130 * go to sm_movehalf 131 * if(src&dst word alignable) 132 * go to sm_moveword 133 * ! fall into long word movement 134 * move bytes until src is word aligned 135 * if not long word aligned, move a word 136 * move long words in 4-way unrolled loop until < 32 bytes left 137 * move long words in 1-way unrolled loop until < 8 bytes left 138 * if zero bytes left, goto sm_exit 139 * if one byte left, go to sm_byte 140 * else go to sm_half 141 * 142 * sm_moveword: 143 * move bytes until src is word aligned 144 * move words in 4-way unrolled loop until < 16 bytes left 145 * move words in 1-way unrolled loop until < 4 bytes left 146 * if zero bytes left, goto sm_exit 147 * if one byte left, go to sm_byte 148 * else go to sm_half 149 * 150 * sm_movehalf: 151 * move a byte if needed to align src on halfword 152 * move halfwords in 4-way unrolled loop until < 8 bytes left 153 * if zero bytes left, goto sm_exit 154 * if one byte left, go to sm_byte 155 * else go to sm_half 156 * 157 * 158 * FPBLK_copy: 159 * %l6 = curthread->t_lofault; 160 * if (%l6 != NULL) { 161 * membar #Sync 162 * curthread->t_lofault = .copyerr; 163 * caller_error_handler = TRUE ! %l6 |= 2 164 * } 165 * 166 * ! for FPU testing we must not migrate cpus 167 * if (curthread->t_lwp == NULL) { 168 * ! Kernel threads do not have pcb's in which to store 169 * ! the floating point state, so disallow preemption during 170 * ! the copy. This also prevents cpu migration. 171 * kpreempt_disable(curthread); 172 * } else { 173 * thread_nomigrate(); 174 * } 175 * 176 * old_fprs = %fprs; 177 * old_gsr = %gsr; 178 * if (%fprs.fef) { 179 * %fprs.fef = 1; 180 * save current fpregs on stack using blockstore 181 * } else { 182 * %fprs.fef = 1; 183 * } 184 * 185 * 186 * do_blockcopy_here; 187 * 188 * In lofault handler: 189 * curthread->t_lofault = .copyerr2; 190 * Continue on with the normal exit handler 191 * 192 * On normal exit: 193 * %gsr = old_gsr; 194 * if (old_fprs & FPRS_FEF) 195 * restore fpregs from stack using blockload 196 * else 197 * zero fpregs 198 * %fprs = old_fprs; 199 * membar #Sync 200 * curthread->t_lofault = (%l6 & ~3); 201 * ! following test omitted from copyin/copyout as they 202 * ! will always have a current thread 203 * if (curthread->t_lwp == NULL) 204 * kpreempt_enable(curthread); 205 * else 206 * thread_allowmigrate(); 207 * return (0) 208 * 209 * In second lofault handler (.copyerr2): 210 * We've tried to restore fp state from the stack and failed. To 211 * prevent from returning with a corrupted fp state, we will panic. 212 */ 213 214 /* 215 * Comments about optimization choices 216 * 217 * The initial optimization decision in this code is to determine 218 * whether to use the FP registers for a copy or not. If we don't 219 * use the FP registers, we can execute the copy as a leaf routine, 220 * saving a register save and restore. Also, less elaborate setup 221 * is required, allowing short copies to be completed more quickly. 222 * For longer copies, especially unaligned ones (where the src and 223 * dst do not align to allow simple ldx,stx operation), the FP 224 * registers allow much faster copy operations. 225 * 226 * The estimated extra cost of the FP path will vary depending on 227 * src/dst alignment, dst offset from the next 64 byte FPblock store 228 * boundary, remaining src data after the last full dst cache line is 229 * moved whether the FP registers need to be saved, and some other 230 * minor issues. The average additional overhead is estimated to be 231 * 400 clocks. Since each non-repeated/predicted tst and branch costs 232 * around 10 clocks, elaborate calculation would slow down to all 233 * longer copies and only benefit a small portion of medium sized 234 * copies. Rather than incur such cost, we chose fixed transition 235 * points for each of the alignment choices. 236 * 237 * For the inner loop, here is a comparison of the per cache line 238 * costs for each alignment when src&dst are in cache: 239 * 240 * byte aligned: 108 clocks slower for non-FPBLK 241 * half aligned: 44 clocks slower for non-FPBLK 242 * word aligned: 12 clocks slower for non-FPBLK 243 * long aligned: 4 clocks >>faster<< for non-FPBLK 244 * 245 * The long aligned loop runs faster because it does no prefetching. 246 * That wins if the data is not in cache or there is too little 247 * data to gain much benefit from prefetching. But when there 248 * is more data and that data is not in cache, failing to prefetch 249 * can run much slower. In addition, there is a 2 Kbyte store queue 250 * which will cause the non-FPBLK inner loop to slow for larger copies. 251 * The exact tradeoff is strongly load and application dependent, with 252 * increasing risk of a customer visible performance regression if the 253 * non-FPBLK code is used for larger copies. Studies of synthetic in-cache 254 * vs out-of-cache copy tests in user space suggest 1024 bytes as a safe 255 * upper limit for the non-FPBLK code. To minimize performance regression 256 * risk while still gaining the primary benefits of the improvements to 257 * the non-FPBLK code, we set an upper bound of 1024 bytes for the various 258 * hw_copy_limit_*. Later experimental studies using different values 259 * of hw_copy_limit_* can be used to make further adjustments if 260 * appropriate. 261 * 262 * hw_copy_limit_1 = src and dst are byte aligned but not halfword aligned 263 * hw_copy_limit_2 = src and dst are halfword aligned but not word aligned 264 * hw_copy_limit_4 = src and dst are word aligned but not longword aligned 265 * hw_copy_limit_8 = src and dst are longword aligned 266 * 267 * To say that src and dst are word aligned means that after 268 * some initial alignment activity of moving 0 to 3 bytes, 269 * both the src and dst will be on word boundaries so that 270 * word loads and stores may be used. 271 * 272 * Recommended initial values as of Mar 2004, includes testing 273 * on Cheetah+ (900MHz), Cheetah++ (1200MHz), and Jaguar(1050MHz): 274 * hw_copy_limit_1 = 256 275 * hw_copy_limit_2 = 512 276 * hw_copy_limit_4 = 1024 277 * hw_copy_limit_8 = 1024 (or 1536 on some systems) 278 * 279 * 280 * If hw_copy_limit_? is set to zero, then use of FPBLK copy is 281 * disabled for that alignment choice. 282 * If hw_copy_limit_? is set to a value between 1 and VIS_COPY_THRESHOLD (256) 283 * the value of VIS_COPY_THRESHOLD is used. 284 * It is not envisioned that hw_copy_limit_? will be changed in the field 285 * It is provided to allow for disabling FPBLK copies and to allow 286 * easy testing of alternate values on future HW implementations 287 * that might have different cache sizes, clock rates or instruction 288 * timing rules. 289 * 290 * Our first test for FPBLK copies vs non-FPBLK copies checks a minimum 291 * threshold to speedup all shorter copies (less than 256). That 292 * saves an alignment test, memory reference, and enabling test 293 * for all short copies, or an estimated 24 clocks. 294 * 295 * The order in which these limits are checked does matter since each 296 * non-predicted tst and branch costs around 10 clocks. 297 * If src and dst are randomly selected addresses, 298 * 4 of 8 will not be alignable. 299 * 2 of 8 will be half word alignable. 300 * 1 of 8 will be word alignable. 301 * 1 of 8 will be long word alignable. 302 * But, tests on running kernels show that src and dst to copy code 303 * are typically not on random alignments. Structure copies and 304 * copies of larger data sizes are often on long word boundaries. 305 * So we test the long word alignment case first, then 306 * the byte alignment, then halfword, then word alignment. 307 * 308 * Several times, tests for length are made to split the code 309 * into subcases. These tests often allow later tests to be 310 * avoided. For example, within the non-FPBLK copy, we first 311 * check for tiny copies of 3 bytes or less. That allows us 312 * to use a 4-way unrolled loop for the general byte copy case 313 * without a test on loop entry. 314 * We subdivide the non-FPBLK case further into CHKSIZE bytes and less 315 * vs longer cases. For the really short case, we don't attempt 316 * align src and dst. We try to minimize special case tests in 317 * the shortest loops as each test adds a significant percentage 318 * to the total time. 319 * 320 * For the medium sized cases, we allow ourselves to adjust the 321 * src and dst alignment and provide special cases for each of 322 * the four adjusted alignment cases. The CHKSIZE that was used 323 * to decide between short and medium size was chosen to be 39 324 * as that allows for the worst case of 7 bytes of alignment 325 * shift and 4 times 8 bytes for the first long word unrolling. 326 * That knowledge saves an initial test for length on entry into 327 * the medium cases. If the general loop unrolling factor were 328 * to be increases, this number would also need to be adjusted. 329 * 330 * For all cases in the non-FPBLK code where it is known that at 331 * least 4 chunks of data are available for movement, the 332 * loop is unrolled by four. This 4-way loop runs in 8 clocks 333 * or 2 clocks per data element. Due to limitations of the 334 * branch instruction on Cheetah, Jaguar, and Panther, the 335 * minimum time for a small, tight loop is 3 clocks. So 336 * the 4-way loop runs 50% faster than the fastest non-unrolled 337 * loop. 338 * 339 * Instruction alignment is forced by used of .align 16 directives 340 * and nops which are not executed in the code. This 341 * combination of operations shifts the alignment of following 342 * loops to insure that loops are aligned so that their instructions 343 * fall within the minimum number of 4 instruction fetch groups. 344 * If instructions are inserted or removed between the .align 345 * instruction and the unrolled loops, then the alignment needs 346 * to be readjusted. Misaligned loops can add a clock per loop 347 * iteration to the loop timing. 348 * 349 * In a few cases, code is duplicated to avoid a branch. Since 350 * a non-predicted tst and branch takes 10 clocks, this savings 351 * is judged an appropriate time-space tradeoff. 352 * 353 * Within the FPBLK-code, the prefetch method in the inner 354 * loop needs to be explained as it is not standard. Two 355 * prefetches are issued for each cache line instead of one. 356 * The primary one is at the maximum reach of 8 cache lines. 357 * Most of the time, that maximum prefetch reach gives the 358 * cache line more time to reach the processor for systems with 359 * higher processor clocks. But, sometimes memory interference 360 * can cause that prefetch to be dropped. Putting a second 361 * prefetch at a reach of 5 cache lines catches the drops 362 * three iterations later and shows a measured improvement 363 * in performance over any similar loop with a single prefetch. 364 * The prefetches are placed in the loop so they overlap with 365 * non-memory instructions, so that there is no extra cost 366 * when the data is already in-cache. 367 * 368 */ 369 370 /* 371 * Notes on preserving existing fp state and on membars. 372 * 373 * When a copyOP decides to use fp we may have to preserve existing 374 * floating point state. It is not the caller's state that we need to 375 * preserve - the rest of the kernel does not use fp and, anyway, fp 376 * registers are volatile across a call. Some examples: 377 * 378 * - userland has fp state and is interrupted (device interrupt 379 * or trap) and within the interrupt/trap handling we use 380 * bcopy() 381 * - another (higher level) interrupt or trap handler uses bcopy 382 * while a bcopy from an earlier interrupt is still active 383 * - an asynchronous error trap occurs while fp state exists (in 384 * userland or in kernel copy) and the tl0 component of the handling 385 * uses bcopy 386 * - a user process with fp state incurs a copy-on-write fault and 387 * hwblkpagecopy always uses fp 388 * 389 * We therefore need a per-call place in which to preserve fp state - 390 * using our stack is ideal (and since fp copy cannot be leaf optimized 391 * because of calls it makes, this is no hardship). 392 * 393 * The following membar BLD/BST discussion is Cheetah pipeline specific. 394 * In Cheetah BLD is blocking, #LoadLoad/#LoadStore/#StoreStore are 395 * nops (those semantics always apply) and #StoreLoad is implemented 396 * as a membar #Sync. 397 * 398 * It is possible that the owner of the fp state has a block load or 399 * block store still "in flight" at the time we come to preserve that 400 * state. Block loads are blocking in Cheetah pipelines so we do not 401 * need to sync with them. In preserving fp regs we will use block stores 402 * (which are not blocking in Cheetah pipelines) so we require a membar #Sync 403 * after storing state (so that our subsequent use of those registers 404 * does not modify them before the block stores complete); this membar 405 * also serves to sync with block stores the owner of the fp state has 406 * initiated. 407 * 408 * When we have finished fp copy (with it's repeated block stores) 409 * we must membar #Sync so that our block stores may complete before 410 * we either restore the original fp state into the fp registers or 411 * return to a caller which may initiate other fp operations that could 412 * modify the fp regs we used before the block stores complete. 413 * 414 * Synchronous faults (eg, unresolvable DMMU miss) that occur while 415 * t_lofault is not NULL will not panic but will instead trampoline 416 * to the registered lofault handler. There is no need for any 417 * membars for these - eg, our store to t_lofault will always be visible to 418 * ourselves and it is our cpu which will take any trap. 419 * 420 * Asynchronous faults (eg, uncorrectable ECC error from memory) that occur 421 * while t_lofault is not NULL will also not panic. Since we're copying 422 * to or from userland the extent of the damage is known - the destination 423 * buffer is incomplete. So trap handlers will trampoline to the lofault 424 * handler in this case which should take some form of error action to 425 * avoid using the incomplete buffer. The trap handler also flags the 426 * fault so that later return-from-trap handling (for the trap that brought 427 * this thread into the kernel in the first place) can notify the process 428 * and reboot the system (or restart the service with Greenline/Contracts). 429 * 430 * Asynchronous faults (eg, uncorrectable ECC error from memory) can 431 * result in deferred error traps - the trap is taken sometime after 432 * the event and the trap PC may not be the PC of the faulting access. 433 * Delivery of such pending traps can be forced by a membar #Sync, acting 434 * as an "error barrier" in this role. To accurately apply the user/kernel 435 * separation described in the preceding paragraph we must force delivery 436 * of deferred traps affecting kernel state before we install a lofault 437 * handler (if we interpose a new lofault handler on an existing one there 438 * is no need to repeat this), and we must force delivery of deferred 439 * errors affecting the lofault-protected region before we clear t_lofault. 440 * Failure to do so results in lost kernel state being interpreted as 441 * affecting a copyin/copyout only, or of an error that really only 442 * affects copy data being interpreted as losing kernel state. 443 * 444 * Since the copy operations may preserve and later restore floating 445 * point state that does not belong to the caller (see examples above), 446 * we must be careful in how we do this in order to prevent corruption 447 * of another program. 448 * 449 * To make sure that floating point state is always saved and restored 450 * correctly, the following "big rules" must be followed when the floating 451 * point registers will be used: 452 * 453 * 1. %l6 always holds the caller's lofault handler. Also in this register, 454 * Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in 455 * use. Bit 2 (TRAMP_FLAG) indicates that the call was to bcopy, and a 456 * lofault handler was set coming in. 457 * 458 * 2. The FPUSED flag indicates that all FP state has been successfully stored 459 * on the stack. It should not be set until this save has been completed. 460 * 461 * 3. The FPUSED flag should not be cleared on exit until all FP state has 462 * been restored from the stack. If an error occurs while restoring 463 * data from the stack, the error handler can check this flag to see if 464 * a restore is necessary. 465 * 466 * 4. Code run under the new lofault handler must be kept to a minimum. In 467 * particular, any calls to FP_ALLOWMIGRATE, which could result in a call 468 * to kpreempt(), should not be made until after the lofault handler has 469 * been restored. 470 */ 471 472 /* 473 * VIS_COPY_THRESHOLD indicates the minimum number of bytes needed 474 * to "break even" using FP/VIS-accelerated memory operations. 475 * The FPBLK code assumes a minimum number of bytes are available 476 * to be moved on entry. Check that code carefully before 477 * reducing VIS_COPY_THRESHOLD below 256. 478 */ 479 /* 480 * This shadows sys/machsystm.h which can't be included due to the lack of 481 * _ASM guards in include files it references. Change it here, change it there. 482 */ 483 #define VIS_COPY_THRESHOLD 256 484 485 /* 486 * TEST for very short copies 487 * Be aware that the maximum unroll for the short unaligned case 488 * is SHORTCOPY+1 489 */ 490 #define SHORTCOPY 3 491 #define CHKSIZE 39 492 493 /* 494 * Indicates that we're to trampoline to the error handler. 495 * Entry points bcopy, copyin_noerr, and copyout_noerr use this flag. 496 * kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag. 497 */ 498 #define FPUSED_FLAG 1 499 #define TRAMP_FLAG 2 500 #define MASK_FLAGS 3 501 502 /* 503 * Number of outstanding prefetches. 504 * Testing with 1200 MHz Cheetah+ and Jaguar gives best results with 505 * two prefetches, one with a reach of 8*BLOCK_SIZE+8 and one with a 506 * reach of 5*BLOCK_SIZE. The double prefetch gives an typical improvement 507 * of 5% for large copies as compared to a single prefetch. The reason 508 * for the improvement is that with Cheetah and Jaguar, some prefetches 509 * are dropped due to the prefetch queue being full. The second prefetch 510 * reduces the number of cache lines that are dropped. 511 * Do not remove the double prefetch or change either CHEETAH_PREFETCH 512 * or CHEETAH_2ND_PREFETCH without extensive performance tests to prove 513 * there is no loss of performance. 514 */ 515 #define CHEETAH_PREFETCH 8 516 #define CHEETAH_2ND_PREFETCH 5 517 518 #define VIS_BLOCKSIZE 64 519 520 /* 521 * Size of stack frame in order to accomodate a 64-byte aligned 522 * floating-point register save area and 2 64-bit temp locations. 523 * All copy functions use two quadrants of fp registers; to assure a 524 * block-aligned two block buffer in which to save we must reserve 525 * three blocks on stack. Not all functions preserve %pfrs on stack 526 * or need to preserve %gsr but we use HWCOPYFRAMESIZE for all. 527 * 528 * _______________________________________ <-- %fp + STACK_BIAS 529 * | We may need to preserve 2 quadrants | 530 * | of fp regs, but since we do so with | 531 * | BST/BLD we need room in which to | 532 * | align to VIS_BLOCKSIZE bytes. So | 533 * | this area is 3 * VIS_BLOCKSIZE. | <-- - SAVED_FPREGS_OFFSET 534 * |-------------------------------------| 535 * | 8 bytes to save %fprs | <-- - SAVED_FPRS_OFFSET 536 * |-------------------------------------| 537 * | 8 bytes to save %gsr | <-- - SAVED_GSR_OFFSET 538 * --------------------------------------- 539 */ 540 #define HWCOPYFRAMESIZE ((VIS_BLOCKSIZE * (2 + 1)) + (2 * 8)) 541 #define SAVED_FPREGS_OFFSET (VIS_BLOCKSIZE * 3) 542 #define SAVED_FPREGS_ADJUST ((VIS_BLOCKSIZE * 2) - 1) 543 #define SAVED_FPRS_OFFSET (SAVED_FPREGS_OFFSET + 8) 544 #define SAVED_GSR_OFFSET (SAVED_FPRS_OFFSET + 8) 545 546 /* 547 * Common macros used by the various versions of the block copy 548 * routines in this file. 549 */ 550 551 /* 552 * In FP copies if we do not have preserved data to restore over 553 * the fp regs we used then we must zero those regs to avoid 554 * exposing portions of the data to later threads (data security). 555 * 556 * Copy functions use either quadrants 1 and 3 or 2 and 4. 557 * 558 * FZEROQ1Q3: Zero quadrants 1 and 3, ie %f0 - %f15 and %f32 - %f47 559 * FZEROQ2Q4: Zero quadrants 2 and 4, ie %f16 - %f31 and %f48 - %f63 560 * 561 * The instructions below are quicker than repeated fzero instructions 562 * since they can dispatch down two fp pipelines. 563 */ 564 #define FZEROQ1Q3 \ 565 fzero %f0 ;\ 566 fzero %f2 ;\ 567 faddd %f0, %f2, %f4 ;\ 568 fmuld %f0, %f2, %f6 ;\ 569 faddd %f0, %f2, %f8 ;\ 570 fmuld %f0, %f2, %f10 ;\ 571 faddd %f0, %f2, %f12 ;\ 572 fmuld %f0, %f2, %f14 ;\ 573 faddd %f0, %f2, %f32 ;\ 574 fmuld %f0, %f2, %f34 ;\ 575 faddd %f0, %f2, %f36 ;\ 576 fmuld %f0, %f2, %f38 ;\ 577 faddd %f0, %f2, %f40 ;\ 578 fmuld %f0, %f2, %f42 ;\ 579 faddd %f0, %f2, %f44 ;\ 580 fmuld %f0, %f2, %f46 581 582 #define FZEROQ2Q4 \ 583 fzero %f16 ;\ 584 fzero %f18 ;\ 585 faddd %f16, %f18, %f20 ;\ 586 fmuld %f16, %f18, %f22 ;\ 587 faddd %f16, %f18, %f24 ;\ 588 fmuld %f16, %f18, %f26 ;\ 589 faddd %f16, %f18, %f28 ;\ 590 fmuld %f16, %f18, %f30 ;\ 591 faddd %f16, %f18, %f48 ;\ 592 fmuld %f16, %f18, %f50 ;\ 593 faddd %f16, %f18, %f52 ;\ 594 fmuld %f16, %f18, %f54 ;\ 595 faddd %f16, %f18, %f56 ;\ 596 fmuld %f16, %f18, %f58 ;\ 597 faddd %f16, %f18, %f60 ;\ 598 fmuld %f16, %f18, %f62 599 600 /* 601 * Macros to save and restore quadrants 1 and 3 or 2 and 4 to/from the stack. 602 * Used to save and restore in-use fp registers when we want to use FP 603 * and find fp already in use and copy size still large enough to justify 604 * the additional overhead of this save and restore. 605 * 606 * A membar #Sync is needed before save to sync fp ops initiated before 607 * the call to the copy function (by whoever has fp in use); for example 608 * an earlier block load to the quadrant we are about to save may still be 609 * "in flight". A membar #Sync is required at the end of the save to 610 * sync our block store (the copy code is about to begin ldd's to the 611 * first quadrant). Note, however, that since Cheetah pipeline block load 612 * is blocking we can omit the initial membar before saving fp state (they're 613 * commented below in case of future porting to a chip that does not block 614 * on block load). 615 * 616 * Similarly: a membar #Sync before restore allows the block stores of 617 * the copy operation to complete before we fill the quadrants with their 618 * original data, and a membar #Sync after restore lets the block loads 619 * of the restore complete before we return to whoever has the fp regs 620 * in use. To avoid repeated membar #Sync we make it the responsibility 621 * of the copy code to membar #Sync immediately after copy is complete 622 * and before using the BLD_*_FROMSTACK macro. 623 */ 624 #if !defined(lint) 625 #define BST_FPQ1Q3_TOSTACK(tmp1) \ 626 /* membar #Sync */ ;\ 627 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 628 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 629 stda %f0, [tmp1]ASI_BLK_P ;\ 630 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 631 stda %f32, [tmp1]ASI_BLK_P ;\ 632 membar #Sync 633 634 #define BLD_FPQ1Q3_FROMSTACK(tmp1) \ 635 /* membar #Sync - provided at copy completion */ ;\ 636 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 637 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 638 ldda [tmp1]ASI_BLK_P, %f0 ;\ 639 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 640 ldda [tmp1]ASI_BLK_P, %f32 ;\ 641 membar #Sync 642 643 #define BST_FPQ2Q4_TOSTACK(tmp1) \ 644 /* membar #Sync */ ;\ 645 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 646 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 647 stda %f16, [tmp1]ASI_BLK_P ;\ 648 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 649 stda %f48, [tmp1]ASI_BLK_P ;\ 650 membar #Sync 651 652 #define BLD_FPQ2Q4_FROMSTACK(tmp1) \ 653 /* membar #Sync - provided at copy completion */ ;\ 654 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 655 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 656 ldda [tmp1]ASI_BLK_P, %f16 ;\ 657 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 658 ldda [tmp1]ASI_BLK_P, %f48 ;\ 659 membar #Sync 660 #endif 661 662 /* 663 * FP_NOMIGRATE and FP_ALLOWMIGRATE. Prevent migration (or, stronger, 664 * prevent preemption if there is no t_lwp to save FP state to on context 665 * switch) before commencing a FP copy, and reallow it on completion or 666 * in error trampoline paths when we were using FP copy. 667 * 668 * Both macros may call other functions, so be aware that all outputs are 669 * forfeit after using these macros. For this reason we do not pass registers 670 * to use - we just use any outputs we want. 671 * 672 * For fpRAS we need to perform the fpRAS mechanism test on the same 673 * CPU as we use for the copy operation, both so that we validate the 674 * CPU we perform the copy on and so that we know which CPU failed 675 * if a failure is detected. Hence we need to be bound to "our" CPU. 676 * This could be achieved through disabling preemption (and we have do it that 677 * way for threads with no t_lwp) but for larger copies this may hold 678 * higher priority threads off of cpu for too long (eg, realtime). So we 679 * make use of the lightweight t_nomigrate mechanism where we can (ie, when 680 * we have a t_lwp). 681 * 682 * Pseudo code: 683 * 684 * FP_NOMIGRATE: 685 * 686 * if (curthread->t_lwp) { 687 * thread_nomigrate(); 688 * } else { 689 * kpreempt_disable(); 690 * } 691 * 692 * FP_ALLOWMIGRATE: 693 * 694 * if (curthread->t_lwp) { 695 * thread_allowmigrate(); 696 * } else { 697 * kpreempt_enable(); 698 * } 699 */ 700 701 #define FP_NOMIGRATE(label1, label2) \ 702 ldn [THREAD_REG + T_LWP], %o0 ;\ 703 brz,a,pn %o0, label1/**/f ;\ 704 ldsb [THREAD_REG + T_PREEMPT], %o1 ;\ 705 call thread_nomigrate ;\ 706 nop ;\ 707 ba label2/**/f ;\ 708 nop ;\ 709 label1: ;\ 710 inc %o1 ;\ 711 stb %o1, [THREAD_REG + T_PREEMPT] ;\ 712 label2: 713 714 #define FP_ALLOWMIGRATE(label1, label2) \ 715 ldn [THREAD_REG + T_LWP], %o0 ;\ 716 brz,a,pn %o0, label1/**/f ;\ 717 ldsb [THREAD_REG + T_PREEMPT], %o1 ;\ 718 call thread_allowmigrate ;\ 719 nop ;\ 720 ba label2/**/f ;\ 721 nop ;\ 722 label1: ;\ 723 dec %o1 ;\ 724 brnz,pn %o1, label2/**/f ;\ 725 stb %o1, [THREAD_REG + T_PREEMPT] ;\ 726 ldn [THREAD_REG + T_CPU], %o0 ;\ 727 ldub [%o0 + CPU_KPRUNRUN], %o0 ;\ 728 brz,pt %o0, label2/**/f ;\ 729 nop ;\ 730 call kpreempt ;\ 731 rdpr %pil, %o0 ;\ 732 label2: 733 734 /* 735 * Copy a block of storage, returning an error code if `from' or 736 * `to' takes a kernel pagefault which cannot be resolved. 737 * Returns errno value on pagefault error, 0 if all ok 738 */ 739 740 #if defined(lint) 741 742 /* ARGSUSED */ 743 int 744 kcopy(const void *from, void *to, size_t count) 745 { return(0); } 746 747 #else /* lint */ 748 749 .seg ".text" 750 .align 4 751 752 ENTRY(kcopy) 753 754 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 755 bleu,pt %ncc, .kcopy_small ! go to larger cases 756 xor %o0, %o1, %o3 ! are src, dst alignable? 757 btst 7, %o3 ! 758 bz,pt %ncc, .kcopy_8 ! check for longword alignment 759 nop 760 btst 1, %o3 ! 761 bz,pt %ncc, .kcopy_2 ! check for half-word 762 nop 763 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 764 ld [%o3 + %lo(hw_copy_limit_1)], %o3 765 tst %o3 766 bz,pn %icc, .kcopy_small ! if zero, disable HW copy 767 cmp %o2, %o3 ! if length <= limit 768 bleu,pt %ncc, .kcopy_small ! go to small copy 769 nop 770 ba,pt %ncc, .kcopy_more ! otherwise go to large copy 771 nop 772 .kcopy_2: 773 btst 3, %o3 ! 774 bz,pt %ncc, .kcopy_4 ! check for word alignment 775 nop 776 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 777 ld [%o3 + %lo(hw_copy_limit_2)], %o3 778 tst %o3 779 bz,pn %icc, .kcopy_small ! if zero, disable HW copy 780 cmp %o2, %o3 ! if length <= limit 781 bleu,pt %ncc, .kcopy_small ! go to small copy 782 nop 783 ba,pt %ncc, .kcopy_more ! otherwise go to large copy 784 nop 785 .kcopy_4: 786 ! already checked longword, must be word aligned 787 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 788 ld [%o3 + %lo(hw_copy_limit_4)], %o3 789 tst %o3 790 bz,pn %icc, .kcopy_small ! if zero, disable HW copy 791 cmp %o2, %o3 ! if length <= limit 792 bleu,pt %ncc, .kcopy_small ! go to small copy 793 nop 794 ba,pt %ncc, .kcopy_more ! otherwise go to large copy 795 nop 796 .kcopy_8: 797 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 798 ld [%o3 + %lo(hw_copy_limit_8)], %o3 799 tst %o3 800 bz,pn %icc, .kcopy_small ! if zero, disable HW copy 801 cmp %o2, %o3 ! if length <= limit 802 bleu,pt %ncc, .kcopy_small ! go to small copy 803 nop 804 ba,pt %ncc, .kcopy_more ! otherwise go to large copy 805 nop 806 807 .kcopy_small: 808 sethi %hi(.sm_copyerr), %o5 ! sm_copyerr is lofault value 809 or %o5, %lo(.sm_copyerr), %o5 810 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler 811 membar #Sync ! sync error barrier 812 ba,pt %ncc, .sm_do_copy ! common code 813 stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault 814 815 .kcopy_more: 816 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 817 sethi %hi(.copyerr), %l7 ! copyerr is lofault value 818 or %l7, %lo(.copyerr), %l7 819 ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler 820 membar #Sync ! sync error barrier 821 ba,pt %ncc, .do_copy ! common code 822 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 823 824 825 /* 826 * We got here because of a fault during bcopy_more, called from kcopy or bcopy. 827 * Errno value is in %g1. bcopy_more uses fp quadrants 1 and 3. 828 */ 829 .copyerr: 830 set .copyerr2, %l0 831 membar #Sync ! sync error barrier 832 stn %l0, [THREAD_REG + T_LOFAULT] ! set t_lofault 833 btst FPUSED_FLAG, %l6 834 bz %ncc, 1f 835 and %l6, TRAMP_FLAG, %l0 ! copy trampoline flag to %l0 836 837 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr 838 wr %o2, 0, %gsr 839 840 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 841 btst FPRS_FEF, %o3 842 bz,pt %icc, 4f 843 nop 844 845 BLD_FPQ1Q3_FROMSTACK(%o2) 846 847 ba,pt %ncc, 1f 848 wr %o3, 0, %fprs ! restore fprs 849 850 4: 851 FZEROQ1Q3 852 wr %o3, 0, %fprs ! restore fprs 853 854 ! 855 ! Need to cater for the different expectations of kcopy 856 ! and bcopy. kcopy will *always* set a t_lofault handler 857 ! If it fires, we're expected to just return the error code 858 ! and *not* to invoke any existing error handler. As far as 859 ! bcopy is concerned, we only set t_lofault if there was an 860 ! existing lofault handler. In that case we're expected to 861 ! invoke the previously existing handler after resetting the 862 ! t_lofault value. 863 ! 864 1: 865 andn %l6, MASK_FLAGS, %l6 ! turn trampoline flag off 866 membar #Sync ! sync error barrier 867 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 868 FP_ALLOWMIGRATE(5, 6) 869 870 btst TRAMP_FLAG, %l0 871 bnz,pn %ncc, 3f 872 nop 873 ret 874 restore %g1, 0, %o0 875 876 3: 877 ! 878 ! We're here via bcopy. There *must* have been an error handler 879 ! in place otherwise we would have died a nasty death already. 880 ! 881 jmp %l6 ! goto real handler 882 restore %g0, 0, %o0 ! dispose of copy window 883 884 /* 885 * We got here because of a fault in .copyerr. We can't safely restore fp 886 * state, so we panic. 887 */ 888 fp_panic_msg: 889 .asciz "Unable to restore fp state after copy operation" 890 891 .align 4 892 .copyerr2: 893 set fp_panic_msg, %o0 894 call panic 895 nop 896 897 /* 898 * We got here because of a fault during a small kcopy or bcopy. 899 * No floating point registers are used by the small copies. 900 * Errno value is in %g1. 901 */ 902 .sm_copyerr: 903 1: 904 btst TRAMP_FLAG, %o4 905 membar #Sync 906 andn %o4, TRAMP_FLAG, %o4 907 bnz,pn %ncc, 3f 908 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 909 retl 910 mov %g1, %o0 911 3: 912 jmp %o4 ! goto real handler 913 mov %g0, %o0 ! 914 915 SET_SIZE(kcopy) 916 #endif /* lint */ 917 918 919 /* 920 * Copy a block of storage - must not overlap (from + len <= to). 921 * Registers: l6 - saved t_lofault 922 * (for short copies, o4 - saved t_lofault) 923 * 924 * Copy a page of memory. 925 * Assumes double word alignment and a count >= 256. 926 */ 927 #if defined(lint) 928 929 /* ARGSUSED */ 930 void 931 bcopy(const void *from, void *to, size_t count) 932 {} 933 934 #else /* lint */ 935 936 ENTRY(bcopy) 937 938 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 939 bleu,pt %ncc, .bcopy_small ! go to larger cases 940 xor %o0, %o1, %o3 ! are src, dst alignable? 941 btst 7, %o3 ! 942 bz,pt %ncc, .bcopy_8 ! check for longword alignment 943 nop 944 btst 1, %o3 ! 945 bz,pt %ncc, .bcopy_2 ! check for half-word 946 nop 947 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 948 ld [%o3 + %lo(hw_copy_limit_1)], %o3 949 tst %o3 950 bz,pn %icc, .bcopy_small ! if zero, disable HW copy 951 cmp %o2, %o3 ! if length <= limit 952 bleu,pt %ncc, .bcopy_small ! go to small copy 953 nop 954 ba,pt %ncc, .bcopy_more ! otherwise go to large copy 955 nop 956 .bcopy_2: 957 btst 3, %o3 ! 958 bz,pt %ncc, .bcopy_4 ! check for word alignment 959 nop 960 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 961 ld [%o3 + %lo(hw_copy_limit_2)], %o3 962 tst %o3 963 bz,pn %icc, .bcopy_small ! if zero, disable HW copy 964 cmp %o2, %o3 ! if length <= limit 965 bleu,pt %ncc, .bcopy_small ! go to small copy 966 nop 967 ba,pt %ncc, .bcopy_more ! otherwise go to large copy 968 nop 969 .bcopy_4: 970 ! already checked longword, must be word aligned 971 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 972 ld [%o3 + %lo(hw_copy_limit_4)], %o3 973 tst %o3 974 bz,pn %icc, .bcopy_small ! if zero, disable HW copy 975 cmp %o2, %o3 ! if length <= limit 976 bleu,pt %ncc, .bcopy_small ! go to small copy 977 nop 978 ba,pt %ncc, .bcopy_more ! otherwise go to large copy 979 nop 980 .bcopy_8: 981 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 982 ld [%o3 + %lo(hw_copy_limit_8)], %o3 983 tst %o3 984 bz,pn %icc, .bcopy_small ! if zero, disable HW copy 985 cmp %o2, %o3 ! if length <= limit 986 bleu,pt %ncc, .bcopy_small ! go to small copy 987 nop 988 ba,pt %ncc, .bcopy_more ! otherwise go to large copy 989 nop 990 991 .align 16 992 .bcopy_small: 993 ldn [THREAD_REG + T_LOFAULT], %o4 ! save t_lofault 994 tst %o4 995 bz,pt %icc, .sm_do_copy 996 nop 997 sethi %hi(.sm_copyerr), %o5 998 or %o5, %lo(.sm_copyerr), %o5 999 membar #Sync ! sync error barrier 1000 stn %o5, [THREAD_REG + T_LOFAULT] ! install new vector 1001 or %o4, TRAMP_FLAG, %o4 ! error should trampoline 1002 .sm_do_copy: 1003 cmp %o2, SHORTCOPY ! check for really short case 1004 bleu,pt %ncc, .bc_sm_left ! 1005 cmp %o2, CHKSIZE ! check for medium length cases 1006 bgu,pn %ncc, .bc_med ! 1007 or %o0, %o1, %o3 ! prepare alignment check 1008 andcc %o3, 0x3, %g0 ! test for alignment 1009 bz,pt %ncc, .bc_sm_word ! branch to word aligned case 1010 .bc_sm_movebytes: 1011 sub %o2, 3, %o2 ! adjust count to allow cc zero test 1012 .bc_sm_notalign4: 1013 ldub [%o0], %o3 ! read byte 1014 stb %o3, [%o1] ! write byte 1015 subcc %o2, 4, %o2 ! reduce count by 4 1016 ldub [%o0 + 1], %o3 ! repeat for a total of 4 bytes 1017 add %o0, 4, %o0 ! advance SRC by 4 1018 stb %o3, [%o1 + 1] 1019 ldub [%o0 - 2], %o3 1020 add %o1, 4, %o1 ! advance DST by 4 1021 stb %o3, [%o1 - 2] 1022 ldub [%o0 - 1], %o3 1023 bgt,pt %ncc, .bc_sm_notalign4 ! loop til 3 or fewer bytes remain 1024 stb %o3, [%o1 - 1] 1025 add %o2, 3, %o2 ! restore count 1026 .bc_sm_left: 1027 tst %o2 1028 bz,pt %ncc, .bc_sm_exit ! check for zero length 1029 deccc %o2 ! reduce count for cc test 1030 ldub [%o0], %o3 ! move one byte 1031 bz,pt %ncc, .bc_sm_exit 1032 stb %o3, [%o1] 1033 ldub [%o0 + 1], %o3 ! move another byte 1034 deccc %o2 ! check for more 1035 bz,pt %ncc, .bc_sm_exit 1036 stb %o3, [%o1 + 1] 1037 ldub [%o0 + 2], %o3 ! move final byte 1038 stb %o3, [%o1 + 2] 1039 membar #Sync ! sync error barrier 1040 andn %o4, TRAMP_FLAG, %o4 1041 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1042 retl 1043 mov %g0, %o0 ! return 0 1044 .align 16 1045 nop ! instruction alignment 1046 ! see discussion at start of file 1047 .bc_sm_words: 1048 lduw [%o0], %o3 ! read word 1049 .bc_sm_wordx: 1050 subcc %o2, 8, %o2 ! update count 1051 stw %o3, [%o1] ! write word 1052 add %o0, 8, %o0 ! update SRC 1053 lduw [%o0 - 4], %o3 ! read word 1054 add %o1, 8, %o1 ! update DST 1055 bgt,pt %ncc, .bc_sm_words ! loop til done 1056 stw %o3, [%o1 - 4] ! write word 1057 addcc %o2, 7, %o2 ! restore count 1058 bz,pt %ncc, .bc_sm_exit 1059 deccc %o2 1060 bz,pt %ncc, .bc_sm_byte 1061 .bc_sm_half: 1062 subcc %o2, 2, %o2 ! reduce count by 2 1063 add %o0, 2, %o0 ! advance SRC by 2 1064 lduh [%o0 - 2], %o3 ! read half word 1065 add %o1, 2, %o1 ! advance DST by 2 1066 bgt,pt %ncc, .bc_sm_half ! loop til done 1067 sth %o3, [%o1 - 2] ! write half word 1068 addcc %o2, 1, %o2 ! restore count 1069 bz,pt %ncc, .bc_sm_exit 1070 nop 1071 .bc_sm_byte: 1072 ldub [%o0], %o3 1073 stb %o3, [%o1] 1074 membar #Sync ! sync error barrier 1075 andn %o4, TRAMP_FLAG, %o4 1076 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1077 retl 1078 mov %g0, %o0 ! return 0 1079 1080 .bc_sm_word: 1081 subcc %o2, 4, %o2 ! update count 1082 bgt,pt %ncc, .bc_sm_wordx 1083 lduw [%o0], %o3 ! read word 1084 addcc %o2, 3, %o2 ! restore count 1085 bz,pt %ncc, .bc_sm_exit 1086 stw %o3, [%o1] ! write word 1087 deccc %o2 ! reduce count for cc test 1088 ldub [%o0 + 4], %o3 ! load one byte 1089 bz,pt %ncc, .bc_sm_exit 1090 stb %o3, [%o1 + 4] ! store one byte 1091 ldub [%o0 + 5], %o3 ! load second byte 1092 deccc %o2 1093 bz,pt %ncc, .bc_sm_exit 1094 stb %o3, [%o1 + 5] ! store second byte 1095 ldub [%o0 + 6], %o3 ! load third byte 1096 stb %o3, [%o1 + 6] ! store third byte 1097 .bc_sm_exit: 1098 membar #Sync ! sync error barrier 1099 andn %o4, TRAMP_FLAG, %o4 1100 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1101 retl 1102 mov %g0, %o0 ! return 0 1103 1104 .align 16 1105 .bc_med: 1106 xor %o0, %o1, %o3 ! setup alignment check 1107 btst 1, %o3 1108 bnz,pt %ncc, .bc_sm_movebytes ! unaligned 1109 nop 1110 btst 3, %o3 1111 bnz,pt %ncc, .bc_med_half ! halfword aligned 1112 nop 1113 btst 7, %o3 1114 bnz,pt %ncc, .bc_med_word ! word aligned 1115 nop 1116 .bc_med_long: 1117 btst 3, %o0 ! check for 1118 bz,pt %ncc, .bc_med_long1 ! word alignment 1119 nop 1120 .bc_med_long0: 1121 ldub [%o0], %o3 ! load one byte 1122 inc %o0 1123 stb %o3,[%o1] ! store byte 1124 inc %o1 1125 btst 3, %o0 1126 bnz,pt %ncc, .bc_med_long0 1127 dec %o2 1128 .bc_med_long1: ! word aligned 1129 btst 7, %o0 ! check for long word 1130 bz,pt %ncc, .bc_med_long2 1131 nop 1132 lduw [%o0], %o3 ! load word 1133 add %o0, 4, %o0 ! advance SRC by 4 1134 stw %o3, [%o1] ! store word 1135 add %o1, 4, %o1 ! advance DST by 4 1136 sub %o2, 4, %o2 ! reduce count by 4 1137 ! 1138 ! Now long word aligned and have at least 32 bytes to move 1139 ! 1140 .bc_med_long2: 1141 sub %o2, 31, %o2 ! adjust count to allow cc zero test 1142 .bc_med_lmove: 1143 ldx [%o0], %o3 ! read long word 1144 stx %o3, [%o1] ! write long word 1145 subcc %o2, 32, %o2 ! reduce count by 32 1146 ldx [%o0 + 8], %o3 ! repeat for a total for 4 long words 1147 add %o0, 32, %o0 ! advance SRC by 32 1148 stx %o3, [%o1 + 8] 1149 ldx [%o0 - 16], %o3 1150 add %o1, 32, %o1 ! advance DST by 32 1151 stx %o3, [%o1 - 16] 1152 ldx [%o0 - 8], %o3 1153 bgt,pt %ncc, .bc_med_lmove ! loop til 31 or fewer bytes left 1154 stx %o3, [%o1 - 8] 1155 addcc %o2, 24, %o2 ! restore count to long word offset 1156 ble,pt %ncc, .bc_med_lextra ! check for more long words to move 1157 nop 1158 .bc_med_lword: 1159 ldx [%o0], %o3 ! read long word 1160 subcc %o2, 8, %o2 ! reduce count by 8 1161 stx %o3, [%o1] ! write long word 1162 add %o0, 8, %o0 ! advance SRC by 8 1163 bgt,pt %ncc, .bc_med_lword ! loop til 7 or fewer bytes left 1164 add %o1, 8, %o1 ! advance DST by 8 1165 .bc_med_lextra: 1166 addcc %o2, 7, %o2 ! restore rest of count 1167 bz,pt %ncc, .bc_sm_exit ! if zero, then done 1168 deccc %o2 1169 bz,pt %ncc, .bc_sm_byte 1170 nop 1171 ba,pt %ncc, .bc_sm_half 1172 nop 1173 1174 .align 16 1175 .bc_med_word: 1176 btst 3, %o0 ! check for 1177 bz,pt %ncc, .bc_med_word1 ! word alignment 1178 nop 1179 .bc_med_word0: 1180 ldub [%o0], %o3 ! load one byte 1181 inc %o0 1182 stb %o3,[%o1] ! store byte 1183 inc %o1 1184 btst 3, %o0 1185 bnz,pt %ncc, .bc_med_word0 1186 dec %o2 1187 ! 1188 ! Now word aligned and have at least 36 bytes to move 1189 ! 1190 .bc_med_word1: 1191 sub %o2, 15, %o2 ! adjust count to allow cc zero test 1192 .bc_med_wmove: 1193 lduw [%o0], %o3 ! read word 1194 stw %o3, [%o1] ! write word 1195 subcc %o2, 16, %o2 ! reduce count by 16 1196 lduw [%o0 + 4], %o3 ! repeat for a total for 4 words 1197 add %o0, 16, %o0 ! advance SRC by 16 1198 stw %o3, [%o1 + 4] 1199 lduw [%o0 - 8], %o3 1200 add %o1, 16, %o1 ! advance DST by 16 1201 stw %o3, [%o1 - 8] 1202 lduw [%o0 - 4], %o3 1203 bgt,pt %ncc, .bc_med_wmove ! loop til 15 or fewer bytes left 1204 stw %o3, [%o1 - 4] 1205 addcc %o2, 12, %o2 ! restore count to word offset 1206 ble,pt %ncc, .bc_med_wextra ! check for more words to move 1207 nop 1208 .bc_med_word2: 1209 lduw [%o0], %o3 ! read word 1210 subcc %o2, 4, %o2 ! reduce count by 4 1211 stw %o3, [%o1] ! write word 1212 add %o0, 4, %o0 ! advance SRC by 4 1213 bgt,pt %ncc, .bc_med_word2 ! loop til 3 or fewer bytes left 1214 add %o1, 4, %o1 ! advance DST by 4 1215 .bc_med_wextra: 1216 addcc %o2, 3, %o2 ! restore rest of count 1217 bz,pt %ncc, .bc_sm_exit ! if zero, then done 1218 deccc %o2 1219 bz,pt %ncc, .bc_sm_byte 1220 nop 1221 ba,pt %ncc, .bc_sm_half 1222 nop 1223 1224 .align 16 1225 .bc_med_half: 1226 btst 1, %o0 ! check for 1227 bz,pt %ncc, .bc_med_half1 ! half word alignment 1228 nop 1229 ldub [%o0], %o3 ! load one byte 1230 inc %o0 1231 stb %o3,[%o1] ! store byte 1232 inc %o1 1233 dec %o2 1234 ! 1235 ! Now half word aligned and have at least 38 bytes to move 1236 ! 1237 .bc_med_half1: 1238 sub %o2, 7, %o2 ! adjust count to allow cc zero test 1239 .bc_med_hmove: 1240 lduh [%o0], %o3 ! read half word 1241 sth %o3, [%o1] ! write half word 1242 subcc %o2, 8, %o2 ! reduce count by 8 1243 lduh [%o0 + 2], %o3 ! repeat for a total for 4 halfwords 1244 add %o0, 8, %o0 ! advance SRC by 8 1245 sth %o3, [%o1 + 2] 1246 lduh [%o0 - 4], %o3 1247 add %o1, 8, %o1 ! advance DST by 8 1248 sth %o3, [%o1 - 4] 1249 lduh [%o0 - 2], %o3 1250 bgt,pt %ncc, .bc_med_hmove ! loop til 7 or fewer bytes left 1251 sth %o3, [%o1 - 2] 1252 addcc %o2, 7, %o2 ! restore count 1253 bz,pt %ncc, .bc_sm_exit 1254 deccc %o2 1255 bz,pt %ncc, .bc_sm_byte 1256 nop 1257 ba,pt %ncc, .bc_sm_half 1258 nop 1259 1260 SET_SIZE(bcopy) 1261 1262 /* 1263 * The _more entry points are not intended to be used directly by 1264 * any caller from outside this file. They are provided to allow 1265 * profiling and dtrace of the portions of the copy code that uses 1266 * the floating point registers. 1267 * This entry is particularly important as DTRACE (at least as of 1268 * 4/2004) does not support leaf functions. 1269 */ 1270 1271 ENTRY(bcopy_more) 1272 .bcopy_more: 1273 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 1274 ldn [THREAD_REG + T_LOFAULT], %l6 ! save t_lofault 1275 tst %l6 1276 bz,pt %ncc, .do_copy 1277 nop 1278 sethi %hi(.copyerr), %o2 1279 or %o2, %lo(.copyerr), %o2 1280 membar #Sync ! sync error barrier 1281 stn %o2, [THREAD_REG + T_LOFAULT] ! install new vector 1282 ! 1283 ! We've already captured whether t_lofault was zero on entry. 1284 ! We need to mark ourselves as being from bcopy since both 1285 ! kcopy and bcopy use the same code path. If TRAMP_FLAG is set 1286 ! and the saved lofault was zero, we won't reset lofault on 1287 ! returning. 1288 ! 1289 or %l6, TRAMP_FLAG, %l6 1290 1291 /* 1292 * Copies that reach here are larger than VIS_COPY_THRESHOLD bytes 1293 * Also, use of FP registers has been tested to be enabled 1294 */ 1295 .do_copy: 1296 FP_NOMIGRATE(6, 7) 1297 1298 rd %fprs, %o2 ! check for unused fp 1299 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs 1300 btst FPRS_FEF, %o2 1301 bz,a,pt %icc, .do_blockcopy 1302 wr %g0, FPRS_FEF, %fprs 1303 1304 BST_FPQ1Q3_TOSTACK(%o2) 1305 1306 .do_blockcopy: 1307 rd %gsr, %o2 1308 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr 1309 or %l6, FPUSED_FLAG, %l6 1310 1311 #define REALSRC %i0 1312 #define DST %i1 1313 #define CNT %i2 1314 #define SRC %i3 1315 #define TMP %i5 1316 1317 andcc DST, VIS_BLOCKSIZE - 1, TMP 1318 bz,pt %ncc, 2f 1319 neg TMP 1320 add TMP, VIS_BLOCKSIZE, TMP 1321 1322 ! TMP = bytes required to align DST on FP_BLOCK boundary 1323 ! Using SRC as a tmp here 1324 cmp TMP, 3 1325 bleu,pt %ncc, 1f 1326 sub CNT,TMP,CNT ! adjust main count 1327 sub TMP, 3, TMP ! adjust for end of loop test 1328 .bc_blkalign: 1329 ldub [REALSRC], SRC ! move 4 bytes per loop iteration 1330 stb SRC, [DST] 1331 subcc TMP, 4, TMP 1332 ldub [REALSRC + 1], SRC 1333 add REALSRC, 4, REALSRC 1334 stb SRC, [DST + 1] 1335 ldub [REALSRC - 2], SRC 1336 add DST, 4, DST 1337 stb SRC, [DST - 2] 1338 ldub [REALSRC - 1], SRC 1339 bgu,pt %ncc, .bc_blkalign 1340 stb SRC, [DST - 1] 1341 1342 addcc TMP, 3, TMP ! restore count adjustment 1343 bz,pt %ncc, 2f ! no bytes left? 1344 nop 1345 1: ldub [REALSRC], SRC 1346 inc REALSRC 1347 inc DST 1348 deccc TMP 1349 bgu %ncc, 1b 1350 stb SRC, [DST - 1] 1351 1352 2: 1353 andn REALSRC, 0x7, SRC 1354 alignaddr REALSRC, %g0, %g0 1355 1356 ! SRC - 8-byte aligned 1357 ! DST - 64-byte aligned 1358 prefetch [SRC], #one_read 1359 prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read 1360 prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read 1361 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read 1362 ldd [SRC], %f0 1363 #if CHEETAH_PREFETCH > 4 1364 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read 1365 #endif 1366 ldd [SRC + 0x08], %f2 1367 #if CHEETAH_PREFETCH > 5 1368 prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read 1369 #endif 1370 ldd [SRC + 0x10], %f4 1371 #if CHEETAH_PREFETCH > 6 1372 prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read 1373 #endif 1374 faligndata %f0, %f2, %f32 1375 ldd [SRC + 0x18], %f6 1376 #if CHEETAH_PREFETCH > 7 1377 prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read 1378 #endif 1379 faligndata %f2, %f4, %f34 1380 ldd [SRC + 0x20], %f8 1381 faligndata %f4, %f6, %f36 1382 ldd [SRC + 0x28], %f10 1383 faligndata %f6, %f8, %f38 1384 ldd [SRC + 0x30], %f12 1385 faligndata %f8, %f10, %f40 1386 ldd [SRC + 0x38], %f14 1387 faligndata %f10, %f12, %f42 1388 ldd [SRC + VIS_BLOCKSIZE], %f0 1389 sub CNT, VIS_BLOCKSIZE, CNT 1390 add SRC, VIS_BLOCKSIZE, SRC 1391 add REALSRC, VIS_BLOCKSIZE, REALSRC 1392 ba,a,pt %ncc, 1f 1393 nop 1394 .align 16 1395 1: 1396 ldd [SRC + 0x08], %f2 1397 faligndata %f12, %f14, %f44 1398 ldd [SRC + 0x10], %f4 1399 faligndata %f14, %f0, %f46 1400 stda %f32, [DST]ASI_BLK_P 1401 ldd [SRC + 0x18], %f6 1402 faligndata %f0, %f2, %f32 1403 ldd [SRC + 0x20], %f8 1404 faligndata %f2, %f4, %f34 1405 ldd [SRC + 0x28], %f10 1406 faligndata %f4, %f6, %f36 1407 ldd [SRC + 0x30], %f12 1408 faligndata %f6, %f8, %f38 1409 ldd [SRC + 0x38], %f14 1410 faligndata %f8, %f10, %f40 1411 sub CNT, VIS_BLOCKSIZE, CNT 1412 ldd [SRC + VIS_BLOCKSIZE], %f0 1413 faligndata %f10, %f12, %f42 1414 prefetch [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read 1415 add DST, VIS_BLOCKSIZE, DST 1416 prefetch [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read 1417 add REALSRC, VIS_BLOCKSIZE, REALSRC 1418 cmp CNT, VIS_BLOCKSIZE + 8 1419 bgu,pt %ncc, 1b 1420 add SRC, VIS_BLOCKSIZE, SRC 1421 1422 ! only if REALSRC & 0x7 is 0 1423 cmp CNT, VIS_BLOCKSIZE 1424 bne %ncc, 3f 1425 andcc REALSRC, 0x7, %g0 1426 bz,pt %ncc, 2f 1427 nop 1428 3: 1429 faligndata %f12, %f14, %f44 1430 faligndata %f14, %f0, %f46 1431 stda %f32, [DST]ASI_BLK_P 1432 add DST, VIS_BLOCKSIZE, DST 1433 ba,pt %ncc, 3f 1434 nop 1435 2: 1436 ldd [SRC + 0x08], %f2 1437 fsrc1 %f12, %f44 1438 ldd [SRC + 0x10], %f4 1439 fsrc1 %f14, %f46 1440 stda %f32, [DST]ASI_BLK_P 1441 ldd [SRC + 0x18], %f6 1442 fsrc1 %f0, %f32 1443 ldd [SRC + 0x20], %f8 1444 fsrc1 %f2, %f34 1445 ldd [SRC + 0x28], %f10 1446 fsrc1 %f4, %f36 1447 ldd [SRC + 0x30], %f12 1448 fsrc1 %f6, %f38 1449 ldd [SRC + 0x38], %f14 1450 fsrc1 %f8, %f40 1451 sub CNT, VIS_BLOCKSIZE, CNT 1452 add DST, VIS_BLOCKSIZE, DST 1453 add SRC, VIS_BLOCKSIZE, SRC 1454 add REALSRC, VIS_BLOCKSIZE, REALSRC 1455 fsrc1 %f10, %f42 1456 fsrc1 %f12, %f44 1457 fsrc1 %f14, %f46 1458 stda %f32, [DST]ASI_BLK_P 1459 add DST, VIS_BLOCKSIZE, DST 1460 ba,a,pt %ncc, .bcb_exit 1461 nop 1462 1463 3: tst CNT 1464 bz,a,pt %ncc, .bcb_exit 1465 nop 1466 1467 5: ldub [REALSRC], TMP 1468 inc REALSRC 1469 inc DST 1470 deccc CNT 1471 bgu %ncc, 5b 1472 stb TMP, [DST - 1] 1473 .bcb_exit: 1474 membar #Sync 1475 1476 FPRAS_INTERVAL(FPRAS_BCOPY, 0, %l5, %o2, %o3, %o4, %o5, 8) 1477 FPRAS_REWRITE_TYPE2Q1(0, %l5, %o2, %o3, 8, 9) 1478 FPRAS_CHECK(FPRAS_BCOPY, %l5, 9) ! outputs lost 1479 1480 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr 1481 wr %o2, 0, %gsr 1482 1483 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 1484 btst FPRS_FEF, %o3 1485 bz,pt %icc, 4f 1486 nop 1487 1488 BLD_FPQ1Q3_FROMSTACK(%o2) 1489 1490 ba,pt %ncc, 2f 1491 wr %o3, 0, %fprs ! restore fprs 1492 4: 1493 FZEROQ1Q3 1494 wr %o3, 0, %fprs ! restore fprs 1495 2: 1496 membar #Sync ! sync error barrier 1497 andn %l6, MASK_FLAGS, %l6 1498 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1499 FP_ALLOWMIGRATE(5, 6) 1500 ret 1501 restore %g0, 0, %o0 1502 1503 SET_SIZE(bcopy_more) 1504 1505 #endif /* lint */ 1506 1507 /* 1508 * Block copy with possibly overlapped operands. 1509 */ 1510 1511 #if defined(lint) 1512 1513 /*ARGSUSED*/ 1514 void 1515 ovbcopy(const void *from, void *to, size_t count) 1516 {} 1517 1518 #else /* lint */ 1519 1520 ENTRY(ovbcopy) 1521 tst %o2 ! check count 1522 bgu,a %ncc, 1f ! nothing to do or bad arguments 1523 subcc %o0, %o1, %o3 ! difference of from and to address 1524 1525 retl ! return 1526 nop 1527 1: 1528 bneg,a %ncc, 2f 1529 neg %o3 ! if < 0, make it positive 1530 2: cmp %o2, %o3 ! cmp size and abs(from - to) 1531 bleu %ncc, bcopy ! if size <= abs(diff): use bcopy, 1532 .empty ! no overlap 1533 cmp %o0, %o1 ! compare from and to addresses 1534 blu %ncc, .ov_bkwd ! if from < to, copy backwards 1535 nop 1536 ! 1537 ! Copy forwards. 1538 ! 1539 .ov_fwd: 1540 ldub [%o0], %o3 ! read from address 1541 inc %o0 ! inc from address 1542 stb %o3, [%o1] ! write to address 1543 deccc %o2 ! dec count 1544 bgu %ncc, .ov_fwd ! loop till done 1545 inc %o1 ! inc to address 1546 1547 retl ! return 1548 nop 1549 ! 1550 ! Copy backwards. 1551 ! 1552 .ov_bkwd: 1553 deccc %o2 ! dec count 1554 ldub [%o0 + %o2], %o3 ! get byte at end of src 1555 bgu %ncc, .ov_bkwd ! loop till done 1556 stb %o3, [%o1 + %o2] ! delay slot, store at end of dst 1557 1558 retl ! return 1559 nop 1560 1561 SET_SIZE(ovbcopy) 1562 1563 #endif /* lint */ 1564 1565 1566 /* 1567 * hwblkpagecopy() 1568 * 1569 * Copies exactly one page. This routine assumes the caller (ppcopy) 1570 * has already disabled kernel preemption and has checked 1571 * use_hw_bcopy. Preventing preemption also prevents cpu migration. 1572 */ 1573 #ifdef lint 1574 /*ARGSUSED*/ 1575 void 1576 hwblkpagecopy(const void *src, void *dst) 1577 { } 1578 #else /* lint */ 1579 ENTRY(hwblkpagecopy) 1580 ! get another window w/space for three aligned blocks of saved fpregs 1581 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 1582 1583 ! %i0 - source address (arg) 1584 ! %i1 - destination address (arg) 1585 ! %i2 - length of region (not arg) 1586 ! %l0 - saved fprs 1587 ! %l1 - pointer to saved fpregs 1588 1589 rd %fprs, %l0 ! check for unused fp 1590 btst FPRS_FEF, %l0 1591 bz,a,pt %icc, 1f 1592 wr %g0, FPRS_FEF, %fprs 1593 1594 BST_FPQ1Q3_TOSTACK(%l1) 1595 1596 1: set PAGESIZE, CNT 1597 mov REALSRC, SRC 1598 1599 prefetch [SRC], #one_read 1600 prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read 1601 prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read 1602 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read 1603 ldd [SRC], %f0 1604 #if CHEETAH_PREFETCH > 4 1605 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read 1606 #endif 1607 ldd [SRC + 0x08], %f2 1608 #if CHEETAH_PREFETCH > 5 1609 prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read 1610 #endif 1611 ldd [SRC + 0x10], %f4 1612 #if CHEETAH_PREFETCH > 6 1613 prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read 1614 #endif 1615 fsrc1 %f0, %f32 1616 ldd [SRC + 0x18], %f6 1617 #if CHEETAH_PREFETCH > 7 1618 prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read 1619 #endif 1620 fsrc1 %f2, %f34 1621 ldd [SRC + 0x20], %f8 1622 fsrc1 %f4, %f36 1623 ldd [SRC + 0x28], %f10 1624 fsrc1 %f6, %f38 1625 ldd [SRC + 0x30], %f12 1626 fsrc1 %f8, %f40 1627 ldd [SRC + 0x38], %f14 1628 fsrc1 %f10, %f42 1629 ldd [SRC + VIS_BLOCKSIZE], %f0 1630 sub CNT, VIS_BLOCKSIZE, CNT 1631 add SRC, VIS_BLOCKSIZE, SRC 1632 ba,a,pt %ncc, 2f 1633 nop 1634 .align 16 1635 2: 1636 ldd [SRC + 0x08], %f2 1637 fsrc1 %f12, %f44 1638 ldd [SRC + 0x10], %f4 1639 fsrc1 %f14, %f46 1640 stda %f32, [DST]ASI_BLK_P 1641 ldd [SRC + 0x18], %f6 1642 fsrc1 %f0, %f32 1643 ldd [SRC + 0x20], %f8 1644 fsrc1 %f2, %f34 1645 ldd [SRC + 0x28], %f10 1646 fsrc1 %f4, %f36 1647 ldd [SRC + 0x30], %f12 1648 fsrc1 %f6, %f38 1649 ldd [SRC + 0x38], %f14 1650 fsrc1 %f8, %f40 1651 ldd [SRC + VIS_BLOCKSIZE], %f0 1652 fsrc1 %f10, %f42 1653 prefetch [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read 1654 sub CNT, VIS_BLOCKSIZE, CNT 1655 add DST, VIS_BLOCKSIZE, DST 1656 cmp CNT, VIS_BLOCKSIZE + 8 1657 prefetch [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read 1658 bgu,pt %ncc, 2b 1659 add SRC, VIS_BLOCKSIZE, SRC 1660 1661 ! trailing block 1662 ldd [SRC + 0x08], %f2 1663 fsrc1 %f12, %f44 1664 ldd [SRC + 0x10], %f4 1665 fsrc1 %f14, %f46 1666 stda %f32, [DST]ASI_BLK_P 1667 ldd [SRC + 0x18], %f6 1668 fsrc1 %f0, %f32 1669 ldd [SRC + 0x20], %f8 1670 fsrc1 %f2, %f34 1671 ldd [SRC + 0x28], %f10 1672 fsrc1 %f4, %f36 1673 ldd [SRC + 0x30], %f12 1674 fsrc1 %f6, %f38 1675 ldd [SRC + 0x38], %f14 1676 fsrc1 %f8, %f40 1677 sub CNT, VIS_BLOCKSIZE, CNT 1678 add DST, VIS_BLOCKSIZE, DST 1679 add SRC, VIS_BLOCKSIZE, SRC 1680 fsrc1 %f10, %f42 1681 fsrc1 %f12, %f44 1682 fsrc1 %f14, %f46 1683 stda %f32, [DST]ASI_BLK_P 1684 1685 membar #Sync 1686 1687 FPRAS_INTERVAL(FPRAS_PGCOPY, 1, %l5, %o2, %o3, %o4, %o5, 8) 1688 FPRAS_REWRITE_TYPE1(1, %l5, %f32, %o2, 9) 1689 FPRAS_CHECK(FPRAS_PGCOPY, %l5, 9) ! lose outputs 1690 1691 btst FPRS_FEF, %l0 1692 bz,pt %icc, 2f 1693 nop 1694 1695 BLD_FPQ1Q3_FROMSTACK(%l3) 1696 ba 3f 1697 nop 1698 1699 2: FZEROQ1Q3 1700 1701 3: wr %l0, 0, %fprs ! restore fprs 1702 ret 1703 restore %g0, 0, %o0 1704 1705 SET_SIZE(hwblkpagecopy) 1706 #endif /* lint */ 1707 1708 1709 /* 1710 * Transfer data to and from user space - 1711 * Note that these routines can cause faults 1712 * It is assumed that the kernel has nothing at 1713 * less than KERNELBASE in the virtual address space. 1714 * 1715 * Note that copyin(9F) and copyout(9F) are part of the 1716 * DDI/DKI which specifies that they return '-1' on "errors." 1717 * 1718 * Sigh. 1719 * 1720 * So there's two extremely similar routines - xcopyin() and xcopyout() 1721 * which return the errno that we've faithfully computed. This 1722 * allows other callers (e.g. uiomove(9F)) to work correctly. 1723 * Given that these are used pretty heavily, we expand the calling 1724 * sequences inline for all flavours (rather than making wrappers). 1725 * 1726 * There are also stub routines for xcopyout_little and xcopyin_little, 1727 * which currently are intended to handle requests of <= 16 bytes from 1728 * do_unaligned. Future enhancement to make them handle 8k pages efficiently 1729 * is left as an exercise... 1730 */ 1731 1732 /* 1733 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr) 1734 * 1735 * General theory of operation: 1736 * 1737 * The only difference between copy{in,out} and 1738 * xcopy{in,out} is in the error handling routine they invoke 1739 * when a memory access error occurs. xcopyOP returns the errno 1740 * while copyOP returns -1 (see above). copy{in,out}_noerr set 1741 * a special flag (by oring the TRAMP_FLAG into the fault handler address) 1742 * if they are called with a fault handler already in place. That flag 1743 * causes the default handlers to trampoline to the previous handler 1744 * upon an error. 1745 * 1746 * None of the copyops routines grab a window until it's decided that 1747 * we need to do a HW block copy operation. This saves a window 1748 * spill/fill when we're called during socket ops. The typical IO 1749 * path won't cause spill/fill traps. 1750 * 1751 * This code uses a set of 4 limits for the maximum size that will 1752 * be copied given a particular input/output address alignment. 1753 * If the value for a particular limit is zero, the copy will be performed 1754 * by the plain copy loops rather than FPBLK. 1755 * 1756 * See the description of bcopy above for more details of the 1757 * data copying algorithm and the default limits. 1758 * 1759 */ 1760 1761 /* 1762 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little). 1763 */ 1764 1765 #if defined(lint) 1766 1767 1768 #else /* lint */ 1769 /* 1770 * We save the arguments in the following registers in case of a fault: 1771 * kaddr - %l1 1772 * uaddr - %l2 1773 * count - %l3 1774 */ 1775 #define SAVE_SRC %l1 1776 #define SAVE_DST %l2 1777 #define SAVE_COUNT %l3 1778 1779 #define SM_SAVE_SRC %g4 1780 #define SM_SAVE_DST %g5 1781 #define SM_SAVE_COUNT %o5 1782 #define ERRNO %l5 1783 1784 1785 #define REAL_LOFAULT %l4 1786 /* 1787 * Generic copyio fault handler. This is the first line of defense when a 1788 * fault occurs in (x)copyin/(x)copyout. In order for this to function 1789 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT. 1790 * This allows us to share common code for all the flavors of the copy 1791 * operations, including the _noerr versions. 1792 * 1793 * Note that this function will restore the original input parameters before 1794 * calling REAL_LOFAULT. So the real handler can vector to the appropriate 1795 * member of the t_copyop structure, if needed. 1796 */ 1797 ENTRY(copyio_fault) 1798 membar #Sync 1799 mov %g1,ERRNO ! save errno in ERRNO 1800 btst FPUSED_FLAG, %l6 1801 bz %ncc, 1f 1802 nop 1803 1804 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 1805 wr %o2, 0, %gsr ! restore gsr 1806 1807 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 1808 btst FPRS_FEF, %o3 1809 bz,pt %icc, 4f 1810 nop 1811 1812 BLD_FPQ2Q4_FROMSTACK(%o2) 1813 1814 ba,pt %ncc, 1f 1815 wr %o3, 0, %fprs ! restore fprs 1816 1817 4: 1818 FZEROQ2Q4 1819 wr %o3, 0, %fprs ! restore fprs 1820 1821 1: 1822 andn %l6, FPUSED_FLAG, %l6 1823 membar #Sync 1824 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1825 FP_ALLOWMIGRATE(5, 6) 1826 1827 mov SAVE_SRC, %i0 1828 mov SAVE_DST, %i1 1829 jmp REAL_LOFAULT 1830 mov SAVE_COUNT, %i2 1831 1832 SET_SIZE(copyio_fault) 1833 1834 1835 #endif 1836 1837 #if defined(lint) 1838 1839 /*ARGSUSED*/ 1840 int 1841 copyout(const void *kaddr, void *uaddr, size_t count) 1842 { return (0); } 1843 1844 #else /* lint */ 1845 1846 ENTRY(copyout) 1847 1848 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 1849 bleu,pt %ncc, .copyout_small ! go to larger cases 1850 xor %o0, %o1, %o3 ! are src, dst alignable? 1851 btst 7, %o3 ! 1852 bz,pt %ncc, .copyout_8 ! check for longword alignment 1853 nop 1854 btst 1, %o3 ! 1855 bz,pt %ncc, .copyout_2 ! check for half-word 1856 nop 1857 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 1858 ld [%o3 + %lo(hw_copy_limit_1)], %o3 1859 tst %o3 1860 bz,pn %icc, .copyout_small ! if zero, disable HW copy 1861 cmp %o2, %o3 ! if length <= limit 1862 bleu,pt %ncc, .copyout_small ! go to small copy 1863 nop 1864 ba,pt %ncc, .copyout_more ! otherwise go to large copy 1865 nop 1866 .copyout_2: 1867 btst 3, %o3 ! 1868 bz,pt %ncc, .copyout_4 ! check for word alignment 1869 nop 1870 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 1871 ld [%o3 + %lo(hw_copy_limit_2)], %o3 1872 tst %o3 1873 bz,pn %icc, .copyout_small ! if zero, disable HW copy 1874 cmp %o2, %o3 ! if length <= limit 1875 bleu,pt %ncc, .copyout_small ! go to small copy 1876 nop 1877 ba,pt %ncc, .copyout_more ! otherwise go to large copy 1878 nop 1879 .copyout_4: 1880 ! already checked longword, must be word aligned 1881 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 1882 ld [%o3 + %lo(hw_copy_limit_4)], %o3 1883 tst %o3 1884 bz,pn %icc, .copyout_small ! if zero, disable HW copy 1885 cmp %o2, %o3 ! if length <= limit 1886 bleu,pt %ncc, .copyout_small ! go to small copy 1887 nop 1888 ba,pt %ncc, .copyout_more ! otherwise go to large copy 1889 nop 1890 .copyout_8: 1891 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 1892 ld [%o3 + %lo(hw_copy_limit_8)], %o3 1893 tst %o3 1894 bz,pn %icc, .copyout_small ! if zero, disable HW copy 1895 cmp %o2, %o3 ! if length <= limit 1896 bleu,pt %ncc, .copyout_small ! go to small copy 1897 nop 1898 ba,pt %ncc, .copyout_more ! otherwise go to large copy 1899 nop 1900 1901 .align 16 1902 nop ! instruction alignment 1903 ! see discussion at start of file 1904 .copyout_small: 1905 sethi %hi(.sm_copyout_err), %o5 ! .sm_copyout_err is lofault 1906 or %o5, %lo(.sm_copyout_err), %o5 1907 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler 1908 membar #Sync ! sync error barrier 1909 stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault 1910 .sm_do_copyout: 1911 mov %o0, SM_SAVE_SRC 1912 mov %o1, SM_SAVE_DST 1913 cmp %o2, SHORTCOPY ! check for really short case 1914 bleu,pt %ncc, .co_sm_left ! 1915 mov %o2, SM_SAVE_COUNT 1916 cmp %o2, CHKSIZE ! check for medium length cases 1917 bgu,pn %ncc, .co_med ! 1918 or %o0, %o1, %o3 ! prepare alignment check 1919 andcc %o3, 0x3, %g0 ! test for alignment 1920 bz,pt %ncc, .co_sm_word ! branch to word aligned case 1921 .co_sm_movebytes: 1922 sub %o2, 3, %o2 ! adjust count to allow cc zero test 1923 .co_sm_notalign4: 1924 ldub [%o0], %o3 ! read byte 1925 subcc %o2, 4, %o2 ! reduce count by 4 1926 stba %o3, [%o1]ASI_USER ! write byte 1927 inc %o1 ! advance DST by 1 1928 ldub [%o0 + 1], %o3 ! repeat for a total of 4 bytes 1929 add %o0, 4, %o0 ! advance SRC by 4 1930 stba %o3, [%o1]ASI_USER 1931 inc %o1 ! advance DST by 1 1932 ldub [%o0 - 2], %o3 1933 stba %o3, [%o1]ASI_USER 1934 inc %o1 ! advance DST by 1 1935 ldub [%o0 - 1], %o3 1936 stba %o3, [%o1]ASI_USER 1937 bgt,pt %ncc, .co_sm_notalign4 ! loop til 3 or fewer bytes remain 1938 inc %o1 ! advance DST by 1 1939 add %o2, 3, %o2 ! restore count 1940 .co_sm_left: 1941 tst %o2 1942 bz,pt %ncc, .co_sm_exit ! check for zero length 1943 nop 1944 ldub [%o0], %o3 ! load one byte 1945 deccc %o2 ! reduce count for cc test 1946 bz,pt %ncc, .co_sm_exit 1947 stba %o3,[%o1]ASI_USER ! store one byte 1948 ldub [%o0 + 1], %o3 ! load second byte 1949 deccc %o2 1950 inc %o1 1951 bz,pt %ncc, .co_sm_exit 1952 stba %o3,[%o1]ASI_USER ! store second byte 1953 ldub [%o0 + 2], %o3 ! load third byte 1954 inc %o1 1955 stba %o3,[%o1]ASI_USER ! store third byte 1956 membar #Sync ! sync error barrier 1957 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1958 retl 1959 mov %g0, %o0 ! return 0 1960 .align 16 1961 .co_sm_words: 1962 lduw [%o0], %o3 ! read word 1963 .co_sm_wordx: 1964 subcc %o2, 8, %o2 ! update count 1965 stwa %o3, [%o1]ASI_USER ! write word 1966 add %o0, 8, %o0 ! update SRC 1967 lduw [%o0 - 4], %o3 ! read word 1968 add %o1, 4, %o1 ! update DST 1969 stwa %o3, [%o1]ASI_USER ! write word 1970 bgt,pt %ncc, .co_sm_words ! loop til done 1971 add %o1, 4, %o1 ! update DST 1972 addcc %o2, 7, %o2 ! restore count 1973 bz,pt %ncc, .co_sm_exit 1974 nop 1975 deccc %o2 1976 bz,pt %ncc, .co_sm_byte 1977 .co_sm_half: 1978 subcc %o2, 2, %o2 ! reduce count by 2 1979 lduh [%o0], %o3 ! read half word 1980 add %o0, 2, %o0 ! advance SRC by 2 1981 stha %o3, [%o1]ASI_USER ! write half word 1982 bgt,pt %ncc, .co_sm_half ! loop til done 1983 add %o1, 2, %o1 ! advance DST by 2 1984 addcc %o2, 1, %o2 ! restore count 1985 bz,pt %ncc, .co_sm_exit 1986 nop 1987 .co_sm_byte: 1988 ldub [%o0], %o3 1989 stba %o3, [%o1]ASI_USER 1990 membar #Sync ! sync error barrier 1991 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1992 retl 1993 mov %g0, %o0 ! return 0 1994 .align 16 1995 .co_sm_word: 1996 subcc %o2, 4, %o2 ! update count 1997 bgt,pt %ncc, .co_sm_wordx 1998 lduw [%o0], %o3 ! read word 1999 addcc %o2, 3, %o2 ! restore count 2000 bz,pt %ncc, .co_sm_exit 2001 stwa %o3, [%o1]ASI_USER ! write word 2002 deccc %o2 ! reduce count for cc test 2003 ldub [%o0 + 4], %o3 ! load one byte 2004 add %o1, 4, %o1 2005 bz,pt %ncc, .co_sm_exit 2006 stba %o3, [%o1]ASI_USER ! store one byte 2007 ldub [%o0 + 5], %o3 ! load second byte 2008 deccc %o2 2009 inc %o1 2010 bz,pt %ncc, .co_sm_exit 2011 stba %o3, [%o1]ASI_USER ! store second byte 2012 ldub [%o0 + 6], %o3 ! load third byte 2013 inc %o1 2014 stba %o3, [%o1]ASI_USER ! store third byte 2015 .co_sm_exit: 2016 membar #Sync ! sync error barrier 2017 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2018 retl 2019 mov %g0, %o0 ! return 0 2020 2021 .align 16 2022 .co_med: 2023 xor %o0, %o1, %o3 ! setup alignment check 2024 btst 1, %o3 2025 bnz,pt %ncc, .co_sm_movebytes ! unaligned 2026 nop 2027 btst 3, %o3 2028 bnz,pt %ncc, .co_med_half ! halfword aligned 2029 nop 2030 btst 7, %o3 2031 bnz,pt %ncc, .co_med_word ! word aligned 2032 nop 2033 .co_med_long: 2034 btst 3, %o0 ! check for 2035 bz,pt %ncc, .co_med_long1 ! word alignment 2036 nop 2037 .co_med_long0: 2038 ldub [%o0], %o3 ! load one byte 2039 inc %o0 2040 stba %o3,[%o1]ASI_USER ! store byte 2041 inc %o1 2042 btst 3, %o0 2043 bnz,pt %ncc, .co_med_long0 2044 dec %o2 2045 .co_med_long1: ! word aligned 2046 btst 7, %o0 ! check for long word 2047 bz,pt %ncc, .co_med_long2 2048 nop 2049 lduw [%o0], %o3 ! load word 2050 add %o0, 4, %o0 ! advance SRC by 4 2051 stwa %o3, [%o1]ASI_USER ! store word 2052 add %o1, 4, %o1 ! advance DST by 4 2053 sub %o2, 4, %o2 ! reduce count by 4 2054 ! 2055 ! Now long word aligned and have at least 32 bytes to move 2056 ! 2057 .co_med_long2: 2058 sub %o2, 31, %o2 ! adjust count to allow cc zero test 2059 sub %o1, 8, %o1 ! adjust pointer to allow store in 2060 ! branch delay slot instead of add 2061 .co_med_lmove: 2062 add %o1, 8, %o1 ! advance DST by 8 2063 ldx [%o0], %o3 ! read long word 2064 subcc %o2, 32, %o2 ! reduce count by 32 2065 stxa %o3, [%o1]ASI_USER ! write long word 2066 add %o1, 8, %o1 ! advance DST by 8 2067 ldx [%o0 + 8], %o3 ! repeat for a total for 4 long words 2068 add %o0, 32, %o0 ! advance SRC by 32 2069 stxa %o3, [%o1]ASI_USER 2070 ldx [%o0 - 16], %o3 2071 add %o1, 8, %o1 ! advance DST by 8 2072 stxa %o3, [%o1]ASI_USER 2073 ldx [%o0 - 8], %o3 2074 add %o1, 8, %o1 ! advance DST by 8 2075 bgt,pt %ncc, .co_med_lmove ! loop til 31 or fewer bytes left 2076 stxa %o3, [%o1]ASI_USER 2077 add %o1, 8, %o1 ! advance DST by 8 2078 addcc %o2, 24, %o2 ! restore count to long word offset 2079 ble,pt %ncc, .co_med_lextra ! check for more long words to move 2080 nop 2081 .co_med_lword: 2082 ldx [%o0], %o3 ! read long word 2083 subcc %o2, 8, %o2 ! reduce count by 8 2084 stxa %o3, [%o1]ASI_USER ! write long word 2085 add %o0, 8, %o0 ! advance SRC by 8 2086 bgt,pt %ncc, .co_med_lword ! loop til 7 or fewer bytes left 2087 add %o1, 8, %o1 ! advance DST by 8 2088 .co_med_lextra: 2089 addcc %o2, 7, %o2 ! restore rest of count 2090 bz,pt %ncc, .co_sm_exit ! if zero, then done 2091 deccc %o2 2092 bz,pt %ncc, .co_sm_byte 2093 nop 2094 ba,pt %ncc, .co_sm_half 2095 nop 2096 2097 .align 16 2098 nop ! instruction alignment 2099 ! see discussion at start of file 2100 .co_med_word: 2101 btst 3, %o0 ! check for 2102 bz,pt %ncc, .co_med_word1 ! word alignment 2103 nop 2104 .co_med_word0: 2105 ldub [%o0], %o3 ! load one byte 2106 inc %o0 2107 stba %o3,[%o1]ASI_USER ! store byte 2108 inc %o1 2109 btst 3, %o0 2110 bnz,pt %ncc, .co_med_word0 2111 dec %o2 2112 ! 2113 ! Now word aligned and have at least 36 bytes to move 2114 ! 2115 .co_med_word1: 2116 sub %o2, 15, %o2 ! adjust count to allow cc zero test 2117 .co_med_wmove: 2118 lduw [%o0], %o3 ! read word 2119 subcc %o2, 16, %o2 ! reduce count by 16 2120 stwa %o3, [%o1]ASI_USER ! write word 2121 add %o1, 4, %o1 ! advance DST by 4 2122 lduw [%o0 + 4], %o3 ! repeat for a total for 4 words 2123 add %o0, 16, %o0 ! advance SRC by 16 2124 stwa %o3, [%o1]ASI_USER 2125 add %o1, 4, %o1 ! advance DST by 4 2126 lduw [%o0 - 8], %o3 2127 stwa %o3, [%o1]ASI_USER 2128 add %o1, 4, %o1 ! advance DST by 4 2129 lduw [%o0 - 4], %o3 2130 stwa %o3, [%o1]ASI_USER 2131 bgt,pt %ncc, .co_med_wmove ! loop til 15 or fewer bytes left 2132 add %o1, 4, %o1 ! advance DST by 4 2133 addcc %o2, 12, %o2 ! restore count to word offset 2134 ble,pt %ncc, .co_med_wextra ! check for more words to move 2135 nop 2136 .co_med_word2: 2137 lduw [%o0], %o3 ! read word 2138 subcc %o2, 4, %o2 ! reduce count by 4 2139 stwa %o3, [%o1]ASI_USER ! write word 2140 add %o0, 4, %o0 ! advance SRC by 4 2141 bgt,pt %ncc, .co_med_word2 ! loop til 3 or fewer bytes left 2142 add %o1, 4, %o1 ! advance DST by 4 2143 .co_med_wextra: 2144 addcc %o2, 3, %o2 ! restore rest of count 2145 bz,pt %ncc, .co_sm_exit ! if zero, then done 2146 deccc %o2 2147 bz,pt %ncc, .co_sm_byte 2148 nop 2149 ba,pt %ncc, .co_sm_half 2150 nop 2151 2152 .align 16 2153 nop ! instruction alignment 2154 nop ! see discussion at start of file 2155 nop 2156 .co_med_half: 2157 btst 1, %o0 ! check for 2158 bz,pt %ncc, .co_med_half1 ! half word alignment 2159 nop 2160 ldub [%o0], %o3 ! load one byte 2161 inc %o0 2162 stba %o3,[%o1]ASI_USER ! store byte 2163 inc %o1 2164 dec %o2 2165 ! 2166 ! Now half word aligned and have at least 38 bytes to move 2167 ! 2168 .co_med_half1: 2169 sub %o2, 7, %o2 ! adjust count to allow cc zero test 2170 .co_med_hmove: 2171 lduh [%o0], %o3 ! read half word 2172 subcc %o2, 8, %o2 ! reduce count by 8 2173 stha %o3, [%o1]ASI_USER ! write half word 2174 add %o1, 2, %o1 ! advance DST by 2 2175 lduh [%o0 + 2], %o3 ! repeat for a total for 4 halfwords 2176 add %o0, 8, %o0 ! advance SRC by 8 2177 stha %o3, [%o1]ASI_USER 2178 add %o1, 2, %o1 ! advance DST by 2 2179 lduh [%o0 - 4], %o3 2180 stha %o3, [%o1]ASI_USER 2181 add %o1, 2, %o1 ! advance DST by 2 2182 lduh [%o0 - 2], %o3 2183 stha %o3, [%o1]ASI_USER 2184 bgt,pt %ncc, .co_med_hmove ! loop til 7 or fewer bytes left 2185 add %o1, 2, %o1 ! advance DST by 2 2186 addcc %o2, 7, %o2 ! restore count 2187 bz,pt %ncc, .co_sm_exit 2188 deccc %o2 2189 bz,pt %ncc, .co_sm_byte 2190 nop 2191 ba,pt %ncc, .co_sm_half 2192 nop 2193 2194 /* 2195 * We got here because of a fault during short copyout. 2196 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh). 2197 */ 2198 .sm_copyout_err: 2199 membar #Sync 2200 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2201 mov SM_SAVE_SRC, %o0 2202 mov SM_SAVE_DST, %o1 2203 mov SM_SAVE_COUNT, %o2 2204 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler 2205 tst %o3 2206 bz,pt %ncc, 3f ! if not, return error 2207 nop 2208 ldn [%o3 + CP_COPYOUT], %o5 ! if handler, invoke it with 2209 jmp %o5 ! original arguments 2210 nop 2211 3: 2212 retl 2213 or %g0, -1, %o0 ! return error value 2214 2215 SET_SIZE(copyout) 2216 2217 /* 2218 * The _more entry points are not intended to be used directly by 2219 * any caller from outside this file. They are provided to allow 2220 * profiling and dtrace of the portions of the copy code that uses 2221 * the floating point registers. 2222 * This entry is particularly important as DTRACE (at least as of 2223 * 4/2004) does not support leaf functions. 2224 */ 2225 2226 ENTRY(copyout_more) 2227 .copyout_more: 2228 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 2229 set .copyout_err, REAL_LOFAULT 2230 2231 /* 2232 * Copy outs that reach here are larger than VIS_COPY_THRESHOLD bytes 2233 */ 2234 .do_copyout: 2235 set copyio_fault, %l7 ! .copyio_fault is lofault val 2236 2237 ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler 2238 membar #Sync ! sync error barrier 2239 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 2240 2241 mov %i0, SAVE_SRC 2242 mov %i1, SAVE_DST 2243 mov %i2, SAVE_COUNT 2244 2245 FP_NOMIGRATE(6, 7) 2246 2247 rd %fprs, %o2 ! check for unused fp 2248 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs 2249 btst FPRS_FEF, %o2 2250 bz,a,pt %icc, .do_blockcopyout 2251 wr %g0, FPRS_FEF, %fprs 2252 2253 BST_FPQ2Q4_TOSTACK(%o2) 2254 2255 .do_blockcopyout: 2256 rd %gsr, %o2 2257 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr 2258 or %l6, FPUSED_FLAG, %l6 2259 2260 andcc DST, VIS_BLOCKSIZE - 1, TMP 2261 mov ASI_USER, %asi 2262 bz,pt %ncc, 2f 2263 neg TMP 2264 add TMP, VIS_BLOCKSIZE, TMP 2265 2266 ! TMP = bytes required to align DST on FP_BLOCK boundary 2267 ! Using SRC as a tmp here 2268 cmp TMP, 3 2269 bleu,pt %ncc, 1f 2270 sub CNT,TMP,CNT ! adjust main count 2271 sub TMP, 3, TMP ! adjust for end of loop test 2272 .co_blkalign: 2273 ldub [REALSRC], SRC ! move 4 bytes per loop iteration 2274 stba SRC, [DST]%asi 2275 subcc TMP, 4, TMP 2276 ldub [REALSRC + 1], SRC 2277 add REALSRC, 4, REALSRC 2278 stba SRC, [DST + 1]%asi 2279 ldub [REALSRC - 2], SRC 2280 add DST, 4, DST 2281 stba SRC, [DST - 2]%asi 2282 ldub [REALSRC - 1], SRC 2283 bgu,pt %ncc, .co_blkalign 2284 stba SRC, [DST - 1]%asi 2285 2286 addcc TMP, 3, TMP ! restore count adjustment 2287 bz,pt %ncc, 2f ! no bytes left? 2288 nop 2289 1: ldub [REALSRC], SRC 2290 inc REALSRC 2291 inc DST 2292 deccc TMP 2293 bgu %ncc, 1b 2294 stba SRC, [DST - 1]%asi 2295 2296 2: 2297 andn REALSRC, 0x7, SRC 2298 alignaddr REALSRC, %g0, %g0 2299 2300 ! SRC - 8-byte aligned 2301 ! DST - 64-byte aligned 2302 prefetch [SRC], #one_read 2303 prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read 2304 prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read 2305 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read 2306 ldd [SRC], %f16 2307 #if CHEETAH_PREFETCH > 4 2308 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read 2309 #endif 2310 ldd [SRC + 0x08], %f18 2311 #if CHEETAH_PREFETCH > 5 2312 prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read 2313 #endif 2314 ldd [SRC + 0x10], %f20 2315 #if CHEETAH_PREFETCH > 6 2316 prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read 2317 #endif 2318 faligndata %f16, %f18, %f48 2319 ldd [SRC + 0x18], %f22 2320 #if CHEETAH_PREFETCH > 7 2321 prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read 2322 #endif 2323 faligndata %f18, %f20, %f50 2324 ldd [SRC + 0x20], %f24 2325 faligndata %f20, %f22, %f52 2326 ldd [SRC + 0x28], %f26 2327 faligndata %f22, %f24, %f54 2328 ldd [SRC + 0x30], %f28 2329 faligndata %f24, %f26, %f56 2330 ldd [SRC + 0x38], %f30 2331 faligndata %f26, %f28, %f58 2332 ldd [SRC + VIS_BLOCKSIZE], %f16 2333 sub CNT, VIS_BLOCKSIZE, CNT 2334 add SRC, VIS_BLOCKSIZE, SRC 2335 add REALSRC, VIS_BLOCKSIZE, REALSRC 2336 ba,a,pt %ncc, 1f 2337 nop 2338 .align 16 2339 1: 2340 ldd [SRC + 0x08], %f18 2341 faligndata %f28, %f30, %f60 2342 ldd [SRC + 0x10], %f20 2343 faligndata %f30, %f16, %f62 2344 stda %f48, [DST]ASI_BLK_AIUS 2345 ldd [SRC + 0x18], %f22 2346 faligndata %f16, %f18, %f48 2347 ldd [SRC + 0x20], %f24 2348 faligndata %f18, %f20, %f50 2349 ldd [SRC + 0x28], %f26 2350 faligndata %f20, %f22, %f52 2351 ldd [SRC + 0x30], %f28 2352 faligndata %f22, %f24, %f54 2353 ldd [SRC + 0x38], %f30 2354 faligndata %f24, %f26, %f56 2355 sub CNT, VIS_BLOCKSIZE, CNT 2356 ldd [SRC + VIS_BLOCKSIZE], %f16 2357 faligndata %f26, %f28, %f58 2358 prefetch [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read 2359 add DST, VIS_BLOCKSIZE, DST 2360 prefetch [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read 2361 add REALSRC, VIS_BLOCKSIZE, REALSRC 2362 cmp CNT, VIS_BLOCKSIZE + 8 2363 bgu,pt %ncc, 1b 2364 add SRC, VIS_BLOCKSIZE, SRC 2365 2366 ! only if REALSRC & 0x7 is 0 2367 cmp CNT, VIS_BLOCKSIZE 2368 bne %ncc, 3f 2369 andcc REALSRC, 0x7, %g0 2370 bz,pt %ncc, 2f 2371 nop 2372 3: 2373 faligndata %f28, %f30, %f60 2374 faligndata %f30, %f16, %f62 2375 stda %f48, [DST]ASI_BLK_AIUS 2376 add DST, VIS_BLOCKSIZE, DST 2377 ba,pt %ncc, 3f 2378 nop 2379 2: 2380 ldd [SRC + 0x08], %f18 2381 fsrc1 %f28, %f60 2382 ldd [SRC + 0x10], %f20 2383 fsrc1 %f30, %f62 2384 stda %f48, [DST]ASI_BLK_AIUS 2385 ldd [SRC + 0x18], %f22 2386 fsrc1 %f16, %f48 2387 ldd [SRC + 0x20], %f24 2388 fsrc1 %f18, %f50 2389 ldd [SRC + 0x28], %f26 2390 fsrc1 %f20, %f52 2391 ldd [SRC + 0x30], %f28 2392 fsrc1 %f22, %f54 2393 ldd [SRC + 0x38], %f30 2394 fsrc1 %f24, %f56 2395 sub CNT, VIS_BLOCKSIZE, CNT 2396 add DST, VIS_BLOCKSIZE, DST 2397 add SRC, VIS_BLOCKSIZE, SRC 2398 add REALSRC, VIS_BLOCKSIZE, REALSRC 2399 fsrc1 %f26, %f58 2400 fsrc1 %f28, %f60 2401 fsrc1 %f30, %f62 2402 stda %f48, [DST]ASI_BLK_AIUS 2403 add DST, VIS_BLOCKSIZE, DST 2404 ba,a,pt %ncc, 4f 2405 nop 2406 2407 3: tst CNT 2408 bz,a %ncc, 4f 2409 nop 2410 2411 5: ldub [REALSRC], TMP 2412 inc REALSRC 2413 inc DST 2414 deccc CNT 2415 bgu %ncc, 5b 2416 stba TMP, [DST - 1]%asi 2417 4: 2418 2419 .copyout_exit: 2420 membar #Sync 2421 2422 FPRAS_INTERVAL(FPRAS_COPYOUT, 0, %l5, %o2, %o3, %o4, %o5, 8) 2423 FPRAS_REWRITE_TYPE2Q2(0, %l5, %o2, %o3, 8, 9) 2424 FPRAS_CHECK(FPRAS_COPYOUT, %l5, 9) ! lose outputs 2425 2426 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 2427 wr %o2, 0, %gsr ! restore gsr 2428 2429 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 2430 btst FPRS_FEF, %o3 2431 bz,pt %icc, 4f 2432 nop 2433 2434 BLD_FPQ2Q4_FROMSTACK(%o2) 2435 2436 ba,pt %ncc, 1f 2437 wr %o3, 0, %fprs ! restore fprs 2438 2439 4: 2440 FZEROQ2Q4 2441 wr %o3, 0, %fprs ! restore fprs 2442 2443 1: 2444 membar #Sync 2445 andn %l6, FPUSED_FLAG, %l6 2446 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2447 FP_ALLOWMIGRATE(5, 6) 2448 ret 2449 restore %g0, 0, %o0 2450 2451 /* 2452 * We got here because of a fault during copyout. 2453 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh). 2454 */ 2455 .copyout_err: 2456 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler 2457 tst %o4 2458 bz,pt %ncc, 2f ! if not, return error 2459 nop 2460 ldn [%o4 + CP_COPYOUT], %g2 ! if handler, invoke it with 2461 jmp %g2 ! original arguments 2462 restore %g0, 0, %g0 ! dispose of copy window 2463 2: 2464 ret 2465 restore %g0, -1, %o0 ! return error value 2466 2467 2468 SET_SIZE(copyout_more) 2469 2470 #endif /* lint */ 2471 2472 2473 #ifdef lint 2474 2475 /*ARGSUSED*/ 2476 int 2477 xcopyout(const void *kaddr, void *uaddr, size_t count) 2478 { return (0); } 2479 2480 #else /* lint */ 2481 2482 ENTRY(xcopyout) 2483 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 2484 bleu,pt %ncc, .xcopyout_small ! go to larger cases 2485 xor %o0, %o1, %o3 ! are src, dst alignable? 2486 btst 7, %o3 ! 2487 bz,pt %ncc, .xcopyout_8 ! 2488 nop 2489 btst 1, %o3 ! 2490 bz,pt %ncc, .xcopyout_2 ! check for half-word 2491 nop 2492 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 2493 ld [%o3 + %lo(hw_copy_limit_1)], %o3 2494 tst %o3 2495 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy 2496 cmp %o2, %o3 ! if length <= limit 2497 bleu,pt %ncc, .xcopyout_small ! go to small copy 2498 nop 2499 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy 2500 nop 2501 .xcopyout_2: 2502 btst 3, %o3 ! 2503 bz,pt %ncc, .xcopyout_4 ! check for word alignment 2504 nop 2505 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 2506 ld [%o3 + %lo(hw_copy_limit_2)], %o3 2507 tst %o3 2508 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy 2509 cmp %o2, %o3 ! if length <= limit 2510 bleu,pt %ncc, .xcopyout_small ! go to small copy 2511 nop 2512 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy 2513 nop 2514 .xcopyout_4: 2515 ! already checked longword, must be word aligned 2516 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 2517 ld [%o3 + %lo(hw_copy_limit_4)], %o3 2518 tst %o3 2519 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy 2520 cmp %o2, %o3 ! if length <= limit 2521 bleu,pt %ncc, .xcopyout_small ! go to small copy 2522 nop 2523 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy 2524 nop 2525 .xcopyout_8: 2526 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 2527 ld [%o3 + %lo(hw_copy_limit_8)], %o3 2528 tst %o3 2529 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy 2530 cmp %o2, %o3 ! if length <= limit 2531 bleu,pt %ncc, .xcopyout_small ! go to small copy 2532 nop 2533 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy 2534 nop 2535 2536 .xcopyout_small: 2537 sethi %hi(.sm_xcopyout_err), %o5 ! .sm_xcopyout_err is lofault 2538 or %o5, %lo(.sm_xcopyout_err), %o5 2539 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler 2540 membar #Sync ! sync error barrier 2541 ba,pt %ncc, .sm_do_copyout ! common code 2542 stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault 2543 2544 .xcopyout_more: 2545 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 2546 sethi %hi(.xcopyout_err), REAL_LOFAULT 2547 ba,pt %ncc, .do_copyout ! common code 2548 or REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT 2549 2550 /* 2551 * We got here because of fault during xcopyout 2552 * Errno value is in ERRNO 2553 */ 2554 .xcopyout_err: 2555 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler 2556 tst %o4 2557 bz,pt %ncc, 2f ! if not, return error 2558 nop 2559 ldn [%o4 + CP_XCOPYOUT], %g2 ! if handler, invoke it with 2560 jmp %g2 ! original arguments 2561 restore %g0, 0, %g0 ! dispose of copy window 2562 2: 2563 ret 2564 restore ERRNO, 0, %o0 ! return errno value 2565 2566 .sm_xcopyout_err: 2567 2568 membar #Sync 2569 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2570 mov SM_SAVE_SRC, %o0 2571 mov SM_SAVE_DST, %o1 2572 mov SM_SAVE_COUNT, %o2 2573 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler 2574 tst %o3 2575 bz,pt %ncc, 3f ! if not, return error 2576 nop 2577 ldn [%o3 + CP_XCOPYOUT], %o5 ! if handler, invoke it with 2578 jmp %o5 ! original arguments 2579 nop 2580 3: 2581 retl 2582 or %g1, 0, %o0 ! return errno value 2583 2584 SET_SIZE(xcopyout) 2585 2586 #endif /* lint */ 2587 2588 #ifdef lint 2589 2590 /*ARGSUSED*/ 2591 int 2592 xcopyout_little(const void *kaddr, void *uaddr, size_t count) 2593 { return (0); } 2594 2595 #else /* lint */ 2596 2597 ENTRY(xcopyout_little) 2598 sethi %hi(.xcopyio_err), %o5 2599 or %o5, %lo(.xcopyio_err), %o5 2600 ldn [THREAD_REG + T_LOFAULT], %o4 2601 membar #Sync ! sync error barrier 2602 stn %o5, [THREAD_REG + T_LOFAULT] 2603 mov %o4, %o5 2604 2605 subcc %g0, %o2, %o3 2606 add %o0, %o2, %o0 2607 bz,pn %ncc, 2f ! check for zero bytes 2608 sub %o2, 1, %o4 2609 add %o0, %o4, %o0 ! start w/last byte 2610 add %o1, %o2, %o1 2611 ldub [%o0 + %o3], %o4 2612 2613 1: stba %o4, [%o1 + %o3]ASI_AIUSL 2614 inccc %o3 2615 sub %o0, 2, %o0 ! get next byte 2616 bcc,a,pt %ncc, 1b 2617 ldub [%o0 + %o3], %o4 2618 2619 2: 2620 membar #Sync ! sync error barrier 2621 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2622 retl 2623 mov %g0, %o0 ! return (0) 2624 2625 SET_SIZE(xcopyout_little) 2626 2627 #endif /* lint */ 2628 2629 /* 2630 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little) 2631 */ 2632 2633 #if defined(lint) 2634 2635 /*ARGSUSED*/ 2636 int 2637 copyin(const void *uaddr, void *kaddr, size_t count) 2638 { return (0); } 2639 2640 #else /* lint */ 2641 2642 ENTRY(copyin) 2643 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 2644 bleu,pt %ncc, .copyin_small ! go to larger cases 2645 xor %o0, %o1, %o3 ! are src, dst alignable? 2646 btst 7, %o3 ! 2647 bz,pt %ncc, .copyin_8 ! check for longword alignment 2648 nop 2649 btst 1, %o3 ! 2650 bz,pt %ncc, .copyin_2 ! check for half-word 2651 nop 2652 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 2653 ld [%o3 + %lo(hw_copy_limit_1)], %o3 2654 tst %o3 2655 bz,pn %icc, .copyin_small ! if zero, disable HW copy 2656 cmp %o2, %o3 ! if length <= limit 2657 bleu,pt %ncc, .copyin_small ! go to small copy 2658 nop 2659 ba,pt %ncc, .copyin_more ! otherwise go to large copy 2660 nop 2661 .copyin_2: 2662 btst 3, %o3 ! 2663 bz,pt %ncc, .copyin_4 ! check for word alignment 2664 nop 2665 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 2666 ld [%o3 + %lo(hw_copy_limit_2)], %o3 2667 tst %o3 2668 bz,pn %icc, .copyin_small ! if zero, disable HW copy 2669 cmp %o2, %o3 ! if length <= limit 2670 bleu,pt %ncc, .copyin_small ! go to small copy 2671 nop 2672 ba,pt %ncc, .copyin_more ! otherwise go to large copy 2673 nop 2674 .copyin_4: 2675 ! already checked longword, must be word aligned 2676 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 2677 ld [%o3 + %lo(hw_copy_limit_4)], %o3 2678 tst %o3 2679 bz,pn %icc, .copyin_small ! if zero, disable HW copy 2680 cmp %o2, %o3 ! if length <= limit 2681 bleu,pt %ncc, .copyin_small ! go to small copy 2682 nop 2683 ba,pt %ncc, .copyin_more ! otherwise go to large copy 2684 nop 2685 .copyin_8: 2686 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 2687 ld [%o3 + %lo(hw_copy_limit_8)], %o3 2688 tst %o3 2689 bz,pn %icc, .copyin_small ! if zero, disable HW copy 2690 cmp %o2, %o3 ! if length <= limit 2691 bleu,pt %ncc, .copyin_small ! go to small copy 2692 nop 2693 ba,pt %ncc, .copyin_more ! otherwise go to large copy 2694 nop 2695 2696 .align 16 2697 nop ! instruction alignment 2698 ! see discussion at start of file 2699 .copyin_small: 2700 sethi %hi(.sm_copyin_err), %o5 ! .sm_copyin_err is lofault 2701 or %o5, %lo(.sm_copyin_err), %o5 2702 ldn [THREAD_REG + T_LOFAULT], %o4 ! set/save t_lofault, no tramp 2703 membar #Sync ! sync error barrier 2704 stn %o5, [THREAD_REG + T_LOFAULT] 2705 .sm_do_copyin: 2706 mov %o0, SM_SAVE_SRC 2707 mov %o1, SM_SAVE_DST 2708 cmp %o2, SHORTCOPY ! check for really short case 2709 bleu,pt %ncc, .ci_sm_left ! 2710 mov %o2, SM_SAVE_COUNT 2711 cmp %o2, CHKSIZE ! check for medium length cases 2712 bgu,pn %ncc, .ci_med ! 2713 or %o0, %o1, %o3 ! prepare alignment check 2714 andcc %o3, 0x3, %g0 ! test for alignment 2715 bz,pt %ncc, .ci_sm_word ! branch to word aligned case 2716 .ci_sm_movebytes: 2717 sub %o2, 3, %o2 ! adjust count to allow cc zero test 2718 .ci_sm_notalign4: 2719 lduba [%o0]ASI_USER, %o3 ! read byte 2720 subcc %o2, 4, %o2 ! reduce count by 4 2721 stb %o3, [%o1] ! write byte 2722 add %o0, 1, %o0 ! advance SRC by 1 2723 lduba [%o0]ASI_USER, %o3 ! repeat for a total of 4 bytes 2724 add %o0, 1, %o0 ! advance SRC by 1 2725 stb %o3, [%o1 + 1] 2726 add %o1, 4, %o1 ! advance DST by 4 2727 lduba [%o0]ASI_USER, %o3 2728 add %o0, 1, %o0 ! advance SRC by 1 2729 stb %o3, [%o1 - 2] 2730 lduba [%o0]ASI_USER, %o3 2731 add %o0, 1, %o0 ! advance SRC by 1 2732 bgt,pt %ncc, .ci_sm_notalign4 ! loop til 3 or fewer bytes remain 2733 stb %o3, [%o1 - 1] 2734 add %o2, 3, %o2 ! restore count 2735 .ci_sm_left: 2736 tst %o2 2737 bz,pt %ncc, .ci_sm_exit 2738 nop 2739 lduba [%o0]ASI_USER, %o3 ! load one byte 2740 deccc %o2 ! reduce count for cc test 2741 bz,pt %ncc, .ci_sm_exit 2742 stb %o3,[%o1] ! store one byte 2743 inc %o0 2744 lduba [%o0]ASI_USER, %o3 ! load second byte 2745 deccc %o2 2746 bz,pt %ncc, .ci_sm_exit 2747 stb %o3,[%o1 + 1] ! store second byte 2748 inc %o0 2749 lduba [%o0]ASI_USER, %o3 ! load third byte 2750 stb %o3,[%o1 + 2] ! store third byte 2751 membar #Sync ! sync error barrier 2752 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2753 retl 2754 mov %g0, %o0 ! return 0 2755 .align 16 2756 .ci_sm_words: 2757 lduwa [%o0]ASI_USER, %o3 ! read word 2758 .ci_sm_wordx: 2759 subcc %o2, 8, %o2 ! update count 2760 stw %o3, [%o1] ! write word 2761 add %o0, 4, %o0 ! update SRC 2762 add %o1, 8, %o1 ! update DST 2763 lduwa [%o0]ASI_USER, %o3 ! read word 2764 add %o0, 4, %o0 ! update SRC 2765 bgt,pt %ncc, .ci_sm_words ! loop til done 2766 stw %o3, [%o1 - 4] ! write word 2767 addcc %o2, 7, %o2 ! restore count 2768 bz,pt %ncc, .ci_sm_exit 2769 nop 2770 deccc %o2 2771 bz,pt %ncc, .ci_sm_byte 2772 .ci_sm_half: 2773 subcc %o2, 2, %o2 ! reduce count by 2 2774 lduha [%o0]ASI_USER, %o3 ! read half word 2775 add %o0, 2, %o0 ! advance SRC by 2 2776 add %o1, 2, %o1 ! advance DST by 2 2777 bgt,pt %ncc, .ci_sm_half ! loop til done 2778 sth %o3, [%o1 - 2] ! write half word 2779 addcc %o2, 1, %o2 ! restore count 2780 bz,pt %ncc, .ci_sm_exit 2781 nop 2782 .ci_sm_byte: 2783 lduba [%o0]ASI_USER, %o3 2784 stb %o3, [%o1] 2785 membar #Sync ! sync error barrier 2786 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2787 retl 2788 mov %g0, %o0 ! return 0 2789 .align 16 2790 .ci_sm_word: 2791 subcc %o2, 4, %o2 ! update count 2792 bgt,pt %ncc, .ci_sm_wordx 2793 lduwa [%o0]ASI_USER, %o3 ! read word 2794 addcc %o2, 3, %o2 ! restore count 2795 bz,pt %ncc, .ci_sm_exit 2796 stw %o3, [%o1] ! write word 2797 deccc %o2 ! reduce count for cc test 2798 add %o0, 4, %o0 2799 lduba [%o0]ASI_USER, %o3 ! load one byte 2800 bz,pt %ncc, .ci_sm_exit 2801 stb %o3, [%o1 + 4] ! store one byte 2802 inc %o0 2803 lduba [%o0]ASI_USER, %o3 ! load second byte 2804 deccc %o2 2805 bz,pt %ncc, .ci_sm_exit 2806 stb %o3, [%o1 + 5] ! store second byte 2807 inc %o0 2808 lduba [%o0]ASI_USER, %o3 ! load third byte 2809 stb %o3, [%o1 + 6] ! store third byte 2810 .ci_sm_exit: 2811 membar #Sync ! sync error barrier 2812 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2813 retl 2814 mov %g0, %o0 ! return 0 2815 2816 .align 16 2817 .ci_med: 2818 xor %o0, %o1, %o3 ! setup alignment check 2819 btst 1, %o3 2820 bnz,pt %ncc, .ci_sm_movebytes ! unaligned 2821 nop 2822 btst 3, %o3 2823 bnz,pt %ncc, .ci_med_half ! halfword aligned 2824 nop 2825 btst 7, %o3 2826 bnz,pt %ncc, .ci_med_word ! word aligned 2827 nop 2828 .ci_med_long: 2829 btst 3, %o0 ! check for 2830 bz,pt %ncc, .ci_med_long1 ! word alignment 2831 nop 2832 .ci_med_long0: 2833 lduba [%o0]ASI_USER, %o3 ! load one byte 2834 inc %o0 2835 stb %o3,[%o1] ! store byte 2836 inc %o1 2837 btst 3, %o0 2838 bnz,pt %ncc, .ci_med_long0 2839 dec %o2 2840 .ci_med_long1: ! word aligned 2841 btst 7, %o0 ! check for long word 2842 bz,pt %ncc, .ci_med_long2 2843 nop 2844 lduwa [%o0]ASI_USER, %o3 ! load word 2845 add %o0, 4, %o0 ! advance SRC by 4 2846 stw %o3, [%o1] ! store word 2847 add %o1, 4, %o1 ! advance DST by 4 2848 sub %o2, 4, %o2 ! reduce count by 4 2849 ! 2850 ! Now long word aligned and have at least 32 bytes to move 2851 ! 2852 .ci_med_long2: 2853 sub %o2, 31, %o2 ! adjust count to allow cc zero test 2854 .ci_med_lmove: 2855 ldxa [%o0]ASI_USER, %o3 ! read long word 2856 subcc %o2, 32, %o2 ! reduce count by 32 2857 stx %o3, [%o1] ! write long word 2858 add %o0, 8, %o0 ! advance SRC by 8 2859 ldxa [%o0]ASI_USER, %o3 ! repeat for a total for 4 long words 2860 add %o0, 8, %o0 ! advance SRC by 8 2861 stx %o3, [%o1 + 8] 2862 add %o1, 32, %o1 ! advance DST by 32 2863 ldxa [%o0]ASI_USER, %o3 2864 add %o0, 8, %o0 ! advance SRC by 8 2865 stx %o3, [%o1 - 16] 2866 ldxa [%o0]ASI_USER, %o3 2867 add %o0, 8, %o0 ! advance SRC by 8 2868 bgt,pt %ncc, .ci_med_lmove ! loop til 31 or fewer bytes left 2869 stx %o3, [%o1 - 8] 2870 addcc %o2, 24, %o2 ! restore count to long word offset 2871 ble,pt %ncc, .ci_med_lextra ! check for more long words to move 2872 nop 2873 .ci_med_lword: 2874 ldxa [%o0]ASI_USER, %o3 ! read long word 2875 subcc %o2, 8, %o2 ! reduce count by 8 2876 stx %o3, [%o1] ! write long word 2877 add %o0, 8, %o0 ! advance SRC by 8 2878 bgt,pt %ncc, .ci_med_lword ! loop til 7 or fewer bytes left 2879 add %o1, 8, %o1 ! advance DST by 8 2880 .ci_med_lextra: 2881 addcc %o2, 7, %o2 ! restore rest of count 2882 bz,pt %ncc, .ci_sm_exit ! if zero, then done 2883 deccc %o2 2884 bz,pt %ncc, .ci_sm_byte 2885 nop 2886 ba,pt %ncc, .ci_sm_half 2887 nop 2888 2889 .align 16 2890 nop ! instruction alignment 2891 ! see discussion at start of file 2892 .ci_med_word: 2893 btst 3, %o0 ! check for 2894 bz,pt %ncc, .ci_med_word1 ! word alignment 2895 nop 2896 .ci_med_word0: 2897 lduba [%o0]ASI_USER, %o3 ! load one byte 2898 inc %o0 2899 stb %o3,[%o1] ! store byte 2900 inc %o1 2901 btst 3, %o0 2902 bnz,pt %ncc, .ci_med_word0 2903 dec %o2 2904 ! 2905 ! Now word aligned and have at least 36 bytes to move 2906 ! 2907 .ci_med_word1: 2908 sub %o2, 15, %o2 ! adjust count to allow cc zero test 2909 .ci_med_wmove: 2910 lduwa [%o0]ASI_USER, %o3 ! read word 2911 subcc %o2, 16, %o2 ! reduce count by 16 2912 stw %o3, [%o1] ! write word 2913 add %o0, 4, %o0 ! advance SRC by 4 2914 lduwa [%o0]ASI_USER, %o3 ! repeat for a total for 4 words 2915 add %o0, 4, %o0 ! advance SRC by 4 2916 stw %o3, [%o1 + 4] 2917 add %o1, 16, %o1 ! advance DST by 16 2918 lduwa [%o0]ASI_USER, %o3 2919 add %o0, 4, %o0 ! advance SRC by 4 2920 stw %o3, [%o1 - 8] 2921 lduwa [%o0]ASI_USER, %o3 2922 add %o0, 4, %o0 ! advance SRC by 4 2923 bgt,pt %ncc, .ci_med_wmove ! loop til 15 or fewer bytes left 2924 stw %o3, [%o1 - 4] 2925 addcc %o2, 12, %o2 ! restore count to word offset 2926 ble,pt %ncc, .ci_med_wextra ! check for more words to move 2927 nop 2928 .ci_med_word2: 2929 lduwa [%o0]ASI_USER, %o3 ! read word 2930 subcc %o2, 4, %o2 ! reduce count by 4 2931 stw %o3, [%o1] ! write word 2932 add %o0, 4, %o0 ! advance SRC by 4 2933 bgt,pt %ncc, .ci_med_word2 ! loop til 3 or fewer bytes left 2934 add %o1, 4, %o1 ! advance DST by 4 2935 .ci_med_wextra: 2936 addcc %o2, 3, %o2 ! restore rest of count 2937 bz,pt %ncc, .ci_sm_exit ! if zero, then done 2938 deccc %o2 2939 bz,pt %ncc, .ci_sm_byte 2940 nop 2941 ba,pt %ncc, .ci_sm_half 2942 nop 2943 2944 .align 16 2945 nop ! instruction alignment 2946 ! see discussion at start of file 2947 .ci_med_half: 2948 btst 1, %o0 ! check for 2949 bz,pt %ncc, .ci_med_half1 ! half word alignment 2950 nop 2951 lduba [%o0]ASI_USER, %o3 ! load one byte 2952 inc %o0 2953 stb %o3,[%o1] ! store byte 2954 inc %o1 2955 dec %o2 2956 ! 2957 ! Now half word aligned and have at least 38 bytes to move 2958 ! 2959 .ci_med_half1: 2960 sub %o2, 7, %o2 ! adjust count to allow cc zero test 2961 .ci_med_hmove: 2962 lduha [%o0]ASI_USER, %o3 ! read half word 2963 subcc %o2, 8, %o2 ! reduce count by 8 2964 sth %o3, [%o1] ! write half word 2965 add %o0, 2, %o0 ! advance SRC by 2 2966 lduha [%o0]ASI_USER, %o3 ! repeat for a total for 4 halfwords 2967 add %o0, 2, %o0 ! advance SRC by 2 2968 sth %o3, [%o1 + 2] 2969 add %o1, 8, %o1 ! advance DST by 8 2970 lduha [%o0]ASI_USER, %o3 2971 add %o0, 2, %o0 ! advance SRC by 2 2972 sth %o3, [%o1 - 4] 2973 lduha [%o0]ASI_USER, %o3 2974 add %o0, 2, %o0 ! advance SRC by 2 2975 bgt,pt %ncc, .ci_med_hmove ! loop til 7 or fewer bytes left 2976 sth %o3, [%o1 - 2] 2977 addcc %o2, 7, %o2 ! restore count 2978 bz,pt %ncc, .ci_sm_exit 2979 deccc %o2 2980 bz,pt %ncc, .ci_sm_byte 2981 nop 2982 ba,pt %ncc, .ci_sm_half 2983 nop 2984 2985 .sm_copyin_err: 2986 membar #Sync 2987 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2988 mov SM_SAVE_SRC, %o0 2989 mov SM_SAVE_DST, %o1 2990 mov SM_SAVE_COUNT, %o2 2991 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler 2992 tst %o3 2993 bz,pt %ncc, 3f ! if not, return error 2994 nop 2995 ldn [%o3 + CP_COPYIN], %o5 ! if handler, invoke it with 2996 jmp %o5 ! original arguments 2997 nop 2998 3: 2999 retl 3000 or %g0, -1, %o0 ! return errno value 3001 3002 SET_SIZE(copyin) 3003 3004 3005 /* 3006 * The _more entry points are not intended to be used directly by 3007 * any caller from outside this file. They are provided to allow 3008 * profiling and dtrace of the portions of the copy code that uses 3009 * the floating point registers. 3010 * This entry is particularly important as DTRACE (at least as of 3011 * 4/2004) does not support leaf functions. 3012 */ 3013 3014 ENTRY(copyin_more) 3015 .copyin_more: 3016 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 3017 set .copyin_err, REAL_LOFAULT 3018 3019 /* 3020 * Copy ins that reach here are larger than VIS_COPY_THRESHOLD bytes 3021 */ 3022 .do_copyin: 3023 set copyio_fault, %l7 ! .copyio_fault is lofault val 3024 3025 ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler 3026 membar #Sync ! sync error barrier 3027 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 3028 3029 mov %i0, SAVE_SRC 3030 mov %i1, SAVE_DST 3031 mov %i2, SAVE_COUNT 3032 3033 FP_NOMIGRATE(6, 7) 3034 3035 rd %fprs, %o2 ! check for unused fp 3036 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs 3037 btst FPRS_FEF, %o2 3038 bz,a,pt %icc, .do_blockcopyin 3039 wr %g0, FPRS_FEF, %fprs 3040 3041 BST_FPQ2Q4_TOSTACK(%o2) 3042 3043 .do_blockcopyin: 3044 rd %gsr, %o2 3045 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr 3046 or %l6, FPUSED_FLAG, %l6 3047 3048 andcc DST, VIS_BLOCKSIZE - 1, TMP 3049 mov ASI_USER, %asi 3050 bz,pt %ncc, 2f 3051 neg TMP 3052 add TMP, VIS_BLOCKSIZE, TMP 3053 3054 ! TMP = bytes required to align DST on FP_BLOCK boundary 3055 ! Using SRC as a tmp here 3056 cmp TMP, 3 3057 bleu,pt %ncc, 1f 3058 sub CNT,TMP,CNT ! adjust main count 3059 sub TMP, 3, TMP ! adjust for end of loop test 3060 .ci_blkalign: 3061 lduba [REALSRC]%asi, SRC ! move 4 bytes per loop iteration 3062 stb SRC, [DST] 3063 subcc TMP, 4, TMP 3064 lduba [REALSRC + 1]%asi, SRC 3065 add REALSRC, 4, REALSRC 3066 stb SRC, [DST + 1] 3067 lduba [REALSRC - 2]%asi, SRC 3068 add DST, 4, DST 3069 stb SRC, [DST - 2] 3070 lduba [REALSRC - 1]%asi, SRC 3071 bgu,pt %ncc, .ci_blkalign 3072 stb SRC, [DST - 1] 3073 3074 addcc TMP, 3, TMP ! restore count adjustment 3075 bz,pt %ncc, 2f ! no bytes left? 3076 nop 3077 1: lduba [REALSRC]%asi, SRC 3078 inc REALSRC 3079 inc DST 3080 deccc TMP 3081 bgu %ncc, 1b 3082 stb SRC, [DST - 1] 3083 3084 2: 3085 andn REALSRC, 0x7, SRC 3086 alignaddr REALSRC, %g0, %g0 3087 3088 ! SRC - 8-byte aligned 3089 ! DST - 64-byte aligned 3090 prefetcha [SRC]%asi, #one_read 3091 prefetcha [SRC + (1 * VIS_BLOCKSIZE)]%asi, #one_read 3092 prefetcha [SRC + (2 * VIS_BLOCKSIZE)]%asi, #one_read 3093 prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #one_read 3094 ldda [SRC]%asi, %f16 3095 #if CHEETAH_PREFETCH > 4 3096 prefetcha [SRC + (4 * VIS_BLOCKSIZE)]%asi, #one_read 3097 #endif 3098 ldda [SRC + 0x08]%asi, %f18 3099 #if CHEETAH_PREFETCH > 5 3100 prefetcha [SRC + (5 * VIS_BLOCKSIZE)]%asi, #one_read 3101 #endif 3102 ldda [SRC + 0x10]%asi, %f20 3103 #if CHEETAH_PREFETCH > 6 3104 prefetcha [SRC + (6 * VIS_BLOCKSIZE)]%asi, #one_read 3105 #endif 3106 faligndata %f16, %f18, %f48 3107 ldda [SRC + 0x18]%asi, %f22 3108 #if CHEETAH_PREFETCH > 7 3109 prefetcha [SRC + (7 * VIS_BLOCKSIZE)]%asi, #one_read 3110 #endif 3111 faligndata %f18, %f20, %f50 3112 ldda [SRC + 0x20]%asi, %f24 3113 faligndata %f20, %f22, %f52 3114 ldda [SRC + 0x28]%asi, %f26 3115 faligndata %f22, %f24, %f54 3116 ldda [SRC + 0x30]%asi, %f28 3117 faligndata %f24, %f26, %f56 3118 ldda [SRC + 0x38]%asi, %f30 3119 faligndata %f26, %f28, %f58 3120 ldda [SRC + VIS_BLOCKSIZE]%asi, %f16 3121 sub CNT, VIS_BLOCKSIZE, CNT 3122 add SRC, VIS_BLOCKSIZE, SRC 3123 add REALSRC, VIS_BLOCKSIZE, REALSRC 3124 ba,a,pt %ncc, 1f 3125 nop 3126 .align 16 3127 1: 3128 ldda [SRC + 0x08]%asi, %f18 3129 faligndata %f28, %f30, %f60 3130 ldda [SRC + 0x10]%asi, %f20 3131 faligndata %f30, %f16, %f62 3132 stda %f48, [DST]ASI_BLK_P 3133 ldda [SRC + 0x18]%asi, %f22 3134 faligndata %f16, %f18, %f48 3135 ldda [SRC + 0x20]%asi, %f24 3136 faligndata %f18, %f20, %f50 3137 ldda [SRC + 0x28]%asi, %f26 3138 faligndata %f20, %f22, %f52 3139 ldda [SRC + 0x30]%asi, %f28 3140 faligndata %f22, %f24, %f54 3141 ldda [SRC + 0x38]%asi, %f30 3142 faligndata %f24, %f26, %f56 3143 sub CNT, VIS_BLOCKSIZE, CNT 3144 ldda [SRC + VIS_BLOCKSIZE]%asi, %f16 3145 faligndata %f26, %f28, %f58 3146 prefetcha [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8]%asi, #one_read 3147 add DST, VIS_BLOCKSIZE, DST 3148 prefetcha [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read 3149 add REALSRC, VIS_BLOCKSIZE, REALSRC 3150 cmp CNT, VIS_BLOCKSIZE + 8 3151 bgu,pt %ncc, 1b 3152 add SRC, VIS_BLOCKSIZE, SRC 3153 3154 ! only if REALSRC & 0x7 is 0 3155 cmp CNT, VIS_BLOCKSIZE 3156 bne %ncc, 3f 3157 andcc REALSRC, 0x7, %g0 3158 bz,pt %ncc, 2f 3159 nop 3160 3: 3161 faligndata %f28, %f30, %f60 3162 faligndata %f30, %f16, %f62 3163 stda %f48, [DST]ASI_BLK_P 3164 add DST, VIS_BLOCKSIZE, DST 3165 ba,pt %ncc, 3f 3166 nop 3167 2: 3168 ldda [SRC + 0x08]%asi, %f18 3169 fsrc1 %f28, %f60 3170 ldda [SRC + 0x10]%asi, %f20 3171 fsrc1 %f30, %f62 3172 stda %f48, [DST]ASI_BLK_P 3173 ldda [SRC + 0x18]%asi, %f22 3174 fsrc1 %f16, %f48 3175 ldda [SRC + 0x20]%asi, %f24 3176 fsrc1 %f18, %f50 3177 ldda [SRC + 0x28]%asi, %f26 3178 fsrc1 %f20, %f52 3179 ldda [SRC + 0x30]%asi, %f28 3180 fsrc1 %f22, %f54 3181 ldda [SRC + 0x38]%asi, %f30 3182 fsrc1 %f24, %f56 3183 sub CNT, VIS_BLOCKSIZE, CNT 3184 add DST, VIS_BLOCKSIZE, DST 3185 add SRC, VIS_BLOCKSIZE, SRC 3186 add REALSRC, VIS_BLOCKSIZE, REALSRC 3187 fsrc1 %f26, %f58 3188 fsrc1 %f28, %f60 3189 fsrc1 %f30, %f62 3190 stda %f48, [DST]ASI_BLK_P 3191 add DST, VIS_BLOCKSIZE, DST 3192 ba,a,pt %ncc, 4f 3193 nop 3194 3195 3: tst CNT 3196 bz,a %ncc, 4f 3197 nop 3198 3199 5: lduba [REALSRC]ASI_USER, TMP 3200 inc REALSRC 3201 inc DST 3202 deccc CNT 3203 bgu %ncc, 5b 3204 stb TMP, [DST - 1] 3205 4: 3206 3207 .copyin_exit: 3208 membar #Sync 3209 3210 FPRAS_INTERVAL(FPRAS_COPYIN, 1, %l5, %o2, %o3, %o4, %o5, 8) 3211 FPRAS_REWRITE_TYPE1(1, %l5, %f48, %o2, 9) 3212 FPRAS_CHECK(FPRAS_COPYIN, %l5, 9) ! lose outputs 3213 3214 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr 3215 wr %o2, 0, %gsr 3216 3217 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 3218 btst FPRS_FEF, %o3 3219 bz,pt %icc, 4f 3220 nop 3221 3222 BLD_FPQ2Q4_FROMSTACK(%o2) 3223 3224 ba,pt %ncc, 1f 3225 wr %o3, 0, %fprs ! restore fprs 3226 3227 4: 3228 FZEROQ2Q4 3229 wr %o3, 0, %fprs ! restore fprs 3230 3231 1: 3232 membar #Sync ! sync error barrier 3233 andn %l6, FPUSED_FLAG, %l6 3234 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 3235 FP_ALLOWMIGRATE(5, 6) 3236 ret 3237 restore %g0, 0, %o0 3238 /* 3239 * We got here because of a fault during copyin 3240 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh). 3241 */ 3242 .copyin_err: 3243 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler 3244 tst %o4 3245 bz,pt %ncc, 2f ! if not, return error 3246 nop 3247 ldn [%o4 + CP_COPYIN], %g2 ! if handler, invoke it with 3248 jmp %g2 ! original arguments 3249 restore %g0, 0, %g0 ! dispose of copy window 3250 2: 3251 ret 3252 restore %g0, -1, %o0 ! return error value 3253 3254 3255 SET_SIZE(copyin_more) 3256 3257 #endif /* lint */ 3258 3259 #ifdef lint 3260 3261 /*ARGSUSED*/ 3262 int 3263 xcopyin(const void *uaddr, void *kaddr, size_t count) 3264 { return (0); } 3265 3266 #else /* lint */ 3267 3268 ENTRY(xcopyin) 3269 3270 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 3271 bleu,pt %ncc, .xcopyin_small ! go to larger cases 3272 xor %o0, %o1, %o3 ! are src, dst alignable? 3273 btst 7, %o3 ! 3274 bz,pt %ncc, .xcopyin_8 ! check for longword alignment 3275 nop 3276 btst 1, %o3 ! 3277 bz,pt %ncc, .xcopyin_2 ! check for half-word 3278 nop 3279 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 3280 ld [%o3 + %lo(hw_copy_limit_1)], %o3 3281 tst %o3 3282 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy 3283 cmp %o2, %o3 ! if length <= limit 3284 bleu,pt %ncc, .xcopyin_small ! go to small copy 3285 nop 3286 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy 3287 nop 3288 .xcopyin_2: 3289 btst 3, %o3 ! 3290 bz,pt %ncc, .xcopyin_4 ! check for word alignment 3291 nop 3292 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 3293 ld [%o3 + %lo(hw_copy_limit_2)], %o3 3294 tst %o3 3295 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy 3296 cmp %o2, %o3 ! if length <= limit 3297 bleu,pt %ncc, .xcopyin_small ! go to small copy 3298 nop 3299 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy 3300 nop 3301 .xcopyin_4: 3302 ! already checked longword, must be word aligned 3303 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 3304 ld [%o3 + %lo(hw_copy_limit_4)], %o3 3305 tst %o3 3306 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy 3307 cmp %o2, %o3 ! if length <= limit 3308 bleu,pt %ncc, .xcopyin_small ! go to small copy 3309 nop 3310 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy 3311 nop 3312 .xcopyin_8: 3313 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 3314 ld [%o3 + %lo(hw_copy_limit_8)], %o3 3315 tst %o3 3316 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy 3317 cmp %o2, %o3 ! if length <= limit 3318 bleu,pt %ncc, .xcopyin_small ! go to small copy 3319 nop 3320 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy 3321 nop 3322 3323 .xcopyin_small: 3324 sethi %hi(.sm_xcopyin_err), %o5 ! .sm_xcopyin_err is lofault value 3325 or %o5, %lo(.sm_xcopyin_err), %o5 3326 ldn [THREAD_REG + T_LOFAULT], %o4 ! set/save t_lofaul 3327 membar #Sync ! sync error barrier 3328 ba,pt %ncc, .sm_do_copyin ! common code 3329 stn %o5, [THREAD_REG + T_LOFAULT] 3330 3331 .xcopyin_more: 3332 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 3333 sethi %hi(.xcopyin_err), REAL_LOFAULT ! .xcopyin_err is lofault value 3334 ba,pt %ncc, .do_copyin 3335 or REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT 3336 3337 /* 3338 * We got here because of fault during xcopyin 3339 * Errno value is in ERRNO 3340 */ 3341 .xcopyin_err: 3342 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler 3343 tst %o4 3344 bz,pt %ncc, 2f ! if not, return error 3345 nop 3346 ldn [%o4 + CP_XCOPYIN], %g2 ! if handler, invoke it with 3347 jmp %g2 ! original arguments 3348 restore %g0, 0, %g0 ! dispose of copy window 3349 2: 3350 ret 3351 restore ERRNO, 0, %o0 ! return errno value 3352 3353 .sm_xcopyin_err: 3354 3355 membar #Sync 3356 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 3357 mov SM_SAVE_SRC, %o0 3358 mov SM_SAVE_DST, %o1 3359 mov SM_SAVE_COUNT, %o2 3360 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler 3361 tst %o3 3362 bz,pt %ncc, 3f ! if not, return error 3363 nop 3364 ldn [%o3 + CP_XCOPYIN], %o5 ! if handler, invoke it with 3365 jmp %o5 ! original arguments 3366 nop 3367 3: 3368 retl 3369 or %g1, 0, %o0 ! return errno value 3370 3371 SET_SIZE(xcopyin) 3372 3373 #endif /* lint */ 3374 3375 #ifdef lint 3376 3377 /*ARGSUSED*/ 3378 int 3379 xcopyin_little(const void *uaddr, void *kaddr, size_t count) 3380 { return (0); } 3381 3382 #else /* lint */ 3383 3384 ENTRY(xcopyin_little) 3385 sethi %hi(.xcopyio_err), %o5 3386 or %o5, %lo(.xcopyio_err), %o5 3387 ldn [THREAD_REG + T_LOFAULT], %o4 3388 membar #Sync ! sync error barrier 3389 stn %o5, [THREAD_REG + T_LOFAULT] 3390 mov %o4, %o5 3391 3392 subcc %g0, %o2, %o3 3393 add %o0, %o2, %o0 3394 bz,pn %ncc, 2f ! check for zero bytes 3395 sub %o2, 1, %o4 3396 add %o0, %o4, %o0 ! start w/last byte 3397 add %o1, %o2, %o1 3398 lduba [%o0 + %o3]ASI_AIUSL, %o4 3399 3400 1: stb %o4, [%o1 + %o3] 3401 inccc %o3 3402 sub %o0, 2, %o0 ! get next byte 3403 bcc,a,pt %ncc, 1b 3404 lduba [%o0 + %o3]ASI_AIUSL, %o4 3405 3406 2: 3407 membar #Sync ! sync error barrier 3408 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 3409 retl 3410 mov %g0, %o0 ! return (0) 3411 3412 .xcopyio_err: 3413 membar #Sync ! sync error barrier 3414 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 3415 retl 3416 mov %g1, %o0 3417 3418 SET_SIZE(xcopyin_little) 3419 3420 #endif /* lint */ 3421 3422 3423 /* 3424 * Copy a block of storage - must not overlap (from + len <= to). 3425 * No fault handler installed (to be called under on_fault()) 3426 */ 3427 #if defined(lint) 3428 3429 /* ARGSUSED */ 3430 void 3431 copyin_noerr(const void *ufrom, void *kto, size_t count) 3432 {} 3433 3434 #else /* lint */ 3435 ENTRY(copyin_noerr) 3436 3437 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 3438 bleu,pt %ncc, .copyin_ne_small ! go to larger cases 3439 xor %o0, %o1, %o3 ! are src, dst alignable? 3440 btst 7, %o3 ! 3441 bz,pt %ncc, .copyin_ne_8 ! check for longword alignment 3442 nop 3443 btst 1, %o3 ! 3444 bz,pt %ncc, .copyin_ne_2 ! check for half-word 3445 nop 3446 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 3447 ld [%o3 + %lo(hw_copy_limit_1)], %o3 3448 tst %o3 3449 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy 3450 cmp %o2, %o3 ! if length <= limit 3451 bleu,pt %ncc, .copyin_ne_small ! go to small copy 3452 nop 3453 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy 3454 nop 3455 .copyin_ne_2: 3456 btst 3, %o3 ! 3457 bz,pt %ncc, .copyin_ne_4 ! check for word alignment 3458 nop 3459 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 3460 ld [%o3 + %lo(hw_copy_limit_2)], %o3 3461 tst %o3 3462 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy 3463 cmp %o2, %o3 ! if length <= limit 3464 bleu,pt %ncc, .copyin_ne_small ! go to small copy 3465 nop 3466 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy 3467 nop 3468 .copyin_ne_4: 3469 ! already checked longword, must be word aligned 3470 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 3471 ld [%o3 + %lo(hw_copy_limit_4)], %o3 3472 tst %o3 3473 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy 3474 cmp %o2, %o3 ! if length <= limit 3475 bleu,pt %ncc, .copyin_ne_small ! go to small copy 3476 nop 3477 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy 3478 nop 3479 .copyin_ne_8: 3480 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 3481 ld [%o3 + %lo(hw_copy_limit_8)], %o3 3482 tst %o3 3483 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy 3484 cmp %o2, %o3 ! if length <= limit 3485 bleu,pt %ncc, .copyin_ne_small ! go to small copy 3486 nop 3487 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy 3488 nop 3489 3490 .copyin_ne_small: 3491 ldn [THREAD_REG + T_LOFAULT], %o4 3492 tst %o4 3493 bz,pn %ncc, .sm_do_copyin 3494 nop 3495 sethi %hi(.sm_copyio_noerr), %o5 3496 or %o5, %lo(.sm_copyio_noerr), %o5 3497 membar #Sync ! sync error barrier 3498 ba,pt %ncc, .sm_do_copyin 3499 stn %o5, [THREAD_REG + T_LOFAULT] ! set/save t_lofault 3500 3501 .copyin_noerr_more: 3502 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 3503 sethi %hi(.copyio_noerr), REAL_LOFAULT 3504 ba,pt %ncc, .do_copyin 3505 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT 3506 3507 .copyio_noerr: 3508 jmp %l6 3509 restore %g0,0,%g0 3510 3511 .sm_copyio_noerr: 3512 membar #Sync 3513 stn %o4, [THREAD_REG + T_LOFAULT] ! restore t_lofault 3514 jmp %o4 3515 nop 3516 3517 SET_SIZE(copyin_noerr) 3518 #endif /* lint */ 3519 3520 /* 3521 * Copy a block of storage - must not overlap (from + len <= to). 3522 * No fault handler installed (to be called under on_fault()) 3523 */ 3524 3525 #if defined(lint) 3526 3527 /* ARGSUSED */ 3528 void 3529 copyout_noerr(const void *kfrom, void *uto, size_t count) 3530 {} 3531 3532 #else /* lint */ 3533 ENTRY(copyout_noerr) 3534 3535 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 3536 bleu,pt %ncc, .copyout_ne_small ! go to larger cases 3537 xor %o0, %o1, %o3 ! are src, dst alignable? 3538 btst 7, %o3 ! 3539 bz,pt %ncc, .copyout_ne_8 ! check for longword alignment 3540 nop 3541 btst 1, %o3 ! 3542 bz,pt %ncc, .copyout_ne_2 ! check for half-word 3543 nop 3544 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 3545 ld [%o3 + %lo(hw_copy_limit_1)], %o3 3546 tst %o3 3547 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy 3548 cmp %o2, %o3 ! if length <= limit 3549 bleu,pt %ncc, .copyout_ne_small ! go to small copy 3550 nop 3551 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy 3552 nop 3553 .copyout_ne_2: 3554 btst 3, %o3 ! 3555 bz,pt %ncc, .copyout_ne_4 ! check for word alignment 3556 nop 3557 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 3558 ld [%o3 + %lo(hw_copy_limit_2)], %o3 3559 tst %o3 3560 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy 3561 cmp %o2, %o3 ! if length <= limit 3562 bleu,pt %ncc, .copyout_ne_small ! go to small copy 3563 nop 3564 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy 3565 nop 3566 .copyout_ne_4: 3567 ! already checked longword, must be word aligned 3568 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 3569 ld [%o3 + %lo(hw_copy_limit_4)], %o3 3570 tst %o3 3571 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy 3572 cmp %o2, %o3 ! if length <= limit 3573 bleu,pt %ncc, .copyout_ne_small ! go to small copy 3574 nop 3575 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy 3576 nop 3577 .copyout_ne_8: 3578 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 3579 ld [%o3 + %lo(hw_copy_limit_8)], %o3 3580 tst %o3 3581 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy 3582 cmp %o2, %o3 ! if length <= limit 3583 bleu,pt %ncc, .copyout_ne_small ! go to small copy 3584 nop 3585 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy 3586 nop 3587 3588 .copyout_ne_small: 3589 ldn [THREAD_REG + T_LOFAULT], %o4 3590 tst %o4 3591 bz,pn %ncc, .sm_do_copyout 3592 nop 3593 sethi %hi(.sm_copyio_noerr), %o5 3594 or %o5, %lo(.sm_copyio_noerr), %o5 3595 membar #Sync ! sync error barrier 3596 ba,pt %ncc, .sm_do_copyout 3597 stn %o5, [THREAD_REG + T_LOFAULT] ! set/save t_lofault 3598 3599 .copyout_noerr_more: 3600 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 3601 sethi %hi(.copyio_noerr), REAL_LOFAULT 3602 ba,pt %ncc, .do_copyout 3603 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT 3604 3605 SET_SIZE(copyout_noerr) 3606 #endif /* lint */ 3607 3608 3609 /* 3610 * hwblkclr - clears block-aligned, block-multiple-sized regions that are 3611 * longer than 256 bytes in length using spitfire's block stores. If 3612 * the criteria for using this routine are not met then it calls bzero 3613 * and returns 1. Otherwise 0 is returned indicating success. 3614 * Caller is responsible for ensuring use_hw_bzero is true and that 3615 * kpreempt_disable() has been called. 3616 */ 3617 #ifdef lint 3618 /*ARGSUSED*/ 3619 int 3620 hwblkclr(void *addr, size_t len) 3621 { 3622 return(0); 3623 } 3624 #else /* lint */ 3625 ! %i0 - start address 3626 ! %i1 - length of region (multiple of 64) 3627 ! %l0 - saved fprs 3628 ! %l1 - pointer to saved %d0 block 3629 ! %l2 - saved curthread->t_lwp 3630 3631 ENTRY(hwblkclr) 3632 ! get another window w/space for one aligned block of saved fpregs 3633 save %sp, -SA(MINFRAME + 2*VIS_BLOCKSIZE), %sp 3634 3635 ! Must be block-aligned 3636 andcc %i0, (VIS_BLOCKSIZE-1), %g0 3637 bnz,pn %ncc, 1f 3638 nop 3639 3640 ! ... and must be 256 bytes or more 3641 cmp %i1, 256 3642 blu,pn %ncc, 1f 3643 nop 3644 3645 ! ... and length must be a multiple of VIS_BLOCKSIZE 3646 andcc %i1, (VIS_BLOCKSIZE-1), %g0 3647 bz,pn %ncc, 2f 3648 nop 3649 3650 1: ! punt, call bzero but notify the caller that bzero was used 3651 mov %i0, %o0 3652 call bzero 3653 mov %i1, %o1 3654 ret 3655 restore %g0, 1, %o0 ! return (1) - did not use block operations 3656 3657 2: rd %fprs, %l0 ! check for unused fp 3658 btst FPRS_FEF, %l0 3659 bz,pt %icc, 1f 3660 nop 3661 3662 ! save in-use fpregs on stack 3663 membar #Sync 3664 add %fp, STACK_BIAS - 65, %l1 3665 and %l1, -VIS_BLOCKSIZE, %l1 3666 stda %d0, [%l1]ASI_BLK_P 3667 3668 1: membar #StoreStore|#StoreLoad|#LoadStore 3669 wr %g0, FPRS_FEF, %fprs 3670 wr %g0, ASI_BLK_P, %asi 3671 3672 ! Clear block 3673 fzero %d0 3674 fzero %d2 3675 fzero %d4 3676 fzero %d6 3677 fzero %d8 3678 fzero %d10 3679 fzero %d12 3680 fzero %d14 3681 3682 mov 256, %i3 3683 ba,pt %ncc, .pz_doblock 3684 nop 3685 3686 .pz_blkstart: 3687 ! stda %d0, [%i0 + 192]%asi ! in dly slot of branch that got us here 3688 stda %d0, [%i0 + 128]%asi 3689 stda %d0, [%i0 + 64]%asi 3690 stda %d0, [%i0]%asi 3691 .pz_zinst: 3692 add %i0, %i3, %i0 3693 sub %i1, %i3, %i1 3694 .pz_doblock: 3695 cmp %i1, 256 3696 bgeu,a %ncc, .pz_blkstart 3697 stda %d0, [%i0 + 192]%asi 3698 3699 cmp %i1, 64 3700 blu %ncc, .pz_finish 3701 3702 andn %i1, (64-1), %i3 3703 srl %i3, 4, %i2 ! using blocks, 1 instr / 16 words 3704 set .pz_zinst, %i4 3705 sub %i4, %i2, %i4 3706 jmp %i4 3707 nop 3708 3709 .pz_finish: 3710 membar #Sync 3711 btst FPRS_FEF, %l0 3712 bz,a .pz_finished 3713 wr %l0, 0, %fprs ! restore fprs 3714 3715 ! restore fpregs from stack 3716 ldda [%l1]ASI_BLK_P, %d0 3717 membar #Sync 3718 wr %l0, 0, %fprs ! restore fprs 3719 3720 .pz_finished: 3721 ret 3722 restore %g0, 0, %o0 ! return (bzero or not) 3723 3724 SET_SIZE(hwblkclr) 3725 #endif /* lint */ 3726 3727 #ifdef lint 3728 /*ARGSUSED*/ 3729 void 3730 hw_pa_bcopy32(uint64_t src, uint64_t dst) 3731 {} 3732 #else /*!lint */ 3733 /* 3734 * Copy 32 bytes of data from src (%o0) to dst (%o1) 3735 * using physical addresses. 3736 */ 3737 ENTRY_NP(hw_pa_bcopy32) 3738 rdpr %pstate, %g1 3739 andn %g1, PSTATE_IE, %g2 3740 wrpr %g0, %g2, %pstate 3741 3742 rdpr %pstate, %g0 3743 ldxa [%o0]ASI_MEM, %o2 3744 add %o0, 8, %o0 3745 ldxa [%o0]ASI_MEM, %o3 3746 add %o0, 8, %o0 3747 ldxa [%o0]ASI_MEM, %o4 3748 add %o0, 8, %o0 3749 ldxa [%o0]ASI_MEM, %o5 3750 3751 stxa %g0, [%o1]ASI_DC_INVAL 3752 membar #Sync 3753 3754 stxa %o2, [%o1]ASI_MEM 3755 add %o1, 8, %o1 3756 stxa %o3, [%o1]ASI_MEM 3757 add %o1, 8, %o1 3758 stxa %o4, [%o1]ASI_MEM 3759 add %o1, 8, %o1 3760 stxa %o5, [%o1]ASI_MEM 3761 3762 retl 3763 wrpr %g0, %g1, %pstate 3764 3765 SET_SIZE(hw_pa_bcopy32) 3766 3767 #endif /* lint */ 3768 3769 #if defined(lint) 3770 3771 int use_hw_bcopy = 1; 3772 int use_hw_bzero = 1; 3773 uint_t hw_copy_limit_1 = 0; 3774 uint_t hw_copy_limit_2 = 0; 3775 uint_t hw_copy_limit_4 = 0; 3776 uint_t hw_copy_limit_8 = 0; 3777 3778 #else /* !lint */ 3779 3780 DGDEF(use_hw_bcopy) 3781 .word 1 3782 DGDEF(use_hw_bzero) 3783 .word 1 3784 DGDEF(hw_copy_limit_1) 3785 .word 0 3786 DGDEF(hw_copy_limit_2) 3787 .word 0 3788 DGDEF(hw_copy_limit_4) 3789 .word 0 3790 DGDEF(hw_copy_limit_8) 3791 .word 0 3792 3793 .align 64 3794 .section ".text" 3795 #endif /* !lint */