1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2004 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/param.h> 28 #include <sys/errno.h> 29 #include <sys/asm_linkage.h> 30 #include <sys/vtrace.h> 31 #include <sys/machthread.h> 32 #include <sys/clock.h> 33 #include <sys/asi.h> 34 #include <sys/fsr.h> 35 #include <sys/privregs.h> 36 #include <sys/fpras_impl.h> 37 38 #include "assym.h" 39 40 /* 41 * Pseudo-code to aid in understanding the control flow of the 42 * bcopy/copyin/copyout routines. 43 * 44 * On entry: 45 * 46 * ! Determine whether to use the FP register version 47 * ! or the leaf routine version depending on size 48 * ! of copy and flags. Set up error handling accordingly. 49 * ! The transition point depends on whether the src and 50 * ! dst addresses can be aligned to long word, word, 51 * ! half word, or byte boundaries. 52 * ! 53 * ! WARNING: <Register usage convention> 54 * ! For FP version, %l6 holds previous error handling and 55 * ! a flag: TRAMP_FLAG (low bits) 56 * ! for leaf routine version, %o4 holds those values. 57 * ! So either %l6 or %o4 is reserved and not available for 58 * ! any other use. 59 * 60 * if (length <= VIS_COPY_THRESHOLD) ! start with a quick test 61 * go to small_copy; ! to speed short copies 62 * 63 * ! src, dst long word alignable 64 * if (hw_copy_limit_8 == 0) ! hw_copy disabled 65 * go to small_copy; 66 * if (length <= hw_copy_limit_8) 67 * go to small_copy; 68 * go to FPBLK_copy; 69 * } 70 * if (src,dst not alignable) { 71 * if (hw_copy_limit_1 == 0) ! hw_copy disabled 72 * go to small_copy; 73 * if (length <= hw_copy_limit_1) 74 * go to small_copy; 75 * go to FPBLK_copy; 76 * } 77 * if (src,dst halfword alignable) { 78 * if (hw_copy_limit_2 == 0) ! hw_copy disabled 79 * go to small_copy; 80 * if (length <= hw_copy_limit_2) 81 * go to small_copy; 82 * go to FPBLK_copy; 83 * } 84 * if (src,dst word alignable) { 85 * if (hw_copy_limit_4 == 0) ! hw_copy disabled 86 * go to small_copy; 87 * if (length <= hw_copy_limit_4) 88 * go to small_copy; 89 * go to FPBLK_copy; 90 * } 91 * 92 * small_copy: 93 * Setup_leaf_rtn_error_handler; ! diffs for each entry point 94 * 95 * if (count <= 3) ! fast path for tiny copies 96 * go to sm_left; ! special finish up code 97 * else 98 * if (count > CHKSIZE) ! medium sized copies 99 * go to sm_med ! tuned by alignment 100 * if(src&dst not both word aligned) { 101 * sm_movebytes: 102 * move byte by byte in 4-way unrolled loop 103 * fall into sm_left; 104 * sm_left: 105 * move 0-3 bytes byte at a time as needed. 106 * restore error handler and exit. 107 * 108 * } else { ! src&dst are word aligned 109 * check for at least 8 bytes left, 110 * move word at a time, unrolled by 2 111 * when fewer than 8 bytes left, 112 * sm_half: move half word at a time while 2 or more bytes left 113 * sm_byte: move final byte if necessary 114 * sm_exit: 115 * restore error handler and exit. 116 * } 117 * 118 * ! Medium length cases with at least CHKSIZE bytes available 119 * ! method: line up src and dst as best possible, then 120 * ! move data in 4-way unrolled loops. 121 * 122 * sm_med: 123 * if(src&dst unalignable) 124 * go to sm_movebytes 125 * if(src&dst halfword alignable) 126 * go to sm_movehalf 127 * if(src&dst word alignable) 128 * go to sm_moveword 129 * ! fall into long word movement 130 * move bytes until src is word aligned 131 * if not long word aligned, move a word 132 * move long words in 4-way unrolled loop until < 32 bytes left 133 * move long words in 1-way unrolled loop until < 8 bytes left 134 * if zero bytes left, goto sm_exit 135 * if one byte left, go to sm_byte 136 * else go to sm_half 137 * 138 * sm_moveword: 139 * move bytes until src is word aligned 140 * move words in 4-way unrolled loop until < 16 bytes left 141 * move words in 1-way unrolled loop until < 4 bytes left 142 * if zero bytes left, goto sm_exit 143 * if one byte left, go to sm_byte 144 * else go to sm_half 145 * 146 * sm_movehalf: 147 * move a byte if needed to align src on halfword 148 * move halfwords in 4-way unrolled loop until < 8 bytes left 149 * if zero bytes left, goto sm_exit 150 * if one byte left, go to sm_byte 151 * else go to sm_half 152 * 153 * 154 * FPBLK_copy: 155 * %l6 = curthread->t_lofault; 156 * if (%l6 != NULL) { 157 * membar #Sync 158 * curthread->t_lofault = .copyerr; 159 * caller_error_handler = TRUE ! %l6 |= 2 160 * } 161 * 162 * ! for FPU testing we must not migrate cpus 163 * if (curthread->t_lwp == NULL) { 164 * ! Kernel threads do not have pcb's in which to store 165 * ! the floating point state, so disallow preemption during 166 * ! the copy. This also prevents cpu migration. 167 * kpreempt_disable(curthread); 168 * } else { 169 * thread_nomigrate(); 170 * } 171 * 172 * old_fprs = %fprs; 173 * old_gsr = %gsr; 174 * if (%fprs.fef) { 175 * %fprs.fef = 1; 176 * save current fpregs on stack using blockstore 177 * } else { 178 * %fprs.fef = 1; 179 * } 180 * 181 * 182 * do_blockcopy_here; 183 * 184 * In lofault handler: 185 * curthread->t_lofault = .copyerr2; 186 * Continue on with the normal exit handler 187 * 188 * On normal exit: 189 * %gsr = old_gsr; 190 * if (old_fprs & FPRS_FEF) 191 * restore fpregs from stack using blockload 192 * else 193 * zero fpregs 194 * %fprs = old_fprs; 195 * membar #Sync 196 * curthread->t_lofault = (%l6 & ~3); 197 * ! following test omitted from copyin/copyout as they 198 * ! will always have a current thread 199 * if (curthread->t_lwp == NULL) 200 * kpreempt_enable(curthread); 201 * else 202 * thread_allowmigrate(); 203 * return (0) 204 * 205 * In second lofault handler (.copyerr2): 206 * We've tried to restore fp state from the stack and failed. To 207 * prevent from returning with a corrupted fp state, we will panic. 208 */ 209 210 /* 211 * Comments about optimization choices 212 * 213 * The initial optimization decision in this code is to determine 214 * whether to use the FP registers for a copy or not. If we don't 215 * use the FP registers, we can execute the copy as a leaf routine, 216 * saving a register save and restore. Also, less elaborate setup 217 * is required, allowing short copies to be completed more quickly. 218 * For longer copies, especially unaligned ones (where the src and 219 * dst do not align to allow simple ldx,stx operation), the FP 220 * registers allow much faster copy operations. 221 * 222 * The estimated extra cost of the FP path will vary depending on 223 * src/dst alignment, dst offset from the next 64 byte FPblock store 224 * boundary, remaining src data after the last full dst cache line is 225 * moved whether the FP registers need to be saved, and some other 226 * minor issues. The average additional overhead is estimated to be 227 * 400 clocks. Since each non-repeated/predicted tst and branch costs 228 * around 10 clocks, elaborate calculation would slow down to all 229 * longer copies and only benefit a small portion of medium sized 230 * copies. Rather than incur such cost, we chose fixed transition 231 * points for each of the alignment choices. 232 * 233 * For the inner loop, here is a comparison of the per cache line 234 * costs for each alignment when src&dst are in cache: 235 * 236 * byte aligned: 108 clocks slower for non-FPBLK 237 * half aligned: 44 clocks slower for non-FPBLK 238 * word aligned: 12 clocks slower for non-FPBLK 239 * long aligned: 4 clocks >>faster<< for non-FPBLK 240 * 241 * The long aligned loop runs faster because it does no prefetching. 242 * That wins if the data is not in cache or there is too little 243 * data to gain much benefit from prefetching. But when there 244 * is more data and that data is not in cache, failing to prefetch 245 * can run much slower. In addition, there is a 2 Kbyte store queue 246 * which will cause the non-FPBLK inner loop to slow for larger copies. 247 * The exact tradeoff is strongly load and application dependent, with 248 * increasing risk of a customer visible performance regression if the 249 * non-FPBLK code is used for larger copies. Studies of synthetic in-cache 250 * vs out-of-cache copy tests in user space suggest 1024 bytes as a safe 251 * upper limit for the non-FPBLK code. To minimize performance regression 252 * risk while still gaining the primary benefits of the improvements to 253 * the non-FPBLK code, we set an upper bound of 1024 bytes for the various 254 * hw_copy_limit_*. Later experimental studies using different values 255 * of hw_copy_limit_* can be used to make further adjustments if 256 * appropriate. 257 * 258 * hw_copy_limit_1 = src and dst are byte aligned but not halfword aligned 259 * hw_copy_limit_2 = src and dst are halfword aligned but not word aligned 260 * hw_copy_limit_4 = src and dst are word aligned but not longword aligned 261 * hw_copy_limit_8 = src and dst are longword aligned 262 * 263 * To say that src and dst are word aligned means that after 264 * some initial alignment activity of moving 0 to 3 bytes, 265 * both the src and dst will be on word boundaries so that 266 * word loads and stores may be used. 267 * 268 * Recommended initial values as of Mar 2004, includes testing 269 * on Cheetah+ (900MHz), Cheetah++ (1200MHz), and Jaguar(1050MHz): 270 * hw_copy_limit_1 = 256 271 * hw_copy_limit_2 = 512 272 * hw_copy_limit_4 = 1024 273 * hw_copy_limit_8 = 1024 (or 1536 on some systems) 274 * 275 * 276 * If hw_copy_limit_? is set to zero, then use of FPBLK copy is 277 * disabled for that alignment choice. 278 * If hw_copy_limit_? is set to a value between 1 and VIS_COPY_THRESHOLD (256) 279 * the value of VIS_COPY_THRESHOLD is used. 280 * It is not envisioned that hw_copy_limit_? will be changed in the field 281 * It is provided to allow for disabling FPBLK copies and to allow 282 * easy testing of alternate values on future HW implementations 283 * that might have different cache sizes, clock rates or instruction 284 * timing rules. 285 * 286 * Our first test for FPBLK copies vs non-FPBLK copies checks a minimum 287 * threshold to speedup all shorter copies (less than 256). That 288 * saves an alignment test, memory reference, and enabling test 289 * for all short copies, or an estimated 24 clocks. 290 * 291 * The order in which these limits are checked does matter since each 292 * non-predicted tst and branch costs around 10 clocks. 293 * If src and dst are randomly selected addresses, 294 * 4 of 8 will not be alignable. 295 * 2 of 8 will be half word alignable. 296 * 1 of 8 will be word alignable. 297 * 1 of 8 will be long word alignable. 298 * But, tests on running kernels show that src and dst to copy code 299 * are typically not on random alignments. Structure copies and 300 * copies of larger data sizes are often on long word boundaries. 301 * So we test the long word alignment case first, then 302 * the byte alignment, then halfword, then word alignment. 303 * 304 * Several times, tests for length are made to split the code 305 * into subcases. These tests often allow later tests to be 306 * avoided. For example, within the non-FPBLK copy, we first 307 * check for tiny copies of 3 bytes or less. That allows us 308 * to use a 4-way unrolled loop for the general byte copy case 309 * without a test on loop entry. 310 * We subdivide the non-FPBLK case further into CHKSIZE bytes and less 311 * vs longer cases. For the really short case, we don't attempt 312 * align src and dst. We try to minimize special case tests in 313 * the shortest loops as each test adds a significant percentage 314 * to the total time. 315 * 316 * For the medium sized cases, we allow ourselves to adjust the 317 * src and dst alignment and provide special cases for each of 318 * the four adjusted alignment cases. The CHKSIZE that was used 319 * to decide between short and medium size was chosen to be 39 320 * as that allows for the worst case of 7 bytes of alignment 321 * shift and 4 times 8 bytes for the first long word unrolling. 322 * That knowledge saves an initial test for length on entry into 323 * the medium cases. If the general loop unrolling factor were 324 * to be increases, this number would also need to be adjusted. 325 * 326 * For all cases in the non-FPBLK code where it is known that at 327 * least 4 chunks of data are available for movement, the 328 * loop is unrolled by four. This 4-way loop runs in 8 clocks 329 * or 2 clocks per data element. Due to limitations of the 330 * branch instruction on Cheetah, Jaguar, and Panther, the 331 * minimum time for a small, tight loop is 3 clocks. So 332 * the 4-way loop runs 50% faster than the fastest non-unrolled 333 * loop. 334 * 335 * Instruction alignment is forced by used of .align 16 directives 336 * and nops which are not executed in the code. This 337 * combination of operations shifts the alignment of following 338 * loops to insure that loops are aligned so that their instructions 339 * fall within the minimum number of 4 instruction fetch groups. 340 * If instructions are inserted or removed between the .align 341 * instruction and the unrolled loops, then the alignment needs 342 * to be readjusted. Misaligned loops can add a clock per loop 343 * iteration to the loop timing. 344 * 345 * In a few cases, code is duplicated to avoid a branch. Since 346 * a non-predicted tst and branch takes 10 clocks, this savings 347 * is judged an appropriate time-space tradeoff. 348 * 349 * Within the FPBLK-code, the prefetch method in the inner 350 * loop needs to be explained as it is not standard. Two 351 * prefetches are issued for each cache line instead of one. 352 * The primary one is at the maximum reach of 8 cache lines. 353 * Most of the time, that maximum prefetch reach gives the 354 * cache line more time to reach the processor for systems with 355 * higher processor clocks. But, sometimes memory interference 356 * can cause that prefetch to be dropped. Putting a second 357 * prefetch at a reach of 5 cache lines catches the drops 358 * three iterations later and shows a measured improvement 359 * in performance over any similar loop with a single prefetch. 360 * The prefetches are placed in the loop so they overlap with 361 * non-memory instructions, so that there is no extra cost 362 * when the data is already in-cache. 363 * 364 */ 365 366 /* 367 * Notes on preserving existing fp state and on membars. 368 * 369 * When a copyOP decides to use fp we may have to preserve existing 370 * floating point state. It is not the caller's state that we need to 371 * preserve - the rest of the kernel does not use fp and, anyway, fp 372 * registers are volatile across a call. Some examples: 373 * 374 * - userland has fp state and is interrupted (device interrupt 375 * or trap) and within the interrupt/trap handling we use 376 * bcopy() 377 * - another (higher level) interrupt or trap handler uses bcopy 378 * while a bcopy from an earlier interrupt is still active 379 * - an asynchronous error trap occurs while fp state exists (in 380 * userland or in kernel copy) and the tl0 component of the handling 381 * uses bcopy 382 * - a user process with fp state incurs a copy-on-write fault and 383 * hwblkpagecopy always uses fp 384 * 385 * We therefore need a per-call place in which to preserve fp state - 386 * using our stack is ideal (and since fp copy cannot be leaf optimized 387 * because of calls it makes, this is no hardship). 388 * 389 * The following membar BLD/BST discussion is Cheetah pipeline specific. 390 * In Cheetah BLD is blocking, #LoadLoad/#LoadStore/#StoreStore are 391 * nops (those semantics always apply) and #StoreLoad is implemented 392 * as a membar #Sync. 393 * 394 * It is possible that the owner of the fp state has a block load or 395 * block store still "in flight" at the time we come to preserve that 396 * state. Block loads are blocking in Cheetah pipelines so we do not 397 * need to sync with them. In preserving fp regs we will use block stores 398 * (which are not blocking in Cheetah pipelines) so we require a membar #Sync 399 * after storing state (so that our subsequent use of those registers 400 * does not modify them before the block stores complete); this membar 401 * also serves to sync with block stores the owner of the fp state has 402 * initiated. 403 * 404 * When we have finished fp copy (with it's repeated block stores) 405 * we must membar #Sync so that our block stores may complete before 406 * we either restore the original fp state into the fp registers or 407 * return to a caller which may initiate other fp operations that could 408 * modify the fp regs we used before the block stores complete. 409 * 410 * Synchronous faults (eg, unresolvable DMMU miss) that occur while 411 * t_lofault is not NULL will not panic but will instead trampoline 412 * to the registered lofault handler. There is no need for any 413 * membars for these - eg, our store to t_lofault will always be visible to 414 * ourselves and it is our cpu which will take any trap. 415 * 416 * Asynchronous faults (eg, uncorrectable ECC error from memory) that occur 417 * while t_lofault is not NULL will also not panic. Since we're copying 418 * to or from userland the extent of the damage is known - the destination 419 * buffer is incomplete. So trap handlers will trampoline to the lofault 420 * handler in this case which should take some form of error action to 421 * avoid using the incomplete buffer. The trap handler also flags the 422 * fault so that later return-from-trap handling (for the trap that brought 423 * this thread into the kernel in the first place) can notify the process 424 * and reboot the system (or restart the service with Greenline/Contracts). 425 * 426 * Asynchronous faults (eg, uncorrectable ECC error from memory) can 427 * result in deferred error traps - the trap is taken sometime after 428 * the event and the trap PC may not be the PC of the faulting access. 429 * Delivery of such pending traps can be forced by a membar #Sync, acting 430 * as an "error barrier" in this role. To accurately apply the user/kernel 431 * separation described in the preceding paragraph we must force delivery 432 * of deferred traps affecting kernel state before we install a lofault 433 * handler (if we interpose a new lofault handler on an existing one there 434 * is no need to repeat this), and we must force delivery of deferred 435 * errors affecting the lofault-protected region before we clear t_lofault. 436 * Failure to do so results in lost kernel state being interpreted as 437 * affecting a copyin/copyout only, or of an error that really only 438 * affects copy data being interpreted as losing kernel state. 439 * 440 * Since the copy operations may preserve and later restore floating 441 * point state that does not belong to the caller (see examples above), 442 * we must be careful in how we do this in order to prevent corruption 443 * of another program. 444 * 445 * To make sure that floating point state is always saved and restored 446 * correctly, the following "big rules" must be followed when the floating 447 * point registers will be used: 448 * 449 * 1. %l6 always holds the caller's lofault handler. Also in this register, 450 * Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in 451 * use. Bit 2 (TRAMP_FLAG) indicates that the call was to bcopy, and a 452 * lofault handler was set coming in. 453 * 454 * 2. The FPUSED flag indicates that all FP state has been successfully stored 455 * on the stack. It should not be set until this save has been completed. 456 * 457 * 3. The FPUSED flag should not be cleared on exit until all FP state has 458 * been restored from the stack. If an error occurs while restoring 459 * data from the stack, the error handler can check this flag to see if 460 * a restore is necessary. 461 * 462 * 4. Code run under the new lofault handler must be kept to a minimum. In 463 * particular, any calls to FP_ALLOWMIGRATE, which could result in a call 464 * to kpreempt(), should not be made until after the lofault handler has 465 * been restored. 466 */ 467 468 /* 469 * VIS_COPY_THRESHOLD indicates the minimum number of bytes needed 470 * to "break even" using FP/VIS-accelerated memory operations. 471 * The FPBLK code assumes a minimum number of bytes are available 472 * to be moved on entry. Check that code carefully before 473 * reducing VIS_COPY_THRESHOLD below 256. 474 */ 475 /* 476 * This shadows sys/machsystm.h which can't be included due to the lack of 477 * _ASM guards in include files it references. Change it here, change it there. 478 */ 479 #define VIS_COPY_THRESHOLD 256 480 481 /* 482 * TEST for very short copies 483 * Be aware that the maximum unroll for the short unaligned case 484 * is SHORTCOPY+1 485 */ 486 #define SHORTCOPY 3 487 #define CHKSIZE 39 488 489 /* 490 * Indicates that we're to trampoline to the error handler. 491 * Entry points bcopy, copyin_noerr, and copyout_noerr use this flag. 492 * kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag. 493 */ 494 #define FPUSED_FLAG 1 495 #define TRAMP_FLAG 2 496 #define MASK_FLAGS 3 497 498 /* 499 * Number of outstanding prefetches. 500 * Testing with 1200 MHz Cheetah+ and Jaguar gives best results with 501 * two prefetches, one with a reach of 8*BLOCK_SIZE+8 and one with a 502 * reach of 5*BLOCK_SIZE. The double prefetch gives an typical improvement 503 * of 5% for large copies as compared to a single prefetch. The reason 504 * for the improvement is that with Cheetah and Jaguar, some prefetches 505 * are dropped due to the prefetch queue being full. The second prefetch 506 * reduces the number of cache lines that are dropped. 507 * Do not remove the double prefetch or change either CHEETAH_PREFETCH 508 * or CHEETAH_2ND_PREFETCH without extensive performance tests to prove 509 * there is no loss of performance. 510 */ 511 #define CHEETAH_PREFETCH 8 512 #define CHEETAH_2ND_PREFETCH 5 513 514 #define VIS_BLOCKSIZE 64 515 516 /* 517 * Size of stack frame in order to accomodate a 64-byte aligned 518 * floating-point register save area and 2 64-bit temp locations. 519 * All copy functions use two quadrants of fp registers; to assure a 520 * block-aligned two block buffer in which to save we must reserve 521 * three blocks on stack. Not all functions preserve %pfrs on stack 522 * or need to preserve %gsr but we use HWCOPYFRAMESIZE for all. 523 * 524 * _______________________________________ <-- %fp + STACK_BIAS 525 * | We may need to preserve 2 quadrants | 526 * | of fp regs, but since we do so with | 527 * | BST/BLD we need room in which to | 528 * | align to VIS_BLOCKSIZE bytes. So | 529 * | this area is 3 * VIS_BLOCKSIZE. | <-- - SAVED_FPREGS_OFFSET 530 * |-------------------------------------| 531 * | 8 bytes to save %fprs | <-- - SAVED_FPRS_OFFSET 532 * |-------------------------------------| 533 * | 8 bytes to save %gsr | <-- - SAVED_GSR_OFFSET 534 * --------------------------------------- 535 */ 536 #define HWCOPYFRAMESIZE ((VIS_BLOCKSIZE * (2 + 1)) + (2 * 8)) 537 #define SAVED_FPREGS_OFFSET (VIS_BLOCKSIZE * 3) 538 #define SAVED_FPREGS_ADJUST ((VIS_BLOCKSIZE * 2) - 1) 539 #define SAVED_FPRS_OFFSET (SAVED_FPREGS_OFFSET + 8) 540 #define SAVED_GSR_OFFSET (SAVED_FPRS_OFFSET + 8) 541 542 /* 543 * Common macros used by the various versions of the block copy 544 * routines in this file. 545 */ 546 547 /* 548 * In FP copies if we do not have preserved data to restore over 549 * the fp regs we used then we must zero those regs to avoid 550 * exposing portions of the data to later threads (data security). 551 * 552 * Copy functions use either quadrants 1 and 3 or 2 and 4. 553 * 554 * FZEROQ1Q3: Zero quadrants 1 and 3, ie %f0 - %f15 and %f32 - %f47 555 * FZEROQ2Q4: Zero quadrants 2 and 4, ie %f16 - %f31 and %f48 - %f63 556 * 557 * The instructions below are quicker than repeated fzero instructions 558 * since they can dispatch down two fp pipelines. 559 */ 560 #define FZEROQ1Q3 \ 561 fzero %f0 ;\ 562 fzero %f2 ;\ 563 faddd %f0, %f2, %f4 ;\ 564 fmuld %f0, %f2, %f6 ;\ 565 faddd %f0, %f2, %f8 ;\ 566 fmuld %f0, %f2, %f10 ;\ 567 faddd %f0, %f2, %f12 ;\ 568 fmuld %f0, %f2, %f14 ;\ 569 faddd %f0, %f2, %f32 ;\ 570 fmuld %f0, %f2, %f34 ;\ 571 faddd %f0, %f2, %f36 ;\ 572 fmuld %f0, %f2, %f38 ;\ 573 faddd %f0, %f2, %f40 ;\ 574 fmuld %f0, %f2, %f42 ;\ 575 faddd %f0, %f2, %f44 ;\ 576 fmuld %f0, %f2, %f46 577 578 #define FZEROQ2Q4 \ 579 fzero %f16 ;\ 580 fzero %f18 ;\ 581 faddd %f16, %f18, %f20 ;\ 582 fmuld %f16, %f18, %f22 ;\ 583 faddd %f16, %f18, %f24 ;\ 584 fmuld %f16, %f18, %f26 ;\ 585 faddd %f16, %f18, %f28 ;\ 586 fmuld %f16, %f18, %f30 ;\ 587 faddd %f16, %f18, %f48 ;\ 588 fmuld %f16, %f18, %f50 ;\ 589 faddd %f16, %f18, %f52 ;\ 590 fmuld %f16, %f18, %f54 ;\ 591 faddd %f16, %f18, %f56 ;\ 592 fmuld %f16, %f18, %f58 ;\ 593 faddd %f16, %f18, %f60 ;\ 594 fmuld %f16, %f18, %f62 595 596 /* 597 * Macros to save and restore quadrants 1 and 3 or 2 and 4 to/from the stack. 598 * Used to save and restore in-use fp registers when we want to use FP 599 * and find fp already in use and copy size still large enough to justify 600 * the additional overhead of this save and restore. 601 * 602 * A membar #Sync is needed before save to sync fp ops initiated before 603 * the call to the copy function (by whoever has fp in use); for example 604 * an earlier block load to the quadrant we are about to save may still be 605 * "in flight". A membar #Sync is required at the end of the save to 606 * sync our block store (the copy code is about to begin ldd's to the 607 * first quadrant). Note, however, that since Cheetah pipeline block load 608 * is blocking we can omit the initial membar before saving fp state (they're 609 * commented below in case of future porting to a chip that does not block 610 * on block load). 611 * 612 * Similarly: a membar #Sync before restore allows the block stores of 613 * the copy operation to complete before we fill the quadrants with their 614 * original data, and a membar #Sync after restore lets the block loads 615 * of the restore complete before we return to whoever has the fp regs 616 * in use. To avoid repeated membar #Sync we make it the responsibility 617 * of the copy code to membar #Sync immediately after copy is complete 618 * and before using the BLD_*_FROMSTACK macro. 619 */ 620 #define BST_FPQ1Q3_TOSTACK(tmp1) \ 621 /* membar #Sync */ ;\ 622 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 623 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 624 stda %f0, [tmp1]ASI_BLK_P ;\ 625 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 626 stda %f32, [tmp1]ASI_BLK_P ;\ 627 membar #Sync 628 629 #define BLD_FPQ1Q3_FROMSTACK(tmp1) \ 630 /* membar #Sync - provided at copy completion */ ;\ 631 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 632 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 633 ldda [tmp1]ASI_BLK_P, %f0 ;\ 634 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 635 ldda [tmp1]ASI_BLK_P, %f32 ;\ 636 membar #Sync 637 638 #define BST_FPQ2Q4_TOSTACK(tmp1) \ 639 /* membar #Sync */ ;\ 640 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 641 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 642 stda %f16, [tmp1]ASI_BLK_P ;\ 643 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 644 stda %f48, [tmp1]ASI_BLK_P ;\ 645 membar #Sync 646 647 #define BLD_FPQ2Q4_FROMSTACK(tmp1) \ 648 /* membar #Sync - provided at copy completion */ ;\ 649 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 650 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 651 ldda [tmp1]ASI_BLK_P, %f16 ;\ 652 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 653 ldda [tmp1]ASI_BLK_P, %f48 ;\ 654 membar #Sync 655 656 /* 657 * FP_NOMIGRATE and FP_ALLOWMIGRATE. Prevent migration (or, stronger, 658 * prevent preemption if there is no t_lwp to save FP state to on context 659 * switch) before commencing a FP copy, and reallow it on completion or 660 * in error trampoline paths when we were using FP copy. 661 * 662 * Both macros may call other functions, so be aware that all outputs are 663 * forfeit after using these macros. For this reason we do not pass registers 664 * to use - we just use any outputs we want. 665 * 666 * For fpRAS we need to perform the fpRAS mechanism test on the same 667 * CPU as we use for the copy operation, both so that we validate the 668 * CPU we perform the copy on and so that we know which CPU failed 669 * if a failure is detected. Hence we need to be bound to "our" CPU. 670 * This could be achieved through disabling preemption (and we have do it that 671 * way for threads with no t_lwp) but for larger copies this may hold 672 * higher priority threads off of cpu for too long (eg, realtime). So we 673 * make use of the lightweight t_nomigrate mechanism where we can (ie, when 674 * we have a t_lwp). 675 * 676 * Pseudo code: 677 * 678 * FP_NOMIGRATE: 679 * 680 * if (curthread->t_lwp) { 681 * thread_nomigrate(); 682 * } else { 683 * kpreempt_disable(); 684 * } 685 * 686 * FP_ALLOWMIGRATE: 687 * 688 * if (curthread->t_lwp) { 689 * thread_allowmigrate(); 690 * } else { 691 * kpreempt_enable(); 692 * } 693 */ 694 695 #define FP_NOMIGRATE(label1, label2) \ 696 ldn [THREAD_REG + T_LWP], %o0 ;\ 697 brz,a,pn %o0, label1/**/f ;\ 698 ldsb [THREAD_REG + T_PREEMPT], %o1 ;\ 699 call thread_nomigrate ;\ 700 nop ;\ 701 ba label2/**/f ;\ 702 nop ;\ 703 label1: ;\ 704 inc %o1 ;\ 705 stb %o1, [THREAD_REG + T_PREEMPT] ;\ 706 label2: 707 708 #define FP_ALLOWMIGRATE(label1, label2) \ 709 ldn [THREAD_REG + T_LWP], %o0 ;\ 710 brz,a,pn %o0, label1/**/f ;\ 711 ldsb [THREAD_REG + T_PREEMPT], %o1 ;\ 712 call thread_allowmigrate ;\ 713 nop ;\ 714 ba label2/**/f ;\ 715 nop ;\ 716 label1: ;\ 717 dec %o1 ;\ 718 brnz,pn %o1, label2/**/f ;\ 719 stb %o1, [THREAD_REG + T_PREEMPT] ;\ 720 ldn [THREAD_REG + T_CPU], %o0 ;\ 721 ldub [%o0 + CPU_KPRUNRUN], %o0 ;\ 722 brz,pt %o0, label2/**/f ;\ 723 nop ;\ 724 call kpreempt ;\ 725 rdpr %pil, %o0 ;\ 726 label2: 727 728 /* 729 * Copy a block of storage, returning an error code if `from' or 730 * `to' takes a kernel pagefault which cannot be resolved. 731 * Returns errno value on pagefault error, 0 if all ok 732 */ 733 734 .seg ".text" 735 .align 4 736 737 ENTRY(kcopy) 738 739 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 740 bleu,pt %ncc, .kcopy_small ! go to larger cases 741 xor %o0, %o1, %o3 ! are src, dst alignable? 742 btst 7, %o3 ! 743 bz,pt %ncc, .kcopy_8 ! check for longword alignment 744 nop 745 btst 1, %o3 ! 746 bz,pt %ncc, .kcopy_2 ! check for half-word 747 nop 748 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 749 ld [%o3 + %lo(hw_copy_limit_1)], %o3 750 tst %o3 751 bz,pn %icc, .kcopy_small ! if zero, disable HW copy 752 cmp %o2, %o3 ! if length <= limit 753 bleu,pt %ncc, .kcopy_small ! go to small copy 754 nop 755 ba,pt %ncc, .kcopy_more ! otherwise go to large copy 756 nop 757 .kcopy_2: 758 btst 3, %o3 ! 759 bz,pt %ncc, .kcopy_4 ! check for word alignment 760 nop 761 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 762 ld [%o3 + %lo(hw_copy_limit_2)], %o3 763 tst %o3 764 bz,pn %icc, .kcopy_small ! if zero, disable HW copy 765 cmp %o2, %o3 ! if length <= limit 766 bleu,pt %ncc, .kcopy_small ! go to small copy 767 nop 768 ba,pt %ncc, .kcopy_more ! otherwise go to large copy 769 nop 770 .kcopy_4: 771 ! already checked longword, must be word aligned 772 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 773 ld [%o3 + %lo(hw_copy_limit_4)], %o3 774 tst %o3 775 bz,pn %icc, .kcopy_small ! if zero, disable HW copy 776 cmp %o2, %o3 ! if length <= limit 777 bleu,pt %ncc, .kcopy_small ! go to small copy 778 nop 779 ba,pt %ncc, .kcopy_more ! otherwise go to large copy 780 nop 781 .kcopy_8: 782 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 783 ld [%o3 + %lo(hw_copy_limit_8)], %o3 784 tst %o3 785 bz,pn %icc, .kcopy_small ! if zero, disable HW copy 786 cmp %o2, %o3 ! if length <= limit 787 bleu,pt %ncc, .kcopy_small ! go to small copy 788 nop 789 ba,pt %ncc, .kcopy_more ! otherwise go to large copy 790 nop 791 792 .kcopy_small: 793 sethi %hi(.sm_copyerr), %o5 ! sm_copyerr is lofault value 794 or %o5, %lo(.sm_copyerr), %o5 795 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler 796 membar #Sync ! sync error barrier 797 ba,pt %ncc, .sm_do_copy ! common code 798 stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault 799 800 .kcopy_more: 801 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 802 sethi %hi(.copyerr), %l7 ! copyerr is lofault value 803 or %l7, %lo(.copyerr), %l7 804 ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler 805 membar #Sync ! sync error barrier 806 ba,pt %ncc, .do_copy ! common code 807 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 808 809 810 /* 811 * We got here because of a fault during bcopy_more, called from kcopy or bcopy. 812 * Errno value is in %g1. bcopy_more uses fp quadrants 1 and 3. 813 */ 814 .copyerr: 815 set .copyerr2, %l0 816 membar #Sync ! sync error barrier 817 stn %l0, [THREAD_REG + T_LOFAULT] ! set t_lofault 818 btst FPUSED_FLAG, %l6 819 bz %ncc, 1f 820 and %l6, TRAMP_FLAG, %l0 ! copy trampoline flag to %l0 821 822 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr 823 wr %o2, 0, %gsr 824 825 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 826 btst FPRS_FEF, %o3 827 bz,pt %icc, 4f 828 nop 829 830 BLD_FPQ1Q3_FROMSTACK(%o2) 831 832 ba,pt %ncc, 1f 833 wr %o3, 0, %fprs ! restore fprs 834 835 4: 836 FZEROQ1Q3 837 wr %o3, 0, %fprs ! restore fprs 838 839 ! 840 ! Need to cater for the different expectations of kcopy 841 ! and bcopy. kcopy will *always* set a t_lofault handler 842 ! If it fires, we're expected to just return the error code 843 ! and *not* to invoke any existing error handler. As far as 844 ! bcopy is concerned, we only set t_lofault if there was an 845 ! existing lofault handler. In that case we're expected to 846 ! invoke the previously existing handler after resetting the 847 ! t_lofault value. 848 ! 849 1: 850 andn %l6, MASK_FLAGS, %l6 ! turn trampoline flag off 851 membar #Sync ! sync error barrier 852 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 853 FP_ALLOWMIGRATE(5, 6) 854 855 btst TRAMP_FLAG, %l0 856 bnz,pn %ncc, 3f 857 nop 858 ret 859 restore %g1, 0, %o0 860 861 3: 862 ! 863 ! We're here via bcopy. There *must* have been an error handler 864 ! in place otherwise we would have died a nasty death already. 865 ! 866 jmp %l6 ! goto real handler 867 restore %g0, 0, %o0 ! dispose of copy window 868 869 /* 870 * We got here because of a fault in .copyerr. We can't safely restore fp 871 * state, so we panic. 872 */ 873 fp_panic_msg: 874 .asciz "Unable to restore fp state after copy operation" 875 876 .align 4 877 .copyerr2: 878 set fp_panic_msg, %o0 879 call panic 880 nop 881 882 /* 883 * We got here because of a fault during a small kcopy or bcopy. 884 * No floating point registers are used by the small copies. 885 * Errno value is in %g1. 886 */ 887 .sm_copyerr: 888 1: 889 btst TRAMP_FLAG, %o4 890 membar #Sync 891 andn %o4, TRAMP_FLAG, %o4 892 bnz,pn %ncc, 3f 893 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 894 retl 895 mov %g1, %o0 896 3: 897 jmp %o4 ! goto real handler 898 mov %g0, %o0 ! 899 900 SET_SIZE(kcopy) 901 902 903 /* 904 * Copy a block of storage - must not overlap (from + len <= to). 905 * Registers: l6 - saved t_lofault 906 * (for short copies, o4 - saved t_lofault) 907 * 908 * Copy a page of memory. 909 * Assumes double word alignment and a count >= 256. 910 */ 911 912 ENTRY(bcopy) 913 914 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 915 bleu,pt %ncc, .bcopy_small ! go to larger cases 916 xor %o0, %o1, %o3 ! are src, dst alignable? 917 btst 7, %o3 ! 918 bz,pt %ncc, .bcopy_8 ! check for longword alignment 919 nop 920 btst 1, %o3 ! 921 bz,pt %ncc, .bcopy_2 ! check for half-word 922 nop 923 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 924 ld [%o3 + %lo(hw_copy_limit_1)], %o3 925 tst %o3 926 bz,pn %icc, .bcopy_small ! if zero, disable HW copy 927 cmp %o2, %o3 ! if length <= limit 928 bleu,pt %ncc, .bcopy_small ! go to small copy 929 nop 930 ba,pt %ncc, .bcopy_more ! otherwise go to large copy 931 nop 932 .bcopy_2: 933 btst 3, %o3 ! 934 bz,pt %ncc, .bcopy_4 ! check for word alignment 935 nop 936 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 937 ld [%o3 + %lo(hw_copy_limit_2)], %o3 938 tst %o3 939 bz,pn %icc, .bcopy_small ! if zero, disable HW copy 940 cmp %o2, %o3 ! if length <= limit 941 bleu,pt %ncc, .bcopy_small ! go to small copy 942 nop 943 ba,pt %ncc, .bcopy_more ! otherwise go to large copy 944 nop 945 .bcopy_4: 946 ! already checked longword, must be word aligned 947 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 948 ld [%o3 + %lo(hw_copy_limit_4)], %o3 949 tst %o3 950 bz,pn %icc, .bcopy_small ! if zero, disable HW copy 951 cmp %o2, %o3 ! if length <= limit 952 bleu,pt %ncc, .bcopy_small ! go to small copy 953 nop 954 ba,pt %ncc, .bcopy_more ! otherwise go to large copy 955 nop 956 .bcopy_8: 957 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 958 ld [%o3 + %lo(hw_copy_limit_8)], %o3 959 tst %o3 960 bz,pn %icc, .bcopy_small ! if zero, disable HW copy 961 cmp %o2, %o3 ! if length <= limit 962 bleu,pt %ncc, .bcopy_small ! go to small copy 963 nop 964 ba,pt %ncc, .bcopy_more ! otherwise go to large copy 965 nop 966 967 .align 16 968 .bcopy_small: 969 ldn [THREAD_REG + T_LOFAULT], %o4 ! save t_lofault 970 tst %o4 971 bz,pt %icc, .sm_do_copy 972 nop 973 sethi %hi(.sm_copyerr), %o5 974 or %o5, %lo(.sm_copyerr), %o5 975 membar #Sync ! sync error barrier 976 stn %o5, [THREAD_REG + T_LOFAULT] ! install new vector 977 or %o4, TRAMP_FLAG, %o4 ! error should trampoline 978 .sm_do_copy: 979 cmp %o2, SHORTCOPY ! check for really short case 980 bleu,pt %ncc, .bc_sm_left ! 981 cmp %o2, CHKSIZE ! check for medium length cases 982 bgu,pn %ncc, .bc_med ! 983 or %o0, %o1, %o3 ! prepare alignment check 984 andcc %o3, 0x3, %g0 ! test for alignment 985 bz,pt %ncc, .bc_sm_word ! branch to word aligned case 986 .bc_sm_movebytes: 987 sub %o2, 3, %o2 ! adjust count to allow cc zero test 988 .bc_sm_notalign4: 989 ldub [%o0], %o3 ! read byte 990 stb %o3, [%o1] ! write byte 991 subcc %o2, 4, %o2 ! reduce count by 4 992 ldub [%o0 + 1], %o3 ! repeat for a total of 4 bytes 993 add %o0, 4, %o0 ! advance SRC by 4 994 stb %o3, [%o1 + 1] 995 ldub [%o0 - 2], %o3 996 add %o1, 4, %o1 ! advance DST by 4 997 stb %o3, [%o1 - 2] 998 ldub [%o0 - 1], %o3 999 bgt,pt %ncc, .bc_sm_notalign4 ! loop til 3 or fewer bytes remain 1000 stb %o3, [%o1 - 1] 1001 add %o2, 3, %o2 ! restore count 1002 .bc_sm_left: 1003 tst %o2 1004 bz,pt %ncc, .bc_sm_exit ! check for zero length 1005 deccc %o2 ! reduce count for cc test 1006 ldub [%o0], %o3 ! move one byte 1007 bz,pt %ncc, .bc_sm_exit 1008 stb %o3, [%o1] 1009 ldub [%o0 + 1], %o3 ! move another byte 1010 deccc %o2 ! check for more 1011 bz,pt %ncc, .bc_sm_exit 1012 stb %o3, [%o1 + 1] 1013 ldub [%o0 + 2], %o3 ! move final byte 1014 stb %o3, [%o1 + 2] 1015 membar #Sync ! sync error barrier 1016 andn %o4, TRAMP_FLAG, %o4 1017 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1018 retl 1019 mov %g0, %o0 ! return 0 1020 .align 16 1021 nop ! instruction alignment 1022 ! see discussion at start of file 1023 .bc_sm_words: 1024 lduw [%o0], %o3 ! read word 1025 .bc_sm_wordx: 1026 subcc %o2, 8, %o2 ! update count 1027 stw %o3, [%o1] ! write word 1028 add %o0, 8, %o0 ! update SRC 1029 lduw [%o0 - 4], %o3 ! read word 1030 add %o1, 8, %o1 ! update DST 1031 bgt,pt %ncc, .bc_sm_words ! loop til done 1032 stw %o3, [%o1 - 4] ! write word 1033 addcc %o2, 7, %o2 ! restore count 1034 bz,pt %ncc, .bc_sm_exit 1035 deccc %o2 1036 bz,pt %ncc, .bc_sm_byte 1037 .bc_sm_half: 1038 subcc %o2, 2, %o2 ! reduce count by 2 1039 add %o0, 2, %o0 ! advance SRC by 2 1040 lduh [%o0 - 2], %o3 ! read half word 1041 add %o1, 2, %o1 ! advance DST by 2 1042 bgt,pt %ncc, .bc_sm_half ! loop til done 1043 sth %o3, [%o1 - 2] ! write half word 1044 addcc %o2, 1, %o2 ! restore count 1045 bz,pt %ncc, .bc_sm_exit 1046 nop 1047 .bc_sm_byte: 1048 ldub [%o0], %o3 1049 stb %o3, [%o1] 1050 membar #Sync ! sync error barrier 1051 andn %o4, TRAMP_FLAG, %o4 1052 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1053 retl 1054 mov %g0, %o0 ! return 0 1055 1056 .bc_sm_word: 1057 subcc %o2, 4, %o2 ! update count 1058 bgt,pt %ncc, .bc_sm_wordx 1059 lduw [%o0], %o3 ! read word 1060 addcc %o2, 3, %o2 ! restore count 1061 bz,pt %ncc, .bc_sm_exit 1062 stw %o3, [%o1] ! write word 1063 deccc %o2 ! reduce count for cc test 1064 ldub [%o0 + 4], %o3 ! load one byte 1065 bz,pt %ncc, .bc_sm_exit 1066 stb %o3, [%o1 + 4] ! store one byte 1067 ldub [%o0 + 5], %o3 ! load second byte 1068 deccc %o2 1069 bz,pt %ncc, .bc_sm_exit 1070 stb %o3, [%o1 + 5] ! store second byte 1071 ldub [%o0 + 6], %o3 ! load third byte 1072 stb %o3, [%o1 + 6] ! store third byte 1073 .bc_sm_exit: 1074 membar #Sync ! sync error barrier 1075 andn %o4, TRAMP_FLAG, %o4 1076 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1077 retl 1078 mov %g0, %o0 ! return 0 1079 1080 .align 16 1081 .bc_med: 1082 xor %o0, %o1, %o3 ! setup alignment check 1083 btst 1, %o3 1084 bnz,pt %ncc, .bc_sm_movebytes ! unaligned 1085 nop 1086 btst 3, %o3 1087 bnz,pt %ncc, .bc_med_half ! halfword aligned 1088 nop 1089 btst 7, %o3 1090 bnz,pt %ncc, .bc_med_word ! word aligned 1091 nop 1092 .bc_med_long: 1093 btst 3, %o0 ! check for 1094 bz,pt %ncc, .bc_med_long1 ! word alignment 1095 nop 1096 .bc_med_long0: 1097 ldub [%o0], %o3 ! load one byte 1098 inc %o0 1099 stb %o3,[%o1] ! store byte 1100 inc %o1 1101 btst 3, %o0 1102 bnz,pt %ncc, .bc_med_long0 1103 dec %o2 1104 .bc_med_long1: ! word aligned 1105 btst 7, %o0 ! check for long word 1106 bz,pt %ncc, .bc_med_long2 1107 nop 1108 lduw [%o0], %o3 ! load word 1109 add %o0, 4, %o0 ! advance SRC by 4 1110 stw %o3, [%o1] ! store word 1111 add %o1, 4, %o1 ! advance DST by 4 1112 sub %o2, 4, %o2 ! reduce count by 4 1113 ! 1114 ! Now long word aligned and have at least 32 bytes to move 1115 ! 1116 .bc_med_long2: 1117 sub %o2, 31, %o2 ! adjust count to allow cc zero test 1118 .bc_med_lmove: 1119 ldx [%o0], %o3 ! read long word 1120 stx %o3, [%o1] ! write long word 1121 subcc %o2, 32, %o2 ! reduce count by 32 1122 ldx [%o0 + 8], %o3 ! repeat for a total for 4 long words 1123 add %o0, 32, %o0 ! advance SRC by 32 1124 stx %o3, [%o1 + 8] 1125 ldx [%o0 - 16], %o3 1126 add %o1, 32, %o1 ! advance DST by 32 1127 stx %o3, [%o1 - 16] 1128 ldx [%o0 - 8], %o3 1129 bgt,pt %ncc, .bc_med_lmove ! loop til 31 or fewer bytes left 1130 stx %o3, [%o1 - 8] 1131 addcc %o2, 24, %o2 ! restore count to long word offset 1132 ble,pt %ncc, .bc_med_lextra ! check for more long words to move 1133 nop 1134 .bc_med_lword: 1135 ldx [%o0], %o3 ! read long word 1136 subcc %o2, 8, %o2 ! reduce count by 8 1137 stx %o3, [%o1] ! write long word 1138 add %o0, 8, %o0 ! advance SRC by 8 1139 bgt,pt %ncc, .bc_med_lword ! loop til 7 or fewer bytes left 1140 add %o1, 8, %o1 ! advance DST by 8 1141 .bc_med_lextra: 1142 addcc %o2, 7, %o2 ! restore rest of count 1143 bz,pt %ncc, .bc_sm_exit ! if zero, then done 1144 deccc %o2 1145 bz,pt %ncc, .bc_sm_byte 1146 nop 1147 ba,pt %ncc, .bc_sm_half 1148 nop 1149 1150 .align 16 1151 .bc_med_word: 1152 btst 3, %o0 ! check for 1153 bz,pt %ncc, .bc_med_word1 ! word alignment 1154 nop 1155 .bc_med_word0: 1156 ldub [%o0], %o3 ! load one byte 1157 inc %o0 1158 stb %o3,[%o1] ! store byte 1159 inc %o1 1160 btst 3, %o0 1161 bnz,pt %ncc, .bc_med_word0 1162 dec %o2 1163 ! 1164 ! Now word aligned and have at least 36 bytes to move 1165 ! 1166 .bc_med_word1: 1167 sub %o2, 15, %o2 ! adjust count to allow cc zero test 1168 .bc_med_wmove: 1169 lduw [%o0], %o3 ! read word 1170 stw %o3, [%o1] ! write word 1171 subcc %o2, 16, %o2 ! reduce count by 16 1172 lduw [%o0 + 4], %o3 ! repeat for a total for 4 words 1173 add %o0, 16, %o0 ! advance SRC by 16 1174 stw %o3, [%o1 + 4] 1175 lduw [%o0 - 8], %o3 1176 add %o1, 16, %o1 ! advance DST by 16 1177 stw %o3, [%o1 - 8] 1178 lduw [%o0 - 4], %o3 1179 bgt,pt %ncc, .bc_med_wmove ! loop til 15 or fewer bytes left 1180 stw %o3, [%o1 - 4] 1181 addcc %o2, 12, %o2 ! restore count to word offset 1182 ble,pt %ncc, .bc_med_wextra ! check for more words to move 1183 nop 1184 .bc_med_word2: 1185 lduw [%o0], %o3 ! read word 1186 subcc %o2, 4, %o2 ! reduce count by 4 1187 stw %o3, [%o1] ! write word 1188 add %o0, 4, %o0 ! advance SRC by 4 1189 bgt,pt %ncc, .bc_med_word2 ! loop til 3 or fewer bytes left 1190 add %o1, 4, %o1 ! advance DST by 4 1191 .bc_med_wextra: 1192 addcc %o2, 3, %o2 ! restore rest of count 1193 bz,pt %ncc, .bc_sm_exit ! if zero, then done 1194 deccc %o2 1195 bz,pt %ncc, .bc_sm_byte 1196 nop 1197 ba,pt %ncc, .bc_sm_half 1198 nop 1199 1200 .align 16 1201 .bc_med_half: 1202 btst 1, %o0 ! check for 1203 bz,pt %ncc, .bc_med_half1 ! half word alignment 1204 nop 1205 ldub [%o0], %o3 ! load one byte 1206 inc %o0 1207 stb %o3,[%o1] ! store byte 1208 inc %o1 1209 dec %o2 1210 ! 1211 ! Now half word aligned and have at least 38 bytes to move 1212 ! 1213 .bc_med_half1: 1214 sub %o2, 7, %o2 ! adjust count to allow cc zero test 1215 .bc_med_hmove: 1216 lduh [%o0], %o3 ! read half word 1217 sth %o3, [%o1] ! write half word 1218 subcc %o2, 8, %o2 ! reduce count by 8 1219 lduh [%o0 + 2], %o3 ! repeat for a total for 4 halfwords 1220 add %o0, 8, %o0 ! advance SRC by 8 1221 sth %o3, [%o1 + 2] 1222 lduh [%o0 - 4], %o3 1223 add %o1, 8, %o1 ! advance DST by 8 1224 sth %o3, [%o1 - 4] 1225 lduh [%o0 - 2], %o3 1226 bgt,pt %ncc, .bc_med_hmove ! loop til 7 or fewer bytes left 1227 sth %o3, [%o1 - 2] 1228 addcc %o2, 7, %o2 ! restore count 1229 bz,pt %ncc, .bc_sm_exit 1230 deccc %o2 1231 bz,pt %ncc, .bc_sm_byte 1232 nop 1233 ba,pt %ncc, .bc_sm_half 1234 nop 1235 1236 SET_SIZE(bcopy) 1237 1238 /* 1239 * The _more entry points are not intended to be used directly by 1240 * any caller from outside this file. They are provided to allow 1241 * profiling and dtrace of the portions of the copy code that uses 1242 * the floating point registers. 1243 * This entry is particularly important as DTRACE (at least as of 1244 * 4/2004) does not support leaf functions. 1245 */ 1246 1247 ENTRY(bcopy_more) 1248 .bcopy_more: 1249 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 1250 ldn [THREAD_REG + T_LOFAULT], %l6 ! save t_lofault 1251 tst %l6 1252 bz,pt %ncc, .do_copy 1253 nop 1254 sethi %hi(.copyerr), %o2 1255 or %o2, %lo(.copyerr), %o2 1256 membar #Sync ! sync error barrier 1257 stn %o2, [THREAD_REG + T_LOFAULT] ! install new vector 1258 ! 1259 ! We've already captured whether t_lofault was zero on entry. 1260 ! We need to mark ourselves as being from bcopy since both 1261 ! kcopy and bcopy use the same code path. If TRAMP_FLAG is set 1262 ! and the saved lofault was zero, we won't reset lofault on 1263 ! returning. 1264 ! 1265 or %l6, TRAMP_FLAG, %l6 1266 1267 /* 1268 * Copies that reach here are larger than VIS_COPY_THRESHOLD bytes 1269 * Also, use of FP registers has been tested to be enabled 1270 */ 1271 .do_copy: 1272 FP_NOMIGRATE(6, 7) 1273 1274 rd %fprs, %o2 ! check for unused fp 1275 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs 1276 btst FPRS_FEF, %o2 1277 bz,a,pt %icc, .do_blockcopy 1278 wr %g0, FPRS_FEF, %fprs 1279 1280 BST_FPQ1Q3_TOSTACK(%o2) 1281 1282 .do_blockcopy: 1283 rd %gsr, %o2 1284 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr 1285 or %l6, FPUSED_FLAG, %l6 1286 1287 #define REALSRC %i0 1288 #define DST %i1 1289 #define CNT %i2 1290 #define SRC %i3 1291 #define TMP %i5 1292 1293 andcc DST, VIS_BLOCKSIZE - 1, TMP 1294 bz,pt %ncc, 2f 1295 neg TMP 1296 add TMP, VIS_BLOCKSIZE, TMP 1297 1298 ! TMP = bytes required to align DST on FP_BLOCK boundary 1299 ! Using SRC as a tmp here 1300 cmp TMP, 3 1301 bleu,pt %ncc, 1f 1302 sub CNT,TMP,CNT ! adjust main count 1303 sub TMP, 3, TMP ! adjust for end of loop test 1304 .bc_blkalign: 1305 ldub [REALSRC], SRC ! move 4 bytes per loop iteration 1306 stb SRC, [DST] 1307 subcc TMP, 4, TMP 1308 ldub [REALSRC + 1], SRC 1309 add REALSRC, 4, REALSRC 1310 stb SRC, [DST + 1] 1311 ldub [REALSRC - 2], SRC 1312 add DST, 4, DST 1313 stb SRC, [DST - 2] 1314 ldub [REALSRC - 1], SRC 1315 bgu,pt %ncc, .bc_blkalign 1316 stb SRC, [DST - 1] 1317 1318 addcc TMP, 3, TMP ! restore count adjustment 1319 bz,pt %ncc, 2f ! no bytes left? 1320 nop 1321 1: ldub [REALSRC], SRC 1322 inc REALSRC 1323 inc DST 1324 deccc TMP 1325 bgu %ncc, 1b 1326 stb SRC, [DST - 1] 1327 1328 2: 1329 andn REALSRC, 0x7, SRC 1330 alignaddr REALSRC, %g0, %g0 1331 1332 ! SRC - 8-byte aligned 1333 ! DST - 64-byte aligned 1334 prefetch [SRC], #one_read 1335 prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read 1336 prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read 1337 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read 1338 ldd [SRC], %f0 1339 #if CHEETAH_PREFETCH > 4 1340 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read 1341 #endif 1342 ldd [SRC + 0x08], %f2 1343 #if CHEETAH_PREFETCH > 5 1344 prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read 1345 #endif 1346 ldd [SRC + 0x10], %f4 1347 #if CHEETAH_PREFETCH > 6 1348 prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read 1349 #endif 1350 faligndata %f0, %f2, %f32 1351 ldd [SRC + 0x18], %f6 1352 #if CHEETAH_PREFETCH > 7 1353 prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read 1354 #endif 1355 faligndata %f2, %f4, %f34 1356 ldd [SRC + 0x20], %f8 1357 faligndata %f4, %f6, %f36 1358 ldd [SRC + 0x28], %f10 1359 faligndata %f6, %f8, %f38 1360 ldd [SRC + 0x30], %f12 1361 faligndata %f8, %f10, %f40 1362 ldd [SRC + 0x38], %f14 1363 faligndata %f10, %f12, %f42 1364 ldd [SRC + VIS_BLOCKSIZE], %f0 1365 sub CNT, VIS_BLOCKSIZE, CNT 1366 add SRC, VIS_BLOCKSIZE, SRC 1367 add REALSRC, VIS_BLOCKSIZE, REALSRC 1368 ba,a,pt %ncc, 1f 1369 nop 1370 .align 16 1371 1: 1372 ldd [SRC + 0x08], %f2 1373 faligndata %f12, %f14, %f44 1374 ldd [SRC + 0x10], %f4 1375 faligndata %f14, %f0, %f46 1376 stda %f32, [DST]ASI_BLK_P 1377 ldd [SRC + 0x18], %f6 1378 faligndata %f0, %f2, %f32 1379 ldd [SRC + 0x20], %f8 1380 faligndata %f2, %f4, %f34 1381 ldd [SRC + 0x28], %f10 1382 faligndata %f4, %f6, %f36 1383 ldd [SRC + 0x30], %f12 1384 faligndata %f6, %f8, %f38 1385 ldd [SRC + 0x38], %f14 1386 faligndata %f8, %f10, %f40 1387 sub CNT, VIS_BLOCKSIZE, CNT 1388 ldd [SRC + VIS_BLOCKSIZE], %f0 1389 faligndata %f10, %f12, %f42 1390 prefetch [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read 1391 add DST, VIS_BLOCKSIZE, DST 1392 prefetch [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read 1393 add REALSRC, VIS_BLOCKSIZE, REALSRC 1394 cmp CNT, VIS_BLOCKSIZE + 8 1395 bgu,pt %ncc, 1b 1396 add SRC, VIS_BLOCKSIZE, SRC 1397 1398 ! only if REALSRC & 0x7 is 0 1399 cmp CNT, VIS_BLOCKSIZE 1400 bne %ncc, 3f 1401 andcc REALSRC, 0x7, %g0 1402 bz,pt %ncc, 2f 1403 nop 1404 3: 1405 faligndata %f12, %f14, %f44 1406 faligndata %f14, %f0, %f46 1407 stda %f32, [DST]ASI_BLK_P 1408 add DST, VIS_BLOCKSIZE, DST 1409 ba,pt %ncc, 3f 1410 nop 1411 2: 1412 ldd [SRC + 0x08], %f2 1413 fsrc1 %f12, %f44 1414 ldd [SRC + 0x10], %f4 1415 fsrc1 %f14, %f46 1416 stda %f32, [DST]ASI_BLK_P 1417 ldd [SRC + 0x18], %f6 1418 fsrc1 %f0, %f32 1419 ldd [SRC + 0x20], %f8 1420 fsrc1 %f2, %f34 1421 ldd [SRC + 0x28], %f10 1422 fsrc1 %f4, %f36 1423 ldd [SRC + 0x30], %f12 1424 fsrc1 %f6, %f38 1425 ldd [SRC + 0x38], %f14 1426 fsrc1 %f8, %f40 1427 sub CNT, VIS_BLOCKSIZE, CNT 1428 add DST, VIS_BLOCKSIZE, DST 1429 add SRC, VIS_BLOCKSIZE, SRC 1430 add REALSRC, VIS_BLOCKSIZE, REALSRC 1431 fsrc1 %f10, %f42 1432 fsrc1 %f12, %f44 1433 fsrc1 %f14, %f46 1434 stda %f32, [DST]ASI_BLK_P 1435 add DST, VIS_BLOCKSIZE, DST 1436 ba,a,pt %ncc, .bcb_exit 1437 nop 1438 1439 3: tst CNT 1440 bz,a,pt %ncc, .bcb_exit 1441 nop 1442 1443 5: ldub [REALSRC], TMP 1444 inc REALSRC 1445 inc DST 1446 deccc CNT 1447 bgu %ncc, 5b 1448 stb TMP, [DST - 1] 1449 .bcb_exit: 1450 membar #Sync 1451 1452 FPRAS_INTERVAL(FPRAS_BCOPY, 0, %l5, %o2, %o3, %o4, %o5, 8) 1453 FPRAS_REWRITE_TYPE2Q1(0, %l5, %o2, %o3, 8, 9) 1454 FPRAS_CHECK(FPRAS_BCOPY, %l5, 9) ! outputs lost 1455 1456 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr 1457 wr %o2, 0, %gsr 1458 1459 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 1460 btst FPRS_FEF, %o3 1461 bz,pt %icc, 4f 1462 nop 1463 1464 BLD_FPQ1Q3_FROMSTACK(%o2) 1465 1466 ba,pt %ncc, 2f 1467 wr %o3, 0, %fprs ! restore fprs 1468 4: 1469 FZEROQ1Q3 1470 wr %o3, 0, %fprs ! restore fprs 1471 2: 1472 membar #Sync ! sync error barrier 1473 andn %l6, MASK_FLAGS, %l6 1474 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1475 FP_ALLOWMIGRATE(5, 6) 1476 ret 1477 restore %g0, 0, %o0 1478 1479 SET_SIZE(bcopy_more) 1480 1481 /* 1482 * Block copy with possibly overlapped operands. 1483 */ 1484 1485 ENTRY(ovbcopy) 1486 tst %o2 ! check count 1487 bgu,a %ncc, 1f ! nothing to do or bad arguments 1488 subcc %o0, %o1, %o3 ! difference of from and to address 1489 1490 retl ! return 1491 nop 1492 1: 1493 bneg,a %ncc, 2f 1494 neg %o3 ! if < 0, make it positive 1495 2: cmp %o2, %o3 ! cmp size and abs(from - to) 1496 bleu %ncc, bcopy ! if size <= abs(diff): use bcopy, 1497 .empty ! no overlap 1498 cmp %o0, %o1 ! compare from and to addresses 1499 blu %ncc, .ov_bkwd ! if from < to, copy backwards 1500 nop 1501 ! 1502 ! Copy forwards. 1503 ! 1504 .ov_fwd: 1505 ldub [%o0], %o3 ! read from address 1506 inc %o0 ! inc from address 1507 stb %o3, [%o1] ! write to address 1508 deccc %o2 ! dec count 1509 bgu %ncc, .ov_fwd ! loop till done 1510 inc %o1 ! inc to address 1511 1512 retl ! return 1513 nop 1514 ! 1515 ! Copy backwards. 1516 ! 1517 .ov_bkwd: 1518 deccc %o2 ! dec count 1519 ldub [%o0 + %o2], %o3 ! get byte at end of src 1520 bgu %ncc, .ov_bkwd ! loop till done 1521 stb %o3, [%o1 + %o2] ! delay slot, store at end of dst 1522 1523 retl ! return 1524 nop 1525 1526 SET_SIZE(ovbcopy) 1527 1528 1529 /* 1530 * hwblkpagecopy() 1531 * 1532 * Copies exactly one page. This routine assumes the caller (ppcopy) 1533 * has already disabled kernel preemption and has checked 1534 * use_hw_bcopy. Preventing preemption also prevents cpu migration. 1535 */ 1536 ENTRY(hwblkpagecopy) 1537 ! get another window w/space for three aligned blocks of saved fpregs 1538 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 1539 1540 ! %i0 - source address (arg) 1541 ! %i1 - destination address (arg) 1542 ! %i2 - length of region (not arg) 1543 ! %l0 - saved fprs 1544 ! %l1 - pointer to saved fpregs 1545 1546 rd %fprs, %l0 ! check for unused fp 1547 btst FPRS_FEF, %l0 1548 bz,a,pt %icc, 1f 1549 wr %g0, FPRS_FEF, %fprs 1550 1551 BST_FPQ1Q3_TOSTACK(%l1) 1552 1553 1: set PAGESIZE, CNT 1554 mov REALSRC, SRC 1555 1556 prefetch [SRC], #one_read 1557 prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read 1558 prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read 1559 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read 1560 ldd [SRC], %f0 1561 #if CHEETAH_PREFETCH > 4 1562 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read 1563 #endif 1564 ldd [SRC + 0x08], %f2 1565 #if CHEETAH_PREFETCH > 5 1566 prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read 1567 #endif 1568 ldd [SRC + 0x10], %f4 1569 #if CHEETAH_PREFETCH > 6 1570 prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read 1571 #endif 1572 fsrc1 %f0, %f32 1573 ldd [SRC + 0x18], %f6 1574 #if CHEETAH_PREFETCH > 7 1575 prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read 1576 #endif 1577 fsrc1 %f2, %f34 1578 ldd [SRC + 0x20], %f8 1579 fsrc1 %f4, %f36 1580 ldd [SRC + 0x28], %f10 1581 fsrc1 %f6, %f38 1582 ldd [SRC + 0x30], %f12 1583 fsrc1 %f8, %f40 1584 ldd [SRC + 0x38], %f14 1585 fsrc1 %f10, %f42 1586 ldd [SRC + VIS_BLOCKSIZE], %f0 1587 sub CNT, VIS_BLOCKSIZE, CNT 1588 add SRC, VIS_BLOCKSIZE, SRC 1589 ba,a,pt %ncc, 2f 1590 nop 1591 .align 16 1592 2: 1593 ldd [SRC + 0x08], %f2 1594 fsrc1 %f12, %f44 1595 ldd [SRC + 0x10], %f4 1596 fsrc1 %f14, %f46 1597 stda %f32, [DST]ASI_BLK_P 1598 ldd [SRC + 0x18], %f6 1599 fsrc1 %f0, %f32 1600 ldd [SRC + 0x20], %f8 1601 fsrc1 %f2, %f34 1602 ldd [SRC + 0x28], %f10 1603 fsrc1 %f4, %f36 1604 ldd [SRC + 0x30], %f12 1605 fsrc1 %f6, %f38 1606 ldd [SRC + 0x38], %f14 1607 fsrc1 %f8, %f40 1608 ldd [SRC + VIS_BLOCKSIZE], %f0 1609 fsrc1 %f10, %f42 1610 prefetch [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read 1611 sub CNT, VIS_BLOCKSIZE, CNT 1612 add DST, VIS_BLOCKSIZE, DST 1613 cmp CNT, VIS_BLOCKSIZE + 8 1614 prefetch [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read 1615 bgu,pt %ncc, 2b 1616 add SRC, VIS_BLOCKSIZE, SRC 1617 1618 ! trailing block 1619 ldd [SRC + 0x08], %f2 1620 fsrc1 %f12, %f44 1621 ldd [SRC + 0x10], %f4 1622 fsrc1 %f14, %f46 1623 stda %f32, [DST]ASI_BLK_P 1624 ldd [SRC + 0x18], %f6 1625 fsrc1 %f0, %f32 1626 ldd [SRC + 0x20], %f8 1627 fsrc1 %f2, %f34 1628 ldd [SRC + 0x28], %f10 1629 fsrc1 %f4, %f36 1630 ldd [SRC + 0x30], %f12 1631 fsrc1 %f6, %f38 1632 ldd [SRC + 0x38], %f14 1633 fsrc1 %f8, %f40 1634 sub CNT, VIS_BLOCKSIZE, CNT 1635 add DST, VIS_BLOCKSIZE, DST 1636 add SRC, VIS_BLOCKSIZE, SRC 1637 fsrc1 %f10, %f42 1638 fsrc1 %f12, %f44 1639 fsrc1 %f14, %f46 1640 stda %f32, [DST]ASI_BLK_P 1641 1642 membar #Sync 1643 1644 FPRAS_INTERVAL(FPRAS_PGCOPY, 1, %l5, %o2, %o3, %o4, %o5, 8) 1645 FPRAS_REWRITE_TYPE1(1, %l5, %f32, %o2, 9) 1646 FPRAS_CHECK(FPRAS_PGCOPY, %l5, 9) ! lose outputs 1647 1648 btst FPRS_FEF, %l0 1649 bz,pt %icc, 2f 1650 nop 1651 1652 BLD_FPQ1Q3_FROMSTACK(%l3) 1653 ba 3f 1654 nop 1655 1656 2: FZEROQ1Q3 1657 1658 3: wr %l0, 0, %fprs ! restore fprs 1659 ret 1660 restore %g0, 0, %o0 1661 1662 SET_SIZE(hwblkpagecopy) 1663 1664 1665 /* 1666 * Transfer data to and from user space - 1667 * Note that these routines can cause faults 1668 * It is assumed that the kernel has nothing at 1669 * less than KERNELBASE in the virtual address space. 1670 * 1671 * Note that copyin(9F) and copyout(9F) are part of the 1672 * DDI/DKI which specifies that they return '-1' on "errors." 1673 * 1674 * Sigh. 1675 * 1676 * So there's two extremely similar routines - xcopyin() and xcopyout() 1677 * which return the errno that we've faithfully computed. This 1678 * allows other callers (e.g. uiomove(9F)) to work correctly. 1679 * Given that these are used pretty heavily, we expand the calling 1680 * sequences inline for all flavours (rather than making wrappers). 1681 * 1682 * There are also stub routines for xcopyout_little and xcopyin_little, 1683 * which currently are intended to handle requests of <= 16 bytes from 1684 * do_unaligned. Future enhancement to make them handle 8k pages efficiently 1685 * is left as an exercise... 1686 */ 1687 1688 /* 1689 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr) 1690 * 1691 * General theory of operation: 1692 * 1693 * The only difference between copy{in,out} and 1694 * xcopy{in,out} is in the error handling routine they invoke 1695 * when a memory access error occurs. xcopyOP returns the errno 1696 * while copyOP returns -1 (see above). copy{in,out}_noerr set 1697 * a special flag (by oring the TRAMP_FLAG into the fault handler address) 1698 * if they are called with a fault handler already in place. That flag 1699 * causes the default handlers to trampoline to the previous handler 1700 * upon an error. 1701 * 1702 * None of the copyops routines grab a window until it's decided that 1703 * we need to do a HW block copy operation. This saves a window 1704 * spill/fill when we're called during socket ops. The typical IO 1705 * path won't cause spill/fill traps. 1706 * 1707 * This code uses a set of 4 limits for the maximum size that will 1708 * be copied given a particular input/output address alignment. 1709 * If the value for a particular limit is zero, the copy will be performed 1710 * by the plain copy loops rather than FPBLK. 1711 * 1712 * See the description of bcopy above for more details of the 1713 * data copying algorithm and the default limits. 1714 * 1715 */ 1716 1717 /* 1718 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little). 1719 */ 1720 1721 /* 1722 * We save the arguments in the following registers in case of a fault: 1723 * kaddr - %l1 1724 * uaddr - %l2 1725 * count - %l3 1726 */ 1727 #define SAVE_SRC %l1 1728 #define SAVE_DST %l2 1729 #define SAVE_COUNT %l3 1730 1731 #define SM_SAVE_SRC %g4 1732 #define SM_SAVE_DST %g5 1733 #define SM_SAVE_COUNT %o5 1734 #define ERRNO %l5 1735 1736 1737 #define REAL_LOFAULT %l4 1738 /* 1739 * Generic copyio fault handler. This is the first line of defense when a 1740 * fault occurs in (x)copyin/(x)copyout. In order for this to function 1741 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT. 1742 * This allows us to share common code for all the flavors of the copy 1743 * operations, including the _noerr versions. 1744 * 1745 * Note that this function will restore the original input parameters before 1746 * calling REAL_LOFAULT. So the real handler can vector to the appropriate 1747 * member of the t_copyop structure, if needed. 1748 */ 1749 ENTRY(copyio_fault) 1750 membar #Sync 1751 mov %g1,ERRNO ! save errno in ERRNO 1752 btst FPUSED_FLAG, %l6 1753 bz %ncc, 1f 1754 nop 1755 1756 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 1757 wr %o2, 0, %gsr ! restore gsr 1758 1759 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 1760 btst FPRS_FEF, %o3 1761 bz,pt %icc, 4f 1762 nop 1763 1764 BLD_FPQ2Q4_FROMSTACK(%o2) 1765 1766 ba,pt %ncc, 1f 1767 wr %o3, 0, %fprs ! restore fprs 1768 1769 4: 1770 FZEROQ2Q4 1771 wr %o3, 0, %fprs ! restore fprs 1772 1773 1: 1774 andn %l6, FPUSED_FLAG, %l6 1775 membar #Sync 1776 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1777 FP_ALLOWMIGRATE(5, 6) 1778 1779 mov SAVE_SRC, %i0 1780 mov SAVE_DST, %i1 1781 jmp REAL_LOFAULT 1782 mov SAVE_COUNT, %i2 1783 1784 SET_SIZE(copyio_fault) 1785 1786 1787 ENTRY(copyout) 1788 1789 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 1790 bleu,pt %ncc, .copyout_small ! go to larger cases 1791 xor %o0, %o1, %o3 ! are src, dst alignable? 1792 btst 7, %o3 ! 1793 bz,pt %ncc, .copyout_8 ! check for longword alignment 1794 nop 1795 btst 1, %o3 ! 1796 bz,pt %ncc, .copyout_2 ! check for half-word 1797 nop 1798 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 1799 ld [%o3 + %lo(hw_copy_limit_1)], %o3 1800 tst %o3 1801 bz,pn %icc, .copyout_small ! if zero, disable HW copy 1802 cmp %o2, %o3 ! if length <= limit 1803 bleu,pt %ncc, .copyout_small ! go to small copy 1804 nop 1805 ba,pt %ncc, .copyout_more ! otherwise go to large copy 1806 nop 1807 .copyout_2: 1808 btst 3, %o3 ! 1809 bz,pt %ncc, .copyout_4 ! check for word alignment 1810 nop 1811 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 1812 ld [%o3 + %lo(hw_copy_limit_2)], %o3 1813 tst %o3 1814 bz,pn %icc, .copyout_small ! if zero, disable HW copy 1815 cmp %o2, %o3 ! if length <= limit 1816 bleu,pt %ncc, .copyout_small ! go to small copy 1817 nop 1818 ba,pt %ncc, .copyout_more ! otherwise go to large copy 1819 nop 1820 .copyout_4: 1821 ! already checked longword, must be word aligned 1822 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 1823 ld [%o3 + %lo(hw_copy_limit_4)], %o3 1824 tst %o3 1825 bz,pn %icc, .copyout_small ! if zero, disable HW copy 1826 cmp %o2, %o3 ! if length <= limit 1827 bleu,pt %ncc, .copyout_small ! go to small copy 1828 nop 1829 ba,pt %ncc, .copyout_more ! otherwise go to large copy 1830 nop 1831 .copyout_8: 1832 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 1833 ld [%o3 + %lo(hw_copy_limit_8)], %o3 1834 tst %o3 1835 bz,pn %icc, .copyout_small ! if zero, disable HW copy 1836 cmp %o2, %o3 ! if length <= limit 1837 bleu,pt %ncc, .copyout_small ! go to small copy 1838 nop 1839 ba,pt %ncc, .copyout_more ! otherwise go to large copy 1840 nop 1841 1842 .align 16 1843 nop ! instruction alignment 1844 ! see discussion at start of file 1845 .copyout_small: 1846 sethi %hi(.sm_copyout_err), %o5 ! .sm_copyout_err is lofault 1847 or %o5, %lo(.sm_copyout_err), %o5 1848 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler 1849 membar #Sync ! sync error barrier 1850 stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault 1851 .sm_do_copyout: 1852 mov %o0, SM_SAVE_SRC 1853 mov %o1, SM_SAVE_DST 1854 cmp %o2, SHORTCOPY ! check for really short case 1855 bleu,pt %ncc, .co_sm_left ! 1856 mov %o2, SM_SAVE_COUNT 1857 cmp %o2, CHKSIZE ! check for medium length cases 1858 bgu,pn %ncc, .co_med ! 1859 or %o0, %o1, %o3 ! prepare alignment check 1860 andcc %o3, 0x3, %g0 ! test for alignment 1861 bz,pt %ncc, .co_sm_word ! branch to word aligned case 1862 .co_sm_movebytes: 1863 sub %o2, 3, %o2 ! adjust count to allow cc zero test 1864 .co_sm_notalign4: 1865 ldub [%o0], %o3 ! read byte 1866 subcc %o2, 4, %o2 ! reduce count by 4 1867 stba %o3, [%o1]ASI_USER ! write byte 1868 inc %o1 ! advance DST by 1 1869 ldub [%o0 + 1], %o3 ! repeat for a total of 4 bytes 1870 add %o0, 4, %o0 ! advance SRC by 4 1871 stba %o3, [%o1]ASI_USER 1872 inc %o1 ! advance DST by 1 1873 ldub [%o0 - 2], %o3 1874 stba %o3, [%o1]ASI_USER 1875 inc %o1 ! advance DST by 1 1876 ldub [%o0 - 1], %o3 1877 stba %o3, [%o1]ASI_USER 1878 bgt,pt %ncc, .co_sm_notalign4 ! loop til 3 or fewer bytes remain 1879 inc %o1 ! advance DST by 1 1880 add %o2, 3, %o2 ! restore count 1881 .co_sm_left: 1882 tst %o2 1883 bz,pt %ncc, .co_sm_exit ! check for zero length 1884 nop 1885 ldub [%o0], %o3 ! load one byte 1886 deccc %o2 ! reduce count for cc test 1887 bz,pt %ncc, .co_sm_exit 1888 stba %o3,[%o1]ASI_USER ! store one byte 1889 ldub [%o0 + 1], %o3 ! load second byte 1890 deccc %o2 1891 inc %o1 1892 bz,pt %ncc, .co_sm_exit 1893 stba %o3,[%o1]ASI_USER ! store second byte 1894 ldub [%o0 + 2], %o3 ! load third byte 1895 inc %o1 1896 stba %o3,[%o1]ASI_USER ! store third byte 1897 membar #Sync ! sync error barrier 1898 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1899 retl 1900 mov %g0, %o0 ! return 0 1901 .align 16 1902 .co_sm_words: 1903 lduw [%o0], %o3 ! read word 1904 .co_sm_wordx: 1905 subcc %o2, 8, %o2 ! update count 1906 stwa %o3, [%o1]ASI_USER ! write word 1907 add %o0, 8, %o0 ! update SRC 1908 lduw [%o0 - 4], %o3 ! read word 1909 add %o1, 4, %o1 ! update DST 1910 stwa %o3, [%o1]ASI_USER ! write word 1911 bgt,pt %ncc, .co_sm_words ! loop til done 1912 add %o1, 4, %o1 ! update DST 1913 addcc %o2, 7, %o2 ! restore count 1914 bz,pt %ncc, .co_sm_exit 1915 nop 1916 deccc %o2 1917 bz,pt %ncc, .co_sm_byte 1918 .co_sm_half: 1919 subcc %o2, 2, %o2 ! reduce count by 2 1920 lduh [%o0], %o3 ! read half word 1921 add %o0, 2, %o0 ! advance SRC by 2 1922 stha %o3, [%o1]ASI_USER ! write half word 1923 bgt,pt %ncc, .co_sm_half ! loop til done 1924 add %o1, 2, %o1 ! advance DST by 2 1925 addcc %o2, 1, %o2 ! restore count 1926 bz,pt %ncc, .co_sm_exit 1927 nop 1928 .co_sm_byte: 1929 ldub [%o0], %o3 1930 stba %o3, [%o1]ASI_USER 1931 membar #Sync ! sync error barrier 1932 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1933 retl 1934 mov %g0, %o0 ! return 0 1935 .align 16 1936 .co_sm_word: 1937 subcc %o2, 4, %o2 ! update count 1938 bgt,pt %ncc, .co_sm_wordx 1939 lduw [%o0], %o3 ! read word 1940 addcc %o2, 3, %o2 ! restore count 1941 bz,pt %ncc, .co_sm_exit 1942 stwa %o3, [%o1]ASI_USER ! write word 1943 deccc %o2 ! reduce count for cc test 1944 ldub [%o0 + 4], %o3 ! load one byte 1945 add %o1, 4, %o1 1946 bz,pt %ncc, .co_sm_exit 1947 stba %o3, [%o1]ASI_USER ! store one byte 1948 ldub [%o0 + 5], %o3 ! load second byte 1949 deccc %o2 1950 inc %o1 1951 bz,pt %ncc, .co_sm_exit 1952 stba %o3, [%o1]ASI_USER ! store second byte 1953 ldub [%o0 + 6], %o3 ! load third byte 1954 inc %o1 1955 stba %o3, [%o1]ASI_USER ! store third byte 1956 .co_sm_exit: 1957 membar #Sync ! sync error barrier 1958 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1959 retl 1960 mov %g0, %o0 ! return 0 1961 1962 .align 16 1963 .co_med: 1964 xor %o0, %o1, %o3 ! setup alignment check 1965 btst 1, %o3 1966 bnz,pt %ncc, .co_sm_movebytes ! unaligned 1967 nop 1968 btst 3, %o3 1969 bnz,pt %ncc, .co_med_half ! halfword aligned 1970 nop 1971 btst 7, %o3 1972 bnz,pt %ncc, .co_med_word ! word aligned 1973 nop 1974 .co_med_long: 1975 btst 3, %o0 ! check for 1976 bz,pt %ncc, .co_med_long1 ! word alignment 1977 nop 1978 .co_med_long0: 1979 ldub [%o0], %o3 ! load one byte 1980 inc %o0 1981 stba %o3,[%o1]ASI_USER ! store byte 1982 inc %o1 1983 btst 3, %o0 1984 bnz,pt %ncc, .co_med_long0 1985 dec %o2 1986 .co_med_long1: ! word aligned 1987 btst 7, %o0 ! check for long word 1988 bz,pt %ncc, .co_med_long2 1989 nop 1990 lduw [%o0], %o3 ! load word 1991 add %o0, 4, %o0 ! advance SRC by 4 1992 stwa %o3, [%o1]ASI_USER ! store word 1993 add %o1, 4, %o1 ! advance DST by 4 1994 sub %o2, 4, %o2 ! reduce count by 4 1995 ! 1996 ! Now long word aligned and have at least 32 bytes to move 1997 ! 1998 .co_med_long2: 1999 sub %o2, 31, %o2 ! adjust count to allow cc zero test 2000 sub %o1, 8, %o1 ! adjust pointer to allow store in 2001 ! branch delay slot instead of add 2002 .co_med_lmove: 2003 add %o1, 8, %o1 ! advance DST by 8 2004 ldx [%o0], %o3 ! read long word 2005 subcc %o2, 32, %o2 ! reduce count by 32 2006 stxa %o3, [%o1]ASI_USER ! write long word 2007 add %o1, 8, %o1 ! advance DST by 8 2008 ldx [%o0 + 8], %o3 ! repeat for a total for 4 long words 2009 add %o0, 32, %o0 ! advance SRC by 32 2010 stxa %o3, [%o1]ASI_USER 2011 ldx [%o0 - 16], %o3 2012 add %o1, 8, %o1 ! advance DST by 8 2013 stxa %o3, [%o1]ASI_USER 2014 ldx [%o0 - 8], %o3 2015 add %o1, 8, %o1 ! advance DST by 8 2016 bgt,pt %ncc, .co_med_lmove ! loop til 31 or fewer bytes left 2017 stxa %o3, [%o1]ASI_USER 2018 add %o1, 8, %o1 ! advance DST by 8 2019 addcc %o2, 24, %o2 ! restore count to long word offset 2020 ble,pt %ncc, .co_med_lextra ! check for more long words to move 2021 nop 2022 .co_med_lword: 2023 ldx [%o0], %o3 ! read long word 2024 subcc %o2, 8, %o2 ! reduce count by 8 2025 stxa %o3, [%o1]ASI_USER ! write long word 2026 add %o0, 8, %o0 ! advance SRC by 8 2027 bgt,pt %ncc, .co_med_lword ! loop til 7 or fewer bytes left 2028 add %o1, 8, %o1 ! advance DST by 8 2029 .co_med_lextra: 2030 addcc %o2, 7, %o2 ! restore rest of count 2031 bz,pt %ncc, .co_sm_exit ! if zero, then done 2032 deccc %o2 2033 bz,pt %ncc, .co_sm_byte 2034 nop 2035 ba,pt %ncc, .co_sm_half 2036 nop 2037 2038 .align 16 2039 nop ! instruction alignment 2040 ! see discussion at start of file 2041 .co_med_word: 2042 btst 3, %o0 ! check for 2043 bz,pt %ncc, .co_med_word1 ! word alignment 2044 nop 2045 .co_med_word0: 2046 ldub [%o0], %o3 ! load one byte 2047 inc %o0 2048 stba %o3,[%o1]ASI_USER ! store byte 2049 inc %o1 2050 btst 3, %o0 2051 bnz,pt %ncc, .co_med_word0 2052 dec %o2 2053 ! 2054 ! Now word aligned and have at least 36 bytes to move 2055 ! 2056 .co_med_word1: 2057 sub %o2, 15, %o2 ! adjust count to allow cc zero test 2058 .co_med_wmove: 2059 lduw [%o0], %o3 ! read word 2060 subcc %o2, 16, %o2 ! reduce count by 16 2061 stwa %o3, [%o1]ASI_USER ! write word 2062 add %o1, 4, %o1 ! advance DST by 4 2063 lduw [%o0 + 4], %o3 ! repeat for a total for 4 words 2064 add %o0, 16, %o0 ! advance SRC by 16 2065 stwa %o3, [%o1]ASI_USER 2066 add %o1, 4, %o1 ! advance DST by 4 2067 lduw [%o0 - 8], %o3 2068 stwa %o3, [%o1]ASI_USER 2069 add %o1, 4, %o1 ! advance DST by 4 2070 lduw [%o0 - 4], %o3 2071 stwa %o3, [%o1]ASI_USER 2072 bgt,pt %ncc, .co_med_wmove ! loop til 15 or fewer bytes left 2073 add %o1, 4, %o1 ! advance DST by 4 2074 addcc %o2, 12, %o2 ! restore count to word offset 2075 ble,pt %ncc, .co_med_wextra ! check for more words to move 2076 nop 2077 .co_med_word2: 2078 lduw [%o0], %o3 ! read word 2079 subcc %o2, 4, %o2 ! reduce count by 4 2080 stwa %o3, [%o1]ASI_USER ! write word 2081 add %o0, 4, %o0 ! advance SRC by 4 2082 bgt,pt %ncc, .co_med_word2 ! loop til 3 or fewer bytes left 2083 add %o1, 4, %o1 ! advance DST by 4 2084 .co_med_wextra: 2085 addcc %o2, 3, %o2 ! restore rest of count 2086 bz,pt %ncc, .co_sm_exit ! if zero, then done 2087 deccc %o2 2088 bz,pt %ncc, .co_sm_byte 2089 nop 2090 ba,pt %ncc, .co_sm_half 2091 nop 2092 2093 .align 16 2094 nop ! instruction alignment 2095 nop ! see discussion at start of file 2096 nop 2097 .co_med_half: 2098 btst 1, %o0 ! check for 2099 bz,pt %ncc, .co_med_half1 ! half word alignment 2100 nop 2101 ldub [%o0], %o3 ! load one byte 2102 inc %o0 2103 stba %o3,[%o1]ASI_USER ! store byte 2104 inc %o1 2105 dec %o2 2106 ! 2107 ! Now half word aligned and have at least 38 bytes to move 2108 ! 2109 .co_med_half1: 2110 sub %o2, 7, %o2 ! adjust count to allow cc zero test 2111 .co_med_hmove: 2112 lduh [%o0], %o3 ! read half word 2113 subcc %o2, 8, %o2 ! reduce count by 8 2114 stha %o3, [%o1]ASI_USER ! write half word 2115 add %o1, 2, %o1 ! advance DST by 2 2116 lduh [%o0 + 2], %o3 ! repeat for a total for 4 halfwords 2117 add %o0, 8, %o0 ! advance SRC by 8 2118 stha %o3, [%o1]ASI_USER 2119 add %o1, 2, %o1 ! advance DST by 2 2120 lduh [%o0 - 4], %o3 2121 stha %o3, [%o1]ASI_USER 2122 add %o1, 2, %o1 ! advance DST by 2 2123 lduh [%o0 - 2], %o3 2124 stha %o3, [%o1]ASI_USER 2125 bgt,pt %ncc, .co_med_hmove ! loop til 7 or fewer bytes left 2126 add %o1, 2, %o1 ! advance DST by 2 2127 addcc %o2, 7, %o2 ! restore count 2128 bz,pt %ncc, .co_sm_exit 2129 deccc %o2 2130 bz,pt %ncc, .co_sm_byte 2131 nop 2132 ba,pt %ncc, .co_sm_half 2133 nop 2134 2135 /* 2136 * We got here because of a fault during short copyout. 2137 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh). 2138 */ 2139 .sm_copyout_err: 2140 membar #Sync 2141 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2142 mov SM_SAVE_SRC, %o0 2143 mov SM_SAVE_DST, %o1 2144 mov SM_SAVE_COUNT, %o2 2145 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler 2146 tst %o3 2147 bz,pt %ncc, 3f ! if not, return error 2148 nop 2149 ldn [%o3 + CP_COPYOUT], %o5 ! if handler, invoke it with 2150 jmp %o5 ! original arguments 2151 nop 2152 3: 2153 retl 2154 or %g0, -1, %o0 ! return error value 2155 2156 SET_SIZE(copyout) 2157 2158 /* 2159 * The _more entry points are not intended to be used directly by 2160 * any caller from outside this file. They are provided to allow 2161 * profiling and dtrace of the portions of the copy code that uses 2162 * the floating point registers. 2163 * This entry is particularly important as DTRACE (at least as of 2164 * 4/2004) does not support leaf functions. 2165 */ 2166 2167 ENTRY(copyout_more) 2168 .copyout_more: 2169 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 2170 set .copyout_err, REAL_LOFAULT 2171 2172 /* 2173 * Copy outs that reach here are larger than VIS_COPY_THRESHOLD bytes 2174 */ 2175 .do_copyout: 2176 set copyio_fault, %l7 ! .copyio_fault is lofault val 2177 2178 ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler 2179 membar #Sync ! sync error barrier 2180 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 2181 2182 mov %i0, SAVE_SRC 2183 mov %i1, SAVE_DST 2184 mov %i2, SAVE_COUNT 2185 2186 FP_NOMIGRATE(6, 7) 2187 2188 rd %fprs, %o2 ! check for unused fp 2189 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs 2190 btst FPRS_FEF, %o2 2191 bz,a,pt %icc, .do_blockcopyout 2192 wr %g0, FPRS_FEF, %fprs 2193 2194 BST_FPQ2Q4_TOSTACK(%o2) 2195 2196 .do_blockcopyout: 2197 rd %gsr, %o2 2198 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr 2199 or %l6, FPUSED_FLAG, %l6 2200 2201 andcc DST, VIS_BLOCKSIZE - 1, TMP 2202 mov ASI_USER, %asi 2203 bz,pt %ncc, 2f 2204 neg TMP 2205 add TMP, VIS_BLOCKSIZE, TMP 2206 2207 ! TMP = bytes required to align DST on FP_BLOCK boundary 2208 ! Using SRC as a tmp here 2209 cmp TMP, 3 2210 bleu,pt %ncc, 1f 2211 sub CNT,TMP,CNT ! adjust main count 2212 sub TMP, 3, TMP ! adjust for end of loop test 2213 .co_blkalign: 2214 ldub [REALSRC], SRC ! move 4 bytes per loop iteration 2215 stba SRC, [DST]%asi 2216 subcc TMP, 4, TMP 2217 ldub [REALSRC + 1], SRC 2218 add REALSRC, 4, REALSRC 2219 stba SRC, [DST + 1]%asi 2220 ldub [REALSRC - 2], SRC 2221 add DST, 4, DST 2222 stba SRC, [DST - 2]%asi 2223 ldub [REALSRC - 1], SRC 2224 bgu,pt %ncc, .co_blkalign 2225 stba SRC, [DST - 1]%asi 2226 2227 addcc TMP, 3, TMP ! restore count adjustment 2228 bz,pt %ncc, 2f ! no bytes left? 2229 nop 2230 1: ldub [REALSRC], SRC 2231 inc REALSRC 2232 inc DST 2233 deccc TMP 2234 bgu %ncc, 1b 2235 stba SRC, [DST - 1]%asi 2236 2237 2: 2238 andn REALSRC, 0x7, SRC 2239 alignaddr REALSRC, %g0, %g0 2240 2241 ! SRC - 8-byte aligned 2242 ! DST - 64-byte aligned 2243 prefetch [SRC], #one_read 2244 prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read 2245 prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read 2246 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read 2247 ldd [SRC], %f16 2248 #if CHEETAH_PREFETCH > 4 2249 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read 2250 #endif 2251 ldd [SRC + 0x08], %f18 2252 #if CHEETAH_PREFETCH > 5 2253 prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read 2254 #endif 2255 ldd [SRC + 0x10], %f20 2256 #if CHEETAH_PREFETCH > 6 2257 prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read 2258 #endif 2259 faligndata %f16, %f18, %f48 2260 ldd [SRC + 0x18], %f22 2261 #if CHEETAH_PREFETCH > 7 2262 prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read 2263 #endif 2264 faligndata %f18, %f20, %f50 2265 ldd [SRC + 0x20], %f24 2266 faligndata %f20, %f22, %f52 2267 ldd [SRC + 0x28], %f26 2268 faligndata %f22, %f24, %f54 2269 ldd [SRC + 0x30], %f28 2270 faligndata %f24, %f26, %f56 2271 ldd [SRC + 0x38], %f30 2272 faligndata %f26, %f28, %f58 2273 ldd [SRC + VIS_BLOCKSIZE], %f16 2274 sub CNT, VIS_BLOCKSIZE, CNT 2275 add SRC, VIS_BLOCKSIZE, SRC 2276 add REALSRC, VIS_BLOCKSIZE, REALSRC 2277 ba,a,pt %ncc, 1f 2278 nop 2279 .align 16 2280 1: 2281 ldd [SRC + 0x08], %f18 2282 faligndata %f28, %f30, %f60 2283 ldd [SRC + 0x10], %f20 2284 faligndata %f30, %f16, %f62 2285 stda %f48, [DST]ASI_BLK_AIUS 2286 ldd [SRC + 0x18], %f22 2287 faligndata %f16, %f18, %f48 2288 ldd [SRC + 0x20], %f24 2289 faligndata %f18, %f20, %f50 2290 ldd [SRC + 0x28], %f26 2291 faligndata %f20, %f22, %f52 2292 ldd [SRC + 0x30], %f28 2293 faligndata %f22, %f24, %f54 2294 ldd [SRC + 0x38], %f30 2295 faligndata %f24, %f26, %f56 2296 sub CNT, VIS_BLOCKSIZE, CNT 2297 ldd [SRC + VIS_BLOCKSIZE], %f16 2298 faligndata %f26, %f28, %f58 2299 prefetch [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read 2300 add DST, VIS_BLOCKSIZE, DST 2301 prefetch [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read 2302 add REALSRC, VIS_BLOCKSIZE, REALSRC 2303 cmp CNT, VIS_BLOCKSIZE + 8 2304 bgu,pt %ncc, 1b 2305 add SRC, VIS_BLOCKSIZE, SRC 2306 2307 ! only if REALSRC & 0x7 is 0 2308 cmp CNT, VIS_BLOCKSIZE 2309 bne %ncc, 3f 2310 andcc REALSRC, 0x7, %g0 2311 bz,pt %ncc, 2f 2312 nop 2313 3: 2314 faligndata %f28, %f30, %f60 2315 faligndata %f30, %f16, %f62 2316 stda %f48, [DST]ASI_BLK_AIUS 2317 add DST, VIS_BLOCKSIZE, DST 2318 ba,pt %ncc, 3f 2319 nop 2320 2: 2321 ldd [SRC + 0x08], %f18 2322 fsrc1 %f28, %f60 2323 ldd [SRC + 0x10], %f20 2324 fsrc1 %f30, %f62 2325 stda %f48, [DST]ASI_BLK_AIUS 2326 ldd [SRC + 0x18], %f22 2327 fsrc1 %f16, %f48 2328 ldd [SRC + 0x20], %f24 2329 fsrc1 %f18, %f50 2330 ldd [SRC + 0x28], %f26 2331 fsrc1 %f20, %f52 2332 ldd [SRC + 0x30], %f28 2333 fsrc1 %f22, %f54 2334 ldd [SRC + 0x38], %f30 2335 fsrc1 %f24, %f56 2336 sub CNT, VIS_BLOCKSIZE, CNT 2337 add DST, VIS_BLOCKSIZE, DST 2338 add SRC, VIS_BLOCKSIZE, SRC 2339 add REALSRC, VIS_BLOCKSIZE, REALSRC 2340 fsrc1 %f26, %f58 2341 fsrc1 %f28, %f60 2342 fsrc1 %f30, %f62 2343 stda %f48, [DST]ASI_BLK_AIUS 2344 add DST, VIS_BLOCKSIZE, DST 2345 ba,a,pt %ncc, 4f 2346 nop 2347 2348 3: tst CNT 2349 bz,a %ncc, 4f 2350 nop 2351 2352 5: ldub [REALSRC], TMP 2353 inc REALSRC 2354 inc DST 2355 deccc CNT 2356 bgu %ncc, 5b 2357 stba TMP, [DST - 1]%asi 2358 4: 2359 2360 .copyout_exit: 2361 membar #Sync 2362 2363 FPRAS_INTERVAL(FPRAS_COPYOUT, 0, %l5, %o2, %o3, %o4, %o5, 8) 2364 FPRAS_REWRITE_TYPE2Q2(0, %l5, %o2, %o3, 8, 9) 2365 FPRAS_CHECK(FPRAS_COPYOUT, %l5, 9) ! lose outputs 2366 2367 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 2368 wr %o2, 0, %gsr ! restore gsr 2369 2370 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 2371 btst FPRS_FEF, %o3 2372 bz,pt %icc, 4f 2373 nop 2374 2375 BLD_FPQ2Q4_FROMSTACK(%o2) 2376 2377 ba,pt %ncc, 1f 2378 wr %o3, 0, %fprs ! restore fprs 2379 2380 4: 2381 FZEROQ2Q4 2382 wr %o3, 0, %fprs ! restore fprs 2383 2384 1: 2385 membar #Sync 2386 andn %l6, FPUSED_FLAG, %l6 2387 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2388 FP_ALLOWMIGRATE(5, 6) 2389 ret 2390 restore %g0, 0, %o0 2391 2392 /* 2393 * We got here because of a fault during copyout. 2394 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh). 2395 */ 2396 .copyout_err: 2397 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler 2398 tst %o4 2399 bz,pt %ncc, 2f ! if not, return error 2400 nop 2401 ldn [%o4 + CP_COPYOUT], %g2 ! if handler, invoke it with 2402 jmp %g2 ! original arguments 2403 restore %g0, 0, %g0 ! dispose of copy window 2404 2: 2405 ret 2406 restore %g0, -1, %o0 ! return error value 2407 2408 2409 SET_SIZE(copyout_more) 2410 2411 2412 ENTRY(xcopyout) 2413 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 2414 bleu,pt %ncc, .xcopyout_small ! go to larger cases 2415 xor %o0, %o1, %o3 ! are src, dst alignable? 2416 btst 7, %o3 ! 2417 bz,pt %ncc, .xcopyout_8 ! 2418 nop 2419 btst 1, %o3 ! 2420 bz,pt %ncc, .xcopyout_2 ! check for half-word 2421 nop 2422 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 2423 ld [%o3 + %lo(hw_copy_limit_1)], %o3 2424 tst %o3 2425 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy 2426 cmp %o2, %o3 ! if length <= limit 2427 bleu,pt %ncc, .xcopyout_small ! go to small copy 2428 nop 2429 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy 2430 nop 2431 .xcopyout_2: 2432 btst 3, %o3 ! 2433 bz,pt %ncc, .xcopyout_4 ! check for word alignment 2434 nop 2435 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 2436 ld [%o3 + %lo(hw_copy_limit_2)], %o3 2437 tst %o3 2438 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy 2439 cmp %o2, %o3 ! if length <= limit 2440 bleu,pt %ncc, .xcopyout_small ! go to small copy 2441 nop 2442 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy 2443 nop 2444 .xcopyout_4: 2445 ! already checked longword, must be word aligned 2446 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 2447 ld [%o3 + %lo(hw_copy_limit_4)], %o3 2448 tst %o3 2449 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy 2450 cmp %o2, %o3 ! if length <= limit 2451 bleu,pt %ncc, .xcopyout_small ! go to small copy 2452 nop 2453 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy 2454 nop 2455 .xcopyout_8: 2456 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 2457 ld [%o3 + %lo(hw_copy_limit_8)], %o3 2458 tst %o3 2459 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy 2460 cmp %o2, %o3 ! if length <= limit 2461 bleu,pt %ncc, .xcopyout_small ! go to small copy 2462 nop 2463 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy 2464 nop 2465 2466 .xcopyout_small: 2467 sethi %hi(.sm_xcopyout_err), %o5 ! .sm_xcopyout_err is lofault 2468 or %o5, %lo(.sm_xcopyout_err), %o5 2469 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler 2470 membar #Sync ! sync error barrier 2471 ba,pt %ncc, .sm_do_copyout ! common code 2472 stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault 2473 2474 .xcopyout_more: 2475 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 2476 sethi %hi(.xcopyout_err), REAL_LOFAULT 2477 ba,pt %ncc, .do_copyout ! common code 2478 or REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT 2479 2480 /* 2481 * We got here because of fault during xcopyout 2482 * Errno value is in ERRNO 2483 */ 2484 .xcopyout_err: 2485 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler 2486 tst %o4 2487 bz,pt %ncc, 2f ! if not, return error 2488 nop 2489 ldn [%o4 + CP_XCOPYOUT], %g2 ! if handler, invoke it with 2490 jmp %g2 ! original arguments 2491 restore %g0, 0, %g0 ! dispose of copy window 2492 2: 2493 ret 2494 restore ERRNO, 0, %o0 ! return errno value 2495 2496 .sm_xcopyout_err: 2497 2498 membar #Sync 2499 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2500 mov SM_SAVE_SRC, %o0 2501 mov SM_SAVE_DST, %o1 2502 mov SM_SAVE_COUNT, %o2 2503 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler 2504 tst %o3 2505 bz,pt %ncc, 3f ! if not, return error 2506 nop 2507 ldn [%o3 + CP_XCOPYOUT], %o5 ! if handler, invoke it with 2508 jmp %o5 ! original arguments 2509 nop 2510 3: 2511 retl 2512 or %g1, 0, %o0 ! return errno value 2513 2514 SET_SIZE(xcopyout) 2515 2516 ENTRY(xcopyout_little) 2517 sethi %hi(.xcopyio_err), %o5 2518 or %o5, %lo(.xcopyio_err), %o5 2519 ldn [THREAD_REG + T_LOFAULT], %o4 2520 membar #Sync ! sync error barrier 2521 stn %o5, [THREAD_REG + T_LOFAULT] 2522 mov %o4, %o5 2523 2524 subcc %g0, %o2, %o3 2525 add %o0, %o2, %o0 2526 bz,pn %ncc, 2f ! check for zero bytes 2527 sub %o2, 1, %o4 2528 add %o0, %o4, %o0 ! start w/last byte 2529 add %o1, %o2, %o1 2530 ldub [%o0 + %o3], %o4 2531 2532 1: stba %o4, [%o1 + %o3]ASI_AIUSL 2533 inccc %o3 2534 sub %o0, 2, %o0 ! get next byte 2535 bcc,a,pt %ncc, 1b 2536 ldub [%o0 + %o3], %o4 2537 2538 2: 2539 membar #Sync ! sync error barrier 2540 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2541 retl 2542 mov %g0, %o0 ! return (0) 2543 2544 SET_SIZE(xcopyout_little) 2545 2546 /* 2547 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little) 2548 */ 2549 2550 ENTRY(copyin) 2551 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 2552 bleu,pt %ncc, .copyin_small ! go to larger cases 2553 xor %o0, %o1, %o3 ! are src, dst alignable? 2554 btst 7, %o3 ! 2555 bz,pt %ncc, .copyin_8 ! check for longword alignment 2556 nop 2557 btst 1, %o3 ! 2558 bz,pt %ncc, .copyin_2 ! check for half-word 2559 nop 2560 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 2561 ld [%o3 + %lo(hw_copy_limit_1)], %o3 2562 tst %o3 2563 bz,pn %icc, .copyin_small ! if zero, disable HW copy 2564 cmp %o2, %o3 ! if length <= limit 2565 bleu,pt %ncc, .copyin_small ! go to small copy 2566 nop 2567 ba,pt %ncc, .copyin_more ! otherwise go to large copy 2568 nop 2569 .copyin_2: 2570 btst 3, %o3 ! 2571 bz,pt %ncc, .copyin_4 ! check for word alignment 2572 nop 2573 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 2574 ld [%o3 + %lo(hw_copy_limit_2)], %o3 2575 tst %o3 2576 bz,pn %icc, .copyin_small ! if zero, disable HW copy 2577 cmp %o2, %o3 ! if length <= limit 2578 bleu,pt %ncc, .copyin_small ! go to small copy 2579 nop 2580 ba,pt %ncc, .copyin_more ! otherwise go to large copy 2581 nop 2582 .copyin_4: 2583 ! already checked longword, must be word aligned 2584 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 2585 ld [%o3 + %lo(hw_copy_limit_4)], %o3 2586 tst %o3 2587 bz,pn %icc, .copyin_small ! if zero, disable HW copy 2588 cmp %o2, %o3 ! if length <= limit 2589 bleu,pt %ncc, .copyin_small ! go to small copy 2590 nop 2591 ba,pt %ncc, .copyin_more ! otherwise go to large copy 2592 nop 2593 .copyin_8: 2594 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 2595 ld [%o3 + %lo(hw_copy_limit_8)], %o3 2596 tst %o3 2597 bz,pn %icc, .copyin_small ! if zero, disable HW copy 2598 cmp %o2, %o3 ! if length <= limit 2599 bleu,pt %ncc, .copyin_small ! go to small copy 2600 nop 2601 ba,pt %ncc, .copyin_more ! otherwise go to large copy 2602 nop 2603 2604 .align 16 2605 nop ! instruction alignment 2606 ! see discussion at start of file 2607 .copyin_small: 2608 sethi %hi(.sm_copyin_err), %o5 ! .sm_copyin_err is lofault 2609 or %o5, %lo(.sm_copyin_err), %o5 2610 ldn [THREAD_REG + T_LOFAULT], %o4 ! set/save t_lofault, no tramp 2611 membar #Sync ! sync error barrier 2612 stn %o5, [THREAD_REG + T_LOFAULT] 2613 .sm_do_copyin: 2614 mov %o0, SM_SAVE_SRC 2615 mov %o1, SM_SAVE_DST 2616 cmp %o2, SHORTCOPY ! check for really short case 2617 bleu,pt %ncc, .ci_sm_left ! 2618 mov %o2, SM_SAVE_COUNT 2619 cmp %o2, CHKSIZE ! check for medium length cases 2620 bgu,pn %ncc, .ci_med ! 2621 or %o0, %o1, %o3 ! prepare alignment check 2622 andcc %o3, 0x3, %g0 ! test for alignment 2623 bz,pt %ncc, .ci_sm_word ! branch to word aligned case 2624 .ci_sm_movebytes: 2625 sub %o2, 3, %o2 ! adjust count to allow cc zero test 2626 .ci_sm_notalign4: 2627 lduba [%o0]ASI_USER, %o3 ! read byte 2628 subcc %o2, 4, %o2 ! reduce count by 4 2629 stb %o3, [%o1] ! write byte 2630 add %o0, 1, %o0 ! advance SRC by 1 2631 lduba [%o0]ASI_USER, %o3 ! repeat for a total of 4 bytes 2632 add %o0, 1, %o0 ! advance SRC by 1 2633 stb %o3, [%o1 + 1] 2634 add %o1, 4, %o1 ! advance DST by 4 2635 lduba [%o0]ASI_USER, %o3 2636 add %o0, 1, %o0 ! advance SRC by 1 2637 stb %o3, [%o1 - 2] 2638 lduba [%o0]ASI_USER, %o3 2639 add %o0, 1, %o0 ! advance SRC by 1 2640 bgt,pt %ncc, .ci_sm_notalign4 ! loop til 3 or fewer bytes remain 2641 stb %o3, [%o1 - 1] 2642 add %o2, 3, %o2 ! restore count 2643 .ci_sm_left: 2644 tst %o2 2645 bz,pt %ncc, .ci_sm_exit 2646 nop 2647 lduba [%o0]ASI_USER, %o3 ! load one byte 2648 deccc %o2 ! reduce count for cc test 2649 bz,pt %ncc, .ci_sm_exit 2650 stb %o3,[%o1] ! store one byte 2651 inc %o0 2652 lduba [%o0]ASI_USER, %o3 ! load second byte 2653 deccc %o2 2654 bz,pt %ncc, .ci_sm_exit 2655 stb %o3,[%o1 + 1] ! store second byte 2656 inc %o0 2657 lduba [%o0]ASI_USER, %o3 ! load third byte 2658 stb %o3,[%o1 + 2] ! store third byte 2659 membar #Sync ! sync error barrier 2660 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2661 retl 2662 mov %g0, %o0 ! return 0 2663 .align 16 2664 .ci_sm_words: 2665 lduwa [%o0]ASI_USER, %o3 ! read word 2666 .ci_sm_wordx: 2667 subcc %o2, 8, %o2 ! update count 2668 stw %o3, [%o1] ! write word 2669 add %o0, 4, %o0 ! update SRC 2670 add %o1, 8, %o1 ! update DST 2671 lduwa [%o0]ASI_USER, %o3 ! read word 2672 add %o0, 4, %o0 ! update SRC 2673 bgt,pt %ncc, .ci_sm_words ! loop til done 2674 stw %o3, [%o1 - 4] ! write word 2675 addcc %o2, 7, %o2 ! restore count 2676 bz,pt %ncc, .ci_sm_exit 2677 nop 2678 deccc %o2 2679 bz,pt %ncc, .ci_sm_byte 2680 .ci_sm_half: 2681 subcc %o2, 2, %o2 ! reduce count by 2 2682 lduha [%o0]ASI_USER, %o3 ! read half word 2683 add %o0, 2, %o0 ! advance SRC by 2 2684 add %o1, 2, %o1 ! advance DST by 2 2685 bgt,pt %ncc, .ci_sm_half ! loop til done 2686 sth %o3, [%o1 - 2] ! write half word 2687 addcc %o2, 1, %o2 ! restore count 2688 bz,pt %ncc, .ci_sm_exit 2689 nop 2690 .ci_sm_byte: 2691 lduba [%o0]ASI_USER, %o3 2692 stb %o3, [%o1] 2693 membar #Sync ! sync error barrier 2694 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2695 retl 2696 mov %g0, %o0 ! return 0 2697 .align 16 2698 .ci_sm_word: 2699 subcc %o2, 4, %o2 ! update count 2700 bgt,pt %ncc, .ci_sm_wordx 2701 lduwa [%o0]ASI_USER, %o3 ! read word 2702 addcc %o2, 3, %o2 ! restore count 2703 bz,pt %ncc, .ci_sm_exit 2704 stw %o3, [%o1] ! write word 2705 deccc %o2 ! reduce count for cc test 2706 add %o0, 4, %o0 2707 lduba [%o0]ASI_USER, %o3 ! load one byte 2708 bz,pt %ncc, .ci_sm_exit 2709 stb %o3, [%o1 + 4] ! store one byte 2710 inc %o0 2711 lduba [%o0]ASI_USER, %o3 ! load second byte 2712 deccc %o2 2713 bz,pt %ncc, .ci_sm_exit 2714 stb %o3, [%o1 + 5] ! store second byte 2715 inc %o0 2716 lduba [%o0]ASI_USER, %o3 ! load third byte 2717 stb %o3, [%o1 + 6] ! store third byte 2718 .ci_sm_exit: 2719 membar #Sync ! sync error barrier 2720 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2721 retl 2722 mov %g0, %o0 ! return 0 2723 2724 .align 16 2725 .ci_med: 2726 xor %o0, %o1, %o3 ! setup alignment check 2727 btst 1, %o3 2728 bnz,pt %ncc, .ci_sm_movebytes ! unaligned 2729 nop 2730 btst 3, %o3 2731 bnz,pt %ncc, .ci_med_half ! halfword aligned 2732 nop 2733 btst 7, %o3 2734 bnz,pt %ncc, .ci_med_word ! word aligned 2735 nop 2736 .ci_med_long: 2737 btst 3, %o0 ! check for 2738 bz,pt %ncc, .ci_med_long1 ! word alignment 2739 nop 2740 .ci_med_long0: 2741 lduba [%o0]ASI_USER, %o3 ! load one byte 2742 inc %o0 2743 stb %o3,[%o1] ! store byte 2744 inc %o1 2745 btst 3, %o0 2746 bnz,pt %ncc, .ci_med_long0 2747 dec %o2 2748 .ci_med_long1: ! word aligned 2749 btst 7, %o0 ! check for long word 2750 bz,pt %ncc, .ci_med_long2 2751 nop 2752 lduwa [%o0]ASI_USER, %o3 ! load word 2753 add %o0, 4, %o0 ! advance SRC by 4 2754 stw %o3, [%o1] ! store word 2755 add %o1, 4, %o1 ! advance DST by 4 2756 sub %o2, 4, %o2 ! reduce count by 4 2757 ! 2758 ! Now long word aligned and have at least 32 bytes to move 2759 ! 2760 .ci_med_long2: 2761 sub %o2, 31, %o2 ! adjust count to allow cc zero test 2762 .ci_med_lmove: 2763 ldxa [%o0]ASI_USER, %o3 ! read long word 2764 subcc %o2, 32, %o2 ! reduce count by 32 2765 stx %o3, [%o1] ! write long word 2766 add %o0, 8, %o0 ! advance SRC by 8 2767 ldxa [%o0]ASI_USER, %o3 ! repeat for a total for 4 long words 2768 add %o0, 8, %o0 ! advance SRC by 8 2769 stx %o3, [%o1 + 8] 2770 add %o1, 32, %o1 ! advance DST by 32 2771 ldxa [%o0]ASI_USER, %o3 2772 add %o0, 8, %o0 ! advance SRC by 8 2773 stx %o3, [%o1 - 16] 2774 ldxa [%o0]ASI_USER, %o3 2775 add %o0, 8, %o0 ! advance SRC by 8 2776 bgt,pt %ncc, .ci_med_lmove ! loop til 31 or fewer bytes left 2777 stx %o3, [%o1 - 8] 2778 addcc %o2, 24, %o2 ! restore count to long word offset 2779 ble,pt %ncc, .ci_med_lextra ! check for more long words to move 2780 nop 2781 .ci_med_lword: 2782 ldxa [%o0]ASI_USER, %o3 ! read long word 2783 subcc %o2, 8, %o2 ! reduce count by 8 2784 stx %o3, [%o1] ! write long word 2785 add %o0, 8, %o0 ! advance SRC by 8 2786 bgt,pt %ncc, .ci_med_lword ! loop til 7 or fewer bytes left 2787 add %o1, 8, %o1 ! advance DST by 8 2788 .ci_med_lextra: 2789 addcc %o2, 7, %o2 ! restore rest of count 2790 bz,pt %ncc, .ci_sm_exit ! if zero, then done 2791 deccc %o2 2792 bz,pt %ncc, .ci_sm_byte 2793 nop 2794 ba,pt %ncc, .ci_sm_half 2795 nop 2796 2797 .align 16 2798 nop ! instruction alignment 2799 ! see discussion at start of file 2800 .ci_med_word: 2801 btst 3, %o0 ! check for 2802 bz,pt %ncc, .ci_med_word1 ! word alignment 2803 nop 2804 .ci_med_word0: 2805 lduba [%o0]ASI_USER, %o3 ! load one byte 2806 inc %o0 2807 stb %o3,[%o1] ! store byte 2808 inc %o1 2809 btst 3, %o0 2810 bnz,pt %ncc, .ci_med_word0 2811 dec %o2 2812 ! 2813 ! Now word aligned and have at least 36 bytes to move 2814 ! 2815 .ci_med_word1: 2816 sub %o2, 15, %o2 ! adjust count to allow cc zero test 2817 .ci_med_wmove: 2818 lduwa [%o0]ASI_USER, %o3 ! read word 2819 subcc %o2, 16, %o2 ! reduce count by 16 2820 stw %o3, [%o1] ! write word 2821 add %o0, 4, %o0 ! advance SRC by 4 2822 lduwa [%o0]ASI_USER, %o3 ! repeat for a total for 4 words 2823 add %o0, 4, %o0 ! advance SRC by 4 2824 stw %o3, [%o1 + 4] 2825 add %o1, 16, %o1 ! advance DST by 16 2826 lduwa [%o0]ASI_USER, %o3 2827 add %o0, 4, %o0 ! advance SRC by 4 2828 stw %o3, [%o1 - 8] 2829 lduwa [%o0]ASI_USER, %o3 2830 add %o0, 4, %o0 ! advance SRC by 4 2831 bgt,pt %ncc, .ci_med_wmove ! loop til 15 or fewer bytes left 2832 stw %o3, [%o1 - 4] 2833 addcc %o2, 12, %o2 ! restore count to word offset 2834 ble,pt %ncc, .ci_med_wextra ! check for more words to move 2835 nop 2836 .ci_med_word2: 2837 lduwa [%o0]ASI_USER, %o3 ! read word 2838 subcc %o2, 4, %o2 ! reduce count by 4 2839 stw %o3, [%o1] ! write word 2840 add %o0, 4, %o0 ! advance SRC by 4 2841 bgt,pt %ncc, .ci_med_word2 ! loop til 3 or fewer bytes left 2842 add %o1, 4, %o1 ! advance DST by 4 2843 .ci_med_wextra: 2844 addcc %o2, 3, %o2 ! restore rest of count 2845 bz,pt %ncc, .ci_sm_exit ! if zero, then done 2846 deccc %o2 2847 bz,pt %ncc, .ci_sm_byte 2848 nop 2849 ba,pt %ncc, .ci_sm_half 2850 nop 2851 2852 .align 16 2853 nop ! instruction alignment 2854 ! see discussion at start of file 2855 .ci_med_half: 2856 btst 1, %o0 ! check for 2857 bz,pt %ncc, .ci_med_half1 ! half word alignment 2858 nop 2859 lduba [%o0]ASI_USER, %o3 ! load one byte 2860 inc %o0 2861 stb %o3,[%o1] ! store byte 2862 inc %o1 2863 dec %o2 2864 ! 2865 ! Now half word aligned and have at least 38 bytes to move 2866 ! 2867 .ci_med_half1: 2868 sub %o2, 7, %o2 ! adjust count to allow cc zero test 2869 .ci_med_hmove: 2870 lduha [%o0]ASI_USER, %o3 ! read half word 2871 subcc %o2, 8, %o2 ! reduce count by 8 2872 sth %o3, [%o1] ! write half word 2873 add %o0, 2, %o0 ! advance SRC by 2 2874 lduha [%o0]ASI_USER, %o3 ! repeat for a total for 4 halfwords 2875 add %o0, 2, %o0 ! advance SRC by 2 2876 sth %o3, [%o1 + 2] 2877 add %o1, 8, %o1 ! advance DST by 8 2878 lduha [%o0]ASI_USER, %o3 2879 add %o0, 2, %o0 ! advance SRC by 2 2880 sth %o3, [%o1 - 4] 2881 lduha [%o0]ASI_USER, %o3 2882 add %o0, 2, %o0 ! advance SRC by 2 2883 bgt,pt %ncc, .ci_med_hmove ! loop til 7 or fewer bytes left 2884 sth %o3, [%o1 - 2] 2885 addcc %o2, 7, %o2 ! restore count 2886 bz,pt %ncc, .ci_sm_exit 2887 deccc %o2 2888 bz,pt %ncc, .ci_sm_byte 2889 nop 2890 ba,pt %ncc, .ci_sm_half 2891 nop 2892 2893 .sm_copyin_err: 2894 membar #Sync 2895 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2896 mov SM_SAVE_SRC, %o0 2897 mov SM_SAVE_DST, %o1 2898 mov SM_SAVE_COUNT, %o2 2899 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler 2900 tst %o3 2901 bz,pt %ncc, 3f ! if not, return error 2902 nop 2903 ldn [%o3 + CP_COPYIN], %o5 ! if handler, invoke it with 2904 jmp %o5 ! original arguments 2905 nop 2906 3: 2907 retl 2908 or %g0, -1, %o0 ! return errno value 2909 2910 SET_SIZE(copyin) 2911 2912 2913 /* 2914 * The _more entry points are not intended to be used directly by 2915 * any caller from outside this file. They are provided to allow 2916 * profiling and dtrace of the portions of the copy code that uses 2917 * the floating point registers. 2918 * This entry is particularly important as DTRACE (at least as of 2919 * 4/2004) does not support leaf functions. 2920 */ 2921 2922 ENTRY(copyin_more) 2923 .copyin_more: 2924 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 2925 set .copyin_err, REAL_LOFAULT 2926 2927 /* 2928 * Copy ins that reach here are larger than VIS_COPY_THRESHOLD bytes 2929 */ 2930 .do_copyin: 2931 set copyio_fault, %l7 ! .copyio_fault is lofault val 2932 2933 ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler 2934 membar #Sync ! sync error barrier 2935 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 2936 2937 mov %i0, SAVE_SRC 2938 mov %i1, SAVE_DST 2939 mov %i2, SAVE_COUNT 2940 2941 FP_NOMIGRATE(6, 7) 2942 2943 rd %fprs, %o2 ! check for unused fp 2944 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs 2945 btst FPRS_FEF, %o2 2946 bz,a,pt %icc, .do_blockcopyin 2947 wr %g0, FPRS_FEF, %fprs 2948 2949 BST_FPQ2Q4_TOSTACK(%o2) 2950 2951 .do_blockcopyin: 2952 rd %gsr, %o2 2953 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr 2954 or %l6, FPUSED_FLAG, %l6 2955 2956 andcc DST, VIS_BLOCKSIZE - 1, TMP 2957 mov ASI_USER, %asi 2958 bz,pt %ncc, 2f 2959 neg TMP 2960 add TMP, VIS_BLOCKSIZE, TMP 2961 2962 ! TMP = bytes required to align DST on FP_BLOCK boundary 2963 ! Using SRC as a tmp here 2964 cmp TMP, 3 2965 bleu,pt %ncc, 1f 2966 sub CNT,TMP,CNT ! adjust main count 2967 sub TMP, 3, TMP ! adjust for end of loop test 2968 .ci_blkalign: 2969 lduba [REALSRC]%asi, SRC ! move 4 bytes per loop iteration 2970 stb SRC, [DST] 2971 subcc TMP, 4, TMP 2972 lduba [REALSRC + 1]%asi, SRC 2973 add REALSRC, 4, REALSRC 2974 stb SRC, [DST + 1] 2975 lduba [REALSRC - 2]%asi, SRC 2976 add DST, 4, DST 2977 stb SRC, [DST - 2] 2978 lduba [REALSRC - 1]%asi, SRC 2979 bgu,pt %ncc, .ci_blkalign 2980 stb SRC, [DST - 1] 2981 2982 addcc TMP, 3, TMP ! restore count adjustment 2983 bz,pt %ncc, 2f ! no bytes left? 2984 nop 2985 1: lduba [REALSRC]%asi, SRC 2986 inc REALSRC 2987 inc DST 2988 deccc TMP 2989 bgu %ncc, 1b 2990 stb SRC, [DST - 1] 2991 2992 2: 2993 andn REALSRC, 0x7, SRC 2994 alignaddr REALSRC, %g0, %g0 2995 2996 ! SRC - 8-byte aligned 2997 ! DST - 64-byte aligned 2998 prefetcha [SRC]%asi, #one_read 2999 prefetcha [SRC + (1 * VIS_BLOCKSIZE)]%asi, #one_read 3000 prefetcha [SRC + (2 * VIS_BLOCKSIZE)]%asi, #one_read 3001 prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #one_read 3002 ldda [SRC]%asi, %f16 3003 #if CHEETAH_PREFETCH > 4 3004 prefetcha [SRC + (4 * VIS_BLOCKSIZE)]%asi, #one_read 3005 #endif 3006 ldda [SRC + 0x08]%asi, %f18 3007 #if CHEETAH_PREFETCH > 5 3008 prefetcha [SRC + (5 * VIS_BLOCKSIZE)]%asi, #one_read 3009 #endif 3010 ldda [SRC + 0x10]%asi, %f20 3011 #if CHEETAH_PREFETCH > 6 3012 prefetcha [SRC + (6 * VIS_BLOCKSIZE)]%asi, #one_read 3013 #endif 3014 faligndata %f16, %f18, %f48 3015 ldda [SRC + 0x18]%asi, %f22 3016 #if CHEETAH_PREFETCH > 7 3017 prefetcha [SRC + (7 * VIS_BLOCKSIZE)]%asi, #one_read 3018 #endif 3019 faligndata %f18, %f20, %f50 3020 ldda [SRC + 0x20]%asi, %f24 3021 faligndata %f20, %f22, %f52 3022 ldda [SRC + 0x28]%asi, %f26 3023 faligndata %f22, %f24, %f54 3024 ldda [SRC + 0x30]%asi, %f28 3025 faligndata %f24, %f26, %f56 3026 ldda [SRC + 0x38]%asi, %f30 3027 faligndata %f26, %f28, %f58 3028 ldda [SRC + VIS_BLOCKSIZE]%asi, %f16 3029 sub CNT, VIS_BLOCKSIZE, CNT 3030 add SRC, VIS_BLOCKSIZE, SRC 3031 add REALSRC, VIS_BLOCKSIZE, REALSRC 3032 ba,a,pt %ncc, 1f 3033 nop 3034 .align 16 3035 1: 3036 ldda [SRC + 0x08]%asi, %f18 3037 faligndata %f28, %f30, %f60 3038 ldda [SRC + 0x10]%asi, %f20 3039 faligndata %f30, %f16, %f62 3040 stda %f48, [DST]ASI_BLK_P 3041 ldda [SRC + 0x18]%asi, %f22 3042 faligndata %f16, %f18, %f48 3043 ldda [SRC + 0x20]%asi, %f24 3044 faligndata %f18, %f20, %f50 3045 ldda [SRC + 0x28]%asi, %f26 3046 faligndata %f20, %f22, %f52 3047 ldda [SRC + 0x30]%asi, %f28 3048 faligndata %f22, %f24, %f54 3049 ldda [SRC + 0x38]%asi, %f30 3050 faligndata %f24, %f26, %f56 3051 sub CNT, VIS_BLOCKSIZE, CNT 3052 ldda [SRC + VIS_BLOCKSIZE]%asi, %f16 3053 faligndata %f26, %f28, %f58 3054 prefetcha [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8]%asi, #one_read 3055 add DST, VIS_BLOCKSIZE, DST 3056 prefetcha [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read 3057 add REALSRC, VIS_BLOCKSIZE, REALSRC 3058 cmp CNT, VIS_BLOCKSIZE + 8 3059 bgu,pt %ncc, 1b 3060 add SRC, VIS_BLOCKSIZE, SRC 3061 3062 ! only if REALSRC & 0x7 is 0 3063 cmp CNT, VIS_BLOCKSIZE 3064 bne %ncc, 3f 3065 andcc REALSRC, 0x7, %g0 3066 bz,pt %ncc, 2f 3067 nop 3068 3: 3069 faligndata %f28, %f30, %f60 3070 faligndata %f30, %f16, %f62 3071 stda %f48, [DST]ASI_BLK_P 3072 add DST, VIS_BLOCKSIZE, DST 3073 ba,pt %ncc, 3f 3074 nop 3075 2: 3076 ldda [SRC + 0x08]%asi, %f18 3077 fsrc1 %f28, %f60 3078 ldda [SRC + 0x10]%asi, %f20 3079 fsrc1 %f30, %f62 3080 stda %f48, [DST]ASI_BLK_P 3081 ldda [SRC + 0x18]%asi, %f22 3082 fsrc1 %f16, %f48 3083 ldda [SRC + 0x20]%asi, %f24 3084 fsrc1 %f18, %f50 3085 ldda [SRC + 0x28]%asi, %f26 3086 fsrc1 %f20, %f52 3087 ldda [SRC + 0x30]%asi, %f28 3088 fsrc1 %f22, %f54 3089 ldda [SRC + 0x38]%asi, %f30 3090 fsrc1 %f24, %f56 3091 sub CNT, VIS_BLOCKSIZE, CNT 3092 add DST, VIS_BLOCKSIZE, DST 3093 add SRC, VIS_BLOCKSIZE, SRC 3094 add REALSRC, VIS_BLOCKSIZE, REALSRC 3095 fsrc1 %f26, %f58 3096 fsrc1 %f28, %f60 3097 fsrc1 %f30, %f62 3098 stda %f48, [DST]ASI_BLK_P 3099 add DST, VIS_BLOCKSIZE, DST 3100 ba,a,pt %ncc, 4f 3101 nop 3102 3103 3: tst CNT 3104 bz,a %ncc, 4f 3105 nop 3106 3107 5: lduba [REALSRC]ASI_USER, TMP 3108 inc REALSRC 3109 inc DST 3110 deccc CNT 3111 bgu %ncc, 5b 3112 stb TMP, [DST - 1] 3113 4: 3114 3115 .copyin_exit: 3116 membar #Sync 3117 3118 FPRAS_INTERVAL(FPRAS_COPYIN, 1, %l5, %o2, %o3, %o4, %o5, 8) 3119 FPRAS_REWRITE_TYPE1(1, %l5, %f48, %o2, 9) 3120 FPRAS_CHECK(FPRAS_COPYIN, %l5, 9) ! lose outputs 3121 3122 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr 3123 wr %o2, 0, %gsr 3124 3125 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 3126 btst FPRS_FEF, %o3 3127 bz,pt %icc, 4f 3128 nop 3129 3130 BLD_FPQ2Q4_FROMSTACK(%o2) 3131 3132 ba,pt %ncc, 1f 3133 wr %o3, 0, %fprs ! restore fprs 3134 3135 4: 3136 FZEROQ2Q4 3137 wr %o3, 0, %fprs ! restore fprs 3138 3139 1: 3140 membar #Sync ! sync error barrier 3141 andn %l6, FPUSED_FLAG, %l6 3142 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 3143 FP_ALLOWMIGRATE(5, 6) 3144 ret 3145 restore %g0, 0, %o0 3146 /* 3147 * We got here because of a fault during copyin 3148 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh). 3149 */ 3150 .copyin_err: 3151 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler 3152 tst %o4 3153 bz,pt %ncc, 2f ! if not, return error 3154 nop 3155 ldn [%o4 + CP_COPYIN], %g2 ! if handler, invoke it with 3156 jmp %g2 ! original arguments 3157 restore %g0, 0, %g0 ! dispose of copy window 3158 2: 3159 ret 3160 restore %g0, -1, %o0 ! return error value 3161 3162 3163 SET_SIZE(copyin_more) 3164 3165 ENTRY(xcopyin) 3166 3167 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 3168 bleu,pt %ncc, .xcopyin_small ! go to larger cases 3169 xor %o0, %o1, %o3 ! are src, dst alignable? 3170 btst 7, %o3 ! 3171 bz,pt %ncc, .xcopyin_8 ! check for longword alignment 3172 nop 3173 btst 1, %o3 ! 3174 bz,pt %ncc, .xcopyin_2 ! check for half-word 3175 nop 3176 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 3177 ld [%o3 + %lo(hw_copy_limit_1)], %o3 3178 tst %o3 3179 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy 3180 cmp %o2, %o3 ! if length <= limit 3181 bleu,pt %ncc, .xcopyin_small ! go to small copy 3182 nop 3183 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy 3184 nop 3185 .xcopyin_2: 3186 btst 3, %o3 ! 3187 bz,pt %ncc, .xcopyin_4 ! check for word alignment 3188 nop 3189 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 3190 ld [%o3 + %lo(hw_copy_limit_2)], %o3 3191 tst %o3 3192 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy 3193 cmp %o2, %o3 ! if length <= limit 3194 bleu,pt %ncc, .xcopyin_small ! go to small copy 3195 nop 3196 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy 3197 nop 3198 .xcopyin_4: 3199 ! already checked longword, must be word aligned 3200 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 3201 ld [%o3 + %lo(hw_copy_limit_4)], %o3 3202 tst %o3 3203 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy 3204 cmp %o2, %o3 ! if length <= limit 3205 bleu,pt %ncc, .xcopyin_small ! go to small copy 3206 nop 3207 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy 3208 nop 3209 .xcopyin_8: 3210 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 3211 ld [%o3 + %lo(hw_copy_limit_8)], %o3 3212 tst %o3 3213 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy 3214 cmp %o2, %o3 ! if length <= limit 3215 bleu,pt %ncc, .xcopyin_small ! go to small copy 3216 nop 3217 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy 3218 nop 3219 3220 .xcopyin_small: 3221 sethi %hi(.sm_xcopyin_err), %o5 ! .sm_xcopyin_err is lofault value 3222 or %o5, %lo(.sm_xcopyin_err), %o5 3223 ldn [THREAD_REG + T_LOFAULT], %o4 ! set/save t_lofaul 3224 membar #Sync ! sync error barrier 3225 ba,pt %ncc, .sm_do_copyin ! common code 3226 stn %o5, [THREAD_REG + T_LOFAULT] 3227 3228 .xcopyin_more: 3229 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 3230 sethi %hi(.xcopyin_err), REAL_LOFAULT ! .xcopyin_err is lofault value 3231 ba,pt %ncc, .do_copyin 3232 or REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT 3233 3234 /* 3235 * We got here because of fault during xcopyin 3236 * Errno value is in ERRNO 3237 */ 3238 .xcopyin_err: 3239 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler 3240 tst %o4 3241 bz,pt %ncc, 2f ! if not, return error 3242 nop 3243 ldn [%o4 + CP_XCOPYIN], %g2 ! if handler, invoke it with 3244 jmp %g2 ! original arguments 3245 restore %g0, 0, %g0 ! dispose of copy window 3246 2: 3247 ret 3248 restore ERRNO, 0, %o0 ! return errno value 3249 3250 .sm_xcopyin_err: 3251 3252 membar #Sync 3253 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 3254 mov SM_SAVE_SRC, %o0 3255 mov SM_SAVE_DST, %o1 3256 mov SM_SAVE_COUNT, %o2 3257 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler 3258 tst %o3 3259 bz,pt %ncc, 3f ! if not, return error 3260 nop 3261 ldn [%o3 + CP_XCOPYIN], %o5 ! if handler, invoke it with 3262 jmp %o5 ! original arguments 3263 nop 3264 3: 3265 retl 3266 or %g1, 0, %o0 ! return errno value 3267 3268 SET_SIZE(xcopyin) 3269 3270 ENTRY(xcopyin_little) 3271 sethi %hi(.xcopyio_err), %o5 3272 or %o5, %lo(.xcopyio_err), %o5 3273 ldn [THREAD_REG + T_LOFAULT], %o4 3274 membar #Sync ! sync error barrier 3275 stn %o5, [THREAD_REG + T_LOFAULT] 3276 mov %o4, %o5 3277 3278 subcc %g0, %o2, %o3 3279 add %o0, %o2, %o0 3280 bz,pn %ncc, 2f ! check for zero bytes 3281 sub %o2, 1, %o4 3282 add %o0, %o4, %o0 ! start w/last byte 3283 add %o1, %o2, %o1 3284 lduba [%o0 + %o3]ASI_AIUSL, %o4 3285 3286 1: stb %o4, [%o1 + %o3] 3287 inccc %o3 3288 sub %o0, 2, %o0 ! get next byte 3289 bcc,a,pt %ncc, 1b 3290 lduba [%o0 + %o3]ASI_AIUSL, %o4 3291 3292 2: 3293 membar #Sync ! sync error barrier 3294 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 3295 retl 3296 mov %g0, %o0 ! return (0) 3297 3298 .xcopyio_err: 3299 membar #Sync ! sync error barrier 3300 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 3301 retl 3302 mov %g1, %o0 3303 3304 SET_SIZE(xcopyin_little) 3305 3306 3307 /* 3308 * Copy a block of storage - must not overlap (from + len <= to). 3309 * No fault handler installed (to be called under on_fault()) 3310 */ 3311 ENTRY(copyin_noerr) 3312 3313 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 3314 bleu,pt %ncc, .copyin_ne_small ! go to larger cases 3315 xor %o0, %o1, %o3 ! are src, dst alignable? 3316 btst 7, %o3 ! 3317 bz,pt %ncc, .copyin_ne_8 ! check for longword alignment 3318 nop 3319 btst 1, %o3 ! 3320 bz,pt %ncc, .copyin_ne_2 ! check for half-word 3321 nop 3322 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 3323 ld [%o3 + %lo(hw_copy_limit_1)], %o3 3324 tst %o3 3325 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy 3326 cmp %o2, %o3 ! if length <= limit 3327 bleu,pt %ncc, .copyin_ne_small ! go to small copy 3328 nop 3329 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy 3330 nop 3331 .copyin_ne_2: 3332 btst 3, %o3 ! 3333 bz,pt %ncc, .copyin_ne_4 ! check for word alignment 3334 nop 3335 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 3336 ld [%o3 + %lo(hw_copy_limit_2)], %o3 3337 tst %o3 3338 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy 3339 cmp %o2, %o3 ! if length <= limit 3340 bleu,pt %ncc, .copyin_ne_small ! go to small copy 3341 nop 3342 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy 3343 nop 3344 .copyin_ne_4: 3345 ! already checked longword, must be word aligned 3346 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 3347 ld [%o3 + %lo(hw_copy_limit_4)], %o3 3348 tst %o3 3349 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy 3350 cmp %o2, %o3 ! if length <= limit 3351 bleu,pt %ncc, .copyin_ne_small ! go to small copy 3352 nop 3353 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy 3354 nop 3355 .copyin_ne_8: 3356 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 3357 ld [%o3 + %lo(hw_copy_limit_8)], %o3 3358 tst %o3 3359 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy 3360 cmp %o2, %o3 ! if length <= limit 3361 bleu,pt %ncc, .copyin_ne_small ! go to small copy 3362 nop 3363 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy 3364 nop 3365 3366 .copyin_ne_small: 3367 ldn [THREAD_REG + T_LOFAULT], %o4 3368 tst %o4 3369 bz,pn %ncc, .sm_do_copyin 3370 nop 3371 sethi %hi(.sm_copyio_noerr), %o5 3372 or %o5, %lo(.sm_copyio_noerr), %o5 3373 membar #Sync ! sync error barrier 3374 ba,pt %ncc, .sm_do_copyin 3375 stn %o5, [THREAD_REG + T_LOFAULT] ! set/save t_lofault 3376 3377 .copyin_noerr_more: 3378 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 3379 sethi %hi(.copyio_noerr), REAL_LOFAULT 3380 ba,pt %ncc, .do_copyin 3381 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT 3382 3383 .copyio_noerr: 3384 jmp %l6 3385 restore %g0,0,%g0 3386 3387 .sm_copyio_noerr: 3388 membar #Sync 3389 stn %o4, [THREAD_REG + T_LOFAULT] ! restore t_lofault 3390 jmp %o4 3391 nop 3392 3393 SET_SIZE(copyin_noerr) 3394 3395 /* 3396 * Copy a block of storage - must not overlap (from + len <= to). 3397 * No fault handler installed (to be called under on_fault()) 3398 */ 3399 3400 ENTRY(copyout_noerr) 3401 3402 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 3403 bleu,pt %ncc, .copyout_ne_small ! go to larger cases 3404 xor %o0, %o1, %o3 ! are src, dst alignable? 3405 btst 7, %o3 ! 3406 bz,pt %ncc, .copyout_ne_8 ! check for longword alignment 3407 nop 3408 btst 1, %o3 ! 3409 bz,pt %ncc, .copyout_ne_2 ! check for half-word 3410 nop 3411 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 3412 ld [%o3 + %lo(hw_copy_limit_1)], %o3 3413 tst %o3 3414 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy 3415 cmp %o2, %o3 ! if length <= limit 3416 bleu,pt %ncc, .copyout_ne_small ! go to small copy 3417 nop 3418 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy 3419 nop 3420 .copyout_ne_2: 3421 btst 3, %o3 ! 3422 bz,pt %ncc, .copyout_ne_4 ! check for word alignment 3423 nop 3424 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 3425 ld [%o3 + %lo(hw_copy_limit_2)], %o3 3426 tst %o3 3427 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy 3428 cmp %o2, %o3 ! if length <= limit 3429 bleu,pt %ncc, .copyout_ne_small ! go to small copy 3430 nop 3431 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy 3432 nop 3433 .copyout_ne_4: 3434 ! already checked longword, must be word aligned 3435 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 3436 ld [%o3 + %lo(hw_copy_limit_4)], %o3 3437 tst %o3 3438 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy 3439 cmp %o2, %o3 ! if length <= limit 3440 bleu,pt %ncc, .copyout_ne_small ! go to small copy 3441 nop 3442 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy 3443 nop 3444 .copyout_ne_8: 3445 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 3446 ld [%o3 + %lo(hw_copy_limit_8)], %o3 3447 tst %o3 3448 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy 3449 cmp %o2, %o3 ! if length <= limit 3450 bleu,pt %ncc, .copyout_ne_small ! go to small copy 3451 nop 3452 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy 3453 nop 3454 3455 .copyout_ne_small: 3456 ldn [THREAD_REG + T_LOFAULT], %o4 3457 tst %o4 3458 bz,pn %ncc, .sm_do_copyout 3459 nop 3460 sethi %hi(.sm_copyio_noerr), %o5 3461 or %o5, %lo(.sm_copyio_noerr), %o5 3462 membar #Sync ! sync error barrier 3463 ba,pt %ncc, .sm_do_copyout 3464 stn %o5, [THREAD_REG + T_LOFAULT] ! set/save t_lofault 3465 3466 .copyout_noerr_more: 3467 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 3468 sethi %hi(.copyio_noerr), REAL_LOFAULT 3469 ba,pt %ncc, .do_copyout 3470 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT 3471 3472 SET_SIZE(copyout_noerr) 3473 3474 3475 /* 3476 * hwblkclr - clears block-aligned, block-multiple-sized regions that are 3477 * longer than 256 bytes in length using spitfire's block stores. If 3478 * the criteria for using this routine are not met then it calls bzero 3479 * and returns 1. Otherwise 0 is returned indicating success. 3480 * Caller is responsible for ensuring use_hw_bzero is true and that 3481 * kpreempt_disable() has been called. 3482 */ 3483 ! %i0 - start address 3484 ! %i1 - length of region (multiple of 64) 3485 ! %l0 - saved fprs 3486 ! %l1 - pointer to saved %d0 block 3487 ! %l2 - saved curthread->t_lwp 3488 3489 ENTRY(hwblkclr) 3490 ! get another window w/space for one aligned block of saved fpregs 3491 save %sp, -SA(MINFRAME + 2*VIS_BLOCKSIZE), %sp 3492 3493 ! Must be block-aligned 3494 andcc %i0, (VIS_BLOCKSIZE-1), %g0 3495 bnz,pn %ncc, 1f 3496 nop 3497 3498 ! ... and must be 256 bytes or more 3499 cmp %i1, 256 3500 blu,pn %ncc, 1f 3501 nop 3502 3503 ! ... and length must be a multiple of VIS_BLOCKSIZE 3504 andcc %i1, (VIS_BLOCKSIZE-1), %g0 3505 bz,pn %ncc, 2f 3506 nop 3507 3508 1: ! punt, call bzero but notify the caller that bzero was used 3509 mov %i0, %o0 3510 call bzero 3511 mov %i1, %o1 3512 ret 3513 restore %g0, 1, %o0 ! return (1) - did not use block operations 3514 3515 2: rd %fprs, %l0 ! check for unused fp 3516 btst FPRS_FEF, %l0 3517 bz,pt %icc, 1f 3518 nop 3519 3520 ! save in-use fpregs on stack 3521 membar #Sync 3522 add %fp, STACK_BIAS - 65, %l1 3523 and %l1, -VIS_BLOCKSIZE, %l1 3524 stda %d0, [%l1]ASI_BLK_P 3525 3526 1: membar #StoreStore|#StoreLoad|#LoadStore 3527 wr %g0, FPRS_FEF, %fprs 3528 wr %g0, ASI_BLK_P, %asi 3529 3530 ! Clear block 3531 fzero %d0 3532 fzero %d2 3533 fzero %d4 3534 fzero %d6 3535 fzero %d8 3536 fzero %d10 3537 fzero %d12 3538 fzero %d14 3539 3540 mov 256, %i3 3541 ba,pt %ncc, .pz_doblock 3542 nop 3543 3544 .pz_blkstart: 3545 ! stda %d0, [%i0 + 192]%asi ! in dly slot of branch that got us here 3546 stda %d0, [%i0 + 128]%asi 3547 stda %d0, [%i0 + 64]%asi 3548 stda %d0, [%i0]%asi 3549 .pz_zinst: 3550 add %i0, %i3, %i0 3551 sub %i1, %i3, %i1 3552 .pz_doblock: 3553 cmp %i1, 256 3554 bgeu,a %ncc, .pz_blkstart 3555 stda %d0, [%i0 + 192]%asi 3556 3557 cmp %i1, 64 3558 blu %ncc, .pz_finish 3559 3560 andn %i1, (64-1), %i3 3561 srl %i3, 4, %i2 ! using blocks, 1 instr / 16 words 3562 set .pz_zinst, %i4 3563 sub %i4, %i2, %i4 3564 jmp %i4 3565 nop 3566 3567 .pz_finish: 3568 membar #Sync 3569 btst FPRS_FEF, %l0 3570 bz,a .pz_finished 3571 wr %l0, 0, %fprs ! restore fprs 3572 3573 ! restore fpregs from stack 3574 ldda [%l1]ASI_BLK_P, %d0 3575 membar #Sync 3576 wr %l0, 0, %fprs ! restore fprs 3577 3578 .pz_finished: 3579 ret 3580 restore %g0, 0, %o0 ! return (bzero or not) 3581 3582 SET_SIZE(hwblkclr) 3583 3584 /* 3585 * Copy 32 bytes of data from src (%o0) to dst (%o1) 3586 * using physical addresses. 3587 */ 3588 ENTRY_NP(hw_pa_bcopy32) 3589 rdpr %pstate, %g1 3590 andn %g1, PSTATE_IE, %g2 3591 wrpr %g0, %g2, %pstate 3592 3593 rdpr %pstate, %g0 3594 ldxa [%o0]ASI_MEM, %o2 3595 add %o0, 8, %o0 3596 ldxa [%o0]ASI_MEM, %o3 3597 add %o0, 8, %o0 3598 ldxa [%o0]ASI_MEM, %o4 3599 add %o0, 8, %o0 3600 ldxa [%o0]ASI_MEM, %o5 3601 3602 stxa %g0, [%o1]ASI_DC_INVAL 3603 membar #Sync 3604 3605 stxa %o2, [%o1]ASI_MEM 3606 add %o1, 8, %o1 3607 stxa %o3, [%o1]ASI_MEM 3608 add %o1, 8, %o1 3609 stxa %o4, [%o1]ASI_MEM 3610 add %o1, 8, %o1 3611 stxa %o5, [%o1]ASI_MEM 3612 3613 retl 3614 wrpr %g0, %g1, %pstate 3615 3616 SET_SIZE(hw_pa_bcopy32) 3617 3618 DGDEF(use_hw_bcopy) 3619 .word 1 3620 DGDEF(use_hw_bzero) 3621 .word 1 3622 DGDEF(hw_copy_limit_1) 3623 .word 0 3624 DGDEF(hw_copy_limit_2) 3625 .word 0 3626 DGDEF(hw_copy_limit_4) 3627 .word 0 3628 DGDEF(hw_copy_limit_8) 3629 .word 0 3630 3631 .align 64 3632 .section ".text"