1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License, Version 1.0 only
   6  * (the "License").  You may not use this file except in compliance
   7  * with the License.
   8  *
   9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  10  * or http://www.opensolaris.org/os/licensing.
  11  * See the License for the specific language governing permissions
  12  * and limitations under the License.
  13  *
  14  * When distributing Covered Code, include this CDDL HEADER in each
  15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  16  * If applicable, add the following below this CDDL HEADER, with the
  17  * fields enclosed by brackets "[]" replaced with your own identifying
  18  * information: Portions Copyright [yyyy] [name of copyright owner]
  19  *
  20  * CDDL HEADER END
  21  */
  22 /*
  23  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 #include <sys/param.h>
  28 #include <sys/errno.h>
  29 #include <sys/asm_linkage.h>
  30 #include <sys/vtrace.h>
  31 #include <sys/machthread.h>
  32 #include <sys/clock.h>
  33 #include <sys/asi.h>
  34 #include <sys/fsr.h>
  35 #include <sys/privregs.h>
  36 #include <sys/fpras_impl.h>
  37 
  38 #include "assym.h"
  39 
  40 /*
  41  * Pseudo-code to aid in understanding the control flow of the
  42  * bcopy/copyin/copyout routines.
  43  *
  44  * On entry:
  45  *
  46  *      ! Determine whether to use the FP register version
  47  *      ! or the leaf routine version depending on size
  48  *      ! of copy and flags.  Set up error handling accordingly.
  49  *      ! The transition point depends on whether the src and
  50  *      ! dst addresses can be aligned to long word, word,
  51  *      ! half word, or byte boundaries.
  52  *      !
  53  *      ! WARNING: <Register usage convention>
  54  *      ! For FP version, %l6 holds previous error handling and
  55  *      ! a flag: TRAMP_FLAG (low bits)
  56  *      ! for leaf routine version, %o4 holds those values.
  57  *      ! So either %l6 or %o4 is reserved and not available for
  58  *      ! any other use.
  59  *
  60  *      if (length <= VIS_COPY_THRESHOLD)    ! start with a quick test
  61  *              go to small_copy;               ! to speed short copies
  62  * 
  63  *      ! src, dst long word alignable
  64  *              if (hw_copy_limit_8 == 0)       ! hw_copy disabled
  65  *                      go to small_copy;
  66  *              if (length <= hw_copy_limit_8)
  67  *                      go to small_copy;
  68  *              go to FPBLK_copy;
  69  *      }
  70  *      if (src,dst not alignable) {
  71  *              if (hw_copy_limit_1 == 0)       ! hw_copy disabled
  72  *                      go to small_copy;
  73  *              if (length <= hw_copy_limit_1)
  74  *                      go to small_copy;
  75  *              go to FPBLK_copy;
  76  *      }
  77  *      if (src,dst halfword alignable) {
  78  *              if (hw_copy_limit_2 == 0)       ! hw_copy disabled
  79  *                      go to small_copy;
  80  *              if (length <= hw_copy_limit_2)
  81  *                      go to small_copy;
  82  *              go to FPBLK_copy;
  83  *      }
  84  *      if (src,dst word alignable) {
  85  *              if (hw_copy_limit_4 == 0)       ! hw_copy disabled
  86  *                      go to small_copy;
  87  *              if (length <= hw_copy_limit_4)
  88  *                      go to small_copy;
  89  *              go to FPBLK_copy;
  90  *      }
  91  *
  92  * small_copy:
  93  *      Setup_leaf_rtn_error_handler;           ! diffs for each entry point
  94  *      
  95  *      if (count <= 3)                              ! fast path for tiny copies
  96  *              go to sm_left;                  ! special finish up code
  97  *      else
  98  *              if (count > CHKSIZE)         ! medium sized copies
  99  *                      go to sm_med            ! tuned by alignment
 100  *              if(src&dst not both word aligned) {
 101  *      sm_movebytes:
 102  *                      move byte by byte in 4-way unrolled loop
 103  *                      fall into sm_left;
 104  *      sm_left:
 105  *                      move 0-3 bytes byte at a time as needed.
 106  *                      restore error handler and exit.
 107  *
 108  *              } else {        ! src&dst are word aligned
 109  *                      check for at least 8 bytes left,
 110  *                      move word at a time, unrolled by 2
 111  *                      when fewer than 8 bytes left,
 112  *      sm_half:        move half word at a time while 2 or more bytes left
 113  *      sm_byte:        move final byte if necessary
 114  *      sm_exit:
 115  *                      restore error handler and exit.
 116  *              }
 117  *
 118  * ! Medium length cases with at least CHKSIZE bytes available
 119  * ! method: line up src and dst as best possible, then
 120  * ! move data in 4-way unrolled loops.
 121  *
 122  * sm_med:
 123  *      if(src&dst unalignable)
 124  *              go to sm_movebytes
 125  *      if(src&dst halfword alignable)
 126  *              go to sm_movehalf
 127  *      if(src&dst word alignable)
 128  *              go to sm_moveword
 129  * ! fall into long word movement
 130  *      move bytes until src is word aligned
 131  *      if not long word aligned, move a word
 132  *      move long words in 4-way unrolled loop until < 32 bytes left
 133  *      move long words in 1-way unrolled loop until < 8 bytes left
 134  *      if zero bytes left, goto sm_exit
 135  *      if one byte left, go to sm_byte
 136  *      else go to sm_half
 137  *
 138  * sm_moveword:
 139  *      move bytes until src is word aligned
 140  *      move words in 4-way unrolled loop until < 16 bytes left
 141  *      move words in 1-way unrolled loop until < 4 bytes left
 142  *      if zero bytes left, goto sm_exit
 143  *      if one byte left, go to sm_byte
 144  *      else go to sm_half
 145  *
 146  * sm_movehalf:
 147  *      move a byte if needed to align src on halfword
 148  *      move halfwords in 4-way unrolled loop until < 8 bytes left
 149  *      if zero bytes left, goto sm_exit
 150  *      if one byte left, go to sm_byte
 151  *      else go to sm_half
 152  *
 153  *
 154  * FPBLK_copy:
 155  *      %l6 = curthread->t_lofault;
 156  *      if (%l6 != NULL) {
 157  *              membar #Sync
 158  *              curthread->t_lofault = .copyerr;
 159  *              caller_error_handler = TRUE             ! %l6 |= 2
 160  *      }
 161  *
 162  *      ! for FPU testing we must not migrate cpus
 163  *      if (curthread->t_lwp == NULL) {
 164  *              ! Kernel threads do not have pcb's in which to store
 165  *              ! the floating point state, so disallow preemption during
 166  *              ! the copy.  This also prevents cpu migration.
 167  *              kpreempt_disable(curthread);
 168  *      } else {
 169  *              thread_nomigrate();
 170  *      }
 171  *
 172  *      old_fprs = %fprs;
 173  *      old_gsr = %gsr;
 174  *      if (%fprs.fef) {
 175  *              %fprs.fef = 1;
 176  *              save current fpregs on stack using blockstore
 177  *      } else {
 178  *              %fprs.fef = 1;
 179  *      }
 180  *
 181  *
 182  *      do_blockcopy_here;
 183  *
 184  * In lofault handler:
 185  *      curthread->t_lofault = .copyerr2;
 186  *      Continue on with the normal exit handler
 187  *
 188  * On normal exit:
 189  *      %gsr = old_gsr;
 190  *      if (old_fprs & FPRS_FEF)
 191  *              restore fpregs from stack using blockload
 192  *      else
 193  *              zero fpregs
 194  *      %fprs = old_fprs;
 195  *      membar #Sync
 196  *      curthread->t_lofault = (%l6 & ~3);
 197  *      ! following test omitted from copyin/copyout as they
 198  *      ! will always have a current thread
 199  *      if (curthread->t_lwp == NULL)
 200  *              kpreempt_enable(curthread);
 201  *      else
 202  *              thread_allowmigrate();
 203  *      return (0)
 204  *
 205  * In second lofault handler (.copyerr2):
 206  *      We've tried to restore fp state from the stack and failed.  To
 207  *      prevent from returning with a corrupted fp state, we will panic.
 208  */
 209 
 210 /*
 211  * Comments about optimization choices
 212  *
 213  * The initial optimization decision in this code is to determine
 214  * whether to use the FP registers for a copy or not.  If we don't
 215  * use the FP registers, we can execute the copy as a leaf routine,
 216  * saving a register save and restore.  Also, less elaborate setup
 217  * is required, allowing short copies to be completed more quickly.
 218  * For longer copies, especially unaligned ones (where the src and
 219  * dst do not align to allow simple ldx,stx operation), the FP
 220  * registers allow much faster copy operations.
 221  *
 222  * The estimated extra cost of the FP path will vary depending on
 223  * src/dst alignment, dst offset from the next 64 byte FPblock store
 224  * boundary, remaining src data after the last full dst cache line is
 225  * moved whether the FP registers need to be saved, and some other
 226  * minor issues.  The average additional overhead is estimated to be
 227  * 400 clocks.  Since each non-repeated/predicted tst and branch costs
 228  * around 10 clocks, elaborate calculation would slow down to all 
 229  * longer copies and only benefit a small portion of medium sized
 230  * copies.  Rather than incur such cost, we chose fixed transition
 231  * points for each of the alignment choices.
 232  *
 233  * For the inner loop, here is a comparison of the per cache line
 234  * costs for each alignment when src&dst are in cache:
 235  * 
 236  * byte aligned:  108 clocks slower for non-FPBLK
 237  * half aligned:   44 clocks slower for non-FPBLK
 238  * word aligned:   12 clocks slower for non-FPBLK
 239  * long aligned:    4 clocks >>faster<< for non-FPBLK
 240  *
 241  * The long aligned loop runs faster because it does no prefetching.
 242  * That wins if the data is not in cache or there is too little
 243  * data to gain much benefit from prefetching.  But when there
 244  * is more data and that data is not in cache, failing to prefetch
 245  * can run much slower.  In addition, there is a 2 Kbyte store queue
 246  * which will cause the non-FPBLK inner loop to slow for larger copies.
 247  * The exact tradeoff is strongly load and application dependent, with
 248  * increasing risk of a customer visible performance regression if the
 249  * non-FPBLK code is used for larger copies. Studies of synthetic in-cache
 250  * vs out-of-cache copy tests in user space suggest 1024 bytes as a safe
 251  * upper limit for the non-FPBLK code.  To minimize performance regression
 252  * risk while still gaining the primary benefits of the improvements to 
 253  * the non-FPBLK code, we set an upper bound of 1024 bytes for the various
 254  * hw_copy_limit_*.  Later experimental studies using different values 
 255  * of hw_copy_limit_* can be used to make further adjustments if 
 256  * appropriate.
 257  *
 258  * hw_copy_limit_1 = src and dst are byte aligned but not halfword aligned
 259  * hw_copy_limit_2 = src and dst are halfword aligned but not word aligned
 260  * hw_copy_limit_4 = src and dst are word aligned but not longword aligned
 261  * hw_copy_limit_8 = src and dst are longword aligned
 262  *
 263  * To say that src and dst are word aligned means that after
 264  * some initial alignment activity of moving 0 to 3 bytes,
 265  * both the src and dst will be on word boundaries so that
 266  * word loads and stores may be used.
 267  *
 268  * Recommended initial values as of Mar 2004, includes testing
 269  * on Cheetah+ (900MHz), Cheetah++ (1200MHz), and Jaguar(1050MHz):
 270  * hw_copy_limit_1 =  256
 271  * hw_copy_limit_2 =  512
 272  * hw_copy_limit_4 = 1024
 273  * hw_copy_limit_8 = 1024 (or 1536 on some systems)
 274  *
 275  *
 276  * If hw_copy_limit_? is set to zero, then use of FPBLK copy is
 277  * disabled for that alignment choice.
 278  * If hw_copy_limit_? is set to a value between 1 and VIS_COPY_THRESHOLD (256)
 279  * the value of VIS_COPY_THRESHOLD is used.
 280  * It is not envisioned that hw_copy_limit_? will be changed in the field
 281  * It is provided to allow for disabling FPBLK copies and to allow
 282  * easy testing of alternate values on future HW implementations
 283  * that might have different cache sizes, clock rates or instruction
 284  * timing rules.
 285  *
 286  * Our first test for FPBLK copies vs non-FPBLK copies checks a minimum
 287  * threshold to speedup all shorter copies (less than 256).  That
 288  * saves an alignment test, memory reference, and enabling test
 289  * for all short copies, or an estimated 24 clocks.
 290  *
 291  * The order in which these limits are checked does matter since each
 292  * non-predicted tst and branch costs around 10 clocks.
 293  * If src and dst are randomly selected addresses,
 294  * 4 of 8 will not be alignable.
 295  * 2 of 8 will be half word alignable.
 296  * 1 of 8 will be word alignable.
 297  * 1 of 8 will be long word alignable.
 298  * But, tests on running kernels show that src and dst to copy code
 299  * are typically not on random alignments.  Structure copies and
 300  * copies of larger data sizes are often on long word boundaries.
 301  * So we test the long word alignment case first, then
 302  * the byte alignment, then halfword, then word alignment.
 303  *
 304  * Several times, tests for length are made to split the code
 305  * into subcases.  These tests often allow later tests to be
 306  * avoided.  For example, within the non-FPBLK copy, we first 
 307  * check for tiny copies of 3 bytes or less.  That allows us
 308  * to use a 4-way unrolled loop for the general byte copy case
 309  * without a test on loop entry.
 310  * We subdivide the non-FPBLK case further into CHKSIZE bytes and less
 311  * vs longer cases.  For the really short case, we don't attempt
 312  * align src and dst.  We try to minimize special case tests in
 313  * the shortest loops as each test adds a significant percentage
 314  * to the total time.
 315  *
 316  * For the medium sized cases, we allow ourselves to adjust the
 317  * src and dst alignment and provide special cases for each of
 318  * the four adjusted alignment cases. The CHKSIZE that was used
 319  * to decide between short and medium size was chosen to be 39
 320  * as that allows for the worst case of 7 bytes of alignment
 321  * shift and 4 times 8 bytes for the first long word unrolling.
 322  * That knowledge saves an initial test for length on entry into
 323  * the medium cases.  If the general loop unrolling factor were
 324  * to be increases, this number would also need to be adjusted.
 325  *
 326  * For all cases in the non-FPBLK code where it is known that at
 327  * least 4 chunks of data are available for movement, the
 328  * loop is unrolled by four.  This 4-way loop runs in 8 clocks
 329  * or 2 clocks per data element.  Due to limitations of the
 330  * branch instruction on Cheetah, Jaguar, and Panther, the
 331  * minimum time for a small, tight loop is 3 clocks.  So
 332  * the 4-way loop runs 50% faster than the fastest non-unrolled
 333  * loop.
 334  *
 335  * Instruction alignment is forced by used of .align 16 directives
 336  * and nops which are not executed in the code.  This
 337  * combination of operations shifts the alignment of following
 338  * loops to insure that loops are aligned so that their instructions
 339  * fall within the minimum number of 4 instruction fetch groups. 
 340  * If instructions are inserted or removed between the .align 
 341  * instruction and the unrolled loops, then the alignment needs
 342  * to be readjusted.  Misaligned loops can add a clock per loop
 343  * iteration to the loop timing.
 344  *
 345  * In a few cases, code is duplicated to avoid a branch.  Since
 346  * a non-predicted tst and branch takes 10 clocks, this savings
 347  * is judged an appropriate time-space tradeoff.
 348  *
 349  * Within the FPBLK-code, the prefetch method in the inner
 350  * loop needs to be explained as it is not standard.  Two 
 351  * prefetches are issued for each cache line instead of one.
 352  * The primary one is at the maximum reach of 8 cache lines.
 353  * Most of the time, that maximum prefetch reach gives the
 354  * cache line more time to reach the processor for systems with
 355  * higher processor clocks.  But, sometimes memory interference
 356  * can cause that prefetch to be dropped.  Putting a second
 357  * prefetch at a reach of 5 cache lines catches the drops
 358  * three iterations later and shows a measured improvement
 359  * in performance over any similar loop with a single prefetch.
 360  * The prefetches are placed in the loop so they overlap with 
 361  * non-memory instructions, so that there is no extra cost 
 362  * when the data is already in-cache.
 363  *
 364  */
 365 
 366 /*
 367  * Notes on preserving existing fp state and on membars.
 368  *
 369  * When a copyOP decides to use fp we may have to preserve existing
 370  * floating point state.  It is not the caller's state that we need to
 371  * preserve - the rest of the kernel does not use fp and, anyway, fp
 372  * registers are volatile across a call.  Some examples:
 373  *
 374  *      - userland has fp state and is interrupted (device interrupt 
 375  *        or trap) and within the interrupt/trap handling we use
 376  *        bcopy()
 377  *      - another (higher level) interrupt or trap handler uses bcopy
 378  *        while a bcopy from an earlier interrupt is still active
 379  *      - an asynchronous error trap occurs while fp state exists (in
 380  *        userland or in kernel copy) and the tl0 component of the handling
 381  *        uses bcopy
 382  *      - a user process with fp state incurs a copy-on-write fault and
 383  *        hwblkpagecopy always uses fp
 384  *
 385  * We therefore need a per-call place in which to preserve fp state -
 386  * using our stack is ideal (and since fp copy cannot be leaf optimized
 387  * because of calls it makes, this is no hardship).
 388  *
 389  * The following membar BLD/BST discussion is Cheetah pipeline specific.
 390  * In Cheetah BLD is blocking, #LoadLoad/#LoadStore/#StoreStore are
 391  * nops (those semantics always apply) and #StoreLoad is implemented
 392  * as a membar #Sync.
 393  *
 394  * It is possible that the owner of the fp state has a block load or
 395  * block store still "in flight" at the time we come to preserve that
 396  * state.  Block loads are blocking in Cheetah pipelines so we do not
 397  * need to sync with them.  In preserving fp regs we will use block stores
 398  * (which are not blocking in Cheetah pipelines) so we require a membar #Sync
 399  * after storing state (so that our subsequent use of those registers
 400  * does not modify them before the block stores complete);  this membar
 401  * also serves to sync with block stores the owner of the fp state has
 402  * initiated.
 403  *
 404  * When we have finished fp copy (with it's repeated block stores)
 405  * we must membar #Sync so that our block stores may complete before
 406  * we either restore the original fp state into the fp registers or
 407  * return to a caller which may initiate other fp operations that could
 408  * modify the fp regs we used before the block stores complete.
 409  *
 410  * Synchronous faults (eg, unresolvable DMMU miss) that occur while
 411  * t_lofault is not NULL will not panic but will instead trampoline
 412  * to the registered lofault handler.  There is no need for any
 413  * membars for these - eg, our store to t_lofault will always be visible to
 414  * ourselves and it is our cpu which will take any trap.
 415  *
 416  * Asynchronous faults (eg, uncorrectable ECC error from memory) that occur
 417  * while t_lofault is not NULL will also not panic.  Since we're copying
 418  * to or from userland the extent of the damage is known - the destination
 419  * buffer is incomplete.  So trap handlers will trampoline to the lofault
 420  * handler in this case which should take some form of error action to
 421  * avoid using the incomplete buffer.  The trap handler also flags the
 422  * fault so that later return-from-trap handling (for the trap that brought
 423  * this thread into the kernel in the first place) can notify the process
 424  * and reboot the system (or restart the service with Greenline/Contracts).
 425  *
 426  * Asynchronous faults (eg, uncorrectable ECC error from memory) can
 427  * result in deferred error traps - the trap is taken sometime after
 428  * the event and the trap PC may not be the PC of the faulting access.
 429  * Delivery of such pending traps can be forced by a membar #Sync, acting
 430  * as an "error barrier" in this role.  To accurately apply the user/kernel
 431  * separation described in the preceding paragraph we must force delivery
 432  * of deferred traps affecting kernel state before we install a lofault
 433  * handler (if we interpose a new lofault handler on an existing one there
 434  * is no need to repeat this), and we must force delivery of deferred
 435  * errors affecting the lofault-protected region before we clear t_lofault.
 436  * Failure to do so results in lost kernel state being interpreted as
 437  * affecting a copyin/copyout only, or of an error that really only
 438  * affects copy data being interpreted as losing kernel state.
 439  *
 440  * Since the copy operations may preserve and later restore floating
 441  * point state that does not belong to the caller (see examples above),
 442  * we must be careful in how we do this in order to prevent corruption
 443  * of another program.
 444  *
 445  * To make sure that floating point state is always saved and restored
 446  * correctly, the following "big rules" must be followed when the floating
 447  * point registers will be used:
 448  *
 449  * 1. %l6 always holds the caller's lofault handler.  Also in this register,
 450  *    Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in
 451  *    use.  Bit 2 (TRAMP_FLAG) indicates that the call was to bcopy, and a
 452  *    lofault handler was set coming in.
 453  *
 454  * 2. The FPUSED flag indicates that all FP state has been successfully stored
 455  *    on the stack.  It should not be set until this save has been completed.
 456  *
 457  * 3. The FPUSED flag should not be cleared on exit until all FP state has
 458  *    been restored from the stack.  If an error occurs while restoring
 459  *    data from the stack, the error handler can check this flag to see if
 460  *    a restore is necessary.
 461  *
 462  * 4. Code run under the new lofault handler must be kept to a minimum.  In
 463  *    particular, any calls to FP_ALLOWMIGRATE, which could result in a call
 464  *    to kpreempt(), should not be made until after the lofault handler has
 465  *    been restored.
 466  */
 467 
 468 /*
 469  * VIS_COPY_THRESHOLD indicates the minimum number of bytes needed
 470  * to "break even" using FP/VIS-accelerated memory operations.
 471  * The FPBLK code assumes a minimum number of bytes are available
 472  * to be moved on entry.  Check that code carefully before 
 473  * reducing VIS_COPY_THRESHOLD below 256.
 474  */
 475 /*
 476  * This shadows sys/machsystm.h which can't be included due to the lack of
 477  * _ASM guards in include files it references. Change it here, change it there.
 478  */
 479 #define VIS_COPY_THRESHOLD 256
 480 
 481 /*
 482  * TEST for very short copies
 483  * Be aware that the maximum unroll for the short unaligned case
 484  * is SHORTCOPY+1
 485  */
 486 #define SHORTCOPY 3
 487 #define CHKSIZE  39
 488 
 489 /*
 490  * Indicates that we're to trampoline to the error handler.
 491  * Entry points bcopy, copyin_noerr, and copyout_noerr use this flag.
 492  * kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag.
 493  */
 494 #define FPUSED_FLAG     1
 495 #define TRAMP_FLAG      2
 496 #define MASK_FLAGS      3
 497 
 498 /*
 499  * Number of outstanding prefetches.
 500  * Testing with 1200 MHz Cheetah+ and Jaguar gives best results with
 501  * two prefetches, one with a reach of 8*BLOCK_SIZE+8 and one with a
 502  * reach of 5*BLOCK_SIZE.  The double prefetch gives an typical improvement
 503  * of 5% for large copies as compared to a single prefetch.  The reason
 504  * for the improvement is that with Cheetah and Jaguar, some prefetches
 505  * are dropped due to the prefetch queue being full.  The second prefetch
 506  * reduces the number of cache lines that are dropped. 
 507  * Do not remove the double prefetch or change either CHEETAH_PREFETCH
 508  * or CHEETAH_2ND_PREFETCH without extensive performance tests to prove
 509  * there is no loss of performance.
 510  */
 511 #define CHEETAH_PREFETCH        8
 512 #define CHEETAH_2ND_PREFETCH    5
 513 
 514 #define VIS_BLOCKSIZE           64
 515 
 516 /*
 517  * Size of stack frame in order to accomodate a 64-byte aligned
 518  * floating-point register save area and 2 64-bit temp locations.
 519  * All copy functions use two quadrants of fp registers; to assure a
 520  * block-aligned two block buffer in which to save we must reserve
 521  * three blocks on stack.  Not all functions preserve %pfrs on stack
 522  * or need to preserve %gsr but we use HWCOPYFRAMESIZE for all.
 523  *
 524  *    _______________________________________ <-- %fp + STACK_BIAS
 525  *    | We may need to preserve 2 quadrants |
 526  *    | of fp regs, but since we do so with |
 527  *    | BST/BLD we need room in which to    |
 528  *    | align to VIS_BLOCKSIZE bytes.  So   |
 529  *    | this area is 3 * VIS_BLOCKSIZE.     | <--  - SAVED_FPREGS_OFFSET
 530  *    |-------------------------------------|
 531  *    | 8 bytes to save %fprs               | <--  - SAVED_FPRS_OFFSET
 532  *    |-------------------------------------|
 533  *    | 8 bytes to save %gsr                | <--  - SAVED_GSR_OFFSET
 534  *    ---------------------------------------
 535  */
 536 #define HWCOPYFRAMESIZE         ((VIS_BLOCKSIZE * (2 + 1)) + (2 * 8))
 537 #define SAVED_FPREGS_OFFSET     (VIS_BLOCKSIZE * 3)
 538 #define SAVED_FPREGS_ADJUST     ((VIS_BLOCKSIZE * 2) - 1)
 539 #define SAVED_FPRS_OFFSET       (SAVED_FPREGS_OFFSET + 8)
 540 #define SAVED_GSR_OFFSET        (SAVED_FPRS_OFFSET + 8)
 541 
 542 /*
 543  * Common macros used by the various versions of the block copy
 544  * routines in this file.
 545  */
 546 
 547 /*
 548  * In FP copies if we do not have preserved data to restore over
 549  * the fp regs we used then we must zero those regs to avoid
 550  * exposing portions of the data to later threads (data security).
 551  *
 552  * Copy functions use either quadrants 1 and 3 or 2 and 4.
 553  *
 554  * FZEROQ1Q3: Zero quadrants 1 and 3, ie %f0 - %f15 and %f32 - %f47
 555  * FZEROQ2Q4: Zero quadrants 2 and 4, ie %f16 - %f31 and %f48 - %f63
 556  *
 557  * The instructions below are quicker than repeated fzero instructions
 558  * since they can dispatch down two fp pipelines.
 559  */
 560 #define FZEROQ1Q3                       \
 561         fzero   %f0                     ;\
 562         fzero   %f2                     ;\
 563         faddd   %f0, %f2, %f4           ;\
 564         fmuld   %f0, %f2, %f6           ;\
 565         faddd   %f0, %f2, %f8           ;\
 566         fmuld   %f0, %f2, %f10          ;\
 567         faddd   %f0, %f2, %f12          ;\
 568         fmuld   %f0, %f2, %f14          ;\
 569         faddd   %f0, %f2, %f32          ;\
 570         fmuld   %f0, %f2, %f34          ;\
 571         faddd   %f0, %f2, %f36          ;\
 572         fmuld   %f0, %f2, %f38          ;\
 573         faddd   %f0, %f2, %f40          ;\
 574         fmuld   %f0, %f2, %f42          ;\
 575         faddd   %f0, %f2, %f44          ;\
 576         fmuld   %f0, %f2, %f46
 577 
 578 #define FZEROQ2Q4                       \
 579         fzero   %f16                    ;\
 580         fzero   %f18                    ;\
 581         faddd   %f16, %f18, %f20        ;\
 582         fmuld   %f16, %f18, %f22        ;\
 583         faddd   %f16, %f18, %f24        ;\
 584         fmuld   %f16, %f18, %f26        ;\
 585         faddd   %f16, %f18, %f28        ;\
 586         fmuld   %f16, %f18, %f30        ;\
 587         faddd   %f16, %f18, %f48        ;\
 588         fmuld   %f16, %f18, %f50        ;\
 589         faddd   %f16, %f18, %f52        ;\
 590         fmuld   %f16, %f18, %f54        ;\
 591         faddd   %f16, %f18, %f56        ;\
 592         fmuld   %f16, %f18, %f58        ;\
 593         faddd   %f16, %f18, %f60        ;\
 594         fmuld   %f16, %f18, %f62
 595 
 596 /*
 597  * Macros to save and restore quadrants 1 and 3 or 2 and 4 to/from the stack.
 598  * Used to save and restore in-use fp registers when we want to use FP
 599  * and find fp already in use and copy size still large enough to justify
 600  * the additional overhead of this save and restore.
 601  *
 602  * A membar #Sync is needed before save to sync fp ops initiated before
 603  * the call to the copy function (by whoever has fp in use); for example
 604  * an earlier block load to the quadrant we are about to save may still be
 605  * "in flight".  A membar #Sync is required at the end of the save to
 606  * sync our block store (the copy code is about to begin ldd's to the
 607  * first quadrant).  Note, however, that since Cheetah pipeline block load
 608  * is blocking we can omit the initial membar before saving fp state (they're
 609  * commented below in case of future porting to a chip that does not block
 610  * on block load).
 611  *
 612  * Similarly: a membar #Sync before restore allows the block stores of
 613  * the copy operation to complete before we fill the quadrants with their
 614  * original data, and a membar #Sync after restore lets the block loads
 615  * of the restore complete before we return to whoever has the fp regs
 616  * in use.  To avoid repeated membar #Sync we make it the responsibility
 617  * of the copy code to membar #Sync immediately after copy is complete
 618  * and before using the BLD_*_FROMSTACK macro.
 619  */
 620 #define BST_FPQ1Q3_TOSTACK(tmp1)                                \
 621         /* membar #Sync */                                      ;\
 622         add     %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1     ;\
 623         and     tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */    ;\
 624         stda    %f0, [tmp1]ASI_BLK_P                            ;\
 625         add     tmp1, VIS_BLOCKSIZE, tmp1                       ;\
 626         stda    %f32, [tmp1]ASI_BLK_P                           ;\
 627         membar  #Sync
 628 
 629 #define BLD_FPQ1Q3_FROMSTACK(tmp1)                              \
 630         /* membar #Sync - provided at copy completion */        ;\
 631         add     %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1     ;\
 632         and     tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */    ;\
 633         ldda    [tmp1]ASI_BLK_P, %f0                            ;\
 634         add     tmp1, VIS_BLOCKSIZE, tmp1                       ;\
 635         ldda    [tmp1]ASI_BLK_P, %f32                           ;\
 636         membar  #Sync
 637 
 638 #define BST_FPQ2Q4_TOSTACK(tmp1)                                \
 639         /* membar #Sync */                                      ;\
 640         add     %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1     ;\
 641         and     tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */    ;\
 642         stda    %f16, [tmp1]ASI_BLK_P                           ;\
 643         add     tmp1, VIS_BLOCKSIZE, tmp1                       ;\
 644         stda    %f48, [tmp1]ASI_BLK_P                           ;\
 645         membar  #Sync
 646 
 647 #define BLD_FPQ2Q4_FROMSTACK(tmp1)                              \
 648         /* membar #Sync - provided at copy completion */        ;\
 649         add     %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1     ;\
 650         and     tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */    ;\
 651         ldda    [tmp1]ASI_BLK_P, %f16                           ;\
 652         add     tmp1, VIS_BLOCKSIZE, tmp1                       ;\
 653         ldda    [tmp1]ASI_BLK_P, %f48                           ;\
 654         membar  #Sync
 655 
 656 /*
 657  * FP_NOMIGRATE and FP_ALLOWMIGRATE.  Prevent migration (or, stronger,
 658  * prevent preemption if there is no t_lwp to save FP state to on context
 659  * switch) before commencing a FP copy, and reallow it on completion or
 660  * in error trampoline paths when we were using FP copy.
 661  *
 662  * Both macros may call other functions, so be aware that all outputs are
 663  * forfeit after using these macros.  For this reason we do not pass registers
 664  * to use - we just use any outputs we want.
 665  *
 666  * For fpRAS we need to perform the fpRAS mechanism test on the same
 667  * CPU as we use for the copy operation, both so that we validate the
 668  * CPU we perform the copy on and so that we know which CPU failed
 669  * if a failure is detected.  Hence we need to be bound to "our" CPU.
 670  * This could be achieved through disabling preemption (and we have do it that
 671  * way for threads with no t_lwp) but for larger copies this may hold
 672  * higher priority threads off of cpu for too long (eg, realtime).  So we
 673  * make use of the lightweight t_nomigrate mechanism where we can (ie, when
 674  * we have a t_lwp).
 675  *
 676  * Pseudo code:
 677  *
 678  * FP_NOMIGRATE:
 679  *
 680  * if (curthread->t_lwp) {
 681  *      thread_nomigrate();
 682  * } else {
 683  *      kpreempt_disable();
 684  * }
 685  *
 686  * FP_ALLOWMIGRATE:
 687  *
 688  * if (curthread->t_lwp) {
 689  *      thread_allowmigrate();
 690  * } else {
 691  *      kpreempt_enable();
 692  * }
 693  */
 694 
 695 #define FP_NOMIGRATE(label1, label2)                            \
 696         ldn     [THREAD_REG + T_LWP], %o0                       ;\
 697         brz,a,pn %o0, label1/**/f                               ;\
 698           ldsb  [THREAD_REG + T_PREEMPT], %o1                   ;\
 699         call    thread_nomigrate                                ;\
 700           nop                                                   ;\
 701         ba      label2/**/f                                     ;\
 702           nop                                                   ;\
 703 label1:                                                         ;\
 704         inc     %o1                                             ;\
 705         stb     %o1, [THREAD_REG + T_PREEMPT]                   ;\
 706 label2:
 707 
 708 #define FP_ALLOWMIGRATE(label1, label2)                 \
 709         ldn     [THREAD_REG + T_LWP], %o0                       ;\
 710         brz,a,pn %o0, label1/**/f                               ;\
 711           ldsb  [THREAD_REG + T_PREEMPT], %o1                   ;\
 712         call thread_allowmigrate                                ;\
 713           nop                                                   ;\
 714         ba      label2/**/f                                     ;\
 715           nop                                                   ;\
 716 label1:                                                         ;\
 717         dec     %o1                                             ;\
 718         brnz,pn %o1, label2/**/f                                ;\
 719           stb   %o1, [THREAD_REG + T_PREEMPT]                   ;\
 720         ldn     [THREAD_REG + T_CPU], %o0                       ;\
 721         ldub    [%o0 + CPU_KPRUNRUN], %o0                       ;\
 722         brz,pt  %o0, label2/**/f                                ;\
 723           nop                                                   ;\
 724         call    kpreempt                                        ;\
 725           rdpr  %pil, %o0                                       ;\
 726 label2:
 727 
 728 /*
 729  * Copy a block of storage, returning an error code if `from' or
 730  * `to' takes a kernel pagefault which cannot be resolved.
 731  * Returns errno value on pagefault error, 0 if all ok
 732  */
 733 
 734         .seg    ".text"
 735         .align  4
 736 
 737         ENTRY(kcopy)
 738 
 739         cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
 740         bleu,pt %ncc, .kcopy_small              ! go to larger cases
 741           xor   %o0, %o1, %o3                   ! are src, dst alignable?
 742         btst    7, %o3                          !
 743         bz,pt   %ncc, .kcopy_8                  ! check for longword alignment
 744           nop
 745         btst    1, %o3                          ! 
 746         bz,pt   %ncc, .kcopy_2                  ! check for half-word
 747           nop
 748         sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
 749         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
 750         tst     %o3
 751         bz,pn   %icc, .kcopy_small              ! if zero, disable HW copy
 752           cmp   %o2, %o3                        ! if length <= limit
 753         bleu,pt %ncc, .kcopy_small              ! go to small copy
 754           nop
 755         ba,pt   %ncc, .kcopy_more               ! otherwise go to large copy
 756           nop
 757 .kcopy_2:
 758         btst    3, %o3                          !
 759         bz,pt   %ncc, .kcopy_4                  ! check for word alignment
 760           nop
 761         sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
 762         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
 763         tst     %o3
 764         bz,pn   %icc, .kcopy_small              ! if zero, disable HW copy
 765           cmp   %o2, %o3                        ! if length <= limit
 766         bleu,pt %ncc, .kcopy_small              ! go to small copy
 767           nop
 768         ba,pt   %ncc, .kcopy_more               ! otherwise go to large copy
 769           nop
 770 .kcopy_4:
 771         ! already checked longword, must be word aligned
 772         sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
 773         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
 774         tst     %o3
 775         bz,pn   %icc, .kcopy_small              ! if zero, disable HW copy
 776           cmp   %o2, %o3                        ! if length <= limit
 777         bleu,pt %ncc, .kcopy_small              ! go to small copy
 778           nop
 779         ba,pt   %ncc, .kcopy_more               ! otherwise go to large copy
 780           nop
 781 .kcopy_8:
 782         sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
 783         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
 784         tst     %o3
 785         bz,pn   %icc, .kcopy_small              ! if zero, disable HW copy
 786           cmp   %o2, %o3                        ! if length <= limit
 787         bleu,pt %ncc, .kcopy_small              ! go to small copy
 788           nop
 789         ba,pt   %ncc, .kcopy_more               ! otherwise go to large copy
 790           nop
 791 
 792 .kcopy_small:
 793         sethi   %hi(.sm_copyerr), %o5           ! sm_copyerr is lofault value
 794         or      %o5, %lo(.sm_copyerr), %o5
 795         ldn     [THREAD_REG + T_LOFAULT], %o4   ! save existing handler
 796         membar  #Sync                           ! sync error barrier
 797         ba,pt   %ncc, .sm_do_copy               ! common code
 798          stn    %o5, [THREAD_REG + T_LOFAULT]   ! set t_lofault
 799 
 800 .kcopy_more:
 801         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
 802         sethi   %hi(.copyerr), %l7              ! copyerr is lofault value
 803         or      %l7, %lo(.copyerr), %l7
 804         ldn     [THREAD_REG + T_LOFAULT], %l6   ! save existing handler
 805         membar  #Sync                           ! sync error barrier
 806         ba,pt   %ncc, .do_copy                  ! common code
 807           stn   %l7, [THREAD_REG + T_LOFAULT]   ! set t_lofault
 808 
 809 
 810 /*
 811  * We got here because of a fault during bcopy_more, called from kcopy or bcopy.
 812  * Errno value is in %g1.  bcopy_more uses fp quadrants 1 and 3.
 813  */
 814 .copyerr:
 815         set     .copyerr2, %l0
 816         membar  #Sync                           ! sync error barrier
 817         stn     %l0, [THREAD_REG + T_LOFAULT]   ! set t_lofault
 818         btst    FPUSED_FLAG, %l6
 819         bz      %ncc, 1f
 820           and   %l6, TRAMP_FLAG, %l0            ! copy trampoline flag to %l0
 821 
 822         ldx     [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2      ! restore gsr
 823         wr      %o2, 0, %gsr
 824 
 825         ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
 826         btst    FPRS_FEF, %o3
 827         bz,pt   %icc, 4f
 828           nop
 829 
 830         BLD_FPQ1Q3_FROMSTACK(%o2)
 831 
 832         ba,pt   %ncc, 1f
 833           wr    %o3, 0, %fprs           ! restore fprs
 834 
 835 4:
 836         FZEROQ1Q3
 837         wr      %o3, 0, %fprs           ! restore fprs
 838 
 839         !
 840         ! Need to cater for the different expectations of kcopy
 841         ! and bcopy. kcopy will *always* set a t_lofault handler
 842         ! If it fires, we're expected to just return the error code
 843         ! and *not* to invoke any existing error handler. As far as
 844         ! bcopy is concerned, we only set t_lofault if there was an
 845         ! existing lofault handler. In that case we're expected to
 846         ! invoke the previously existing handler after resetting the
 847         ! t_lofault value.
 848         !
 849 1:
 850         andn    %l6, MASK_FLAGS, %l6            ! turn trampoline flag off
 851         membar  #Sync                           ! sync error barrier
 852         stn     %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
 853         FP_ALLOWMIGRATE(5, 6)
 854 
 855         btst    TRAMP_FLAG, %l0
 856         bnz,pn  %ncc, 3f
 857           nop
 858         ret
 859           restore       %g1, 0, %o0
 860 
 861 3:
 862         !
 863         ! We're here via bcopy. There *must* have been an error handler
 864         ! in place otherwise we would have died a nasty death already.
 865         !
 866         jmp     %l6                             ! goto real handler
 867           restore       %g0, 0, %o0             ! dispose of copy window
 868 
 869 /*
 870  * We got here because of a fault in .copyerr.  We can't safely restore fp
 871  * state, so we panic.
 872  */
 873 fp_panic_msg:
 874         .asciz  "Unable to restore fp state after copy operation"
 875 
 876         .align  4
 877 .copyerr2:
 878         set     fp_panic_msg, %o0
 879         call    panic
 880           nop
 881 
 882 /*
 883  * We got here because of a fault during a small kcopy or bcopy.
 884  * No floating point registers are used by the small copies.
 885  * Errno value is in %g1.
 886  */
 887 .sm_copyerr:
 888 1:
 889         btst    TRAMP_FLAG, %o4
 890         membar  #Sync
 891         andn    %o4, TRAMP_FLAG, %o4
 892         bnz,pn  %ncc, 3f
 893           stn   %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
 894         retl
 895           mov   %g1, %o0
 896 3:
 897         jmp     %o4                             ! goto real handler
 898           mov   %g0, %o0                        ! 
 899 
 900         SET_SIZE(kcopy)
 901 
 902 
 903 /*
 904  * Copy a block of storage - must not overlap (from + len <= to).
 905  * Registers: l6 - saved t_lofault
 906  * (for short copies, o4 - saved t_lofault)
 907  *
 908  * Copy a page of memory.
 909  * Assumes double word alignment and a count >= 256.
 910  */
 911 
 912         ENTRY(bcopy)
 913 
 914         cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
 915         bleu,pt %ncc, .bcopy_small              ! go to larger cases
 916           xor   %o0, %o1, %o3                   ! are src, dst alignable?
 917         btst    7, %o3                          !
 918         bz,pt   %ncc, .bcopy_8                  ! check for longword alignment
 919           nop
 920         btst    1, %o3                          ! 
 921         bz,pt   %ncc, .bcopy_2                  ! check for half-word
 922           nop
 923         sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
 924         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
 925         tst     %o3
 926         bz,pn   %icc, .bcopy_small              ! if zero, disable HW copy
 927           cmp   %o2, %o3                        ! if length <= limit
 928         bleu,pt %ncc, .bcopy_small              ! go to small copy
 929           nop
 930         ba,pt   %ncc, .bcopy_more               ! otherwise go to large copy
 931           nop
 932 .bcopy_2:
 933         btst    3, %o3                          !
 934         bz,pt   %ncc, .bcopy_4                  ! check for word alignment
 935           nop
 936         sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
 937         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
 938         tst     %o3
 939         bz,pn   %icc, .bcopy_small              ! if zero, disable HW copy
 940           cmp   %o2, %o3                        ! if length <= limit
 941         bleu,pt %ncc, .bcopy_small              ! go to small copy
 942           nop
 943         ba,pt   %ncc, .bcopy_more               ! otherwise go to large copy
 944           nop
 945 .bcopy_4:
 946         ! already checked longword, must be word aligned
 947         sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
 948         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
 949         tst     %o3
 950         bz,pn   %icc, .bcopy_small              ! if zero, disable HW copy
 951           cmp   %o2, %o3                        ! if length <= limit
 952         bleu,pt %ncc, .bcopy_small              ! go to small copy
 953           nop
 954         ba,pt   %ncc, .bcopy_more               ! otherwise go to large copy
 955           nop
 956 .bcopy_8:
 957         sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
 958         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
 959         tst     %o3
 960         bz,pn   %icc, .bcopy_small              ! if zero, disable HW copy
 961           cmp   %o2, %o3                        ! if length <= limit
 962         bleu,pt %ncc, .bcopy_small              ! go to small copy
 963           nop
 964         ba,pt   %ncc, .bcopy_more               ! otherwise go to large copy
 965           nop
 966 
 967         .align  16
 968 .bcopy_small:
 969         ldn     [THREAD_REG + T_LOFAULT], %o4   ! save t_lofault
 970         tst     %o4
 971         bz,pt   %icc, .sm_do_copy
 972           nop
 973         sethi   %hi(.sm_copyerr), %o5
 974         or      %o5, %lo(.sm_copyerr), %o5
 975         membar  #Sync                           ! sync error barrier
 976         stn     %o5, [THREAD_REG + T_LOFAULT]   ! install new vector
 977         or      %o4, TRAMP_FLAG, %o4            ! error should trampoline
 978 .sm_do_copy:
 979         cmp     %o2, SHORTCOPY          ! check for really short case
 980         bleu,pt %ncc, .bc_sm_left       !
 981           cmp   %o2, CHKSIZE            ! check for medium length cases
 982         bgu,pn  %ncc, .bc_med           !
 983           or    %o0, %o1, %o3           ! prepare alignment check
 984         andcc   %o3, 0x3, %g0           ! test for alignment
 985         bz,pt   %ncc, .bc_sm_word       ! branch to word aligned case
 986 .bc_sm_movebytes:
 987           sub   %o2, 3, %o2             ! adjust count to allow cc zero test
 988 .bc_sm_notalign4:
 989         ldub    [%o0], %o3              ! read byte
 990         stb     %o3, [%o1]              ! write byte
 991         subcc   %o2, 4, %o2             ! reduce count by 4
 992         ldub    [%o0 + 1], %o3          ! repeat for a total of 4 bytes
 993         add     %o0, 4, %o0             ! advance SRC by 4
 994         stb     %o3, [%o1 + 1]
 995         ldub    [%o0 - 2], %o3
 996         add     %o1, 4, %o1             ! advance DST by 4
 997         stb     %o3, [%o1 - 2]
 998         ldub    [%o0 - 1], %o3
 999         bgt,pt  %ncc, .bc_sm_notalign4  ! loop til 3 or fewer bytes remain
1000           stb   %o3, [%o1 - 1]
1001         add     %o2, 3, %o2             ! restore count
1002 .bc_sm_left:
1003         tst     %o2
1004         bz,pt   %ncc, .bc_sm_exit       ! check for zero length
1005           deccc %o2                     ! reduce count for cc test
1006         ldub    [%o0], %o3              ! move one byte
1007         bz,pt   %ncc, .bc_sm_exit
1008           stb   %o3, [%o1]
1009         ldub    [%o0 + 1], %o3          ! move another byte
1010         deccc   %o2                     ! check for more
1011         bz,pt   %ncc, .bc_sm_exit
1012           stb   %o3, [%o1 + 1]
1013         ldub    [%o0 + 2], %o3          ! move final byte
1014         stb     %o3, [%o1 + 2]
1015         membar  #Sync                           ! sync error barrier
1016         andn    %o4, TRAMP_FLAG, %o4
1017         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1018         retl
1019           mov   %g0, %o0                ! return 0
1020         .align  16
1021         nop                             ! instruction alignment
1022                                         ! see discussion at start of file
1023 .bc_sm_words:
1024         lduw    [%o0], %o3              ! read word
1025 .bc_sm_wordx:
1026         subcc   %o2, 8, %o2             ! update count
1027         stw     %o3, [%o1]              ! write word
1028         add     %o0, 8, %o0             ! update SRC
1029         lduw    [%o0 - 4], %o3          ! read word
1030         add     %o1, 8, %o1             ! update DST
1031         bgt,pt  %ncc, .bc_sm_words      ! loop til done
1032           stw   %o3, [%o1 - 4]          ! write word
1033         addcc   %o2, 7, %o2             ! restore count
1034         bz,pt   %ncc, .bc_sm_exit
1035           deccc %o2
1036         bz,pt   %ncc, .bc_sm_byte
1037 .bc_sm_half:
1038           subcc %o2, 2, %o2             ! reduce count by 2
1039         add     %o0, 2, %o0             ! advance SRC by 2
1040         lduh    [%o0 - 2], %o3          ! read half word
1041         add     %o1, 2, %o1             ! advance DST by 2
1042         bgt,pt  %ncc, .bc_sm_half       ! loop til done
1043           sth   %o3, [%o1 - 2]          ! write half word
1044         addcc   %o2, 1, %o2             ! restore count
1045         bz,pt   %ncc, .bc_sm_exit
1046           nop
1047 .bc_sm_byte:
1048         ldub    [%o0], %o3
1049         stb     %o3, [%o1]
1050         membar  #Sync                           ! sync error barrier
1051         andn    %o4, TRAMP_FLAG, %o4
1052         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1053         retl
1054           mov   %g0, %o0                ! return 0
1055 
1056 .bc_sm_word:
1057         subcc   %o2, 4, %o2             ! update count
1058         bgt,pt  %ncc, .bc_sm_wordx
1059           lduw  [%o0], %o3              ! read word
1060         addcc   %o2, 3, %o2             ! restore count
1061         bz,pt   %ncc, .bc_sm_exit
1062           stw   %o3, [%o1]              ! write word
1063         deccc   %o2                     ! reduce count for cc test
1064         ldub    [%o0 + 4], %o3          ! load one byte
1065         bz,pt   %ncc, .bc_sm_exit
1066           stb   %o3, [%o1 + 4]          ! store one byte
1067         ldub    [%o0 + 5], %o3          ! load second byte
1068         deccc   %o2
1069         bz,pt   %ncc, .bc_sm_exit
1070           stb   %o3, [%o1 + 5]          ! store second byte
1071         ldub    [%o0 + 6], %o3          ! load third byte
1072         stb     %o3, [%o1 + 6]          ! store third byte
1073 .bc_sm_exit:
1074         membar  #Sync                           ! sync error barrier
1075         andn    %o4, TRAMP_FLAG, %o4
1076         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1077         retl
1078           mov   %g0, %o0                ! return 0
1079 
1080         .align 16
1081 .bc_med:
1082         xor     %o0, %o1, %o3           ! setup alignment check
1083         btst    1, %o3
1084         bnz,pt  %ncc, .bc_sm_movebytes  ! unaligned
1085           nop
1086         btst    3, %o3
1087         bnz,pt  %ncc, .bc_med_half      ! halfword aligned
1088           nop
1089         btst    7, %o3
1090         bnz,pt  %ncc, .bc_med_word      ! word aligned
1091           nop
1092 .bc_med_long:
1093         btst    3, %o0                  ! check for
1094         bz,pt   %ncc, .bc_med_long1     ! word alignment
1095           nop
1096 .bc_med_long0:
1097         ldub    [%o0], %o3              ! load one byte
1098         inc     %o0
1099         stb     %o3,[%o1]               ! store byte
1100         inc     %o1
1101         btst    3, %o0
1102         bnz,pt  %ncc, .bc_med_long0
1103           dec   %o2
1104 .bc_med_long1:                  ! word aligned
1105         btst    7, %o0                  ! check for long word
1106         bz,pt   %ncc, .bc_med_long2
1107           nop
1108         lduw    [%o0], %o3              ! load word
1109         add     %o0, 4, %o0             ! advance SRC by 4
1110         stw     %o3, [%o1]              ! store word
1111         add     %o1, 4, %o1             ! advance DST by 4
1112         sub     %o2, 4, %o2             ! reduce count by 4
1113 !
1114 !  Now long word aligned and have at least 32 bytes to move
1115 !
1116 .bc_med_long2:
1117         sub     %o2, 31, %o2            ! adjust count to allow cc zero test
1118 .bc_med_lmove:
1119         ldx     [%o0], %o3              ! read long word
1120         stx     %o3, [%o1]              ! write long word
1121         subcc   %o2, 32, %o2            ! reduce count by 32
1122         ldx     [%o0 + 8], %o3          ! repeat for a total for 4 long words
1123         add     %o0, 32, %o0            ! advance SRC by 32
1124         stx     %o3, [%o1 + 8]
1125         ldx     [%o0 - 16], %o3
1126         add     %o1, 32, %o1            ! advance DST by 32
1127         stx     %o3, [%o1 - 16]
1128         ldx     [%o0 - 8], %o3
1129         bgt,pt  %ncc, .bc_med_lmove     ! loop til 31 or fewer bytes left
1130           stx   %o3, [%o1 - 8]
1131         addcc   %o2, 24, %o2            ! restore count to long word offset
1132         ble,pt  %ncc, .bc_med_lextra    ! check for more long words to move
1133           nop
1134 .bc_med_lword:
1135         ldx     [%o0], %o3              ! read long word
1136         subcc   %o2, 8, %o2             ! reduce count by 8
1137         stx     %o3, [%o1]              ! write long word
1138         add     %o0, 8, %o0             ! advance SRC by 8
1139         bgt,pt  %ncc, .bc_med_lword     ! loop til 7 or fewer bytes left
1140           add   %o1, 8, %o1             ! advance DST by 8
1141 .bc_med_lextra:
1142         addcc   %o2, 7, %o2             ! restore rest of count
1143         bz,pt   %ncc, .bc_sm_exit       ! if zero, then done
1144           deccc %o2
1145         bz,pt   %ncc, .bc_sm_byte
1146           nop
1147         ba,pt   %ncc, .bc_sm_half
1148           nop
1149 
1150         .align 16
1151 .bc_med_word:
1152         btst    3, %o0                  ! check for
1153         bz,pt   %ncc, .bc_med_word1     ! word alignment
1154           nop
1155 .bc_med_word0:
1156         ldub    [%o0], %o3              ! load one byte
1157         inc     %o0
1158         stb     %o3,[%o1]               ! store byte
1159         inc     %o1
1160         btst    3, %o0
1161         bnz,pt  %ncc, .bc_med_word0
1162           dec   %o2
1163 !
1164 !  Now word aligned and have at least 36 bytes to move
1165 !
1166 .bc_med_word1:
1167         sub     %o2, 15, %o2            ! adjust count to allow cc zero test
1168 .bc_med_wmove:
1169         lduw    [%o0], %o3              ! read word
1170         stw     %o3, [%o1]              ! write word
1171         subcc   %o2, 16, %o2            ! reduce count by 16
1172         lduw    [%o0 + 4], %o3          ! repeat for a total for 4 words
1173         add     %o0, 16, %o0            ! advance SRC by 16
1174         stw     %o3, [%o1 + 4]
1175         lduw    [%o0 - 8], %o3
1176         add     %o1, 16, %o1            ! advance DST by 16
1177         stw     %o3, [%o1 - 8]
1178         lduw    [%o0 - 4], %o3
1179         bgt,pt  %ncc, .bc_med_wmove     ! loop til 15 or fewer bytes left
1180           stw   %o3, [%o1 - 4]
1181         addcc   %o2, 12, %o2            ! restore count to word offset
1182         ble,pt  %ncc, .bc_med_wextra    ! check for more words to move
1183           nop
1184 .bc_med_word2:
1185         lduw    [%o0], %o3              ! read word
1186         subcc   %o2, 4, %o2             ! reduce count by 4
1187         stw     %o3, [%o1]              ! write word
1188         add     %o0, 4, %o0             ! advance SRC by 4
1189         bgt,pt  %ncc, .bc_med_word2     ! loop til 3 or fewer bytes left
1190           add   %o1, 4, %o1             ! advance DST by 4
1191 .bc_med_wextra:
1192         addcc   %o2, 3, %o2             ! restore rest of count
1193         bz,pt   %ncc, .bc_sm_exit       ! if zero, then done
1194           deccc %o2
1195         bz,pt   %ncc, .bc_sm_byte
1196           nop
1197         ba,pt   %ncc, .bc_sm_half
1198           nop
1199 
1200         .align 16
1201 .bc_med_half:
1202         btst    1, %o0                  ! check for
1203         bz,pt   %ncc, .bc_med_half1     ! half word alignment
1204           nop
1205         ldub    [%o0], %o3              ! load one byte
1206         inc     %o0
1207         stb     %o3,[%o1]               ! store byte
1208         inc     %o1
1209         dec     %o2
1210 !
1211 !  Now half word aligned and have at least 38 bytes to move
1212 !
1213 .bc_med_half1:
1214         sub     %o2, 7, %o2             ! adjust count to allow cc zero test
1215 .bc_med_hmove:
1216         lduh    [%o0], %o3              ! read half word
1217         sth     %o3, [%o1]              ! write half word
1218         subcc   %o2, 8, %o2             ! reduce count by 8
1219         lduh    [%o0 + 2], %o3          ! repeat for a total for 4 halfwords
1220         add     %o0, 8, %o0             ! advance SRC by 8
1221         sth     %o3, [%o1 + 2]
1222         lduh    [%o0 - 4], %o3
1223         add     %o1, 8, %o1             ! advance DST by 8
1224         sth     %o3, [%o1 - 4]
1225         lduh    [%o0 - 2], %o3
1226         bgt,pt  %ncc, .bc_med_hmove     ! loop til 7 or fewer bytes left
1227           sth   %o3, [%o1 - 2]
1228         addcc   %o2, 7, %o2             ! restore count
1229         bz,pt   %ncc, .bc_sm_exit
1230           deccc %o2
1231         bz,pt   %ncc, .bc_sm_byte
1232           nop
1233         ba,pt   %ncc, .bc_sm_half
1234           nop
1235 
1236         SET_SIZE(bcopy)
1237 
1238 /*
1239  * The _more entry points are not intended to be used directly by
1240  * any caller from outside this file.  They are provided to allow
1241  * profiling and dtrace of the portions of the copy code that uses
1242  * the floating point registers.
1243  * This entry is particularly important as DTRACE (at least as of
1244  * 4/2004) does not support leaf functions.
1245  */
1246 
1247         ENTRY(bcopy_more)
1248 .bcopy_more:            
1249         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1250         ldn     [THREAD_REG + T_LOFAULT], %l6   ! save t_lofault
1251         tst     %l6
1252         bz,pt   %ncc, .do_copy
1253           nop
1254         sethi   %hi(.copyerr), %o2
1255         or      %o2, %lo(.copyerr), %o2
1256         membar  #Sync                           ! sync error barrier
1257         stn     %o2, [THREAD_REG + T_LOFAULT]   ! install new vector
1258         !
1259         ! We've already captured whether t_lofault was zero on entry.
1260         ! We need to mark ourselves as being from bcopy since both
1261         ! kcopy and bcopy use the same code path. If TRAMP_FLAG is set
1262         ! and the saved lofault was zero, we won't reset lofault on
1263         ! returning.
1264         !
1265         or      %l6, TRAMP_FLAG, %l6
1266 
1267 /*
1268  * Copies that reach here are larger than VIS_COPY_THRESHOLD bytes
1269  * Also, use of FP registers has been tested to be enabled
1270  */
1271 .do_copy:
1272         FP_NOMIGRATE(6, 7)
1273 
1274         rd      %fprs, %o2              ! check for unused fp
1275         st      %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
1276         btst    FPRS_FEF, %o2
1277         bz,a,pt %icc, .do_blockcopy
1278           wr    %g0, FPRS_FEF, %fprs
1279 
1280         BST_FPQ1Q3_TOSTACK(%o2)
1281 
1282 .do_blockcopy:
1283         rd      %gsr, %o2
1284         stx     %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]      ! save gsr
1285         or      %l6, FPUSED_FLAG, %l6
1286 
1287 #define REALSRC %i0
1288 #define DST     %i1
1289 #define CNT     %i2
1290 #define SRC     %i3
1291 #define TMP     %i5
1292 
1293         andcc   DST, VIS_BLOCKSIZE - 1, TMP
1294         bz,pt   %ncc, 2f
1295           neg   TMP
1296         add     TMP, VIS_BLOCKSIZE, TMP
1297 
1298         ! TMP = bytes required to align DST on FP_BLOCK boundary
1299         ! Using SRC as a tmp here
1300         cmp     TMP, 3
1301         bleu,pt %ncc, 1f
1302           sub   CNT,TMP,CNT             ! adjust main count
1303         sub     TMP, 3, TMP             ! adjust for end of loop test
1304 .bc_blkalign:
1305         ldub    [REALSRC], SRC          ! move 4 bytes per loop iteration
1306         stb     SRC, [DST]
1307         subcc   TMP, 4, TMP
1308         ldub    [REALSRC + 1], SRC
1309         add     REALSRC, 4, REALSRC
1310         stb     SRC, [DST + 1]
1311         ldub    [REALSRC - 2], SRC
1312         add     DST, 4, DST
1313         stb     SRC, [DST - 2]
1314         ldub    [REALSRC - 1], SRC
1315         bgu,pt  %ncc, .bc_blkalign
1316           stb   SRC, [DST - 1]
1317 
1318         addcc   TMP, 3, TMP             ! restore count adjustment
1319         bz,pt   %ncc, 2f                ! no bytes left?
1320           nop
1321 1:      ldub    [REALSRC], SRC
1322         inc     REALSRC
1323         inc     DST
1324         deccc   TMP
1325         bgu     %ncc, 1b
1326           stb   SRC, [DST - 1]
1327 
1328 2:
1329         andn    REALSRC, 0x7, SRC
1330         alignaddr REALSRC, %g0, %g0
1331 
1332         ! SRC - 8-byte aligned
1333         ! DST - 64-byte aligned
1334         prefetch [SRC], #one_read
1335         prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read
1336         prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read
1337         prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read
1338         ldd     [SRC], %f0
1339 #if CHEETAH_PREFETCH > 4
1340         prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
1341 #endif
1342         ldd     [SRC + 0x08], %f2
1343 #if CHEETAH_PREFETCH > 5
1344         prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read
1345 #endif
1346         ldd     [SRC + 0x10], %f4
1347 #if CHEETAH_PREFETCH > 6
1348         prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read
1349 #endif
1350         faligndata %f0, %f2, %f32
1351         ldd     [SRC + 0x18], %f6
1352 #if CHEETAH_PREFETCH > 7
1353         prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read
1354 #endif
1355         faligndata %f2, %f4, %f34
1356         ldd     [SRC + 0x20], %f8
1357         faligndata %f4, %f6, %f36
1358         ldd     [SRC + 0x28], %f10
1359         faligndata %f6, %f8, %f38
1360         ldd     [SRC + 0x30], %f12
1361         faligndata %f8, %f10, %f40
1362         ldd     [SRC + 0x38], %f14
1363         faligndata %f10, %f12, %f42
1364         ldd     [SRC + VIS_BLOCKSIZE], %f0
1365         sub     CNT, VIS_BLOCKSIZE, CNT
1366         add     SRC, VIS_BLOCKSIZE, SRC
1367         add     REALSRC, VIS_BLOCKSIZE, REALSRC
1368         ba,a,pt %ncc, 1f
1369           nop
1370         .align  16
1371 1:
1372         ldd     [SRC + 0x08], %f2
1373         faligndata %f12, %f14, %f44
1374         ldd     [SRC + 0x10], %f4
1375         faligndata %f14, %f0, %f46
1376         stda    %f32, [DST]ASI_BLK_P
1377         ldd     [SRC + 0x18], %f6
1378         faligndata %f0, %f2, %f32
1379         ldd     [SRC + 0x20], %f8
1380         faligndata %f2, %f4, %f34
1381         ldd     [SRC + 0x28], %f10
1382         faligndata %f4, %f6, %f36
1383         ldd     [SRC + 0x30], %f12
1384         faligndata %f6, %f8, %f38
1385         ldd     [SRC + 0x38], %f14
1386         faligndata %f8, %f10, %f40
1387         sub     CNT, VIS_BLOCKSIZE, CNT
1388         ldd     [SRC + VIS_BLOCKSIZE], %f0
1389         faligndata %f10, %f12, %f42
1390         prefetch [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read
1391         add     DST, VIS_BLOCKSIZE, DST
1392         prefetch [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1393         add     REALSRC, VIS_BLOCKSIZE, REALSRC
1394         cmp     CNT, VIS_BLOCKSIZE + 8
1395         bgu,pt  %ncc, 1b
1396           add   SRC, VIS_BLOCKSIZE, SRC
1397 
1398         ! only if REALSRC & 0x7 is 0
1399         cmp     CNT, VIS_BLOCKSIZE
1400         bne     %ncc, 3f
1401           andcc REALSRC, 0x7, %g0
1402         bz,pt   %ncc, 2f
1403           nop
1404 3:      
1405         faligndata %f12, %f14, %f44
1406         faligndata %f14, %f0, %f46
1407         stda    %f32, [DST]ASI_BLK_P
1408         add     DST, VIS_BLOCKSIZE, DST
1409         ba,pt   %ncc, 3f
1410           nop
1411 2:
1412         ldd     [SRC + 0x08], %f2
1413         fsrc1   %f12, %f44
1414         ldd     [SRC + 0x10], %f4
1415         fsrc1   %f14, %f46
1416         stda    %f32, [DST]ASI_BLK_P
1417         ldd     [SRC + 0x18], %f6
1418         fsrc1   %f0, %f32
1419         ldd     [SRC + 0x20], %f8
1420         fsrc1   %f2, %f34
1421         ldd     [SRC + 0x28], %f10
1422         fsrc1   %f4, %f36
1423         ldd     [SRC + 0x30], %f12
1424         fsrc1   %f6, %f38
1425         ldd     [SRC + 0x38], %f14
1426         fsrc1   %f8, %f40
1427         sub     CNT, VIS_BLOCKSIZE, CNT
1428         add     DST, VIS_BLOCKSIZE, DST
1429         add     SRC, VIS_BLOCKSIZE, SRC
1430         add     REALSRC, VIS_BLOCKSIZE, REALSRC
1431         fsrc1   %f10, %f42
1432         fsrc1   %f12, %f44
1433         fsrc1   %f14, %f46
1434         stda    %f32, [DST]ASI_BLK_P
1435         add     DST, VIS_BLOCKSIZE, DST
1436         ba,a,pt %ncc, .bcb_exit
1437           nop
1438 
1439 3:      tst     CNT
1440         bz,a,pt %ncc, .bcb_exit
1441           nop
1442 
1443 5:      ldub    [REALSRC], TMP
1444         inc     REALSRC
1445         inc     DST
1446         deccc   CNT
1447         bgu     %ncc, 5b
1448           stb   TMP, [DST - 1]
1449 .bcb_exit:
1450         membar  #Sync
1451 
1452         FPRAS_INTERVAL(FPRAS_BCOPY, 0, %l5, %o2, %o3, %o4, %o5, 8)
1453         FPRAS_REWRITE_TYPE2Q1(0, %l5, %o2, %o3, 8, 9)
1454         FPRAS_CHECK(FPRAS_BCOPY, %l5, 9)        ! outputs lost
1455 
1456         ldx     [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2      ! restore gsr
1457         wr      %o2, 0, %gsr
1458 
1459         ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1460         btst    FPRS_FEF, %o3
1461         bz,pt   %icc, 4f
1462           nop
1463 
1464         BLD_FPQ1Q3_FROMSTACK(%o2)
1465 
1466         ba,pt   %ncc, 2f        
1467           wr    %o3, 0, %fprs           ! restore fprs
1468 4:
1469         FZEROQ1Q3
1470         wr      %o3, 0, %fprs           ! restore fprs
1471 2:
1472         membar  #Sync                           ! sync error barrier
1473         andn    %l6, MASK_FLAGS, %l6
1474         stn     %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1475         FP_ALLOWMIGRATE(5, 6)
1476         ret
1477           restore       %g0, 0, %o0
1478 
1479         SET_SIZE(bcopy_more)
1480 
1481 /*
1482  * Block copy with possibly overlapped operands.
1483  */
1484 
1485         ENTRY(ovbcopy)
1486         tst     %o2                     ! check count
1487         bgu,a   %ncc, 1f                ! nothing to do or bad arguments
1488           subcc %o0, %o1, %o3           ! difference of from and to address
1489 
1490         retl                            ! return
1491           nop
1492 1:
1493         bneg,a  %ncc, 2f
1494           neg   %o3                     ! if < 0, make it positive
1495 2:      cmp     %o2, %o3                ! cmp size and abs(from - to)
1496         bleu    %ncc, bcopy             ! if size <= abs(diff): use bcopy,
1497           .empty                                !   no overlap
1498           cmp   %o0, %o1                ! compare from and to addresses
1499         blu     %ncc, .ov_bkwd          ! if from < to, copy backwards
1500           nop
1501         !
1502         ! Copy forwards.
1503         !
1504 .ov_fwd:
1505         ldub    [%o0], %o3              ! read from address
1506         inc     %o0                     ! inc from address
1507         stb     %o3, [%o1]              ! write to address
1508         deccc   %o2                     ! dec count
1509         bgu     %ncc, .ov_fwd           ! loop till done
1510           inc   %o1                     ! inc to address
1511 
1512         retl                            ! return
1513           nop
1514         !
1515         ! Copy backwards.
1516         !
1517 .ov_bkwd:
1518         deccc   %o2                     ! dec count
1519         ldub    [%o0 + %o2], %o3        ! get byte at end of src
1520         bgu     %ncc, .ov_bkwd          ! loop till done
1521           stb   %o3, [%o1 + %o2]        ! delay slot, store at end of dst
1522 
1523         retl                            ! return
1524           nop
1525 
1526         SET_SIZE(ovbcopy)
1527 
1528 
1529 /*
1530  * hwblkpagecopy()
1531  *
1532  * Copies exactly one page.  This routine assumes the caller (ppcopy)
1533  * has already disabled kernel preemption and has checked
1534  * use_hw_bcopy.  Preventing preemption also prevents cpu migration.
1535  */
1536         ENTRY(hwblkpagecopy)
1537         ! get another window w/space for three aligned blocks of saved fpregs
1538         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1539 
1540         ! %i0 - source address (arg)
1541         ! %i1 - destination address (arg)
1542         ! %i2 - length of region (not arg)
1543         ! %l0 - saved fprs
1544         ! %l1 - pointer to saved fpregs
1545 
1546         rd      %fprs, %l0              ! check for unused fp
1547         btst    FPRS_FEF, %l0
1548         bz,a,pt %icc, 1f
1549           wr    %g0, FPRS_FEF, %fprs
1550 
1551         BST_FPQ1Q3_TOSTACK(%l1)
1552 
1553 1:      set     PAGESIZE, CNT
1554         mov     REALSRC, SRC
1555 
1556         prefetch [SRC], #one_read
1557         prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read
1558         prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read
1559         prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read
1560         ldd     [SRC], %f0
1561 #if CHEETAH_PREFETCH > 4
1562         prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
1563 #endif
1564         ldd     [SRC + 0x08], %f2
1565 #if CHEETAH_PREFETCH > 5
1566         prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read
1567 #endif
1568         ldd     [SRC + 0x10], %f4
1569 #if CHEETAH_PREFETCH > 6
1570         prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read
1571 #endif
1572         fsrc1   %f0, %f32
1573         ldd     [SRC + 0x18], %f6
1574 #if CHEETAH_PREFETCH > 7
1575         prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read
1576 #endif
1577         fsrc1   %f2, %f34
1578         ldd     [SRC + 0x20], %f8
1579         fsrc1   %f4, %f36
1580         ldd     [SRC + 0x28], %f10
1581         fsrc1   %f6, %f38
1582         ldd     [SRC + 0x30], %f12
1583         fsrc1   %f8, %f40
1584         ldd     [SRC + 0x38], %f14
1585         fsrc1   %f10, %f42
1586         ldd     [SRC + VIS_BLOCKSIZE], %f0
1587         sub     CNT, VIS_BLOCKSIZE, CNT
1588         add     SRC, VIS_BLOCKSIZE, SRC
1589         ba,a,pt %ncc, 2f
1590           nop
1591         .align  16
1592 2:
1593         ldd     [SRC + 0x08], %f2
1594         fsrc1   %f12, %f44
1595         ldd     [SRC + 0x10], %f4
1596         fsrc1   %f14, %f46
1597         stda    %f32, [DST]ASI_BLK_P
1598         ldd     [SRC + 0x18], %f6
1599         fsrc1   %f0, %f32
1600         ldd     [SRC + 0x20], %f8
1601         fsrc1   %f2, %f34
1602         ldd     [SRC + 0x28], %f10
1603         fsrc1   %f4, %f36
1604         ldd     [SRC + 0x30], %f12
1605         fsrc1   %f6, %f38
1606         ldd     [SRC + 0x38], %f14
1607         fsrc1   %f8, %f40
1608         ldd     [SRC + VIS_BLOCKSIZE], %f0
1609         fsrc1   %f10, %f42
1610         prefetch [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read
1611         sub     CNT, VIS_BLOCKSIZE, CNT
1612         add     DST, VIS_BLOCKSIZE, DST
1613         cmp     CNT, VIS_BLOCKSIZE + 8
1614         prefetch [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1615         bgu,pt  %ncc, 2b
1616           add   SRC, VIS_BLOCKSIZE, SRC
1617 
1618         ! trailing block
1619         ldd     [SRC + 0x08], %f2
1620         fsrc1   %f12, %f44
1621         ldd     [SRC + 0x10], %f4
1622         fsrc1   %f14, %f46
1623         stda    %f32, [DST]ASI_BLK_P
1624         ldd     [SRC + 0x18], %f6
1625         fsrc1   %f0, %f32
1626         ldd     [SRC + 0x20], %f8
1627         fsrc1   %f2, %f34
1628         ldd     [SRC + 0x28], %f10
1629         fsrc1   %f4, %f36
1630         ldd     [SRC + 0x30], %f12
1631         fsrc1   %f6, %f38
1632         ldd     [SRC + 0x38], %f14
1633         fsrc1   %f8, %f40
1634         sub     CNT, VIS_BLOCKSIZE, CNT
1635         add     DST, VIS_BLOCKSIZE, DST
1636         add     SRC, VIS_BLOCKSIZE, SRC
1637         fsrc1   %f10, %f42
1638         fsrc1   %f12, %f44
1639         fsrc1   %f14, %f46
1640         stda    %f32, [DST]ASI_BLK_P
1641 
1642         membar  #Sync
1643 
1644         FPRAS_INTERVAL(FPRAS_PGCOPY, 1, %l5, %o2, %o3, %o4, %o5, 8)
1645         FPRAS_REWRITE_TYPE1(1, %l5, %f32, %o2, 9)
1646         FPRAS_CHECK(FPRAS_PGCOPY, %l5, 9)       ! lose outputs
1647 
1648         btst    FPRS_FEF, %l0
1649         bz,pt   %icc, 2f
1650           nop
1651 
1652         BLD_FPQ1Q3_FROMSTACK(%l3)
1653         ba      3f
1654           nop
1655 
1656 2:      FZEROQ1Q3
1657 
1658 3:      wr      %l0, 0, %fprs           ! restore fprs
1659         ret
1660           restore       %g0, 0, %o0
1661 
1662         SET_SIZE(hwblkpagecopy)
1663 
1664 
1665 /*
1666  * Transfer data to and from user space -
1667  * Note that these routines can cause faults
1668  * It is assumed that the kernel has nothing at
1669  * less than KERNELBASE in the virtual address space.
1670  *
1671  * Note that copyin(9F) and copyout(9F) are part of the
1672  * DDI/DKI which specifies that they return '-1' on "errors."
1673  *
1674  * Sigh.
1675  *
1676  * So there's two extremely similar routines - xcopyin() and xcopyout()
1677  * which return the errno that we've faithfully computed.  This
1678  * allows other callers (e.g. uiomove(9F)) to work correctly.
1679  * Given that these are used pretty heavily, we expand the calling
1680  * sequences inline for all flavours (rather than making wrappers).
1681  *
1682  * There are also stub routines for xcopyout_little and xcopyin_little,
1683  * which currently are intended to handle requests of <= 16 bytes from
1684  * do_unaligned. Future enhancement to make them handle 8k pages efficiently
1685  * is left as an exercise...
1686  */
1687 
1688 /*
1689  * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
1690  *      
1691  * General theory of operation:
1692  *
1693  * The only difference between copy{in,out} and
1694  * xcopy{in,out} is in the error handling routine they invoke
1695  * when a memory access error occurs. xcopyOP returns the errno
1696  * while copyOP returns -1 (see above). copy{in,out}_noerr set
1697  * a special flag (by oring the TRAMP_FLAG into the fault handler address)
1698  * if they are called with a fault handler already in place. That flag
1699  * causes the default handlers to trampoline to the previous handler
1700  * upon an error.
1701  *
1702  * None of the copyops routines grab a window until it's decided that
1703  * we need to do a HW block copy operation. This saves a window
1704  * spill/fill when we're called during socket ops. The typical IO
1705  * path won't cause spill/fill traps.
1706  *
1707  * This code uses a set of 4 limits for the maximum size that will
1708  * be copied given a particular input/output address alignment.
1709  * If the value for a particular limit is zero, the copy will be performed
1710  * by the plain copy loops rather than FPBLK.
1711  *
1712  * See the description of bcopy above for more details of the
1713  * data copying algorithm and the default limits.
1714  *
1715  */
1716 
1717 /*
1718  * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
1719  */
1720 
1721 /*
1722  * We save the arguments in the following registers in case of a fault:
1723  *      kaddr - %l1
1724  *      uaddr - %l2
1725  *      count - %l3
1726  */
1727 #define SAVE_SRC        %l1
1728 #define SAVE_DST        %l2
1729 #define SAVE_COUNT      %l3
1730 
1731 #define SM_SAVE_SRC             %g4
1732 #define SM_SAVE_DST             %g5
1733 #define SM_SAVE_COUNT           %o5
1734 #define ERRNO           %l5
1735 
1736 
1737 #define REAL_LOFAULT    %l4
1738 /*
1739  * Generic copyio fault handler.  This is the first line of defense when a
1740  * fault occurs in (x)copyin/(x)copyout.  In order for this to function
1741  * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
1742  * This allows us to share common code for all the flavors of the copy
1743  * operations, including the _noerr versions.
1744  *
1745  * Note that this function will restore the original input parameters before
1746  * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
1747  * member of the t_copyop structure, if needed.
1748  */
1749         ENTRY(copyio_fault)
1750         membar  #Sync
1751         mov     %g1,ERRNO                       ! save errno in ERRNO
1752         btst    FPUSED_FLAG, %l6
1753         bz      %ncc, 1f
1754           nop
1755 
1756         ldx     [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
1757         wr      %o2, 0, %gsr            ! restore gsr
1758 
1759         ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1760         btst    FPRS_FEF, %o3
1761         bz,pt   %icc, 4f
1762           nop
1763 
1764         BLD_FPQ2Q4_FROMSTACK(%o2)
1765 
1766         ba,pt   %ncc, 1f
1767           wr    %o3, 0, %fprs           ! restore fprs
1768 
1769 4:
1770         FZEROQ2Q4
1771         wr      %o3, 0, %fprs           ! restore fprs
1772 
1773 1:
1774         andn    %l6, FPUSED_FLAG, %l6
1775         membar  #Sync
1776         stn     %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1777         FP_ALLOWMIGRATE(5, 6)
1778 
1779         mov     SAVE_SRC, %i0
1780         mov     SAVE_DST, %i1
1781         jmp     REAL_LOFAULT
1782           mov   SAVE_COUNT, %i2
1783 
1784         SET_SIZE(copyio_fault)
1785 
1786 
1787         ENTRY(copyout)
1788 
1789         cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
1790         bleu,pt %ncc, .copyout_small            ! go to larger cases
1791           xor   %o0, %o1, %o3                   ! are src, dst alignable?
1792         btst    7, %o3                          !
1793         bz,pt   %ncc, .copyout_8                ! check for longword alignment
1794           nop
1795         btst    1, %o3                          ! 
1796         bz,pt   %ncc, .copyout_2                ! check for half-word
1797           nop
1798         sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
1799         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
1800         tst     %o3
1801         bz,pn   %icc, .copyout_small            ! if zero, disable HW copy
1802           cmp   %o2, %o3                        ! if length <= limit
1803         bleu,pt %ncc, .copyout_small            ! go to small copy
1804           nop
1805         ba,pt   %ncc, .copyout_more             ! otherwise go to large copy
1806           nop
1807 .copyout_2:
1808         btst    3, %o3                          !
1809         bz,pt   %ncc, .copyout_4                ! check for word alignment
1810           nop
1811         sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
1812         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
1813         tst     %o3
1814         bz,pn   %icc, .copyout_small            ! if zero, disable HW copy
1815           cmp   %o2, %o3                        ! if length <= limit
1816         bleu,pt %ncc, .copyout_small            ! go to small copy
1817           nop
1818         ba,pt   %ncc, .copyout_more             ! otherwise go to large copy
1819           nop
1820 .copyout_4:
1821         ! already checked longword, must be word aligned
1822         sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
1823         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
1824         tst     %o3
1825         bz,pn   %icc, .copyout_small            ! if zero, disable HW copy
1826           cmp   %o2, %o3                        ! if length <= limit
1827         bleu,pt %ncc, .copyout_small            ! go to small copy
1828           nop
1829         ba,pt   %ncc, .copyout_more             ! otherwise go to large copy
1830           nop
1831 .copyout_8:
1832         sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
1833         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
1834         tst     %o3
1835         bz,pn   %icc, .copyout_small            ! if zero, disable HW copy
1836           cmp   %o2, %o3                        ! if length <= limit
1837         bleu,pt %ncc, .copyout_small            ! go to small copy
1838           nop
1839         ba,pt   %ncc, .copyout_more             ! otherwise go to large copy
1840           nop
1841 
1842         .align  16
1843         nop                             ! instruction alignment
1844                                         ! see discussion at start of file
1845 .copyout_small:
1846         sethi   %hi(.sm_copyout_err), %o5       ! .sm_copyout_err is lofault
1847         or      %o5, %lo(.sm_copyout_err), %o5
1848         ldn     [THREAD_REG + T_LOFAULT], %o4   ! save existing handler
1849         membar  #Sync                           ! sync error barrier
1850         stn     %o5, [THREAD_REG + T_LOFAULT]   ! set t_lofault
1851 .sm_do_copyout:
1852         mov     %o0, SM_SAVE_SRC
1853         mov     %o1, SM_SAVE_DST
1854         cmp     %o2, SHORTCOPY          ! check for really short case
1855         bleu,pt %ncc, .co_sm_left       !
1856           mov   %o2, SM_SAVE_COUNT
1857         cmp     %o2, CHKSIZE            ! check for medium length cases
1858         bgu,pn  %ncc, .co_med           !
1859           or    %o0, %o1, %o3           ! prepare alignment check
1860         andcc   %o3, 0x3, %g0           ! test for alignment
1861         bz,pt   %ncc, .co_sm_word       ! branch to word aligned case
1862 .co_sm_movebytes:
1863           sub   %o2, 3, %o2             ! adjust count to allow cc zero test
1864 .co_sm_notalign4:
1865         ldub    [%o0], %o3              ! read byte
1866         subcc   %o2, 4, %o2             ! reduce count by 4
1867         stba    %o3, [%o1]ASI_USER      ! write byte
1868         inc     %o1                     ! advance DST by 1
1869         ldub    [%o0 + 1], %o3          ! repeat for a total of 4 bytes
1870         add     %o0, 4, %o0             ! advance SRC by 4
1871         stba    %o3, [%o1]ASI_USER
1872         inc     %o1                     ! advance DST by 1
1873         ldub    [%o0 - 2], %o3
1874         stba    %o3, [%o1]ASI_USER
1875         inc     %o1                     ! advance DST by 1
1876         ldub    [%o0 - 1], %o3
1877         stba    %o3, [%o1]ASI_USER
1878         bgt,pt  %ncc, .co_sm_notalign4  ! loop til 3 or fewer bytes remain
1879           inc   %o1                     ! advance DST by 1
1880         add     %o2, 3, %o2             ! restore count
1881 .co_sm_left:
1882         tst     %o2
1883         bz,pt   %ncc, .co_sm_exit       ! check for zero length
1884           nop
1885         ldub    [%o0], %o3              ! load one byte
1886         deccc   %o2                     ! reduce count for cc test
1887         bz,pt   %ncc, .co_sm_exit
1888           stba  %o3,[%o1]ASI_USER       ! store one byte
1889         ldub    [%o0 + 1], %o3          ! load second byte
1890         deccc   %o2
1891         inc     %o1
1892         bz,pt   %ncc, .co_sm_exit
1893           stba  %o3,[%o1]ASI_USER       ! store second byte
1894         ldub    [%o0 + 2], %o3          ! load third byte
1895         inc     %o1
1896         stba    %o3,[%o1]ASI_USER       ! store third byte
1897         membar  #Sync                           ! sync error barrier
1898         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1899         retl
1900           mov   %g0, %o0                ! return 0
1901         .align  16
1902 .co_sm_words:
1903         lduw    [%o0], %o3              ! read word
1904 .co_sm_wordx:
1905         subcc   %o2, 8, %o2             ! update count
1906         stwa    %o3, [%o1]ASI_USER      ! write word
1907         add     %o0, 8, %o0             ! update SRC
1908         lduw    [%o0 - 4], %o3          ! read word
1909         add     %o1, 4, %o1             ! update DST
1910         stwa    %o3, [%o1]ASI_USER      ! write word
1911         bgt,pt  %ncc, .co_sm_words      ! loop til done
1912           add   %o1, 4, %o1             ! update DST
1913         addcc   %o2, 7, %o2             ! restore count
1914         bz,pt   %ncc, .co_sm_exit
1915           nop
1916         deccc   %o2
1917         bz,pt   %ncc, .co_sm_byte
1918 .co_sm_half:
1919           subcc %o2, 2, %o2             ! reduce count by 2
1920         lduh    [%o0], %o3              ! read half word
1921         add     %o0, 2, %o0             ! advance SRC by 2
1922         stha    %o3, [%o1]ASI_USER      ! write half word
1923         bgt,pt  %ncc, .co_sm_half       ! loop til done
1924           add   %o1, 2, %o1             ! advance DST by 2
1925         addcc   %o2, 1, %o2             ! restore count
1926         bz,pt   %ncc, .co_sm_exit
1927           nop
1928 .co_sm_byte:
1929         ldub    [%o0], %o3
1930         stba    %o3, [%o1]ASI_USER
1931         membar  #Sync                           ! sync error barrier
1932         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1933         retl
1934           mov   %g0, %o0                ! return 0
1935         .align 16
1936 .co_sm_word:
1937         subcc   %o2, 4, %o2             ! update count
1938         bgt,pt  %ncc, .co_sm_wordx
1939           lduw  [%o0], %o3              ! read word
1940         addcc   %o2, 3, %o2             ! restore count
1941         bz,pt   %ncc, .co_sm_exit
1942           stwa  %o3, [%o1]ASI_USER      ! write word
1943         deccc   %o2                     ! reduce count for cc test
1944         ldub    [%o0 + 4], %o3          ! load one byte
1945         add     %o1, 4, %o1
1946         bz,pt   %ncc, .co_sm_exit
1947           stba  %o3, [%o1]ASI_USER      ! store one byte
1948         ldub    [%o0 + 5], %o3          ! load second byte
1949         deccc   %o2
1950         inc     %o1
1951         bz,pt   %ncc, .co_sm_exit
1952           stba  %o3, [%o1]ASI_USER      ! store second byte
1953         ldub    [%o0 + 6], %o3          ! load third byte
1954         inc     %o1
1955         stba    %o3, [%o1]ASI_USER      ! store third byte
1956 .co_sm_exit:
1957           membar        #Sync                           ! sync error barrier
1958         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1959         retl
1960           mov   %g0, %o0                ! return 0
1961 
1962         .align 16
1963 .co_med:
1964         xor     %o0, %o1, %o3           ! setup alignment check
1965         btst    1, %o3
1966         bnz,pt  %ncc, .co_sm_movebytes  ! unaligned
1967           nop
1968         btst    3, %o3
1969         bnz,pt  %ncc, .co_med_half      ! halfword aligned
1970           nop
1971         btst    7, %o3
1972         bnz,pt  %ncc, .co_med_word      ! word aligned
1973           nop
1974 .co_med_long:
1975         btst    3, %o0                  ! check for
1976         bz,pt   %ncc, .co_med_long1     ! word alignment
1977           nop
1978 .co_med_long0:
1979         ldub    [%o0], %o3              ! load one byte
1980         inc     %o0
1981         stba    %o3,[%o1]ASI_USER       ! store byte
1982         inc     %o1
1983         btst    3, %o0
1984         bnz,pt  %ncc, .co_med_long0
1985           dec   %o2
1986 .co_med_long1:                  ! word aligned
1987         btst    7, %o0                  ! check for long word
1988         bz,pt   %ncc, .co_med_long2
1989           nop
1990         lduw    [%o0], %o3              ! load word
1991         add     %o0, 4, %o0             ! advance SRC by 4
1992         stwa    %o3, [%o1]ASI_USER      ! store word
1993         add     %o1, 4, %o1             ! advance DST by 4
1994         sub     %o2, 4, %o2             ! reduce count by 4
1995 !
1996 !  Now long word aligned and have at least 32 bytes to move
1997 !
1998 .co_med_long2:
1999         sub     %o2, 31, %o2            ! adjust count to allow cc zero test
2000         sub     %o1, 8, %o1             ! adjust pointer to allow store in
2001                                         ! branch delay slot instead of add
2002 .co_med_lmove:
2003         add     %o1, 8, %o1             ! advance DST by 8
2004         ldx     [%o0], %o3              ! read long word
2005         subcc   %o2, 32, %o2            ! reduce count by 32
2006         stxa    %o3, [%o1]ASI_USER      ! write long word
2007         add     %o1, 8, %o1             ! advance DST by 8
2008         ldx     [%o0 + 8], %o3          ! repeat for a total for 4 long words
2009         add     %o0, 32, %o0            ! advance SRC by 32
2010         stxa    %o3, [%o1]ASI_USER
2011         ldx     [%o0 - 16], %o3
2012         add     %o1, 8, %o1             ! advance DST by 8
2013         stxa    %o3, [%o1]ASI_USER
2014         ldx     [%o0 - 8], %o3
2015         add     %o1, 8, %o1             ! advance DST by 8
2016         bgt,pt  %ncc, .co_med_lmove     ! loop til 31 or fewer bytes left
2017           stxa  %o3, [%o1]ASI_USER
2018         add     %o1, 8, %o1             ! advance DST by 8
2019         addcc   %o2, 24, %o2            ! restore count to long word offset
2020         ble,pt  %ncc, .co_med_lextra    ! check for more long words to move
2021           nop
2022 .co_med_lword:
2023         ldx     [%o0], %o3              ! read long word
2024         subcc   %o2, 8, %o2             ! reduce count by 8
2025         stxa    %o3, [%o1]ASI_USER      ! write long word
2026         add     %o0, 8, %o0             ! advance SRC by 8
2027         bgt,pt  %ncc, .co_med_lword     ! loop til 7 or fewer bytes left
2028           add   %o1, 8, %o1             ! advance DST by 8
2029 .co_med_lextra:
2030         addcc   %o2, 7, %o2             ! restore rest of count
2031         bz,pt   %ncc, .co_sm_exit       ! if zero, then done
2032           deccc %o2
2033         bz,pt   %ncc, .co_sm_byte
2034           nop
2035         ba,pt   %ncc, .co_sm_half
2036           nop
2037 
2038         .align 16
2039         nop                             ! instruction alignment
2040                                         ! see discussion at start of file
2041 .co_med_word:
2042         btst    3, %o0                  ! check for
2043         bz,pt   %ncc, .co_med_word1     ! word alignment
2044           nop
2045 .co_med_word0:
2046         ldub    [%o0], %o3              ! load one byte
2047         inc     %o0
2048         stba    %o3,[%o1]ASI_USER       ! store byte
2049         inc     %o1
2050         btst    3, %o0
2051         bnz,pt  %ncc, .co_med_word0
2052           dec   %o2
2053 !
2054 !  Now word aligned and have at least 36 bytes to move
2055 !
2056 .co_med_word1:
2057         sub     %o2, 15, %o2            ! adjust count to allow cc zero test
2058 .co_med_wmove:
2059         lduw    [%o0], %o3              ! read word
2060         subcc   %o2, 16, %o2            ! reduce count by 16
2061         stwa    %o3, [%o1]ASI_USER      ! write word
2062         add     %o1, 4, %o1             ! advance DST by 4
2063         lduw    [%o0 + 4], %o3          ! repeat for a total for 4 words
2064         add     %o0, 16, %o0            ! advance SRC by 16
2065         stwa    %o3, [%o1]ASI_USER
2066         add     %o1, 4, %o1             ! advance DST by 4
2067         lduw    [%o0 - 8], %o3
2068         stwa    %o3, [%o1]ASI_USER
2069         add     %o1, 4, %o1             ! advance DST by 4
2070         lduw    [%o0 - 4], %o3
2071         stwa    %o3, [%o1]ASI_USER
2072         bgt,pt  %ncc, .co_med_wmove     ! loop til 15 or fewer bytes left
2073           add   %o1, 4, %o1             ! advance DST by 4
2074         addcc   %o2, 12, %o2            ! restore count to word offset
2075         ble,pt  %ncc, .co_med_wextra    ! check for more words to move
2076           nop
2077 .co_med_word2:
2078         lduw    [%o0], %o3              ! read word
2079         subcc   %o2, 4, %o2             ! reduce count by 4
2080         stwa    %o3, [%o1]ASI_USER      ! write word
2081         add     %o0, 4, %o0             ! advance SRC by 4
2082         bgt,pt  %ncc, .co_med_word2     ! loop til 3 or fewer bytes left
2083           add   %o1, 4, %o1             ! advance DST by 4
2084 .co_med_wextra:
2085         addcc   %o2, 3, %o2             ! restore rest of count
2086         bz,pt   %ncc, .co_sm_exit       ! if zero, then done
2087           deccc %o2
2088         bz,pt   %ncc, .co_sm_byte
2089           nop
2090         ba,pt   %ncc, .co_sm_half
2091           nop
2092 
2093         .align 16
2094         nop                             ! instruction alignment
2095         nop                             ! see discussion at start of file
2096         nop
2097 .co_med_half:
2098         btst    1, %o0                  ! check for
2099         bz,pt   %ncc, .co_med_half1     ! half word alignment
2100           nop
2101         ldub    [%o0], %o3              ! load one byte
2102         inc     %o0
2103         stba    %o3,[%o1]ASI_USER       ! store byte
2104         inc     %o1
2105         dec     %o2
2106 !
2107 !  Now half word aligned and have at least 38 bytes to move
2108 !
2109 .co_med_half1:
2110         sub     %o2, 7, %o2             ! adjust count to allow cc zero test
2111 .co_med_hmove:
2112         lduh    [%o0], %o3              ! read half word
2113         subcc   %o2, 8, %o2             ! reduce count by 8
2114         stha    %o3, [%o1]ASI_USER      ! write half word
2115         add     %o1, 2, %o1             ! advance DST by 2
2116         lduh    [%o0 + 2], %o3          ! repeat for a total for 4 halfwords
2117         add     %o0, 8, %o0             ! advance SRC by 8
2118         stha    %o3, [%o1]ASI_USER
2119         add     %o1, 2, %o1             ! advance DST by 2
2120         lduh    [%o0 - 4], %o3
2121         stha    %o3, [%o1]ASI_USER
2122         add     %o1, 2, %o1             ! advance DST by 2
2123         lduh    [%o0 - 2], %o3
2124         stha    %o3, [%o1]ASI_USER
2125         bgt,pt  %ncc, .co_med_hmove     ! loop til 7 or fewer bytes left
2126           add   %o1, 2, %o1             ! advance DST by 2
2127         addcc   %o2, 7, %o2             ! restore count
2128         bz,pt   %ncc, .co_sm_exit
2129           deccc %o2
2130         bz,pt   %ncc, .co_sm_byte
2131           nop
2132         ba,pt   %ncc, .co_sm_half
2133           nop
2134 
2135 /*
2136  * We got here because of a fault during short copyout.
2137  * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
2138  */
2139 .sm_copyout_err:
2140         membar  #Sync
2141         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2142         mov     SM_SAVE_SRC, %o0
2143         mov     SM_SAVE_DST, %o1
2144         mov     SM_SAVE_COUNT, %o2
2145         ldn     [THREAD_REG + T_COPYOPS], %o3   ! check for copyop handler
2146         tst     %o3
2147         bz,pt   %ncc, 3f                        ! if not, return error
2148           nop
2149         ldn     [%o3 + CP_COPYOUT], %o5         ! if handler, invoke it with
2150         jmp     %o5                             ! original arguments
2151           nop
2152 3:
2153         retl
2154           or    %g0, -1, %o0            ! return error value
2155 
2156         SET_SIZE(copyout)
2157 
2158 /*
2159  * The _more entry points are not intended to be used directly by
2160  * any caller from outside this file.  They are provided to allow
2161  * profiling and dtrace of the portions of the copy code that uses
2162  * the floating point registers.
2163  * This entry is particularly important as DTRACE (at least as of
2164  * 4/2004) does not support leaf functions.
2165  */
2166 
2167         ENTRY(copyout_more)
2168 .copyout_more:
2169         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2170         set     .copyout_err, REAL_LOFAULT
2171 
2172 /*
2173  * Copy outs that reach here are larger than VIS_COPY_THRESHOLD bytes
2174  */
2175 .do_copyout:
2176         set     copyio_fault, %l7               ! .copyio_fault is lofault val
2177 
2178         ldn     [THREAD_REG + T_LOFAULT], %l6   ! save existing handler
2179         membar  #Sync                           ! sync error barrier
2180         stn     %l7, [THREAD_REG + T_LOFAULT]   ! set t_lofault
2181 
2182         mov     %i0, SAVE_SRC
2183         mov     %i1, SAVE_DST
2184         mov     %i2, SAVE_COUNT
2185 
2186         FP_NOMIGRATE(6, 7)
2187 
2188         rd      %fprs, %o2              ! check for unused fp
2189         st      %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
2190         btst    FPRS_FEF, %o2
2191         bz,a,pt %icc, .do_blockcopyout
2192           wr    %g0, FPRS_FEF, %fprs
2193 
2194         BST_FPQ2Q4_TOSTACK(%o2)
2195 
2196 .do_blockcopyout:
2197         rd      %gsr, %o2
2198         stx     %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]      ! save gsr
2199         or      %l6, FPUSED_FLAG, %l6
2200 
2201         andcc   DST, VIS_BLOCKSIZE - 1, TMP
2202         mov     ASI_USER, %asi
2203         bz,pt   %ncc, 2f
2204           neg   TMP
2205         add     TMP, VIS_BLOCKSIZE, TMP
2206 
2207         ! TMP = bytes required to align DST on FP_BLOCK boundary
2208         ! Using SRC as a tmp here
2209         cmp     TMP, 3
2210         bleu,pt %ncc, 1f
2211           sub   CNT,TMP,CNT             ! adjust main count
2212         sub     TMP, 3, TMP             ! adjust for end of loop test
2213 .co_blkalign:
2214         ldub    [REALSRC], SRC          ! move 4 bytes per loop iteration
2215         stba    SRC, [DST]%asi
2216         subcc   TMP, 4, TMP
2217         ldub    [REALSRC + 1], SRC
2218         add     REALSRC, 4, REALSRC
2219         stba    SRC, [DST + 1]%asi
2220         ldub    [REALSRC - 2], SRC
2221         add     DST, 4, DST
2222         stba    SRC, [DST - 2]%asi
2223         ldub    [REALSRC - 1], SRC
2224         bgu,pt  %ncc, .co_blkalign
2225           stba  SRC, [DST - 1]%asi
2226 
2227         addcc   TMP, 3, TMP             ! restore count adjustment
2228         bz,pt   %ncc, 2f                ! no bytes left?
2229           nop
2230 1:      ldub    [REALSRC], SRC
2231         inc     REALSRC
2232         inc     DST
2233         deccc   TMP
2234         bgu     %ncc, 1b
2235           stba  SRC, [DST - 1]%asi
2236 
2237 2:
2238         andn    REALSRC, 0x7, SRC
2239         alignaddr REALSRC, %g0, %g0
2240 
2241         ! SRC - 8-byte aligned
2242         ! DST - 64-byte aligned
2243         prefetch [SRC], #one_read
2244         prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read
2245         prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read
2246         prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read
2247         ldd     [SRC], %f16
2248 #if CHEETAH_PREFETCH > 4
2249         prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
2250 #endif
2251         ldd     [SRC + 0x08], %f18
2252 #if CHEETAH_PREFETCH > 5
2253         prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read
2254 #endif
2255         ldd     [SRC + 0x10], %f20
2256 #if CHEETAH_PREFETCH > 6
2257         prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read
2258 #endif
2259         faligndata %f16, %f18, %f48
2260         ldd     [SRC + 0x18], %f22
2261 #if CHEETAH_PREFETCH > 7
2262         prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read
2263 #endif
2264         faligndata %f18, %f20, %f50
2265         ldd     [SRC + 0x20], %f24
2266         faligndata %f20, %f22, %f52
2267         ldd     [SRC + 0x28], %f26
2268         faligndata %f22, %f24, %f54
2269         ldd     [SRC + 0x30], %f28
2270         faligndata %f24, %f26, %f56
2271         ldd     [SRC + 0x38], %f30
2272         faligndata %f26, %f28, %f58
2273         ldd     [SRC + VIS_BLOCKSIZE], %f16
2274         sub     CNT, VIS_BLOCKSIZE, CNT
2275         add     SRC, VIS_BLOCKSIZE, SRC
2276         add     REALSRC, VIS_BLOCKSIZE, REALSRC
2277         ba,a,pt %ncc, 1f
2278           nop
2279         .align  16
2280 1:
2281         ldd     [SRC + 0x08], %f18
2282         faligndata %f28, %f30, %f60
2283         ldd     [SRC + 0x10], %f20
2284         faligndata %f30, %f16, %f62
2285         stda    %f48, [DST]ASI_BLK_AIUS
2286         ldd     [SRC + 0x18], %f22
2287         faligndata %f16, %f18, %f48
2288         ldd     [SRC + 0x20], %f24
2289         faligndata %f18, %f20, %f50
2290         ldd     [SRC + 0x28], %f26
2291         faligndata %f20, %f22, %f52
2292         ldd     [SRC + 0x30], %f28
2293         faligndata %f22, %f24, %f54
2294         ldd     [SRC + 0x38], %f30
2295         faligndata %f24, %f26, %f56
2296         sub     CNT, VIS_BLOCKSIZE, CNT
2297         ldd     [SRC + VIS_BLOCKSIZE], %f16
2298         faligndata %f26, %f28, %f58
2299         prefetch [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read
2300         add     DST, VIS_BLOCKSIZE, DST
2301         prefetch [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
2302         add     REALSRC, VIS_BLOCKSIZE, REALSRC
2303         cmp     CNT, VIS_BLOCKSIZE + 8
2304         bgu,pt  %ncc, 1b
2305           add   SRC, VIS_BLOCKSIZE, SRC
2306 
2307         ! only if REALSRC & 0x7 is 0
2308         cmp     CNT, VIS_BLOCKSIZE
2309         bne     %ncc, 3f
2310           andcc REALSRC, 0x7, %g0
2311         bz,pt   %ncc, 2f
2312           nop
2313 3:      
2314         faligndata %f28, %f30, %f60
2315         faligndata %f30, %f16, %f62
2316         stda    %f48, [DST]ASI_BLK_AIUS
2317         add     DST, VIS_BLOCKSIZE, DST
2318         ba,pt   %ncc, 3f
2319           nop
2320 2:
2321         ldd     [SRC + 0x08], %f18
2322         fsrc1   %f28, %f60
2323         ldd     [SRC + 0x10], %f20
2324         fsrc1   %f30, %f62
2325         stda    %f48, [DST]ASI_BLK_AIUS
2326         ldd     [SRC + 0x18], %f22
2327         fsrc1   %f16, %f48
2328         ldd     [SRC + 0x20], %f24
2329         fsrc1   %f18, %f50
2330         ldd     [SRC + 0x28], %f26
2331         fsrc1   %f20, %f52
2332         ldd     [SRC + 0x30], %f28
2333         fsrc1   %f22, %f54
2334         ldd     [SRC + 0x38], %f30
2335         fsrc1   %f24, %f56
2336         sub     CNT, VIS_BLOCKSIZE, CNT
2337         add     DST, VIS_BLOCKSIZE, DST
2338         add     SRC, VIS_BLOCKSIZE, SRC
2339         add     REALSRC, VIS_BLOCKSIZE, REALSRC
2340         fsrc1   %f26, %f58
2341         fsrc1   %f28, %f60
2342         fsrc1   %f30, %f62
2343         stda    %f48, [DST]ASI_BLK_AIUS
2344         add     DST, VIS_BLOCKSIZE, DST
2345         ba,a,pt %ncc, 4f
2346           nop
2347 
2348 3:      tst     CNT
2349         bz,a    %ncc, 4f
2350           nop
2351 
2352 5:      ldub    [REALSRC], TMP
2353         inc     REALSRC
2354         inc     DST
2355         deccc   CNT
2356         bgu     %ncc, 5b
2357           stba  TMP, [DST - 1]%asi
2358 4:
2359 
2360 .copyout_exit:
2361         membar  #Sync
2362 
2363         FPRAS_INTERVAL(FPRAS_COPYOUT, 0, %l5, %o2, %o3, %o4, %o5, 8)
2364         FPRAS_REWRITE_TYPE2Q2(0, %l5, %o2, %o3, 8, 9)
2365         FPRAS_CHECK(FPRAS_COPYOUT, %l5, 9)      ! lose outputs
2366 
2367         ldx     [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
2368         wr      %o2, 0, %gsr            ! restore gsr
2369 
2370         ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
2371         btst    FPRS_FEF, %o3
2372         bz,pt   %icc, 4f
2373           nop
2374 
2375         BLD_FPQ2Q4_FROMSTACK(%o2)
2376 
2377         ba,pt   %ncc, 1f
2378           wr    %o3, 0, %fprs           ! restore fprs
2379 
2380 4:
2381         FZEROQ2Q4
2382         wr      %o3, 0, %fprs           ! restore fprs
2383 
2384 1:
2385         membar  #Sync
2386         andn    %l6, FPUSED_FLAG, %l6
2387         stn     %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2388         FP_ALLOWMIGRATE(5, 6)
2389         ret
2390           restore       %g0, 0, %o0
2391 
2392 /*
2393  * We got here because of a fault during copyout.
2394  * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
2395  */
2396 .copyout_err:
2397         ldn     [THREAD_REG + T_COPYOPS], %o4   ! check for copyop handler
2398         tst     %o4
2399         bz,pt   %ncc, 2f                        ! if not, return error
2400           nop
2401         ldn     [%o4 + CP_COPYOUT], %g2         ! if handler, invoke it with
2402         jmp     %g2                             ! original arguments
2403           restore %g0, 0, %g0                   ! dispose of copy window
2404 2:
2405         ret
2406           restore %g0, -1, %o0                  ! return error value
2407 
2408 
2409         SET_SIZE(copyout_more)
2410 
2411 
2412         ENTRY(xcopyout)
2413         cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
2414         bleu,pt %ncc, .xcopyout_small           ! go to larger cases
2415           xor   %o0, %o1, %o3                   ! are src, dst alignable?
2416         btst    7, %o3                          !
2417         bz,pt   %ncc, .xcopyout_8               !
2418           nop
2419         btst    1, %o3                          ! 
2420         bz,pt   %ncc, .xcopyout_2               ! check for half-word
2421           nop
2422         sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
2423         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
2424         tst     %o3
2425         bz,pn   %icc, .xcopyout_small           ! if zero, disable HW copy
2426           cmp   %o2, %o3                        ! if length <= limit
2427         bleu,pt %ncc, .xcopyout_small           ! go to small copy
2428           nop
2429         ba,pt   %ncc, .xcopyout_more            ! otherwise go to large copy
2430           nop
2431 .xcopyout_2:
2432         btst    3, %o3                          !
2433         bz,pt   %ncc, .xcopyout_4               ! check for word alignment
2434           nop
2435         sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
2436         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
2437         tst     %o3
2438         bz,pn   %icc, .xcopyout_small           ! if zero, disable HW copy
2439           cmp   %o2, %o3                        ! if length <= limit
2440         bleu,pt %ncc, .xcopyout_small           ! go to small copy
2441           nop
2442         ba,pt   %ncc, .xcopyout_more            ! otherwise go to large copy
2443           nop
2444 .xcopyout_4:
2445         ! already checked longword, must be word aligned
2446         sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
2447         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
2448         tst     %o3
2449         bz,pn   %icc, .xcopyout_small           ! if zero, disable HW copy
2450           cmp   %o2, %o3                        ! if length <= limit
2451         bleu,pt %ncc, .xcopyout_small           ! go to small copy
2452           nop
2453         ba,pt   %ncc, .xcopyout_more            ! otherwise go to large copy
2454           nop
2455 .xcopyout_8:
2456         sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
2457         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
2458         tst     %o3
2459         bz,pn   %icc, .xcopyout_small           ! if zero, disable HW copy
2460           cmp   %o2, %o3                        ! if length <= limit
2461         bleu,pt %ncc, .xcopyout_small           ! go to small copy
2462           nop
2463         ba,pt   %ncc, .xcopyout_more            ! otherwise go to large copy
2464           nop
2465 
2466 .xcopyout_small:
2467         sethi   %hi(.sm_xcopyout_err), %o5      ! .sm_xcopyout_err is lofault
2468         or      %o5, %lo(.sm_xcopyout_err), %o5
2469         ldn     [THREAD_REG + T_LOFAULT], %o4   ! save existing handler
2470         membar  #Sync                           ! sync error barrier
2471         ba,pt   %ncc, .sm_do_copyout            ! common code
2472           stn   %o5, [THREAD_REG + T_LOFAULT]   ! set t_lofault
2473 
2474 .xcopyout_more:
2475         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2476         sethi   %hi(.xcopyout_err), REAL_LOFAULT
2477         ba,pt   %ncc, .do_copyout               ! common code
2478           or    REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
2479 
2480 /*
2481  * We got here because of fault during xcopyout
2482  * Errno value is in ERRNO
2483  */
2484 .xcopyout_err:
2485         ldn     [THREAD_REG + T_COPYOPS], %o4   ! check for copyop handler
2486         tst     %o4
2487         bz,pt   %ncc, 2f                        ! if not, return error
2488           nop
2489         ldn     [%o4 + CP_XCOPYOUT], %g2        ! if handler, invoke it with
2490         jmp     %g2                             ! original arguments
2491           restore %g0, 0, %g0                   ! dispose of copy window
2492 2:
2493         ret
2494           restore ERRNO, 0, %o0                 ! return errno value
2495 
2496 .sm_xcopyout_err:
2497 
2498         membar  #Sync
2499         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2500         mov     SM_SAVE_SRC, %o0
2501         mov     SM_SAVE_DST, %o1
2502         mov     SM_SAVE_COUNT, %o2
2503         ldn     [THREAD_REG + T_COPYOPS], %o3   ! check for copyop handler
2504         tst     %o3
2505         bz,pt   %ncc, 3f                        ! if not, return error
2506           nop
2507         ldn     [%o3 + CP_XCOPYOUT], %o5        ! if handler, invoke it with
2508         jmp     %o5                             ! original arguments
2509           nop
2510 3:
2511         retl
2512           or    %g1, 0, %o0             ! return errno value
2513 
2514         SET_SIZE(xcopyout)
2515 
2516         ENTRY(xcopyout_little)
2517         sethi   %hi(.xcopyio_err), %o5
2518         or      %o5, %lo(.xcopyio_err), %o5
2519         ldn     [THREAD_REG + T_LOFAULT], %o4
2520         membar  #Sync                           ! sync error barrier
2521         stn     %o5, [THREAD_REG + T_LOFAULT]
2522         mov     %o4, %o5
2523 
2524         subcc   %g0, %o2, %o3
2525         add     %o0, %o2, %o0
2526         bz,pn   %ncc, 2f                ! check for zero bytes
2527           sub   %o2, 1, %o4
2528         add     %o0, %o4, %o0           ! start w/last byte
2529         add     %o1, %o2, %o1
2530         ldub    [%o0 + %o3], %o4
2531 
2532 1:      stba    %o4, [%o1 + %o3]ASI_AIUSL
2533         inccc   %o3
2534         sub     %o0, 2, %o0             ! get next byte
2535         bcc,a,pt %ncc, 1b
2536           ldub  [%o0 + %o3], %o4
2537 
2538 2:
2539         membar  #Sync                           ! sync error barrier
2540         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2541         retl
2542           mov   %g0, %o0                ! return (0)
2543 
2544         SET_SIZE(xcopyout_little)
2545 
2546 /*
2547  * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
2548  */
2549 
2550         ENTRY(copyin)
2551         cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
2552         bleu,pt %ncc, .copyin_small             ! go to larger cases
2553           xor   %o0, %o1, %o3                   ! are src, dst alignable?
2554         btst    7, %o3                          !
2555         bz,pt   %ncc, .copyin_8                 ! check for longword alignment
2556           nop
2557         btst    1, %o3                          ! 
2558         bz,pt   %ncc, .copyin_2                 ! check for half-word
2559           nop
2560         sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
2561         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
2562         tst     %o3
2563         bz,pn   %icc, .copyin_small             ! if zero, disable HW copy
2564           cmp   %o2, %o3                        ! if length <= limit
2565         bleu,pt %ncc, .copyin_small             ! go to small copy
2566           nop
2567         ba,pt   %ncc, .copyin_more              ! otherwise go to large copy
2568           nop
2569 .copyin_2:
2570         btst    3, %o3                          !
2571         bz,pt   %ncc, .copyin_4                 ! check for word alignment
2572           nop
2573         sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
2574         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
2575         tst     %o3
2576         bz,pn   %icc, .copyin_small             ! if zero, disable HW copy
2577           cmp   %o2, %o3                        ! if length <= limit
2578         bleu,pt %ncc, .copyin_small             ! go to small copy
2579           nop
2580         ba,pt   %ncc, .copyin_more              ! otherwise go to large copy
2581           nop
2582 .copyin_4:
2583         ! already checked longword, must be word aligned
2584         sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
2585         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
2586         tst     %o3
2587         bz,pn   %icc, .copyin_small             ! if zero, disable HW copy
2588           cmp   %o2, %o3                        ! if length <= limit
2589         bleu,pt %ncc, .copyin_small             ! go to small copy
2590           nop
2591         ba,pt   %ncc, .copyin_more              ! otherwise go to large copy
2592           nop
2593 .copyin_8:
2594         sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
2595         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
2596         tst     %o3
2597         bz,pn   %icc, .copyin_small             ! if zero, disable HW copy
2598           cmp   %o2, %o3                        ! if length <= limit
2599         bleu,pt %ncc, .copyin_small             ! go to small copy
2600           nop
2601         ba,pt   %ncc, .copyin_more              ! otherwise go to large copy
2602           nop
2603 
2604         .align  16
2605         nop                             ! instruction alignment
2606                                         ! see discussion at start of file
2607 .copyin_small:
2608         sethi   %hi(.sm_copyin_err), %o5        ! .sm_copyin_err is lofault 
2609         or      %o5, %lo(.sm_copyin_err), %o5
2610         ldn     [THREAD_REG + T_LOFAULT], %o4   ! set/save t_lofault, no tramp
2611         membar  #Sync                           ! sync error barrier
2612         stn     %o5, [THREAD_REG + T_LOFAULT]
2613 .sm_do_copyin:
2614         mov     %o0, SM_SAVE_SRC
2615         mov     %o1, SM_SAVE_DST
2616         cmp     %o2, SHORTCOPY          ! check for really short case
2617         bleu,pt %ncc, .ci_sm_left       !
2618           mov   %o2, SM_SAVE_COUNT
2619         cmp     %o2, CHKSIZE            ! check for medium length cases
2620         bgu,pn  %ncc, .ci_med           !
2621           or    %o0, %o1, %o3           ! prepare alignment check
2622         andcc   %o3, 0x3, %g0           ! test for alignment
2623         bz,pt   %ncc, .ci_sm_word       ! branch to word aligned case
2624 .ci_sm_movebytes:
2625           sub   %o2, 3, %o2             ! adjust count to allow cc zero test
2626 .ci_sm_notalign4:
2627         lduba   [%o0]ASI_USER, %o3      ! read byte
2628         subcc   %o2, 4, %o2             ! reduce count by 4
2629         stb     %o3, [%o1]              ! write byte
2630         add     %o0, 1, %o0             ! advance SRC by 1
2631         lduba   [%o0]ASI_USER, %o3      ! repeat for a total of 4 bytes
2632         add     %o0, 1, %o0             ! advance SRC by 1
2633         stb     %o3, [%o1 + 1]
2634         add     %o1, 4, %o1             ! advance DST by 4
2635         lduba   [%o0]ASI_USER, %o3
2636         add     %o0, 1, %o0             ! advance SRC by 1
2637         stb     %o3, [%o1 - 2]
2638         lduba   [%o0]ASI_USER, %o3
2639         add     %o0, 1, %o0             ! advance SRC by 1
2640         bgt,pt  %ncc, .ci_sm_notalign4  ! loop til 3 or fewer bytes remain
2641           stb   %o3, [%o1 - 1]
2642         add     %o2, 3, %o2             ! restore count
2643 .ci_sm_left:
2644         tst     %o2
2645         bz,pt   %ncc, .ci_sm_exit
2646           nop
2647         lduba   [%o0]ASI_USER, %o3              ! load one byte
2648         deccc   %o2                     ! reduce count for cc test
2649         bz,pt   %ncc, .ci_sm_exit
2650           stb   %o3,[%o1]               ! store one byte
2651         inc     %o0
2652         lduba   [%o0]ASI_USER, %o3      ! load second byte
2653         deccc   %o2
2654         bz,pt   %ncc, .ci_sm_exit
2655           stb   %o3,[%o1 + 1]           ! store second byte
2656         inc     %o0
2657         lduba   [%o0]ASI_USER, %o3      ! load third byte
2658         stb     %o3,[%o1 + 2]           ! store third byte
2659         membar  #Sync                           ! sync error barrier
2660         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2661         retl
2662           mov   %g0, %o0                ! return 0
2663         .align  16
2664 .ci_sm_words:
2665         lduwa   [%o0]ASI_USER, %o3              ! read word
2666 .ci_sm_wordx:
2667         subcc   %o2, 8, %o2             ! update count
2668         stw     %o3, [%o1]              ! write word
2669         add     %o0, 4, %o0             ! update SRC
2670         add     %o1, 8, %o1             ! update DST
2671         lduwa   [%o0]ASI_USER, %o3      ! read word
2672         add     %o0, 4, %o0             ! update SRC
2673         bgt,pt  %ncc, .ci_sm_words      ! loop til done
2674           stw   %o3, [%o1 - 4]          ! write word
2675         addcc   %o2, 7, %o2             ! restore count
2676         bz,pt   %ncc, .ci_sm_exit
2677           nop
2678         deccc   %o2
2679         bz,pt   %ncc, .ci_sm_byte
2680 .ci_sm_half:
2681           subcc %o2, 2, %o2             ! reduce count by 2
2682         lduha   [%o0]ASI_USER, %o3      ! read half word
2683         add     %o0, 2, %o0             ! advance SRC by 2
2684         add     %o1, 2, %o1             ! advance DST by 2
2685         bgt,pt  %ncc, .ci_sm_half       ! loop til done
2686           sth   %o3, [%o1 - 2]          ! write half word
2687         addcc   %o2, 1, %o2             ! restore count
2688         bz,pt   %ncc, .ci_sm_exit
2689           nop
2690 .ci_sm_byte:
2691         lduba   [%o0]ASI_USER, %o3
2692         stb     %o3, [%o1]
2693         membar  #Sync                           ! sync error barrier
2694         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2695         retl
2696           mov   %g0, %o0                ! return 0
2697         .align  16
2698 .ci_sm_word:
2699         subcc   %o2, 4, %o2             ! update count
2700         bgt,pt  %ncc, .ci_sm_wordx
2701           lduwa [%o0]ASI_USER, %o3              ! read word
2702         addcc   %o2, 3, %o2             ! restore count
2703         bz,pt   %ncc, .ci_sm_exit
2704           stw   %o3, [%o1]              ! write word
2705         deccc   %o2                     ! reduce count for cc test
2706         add     %o0, 4, %o0
2707         lduba   [%o0]ASI_USER, %o3      ! load one byte
2708         bz,pt   %ncc, .ci_sm_exit
2709           stb   %o3, [%o1 + 4]          ! store one byte
2710         inc     %o0
2711         lduba   [%o0]ASI_USER, %o3      ! load second byte
2712         deccc   %o2
2713         bz,pt   %ncc, .ci_sm_exit
2714           stb   %o3, [%o1 + 5]          ! store second byte
2715         inc     %o0
2716         lduba   [%o0]ASI_USER, %o3      ! load third byte
2717         stb     %o3, [%o1 + 6]          ! store third byte
2718 .ci_sm_exit:
2719         membar  #Sync                           ! sync error barrier
2720         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2721         retl
2722           mov   %g0, %o0                ! return 0
2723 
2724         .align 16
2725 .ci_med:
2726         xor     %o0, %o1, %o3           ! setup alignment check
2727         btst    1, %o3
2728         bnz,pt  %ncc, .ci_sm_movebytes  ! unaligned
2729           nop
2730         btst    3, %o3
2731         bnz,pt  %ncc, .ci_med_half      ! halfword aligned
2732           nop
2733         btst    7, %o3
2734         bnz,pt  %ncc, .ci_med_word      ! word aligned
2735           nop
2736 .ci_med_long:
2737         btst    3, %o0                  ! check for
2738         bz,pt   %ncc, .ci_med_long1     ! word alignment
2739           nop
2740 .ci_med_long0:
2741         lduba   [%o0]ASI_USER, %o3              ! load one byte
2742         inc     %o0
2743         stb     %o3,[%o1]               ! store byte
2744         inc     %o1
2745         btst    3, %o0
2746         bnz,pt  %ncc, .ci_med_long0
2747           dec   %o2
2748 .ci_med_long1:                  ! word aligned
2749         btst    7, %o0                  ! check for long word
2750         bz,pt   %ncc, .ci_med_long2
2751           nop
2752         lduwa   [%o0]ASI_USER, %o3      ! load word
2753         add     %o0, 4, %o0             ! advance SRC by 4
2754         stw     %o3, [%o1]              ! store word
2755         add     %o1, 4, %o1             ! advance DST by 4
2756         sub     %o2, 4, %o2             ! reduce count by 4
2757 !
2758 !  Now long word aligned and have at least 32 bytes to move
2759 !
2760 .ci_med_long2:
2761         sub     %o2, 31, %o2            ! adjust count to allow cc zero test
2762 .ci_med_lmove:
2763         ldxa    [%o0]ASI_USER, %o3      ! read long word
2764         subcc   %o2, 32, %o2            ! reduce count by 32
2765         stx     %o3, [%o1]              ! write long word
2766         add     %o0, 8, %o0             ! advance SRC by 8
2767         ldxa    [%o0]ASI_USER, %o3      ! repeat for a total for 4 long words
2768         add     %o0, 8, %o0             ! advance SRC by 8
2769         stx     %o3, [%o1 + 8]
2770         add     %o1, 32, %o1            ! advance DST by 32
2771         ldxa    [%o0]ASI_USER, %o3
2772         add     %o0, 8, %o0             ! advance SRC by 8
2773         stx     %o3, [%o1 - 16]
2774         ldxa    [%o0]ASI_USER, %o3
2775         add     %o0, 8, %o0             ! advance SRC by 8
2776         bgt,pt  %ncc, .ci_med_lmove     ! loop til 31 or fewer bytes left
2777           stx   %o3, [%o1 - 8]
2778         addcc   %o2, 24, %o2            ! restore count to long word offset
2779         ble,pt  %ncc, .ci_med_lextra    ! check for more long words to move
2780           nop
2781 .ci_med_lword:
2782         ldxa    [%o0]ASI_USER, %o3      ! read long word
2783         subcc   %o2, 8, %o2             ! reduce count by 8
2784         stx     %o3, [%o1]              ! write long word
2785         add     %o0, 8, %o0             ! advance SRC by 8
2786         bgt,pt  %ncc, .ci_med_lword     ! loop til 7 or fewer bytes left
2787           add   %o1, 8, %o1             ! advance DST by 8
2788 .ci_med_lextra:
2789         addcc   %o2, 7, %o2             ! restore rest of count
2790         bz,pt   %ncc, .ci_sm_exit       ! if zero, then done
2791           deccc %o2
2792         bz,pt   %ncc, .ci_sm_byte
2793           nop
2794         ba,pt   %ncc, .ci_sm_half
2795           nop
2796 
2797         .align 16
2798         nop                             ! instruction alignment
2799                                         ! see discussion at start of file
2800 .ci_med_word:
2801         btst    3, %o0                  ! check for
2802         bz,pt   %ncc, .ci_med_word1     ! word alignment
2803           nop
2804 .ci_med_word0:
2805         lduba   [%o0]ASI_USER, %o3      ! load one byte
2806         inc     %o0
2807         stb     %o3,[%o1]               ! store byte
2808         inc     %o1
2809         btst    3, %o0
2810         bnz,pt  %ncc, .ci_med_word0
2811           dec   %o2
2812 !
2813 !  Now word aligned and have at least 36 bytes to move
2814 !
2815 .ci_med_word1:
2816         sub     %o2, 15, %o2            ! adjust count to allow cc zero test
2817 .ci_med_wmove:
2818         lduwa   [%o0]ASI_USER, %o3      ! read word
2819         subcc   %o2, 16, %o2            ! reduce count by 16
2820         stw     %o3, [%o1]              ! write word
2821         add     %o0, 4, %o0             ! advance SRC by 4
2822         lduwa   [%o0]ASI_USER, %o3      ! repeat for a total for 4 words
2823         add     %o0, 4, %o0             ! advance SRC by 4
2824         stw     %o3, [%o1 + 4]
2825         add     %o1, 16, %o1            ! advance DST by 16
2826         lduwa   [%o0]ASI_USER, %o3
2827         add     %o0, 4, %o0             ! advance SRC by 4
2828         stw     %o3, [%o1 - 8]
2829         lduwa   [%o0]ASI_USER, %o3
2830         add     %o0, 4, %o0             ! advance SRC by 4
2831         bgt,pt  %ncc, .ci_med_wmove     ! loop til 15 or fewer bytes left
2832           stw   %o3, [%o1 - 4]
2833         addcc   %o2, 12, %o2            ! restore count to word offset
2834         ble,pt  %ncc, .ci_med_wextra    ! check for more words to move
2835           nop
2836 .ci_med_word2:
2837         lduwa   [%o0]ASI_USER, %o3      ! read word
2838         subcc   %o2, 4, %o2             ! reduce count by 4
2839         stw     %o3, [%o1]              ! write word
2840         add     %o0, 4, %o0             ! advance SRC by 4
2841         bgt,pt  %ncc, .ci_med_word2     ! loop til 3 or fewer bytes left
2842           add   %o1, 4, %o1             ! advance DST by 4
2843 .ci_med_wextra:
2844         addcc   %o2, 3, %o2             ! restore rest of count
2845         bz,pt   %ncc, .ci_sm_exit       ! if zero, then done
2846           deccc %o2
2847         bz,pt   %ncc, .ci_sm_byte
2848           nop
2849         ba,pt   %ncc, .ci_sm_half
2850           nop
2851 
2852         .align 16
2853         nop                             ! instruction alignment
2854                                         ! see discussion at start of file
2855 .ci_med_half:
2856         btst    1, %o0                  ! check for
2857         bz,pt   %ncc, .ci_med_half1     ! half word alignment
2858           nop
2859         lduba   [%o0]ASI_USER, %o3      ! load one byte
2860         inc     %o0
2861         stb     %o3,[%o1]               ! store byte
2862         inc     %o1
2863         dec     %o2
2864 !
2865 !  Now half word aligned and have at least 38 bytes to move
2866 !
2867 .ci_med_half1:
2868         sub     %o2, 7, %o2             ! adjust count to allow cc zero test
2869 .ci_med_hmove:
2870         lduha   [%o0]ASI_USER, %o3      ! read half word
2871         subcc   %o2, 8, %o2             ! reduce count by 8
2872         sth     %o3, [%o1]              ! write half word
2873         add     %o0, 2, %o0             ! advance SRC by 2
2874         lduha   [%o0]ASI_USER, %o3      ! repeat for a total for 4 halfwords
2875         add     %o0, 2, %o0             ! advance SRC by 2
2876         sth     %o3, [%o1 + 2]
2877         add     %o1, 8, %o1             ! advance DST by 8
2878         lduha   [%o0]ASI_USER, %o3
2879         add     %o0, 2, %o0             ! advance SRC by 2
2880         sth     %o3, [%o1 - 4]
2881         lduha   [%o0]ASI_USER, %o3
2882         add     %o0, 2, %o0             ! advance SRC by 2
2883         bgt,pt  %ncc, .ci_med_hmove     ! loop til 7 or fewer bytes left
2884           sth   %o3, [%o1 - 2]
2885         addcc   %o2, 7, %o2             ! restore count
2886         bz,pt   %ncc, .ci_sm_exit
2887           deccc %o2
2888         bz,pt   %ncc, .ci_sm_byte
2889           nop
2890         ba,pt   %ncc, .ci_sm_half
2891           nop
2892 
2893 .sm_copyin_err:
2894         membar  #Sync
2895         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2896         mov     SM_SAVE_SRC, %o0
2897         mov     SM_SAVE_DST, %o1
2898         mov     SM_SAVE_COUNT, %o2
2899         ldn     [THREAD_REG + T_COPYOPS], %o3   ! check for copyop handler
2900         tst     %o3
2901         bz,pt   %ncc, 3f                        ! if not, return error
2902           nop
2903         ldn     [%o3 + CP_COPYIN], %o5          ! if handler, invoke it with
2904         jmp     %o5                             ! original arguments
2905           nop
2906 3:
2907         retl
2908           or    %g0, -1, %o0            ! return errno value
2909 
2910         SET_SIZE(copyin)
2911 
2912 
2913 /*
2914  * The _more entry points are not intended to be used directly by
2915  * any caller from outside this file.  They are provided to allow
2916  * profiling and dtrace of the portions of the copy code that uses
2917  * the floating point registers.
2918  * This entry is particularly important as DTRACE (at least as of
2919  * 4/2004) does not support leaf functions.
2920  */
2921 
2922         ENTRY(copyin_more)
2923 .copyin_more:
2924         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2925         set     .copyin_err, REAL_LOFAULT
2926 
2927 /*
2928  * Copy ins that reach here are larger than VIS_COPY_THRESHOLD bytes
2929  */
2930 .do_copyin:
2931         set     copyio_fault, %l7               ! .copyio_fault is lofault val
2932 
2933         ldn     [THREAD_REG + T_LOFAULT], %l6   ! save existing handler
2934         membar  #Sync                           ! sync error barrier
2935         stn     %l7, [THREAD_REG + T_LOFAULT]   ! set t_lofault
2936 
2937         mov     %i0, SAVE_SRC
2938         mov     %i1, SAVE_DST
2939         mov     %i2, SAVE_COUNT
2940 
2941         FP_NOMIGRATE(6, 7)
2942 
2943         rd      %fprs, %o2              ! check for unused fp
2944         st      %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
2945         btst    FPRS_FEF, %o2
2946         bz,a,pt %icc, .do_blockcopyin
2947           wr    %g0, FPRS_FEF, %fprs
2948 
2949         BST_FPQ2Q4_TOSTACK(%o2)
2950 
2951 .do_blockcopyin:
2952         rd      %gsr, %o2
2953         stx     %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]      ! save gsr
2954         or      %l6, FPUSED_FLAG, %l6
2955 
2956         andcc   DST, VIS_BLOCKSIZE - 1, TMP
2957         mov     ASI_USER, %asi
2958         bz,pt   %ncc, 2f
2959           neg   TMP
2960         add     TMP, VIS_BLOCKSIZE, TMP
2961 
2962         ! TMP = bytes required to align DST on FP_BLOCK boundary
2963         ! Using SRC as a tmp here
2964         cmp     TMP, 3
2965         bleu,pt %ncc, 1f
2966           sub   CNT,TMP,CNT             ! adjust main count
2967         sub     TMP, 3, TMP             ! adjust for end of loop test
2968 .ci_blkalign:
2969         lduba   [REALSRC]%asi, SRC      ! move 4 bytes per loop iteration
2970         stb     SRC, [DST]
2971         subcc   TMP, 4, TMP
2972         lduba   [REALSRC + 1]%asi, SRC
2973         add     REALSRC, 4, REALSRC
2974         stb     SRC, [DST + 1]
2975         lduba   [REALSRC - 2]%asi, SRC
2976         add     DST, 4, DST
2977         stb     SRC, [DST - 2]
2978         lduba   [REALSRC - 1]%asi, SRC
2979         bgu,pt  %ncc, .ci_blkalign
2980           stb   SRC, [DST - 1]
2981 
2982         addcc   TMP, 3, TMP             ! restore count adjustment
2983         bz,pt   %ncc, 2f                ! no bytes left?
2984           nop
2985 1:      lduba   [REALSRC]%asi, SRC
2986         inc     REALSRC
2987         inc     DST
2988         deccc   TMP
2989         bgu     %ncc, 1b
2990           stb   SRC, [DST - 1]
2991 
2992 2:
2993         andn    REALSRC, 0x7, SRC
2994         alignaddr REALSRC, %g0, %g0
2995 
2996         ! SRC - 8-byte aligned
2997         ! DST - 64-byte aligned
2998         prefetcha [SRC]%asi, #one_read
2999         prefetcha [SRC + (1 * VIS_BLOCKSIZE)]%asi, #one_read
3000         prefetcha [SRC + (2 * VIS_BLOCKSIZE)]%asi, #one_read
3001         prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #one_read
3002         ldda    [SRC]%asi, %f16
3003 #if CHEETAH_PREFETCH > 4
3004         prefetcha [SRC + (4 * VIS_BLOCKSIZE)]%asi, #one_read
3005 #endif
3006         ldda    [SRC + 0x08]%asi, %f18
3007 #if CHEETAH_PREFETCH > 5
3008         prefetcha [SRC + (5 * VIS_BLOCKSIZE)]%asi, #one_read
3009 #endif
3010         ldda    [SRC + 0x10]%asi, %f20
3011 #if CHEETAH_PREFETCH > 6
3012         prefetcha [SRC + (6 * VIS_BLOCKSIZE)]%asi, #one_read
3013 #endif
3014         faligndata %f16, %f18, %f48
3015         ldda    [SRC + 0x18]%asi, %f22
3016 #if CHEETAH_PREFETCH > 7
3017         prefetcha [SRC + (7 * VIS_BLOCKSIZE)]%asi, #one_read
3018 #endif
3019         faligndata %f18, %f20, %f50
3020         ldda    [SRC + 0x20]%asi, %f24
3021         faligndata %f20, %f22, %f52
3022         ldda    [SRC + 0x28]%asi, %f26
3023         faligndata %f22, %f24, %f54
3024         ldda    [SRC + 0x30]%asi, %f28
3025         faligndata %f24, %f26, %f56
3026         ldda    [SRC + 0x38]%asi, %f30
3027         faligndata %f26, %f28, %f58
3028         ldda    [SRC + VIS_BLOCKSIZE]%asi, %f16
3029         sub     CNT, VIS_BLOCKSIZE, CNT
3030         add     SRC, VIS_BLOCKSIZE, SRC
3031         add     REALSRC, VIS_BLOCKSIZE, REALSRC
3032         ba,a,pt %ncc, 1f
3033           nop
3034         .align  16
3035 1:
3036         ldda    [SRC + 0x08]%asi, %f18
3037         faligndata %f28, %f30, %f60
3038         ldda    [SRC + 0x10]%asi, %f20
3039         faligndata %f30, %f16, %f62
3040         stda    %f48, [DST]ASI_BLK_P
3041         ldda    [SRC + 0x18]%asi, %f22
3042         faligndata %f16, %f18, %f48
3043         ldda    [SRC + 0x20]%asi, %f24
3044         faligndata %f18, %f20, %f50
3045         ldda    [SRC + 0x28]%asi, %f26
3046         faligndata %f20, %f22, %f52
3047         ldda    [SRC + 0x30]%asi, %f28
3048         faligndata %f22, %f24, %f54
3049         ldda    [SRC + 0x38]%asi, %f30
3050         faligndata %f24, %f26, %f56
3051         sub     CNT, VIS_BLOCKSIZE, CNT
3052         ldda    [SRC + VIS_BLOCKSIZE]%asi, %f16
3053         faligndata %f26, %f28, %f58
3054         prefetcha [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8]%asi, #one_read
3055         add     DST, VIS_BLOCKSIZE, DST
3056         prefetcha [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read
3057         add     REALSRC, VIS_BLOCKSIZE, REALSRC
3058         cmp     CNT, VIS_BLOCKSIZE + 8
3059         bgu,pt  %ncc, 1b
3060           add   SRC, VIS_BLOCKSIZE, SRC
3061 
3062         ! only if REALSRC & 0x7 is 0
3063         cmp     CNT, VIS_BLOCKSIZE
3064         bne     %ncc, 3f
3065           andcc REALSRC, 0x7, %g0
3066         bz,pt   %ncc, 2f
3067           nop
3068 3:      
3069         faligndata %f28, %f30, %f60
3070         faligndata %f30, %f16, %f62
3071         stda    %f48, [DST]ASI_BLK_P
3072         add     DST, VIS_BLOCKSIZE, DST
3073         ba,pt   %ncc, 3f
3074           nop
3075 2:
3076         ldda    [SRC + 0x08]%asi, %f18
3077         fsrc1   %f28, %f60
3078         ldda    [SRC + 0x10]%asi, %f20
3079         fsrc1   %f30, %f62
3080         stda    %f48, [DST]ASI_BLK_P
3081         ldda    [SRC + 0x18]%asi, %f22
3082         fsrc1   %f16, %f48
3083         ldda    [SRC + 0x20]%asi, %f24
3084         fsrc1   %f18, %f50
3085         ldda    [SRC + 0x28]%asi, %f26
3086         fsrc1   %f20, %f52
3087         ldda    [SRC + 0x30]%asi, %f28
3088         fsrc1   %f22, %f54
3089         ldda    [SRC + 0x38]%asi, %f30
3090         fsrc1   %f24, %f56
3091         sub     CNT, VIS_BLOCKSIZE, CNT
3092         add     DST, VIS_BLOCKSIZE, DST
3093         add     SRC, VIS_BLOCKSIZE, SRC
3094         add     REALSRC, VIS_BLOCKSIZE, REALSRC
3095         fsrc1   %f26, %f58
3096         fsrc1   %f28, %f60
3097         fsrc1   %f30, %f62
3098         stda    %f48, [DST]ASI_BLK_P
3099         add     DST, VIS_BLOCKSIZE, DST
3100         ba,a,pt %ncc, 4f
3101           nop
3102 
3103 3:      tst     CNT
3104         bz,a    %ncc, 4f
3105           nop
3106 
3107 5:      lduba   [REALSRC]ASI_USER, TMP
3108         inc     REALSRC
3109         inc     DST
3110         deccc   CNT
3111         bgu     %ncc, 5b
3112           stb   TMP, [DST - 1]
3113 4:
3114 
3115 .copyin_exit:
3116         membar  #Sync
3117 
3118         FPRAS_INTERVAL(FPRAS_COPYIN, 1, %l5, %o2, %o3, %o4, %o5, 8)
3119         FPRAS_REWRITE_TYPE1(1, %l5, %f48, %o2, 9)
3120         FPRAS_CHECK(FPRAS_COPYIN, %l5, 9)       ! lose outputs
3121 
3122         ldx     [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2      ! restore gsr
3123         wr      %o2, 0, %gsr
3124 
3125         ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
3126         btst    FPRS_FEF, %o3
3127         bz,pt   %icc, 4f
3128           nop
3129 
3130         BLD_FPQ2Q4_FROMSTACK(%o2)
3131 
3132         ba,pt   %ncc, 1f
3133           wr    %o3, 0, %fprs           ! restore fprs
3134 
3135 4:
3136         FZEROQ2Q4
3137         wr      %o3, 0, %fprs           ! restore fprs
3138 
3139 1:
3140         membar  #Sync                           ! sync error barrier
3141         andn    %l6, FPUSED_FLAG, %l6
3142         stn     %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3143         FP_ALLOWMIGRATE(5, 6)
3144         ret
3145           restore       %g0, 0, %o0
3146 /*
3147  * We got here because of a fault during copyin
3148  * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
3149  */
3150 .copyin_err:
3151         ldn     [THREAD_REG + T_COPYOPS], %o4   ! check for copyop handler
3152         tst     %o4
3153         bz,pt   %ncc, 2f                        ! if not, return error
3154         nop
3155         ldn     [%o4 + CP_COPYIN], %g2          ! if handler, invoke it with
3156         jmp     %g2                             ! original arguments
3157         restore %g0, 0, %g0                     ! dispose of copy window
3158 2:
3159         ret
3160         restore %g0, -1, %o0                    ! return error value
3161 
3162 
3163         SET_SIZE(copyin_more)
3164 
3165         ENTRY(xcopyin)
3166 
3167         cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
3168         bleu,pt %ncc, .xcopyin_small            ! go to larger cases
3169           xor   %o0, %o1, %o3                   ! are src, dst alignable?
3170         btst    7, %o3                          !
3171         bz,pt   %ncc, .xcopyin_8                ! check for longword alignment
3172           nop
3173         btst    1, %o3                          ! 
3174         bz,pt   %ncc, .xcopyin_2                ! check for half-word
3175           nop
3176         sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
3177         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
3178         tst     %o3
3179         bz,pn   %icc, .xcopyin_small            ! if zero, disable HW copy
3180           cmp   %o2, %o3                        ! if length <= limit
3181         bleu,pt %ncc, .xcopyin_small            ! go to small copy
3182           nop
3183         ba,pt   %ncc, .xcopyin_more             ! otherwise go to large copy
3184           nop
3185 .xcopyin_2:
3186         btst    3, %o3                          !
3187         bz,pt   %ncc, .xcopyin_4                ! check for word alignment
3188           nop
3189         sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
3190         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
3191         tst     %o3
3192         bz,pn   %icc, .xcopyin_small            ! if zero, disable HW copy
3193           cmp   %o2, %o3                        ! if length <= limit
3194         bleu,pt %ncc, .xcopyin_small            ! go to small copy
3195           nop
3196         ba,pt   %ncc, .xcopyin_more             ! otherwise go to large copy
3197           nop
3198 .xcopyin_4:
3199         ! already checked longword, must be word aligned
3200         sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
3201         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
3202         tst     %o3
3203         bz,pn   %icc, .xcopyin_small            ! if zero, disable HW copy
3204           cmp   %o2, %o3                        ! if length <= limit
3205         bleu,pt %ncc, .xcopyin_small            ! go to small copy
3206           nop
3207         ba,pt   %ncc, .xcopyin_more             ! otherwise go to large copy
3208           nop
3209 .xcopyin_8:
3210         sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
3211         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
3212         tst     %o3
3213         bz,pn   %icc, .xcopyin_small            ! if zero, disable HW copy
3214           cmp   %o2, %o3                        ! if length <= limit
3215         bleu,pt %ncc, .xcopyin_small            ! go to small copy
3216           nop
3217         ba,pt   %ncc, .xcopyin_more             ! otherwise go to large copy
3218           nop
3219 
3220 .xcopyin_small:
3221         sethi   %hi(.sm_xcopyin_err), %o5  ! .sm_xcopyin_err is lofault value
3222         or      %o5, %lo(.sm_xcopyin_err), %o5
3223         ldn     [THREAD_REG + T_LOFAULT], %o4   ! set/save t_lofaul
3224         membar  #Sync                           ! sync error barrier
3225         ba,pt   %ncc, .sm_do_copyin             ! common code
3226           stn   %o5, [THREAD_REG + T_LOFAULT]
3227         
3228 .xcopyin_more:
3229         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3230         sethi   %hi(.xcopyin_err), REAL_LOFAULT ! .xcopyin_err is lofault value
3231         ba,pt   %ncc, .do_copyin
3232           or    REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
3233 
3234 /*
3235  * We got here because of fault during xcopyin
3236  * Errno value is in ERRNO
3237  */
3238 .xcopyin_err:
3239         ldn     [THREAD_REG + T_COPYOPS], %o4   ! check for copyop handler
3240         tst     %o4
3241         bz,pt   %ncc, 2f                        ! if not, return error
3242           nop
3243         ldn     [%o4 + CP_XCOPYIN], %g2         ! if handler, invoke it with
3244         jmp     %g2                             ! original arguments
3245           restore %g0, 0, %g0                   ! dispose of copy window
3246 2:
3247         ret
3248           restore ERRNO, 0, %o0                 ! return errno value
3249 
3250 .sm_xcopyin_err:
3251 
3252         membar  #Sync
3253         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3254         mov     SM_SAVE_SRC, %o0
3255         mov     SM_SAVE_DST, %o1
3256         mov     SM_SAVE_COUNT, %o2
3257         ldn     [THREAD_REG + T_COPYOPS], %o3   ! check for copyop handler
3258         tst     %o3
3259         bz,pt   %ncc, 3f                        ! if not, return error
3260           nop
3261         ldn     [%o3 + CP_XCOPYIN], %o5         ! if handler, invoke it with
3262         jmp     %o5                             ! original arguments
3263           nop
3264 3:
3265         retl
3266           or    %g1, 0, %o0             ! return errno value
3267 
3268         SET_SIZE(xcopyin)
3269 
3270         ENTRY(xcopyin_little)
3271         sethi   %hi(.xcopyio_err), %o5
3272         or      %o5, %lo(.xcopyio_err), %o5
3273         ldn     [THREAD_REG + T_LOFAULT], %o4
3274         membar  #Sync                           ! sync error barrier
3275         stn     %o5, [THREAD_REG + T_LOFAULT]   
3276         mov     %o4, %o5
3277 
3278         subcc   %g0, %o2, %o3
3279         add     %o0, %o2, %o0
3280         bz,pn   %ncc, 2f                ! check for zero bytes
3281           sub   %o2, 1, %o4
3282         add     %o0, %o4, %o0           ! start w/last byte     
3283         add     %o1, %o2, %o1
3284         lduba   [%o0 + %o3]ASI_AIUSL, %o4
3285 
3286 1:      stb     %o4, [%o1 + %o3]
3287         inccc   %o3
3288         sub     %o0, 2, %o0             ! get next byte
3289         bcc,a,pt %ncc, 1b
3290           lduba [%o0 + %o3]ASI_AIUSL, %o4
3291 
3292 2:
3293         membar  #Sync                           ! sync error barrier
3294         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3295         retl
3296           mov   %g0, %o0                ! return (0)
3297 
3298 .xcopyio_err:
3299         membar  #Sync                           ! sync error barrier
3300         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3301         retl
3302           mov   %g1, %o0
3303 
3304         SET_SIZE(xcopyin_little)
3305 
3306 
3307 /*
3308  * Copy a block of storage - must not overlap (from + len <= to).
3309  * No fault handler installed (to be called under on_fault())
3310  */
3311         ENTRY(copyin_noerr)
3312 
3313         cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
3314         bleu,pt %ncc, .copyin_ne_small          ! go to larger cases
3315           xor   %o0, %o1, %o3                   ! are src, dst alignable?
3316         btst    7, %o3                          !
3317         bz,pt   %ncc, .copyin_ne_8              ! check for longword alignment
3318           nop
3319         btst    1, %o3                          ! 
3320         bz,pt   %ncc, .copyin_ne_2              ! check for half-word
3321           nop
3322         sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
3323         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
3324         tst     %o3
3325         bz,pn   %icc, .copyin_ne_small          ! if zero, disable HW copy
3326           cmp   %o2, %o3                        ! if length <= limit
3327         bleu,pt %ncc, .copyin_ne_small          ! go to small copy
3328           nop
3329         ba,pt   %ncc, .copyin_noerr_more        ! otherwise go to large copy
3330           nop
3331 .copyin_ne_2:
3332         btst    3, %o3                          !
3333         bz,pt   %ncc, .copyin_ne_4              ! check for word alignment
3334           nop
3335         sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
3336         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
3337         tst     %o3
3338         bz,pn   %icc, .copyin_ne_small          ! if zero, disable HW copy
3339           cmp   %o2, %o3                        ! if length <= limit
3340         bleu,pt %ncc, .copyin_ne_small          ! go to small copy
3341           nop
3342         ba,pt   %ncc, .copyin_noerr_more        ! otherwise go to large copy
3343           nop
3344 .copyin_ne_4:
3345         ! already checked longword, must be word aligned
3346         sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
3347         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
3348         tst     %o3
3349         bz,pn   %icc, .copyin_ne_small          ! if zero, disable HW copy
3350           cmp   %o2, %o3                        ! if length <= limit
3351         bleu,pt %ncc, .copyin_ne_small          ! go to small copy
3352           nop
3353         ba,pt   %ncc, .copyin_noerr_more        ! otherwise go to large copy
3354           nop
3355 .copyin_ne_8:
3356         sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
3357         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
3358         tst     %o3
3359         bz,pn   %icc, .copyin_ne_small          ! if zero, disable HW copy
3360           cmp   %o2, %o3                        ! if length <= limit
3361         bleu,pt %ncc, .copyin_ne_small          ! go to small copy
3362           nop
3363         ba,pt   %ncc, .copyin_noerr_more        ! otherwise go to large copy
3364           nop
3365 
3366 .copyin_ne_small:
3367         ldn     [THREAD_REG + T_LOFAULT], %o4
3368         tst     %o4
3369         bz,pn   %ncc, .sm_do_copyin
3370           nop
3371         sethi   %hi(.sm_copyio_noerr), %o5
3372         or      %o5, %lo(.sm_copyio_noerr), %o5
3373         membar  #Sync                           ! sync error barrier
3374         ba,pt   %ncc, .sm_do_copyin
3375           stn   %o5, [THREAD_REG + T_LOFAULT]   ! set/save t_lofault
3376 
3377 .copyin_noerr_more:
3378         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3379         sethi   %hi(.copyio_noerr), REAL_LOFAULT
3380         ba,pt   %ncc, .do_copyin
3381           or    REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
3382 
3383 .copyio_noerr:
3384         jmp     %l6
3385           restore %g0,0,%g0
3386 
3387 .sm_copyio_noerr:
3388         membar  #Sync
3389         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore t_lofault
3390         jmp     %o4
3391           nop
3392 
3393         SET_SIZE(copyin_noerr)
3394 
3395 /*
3396  * Copy a block of storage - must not overlap (from + len <= to).
3397  * No fault handler installed (to be called under on_fault())
3398  */
3399 
3400         ENTRY(copyout_noerr)
3401 
3402         cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
3403         bleu,pt %ncc, .copyout_ne_small         ! go to larger cases
3404           xor   %o0, %o1, %o3                   ! are src, dst alignable?
3405         btst    7, %o3                          !
3406         bz,pt   %ncc, .copyout_ne_8             ! check for longword alignment
3407           nop
3408         btst    1, %o3                          ! 
3409         bz,pt   %ncc, .copyout_ne_2             ! check for half-word
3410           nop
3411         sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
3412         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
3413         tst     %o3
3414         bz,pn   %icc, .copyout_ne_small         ! if zero, disable HW copy
3415           cmp   %o2, %o3                        ! if length <= limit
3416         bleu,pt %ncc, .copyout_ne_small         ! go to small copy
3417           nop
3418         ba,pt   %ncc, .copyout_noerr_more       ! otherwise go to large copy
3419           nop
3420 .copyout_ne_2:
3421         btst    3, %o3                          !
3422         bz,pt   %ncc, .copyout_ne_4             ! check for word alignment
3423           nop
3424         sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
3425         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
3426         tst     %o3
3427         bz,pn   %icc, .copyout_ne_small         ! if zero, disable HW copy
3428           cmp   %o2, %o3                        ! if length <= limit
3429         bleu,pt %ncc, .copyout_ne_small         ! go to small copy
3430           nop
3431         ba,pt   %ncc, .copyout_noerr_more       ! otherwise go to large copy
3432           nop
3433 .copyout_ne_4:
3434         ! already checked longword, must be word aligned
3435         sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
3436         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
3437         tst     %o3
3438         bz,pn   %icc, .copyout_ne_small         ! if zero, disable HW copy
3439           cmp   %o2, %o3                        ! if length <= limit
3440         bleu,pt %ncc, .copyout_ne_small         ! go to small copy
3441           nop
3442         ba,pt   %ncc, .copyout_noerr_more       ! otherwise go to large copy
3443           nop
3444 .copyout_ne_8:
3445         sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
3446         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
3447         tst     %o3
3448         bz,pn   %icc, .copyout_ne_small         ! if zero, disable HW copy
3449           cmp   %o2, %o3                        ! if length <= limit
3450         bleu,pt %ncc, .copyout_ne_small         ! go to small copy
3451           nop
3452         ba,pt   %ncc, .copyout_noerr_more       ! otherwise go to large copy
3453           nop
3454 
3455 .copyout_ne_small:
3456         ldn     [THREAD_REG + T_LOFAULT], %o4
3457         tst     %o4
3458         bz,pn   %ncc, .sm_do_copyout
3459           nop
3460         sethi   %hi(.sm_copyio_noerr), %o5
3461         or      %o5, %lo(.sm_copyio_noerr), %o5
3462         membar  #Sync                           ! sync error barrier
3463         ba,pt   %ncc, .sm_do_copyout
3464         stn     %o5, [THREAD_REG + T_LOFAULT]   ! set/save t_lofault
3465 
3466 .copyout_noerr_more:
3467         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3468         sethi   %hi(.copyio_noerr), REAL_LOFAULT
3469         ba,pt   %ncc, .do_copyout
3470           or    REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
3471 
3472         SET_SIZE(copyout_noerr)
3473 
3474 
3475 /*
3476  * hwblkclr - clears block-aligned, block-multiple-sized regions that are
3477  * longer than 256 bytes in length using spitfire's block stores.  If
3478  * the criteria for using this routine are not met then it calls bzero
3479  * and returns 1.  Otherwise 0 is returned indicating success.
3480  * Caller is responsible for ensuring use_hw_bzero is true and that
3481  * kpreempt_disable() has been called.
3482  */
3483         ! %i0 - start address
3484         ! %i1 - length of region (multiple of 64)
3485         ! %l0 - saved fprs
3486         ! %l1 - pointer to saved %d0 block
3487         ! %l2 - saved curthread->t_lwp
3488 
3489         ENTRY(hwblkclr)
3490         ! get another window w/space for one aligned block of saved fpregs
3491         save    %sp, -SA(MINFRAME + 2*VIS_BLOCKSIZE), %sp
3492 
3493         ! Must be block-aligned
3494         andcc   %i0, (VIS_BLOCKSIZE-1), %g0
3495         bnz,pn  %ncc, 1f
3496           nop
3497 
3498         ! ... and must be 256 bytes or more
3499         cmp     %i1, 256
3500         blu,pn  %ncc, 1f
3501           nop
3502 
3503         ! ... and length must be a multiple of VIS_BLOCKSIZE
3504         andcc   %i1, (VIS_BLOCKSIZE-1), %g0
3505         bz,pn   %ncc, 2f
3506           nop
3507 
3508 1:      ! punt, call bzero but notify the caller that bzero was used
3509         mov     %i0, %o0
3510         call    bzero
3511         mov     %i1, %o1
3512         ret
3513           restore       %g0, 1, %o0 ! return (1) - did not use block operations
3514 
3515 2:      rd      %fprs, %l0              ! check for unused fp
3516         btst    FPRS_FEF, %l0
3517         bz,pt   %icc, 1f
3518           nop
3519 
3520         ! save in-use fpregs on stack
3521         membar  #Sync
3522         add     %fp, STACK_BIAS - 65, %l1
3523         and     %l1, -VIS_BLOCKSIZE, %l1
3524         stda    %d0, [%l1]ASI_BLK_P
3525 
3526 1:      membar  #StoreStore|#StoreLoad|#LoadStore
3527         wr      %g0, FPRS_FEF, %fprs
3528         wr      %g0, ASI_BLK_P, %asi
3529 
3530         ! Clear block
3531         fzero   %d0
3532         fzero   %d2
3533         fzero   %d4
3534         fzero   %d6
3535         fzero   %d8
3536         fzero   %d10
3537         fzero   %d12
3538         fzero   %d14
3539 
3540         mov     256, %i3
3541         ba,pt   %ncc, .pz_doblock
3542           nop
3543 
3544 .pz_blkstart:   
3545       ! stda    %d0, [%i0 + 192]%asi  ! in dly slot of branch that got us here
3546         stda    %d0, [%i0 + 128]%asi
3547         stda    %d0, [%i0 + 64]%asi
3548         stda    %d0, [%i0]%asi
3549 .pz_zinst:
3550         add     %i0, %i3, %i0
3551         sub     %i1, %i3, %i1
3552 .pz_doblock:
3553         cmp     %i1, 256
3554         bgeu,a  %ncc, .pz_blkstart
3555           stda  %d0, [%i0 + 192]%asi
3556 
3557         cmp     %i1, 64
3558         blu     %ncc, .pz_finish
3559         
3560           andn  %i1, (64-1), %i3
3561         srl     %i3, 4, %i2             ! using blocks, 1 instr / 16 words
3562         set     .pz_zinst, %i4
3563         sub     %i4, %i2, %i4
3564         jmp     %i4
3565           nop
3566 
3567 .pz_finish:
3568         membar  #Sync
3569         btst    FPRS_FEF, %l0
3570         bz,a    .pz_finished
3571           wr    %l0, 0, %fprs           ! restore fprs
3572 
3573         ! restore fpregs from stack
3574         ldda    [%l1]ASI_BLK_P, %d0
3575         membar  #Sync
3576         wr      %l0, 0, %fprs           ! restore fprs
3577 
3578 .pz_finished:
3579         ret
3580           restore       %g0, 0, %o0             ! return (bzero or not)
3581 
3582         SET_SIZE(hwblkclr)
3583 
3584         /*
3585          * Copy 32 bytes of data from src (%o0) to dst (%o1)
3586          * using physical addresses.
3587          */
3588         ENTRY_NP(hw_pa_bcopy32)
3589         rdpr    %pstate, %g1
3590         andn    %g1, PSTATE_IE, %g2
3591         wrpr    %g0, %g2, %pstate
3592 
3593         rdpr    %pstate, %g0
3594         ldxa    [%o0]ASI_MEM, %o2
3595         add     %o0, 8, %o0
3596         ldxa    [%o0]ASI_MEM, %o3
3597         add     %o0, 8, %o0
3598         ldxa    [%o0]ASI_MEM, %o4
3599         add     %o0, 8, %o0
3600         ldxa    [%o0]ASI_MEM, %o5
3601 
3602         stxa    %g0, [%o1]ASI_DC_INVAL
3603         membar  #Sync
3604 
3605         stxa    %o2, [%o1]ASI_MEM
3606         add     %o1, 8, %o1
3607         stxa    %o3, [%o1]ASI_MEM
3608         add     %o1, 8, %o1
3609         stxa    %o4, [%o1]ASI_MEM
3610         add     %o1, 8, %o1
3611         stxa    %o5, [%o1]ASI_MEM
3612 
3613         retl
3614           wrpr    %g0, %g1, %pstate
3615 
3616         SET_SIZE(hw_pa_bcopy32)
3617 
3618         DGDEF(use_hw_bcopy)
3619         .word   1
3620         DGDEF(use_hw_bzero)
3621         .word   1
3622         DGDEF(hw_copy_limit_1)
3623         .word   0
3624         DGDEF(hw_copy_limit_2)
3625         .word   0
3626         DGDEF(hw_copy_limit_4)
3627         .word   0
3628         DGDEF(hw_copy_limit_8)
3629         .word   0
3630 
3631         .align  64
3632         .section ".text"