1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 #include <sys/param.h>
  27 #include <sys/errno.h>
  28 #include <sys/asm_linkage.h>
  29 #include <sys/vtrace.h>
  30 #include <sys/machthread.h>
  31 #include <sys/clock.h>
  32 #include <sys/asi.h>
  33 #include <sys/fsr.h>
  34 #include <sys/privregs.h>
  35 
  36 #if !defined(lint)
  37 #include "assym.h"
  38 #endif  /* lint */
  39 
  40 /*
  41  * Pseudo-code to aid in understanding the control flow of the
  42  * bcopy/copyin/copyout routines.
  43  *
  44  * On entry:
  45  *
  46  *      ! Determine whether to use the FP register version
  47  *      ! or the leaf routine version depending on size
  48  *      ! of copy and flags.  Set up error handling accordingly.
  49  *      ! The transition point depends on whether the src and
  50  *      ! dst addresses can be aligned to long word, word,
  51  *      ! half word, or byte boundaries.
  52  *      !
  53  *      ! WARNING: <Register usage convention>
  54  *      ! For FP version, %l6 holds previous error handling and
  55  *      ! a flag: TRAMP_FLAG (low bits)
  56  *      ! for leaf routine version, %o4 holds those values.
  57  *      ! So either %l6 or %o4 is reserved and not available for
  58  *      ! any other use.
  59  *
  60  *      if (length <= VIS_COPY_THRESHOLD)    ! start with a quick test
  61  *              go to small_copy;               ! to speed short copies
  62  *
  63  *      ! src, dst long word alignable
  64  *              if (hw_copy_limit_8 == 0)       ! hw_copy disabled
  65  *                      go to small_copy;
  66  *              if (length <= hw_copy_limit_8)
  67  *                      go to small_copy;
  68  *              go to FPBLK_copy;
  69  *      }
  70  *      if (src,dst not alignable) {
  71  *              if (hw_copy_limit_1 == 0)       ! hw_copy disabled
  72  *                      go to small_copy;
  73  *              if (length <= hw_copy_limit_1)
  74  *                      go to small_copy;
  75  *              go to FPBLK_copy;
  76  *      }
  77  *      if (src,dst halfword alignable) {
  78  *              if (hw_copy_limit_2 == 0)       ! hw_copy disabled
  79  *                      go to small_copy;
  80  *              if (length <= hw_copy_limit_2)
  81  *                      go to small_copy;
  82  *              go to FPBLK_copy;
  83  *      }
  84  *      if (src,dst word alignable) {
  85  *              if (hw_copy_limit_4 == 0)       ! hw_copy disabled
  86  *                      go to small_copy;
  87  *              if (length <= hw_copy_limit_4)
  88  *                      go to small_copy;
  89  *              go to FPBLK_copy;
  90  *      }
  91  *
  92  * small_copy:
  93  *      Setup_leaf_rtn_error_handler;           ! diffs for each entry point
  94  *
  95  *      if (count <= 3)                              ! fast path for tiny copies
  96  *              go to sm_left;                  ! special finish up code
  97  *      else
  98  *              if (count > CHKSIZE)         ! medium sized copies
  99  *                      go to sm_med            ! tuned by alignment
 100  *              if(src&dst not both word aligned) {
 101  *      sm_movebytes:
 102  *                      move byte by byte in 4-way unrolled loop
 103  *                      fall into sm_left;
 104  *      sm_left:
 105  *                      move 0-3 bytes byte at a time as needed.
 106  *                      restore error handler and exit.
 107  *
 108  *              } else {        ! src&dst are word aligned
 109  *                      check for at least 8 bytes left,
 110  *                      move word at a time, unrolled by 2
 111  *                      when fewer than 8 bytes left,
 112  *      sm_half:        move half word at a time while 2 or more bytes left
 113  *      sm_byte:        move final byte if necessary
 114  *      sm_exit:
 115  *                      restore error handler and exit.
 116  *              }
 117  *
 118  * ! Medium length cases with at least CHKSIZE bytes available
 119  * ! method: line up src and dst as best possible, then
 120  * ! move data in 4-way unrolled loops.
 121  *
 122  * sm_med:
 123  *      if(src&dst unalignable)
 124  *              go to sm_movebytes
 125  *      if(src&dst halfword alignable)
 126  *              go to sm_movehalf
 127  *      if(src&dst word alignable)
 128  *              go to sm_moveword
 129  * ! fall into long word movement
 130  *      move bytes until src is word aligned
 131  *      if not long word aligned, move a word
 132  *      move long words in 4-way unrolled loop until < 32 bytes left
 133  *      move long words in 1-way unrolled loop until < 8 bytes left
 134  *      if zero bytes left, goto sm_exit
 135  *      if one byte left, go to sm_byte
 136  *      else go to sm_half
 137  *
 138  * sm_moveword:
 139  *      move bytes until src is word aligned
 140  *      move words in 4-way unrolled loop until < 16 bytes left
 141  *      move words in 1-way unrolled loop until < 4 bytes left
 142  *      if zero bytes left, goto sm_exit
 143  *      if one byte left, go to sm_byte
 144  *      else go to sm_half
 145  *
 146  * sm_movehalf:
 147  *      move a byte if needed to align src on halfword
 148  *      move halfwords in 4-way unrolled loop until < 8 bytes left
 149  *      if zero bytes left, goto sm_exit
 150  *      if one byte left, go to sm_byte
 151  *      else go to sm_half
 152  *
 153  *
 154  * FPBLK_copy:
 155  *      %l6 = curthread->t_lofault;
 156  *      if (%l6 != NULL) {
 157  *              membar #Sync
 158  *              curthread->t_lofault = .copyerr;
 159  *              caller_error_handler = TRUE             ! %l6 |= 2
 160  *      }
 161  *
 162  *      ! for FPU testing we must not migrate cpus
 163  *      if (curthread->t_lwp == NULL) {
 164  *              ! Kernel threads do not have pcb's in which to store
 165  *              ! the floating point state, so disallow preemption during
 166  *              ! the copy.  This also prevents cpu migration.
 167  *              kpreempt_disable(curthread);
 168  *      } else {
 169  *              thread_nomigrate();
 170  *      }
 171  *
 172  *      old_fprs = %fprs;
 173  *      old_gsr = %gsr;
 174  *      if (%fprs.fef) {
 175  *              %fprs.fef = 1;
 176  *              save current fpregs on stack using blockstore
 177  *      } else {
 178  *              %fprs.fef = 1;
 179  *      }
 180  *
 181  *
 182  *      do_blockcopy_here;
 183  *
 184  * In lofault handler:
 185  *      curthread->t_lofault = .copyerr2;
 186  *      Continue on with the normal exit handler
 187  *
 188  * On normal exit:
 189  *      %gsr = old_gsr;
 190  *      if (old_fprs & FPRS_FEF)
 191  *              restore fpregs from stack using blockload
 192  *      else
 193  *              zero fpregs
 194  *      %fprs = old_fprs;
 195  *      membar #Sync
 196  *      curthread->t_lofault = (%l6 & ~3);
 197  *      ! following test omitted from copyin/copyout as they
 198  *      ! will always have a current thread
 199  *      if (curthread->t_lwp == NULL)
 200  *              kpreempt_enable(curthread);
 201  *      else
 202  *              thread_allowmigrate();
 203  *      return (0)
 204  *
 205  * In second lofault handler (.copyerr2):
 206  *      We've tried to restore fp state from the stack and failed.  To
 207  *      prevent from returning with a corrupted fp state, we will panic.
 208  */
 209 
 210 /*
 211  * Comments about optimization choices
 212  *
 213  * The initial optimization decision in this code is to determine
 214  * whether to use the FP registers for a copy or not.  If we don't
 215  * use the FP registers, we can execute the copy as a leaf routine,
 216  * saving a register save and restore.  Also, less elaborate setup
 217  * is required, allowing short copies to be completed more quickly.
 218  * For longer copies, especially unaligned ones (where the src and
 219  * dst do not align to allow simple ldx,stx operation), the FP
 220  * registers allow much faster copy operations.
 221  *
 222  * The estimated extra cost of the FP path will vary depending on
 223  * src/dst alignment, dst offset from the next 64 byte FPblock store
 224  * boundary, remaining src data after the last full dst cache line is
 225  * moved whether the FP registers need to be saved, and some other
 226  * minor issues.  The average additional overhead is estimated to be
 227  * 400 clocks.  Since each non-repeated/predicted tst and branch costs
 228  * around 10 clocks, elaborate calculation would slow down to all
 229  * longer copies and only benefit a small portion of medium sized
 230  * copies.  Rather than incur such cost, we chose fixed transition
 231  * points for each of the alignment choices.
 232  *
 233  * For the inner loop, here is a comparison of the per cache line
 234  * costs for each alignment when src&dst are in cache:
 235  *
 236  * byte aligned:  108 clocks slower for non-FPBLK
 237  * half aligned:   44 clocks slower for non-FPBLK
 238  * word aligned:   12 clocks slower for non-FPBLK
 239  * long aligned:    4 clocks >>faster<< for non-FPBLK
 240  *
 241  * The long aligned loop runs faster because it does no prefetching.
 242  * That wins if the data is not in cache or there is too little
 243  * data to gain much benefit from prefetching.  But when there
 244  * is more data and that data is not in cache, failing to prefetch
 245  * can run much slower.  In addition, there is a 2 Kbyte store queue
 246  * which will cause the non-FPBLK inner loop to slow for larger copies.
 247  * The exact tradeoff is strongly load and application dependent, with
 248  * increasing risk of a customer visible performance regression if the
 249  * non-FPBLK code is used for larger copies. Studies of synthetic in-cache
 250  * vs out-of-cache copy tests in user space suggest 1024 bytes as a safe
 251  * upper limit for the non-FPBLK code.  To minimize performance regression
 252  * risk while still gaining the primary benefits of the improvements to
 253  * the non-FPBLK code, we set an upper bound of 1024 bytes for the various
 254  * hw_copy_limit_*.  Later experimental studies using different values
 255  * of hw_copy_limit_* can be used to make further adjustments if
 256  * appropriate.
 257  *
 258  * hw_copy_limit_1 = src and dst are byte aligned but not halfword aligned
 259  * hw_copy_limit_2 = src and dst are halfword aligned but not word aligned
 260  * hw_copy_limit_4 = src and dst are word aligned but not longword aligned
 261  * hw_copy_limit_8 = src and dst are longword aligned
 262  *
 263  * To say that src and dst are word aligned means that after
 264  * some initial alignment activity of moving 0 to 3 bytes,
 265  * both the src and dst will be on word boundaries so that
 266  * word loads and stores may be used.
 267  *
 268  * Default values at May,2005 are:
 269  * hw_copy_limit_1 =  256
 270  * hw_copy_limit_2 =  512
 271  * hw_copy_limit_4 = 1024
 272  * hw_copy_limit_8 = 1024 (or 1536 on some systems)
 273  *
 274  *
 275  * If hw_copy_limit_? is set to zero, then use of FPBLK copy is
 276  * disabled for that alignment choice.
 277  * If hw_copy_limit_? is set to a value between 1 and VIS_COPY_THRESHOLD (256)
 278  * the value of VIS_COPY_THRESHOLD is used.
 279  * It is not envisioned that hw_copy_limit_? will be changed in the field
 280  * It is provided to allow for disabling FPBLK copies and to allow
 281  * easy testing of alternate values on future HW implementations
 282  * that might have different cache sizes, clock rates or instruction
 283  * timing rules.
 284  *
 285  * Our first test for FPBLK copies vs non-FPBLK copies checks a minimum
 286  * threshold to speedup all shorter copies (less than 256).  That
 287  * saves an alignment test, memory reference, and enabling test
 288  * for all short copies, or an estimated 24 clocks.
 289  *
 290  * The order in which these limits are checked does matter since each
 291  * non-predicted tst and branch costs around 10 clocks.
 292  * If src and dst are randomly selected addresses,
 293  * 4 of 8 will not be alignable.
 294  * 2 of 8 will be half word alignable.
 295  * 1 of 8 will be word alignable.
 296  * 1 of 8 will be long word alignable.
 297  * But, tests on running kernels show that src and dst to copy code
 298  * are typically not on random alignments.  Structure copies and
 299  * copies of larger data sizes are often on long word boundaries.
 300  * So we test the long word alignment case first, then
 301  * the byte alignment, then halfword, then word alignment.
 302  *
 303  * Several times, tests for length are made to split the code
 304  * into subcases.  These tests often allow later tests to be
 305  * avoided.  For example, within the non-FPBLK copy, we first
 306  * check for tiny copies of 3 bytes or less.  That allows us
 307  * to use a 4-way unrolled loop for the general byte copy case
 308  * without a test on loop entry.
 309  * We subdivide the non-FPBLK case further into CHKSIZE bytes and less
 310  * vs longer cases.  For the really short case, we don't attempt
 311  * align src and dst.  We try to minimize special case tests in
 312  * the shortest loops as each test adds a significant percentage
 313  * to the total time.
 314  *
 315  * For the medium sized cases, we allow ourselves to adjust the
 316  * src and dst alignment and provide special cases for each of
 317  * the four adjusted alignment cases. The CHKSIZE that was used
 318  * to decide between short and medium size was chosen to be 39
 319  * as that allows for the worst case of 7 bytes of alignment
 320  * shift and 4 times 8 bytes for the first long word unrolling.
 321  * That knowledge saves an initial test for length on entry into
 322  * the medium cases.  If the general loop unrolling factor were
 323  * to be increases, this number would also need to be adjusted.
 324  *
 325  * For all cases in the non-FPBLK code where it is known that at
 326  * least 4 chunks of data are available for movement, the
 327  * loop is unrolled by four.  This 4-way loop runs in 8 clocks
 328  * or 2 clocks per data element.
 329  *
 330  * Instruction alignment is forced by used of .align 16 directives
 331  * and nops which are not executed in the code.  This
 332  * combination of operations shifts the alignment of following
 333  * loops to insure that loops are aligned so that their instructions
 334  * fall within the minimum number of 4 instruction fetch groups.
 335  * If instructions are inserted or removed between the .align
 336  * instruction and the unrolled loops, then the alignment needs
 337  * to be readjusted.  Misaligned loops can add a clock per loop
 338  * iteration to the loop timing.
 339  *
 340  * In a few cases, code is duplicated to avoid a branch.  Since
 341  * a non-predicted tst and branch takes 10 clocks, this savings
 342  * is judged an appropriate time-space tradeoff.
 343  *
 344  * Within the FPBLK-code, the prefetch method in the inner
 345  * loop needs to be explained as it is not standard.  Two
 346  * prefetches are issued for each cache line instead of one.
 347  * The primary one is at the maximum reach of 8 cache lines.
 348  * Most of the time, that maximum prefetch reach gives the
 349  * cache line more time to reach the processor for systems with
 350  * higher processor clocks.  But, sometimes memory interference
 351  * can cause that prefetch to be dropped.  Putting a second
 352  * prefetch at a reach of 5 cache lines catches the drops
 353  * three iterations later and shows a measured improvement
 354  * in performance over any similar loop with a single prefetch.
 355  * The prefetches are placed in the loop so they overlap with
 356  * non-memory instructions, so that there is no extra cost
 357  * when the data is already in-cache.
 358  *
 359  */
 360 
 361 /*
 362  * Notes on preserving existing fp state and on membars.
 363  *
 364  * When a copyOP decides to use fp we may have to preserve existing
 365  * floating point state.  It is not the caller's state that we need to
 366  * preserve - the rest of the kernel does not use fp and, anyway, fp
 367  * registers are volatile across a call.  Some examples:
 368  *
 369  *      - userland has fp state and is interrupted (device interrupt
 370  *        or trap) and within the interrupt/trap handling we use
 371  *        bcopy()
 372  *      - another (higher level) interrupt or trap handler uses bcopy
 373  *        while a bcopy from an earlier interrupt is still active
 374  *      - an asynchronous error trap occurs while fp state exists (in
 375  *        userland or in kernel copy) and the tl0 component of the handling
 376  *        uses bcopy
 377  *      - a user process with fp state incurs a copy-on-write fault and
 378  *        hwblkpagecopy always uses fp
 379  *
 380  * We therefore need a per-call place in which to preserve fp state -
 381  * using our stack is ideal (and since fp copy cannot be leaf optimized
 382  * because of calls it makes, this is no hardship).
 383  *
 384  * When we have finished fp copy (with it's repeated block stores)
 385  * we must membar #Sync so that our block stores may complete before
 386  * we either restore the original fp state into the fp registers or
 387  * return to a caller which may initiate other fp operations that could
 388  * modify the fp regs we used before the block stores complete.
 389  *
 390  * Synchronous faults (eg, unresolvable DMMU miss) that occur while
 391  * t_lofault is not NULL will not panic but will instead trampoline
 392  * to the registered lofault handler.  There is no need for any
 393  * membars for these - eg, our store to t_lofault will always be visible to
 394  * ourselves and it is our cpu which will take any trap.
 395  *
 396  * Asynchronous faults (eg, uncorrectable ECC error from memory) that occur
 397  * while t_lofault is not NULL will also not panic.  Since we're copying
 398  * to or from userland the extent of the damage is known - the destination
 399  * buffer is incomplete.  So trap handlers will trampoline to the lofault
 400  * handler in this case which should take some form of error action to
 401  * avoid using the incomplete buffer.  The trap handler also flags the
 402  * fault so that later return-from-trap handling (for the trap that brought
 403  * this thread into the kernel in the first place) can notify the process
 404  * and reboot the system (or restart the service with Greenline/Contracts).
 405  *
 406  * Asynchronous faults (eg, uncorrectable ECC error from memory) can
 407  * result in deferred error traps - the trap is taken sometime after
 408  * the event and the trap PC may not be the PC of the faulting access.
 409  * Delivery of such pending traps can be forced by a membar #Sync, acting
 410  * as an "error barrier" in this role.  To accurately apply the user/kernel
 411  * separation described in the preceding paragraph we must force delivery
 412  * of deferred traps affecting kernel state before we install a lofault
 413  * handler (if we interpose a new lofault handler on an existing one there
 414  * is no need to repeat this), and we must force delivery of deferred
 415  * errors affecting the lofault-protected region before we clear t_lofault.
 416  * Failure to do so results in lost kernel state being interpreted as
 417  * affecting a copyin/copyout only, or of an error that really only
 418  * affects copy data being interpreted as losing kernel state.
 419  *
 420  * Since the copy operations may preserve and later restore floating
 421  * point state that does not belong to the caller (see examples above),
 422  * we must be careful in how we do this in order to prevent corruption
 423  * of another program.
 424  *
 425  * To make sure that floating point state is always saved and restored
 426  * correctly, the following "big rules" must be followed when the floating
 427  * point registers will be used:
 428  *
 429  * 1. %l6 always holds the caller's lofault handler.  Also in this register,
 430  *    Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in
 431  *    use.  Bit 2 (TRAMP_FLAG) indicates that the call was to bcopy, and a
 432  *    lofault handler was set coming in.
 433  *
 434  * 2. The FPUSED flag indicates that all FP state has been successfully stored
 435  *    on the stack.  It should not be set until this save has been completed.
 436  *
 437  * 3. The FPUSED flag should not be cleared on exit until all FP state has
 438  *    been restored from the stack.  If an error occurs while restoring
 439  *    data from the stack, the error handler can check this flag to see if
 440  *    a restore is necessary.
 441  *
 442  * 4. Code run under the new lofault handler must be kept to a minimum.  In
 443  *    particular, any calls to FP_ALLOWMIGRATE, which could result in a call
 444  *    to kpreempt(), should not be made until after the lofault handler has
 445  *    been restored.
 446  */
 447 
 448 /*
 449  * VIS_COPY_THRESHOLD indicates the minimum number of bytes needed
 450  * to "break even" using FP/VIS-accelerated memory operations.
 451  * The FPBLK code assumes a minimum number of bytes are available
 452  * to be moved on entry.  Check that code carefully before
 453  * reducing VIS_COPY_THRESHOLD below 256.
 454  */
 455 /*
 456  * This shadows sys/machsystm.h which can't be included due to the lack of
 457  * _ASM guards in include files it references. Change it here, change it there.
 458  */
 459 #define VIS_COPY_THRESHOLD 256
 460 
 461 /*
 462  * TEST for very short copies
 463  * Be aware that the maximum unroll for the short unaligned case
 464  * is SHORTCOPY+1
 465  */
 466 #define SHORTCOPY 3
 467 #define CHKSIZE  39
 468 
 469 /*
 470  * Indicates that we're to trampoline to the error handler.
 471  * Entry points bcopy, copyin_noerr, and copyout_noerr use this flag.
 472  * kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag.
 473  */
 474 #define FPUSED_FLAG     1
 475 #define TRAMP_FLAG      2
 476 #define MASK_FLAGS      3
 477 
 478 /*
 479  * Number of outstanding prefetches.
 480  * first prefetch moves data from L2 to L1 (n_reads)
 481  * second prefetch moves data from memory to L2 (one_read)
 482  */
 483 #define OLYMPUS_C_PREFETCH      24
 484 #define OLYMPUS_C_2ND_PREFETCH  12
 485 
 486 #define VIS_BLOCKSIZE           64
 487 
 488 /*
 489  * Size of stack frame in order to accomodate a 64-byte aligned
 490  * floating-point register save area and 2 64-bit temp locations.
 491  * All copy functions use two quadrants of fp registers; to assure a
 492  * block-aligned two block buffer in which to save we must reserve
 493  * three blocks on stack.  Not all functions preserve %pfrs on stack
 494  * or need to preserve %gsr but we use HWCOPYFRAMESIZE for all.
 495  *
 496  *    _______________________________________ <-- %fp + STACK_BIAS
 497  *    | We may need to preserve 2 quadrants |
 498  *    | of fp regs, but since we do so with |
 499  *    | BST/BLD we need room in which to    |
 500  *    | align to VIS_BLOCKSIZE bytes.  So   |
 501  *    | this area is 3 * VIS_BLOCKSIZE.     | <--  - SAVED_FPREGS_OFFSET
 502  *    |-------------------------------------|
 503  *    | 8 bytes to save %fprs               | <--  - SAVED_FPRS_OFFSET
 504  *    |-------------------------------------|
 505  *    | 8 bytes to save %gsr                | <--  - SAVED_GSR_OFFSET
 506  *    ---------------------------------------
 507  */
 508 #define HWCOPYFRAMESIZE         ((VIS_BLOCKSIZE * (2 + 1)) + (2 * 8))
 509 #define SAVED_FPREGS_OFFSET     (VIS_BLOCKSIZE * 3)
 510 #define SAVED_FPREGS_ADJUST     ((VIS_BLOCKSIZE * 2) - 1)
 511 #define SAVED_FPRS_OFFSET       (SAVED_FPREGS_OFFSET + 8)
 512 #define SAVED_GSR_OFFSET        (SAVED_FPRS_OFFSET + 8)
 513 
 514 /*
 515  * Common macros used by the various versions of the block copy
 516  * routines in this file.
 517  */
 518 
 519 /*
 520  * In FP copies if we do not have preserved data to restore over
 521  * the fp regs we used then we must zero those regs to avoid
 522  * exposing portions of the data to later threads (data security).
 523  *
 524  * Copy functions use either quadrants 1 and 3 or 2 and 4.
 525  *
 526  * FZEROQ1Q3: Zero quadrants 1 and 3, ie %f0 - %f15 and %f32 - %f47
 527  * FZEROQ2Q4: Zero quadrants 2 and 4, ie %f16 - %f31 and %f48 - %f63
 528  *
 529  * The instructions below are quicker than repeated fzero instructions
 530  * since they can dispatch down two fp pipelines.
 531  */
 532 #define FZEROQ1Q3                       \
 533         fzero   %f0                     ;\
 534         fmovd   %f0, %f2                ;\
 535         fmovd   %f0, %f4                ;\
 536         fmovd   %f0, %f6                ;\
 537         fmovd   %f0, %f8                ;\
 538         fmovd   %f0, %f10               ;\
 539         fmovd   %f0, %f12               ;\
 540         fmovd   %f0, %f14               ;\
 541         fmovd   %f0, %f32               ;\
 542         fmovd   %f0, %f34               ;\
 543         fmovd   %f0, %f36               ;\
 544         fmovd   %f0, %f38               ;\
 545         fmovd   %f0, %f40               ;\
 546         fmovd   %f0, %f42               ;\
 547         fmovd   %f0, %f44               ;\
 548         fmovd   %f0, %f46
 549 
 550 #define FZEROQ2Q4                       \
 551         fzero   %f16                    ;\
 552         fmovd   %f0, %f18               ;\
 553         fmovd   %f0, %f20               ;\
 554         fmovd   %f0, %f22               ;\
 555         fmovd   %f0, %f24               ;\
 556         fmovd   %f0, %f26               ;\
 557         fmovd   %f0, %f28               ;\
 558         fmovd   %f0, %f30               ;\
 559         fmovd   %f0, %f48               ;\
 560         fmovd   %f0, %f50               ;\
 561         fmovd   %f0, %f52               ;\
 562         fmovd   %f0, %f54               ;\
 563         fmovd   %f0, %f56               ;\
 564         fmovd   %f0, %f58               ;\
 565         fmovd   %f0, %f60               ;\
 566         fmovd   %f0, %f62
 567 
 568 /*
 569  * Macros to save and restore quadrants 1 and 3 or 2 and 4 to/from the stack.
 570  * Used to save and restore in-use fp registers when we want to use FP
 571  * and find fp already in use and copy size still large enough to justify
 572  * the additional overhead of this save and restore.
 573  *
 574  * A membar #Sync is needed before save to sync fp ops initiated before
 575  * the call to the copy function (by whoever has fp in use); for example
 576  * an earlier block load to the quadrant we are about to save may still be
 577  * "in flight".  A membar #Sync is required at the end of the save to
 578  * sync our block store (the copy code is about to begin ldd's to the
 579  * first quadrant).
 580  *
 581  * Similarly: a membar #Sync before restore allows the block stores of
 582  * the copy operation to complete before we fill the quadrants with their
 583  * original data, and a membar #Sync after restore lets the block loads
 584  * of the restore complete before we return to whoever has the fp regs
 585  * in use.  To avoid repeated membar #Sync we make it the responsibility
 586  * of the copy code to membar #Sync immediately after copy is complete
 587  * and before using the BLD_*_FROMSTACK macro.
 588  */
 589 #if !defined(lint)
 590 #define BST_FPQ1Q3_TOSTACK(tmp1)                                \
 591         /* membar #Sync */                                      ;\
 592         add     %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1     ;\
 593         and     tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */    ;\
 594         stda    %f0, [tmp1]ASI_BLK_P                            ;\
 595         add     tmp1, VIS_BLOCKSIZE, tmp1                       ;\
 596         stda    %f32, [tmp1]ASI_BLK_P                           ;\
 597         membar  #Sync
 598 
 599 #define BLD_FPQ1Q3_FROMSTACK(tmp1)                              \
 600         /* membar #Sync - provided at copy completion */        ;\
 601         add     %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1     ;\
 602         and     tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */    ;\
 603         ldda    [tmp1]ASI_BLK_P, %f0                            ;\
 604         add     tmp1, VIS_BLOCKSIZE, tmp1                       ;\
 605         ldda    [tmp1]ASI_BLK_P, %f32                           ;\
 606         membar  #Sync
 607 
 608 #define BST_FPQ2Q4_TOSTACK(tmp1)                                \
 609         /* membar #Sync */                                      ;\
 610         add     %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1     ;\
 611         and     tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */    ;\
 612         stda    %f16, [tmp1]ASI_BLK_P                           ;\
 613         add     tmp1, VIS_BLOCKSIZE, tmp1                       ;\
 614         stda    %f48, [tmp1]ASI_BLK_P                           ;\
 615         membar  #Sync
 616 
 617 #define BLD_FPQ2Q4_FROMSTACK(tmp1)                              \
 618         /* membar #Sync - provided at copy completion */        ;\
 619         add     %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1     ;\
 620         and     tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */    ;\
 621         ldda    [tmp1]ASI_BLK_P, %f16                           ;\
 622         add     tmp1, VIS_BLOCKSIZE, tmp1                       ;\
 623         ldda    [tmp1]ASI_BLK_P, %f48                           ;\
 624         membar  #Sync
 625 #endif
 626 
 627 /*
 628  * FP_NOMIGRATE and FP_ALLOWMIGRATE.  Prevent migration (or, stronger,
 629  * prevent preemption if there is no t_lwp to save FP state to on context
 630  * switch) before commencing a FP copy, and reallow it on completion or
 631  * in error trampoline paths when we were using FP copy.
 632  *
 633  * Both macros may call other functions, so be aware that all outputs are
 634  * forfeit after using these macros.  For this reason we do not pass registers
 635  * to use - we just use any outputs we want.
 636  *
 637  * Pseudo code:
 638  *
 639  * FP_NOMIGRATE:
 640  *
 641  * if (curthread->t_lwp) {
 642  *      thread_nomigrate();
 643  * } else {
 644  *      kpreempt_disable();
 645  * }
 646  *
 647  * FP_ALLOWMIGRATE:
 648  *
 649  * if (curthread->t_lwp) {
 650  *      thread_allowmigrate();
 651  * } else {
 652  *      kpreempt_enable();
 653  * }
 654  */
 655 
 656 #define FP_NOMIGRATE(label1, label2)                            \
 657         ldn     [THREAD_REG + T_LWP], %o0                       ;\
 658         brz,a,pn %o0, label1/**/f                               ;\
 659           ldsb  [THREAD_REG + T_PREEMPT], %o1                   ;\
 660         call    thread_nomigrate                                ;\
 661           nop                                                   ;\
 662         ba      label2/**/f                                     ;\
 663           nop                                                   ;\
 664 label1:                                                         ;\
 665         inc     %o1                                             ;\
 666         stb     %o1, [THREAD_REG + T_PREEMPT]                   ;\
 667 label2:
 668 
 669 #define FP_ALLOWMIGRATE(label1, label2)                 \
 670         ldn     [THREAD_REG + T_LWP], %o0                       ;\
 671         brz,a,pn %o0, label1/**/f                               ;\
 672           ldsb  [THREAD_REG + T_PREEMPT], %o1                   ;\
 673         call thread_allowmigrate                                ;\
 674           nop                                                   ;\
 675         ba      label2/**/f                                     ;\
 676           nop                                                   ;\
 677 label1:                                                         ;\
 678         dec     %o1                                             ;\
 679         brnz,pn %o1, label2/**/f                                ;\
 680           stb   %o1, [THREAD_REG + T_PREEMPT]                   ;\
 681         ldn     [THREAD_REG + T_CPU], %o0                       ;\
 682         ldub    [%o0 + CPU_KPRUNRUN], %o0                       ;\
 683         brz,pt  %o0, label2/**/f                                ;\
 684           nop                                                   ;\
 685         call    kpreempt                                        ;\
 686           rdpr  %pil, %o0                                       ;\
 687 label2:
 688 
 689 /*
 690  * Copy a block of storage, returning an error code if `from' or
 691  * `to' takes a kernel pagefault which cannot be resolved.
 692  * Returns errno value on pagefault error, 0 if all ok
 693  */
 694 
 695 #if defined(lint)
 696 
 697 /* ARGSUSED */
 698 int
 699 kcopy(const void *from, void *to, size_t count)
 700 { return(0); }
 701 
 702 #else   /* lint */
 703 
 704         .seg    ".text"
 705         .align  4
 706 
 707         ENTRY(kcopy)
 708 
 709         cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
 710         bleu,pt %ncc, .kcopy_small              ! go to larger cases
 711           xor   %o0, %o1, %o3                   ! are src, dst alignable?
 712         btst    7, %o3                          !
 713         bz,pt   %ncc, .kcopy_8                  ! check for longword alignment
 714           nop
 715         btst    1, %o3                          !
 716         bz,pt   %ncc, .kcopy_2                  ! check for half-word
 717           nop
 718         sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
 719         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
 720         tst     %o3
 721         bz,pn   %icc, .kcopy_small              ! if zero, disable HW copy
 722           cmp   %o2, %o3                        ! if length <= limit
 723         bleu,pt %ncc, .kcopy_small              ! go to small copy
 724           nop
 725         ba,pt   %ncc, .kcopy_more               ! otherwise go to large copy
 726           nop
 727 .kcopy_2:
 728         btst    3, %o3                          !
 729         bz,pt   %ncc, .kcopy_4                  ! check for word alignment
 730           nop
 731         sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
 732         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
 733         tst     %o3
 734         bz,pn   %icc, .kcopy_small              ! if zero, disable HW copy
 735           cmp   %o2, %o3                        ! if length <= limit
 736         bleu,pt %ncc, .kcopy_small              ! go to small copy
 737           nop
 738         ba,pt   %ncc, .kcopy_more               ! otherwise go to large copy
 739           nop
 740 .kcopy_4:
 741         ! already checked longword, must be word aligned
 742         sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
 743         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
 744         tst     %o3
 745         bz,pn   %icc, .kcopy_small              ! if zero, disable HW copy
 746           cmp   %o2, %o3                        ! if length <= limit
 747         bleu,pt %ncc, .kcopy_small              ! go to small copy
 748           nop
 749         ba,pt   %ncc, .kcopy_more               ! otherwise go to large copy
 750           nop
 751 .kcopy_8:
 752         sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
 753         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
 754         tst     %o3
 755         bz,pn   %icc, .kcopy_small              ! if zero, disable HW copy
 756           cmp   %o2, %o3                        ! if length <= limit
 757         bleu,pt %ncc, .kcopy_small              ! go to small copy
 758           nop
 759         ba,pt   %ncc, .kcopy_more               ! otherwise go to large copy
 760           nop
 761 
 762 .kcopy_small:
 763         sethi   %hi(.sm_copyerr), %o5           ! sm_copyerr is lofault value
 764         or      %o5, %lo(.sm_copyerr), %o5
 765         ldn     [THREAD_REG + T_LOFAULT], %o4   ! save existing handler
 766         membar  #Sync                           ! sync error barrier
 767         ba,pt   %ncc, .sm_do_copy               ! common code
 768          stn    %o5, [THREAD_REG + T_LOFAULT]   ! set t_lofault
 769 
 770 .kcopy_more:
 771         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
 772         sethi   %hi(.copyerr), %l7              ! copyerr is lofault value
 773         or      %l7, %lo(.copyerr), %l7
 774         ldn     [THREAD_REG + T_LOFAULT], %l6   ! save existing handler
 775         membar  #Sync                           ! sync error barrier
 776         ba,pt   %ncc, .do_copy                  ! common code
 777           stn   %l7, [THREAD_REG + T_LOFAULT]   ! set t_lofault
 778 
 779 
 780 /*
 781  * We got here because of a fault during bcopy_more, called from kcopy or bcopy.
 782  * Errno value is in %g1.  bcopy_more uses fp quadrants 1 and 3.
 783  */
 784 .copyerr:
 785         set     .copyerr2, %l0
 786         membar  #Sync                           ! sync error barrier
 787         stn     %l0, [THREAD_REG + T_LOFAULT]   ! set t_lofault
 788         btst    FPUSED_FLAG, %l6
 789         bz      %ncc, 1f
 790           and   %l6, TRAMP_FLAG, %l0            ! copy trampoline flag to %l0
 791 
 792         ldx     [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2      ! restore gsr
 793         wr      %o2, 0, %gsr
 794 
 795         ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
 796         btst    FPRS_FEF, %o3
 797         bz,pt   %icc, 4f
 798           nop
 799 
 800         BLD_FPQ1Q3_FROMSTACK(%o2)
 801 
 802         ba,pt   %ncc, 1f
 803           wr    %o3, 0, %fprs           ! restore fprs
 804 
 805 4:
 806         FZEROQ1Q3
 807         wr      %o3, 0, %fprs           ! restore fprs
 808 
 809         !
 810         ! Need to cater for the different expectations of kcopy
 811         ! and bcopy. kcopy will *always* set a t_lofault handler
 812         ! If it fires, we're expected to just return the error code
 813         ! and *not* to invoke any existing error handler. As far as
 814         ! bcopy is concerned, we only set t_lofault if there was an
 815         ! existing lofault handler. In that case we're expected to
 816         ! invoke the previously existing handler after resetting the
 817         ! t_lofault value.
 818         !
 819 1:
 820         andn    %l6, MASK_FLAGS, %l6            ! turn trampoline flag off
 821         membar  #Sync                           ! sync error barrier
 822         stn     %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
 823         FP_ALLOWMIGRATE(5, 6)
 824 
 825         btst    TRAMP_FLAG, %l0
 826         bnz,pn  %ncc, 3f
 827           nop
 828         ret
 829           restore       %g1, 0, %o0
 830 
 831 3:
 832         !
 833         ! We're here via bcopy. There *must* have been an error handler
 834         ! in place otherwise we would have died a nasty death already.
 835         !
 836         jmp     %l6                             ! goto real handler
 837           restore       %g0, 0, %o0             ! dispose of copy window
 838 
 839 /*
 840  * We got here because of a fault in .copyerr.  We can't safely restore fp
 841  * state, so we panic.
 842  */
 843 fp_panic_msg:
 844         .asciz  "Unable to restore fp state after copy operation"
 845 
 846         .align  4
 847 .copyerr2:
 848         set     fp_panic_msg, %o0
 849         call    panic
 850           nop
 851 
 852 /*
 853  * We got here because of a fault during a small kcopy or bcopy.
 854  * No floating point registers are used by the small copies.
 855  * Errno value is in %g1.
 856  */
 857 .sm_copyerr:
 858 1:
 859         btst    TRAMP_FLAG, %o4
 860         membar  #Sync
 861         andn    %o4, TRAMP_FLAG, %o4
 862         bnz,pn  %ncc, 3f
 863           stn   %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
 864         retl
 865           mov   %g1, %o0
 866 3:
 867         jmp     %o4                             ! goto real handler
 868           mov   %g0, %o0                        !
 869 
 870         SET_SIZE(kcopy)
 871 #endif  /* lint */
 872 
 873 
 874 /*
 875  * Copy a block of storage - must not overlap (from + len <= to).
 876  * Registers: l6 - saved t_lofault
 877  * (for short copies, o4 - saved t_lofault)
 878  *
 879  * Copy a page of memory.
 880  * Assumes double word alignment and a count >= 256.
 881  */
 882 #if defined(lint)
 883 
 884 /* ARGSUSED */
 885 void
 886 bcopy(const void *from, void *to, size_t count)
 887 {}
 888 
 889 #else   /* lint */
 890 
 891         ENTRY(bcopy)
 892 
 893         cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
 894         bleu,pt %ncc, .bcopy_small              ! go to larger cases
 895           xor   %o0, %o1, %o3                   ! are src, dst alignable?
 896         btst    7, %o3                          !
 897         bz,pt   %ncc, .bcopy_8                  ! check for longword alignment
 898           nop
 899         btst    1, %o3                          !
 900         bz,pt   %ncc, .bcopy_2                  ! check for half-word
 901           nop
 902         sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
 903         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
 904         tst     %o3
 905         bz,pn   %icc, .bcopy_small              ! if zero, disable HW copy
 906           cmp   %o2, %o3                        ! if length <= limit
 907         bleu,pt %ncc, .bcopy_small              ! go to small copy
 908           nop
 909         ba,pt   %ncc, .bcopy_more               ! otherwise go to large copy
 910           nop
 911 .bcopy_2:
 912         btst    3, %o3                          !
 913         bz,pt   %ncc, .bcopy_4                  ! check for word alignment
 914           nop
 915         sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
 916         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
 917         tst     %o3
 918         bz,pn   %icc, .bcopy_small              ! if zero, disable HW copy
 919           cmp   %o2, %o3                        ! if length <= limit
 920         bleu,pt %ncc, .bcopy_small              ! go to small copy
 921           nop
 922         ba,pt   %ncc, .bcopy_more               ! otherwise go to large copy
 923           nop
 924 .bcopy_4:
 925         ! already checked longword, must be word aligned
 926         sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
 927         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
 928         tst     %o3
 929         bz,pn   %icc, .bcopy_small              ! if zero, disable HW copy
 930           cmp   %o2, %o3                        ! if length <= limit
 931         bleu,pt %ncc, .bcopy_small              ! go to small copy
 932           nop
 933         ba,pt   %ncc, .bcopy_more               ! otherwise go to large copy
 934           nop
 935 .bcopy_8:
 936         sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
 937         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
 938         tst     %o3
 939         bz,pn   %icc, .bcopy_small              ! if zero, disable HW copy
 940           cmp   %o2, %o3                        ! if length <= limit
 941         bleu,pt %ncc, .bcopy_small              ! go to small copy
 942           nop
 943         ba,pt   %ncc, .bcopy_more               ! otherwise go to large copy
 944           nop
 945 
 946         .align  16
 947 .bcopy_small:
 948         ldn     [THREAD_REG + T_LOFAULT], %o4   ! save t_lofault
 949         tst     %o4
 950         bz,pt   %icc, .sm_do_copy
 951           nop
 952         sethi   %hi(.sm_copyerr), %o5
 953         or      %o5, %lo(.sm_copyerr), %o5
 954         membar  #Sync                           ! sync error barrier
 955         stn     %o5, [THREAD_REG + T_LOFAULT]   ! install new vector
 956         or      %o4, TRAMP_FLAG, %o4            ! error should trampoline
 957 .sm_do_copy:
 958         cmp     %o2, SHORTCOPY          ! check for really short case
 959         bleu,pt %ncc, .bc_sm_left       !
 960           cmp   %o2, CHKSIZE            ! check for medium length cases
 961         bgu,pn  %ncc, .bc_med           !
 962           or    %o0, %o1, %o3           ! prepare alignment check
 963         andcc   %o3, 0x3, %g0           ! test for alignment
 964         bz,pt   %ncc, .bc_sm_word       ! branch to word aligned case
 965 .bc_sm_movebytes:
 966           sub   %o2, 3, %o2             ! adjust count to allow cc zero test
 967 .bc_sm_notalign4:
 968         ldub    [%o0], %o3              ! read byte
 969         stb     %o3, [%o1]              ! write byte
 970         subcc   %o2, 4, %o2             ! reduce count by 4
 971         ldub    [%o0 + 1], %o3          ! repeat for a total of 4 bytes
 972         add     %o0, 4, %o0             ! advance SRC by 4
 973         stb     %o3, [%o1 + 1]
 974         ldub    [%o0 - 2], %o3
 975         add     %o1, 4, %o1             ! advance DST by 4
 976         stb     %o3, [%o1 - 2]
 977         ldub    [%o0 - 1], %o3
 978         bgt,pt  %ncc, .bc_sm_notalign4  ! loop til 3 or fewer bytes remain
 979           stb   %o3, [%o1 - 1]
 980         add     %o2, 3, %o2             ! restore count
 981 .bc_sm_left:
 982         tst     %o2
 983         bz,pt   %ncc, .bc_sm_exit       ! check for zero length
 984           deccc %o2                     ! reduce count for cc test
 985         ldub    [%o0], %o3              ! move one byte
 986         bz,pt   %ncc, .bc_sm_exit
 987           stb   %o3, [%o1]
 988         ldub    [%o0 + 1], %o3          ! move another byte
 989         deccc   %o2                     ! check for more
 990         bz,pt   %ncc, .bc_sm_exit
 991           stb   %o3, [%o1 + 1]
 992         ldub    [%o0 + 2], %o3          ! move final byte
 993         ba,pt   %ncc, .bc_sm_exit
 994           stb   %o3, [%o1 + 2]
 995         .align  16
 996         nop                             ! instruction alignment
 997                                         ! see discussion at start of file
 998 .bc_sm_words:
 999         lduw    [%o0], %o3              ! read word
1000 .bc_sm_wordx:
1001         subcc   %o2, 8, %o2             ! update count
1002         stw     %o3, [%o1]              ! write word
1003         add     %o0, 8, %o0             ! update SRC
1004         lduw    [%o0 - 4], %o3          ! read word
1005         add     %o1, 8, %o1             ! update DST
1006         bgt,pt  %ncc, .bc_sm_words      ! loop til done
1007           stw   %o3, [%o1 - 4]          ! write word
1008         addcc   %o2, 7, %o2             ! restore count
1009         bz,pt   %ncc, .bc_sm_exit
1010           deccc %o2
1011         bz,pt   %ncc, .bc_sm_byte
1012 .bc_sm_half:
1013           subcc %o2, 2, %o2             ! reduce count by 2
1014         add     %o0, 2, %o0             ! advance SRC by 2
1015         lduh    [%o0 - 2], %o3          ! read half word
1016         add     %o1, 2, %o1             ! advance DST by 2
1017         bgt,pt  %ncc, .bc_sm_half       ! loop til done
1018           sth   %o3, [%o1 - 2]          ! write half word
1019         addcc   %o2, 1, %o2             ! restore count
1020         bz,pt   %ncc, .bc_sm_exit
1021           nop
1022 .bc_sm_byte:
1023         ldub    [%o0], %o3
1024         ba,pt   %ncc, .bc_sm_exit
1025           stb   %o3, [%o1]
1026 
1027 .bc_sm_word:
1028         subcc   %o2, 4, %o2             ! update count
1029         bgt,pt  %ncc, .bc_sm_wordx
1030           lduw  [%o0], %o3              ! read word
1031         addcc   %o2, 3, %o2             ! restore count
1032         bz,pt   %ncc, .bc_sm_exit
1033           stw   %o3, [%o1]              ! write word
1034         deccc   %o2                     ! reduce count for cc test
1035         ldub    [%o0 + 4], %o3          ! load one byte
1036         bz,pt   %ncc, .bc_sm_exit
1037           stb   %o3, [%o1 + 4]          ! store one byte
1038         ldub    [%o0 + 5], %o3          ! load second byte
1039         deccc   %o2
1040         bz,pt   %ncc, .bc_sm_exit
1041           stb   %o3, [%o1 + 5]          ! store second byte
1042         ldub    [%o0 + 6], %o3          ! load third byte
1043         stb     %o3, [%o1 + 6]          ! store third byte
1044 .bc_sm_exit:
1045         ldn     [THREAD_REG + T_LOFAULT], %o3
1046         brz,pt  %o3, .bc_sm_done
1047           nop
1048         membar  #Sync                           ! sync error barrier
1049         andn    %o4, TRAMP_FLAG, %o4
1050         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1051 .bc_sm_done:
1052         retl
1053           mov   %g0, %o0                ! return 0
1054 
1055         .align 16
1056 .bc_med:
1057         xor     %o0, %o1, %o3           ! setup alignment check
1058         btst    1, %o3
1059         bnz,pt  %ncc, .bc_sm_movebytes  ! unaligned
1060           nop
1061         btst    3, %o3
1062         bnz,pt  %ncc, .bc_med_half      ! halfword aligned
1063           nop
1064         btst    7, %o3
1065         bnz,pt  %ncc, .bc_med_word      ! word aligned
1066           nop
1067 .bc_med_long:
1068         btst    3, %o0                  ! check for
1069         bz,pt   %ncc, .bc_med_long1     ! word alignment
1070           nop
1071 .bc_med_long0:
1072         ldub    [%o0], %o3              ! load one byte
1073         inc     %o0
1074         stb     %o3,[%o1]               ! store byte
1075         inc     %o1
1076         btst    3, %o0
1077         bnz,pt  %ncc, .bc_med_long0
1078           dec   %o2
1079 .bc_med_long1:                  ! word aligned
1080         btst    7, %o0                  ! check for long word
1081         bz,pt   %ncc, .bc_med_long2
1082           nop
1083         lduw    [%o0], %o3              ! load word
1084         add     %o0, 4, %o0             ! advance SRC by 4
1085         stw     %o3, [%o1]              ! store word
1086         add     %o1, 4, %o1             ! advance DST by 4
1087         sub     %o2, 4, %o2             ! reduce count by 4
1088 !
1089 !  Now long word aligned and have at least 32 bytes to move
1090 !
1091 .bc_med_long2:
1092         sub     %o2, 31, %o2            ! adjust count to allow cc zero test
1093 .bc_med_lmove:
1094         ldx     [%o0], %o3              ! read long word
1095         stx     %o3, [%o1]              ! write long word
1096         subcc   %o2, 32, %o2            ! reduce count by 32
1097         ldx     [%o0 + 8], %o3          ! repeat for a total for 4 long words
1098         add     %o0, 32, %o0            ! advance SRC by 32
1099         stx     %o3, [%o1 + 8]
1100         ldx     [%o0 - 16], %o3
1101         add     %o1, 32, %o1            ! advance DST by 32
1102         stx     %o3, [%o1 - 16]
1103         ldx     [%o0 - 8], %o3
1104         bgt,pt  %ncc, .bc_med_lmove     ! loop til 31 or fewer bytes left
1105           stx   %o3, [%o1 - 8]
1106         addcc   %o2, 24, %o2            ! restore count to long word offset
1107         ble,pt  %ncc, .bc_med_lextra    ! check for more long words to move
1108           nop
1109 .bc_med_lword:
1110         ldx     [%o0], %o3              ! read long word
1111         subcc   %o2, 8, %o2             ! reduce count by 8
1112         stx     %o3, [%o1]              ! write long word
1113         add     %o0, 8, %o0             ! advance SRC by 8
1114         bgt,pt  %ncc, .bc_med_lword     ! loop til 7 or fewer bytes left
1115           add   %o1, 8, %o1             ! advance DST by 8
1116 .bc_med_lextra:
1117         addcc   %o2, 7, %o2             ! restore rest of count
1118         bz,pt   %ncc, .bc_sm_exit       ! if zero, then done
1119           deccc %o2
1120         bz,pt   %ncc, .bc_sm_byte
1121           nop
1122         ba,pt   %ncc, .bc_sm_half
1123           nop
1124 
1125         .align 16
1126 .bc_med_word:
1127         btst    3, %o0                  ! check for
1128         bz,pt   %ncc, .bc_med_word1     ! word alignment
1129           nop
1130 .bc_med_word0:
1131         ldub    [%o0], %o3              ! load one byte
1132         inc     %o0
1133         stb     %o3,[%o1]               ! store byte
1134         inc     %o1
1135         btst    3, %o0
1136         bnz,pt  %ncc, .bc_med_word0
1137           dec   %o2
1138 !
1139 !  Now word aligned and have at least 36 bytes to move
1140 !
1141 .bc_med_word1:
1142         sub     %o2, 15, %o2            ! adjust count to allow cc zero test
1143 .bc_med_wmove:
1144         lduw    [%o0], %o3              ! read word
1145         stw     %o3, [%o1]              ! write word
1146         subcc   %o2, 16, %o2            ! reduce count by 16
1147         lduw    [%o0 + 4], %o3          ! repeat for a total for 4 words
1148         add     %o0, 16, %o0            ! advance SRC by 16
1149         stw     %o3, [%o1 + 4]
1150         lduw    [%o0 - 8], %o3
1151         add     %o1, 16, %o1            ! advance DST by 16
1152         stw     %o3, [%o1 - 8]
1153         lduw    [%o0 - 4], %o3
1154         bgt,pt  %ncc, .bc_med_wmove     ! loop til 15 or fewer bytes left
1155           stw   %o3, [%o1 - 4]
1156         addcc   %o2, 12, %o2            ! restore count to word offset
1157         ble,pt  %ncc, .bc_med_wextra    ! check for more words to move
1158           nop
1159 .bc_med_word2:
1160         lduw    [%o0], %o3              ! read word
1161         subcc   %o2, 4, %o2             ! reduce count by 4
1162         stw     %o3, [%o1]              ! write word
1163         add     %o0, 4, %o0             ! advance SRC by 4
1164         bgt,pt  %ncc, .bc_med_word2     ! loop til 3 or fewer bytes left
1165           add   %o1, 4, %o1             ! advance DST by 4
1166 .bc_med_wextra:
1167         addcc   %o2, 3, %o2             ! restore rest of count
1168         bz,pt   %ncc, .bc_sm_exit       ! if zero, then done
1169           deccc %o2
1170         bz,pt   %ncc, .bc_sm_byte
1171           nop
1172         ba,pt   %ncc, .bc_sm_half
1173           nop
1174 
1175         .align 16
1176 .bc_med_half:
1177         btst    1, %o0                  ! check for
1178         bz,pt   %ncc, .bc_med_half1     ! half word alignment
1179           nop
1180         ldub    [%o0], %o3              ! load one byte
1181         inc     %o0
1182         stb     %o3,[%o1]               ! store byte
1183         inc     %o1
1184         dec     %o2
1185 !
1186 !  Now half word aligned and have at least 38 bytes to move
1187 !
1188 .bc_med_half1:
1189         sub     %o2, 7, %o2             ! adjust count to allow cc zero test
1190 .bc_med_hmove:
1191         lduh    [%o0], %o3              ! read half word
1192         sth     %o3, [%o1]              ! write half word
1193         subcc   %o2, 8, %o2             ! reduce count by 8
1194         lduh    [%o0 + 2], %o3          ! repeat for a total for 4 halfwords
1195         add     %o0, 8, %o0             ! advance SRC by 8
1196         sth     %o3, [%o1 + 2]
1197         lduh    [%o0 - 4], %o3
1198         add     %o1, 8, %o1             ! advance DST by 8
1199         sth     %o3, [%o1 - 4]
1200         lduh    [%o0 - 2], %o3
1201         bgt,pt  %ncc, .bc_med_hmove     ! loop til 7 or fewer bytes left
1202           sth   %o3, [%o1 - 2]
1203         addcc   %o2, 7, %o2             ! restore count
1204         bz,pt   %ncc, .bc_sm_exit
1205           deccc %o2
1206         bz,pt   %ncc, .bc_sm_byte
1207           nop
1208         ba,pt   %ncc, .bc_sm_half
1209           nop
1210 
1211         SET_SIZE(bcopy)
1212 
1213 /*
1214  * The _more entry points are not intended to be used directly by
1215  * any caller from outside this file.  They are provided to allow
1216  * profiling and dtrace of the portions of the copy code that uses
1217  * the floating point registers.
1218  * This entry is particularly important as DTRACE (at least as of
1219  * 4/2004) does not support leaf functions.
1220  */
1221 
1222         ENTRY(bcopy_more)
1223 .bcopy_more:
1224         prefetch [%o0], #n_reads
1225         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1226         ldn     [THREAD_REG + T_LOFAULT], %l6   ! save t_lofault
1227         tst     %l6
1228         bz,pt   %ncc, .do_copy
1229           nop
1230         sethi   %hi(.copyerr), %o2
1231         or      %o2, %lo(.copyerr), %o2
1232         membar  #Sync                           ! sync error barrier
1233         stn     %o2, [THREAD_REG + T_LOFAULT]   ! install new vector
1234         !
1235         ! We've already captured whether t_lofault was zero on entry.
1236         ! We need to mark ourselves as being from bcopy since both
1237         ! kcopy and bcopy use the same code path. If TRAMP_FLAG is set
1238         ! and the saved lofault was zero, we won't reset lofault on
1239         ! returning.
1240         !
1241         or      %l6, TRAMP_FLAG, %l6
1242 
1243 /*
1244  * Copies that reach here are larger than VIS_COPY_THRESHOLD bytes
1245  * Also, use of FP registers has been tested to be enabled
1246  */
1247 .do_copy:
1248         FP_NOMIGRATE(6, 7)
1249 
1250         rd      %fprs, %o2              ! check for unused fp
1251         st      %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
1252         btst    FPRS_FEF, %o2
1253         bz,a,pt %icc, .do_blockcopy
1254           wr    %g0, FPRS_FEF, %fprs
1255 
1256         BST_FPQ1Q3_TOSTACK(%o2)
1257 
1258 .do_blockcopy:
1259         rd      %gsr, %o2
1260         stx     %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]      ! save gsr
1261         or      %l6, FPUSED_FLAG, %l6
1262 
1263 #define REALSRC %i0
1264 #define DST     %i1
1265 #define CNT     %i2
1266 #define SRC     %i3
1267 #define TMP     %i5
1268 
1269         andcc   DST, VIS_BLOCKSIZE - 1, TMP
1270         bz,pt   %ncc, 2f
1271           neg   TMP
1272         add     TMP, VIS_BLOCKSIZE, TMP
1273 
1274         ! TMP = bytes required to align DST on FP_BLOCK boundary
1275         ! Using SRC as a tmp here
1276         cmp     TMP, 3
1277         bleu,pt %ncc, 1f
1278           sub   CNT,TMP,CNT             ! adjust main count
1279         sub     TMP, 3, TMP             ! adjust for end of loop test
1280 .bc_blkalign:
1281         ldub    [REALSRC], SRC          ! move 4 bytes per loop iteration
1282         stb     SRC, [DST]
1283         subcc   TMP, 4, TMP
1284         ldub    [REALSRC + 1], SRC
1285         add     REALSRC, 4, REALSRC
1286         stb     SRC, [DST + 1]
1287         ldub    [REALSRC - 2], SRC
1288         add     DST, 4, DST
1289         stb     SRC, [DST - 2]
1290         ldub    [REALSRC - 1], SRC
1291         bgu,pt  %ncc, .bc_blkalign
1292           stb   SRC, [DST - 1]
1293 
1294         addcc   TMP, 3, TMP             ! restore count adjustment
1295         bz,pt   %ncc, 2f                ! no bytes left?
1296           nop
1297 1:      ldub    [REALSRC], SRC
1298         inc     REALSRC
1299         inc     DST
1300         deccc   TMP
1301         bgu     %ncc, 1b
1302           stb   SRC, [DST - 1]
1303 
1304 2:
1305         membar  #StoreLoad
1306         andn    REALSRC, 0x7, SRC
1307 
1308         ! SRC - 8-byte aligned
1309         ! DST - 64-byte aligned
1310         ldd     [SRC], %f0
1311         prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads
1312         alignaddr REALSRC, %g0, %g0
1313         ldd     [SRC + 0x08], %f2
1314         prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads
1315         faligndata %f0, %f2, %f32
1316         ldd     [SRC + 0x10], %f4
1317         prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
1318         faligndata %f2, %f4, %f34
1319         ldd     [SRC + 0x18], %f6
1320         prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
1321         faligndata %f4, %f6, %f36
1322         ldd     [SRC + 0x20], %f8
1323         prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read
1324         faligndata %f6, %f8, %f38
1325         ldd     [SRC + 0x28], %f10
1326         prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read
1327         faligndata %f8, %f10, %f40
1328         ldd     [SRC + 0x30], %f12
1329         prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read
1330         faligndata %f10, %f12, %f42
1331         ldd     [SRC + 0x38], %f14
1332         ldd     [SRC + VIS_BLOCKSIZE], %f0
1333         sub     CNT, VIS_BLOCKSIZE, CNT
1334         add     SRC, VIS_BLOCKSIZE, SRC
1335         prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read
1336         add     REALSRC, VIS_BLOCKSIZE, REALSRC
1337         ba,pt   %ncc, 1f
1338           prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read
1339         .align  32
1340 1:
1341         ldd     [SRC + 0x08], %f2
1342         faligndata %f12, %f14, %f44
1343         ldd     [SRC + 0x10], %f4
1344         faligndata %f14, %f0, %f46
1345         stda    %f32, [DST]ASI_BLK_P
1346         ldd     [SRC + 0x18], %f6
1347         faligndata %f0, %f2, %f32
1348         ldd     [SRC + 0x20], %f8
1349         faligndata %f2, %f4, %f34
1350         ldd     [SRC + 0x28], %f10
1351         faligndata %f4, %f6, %f36
1352         ldd     [SRC + 0x30], %f12
1353         faligndata %f6, %f8, %f38
1354         sub     CNT, VIS_BLOCKSIZE, CNT
1355         ldd     [SRC + 0x38], %f14
1356         faligndata %f8, %f10, %f40
1357         add     DST, VIS_BLOCKSIZE, DST
1358         ldd     [SRC + VIS_BLOCKSIZE], %f0
1359         faligndata %f10, %f12, %f42
1360         add     REALSRC, VIS_BLOCKSIZE, REALSRC
1361         prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
1362         add     SRC, VIS_BLOCKSIZE, SRC
1363         prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1364         cmp     CNT, VIS_BLOCKSIZE + 8
1365         bgu,pt  %ncc, 1b
1366           prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1367 
1368         ! only if REALSRC & 0x7 is 0
1369         cmp     CNT, VIS_BLOCKSIZE
1370         bne     %ncc, 3f
1371           andcc REALSRC, 0x7, %g0
1372         bz,pt   %ncc, 2f
1373           nop
1374 3:
1375         faligndata %f12, %f14, %f44
1376         faligndata %f14, %f0, %f46
1377         stda    %f32, [DST]ASI_BLK_P
1378         add     DST, VIS_BLOCKSIZE, DST
1379         ba,pt   %ncc, 3f
1380           nop
1381 2:
1382         ldd     [SRC + 0x08], %f2
1383         fsrc1   %f12, %f44
1384         ldd     [SRC + 0x10], %f4
1385         fsrc1   %f14, %f46
1386         stda    %f32, [DST]ASI_BLK_P
1387         ldd     [SRC + 0x18], %f6
1388         fsrc1   %f0, %f32
1389         ldd     [SRC + 0x20], %f8
1390         fsrc1   %f2, %f34
1391         ldd     [SRC + 0x28], %f10
1392         fsrc1   %f4, %f36
1393         ldd     [SRC + 0x30], %f12
1394         fsrc1   %f6, %f38
1395         ldd     [SRC + 0x38], %f14
1396         fsrc1   %f8, %f40
1397         sub     CNT, VIS_BLOCKSIZE, CNT
1398         add     DST, VIS_BLOCKSIZE, DST
1399         add     SRC, VIS_BLOCKSIZE, SRC
1400         add     REALSRC, VIS_BLOCKSIZE, REALSRC
1401         fsrc1   %f10, %f42
1402         fsrc1   %f12, %f44
1403         fsrc1   %f14, %f46
1404         stda    %f32, [DST]ASI_BLK_P
1405         add     DST, VIS_BLOCKSIZE, DST
1406         ba,a,pt %ncc, .bcb_exit
1407           nop
1408 
1409 3:      tst     CNT
1410         bz,a,pt %ncc, .bcb_exit
1411           nop
1412 
1413 5:      ldub    [REALSRC], TMP
1414         inc     REALSRC
1415         inc     DST
1416         deccc   CNT
1417         bgu     %ncc, 5b
1418           stb   TMP, [DST - 1]
1419 .bcb_exit:
1420         membar  #Sync
1421 
1422         ldx     [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2      ! restore gsr
1423         wr      %o2, 0, %gsr
1424 
1425         ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1426         btst    FPRS_FEF, %o3
1427         bz,pt   %icc, 4f
1428           nop
1429 
1430         BLD_FPQ1Q3_FROMSTACK(%o2)
1431 
1432         ba,pt   %ncc, 2f
1433           wr    %o3, 0, %fprs           ! restore fprs
1434 4:
1435         FZEROQ1Q3
1436         wr      %o3, 0, %fprs           ! restore fprs
1437 2:
1438         membar  #Sync                           ! sync error barrier
1439         andn    %l6, MASK_FLAGS, %l6
1440         stn     %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1441         FP_ALLOWMIGRATE(5, 6)
1442         ret
1443           restore       %g0, 0, %o0
1444 
1445         SET_SIZE(bcopy_more)
1446 
1447 #endif  /* lint */
1448 
1449 /*
1450  * Block copy with possibly overlapped operands.
1451  */
1452 
1453 #if defined(lint)
1454 
1455 /*ARGSUSED*/
1456 void
1457 ovbcopy(const void *from, void *to, size_t count)
1458 {}
1459 
1460 #else   /* lint */
1461 
1462         ENTRY(ovbcopy)
1463         tst     %o2                     ! check count
1464         bgu,a   %ncc, 1f                ! nothing to do or bad arguments
1465           subcc %o0, %o1, %o3           ! difference of from and to address
1466 
1467         retl                            ! return
1468           nop
1469 1:
1470         bneg,a  %ncc, 2f
1471           neg   %o3                     ! if < 0, make it positive
1472 2:      cmp     %o2, %o3                ! cmp size and abs(from - to)
1473         bleu    %ncc, bcopy             ! if size <= abs(diff): use bcopy,
1474           .empty                                !   no overlap
1475           cmp   %o0, %o1                ! compare from and to addresses
1476         blu     %ncc, .ov_bkwd          ! if from < to, copy backwards
1477           nop
1478         !
1479         ! Copy forwards.
1480         !
1481 .ov_fwd:
1482         ldub    [%o0], %o3              ! read from address
1483         inc     %o0                     ! inc from address
1484         stb     %o3, [%o1]              ! write to address
1485         deccc   %o2                     ! dec count
1486         bgu     %ncc, .ov_fwd           ! loop till done
1487           inc   %o1                     ! inc to address
1488 
1489         retl                            ! return
1490           nop
1491         !
1492         ! Copy backwards.
1493         !
1494 .ov_bkwd:
1495         deccc   %o2                     ! dec count
1496         ldub    [%o0 + %o2], %o3        ! get byte at end of src
1497         bgu     %ncc, .ov_bkwd          ! loop till done
1498           stb   %o3, [%o1 + %o2]        ! delay slot, store at end of dst
1499 
1500         retl                            ! return
1501           nop
1502 
1503         SET_SIZE(ovbcopy)
1504 
1505 #endif  /* lint */
1506 
1507 
1508 /*
1509  * hwblkpagecopy()
1510  *
1511  * Copies exactly one page.  This routine assumes the caller (ppcopy)
1512  * has already disabled kernel preemption and has checked
1513  * use_hw_bcopy.  Preventing preemption also prevents cpu migration.
1514  */
1515 #ifdef lint
1516 /*ARGSUSED*/
1517 void
1518 hwblkpagecopy(const void *src, void *dst)
1519 { }
1520 #else /* lint */
1521         ENTRY(hwblkpagecopy)
1522         ! get another window w/space for three aligned blocks of saved fpregs
1523         prefetch [%o0], #n_reads
1524         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1525 
1526         ! %i0 - source address (arg)
1527         ! %i1 - destination address (arg)
1528         ! %i2 - length of region (not arg)
1529         ! %l0 - saved fprs
1530         ! %l1 - pointer to saved fpregs
1531 
1532         rd      %fprs, %l0              ! check for unused fp
1533         btst    FPRS_FEF, %l0
1534         bz,a,pt %icc, 1f
1535           wr    %g0, FPRS_FEF, %fprs
1536 
1537         BST_FPQ1Q3_TOSTACK(%l1)
1538 
1539 1:      set     PAGESIZE, CNT
1540         mov     REALSRC, SRC
1541 
1542         ldd     [SRC], %f0
1543         prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads
1544         ldd     [SRC + 0x08], %f2
1545         prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads
1546         fmovd   %f0, %f32
1547         ldd     [SRC + 0x10], %f4
1548         prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
1549         fmovd   %f2, %f34
1550         ldd     [SRC + 0x18], %f6
1551         prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
1552         fmovd   %f4, %f36
1553         ldd     [SRC + 0x20], %f8
1554         prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read
1555         fmovd   %f6, %f38
1556         ldd     [SRC + 0x28], %f10
1557         prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read
1558         fmovd   %f8, %f40
1559         ldd     [SRC + 0x30], %f12
1560         prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read
1561         fmovd   %f10, %f42
1562         ldd     [SRC + 0x38], %f14
1563         ldd     [SRC + VIS_BLOCKSIZE], %f0
1564         sub     CNT, VIS_BLOCKSIZE, CNT
1565         add     SRC, VIS_BLOCKSIZE, SRC
1566         prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read
1567         ba,pt   %ncc, 2f
1568         prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read
1569         .align  32
1570 2:
1571         ldd     [SRC + 0x08], %f2
1572         fmovd   %f12, %f44
1573         ldd     [SRC + 0x10], %f4
1574         fmovd   %f14, %f46
1575         stda    %f32, [DST]ASI_BLK_P
1576         ldd     [SRC + 0x18], %f6
1577         fmovd   %f0, %f32
1578         ldd     [SRC + 0x20], %f8
1579         fmovd   %f2, %f34
1580         ldd     [SRC + 0x28], %f10
1581         fmovd   %f4, %f36
1582         ldd     [SRC + 0x30], %f12
1583         fmovd   %f6, %f38
1584         ldd     [SRC + 0x38], %f14
1585         fmovd   %f8, %f40
1586         ldd     [SRC + VIS_BLOCKSIZE], %f0
1587         fmovd   %f10, %f42
1588         sub     CNT, VIS_BLOCKSIZE, CNT
1589         prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
1590         add     DST, VIS_BLOCKSIZE, DST
1591         prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1592         add     SRC, VIS_BLOCKSIZE, SRC
1593         cmp     CNT, VIS_BLOCKSIZE + 8
1594         bgu,pt  %ncc, 2b
1595           prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1596 
1597         ! trailing block
1598         ldd     [SRC + 0x08], %f2
1599         fsrc1   %f12, %f44
1600         ldd     [SRC + 0x10], %f4
1601         fsrc1   %f14, %f46
1602         stda    %f32, [DST]ASI_BLK_P
1603         ldd     [SRC + 0x18], %f6
1604         fsrc1   %f0, %f32
1605         ldd     [SRC + 0x20], %f8
1606         fsrc1   %f2, %f34
1607         ldd     [SRC + 0x28], %f10
1608         fsrc1   %f4, %f36
1609         ldd     [SRC + 0x30], %f12
1610         fsrc1   %f6, %f38
1611         ldd     [SRC + 0x38], %f14
1612         fsrc1   %f8, %f40
1613         sub     CNT, VIS_BLOCKSIZE, CNT
1614         add     DST, VIS_BLOCKSIZE, DST
1615         add     SRC, VIS_BLOCKSIZE, SRC
1616         fsrc1   %f10, %f42
1617         fsrc1   %f12, %f44
1618         fsrc1   %f14, %f46
1619         stda    %f32, [DST]ASI_BLK_P
1620 
1621         membar  #Sync
1622 
1623         btst    FPRS_FEF, %l0
1624         bz,pt   %icc, 2f
1625           nop
1626 
1627         BLD_FPQ1Q3_FROMSTACK(%l3)
1628         ba      3f
1629           nop
1630 
1631 2:      FZEROQ1Q3
1632 
1633 3:      wr      %l0, 0, %fprs           ! restore fprs
1634         ret
1635           restore       %g0, 0, %o0
1636 
1637         SET_SIZE(hwblkpagecopy)
1638 #endif  /* lint */
1639 
1640 
1641 /*
1642  * Transfer data to and from user space -
1643  * Note that these routines can cause faults
1644  * It is assumed that the kernel has nothing at
1645  * less than KERNELBASE in the virtual address space.
1646  *
1647  * Note that copyin(9F) and copyout(9F) are part of the
1648  * DDI/DKI which specifies that they return '-1' on "errors."
1649  *
1650  * Sigh.
1651  *
1652  * So there's two extremely similar routines - xcopyin() and xcopyout()
1653  * which return the errno that we've faithfully computed.  This
1654  * allows other callers (e.g. uiomove(9F)) to work correctly.
1655  * Given that these are used pretty heavily, we expand the calling
1656  * sequences inline for all flavours (rather than making wrappers).
1657  *
1658  * There are also stub routines for xcopyout_little and xcopyin_little,
1659  * which currently are intended to handle requests of <= 16 bytes from
1660  * do_unaligned. Future enhancement to make them handle 8k pages efficiently
1661  * is left as an exercise...
1662  */
1663 
1664 /*
1665  * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
1666  *
1667  * General theory of operation:
1668  *
1669  * The only difference between copy{in,out} and
1670  * xcopy{in,out} is in the error handling routine they invoke
1671  * when a memory access error occurs. xcopyOP returns the errno
1672  * while copyOP returns -1 (see above). copy{in,out}_noerr set
1673  * a special flag (by oring the TRAMP_FLAG into the fault handler address)
1674  * if they are called with a fault handler already in place. That flag
1675  * causes the default handlers to trampoline to the previous handler
1676  * upon an error.
1677  *
1678  * None of the copyops routines grab a window until it's decided that
1679  * we need to do a HW block copy operation. This saves a window
1680  * spill/fill when we're called during socket ops. The typical IO
1681  * path won't cause spill/fill traps.
1682  *
1683  * This code uses a set of 4 limits for the maximum size that will
1684  * be copied given a particular input/output address alignment.
1685  * If the value for a particular limit is zero, the copy will be performed
1686  * by the plain copy loops rather than FPBLK.
1687  *
1688  * See the description of bcopy above for more details of the
1689  * data copying algorithm and the default limits.
1690  *
1691  */
1692 
1693 /*
1694  * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
1695  */
1696 
1697 #if defined(lint)
1698 
1699 
1700 #else   /* lint */
1701 /*
1702  * We save the arguments in the following registers in case of a fault:
1703  *      kaddr - %l1
1704  *      uaddr - %l2
1705  *      count - %l3
1706  */
1707 #define SAVE_SRC        %l1
1708 #define SAVE_DST        %l2
1709 #define SAVE_COUNT      %l3
1710 
1711 #define SM_SAVE_SRC             %g4
1712 #define SM_SAVE_DST             %g5
1713 #define SM_SAVE_COUNT           %o5
1714 #define ERRNO           %l5
1715 
1716 
1717 #define REAL_LOFAULT    %l4
1718 /*
1719  * Generic copyio fault handler.  This is the first line of defense when a
1720  * fault occurs in (x)copyin/(x)copyout.  In order for this to function
1721  * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
1722  * This allows us to share common code for all the flavors of the copy
1723  * operations, including the _noerr versions.
1724  *
1725  * Note that this function will restore the original input parameters before
1726  * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
1727  * member of the t_copyop structure, if needed.
1728  */
1729         ENTRY(copyio_fault)
1730         membar  #Sync
1731         mov     %g1,ERRNO                       ! save errno in ERRNO
1732         btst    FPUSED_FLAG, %l6
1733         bz      %ncc, 1f
1734           nop
1735 
1736         ldx     [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
1737         wr      %o2, 0, %gsr            ! restore gsr
1738 
1739         ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1740         btst    FPRS_FEF, %o3
1741         bz,pt   %icc, 4f
1742           nop
1743 
1744         BLD_FPQ2Q4_FROMSTACK(%o2)
1745 
1746         ba,pt   %ncc, 1f
1747           wr    %o3, 0, %fprs           ! restore fprs
1748 
1749 4:
1750         FZEROQ2Q4
1751         wr      %o3, 0, %fprs           ! restore fprs
1752 
1753 1:
1754         andn    %l6, FPUSED_FLAG, %l6
1755         membar  #Sync
1756         stn     %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1757         FP_ALLOWMIGRATE(5, 6)
1758 
1759         mov     SAVE_SRC, %i0
1760         mov     SAVE_DST, %i1
1761         jmp     REAL_LOFAULT
1762           mov   SAVE_COUNT, %i2
1763 
1764         SET_SIZE(copyio_fault)
1765 
1766 
1767 #endif
1768 
1769 #if defined(lint)
1770 
1771 /*ARGSUSED*/
1772 int
1773 copyout(const void *kaddr, void *uaddr, size_t count)
1774 { return (0); }
1775 
1776 #else   /* lint */
1777 
1778         ENTRY(copyout)
1779 
1780         cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
1781         bleu,pt %ncc, .copyout_small            ! go to larger cases
1782           xor   %o0, %o1, %o3                   ! are src, dst alignable?
1783         btst    7, %o3                          !
1784         bz,pt   %ncc, .copyout_8                ! check for longword alignment
1785           nop
1786         btst    1, %o3                          !
1787         bz,pt   %ncc, .copyout_2                ! check for half-word
1788           nop
1789         sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
1790         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
1791         tst     %o3
1792         bz,pn   %icc, .copyout_small            ! if zero, disable HW copy
1793           cmp   %o2, %o3                        ! if length <= limit
1794         bleu,pt %ncc, .copyout_small            ! go to small copy
1795           nop
1796         ba,pt   %ncc, .copyout_more             ! otherwise go to large copy
1797           nop
1798 .copyout_2:
1799         btst    3, %o3                          !
1800         bz,pt   %ncc, .copyout_4                ! check for word alignment
1801           nop
1802         sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
1803         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
1804         tst     %o3
1805         bz,pn   %icc, .copyout_small            ! if zero, disable HW copy
1806           cmp   %o2, %o3                        ! if length <= limit
1807         bleu,pt %ncc, .copyout_small            ! go to small copy
1808           nop
1809         ba,pt   %ncc, .copyout_more             ! otherwise go to large copy
1810           nop
1811 .copyout_4:
1812         ! already checked longword, must be word aligned
1813         sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
1814         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
1815         tst     %o3
1816         bz,pn   %icc, .copyout_small            ! if zero, disable HW copy
1817           cmp   %o2, %o3                        ! if length <= limit
1818         bleu,pt %ncc, .copyout_small            ! go to small copy
1819           nop
1820         ba,pt   %ncc, .copyout_more             ! otherwise go to large copy
1821           nop
1822 .copyout_8:
1823         sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
1824         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
1825         tst     %o3
1826         bz,pn   %icc, .copyout_small            ! if zero, disable HW copy
1827           cmp   %o2, %o3                        ! if length <= limit
1828         bleu,pt %ncc, .copyout_small            ! go to small copy
1829           nop
1830         ba,pt   %ncc, .copyout_more             ! otherwise go to large copy
1831           nop
1832 
1833         .align  16
1834         nop                             ! instruction alignment
1835                                         ! see discussion at start of file
1836 .copyout_small:
1837         sethi   %hi(.sm_copyout_err), %o5       ! .sm_copyout_err is lofault
1838         or      %o5, %lo(.sm_copyout_err), %o5
1839         ldn     [THREAD_REG + T_LOFAULT], %o4   ! save existing handler
1840         membar  #Sync                           ! sync error barrier
1841         stn     %o5, [THREAD_REG + T_LOFAULT]   ! set t_lofault
1842 .sm_do_copyout:
1843         mov     %o0, SM_SAVE_SRC
1844         mov     %o1, SM_SAVE_DST
1845         cmp     %o2, SHORTCOPY          ! check for really short case
1846         bleu,pt %ncc, .co_sm_left       !
1847           mov   %o2, SM_SAVE_COUNT
1848         cmp     %o2, CHKSIZE            ! check for medium length cases
1849         bgu,pn  %ncc, .co_med           !
1850           or    %o0, %o1, %o3           ! prepare alignment check
1851         andcc   %o3, 0x3, %g0           ! test for alignment
1852         bz,pt   %ncc, .co_sm_word       ! branch to word aligned case
1853 .co_sm_movebytes:
1854           sub   %o2, 3, %o2             ! adjust count to allow cc zero test
1855 .co_sm_notalign4:
1856         ldub    [%o0], %o3              ! read byte
1857         subcc   %o2, 4, %o2             ! reduce count by 4
1858         stba    %o3, [%o1]ASI_USER      ! write byte
1859         inc     %o1                     ! advance DST by 1
1860         ldub    [%o0 + 1], %o3          ! repeat for a total of 4 bytes
1861         add     %o0, 4, %o0             ! advance SRC by 4
1862         stba    %o3, [%o1]ASI_USER
1863         inc     %o1                     ! advance DST by 1
1864         ldub    [%o0 - 2], %o3
1865         stba    %o3, [%o1]ASI_USER
1866         inc     %o1                     ! advance DST by 1
1867         ldub    [%o0 - 1], %o3
1868         stba    %o3, [%o1]ASI_USER
1869         bgt,pt  %ncc, .co_sm_notalign4  ! loop til 3 or fewer bytes remain
1870           inc   %o1                     ! advance DST by 1
1871         add     %o2, 3, %o2             ! restore count
1872 .co_sm_left:
1873         tst     %o2
1874         bz,pt   %ncc, .co_sm_exit       ! check for zero length
1875           nop
1876         ldub    [%o0], %o3              ! load one byte
1877         deccc   %o2                     ! reduce count for cc test
1878         bz,pt   %ncc, .co_sm_exit
1879           stba  %o3,[%o1]ASI_USER       ! store one byte
1880         ldub    [%o0 + 1], %o3          ! load second byte
1881         deccc   %o2
1882         inc     %o1
1883         bz,pt   %ncc, .co_sm_exit
1884           stba  %o3,[%o1]ASI_USER       ! store second byte
1885         ldub    [%o0 + 2], %o3          ! load third byte
1886         inc     %o1
1887         stba    %o3,[%o1]ASI_USER       ! store third byte
1888         membar  #Sync                           ! sync error barrier
1889         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1890         retl
1891           mov   %g0, %o0                ! return 0
1892         .align  16
1893 .co_sm_words:
1894         lduw    [%o0], %o3              ! read word
1895 .co_sm_wordx:
1896         subcc   %o2, 8, %o2             ! update count
1897         stwa    %o3, [%o1]ASI_USER      ! write word
1898         add     %o0, 8, %o0             ! update SRC
1899         lduw    [%o0 - 4], %o3          ! read word
1900         add     %o1, 4, %o1             ! update DST
1901         stwa    %o3, [%o1]ASI_USER      ! write word
1902         bgt,pt  %ncc, .co_sm_words      ! loop til done
1903           add   %o1, 4, %o1             ! update DST
1904         addcc   %o2, 7, %o2             ! restore count
1905         bz,pt   %ncc, .co_sm_exit
1906           nop
1907         deccc   %o2
1908         bz,pt   %ncc, .co_sm_byte
1909 .co_sm_half:
1910           subcc %o2, 2, %o2             ! reduce count by 2
1911         lduh    [%o0], %o3              ! read half word
1912         add     %o0, 2, %o0             ! advance SRC by 2
1913         stha    %o3, [%o1]ASI_USER      ! write half word
1914         bgt,pt  %ncc, .co_sm_half       ! loop til done
1915           add   %o1, 2, %o1             ! advance DST by 2
1916         addcc   %o2, 1, %o2             ! restore count
1917         bz,pt   %ncc, .co_sm_exit
1918           nop
1919 .co_sm_byte:
1920         ldub    [%o0], %o3
1921         stba    %o3, [%o1]ASI_USER
1922         membar  #Sync                           ! sync error barrier
1923         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1924         retl
1925           mov   %g0, %o0                ! return 0
1926         .align 16
1927 .co_sm_word:
1928         subcc   %o2, 4, %o2             ! update count
1929         bgt,pt  %ncc, .co_sm_wordx
1930           lduw  [%o0], %o3              ! read word
1931         addcc   %o2, 3, %o2             ! restore count
1932         bz,pt   %ncc, .co_sm_exit
1933           stwa  %o3, [%o1]ASI_USER      ! write word
1934         deccc   %o2                     ! reduce count for cc test
1935         ldub    [%o0 + 4], %o3          ! load one byte
1936         add     %o1, 4, %o1
1937         bz,pt   %ncc, .co_sm_exit
1938           stba  %o3, [%o1]ASI_USER      ! store one byte
1939         ldub    [%o0 + 5], %o3          ! load second byte
1940         deccc   %o2
1941         inc     %o1
1942         bz,pt   %ncc, .co_sm_exit
1943           stba  %o3, [%o1]ASI_USER      ! store second byte
1944         ldub    [%o0 + 6], %o3          ! load third byte
1945         inc     %o1
1946         stba    %o3, [%o1]ASI_USER      ! store third byte
1947 .co_sm_exit:
1948           membar        #Sync                           ! sync error barrier
1949         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1950         retl
1951           mov   %g0, %o0                ! return 0
1952 
1953         .align 16
1954 .co_med:
1955         xor     %o0, %o1, %o3           ! setup alignment check
1956         btst    1, %o3
1957         bnz,pt  %ncc, .co_sm_movebytes  ! unaligned
1958           nop
1959         btst    3, %o3
1960         bnz,pt  %ncc, .co_med_half      ! halfword aligned
1961           nop
1962         btst    7, %o3
1963         bnz,pt  %ncc, .co_med_word      ! word aligned
1964           nop
1965 .co_med_long:
1966         btst    3, %o0                  ! check for
1967         bz,pt   %ncc, .co_med_long1     ! word alignment
1968           nop
1969 .co_med_long0:
1970         ldub    [%o0], %o3              ! load one byte
1971         inc     %o0
1972         stba    %o3,[%o1]ASI_USER       ! store byte
1973         inc     %o1
1974         btst    3, %o0
1975         bnz,pt  %ncc, .co_med_long0
1976           dec   %o2
1977 .co_med_long1:                  ! word aligned
1978         btst    7, %o0                  ! check for long word
1979         bz,pt   %ncc, .co_med_long2
1980           nop
1981         lduw    [%o0], %o3              ! load word
1982         add     %o0, 4, %o0             ! advance SRC by 4
1983         stwa    %o3, [%o1]ASI_USER      ! store word
1984         add     %o1, 4, %o1             ! advance DST by 4
1985         sub     %o2, 4, %o2             ! reduce count by 4
1986 !
1987 !  Now long word aligned and have at least 32 bytes to move
1988 !
1989 .co_med_long2:
1990         sub     %o2, 31, %o2            ! adjust count to allow cc zero test
1991         sub     %o1, 8, %o1             ! adjust pointer to allow store in
1992                                         ! branch delay slot instead of add
1993 .co_med_lmove:
1994         add     %o1, 8, %o1             ! advance DST by 8
1995         ldx     [%o0], %o3              ! read long word
1996         subcc   %o2, 32, %o2            ! reduce count by 32
1997         stxa    %o3, [%o1]ASI_USER      ! write long word
1998         add     %o1, 8, %o1             ! advance DST by 8
1999         ldx     [%o0 + 8], %o3          ! repeat for a total for 4 long words
2000         add     %o0, 32, %o0            ! advance SRC by 32
2001         stxa    %o3, [%o1]ASI_USER
2002         ldx     [%o0 - 16], %o3
2003         add     %o1, 8, %o1             ! advance DST by 8
2004         stxa    %o3, [%o1]ASI_USER
2005         ldx     [%o0 - 8], %o3
2006         add     %o1, 8, %o1             ! advance DST by 8
2007         bgt,pt  %ncc, .co_med_lmove     ! loop til 31 or fewer bytes left
2008           stxa  %o3, [%o1]ASI_USER
2009         add     %o1, 8, %o1             ! advance DST by 8
2010         addcc   %o2, 24, %o2            ! restore count to long word offset
2011         ble,pt  %ncc, .co_med_lextra    ! check for more long words to move
2012           nop
2013 .co_med_lword:
2014         ldx     [%o0], %o3              ! read long word
2015         subcc   %o2, 8, %o2             ! reduce count by 8
2016         stxa    %o3, [%o1]ASI_USER      ! write long word
2017         add     %o0, 8, %o0             ! advance SRC by 8
2018         bgt,pt  %ncc, .co_med_lword     ! loop til 7 or fewer bytes left
2019           add   %o1, 8, %o1             ! advance DST by 8
2020 .co_med_lextra:
2021         addcc   %o2, 7, %o2             ! restore rest of count
2022         bz,pt   %ncc, .co_sm_exit       ! if zero, then done
2023           deccc %o2
2024         bz,pt   %ncc, .co_sm_byte
2025           nop
2026         ba,pt   %ncc, .co_sm_half
2027           nop
2028 
2029         .align 16
2030         nop                             ! instruction alignment
2031                                         ! see discussion at start of file
2032 .co_med_word:
2033         btst    3, %o0                  ! check for
2034         bz,pt   %ncc, .co_med_word1     ! word alignment
2035           nop
2036 .co_med_word0:
2037         ldub    [%o0], %o3              ! load one byte
2038         inc     %o0
2039         stba    %o3,[%o1]ASI_USER       ! store byte
2040         inc     %o1
2041         btst    3, %o0
2042         bnz,pt  %ncc, .co_med_word0
2043           dec   %o2
2044 !
2045 !  Now word aligned and have at least 36 bytes to move
2046 !
2047 .co_med_word1:
2048         sub     %o2, 15, %o2            ! adjust count to allow cc zero test
2049 .co_med_wmove:
2050         lduw    [%o0], %o3              ! read word
2051         subcc   %o2, 16, %o2            ! reduce count by 16
2052         stwa    %o3, [%o1]ASI_USER      ! write word
2053         add     %o1, 4, %o1             ! advance DST by 4
2054         lduw    [%o0 + 4], %o3          ! repeat for a total for 4 words
2055         add     %o0, 16, %o0            ! advance SRC by 16
2056         stwa    %o3, [%o1]ASI_USER
2057         add     %o1, 4, %o1             ! advance DST by 4
2058         lduw    [%o0 - 8], %o3
2059         stwa    %o3, [%o1]ASI_USER
2060         add     %o1, 4, %o1             ! advance DST by 4
2061         lduw    [%o0 - 4], %o3
2062         stwa    %o3, [%o1]ASI_USER
2063         bgt,pt  %ncc, .co_med_wmove     ! loop til 15 or fewer bytes left
2064           add   %o1, 4, %o1             ! advance DST by 4
2065         addcc   %o2, 12, %o2            ! restore count to word offset
2066         ble,pt  %ncc, .co_med_wextra    ! check for more words to move
2067           nop
2068 .co_med_word2:
2069         lduw    [%o0], %o3              ! read word
2070         subcc   %o2, 4, %o2             ! reduce count by 4
2071         stwa    %o3, [%o1]ASI_USER      ! write word
2072         add     %o0, 4, %o0             ! advance SRC by 4
2073         bgt,pt  %ncc, .co_med_word2     ! loop til 3 or fewer bytes left
2074           add   %o1, 4, %o1             ! advance DST by 4
2075 .co_med_wextra:
2076         addcc   %o2, 3, %o2             ! restore rest of count
2077         bz,pt   %ncc, .co_sm_exit       ! if zero, then done
2078           deccc %o2
2079         bz,pt   %ncc, .co_sm_byte
2080           nop
2081         ba,pt   %ncc, .co_sm_half
2082           nop
2083 
2084         .align 16
2085         nop                             ! instruction alignment
2086         nop                             ! see discussion at start of file
2087         nop
2088 .co_med_half:
2089         btst    1, %o0                  ! check for
2090         bz,pt   %ncc, .co_med_half1     ! half word alignment
2091           nop
2092         ldub    [%o0], %o3              ! load one byte
2093         inc     %o0
2094         stba    %o3,[%o1]ASI_USER       ! store byte
2095         inc     %o1
2096         dec     %o2
2097 !
2098 !  Now half word aligned and have at least 38 bytes to move
2099 !
2100 .co_med_half1:
2101         sub     %o2, 7, %o2             ! adjust count to allow cc zero test
2102 .co_med_hmove:
2103         lduh    [%o0], %o3              ! read half word
2104         subcc   %o2, 8, %o2             ! reduce count by 8
2105         stha    %o3, [%o1]ASI_USER      ! write half word
2106         add     %o1, 2, %o1             ! advance DST by 2
2107         lduh    [%o0 + 2], %o3          ! repeat for a total for 4 halfwords
2108         add     %o0, 8, %o0             ! advance SRC by 8
2109         stha    %o3, [%o1]ASI_USER
2110         add     %o1, 2, %o1             ! advance DST by 2
2111         lduh    [%o0 - 4], %o3
2112         stha    %o3, [%o1]ASI_USER
2113         add     %o1, 2, %o1             ! advance DST by 2
2114         lduh    [%o0 - 2], %o3
2115         stha    %o3, [%o1]ASI_USER
2116         bgt,pt  %ncc, .co_med_hmove     ! loop til 7 or fewer bytes left
2117           add   %o1, 2, %o1             ! advance DST by 2
2118         addcc   %o2, 7, %o2             ! restore count
2119         bz,pt   %ncc, .co_sm_exit
2120           deccc %o2
2121         bz,pt   %ncc, .co_sm_byte
2122           nop
2123         ba,pt   %ncc, .co_sm_half
2124           nop
2125 
2126 /*
2127  * We got here because of a fault during short copyout.
2128  * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
2129  */
2130 .sm_copyout_err:
2131         membar  #Sync
2132         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2133         mov     SM_SAVE_SRC, %o0
2134         mov     SM_SAVE_DST, %o1
2135         mov     SM_SAVE_COUNT, %o2
2136         ldn     [THREAD_REG + T_COPYOPS], %o3   ! check for copyop handler
2137         tst     %o3
2138         bz,pt   %ncc, 3f                        ! if not, return error
2139           nop
2140         ldn     [%o3 + CP_COPYOUT], %o5         ! if handler, invoke it with
2141         jmp     %o5                             ! original arguments
2142           nop
2143 3:
2144         retl
2145           or    %g0, -1, %o0            ! return error value
2146 
2147         SET_SIZE(copyout)
2148 
2149 /*
2150  * The _more entry points are not intended to be used directly by
2151  * any caller from outside this file.  They are provided to allow
2152  * profiling and dtrace of the portions of the copy code that uses
2153  * the floating point registers.
2154  * This entry is particularly important as DTRACE (at least as of
2155  * 4/2004) does not support leaf functions.
2156  */
2157 
2158         ENTRY(copyout_more)
2159 .copyout_more:
2160         prefetch [%o0], #n_reads
2161         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2162         set     .copyout_err, REAL_LOFAULT
2163 
2164 /*
2165  * Copy outs that reach here are larger than VIS_COPY_THRESHOLD bytes
2166  */
2167 .do_copyout:
2168         set     copyio_fault, %l7               ! .copyio_fault is lofault val
2169 
2170         ldn     [THREAD_REG + T_LOFAULT], %l6   ! save existing handler
2171         membar  #Sync                           ! sync error barrier
2172         stn     %l7, [THREAD_REG + T_LOFAULT]   ! set t_lofault
2173 
2174         mov     %i0, SAVE_SRC
2175         mov     %i1, SAVE_DST
2176         mov     %i2, SAVE_COUNT
2177 
2178         FP_NOMIGRATE(6, 7)
2179 
2180         rd      %fprs, %o2              ! check for unused fp
2181         st      %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
2182         btst    FPRS_FEF, %o2
2183         bz,a,pt %icc, .do_blockcopyout
2184           wr    %g0, FPRS_FEF, %fprs
2185 
2186         BST_FPQ2Q4_TOSTACK(%o2)
2187 
2188 .do_blockcopyout:
2189         rd      %gsr, %o2
2190         stx     %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]      ! save gsr
2191         or      %l6, FPUSED_FLAG, %l6
2192 
2193         andcc   DST, VIS_BLOCKSIZE - 1, TMP
2194         mov     ASI_USER, %asi
2195         bz,pt   %ncc, 2f
2196           neg   TMP
2197         add     TMP, VIS_BLOCKSIZE, TMP
2198 
2199         ! TMP = bytes required to align DST on FP_BLOCK boundary
2200         ! Using SRC as a tmp here
2201         cmp     TMP, 3
2202         bleu,pt %ncc, 1f
2203           sub   CNT,TMP,CNT             ! adjust main count
2204         sub     TMP, 3, TMP             ! adjust for end of loop test
2205 .co_blkalign:
2206         ldub    [REALSRC], SRC          ! move 4 bytes per loop iteration
2207         stba    SRC, [DST]%asi
2208         subcc   TMP, 4, TMP
2209         ldub    [REALSRC + 1], SRC
2210         add     REALSRC, 4, REALSRC
2211         stba    SRC, [DST + 1]%asi
2212         ldub    [REALSRC - 2], SRC
2213         add     DST, 4, DST
2214         stba    SRC, [DST - 2]%asi
2215         ldub    [REALSRC - 1], SRC
2216         bgu,pt  %ncc, .co_blkalign
2217           stba  SRC, [DST - 1]%asi
2218 
2219         addcc   TMP, 3, TMP             ! restore count adjustment
2220         bz,pt   %ncc, 2f                ! no bytes left?
2221           nop
2222 1:      ldub    [REALSRC], SRC
2223         inc     REALSRC
2224         inc     DST
2225         deccc   TMP
2226         bgu     %ncc, 1b
2227           stba  SRC, [DST - 1]%asi
2228 
2229 2:
2230         membar  #StoreLoad
2231         andn    REALSRC, 0x7, SRC
2232 
2233         ! SRC - 8-byte aligned
2234         ! DST - 64-byte aligned
2235         ldd     [SRC], %f16
2236         prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads
2237         alignaddr REALSRC, %g0, %g0
2238         ldd     [SRC + 0x08], %f18
2239         prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads
2240         faligndata %f16, %f18, %f48
2241         ldd     [SRC + 0x10], %f20
2242         prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
2243         faligndata %f18, %f20, %f50
2244         ldd     [SRC + 0x18], %f22
2245         prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
2246         faligndata %f20, %f22, %f52
2247         ldd     [SRC + 0x20], %f24
2248         prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read
2249         faligndata %f22, %f24, %f54
2250         ldd     [SRC + 0x28], %f26
2251         prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read
2252         faligndata %f24, %f26, %f56
2253         ldd     [SRC + 0x30], %f28
2254         prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read
2255         faligndata %f26, %f28, %f58
2256         ldd     [SRC + 0x38], %f30
2257         ldd     [SRC + VIS_BLOCKSIZE], %f16
2258         sub     CNT, VIS_BLOCKSIZE, CNT
2259         add     SRC, VIS_BLOCKSIZE, SRC
2260         prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read
2261         add     REALSRC, VIS_BLOCKSIZE, REALSRC
2262         ba,pt   %ncc, 1f
2263         prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read
2264         .align  32
2265 1:
2266         ldd     [SRC + 0x08], %f18
2267         faligndata %f28, %f30, %f60
2268         ldd     [SRC + 0x10], %f20
2269         faligndata %f30, %f16, %f62
2270         stda    %f48, [DST]ASI_BLK_AIUS
2271         ldd     [SRC + 0x18], %f22
2272         faligndata %f16, %f18, %f48
2273         ldd     [SRC + 0x20], %f24
2274         faligndata %f18, %f20, %f50
2275         ldd     [SRC + 0x28], %f26
2276         faligndata %f20, %f22, %f52
2277         ldd     [SRC + 0x30], %f28
2278         faligndata %f22, %f24, %f54
2279         sub     CNT, VIS_BLOCKSIZE, CNT
2280         ldd     [SRC + 0x38], %f30
2281         faligndata %f24, %f26, %f56
2282         add     DST, VIS_BLOCKSIZE, DST
2283         ldd     [SRC + VIS_BLOCKSIZE], %f16
2284         faligndata %f26, %f28, %f58
2285         add     REALSRC, VIS_BLOCKSIZE, REALSRC
2286         prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
2287         add     SRC, VIS_BLOCKSIZE, SRC
2288         prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read
2289         cmp     CNT, VIS_BLOCKSIZE + 8
2290         bgu,pt  %ncc, 1b
2291           prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
2292 
2293         ! only if REALSRC & 0x7 is 0
2294         cmp     CNT, VIS_BLOCKSIZE
2295         bne     %ncc, 3f
2296           andcc REALSRC, 0x7, %g0
2297         bz,pt   %ncc, 2f
2298           nop
2299 3:
2300         faligndata %f28, %f30, %f60
2301         faligndata %f30, %f16, %f62
2302         stda    %f48, [DST]ASI_BLK_AIUS
2303         add     DST, VIS_BLOCKSIZE, DST
2304         ba,pt   %ncc, 3f
2305           nop
2306 2:
2307         ldd     [SRC + 0x08], %f18
2308         fsrc1   %f28, %f60
2309         ldd     [SRC + 0x10], %f20
2310         fsrc1   %f30, %f62
2311         stda    %f48, [DST]ASI_BLK_AIUS
2312         ldd     [SRC + 0x18], %f22
2313         fsrc1   %f16, %f48
2314         ldd     [SRC + 0x20], %f24
2315         fsrc1   %f18, %f50
2316         ldd     [SRC + 0x28], %f26
2317         fsrc1   %f20, %f52
2318         ldd     [SRC + 0x30], %f28
2319         fsrc1   %f22, %f54
2320         ldd     [SRC + 0x38], %f30
2321         fsrc1   %f24, %f56
2322         sub     CNT, VIS_BLOCKSIZE, CNT
2323         add     DST, VIS_BLOCKSIZE, DST
2324         add     SRC, VIS_BLOCKSIZE, SRC
2325         add     REALSRC, VIS_BLOCKSIZE, REALSRC
2326         fsrc1   %f26, %f58
2327         fsrc1   %f28, %f60
2328         fsrc1   %f30, %f62
2329         stda    %f48, [DST]ASI_BLK_AIUS
2330         add     DST, VIS_BLOCKSIZE, DST
2331         ba,a,pt %ncc, 4f
2332           nop
2333 
2334 3:      tst     CNT
2335         bz,a    %ncc, 4f
2336           nop
2337 
2338 5:      ldub    [REALSRC], TMP
2339         inc     REALSRC
2340         inc     DST
2341         deccc   CNT
2342         bgu     %ncc, 5b
2343           stba  TMP, [DST - 1]%asi
2344 4:
2345 
2346 .copyout_exit:
2347         membar  #Sync
2348 
2349         ldx     [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
2350         wr      %o2, 0, %gsr            ! restore gsr
2351 
2352         ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
2353         btst    FPRS_FEF, %o3
2354         bz,pt   %icc, 4f
2355           nop
2356 
2357         BLD_FPQ2Q4_FROMSTACK(%o2)
2358 
2359         ba,pt   %ncc, 1f
2360           wr    %o3, 0, %fprs           ! restore fprs
2361 
2362 4:
2363         FZEROQ2Q4
2364         wr      %o3, 0, %fprs           ! restore fprs
2365 
2366 1:
2367         membar  #Sync
2368         andn    %l6, FPUSED_FLAG, %l6
2369         stn     %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2370         FP_ALLOWMIGRATE(5, 6)
2371         ret
2372           restore       %g0, 0, %o0
2373 
2374 /*
2375  * We got here because of a fault during copyout.
2376  * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
2377  */
2378 .copyout_err:
2379         ldn     [THREAD_REG + T_COPYOPS], %o4   ! check for copyop handler
2380         tst     %o4
2381         bz,pt   %ncc, 2f                        ! if not, return error
2382           nop
2383         ldn     [%o4 + CP_COPYOUT], %g2         ! if handler, invoke it with
2384         jmp     %g2                             ! original arguments
2385           restore %g0, 0, %g0                   ! dispose of copy window
2386 2:
2387         ret
2388           restore %g0, -1, %o0                  ! return error value
2389 
2390 
2391         SET_SIZE(copyout_more)
2392 
2393 #endif  /* lint */
2394 
2395 
2396 #ifdef  lint
2397 
2398 /*ARGSUSED*/
2399 int
2400 xcopyout(const void *kaddr, void *uaddr, size_t count)
2401 { return (0); }
2402 
2403 #else   /* lint */
2404 
2405         ENTRY(xcopyout)
2406         cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
2407         bleu,pt %ncc, .xcopyout_small           ! go to larger cases
2408           xor   %o0, %o1, %o3                   ! are src, dst alignable?
2409         btst    7, %o3                          !
2410         bz,pt   %ncc, .xcopyout_8               !
2411           nop
2412         btst    1, %o3                          !
2413         bz,pt   %ncc, .xcopyout_2               ! check for half-word
2414           nop
2415         sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
2416         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
2417         tst     %o3
2418         bz,pn   %icc, .xcopyout_small           ! if zero, disable HW copy
2419           cmp   %o2, %o3                        ! if length <= limit
2420         bleu,pt %ncc, .xcopyout_small           ! go to small copy
2421           nop
2422         ba,pt   %ncc, .xcopyout_more            ! otherwise go to large copy
2423           nop
2424 .xcopyout_2:
2425         btst    3, %o3                          !
2426         bz,pt   %ncc, .xcopyout_4               ! check for word alignment
2427           nop
2428         sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
2429         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
2430         tst     %o3
2431         bz,pn   %icc, .xcopyout_small           ! if zero, disable HW copy
2432           cmp   %o2, %o3                        ! if length <= limit
2433         bleu,pt %ncc, .xcopyout_small           ! go to small copy
2434           nop
2435         ba,pt   %ncc, .xcopyout_more            ! otherwise go to large copy
2436           nop
2437 .xcopyout_4:
2438         ! already checked longword, must be word aligned
2439         sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
2440         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
2441         tst     %o3
2442         bz,pn   %icc, .xcopyout_small           ! if zero, disable HW copy
2443           cmp   %o2, %o3                        ! if length <= limit
2444         bleu,pt %ncc, .xcopyout_small           ! go to small copy
2445           nop
2446         ba,pt   %ncc, .xcopyout_more            ! otherwise go to large copy
2447           nop
2448 .xcopyout_8:
2449         sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
2450         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
2451         tst     %o3
2452         bz,pn   %icc, .xcopyout_small           ! if zero, disable HW copy
2453           cmp   %o2, %o3                        ! if length <= limit
2454         bleu,pt %ncc, .xcopyout_small           ! go to small copy
2455           nop
2456         ba,pt   %ncc, .xcopyout_more            ! otherwise go to large copy
2457           nop
2458 
2459 .xcopyout_small:
2460         sethi   %hi(.sm_xcopyout_err), %o5      ! .sm_xcopyout_err is lofault
2461         or      %o5, %lo(.sm_xcopyout_err), %o5
2462         ldn     [THREAD_REG + T_LOFAULT], %o4   ! save existing handler
2463         membar  #Sync                           ! sync error barrier
2464         ba,pt   %ncc, .sm_do_copyout            ! common code
2465           stn   %o5, [THREAD_REG + T_LOFAULT]   ! set t_lofault
2466 
2467 .xcopyout_more:
2468         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2469         sethi   %hi(.xcopyout_err), REAL_LOFAULT
2470         ba,pt   %ncc, .do_copyout               ! common code
2471           or    REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
2472 
2473 /*
2474  * We got here because of fault during xcopyout
2475  * Errno value is in ERRNO
2476  */
2477 .xcopyout_err:
2478         ldn     [THREAD_REG + T_COPYOPS], %o4   ! check for copyop handler
2479         tst     %o4
2480         bz,pt   %ncc, 2f                        ! if not, return error
2481           nop
2482         ldn     [%o4 + CP_XCOPYOUT], %g2        ! if handler, invoke it with
2483         jmp     %g2                             ! original arguments
2484           restore %g0, 0, %g0                   ! dispose of copy window
2485 2:
2486         ret
2487           restore ERRNO, 0, %o0                 ! return errno value
2488 
2489 .sm_xcopyout_err:
2490 
2491         membar  #Sync
2492         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2493         mov     SM_SAVE_SRC, %o0
2494         mov     SM_SAVE_DST, %o1
2495         mov     SM_SAVE_COUNT, %o2
2496         ldn     [THREAD_REG + T_COPYOPS], %o3   ! check for copyop handler
2497         tst     %o3
2498         bz,pt   %ncc, 3f                        ! if not, return error
2499           nop
2500         ldn     [%o3 + CP_XCOPYOUT], %o5        ! if handler, invoke it with
2501         jmp     %o5                             ! original arguments
2502           nop
2503 3:
2504         retl
2505           or    %g1, 0, %o0             ! return errno value
2506 
2507         SET_SIZE(xcopyout)
2508 
2509 #endif  /* lint */
2510 
2511 #ifdef  lint
2512 
2513 /*ARGSUSED*/
2514 int
2515 xcopyout_little(const void *kaddr, void *uaddr, size_t count)
2516 { return (0); }
2517 
2518 #else   /* lint */
2519 
2520         ENTRY(xcopyout_little)
2521         sethi   %hi(.xcopyio_err), %o5
2522         or      %o5, %lo(.xcopyio_err), %o5
2523         ldn     [THREAD_REG + T_LOFAULT], %o4
2524         membar  #Sync                           ! sync error barrier
2525         stn     %o5, [THREAD_REG + T_LOFAULT]
2526         mov     %o4, %o5
2527 
2528         subcc   %g0, %o2, %o3
2529         add     %o0, %o2, %o0
2530         bz,pn   %ncc, 2f                ! check for zero bytes
2531           sub   %o2, 1, %o4
2532         add     %o0, %o4, %o0           ! start w/last byte
2533         add     %o1, %o2, %o1
2534         ldub    [%o0 + %o3], %o4
2535 
2536 1:      stba    %o4, [%o1 + %o3]ASI_AIUSL
2537         inccc   %o3
2538         sub     %o0, 2, %o0             ! get next byte
2539         bcc,a,pt %ncc, 1b
2540           ldub  [%o0 + %o3], %o4
2541 
2542 2:
2543         membar  #Sync                           ! sync error barrier
2544         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2545         retl
2546           mov   %g0, %o0                ! return (0)
2547 
2548         SET_SIZE(xcopyout_little)
2549 
2550 #endif  /* lint */
2551 
2552 /*
2553  * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
2554  */
2555 
2556 #if defined(lint)
2557 
2558 /*ARGSUSED*/
2559 int
2560 copyin(const void *uaddr, void *kaddr, size_t count)
2561 { return (0); }
2562 
2563 #else   /* lint */
2564 
2565         ENTRY(copyin)
2566         cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
2567         bleu,pt %ncc, .copyin_small             ! go to larger cases
2568           xor   %o0, %o1, %o3                   ! are src, dst alignable?
2569         btst    7, %o3                          !
2570         bz,pt   %ncc, .copyin_8                 ! check for longword alignment
2571           nop
2572         btst    1, %o3                          !
2573         bz,pt   %ncc, .copyin_2                 ! check for half-word
2574           nop
2575         sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
2576         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
2577         tst     %o3
2578         bz,pn   %icc, .copyin_small             ! if zero, disable HW copy
2579           cmp   %o2, %o3                        ! if length <= limit
2580         bleu,pt %ncc, .copyin_small             ! go to small copy
2581           nop
2582         ba,pt   %ncc, .copyin_more              ! otherwise go to large copy
2583           nop
2584 .copyin_2:
2585         btst    3, %o3                          !
2586         bz,pt   %ncc, .copyin_4                 ! check for word alignment
2587           nop
2588         sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
2589         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
2590         tst     %o3
2591         bz,pn   %icc, .copyin_small             ! if zero, disable HW copy
2592           cmp   %o2, %o3                        ! if length <= limit
2593         bleu,pt %ncc, .copyin_small             ! go to small copy
2594           nop
2595         ba,pt   %ncc, .copyin_more              ! otherwise go to large copy
2596           nop
2597 .copyin_4:
2598         ! already checked longword, must be word aligned
2599         sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
2600         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
2601         tst     %o3
2602         bz,pn   %icc, .copyin_small             ! if zero, disable HW copy
2603           cmp   %o2, %o3                        ! if length <= limit
2604         bleu,pt %ncc, .copyin_small             ! go to small copy
2605           nop
2606         ba,pt   %ncc, .copyin_more              ! otherwise go to large copy
2607           nop
2608 .copyin_8:
2609         sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
2610         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
2611         tst     %o3
2612         bz,pn   %icc, .copyin_small             ! if zero, disable HW copy
2613           cmp   %o2, %o3                        ! if length <= limit
2614         bleu,pt %ncc, .copyin_small             ! go to small copy
2615           nop
2616         ba,pt   %ncc, .copyin_more              ! otherwise go to large copy
2617           nop
2618 
2619         .align  16
2620         nop                             ! instruction alignment
2621                                         ! see discussion at start of file
2622 .copyin_small:
2623         sethi   %hi(.sm_copyin_err), %o5        ! .sm_copyin_err is lofault
2624         or      %o5, %lo(.sm_copyin_err), %o5
2625         ldn     [THREAD_REG + T_LOFAULT], %o4   ! set/save t_lofault, no tramp
2626         membar  #Sync                           ! sync error barrier
2627         stn     %o5, [THREAD_REG + T_LOFAULT]
2628 .sm_do_copyin:
2629         mov     %o0, SM_SAVE_SRC
2630         mov     %o1, SM_SAVE_DST
2631         cmp     %o2, SHORTCOPY          ! check for really short case
2632         bleu,pt %ncc, .ci_sm_left       !
2633           mov   %o2, SM_SAVE_COUNT
2634         cmp     %o2, CHKSIZE            ! check for medium length cases
2635         bgu,pn  %ncc, .ci_med           !
2636           or    %o0, %o1, %o3           ! prepare alignment check
2637         andcc   %o3, 0x3, %g0           ! test for alignment
2638         bz,pt   %ncc, .ci_sm_word       ! branch to word aligned case
2639 .ci_sm_movebytes:
2640           sub   %o2, 3, %o2             ! adjust count to allow cc zero test
2641 .ci_sm_notalign4:
2642         lduba   [%o0]ASI_USER, %o3      ! read byte
2643         subcc   %o2, 4, %o2             ! reduce count by 4
2644         stb     %o3, [%o1]              ! write byte
2645         add     %o0, 1, %o0             ! advance SRC by 1
2646         lduba   [%o0]ASI_USER, %o3      ! repeat for a total of 4 bytes
2647         add     %o0, 1, %o0             ! advance SRC by 1
2648         stb     %o3, [%o1 + 1]
2649         add     %o1, 4, %o1             ! advance DST by 4
2650         lduba   [%o0]ASI_USER, %o3
2651         add     %o0, 1, %o0             ! advance SRC by 1
2652         stb     %o3, [%o1 - 2]
2653         lduba   [%o0]ASI_USER, %o3
2654         add     %o0, 1, %o0             ! advance SRC by 1
2655         bgt,pt  %ncc, .ci_sm_notalign4  ! loop til 3 or fewer bytes remain
2656           stb   %o3, [%o1 - 1]
2657         add     %o2, 3, %o2             ! restore count
2658 .ci_sm_left:
2659         tst     %o2
2660         bz,pt   %ncc, .ci_sm_exit
2661           nop
2662         lduba   [%o0]ASI_USER, %o3              ! load one byte
2663         deccc   %o2                     ! reduce count for cc test
2664         bz,pt   %ncc, .ci_sm_exit
2665           stb   %o3,[%o1]               ! store one byte
2666         inc     %o0
2667         lduba   [%o0]ASI_USER, %o3      ! load second byte
2668         deccc   %o2
2669         bz,pt   %ncc, .ci_sm_exit
2670           stb   %o3,[%o1 + 1]           ! store second byte
2671         inc     %o0
2672         lduba   [%o0]ASI_USER, %o3      ! load third byte
2673         stb     %o3,[%o1 + 2]           ! store third byte
2674         membar  #Sync                           ! sync error barrier
2675         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2676         retl
2677           mov   %g0, %o0                ! return 0
2678         .align  16
2679 .ci_sm_words:
2680         lduwa   [%o0]ASI_USER, %o3              ! read word
2681 .ci_sm_wordx:
2682         subcc   %o2, 8, %o2             ! update count
2683         stw     %o3, [%o1]              ! write word
2684         add     %o0, 4, %o0             ! update SRC
2685         add     %o1, 8, %o1             ! update DST
2686         lduwa   [%o0]ASI_USER, %o3      ! read word
2687         add     %o0, 4, %o0             ! update SRC
2688         bgt,pt  %ncc, .ci_sm_words      ! loop til done
2689           stw   %o3, [%o1 - 4]          ! write word
2690         addcc   %o2, 7, %o2             ! restore count
2691         bz,pt   %ncc, .ci_sm_exit
2692           nop
2693         deccc   %o2
2694         bz,pt   %ncc, .ci_sm_byte
2695 .ci_sm_half:
2696           subcc %o2, 2, %o2             ! reduce count by 2
2697         lduha   [%o0]ASI_USER, %o3      ! read half word
2698         add     %o0, 2, %o0             ! advance SRC by 2
2699         add     %o1, 2, %o1             ! advance DST by 2
2700         bgt,pt  %ncc, .ci_sm_half       ! loop til done
2701           sth   %o3, [%o1 - 2]          ! write half word
2702         addcc   %o2, 1, %o2             ! restore count
2703         bz,pt   %ncc, .ci_sm_exit
2704           nop
2705 .ci_sm_byte:
2706         lduba   [%o0]ASI_USER, %o3
2707         stb     %o3, [%o1]
2708         membar  #Sync                           ! sync error barrier
2709         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2710         retl
2711           mov   %g0, %o0                ! return 0
2712         .align  16
2713 .ci_sm_word:
2714         subcc   %o2, 4, %o2             ! update count
2715         bgt,pt  %ncc, .ci_sm_wordx
2716           lduwa [%o0]ASI_USER, %o3              ! read word
2717         addcc   %o2, 3, %o2             ! restore count
2718         bz,pt   %ncc, .ci_sm_exit
2719           stw   %o3, [%o1]              ! write word
2720         deccc   %o2                     ! reduce count for cc test
2721         add     %o0, 4, %o0
2722         lduba   [%o0]ASI_USER, %o3      ! load one byte
2723         bz,pt   %ncc, .ci_sm_exit
2724           stb   %o3, [%o1 + 4]          ! store one byte
2725         inc     %o0
2726         lduba   [%o0]ASI_USER, %o3      ! load second byte
2727         deccc   %o2
2728         bz,pt   %ncc, .ci_sm_exit
2729           stb   %o3, [%o1 + 5]          ! store second byte
2730         inc     %o0
2731         lduba   [%o0]ASI_USER, %o3      ! load third byte
2732         stb     %o3, [%o1 + 6]          ! store third byte
2733 .ci_sm_exit:
2734         membar  #Sync                           ! sync error barrier
2735         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2736         retl
2737           mov   %g0, %o0                ! return 0
2738 
2739         .align 16
2740 .ci_med:
2741         xor     %o0, %o1, %o3           ! setup alignment check
2742         btst    1, %o3
2743         bnz,pt  %ncc, .ci_sm_movebytes  ! unaligned
2744           nop
2745         btst    3, %o3
2746         bnz,pt  %ncc, .ci_med_half      ! halfword aligned
2747           nop
2748         btst    7, %o3
2749         bnz,pt  %ncc, .ci_med_word      ! word aligned
2750           nop
2751 .ci_med_long:
2752         btst    3, %o0                  ! check for
2753         bz,pt   %ncc, .ci_med_long1     ! word alignment
2754           nop
2755 .ci_med_long0:
2756         lduba   [%o0]ASI_USER, %o3              ! load one byte
2757         inc     %o0
2758         stb     %o3,[%o1]               ! store byte
2759         inc     %o1
2760         btst    3, %o0
2761         bnz,pt  %ncc, .ci_med_long0
2762           dec   %o2
2763 .ci_med_long1:                  ! word aligned
2764         btst    7, %o0                  ! check for long word
2765         bz,pt   %ncc, .ci_med_long2
2766           nop
2767         lduwa   [%o0]ASI_USER, %o3      ! load word
2768         add     %o0, 4, %o0             ! advance SRC by 4
2769         stw     %o3, [%o1]              ! store word
2770         add     %o1, 4, %o1             ! advance DST by 4
2771         sub     %o2, 4, %o2             ! reduce count by 4
2772 !
2773 !  Now long word aligned and have at least 32 bytes to move
2774 !
2775 .ci_med_long2:
2776         sub     %o2, 31, %o2            ! adjust count to allow cc zero test
2777 .ci_med_lmove:
2778         ldxa    [%o0]ASI_USER, %o3      ! read long word
2779         subcc   %o2, 32, %o2            ! reduce count by 32
2780         stx     %o3, [%o1]              ! write long word
2781         add     %o0, 8, %o0             ! advance SRC by 8
2782         ldxa    [%o0]ASI_USER, %o3      ! repeat for a total for 4 long words
2783         add     %o0, 8, %o0             ! advance SRC by 8
2784         stx     %o3, [%o1 + 8]
2785         add     %o1, 32, %o1            ! advance DST by 32
2786         ldxa    [%o0]ASI_USER, %o3
2787         add     %o0, 8, %o0             ! advance SRC by 8
2788         stx     %o3, [%o1 - 16]
2789         ldxa    [%o0]ASI_USER, %o3
2790         add     %o0, 8, %o0             ! advance SRC by 8
2791         bgt,pt  %ncc, .ci_med_lmove     ! loop til 31 or fewer bytes left
2792           stx   %o3, [%o1 - 8]
2793         addcc   %o2, 24, %o2            ! restore count to long word offset
2794         ble,pt  %ncc, .ci_med_lextra    ! check for more long words to move
2795           nop
2796 .ci_med_lword:
2797         ldxa    [%o0]ASI_USER, %o3      ! read long word
2798         subcc   %o2, 8, %o2             ! reduce count by 8
2799         stx     %o3, [%o1]              ! write long word
2800         add     %o0, 8, %o0             ! advance SRC by 8
2801         bgt,pt  %ncc, .ci_med_lword     ! loop til 7 or fewer bytes left
2802           add   %o1, 8, %o1             ! advance DST by 8
2803 .ci_med_lextra:
2804         addcc   %o2, 7, %o2             ! restore rest of count
2805         bz,pt   %ncc, .ci_sm_exit       ! if zero, then done
2806           deccc %o2
2807         bz,pt   %ncc, .ci_sm_byte
2808           nop
2809         ba,pt   %ncc, .ci_sm_half
2810           nop
2811 
2812         .align 16
2813         nop                             ! instruction alignment
2814                                         ! see discussion at start of file
2815 .ci_med_word:
2816         btst    3, %o0                  ! check for
2817         bz,pt   %ncc, .ci_med_word1     ! word alignment
2818           nop
2819 .ci_med_word0:
2820         lduba   [%o0]ASI_USER, %o3      ! load one byte
2821         inc     %o0
2822         stb     %o3,[%o1]               ! store byte
2823         inc     %o1
2824         btst    3, %o0
2825         bnz,pt  %ncc, .ci_med_word0
2826           dec   %o2
2827 !
2828 !  Now word aligned and have at least 36 bytes to move
2829 !
2830 .ci_med_word1:
2831         sub     %o2, 15, %o2            ! adjust count to allow cc zero test
2832 .ci_med_wmove:
2833         lduwa   [%o0]ASI_USER, %o3      ! read word
2834         subcc   %o2, 16, %o2            ! reduce count by 16
2835         stw     %o3, [%o1]              ! write word
2836         add     %o0, 4, %o0             ! advance SRC by 4
2837         lduwa   [%o0]ASI_USER, %o3      ! repeat for a total for 4 words
2838         add     %o0, 4, %o0             ! advance SRC by 4
2839         stw     %o3, [%o1 + 4]
2840         add     %o1, 16, %o1            ! advance DST by 16
2841         lduwa   [%o0]ASI_USER, %o3
2842         add     %o0, 4, %o0             ! advance SRC by 4
2843         stw     %o3, [%o1 - 8]
2844         lduwa   [%o0]ASI_USER, %o3
2845         add     %o0, 4, %o0             ! advance SRC by 4
2846         bgt,pt  %ncc, .ci_med_wmove     ! loop til 15 or fewer bytes left
2847           stw   %o3, [%o1 - 4]
2848         addcc   %o2, 12, %o2            ! restore count to word offset
2849         ble,pt  %ncc, .ci_med_wextra    ! check for more words to move
2850           nop
2851 .ci_med_word2:
2852         lduwa   [%o0]ASI_USER, %o3      ! read word
2853         subcc   %o2, 4, %o2             ! reduce count by 4
2854         stw     %o3, [%o1]              ! write word
2855         add     %o0, 4, %o0             ! advance SRC by 4
2856         bgt,pt  %ncc, .ci_med_word2     ! loop til 3 or fewer bytes left
2857           add   %o1, 4, %o1             ! advance DST by 4
2858 .ci_med_wextra:
2859         addcc   %o2, 3, %o2             ! restore rest of count
2860         bz,pt   %ncc, .ci_sm_exit       ! if zero, then done
2861           deccc %o2
2862         bz,pt   %ncc, .ci_sm_byte
2863           nop
2864         ba,pt   %ncc, .ci_sm_half
2865           nop
2866 
2867         .align 16
2868         nop                             ! instruction alignment
2869                                         ! see discussion at start of file
2870 .ci_med_half:
2871         btst    1, %o0                  ! check for
2872         bz,pt   %ncc, .ci_med_half1     ! half word alignment
2873           nop
2874         lduba   [%o0]ASI_USER, %o3      ! load one byte
2875         inc     %o0
2876         stb     %o3,[%o1]               ! store byte
2877         inc     %o1
2878         dec     %o2
2879 !
2880 !  Now half word aligned and have at least 38 bytes to move
2881 !
2882 .ci_med_half1:
2883         sub     %o2, 7, %o2             ! adjust count to allow cc zero test
2884 .ci_med_hmove:
2885         lduha   [%o0]ASI_USER, %o3      ! read half word
2886         subcc   %o2, 8, %o2             ! reduce count by 8
2887         sth     %o3, [%o1]              ! write half word
2888         add     %o0, 2, %o0             ! advance SRC by 2
2889         lduha   [%o0]ASI_USER, %o3      ! repeat for a total for 4 halfwords
2890         add     %o0, 2, %o0             ! advance SRC by 2
2891         sth     %o3, [%o1 + 2]
2892         add     %o1, 8, %o1             ! advance DST by 8
2893         lduha   [%o0]ASI_USER, %o3
2894         add     %o0, 2, %o0             ! advance SRC by 2
2895         sth     %o3, [%o1 - 4]
2896         lduha   [%o0]ASI_USER, %o3
2897         add     %o0, 2, %o0             ! advance SRC by 2
2898         bgt,pt  %ncc, .ci_med_hmove     ! loop til 7 or fewer bytes left
2899           sth   %o3, [%o1 - 2]
2900         addcc   %o2, 7, %o2             ! restore count
2901         bz,pt   %ncc, .ci_sm_exit
2902           deccc %o2
2903         bz,pt   %ncc, .ci_sm_byte
2904           nop
2905         ba,pt   %ncc, .ci_sm_half
2906           nop
2907 
2908 .sm_copyin_err:
2909         membar  #Sync
2910         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2911         mov     SM_SAVE_SRC, %o0
2912         mov     SM_SAVE_DST, %o1
2913         mov     SM_SAVE_COUNT, %o2
2914         ldn     [THREAD_REG + T_COPYOPS], %o3   ! check for copyop handler
2915         tst     %o3
2916         bz,pt   %ncc, 3f                        ! if not, return error
2917           nop
2918         ldn     [%o3 + CP_COPYIN], %o5          ! if handler, invoke it with
2919         jmp     %o5                             ! original arguments
2920           nop
2921 3:
2922         retl
2923           or    %g0, -1, %o0            ! return errno value
2924 
2925         SET_SIZE(copyin)
2926 
2927 
2928 /*
2929  * The _more entry points are not intended to be used directly by
2930  * any caller from outside this file.  They are provided to allow
2931  * profiling and dtrace of the portions of the copy code that uses
2932  * the floating point registers.
2933  * This entry is particularly important as DTRACE (at least as of
2934  * 4/2004) does not support leaf functions.
2935  */
2936 
2937         ENTRY(copyin_more)
2938 .copyin_more:
2939         prefetch [%o0], #n_reads
2940         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2941         set     .copyin_err, REAL_LOFAULT
2942 
2943 /*
2944  * Copy ins that reach here are larger than VIS_COPY_THRESHOLD bytes
2945  */
2946 .do_copyin:
2947         set     copyio_fault, %l7               ! .copyio_fault is lofault val
2948 
2949         ldn     [THREAD_REG + T_LOFAULT], %l6   ! save existing handler
2950         membar  #Sync                           ! sync error barrier
2951         stn     %l7, [THREAD_REG + T_LOFAULT]   ! set t_lofault
2952 
2953         mov     %i0, SAVE_SRC
2954         mov     %i1, SAVE_DST
2955         mov     %i2, SAVE_COUNT
2956 
2957         FP_NOMIGRATE(6, 7)
2958 
2959         rd      %fprs, %o2              ! check for unused fp
2960         st      %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
2961         btst    FPRS_FEF, %o2
2962         bz,a,pt %icc, .do_blockcopyin
2963           wr    %g0, FPRS_FEF, %fprs
2964 
2965         BST_FPQ2Q4_TOSTACK(%o2)
2966 
2967 .do_blockcopyin:
2968         rd      %gsr, %o2
2969         stx     %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]      ! save gsr
2970         or      %l6, FPUSED_FLAG, %l6
2971 
2972         andcc   DST, VIS_BLOCKSIZE - 1, TMP
2973         mov     ASI_USER, %asi
2974         bz,pt   %ncc, 2f
2975           neg   TMP
2976         add     TMP, VIS_BLOCKSIZE, TMP
2977 
2978         ! TMP = bytes required to align DST on FP_BLOCK boundary
2979         ! Using SRC as a tmp here
2980         cmp     TMP, 3
2981         bleu,pt %ncc, 1f
2982           sub   CNT,TMP,CNT             ! adjust main count
2983         sub     TMP, 3, TMP             ! adjust for end of loop test
2984 .ci_blkalign:
2985         lduba   [REALSRC]%asi, SRC      ! move 4 bytes per loop iteration
2986         stb     SRC, [DST]
2987         subcc   TMP, 4, TMP
2988         lduba   [REALSRC + 1]%asi, SRC
2989         add     REALSRC, 4, REALSRC
2990         stb     SRC, [DST + 1]
2991         lduba   [REALSRC - 2]%asi, SRC
2992         add     DST, 4, DST
2993         stb     SRC, [DST - 2]
2994         lduba   [REALSRC - 1]%asi, SRC
2995         bgu,pt  %ncc, .ci_blkalign
2996           stb   SRC, [DST - 1]
2997 
2998         addcc   TMP, 3, TMP             ! restore count adjustment
2999         bz,pt   %ncc, 2f                ! no bytes left?
3000           nop
3001 1:      lduba   [REALSRC]%asi, SRC
3002         inc     REALSRC
3003         inc     DST
3004         deccc   TMP
3005         bgu     %ncc, 1b
3006           stb   SRC, [DST - 1]
3007 
3008 2:
3009         membar  #StoreLoad
3010         andn    REALSRC, 0x7, SRC
3011 
3012         ! SRC - 8-byte aligned
3013         ! DST - 64-byte aligned
3014         ldda    [SRC]%asi, %f16
3015         prefetcha [SRC + (1 * VIS_BLOCKSIZE)]%asi, #n_reads
3016         alignaddr REALSRC, %g0, %g0
3017         ldda    [SRC + 0x08]%asi, %f18
3018         prefetcha [SRC + (2 * VIS_BLOCKSIZE)]%asi, #n_reads
3019         faligndata %f16, %f18, %f48
3020         ldda    [SRC + 0x10]%asi, %f20
3021         prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #n_reads
3022         faligndata %f18, %f20, %f50
3023         ldda    [SRC + 0x18]%asi, %f22
3024         prefetcha [SRC + (4 * VIS_BLOCKSIZE)]%asi, #one_read
3025         faligndata %f20, %f22, %f52
3026         ldda    [SRC + 0x20]%asi, %f24
3027         prefetcha [SRC + (8 * VIS_BLOCKSIZE)]%asi, #one_read
3028         faligndata %f22, %f24, %f54
3029         ldda    [SRC + 0x28]%asi, %f26
3030         prefetcha [SRC + (12 * VIS_BLOCKSIZE)]%asi, #one_read
3031         faligndata %f24, %f26, %f56
3032         ldda    [SRC + 0x30]%asi, %f28
3033         prefetcha [SRC + (16 * VIS_BLOCKSIZE)]%asi, #one_read
3034         faligndata %f26, %f28, %f58
3035         ldda    [SRC + 0x38]%asi, %f30
3036         ldda    [SRC + VIS_BLOCKSIZE]%asi, %f16
3037         sub     CNT, VIS_BLOCKSIZE, CNT
3038         add     SRC, VIS_BLOCKSIZE, SRC
3039         prefetcha [SRC + (19 * VIS_BLOCKSIZE)]%asi, #one_read
3040         add     REALSRC, VIS_BLOCKSIZE, REALSRC
3041         ba,pt   %ncc, 1f
3042         prefetcha [SRC + (23 * VIS_BLOCKSIZE)]%asi, #one_read
3043         .align  32
3044 1:
3045         ldda    [SRC + 0x08]%asi, %f18
3046         faligndata %f28, %f30, %f60
3047         ldda    [SRC + 0x10]%asi, %f20
3048         faligndata %f30, %f16, %f62
3049         stda    %f48, [DST]ASI_BLK_P
3050         ldda    [SRC + 0x18]%asi, %f22
3051         faligndata %f16, %f18, %f48
3052         ldda    [SRC + 0x20]%asi, %f24
3053         faligndata %f18, %f20, %f50
3054         ldda    [SRC + 0x28]%asi, %f26
3055         faligndata %f20, %f22, %f52
3056         ldda    [SRC + 0x30]%asi, %f28
3057         faligndata %f22, %f24, %f54
3058         sub     CNT, VIS_BLOCKSIZE, CNT
3059         ldda    [SRC + 0x38]%asi, %f30
3060         faligndata %f24, %f26, %f56
3061         add     DST, VIS_BLOCKSIZE, DST
3062         ldda    [SRC + VIS_BLOCKSIZE]%asi, %f16
3063         faligndata %f26, %f28, %f58
3064         add     REALSRC, VIS_BLOCKSIZE, REALSRC
3065         prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #n_reads
3066         add     SRC, VIS_BLOCKSIZE, SRC
3067         prefetcha [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read
3068         cmp     CNT, VIS_BLOCKSIZE + 8
3069         bgu,pt  %ncc, 1b
3070           prefetcha [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read
3071 
3072         ! only if REALSRC & 0x7 is 0
3073         cmp     CNT, VIS_BLOCKSIZE
3074         bne     %ncc, 3f
3075           andcc REALSRC, 0x7, %g0
3076         bz,pt   %ncc, 2f
3077           nop
3078 3:
3079         faligndata %f28, %f30, %f60
3080         faligndata %f30, %f16, %f62
3081         stda    %f48, [DST]ASI_BLK_P
3082         add     DST, VIS_BLOCKSIZE, DST
3083         ba,pt   %ncc, 3f
3084           nop
3085 2:
3086         ldda    [SRC + 0x08]%asi, %f18
3087         fsrc1   %f28, %f60
3088         ldda    [SRC + 0x10]%asi, %f20
3089         fsrc1   %f30, %f62
3090         stda    %f48, [DST]ASI_BLK_P
3091         ldda    [SRC + 0x18]%asi, %f22
3092         fsrc1   %f16, %f48
3093         ldda    [SRC + 0x20]%asi, %f24
3094         fsrc1   %f18, %f50
3095         ldda    [SRC + 0x28]%asi, %f26
3096         fsrc1   %f20, %f52
3097         ldda    [SRC + 0x30]%asi, %f28
3098         fsrc1   %f22, %f54
3099         ldda    [SRC + 0x38]%asi, %f30
3100         fsrc1   %f24, %f56
3101         sub     CNT, VIS_BLOCKSIZE, CNT
3102         add     DST, VIS_BLOCKSIZE, DST
3103         add     SRC, VIS_BLOCKSIZE, SRC
3104         add     REALSRC, VIS_BLOCKSIZE, REALSRC
3105         fsrc1   %f26, %f58
3106         fsrc1   %f28, %f60
3107         fsrc1   %f30, %f62
3108         stda    %f48, [DST]ASI_BLK_P
3109         add     DST, VIS_BLOCKSIZE, DST
3110         ba,a,pt %ncc, 4f
3111           nop
3112 
3113 3:      tst     CNT
3114         bz,a    %ncc, 4f
3115           nop
3116 
3117 5:      lduba   [REALSRC]ASI_USER, TMP
3118         inc     REALSRC
3119         inc     DST
3120         deccc   CNT
3121         bgu     %ncc, 5b
3122           stb   TMP, [DST - 1]
3123 4:
3124 
3125 .copyin_exit:
3126         membar  #Sync
3127 
3128         ldx     [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2      ! restore gsr
3129         wr      %o2, 0, %gsr
3130 
3131         ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
3132         btst    FPRS_FEF, %o3
3133         bz,pt   %icc, 4f
3134           nop
3135 
3136         BLD_FPQ2Q4_FROMSTACK(%o2)
3137 
3138         ba,pt   %ncc, 1f
3139           wr    %o3, 0, %fprs           ! restore fprs
3140 
3141 4:
3142         FZEROQ2Q4
3143         wr      %o3, 0, %fprs           ! restore fprs
3144 
3145 1:
3146         membar  #Sync                           ! sync error barrier
3147         andn    %l6, FPUSED_FLAG, %l6
3148         stn     %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3149         FP_ALLOWMIGRATE(5, 6)
3150         ret
3151           restore       %g0, 0, %o0
3152 /*
3153  * We got here because of a fault during copyin
3154  * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
3155  */
3156 .copyin_err:
3157         ldn     [THREAD_REG + T_COPYOPS], %o4   ! check for copyop handler
3158         tst     %o4
3159         bz,pt   %ncc, 2f                        ! if not, return error
3160         nop
3161         ldn     [%o4 + CP_COPYIN], %g2          ! if handler, invoke it with
3162         jmp     %g2                             ! original arguments
3163         restore %g0, 0, %g0                     ! dispose of copy window
3164 2:
3165         ret
3166         restore %g0, -1, %o0                    ! return error value
3167 
3168 
3169         SET_SIZE(copyin_more)
3170 
3171 #endif  /* lint */
3172 
3173 #ifdef  lint
3174 
3175 /*ARGSUSED*/
3176 int
3177 xcopyin(const void *uaddr, void *kaddr, size_t count)
3178 { return (0); }
3179 
3180 #else   /* lint */
3181 
3182         ENTRY(xcopyin)
3183 
3184         cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
3185         bleu,pt %ncc, .xcopyin_small            ! go to larger cases
3186           xor   %o0, %o1, %o3                   ! are src, dst alignable?
3187         btst    7, %o3                          !
3188         bz,pt   %ncc, .xcopyin_8                ! check for longword alignment
3189           nop
3190         btst    1, %o3                          !
3191         bz,pt   %ncc, .xcopyin_2                ! check for half-word
3192           nop
3193         sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
3194         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
3195         tst     %o3
3196         bz,pn   %icc, .xcopyin_small            ! if zero, disable HW copy
3197           cmp   %o2, %o3                        ! if length <= limit
3198         bleu,pt %ncc, .xcopyin_small            ! go to small copy
3199           nop
3200         ba,pt   %ncc, .xcopyin_more             ! otherwise go to large copy
3201           nop
3202 .xcopyin_2:
3203         btst    3, %o3                          !
3204         bz,pt   %ncc, .xcopyin_4                ! check for word alignment
3205           nop
3206         sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
3207         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
3208         tst     %o3
3209         bz,pn   %icc, .xcopyin_small            ! if zero, disable HW copy
3210           cmp   %o2, %o3                        ! if length <= limit
3211         bleu,pt %ncc, .xcopyin_small            ! go to small copy
3212           nop
3213         ba,pt   %ncc, .xcopyin_more             ! otherwise go to large copy
3214           nop
3215 .xcopyin_4:
3216         ! already checked longword, must be word aligned
3217         sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
3218         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
3219         tst     %o3
3220         bz,pn   %icc, .xcopyin_small            ! if zero, disable HW copy
3221           cmp   %o2, %o3                        ! if length <= limit
3222         bleu,pt %ncc, .xcopyin_small            ! go to small copy
3223           nop
3224         ba,pt   %ncc, .xcopyin_more             ! otherwise go to large copy
3225           nop
3226 .xcopyin_8:
3227         sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
3228         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
3229         tst     %o3
3230         bz,pn   %icc, .xcopyin_small            ! if zero, disable HW copy
3231           cmp   %o2, %o3                        ! if length <= limit
3232         bleu,pt %ncc, .xcopyin_small            ! go to small copy
3233           nop
3234         ba,pt   %ncc, .xcopyin_more             ! otherwise go to large copy
3235           nop
3236 
3237 .xcopyin_small:
3238         sethi   %hi(.sm_xcopyin_err), %o5  ! .sm_xcopyin_err is lofault value
3239         or      %o5, %lo(.sm_xcopyin_err), %o5
3240         ldn     [THREAD_REG + T_LOFAULT], %o4   ! set/save t_lofaul
3241         membar  #Sync                           ! sync error barrier
3242         ba,pt   %ncc, .sm_do_copyin             ! common code
3243           stn   %o5, [THREAD_REG + T_LOFAULT]
3244 
3245 .xcopyin_more:
3246         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3247         sethi   %hi(.xcopyin_err), REAL_LOFAULT ! .xcopyin_err is lofault value
3248         ba,pt   %ncc, .do_copyin
3249           or    REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
3250 
3251 /*
3252  * We got here because of fault during xcopyin
3253  * Errno value is in ERRNO
3254  */
3255 .xcopyin_err:
3256         ldn     [THREAD_REG + T_COPYOPS], %o4   ! check for copyop handler
3257         tst     %o4
3258         bz,pt   %ncc, 2f                        ! if not, return error
3259           nop
3260         ldn     [%o4 + CP_XCOPYIN], %g2         ! if handler, invoke it with
3261         jmp     %g2                             ! original arguments
3262           restore %g0, 0, %g0                   ! dispose of copy window
3263 2:
3264         ret
3265           restore ERRNO, 0, %o0                 ! return errno value
3266 
3267 .sm_xcopyin_err:
3268 
3269         membar  #Sync
3270         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3271         mov     SM_SAVE_SRC, %o0
3272         mov     SM_SAVE_DST, %o1
3273         mov     SM_SAVE_COUNT, %o2
3274         ldn     [THREAD_REG + T_COPYOPS], %o3   ! check for copyop handler
3275         tst     %o3
3276         bz,pt   %ncc, 3f                        ! if not, return error
3277           nop
3278         ldn     [%o3 + CP_XCOPYIN], %o5         ! if handler, invoke it with
3279         jmp     %o5                             ! original arguments
3280           nop
3281 3:
3282         retl
3283           or    %g1, 0, %o0             ! return errno value
3284 
3285         SET_SIZE(xcopyin)
3286 
3287 #endif  /* lint */
3288 
3289 #ifdef  lint
3290 
3291 /*ARGSUSED*/
3292 int
3293 xcopyin_little(const void *uaddr, void *kaddr, size_t count)
3294 { return (0); }
3295 
3296 #else   /* lint */
3297 
3298         ENTRY(xcopyin_little)
3299         sethi   %hi(.xcopyio_err), %o5
3300         or      %o5, %lo(.xcopyio_err), %o5
3301         ldn     [THREAD_REG + T_LOFAULT], %o4
3302         membar  #Sync                           ! sync error barrier
3303         stn     %o5, [THREAD_REG + T_LOFAULT]
3304         mov     %o4, %o5
3305 
3306         subcc   %g0, %o2, %o3
3307         add     %o0, %o2, %o0
3308         bz,pn   %ncc, 2f                ! check for zero bytes
3309           sub   %o2, 1, %o4
3310         add     %o0, %o4, %o0           ! start w/last byte
3311         add     %o1, %o2, %o1
3312         lduba   [%o0 + %o3]ASI_AIUSL, %o4
3313 
3314 1:      stb     %o4, [%o1 + %o3]
3315         inccc   %o3
3316         sub     %o0, 2, %o0             ! get next byte
3317         bcc,a,pt %ncc, 1b
3318           lduba [%o0 + %o3]ASI_AIUSL, %o4
3319 
3320 2:
3321         membar  #Sync                           ! sync error barrier
3322         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3323         retl
3324           mov   %g0, %o0                ! return (0)
3325 
3326 .xcopyio_err:
3327         membar  #Sync                           ! sync error barrier
3328         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3329         retl
3330           mov   %g1, %o0
3331 
3332         SET_SIZE(xcopyin_little)
3333 
3334 #endif  /* lint */
3335 
3336 
3337 /*
3338  * Copy a block of storage - must not overlap (from + len <= to).
3339  * No fault handler installed (to be called under on_fault())
3340  */
3341 #if defined(lint)
3342 
3343 /* ARGSUSED */
3344 void
3345 copyin_noerr(const void *ufrom, void *kto, size_t count)
3346 {}
3347 
3348 #else   /* lint */
3349         ENTRY(copyin_noerr)
3350 
3351         cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
3352         bleu,pt %ncc, .copyin_ne_small          ! go to larger cases
3353           xor   %o0, %o1, %o3                   ! are src, dst alignable?
3354         btst    7, %o3                          !
3355         bz,pt   %ncc, .copyin_ne_8              ! check for longword alignment
3356           nop
3357         btst    1, %o3                          !
3358         bz,pt   %ncc, .copyin_ne_2              ! check for half-word
3359           nop
3360         sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
3361         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
3362         tst     %o3
3363         bz,pn   %icc, .copyin_ne_small          ! if zero, disable HW copy
3364           cmp   %o2, %o3                        ! if length <= limit
3365         bleu,pt %ncc, .copyin_ne_small          ! go to small copy
3366           nop
3367         ba,pt   %ncc, .copyin_noerr_more        ! otherwise go to large copy
3368           nop
3369 .copyin_ne_2:
3370         btst    3, %o3                          !
3371         bz,pt   %ncc, .copyin_ne_4              ! check for word alignment
3372           nop
3373         sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
3374         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
3375         tst     %o3
3376         bz,pn   %icc, .copyin_ne_small          ! if zero, disable HW copy
3377           cmp   %o2, %o3                        ! if length <= limit
3378         bleu,pt %ncc, .copyin_ne_small          ! go to small copy
3379           nop
3380         ba,pt   %ncc, .copyin_noerr_more        ! otherwise go to large copy
3381           nop
3382 .copyin_ne_4:
3383         ! already checked longword, must be word aligned
3384         sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
3385         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
3386         tst     %o3
3387         bz,pn   %icc, .copyin_ne_small          ! if zero, disable HW copy
3388           cmp   %o2, %o3                        ! if length <= limit
3389         bleu,pt %ncc, .copyin_ne_small          ! go to small copy
3390           nop
3391         ba,pt   %ncc, .copyin_noerr_more        ! otherwise go to large copy
3392           nop
3393 .copyin_ne_8:
3394         sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
3395         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
3396         tst     %o3
3397         bz,pn   %icc, .copyin_ne_small          ! if zero, disable HW copy
3398           cmp   %o2, %o3                        ! if length <= limit
3399         bleu,pt %ncc, .copyin_ne_small          ! go to small copy
3400           nop
3401         ba,pt   %ncc, .copyin_noerr_more        ! otherwise go to large copy
3402           nop
3403 
3404 .copyin_ne_small:
3405         ldn     [THREAD_REG + T_LOFAULT], %o4
3406         tst     %o4
3407         bz,pn   %ncc, .sm_do_copyin
3408           nop
3409         sethi   %hi(.sm_copyio_noerr), %o5
3410         or      %o5, %lo(.sm_copyio_noerr), %o5
3411         membar  #Sync                           ! sync error barrier
3412         ba,pt   %ncc, .sm_do_copyin
3413           stn   %o5, [THREAD_REG + T_LOFAULT]   ! set/save t_lofault
3414 
3415 .copyin_noerr_more:
3416         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3417         sethi   %hi(.copyio_noerr), REAL_LOFAULT
3418         ba,pt   %ncc, .do_copyin
3419           or    REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
3420 
3421 .copyio_noerr:
3422         jmp     %l6
3423           restore %g0,0,%g0
3424 
3425 .sm_copyio_noerr:
3426         membar  #Sync
3427         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore t_lofault
3428         jmp     %o4
3429           nop
3430 
3431         SET_SIZE(copyin_noerr)
3432 #endif /* lint */
3433 
3434 /*
3435  * Copy a block of storage - must not overlap (from + len <= to).
3436  * No fault handler installed (to be called under on_fault())
3437  */
3438 
3439 #if defined(lint)
3440 
3441 /* ARGSUSED */
3442 void
3443 copyout_noerr(const void *kfrom, void *uto, size_t count)
3444 {}
3445 
3446 #else   /* lint */
3447         ENTRY(copyout_noerr)
3448 
3449         cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
3450         bleu,pt %ncc, .copyout_ne_small         ! go to larger cases
3451           xor   %o0, %o1, %o3                   ! are src, dst alignable?
3452         btst    7, %o3                          !
3453         bz,pt   %ncc, .copyout_ne_8             ! check for longword alignment
3454           nop
3455         btst    1, %o3                          !
3456         bz,pt   %ncc, .copyout_ne_2             ! check for half-word
3457           nop
3458         sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
3459         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
3460         tst     %o3
3461         bz,pn   %icc, .copyout_ne_small         ! if zero, disable HW copy
3462           cmp   %o2, %o3                        ! if length <= limit
3463         bleu,pt %ncc, .copyout_ne_small         ! go to small copy
3464           nop
3465         ba,pt   %ncc, .copyout_noerr_more       ! otherwise go to large copy
3466           nop
3467 .copyout_ne_2:
3468         btst    3, %o3                          !
3469         bz,pt   %ncc, .copyout_ne_4             ! check for word alignment
3470           nop
3471         sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
3472         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
3473         tst     %o3
3474         bz,pn   %icc, .copyout_ne_small         ! if zero, disable HW copy
3475           cmp   %o2, %o3                        ! if length <= limit
3476         bleu,pt %ncc, .copyout_ne_small         ! go to small copy
3477           nop
3478         ba,pt   %ncc, .copyout_noerr_more       ! otherwise go to large copy
3479           nop
3480 .copyout_ne_4:
3481         ! already checked longword, must be word aligned
3482         sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
3483         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
3484         tst     %o3
3485         bz,pn   %icc, .copyout_ne_small         ! if zero, disable HW copy
3486           cmp   %o2, %o3                        ! if length <= limit
3487         bleu,pt %ncc, .copyout_ne_small         ! go to small copy
3488           nop
3489         ba,pt   %ncc, .copyout_noerr_more       ! otherwise go to large copy
3490           nop
3491 .copyout_ne_8:
3492         sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
3493         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
3494         tst     %o3
3495         bz,pn   %icc, .copyout_ne_small         ! if zero, disable HW copy
3496           cmp   %o2, %o3                        ! if length <= limit
3497         bleu,pt %ncc, .copyout_ne_small         ! go to small copy
3498           nop
3499         ba,pt   %ncc, .copyout_noerr_more       ! otherwise go to large copy
3500           nop
3501 
3502 .copyout_ne_small:
3503         ldn     [THREAD_REG + T_LOFAULT], %o4
3504         tst     %o4
3505         bz,pn   %ncc, .sm_do_copyout
3506           nop
3507         sethi   %hi(.sm_copyio_noerr), %o5
3508         or      %o5, %lo(.sm_copyio_noerr), %o5
3509         membar  #Sync                           ! sync error barrier
3510         ba,pt   %ncc, .sm_do_copyout
3511         stn     %o5, [THREAD_REG + T_LOFAULT]   ! set/save t_lofault
3512 
3513 .copyout_noerr_more:
3514         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3515         sethi   %hi(.copyio_noerr), REAL_LOFAULT
3516         ba,pt   %ncc, .do_copyout
3517           or    REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
3518 
3519         SET_SIZE(copyout_noerr)
3520 #endif /* lint */
3521 
3522 
3523 /*
3524  * hwblkclr - clears block-aligned, block-multiple-sized regions that are
3525  * longer than 256 bytes in length using spitfire's block stores.  If
3526  * the criteria for using this routine are not met then it calls bzero
3527  * and returns 1.  Otherwise 0 is returned indicating success.
3528  * Caller is responsible for ensuring use_hw_bzero is true and that
3529  * kpreempt_disable() has been called.
3530  */
3531 #ifdef lint
3532 /*ARGSUSED*/
3533 int
3534 hwblkclr(void *addr, size_t len)
3535 {
3536         return(0);
3537 }
3538 #else /* lint */
3539         ! %i0 - start address
3540         ! %i1 - length of region (multiple of 64)
3541         ! %l0 - saved fprs
3542         ! %l1 - pointer to saved %d0 block
3543         ! %l2 - saved curthread->t_lwp
3544 
3545         ENTRY(hwblkclr)
3546         ! get another window w/space for one aligned block of saved fpregs
3547         save    %sp, -SA(MINFRAME + 2*VIS_BLOCKSIZE), %sp
3548 
3549         ! Must be block-aligned
3550         andcc   %i0, (VIS_BLOCKSIZE-1), %g0
3551         bnz,pn  %ncc, 1f
3552           nop
3553 
3554         ! ... and must be 256 bytes or more
3555         cmp     %i1, 256
3556         blu,pn  %ncc, 1f
3557           nop
3558 
3559         ! ... and length must be a multiple of VIS_BLOCKSIZE
3560         andcc   %i1, (VIS_BLOCKSIZE-1), %g0
3561         bz,pn   %ncc, 2f
3562           nop
3563 
3564 1:      ! punt, call bzero but notify the caller that bzero was used
3565         mov     %i0, %o0
3566         call    bzero
3567         mov     %i1, %o1
3568         ret
3569           restore       %g0, 1, %o0 ! return (1) - did not use block operations
3570 
3571 2:      rd      %fprs, %l0              ! check for unused fp
3572         btst    FPRS_FEF, %l0
3573         bz,pt   %icc, 1f
3574           nop
3575 
3576         ! save in-use fpregs on stack
3577         membar  #Sync
3578         add     %fp, STACK_BIAS - 65, %l1
3579         and     %l1, -VIS_BLOCKSIZE, %l1
3580         stda    %d0, [%l1]ASI_BLK_P
3581 
3582 1:      membar  #StoreStore|#StoreLoad|#LoadStore
3583         wr      %g0, FPRS_FEF, %fprs
3584         wr      %g0, ASI_BLK_P, %asi
3585 
3586         ! Clear block
3587         fzero   %d0
3588         fzero   %d2
3589         fzero   %d4
3590         fzero   %d6
3591         fzero   %d8
3592         fzero   %d10
3593         fzero   %d12
3594         fzero   %d14
3595 
3596         mov     256, %i3
3597         ba,pt   %ncc, .pz_doblock
3598           nop
3599 
3600 .pz_blkstart:
3601       ! stda    %d0, [%i0 + 192]%asi  ! in dly slot of branch that got us here
3602         stda    %d0, [%i0 + 128]%asi
3603         stda    %d0, [%i0 + 64]%asi
3604         stda    %d0, [%i0]%asi
3605 .pz_zinst:
3606         add     %i0, %i3, %i0
3607         sub     %i1, %i3, %i1
3608 .pz_doblock:
3609         cmp     %i1, 256
3610         bgeu,a  %ncc, .pz_blkstart
3611           stda  %d0, [%i0 + 192]%asi
3612 
3613         cmp     %i1, 64
3614         blu     %ncc, .pz_finish
3615 
3616           andn  %i1, (64-1), %i3
3617         srl     %i3, 4, %i2             ! using blocks, 1 instr / 16 words
3618         set     .pz_zinst, %i4
3619         sub     %i4, %i2, %i4
3620         jmp     %i4
3621           nop
3622 
3623 .pz_finish:
3624         membar  #Sync
3625         btst    FPRS_FEF, %l0
3626         bz,a    .pz_finished
3627           wr    %l0, 0, %fprs           ! restore fprs
3628 
3629         ! restore fpregs from stack
3630         ldda    [%l1]ASI_BLK_P, %d0
3631         membar  #Sync
3632         wr      %l0, 0, %fprs           ! restore fprs
3633 
3634 .pz_finished:
3635         ret
3636           restore       %g0, 0, %o0             ! return (bzero or not)
3637 
3638         SET_SIZE(hwblkclr)
3639 #endif  /* lint */
3640 
3641 #ifdef lint
3642 /*ARGSUSED*/
3643 void
3644 hw_pa_bcopy32(uint64_t src, uint64_t dst)
3645 {}
3646 #else /*!lint */
3647         /*
3648          * Copy 32 bytes of data from src (%o0) to dst (%o1)
3649          * using physical addresses.
3650          */
3651         ENTRY_NP(hw_pa_bcopy32)
3652         rdpr    %pstate, %g1
3653         andn    %g1, PSTATE_IE, %g2
3654         wrpr    %g0, %g2, %pstate
3655 
3656         rdpr    %pstate, %g0
3657         ldxa    [%o0]ASI_MEM, %o2
3658         add     %o0, 8, %o0
3659         ldxa    [%o0]ASI_MEM, %o3
3660         add     %o0, 8, %o0
3661         ldxa    [%o0]ASI_MEM, %o4
3662         add     %o0, 8, %o0
3663         ldxa    [%o0]ASI_MEM, %o5
3664         membar  #Sync
3665 
3666         stxa    %o2, [%o1]ASI_MEM
3667         add     %o1, 8, %o1
3668         stxa    %o3, [%o1]ASI_MEM
3669         add     %o1, 8, %o1
3670         stxa    %o4, [%o1]ASI_MEM
3671         add     %o1, 8, %o1
3672         stxa    %o5, [%o1]ASI_MEM
3673 
3674         retl
3675           wrpr    %g0, %g1, %pstate
3676 
3677         SET_SIZE(hw_pa_bcopy32)
3678 
3679 #endif /* lint */
3680 
3681 #if defined(lint)
3682 
3683 int use_hw_bcopy = 1;
3684 int use_hw_bzero = 1;
3685 uint_t hw_copy_limit_1 = 0;
3686 uint_t hw_copy_limit_2 = 0;
3687 uint_t hw_copy_limit_4 = 0;
3688 uint_t hw_copy_limit_8 = 0;
3689 
3690 #else /* !lint */
3691 
3692         DGDEF(use_hw_bcopy)
3693         .word   1
3694         DGDEF(use_hw_bzero)
3695         .word   1
3696         DGDEF(hw_copy_limit_1)
3697         .word   0
3698         DGDEF(hw_copy_limit_2)
3699         .word   0
3700         DGDEF(hw_copy_limit_4)
3701         .word   0
3702         DGDEF(hw_copy_limit_8)
3703         .word   0
3704 
3705         .align  64
3706         .section ".text"
3707 #endif /* !lint */