1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 #include <sys/param.h>
  27 #include <sys/errno.h>
  28 #include <sys/asm_linkage.h>
  29 #include <sys/vtrace.h>
  30 #include <sys/machthread.h>
  31 #include <sys/clock.h>
  32 #include <sys/asi.h>
  33 #include <sys/fsr.h>
  34 #include <sys/privregs.h>
  35 
  36 #include "assym.h"
  37 
  38 /*
  39  * Pseudo-code to aid in understanding the control flow of the
  40  * bcopy/copyin/copyout routines.
  41  *
  42  * On entry:
  43  *
  44  *      ! Determine whether to use the FP register version
  45  *      ! or the leaf routine version depending on size
  46  *      ! of copy and flags.  Set up error handling accordingly.
  47  *      ! The transition point depends on whether the src and
  48  *      ! dst addresses can be aligned to long word, word,
  49  *      ! half word, or byte boundaries.
  50  *      !
  51  *      ! WARNING: <Register usage convention>
  52  *      ! For FP version, %l6 holds previous error handling and
  53  *      ! a flag: TRAMP_FLAG (low bits)
  54  *      ! for leaf routine version, %o4 holds those values.
  55  *      ! So either %l6 or %o4 is reserved and not available for
  56  *      ! any other use.
  57  *
  58  *      if (length <= VIS_COPY_THRESHOLD)    ! start with a quick test
  59  *              go to small_copy;               ! to speed short copies
  60  *
  61  *      ! src, dst long word alignable
  62  *              if (hw_copy_limit_8 == 0)       ! hw_copy disabled
  63  *                      go to small_copy;
  64  *              if (length <= hw_copy_limit_8)
  65  *                      go to small_copy;
  66  *              go to FPBLK_copy;
  67  *      }
  68  *      if (src,dst not alignable) {
  69  *              if (hw_copy_limit_1 == 0)       ! hw_copy disabled
  70  *                      go to small_copy;
  71  *              if (length <= hw_copy_limit_1)
  72  *                      go to small_copy;
  73  *              go to FPBLK_copy;
  74  *      }
  75  *      if (src,dst halfword alignable) {
  76  *              if (hw_copy_limit_2 == 0)       ! hw_copy disabled
  77  *                      go to small_copy;
  78  *              if (length <= hw_copy_limit_2)
  79  *                      go to small_copy;
  80  *              go to FPBLK_copy;
  81  *      }
  82  *      if (src,dst word alignable) {
  83  *              if (hw_copy_limit_4 == 0)       ! hw_copy disabled
  84  *                      go to small_copy;
  85  *              if (length <= hw_copy_limit_4)
  86  *                      go to small_copy;
  87  *              go to FPBLK_copy;
  88  *      }
  89  *
  90  * small_copy:
  91  *      Setup_leaf_rtn_error_handler;           ! diffs for each entry point
  92  *
  93  *      if (count <= 3)                              ! fast path for tiny copies
  94  *              go to sm_left;                  ! special finish up code
  95  *      else
  96  *              if (count > CHKSIZE)         ! medium sized copies
  97  *                      go to sm_med            ! tuned by alignment
  98  *              if(src&dst not both word aligned) {
  99  *      sm_movebytes:
 100  *                      move byte by byte in 4-way unrolled loop
 101  *                      fall into sm_left;
 102  *      sm_left:
 103  *                      move 0-3 bytes byte at a time as needed.
 104  *                      restore error handler and exit.
 105  *
 106  *              } else {        ! src&dst are word aligned
 107  *                      check for at least 8 bytes left,
 108  *                      move word at a time, unrolled by 2
 109  *                      when fewer than 8 bytes left,
 110  *      sm_half:        move half word at a time while 2 or more bytes left
 111  *      sm_byte:        move final byte if necessary
 112  *      sm_exit:
 113  *                      restore error handler and exit.
 114  *              }
 115  *
 116  * ! Medium length cases with at least CHKSIZE bytes available
 117  * ! method: line up src and dst as best possible, then
 118  * ! move data in 4-way unrolled loops.
 119  *
 120  * sm_med:
 121  *      if(src&dst unalignable)
 122  *              go to sm_movebytes
 123  *      if(src&dst halfword alignable)
 124  *              go to sm_movehalf
 125  *      if(src&dst word alignable)
 126  *              go to sm_moveword
 127  * ! fall into long word movement
 128  *      move bytes until src is word aligned
 129  *      if not long word aligned, move a word
 130  *      move long words in 4-way unrolled loop until < 32 bytes left
 131  *      move long words in 1-way unrolled loop until < 8 bytes left
 132  *      if zero bytes left, goto sm_exit
 133  *      if one byte left, go to sm_byte
 134  *      else go to sm_half
 135  *
 136  * sm_moveword:
 137  *      move bytes until src is word aligned
 138  *      move words in 4-way unrolled loop until < 16 bytes left
 139  *      move words in 1-way unrolled loop until < 4 bytes left
 140  *      if zero bytes left, goto sm_exit
 141  *      if one byte left, go to sm_byte
 142  *      else go to sm_half
 143  *
 144  * sm_movehalf:
 145  *      move a byte if needed to align src on halfword
 146  *      move halfwords in 4-way unrolled loop until < 8 bytes left
 147  *      if zero bytes left, goto sm_exit
 148  *      if one byte left, go to sm_byte
 149  *      else go to sm_half
 150  *
 151  *
 152  * FPBLK_copy:
 153  *      %l6 = curthread->t_lofault;
 154  *      if (%l6 != NULL) {
 155  *              membar #Sync
 156  *              curthread->t_lofault = .copyerr;
 157  *              caller_error_handler = TRUE             ! %l6 |= 2
 158  *      }
 159  *
 160  *      ! for FPU testing we must not migrate cpus
 161  *      if (curthread->t_lwp == NULL) {
 162  *              ! Kernel threads do not have pcb's in which to store
 163  *              ! the floating point state, so disallow preemption during
 164  *              ! the copy.  This also prevents cpu migration.
 165  *              kpreempt_disable(curthread);
 166  *      } else {
 167  *              thread_nomigrate();
 168  *      }
 169  *
 170  *      old_fprs = %fprs;
 171  *      old_gsr = %gsr;
 172  *      if (%fprs.fef) {
 173  *              %fprs.fef = 1;
 174  *              save current fpregs on stack using blockstore
 175  *      } else {
 176  *              %fprs.fef = 1;
 177  *      }
 178  *
 179  *
 180  *      do_blockcopy_here;
 181  *
 182  * In lofault handler:
 183  *      curthread->t_lofault = .copyerr2;
 184  *      Continue on with the normal exit handler
 185  *
 186  * On normal exit:
 187  *      %gsr = old_gsr;
 188  *      if (old_fprs & FPRS_FEF)
 189  *              restore fpregs from stack using blockload
 190  *      else
 191  *              zero fpregs
 192  *      %fprs = old_fprs;
 193  *      membar #Sync
 194  *      curthread->t_lofault = (%l6 & ~3);
 195  *      ! following test omitted from copyin/copyout as they
 196  *      ! will always have a current thread
 197  *      if (curthread->t_lwp == NULL)
 198  *              kpreempt_enable(curthread);
 199  *      else
 200  *              thread_allowmigrate();
 201  *      return (0)
 202  *
 203  * In second lofault handler (.copyerr2):
 204  *      We've tried to restore fp state from the stack and failed.  To
 205  *      prevent from returning with a corrupted fp state, we will panic.
 206  */
 207 
 208 /*
 209  * Comments about optimization choices
 210  *
 211  * The initial optimization decision in this code is to determine
 212  * whether to use the FP registers for a copy or not.  If we don't
 213  * use the FP registers, we can execute the copy as a leaf routine,
 214  * saving a register save and restore.  Also, less elaborate setup
 215  * is required, allowing short copies to be completed more quickly.
 216  * For longer copies, especially unaligned ones (where the src and
 217  * dst do not align to allow simple ldx,stx operation), the FP
 218  * registers allow much faster copy operations.
 219  *
 220  * The estimated extra cost of the FP path will vary depending on
 221  * src/dst alignment, dst offset from the next 64 byte FPblock store
 222  * boundary, remaining src data after the last full dst cache line is
 223  * moved whether the FP registers need to be saved, and some other
 224  * minor issues.  The average additional overhead is estimated to be
 225  * 400 clocks.  Since each non-repeated/predicted tst and branch costs
 226  * around 10 clocks, elaborate calculation would slow down to all
 227  * longer copies and only benefit a small portion of medium sized
 228  * copies.  Rather than incur such cost, we chose fixed transition
 229  * points for each of the alignment choices.
 230  *
 231  * For the inner loop, here is a comparison of the per cache line
 232  * costs for each alignment when src&dst are in cache:
 233  *
 234  * byte aligned:  108 clocks slower for non-FPBLK
 235  * half aligned:   44 clocks slower for non-FPBLK
 236  * word aligned:   12 clocks slower for non-FPBLK
 237  * long aligned:    4 clocks >>faster<< for non-FPBLK
 238  *
 239  * The long aligned loop runs faster because it does no prefetching.
 240  * That wins if the data is not in cache or there is too little
 241  * data to gain much benefit from prefetching.  But when there
 242  * is more data and that data is not in cache, failing to prefetch
 243  * can run much slower.  In addition, there is a 2 Kbyte store queue
 244  * which will cause the non-FPBLK inner loop to slow for larger copies.
 245  * The exact tradeoff is strongly load and application dependent, with
 246  * increasing risk of a customer visible performance regression if the
 247  * non-FPBLK code is used for larger copies. Studies of synthetic in-cache
 248  * vs out-of-cache copy tests in user space suggest 1024 bytes as a safe
 249  * upper limit for the non-FPBLK code.  To minimize performance regression
 250  * risk while still gaining the primary benefits of the improvements to
 251  * the non-FPBLK code, we set an upper bound of 1024 bytes for the various
 252  * hw_copy_limit_*.  Later experimental studies using different values
 253  * of hw_copy_limit_* can be used to make further adjustments if
 254  * appropriate.
 255  *
 256  * hw_copy_limit_1 = src and dst are byte aligned but not halfword aligned
 257  * hw_copy_limit_2 = src and dst are halfword aligned but not word aligned
 258  * hw_copy_limit_4 = src and dst are word aligned but not longword aligned
 259  * hw_copy_limit_8 = src and dst are longword aligned
 260  *
 261  * To say that src and dst are word aligned means that after
 262  * some initial alignment activity of moving 0 to 3 bytes,
 263  * both the src and dst will be on word boundaries so that
 264  * word loads and stores may be used.
 265  *
 266  * Default values at May,2005 are:
 267  * hw_copy_limit_1 =  256
 268  * hw_copy_limit_2 =  512
 269  * hw_copy_limit_4 = 1024
 270  * hw_copy_limit_8 = 1024 (or 1536 on some systems)
 271  *
 272  *
 273  * If hw_copy_limit_? is set to zero, then use of FPBLK copy is
 274  * disabled for that alignment choice.
 275  * If hw_copy_limit_? is set to a value between 1 and VIS_COPY_THRESHOLD (256)
 276  * the value of VIS_COPY_THRESHOLD is used.
 277  * It is not envisioned that hw_copy_limit_? will be changed in the field
 278  * It is provided to allow for disabling FPBLK copies and to allow
 279  * easy testing of alternate values on future HW implementations
 280  * that might have different cache sizes, clock rates or instruction
 281  * timing rules.
 282  *
 283  * Our first test for FPBLK copies vs non-FPBLK copies checks a minimum
 284  * threshold to speedup all shorter copies (less than 256).  That
 285  * saves an alignment test, memory reference, and enabling test
 286  * for all short copies, or an estimated 24 clocks.
 287  *
 288  * The order in which these limits are checked does matter since each
 289  * non-predicted tst and branch costs around 10 clocks.
 290  * If src and dst are randomly selected addresses,
 291  * 4 of 8 will not be alignable.
 292  * 2 of 8 will be half word alignable.
 293  * 1 of 8 will be word alignable.
 294  * 1 of 8 will be long word alignable.
 295  * But, tests on running kernels show that src and dst to copy code
 296  * are typically not on random alignments.  Structure copies and
 297  * copies of larger data sizes are often on long word boundaries.
 298  * So we test the long word alignment case first, then
 299  * the byte alignment, then halfword, then word alignment.
 300  *
 301  * Several times, tests for length are made to split the code
 302  * into subcases.  These tests often allow later tests to be
 303  * avoided.  For example, within the non-FPBLK copy, we first
 304  * check for tiny copies of 3 bytes or less.  That allows us
 305  * to use a 4-way unrolled loop for the general byte copy case
 306  * without a test on loop entry.
 307  * We subdivide the non-FPBLK case further into CHKSIZE bytes and less
 308  * vs longer cases.  For the really short case, we don't attempt
 309  * align src and dst.  We try to minimize special case tests in
 310  * the shortest loops as each test adds a significant percentage
 311  * to the total time.
 312  *
 313  * For the medium sized cases, we allow ourselves to adjust the
 314  * src and dst alignment and provide special cases for each of
 315  * the four adjusted alignment cases. The CHKSIZE that was used
 316  * to decide between short and medium size was chosen to be 39
 317  * as that allows for the worst case of 7 bytes of alignment
 318  * shift and 4 times 8 bytes for the first long word unrolling.
 319  * That knowledge saves an initial test for length on entry into
 320  * the medium cases.  If the general loop unrolling factor were
 321  * to be increases, this number would also need to be adjusted.
 322  *
 323  * For all cases in the non-FPBLK code where it is known that at
 324  * least 4 chunks of data are available for movement, the
 325  * loop is unrolled by four.  This 4-way loop runs in 8 clocks
 326  * or 2 clocks per data element.
 327  *
 328  * Instruction alignment is forced by used of .align 16 directives
 329  * and nops which are not executed in the code.  This
 330  * combination of operations shifts the alignment of following
 331  * loops to insure that loops are aligned so that their instructions
 332  * fall within the minimum number of 4 instruction fetch groups.
 333  * If instructions are inserted or removed between the .align
 334  * instruction and the unrolled loops, then the alignment needs
 335  * to be readjusted.  Misaligned loops can add a clock per loop
 336  * iteration to the loop timing.
 337  *
 338  * In a few cases, code is duplicated to avoid a branch.  Since
 339  * a non-predicted tst and branch takes 10 clocks, this savings
 340  * is judged an appropriate time-space tradeoff.
 341  *
 342  * Within the FPBLK-code, the prefetch method in the inner
 343  * loop needs to be explained as it is not standard.  Two
 344  * prefetches are issued for each cache line instead of one.
 345  * The primary one is at the maximum reach of 8 cache lines.
 346  * Most of the time, that maximum prefetch reach gives the
 347  * cache line more time to reach the processor for systems with
 348  * higher processor clocks.  But, sometimes memory interference
 349  * can cause that prefetch to be dropped.  Putting a second
 350  * prefetch at a reach of 5 cache lines catches the drops
 351  * three iterations later and shows a measured improvement
 352  * in performance over any similar loop with a single prefetch.
 353  * The prefetches are placed in the loop so they overlap with
 354  * non-memory instructions, so that there is no extra cost
 355  * when the data is already in-cache.
 356  *
 357  */
 358 
 359 /*
 360  * Notes on preserving existing fp state and on membars.
 361  *
 362  * When a copyOP decides to use fp we may have to preserve existing
 363  * floating point state.  It is not the caller's state that we need to
 364  * preserve - the rest of the kernel does not use fp and, anyway, fp
 365  * registers are volatile across a call.  Some examples:
 366  *
 367  *      - userland has fp state and is interrupted (device interrupt
 368  *        or trap) and within the interrupt/trap handling we use
 369  *        bcopy()
 370  *      - another (higher level) interrupt or trap handler uses bcopy
 371  *        while a bcopy from an earlier interrupt is still active
 372  *      - an asynchronous error trap occurs while fp state exists (in
 373  *        userland or in kernel copy) and the tl0 component of the handling
 374  *        uses bcopy
 375  *      - a user process with fp state incurs a copy-on-write fault and
 376  *        hwblkpagecopy always uses fp
 377  *
 378  * We therefore need a per-call place in which to preserve fp state -
 379  * using our stack is ideal (and since fp copy cannot be leaf optimized
 380  * because of calls it makes, this is no hardship).
 381  *
 382  * When we have finished fp copy (with it's repeated block stores)
 383  * we must membar #Sync so that our block stores may complete before
 384  * we either restore the original fp state into the fp registers or
 385  * return to a caller which may initiate other fp operations that could
 386  * modify the fp regs we used before the block stores complete.
 387  *
 388  * Synchronous faults (eg, unresolvable DMMU miss) that occur while
 389  * t_lofault is not NULL will not panic but will instead trampoline
 390  * to the registered lofault handler.  There is no need for any
 391  * membars for these - eg, our store to t_lofault will always be visible to
 392  * ourselves and it is our cpu which will take any trap.
 393  *
 394  * Asynchronous faults (eg, uncorrectable ECC error from memory) that occur
 395  * while t_lofault is not NULL will also not panic.  Since we're copying
 396  * to or from userland the extent of the damage is known - the destination
 397  * buffer is incomplete.  So trap handlers will trampoline to the lofault
 398  * handler in this case which should take some form of error action to
 399  * avoid using the incomplete buffer.  The trap handler also flags the
 400  * fault so that later return-from-trap handling (for the trap that brought
 401  * this thread into the kernel in the first place) can notify the process
 402  * and reboot the system (or restart the service with Greenline/Contracts).
 403  *
 404  * Asynchronous faults (eg, uncorrectable ECC error from memory) can
 405  * result in deferred error traps - the trap is taken sometime after
 406  * the event and the trap PC may not be the PC of the faulting access.
 407  * Delivery of such pending traps can be forced by a membar #Sync, acting
 408  * as an "error barrier" in this role.  To accurately apply the user/kernel
 409  * separation described in the preceding paragraph we must force delivery
 410  * of deferred traps affecting kernel state before we install a lofault
 411  * handler (if we interpose a new lofault handler on an existing one there
 412  * is no need to repeat this), and we must force delivery of deferred
 413  * errors affecting the lofault-protected region before we clear t_lofault.
 414  * Failure to do so results in lost kernel state being interpreted as
 415  * affecting a copyin/copyout only, or of an error that really only
 416  * affects copy data being interpreted as losing kernel state.
 417  *
 418  * Since the copy operations may preserve and later restore floating
 419  * point state that does not belong to the caller (see examples above),
 420  * we must be careful in how we do this in order to prevent corruption
 421  * of another program.
 422  *
 423  * To make sure that floating point state is always saved and restored
 424  * correctly, the following "big rules" must be followed when the floating
 425  * point registers will be used:
 426  *
 427  * 1. %l6 always holds the caller's lofault handler.  Also in this register,
 428  *    Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in
 429  *    use.  Bit 2 (TRAMP_FLAG) indicates that the call was to bcopy, and a
 430  *    lofault handler was set coming in.
 431  *
 432  * 2. The FPUSED flag indicates that all FP state has been successfully stored
 433  *    on the stack.  It should not be set until this save has been completed.
 434  *
 435  * 3. The FPUSED flag should not be cleared on exit until all FP state has
 436  *    been restored from the stack.  If an error occurs while restoring
 437  *    data from the stack, the error handler can check this flag to see if
 438  *    a restore is necessary.
 439  *
 440  * 4. Code run under the new lofault handler must be kept to a minimum.  In
 441  *    particular, any calls to FP_ALLOWMIGRATE, which could result in a call
 442  *    to kpreempt(), should not be made until after the lofault handler has
 443  *    been restored.
 444  */
 445 
 446 /*
 447  * VIS_COPY_THRESHOLD indicates the minimum number of bytes needed
 448  * to "break even" using FP/VIS-accelerated memory operations.
 449  * The FPBLK code assumes a minimum number of bytes are available
 450  * to be moved on entry.  Check that code carefully before
 451  * reducing VIS_COPY_THRESHOLD below 256.
 452  */
 453 /*
 454  * This shadows sys/machsystm.h which can't be included due to the lack of
 455  * _ASM guards in include files it references. Change it here, change it there.
 456  */
 457 #define VIS_COPY_THRESHOLD 256
 458 
 459 /*
 460  * TEST for very short copies
 461  * Be aware that the maximum unroll for the short unaligned case
 462  * is SHORTCOPY+1
 463  */
 464 #define SHORTCOPY 3
 465 #define CHKSIZE  39
 466 
 467 /*
 468  * Indicates that we're to trampoline to the error handler.
 469  * Entry points bcopy, copyin_noerr, and copyout_noerr use this flag.
 470  * kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag.
 471  */
 472 #define FPUSED_FLAG     1
 473 #define TRAMP_FLAG      2
 474 #define MASK_FLAGS      3
 475 
 476 /*
 477  * Number of outstanding prefetches.
 478  * first prefetch moves data from L2 to L1 (n_reads)
 479  * second prefetch moves data from memory to L2 (one_read)
 480  */
 481 #define OLYMPUS_C_PREFETCH      24
 482 #define OLYMPUS_C_2ND_PREFETCH  12
 483 
 484 #define VIS_BLOCKSIZE           64
 485 
 486 /*
 487  * Size of stack frame in order to accomodate a 64-byte aligned
 488  * floating-point register save area and 2 64-bit temp locations.
 489  * All copy functions use two quadrants of fp registers; to assure a
 490  * block-aligned two block buffer in which to save we must reserve
 491  * three blocks on stack.  Not all functions preserve %pfrs on stack
 492  * or need to preserve %gsr but we use HWCOPYFRAMESIZE for all.
 493  *
 494  *    _______________________________________ <-- %fp + STACK_BIAS
 495  *    | We may need to preserve 2 quadrants |
 496  *    | of fp regs, but since we do so with |
 497  *    | BST/BLD we need room in which to    |
 498  *    | align to VIS_BLOCKSIZE bytes.  So   |
 499  *    | this area is 3 * VIS_BLOCKSIZE.     | <--  - SAVED_FPREGS_OFFSET
 500  *    |-------------------------------------|
 501  *    | 8 bytes to save %fprs               | <--  - SAVED_FPRS_OFFSET
 502  *    |-------------------------------------|
 503  *    | 8 bytes to save %gsr                | <--  - SAVED_GSR_OFFSET
 504  *    ---------------------------------------
 505  */
 506 #define HWCOPYFRAMESIZE         ((VIS_BLOCKSIZE * (2 + 1)) + (2 * 8))
 507 #define SAVED_FPREGS_OFFSET     (VIS_BLOCKSIZE * 3)
 508 #define SAVED_FPREGS_ADJUST     ((VIS_BLOCKSIZE * 2) - 1)
 509 #define SAVED_FPRS_OFFSET       (SAVED_FPREGS_OFFSET + 8)
 510 #define SAVED_GSR_OFFSET        (SAVED_FPRS_OFFSET + 8)
 511 
 512 /*
 513  * Common macros used by the various versions of the block copy
 514  * routines in this file.
 515  */
 516 
 517 /*
 518  * In FP copies if we do not have preserved data to restore over
 519  * the fp regs we used then we must zero those regs to avoid
 520  * exposing portions of the data to later threads (data security).
 521  *
 522  * Copy functions use either quadrants 1 and 3 or 2 and 4.
 523  *
 524  * FZEROQ1Q3: Zero quadrants 1 and 3, ie %f0 - %f15 and %f32 - %f47
 525  * FZEROQ2Q4: Zero quadrants 2 and 4, ie %f16 - %f31 and %f48 - %f63
 526  *
 527  * The instructions below are quicker than repeated fzero instructions
 528  * since they can dispatch down two fp pipelines.
 529  */
 530 #define FZEROQ1Q3                       \
 531         fzero   %f0                     ;\
 532         fmovd   %f0, %f2                ;\
 533         fmovd   %f0, %f4                ;\
 534         fmovd   %f0, %f6                ;\
 535         fmovd   %f0, %f8                ;\
 536         fmovd   %f0, %f10               ;\
 537         fmovd   %f0, %f12               ;\
 538         fmovd   %f0, %f14               ;\
 539         fmovd   %f0, %f32               ;\
 540         fmovd   %f0, %f34               ;\
 541         fmovd   %f0, %f36               ;\
 542         fmovd   %f0, %f38               ;\
 543         fmovd   %f0, %f40               ;\
 544         fmovd   %f0, %f42               ;\
 545         fmovd   %f0, %f44               ;\
 546         fmovd   %f0, %f46
 547 
 548 #define FZEROQ2Q4                       \
 549         fzero   %f16                    ;\
 550         fmovd   %f0, %f18               ;\
 551         fmovd   %f0, %f20               ;\
 552         fmovd   %f0, %f22               ;\
 553         fmovd   %f0, %f24               ;\
 554         fmovd   %f0, %f26               ;\
 555         fmovd   %f0, %f28               ;\
 556         fmovd   %f0, %f30               ;\
 557         fmovd   %f0, %f48               ;\
 558         fmovd   %f0, %f50               ;\
 559         fmovd   %f0, %f52               ;\
 560         fmovd   %f0, %f54               ;\
 561         fmovd   %f0, %f56               ;\
 562         fmovd   %f0, %f58               ;\
 563         fmovd   %f0, %f60               ;\
 564         fmovd   %f0, %f62
 565 
 566 /*
 567  * Macros to save and restore quadrants 1 and 3 or 2 and 4 to/from the stack.
 568  * Used to save and restore in-use fp registers when we want to use FP
 569  * and find fp already in use and copy size still large enough to justify
 570  * the additional overhead of this save and restore.
 571  *
 572  * A membar #Sync is needed before save to sync fp ops initiated before
 573  * the call to the copy function (by whoever has fp in use); for example
 574  * an earlier block load to the quadrant we are about to save may still be
 575  * "in flight".  A membar #Sync is required at the end of the save to
 576  * sync our block store (the copy code is about to begin ldd's to the
 577  * first quadrant).
 578  *
 579  * Similarly: a membar #Sync before restore allows the block stores of
 580  * the copy operation to complete before we fill the quadrants with their
 581  * original data, and a membar #Sync after restore lets the block loads
 582  * of the restore complete before we return to whoever has the fp regs
 583  * in use.  To avoid repeated membar #Sync we make it the responsibility
 584  * of the copy code to membar #Sync immediately after copy is complete
 585  * and before using the BLD_*_FROMSTACK macro.
 586  */
 587 #define BST_FPQ1Q3_TOSTACK(tmp1)                                \
 588         /* membar #Sync */                                      ;\
 589         add     %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1     ;\
 590         and     tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */    ;\
 591         stda    %f0, [tmp1]ASI_BLK_P                            ;\
 592         add     tmp1, VIS_BLOCKSIZE, tmp1                       ;\
 593         stda    %f32, [tmp1]ASI_BLK_P                           ;\
 594         membar  #Sync
 595 
 596 #define BLD_FPQ1Q3_FROMSTACK(tmp1)                              \
 597         /* membar #Sync - provided at copy completion */        ;\
 598         add     %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1     ;\
 599         and     tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */    ;\
 600         ldda    [tmp1]ASI_BLK_P, %f0                            ;\
 601         add     tmp1, VIS_BLOCKSIZE, tmp1                       ;\
 602         ldda    [tmp1]ASI_BLK_P, %f32                           ;\
 603         membar  #Sync
 604 
 605 #define BST_FPQ2Q4_TOSTACK(tmp1)                                \
 606         /* membar #Sync */                                      ;\
 607         add     %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1     ;\
 608         and     tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */    ;\
 609         stda    %f16, [tmp1]ASI_BLK_P                           ;\
 610         add     tmp1, VIS_BLOCKSIZE, tmp1                       ;\
 611         stda    %f48, [tmp1]ASI_BLK_P                           ;\
 612         membar  #Sync
 613 
 614 #define BLD_FPQ2Q4_FROMSTACK(tmp1)                              \
 615         /* membar #Sync - provided at copy completion */        ;\
 616         add     %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1     ;\
 617         and     tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */    ;\
 618         ldda    [tmp1]ASI_BLK_P, %f16                           ;\
 619         add     tmp1, VIS_BLOCKSIZE, tmp1                       ;\
 620         ldda    [tmp1]ASI_BLK_P, %f48                           ;\
 621         membar  #Sync
 622 
 623 /*
 624  * FP_NOMIGRATE and FP_ALLOWMIGRATE.  Prevent migration (or, stronger,
 625  * prevent preemption if there is no t_lwp to save FP state to on context
 626  * switch) before commencing a FP copy, and reallow it on completion or
 627  * in error trampoline paths when we were using FP copy.
 628  *
 629  * Both macros may call other functions, so be aware that all outputs are
 630  * forfeit after using these macros.  For this reason we do not pass registers
 631  * to use - we just use any outputs we want.
 632  *
 633  * Pseudo code:
 634  *
 635  * FP_NOMIGRATE:
 636  *
 637  * if (curthread->t_lwp) {
 638  *      thread_nomigrate();
 639  * } else {
 640  *      kpreempt_disable();
 641  * }
 642  *
 643  * FP_ALLOWMIGRATE:
 644  *
 645  * if (curthread->t_lwp) {
 646  *      thread_allowmigrate();
 647  * } else {
 648  *      kpreempt_enable();
 649  * }
 650  */
 651 
 652 #define FP_NOMIGRATE(label1, label2)                            \
 653         ldn     [THREAD_REG + T_LWP], %o0                       ;\
 654         brz,a,pn %o0, label1/**/f                               ;\
 655           ldsb  [THREAD_REG + T_PREEMPT], %o1                   ;\
 656         call    thread_nomigrate                                ;\
 657           nop                                                   ;\
 658         ba      label2/**/f                                     ;\
 659           nop                                                   ;\
 660 label1:                                                         ;\
 661         inc     %o1                                             ;\
 662         stb     %o1, [THREAD_REG + T_PREEMPT]                   ;\
 663 label2:
 664 
 665 #define FP_ALLOWMIGRATE(label1, label2)                 \
 666         ldn     [THREAD_REG + T_LWP], %o0                       ;\
 667         brz,a,pn %o0, label1/**/f                               ;\
 668           ldsb  [THREAD_REG + T_PREEMPT], %o1                   ;\
 669         call thread_allowmigrate                                ;\
 670           nop                                                   ;\
 671         ba      label2/**/f                                     ;\
 672           nop                                                   ;\
 673 label1:                                                         ;\
 674         dec     %o1                                             ;\
 675         brnz,pn %o1, label2/**/f                                ;\
 676           stb   %o1, [THREAD_REG + T_PREEMPT]                   ;\
 677         ldn     [THREAD_REG + T_CPU], %o0                       ;\
 678         ldub    [%o0 + CPU_KPRUNRUN], %o0                       ;\
 679         brz,pt  %o0, label2/**/f                                ;\
 680           nop                                                   ;\
 681         call    kpreempt                                        ;\
 682           rdpr  %pil, %o0                                       ;\
 683 label2:
 684 
 685 /*
 686  * Copy a block of storage, returning an error code if `from' or
 687  * `to' takes a kernel pagefault which cannot be resolved.
 688  * Returns errno value on pagefault error, 0 if all ok
 689  */
 690 
 691         .seg    ".text"
 692         .align  4
 693 
 694         ENTRY(kcopy)
 695 
 696         cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
 697         bleu,pt %ncc, .kcopy_small              ! go to larger cases
 698           xor   %o0, %o1, %o3                   ! are src, dst alignable?
 699         btst    7, %o3                          !
 700         bz,pt   %ncc, .kcopy_8                  ! check for longword alignment
 701           nop
 702         btst    1, %o3                          !
 703         bz,pt   %ncc, .kcopy_2                  ! check for half-word
 704           nop
 705         sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
 706         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
 707         tst     %o3
 708         bz,pn   %icc, .kcopy_small              ! if zero, disable HW copy
 709           cmp   %o2, %o3                        ! if length <= limit
 710         bleu,pt %ncc, .kcopy_small              ! go to small copy
 711           nop
 712         ba,pt   %ncc, .kcopy_more               ! otherwise go to large copy
 713           nop
 714 .kcopy_2:
 715         btst    3, %o3                          !
 716         bz,pt   %ncc, .kcopy_4                  ! check for word alignment
 717           nop
 718         sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
 719         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
 720         tst     %o3
 721         bz,pn   %icc, .kcopy_small              ! if zero, disable HW copy
 722           cmp   %o2, %o3                        ! if length <= limit
 723         bleu,pt %ncc, .kcopy_small              ! go to small copy
 724           nop
 725         ba,pt   %ncc, .kcopy_more               ! otherwise go to large copy
 726           nop
 727 .kcopy_4:
 728         ! already checked longword, must be word aligned
 729         sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
 730         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
 731         tst     %o3
 732         bz,pn   %icc, .kcopy_small              ! if zero, disable HW copy
 733           cmp   %o2, %o3                        ! if length <= limit
 734         bleu,pt %ncc, .kcopy_small              ! go to small copy
 735           nop
 736         ba,pt   %ncc, .kcopy_more               ! otherwise go to large copy
 737           nop
 738 .kcopy_8:
 739         sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
 740         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
 741         tst     %o3
 742         bz,pn   %icc, .kcopy_small              ! if zero, disable HW copy
 743           cmp   %o2, %o3                        ! if length <= limit
 744         bleu,pt %ncc, .kcopy_small              ! go to small copy
 745           nop
 746         ba,pt   %ncc, .kcopy_more               ! otherwise go to large copy
 747           nop
 748 
 749 .kcopy_small:
 750         sethi   %hi(.sm_copyerr), %o5           ! sm_copyerr is lofault value
 751         or      %o5, %lo(.sm_copyerr), %o5
 752         ldn     [THREAD_REG + T_LOFAULT], %o4   ! save existing handler
 753         membar  #Sync                           ! sync error barrier
 754         ba,pt   %ncc, .sm_do_copy               ! common code
 755          stn    %o5, [THREAD_REG + T_LOFAULT]   ! set t_lofault
 756 
 757 .kcopy_more:
 758         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
 759         sethi   %hi(.copyerr), %l7              ! copyerr is lofault value
 760         or      %l7, %lo(.copyerr), %l7
 761         ldn     [THREAD_REG + T_LOFAULT], %l6   ! save existing handler
 762         membar  #Sync                           ! sync error barrier
 763         ba,pt   %ncc, .do_copy                  ! common code
 764           stn   %l7, [THREAD_REG + T_LOFAULT]   ! set t_lofault
 765 
 766 
 767 /*
 768  * We got here because of a fault during bcopy_more, called from kcopy or bcopy.
 769  * Errno value is in %g1.  bcopy_more uses fp quadrants 1 and 3.
 770  */
 771 .copyerr:
 772         set     .copyerr2, %l0
 773         membar  #Sync                           ! sync error barrier
 774         stn     %l0, [THREAD_REG + T_LOFAULT]   ! set t_lofault
 775         btst    FPUSED_FLAG, %l6
 776         bz      %ncc, 1f
 777           and   %l6, TRAMP_FLAG, %l0            ! copy trampoline flag to %l0
 778 
 779         ldx     [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2      ! restore gsr
 780         wr      %o2, 0, %gsr
 781 
 782         ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
 783         btst    FPRS_FEF, %o3
 784         bz,pt   %icc, 4f
 785           nop
 786 
 787         BLD_FPQ1Q3_FROMSTACK(%o2)
 788 
 789         ba,pt   %ncc, 1f
 790           wr    %o3, 0, %fprs           ! restore fprs
 791 
 792 4:
 793         FZEROQ1Q3
 794         wr      %o3, 0, %fprs           ! restore fprs
 795 
 796         !
 797         ! Need to cater for the different expectations of kcopy
 798         ! and bcopy. kcopy will *always* set a t_lofault handler
 799         ! If it fires, we're expected to just return the error code
 800         ! and *not* to invoke any existing error handler. As far as
 801         ! bcopy is concerned, we only set t_lofault if there was an
 802         ! existing lofault handler. In that case we're expected to
 803         ! invoke the previously existing handler after resetting the
 804         ! t_lofault value.
 805         !
 806 1:
 807         andn    %l6, MASK_FLAGS, %l6            ! turn trampoline flag off
 808         membar  #Sync                           ! sync error barrier
 809         stn     %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
 810         FP_ALLOWMIGRATE(5, 6)
 811 
 812         btst    TRAMP_FLAG, %l0
 813         bnz,pn  %ncc, 3f
 814           nop
 815         ret
 816           restore       %g1, 0, %o0
 817 
 818 3:
 819         !
 820         ! We're here via bcopy. There *must* have been an error handler
 821         ! in place otherwise we would have died a nasty death already.
 822         !
 823         jmp     %l6                             ! goto real handler
 824           restore       %g0, 0, %o0             ! dispose of copy window
 825 
 826 /*
 827  * We got here because of a fault in .copyerr.  We can't safely restore fp
 828  * state, so we panic.
 829  */
 830 fp_panic_msg:
 831         .asciz  "Unable to restore fp state after copy operation"
 832 
 833         .align  4
 834 .copyerr2:
 835         set     fp_panic_msg, %o0
 836         call    panic
 837           nop
 838 
 839 /*
 840  * We got here because of a fault during a small kcopy or bcopy.
 841  * No floating point registers are used by the small copies.
 842  * Errno value is in %g1.
 843  */
 844 .sm_copyerr:
 845 1:
 846         btst    TRAMP_FLAG, %o4
 847         membar  #Sync
 848         andn    %o4, TRAMP_FLAG, %o4
 849         bnz,pn  %ncc, 3f
 850           stn   %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
 851         retl
 852           mov   %g1, %o0
 853 3:
 854         jmp     %o4                             ! goto real handler
 855           mov   %g0, %o0                        !
 856 
 857         SET_SIZE(kcopy)
 858 
 859 
 860 /*
 861  * Copy a block of storage - must not overlap (from + len <= to).
 862  * Registers: l6 - saved t_lofault
 863  * (for short copies, o4 - saved t_lofault)
 864  *
 865  * Copy a page of memory.
 866  * Assumes double word alignment and a count >= 256.
 867  */
 868 
 869         ENTRY(bcopy)
 870 
 871         cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
 872         bleu,pt %ncc, .bcopy_small              ! go to larger cases
 873           xor   %o0, %o1, %o3                   ! are src, dst alignable?
 874         btst    7, %o3                          !
 875         bz,pt   %ncc, .bcopy_8                  ! check for longword alignment
 876           nop
 877         btst    1, %o3                          !
 878         bz,pt   %ncc, .bcopy_2                  ! check for half-word
 879           nop
 880         sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
 881         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
 882         tst     %o3
 883         bz,pn   %icc, .bcopy_small              ! if zero, disable HW copy
 884           cmp   %o2, %o3                        ! if length <= limit
 885         bleu,pt %ncc, .bcopy_small              ! go to small copy
 886           nop
 887         ba,pt   %ncc, .bcopy_more               ! otherwise go to large copy
 888           nop
 889 .bcopy_2:
 890         btst    3, %o3                          !
 891         bz,pt   %ncc, .bcopy_4                  ! check for word alignment
 892           nop
 893         sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
 894         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
 895         tst     %o3
 896         bz,pn   %icc, .bcopy_small              ! if zero, disable HW copy
 897           cmp   %o2, %o3                        ! if length <= limit
 898         bleu,pt %ncc, .bcopy_small              ! go to small copy
 899           nop
 900         ba,pt   %ncc, .bcopy_more               ! otherwise go to large copy
 901           nop
 902 .bcopy_4:
 903         ! already checked longword, must be word aligned
 904         sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
 905         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
 906         tst     %o3
 907         bz,pn   %icc, .bcopy_small              ! if zero, disable HW copy
 908           cmp   %o2, %o3                        ! if length <= limit
 909         bleu,pt %ncc, .bcopy_small              ! go to small copy
 910           nop
 911         ba,pt   %ncc, .bcopy_more               ! otherwise go to large copy
 912           nop
 913 .bcopy_8:
 914         sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
 915         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
 916         tst     %o3
 917         bz,pn   %icc, .bcopy_small              ! if zero, disable HW copy
 918           cmp   %o2, %o3                        ! if length <= limit
 919         bleu,pt %ncc, .bcopy_small              ! go to small copy
 920           nop
 921         ba,pt   %ncc, .bcopy_more               ! otherwise go to large copy
 922           nop
 923 
 924         .align  16
 925 .bcopy_small:
 926         ldn     [THREAD_REG + T_LOFAULT], %o4   ! save t_lofault
 927         tst     %o4
 928         bz,pt   %icc, .sm_do_copy
 929           nop
 930         sethi   %hi(.sm_copyerr), %o5
 931         or      %o5, %lo(.sm_copyerr), %o5
 932         membar  #Sync                           ! sync error barrier
 933         stn     %o5, [THREAD_REG + T_LOFAULT]   ! install new vector
 934         or      %o4, TRAMP_FLAG, %o4            ! error should trampoline
 935 .sm_do_copy:
 936         cmp     %o2, SHORTCOPY          ! check for really short case
 937         bleu,pt %ncc, .bc_sm_left       !
 938           cmp   %o2, CHKSIZE            ! check for medium length cases
 939         bgu,pn  %ncc, .bc_med           !
 940           or    %o0, %o1, %o3           ! prepare alignment check
 941         andcc   %o3, 0x3, %g0           ! test for alignment
 942         bz,pt   %ncc, .bc_sm_word       ! branch to word aligned case
 943 .bc_sm_movebytes:
 944           sub   %o2, 3, %o2             ! adjust count to allow cc zero test
 945 .bc_sm_notalign4:
 946         ldub    [%o0], %o3              ! read byte
 947         stb     %o3, [%o1]              ! write byte
 948         subcc   %o2, 4, %o2             ! reduce count by 4
 949         ldub    [%o0 + 1], %o3          ! repeat for a total of 4 bytes
 950         add     %o0, 4, %o0             ! advance SRC by 4
 951         stb     %o3, [%o1 + 1]
 952         ldub    [%o0 - 2], %o3
 953         add     %o1, 4, %o1             ! advance DST by 4
 954         stb     %o3, [%o1 - 2]
 955         ldub    [%o0 - 1], %o3
 956         bgt,pt  %ncc, .bc_sm_notalign4  ! loop til 3 or fewer bytes remain
 957           stb   %o3, [%o1 - 1]
 958         add     %o2, 3, %o2             ! restore count
 959 .bc_sm_left:
 960         tst     %o2
 961         bz,pt   %ncc, .bc_sm_exit       ! check for zero length
 962           deccc %o2                     ! reduce count for cc test
 963         ldub    [%o0], %o3              ! move one byte
 964         bz,pt   %ncc, .bc_sm_exit
 965           stb   %o3, [%o1]
 966         ldub    [%o0 + 1], %o3          ! move another byte
 967         deccc   %o2                     ! check for more
 968         bz,pt   %ncc, .bc_sm_exit
 969           stb   %o3, [%o1 + 1]
 970         ldub    [%o0 + 2], %o3          ! move final byte
 971         ba,pt   %ncc, .bc_sm_exit
 972           stb   %o3, [%o1 + 2]
 973         .align  16
 974         nop                             ! instruction alignment
 975                                         ! see discussion at start of file
 976 .bc_sm_words:
 977         lduw    [%o0], %o3              ! read word
 978 .bc_sm_wordx:
 979         subcc   %o2, 8, %o2             ! update count
 980         stw     %o3, [%o1]              ! write word
 981         add     %o0, 8, %o0             ! update SRC
 982         lduw    [%o0 - 4], %o3          ! read word
 983         add     %o1, 8, %o1             ! update DST
 984         bgt,pt  %ncc, .bc_sm_words      ! loop til done
 985           stw   %o3, [%o1 - 4]          ! write word
 986         addcc   %o2, 7, %o2             ! restore count
 987         bz,pt   %ncc, .bc_sm_exit
 988           deccc %o2
 989         bz,pt   %ncc, .bc_sm_byte
 990 .bc_sm_half:
 991           subcc %o2, 2, %o2             ! reduce count by 2
 992         add     %o0, 2, %o0             ! advance SRC by 2
 993         lduh    [%o0 - 2], %o3          ! read half word
 994         add     %o1, 2, %o1             ! advance DST by 2
 995         bgt,pt  %ncc, .bc_sm_half       ! loop til done
 996           sth   %o3, [%o1 - 2]          ! write half word
 997         addcc   %o2, 1, %o2             ! restore count
 998         bz,pt   %ncc, .bc_sm_exit
 999           nop
1000 .bc_sm_byte:
1001         ldub    [%o0], %o3
1002         ba,pt   %ncc, .bc_sm_exit
1003           stb   %o3, [%o1]
1004 
1005 .bc_sm_word:
1006         subcc   %o2, 4, %o2             ! update count
1007         bgt,pt  %ncc, .bc_sm_wordx
1008           lduw  [%o0], %o3              ! read word
1009         addcc   %o2, 3, %o2             ! restore count
1010         bz,pt   %ncc, .bc_sm_exit
1011           stw   %o3, [%o1]              ! write word
1012         deccc   %o2                     ! reduce count for cc test
1013         ldub    [%o0 + 4], %o3          ! load one byte
1014         bz,pt   %ncc, .bc_sm_exit
1015           stb   %o3, [%o1 + 4]          ! store one byte
1016         ldub    [%o0 + 5], %o3          ! load second byte
1017         deccc   %o2
1018         bz,pt   %ncc, .bc_sm_exit
1019           stb   %o3, [%o1 + 5]          ! store second byte
1020         ldub    [%o0 + 6], %o3          ! load third byte
1021         stb     %o3, [%o1 + 6]          ! store third byte
1022 .bc_sm_exit:
1023         ldn     [THREAD_REG + T_LOFAULT], %o3
1024         brz,pt  %o3, .bc_sm_done
1025           nop
1026         membar  #Sync                           ! sync error barrier
1027         andn    %o4, TRAMP_FLAG, %o4
1028         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1029 .bc_sm_done:
1030         retl
1031           mov   %g0, %o0                ! return 0
1032 
1033         .align 16
1034 .bc_med:
1035         xor     %o0, %o1, %o3           ! setup alignment check
1036         btst    1, %o3
1037         bnz,pt  %ncc, .bc_sm_movebytes  ! unaligned
1038           nop
1039         btst    3, %o3
1040         bnz,pt  %ncc, .bc_med_half      ! halfword aligned
1041           nop
1042         btst    7, %o3
1043         bnz,pt  %ncc, .bc_med_word      ! word aligned
1044           nop
1045 .bc_med_long:
1046         btst    3, %o0                  ! check for
1047         bz,pt   %ncc, .bc_med_long1     ! word alignment
1048           nop
1049 .bc_med_long0:
1050         ldub    [%o0], %o3              ! load one byte
1051         inc     %o0
1052         stb     %o3,[%o1]               ! store byte
1053         inc     %o1
1054         btst    3, %o0
1055         bnz,pt  %ncc, .bc_med_long0
1056           dec   %o2
1057 .bc_med_long1:                  ! word aligned
1058         btst    7, %o0                  ! check for long word
1059         bz,pt   %ncc, .bc_med_long2
1060           nop
1061         lduw    [%o0], %o3              ! load word
1062         add     %o0, 4, %o0             ! advance SRC by 4
1063         stw     %o3, [%o1]              ! store word
1064         add     %o1, 4, %o1             ! advance DST by 4
1065         sub     %o2, 4, %o2             ! reduce count by 4
1066 !
1067 !  Now long word aligned and have at least 32 bytes to move
1068 !
1069 .bc_med_long2:
1070         sub     %o2, 31, %o2            ! adjust count to allow cc zero test
1071 .bc_med_lmove:
1072         ldx     [%o0], %o3              ! read long word
1073         stx     %o3, [%o1]              ! write long word
1074         subcc   %o2, 32, %o2            ! reduce count by 32
1075         ldx     [%o0 + 8], %o3          ! repeat for a total for 4 long words
1076         add     %o0, 32, %o0            ! advance SRC by 32
1077         stx     %o3, [%o1 + 8]
1078         ldx     [%o0 - 16], %o3
1079         add     %o1, 32, %o1            ! advance DST by 32
1080         stx     %o3, [%o1 - 16]
1081         ldx     [%o0 - 8], %o3
1082         bgt,pt  %ncc, .bc_med_lmove     ! loop til 31 or fewer bytes left
1083           stx   %o3, [%o1 - 8]
1084         addcc   %o2, 24, %o2            ! restore count to long word offset
1085         ble,pt  %ncc, .bc_med_lextra    ! check for more long words to move
1086           nop
1087 .bc_med_lword:
1088         ldx     [%o0], %o3              ! read long word
1089         subcc   %o2, 8, %o2             ! reduce count by 8
1090         stx     %o3, [%o1]              ! write long word
1091         add     %o0, 8, %o0             ! advance SRC by 8
1092         bgt,pt  %ncc, .bc_med_lword     ! loop til 7 or fewer bytes left
1093           add   %o1, 8, %o1             ! advance DST by 8
1094 .bc_med_lextra:
1095         addcc   %o2, 7, %o2             ! restore rest of count
1096         bz,pt   %ncc, .bc_sm_exit       ! if zero, then done
1097           deccc %o2
1098         bz,pt   %ncc, .bc_sm_byte
1099           nop
1100         ba,pt   %ncc, .bc_sm_half
1101           nop
1102 
1103         .align 16
1104 .bc_med_word:
1105         btst    3, %o0                  ! check for
1106         bz,pt   %ncc, .bc_med_word1     ! word alignment
1107           nop
1108 .bc_med_word0:
1109         ldub    [%o0], %o3              ! load one byte
1110         inc     %o0
1111         stb     %o3,[%o1]               ! store byte
1112         inc     %o1
1113         btst    3, %o0
1114         bnz,pt  %ncc, .bc_med_word0
1115           dec   %o2
1116 !
1117 !  Now word aligned and have at least 36 bytes to move
1118 !
1119 .bc_med_word1:
1120         sub     %o2, 15, %o2            ! adjust count to allow cc zero test
1121 .bc_med_wmove:
1122         lduw    [%o0], %o3              ! read word
1123         stw     %o3, [%o1]              ! write word
1124         subcc   %o2, 16, %o2            ! reduce count by 16
1125         lduw    [%o0 + 4], %o3          ! repeat for a total for 4 words
1126         add     %o0, 16, %o0            ! advance SRC by 16
1127         stw     %o3, [%o1 + 4]
1128         lduw    [%o0 - 8], %o3
1129         add     %o1, 16, %o1            ! advance DST by 16
1130         stw     %o3, [%o1 - 8]
1131         lduw    [%o0 - 4], %o3
1132         bgt,pt  %ncc, .bc_med_wmove     ! loop til 15 or fewer bytes left
1133           stw   %o3, [%o1 - 4]
1134         addcc   %o2, 12, %o2            ! restore count to word offset
1135         ble,pt  %ncc, .bc_med_wextra    ! check for more words to move
1136           nop
1137 .bc_med_word2:
1138         lduw    [%o0], %o3              ! read word
1139         subcc   %o2, 4, %o2             ! reduce count by 4
1140         stw     %o3, [%o1]              ! write word
1141         add     %o0, 4, %o0             ! advance SRC by 4
1142         bgt,pt  %ncc, .bc_med_word2     ! loop til 3 or fewer bytes left
1143           add   %o1, 4, %o1             ! advance DST by 4
1144 .bc_med_wextra:
1145         addcc   %o2, 3, %o2             ! restore rest of count
1146         bz,pt   %ncc, .bc_sm_exit       ! if zero, then done
1147           deccc %o2
1148         bz,pt   %ncc, .bc_sm_byte
1149           nop
1150         ba,pt   %ncc, .bc_sm_half
1151           nop
1152 
1153         .align 16
1154 .bc_med_half:
1155         btst    1, %o0                  ! check for
1156         bz,pt   %ncc, .bc_med_half1     ! half word alignment
1157           nop
1158         ldub    [%o0], %o3              ! load one byte
1159         inc     %o0
1160         stb     %o3,[%o1]               ! store byte
1161         inc     %o1
1162         dec     %o2
1163 !
1164 !  Now half word aligned and have at least 38 bytes to move
1165 !
1166 .bc_med_half1:
1167         sub     %o2, 7, %o2             ! adjust count to allow cc zero test
1168 .bc_med_hmove:
1169         lduh    [%o0], %o3              ! read half word
1170         sth     %o3, [%o1]              ! write half word
1171         subcc   %o2, 8, %o2             ! reduce count by 8
1172         lduh    [%o0 + 2], %o3          ! repeat for a total for 4 halfwords
1173         add     %o0, 8, %o0             ! advance SRC by 8
1174         sth     %o3, [%o1 + 2]
1175         lduh    [%o0 - 4], %o3
1176         add     %o1, 8, %o1             ! advance DST by 8
1177         sth     %o3, [%o1 - 4]
1178         lduh    [%o0 - 2], %o3
1179         bgt,pt  %ncc, .bc_med_hmove     ! loop til 7 or fewer bytes left
1180           sth   %o3, [%o1 - 2]
1181         addcc   %o2, 7, %o2             ! restore count
1182         bz,pt   %ncc, .bc_sm_exit
1183           deccc %o2
1184         bz,pt   %ncc, .bc_sm_byte
1185           nop
1186         ba,pt   %ncc, .bc_sm_half
1187           nop
1188 
1189         SET_SIZE(bcopy)
1190 
1191 /*
1192  * The _more entry points are not intended to be used directly by
1193  * any caller from outside this file.  They are provided to allow
1194  * profiling and dtrace of the portions of the copy code that uses
1195  * the floating point registers.
1196  * This entry is particularly important as DTRACE (at least as of
1197  * 4/2004) does not support leaf functions.
1198  */
1199 
1200         ENTRY(bcopy_more)
1201 .bcopy_more:
1202         prefetch [%o0], #n_reads
1203         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1204         ldn     [THREAD_REG + T_LOFAULT], %l6   ! save t_lofault
1205         tst     %l6
1206         bz,pt   %ncc, .do_copy
1207           nop
1208         sethi   %hi(.copyerr), %o2
1209         or      %o2, %lo(.copyerr), %o2
1210         membar  #Sync                           ! sync error barrier
1211         stn     %o2, [THREAD_REG + T_LOFAULT]   ! install new vector
1212         !
1213         ! We've already captured whether t_lofault was zero on entry.
1214         ! We need to mark ourselves as being from bcopy since both
1215         ! kcopy and bcopy use the same code path. If TRAMP_FLAG is set
1216         ! and the saved lofault was zero, we won't reset lofault on
1217         ! returning.
1218         !
1219         or      %l6, TRAMP_FLAG, %l6
1220 
1221 /*
1222  * Copies that reach here are larger than VIS_COPY_THRESHOLD bytes
1223  * Also, use of FP registers has been tested to be enabled
1224  */
1225 .do_copy:
1226         FP_NOMIGRATE(6, 7)
1227 
1228         rd      %fprs, %o2              ! check for unused fp
1229         st      %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
1230         btst    FPRS_FEF, %o2
1231         bz,a,pt %icc, .do_blockcopy
1232           wr    %g0, FPRS_FEF, %fprs
1233 
1234         BST_FPQ1Q3_TOSTACK(%o2)
1235 
1236 .do_blockcopy:
1237         rd      %gsr, %o2
1238         stx     %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]      ! save gsr
1239         or      %l6, FPUSED_FLAG, %l6
1240 
1241 #define REALSRC %i0
1242 #define DST     %i1
1243 #define CNT     %i2
1244 #define SRC     %i3
1245 #define TMP     %i5
1246 
1247         andcc   DST, VIS_BLOCKSIZE - 1, TMP
1248         bz,pt   %ncc, 2f
1249           neg   TMP
1250         add     TMP, VIS_BLOCKSIZE, TMP
1251 
1252         ! TMP = bytes required to align DST on FP_BLOCK boundary
1253         ! Using SRC as a tmp here
1254         cmp     TMP, 3
1255         bleu,pt %ncc, 1f
1256           sub   CNT,TMP,CNT             ! adjust main count
1257         sub     TMP, 3, TMP             ! adjust for end of loop test
1258 .bc_blkalign:
1259         ldub    [REALSRC], SRC          ! move 4 bytes per loop iteration
1260         stb     SRC, [DST]
1261         subcc   TMP, 4, TMP
1262         ldub    [REALSRC + 1], SRC
1263         add     REALSRC, 4, REALSRC
1264         stb     SRC, [DST + 1]
1265         ldub    [REALSRC - 2], SRC
1266         add     DST, 4, DST
1267         stb     SRC, [DST - 2]
1268         ldub    [REALSRC - 1], SRC
1269         bgu,pt  %ncc, .bc_blkalign
1270           stb   SRC, [DST - 1]
1271 
1272         addcc   TMP, 3, TMP             ! restore count adjustment
1273         bz,pt   %ncc, 2f                ! no bytes left?
1274           nop
1275 1:      ldub    [REALSRC], SRC
1276         inc     REALSRC
1277         inc     DST
1278         deccc   TMP
1279         bgu     %ncc, 1b
1280           stb   SRC, [DST - 1]
1281 
1282 2:
1283         membar  #StoreLoad
1284         andn    REALSRC, 0x7, SRC
1285 
1286         ! SRC - 8-byte aligned
1287         ! DST - 64-byte aligned
1288         ldd     [SRC], %f0
1289         prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads
1290         alignaddr REALSRC, %g0, %g0
1291         ldd     [SRC + 0x08], %f2
1292         prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads
1293         faligndata %f0, %f2, %f32
1294         ldd     [SRC + 0x10], %f4
1295         prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
1296         faligndata %f2, %f4, %f34
1297         ldd     [SRC + 0x18], %f6
1298         prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
1299         faligndata %f4, %f6, %f36
1300         ldd     [SRC + 0x20], %f8
1301         prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read
1302         faligndata %f6, %f8, %f38
1303         ldd     [SRC + 0x28], %f10
1304         prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read
1305         faligndata %f8, %f10, %f40
1306         ldd     [SRC + 0x30], %f12
1307         prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read
1308         faligndata %f10, %f12, %f42
1309         ldd     [SRC + 0x38], %f14
1310         ldd     [SRC + VIS_BLOCKSIZE], %f0
1311         sub     CNT, VIS_BLOCKSIZE, CNT
1312         add     SRC, VIS_BLOCKSIZE, SRC
1313         prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read
1314         add     REALSRC, VIS_BLOCKSIZE, REALSRC
1315         ba,pt   %ncc, 1f
1316           prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read
1317         .align  32
1318 1:
1319         ldd     [SRC + 0x08], %f2
1320         faligndata %f12, %f14, %f44
1321         ldd     [SRC + 0x10], %f4
1322         faligndata %f14, %f0, %f46
1323         stda    %f32, [DST]ASI_BLK_P
1324         ldd     [SRC + 0x18], %f6
1325         faligndata %f0, %f2, %f32
1326         ldd     [SRC + 0x20], %f8
1327         faligndata %f2, %f4, %f34
1328         ldd     [SRC + 0x28], %f10
1329         faligndata %f4, %f6, %f36
1330         ldd     [SRC + 0x30], %f12
1331         faligndata %f6, %f8, %f38
1332         sub     CNT, VIS_BLOCKSIZE, CNT
1333         ldd     [SRC + 0x38], %f14
1334         faligndata %f8, %f10, %f40
1335         add     DST, VIS_BLOCKSIZE, DST
1336         ldd     [SRC + VIS_BLOCKSIZE], %f0
1337         faligndata %f10, %f12, %f42
1338         add     REALSRC, VIS_BLOCKSIZE, REALSRC
1339         prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
1340         add     SRC, VIS_BLOCKSIZE, SRC
1341         prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1342         cmp     CNT, VIS_BLOCKSIZE + 8
1343         bgu,pt  %ncc, 1b
1344           prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1345 
1346         ! only if REALSRC & 0x7 is 0
1347         cmp     CNT, VIS_BLOCKSIZE
1348         bne     %ncc, 3f
1349           andcc REALSRC, 0x7, %g0
1350         bz,pt   %ncc, 2f
1351           nop
1352 3:
1353         faligndata %f12, %f14, %f44
1354         faligndata %f14, %f0, %f46
1355         stda    %f32, [DST]ASI_BLK_P
1356         add     DST, VIS_BLOCKSIZE, DST
1357         ba,pt   %ncc, 3f
1358           nop
1359 2:
1360         ldd     [SRC + 0x08], %f2
1361         fsrc1   %f12, %f44
1362         ldd     [SRC + 0x10], %f4
1363         fsrc1   %f14, %f46
1364         stda    %f32, [DST]ASI_BLK_P
1365         ldd     [SRC + 0x18], %f6
1366         fsrc1   %f0, %f32
1367         ldd     [SRC + 0x20], %f8
1368         fsrc1   %f2, %f34
1369         ldd     [SRC + 0x28], %f10
1370         fsrc1   %f4, %f36
1371         ldd     [SRC + 0x30], %f12
1372         fsrc1   %f6, %f38
1373         ldd     [SRC + 0x38], %f14
1374         fsrc1   %f8, %f40
1375         sub     CNT, VIS_BLOCKSIZE, CNT
1376         add     DST, VIS_BLOCKSIZE, DST
1377         add     SRC, VIS_BLOCKSIZE, SRC
1378         add     REALSRC, VIS_BLOCKSIZE, REALSRC
1379         fsrc1   %f10, %f42
1380         fsrc1   %f12, %f44
1381         fsrc1   %f14, %f46
1382         stda    %f32, [DST]ASI_BLK_P
1383         add     DST, VIS_BLOCKSIZE, DST
1384         ba,a,pt %ncc, .bcb_exit
1385           nop
1386 
1387 3:      tst     CNT
1388         bz,a,pt %ncc, .bcb_exit
1389           nop
1390 
1391 5:      ldub    [REALSRC], TMP
1392         inc     REALSRC
1393         inc     DST
1394         deccc   CNT
1395         bgu     %ncc, 5b
1396           stb   TMP, [DST - 1]
1397 .bcb_exit:
1398         membar  #Sync
1399 
1400         ldx     [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2      ! restore gsr
1401         wr      %o2, 0, %gsr
1402 
1403         ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1404         btst    FPRS_FEF, %o3
1405         bz,pt   %icc, 4f
1406           nop
1407 
1408         BLD_FPQ1Q3_FROMSTACK(%o2)
1409 
1410         ba,pt   %ncc, 2f
1411           wr    %o3, 0, %fprs           ! restore fprs
1412 4:
1413         FZEROQ1Q3
1414         wr      %o3, 0, %fprs           ! restore fprs
1415 2:
1416         membar  #Sync                           ! sync error barrier
1417         andn    %l6, MASK_FLAGS, %l6
1418         stn     %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1419         FP_ALLOWMIGRATE(5, 6)
1420         ret
1421           restore       %g0, 0, %o0
1422 
1423         SET_SIZE(bcopy_more)
1424 
1425 /*
1426  * Block copy with possibly overlapped operands.
1427  */
1428 
1429         ENTRY(ovbcopy)
1430         tst     %o2                     ! check count
1431         bgu,a   %ncc, 1f                ! nothing to do or bad arguments
1432           subcc %o0, %o1, %o3           ! difference of from and to address
1433 
1434         retl                            ! return
1435           nop
1436 1:
1437         bneg,a  %ncc, 2f
1438           neg   %o3                     ! if < 0, make it positive
1439 2:      cmp     %o2, %o3                ! cmp size and abs(from - to)
1440         bleu    %ncc, bcopy             ! if size <= abs(diff): use bcopy,
1441           .empty                                !   no overlap
1442           cmp   %o0, %o1                ! compare from and to addresses
1443         blu     %ncc, .ov_bkwd          ! if from < to, copy backwards
1444           nop
1445         !
1446         ! Copy forwards.
1447         !
1448 .ov_fwd:
1449         ldub    [%o0], %o3              ! read from address
1450         inc     %o0                     ! inc from address
1451         stb     %o3, [%o1]              ! write to address
1452         deccc   %o2                     ! dec count
1453         bgu     %ncc, .ov_fwd           ! loop till done
1454           inc   %o1                     ! inc to address
1455 
1456         retl                            ! return
1457           nop
1458         !
1459         ! Copy backwards.
1460         !
1461 .ov_bkwd:
1462         deccc   %o2                     ! dec count
1463         ldub    [%o0 + %o2], %o3        ! get byte at end of src
1464         bgu     %ncc, .ov_bkwd          ! loop till done
1465           stb   %o3, [%o1 + %o2]        ! delay slot, store at end of dst
1466 
1467         retl                            ! return
1468           nop
1469 
1470         SET_SIZE(ovbcopy)
1471 
1472 
1473 /*
1474  * hwblkpagecopy()
1475  *
1476  * Copies exactly one page.  This routine assumes the caller (ppcopy)
1477  * has already disabled kernel preemption and has checked
1478  * use_hw_bcopy.  Preventing preemption also prevents cpu migration.
1479  */
1480         ENTRY(hwblkpagecopy)
1481         ! get another window w/space for three aligned blocks of saved fpregs
1482         prefetch [%o0], #n_reads
1483         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1484 
1485         ! %i0 - source address (arg)
1486         ! %i1 - destination address (arg)
1487         ! %i2 - length of region (not arg)
1488         ! %l0 - saved fprs
1489         ! %l1 - pointer to saved fpregs
1490 
1491         rd      %fprs, %l0              ! check for unused fp
1492         btst    FPRS_FEF, %l0
1493         bz,a,pt %icc, 1f
1494           wr    %g0, FPRS_FEF, %fprs
1495 
1496         BST_FPQ1Q3_TOSTACK(%l1)
1497 
1498 1:      set     PAGESIZE, CNT
1499         mov     REALSRC, SRC
1500 
1501         ldd     [SRC], %f0
1502         prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads
1503         ldd     [SRC + 0x08], %f2
1504         prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads
1505         fmovd   %f0, %f32
1506         ldd     [SRC + 0x10], %f4
1507         prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
1508         fmovd   %f2, %f34
1509         ldd     [SRC + 0x18], %f6
1510         prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
1511         fmovd   %f4, %f36
1512         ldd     [SRC + 0x20], %f8
1513         prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read
1514         fmovd   %f6, %f38
1515         ldd     [SRC + 0x28], %f10
1516         prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read
1517         fmovd   %f8, %f40
1518         ldd     [SRC + 0x30], %f12
1519         prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read
1520         fmovd   %f10, %f42
1521         ldd     [SRC + 0x38], %f14
1522         ldd     [SRC + VIS_BLOCKSIZE], %f0
1523         sub     CNT, VIS_BLOCKSIZE, CNT
1524         add     SRC, VIS_BLOCKSIZE, SRC
1525         prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read
1526         ba,pt   %ncc, 2f
1527         prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read
1528         .align  32
1529 2:
1530         ldd     [SRC + 0x08], %f2
1531         fmovd   %f12, %f44
1532         ldd     [SRC + 0x10], %f4
1533         fmovd   %f14, %f46
1534         stda    %f32, [DST]ASI_BLK_P
1535         ldd     [SRC + 0x18], %f6
1536         fmovd   %f0, %f32
1537         ldd     [SRC + 0x20], %f8
1538         fmovd   %f2, %f34
1539         ldd     [SRC + 0x28], %f10
1540         fmovd   %f4, %f36
1541         ldd     [SRC + 0x30], %f12
1542         fmovd   %f6, %f38
1543         ldd     [SRC + 0x38], %f14
1544         fmovd   %f8, %f40
1545         ldd     [SRC + VIS_BLOCKSIZE], %f0
1546         fmovd   %f10, %f42
1547         sub     CNT, VIS_BLOCKSIZE, CNT
1548         prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
1549         add     DST, VIS_BLOCKSIZE, DST
1550         prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1551         add     SRC, VIS_BLOCKSIZE, SRC
1552         cmp     CNT, VIS_BLOCKSIZE + 8
1553         bgu,pt  %ncc, 2b
1554           prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1555 
1556         ! trailing block
1557         ldd     [SRC + 0x08], %f2
1558         fsrc1   %f12, %f44
1559         ldd     [SRC + 0x10], %f4
1560         fsrc1   %f14, %f46
1561         stda    %f32, [DST]ASI_BLK_P
1562         ldd     [SRC + 0x18], %f6
1563         fsrc1   %f0, %f32
1564         ldd     [SRC + 0x20], %f8
1565         fsrc1   %f2, %f34
1566         ldd     [SRC + 0x28], %f10
1567         fsrc1   %f4, %f36
1568         ldd     [SRC + 0x30], %f12
1569         fsrc1   %f6, %f38
1570         ldd     [SRC + 0x38], %f14
1571         fsrc1   %f8, %f40
1572         sub     CNT, VIS_BLOCKSIZE, CNT
1573         add     DST, VIS_BLOCKSIZE, DST
1574         add     SRC, VIS_BLOCKSIZE, SRC
1575         fsrc1   %f10, %f42
1576         fsrc1   %f12, %f44
1577         fsrc1   %f14, %f46
1578         stda    %f32, [DST]ASI_BLK_P
1579 
1580         membar  #Sync
1581 
1582         btst    FPRS_FEF, %l0
1583         bz,pt   %icc, 2f
1584           nop
1585 
1586         BLD_FPQ1Q3_FROMSTACK(%l3)
1587         ba      3f
1588           nop
1589 
1590 2:      FZEROQ1Q3
1591 
1592 3:      wr      %l0, 0, %fprs           ! restore fprs
1593         ret
1594           restore       %g0, 0, %o0
1595 
1596         SET_SIZE(hwblkpagecopy)
1597 
1598 
1599 /*
1600  * Transfer data to and from user space -
1601  * Note that these routines can cause faults
1602  * It is assumed that the kernel has nothing at
1603  * less than KERNELBASE in the virtual address space.
1604  *
1605  * Note that copyin(9F) and copyout(9F) are part of the
1606  * DDI/DKI which specifies that they return '-1' on "errors."
1607  *
1608  * Sigh.
1609  *
1610  * So there's two extremely similar routines - xcopyin() and xcopyout()
1611  * which return the errno that we've faithfully computed.  This
1612  * allows other callers (e.g. uiomove(9F)) to work correctly.
1613  * Given that these are used pretty heavily, we expand the calling
1614  * sequences inline for all flavours (rather than making wrappers).
1615  *
1616  * There are also stub routines for xcopyout_little and xcopyin_little,
1617  * which currently are intended to handle requests of <= 16 bytes from
1618  * do_unaligned. Future enhancement to make them handle 8k pages efficiently
1619  * is left as an exercise...
1620  */
1621 
1622 /*
1623  * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
1624  *
1625  * General theory of operation:
1626  *
1627  * The only difference between copy{in,out} and
1628  * xcopy{in,out} is in the error handling routine they invoke
1629  * when a memory access error occurs. xcopyOP returns the errno
1630  * while copyOP returns -1 (see above). copy{in,out}_noerr set
1631  * a special flag (by oring the TRAMP_FLAG into the fault handler address)
1632  * if they are called with a fault handler already in place. That flag
1633  * causes the default handlers to trampoline to the previous handler
1634  * upon an error.
1635  *
1636  * None of the copyops routines grab a window until it's decided that
1637  * we need to do a HW block copy operation. This saves a window
1638  * spill/fill when we're called during socket ops. The typical IO
1639  * path won't cause spill/fill traps.
1640  *
1641  * This code uses a set of 4 limits for the maximum size that will
1642  * be copied given a particular input/output address alignment.
1643  * If the value for a particular limit is zero, the copy will be performed
1644  * by the plain copy loops rather than FPBLK.
1645  *
1646  * See the description of bcopy above for more details of the
1647  * data copying algorithm and the default limits.
1648  *
1649  */
1650 
1651 /*
1652  * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
1653  */
1654 
1655 /*
1656  * We save the arguments in the following registers in case of a fault:
1657  *      kaddr - %l1
1658  *      uaddr - %l2
1659  *      count - %l3
1660  */
1661 #define SAVE_SRC        %l1
1662 #define SAVE_DST        %l2
1663 #define SAVE_COUNT      %l3
1664 
1665 #define SM_SAVE_SRC             %g4
1666 #define SM_SAVE_DST             %g5
1667 #define SM_SAVE_COUNT           %o5
1668 #define ERRNO           %l5
1669 
1670 
1671 #define REAL_LOFAULT    %l4
1672 /*
1673  * Generic copyio fault handler.  This is the first line of defense when a
1674  * fault occurs in (x)copyin/(x)copyout.  In order for this to function
1675  * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
1676  * This allows us to share common code for all the flavors of the copy
1677  * operations, including the _noerr versions.
1678  *
1679  * Note that this function will restore the original input parameters before
1680  * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
1681  * member of the t_copyop structure, if needed.
1682  */
1683         ENTRY(copyio_fault)
1684         membar  #Sync
1685         mov     %g1,ERRNO                       ! save errno in ERRNO
1686         btst    FPUSED_FLAG, %l6
1687         bz      %ncc, 1f
1688           nop
1689 
1690         ldx     [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
1691         wr      %o2, 0, %gsr            ! restore gsr
1692 
1693         ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1694         btst    FPRS_FEF, %o3
1695         bz,pt   %icc, 4f
1696           nop
1697 
1698         BLD_FPQ2Q4_FROMSTACK(%o2)
1699 
1700         ba,pt   %ncc, 1f
1701           wr    %o3, 0, %fprs           ! restore fprs
1702 
1703 4:
1704         FZEROQ2Q4
1705         wr      %o3, 0, %fprs           ! restore fprs
1706 
1707 1:
1708         andn    %l6, FPUSED_FLAG, %l6
1709         membar  #Sync
1710         stn     %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1711         FP_ALLOWMIGRATE(5, 6)
1712 
1713         mov     SAVE_SRC, %i0
1714         mov     SAVE_DST, %i1
1715         jmp     REAL_LOFAULT
1716           mov   SAVE_COUNT, %i2
1717 
1718         SET_SIZE(copyio_fault)
1719 
1720 
1721         ENTRY(copyout)
1722 
1723         cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
1724         bleu,pt %ncc, .copyout_small            ! go to larger cases
1725           xor   %o0, %o1, %o3                   ! are src, dst alignable?
1726         btst    7, %o3                          !
1727         bz,pt   %ncc, .copyout_8                ! check for longword alignment
1728           nop
1729         btst    1, %o3                          !
1730         bz,pt   %ncc, .copyout_2                ! check for half-word
1731           nop
1732         sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
1733         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
1734         tst     %o3
1735         bz,pn   %icc, .copyout_small            ! if zero, disable HW copy
1736           cmp   %o2, %o3                        ! if length <= limit
1737         bleu,pt %ncc, .copyout_small            ! go to small copy
1738           nop
1739         ba,pt   %ncc, .copyout_more             ! otherwise go to large copy
1740           nop
1741 .copyout_2:
1742         btst    3, %o3                          !
1743         bz,pt   %ncc, .copyout_4                ! check for word alignment
1744           nop
1745         sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
1746         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
1747         tst     %o3
1748         bz,pn   %icc, .copyout_small            ! if zero, disable HW copy
1749           cmp   %o2, %o3                        ! if length <= limit
1750         bleu,pt %ncc, .copyout_small            ! go to small copy
1751           nop
1752         ba,pt   %ncc, .copyout_more             ! otherwise go to large copy
1753           nop
1754 .copyout_4:
1755         ! already checked longword, must be word aligned
1756         sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
1757         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
1758         tst     %o3
1759         bz,pn   %icc, .copyout_small            ! if zero, disable HW copy
1760           cmp   %o2, %o3                        ! if length <= limit
1761         bleu,pt %ncc, .copyout_small            ! go to small copy
1762           nop
1763         ba,pt   %ncc, .copyout_more             ! otherwise go to large copy
1764           nop
1765 .copyout_8:
1766         sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
1767         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
1768         tst     %o3
1769         bz,pn   %icc, .copyout_small            ! if zero, disable HW copy
1770           cmp   %o2, %o3                        ! if length <= limit
1771         bleu,pt %ncc, .copyout_small            ! go to small copy
1772           nop
1773         ba,pt   %ncc, .copyout_more             ! otherwise go to large copy
1774           nop
1775 
1776         .align  16
1777         nop                             ! instruction alignment
1778                                         ! see discussion at start of file
1779 .copyout_small:
1780         sethi   %hi(.sm_copyout_err), %o5       ! .sm_copyout_err is lofault
1781         or      %o5, %lo(.sm_copyout_err), %o5
1782         ldn     [THREAD_REG + T_LOFAULT], %o4   ! save existing handler
1783         membar  #Sync                           ! sync error barrier
1784         stn     %o5, [THREAD_REG + T_LOFAULT]   ! set t_lofault
1785 .sm_do_copyout:
1786         mov     %o0, SM_SAVE_SRC
1787         mov     %o1, SM_SAVE_DST
1788         cmp     %o2, SHORTCOPY          ! check for really short case
1789         bleu,pt %ncc, .co_sm_left       !
1790           mov   %o2, SM_SAVE_COUNT
1791         cmp     %o2, CHKSIZE            ! check for medium length cases
1792         bgu,pn  %ncc, .co_med           !
1793           or    %o0, %o1, %o3           ! prepare alignment check
1794         andcc   %o3, 0x3, %g0           ! test for alignment
1795         bz,pt   %ncc, .co_sm_word       ! branch to word aligned case
1796 .co_sm_movebytes:
1797           sub   %o2, 3, %o2             ! adjust count to allow cc zero test
1798 .co_sm_notalign4:
1799         ldub    [%o0], %o3              ! read byte
1800         subcc   %o2, 4, %o2             ! reduce count by 4
1801         stba    %o3, [%o1]ASI_USER      ! write byte
1802         inc     %o1                     ! advance DST by 1
1803         ldub    [%o0 + 1], %o3          ! repeat for a total of 4 bytes
1804         add     %o0, 4, %o0             ! advance SRC by 4
1805         stba    %o3, [%o1]ASI_USER
1806         inc     %o1                     ! advance DST by 1
1807         ldub    [%o0 - 2], %o3
1808         stba    %o3, [%o1]ASI_USER
1809         inc     %o1                     ! advance DST by 1
1810         ldub    [%o0 - 1], %o3
1811         stba    %o3, [%o1]ASI_USER
1812         bgt,pt  %ncc, .co_sm_notalign4  ! loop til 3 or fewer bytes remain
1813           inc   %o1                     ! advance DST by 1
1814         add     %o2, 3, %o2             ! restore count
1815 .co_sm_left:
1816         tst     %o2
1817         bz,pt   %ncc, .co_sm_exit       ! check for zero length
1818           nop
1819         ldub    [%o0], %o3              ! load one byte
1820         deccc   %o2                     ! reduce count for cc test
1821         bz,pt   %ncc, .co_sm_exit
1822           stba  %o3,[%o1]ASI_USER       ! store one byte
1823         ldub    [%o0 + 1], %o3          ! load second byte
1824         deccc   %o2
1825         inc     %o1
1826         bz,pt   %ncc, .co_sm_exit
1827           stba  %o3,[%o1]ASI_USER       ! store second byte
1828         ldub    [%o0 + 2], %o3          ! load third byte
1829         inc     %o1
1830         stba    %o3,[%o1]ASI_USER       ! store third byte
1831         membar  #Sync                           ! sync error barrier
1832         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1833         retl
1834           mov   %g0, %o0                ! return 0
1835         .align  16
1836 .co_sm_words:
1837         lduw    [%o0], %o3              ! read word
1838 .co_sm_wordx:
1839         subcc   %o2, 8, %o2             ! update count
1840         stwa    %o3, [%o1]ASI_USER      ! write word
1841         add     %o0, 8, %o0             ! update SRC
1842         lduw    [%o0 - 4], %o3          ! read word
1843         add     %o1, 4, %o1             ! update DST
1844         stwa    %o3, [%o1]ASI_USER      ! write word
1845         bgt,pt  %ncc, .co_sm_words      ! loop til done
1846           add   %o1, 4, %o1             ! update DST
1847         addcc   %o2, 7, %o2             ! restore count
1848         bz,pt   %ncc, .co_sm_exit
1849           nop
1850         deccc   %o2
1851         bz,pt   %ncc, .co_sm_byte
1852 .co_sm_half:
1853           subcc %o2, 2, %o2             ! reduce count by 2
1854         lduh    [%o0], %o3              ! read half word
1855         add     %o0, 2, %o0             ! advance SRC by 2
1856         stha    %o3, [%o1]ASI_USER      ! write half word
1857         bgt,pt  %ncc, .co_sm_half       ! loop til done
1858           add   %o1, 2, %o1             ! advance DST by 2
1859         addcc   %o2, 1, %o2             ! restore count
1860         bz,pt   %ncc, .co_sm_exit
1861           nop
1862 .co_sm_byte:
1863         ldub    [%o0], %o3
1864         stba    %o3, [%o1]ASI_USER
1865         membar  #Sync                           ! sync error barrier
1866         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1867         retl
1868           mov   %g0, %o0                ! return 0
1869         .align 16
1870 .co_sm_word:
1871         subcc   %o2, 4, %o2             ! update count
1872         bgt,pt  %ncc, .co_sm_wordx
1873           lduw  [%o0], %o3              ! read word
1874         addcc   %o2, 3, %o2             ! restore count
1875         bz,pt   %ncc, .co_sm_exit
1876           stwa  %o3, [%o1]ASI_USER      ! write word
1877         deccc   %o2                     ! reduce count for cc test
1878         ldub    [%o0 + 4], %o3          ! load one byte
1879         add     %o1, 4, %o1
1880         bz,pt   %ncc, .co_sm_exit
1881           stba  %o3, [%o1]ASI_USER      ! store one byte
1882         ldub    [%o0 + 5], %o3          ! load second byte
1883         deccc   %o2
1884         inc     %o1
1885         bz,pt   %ncc, .co_sm_exit
1886           stba  %o3, [%o1]ASI_USER      ! store second byte
1887         ldub    [%o0 + 6], %o3          ! load third byte
1888         inc     %o1
1889         stba    %o3, [%o1]ASI_USER      ! store third byte
1890 .co_sm_exit:
1891           membar        #Sync                           ! sync error barrier
1892         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1893         retl
1894           mov   %g0, %o0                ! return 0
1895 
1896         .align 16
1897 .co_med:
1898         xor     %o0, %o1, %o3           ! setup alignment check
1899         btst    1, %o3
1900         bnz,pt  %ncc, .co_sm_movebytes  ! unaligned
1901           nop
1902         btst    3, %o3
1903         bnz,pt  %ncc, .co_med_half      ! halfword aligned
1904           nop
1905         btst    7, %o3
1906         bnz,pt  %ncc, .co_med_word      ! word aligned
1907           nop
1908 .co_med_long:
1909         btst    3, %o0                  ! check for
1910         bz,pt   %ncc, .co_med_long1     ! word alignment
1911           nop
1912 .co_med_long0:
1913         ldub    [%o0], %o3              ! load one byte
1914         inc     %o0
1915         stba    %o3,[%o1]ASI_USER       ! store byte
1916         inc     %o1
1917         btst    3, %o0
1918         bnz,pt  %ncc, .co_med_long0
1919           dec   %o2
1920 .co_med_long1:                  ! word aligned
1921         btst    7, %o0                  ! check for long word
1922         bz,pt   %ncc, .co_med_long2
1923           nop
1924         lduw    [%o0], %o3              ! load word
1925         add     %o0, 4, %o0             ! advance SRC by 4
1926         stwa    %o3, [%o1]ASI_USER      ! store word
1927         add     %o1, 4, %o1             ! advance DST by 4
1928         sub     %o2, 4, %o2             ! reduce count by 4
1929 !
1930 !  Now long word aligned and have at least 32 bytes to move
1931 !
1932 .co_med_long2:
1933         sub     %o2, 31, %o2            ! adjust count to allow cc zero test
1934         sub     %o1, 8, %o1             ! adjust pointer to allow store in
1935                                         ! branch delay slot instead of add
1936 .co_med_lmove:
1937         add     %o1, 8, %o1             ! advance DST by 8
1938         ldx     [%o0], %o3              ! read long word
1939         subcc   %o2, 32, %o2            ! reduce count by 32
1940         stxa    %o3, [%o1]ASI_USER      ! write long word
1941         add     %o1, 8, %o1             ! advance DST by 8
1942         ldx     [%o0 + 8], %o3          ! repeat for a total for 4 long words
1943         add     %o0, 32, %o0            ! advance SRC by 32
1944         stxa    %o3, [%o1]ASI_USER
1945         ldx     [%o0 - 16], %o3
1946         add     %o1, 8, %o1             ! advance DST by 8
1947         stxa    %o3, [%o1]ASI_USER
1948         ldx     [%o0 - 8], %o3
1949         add     %o1, 8, %o1             ! advance DST by 8
1950         bgt,pt  %ncc, .co_med_lmove     ! loop til 31 or fewer bytes left
1951           stxa  %o3, [%o1]ASI_USER
1952         add     %o1, 8, %o1             ! advance DST by 8
1953         addcc   %o2, 24, %o2            ! restore count to long word offset
1954         ble,pt  %ncc, .co_med_lextra    ! check for more long words to move
1955           nop
1956 .co_med_lword:
1957         ldx     [%o0], %o3              ! read long word
1958         subcc   %o2, 8, %o2             ! reduce count by 8
1959         stxa    %o3, [%o1]ASI_USER      ! write long word
1960         add     %o0, 8, %o0             ! advance SRC by 8
1961         bgt,pt  %ncc, .co_med_lword     ! loop til 7 or fewer bytes left
1962           add   %o1, 8, %o1             ! advance DST by 8
1963 .co_med_lextra:
1964         addcc   %o2, 7, %o2             ! restore rest of count
1965         bz,pt   %ncc, .co_sm_exit       ! if zero, then done
1966           deccc %o2
1967         bz,pt   %ncc, .co_sm_byte
1968           nop
1969         ba,pt   %ncc, .co_sm_half
1970           nop
1971 
1972         .align 16
1973         nop                             ! instruction alignment
1974                                         ! see discussion at start of file
1975 .co_med_word:
1976         btst    3, %o0                  ! check for
1977         bz,pt   %ncc, .co_med_word1     ! word alignment
1978           nop
1979 .co_med_word0:
1980         ldub    [%o0], %o3              ! load one byte
1981         inc     %o0
1982         stba    %o3,[%o1]ASI_USER       ! store byte
1983         inc     %o1
1984         btst    3, %o0
1985         bnz,pt  %ncc, .co_med_word0
1986           dec   %o2
1987 !
1988 !  Now word aligned and have at least 36 bytes to move
1989 !
1990 .co_med_word1:
1991         sub     %o2, 15, %o2            ! adjust count to allow cc zero test
1992 .co_med_wmove:
1993         lduw    [%o0], %o3              ! read word
1994         subcc   %o2, 16, %o2            ! reduce count by 16
1995         stwa    %o3, [%o1]ASI_USER      ! write word
1996         add     %o1, 4, %o1             ! advance DST by 4
1997         lduw    [%o0 + 4], %o3          ! repeat for a total for 4 words
1998         add     %o0, 16, %o0            ! advance SRC by 16
1999         stwa    %o3, [%o1]ASI_USER
2000         add     %o1, 4, %o1             ! advance DST by 4
2001         lduw    [%o0 - 8], %o3
2002         stwa    %o3, [%o1]ASI_USER
2003         add     %o1, 4, %o1             ! advance DST by 4
2004         lduw    [%o0 - 4], %o3
2005         stwa    %o3, [%o1]ASI_USER
2006         bgt,pt  %ncc, .co_med_wmove     ! loop til 15 or fewer bytes left
2007           add   %o1, 4, %o1             ! advance DST by 4
2008         addcc   %o2, 12, %o2            ! restore count to word offset
2009         ble,pt  %ncc, .co_med_wextra    ! check for more words to move
2010           nop
2011 .co_med_word2:
2012         lduw    [%o0], %o3              ! read word
2013         subcc   %o2, 4, %o2             ! reduce count by 4
2014         stwa    %o3, [%o1]ASI_USER      ! write word
2015         add     %o0, 4, %o0             ! advance SRC by 4
2016         bgt,pt  %ncc, .co_med_word2     ! loop til 3 or fewer bytes left
2017           add   %o1, 4, %o1             ! advance DST by 4
2018 .co_med_wextra:
2019         addcc   %o2, 3, %o2             ! restore rest of count
2020         bz,pt   %ncc, .co_sm_exit       ! if zero, then done
2021           deccc %o2
2022         bz,pt   %ncc, .co_sm_byte
2023           nop
2024         ba,pt   %ncc, .co_sm_half
2025           nop
2026 
2027         .align 16
2028         nop                             ! instruction alignment
2029         nop                             ! see discussion at start of file
2030         nop
2031 .co_med_half:
2032         btst    1, %o0                  ! check for
2033         bz,pt   %ncc, .co_med_half1     ! half word alignment
2034           nop
2035         ldub    [%o0], %o3              ! load one byte
2036         inc     %o0
2037         stba    %o3,[%o1]ASI_USER       ! store byte
2038         inc     %o1
2039         dec     %o2
2040 !
2041 !  Now half word aligned and have at least 38 bytes to move
2042 !
2043 .co_med_half1:
2044         sub     %o2, 7, %o2             ! adjust count to allow cc zero test
2045 .co_med_hmove:
2046         lduh    [%o0], %o3              ! read half word
2047         subcc   %o2, 8, %o2             ! reduce count by 8
2048         stha    %o3, [%o1]ASI_USER      ! write half word
2049         add     %o1, 2, %o1             ! advance DST by 2
2050         lduh    [%o0 + 2], %o3          ! repeat for a total for 4 halfwords
2051         add     %o0, 8, %o0             ! advance SRC by 8
2052         stha    %o3, [%o1]ASI_USER
2053         add     %o1, 2, %o1             ! advance DST by 2
2054         lduh    [%o0 - 4], %o3
2055         stha    %o3, [%o1]ASI_USER
2056         add     %o1, 2, %o1             ! advance DST by 2
2057         lduh    [%o0 - 2], %o3
2058         stha    %o3, [%o1]ASI_USER
2059         bgt,pt  %ncc, .co_med_hmove     ! loop til 7 or fewer bytes left
2060           add   %o1, 2, %o1             ! advance DST by 2
2061         addcc   %o2, 7, %o2             ! restore count
2062         bz,pt   %ncc, .co_sm_exit
2063           deccc %o2
2064         bz,pt   %ncc, .co_sm_byte
2065           nop
2066         ba,pt   %ncc, .co_sm_half
2067           nop
2068 
2069 /*
2070  * We got here because of a fault during short copyout.
2071  * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
2072  */
2073 .sm_copyout_err:
2074         membar  #Sync
2075         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2076         mov     SM_SAVE_SRC, %o0
2077         mov     SM_SAVE_DST, %o1
2078         mov     SM_SAVE_COUNT, %o2
2079         ldn     [THREAD_REG + T_COPYOPS], %o3   ! check for copyop handler
2080         tst     %o3
2081         bz,pt   %ncc, 3f                        ! if not, return error
2082           nop
2083         ldn     [%o3 + CP_COPYOUT], %o5         ! if handler, invoke it with
2084         jmp     %o5                             ! original arguments
2085           nop
2086 3:
2087         retl
2088           or    %g0, -1, %o0            ! return error value
2089 
2090         SET_SIZE(copyout)
2091 
2092 /*
2093  * The _more entry points are not intended to be used directly by
2094  * any caller from outside this file.  They are provided to allow
2095  * profiling and dtrace of the portions of the copy code that uses
2096  * the floating point registers.
2097  * This entry is particularly important as DTRACE (at least as of
2098  * 4/2004) does not support leaf functions.
2099  */
2100 
2101         ENTRY(copyout_more)
2102 .copyout_more:
2103         prefetch [%o0], #n_reads
2104         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2105         set     .copyout_err, REAL_LOFAULT
2106 
2107 /*
2108  * Copy outs that reach here are larger than VIS_COPY_THRESHOLD bytes
2109  */
2110 .do_copyout:
2111         set     copyio_fault, %l7               ! .copyio_fault is lofault val
2112 
2113         ldn     [THREAD_REG + T_LOFAULT], %l6   ! save existing handler
2114         membar  #Sync                           ! sync error barrier
2115         stn     %l7, [THREAD_REG + T_LOFAULT]   ! set t_lofault
2116 
2117         mov     %i0, SAVE_SRC
2118         mov     %i1, SAVE_DST
2119         mov     %i2, SAVE_COUNT
2120 
2121         FP_NOMIGRATE(6, 7)
2122 
2123         rd      %fprs, %o2              ! check for unused fp
2124         st      %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
2125         btst    FPRS_FEF, %o2
2126         bz,a,pt %icc, .do_blockcopyout
2127           wr    %g0, FPRS_FEF, %fprs
2128 
2129         BST_FPQ2Q4_TOSTACK(%o2)
2130 
2131 .do_blockcopyout:
2132         rd      %gsr, %o2
2133         stx     %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]      ! save gsr
2134         or      %l6, FPUSED_FLAG, %l6
2135 
2136         andcc   DST, VIS_BLOCKSIZE - 1, TMP
2137         mov     ASI_USER, %asi
2138         bz,pt   %ncc, 2f
2139           neg   TMP
2140         add     TMP, VIS_BLOCKSIZE, TMP
2141 
2142         ! TMP = bytes required to align DST on FP_BLOCK boundary
2143         ! Using SRC as a tmp here
2144         cmp     TMP, 3
2145         bleu,pt %ncc, 1f
2146           sub   CNT,TMP,CNT             ! adjust main count
2147         sub     TMP, 3, TMP             ! adjust for end of loop test
2148 .co_blkalign:
2149         ldub    [REALSRC], SRC          ! move 4 bytes per loop iteration
2150         stba    SRC, [DST]%asi
2151         subcc   TMP, 4, TMP
2152         ldub    [REALSRC + 1], SRC
2153         add     REALSRC, 4, REALSRC
2154         stba    SRC, [DST + 1]%asi
2155         ldub    [REALSRC - 2], SRC
2156         add     DST, 4, DST
2157         stba    SRC, [DST - 2]%asi
2158         ldub    [REALSRC - 1], SRC
2159         bgu,pt  %ncc, .co_blkalign
2160           stba  SRC, [DST - 1]%asi
2161 
2162         addcc   TMP, 3, TMP             ! restore count adjustment
2163         bz,pt   %ncc, 2f                ! no bytes left?
2164           nop
2165 1:      ldub    [REALSRC], SRC
2166         inc     REALSRC
2167         inc     DST
2168         deccc   TMP
2169         bgu     %ncc, 1b
2170           stba  SRC, [DST - 1]%asi
2171 
2172 2:
2173         membar  #StoreLoad
2174         andn    REALSRC, 0x7, SRC
2175 
2176         ! SRC - 8-byte aligned
2177         ! DST - 64-byte aligned
2178         ldd     [SRC], %f16
2179         prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads
2180         alignaddr REALSRC, %g0, %g0
2181         ldd     [SRC + 0x08], %f18
2182         prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads
2183         faligndata %f16, %f18, %f48
2184         ldd     [SRC + 0x10], %f20
2185         prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
2186         faligndata %f18, %f20, %f50
2187         ldd     [SRC + 0x18], %f22
2188         prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
2189         faligndata %f20, %f22, %f52
2190         ldd     [SRC + 0x20], %f24
2191         prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read
2192         faligndata %f22, %f24, %f54
2193         ldd     [SRC + 0x28], %f26
2194         prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read
2195         faligndata %f24, %f26, %f56
2196         ldd     [SRC + 0x30], %f28
2197         prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read
2198         faligndata %f26, %f28, %f58
2199         ldd     [SRC + 0x38], %f30
2200         ldd     [SRC + VIS_BLOCKSIZE], %f16
2201         sub     CNT, VIS_BLOCKSIZE, CNT
2202         add     SRC, VIS_BLOCKSIZE, SRC
2203         prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read
2204         add     REALSRC, VIS_BLOCKSIZE, REALSRC
2205         ba,pt   %ncc, 1f
2206         prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read
2207         .align  32
2208 1:
2209         ldd     [SRC + 0x08], %f18
2210         faligndata %f28, %f30, %f60
2211         ldd     [SRC + 0x10], %f20
2212         faligndata %f30, %f16, %f62
2213         stda    %f48, [DST]ASI_BLK_AIUS
2214         ldd     [SRC + 0x18], %f22
2215         faligndata %f16, %f18, %f48
2216         ldd     [SRC + 0x20], %f24
2217         faligndata %f18, %f20, %f50
2218         ldd     [SRC + 0x28], %f26
2219         faligndata %f20, %f22, %f52
2220         ldd     [SRC + 0x30], %f28
2221         faligndata %f22, %f24, %f54
2222         sub     CNT, VIS_BLOCKSIZE, CNT
2223         ldd     [SRC + 0x38], %f30
2224         faligndata %f24, %f26, %f56
2225         add     DST, VIS_BLOCKSIZE, DST
2226         ldd     [SRC + VIS_BLOCKSIZE], %f16
2227         faligndata %f26, %f28, %f58
2228         add     REALSRC, VIS_BLOCKSIZE, REALSRC
2229         prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
2230         add     SRC, VIS_BLOCKSIZE, SRC
2231         prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read
2232         cmp     CNT, VIS_BLOCKSIZE + 8
2233         bgu,pt  %ncc, 1b
2234           prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
2235 
2236         ! only if REALSRC & 0x7 is 0
2237         cmp     CNT, VIS_BLOCKSIZE
2238         bne     %ncc, 3f
2239           andcc REALSRC, 0x7, %g0
2240         bz,pt   %ncc, 2f
2241           nop
2242 3:
2243         faligndata %f28, %f30, %f60
2244         faligndata %f30, %f16, %f62
2245         stda    %f48, [DST]ASI_BLK_AIUS
2246         add     DST, VIS_BLOCKSIZE, DST
2247         ba,pt   %ncc, 3f
2248           nop
2249 2:
2250         ldd     [SRC + 0x08], %f18
2251         fsrc1   %f28, %f60
2252         ldd     [SRC + 0x10], %f20
2253         fsrc1   %f30, %f62
2254         stda    %f48, [DST]ASI_BLK_AIUS
2255         ldd     [SRC + 0x18], %f22
2256         fsrc1   %f16, %f48
2257         ldd     [SRC + 0x20], %f24
2258         fsrc1   %f18, %f50
2259         ldd     [SRC + 0x28], %f26
2260         fsrc1   %f20, %f52
2261         ldd     [SRC + 0x30], %f28
2262         fsrc1   %f22, %f54
2263         ldd     [SRC + 0x38], %f30
2264         fsrc1   %f24, %f56
2265         sub     CNT, VIS_BLOCKSIZE, CNT
2266         add     DST, VIS_BLOCKSIZE, DST
2267         add     SRC, VIS_BLOCKSIZE, SRC
2268         add     REALSRC, VIS_BLOCKSIZE, REALSRC
2269         fsrc1   %f26, %f58
2270         fsrc1   %f28, %f60
2271         fsrc1   %f30, %f62
2272         stda    %f48, [DST]ASI_BLK_AIUS
2273         add     DST, VIS_BLOCKSIZE, DST
2274         ba,a,pt %ncc, 4f
2275           nop
2276 
2277 3:      tst     CNT
2278         bz,a    %ncc, 4f
2279           nop
2280 
2281 5:      ldub    [REALSRC], TMP
2282         inc     REALSRC
2283         inc     DST
2284         deccc   CNT
2285         bgu     %ncc, 5b
2286           stba  TMP, [DST - 1]%asi
2287 4:
2288 
2289 .copyout_exit:
2290         membar  #Sync
2291 
2292         ldx     [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
2293         wr      %o2, 0, %gsr            ! restore gsr
2294 
2295         ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
2296         btst    FPRS_FEF, %o3
2297         bz,pt   %icc, 4f
2298           nop
2299 
2300         BLD_FPQ2Q4_FROMSTACK(%o2)
2301 
2302         ba,pt   %ncc, 1f
2303           wr    %o3, 0, %fprs           ! restore fprs
2304 
2305 4:
2306         FZEROQ2Q4
2307         wr      %o3, 0, %fprs           ! restore fprs
2308 
2309 1:
2310         membar  #Sync
2311         andn    %l6, FPUSED_FLAG, %l6
2312         stn     %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2313         FP_ALLOWMIGRATE(5, 6)
2314         ret
2315           restore       %g0, 0, %o0
2316 
2317 /*
2318  * We got here because of a fault during copyout.
2319  * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
2320  */
2321 .copyout_err:
2322         ldn     [THREAD_REG + T_COPYOPS], %o4   ! check for copyop handler
2323         tst     %o4
2324         bz,pt   %ncc, 2f                        ! if not, return error
2325           nop
2326         ldn     [%o4 + CP_COPYOUT], %g2         ! if handler, invoke it with
2327         jmp     %g2                             ! original arguments
2328           restore %g0, 0, %g0                   ! dispose of copy window
2329 2:
2330         ret
2331           restore %g0, -1, %o0                  ! return error value
2332 
2333 
2334         SET_SIZE(copyout_more)
2335 
2336 
2337         ENTRY(xcopyout)
2338         cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
2339         bleu,pt %ncc, .xcopyout_small           ! go to larger cases
2340           xor   %o0, %o1, %o3                   ! are src, dst alignable?
2341         btst    7, %o3                          !
2342         bz,pt   %ncc, .xcopyout_8               !
2343           nop
2344         btst    1, %o3                          !
2345         bz,pt   %ncc, .xcopyout_2               ! check for half-word
2346           nop
2347         sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
2348         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
2349         tst     %o3
2350         bz,pn   %icc, .xcopyout_small           ! if zero, disable HW copy
2351           cmp   %o2, %o3                        ! if length <= limit
2352         bleu,pt %ncc, .xcopyout_small           ! go to small copy
2353           nop
2354         ba,pt   %ncc, .xcopyout_more            ! otherwise go to large copy
2355           nop
2356 .xcopyout_2:
2357         btst    3, %o3                          !
2358         bz,pt   %ncc, .xcopyout_4               ! check for word alignment
2359           nop
2360         sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
2361         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
2362         tst     %o3
2363         bz,pn   %icc, .xcopyout_small           ! if zero, disable HW copy
2364           cmp   %o2, %o3                        ! if length <= limit
2365         bleu,pt %ncc, .xcopyout_small           ! go to small copy
2366           nop
2367         ba,pt   %ncc, .xcopyout_more            ! otherwise go to large copy
2368           nop
2369 .xcopyout_4:
2370         ! already checked longword, must be word aligned
2371         sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
2372         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
2373         tst     %o3
2374         bz,pn   %icc, .xcopyout_small           ! if zero, disable HW copy
2375           cmp   %o2, %o3                        ! if length <= limit
2376         bleu,pt %ncc, .xcopyout_small           ! go to small copy
2377           nop
2378         ba,pt   %ncc, .xcopyout_more            ! otherwise go to large copy
2379           nop
2380 .xcopyout_8:
2381         sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
2382         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
2383         tst     %o3
2384         bz,pn   %icc, .xcopyout_small           ! if zero, disable HW copy
2385           cmp   %o2, %o3                        ! if length <= limit
2386         bleu,pt %ncc, .xcopyout_small           ! go to small copy
2387           nop
2388         ba,pt   %ncc, .xcopyout_more            ! otherwise go to large copy
2389           nop
2390 
2391 .xcopyout_small:
2392         sethi   %hi(.sm_xcopyout_err), %o5      ! .sm_xcopyout_err is lofault
2393         or      %o5, %lo(.sm_xcopyout_err), %o5
2394         ldn     [THREAD_REG + T_LOFAULT], %o4   ! save existing handler
2395         membar  #Sync                           ! sync error barrier
2396         ba,pt   %ncc, .sm_do_copyout            ! common code
2397           stn   %o5, [THREAD_REG + T_LOFAULT]   ! set t_lofault
2398 
2399 .xcopyout_more:
2400         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2401         sethi   %hi(.xcopyout_err), REAL_LOFAULT
2402         ba,pt   %ncc, .do_copyout               ! common code
2403           or    REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
2404 
2405 /*
2406  * We got here because of fault during xcopyout
2407  * Errno value is in ERRNO
2408  */
2409 .xcopyout_err:
2410         ldn     [THREAD_REG + T_COPYOPS], %o4   ! check for copyop handler
2411         tst     %o4
2412         bz,pt   %ncc, 2f                        ! if not, return error
2413           nop
2414         ldn     [%o4 + CP_XCOPYOUT], %g2        ! if handler, invoke it with
2415         jmp     %g2                             ! original arguments
2416           restore %g0, 0, %g0                   ! dispose of copy window
2417 2:
2418         ret
2419           restore ERRNO, 0, %o0                 ! return errno value
2420 
2421 .sm_xcopyout_err:
2422 
2423         membar  #Sync
2424         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2425         mov     SM_SAVE_SRC, %o0
2426         mov     SM_SAVE_DST, %o1
2427         mov     SM_SAVE_COUNT, %o2
2428         ldn     [THREAD_REG + T_COPYOPS], %o3   ! check for copyop handler
2429         tst     %o3
2430         bz,pt   %ncc, 3f                        ! if not, return error
2431           nop
2432         ldn     [%o3 + CP_XCOPYOUT], %o5        ! if handler, invoke it with
2433         jmp     %o5                             ! original arguments
2434           nop
2435 3:
2436         retl
2437           or    %g1, 0, %o0             ! return errno value
2438 
2439         SET_SIZE(xcopyout)
2440 
2441         ENTRY(xcopyout_little)
2442         sethi   %hi(.xcopyio_err), %o5
2443         or      %o5, %lo(.xcopyio_err), %o5
2444         ldn     [THREAD_REG + T_LOFAULT], %o4
2445         membar  #Sync                           ! sync error barrier
2446         stn     %o5, [THREAD_REG + T_LOFAULT]
2447         mov     %o4, %o5
2448 
2449         subcc   %g0, %o2, %o3
2450         add     %o0, %o2, %o0
2451         bz,pn   %ncc, 2f                ! check for zero bytes
2452           sub   %o2, 1, %o4
2453         add     %o0, %o4, %o0           ! start w/last byte
2454         add     %o1, %o2, %o1
2455         ldub    [%o0 + %o3], %o4
2456 
2457 1:      stba    %o4, [%o1 + %o3]ASI_AIUSL
2458         inccc   %o3
2459         sub     %o0, 2, %o0             ! get next byte
2460         bcc,a,pt %ncc, 1b
2461           ldub  [%o0 + %o3], %o4
2462 
2463 2:
2464         membar  #Sync                           ! sync error barrier
2465         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2466         retl
2467           mov   %g0, %o0                ! return (0)
2468 
2469         SET_SIZE(xcopyout_little)
2470 
2471 /*
2472  * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
2473  */
2474 
2475         ENTRY(copyin)
2476         cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
2477         bleu,pt %ncc, .copyin_small             ! go to larger cases
2478           xor   %o0, %o1, %o3                   ! are src, dst alignable?
2479         btst    7, %o3                          !
2480         bz,pt   %ncc, .copyin_8                 ! check for longword alignment
2481           nop
2482         btst    1, %o3                          !
2483         bz,pt   %ncc, .copyin_2                 ! check for half-word
2484           nop
2485         sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
2486         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
2487         tst     %o3
2488         bz,pn   %icc, .copyin_small             ! if zero, disable HW copy
2489           cmp   %o2, %o3                        ! if length <= limit
2490         bleu,pt %ncc, .copyin_small             ! go to small copy
2491           nop
2492         ba,pt   %ncc, .copyin_more              ! otherwise go to large copy
2493           nop
2494 .copyin_2:
2495         btst    3, %o3                          !
2496         bz,pt   %ncc, .copyin_4                 ! check for word alignment
2497           nop
2498         sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
2499         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
2500         tst     %o3
2501         bz,pn   %icc, .copyin_small             ! if zero, disable HW copy
2502           cmp   %o2, %o3                        ! if length <= limit
2503         bleu,pt %ncc, .copyin_small             ! go to small copy
2504           nop
2505         ba,pt   %ncc, .copyin_more              ! otherwise go to large copy
2506           nop
2507 .copyin_4:
2508         ! already checked longword, must be word aligned
2509         sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
2510         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
2511         tst     %o3
2512         bz,pn   %icc, .copyin_small             ! if zero, disable HW copy
2513           cmp   %o2, %o3                        ! if length <= limit
2514         bleu,pt %ncc, .copyin_small             ! go to small copy
2515           nop
2516         ba,pt   %ncc, .copyin_more              ! otherwise go to large copy
2517           nop
2518 .copyin_8:
2519         sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
2520         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
2521         tst     %o3
2522         bz,pn   %icc, .copyin_small             ! if zero, disable HW copy
2523           cmp   %o2, %o3                        ! if length <= limit
2524         bleu,pt %ncc, .copyin_small             ! go to small copy
2525           nop
2526         ba,pt   %ncc, .copyin_more              ! otherwise go to large copy
2527           nop
2528 
2529         .align  16
2530         nop                             ! instruction alignment
2531                                         ! see discussion at start of file
2532 .copyin_small:
2533         sethi   %hi(.sm_copyin_err), %o5        ! .sm_copyin_err is lofault
2534         or      %o5, %lo(.sm_copyin_err), %o5
2535         ldn     [THREAD_REG + T_LOFAULT], %o4   ! set/save t_lofault, no tramp
2536         membar  #Sync                           ! sync error barrier
2537         stn     %o5, [THREAD_REG + T_LOFAULT]
2538 .sm_do_copyin:
2539         mov     %o0, SM_SAVE_SRC
2540         mov     %o1, SM_SAVE_DST
2541         cmp     %o2, SHORTCOPY          ! check for really short case
2542         bleu,pt %ncc, .ci_sm_left       !
2543           mov   %o2, SM_SAVE_COUNT
2544         cmp     %o2, CHKSIZE            ! check for medium length cases
2545         bgu,pn  %ncc, .ci_med           !
2546           or    %o0, %o1, %o3           ! prepare alignment check
2547         andcc   %o3, 0x3, %g0           ! test for alignment
2548         bz,pt   %ncc, .ci_sm_word       ! branch to word aligned case
2549 .ci_sm_movebytes:
2550           sub   %o2, 3, %o2             ! adjust count to allow cc zero test
2551 .ci_sm_notalign4:
2552         lduba   [%o0]ASI_USER, %o3      ! read byte
2553         subcc   %o2, 4, %o2             ! reduce count by 4
2554         stb     %o3, [%o1]              ! write byte
2555         add     %o0, 1, %o0             ! advance SRC by 1
2556         lduba   [%o0]ASI_USER, %o3      ! repeat for a total of 4 bytes
2557         add     %o0, 1, %o0             ! advance SRC by 1
2558         stb     %o3, [%o1 + 1]
2559         add     %o1, 4, %o1             ! advance DST by 4
2560         lduba   [%o0]ASI_USER, %o3
2561         add     %o0, 1, %o0             ! advance SRC by 1
2562         stb     %o3, [%o1 - 2]
2563         lduba   [%o0]ASI_USER, %o3
2564         add     %o0, 1, %o0             ! advance SRC by 1
2565         bgt,pt  %ncc, .ci_sm_notalign4  ! loop til 3 or fewer bytes remain
2566           stb   %o3, [%o1 - 1]
2567         add     %o2, 3, %o2             ! restore count
2568 .ci_sm_left:
2569         tst     %o2
2570         bz,pt   %ncc, .ci_sm_exit
2571           nop
2572         lduba   [%o0]ASI_USER, %o3              ! load one byte
2573         deccc   %o2                     ! reduce count for cc test
2574         bz,pt   %ncc, .ci_sm_exit
2575           stb   %o3,[%o1]               ! store one byte
2576         inc     %o0
2577         lduba   [%o0]ASI_USER, %o3      ! load second byte
2578         deccc   %o2
2579         bz,pt   %ncc, .ci_sm_exit
2580           stb   %o3,[%o1 + 1]           ! store second byte
2581         inc     %o0
2582         lduba   [%o0]ASI_USER, %o3      ! load third byte
2583         stb     %o3,[%o1 + 2]           ! store third byte
2584         membar  #Sync                           ! sync error barrier
2585         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2586         retl
2587           mov   %g0, %o0                ! return 0
2588         .align  16
2589 .ci_sm_words:
2590         lduwa   [%o0]ASI_USER, %o3              ! read word
2591 .ci_sm_wordx:
2592         subcc   %o2, 8, %o2             ! update count
2593         stw     %o3, [%o1]              ! write word
2594         add     %o0, 4, %o0             ! update SRC
2595         add     %o1, 8, %o1             ! update DST
2596         lduwa   [%o0]ASI_USER, %o3      ! read word
2597         add     %o0, 4, %o0             ! update SRC
2598         bgt,pt  %ncc, .ci_sm_words      ! loop til done
2599           stw   %o3, [%o1 - 4]          ! write word
2600         addcc   %o2, 7, %o2             ! restore count
2601         bz,pt   %ncc, .ci_sm_exit
2602           nop
2603         deccc   %o2
2604         bz,pt   %ncc, .ci_sm_byte
2605 .ci_sm_half:
2606           subcc %o2, 2, %o2             ! reduce count by 2
2607         lduha   [%o0]ASI_USER, %o3      ! read half word
2608         add     %o0, 2, %o0             ! advance SRC by 2
2609         add     %o1, 2, %o1             ! advance DST by 2
2610         bgt,pt  %ncc, .ci_sm_half       ! loop til done
2611           sth   %o3, [%o1 - 2]          ! write half word
2612         addcc   %o2, 1, %o2             ! restore count
2613         bz,pt   %ncc, .ci_sm_exit
2614           nop
2615 .ci_sm_byte:
2616         lduba   [%o0]ASI_USER, %o3
2617         stb     %o3, [%o1]
2618         membar  #Sync                           ! sync error barrier
2619         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2620         retl
2621           mov   %g0, %o0                ! return 0
2622         .align  16
2623 .ci_sm_word:
2624         subcc   %o2, 4, %o2             ! update count
2625         bgt,pt  %ncc, .ci_sm_wordx
2626           lduwa [%o0]ASI_USER, %o3              ! read word
2627         addcc   %o2, 3, %o2             ! restore count
2628         bz,pt   %ncc, .ci_sm_exit
2629           stw   %o3, [%o1]              ! write word
2630         deccc   %o2                     ! reduce count for cc test
2631         add     %o0, 4, %o0
2632         lduba   [%o0]ASI_USER, %o3      ! load one byte
2633         bz,pt   %ncc, .ci_sm_exit
2634           stb   %o3, [%o1 + 4]          ! store one byte
2635         inc     %o0
2636         lduba   [%o0]ASI_USER, %o3      ! load second byte
2637         deccc   %o2
2638         bz,pt   %ncc, .ci_sm_exit
2639           stb   %o3, [%o1 + 5]          ! store second byte
2640         inc     %o0
2641         lduba   [%o0]ASI_USER, %o3      ! load third byte
2642         stb     %o3, [%o1 + 6]          ! store third byte
2643 .ci_sm_exit:
2644         membar  #Sync                           ! sync error barrier
2645         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2646         retl
2647           mov   %g0, %o0                ! return 0
2648 
2649         .align 16
2650 .ci_med:
2651         xor     %o0, %o1, %o3           ! setup alignment check
2652         btst    1, %o3
2653         bnz,pt  %ncc, .ci_sm_movebytes  ! unaligned
2654           nop
2655         btst    3, %o3
2656         bnz,pt  %ncc, .ci_med_half      ! halfword aligned
2657           nop
2658         btst    7, %o3
2659         bnz,pt  %ncc, .ci_med_word      ! word aligned
2660           nop
2661 .ci_med_long:
2662         btst    3, %o0                  ! check for
2663         bz,pt   %ncc, .ci_med_long1     ! word alignment
2664           nop
2665 .ci_med_long0:
2666         lduba   [%o0]ASI_USER, %o3              ! load one byte
2667         inc     %o0
2668         stb     %o3,[%o1]               ! store byte
2669         inc     %o1
2670         btst    3, %o0
2671         bnz,pt  %ncc, .ci_med_long0
2672           dec   %o2
2673 .ci_med_long1:                  ! word aligned
2674         btst    7, %o0                  ! check for long word
2675         bz,pt   %ncc, .ci_med_long2
2676           nop
2677         lduwa   [%o0]ASI_USER, %o3      ! load word
2678         add     %o0, 4, %o0             ! advance SRC by 4
2679         stw     %o3, [%o1]              ! store word
2680         add     %o1, 4, %o1             ! advance DST by 4
2681         sub     %o2, 4, %o2             ! reduce count by 4
2682 !
2683 !  Now long word aligned and have at least 32 bytes to move
2684 !
2685 .ci_med_long2:
2686         sub     %o2, 31, %o2            ! adjust count to allow cc zero test
2687 .ci_med_lmove:
2688         ldxa    [%o0]ASI_USER, %o3      ! read long word
2689         subcc   %o2, 32, %o2            ! reduce count by 32
2690         stx     %o3, [%o1]              ! write long word
2691         add     %o0, 8, %o0             ! advance SRC by 8
2692         ldxa    [%o0]ASI_USER, %o3      ! repeat for a total for 4 long words
2693         add     %o0, 8, %o0             ! advance SRC by 8
2694         stx     %o3, [%o1 + 8]
2695         add     %o1, 32, %o1            ! advance DST by 32
2696         ldxa    [%o0]ASI_USER, %o3
2697         add     %o0, 8, %o0             ! advance SRC by 8
2698         stx     %o3, [%o1 - 16]
2699         ldxa    [%o0]ASI_USER, %o3
2700         add     %o0, 8, %o0             ! advance SRC by 8
2701         bgt,pt  %ncc, .ci_med_lmove     ! loop til 31 or fewer bytes left
2702           stx   %o3, [%o1 - 8]
2703         addcc   %o2, 24, %o2            ! restore count to long word offset
2704         ble,pt  %ncc, .ci_med_lextra    ! check for more long words to move
2705           nop
2706 .ci_med_lword:
2707         ldxa    [%o0]ASI_USER, %o3      ! read long word
2708         subcc   %o2, 8, %o2             ! reduce count by 8
2709         stx     %o3, [%o1]              ! write long word
2710         add     %o0, 8, %o0             ! advance SRC by 8
2711         bgt,pt  %ncc, .ci_med_lword     ! loop til 7 or fewer bytes left
2712           add   %o1, 8, %o1             ! advance DST by 8
2713 .ci_med_lextra:
2714         addcc   %o2, 7, %o2             ! restore rest of count
2715         bz,pt   %ncc, .ci_sm_exit       ! if zero, then done
2716           deccc %o2
2717         bz,pt   %ncc, .ci_sm_byte
2718           nop
2719         ba,pt   %ncc, .ci_sm_half
2720           nop
2721 
2722         .align 16
2723         nop                             ! instruction alignment
2724                                         ! see discussion at start of file
2725 .ci_med_word:
2726         btst    3, %o0                  ! check for
2727         bz,pt   %ncc, .ci_med_word1     ! word alignment
2728           nop
2729 .ci_med_word0:
2730         lduba   [%o0]ASI_USER, %o3      ! load one byte
2731         inc     %o0
2732         stb     %o3,[%o1]               ! store byte
2733         inc     %o1
2734         btst    3, %o0
2735         bnz,pt  %ncc, .ci_med_word0
2736           dec   %o2
2737 !
2738 !  Now word aligned and have at least 36 bytes to move
2739 !
2740 .ci_med_word1:
2741         sub     %o2, 15, %o2            ! adjust count to allow cc zero test
2742 .ci_med_wmove:
2743         lduwa   [%o0]ASI_USER, %o3      ! read word
2744         subcc   %o2, 16, %o2            ! reduce count by 16
2745         stw     %o3, [%o1]              ! write word
2746         add     %o0, 4, %o0             ! advance SRC by 4
2747         lduwa   [%o0]ASI_USER, %o3      ! repeat for a total for 4 words
2748         add     %o0, 4, %o0             ! advance SRC by 4
2749         stw     %o3, [%o1 + 4]
2750         add     %o1, 16, %o1            ! advance DST by 16
2751         lduwa   [%o0]ASI_USER, %o3
2752         add     %o0, 4, %o0             ! advance SRC by 4
2753         stw     %o3, [%o1 - 8]
2754         lduwa   [%o0]ASI_USER, %o3
2755         add     %o0, 4, %o0             ! advance SRC by 4
2756         bgt,pt  %ncc, .ci_med_wmove     ! loop til 15 or fewer bytes left
2757           stw   %o3, [%o1 - 4]
2758         addcc   %o2, 12, %o2            ! restore count to word offset
2759         ble,pt  %ncc, .ci_med_wextra    ! check for more words to move
2760           nop
2761 .ci_med_word2:
2762         lduwa   [%o0]ASI_USER, %o3      ! read word
2763         subcc   %o2, 4, %o2             ! reduce count by 4
2764         stw     %o3, [%o1]              ! write word
2765         add     %o0, 4, %o0             ! advance SRC by 4
2766         bgt,pt  %ncc, .ci_med_word2     ! loop til 3 or fewer bytes left
2767           add   %o1, 4, %o1             ! advance DST by 4
2768 .ci_med_wextra:
2769         addcc   %o2, 3, %o2             ! restore rest of count
2770         bz,pt   %ncc, .ci_sm_exit       ! if zero, then done
2771           deccc %o2
2772         bz,pt   %ncc, .ci_sm_byte
2773           nop
2774         ba,pt   %ncc, .ci_sm_half
2775           nop
2776 
2777         .align 16
2778         nop                             ! instruction alignment
2779                                         ! see discussion at start of file
2780 .ci_med_half:
2781         btst    1, %o0                  ! check for
2782         bz,pt   %ncc, .ci_med_half1     ! half word alignment
2783           nop
2784         lduba   [%o0]ASI_USER, %o3      ! load one byte
2785         inc     %o0
2786         stb     %o3,[%o1]               ! store byte
2787         inc     %o1
2788         dec     %o2
2789 !
2790 !  Now half word aligned and have at least 38 bytes to move
2791 !
2792 .ci_med_half1:
2793         sub     %o2, 7, %o2             ! adjust count to allow cc zero test
2794 .ci_med_hmove:
2795         lduha   [%o0]ASI_USER, %o3      ! read half word
2796         subcc   %o2, 8, %o2             ! reduce count by 8
2797         sth     %o3, [%o1]              ! write half word
2798         add     %o0, 2, %o0             ! advance SRC by 2
2799         lduha   [%o0]ASI_USER, %o3      ! repeat for a total for 4 halfwords
2800         add     %o0, 2, %o0             ! advance SRC by 2
2801         sth     %o3, [%o1 + 2]
2802         add     %o1, 8, %o1             ! advance DST by 8
2803         lduha   [%o0]ASI_USER, %o3
2804         add     %o0, 2, %o0             ! advance SRC by 2
2805         sth     %o3, [%o1 - 4]
2806         lduha   [%o0]ASI_USER, %o3
2807         add     %o0, 2, %o0             ! advance SRC by 2
2808         bgt,pt  %ncc, .ci_med_hmove     ! loop til 7 or fewer bytes left
2809           sth   %o3, [%o1 - 2]
2810         addcc   %o2, 7, %o2             ! restore count
2811         bz,pt   %ncc, .ci_sm_exit
2812           deccc %o2
2813         bz,pt   %ncc, .ci_sm_byte
2814           nop
2815         ba,pt   %ncc, .ci_sm_half
2816           nop
2817 
2818 .sm_copyin_err:
2819         membar  #Sync
2820         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2821         mov     SM_SAVE_SRC, %o0
2822         mov     SM_SAVE_DST, %o1
2823         mov     SM_SAVE_COUNT, %o2
2824         ldn     [THREAD_REG + T_COPYOPS], %o3   ! check for copyop handler
2825         tst     %o3
2826         bz,pt   %ncc, 3f                        ! if not, return error
2827           nop
2828         ldn     [%o3 + CP_COPYIN], %o5          ! if handler, invoke it with
2829         jmp     %o5                             ! original arguments
2830           nop
2831 3:
2832         retl
2833           or    %g0, -1, %o0            ! return errno value
2834 
2835         SET_SIZE(copyin)
2836 
2837 
2838 /*
2839  * The _more entry points are not intended to be used directly by
2840  * any caller from outside this file.  They are provided to allow
2841  * profiling and dtrace of the portions of the copy code that uses
2842  * the floating point registers.
2843  * This entry is particularly important as DTRACE (at least as of
2844  * 4/2004) does not support leaf functions.
2845  */
2846 
2847         ENTRY(copyin_more)
2848 .copyin_more:
2849         prefetch [%o0], #n_reads
2850         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2851         set     .copyin_err, REAL_LOFAULT
2852 
2853 /*
2854  * Copy ins that reach here are larger than VIS_COPY_THRESHOLD bytes
2855  */
2856 .do_copyin:
2857         set     copyio_fault, %l7               ! .copyio_fault is lofault val
2858 
2859         ldn     [THREAD_REG + T_LOFAULT], %l6   ! save existing handler
2860         membar  #Sync                           ! sync error barrier
2861         stn     %l7, [THREAD_REG + T_LOFAULT]   ! set t_lofault
2862 
2863         mov     %i0, SAVE_SRC
2864         mov     %i1, SAVE_DST
2865         mov     %i2, SAVE_COUNT
2866 
2867         FP_NOMIGRATE(6, 7)
2868 
2869         rd      %fprs, %o2              ! check for unused fp
2870         st      %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
2871         btst    FPRS_FEF, %o2
2872         bz,a,pt %icc, .do_blockcopyin
2873           wr    %g0, FPRS_FEF, %fprs
2874 
2875         BST_FPQ2Q4_TOSTACK(%o2)
2876 
2877 .do_blockcopyin:
2878         rd      %gsr, %o2
2879         stx     %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]      ! save gsr
2880         or      %l6, FPUSED_FLAG, %l6
2881 
2882         andcc   DST, VIS_BLOCKSIZE - 1, TMP
2883         mov     ASI_USER, %asi
2884         bz,pt   %ncc, 2f
2885           neg   TMP
2886         add     TMP, VIS_BLOCKSIZE, TMP
2887 
2888         ! TMP = bytes required to align DST on FP_BLOCK boundary
2889         ! Using SRC as a tmp here
2890         cmp     TMP, 3
2891         bleu,pt %ncc, 1f
2892           sub   CNT,TMP,CNT             ! adjust main count
2893         sub     TMP, 3, TMP             ! adjust for end of loop test
2894 .ci_blkalign:
2895         lduba   [REALSRC]%asi, SRC      ! move 4 bytes per loop iteration
2896         stb     SRC, [DST]
2897         subcc   TMP, 4, TMP
2898         lduba   [REALSRC + 1]%asi, SRC
2899         add     REALSRC, 4, REALSRC
2900         stb     SRC, [DST + 1]
2901         lduba   [REALSRC - 2]%asi, SRC
2902         add     DST, 4, DST
2903         stb     SRC, [DST - 2]
2904         lduba   [REALSRC - 1]%asi, SRC
2905         bgu,pt  %ncc, .ci_blkalign
2906           stb   SRC, [DST - 1]
2907 
2908         addcc   TMP, 3, TMP             ! restore count adjustment
2909         bz,pt   %ncc, 2f                ! no bytes left?
2910           nop
2911 1:      lduba   [REALSRC]%asi, SRC
2912         inc     REALSRC
2913         inc     DST
2914         deccc   TMP
2915         bgu     %ncc, 1b
2916           stb   SRC, [DST - 1]
2917 
2918 2:
2919         membar  #StoreLoad
2920         andn    REALSRC, 0x7, SRC
2921 
2922         ! SRC - 8-byte aligned
2923         ! DST - 64-byte aligned
2924         ldda    [SRC]%asi, %f16
2925         prefetcha [SRC + (1 * VIS_BLOCKSIZE)]%asi, #n_reads
2926         alignaddr REALSRC, %g0, %g0
2927         ldda    [SRC + 0x08]%asi, %f18
2928         prefetcha [SRC + (2 * VIS_BLOCKSIZE)]%asi, #n_reads
2929         faligndata %f16, %f18, %f48
2930         ldda    [SRC + 0x10]%asi, %f20
2931         prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #n_reads
2932         faligndata %f18, %f20, %f50
2933         ldda    [SRC + 0x18]%asi, %f22
2934         prefetcha [SRC + (4 * VIS_BLOCKSIZE)]%asi, #one_read
2935         faligndata %f20, %f22, %f52
2936         ldda    [SRC + 0x20]%asi, %f24
2937         prefetcha [SRC + (8 * VIS_BLOCKSIZE)]%asi, #one_read
2938         faligndata %f22, %f24, %f54
2939         ldda    [SRC + 0x28]%asi, %f26
2940         prefetcha [SRC + (12 * VIS_BLOCKSIZE)]%asi, #one_read
2941         faligndata %f24, %f26, %f56
2942         ldda    [SRC + 0x30]%asi, %f28
2943         prefetcha [SRC + (16 * VIS_BLOCKSIZE)]%asi, #one_read
2944         faligndata %f26, %f28, %f58
2945         ldda    [SRC + 0x38]%asi, %f30
2946         ldda    [SRC + VIS_BLOCKSIZE]%asi, %f16
2947         sub     CNT, VIS_BLOCKSIZE, CNT
2948         add     SRC, VIS_BLOCKSIZE, SRC
2949         prefetcha [SRC + (19 * VIS_BLOCKSIZE)]%asi, #one_read
2950         add     REALSRC, VIS_BLOCKSIZE, REALSRC
2951         ba,pt   %ncc, 1f
2952         prefetcha [SRC + (23 * VIS_BLOCKSIZE)]%asi, #one_read
2953         .align  32
2954 1:
2955         ldda    [SRC + 0x08]%asi, %f18
2956         faligndata %f28, %f30, %f60
2957         ldda    [SRC + 0x10]%asi, %f20
2958         faligndata %f30, %f16, %f62
2959         stda    %f48, [DST]ASI_BLK_P
2960         ldda    [SRC + 0x18]%asi, %f22
2961         faligndata %f16, %f18, %f48
2962         ldda    [SRC + 0x20]%asi, %f24
2963         faligndata %f18, %f20, %f50
2964         ldda    [SRC + 0x28]%asi, %f26
2965         faligndata %f20, %f22, %f52
2966         ldda    [SRC + 0x30]%asi, %f28
2967         faligndata %f22, %f24, %f54
2968         sub     CNT, VIS_BLOCKSIZE, CNT
2969         ldda    [SRC + 0x38]%asi, %f30
2970         faligndata %f24, %f26, %f56
2971         add     DST, VIS_BLOCKSIZE, DST
2972         ldda    [SRC + VIS_BLOCKSIZE]%asi, %f16
2973         faligndata %f26, %f28, %f58
2974         add     REALSRC, VIS_BLOCKSIZE, REALSRC
2975         prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #n_reads
2976         add     SRC, VIS_BLOCKSIZE, SRC
2977         prefetcha [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read
2978         cmp     CNT, VIS_BLOCKSIZE + 8
2979         bgu,pt  %ncc, 1b
2980           prefetcha [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read
2981 
2982         ! only if REALSRC & 0x7 is 0
2983         cmp     CNT, VIS_BLOCKSIZE
2984         bne     %ncc, 3f
2985           andcc REALSRC, 0x7, %g0
2986         bz,pt   %ncc, 2f
2987           nop
2988 3:
2989         faligndata %f28, %f30, %f60
2990         faligndata %f30, %f16, %f62
2991         stda    %f48, [DST]ASI_BLK_P
2992         add     DST, VIS_BLOCKSIZE, DST
2993         ba,pt   %ncc, 3f
2994           nop
2995 2:
2996         ldda    [SRC + 0x08]%asi, %f18
2997         fsrc1   %f28, %f60
2998         ldda    [SRC + 0x10]%asi, %f20
2999         fsrc1   %f30, %f62
3000         stda    %f48, [DST]ASI_BLK_P
3001         ldda    [SRC + 0x18]%asi, %f22
3002         fsrc1   %f16, %f48
3003         ldda    [SRC + 0x20]%asi, %f24
3004         fsrc1   %f18, %f50
3005         ldda    [SRC + 0x28]%asi, %f26
3006         fsrc1   %f20, %f52
3007         ldda    [SRC + 0x30]%asi, %f28
3008         fsrc1   %f22, %f54
3009         ldda    [SRC + 0x38]%asi, %f30
3010         fsrc1   %f24, %f56
3011         sub     CNT, VIS_BLOCKSIZE, CNT
3012         add     DST, VIS_BLOCKSIZE, DST
3013         add     SRC, VIS_BLOCKSIZE, SRC
3014         add     REALSRC, VIS_BLOCKSIZE, REALSRC
3015         fsrc1   %f26, %f58
3016         fsrc1   %f28, %f60
3017         fsrc1   %f30, %f62
3018         stda    %f48, [DST]ASI_BLK_P
3019         add     DST, VIS_BLOCKSIZE, DST
3020         ba,a,pt %ncc, 4f
3021           nop
3022 
3023 3:      tst     CNT
3024         bz,a    %ncc, 4f
3025           nop
3026 
3027 5:      lduba   [REALSRC]ASI_USER, TMP
3028         inc     REALSRC
3029         inc     DST
3030         deccc   CNT
3031         bgu     %ncc, 5b
3032           stb   TMP, [DST - 1]
3033 4:
3034 
3035 .copyin_exit:
3036         membar  #Sync
3037 
3038         ldx     [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2      ! restore gsr
3039         wr      %o2, 0, %gsr
3040 
3041         ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
3042         btst    FPRS_FEF, %o3
3043         bz,pt   %icc, 4f
3044           nop
3045 
3046         BLD_FPQ2Q4_FROMSTACK(%o2)
3047 
3048         ba,pt   %ncc, 1f
3049           wr    %o3, 0, %fprs           ! restore fprs
3050 
3051 4:
3052         FZEROQ2Q4
3053         wr      %o3, 0, %fprs           ! restore fprs
3054 
3055 1:
3056         membar  #Sync                           ! sync error barrier
3057         andn    %l6, FPUSED_FLAG, %l6
3058         stn     %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3059         FP_ALLOWMIGRATE(5, 6)
3060         ret
3061           restore       %g0, 0, %o0
3062 /*
3063  * We got here because of a fault during copyin
3064  * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
3065  */
3066 .copyin_err:
3067         ldn     [THREAD_REG + T_COPYOPS], %o4   ! check for copyop handler
3068         tst     %o4
3069         bz,pt   %ncc, 2f                        ! if not, return error
3070         nop
3071         ldn     [%o4 + CP_COPYIN], %g2          ! if handler, invoke it with
3072         jmp     %g2                             ! original arguments
3073         restore %g0, 0, %g0                     ! dispose of copy window
3074 2:
3075         ret
3076         restore %g0, -1, %o0                    ! return error value
3077 
3078 
3079         SET_SIZE(copyin_more)
3080 
3081         ENTRY(xcopyin)
3082 
3083         cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
3084         bleu,pt %ncc, .xcopyin_small            ! go to larger cases
3085           xor   %o0, %o1, %o3                   ! are src, dst alignable?
3086         btst    7, %o3                          !
3087         bz,pt   %ncc, .xcopyin_8                ! check for longword alignment
3088           nop
3089         btst    1, %o3                          !
3090         bz,pt   %ncc, .xcopyin_2                ! check for half-word
3091           nop
3092         sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
3093         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
3094         tst     %o3
3095         bz,pn   %icc, .xcopyin_small            ! if zero, disable HW copy
3096           cmp   %o2, %o3                        ! if length <= limit
3097         bleu,pt %ncc, .xcopyin_small            ! go to small copy
3098           nop
3099         ba,pt   %ncc, .xcopyin_more             ! otherwise go to large copy
3100           nop
3101 .xcopyin_2:
3102         btst    3, %o3                          !
3103         bz,pt   %ncc, .xcopyin_4                ! check for word alignment
3104           nop
3105         sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
3106         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
3107         tst     %o3
3108         bz,pn   %icc, .xcopyin_small            ! if zero, disable HW copy
3109           cmp   %o2, %o3                        ! if length <= limit
3110         bleu,pt %ncc, .xcopyin_small            ! go to small copy
3111           nop
3112         ba,pt   %ncc, .xcopyin_more             ! otherwise go to large copy
3113           nop
3114 .xcopyin_4:
3115         ! already checked longword, must be word aligned
3116         sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
3117         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
3118         tst     %o3
3119         bz,pn   %icc, .xcopyin_small            ! if zero, disable HW copy
3120           cmp   %o2, %o3                        ! if length <= limit
3121         bleu,pt %ncc, .xcopyin_small            ! go to small copy
3122           nop
3123         ba,pt   %ncc, .xcopyin_more             ! otherwise go to large copy
3124           nop
3125 .xcopyin_8:
3126         sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
3127         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
3128         tst     %o3
3129         bz,pn   %icc, .xcopyin_small            ! if zero, disable HW copy
3130           cmp   %o2, %o3                        ! if length <= limit
3131         bleu,pt %ncc, .xcopyin_small            ! go to small copy
3132           nop
3133         ba,pt   %ncc, .xcopyin_more             ! otherwise go to large copy
3134           nop
3135 
3136 .xcopyin_small:
3137         sethi   %hi(.sm_xcopyin_err), %o5  ! .sm_xcopyin_err is lofault value
3138         or      %o5, %lo(.sm_xcopyin_err), %o5
3139         ldn     [THREAD_REG + T_LOFAULT], %o4   ! set/save t_lofaul
3140         membar  #Sync                           ! sync error barrier
3141         ba,pt   %ncc, .sm_do_copyin             ! common code
3142           stn   %o5, [THREAD_REG + T_LOFAULT]
3143 
3144 .xcopyin_more:
3145         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3146         sethi   %hi(.xcopyin_err), REAL_LOFAULT ! .xcopyin_err is lofault value
3147         ba,pt   %ncc, .do_copyin
3148           or    REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
3149 
3150 /*
3151  * We got here because of fault during xcopyin
3152  * Errno value is in ERRNO
3153  */
3154 .xcopyin_err:
3155         ldn     [THREAD_REG + T_COPYOPS], %o4   ! check for copyop handler
3156         tst     %o4
3157         bz,pt   %ncc, 2f                        ! if not, return error
3158           nop
3159         ldn     [%o4 + CP_XCOPYIN], %g2         ! if handler, invoke it with
3160         jmp     %g2                             ! original arguments
3161           restore %g0, 0, %g0                   ! dispose of copy window
3162 2:
3163         ret
3164           restore ERRNO, 0, %o0                 ! return errno value
3165 
3166 .sm_xcopyin_err:
3167 
3168         membar  #Sync
3169         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3170         mov     SM_SAVE_SRC, %o0
3171         mov     SM_SAVE_DST, %o1
3172         mov     SM_SAVE_COUNT, %o2
3173         ldn     [THREAD_REG + T_COPYOPS], %o3   ! check for copyop handler
3174         tst     %o3
3175         bz,pt   %ncc, 3f                        ! if not, return error
3176           nop
3177         ldn     [%o3 + CP_XCOPYIN], %o5         ! if handler, invoke it with
3178         jmp     %o5                             ! original arguments
3179           nop
3180 3:
3181         retl
3182           or    %g1, 0, %o0             ! return errno value
3183 
3184         SET_SIZE(xcopyin)
3185 
3186         ENTRY(xcopyin_little)
3187         sethi   %hi(.xcopyio_err), %o5
3188         or      %o5, %lo(.xcopyio_err), %o5
3189         ldn     [THREAD_REG + T_LOFAULT], %o4
3190         membar  #Sync                           ! sync error barrier
3191         stn     %o5, [THREAD_REG + T_LOFAULT]
3192         mov     %o4, %o5
3193 
3194         subcc   %g0, %o2, %o3
3195         add     %o0, %o2, %o0
3196         bz,pn   %ncc, 2f                ! check for zero bytes
3197           sub   %o2, 1, %o4
3198         add     %o0, %o4, %o0           ! start w/last byte
3199         add     %o1, %o2, %o1
3200         lduba   [%o0 + %o3]ASI_AIUSL, %o4
3201 
3202 1:      stb     %o4, [%o1 + %o3]
3203         inccc   %o3
3204         sub     %o0, 2, %o0             ! get next byte
3205         bcc,a,pt %ncc, 1b
3206           lduba [%o0 + %o3]ASI_AIUSL, %o4
3207 
3208 2:
3209         membar  #Sync                           ! sync error barrier
3210         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3211         retl
3212           mov   %g0, %o0                ! return (0)
3213 
3214 .xcopyio_err:
3215         membar  #Sync                           ! sync error barrier
3216         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3217         retl
3218           mov   %g1, %o0
3219 
3220         SET_SIZE(xcopyin_little)
3221 
3222 
3223 /*
3224  * Copy a block of storage - must not overlap (from + len <= to).
3225  * No fault handler installed (to be called under on_fault())
3226  */
3227         ENTRY(copyin_noerr)
3228 
3229         cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
3230         bleu,pt %ncc, .copyin_ne_small          ! go to larger cases
3231           xor   %o0, %o1, %o3                   ! are src, dst alignable?
3232         btst    7, %o3                          !
3233         bz,pt   %ncc, .copyin_ne_8              ! check for longword alignment
3234           nop
3235         btst    1, %o3                          !
3236         bz,pt   %ncc, .copyin_ne_2              ! check for half-word
3237           nop
3238         sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
3239         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
3240         tst     %o3
3241         bz,pn   %icc, .copyin_ne_small          ! if zero, disable HW copy
3242           cmp   %o2, %o3                        ! if length <= limit
3243         bleu,pt %ncc, .copyin_ne_small          ! go to small copy
3244           nop
3245         ba,pt   %ncc, .copyin_noerr_more        ! otherwise go to large copy
3246           nop
3247 .copyin_ne_2:
3248         btst    3, %o3                          !
3249         bz,pt   %ncc, .copyin_ne_4              ! check for word alignment
3250           nop
3251         sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
3252         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
3253         tst     %o3
3254         bz,pn   %icc, .copyin_ne_small          ! if zero, disable HW copy
3255           cmp   %o2, %o3                        ! if length <= limit
3256         bleu,pt %ncc, .copyin_ne_small          ! go to small copy
3257           nop
3258         ba,pt   %ncc, .copyin_noerr_more        ! otherwise go to large copy
3259           nop
3260 .copyin_ne_4:
3261         ! already checked longword, must be word aligned
3262         sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
3263         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
3264         tst     %o3
3265         bz,pn   %icc, .copyin_ne_small          ! if zero, disable HW copy
3266           cmp   %o2, %o3                        ! if length <= limit
3267         bleu,pt %ncc, .copyin_ne_small          ! go to small copy
3268           nop
3269         ba,pt   %ncc, .copyin_noerr_more        ! otherwise go to large copy
3270           nop
3271 .copyin_ne_8:
3272         sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
3273         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
3274         tst     %o3
3275         bz,pn   %icc, .copyin_ne_small          ! if zero, disable HW copy
3276           cmp   %o2, %o3                        ! if length <= limit
3277         bleu,pt %ncc, .copyin_ne_small          ! go to small copy
3278           nop
3279         ba,pt   %ncc, .copyin_noerr_more        ! otherwise go to large copy
3280           nop
3281 
3282 .copyin_ne_small:
3283         ldn     [THREAD_REG + T_LOFAULT], %o4
3284         tst     %o4
3285         bz,pn   %ncc, .sm_do_copyin
3286           nop
3287         sethi   %hi(.sm_copyio_noerr), %o5
3288         or      %o5, %lo(.sm_copyio_noerr), %o5
3289         membar  #Sync                           ! sync error barrier
3290         ba,pt   %ncc, .sm_do_copyin
3291           stn   %o5, [THREAD_REG + T_LOFAULT]   ! set/save t_lofault
3292 
3293 .copyin_noerr_more:
3294         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3295         sethi   %hi(.copyio_noerr), REAL_LOFAULT
3296         ba,pt   %ncc, .do_copyin
3297           or    REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
3298 
3299 .copyio_noerr:
3300         jmp     %l6
3301           restore %g0,0,%g0
3302 
3303 .sm_copyio_noerr:
3304         membar  #Sync
3305         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore t_lofault
3306         jmp     %o4
3307           nop
3308 
3309         SET_SIZE(copyin_noerr)
3310 
3311 /*
3312  * Copy a block of storage - must not overlap (from + len <= to).
3313  * No fault handler installed (to be called under on_fault())
3314  */
3315 
3316         ENTRY(copyout_noerr)
3317 
3318         cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
3319         bleu,pt %ncc, .copyout_ne_small         ! go to larger cases
3320           xor   %o0, %o1, %o3                   ! are src, dst alignable?
3321         btst    7, %o3                          !
3322         bz,pt   %ncc, .copyout_ne_8             ! check for longword alignment
3323           nop
3324         btst    1, %o3                          !
3325         bz,pt   %ncc, .copyout_ne_2             ! check for half-word
3326           nop
3327         sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
3328         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
3329         tst     %o3
3330         bz,pn   %icc, .copyout_ne_small         ! if zero, disable HW copy
3331           cmp   %o2, %o3                        ! if length <= limit
3332         bleu,pt %ncc, .copyout_ne_small         ! go to small copy
3333           nop
3334         ba,pt   %ncc, .copyout_noerr_more       ! otherwise go to large copy
3335           nop
3336 .copyout_ne_2:
3337         btst    3, %o3                          !
3338         bz,pt   %ncc, .copyout_ne_4             ! check for word alignment
3339           nop
3340         sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
3341         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
3342         tst     %o3
3343         bz,pn   %icc, .copyout_ne_small         ! if zero, disable HW copy
3344           cmp   %o2, %o3                        ! if length <= limit
3345         bleu,pt %ncc, .copyout_ne_small         ! go to small copy
3346           nop
3347         ba,pt   %ncc, .copyout_noerr_more       ! otherwise go to large copy
3348           nop
3349 .copyout_ne_4:
3350         ! already checked longword, must be word aligned
3351         sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
3352         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
3353         tst     %o3
3354         bz,pn   %icc, .copyout_ne_small         ! if zero, disable HW copy
3355           cmp   %o2, %o3                        ! if length <= limit
3356         bleu,pt %ncc, .copyout_ne_small         ! go to small copy
3357           nop
3358         ba,pt   %ncc, .copyout_noerr_more       ! otherwise go to large copy
3359           nop
3360 .copyout_ne_8:
3361         sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
3362         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
3363         tst     %o3
3364         bz,pn   %icc, .copyout_ne_small         ! if zero, disable HW copy
3365           cmp   %o2, %o3                        ! if length <= limit
3366         bleu,pt %ncc, .copyout_ne_small         ! go to small copy
3367           nop
3368         ba,pt   %ncc, .copyout_noerr_more       ! otherwise go to large copy
3369           nop
3370 
3371 .copyout_ne_small:
3372         ldn     [THREAD_REG + T_LOFAULT], %o4
3373         tst     %o4
3374         bz,pn   %ncc, .sm_do_copyout
3375           nop
3376         sethi   %hi(.sm_copyio_noerr), %o5
3377         or      %o5, %lo(.sm_copyio_noerr), %o5
3378         membar  #Sync                           ! sync error barrier
3379         ba,pt   %ncc, .sm_do_copyout
3380         stn     %o5, [THREAD_REG + T_LOFAULT]   ! set/save t_lofault
3381 
3382 .copyout_noerr_more:
3383         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3384         sethi   %hi(.copyio_noerr), REAL_LOFAULT
3385         ba,pt   %ncc, .do_copyout
3386           or    REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
3387 
3388         SET_SIZE(copyout_noerr)
3389 
3390 
3391 /*
3392  * hwblkclr - clears block-aligned, block-multiple-sized regions that are
3393  * longer than 256 bytes in length using spitfire's block stores.  If
3394  * the criteria for using this routine are not met then it calls bzero
3395  * and returns 1.  Otherwise 0 is returned indicating success.
3396  * Caller is responsible for ensuring use_hw_bzero is true and that
3397  * kpreempt_disable() has been called.
3398  */
3399         ! %i0 - start address
3400         ! %i1 - length of region (multiple of 64)
3401         ! %l0 - saved fprs
3402         ! %l1 - pointer to saved %d0 block
3403         ! %l2 - saved curthread->t_lwp
3404 
3405         ENTRY(hwblkclr)
3406         ! get another window w/space for one aligned block of saved fpregs
3407         save    %sp, -SA(MINFRAME + 2*VIS_BLOCKSIZE), %sp
3408 
3409         ! Must be block-aligned
3410         andcc   %i0, (VIS_BLOCKSIZE-1), %g0
3411         bnz,pn  %ncc, 1f
3412           nop
3413 
3414         ! ... and must be 256 bytes or more
3415         cmp     %i1, 256
3416         blu,pn  %ncc, 1f
3417           nop
3418 
3419         ! ... and length must be a multiple of VIS_BLOCKSIZE
3420         andcc   %i1, (VIS_BLOCKSIZE-1), %g0
3421         bz,pn   %ncc, 2f
3422           nop
3423 
3424 1:      ! punt, call bzero but notify the caller that bzero was used
3425         mov     %i0, %o0
3426         call    bzero
3427         mov     %i1, %o1
3428         ret
3429           restore       %g0, 1, %o0 ! return (1) - did not use block operations
3430 
3431 2:      rd      %fprs, %l0              ! check for unused fp
3432         btst    FPRS_FEF, %l0
3433         bz,pt   %icc, 1f
3434           nop
3435 
3436         ! save in-use fpregs on stack
3437         membar  #Sync
3438         add     %fp, STACK_BIAS - 65, %l1
3439         and     %l1, -VIS_BLOCKSIZE, %l1
3440         stda    %d0, [%l1]ASI_BLK_P
3441 
3442 1:      membar  #StoreStore|#StoreLoad|#LoadStore
3443         wr      %g0, FPRS_FEF, %fprs
3444         wr      %g0, ASI_BLK_P, %asi
3445 
3446         ! Clear block
3447         fzero   %d0
3448         fzero   %d2
3449         fzero   %d4
3450         fzero   %d6
3451         fzero   %d8
3452         fzero   %d10
3453         fzero   %d12
3454         fzero   %d14
3455 
3456         mov     256, %i3
3457         ba,pt   %ncc, .pz_doblock
3458           nop
3459 
3460 .pz_blkstart:
3461       ! stda    %d0, [%i0 + 192]%asi  ! in dly slot of branch that got us here
3462         stda    %d0, [%i0 + 128]%asi
3463         stda    %d0, [%i0 + 64]%asi
3464         stda    %d0, [%i0]%asi
3465 .pz_zinst:
3466         add     %i0, %i3, %i0
3467         sub     %i1, %i3, %i1
3468 .pz_doblock:
3469         cmp     %i1, 256
3470         bgeu,a  %ncc, .pz_blkstart
3471           stda  %d0, [%i0 + 192]%asi
3472 
3473         cmp     %i1, 64
3474         blu     %ncc, .pz_finish
3475 
3476           andn  %i1, (64-1), %i3
3477         srl     %i3, 4, %i2             ! using blocks, 1 instr / 16 words
3478         set     .pz_zinst, %i4
3479         sub     %i4, %i2, %i4
3480         jmp     %i4
3481           nop
3482 
3483 .pz_finish:
3484         membar  #Sync
3485         btst    FPRS_FEF, %l0
3486         bz,a    .pz_finished
3487           wr    %l0, 0, %fprs           ! restore fprs
3488 
3489         ! restore fpregs from stack
3490         ldda    [%l1]ASI_BLK_P, %d0
3491         membar  #Sync
3492         wr      %l0, 0, %fprs           ! restore fprs
3493 
3494 .pz_finished:
3495         ret
3496           restore       %g0, 0, %o0             ! return (bzero or not)
3497 
3498         SET_SIZE(hwblkclr)
3499 
3500         /*
3501          * Copy 32 bytes of data from src (%o0) to dst (%o1)
3502          * using physical addresses.
3503          */
3504         ENTRY_NP(hw_pa_bcopy32)
3505         rdpr    %pstate, %g1
3506         andn    %g1, PSTATE_IE, %g2
3507         wrpr    %g0, %g2, %pstate
3508 
3509         rdpr    %pstate, %g0
3510         ldxa    [%o0]ASI_MEM, %o2
3511         add     %o0, 8, %o0
3512         ldxa    [%o0]ASI_MEM, %o3
3513         add     %o0, 8, %o0
3514         ldxa    [%o0]ASI_MEM, %o4
3515         add     %o0, 8, %o0
3516         ldxa    [%o0]ASI_MEM, %o5
3517         membar  #Sync
3518 
3519         stxa    %o2, [%o1]ASI_MEM
3520         add     %o1, 8, %o1
3521         stxa    %o3, [%o1]ASI_MEM
3522         add     %o1, 8, %o1
3523         stxa    %o4, [%o1]ASI_MEM
3524         add     %o1, 8, %o1
3525         stxa    %o5, [%o1]ASI_MEM
3526 
3527         retl
3528           wrpr    %g0, %g1, %pstate
3529 
3530         SET_SIZE(hw_pa_bcopy32)
3531 
3532         DGDEF(use_hw_bcopy)
3533         .word   1
3534         DGDEF(use_hw_bzero)
3535         .word   1
3536         DGDEF(hw_copy_limit_1)
3537         .word   0
3538         DGDEF(hw_copy_limit_2)
3539         .word   0
3540         DGDEF(hw_copy_limit_4)
3541         .word   0
3542         DGDEF(hw_copy_limit_8)
3543         .word   0
3544 
3545         .align  64
3546         .section ".text"