2 Old usr/src/uts/sun4u/cpu/cheetah

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License, Version 1.0 only
   6  * (the "License").  You may not use this file except in compliance
   7  * with the License.
   8  *
   9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  10  * or http://www.opensolaris.org/os/licensing.
  11  * See the License for the specific language governing permissions
  12  * and limitations under the License.
  13  *
  14  * When distributing Covered Code, include this CDDL HEADER in each
  15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  16  * If applicable, add the following below this CDDL HEADER, with the
  17  * fields enclosed by brackets "[]" replaced with your own identifying
  18  * information: Portions Copyright [yyyy] [name of copyright owner]
  19  *
  20  * CDDL HEADER END
  21  */
  22 /*
  23  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 #pragma ident   "%Z%%M% %I%     %E% SMI"
  28 
  29 #include <sys/param.h>
  30 #include <sys/errno.h>
  31 #include <sys/asm_linkage.h>
  32 #include <sys/vtrace.h>
  33 #include <sys/machthread.h>
  34 #include <sys/clock.h>
  35 #include <sys/asi.h>
  36 #include <sys/fsr.h>
  37 #include <sys/privregs.h>
  38 #include <sys/fpras_impl.h>
  39 
  40 #if !defined(lint)
  41 #include "assym.h"
  42 #endif  /* lint */
  43 
  44 /*
  45  * Pseudo-code to aid in understanding the control flow of the
  46  * bcopy/copyin/copyout routines.
  47  *
  48  * On entry:
  49  *
  50  *      ! Determine whether to use the FP register version
  51  *      ! or the leaf routine version depending on size
  52  *      ! of copy and flags.  Set up error handling accordingly.
  53  *      ! The transition point depends on whether the src and
  54  *      ! dst addresses can be aligned to long word, word,
  55  *      ! half word, or byte boundaries.
  56  *      !
  57  *      ! WARNING: <Register usage convention>
  58  *      ! For FP version, %l6 holds previous error handling and
  59  *      ! a flag: TRAMP_FLAG (low bits)
  60  *      ! for leaf routine version, %o4 holds those values.
  61  *      ! So either %l6 or %o4 is reserved and not available for
  62  *      ! any other use.
  63  *
  64  *      if (length <= VIS_COPY_THRESHOLD)    ! start with a quick test
  65  *              go to small_copy;               ! to speed short copies
  66  * 
  67  *      ! src, dst long word alignable
  68  *              if (hw_copy_limit_8 == 0)       ! hw_copy disabled
  69  *                      go to small_copy;
  70  *              if (length <= hw_copy_limit_8)
  71  *                      go to small_copy;
  72  *              go to FPBLK_copy;
  73  *      }
  74  *      if (src,dst not alignable) {
  75  *              if (hw_copy_limit_1 == 0)       ! hw_copy disabled
  76  *                      go to small_copy;
  77  *              if (length <= hw_copy_limit_1)
  78  *                      go to small_copy;
  79  *              go to FPBLK_copy;
  80  *      }
  81  *      if (src,dst halfword alignable) {
  82  *              if (hw_copy_limit_2 == 0)       ! hw_copy disabled
  83  *                      go to small_copy;
  84  *              if (length <= hw_copy_limit_2)
  85  *                      go to small_copy;
  86  *              go to FPBLK_copy;
  87  *      }
  88  *      if (src,dst word alignable) {
  89  *              if (hw_copy_limit_4 == 0)       ! hw_copy disabled
  90  *                      go to small_copy;
  91  *              if (length <= hw_copy_limit_4)
  92  *                      go to small_copy;
  93  *              go to FPBLK_copy;
  94  *      }
  95  *
  96  * small_copy:
  97  *      Setup_leaf_rtn_error_handler;           ! diffs for each entry point
  98  *      
  99  *      if (count <= 3)                              ! fast path for tiny copies
 100  *              go to sm_left;                  ! special finish up code
 101  *      else
 102  *              if (count > CHKSIZE)         ! medium sized copies
 103  *                      go to sm_med            ! tuned by alignment
 104  *              if(src&dst not both word aligned) {
 105  *      sm_movebytes:
 106  *                      move byte by byte in 4-way unrolled loop
 107  *                      fall into sm_left;
 108  *      sm_left:
 109  *                      move 0-3 bytes byte at a time as needed.
 110  *                      restore error handler and exit.
 111  *
 112  *              } else {        ! src&dst are word aligned
 113  *                      check for at least 8 bytes left,
 114  *                      move word at a time, unrolled by 2
 115  *                      when fewer than 8 bytes left,
 116  *      sm_half:        move half word at a time while 2 or more bytes left
 117  *      sm_byte:        move final byte if necessary
 118  *      sm_exit:
 119  *                      restore error handler and exit.
 120  *              }
 121  *
 122  * ! Medium length cases with at least CHKSIZE bytes available
 123  * ! method: line up src and dst as best possible, then
 124  * ! move data in 4-way unrolled loops.
 125  *
 126  * sm_med:
 127  *      if(src&dst unalignable)
 128  *              go to sm_movebytes
 129  *      if(src&dst halfword alignable)
 130  *              go to sm_movehalf
 131  *      if(src&dst word alignable)
 132  *              go to sm_moveword
 133  * ! fall into long word movement
 134  *      move bytes until src is word aligned
 135  *      if not long word aligned, move a word
 136  *      move long words in 4-way unrolled loop until < 32 bytes left
 137  *      move long words in 1-way unrolled loop until < 8 bytes left
 138  *      if zero bytes left, goto sm_exit
 139  *      if one byte left, go to sm_byte
 140  *      else go to sm_half
 141  *
 142  * sm_moveword:
 143  *      move bytes until src is word aligned
 144  *      move words in 4-way unrolled loop until < 16 bytes left
 145  *      move words in 1-way unrolled loop until < 4 bytes left
 146  *      if zero bytes left, goto sm_exit
 147  *      if one byte left, go to sm_byte
 148  *      else go to sm_half
 149  *
 150  * sm_movehalf:
 151  *      move a byte if needed to align src on halfword
 152  *      move halfwords in 4-way unrolled loop until < 8 bytes left
 153  *      if zero bytes left, goto sm_exit
 154  *      if one byte left, go to sm_byte
 155  *      else go to sm_half
 156  *
 157  *
 158  * FPBLK_copy:
 159  *      %l6 = curthread->t_lofault;
 160  *      if (%l6 != NULL) {
 161  *              membar #Sync
 162  *              curthread->t_lofault = .copyerr;
 163  *              caller_error_handler = TRUE             ! %l6 |= 2
 164  *      }
 165  *
 166  *      ! for FPU testing we must not migrate cpus
 167  *      if (curthread->t_lwp == NULL) {
 168  *              ! Kernel threads do not have pcb's in which to store
 169  *              ! the floating point state, so disallow preemption during
 170  *              ! the copy.  This also prevents cpu migration.
 171  *              kpreempt_disable(curthread);
 172  *      } else {
 173  *              thread_nomigrate();
 174  *      }
 175  *
 176  *      old_fprs = %fprs;
 177  *      old_gsr = %gsr;
 178  *      if (%fprs.fef) {
 179  *              %fprs.fef = 1;
 180  *              save current fpregs on stack using blockstore
 181  *      } else {
 182  *              %fprs.fef = 1;
 183  *      }
 184  *
 185  *
 186  *      do_blockcopy_here;
 187  *
 188  * In lofault handler:
 189  *      curthread->t_lofault = .copyerr2;
 190  *      Continue on with the normal exit handler
 191  *
 192  * On normal exit:
 193  *      %gsr = old_gsr;
 194  *      if (old_fprs & FPRS_FEF)
 195  *              restore fpregs from stack using blockload
 196  *      else
 197  *              zero fpregs
 198  *      %fprs = old_fprs;
 199  *      membar #Sync
 200  *      curthread->t_lofault = (%l6 & ~3);
 201  *      ! following test omitted from copyin/copyout as they
 202  *      ! will always have a current thread
 203  *      if (curthread->t_lwp == NULL)
 204  *              kpreempt_enable(curthread);
 205  *      else
 206  *              thread_allowmigrate();
 207  *      return (0)
 208  *
 209  * In second lofault handler (.copyerr2):
 210  *      We've tried to restore fp state from the stack and failed.  To
 211  *      prevent from returning with a corrupted fp state, we will panic.
 212  */
 213 
 214 /*
 215  * Comments about optimization choices
 216  *
 217  * The initial optimization decision in this code is to determine
 218  * whether to use the FP registers for a copy or not.  If we don't
 219  * use the FP registers, we can execute the copy as a leaf routine,
 220  * saving a register save and restore.  Also, less elaborate setup
 221  * is required, allowing short copies to be completed more quickly.
 222  * For longer copies, especially unaligned ones (where the src and
 223  * dst do not align to allow simple ldx,stx operation), the FP
 224  * registers allow much faster copy operations.
 225  *
 226  * The estimated extra cost of the FP path will vary depending on
 227  * src/dst alignment, dst offset from the next 64 byte FPblock store
 228  * boundary, remaining src data after the last full dst cache line is
 229  * moved whether the FP registers need to be saved, and some other
 230  * minor issues.  The average additional overhead is estimated to be
 231  * 400 clocks.  Since each non-repeated/predicted tst and branch costs
 232  * around 10 clocks, elaborate calculation would slow down to all 
 233  * longer copies and only benefit a small portion of medium sized
 234  * copies.  Rather than incur such cost, we chose fixed transition
 235  * points for each of the alignment choices.
 236  *
 237  * For the inner loop, here is a comparison of the per cache line
 238  * costs for each alignment when src&dst are in cache:
 239  * 
 240  * byte aligned:  108 clocks slower for non-FPBLK
 241  * half aligned:   44 clocks slower for non-FPBLK
 242  * word aligned:   12 clocks slower for non-FPBLK
 243  * long aligned:    4 clocks >>faster<< for non-FPBLK
 244  *
 245  * The long aligned loop runs faster because it does no prefetching.
 246  * That wins if the data is not in cache or there is too little
 247  * data to gain much benefit from prefetching.  But when there
 248  * is more data and that data is not in cache, failing to prefetch
 249  * can run much slower.  In addition, there is a 2 Kbyte store queue
 250  * which will cause the non-FPBLK inner loop to slow for larger copies.
 251  * The exact tradeoff is strongly load and application dependent, with
 252  * increasing risk of a customer visible performance regression if the
 253  * non-FPBLK code is used for larger copies. Studies of synthetic in-cache
 254  * vs out-of-cache copy tests in user space suggest 1024 bytes as a safe
 255  * upper limit for the non-FPBLK code.  To minimize performance regression
 256  * risk while still gaining the primary benefits of the improvements to 
 257  * the non-FPBLK code, we set an upper bound of 1024 bytes for the various
 258  * hw_copy_limit_*.  Later experimental studies using different values 
 259  * of hw_copy_limit_* can be used to make further adjustments if 
 260  * appropriate.
 261  *
 262  * hw_copy_limit_1 = src and dst are byte aligned but not halfword aligned
 263  * hw_copy_limit_2 = src and dst are halfword aligned but not word aligned
 264  * hw_copy_limit_4 = src and dst are word aligned but not longword aligned
 265  * hw_copy_limit_8 = src and dst are longword aligned
 266  *
 267  * To say that src and dst are word aligned means that after
 268  * some initial alignment activity of moving 0 to 3 bytes,
 269  * both the src and dst will be on word boundaries so that
 270  * word loads and stores may be used.
 271  *
 272  * Recommended initial values as of Mar 2004, includes testing
 273  * on Cheetah+ (900MHz), Cheetah++ (1200MHz), and Jaguar(1050MHz):
 274  * hw_copy_limit_1 =  256
 275  * hw_copy_limit_2 =  512
 276  * hw_copy_limit_4 = 1024
 277  * hw_copy_limit_8 = 1024 (or 1536 on some systems)
 278  *
 279  *
 280  * If hw_copy_limit_? is set to zero, then use of FPBLK copy is
 281  * disabled for that alignment choice.
 282  * If hw_copy_limit_? is set to a value between 1 and VIS_COPY_THRESHOLD (256)
 283  * the value of VIS_COPY_THRESHOLD is used.
 284  * It is not envisioned that hw_copy_limit_? will be changed in the field
 285  * It is provided to allow for disabling FPBLK copies and to allow
 286  * easy testing of alternate values on future HW implementations
 287  * that might have different cache sizes, clock rates or instruction
 288  * timing rules.
 289  *
 290  * Our first test for FPBLK copies vs non-FPBLK copies checks a minimum
 291  * threshold to speedup all shorter copies (less than 256).  That
 292  * saves an alignment test, memory reference, and enabling test
 293  * for all short copies, or an estimated 24 clocks.
 294  *
 295  * The order in which these limits are checked does matter since each
 296  * non-predicted tst and branch costs around 10 clocks.
 297  * If src and dst are randomly selected addresses,
 298  * 4 of 8 will not be alignable.
 299  * 2 of 8 will be half word alignable.
 300  * 1 of 8 will be word alignable.
 301  * 1 of 8 will be long word alignable.
 302  * But, tests on running kernels show that src and dst to copy code
 303  * are typically not on random alignments.  Structure copies and
 304  * copies of larger data sizes are often on long word boundaries.
 305  * So we test the long word alignment case first, then
 306  * the byte alignment, then halfword, then word alignment.
 307  *
 308  * Several times, tests for length are made to split the code
 309  * into subcases.  These tests often allow later tests to be
 310  * avoided.  For example, within the non-FPBLK copy, we first 
 311  * check for tiny copies of 3 bytes or less.  That allows us
 312  * to use a 4-way unrolled loop for the general byte copy case
 313  * without a test on loop entry.
 314  * We subdivide the non-FPBLK case further into CHKSIZE bytes and less
 315  * vs longer cases.  For the really short case, we don't attempt
 316  * align src and dst.  We try to minimize special case tests in
 317  * the shortest loops as each test adds a significant percentage
 318  * to the total time.
 319  *
 320  * For the medium sized cases, we allow ourselves to adjust the
 321  * src and dst alignment and provide special cases for each of
 322  * the four adjusted alignment cases. The CHKSIZE that was used
 323  * to decide between short and medium size was chosen to be 39
 324  * as that allows for the worst case of 7 bytes of alignment
 325  * shift and 4 times 8 bytes for the first long word unrolling.
 326  * That knowledge saves an initial test for length on entry into
 327  * the medium cases.  If the general loop unrolling factor were
 328  * to be increases, this number would also need to be adjusted.
 329  *
 330  * For all cases in the non-FPBLK code where it is known that at
 331  * least 4 chunks of data are available for movement, the
 332  * loop is unrolled by four.  This 4-way loop runs in 8 clocks
 333  * or 2 clocks per data element.  Due to limitations of the
 334  * branch instruction on Cheetah, Jaguar, and Panther, the
 335  * minimum time for a small, tight loop is 3 clocks.  So
 336  * the 4-way loop runs 50% faster than the fastest non-unrolled
 337  * loop.
 338  *
 339  * Instruction alignment is forced by used of .align 16 directives
 340  * and nops which are not executed in the code.  This
 341  * combination of operations shifts the alignment of following
 342  * loops to insure that loops are aligned so that their instructions
 343  * fall within the minimum number of 4 instruction fetch groups. 
 344  * If instructions are inserted or removed between the .align 
 345  * instruction and the unrolled loops, then the alignment needs
 346  * to be readjusted.  Misaligned loops can add a clock per loop
 347  * iteration to the loop timing.
 348  *
 349  * In a few cases, code is duplicated to avoid a branch.  Since
 350  * a non-predicted tst and branch takes 10 clocks, this savings
 351  * is judged an appropriate time-space tradeoff.
 352  *
 353  * Within the FPBLK-code, the prefetch method in the inner
 354  * loop needs to be explained as it is not standard.  Two 
 355  * prefetches are issued for each cache line instead of one.
 356  * The primary one is at the maximum reach of 8 cache lines.
 357  * Most of the time, that maximum prefetch reach gives the
 358  * cache line more time to reach the processor for systems with
 359  * higher processor clocks.  But, sometimes memory interference
 360  * can cause that prefetch to be dropped.  Putting a second
 361  * prefetch at a reach of 5 cache lines catches the drops
 362  * three iterations later and shows a measured improvement
 363  * in performance over any similar loop with a single prefetch.
 364  * The prefetches are placed in the loop so they overlap with 
 365  * non-memory instructions, so that there is no extra cost 
 366  * when the data is already in-cache.
 367  *
 368  */
 369 
 370 /*
 371  * Notes on preserving existing fp state and on membars.
 372  *
 373  * When a copyOP decides to use fp we may have to preserve existing
 374  * floating point state.  It is not the caller's state that we need to
 375  * preserve - the rest of the kernel does not use fp and, anyway, fp
 376  * registers are volatile across a call.  Some examples:
 377  *
 378  *      - userland has fp state and is interrupted (device interrupt 
 379  *        or trap) and within the interrupt/trap handling we use
 380  *        bcopy()
 381  *      - another (higher level) interrupt or trap handler uses bcopy
 382  *        while a bcopy from an earlier interrupt is still active
 383  *      - an asynchronous error trap occurs while fp state exists (in
 384  *        userland or in kernel copy) and the tl0 component of the handling
 385  *        uses bcopy
 386  *      - a user process with fp state incurs a copy-on-write fault and
 387  *        hwblkpagecopy always uses fp
 388  *
 389  * We therefore need a per-call place in which to preserve fp state -
 390  * using our stack is ideal (and since fp copy cannot be leaf optimized
 391  * because of calls it makes, this is no hardship).
 392  *
 393  * The following membar BLD/BST discussion is Cheetah pipeline specific.
 394  * In Cheetah BLD is blocking, #LoadLoad/#LoadStore/#StoreStore are
 395  * nops (those semantics always apply) and #StoreLoad is implemented
 396  * as a membar #Sync.
 397  *
 398  * It is possible that the owner of the fp state has a block load or
 399  * block store still "in flight" at the time we come to preserve that
 400  * state.  Block loads are blocking in Cheetah pipelines so we do not
 401  * need to sync with them.  In preserving fp regs we will use block stores
 402  * (which are not blocking in Cheetah pipelines) so we require a membar #Sync
 403  * after storing state (so that our subsequent use of those registers
 404  * does not modify them before the block stores complete);  this membar
 405  * also serves to sync with block stores the owner of the fp state has
 406  * initiated.
 407  *
 408  * When we have finished fp copy (with it's repeated block stores)
 409  * we must membar #Sync so that our block stores may complete before
 410  * we either restore the original fp state into the fp registers or
 411  * return to a caller which may initiate other fp operations that could
 412  * modify the fp regs we used before the block stores complete.
 413  *
 414  * Synchronous faults (eg, unresolvable DMMU miss) that occur while
 415  * t_lofault is not NULL will not panic but will instead trampoline
 416  * to the registered lofault handler.  There is no need for any
 417  * membars for these - eg, our store to t_lofault will always be visible to
 418  * ourselves and it is our cpu which will take any trap.
 419  *
 420  * Asynchronous faults (eg, uncorrectable ECC error from memory) that occur
 421  * while t_lofault is not NULL will also not panic.  Since we're copying
 422  * to or from userland the extent of the damage is known - the destination
 423  * buffer is incomplete.  So trap handlers will trampoline to the lofault
 424  * handler in this case which should take some form of error action to
 425  * avoid using the incomplete buffer.  The trap handler also flags the
 426  * fault so that later return-from-trap handling (for the trap that brought
 427  * this thread into the kernel in the first place) can notify the process
 428  * and reboot the system (or restart the service with Greenline/Contracts).
 429  *
 430  * Asynchronous faults (eg, uncorrectable ECC error from memory) can
 431  * result in deferred error traps - the trap is taken sometime after
 432  * the event and the trap PC may not be the PC of the faulting access.
 433  * Delivery of such pending traps can be forced by a membar #Sync, acting
 434  * as an "error barrier" in this role.  To accurately apply the user/kernel
 435  * separation described in the preceding paragraph we must force delivery
 436  * of deferred traps affecting kernel state before we install a lofault
 437  * handler (if we interpose a new lofault handler on an existing one there
 438  * is no need to repeat this), and we must force delivery of deferred
 439  * errors affecting the lofault-protected region before we clear t_lofault.
 440  * Failure to do so results in lost kernel state being interpreted as
 441  * affecting a copyin/copyout only, or of an error that really only
 442  * affects copy data being interpreted as losing kernel state.
 443  *
 444  * Since the copy operations may preserve and later restore floating
 445  * point state that does not belong to the caller (see examples above),
 446  * we must be careful in how we do this in order to prevent corruption
 447  * of another program.
 448  *
 449  * To make sure that floating point state is always saved and restored
 450  * correctly, the following "big rules" must be followed when the floating
 451  * point registers will be used:
 452  *
 453  * 1. %l6 always holds the caller's lofault handler.  Also in this register,
 454  *    Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in
 455  *    use.  Bit 2 (TRAMP_FLAG) indicates that the call was to bcopy, and a
 456  *    lofault handler was set coming in.
 457  *
 458  * 2. The FPUSED flag indicates that all FP state has been successfully stored
 459  *    on the stack.  It should not be set until this save has been completed.
 460  *
 461  * 3. The FPUSED flag should not be cleared on exit until all FP state has
 462  *    been restored from the stack.  If an error occurs while restoring
 463  *    data from the stack, the error handler can check this flag to see if
 464  *    a restore is necessary.
 465  *
 466  * 4. Code run under the new lofault handler must be kept to a minimum.  In
 467  *    particular, any calls to FP_ALLOWMIGRATE, which could result in a call
 468  *    to kpreempt(), should not be made until after the lofault handler has
 469  *    been restored.
 470  */
 471 
 472 /*
 473  * VIS_COPY_THRESHOLD indicates the minimum number of bytes needed
 474  * to "break even" using FP/VIS-accelerated memory operations.
 475  * The FPBLK code assumes a minimum number of bytes are available
 476  * to be moved on entry.  Check that code carefully before 
 477  * reducing VIS_COPY_THRESHOLD below 256.
 478  */
 479 /*
 480  * This shadows sys/machsystm.h which can't be included due to the lack of
 481  * _ASM guards in include files it references. Change it here, change it there.
 482  */
 483 #define VIS_COPY_THRESHOLD 256
 484 
 485 /*
 486  * TEST for very short copies
 487  * Be aware that the maximum unroll for the short unaligned case
 488  * is SHORTCOPY+1
 489  */
 490 #define SHORTCOPY 3
 491 #define CHKSIZE  39
 492 
 493 /*
 494  * Indicates that we're to trampoline to the error handler.
 495  * Entry points bcopy, copyin_noerr, and copyout_noerr use this flag.
 496  * kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag.
 497  */
 498 #define FPUSED_FLAG     1
 499 #define TRAMP_FLAG      2
 500 #define MASK_FLAGS      3
 501 
 502 /*
 503  * Number of outstanding prefetches.
 504  * Testing with 1200 MHz Cheetah+ and Jaguar gives best results with
 505  * two prefetches, one with a reach of 8*BLOCK_SIZE+8 and one with a
 506  * reach of 5*BLOCK_SIZE.  The double prefetch gives an typical improvement
 507  * of 5% for large copies as compared to a single prefetch.  The reason
 508  * for the improvement is that with Cheetah and Jaguar, some prefetches
 509  * are dropped due to the prefetch queue being full.  The second prefetch
 510  * reduces the number of cache lines that are dropped. 
 511  * Do not remove the double prefetch or change either CHEETAH_PREFETCH
 512  * or CHEETAH_2ND_PREFETCH without extensive performance tests to prove
 513  * there is no loss of performance.
 514  */
 515 #define CHEETAH_PREFETCH        8
 516 #define CHEETAH_2ND_PREFETCH    5
 517 
 518 #define VIS_BLOCKSIZE           64
 519 
 520 /*
 521  * Size of stack frame in order to accomodate a 64-byte aligned
 522  * floating-point register save area and 2 64-bit temp locations.
 523  * All copy functions use two quadrants of fp registers; to assure a
 524  * block-aligned two block buffer in which to save we must reserve
 525  * three blocks on stack.  Not all functions preserve %pfrs on stack
 526  * or need to preserve %gsr but we use HWCOPYFRAMESIZE for all.
 527  *
 528  *    _______________________________________ <-- %fp + STACK_BIAS
 529  *    | We may need to preserve 2 quadrants |
 530  *    | of fp regs, but since we do so with |
 531  *    | BST/BLD we need room in which to    |
 532  *    | align to VIS_BLOCKSIZE bytes.  So   |
 533  *    | this area is 3 * VIS_BLOCKSIZE.     | <--  - SAVED_FPREGS_OFFSET
 534  *    |-------------------------------------|
 535  *    | 8 bytes to save %fprs               | <--  - SAVED_FPRS_OFFSET
 536  *    |-------------------------------------|
 537  *    | 8 bytes to save %gsr                | <--  - SAVED_GSR_OFFSET
 538  *    ---------------------------------------
 539  */
 540 #define HWCOPYFRAMESIZE         ((VIS_BLOCKSIZE * (2 + 1)) + (2 * 8))
 541 #define SAVED_FPREGS_OFFSET     (VIS_BLOCKSIZE * 3)
 542 #define SAVED_FPREGS_ADJUST     ((VIS_BLOCKSIZE * 2) - 1)
 543 #define SAVED_FPRS_OFFSET       (SAVED_FPREGS_OFFSET + 8)
 544 #define SAVED_GSR_OFFSET        (SAVED_FPRS_OFFSET + 8)
 545 
 546 /*
 547  * Common macros used by the various versions of the block copy
 548  * routines in this file.
 549  */
 550 
 551 /*
 552  * In FP copies if we do not have preserved data to restore over
 553  * the fp regs we used then we must zero those regs to avoid
 554  * exposing portions of the data to later threads (data security).
 555  *
 556  * Copy functions use either quadrants 1 and 3 or 2 and 4.
 557  *
 558  * FZEROQ1Q3: Zero quadrants 1 and 3, ie %f0 - %f15 and %f32 - %f47
 559  * FZEROQ2Q4: Zero quadrants 2 and 4, ie %f16 - %f31 and %f48 - %f63
 560  *
 561  * The instructions below are quicker than repeated fzero instructions
 562  * since they can dispatch down two fp pipelines.
 563  */
 564 #define FZEROQ1Q3                       \
 565         fzero   %f0                     ;\
 566         fzero   %f2                     ;\
 567         faddd   %f0, %f2, %f4           ;\
 568         fmuld   %f0, %f2, %f6           ;\
 569         faddd   %f0, %f2, %f8           ;\
 570         fmuld   %f0, %f2, %f10          ;\
 571         faddd   %f0, %f2, %f12          ;\
 572         fmuld   %f0, %f2, %f14          ;\
 573         faddd   %f0, %f2, %f32          ;\
 574         fmuld   %f0, %f2, %f34          ;\
 575         faddd   %f0, %f2, %f36          ;\
 576         fmuld   %f0, %f2, %f38          ;\
 577         faddd   %f0, %f2, %f40          ;\
 578         fmuld   %f0, %f2, %f42          ;\
 579         faddd   %f0, %f2, %f44          ;\
 580         fmuld   %f0, %f2, %f46
 581 
 582 #define FZEROQ2Q4                       \
 583         fzero   %f16                    ;\
 584         fzero   %f18                    ;\
 585         faddd   %f16, %f18, %f20        ;\
 586         fmuld   %f16, %f18, %f22        ;\
 587         faddd   %f16, %f18, %f24        ;\
 588         fmuld   %f16, %f18, %f26        ;\
 589         faddd   %f16, %f18, %f28        ;\
 590         fmuld   %f16, %f18, %f30        ;\
 591         faddd   %f16, %f18, %f48        ;\
 592         fmuld   %f16, %f18, %f50        ;\
 593         faddd   %f16, %f18, %f52        ;\
 594         fmuld   %f16, %f18, %f54        ;\
 595         faddd   %f16, %f18, %f56        ;\
 596         fmuld   %f16, %f18, %f58        ;\
 597         faddd   %f16, %f18, %f60        ;\
 598         fmuld   %f16, %f18, %f62
 599 
 600 /*
 601  * Macros to save and restore quadrants 1 and 3 or 2 and 4 to/from the stack.
 602  * Used to save and restore in-use fp registers when we want to use FP
 603  * and find fp already in use and copy size still large enough to justify
 604  * the additional overhead of this save and restore.
 605  *
 606  * A membar #Sync is needed before save to sync fp ops initiated before
 607  * the call to the copy function (by whoever has fp in use); for example
 608  * an earlier block load to the quadrant we are about to save may still be
 609  * "in flight".  A membar #Sync is required at the end of the save to
 610  * sync our block store (the copy code is about to begin ldd's to the
 611  * first quadrant).  Note, however, that since Cheetah pipeline block load
 612  * is blocking we can omit the initial membar before saving fp state (they're
 613  * commented below in case of future porting to a chip that does not block
 614  * on block load).
 615  *
 616  * Similarly: a membar #Sync before restore allows the block stores of
 617  * the copy operation to complete before we fill the quadrants with their
 618  * original data, and a membar #Sync after restore lets the block loads
 619  * of the restore complete before we return to whoever has the fp regs
 620  * in use.  To avoid repeated membar #Sync we make it the responsibility
 621  * of the copy code to membar #Sync immediately after copy is complete
 622  * and before using the BLD_*_FROMSTACK macro.
 623  */
 624 #if !defined(lint)
 625 #define BST_FPQ1Q3_TOSTACK(tmp1)                                \
 626         /* membar #Sync */                                      ;\
 627         add     %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1     ;\
 628         and     tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */    ;\
 629         stda    %f0, [tmp1]ASI_BLK_P                            ;\
 630         add     tmp1, VIS_BLOCKSIZE, tmp1                       ;\
 631         stda    %f32, [tmp1]ASI_BLK_P                           ;\
 632         membar  #Sync
 633 
 634 #define BLD_FPQ1Q3_FROMSTACK(tmp1)                              \
 635         /* membar #Sync - provided at copy completion */        ;\
 636         add     %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1     ;\
 637         and     tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */    ;\
 638         ldda    [tmp1]ASI_BLK_P, %f0                            ;\
 639         add     tmp1, VIS_BLOCKSIZE, tmp1                       ;\
 640         ldda    [tmp1]ASI_BLK_P, %f32                           ;\
 641         membar  #Sync
 642 
 643 #define BST_FPQ2Q4_TOSTACK(tmp1)                                \
 644         /* membar #Sync */                                      ;\
 645         add     %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1     ;\
 646         and     tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */    ;\
 647         stda    %f16, [tmp1]ASI_BLK_P                           ;\
 648         add     tmp1, VIS_BLOCKSIZE, tmp1                       ;\
 649         stda    %f48, [tmp1]ASI_BLK_P                           ;\
 650         membar  #Sync
 651 
 652 #define BLD_FPQ2Q4_FROMSTACK(tmp1)                              \
 653         /* membar #Sync - provided at copy completion */        ;\
 654         add     %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1     ;\
 655         and     tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */    ;\
 656         ldda    [tmp1]ASI_BLK_P, %f16                           ;\
 657         add     tmp1, VIS_BLOCKSIZE, tmp1                       ;\
 658         ldda    [tmp1]ASI_BLK_P, %f48                           ;\
 659         membar  #Sync
 660 #endif
 661 
 662 /*
 663  * FP_NOMIGRATE and FP_ALLOWMIGRATE.  Prevent migration (or, stronger,
 664  * prevent preemption if there is no t_lwp to save FP state to on context
 665  * switch) before commencing a FP copy, and reallow it on completion or
 666  * in error trampoline paths when we were using FP copy.
 667  *
 668  * Both macros may call other functions, so be aware that all outputs are
 669  * forfeit after using these macros.  For this reason we do not pass registers
 670  * to use - we just use any outputs we want.
 671  *
 672  * For fpRAS we need to perform the fpRAS mechanism test on the same
 673  * CPU as we use for the copy operation, both so that we validate the
 674  * CPU we perform the copy on and so that we know which CPU failed
 675  * if a failure is detected.  Hence we need to be bound to "our" CPU.
 676  * This could be achieved through disabling preemption (and we have do it that
 677  * way for threads with no t_lwp) but for larger copies this may hold
 678  * higher priority threads off of cpu for too long (eg, realtime).  So we
 679  * make use of the lightweight t_nomigrate mechanism where we can (ie, when
 680  * we have a t_lwp).
 681  *
 682  * Pseudo code:
 683  *
 684  * FP_NOMIGRATE:
 685  *
 686  * if (curthread->t_lwp) {
 687  *      thread_nomigrate();
 688  * } else {
 689  *      kpreempt_disable();
 690  * }
 691  *
 692  * FP_ALLOWMIGRATE:
 693  *
 694  * if (curthread->t_lwp) {
 695  *      thread_allowmigrate();
 696  * } else {
 697  *      kpreempt_enable();
 698  * }
 699  */
 700 
 701 #define FP_NOMIGRATE(label1, label2)                            \
 702         ldn     [THREAD_REG + T_LWP], %o0                       ;\
 703         brz,a,pn %o0, label1/**/f                               ;\
 704           ldsb  [THREAD_REG + T_PREEMPT], %o1                   ;\
 705         call    thread_nomigrate                                ;\
 706           nop                                                   ;\
 707         ba      label2/**/f                                     ;\
 708           nop                                                   ;\
 709 label1:                                                         ;\
 710         inc     %o1                                             ;\
 711         stb     %o1, [THREAD_REG + T_PREEMPT]                   ;\
 712 label2:
 713 
 714 #define FP_ALLOWMIGRATE(label1, label2)                 \
 715         ldn     [THREAD_REG + T_LWP], %o0                       ;\
 716         brz,a,pn %o0, label1/**/f                               ;\
 717           ldsb  [THREAD_REG + T_PREEMPT], %o1                   ;\
 718         call thread_allowmigrate                                ;\
 719           nop                                                   ;\
 720         ba      label2/**/f                                     ;\
 721           nop                                                   ;\
 722 label1:                                                         ;\
 723         dec     %o1                                             ;\
 724         brnz,pn %o1, label2/**/f                                ;\
 725           stb   %o1, [THREAD_REG + T_PREEMPT]                   ;\
 726         ldn     [THREAD_REG + T_CPU], %o0                       ;\
 727         ldub    [%o0 + CPU_KPRUNRUN], %o0                       ;\
 728         brz,pt  %o0, label2/**/f                                ;\
 729           nop                                                   ;\
 730         call    kpreempt                                        ;\
 731           rdpr  %pil, %o0                                       ;\
 732 label2:
 733 
 734 /*
 735  * Copy a block of storage, returning an error code if `from' or
 736  * `to' takes a kernel pagefault which cannot be resolved.
 737  * Returns errno value on pagefault error, 0 if all ok
 738  */
 739 
 740 #if defined(lint)
 741 
 742 /* ARGSUSED */
 743 int
 744 kcopy(const void *from, void *to, size_t count)
 745 { return(0); }
 746 
 747 #else   /* lint */
 748 
 749         .seg    ".text"
 750         .align  4
 751 
 752         ENTRY(kcopy)
 753 
 754         cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
 755         bleu,pt %ncc, .kcopy_small              ! go to larger cases
 756           xor   %o0, %o1, %o3                   ! are src, dst alignable?
 757         btst    7, %o3                          !
 758         bz,pt   %ncc, .kcopy_8                  ! check for longword alignment
 759           nop
 760         btst    1, %o3                          ! 
 761         bz,pt   %ncc, .kcopy_2                  ! check for half-word
 762           nop
 763         sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
 764         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
 765         tst     %o3
 766         bz,pn   %icc, .kcopy_small              ! if zero, disable HW copy
 767           cmp   %o2, %o3                        ! if length <= limit
 768         bleu,pt %ncc, .kcopy_small              ! go to small copy
 769           nop
 770         ba,pt   %ncc, .kcopy_more               ! otherwise go to large copy
 771           nop
 772 .kcopy_2:
 773         btst    3, %o3                          !
 774         bz,pt   %ncc, .kcopy_4                  ! check for word alignment
 775           nop
 776         sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
 777         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
 778         tst     %o3
 779         bz,pn   %icc, .kcopy_small              ! if zero, disable HW copy
 780           cmp   %o2, %o3                        ! if length <= limit
 781         bleu,pt %ncc, .kcopy_small              ! go to small copy
 782           nop
 783         ba,pt   %ncc, .kcopy_more               ! otherwise go to large copy
 784           nop
 785 .kcopy_4:
 786         ! already checked longword, must be word aligned
 787         sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
 788         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
 789         tst     %o3
 790         bz,pn   %icc, .kcopy_small              ! if zero, disable HW copy
 791           cmp   %o2, %o3                        ! if length <= limit
 792         bleu,pt %ncc, .kcopy_small              ! go to small copy
 793           nop
 794         ba,pt   %ncc, .kcopy_more               ! otherwise go to large copy
 795           nop
 796 .kcopy_8:
 797         sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
 798         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
 799         tst     %o3
 800         bz,pn   %icc, .kcopy_small              ! if zero, disable HW copy
 801           cmp   %o2, %o3                        ! if length <= limit
 802         bleu,pt %ncc, .kcopy_small              ! go to small copy
 803           nop
 804         ba,pt   %ncc, .kcopy_more               ! otherwise go to large copy
 805           nop
 806 
 807 .kcopy_small:
 808         sethi   %hi(.sm_copyerr), %o5           ! sm_copyerr is lofault value
 809         or      %o5, %lo(.sm_copyerr), %o5
 810         ldn     [THREAD_REG + T_LOFAULT], %o4   ! save existing handler
 811         membar  #Sync                           ! sync error barrier
 812         ba,pt   %ncc, .sm_do_copy               ! common code
 813          stn    %o5, [THREAD_REG + T_LOFAULT]   ! set t_lofault
 814 
 815 .kcopy_more:
 816         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
 817         sethi   %hi(.copyerr), %l7              ! copyerr is lofault value
 818         or      %l7, %lo(.copyerr), %l7
 819         ldn     [THREAD_REG + T_LOFAULT], %l6   ! save existing handler
 820         membar  #Sync                           ! sync error barrier
 821         ba,pt   %ncc, .do_copy                  ! common code
 822           stn   %l7, [THREAD_REG + T_LOFAULT]   ! set t_lofault
 823 
 824 
 825 /*
 826  * We got here because of a fault during bcopy_more, called from kcopy or bcopy.
 827  * Errno value is in %g1.  bcopy_more uses fp quadrants 1 and 3.
 828  */
 829 .copyerr:
 830         set     .copyerr2, %l0
 831         membar  #Sync                           ! sync error barrier
 832         stn     %l0, [THREAD_REG + T_LOFAULT]   ! set t_lofault
 833         btst    FPUSED_FLAG, %l6
 834         bz      %ncc, 1f
 835           and   %l6, TRAMP_FLAG, %l0            ! copy trampoline flag to %l0
 836 
 837         ldx     [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2      ! restore gsr
 838         wr      %o2, 0, %gsr
 839 
 840         ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
 841         btst    FPRS_FEF, %o3
 842         bz,pt   %icc, 4f
 843           nop
 844 
 845         BLD_FPQ1Q3_FROMSTACK(%o2)
 846 
 847         ba,pt   %ncc, 1f
 848           wr    %o3, 0, %fprs           ! restore fprs
 849 
 850 4:
 851         FZEROQ1Q3
 852         wr      %o3, 0, %fprs           ! restore fprs
 853 
 854         !
 855         ! Need to cater for the different expectations of kcopy
 856         ! and bcopy. kcopy will *always* set a t_lofault handler
 857         ! If it fires, we're expected to just return the error code
 858         ! and *not* to invoke any existing error handler. As far as
 859         ! bcopy is concerned, we only set t_lofault if there was an
 860         ! existing lofault handler. In that case we're expected to
 861         ! invoke the previously existing handler after resetting the
 862         ! t_lofault value.
 863         !
 864 1:
 865         andn    %l6, MASK_FLAGS, %l6            ! turn trampoline flag off
 866         membar  #Sync                           ! sync error barrier
 867         stn     %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
 868         FP_ALLOWMIGRATE(5, 6)
 869 
 870         btst    TRAMP_FLAG, %l0
 871         bnz,pn  %ncc, 3f
 872           nop
 873         ret
 874           restore       %g1, 0, %o0
 875 
 876 3:
 877         !
 878         ! We're here via bcopy. There *must* have been an error handler
 879         ! in place otherwise we would have died a nasty death already.
 880         !
 881         jmp     %l6                             ! goto real handler
 882           restore       %g0, 0, %o0             ! dispose of copy window
 883 
 884 /*
 885  * We got here because of a fault in .copyerr.  We can't safely restore fp
 886  * state, so we panic.
 887  */
 888 fp_panic_msg:
 889         .asciz  "Unable to restore fp state after copy operation"
 890 
 891         .align  4
 892 .copyerr2:
 893         set     fp_panic_msg, %o0
 894         call    panic
 895           nop
 896 
 897 /*
 898  * We got here because of a fault during a small kcopy or bcopy.
 899  * No floating point registers are used by the small copies.
 900  * Errno value is in %g1.
 901  */
 902 .sm_copyerr:
 903 1:
 904         btst    TRAMP_FLAG, %o4
 905         membar  #Sync
 906         andn    %o4, TRAMP_FLAG, %o4
 907         bnz,pn  %ncc, 3f
 908           stn   %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
 909         retl
 910           mov   %g1, %o0
 911 3:
 912         jmp     %o4                             ! goto real handler
 913           mov   %g0, %o0                        ! 
 914 
 915         SET_SIZE(kcopy)
 916 #endif  /* lint */
 917 
 918 
 919 /*
 920  * Copy a block of storage - must not overlap (from + len <= to).
 921  * Registers: l6 - saved t_lofault
 922  * (for short copies, o4 - saved t_lofault)
 923  *
 924  * Copy a page of memory.
 925  * Assumes double word alignment and a count >= 256.
 926  */
 927 #if defined(lint)
 928 
 929 /* ARGSUSED */
 930 void
 931 bcopy(const void *from, void *to, size_t count)
 932 {}
 933 
 934 #else   /* lint */
 935 
 936         ENTRY(bcopy)
 937 
 938         cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
 939         bleu,pt %ncc, .bcopy_small              ! go to larger cases
 940           xor   %o0, %o1, %o3                   ! are src, dst alignable?
 941         btst    7, %o3                          !
 942         bz,pt   %ncc, .bcopy_8                  ! check for longword alignment
 943           nop
 944         btst    1, %o3                          ! 
 945         bz,pt   %ncc, .bcopy_2                  ! check for half-word
 946           nop
 947         sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
 948         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
 949         tst     %o3
 950         bz,pn   %icc, .bcopy_small              ! if zero, disable HW copy
 951           cmp   %o2, %o3                        ! if length <= limit
 952         bleu,pt %ncc, .bcopy_small              ! go to small copy
 953           nop
 954         ba,pt   %ncc, .bcopy_more               ! otherwise go to large copy
 955           nop
 956 .bcopy_2:
 957         btst    3, %o3                          !
 958         bz,pt   %ncc, .bcopy_4                  ! check for word alignment
 959           nop
 960         sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
 961         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
 962         tst     %o3
 963         bz,pn   %icc, .bcopy_small              ! if zero, disable HW copy
 964           cmp   %o2, %o3                        ! if length <= limit
 965         bleu,pt %ncc, .bcopy_small              ! go to small copy
 966           nop
 967         ba,pt   %ncc, .bcopy_more               ! otherwise go to large copy
 968           nop
 969 .bcopy_4:
 970         ! already checked longword, must be word aligned
 971         sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
 972         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
 973         tst     %o3
 974         bz,pn   %icc, .bcopy_small              ! if zero, disable HW copy
 975           cmp   %o2, %o3                        ! if length <= limit
 976         bleu,pt %ncc, .bcopy_small              ! go to small copy
 977           nop
 978         ba,pt   %ncc, .bcopy_more               ! otherwise go to large copy
 979           nop
 980 .bcopy_8:
 981         sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
 982         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
 983         tst     %o3
 984         bz,pn   %icc, .bcopy_small              ! if zero, disable HW copy
 985           cmp   %o2, %o3                        ! if length <= limit
 986         bleu,pt %ncc, .bcopy_small              ! go to small copy
 987           nop
 988         ba,pt   %ncc, .bcopy_more               ! otherwise go to large copy
 989           nop
 990 
 991         .align  16
 992 .bcopy_small:
 993         ldn     [THREAD_REG + T_LOFAULT], %o4   ! save t_lofault
 994         tst     %o4
 995         bz,pt   %icc, .sm_do_copy
 996           nop
 997         sethi   %hi(.sm_copyerr), %o5
 998         or      %o5, %lo(.sm_copyerr), %o5
 999         membar  #Sync                           ! sync error barrier
1000         stn     %o5, [THREAD_REG + T_LOFAULT]   ! install new vector
1001         or      %o4, TRAMP_FLAG, %o4            ! error should trampoline
1002 .sm_do_copy:
1003         cmp     %o2, SHORTCOPY          ! check for really short case
1004         bleu,pt %ncc, .bc_sm_left       !
1005           cmp   %o2, CHKSIZE            ! check for medium length cases
1006         bgu,pn  %ncc, .bc_med           !
1007           or    %o0, %o1, %o3           ! prepare alignment check
1008         andcc   %o3, 0x3, %g0           ! test for alignment
1009         bz,pt   %ncc, .bc_sm_word       ! branch to word aligned case
1010 .bc_sm_movebytes:
1011           sub   %o2, 3, %o2             ! adjust count to allow cc zero test
1012 .bc_sm_notalign4:
1013         ldub    [%o0], %o3              ! read byte
1014         stb     %o3, [%o1]              ! write byte
1015         subcc   %o2, 4, %o2             ! reduce count by 4
1016         ldub    [%o0 + 1], %o3          ! repeat for a total of 4 bytes
1017         add     %o0, 4, %o0             ! advance SRC by 4
1018         stb     %o3, [%o1 + 1]
1019         ldub    [%o0 - 2], %o3
1020         add     %o1, 4, %o1             ! advance DST by 4
1021         stb     %o3, [%o1 - 2]
1022         ldub    [%o0 - 1], %o3
1023         bgt,pt  %ncc, .bc_sm_notalign4  ! loop til 3 or fewer bytes remain
1024           stb   %o3, [%o1 - 1]
1025         add     %o2, 3, %o2             ! restore count
1026 .bc_sm_left:
1027         tst     %o2
1028         bz,pt   %ncc, .bc_sm_exit       ! check for zero length
1029           deccc %o2                     ! reduce count for cc test
1030         ldub    [%o0], %o3              ! move one byte
1031         bz,pt   %ncc, .bc_sm_exit
1032           stb   %o3, [%o1]
1033         ldub    [%o0 + 1], %o3          ! move another byte
1034         deccc   %o2                     ! check for more
1035         bz,pt   %ncc, .bc_sm_exit
1036           stb   %o3, [%o1 + 1]
1037         ldub    [%o0 + 2], %o3          ! move final byte
1038         stb     %o3, [%o1 + 2]
1039         membar  #Sync                           ! sync error barrier
1040         andn    %o4, TRAMP_FLAG, %o4
1041         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1042         retl
1043           mov   %g0, %o0                ! return 0
1044         .align  16
1045         nop                             ! instruction alignment
1046                                         ! see discussion at start of file
1047 .bc_sm_words:
1048         lduw    [%o0], %o3              ! read word
1049 .bc_sm_wordx:
1050         subcc   %o2, 8, %o2             ! update count
1051         stw     %o3, [%o1]              ! write word
1052         add     %o0, 8, %o0             ! update SRC
1053         lduw    [%o0 - 4], %o3          ! read word
1054         add     %o1, 8, %o1             ! update DST
1055         bgt,pt  %ncc, .bc_sm_words      ! loop til done
1056           stw   %o3, [%o1 - 4]          ! write word
1057         addcc   %o2, 7, %o2             ! restore count
1058         bz,pt   %ncc, .bc_sm_exit
1059           deccc %o2
1060         bz,pt   %ncc, .bc_sm_byte
1061 .bc_sm_half:
1062           subcc %o2, 2, %o2             ! reduce count by 2
1063         add     %o0, 2, %o0             ! advance SRC by 2
1064         lduh    [%o0 - 2], %o3          ! read half word
1065         add     %o1, 2, %o1             ! advance DST by 2
1066         bgt,pt  %ncc, .bc_sm_half       ! loop til done
1067           sth   %o3, [%o1 - 2]          ! write half word
1068         addcc   %o2, 1, %o2             ! restore count
1069         bz,pt   %ncc, .bc_sm_exit
1070           nop
1071 .bc_sm_byte:
1072         ldub    [%o0], %o3
1073         stb     %o3, [%o1]
1074         membar  #Sync                           ! sync error barrier
1075         andn    %o4, TRAMP_FLAG, %o4
1076         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1077         retl
1078           mov   %g0, %o0                ! return 0
1079 
1080 .bc_sm_word:
1081         subcc   %o2, 4, %o2             ! update count
1082         bgt,pt  %ncc, .bc_sm_wordx
1083           lduw  [%o0], %o3              ! read word
1084         addcc   %o2, 3, %o2             ! restore count
1085         bz,pt   %ncc, .bc_sm_exit
1086           stw   %o3, [%o1]              ! write word
1087         deccc   %o2                     ! reduce count for cc test
1088         ldub    [%o0 + 4], %o3          ! load one byte
1089         bz,pt   %ncc, .bc_sm_exit
1090           stb   %o3, [%o1 + 4]          ! store one byte
1091         ldub    [%o0 + 5], %o3          ! load second byte
1092         deccc   %o2
1093         bz,pt   %ncc, .bc_sm_exit
1094           stb   %o3, [%o1 + 5]          ! store second byte
1095         ldub    [%o0 + 6], %o3          ! load third byte
1096         stb     %o3, [%o1 + 6]          ! store third byte
1097 .bc_sm_exit:
1098         membar  #Sync                           ! sync error barrier
1099         andn    %o4, TRAMP_FLAG, %o4
1100         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1101         retl
1102           mov   %g0, %o0                ! return 0
1103 
1104         .align 16
1105 .bc_med:
1106         xor     %o0, %o1, %o3           ! setup alignment check
1107         btst    1, %o3
1108         bnz,pt  %ncc, .bc_sm_movebytes  ! unaligned
1109           nop
1110         btst    3, %o3
1111         bnz,pt  %ncc, .bc_med_half      ! halfword aligned
1112           nop
1113         btst    7, %o3
1114         bnz,pt  %ncc, .bc_med_word      ! word aligned
1115           nop
1116 .bc_med_long:
1117         btst    3, %o0                  ! check for
1118         bz,pt   %ncc, .bc_med_long1     ! word alignment
1119           nop
1120 .bc_med_long0:
1121         ldub    [%o0], %o3              ! load one byte
1122         inc     %o0
1123         stb     %o3,[%o1]               ! store byte
1124         inc     %o1
1125         btst    3, %o0
1126         bnz,pt  %ncc, .bc_med_long0
1127           dec   %o2
1128 .bc_med_long1:                  ! word aligned
1129         btst    7, %o0                  ! check for long word
1130         bz,pt   %ncc, .bc_med_long2
1131           nop
1132         lduw    [%o0], %o3              ! load word
1133         add     %o0, 4, %o0             ! advance SRC by 4
1134         stw     %o3, [%o1]              ! store word
1135         add     %o1, 4, %o1             ! advance DST by 4
1136         sub     %o2, 4, %o2             ! reduce count by 4
1137 !
1138 !  Now long word aligned and have at least 32 bytes to move
1139 !
1140 .bc_med_long2:
1141         sub     %o2, 31, %o2            ! adjust count to allow cc zero test
1142 .bc_med_lmove:
1143         ldx     [%o0], %o3              ! read long word
1144         stx     %o3, [%o1]              ! write long word
1145         subcc   %o2, 32, %o2            ! reduce count by 32
1146         ldx     [%o0 + 8], %o3          ! repeat for a total for 4 long words
1147         add     %o0, 32, %o0            ! advance SRC by 32
1148         stx     %o3, [%o1 + 8]
1149         ldx     [%o0 - 16], %o3
1150         add     %o1, 32, %o1            ! advance DST by 32
1151         stx     %o3, [%o1 - 16]
1152         ldx     [%o0 - 8], %o3
1153         bgt,pt  %ncc, .bc_med_lmove     ! loop til 31 or fewer bytes left
1154           stx   %o3, [%o1 - 8]
1155         addcc   %o2, 24, %o2            ! restore count to long word offset
1156         ble,pt  %ncc, .bc_med_lextra    ! check for more long words to move
1157           nop
1158 .bc_med_lword:
1159         ldx     [%o0], %o3              ! read long word
1160         subcc   %o2, 8, %o2             ! reduce count by 8
1161         stx     %o3, [%o1]              ! write long word
1162         add     %o0, 8, %o0             ! advance SRC by 8
1163         bgt,pt  %ncc, .bc_med_lword     ! loop til 7 or fewer bytes left
1164           add   %o1, 8, %o1             ! advance DST by 8
1165 .bc_med_lextra:
1166         addcc   %o2, 7, %o2             ! restore rest of count
1167         bz,pt   %ncc, .bc_sm_exit       ! if zero, then done
1168           deccc %o2
1169         bz,pt   %ncc, .bc_sm_byte
1170           nop
1171         ba,pt   %ncc, .bc_sm_half
1172           nop
1173 
1174         .align 16
1175 .bc_med_word:
1176         btst    3, %o0                  ! check for
1177         bz,pt   %ncc, .bc_med_word1     ! word alignment
1178           nop
1179 .bc_med_word0:
1180         ldub    [%o0], %o3              ! load one byte
1181         inc     %o0
1182         stb     %o3,[%o1]               ! store byte
1183         inc     %o1
1184         btst    3, %o0
1185         bnz,pt  %ncc, .bc_med_word0
1186           dec   %o2
1187 !
1188 !  Now word aligned and have at least 36 bytes to move
1189 !
1190 .bc_med_word1:
1191         sub     %o2, 15, %o2            ! adjust count to allow cc zero test
1192 .bc_med_wmove:
1193         lduw    [%o0], %o3              ! read word
1194         stw     %o3, [%o1]              ! write word
1195         subcc   %o2, 16, %o2            ! reduce count by 16
1196         lduw    [%o0 + 4], %o3          ! repeat for a total for 4 words
1197         add     %o0, 16, %o0            ! advance SRC by 16
1198         stw     %o3, [%o1 + 4]
1199         lduw    [%o0 - 8], %o3
1200         add     %o1, 16, %o1            ! advance DST by 16
1201         stw     %o3, [%o1 - 8]
1202         lduw    [%o0 - 4], %o3
1203         bgt,pt  %ncc, .bc_med_wmove     ! loop til 15 or fewer bytes left
1204           stw   %o3, [%o1 - 4]
1205         addcc   %o2, 12, %o2            ! restore count to word offset
1206         ble,pt  %ncc, .bc_med_wextra    ! check for more words to move
1207           nop
1208 .bc_med_word2:
1209         lduw    [%o0], %o3              ! read word
1210         subcc   %o2, 4, %o2             ! reduce count by 4
1211         stw     %o3, [%o1]              ! write word
1212         add     %o0, 4, %o0             ! advance SRC by 4
1213         bgt,pt  %ncc, .bc_med_word2     ! loop til 3 or fewer bytes left
1214           add   %o1, 4, %o1             ! advance DST by 4
1215 .bc_med_wextra:
1216         addcc   %o2, 3, %o2             ! restore rest of count
1217         bz,pt   %ncc, .bc_sm_exit       ! if zero, then done
1218           deccc %o2
1219         bz,pt   %ncc, .bc_sm_byte
1220           nop
1221         ba,pt   %ncc, .bc_sm_half
1222           nop
1223 
1224         .align 16
1225 .bc_med_half:
1226         btst    1, %o0                  ! check for
1227         bz,pt   %ncc, .bc_med_half1     ! half word alignment
1228           nop
1229         ldub    [%o0], %o3              ! load one byte
1230         inc     %o0
1231         stb     %o3,[%o1]               ! store byte
1232         inc     %o1
1233         dec     %o2
1234 !
1235 !  Now half word aligned and have at least 38 bytes to move
1236 !
1237 .bc_med_half1:
1238         sub     %o2, 7, %o2             ! adjust count to allow cc zero test
1239 .bc_med_hmove:
1240         lduh    [%o0], %o3              ! read half word
1241         sth     %o3, [%o1]              ! write half word
1242         subcc   %o2, 8, %o2             ! reduce count by 8
1243         lduh    [%o0 + 2], %o3          ! repeat for a total for 4 halfwords
1244         add     %o0, 8, %o0             ! advance SRC by 8
1245         sth     %o3, [%o1 + 2]
1246         lduh    [%o0 - 4], %o3
1247         add     %o1, 8, %o1             ! advance DST by 8
1248         sth     %o3, [%o1 - 4]
1249         lduh    [%o0 - 2], %o3
1250         bgt,pt  %ncc, .bc_med_hmove     ! loop til 7 or fewer bytes left
1251           sth   %o3, [%o1 - 2]
1252         addcc   %o2, 7, %o2             ! restore count
1253         bz,pt   %ncc, .bc_sm_exit
1254           deccc %o2
1255         bz,pt   %ncc, .bc_sm_byte
1256           nop
1257         ba,pt   %ncc, .bc_sm_half
1258           nop
1259 
1260         SET_SIZE(bcopy)
1261 
1262 /*
1263  * The _more entry points are not intended to be used directly by
1264  * any caller from outside this file.  They are provided to allow
1265  * profiling and dtrace of the portions of the copy code that uses
1266  * the floating point registers.
1267  * This entry is particularly important as DTRACE (at least as of
1268  * 4/2004) does not support leaf functions.
1269  */
1270 
1271         ENTRY(bcopy_more)
1272 .bcopy_more:            
1273         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1274         ldn     [THREAD_REG + T_LOFAULT], %l6   ! save t_lofault
1275         tst     %l6
1276         bz,pt   %ncc, .do_copy
1277           nop
1278         sethi   %hi(.copyerr), %o2
1279         or      %o2, %lo(.copyerr), %o2
1280         membar  #Sync                           ! sync error barrier
1281         stn     %o2, [THREAD_REG + T_LOFAULT]   ! install new vector
1282         !
1283         ! We've already captured whether t_lofault was zero on entry.
1284         ! We need to mark ourselves as being from bcopy since both
1285         ! kcopy and bcopy use the same code path. If TRAMP_FLAG is set
1286         ! and the saved lofault was zero, we won't reset lofault on
1287         ! returning.
1288         !
1289         or      %l6, TRAMP_FLAG, %l6
1290 
1291 /*
1292  * Copies that reach here are larger than VIS_COPY_THRESHOLD bytes
1293  * Also, use of FP registers has been tested to be enabled
1294  */
1295 .do_copy:
1296         FP_NOMIGRATE(6, 7)
1297 
1298         rd      %fprs, %o2              ! check for unused fp
1299         st      %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
1300         btst    FPRS_FEF, %o2
1301         bz,a,pt %icc, .do_blockcopy
1302           wr    %g0, FPRS_FEF, %fprs
1303 
1304         BST_FPQ1Q3_TOSTACK(%o2)
1305 
1306 .do_blockcopy:
1307         rd      %gsr, %o2
1308         stx     %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]      ! save gsr
1309         or      %l6, FPUSED_FLAG, %l6
1310 
1311 #define REALSRC %i0
1312 #define DST     %i1
1313 #define CNT     %i2
1314 #define SRC     %i3
1315 #define TMP     %i5
1316 
1317         andcc   DST, VIS_BLOCKSIZE - 1, TMP
1318         bz,pt   %ncc, 2f
1319           neg   TMP
1320         add     TMP, VIS_BLOCKSIZE, TMP
1321 
1322         ! TMP = bytes required to align DST on FP_BLOCK boundary
1323         ! Using SRC as a tmp here
1324         cmp     TMP, 3
1325         bleu,pt %ncc, 1f
1326           sub   CNT,TMP,CNT             ! adjust main count
1327         sub     TMP, 3, TMP             ! adjust for end of loop test
1328 .bc_blkalign:
1329         ldub    [REALSRC], SRC          ! move 4 bytes per loop iteration
1330         stb     SRC, [DST]
1331         subcc   TMP, 4, TMP
1332         ldub    [REALSRC + 1], SRC
1333         add     REALSRC, 4, REALSRC
1334         stb     SRC, [DST + 1]
1335         ldub    [REALSRC - 2], SRC
1336         add     DST, 4, DST
1337         stb     SRC, [DST - 2]
1338         ldub    [REALSRC - 1], SRC
1339         bgu,pt  %ncc, .bc_blkalign
1340           stb   SRC, [DST - 1]
1341 
1342         addcc   TMP, 3, TMP             ! restore count adjustment
1343         bz,pt   %ncc, 2f                ! no bytes left?
1344           nop
1345 1:      ldub    [REALSRC], SRC
1346         inc     REALSRC
1347         inc     DST
1348         deccc   TMP
1349         bgu     %ncc, 1b
1350           stb   SRC, [DST - 1]
1351 
1352 2:
1353         andn    REALSRC, 0x7, SRC
1354         alignaddr REALSRC, %g0, %g0
1355 
1356         ! SRC - 8-byte aligned
1357         ! DST - 64-byte aligned
1358         prefetch [SRC], #one_read
1359         prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read
1360         prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read
1361         prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read
1362         ldd     [SRC], %f0
1363 #if CHEETAH_PREFETCH > 4
1364         prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
1365 #endif
1366         ldd     [SRC + 0x08], %f2
1367 #if CHEETAH_PREFETCH > 5
1368         prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read
1369 #endif
1370         ldd     [SRC + 0x10], %f4
1371 #if CHEETAH_PREFETCH > 6
1372         prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read
1373 #endif
1374         faligndata %f0, %f2, %f32
1375         ldd     [SRC + 0x18], %f6
1376 #if CHEETAH_PREFETCH > 7
1377         prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read
1378 #endif
1379         faligndata %f2, %f4, %f34
1380         ldd     [SRC + 0x20], %f8
1381         faligndata %f4, %f6, %f36
1382         ldd     [SRC + 0x28], %f10
1383         faligndata %f6, %f8, %f38
1384         ldd     [SRC + 0x30], %f12
1385         faligndata %f8, %f10, %f40
1386         ldd     [SRC + 0x38], %f14
1387         faligndata %f10, %f12, %f42
1388         ldd     [SRC + VIS_BLOCKSIZE], %f0
1389         sub     CNT, VIS_BLOCKSIZE, CNT
1390         add     SRC, VIS_BLOCKSIZE, SRC
1391         add     REALSRC, VIS_BLOCKSIZE, REALSRC
1392         ba,a,pt %ncc, 1f
1393           nop
1394         .align  16
1395 1:
1396         ldd     [SRC + 0x08], %f2
1397         faligndata %f12, %f14, %f44
1398         ldd     [SRC + 0x10], %f4
1399         faligndata %f14, %f0, %f46
1400         stda    %f32, [DST]ASI_BLK_P
1401         ldd     [SRC + 0x18], %f6
1402         faligndata %f0, %f2, %f32
1403         ldd     [SRC + 0x20], %f8
1404         faligndata %f2, %f4, %f34
1405         ldd     [SRC + 0x28], %f10
1406         faligndata %f4, %f6, %f36
1407         ldd     [SRC + 0x30], %f12
1408         faligndata %f6, %f8, %f38
1409         ldd     [SRC + 0x38], %f14
1410         faligndata %f8, %f10, %f40
1411         sub     CNT, VIS_BLOCKSIZE, CNT
1412         ldd     [SRC + VIS_BLOCKSIZE], %f0
1413         faligndata %f10, %f12, %f42
1414         prefetch [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read
1415         add     DST, VIS_BLOCKSIZE, DST
1416         prefetch [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1417         add     REALSRC, VIS_BLOCKSIZE, REALSRC
1418         cmp     CNT, VIS_BLOCKSIZE + 8
1419         bgu,pt  %ncc, 1b
1420           add   SRC, VIS_BLOCKSIZE, SRC
1421 
1422         ! only if REALSRC & 0x7 is 0
1423         cmp     CNT, VIS_BLOCKSIZE
1424         bne     %ncc, 3f
1425           andcc REALSRC, 0x7, %g0
1426         bz,pt   %ncc, 2f
1427           nop
1428 3:      
1429         faligndata %f12, %f14, %f44
1430         faligndata %f14, %f0, %f46
1431         stda    %f32, [DST]ASI_BLK_P
1432         add     DST, VIS_BLOCKSIZE, DST
1433         ba,pt   %ncc, 3f
1434           nop
1435 2:
1436         ldd     [SRC + 0x08], %f2
1437         fsrc1   %f12, %f44
1438         ldd     [SRC + 0x10], %f4
1439         fsrc1   %f14, %f46
1440         stda    %f32, [DST]ASI_BLK_P
1441         ldd     [SRC + 0x18], %f6
1442         fsrc1   %f0, %f32
1443         ldd     [SRC + 0x20], %f8
1444         fsrc1   %f2, %f34
1445         ldd     [SRC + 0x28], %f10
1446         fsrc1   %f4, %f36
1447         ldd     [SRC + 0x30], %f12
1448         fsrc1   %f6, %f38
1449         ldd     [SRC + 0x38], %f14
1450         fsrc1   %f8, %f40
1451         sub     CNT, VIS_BLOCKSIZE, CNT
1452         add     DST, VIS_BLOCKSIZE, DST
1453         add     SRC, VIS_BLOCKSIZE, SRC
1454         add     REALSRC, VIS_BLOCKSIZE, REALSRC
1455         fsrc1   %f10, %f42
1456         fsrc1   %f12, %f44
1457         fsrc1   %f14, %f46
1458         stda    %f32, [DST]ASI_BLK_P
1459         add     DST, VIS_BLOCKSIZE, DST
1460         ba,a,pt %ncc, .bcb_exit
1461           nop
1462 
1463 3:      tst     CNT
1464         bz,a,pt %ncc, .bcb_exit
1465           nop
1466 
1467 5:      ldub    [REALSRC], TMP
1468         inc     REALSRC
1469         inc     DST
1470         deccc   CNT
1471         bgu     %ncc, 5b
1472           stb   TMP, [DST - 1]
1473 .bcb_exit:
1474         membar  #Sync
1475 
1476         FPRAS_INTERVAL(FPRAS_BCOPY, 0, %l5, %o2, %o3, %o4, %o5, 8)
1477         FPRAS_REWRITE_TYPE2Q1(0, %l5, %o2, %o3, 8, 9)
1478         FPRAS_CHECK(FPRAS_BCOPY, %l5, 9)        ! outputs lost
1479 
1480         ldx     [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2      ! restore gsr
1481         wr      %o2, 0, %gsr
1482 
1483         ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1484         btst    FPRS_FEF, %o3
1485         bz,pt   %icc, 4f
1486           nop
1487 
1488         BLD_FPQ1Q3_FROMSTACK(%o2)
1489 
1490         ba,pt   %ncc, 2f        
1491           wr    %o3, 0, %fprs           ! restore fprs
1492 4:
1493         FZEROQ1Q3
1494         wr      %o3, 0, %fprs           ! restore fprs
1495 2:
1496         membar  #Sync                           ! sync error barrier
1497         andn    %l6, MASK_FLAGS, %l6
1498         stn     %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1499         FP_ALLOWMIGRATE(5, 6)
1500         ret
1501           restore       %g0, 0, %o0
1502 
1503         SET_SIZE(bcopy_more)
1504 
1505 #endif  /* lint */
1506 
1507 /*
1508  * Block copy with possibly overlapped operands.
1509  */
1510 
1511 #if defined(lint)
1512 
1513 /*ARGSUSED*/
1514 void
1515 ovbcopy(const void *from, void *to, size_t count)
1516 {}
1517 
1518 #else   /* lint */
1519 
1520         ENTRY(ovbcopy)
1521         tst     %o2                     ! check count
1522         bgu,a   %ncc, 1f                ! nothing to do or bad arguments
1523           subcc %o0, %o1, %o3           ! difference of from and to address
1524 
1525         retl                            ! return
1526           nop
1527 1:
1528         bneg,a  %ncc, 2f
1529           neg   %o3                     ! if < 0, make it positive
1530 2:      cmp     %o2, %o3                ! cmp size and abs(from - to)
1531         bleu    %ncc, bcopy             ! if size <= abs(diff): use bcopy,
1532           .empty                                !   no overlap
1533           cmp   %o0, %o1                ! compare from and to addresses
1534         blu     %ncc, .ov_bkwd          ! if from < to, copy backwards
1535           nop
1536         !
1537         ! Copy forwards.
1538         !
1539 .ov_fwd:
1540         ldub    [%o0], %o3              ! read from address
1541         inc     %o0                     ! inc from address
1542         stb     %o3, [%o1]              ! write to address
1543         deccc   %o2                     ! dec count
1544         bgu     %ncc, .ov_fwd           ! loop till done
1545           inc   %o1                     ! inc to address
1546 
1547         retl                            ! return
1548           nop
1549         !
1550         ! Copy backwards.
1551         !
1552 .ov_bkwd:
1553         deccc   %o2                     ! dec count
1554         ldub    [%o0 + %o2], %o3        ! get byte at end of src
1555         bgu     %ncc, .ov_bkwd          ! loop till done
1556           stb   %o3, [%o1 + %o2]        ! delay slot, store at end of dst
1557 
1558         retl                            ! return
1559           nop
1560 
1561         SET_SIZE(ovbcopy)
1562 
1563 #endif  /* lint */
1564 
1565 
1566 /*
1567  * hwblkpagecopy()
1568  *
1569  * Copies exactly one page.  This routine assumes the caller (ppcopy)
1570  * has already disabled kernel preemption and has checked
1571  * use_hw_bcopy.  Preventing preemption also prevents cpu migration.
1572  */
1573 #ifdef lint
1574 /*ARGSUSED*/
1575 void
1576 hwblkpagecopy(const void *src, void *dst)
1577 { }
1578 #else /* lint */
1579         ENTRY(hwblkpagecopy)
1580         ! get another window w/space for three aligned blocks of saved fpregs
1581         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1582 
1583         ! %i0 - source address (arg)
1584         ! %i1 - destination address (arg)
1585         ! %i2 - length of region (not arg)
1586         ! %l0 - saved fprs
1587         ! %l1 - pointer to saved fpregs
1588 
1589         rd      %fprs, %l0              ! check for unused fp
1590         btst    FPRS_FEF, %l0
1591         bz,a,pt %icc, 1f
1592           wr    %g0, FPRS_FEF, %fprs
1593 
1594         BST_FPQ1Q3_TOSTACK(%l1)
1595 
1596 1:      set     PAGESIZE, CNT
1597         mov     REALSRC, SRC
1598 
1599         prefetch [SRC], #one_read
1600         prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read
1601         prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read
1602         prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read
1603         ldd     [SRC], %f0
1604 #if CHEETAH_PREFETCH > 4
1605         prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
1606 #endif
1607         ldd     [SRC + 0x08], %f2
1608 #if CHEETAH_PREFETCH > 5
1609         prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read
1610 #endif
1611         ldd     [SRC + 0x10], %f4
1612 #if CHEETAH_PREFETCH > 6
1613         prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read
1614 #endif
1615         fsrc1   %f0, %f32
1616         ldd     [SRC + 0x18], %f6
1617 #if CHEETAH_PREFETCH > 7
1618         prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read
1619 #endif
1620         fsrc1   %f2, %f34
1621         ldd     [SRC + 0x20], %f8
1622         fsrc1   %f4, %f36
1623         ldd     [SRC + 0x28], %f10
1624         fsrc1   %f6, %f38
1625         ldd     [SRC + 0x30], %f12
1626         fsrc1   %f8, %f40
1627         ldd     [SRC + 0x38], %f14
1628         fsrc1   %f10, %f42
1629         ldd     [SRC + VIS_BLOCKSIZE], %f0
1630         sub     CNT, VIS_BLOCKSIZE, CNT
1631         add     SRC, VIS_BLOCKSIZE, SRC
1632         ba,a,pt %ncc, 2f
1633           nop
1634         .align  16
1635 2:
1636         ldd     [SRC + 0x08], %f2
1637         fsrc1   %f12, %f44
1638         ldd     [SRC + 0x10], %f4
1639         fsrc1   %f14, %f46
1640         stda    %f32, [DST]ASI_BLK_P
1641         ldd     [SRC + 0x18], %f6
1642         fsrc1   %f0, %f32
1643         ldd     [SRC + 0x20], %f8
1644         fsrc1   %f2, %f34
1645         ldd     [SRC + 0x28], %f10
1646         fsrc1   %f4, %f36
1647         ldd     [SRC + 0x30], %f12
1648         fsrc1   %f6, %f38
1649         ldd     [SRC + 0x38], %f14
1650         fsrc1   %f8, %f40
1651         ldd     [SRC + VIS_BLOCKSIZE], %f0
1652         fsrc1   %f10, %f42
1653         prefetch [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read
1654         sub     CNT, VIS_BLOCKSIZE, CNT
1655         add     DST, VIS_BLOCKSIZE, DST
1656         cmp     CNT, VIS_BLOCKSIZE + 8
1657         prefetch [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1658         bgu,pt  %ncc, 2b
1659           add   SRC, VIS_BLOCKSIZE, SRC
1660 
1661         ! trailing block
1662         ldd     [SRC + 0x08], %f2
1663         fsrc1   %f12, %f44
1664         ldd     [SRC + 0x10], %f4
1665         fsrc1   %f14, %f46
1666         stda    %f32, [DST]ASI_BLK_P
1667         ldd     [SRC + 0x18], %f6
1668         fsrc1   %f0, %f32
1669         ldd     [SRC + 0x20], %f8
1670         fsrc1   %f2, %f34
1671         ldd     [SRC + 0x28], %f10
1672         fsrc1   %f4, %f36
1673         ldd     [SRC + 0x30], %f12
1674         fsrc1   %f6, %f38
1675         ldd     [SRC + 0x38], %f14
1676         fsrc1   %f8, %f40
1677         sub     CNT, VIS_BLOCKSIZE, CNT
1678         add     DST, VIS_BLOCKSIZE, DST
1679         add     SRC, VIS_BLOCKSIZE, SRC
1680         fsrc1   %f10, %f42
1681         fsrc1   %f12, %f44
1682         fsrc1   %f14, %f46
1683         stda    %f32, [DST]ASI_BLK_P
1684 
1685         membar  #Sync
1686 
1687         FPRAS_INTERVAL(FPRAS_PGCOPY, 1, %l5, %o2, %o3, %o4, %o5, 8)
1688         FPRAS_REWRITE_TYPE1(1, %l5, %f32, %o2, 9)
1689         FPRAS_CHECK(FPRAS_PGCOPY, %l5, 9)       ! lose outputs
1690 
1691         btst    FPRS_FEF, %l0
1692         bz,pt   %icc, 2f
1693           nop
1694 
1695         BLD_FPQ1Q3_FROMSTACK(%l3)
1696         ba      3f
1697           nop
1698 
1699 2:      FZEROQ1Q3
1700 
1701 3:      wr      %l0, 0, %fprs           ! restore fprs
1702         ret
1703           restore       %g0, 0, %o0
1704 
1705         SET_SIZE(hwblkpagecopy)
1706 #endif  /* lint */
1707 
1708 
1709 /*
1710  * Transfer data to and from user space -
1711  * Note that these routines can cause faults
1712  * It is assumed that the kernel has nothing at
1713  * less than KERNELBASE in the virtual address space.
1714  *
1715  * Note that copyin(9F) and copyout(9F) are part of the
1716  * DDI/DKI which specifies that they return '-1' on "errors."
1717  *
1718  * Sigh.
1719  *
1720  * So there's two extremely similar routines - xcopyin() and xcopyout()
1721  * which return the errno that we've faithfully computed.  This
1722  * allows other callers (e.g. uiomove(9F)) to work correctly.
1723  * Given that these are used pretty heavily, we expand the calling
1724  * sequences inline for all flavours (rather than making wrappers).
1725  *
1726  * There are also stub routines for xcopyout_little and xcopyin_little,
1727  * which currently are intended to handle requests of <= 16 bytes from
1728  * do_unaligned. Future enhancement to make them handle 8k pages efficiently
1729  * is left as an exercise...
1730  */
1731 
1732 /*
1733  * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
1734  *      
1735  * General theory of operation:
1736  *
1737  * The only difference between copy{in,out} and
1738  * xcopy{in,out} is in the error handling routine they invoke
1739  * when a memory access error occurs. xcopyOP returns the errno
1740  * while copyOP returns -1 (see above). copy{in,out}_noerr set
1741  * a special flag (by oring the TRAMP_FLAG into the fault handler address)
1742  * if they are called with a fault handler already in place. That flag
1743  * causes the default handlers to trampoline to the previous handler
1744  * upon an error.
1745  *
1746  * None of the copyops routines grab a window until it's decided that
1747  * we need to do a HW block copy operation. This saves a window
1748  * spill/fill when we're called during socket ops. The typical IO
1749  * path won't cause spill/fill traps.
1750  *
1751  * This code uses a set of 4 limits for the maximum size that will
1752  * be copied given a particular input/output address alignment.
1753  * If the value for a particular limit is zero, the copy will be performed
1754  * by the plain copy loops rather than FPBLK.
1755  *
1756  * See the description of bcopy above for more details of the
1757  * data copying algorithm and the default limits.
1758  *
1759  */
1760 
1761 /*
1762  * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
1763  */
1764 
1765 #if defined(lint)
1766 
1767 
1768 #else   /* lint */
1769 /*
1770  * We save the arguments in the following registers in case of a fault:
1771  *      kaddr - %l1
1772  *      uaddr - %l2
1773  *      count - %l3
1774  */
1775 #define SAVE_SRC        %l1
1776 #define SAVE_DST        %l2
1777 #define SAVE_COUNT      %l3
1778 
1779 #define SM_SAVE_SRC             %g4
1780 #define SM_SAVE_DST             %g5
1781 #define SM_SAVE_COUNT           %o5
1782 #define ERRNO           %l5
1783 
1784 
1785 #define REAL_LOFAULT    %l4
1786 /*
1787  * Generic copyio fault handler.  This is the first line of defense when a
1788  * fault occurs in (x)copyin/(x)copyout.  In order for this to function
1789  * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
1790  * This allows us to share common code for all the flavors of the copy
1791  * operations, including the _noerr versions.
1792  *
1793  * Note that this function will restore the original input parameters before
1794  * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
1795  * member of the t_copyop structure, if needed.
1796  */
1797         ENTRY(copyio_fault)
1798         membar  #Sync
1799         mov     %g1,ERRNO                       ! save errno in ERRNO
1800         btst    FPUSED_FLAG, %l6
1801         bz      %ncc, 1f
1802           nop
1803 
1804         ldx     [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
1805         wr      %o2, 0, %gsr            ! restore gsr
1806 
1807         ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1808         btst    FPRS_FEF, %o3
1809         bz,pt   %icc, 4f
1810           nop
1811 
1812         BLD_FPQ2Q4_FROMSTACK(%o2)
1813 
1814         ba,pt   %ncc, 1f
1815           wr    %o3, 0, %fprs           ! restore fprs
1816 
1817 4:
1818         FZEROQ2Q4
1819         wr      %o3, 0, %fprs           ! restore fprs
1820 
1821 1:
1822         andn    %l6, FPUSED_FLAG, %l6
1823         membar  #Sync
1824         stn     %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1825         FP_ALLOWMIGRATE(5, 6)
1826 
1827         mov     SAVE_SRC, %i0
1828         mov     SAVE_DST, %i1
1829         jmp     REAL_LOFAULT
1830           mov   SAVE_COUNT, %i2
1831 
1832         SET_SIZE(copyio_fault)
1833 
1834 
1835 #endif
1836 
1837 #if defined(lint)
1838 
1839 /*ARGSUSED*/
1840 int
1841 copyout(const void *kaddr, void *uaddr, size_t count)
1842 { return (0); }
1843 
1844 #else   /* lint */
1845 
1846         ENTRY(copyout)
1847 
1848         cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
1849         bleu,pt %ncc, .copyout_small            ! go to larger cases
1850           xor   %o0, %o1, %o3                   ! are src, dst alignable?
1851         btst    7, %o3                          !
1852         bz,pt   %ncc, .copyout_8                ! check for longword alignment
1853           nop
1854         btst    1, %o3                          ! 
1855         bz,pt   %ncc, .copyout_2                ! check for half-word
1856           nop
1857         sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
1858         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
1859         tst     %o3
1860         bz,pn   %icc, .copyout_small            ! if zero, disable HW copy
1861           cmp   %o2, %o3                        ! if length <= limit
1862         bleu,pt %ncc, .copyout_small            ! go to small copy
1863           nop
1864         ba,pt   %ncc, .copyout_more             ! otherwise go to large copy
1865           nop
1866 .copyout_2:
1867         btst    3, %o3                          !
1868         bz,pt   %ncc, .copyout_4                ! check for word alignment
1869           nop
1870         sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
1871         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
1872         tst     %o3
1873         bz,pn   %icc, .copyout_small            ! if zero, disable HW copy
1874           cmp   %o2, %o3                        ! if length <= limit
1875         bleu,pt %ncc, .copyout_small            ! go to small copy
1876           nop
1877         ba,pt   %ncc, .copyout_more             ! otherwise go to large copy
1878           nop
1879 .copyout_4:
1880         ! already checked longword, must be word aligned
1881         sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
1882         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
1883         tst     %o3
1884         bz,pn   %icc, .copyout_small            ! if zero, disable HW copy
1885           cmp   %o2, %o3                        ! if length <= limit
1886         bleu,pt %ncc, .copyout_small            ! go to small copy
1887           nop
1888         ba,pt   %ncc, .copyout_more             ! otherwise go to large copy
1889           nop
1890 .copyout_8:
1891         sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
1892         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
1893         tst     %o3
1894         bz,pn   %icc, .copyout_small            ! if zero, disable HW copy
1895           cmp   %o2, %o3                        ! if length <= limit
1896         bleu,pt %ncc, .copyout_small            ! go to small copy
1897           nop
1898         ba,pt   %ncc, .copyout_more             ! otherwise go to large copy
1899           nop
1900 
1901         .align  16
1902         nop                             ! instruction alignment
1903                                         ! see discussion at start of file
1904 .copyout_small:
1905         sethi   %hi(.sm_copyout_err), %o5       ! .sm_copyout_err is lofault
1906         or      %o5, %lo(.sm_copyout_err), %o5
1907         ldn     [THREAD_REG + T_LOFAULT], %o4   ! save existing handler
1908         membar  #Sync                           ! sync error barrier
1909         stn     %o5, [THREAD_REG + T_LOFAULT]   ! set t_lofault
1910 .sm_do_copyout:
1911         mov     %o0, SM_SAVE_SRC
1912         mov     %o1, SM_SAVE_DST
1913         cmp     %o2, SHORTCOPY          ! check for really short case
1914         bleu,pt %ncc, .co_sm_left       !
1915           mov   %o2, SM_SAVE_COUNT
1916         cmp     %o2, CHKSIZE            ! check for medium length cases
1917         bgu,pn  %ncc, .co_med           !
1918           or    %o0, %o1, %o3           ! prepare alignment check
1919         andcc   %o3, 0x3, %g0           ! test for alignment
1920         bz,pt   %ncc, .co_sm_word       ! branch to word aligned case
1921 .co_sm_movebytes:
1922           sub   %o2, 3, %o2             ! adjust count to allow cc zero test
1923 .co_sm_notalign4:
1924         ldub    [%o0], %o3              ! read byte
1925         subcc   %o2, 4, %o2             ! reduce count by 4
1926         stba    %o3, [%o1]ASI_USER      ! write byte
1927         inc     %o1                     ! advance DST by 1
1928         ldub    [%o0 + 1], %o3          ! repeat for a total of 4 bytes
1929         add     %o0, 4, %o0             ! advance SRC by 4
1930         stba    %o3, [%o1]ASI_USER
1931         inc     %o1                     ! advance DST by 1
1932         ldub    [%o0 - 2], %o3
1933         stba    %o3, [%o1]ASI_USER
1934         inc     %o1                     ! advance DST by 1
1935         ldub    [%o0 - 1], %o3
1936         stba    %o3, [%o1]ASI_USER
1937         bgt,pt  %ncc, .co_sm_notalign4  ! loop til 3 or fewer bytes remain
1938           inc   %o1                     ! advance DST by 1
1939         add     %o2, 3, %o2             ! restore count
1940 .co_sm_left:
1941         tst     %o2
1942         bz,pt   %ncc, .co_sm_exit       ! check for zero length
1943           nop
1944         ldub    [%o0], %o3              ! load one byte
1945         deccc   %o2                     ! reduce count for cc test
1946         bz,pt   %ncc, .co_sm_exit
1947           stba  %o3,[%o1]ASI_USER       ! store one byte
1948         ldub    [%o0 + 1], %o3          ! load second byte
1949         deccc   %o2
1950         inc     %o1
1951         bz,pt   %ncc, .co_sm_exit
1952           stba  %o3,[%o1]ASI_USER       ! store second byte
1953         ldub    [%o0 + 2], %o3          ! load third byte
1954         inc     %o1
1955         stba    %o3,[%o1]ASI_USER       ! store third byte
1956         membar  #Sync                           ! sync error barrier
1957         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1958         retl
1959           mov   %g0, %o0                ! return 0
1960         .align  16
1961 .co_sm_words:
1962         lduw    [%o0], %o3              ! read word
1963 .co_sm_wordx:
1964         subcc   %o2, 8, %o2             ! update count
1965         stwa    %o3, [%o1]ASI_USER      ! write word
1966         add     %o0, 8, %o0             ! update SRC
1967         lduw    [%o0 - 4], %o3          ! read word
1968         add     %o1, 4, %o1             ! update DST
1969         stwa    %o3, [%o1]ASI_USER      ! write word
1970         bgt,pt  %ncc, .co_sm_words      ! loop til done
1971           add   %o1, 4, %o1             ! update DST
1972         addcc   %o2, 7, %o2             ! restore count
1973         bz,pt   %ncc, .co_sm_exit
1974           nop
1975         deccc   %o2
1976         bz,pt   %ncc, .co_sm_byte
1977 .co_sm_half:
1978           subcc %o2, 2, %o2             ! reduce count by 2
1979         lduh    [%o0], %o3              ! read half word
1980         add     %o0, 2, %o0             ! advance SRC by 2
1981         stha    %o3, [%o1]ASI_USER      ! write half word
1982         bgt,pt  %ncc, .co_sm_half       ! loop til done
1983           add   %o1, 2, %o1             ! advance DST by 2
1984         addcc   %o2, 1, %o2             ! restore count
1985         bz,pt   %ncc, .co_sm_exit
1986           nop
1987 .co_sm_byte:
1988         ldub    [%o0], %o3
1989         stba    %o3, [%o1]ASI_USER
1990         membar  #Sync                           ! sync error barrier
1991         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1992         retl
1993           mov   %g0, %o0                ! return 0
1994         .align 16
1995 .co_sm_word:
1996         subcc   %o2, 4, %o2             ! update count
1997         bgt,pt  %ncc, .co_sm_wordx
1998           lduw  [%o0], %o3              ! read word
1999         addcc   %o2, 3, %o2             ! restore count
2000         bz,pt   %ncc, .co_sm_exit
2001           stwa  %o3, [%o1]ASI_USER      ! write word
2002         deccc   %o2                     ! reduce count for cc test
2003         ldub    [%o0 + 4], %o3          ! load one byte
2004         add     %o1, 4, %o1
2005         bz,pt   %ncc, .co_sm_exit
2006           stba  %o3, [%o1]ASI_USER      ! store one byte
2007         ldub    [%o0 + 5], %o3          ! load second byte
2008         deccc   %o2
2009         inc     %o1
2010         bz,pt   %ncc, .co_sm_exit
2011           stba  %o3, [%o1]ASI_USER      ! store second byte
2012         ldub    [%o0 + 6], %o3          ! load third byte
2013         inc     %o1
2014         stba    %o3, [%o1]ASI_USER      ! store third byte
2015 .co_sm_exit:
2016           membar        #Sync                           ! sync error barrier
2017         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2018         retl
2019           mov   %g0, %o0                ! return 0
2020 
2021         .align 16
2022 .co_med:
2023         xor     %o0, %o1, %o3           ! setup alignment check
2024         btst    1, %o3
2025         bnz,pt  %ncc, .co_sm_movebytes  ! unaligned
2026           nop
2027         btst    3, %o3
2028         bnz,pt  %ncc, .co_med_half      ! halfword aligned
2029           nop
2030         btst    7, %o3
2031         bnz,pt  %ncc, .co_med_word      ! word aligned
2032           nop
2033 .co_med_long:
2034         btst    3, %o0                  ! check for
2035         bz,pt   %ncc, .co_med_long1     ! word alignment
2036           nop
2037 .co_med_long0:
2038         ldub    [%o0], %o3              ! load one byte
2039         inc     %o0
2040         stba    %o3,[%o1]ASI_USER       ! store byte
2041         inc     %o1
2042         btst    3, %o0
2043         bnz,pt  %ncc, .co_med_long0
2044           dec   %o2
2045 .co_med_long1:                  ! word aligned
2046         btst    7, %o0                  ! check for long word
2047         bz,pt   %ncc, .co_med_long2
2048           nop
2049         lduw    [%o0], %o3              ! load word
2050         add     %o0, 4, %o0             ! advance SRC by 4
2051         stwa    %o3, [%o1]ASI_USER      ! store word
2052         add     %o1, 4, %o1             ! advance DST by 4
2053         sub     %o2, 4, %o2             ! reduce count by 4
2054 !
2055 !  Now long word aligned and have at least 32 bytes to move
2056 !
2057 .co_med_long2:
2058         sub     %o2, 31, %o2            ! adjust count to allow cc zero test
2059         sub     %o1, 8, %o1             ! adjust pointer to allow store in
2060                                         ! branch delay slot instead of add
2061 .co_med_lmove:
2062         add     %o1, 8, %o1             ! advance DST by 8
2063         ldx     [%o0], %o3              ! read long word
2064         subcc   %o2, 32, %o2            ! reduce count by 32
2065         stxa    %o3, [%o1]ASI_USER      ! write long word
2066         add     %o1, 8, %o1             ! advance DST by 8
2067         ldx     [%o0 + 8], %o3          ! repeat for a total for 4 long words
2068         add     %o0, 32, %o0            ! advance SRC by 32
2069         stxa    %o3, [%o1]ASI_USER
2070         ldx     [%o0 - 16], %o3
2071         add     %o1, 8, %o1             ! advance DST by 8
2072         stxa    %o3, [%o1]ASI_USER
2073         ldx     [%o0 - 8], %o3
2074         add     %o1, 8, %o1             ! advance DST by 8
2075         bgt,pt  %ncc, .co_med_lmove     ! loop til 31 or fewer bytes left
2076           stxa  %o3, [%o1]ASI_USER
2077         add     %o1, 8, %o1             ! advance DST by 8
2078         addcc   %o2, 24, %o2            ! restore count to long word offset
2079         ble,pt  %ncc, .co_med_lextra    ! check for more long words to move
2080           nop
2081 .co_med_lword:
2082         ldx     [%o0], %o3              ! read long word
2083         subcc   %o2, 8, %o2             ! reduce count by 8
2084         stxa    %o3, [%o1]ASI_USER      ! write long word
2085         add     %o0, 8, %o0             ! advance SRC by 8
2086         bgt,pt  %ncc, .co_med_lword     ! loop til 7 or fewer bytes left
2087           add   %o1, 8, %o1             ! advance DST by 8
2088 .co_med_lextra:
2089         addcc   %o2, 7, %o2             ! restore rest of count
2090         bz,pt   %ncc, .co_sm_exit       ! if zero, then done
2091           deccc %o2
2092         bz,pt   %ncc, .co_sm_byte
2093           nop
2094         ba,pt   %ncc, .co_sm_half
2095           nop
2096 
2097         .align 16
2098         nop                             ! instruction alignment
2099                                         ! see discussion at start of file
2100 .co_med_word:
2101         btst    3, %o0                  ! check for
2102         bz,pt   %ncc, .co_med_word1     ! word alignment
2103           nop
2104 .co_med_word0:
2105         ldub    [%o0], %o3              ! load one byte
2106         inc     %o0
2107         stba    %o3,[%o1]ASI_USER       ! store byte
2108         inc     %o1
2109         btst    3, %o0
2110         bnz,pt  %ncc, .co_med_word0
2111           dec   %o2
2112 !
2113 !  Now word aligned and have at least 36 bytes to move
2114 !
2115 .co_med_word1:
2116         sub     %o2, 15, %o2            ! adjust count to allow cc zero test
2117 .co_med_wmove:
2118         lduw    [%o0], %o3              ! read word
2119         subcc   %o2, 16, %o2            ! reduce count by 16
2120         stwa    %o3, [%o1]ASI_USER      ! write word
2121         add     %o1, 4, %o1             ! advance DST by 4
2122         lduw    [%o0 + 4], %o3          ! repeat for a total for 4 words
2123         add     %o0, 16, %o0            ! advance SRC by 16
2124         stwa    %o3, [%o1]ASI_USER
2125         add     %o1, 4, %o1             ! advance DST by 4
2126         lduw    [%o0 - 8], %o3
2127         stwa    %o3, [%o1]ASI_USER
2128         add     %o1, 4, %o1             ! advance DST by 4
2129         lduw    [%o0 - 4], %o3
2130         stwa    %o3, [%o1]ASI_USER
2131         bgt,pt  %ncc, .co_med_wmove     ! loop til 15 or fewer bytes left
2132           add   %o1, 4, %o1             ! advance DST by 4
2133         addcc   %o2, 12, %o2            ! restore count to word offset
2134         ble,pt  %ncc, .co_med_wextra    ! check for more words to move
2135           nop
2136 .co_med_word2:
2137         lduw    [%o0], %o3              ! read word
2138         subcc   %o2, 4, %o2             ! reduce count by 4
2139         stwa    %o3, [%o1]ASI_USER      ! write word
2140         add     %o0, 4, %o0             ! advance SRC by 4
2141         bgt,pt  %ncc, .co_med_word2     ! loop til 3 or fewer bytes left
2142           add   %o1, 4, %o1             ! advance DST by 4
2143 .co_med_wextra:
2144         addcc   %o2, 3, %o2             ! restore rest of count
2145         bz,pt   %ncc, .co_sm_exit       ! if zero, then done
2146           deccc %o2
2147         bz,pt   %ncc, .co_sm_byte
2148           nop
2149         ba,pt   %ncc, .co_sm_half
2150           nop
2151 
2152         .align 16
2153         nop                             ! instruction alignment
2154         nop                             ! see discussion at start of file
2155         nop
2156 .co_med_half:
2157         btst    1, %o0                  ! check for
2158         bz,pt   %ncc, .co_med_half1     ! half word alignment
2159           nop
2160         ldub    [%o0], %o3              ! load one byte
2161         inc     %o0
2162         stba    %o3,[%o1]ASI_USER       ! store byte
2163         inc     %o1
2164         dec     %o2
2165 !
2166 !  Now half word aligned and have at least 38 bytes to move
2167 !
2168 .co_med_half1:
2169         sub     %o2, 7, %o2             ! adjust count to allow cc zero test
2170 .co_med_hmove:
2171         lduh    [%o0], %o3              ! read half word
2172         subcc   %o2, 8, %o2             ! reduce count by 8
2173         stha    %o3, [%o1]ASI_USER      ! write half word
2174         add     %o1, 2, %o1             ! advance DST by 2
2175         lduh    [%o0 + 2], %o3          ! repeat for a total for 4 halfwords
2176         add     %o0, 8, %o0             ! advance SRC by 8
2177         stha    %o3, [%o1]ASI_USER
2178         add     %o1, 2, %o1             ! advance DST by 2
2179         lduh    [%o0 - 4], %o3
2180         stha    %o3, [%o1]ASI_USER
2181         add     %o1, 2, %o1             ! advance DST by 2
2182         lduh    [%o0 - 2], %o3
2183         stha    %o3, [%o1]ASI_USER
2184         bgt,pt  %ncc, .co_med_hmove     ! loop til 7 or fewer bytes left
2185           add   %o1, 2, %o1             ! advance DST by 2
2186         addcc   %o2, 7, %o2             ! restore count
2187         bz,pt   %ncc, .co_sm_exit
2188           deccc %o2
2189         bz,pt   %ncc, .co_sm_byte
2190           nop
2191         ba,pt   %ncc, .co_sm_half
2192           nop
2193 
2194 /*
2195  * We got here because of a fault during short copyout.
2196  * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
2197  */
2198 .sm_copyout_err:
2199         membar  #Sync
2200         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2201         mov     SM_SAVE_SRC, %o0
2202         mov     SM_SAVE_DST, %o1
2203         mov     SM_SAVE_COUNT, %o2
2204         ldn     [THREAD_REG + T_COPYOPS], %o3   ! check for copyop handler
2205         tst     %o3
2206         bz,pt   %ncc, 3f                        ! if not, return error
2207           nop
2208         ldn     [%o3 + CP_COPYOUT], %o5         ! if handler, invoke it with
2209         jmp     %o5                             ! original arguments
2210           nop
2211 3:
2212         retl
2213           or    %g0, -1, %o0            ! return error value
2214 
2215         SET_SIZE(copyout)
2216 
2217 /*
2218  * The _more entry points are not intended to be used directly by
2219  * any caller from outside this file.  They are provided to allow
2220  * profiling and dtrace of the portions of the copy code that uses
2221  * the floating point registers.
2222  * This entry is particularly important as DTRACE (at least as of
2223  * 4/2004) does not support leaf functions.
2224  */
2225 
2226         ENTRY(copyout_more)
2227 .copyout_more:
2228         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2229         set     .copyout_err, REAL_LOFAULT
2230 
2231 /*
2232  * Copy outs that reach here are larger than VIS_COPY_THRESHOLD bytes
2233  */
2234 .do_copyout:
2235         set     copyio_fault, %l7               ! .copyio_fault is lofault val
2236 
2237         ldn     [THREAD_REG + T_LOFAULT], %l6   ! save existing handler
2238         membar  #Sync                           ! sync error barrier
2239         stn     %l7, [THREAD_REG + T_LOFAULT]   ! set t_lofault
2240 
2241         mov     %i0, SAVE_SRC
2242         mov     %i1, SAVE_DST
2243         mov     %i2, SAVE_COUNT
2244 
2245         FP_NOMIGRATE(6, 7)
2246 
2247         rd      %fprs, %o2              ! check for unused fp
2248         st      %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
2249         btst    FPRS_FEF, %o2
2250         bz,a,pt %icc, .do_blockcopyout
2251           wr    %g0, FPRS_FEF, %fprs
2252 
2253         BST_FPQ2Q4_TOSTACK(%o2)
2254 
2255 .do_blockcopyout:
2256         rd      %gsr, %o2
2257         stx     %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]      ! save gsr
2258         or      %l6, FPUSED_FLAG, %l6
2259 
2260         andcc   DST, VIS_BLOCKSIZE - 1, TMP
2261         mov     ASI_USER, %asi
2262         bz,pt   %ncc, 2f
2263           neg   TMP
2264         add     TMP, VIS_BLOCKSIZE, TMP
2265 
2266         ! TMP = bytes required to align DST on FP_BLOCK boundary
2267         ! Using SRC as a tmp here
2268         cmp     TMP, 3
2269         bleu,pt %ncc, 1f
2270           sub   CNT,TMP,CNT             ! adjust main count
2271         sub     TMP, 3, TMP             ! adjust for end of loop test
2272 .co_blkalign:
2273         ldub    [REALSRC], SRC          ! move 4 bytes per loop iteration
2274         stba    SRC, [DST]%asi
2275         subcc   TMP, 4, TMP
2276         ldub    [REALSRC + 1], SRC
2277         add     REALSRC, 4, REALSRC
2278         stba    SRC, [DST + 1]%asi
2279         ldub    [REALSRC - 2], SRC
2280         add     DST, 4, DST
2281         stba    SRC, [DST - 2]%asi
2282         ldub    [REALSRC - 1], SRC
2283         bgu,pt  %ncc, .co_blkalign
2284           stba  SRC, [DST - 1]%asi
2285 
2286         addcc   TMP, 3, TMP             ! restore count adjustment
2287         bz,pt   %ncc, 2f                ! no bytes left?
2288           nop
2289 1:      ldub    [REALSRC], SRC
2290         inc     REALSRC
2291         inc     DST
2292         deccc   TMP
2293         bgu     %ncc, 1b
2294           stba  SRC, [DST - 1]%asi
2295 
2296 2:
2297         andn    REALSRC, 0x7, SRC
2298         alignaddr REALSRC, %g0, %g0
2299 
2300         ! SRC - 8-byte aligned
2301         ! DST - 64-byte aligned
2302         prefetch [SRC], #one_read
2303         prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read
2304         prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read
2305         prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read
2306         ldd     [SRC], %f16
2307 #if CHEETAH_PREFETCH > 4
2308         prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
2309 #endif
2310         ldd     [SRC + 0x08], %f18
2311 #if CHEETAH_PREFETCH > 5
2312         prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read
2313 #endif
2314         ldd     [SRC + 0x10], %f20
2315 #if CHEETAH_PREFETCH > 6
2316         prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read
2317 #endif
2318         faligndata %f16, %f18, %f48
2319         ldd     [SRC + 0x18], %f22
2320 #if CHEETAH_PREFETCH > 7
2321         prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read
2322 #endif
2323         faligndata %f18, %f20, %f50
2324         ldd     [SRC + 0x20], %f24
2325         faligndata %f20, %f22, %f52
2326         ldd     [SRC + 0x28], %f26
2327         faligndata %f22, %f24, %f54
2328         ldd     [SRC + 0x30], %f28
2329         faligndata %f24, %f26, %f56
2330         ldd     [SRC + 0x38], %f30
2331         faligndata %f26, %f28, %f58
2332         ldd     [SRC + VIS_BLOCKSIZE], %f16
2333         sub     CNT, VIS_BLOCKSIZE, CNT
2334         add     SRC, VIS_BLOCKSIZE, SRC
2335         add     REALSRC, VIS_BLOCKSIZE, REALSRC
2336         ba,a,pt %ncc, 1f
2337           nop
2338         .align  16
2339 1:
2340         ldd     [SRC + 0x08], %f18
2341         faligndata %f28, %f30, %f60
2342         ldd     [SRC + 0x10], %f20
2343         faligndata %f30, %f16, %f62
2344         stda    %f48, [DST]ASI_BLK_AIUS
2345         ldd     [SRC + 0x18], %f22
2346         faligndata %f16, %f18, %f48
2347         ldd     [SRC + 0x20], %f24
2348         faligndata %f18, %f20, %f50
2349         ldd     [SRC + 0x28], %f26
2350         faligndata %f20, %f22, %f52
2351         ldd     [SRC + 0x30], %f28
2352         faligndata %f22, %f24, %f54
2353         ldd     [SRC + 0x38], %f30
2354         faligndata %f24, %f26, %f56
2355         sub     CNT, VIS_BLOCKSIZE, CNT
2356         ldd     [SRC + VIS_BLOCKSIZE], %f16
2357         faligndata %f26, %f28, %f58
2358         prefetch [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read
2359         add     DST, VIS_BLOCKSIZE, DST
2360         prefetch [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
2361         add     REALSRC, VIS_BLOCKSIZE, REALSRC
2362         cmp     CNT, VIS_BLOCKSIZE + 8
2363         bgu,pt  %ncc, 1b
2364           add   SRC, VIS_BLOCKSIZE, SRC
2365 
2366         ! only if REALSRC & 0x7 is 0
2367         cmp     CNT, VIS_BLOCKSIZE
2368         bne     %ncc, 3f
2369           andcc REALSRC, 0x7, %g0
2370         bz,pt   %ncc, 2f
2371           nop
2372 3:      
2373         faligndata %f28, %f30, %f60
2374         faligndata %f30, %f16, %f62
2375         stda    %f48, [DST]ASI_BLK_AIUS
2376         add     DST, VIS_BLOCKSIZE, DST
2377         ba,pt   %ncc, 3f
2378           nop
2379 2:
2380         ldd     [SRC + 0x08], %f18
2381         fsrc1   %f28, %f60
2382         ldd     [SRC + 0x10], %f20
2383         fsrc1   %f30, %f62
2384         stda    %f48, [DST]ASI_BLK_AIUS
2385         ldd     [SRC + 0x18], %f22
2386         fsrc1   %f16, %f48
2387         ldd     [SRC + 0x20], %f24
2388         fsrc1   %f18, %f50
2389         ldd     [SRC + 0x28], %f26
2390         fsrc1   %f20, %f52
2391         ldd     [SRC + 0x30], %f28
2392         fsrc1   %f22, %f54
2393         ldd     [SRC + 0x38], %f30
2394         fsrc1   %f24, %f56
2395         sub     CNT, VIS_BLOCKSIZE, CNT
2396         add     DST, VIS_BLOCKSIZE, DST
2397         add     SRC, VIS_BLOCKSIZE, SRC
2398         add     REALSRC, VIS_BLOCKSIZE, REALSRC
2399         fsrc1   %f26, %f58
2400         fsrc1   %f28, %f60
2401         fsrc1   %f30, %f62
2402         stda    %f48, [DST]ASI_BLK_AIUS
2403         add     DST, VIS_BLOCKSIZE, DST
2404         ba,a,pt %ncc, 4f
2405           nop
2406 
2407 3:      tst     CNT
2408         bz,a    %ncc, 4f
2409           nop
2410 
2411 5:      ldub    [REALSRC], TMP
2412         inc     REALSRC
2413         inc     DST
2414         deccc   CNT
2415         bgu     %ncc, 5b
2416           stba  TMP, [DST - 1]%asi
2417 4:
2418 
2419 .copyout_exit:
2420         membar  #Sync
2421 
2422         FPRAS_INTERVAL(FPRAS_COPYOUT, 0, %l5, %o2, %o3, %o4, %o5, 8)
2423         FPRAS_REWRITE_TYPE2Q2(0, %l5, %o2, %o3, 8, 9)
2424         FPRAS_CHECK(FPRAS_COPYOUT, %l5, 9)      ! lose outputs
2425 
2426         ldx     [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
2427         wr      %o2, 0, %gsr            ! restore gsr
2428 
2429         ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
2430         btst    FPRS_FEF, %o3
2431         bz,pt   %icc, 4f
2432           nop
2433 
2434         BLD_FPQ2Q4_FROMSTACK(%o2)
2435 
2436         ba,pt   %ncc, 1f
2437           wr    %o3, 0, %fprs           ! restore fprs
2438 
2439 4:
2440         FZEROQ2Q4
2441         wr      %o3, 0, %fprs           ! restore fprs
2442 
2443 1:
2444         membar  #Sync
2445         andn    %l6, FPUSED_FLAG, %l6
2446         stn     %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2447         FP_ALLOWMIGRATE(5, 6)
2448         ret
2449           restore       %g0, 0, %o0
2450 
2451 /*
2452  * We got here because of a fault during copyout.
2453  * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
2454  */
2455 .copyout_err:
2456         ldn     [THREAD_REG + T_COPYOPS], %o4   ! check for copyop handler
2457         tst     %o4
2458         bz,pt   %ncc, 2f                        ! if not, return error
2459           nop
2460         ldn     [%o4 + CP_COPYOUT], %g2         ! if handler, invoke it with
2461         jmp     %g2                             ! original arguments
2462           restore %g0, 0, %g0                   ! dispose of copy window
2463 2:
2464         ret
2465           restore %g0, -1, %o0                  ! return error value
2466 
2467 
2468         SET_SIZE(copyout_more)
2469 
2470 #endif  /* lint */
2471 
2472 
2473 #ifdef  lint
2474 
2475 /*ARGSUSED*/
2476 int
2477 xcopyout(const void *kaddr, void *uaddr, size_t count)
2478 { return (0); }
2479 
2480 #else   /* lint */
2481 
2482         ENTRY(xcopyout)
2483         cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
2484         bleu,pt %ncc, .xcopyout_small           ! go to larger cases
2485           xor   %o0, %o1, %o3                   ! are src, dst alignable?
2486         btst    7, %o3                          !
2487         bz,pt   %ncc, .xcopyout_8               !
2488           nop
2489         btst    1, %o3                          ! 
2490         bz,pt   %ncc, .xcopyout_2               ! check for half-word
2491           nop
2492         sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
2493         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
2494         tst     %o3
2495         bz,pn   %icc, .xcopyout_small           ! if zero, disable HW copy
2496           cmp   %o2, %o3                        ! if length <= limit
2497         bleu,pt %ncc, .xcopyout_small           ! go to small copy
2498           nop
2499         ba,pt   %ncc, .xcopyout_more            ! otherwise go to large copy
2500           nop
2501 .xcopyout_2:
2502         btst    3, %o3                          !
2503         bz,pt   %ncc, .xcopyout_4               ! check for word alignment
2504           nop
2505         sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
2506         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
2507         tst     %o3
2508         bz,pn   %icc, .xcopyout_small           ! if zero, disable HW copy
2509           cmp   %o2, %o3                        ! if length <= limit
2510         bleu,pt %ncc, .xcopyout_small           ! go to small copy
2511           nop
2512         ba,pt   %ncc, .xcopyout_more            ! otherwise go to large copy
2513           nop
2514 .xcopyout_4:
2515         ! already checked longword, must be word aligned
2516         sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
2517         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
2518         tst     %o3
2519         bz,pn   %icc, .xcopyout_small           ! if zero, disable HW copy
2520           cmp   %o2, %o3                        ! if length <= limit
2521         bleu,pt %ncc, .xcopyout_small           ! go to small copy
2522           nop
2523         ba,pt   %ncc, .xcopyout_more            ! otherwise go to large copy
2524           nop
2525 .xcopyout_8:
2526         sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
2527         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
2528         tst     %o3
2529         bz,pn   %icc, .xcopyout_small           ! if zero, disable HW copy
2530           cmp   %o2, %o3                        ! if length <= limit
2531         bleu,pt %ncc, .xcopyout_small           ! go to small copy
2532           nop
2533         ba,pt   %ncc, .xcopyout_more            ! otherwise go to large copy
2534           nop
2535 
2536 .xcopyout_small:
2537         sethi   %hi(.sm_xcopyout_err), %o5      ! .sm_xcopyout_err is lofault
2538         or      %o5, %lo(.sm_xcopyout_err), %o5
2539         ldn     [THREAD_REG + T_LOFAULT], %o4   ! save existing handler
2540         membar  #Sync                           ! sync error barrier
2541         ba,pt   %ncc, .sm_do_copyout            ! common code
2542           stn   %o5, [THREAD_REG + T_LOFAULT]   ! set t_lofault
2543 
2544 .xcopyout_more:
2545         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2546         sethi   %hi(.xcopyout_err), REAL_LOFAULT
2547         ba,pt   %ncc, .do_copyout               ! common code
2548           or    REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
2549 
2550 /*
2551  * We got here because of fault during xcopyout
2552  * Errno value is in ERRNO
2553  */
2554 .xcopyout_err:
2555         ldn     [THREAD_REG + T_COPYOPS], %o4   ! check for copyop handler
2556         tst     %o4
2557         bz,pt   %ncc, 2f                        ! if not, return error
2558           nop
2559         ldn     [%o4 + CP_XCOPYOUT], %g2        ! if handler, invoke it with
2560         jmp     %g2                             ! original arguments
2561           restore %g0, 0, %g0                   ! dispose of copy window
2562 2:
2563         ret
2564           restore ERRNO, 0, %o0                 ! return errno value
2565 
2566 .sm_xcopyout_err:
2567 
2568         membar  #Sync
2569         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2570         mov     SM_SAVE_SRC, %o0
2571         mov     SM_SAVE_DST, %o1
2572         mov     SM_SAVE_COUNT, %o2
2573         ldn     [THREAD_REG + T_COPYOPS], %o3   ! check for copyop handler
2574         tst     %o3
2575         bz,pt   %ncc, 3f                        ! if not, return error
2576           nop
2577         ldn     [%o3 + CP_XCOPYOUT], %o5        ! if handler, invoke it with
2578         jmp     %o5                             ! original arguments
2579           nop
2580 3:
2581         retl
2582           or    %g1, 0, %o0             ! return errno value
2583 
2584         SET_SIZE(xcopyout)
2585 
2586 #endif  /* lint */
2587         
2588 #ifdef  lint
2589 
2590 /*ARGSUSED*/
2591 int
2592 xcopyout_little(const void *kaddr, void *uaddr, size_t count)
2593 { return (0); }
2594 
2595 #else   /* lint */
2596 
2597         ENTRY(xcopyout_little)
2598         sethi   %hi(.xcopyio_err), %o5
2599         or      %o5, %lo(.xcopyio_err), %o5
2600         ldn     [THREAD_REG + T_LOFAULT], %o4
2601         membar  #Sync                           ! sync error barrier
2602         stn     %o5, [THREAD_REG + T_LOFAULT]
2603         mov     %o4, %o5
2604 
2605         subcc   %g0, %o2, %o3
2606         add     %o0, %o2, %o0
2607         bz,pn   %ncc, 2f                ! check for zero bytes
2608           sub   %o2, 1, %o4
2609         add     %o0, %o4, %o0           ! start w/last byte
2610         add     %o1, %o2, %o1
2611         ldub    [%o0 + %o3], %o4
2612 
2613 1:      stba    %o4, [%o1 + %o3]ASI_AIUSL
2614         inccc   %o3
2615         sub     %o0, 2, %o0             ! get next byte
2616         bcc,a,pt %ncc, 1b
2617           ldub  [%o0 + %o3], %o4
2618 
2619 2:
2620         membar  #Sync                           ! sync error barrier
2621         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2622         retl
2623           mov   %g0, %o0                ! return (0)
2624 
2625         SET_SIZE(xcopyout_little)
2626 
2627 #endif  /* lint */
2628 
2629 /*
2630  * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
2631  */
2632 
2633 #if defined(lint)
2634 
2635 /*ARGSUSED*/
2636 int
2637 copyin(const void *uaddr, void *kaddr, size_t count)
2638 { return (0); }
2639 
2640 #else   /* lint */
2641 
2642         ENTRY(copyin)
2643         cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
2644         bleu,pt %ncc, .copyin_small             ! go to larger cases
2645           xor   %o0, %o1, %o3                   ! are src, dst alignable?
2646         btst    7, %o3                          !
2647         bz,pt   %ncc, .copyin_8                 ! check for longword alignment
2648           nop
2649         btst    1, %o3                          ! 
2650         bz,pt   %ncc, .copyin_2                 ! check for half-word
2651           nop
2652         sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
2653         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
2654         tst     %o3
2655         bz,pn   %icc, .copyin_small             ! if zero, disable HW copy
2656           cmp   %o2, %o3                        ! if length <= limit
2657         bleu,pt %ncc, .copyin_small             ! go to small copy
2658           nop
2659         ba,pt   %ncc, .copyin_more              ! otherwise go to large copy
2660           nop
2661 .copyin_2:
2662         btst    3, %o3                          !
2663         bz,pt   %ncc, .copyin_4                 ! check for word alignment
2664           nop
2665         sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
2666         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
2667         tst     %o3
2668         bz,pn   %icc, .copyin_small             ! if zero, disable HW copy
2669           cmp   %o2, %o3                        ! if length <= limit
2670         bleu,pt %ncc, .copyin_small             ! go to small copy
2671           nop
2672         ba,pt   %ncc, .copyin_more              ! otherwise go to large copy
2673           nop
2674 .copyin_4:
2675         ! already checked longword, must be word aligned
2676         sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
2677         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
2678         tst     %o3
2679         bz,pn   %icc, .copyin_small             ! if zero, disable HW copy
2680           cmp   %o2, %o3                        ! if length <= limit
2681         bleu,pt %ncc, .copyin_small             ! go to small copy
2682           nop
2683         ba,pt   %ncc, .copyin_more              ! otherwise go to large copy
2684           nop
2685 .copyin_8:
2686         sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
2687         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
2688         tst     %o3
2689         bz,pn   %icc, .copyin_small             ! if zero, disable HW copy
2690           cmp   %o2, %o3                        ! if length <= limit
2691         bleu,pt %ncc, .copyin_small             ! go to small copy
2692           nop
2693         ba,pt   %ncc, .copyin_more              ! otherwise go to large copy
2694           nop
2695 
2696         .align  16
2697         nop                             ! instruction alignment
2698                                         ! see discussion at start of file
2699 .copyin_small:
2700         sethi   %hi(.sm_copyin_err), %o5        ! .sm_copyin_err is lofault 
2701         or      %o5, %lo(.sm_copyin_err), %o5
2702         ldn     [THREAD_REG + T_LOFAULT], %o4   ! set/save t_lofault, no tramp
2703         membar  #Sync                           ! sync error barrier
2704         stn     %o5, [THREAD_REG + T_LOFAULT]
2705 .sm_do_copyin:
2706         mov     %o0, SM_SAVE_SRC
2707         mov     %o1, SM_SAVE_DST
2708         cmp     %o2, SHORTCOPY          ! check for really short case
2709         bleu,pt %ncc, .ci_sm_left       !
2710           mov   %o2, SM_SAVE_COUNT
2711         cmp     %o2, CHKSIZE            ! check for medium length cases
2712         bgu,pn  %ncc, .ci_med           !
2713           or    %o0, %o1, %o3           ! prepare alignment check
2714         andcc   %o3, 0x3, %g0           ! test for alignment
2715         bz,pt   %ncc, .ci_sm_word       ! branch to word aligned case
2716 .ci_sm_movebytes:
2717           sub   %o2, 3, %o2             ! adjust count to allow cc zero test
2718 .ci_sm_notalign4:
2719         lduba   [%o0]ASI_USER, %o3      ! read byte
2720         subcc   %o2, 4, %o2             ! reduce count by 4
2721         stb     %o3, [%o1]              ! write byte
2722         add     %o0, 1, %o0             ! advance SRC by 1
2723         lduba   [%o0]ASI_USER, %o3      ! repeat for a total of 4 bytes
2724         add     %o0, 1, %o0             ! advance SRC by 1
2725         stb     %o3, [%o1 + 1]
2726         add     %o1, 4, %o1             ! advance DST by 4
2727         lduba   [%o0]ASI_USER, %o3
2728         add     %o0, 1, %o0             ! advance SRC by 1
2729         stb     %o3, [%o1 - 2]
2730         lduba   [%o0]ASI_USER, %o3
2731         add     %o0, 1, %o0             ! advance SRC by 1
2732         bgt,pt  %ncc, .ci_sm_notalign4  ! loop til 3 or fewer bytes remain
2733           stb   %o3, [%o1 - 1]
2734         add     %o2, 3, %o2             ! restore count
2735 .ci_sm_left:
2736         tst     %o2
2737         bz,pt   %ncc, .ci_sm_exit
2738           nop
2739         lduba   [%o0]ASI_USER, %o3              ! load one byte
2740         deccc   %o2                     ! reduce count for cc test
2741         bz,pt   %ncc, .ci_sm_exit
2742           stb   %o3,[%o1]               ! store one byte
2743         inc     %o0
2744         lduba   [%o0]ASI_USER, %o3      ! load second byte
2745         deccc   %o2
2746         bz,pt   %ncc, .ci_sm_exit
2747           stb   %o3,[%o1 + 1]           ! store second byte
2748         inc     %o0
2749         lduba   [%o0]ASI_USER, %o3      ! load third byte
2750         stb     %o3,[%o1 + 2]           ! store third byte
2751         membar  #Sync                           ! sync error barrier
2752         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2753         retl
2754           mov   %g0, %o0                ! return 0
2755         .align  16
2756 .ci_sm_words:
2757         lduwa   [%o0]ASI_USER, %o3              ! read word
2758 .ci_sm_wordx:
2759         subcc   %o2, 8, %o2             ! update count
2760         stw     %o3, [%o1]              ! write word
2761         add     %o0, 4, %o0             ! update SRC
2762         add     %o1, 8, %o1             ! update DST
2763         lduwa   [%o0]ASI_USER, %o3      ! read word
2764         add     %o0, 4, %o0             ! update SRC
2765         bgt,pt  %ncc, .ci_sm_words      ! loop til done
2766           stw   %o3, [%o1 - 4]          ! write word
2767         addcc   %o2, 7, %o2             ! restore count
2768         bz,pt   %ncc, .ci_sm_exit
2769           nop
2770         deccc   %o2
2771         bz,pt   %ncc, .ci_sm_byte
2772 .ci_sm_half:
2773           subcc %o2, 2, %o2             ! reduce count by 2
2774         lduha   [%o0]ASI_USER, %o3      ! read half word
2775         add     %o0, 2, %o0             ! advance SRC by 2
2776         add     %o1, 2, %o1             ! advance DST by 2
2777         bgt,pt  %ncc, .ci_sm_half       ! loop til done
2778           sth   %o3, [%o1 - 2]          ! write half word
2779         addcc   %o2, 1, %o2             ! restore count
2780         bz,pt   %ncc, .ci_sm_exit
2781           nop
2782 .ci_sm_byte:
2783         lduba   [%o0]ASI_USER, %o3
2784         stb     %o3, [%o1]
2785         membar  #Sync                           ! sync error barrier
2786         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2787         retl
2788           mov   %g0, %o0                ! return 0
2789         .align  16
2790 .ci_sm_word:
2791         subcc   %o2, 4, %o2             ! update count
2792         bgt,pt  %ncc, .ci_sm_wordx
2793           lduwa [%o0]ASI_USER, %o3              ! read word
2794         addcc   %o2, 3, %o2             ! restore count
2795         bz,pt   %ncc, .ci_sm_exit
2796           stw   %o3, [%o1]              ! write word
2797         deccc   %o2                     ! reduce count for cc test
2798         add     %o0, 4, %o0
2799         lduba   [%o0]ASI_USER, %o3      ! load one byte
2800         bz,pt   %ncc, .ci_sm_exit
2801           stb   %o3, [%o1 + 4]          ! store one byte
2802         inc     %o0
2803         lduba   [%o0]ASI_USER, %o3      ! load second byte
2804         deccc   %o2
2805         bz,pt   %ncc, .ci_sm_exit
2806           stb   %o3, [%o1 + 5]          ! store second byte
2807         inc     %o0
2808         lduba   [%o0]ASI_USER, %o3      ! load third byte
2809         stb     %o3, [%o1 + 6]          ! store third byte
2810 .ci_sm_exit:
2811         membar  #Sync                           ! sync error barrier
2812         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2813         retl
2814           mov   %g0, %o0                ! return 0
2815 
2816         .align 16
2817 .ci_med:
2818         xor     %o0, %o1, %o3           ! setup alignment check
2819         btst    1, %o3
2820         bnz,pt  %ncc, .ci_sm_movebytes  ! unaligned
2821           nop
2822         btst    3, %o3
2823         bnz,pt  %ncc, .ci_med_half      ! halfword aligned
2824           nop
2825         btst    7, %o3
2826         bnz,pt  %ncc, .ci_med_word      ! word aligned
2827           nop
2828 .ci_med_long:
2829         btst    3, %o0                  ! check for
2830         bz,pt   %ncc, .ci_med_long1     ! word alignment
2831           nop
2832 .ci_med_long0:
2833         lduba   [%o0]ASI_USER, %o3              ! load one byte
2834         inc     %o0
2835         stb     %o3,[%o1]               ! store byte
2836         inc     %o1
2837         btst    3, %o0
2838         bnz,pt  %ncc, .ci_med_long0
2839           dec   %o2
2840 .ci_med_long1:                  ! word aligned
2841         btst    7, %o0                  ! check for long word
2842         bz,pt   %ncc, .ci_med_long2
2843           nop
2844         lduwa   [%o0]ASI_USER, %o3      ! load word
2845         add     %o0, 4, %o0             ! advance SRC by 4
2846         stw     %o3, [%o1]              ! store word
2847         add     %o1, 4, %o1             ! advance DST by 4
2848         sub     %o2, 4, %o2             ! reduce count by 4
2849 !
2850 !  Now long word aligned and have at least 32 bytes to move
2851 !
2852 .ci_med_long2:
2853         sub     %o2, 31, %o2            ! adjust count to allow cc zero test
2854 .ci_med_lmove:
2855         ldxa    [%o0]ASI_USER, %o3      ! read long word
2856         subcc   %o2, 32, %o2            ! reduce count by 32
2857         stx     %o3, [%o1]              ! write long word
2858         add     %o0, 8, %o0             ! advance SRC by 8
2859         ldxa    [%o0]ASI_USER, %o3      ! repeat for a total for 4 long words
2860         add     %o0, 8, %o0             ! advance SRC by 8
2861         stx     %o3, [%o1 + 8]
2862         add     %o1, 32, %o1            ! advance DST by 32
2863         ldxa    [%o0]ASI_USER, %o3
2864         add     %o0, 8, %o0             ! advance SRC by 8
2865         stx     %o3, [%o1 - 16]
2866         ldxa    [%o0]ASI_USER, %o3
2867         add     %o0, 8, %o0             ! advance SRC by 8
2868         bgt,pt  %ncc, .ci_med_lmove     ! loop til 31 or fewer bytes left
2869           stx   %o3, [%o1 - 8]
2870         addcc   %o2, 24, %o2            ! restore count to long word offset
2871         ble,pt  %ncc, .ci_med_lextra    ! check for more long words to move
2872           nop
2873 .ci_med_lword:
2874         ldxa    [%o0]ASI_USER, %o3      ! read long word
2875         subcc   %o2, 8, %o2             ! reduce count by 8
2876         stx     %o3, [%o1]              ! write long word
2877         add     %o0, 8, %o0             ! advance SRC by 8
2878         bgt,pt  %ncc, .ci_med_lword     ! loop til 7 or fewer bytes left
2879           add   %o1, 8, %o1             ! advance DST by 8
2880 .ci_med_lextra:
2881         addcc   %o2, 7, %o2             ! restore rest of count
2882         bz,pt   %ncc, .ci_sm_exit       ! if zero, then done
2883           deccc %o2
2884         bz,pt   %ncc, .ci_sm_byte
2885           nop
2886         ba,pt   %ncc, .ci_sm_half
2887           nop
2888 
2889         .align 16
2890         nop                             ! instruction alignment
2891                                         ! see discussion at start of file
2892 .ci_med_word:
2893         btst    3, %o0                  ! check for
2894         bz,pt   %ncc, .ci_med_word1     ! word alignment
2895           nop
2896 .ci_med_word0:
2897         lduba   [%o0]ASI_USER, %o3      ! load one byte
2898         inc     %o0
2899         stb     %o3,[%o1]               ! store byte
2900         inc     %o1
2901         btst    3, %o0
2902         bnz,pt  %ncc, .ci_med_word0
2903           dec   %o2
2904 !
2905 !  Now word aligned and have at least 36 bytes to move
2906 !
2907 .ci_med_word1:
2908         sub     %o2, 15, %o2            ! adjust count to allow cc zero test
2909 .ci_med_wmove:
2910         lduwa   [%o0]ASI_USER, %o3      ! read word
2911         subcc   %o2, 16, %o2            ! reduce count by 16
2912         stw     %o3, [%o1]              ! write word
2913         add     %o0, 4, %o0             ! advance SRC by 4
2914         lduwa   [%o0]ASI_USER, %o3      ! repeat for a total for 4 words
2915         add     %o0, 4, %o0             ! advance SRC by 4
2916         stw     %o3, [%o1 + 4]
2917         add     %o1, 16, %o1            ! advance DST by 16
2918         lduwa   [%o0]ASI_USER, %o3
2919         add     %o0, 4, %o0             ! advance SRC by 4
2920         stw     %o3, [%o1 - 8]
2921         lduwa   [%o0]ASI_USER, %o3
2922         add     %o0, 4, %o0             ! advance SRC by 4
2923         bgt,pt  %ncc, .ci_med_wmove     ! loop til 15 or fewer bytes left
2924           stw   %o3, [%o1 - 4]
2925         addcc   %o2, 12, %o2            ! restore count to word offset
2926         ble,pt  %ncc, .ci_med_wextra    ! check for more words to move
2927           nop
2928 .ci_med_word2:
2929         lduwa   [%o0]ASI_USER, %o3      ! read word
2930         subcc   %o2, 4, %o2             ! reduce count by 4
2931         stw     %o3, [%o1]              ! write word
2932         add     %o0, 4, %o0             ! advance SRC by 4
2933         bgt,pt  %ncc, .ci_med_word2     ! loop til 3 or fewer bytes left
2934           add   %o1, 4, %o1             ! advance DST by 4
2935 .ci_med_wextra:
2936         addcc   %o2, 3, %o2             ! restore rest of count
2937         bz,pt   %ncc, .ci_sm_exit       ! if zero, then done
2938           deccc %o2
2939         bz,pt   %ncc, .ci_sm_byte
2940           nop
2941         ba,pt   %ncc, .ci_sm_half
2942           nop
2943 
2944         .align 16
2945         nop                             ! instruction alignment
2946                                         ! see discussion at start of file
2947 .ci_med_half:
2948         btst    1, %o0                  ! check for
2949         bz,pt   %ncc, .ci_med_half1     ! half word alignment
2950           nop
2951         lduba   [%o0]ASI_USER, %o3      ! load one byte
2952         inc     %o0
2953         stb     %o3,[%o1]               ! store byte
2954         inc     %o1
2955         dec     %o2
2956 !
2957 !  Now half word aligned and have at least 38 bytes to move
2958 !
2959 .ci_med_half1:
2960         sub     %o2, 7, %o2             ! adjust count to allow cc zero test
2961 .ci_med_hmove:
2962         lduha   [%o0]ASI_USER, %o3      ! read half word
2963         subcc   %o2, 8, %o2             ! reduce count by 8
2964         sth     %o3, [%o1]              ! write half word
2965         add     %o0, 2, %o0             ! advance SRC by 2
2966         lduha   [%o0]ASI_USER, %o3      ! repeat for a total for 4 halfwords
2967         add     %o0, 2, %o0             ! advance SRC by 2
2968         sth     %o3, [%o1 + 2]
2969         add     %o1, 8, %o1             ! advance DST by 8
2970         lduha   [%o0]ASI_USER, %o3
2971         add     %o0, 2, %o0             ! advance SRC by 2
2972         sth     %o3, [%o1 - 4]
2973         lduha   [%o0]ASI_USER, %o3
2974         add     %o0, 2, %o0             ! advance SRC by 2
2975         bgt,pt  %ncc, .ci_med_hmove     ! loop til 7 or fewer bytes left
2976           sth   %o3, [%o1 - 2]
2977         addcc   %o2, 7, %o2             ! restore count
2978         bz,pt   %ncc, .ci_sm_exit
2979           deccc %o2
2980         bz,pt   %ncc, .ci_sm_byte
2981           nop
2982         ba,pt   %ncc, .ci_sm_half
2983           nop
2984 
2985 .sm_copyin_err:
2986         membar  #Sync
2987         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2988         mov     SM_SAVE_SRC, %o0
2989         mov     SM_SAVE_DST, %o1
2990         mov     SM_SAVE_COUNT, %o2
2991         ldn     [THREAD_REG + T_COPYOPS], %o3   ! check for copyop handler
2992         tst     %o3
2993         bz,pt   %ncc, 3f                        ! if not, return error
2994           nop
2995         ldn     [%o3 + CP_COPYIN], %o5          ! if handler, invoke it with
2996         jmp     %o5                             ! original arguments
2997           nop
2998 3:
2999         retl
3000           or    %g0, -1, %o0            ! return errno value
3001 
3002         SET_SIZE(copyin)
3003 
3004 
3005 /*
3006  * The _more entry points are not intended to be used directly by
3007  * any caller from outside this file.  They are provided to allow
3008  * profiling and dtrace of the portions of the copy code that uses
3009  * the floating point registers.
3010  * This entry is particularly important as DTRACE (at least as of
3011  * 4/2004) does not support leaf functions.
3012  */
3013 
3014         ENTRY(copyin_more)
3015 .copyin_more:
3016         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3017         set     .copyin_err, REAL_LOFAULT
3018 
3019 /*
3020  * Copy ins that reach here are larger than VIS_COPY_THRESHOLD bytes
3021  */
3022 .do_copyin:
3023         set     copyio_fault, %l7               ! .copyio_fault is lofault val
3024 
3025         ldn     [THREAD_REG + T_LOFAULT], %l6   ! save existing handler
3026         membar  #Sync                           ! sync error barrier
3027         stn     %l7, [THREAD_REG + T_LOFAULT]   ! set t_lofault
3028 
3029         mov     %i0, SAVE_SRC
3030         mov     %i1, SAVE_DST
3031         mov     %i2, SAVE_COUNT
3032 
3033         FP_NOMIGRATE(6, 7)
3034 
3035         rd      %fprs, %o2              ! check for unused fp
3036         st      %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
3037         btst    FPRS_FEF, %o2
3038         bz,a,pt %icc, .do_blockcopyin
3039           wr    %g0, FPRS_FEF, %fprs
3040 
3041         BST_FPQ2Q4_TOSTACK(%o2)
3042 
3043 .do_blockcopyin:
3044         rd      %gsr, %o2
3045         stx     %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]      ! save gsr
3046         or      %l6, FPUSED_FLAG, %l6
3047 
3048         andcc   DST, VIS_BLOCKSIZE - 1, TMP
3049         mov     ASI_USER, %asi
3050         bz,pt   %ncc, 2f
3051           neg   TMP
3052         add     TMP, VIS_BLOCKSIZE, TMP
3053 
3054         ! TMP = bytes required to align DST on FP_BLOCK boundary
3055         ! Using SRC as a tmp here
3056         cmp     TMP, 3
3057         bleu,pt %ncc, 1f
3058           sub   CNT,TMP,CNT             ! adjust main count
3059         sub     TMP, 3, TMP             ! adjust for end of loop test
3060 .ci_blkalign:
3061         lduba   [REALSRC]%asi, SRC      ! move 4 bytes per loop iteration
3062         stb     SRC, [DST]
3063         subcc   TMP, 4, TMP
3064         lduba   [REALSRC + 1]%asi, SRC
3065         add     REALSRC, 4, REALSRC
3066         stb     SRC, [DST + 1]
3067         lduba   [REALSRC - 2]%asi, SRC
3068         add     DST, 4, DST
3069         stb     SRC, [DST - 2]
3070         lduba   [REALSRC - 1]%asi, SRC
3071         bgu,pt  %ncc, .ci_blkalign
3072           stb   SRC, [DST - 1]
3073 
3074         addcc   TMP, 3, TMP             ! restore count adjustment
3075         bz,pt   %ncc, 2f                ! no bytes left?
3076           nop
3077 1:      lduba   [REALSRC]%asi, SRC
3078         inc     REALSRC
3079         inc     DST
3080         deccc   TMP
3081         bgu     %ncc, 1b
3082           stb   SRC, [DST - 1]
3083 
3084 2:
3085         andn    REALSRC, 0x7, SRC
3086         alignaddr REALSRC, %g0, %g0
3087 
3088         ! SRC - 8-byte aligned
3089         ! DST - 64-byte aligned
3090         prefetcha [SRC]%asi, #one_read
3091         prefetcha [SRC + (1 * VIS_BLOCKSIZE)]%asi, #one_read
3092         prefetcha [SRC + (2 * VIS_BLOCKSIZE)]%asi, #one_read
3093         prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #one_read
3094         ldda    [SRC]%asi, %f16
3095 #if CHEETAH_PREFETCH > 4
3096         prefetcha [SRC + (4 * VIS_BLOCKSIZE)]%asi, #one_read
3097 #endif
3098         ldda    [SRC + 0x08]%asi, %f18
3099 #if CHEETAH_PREFETCH > 5
3100         prefetcha [SRC + (5 * VIS_BLOCKSIZE)]%asi, #one_read
3101 #endif
3102         ldda    [SRC + 0x10]%asi, %f20
3103 #if CHEETAH_PREFETCH > 6
3104         prefetcha [SRC + (6 * VIS_BLOCKSIZE)]%asi, #one_read
3105 #endif
3106         faligndata %f16, %f18, %f48
3107         ldda    [SRC + 0x18]%asi, %f22
3108 #if CHEETAH_PREFETCH > 7
3109         prefetcha [SRC + (7 * VIS_BLOCKSIZE)]%asi, #one_read
3110 #endif
3111         faligndata %f18, %f20, %f50
3112         ldda    [SRC + 0x20]%asi, %f24
3113         faligndata %f20, %f22, %f52
3114         ldda    [SRC + 0x28]%asi, %f26
3115         faligndata %f22, %f24, %f54
3116         ldda    [SRC + 0x30]%asi, %f28
3117         faligndata %f24, %f26, %f56
3118         ldda    [SRC + 0x38]%asi, %f30
3119         faligndata %f26, %f28, %f58
3120         ldda    [SRC + VIS_BLOCKSIZE]%asi, %f16
3121         sub     CNT, VIS_BLOCKSIZE, CNT
3122         add     SRC, VIS_BLOCKSIZE, SRC
3123         add     REALSRC, VIS_BLOCKSIZE, REALSRC
3124         ba,a,pt %ncc, 1f
3125           nop
3126         .align  16
3127 1:
3128         ldda    [SRC + 0x08]%asi, %f18
3129         faligndata %f28, %f30, %f60
3130         ldda    [SRC + 0x10]%asi, %f20
3131         faligndata %f30, %f16, %f62
3132         stda    %f48, [DST]ASI_BLK_P
3133         ldda    [SRC + 0x18]%asi, %f22
3134         faligndata %f16, %f18, %f48
3135         ldda    [SRC + 0x20]%asi, %f24
3136         faligndata %f18, %f20, %f50
3137         ldda    [SRC + 0x28]%asi, %f26
3138         faligndata %f20, %f22, %f52
3139         ldda    [SRC + 0x30]%asi, %f28
3140         faligndata %f22, %f24, %f54
3141         ldda    [SRC + 0x38]%asi, %f30
3142         faligndata %f24, %f26, %f56
3143         sub     CNT, VIS_BLOCKSIZE, CNT
3144         ldda    [SRC + VIS_BLOCKSIZE]%asi, %f16
3145         faligndata %f26, %f28, %f58
3146         prefetcha [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8]%asi, #one_read
3147         add     DST, VIS_BLOCKSIZE, DST
3148         prefetcha [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read
3149         add     REALSRC, VIS_BLOCKSIZE, REALSRC
3150         cmp     CNT, VIS_BLOCKSIZE + 8
3151         bgu,pt  %ncc, 1b
3152           add   SRC, VIS_BLOCKSIZE, SRC
3153 
3154         ! only if REALSRC & 0x7 is 0
3155         cmp     CNT, VIS_BLOCKSIZE
3156         bne     %ncc, 3f
3157           andcc REALSRC, 0x7, %g0
3158         bz,pt   %ncc, 2f
3159           nop
3160 3:      
3161         faligndata %f28, %f30, %f60
3162         faligndata %f30, %f16, %f62
3163         stda    %f48, [DST]ASI_BLK_P
3164         add     DST, VIS_BLOCKSIZE, DST
3165         ba,pt   %ncc, 3f
3166           nop
3167 2:
3168         ldda    [SRC + 0x08]%asi, %f18
3169         fsrc1   %f28, %f60
3170         ldda    [SRC + 0x10]%asi, %f20
3171         fsrc1   %f30, %f62
3172         stda    %f48, [DST]ASI_BLK_P
3173         ldda    [SRC + 0x18]%asi, %f22
3174         fsrc1   %f16, %f48
3175         ldda    [SRC + 0x20]%asi, %f24
3176         fsrc1   %f18, %f50
3177         ldda    [SRC + 0x28]%asi, %f26
3178         fsrc1   %f20, %f52
3179         ldda    [SRC + 0x30]%asi, %f28
3180         fsrc1   %f22, %f54
3181         ldda    [SRC + 0x38]%asi, %f30
3182         fsrc1   %f24, %f56
3183         sub     CNT, VIS_BLOCKSIZE, CNT
3184         add     DST, VIS_BLOCKSIZE, DST
3185         add     SRC, VIS_BLOCKSIZE, SRC
3186         add     REALSRC, VIS_BLOCKSIZE, REALSRC
3187         fsrc1   %f26, %f58
3188         fsrc1   %f28, %f60
3189         fsrc1   %f30, %f62
3190         stda    %f48, [DST]ASI_BLK_P
3191         add     DST, VIS_BLOCKSIZE, DST
3192         ba,a,pt %ncc, 4f
3193           nop
3194 
3195 3:      tst     CNT
3196         bz,a    %ncc, 4f
3197           nop
3198 
3199 5:      lduba   [REALSRC]ASI_USER, TMP
3200         inc     REALSRC
3201         inc     DST
3202         deccc   CNT
3203         bgu     %ncc, 5b
3204           stb   TMP, [DST - 1]
3205 4:
3206 
3207 .copyin_exit:
3208         membar  #Sync
3209 
3210         FPRAS_INTERVAL(FPRAS_COPYIN, 1, %l5, %o2, %o3, %o4, %o5, 8)
3211         FPRAS_REWRITE_TYPE1(1, %l5, %f48, %o2, 9)
3212         FPRAS_CHECK(FPRAS_COPYIN, %l5, 9)       ! lose outputs
3213 
3214         ldx     [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2      ! restore gsr
3215         wr      %o2, 0, %gsr
3216 
3217         ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
3218         btst    FPRS_FEF, %o3
3219         bz,pt   %icc, 4f
3220           nop
3221 
3222         BLD_FPQ2Q4_FROMSTACK(%o2)
3223 
3224         ba,pt   %ncc, 1f
3225           wr    %o3, 0, %fprs           ! restore fprs
3226 
3227 4:
3228         FZEROQ2Q4
3229         wr      %o3, 0, %fprs           ! restore fprs
3230 
3231 1:
3232         membar  #Sync                           ! sync error barrier
3233         andn    %l6, FPUSED_FLAG, %l6
3234         stn     %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3235         FP_ALLOWMIGRATE(5, 6)
3236         ret
3237           restore       %g0, 0, %o0
3238 /*
3239  * We got here because of a fault during copyin
3240  * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
3241  */
3242 .copyin_err:
3243         ldn     [THREAD_REG + T_COPYOPS], %o4   ! check for copyop handler
3244         tst     %o4
3245         bz,pt   %ncc, 2f                        ! if not, return error
3246         nop
3247         ldn     [%o4 + CP_COPYIN], %g2          ! if handler, invoke it with
3248         jmp     %g2                             ! original arguments
3249         restore %g0, 0, %g0                     ! dispose of copy window
3250 2:
3251         ret
3252         restore %g0, -1, %o0                    ! return error value
3253 
3254 
3255         SET_SIZE(copyin_more)
3256 
3257 #endif  /* lint */
3258 
3259 #ifdef  lint
3260 
3261 /*ARGSUSED*/
3262 int
3263 xcopyin(const void *uaddr, void *kaddr, size_t count)
3264 { return (0); }
3265 
3266 #else   /* lint */
3267 
3268         ENTRY(xcopyin)
3269 
3270         cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
3271         bleu,pt %ncc, .xcopyin_small            ! go to larger cases
3272           xor   %o0, %o1, %o3                   ! are src, dst alignable?
3273         btst    7, %o3                          !
3274         bz,pt   %ncc, .xcopyin_8                ! check for longword alignment
3275           nop
3276         btst    1, %o3                          ! 
3277         bz,pt   %ncc, .xcopyin_2                ! check for half-word
3278           nop
3279         sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
3280         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
3281         tst     %o3
3282         bz,pn   %icc, .xcopyin_small            ! if zero, disable HW copy
3283           cmp   %o2, %o3                        ! if length <= limit
3284         bleu,pt %ncc, .xcopyin_small            ! go to small copy
3285           nop
3286         ba,pt   %ncc, .xcopyin_more             ! otherwise go to large copy
3287           nop
3288 .xcopyin_2:
3289         btst    3, %o3                          !
3290         bz,pt   %ncc, .xcopyin_4                ! check for word alignment
3291           nop
3292         sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
3293         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
3294         tst     %o3
3295         bz,pn   %icc, .xcopyin_small            ! if zero, disable HW copy
3296           cmp   %o2, %o3                        ! if length <= limit
3297         bleu,pt %ncc, .xcopyin_small            ! go to small copy
3298           nop
3299         ba,pt   %ncc, .xcopyin_more             ! otherwise go to large copy
3300           nop
3301 .xcopyin_4:
3302         ! already checked longword, must be word aligned
3303         sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
3304         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
3305         tst     %o3
3306         bz,pn   %icc, .xcopyin_small            ! if zero, disable HW copy
3307           cmp   %o2, %o3                        ! if length <= limit
3308         bleu,pt %ncc, .xcopyin_small            ! go to small copy
3309           nop
3310         ba,pt   %ncc, .xcopyin_more             ! otherwise go to large copy
3311           nop
3312 .xcopyin_8:
3313         sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
3314         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
3315         tst     %o3
3316         bz,pn   %icc, .xcopyin_small            ! if zero, disable HW copy
3317           cmp   %o2, %o3                        ! if length <= limit
3318         bleu,pt %ncc, .xcopyin_small            ! go to small copy
3319           nop
3320         ba,pt   %ncc, .xcopyin_more             ! otherwise go to large copy
3321           nop
3322 
3323 .xcopyin_small:
3324         sethi   %hi(.sm_xcopyin_err), %o5  ! .sm_xcopyin_err is lofault value
3325         or      %o5, %lo(.sm_xcopyin_err), %o5
3326         ldn     [THREAD_REG + T_LOFAULT], %o4   ! set/save t_lofaul
3327         membar  #Sync                           ! sync error barrier
3328         ba,pt   %ncc, .sm_do_copyin             ! common code
3329           stn   %o5, [THREAD_REG + T_LOFAULT]
3330         
3331 .xcopyin_more:
3332         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3333         sethi   %hi(.xcopyin_err), REAL_LOFAULT ! .xcopyin_err is lofault value
3334         ba,pt   %ncc, .do_copyin
3335           or    REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
3336 
3337 /*
3338  * We got here because of fault during xcopyin
3339  * Errno value is in ERRNO
3340  */
3341 .xcopyin_err:
3342         ldn     [THREAD_REG + T_COPYOPS], %o4   ! check for copyop handler
3343         tst     %o4
3344         bz,pt   %ncc, 2f                        ! if not, return error
3345           nop
3346         ldn     [%o4 + CP_XCOPYIN], %g2         ! if handler, invoke it with
3347         jmp     %g2                             ! original arguments
3348           restore %g0, 0, %g0                   ! dispose of copy window
3349 2:
3350         ret
3351           restore ERRNO, 0, %o0                 ! return errno value
3352 
3353 .sm_xcopyin_err:
3354 
3355         membar  #Sync
3356         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3357         mov     SM_SAVE_SRC, %o0
3358         mov     SM_SAVE_DST, %o1
3359         mov     SM_SAVE_COUNT, %o2
3360         ldn     [THREAD_REG + T_COPYOPS], %o3   ! check for copyop handler
3361         tst     %o3
3362         bz,pt   %ncc, 3f                        ! if not, return error
3363           nop
3364         ldn     [%o3 + CP_XCOPYIN], %o5         ! if handler, invoke it with
3365         jmp     %o5                             ! original arguments
3366           nop
3367 3:
3368         retl
3369           or    %g1, 0, %o0             ! return errno value
3370 
3371         SET_SIZE(xcopyin)
3372 
3373 #endif  /* lint */
3374 
3375 #ifdef  lint
3376 
3377 /*ARGSUSED*/
3378 int
3379 xcopyin_little(const void *uaddr, void *kaddr, size_t count)
3380 { return (0); }
3381 
3382 #else   /* lint */
3383 
3384         ENTRY(xcopyin_little)
3385         sethi   %hi(.xcopyio_err), %o5
3386         or      %o5, %lo(.xcopyio_err), %o5
3387         ldn     [THREAD_REG + T_LOFAULT], %o4
3388         membar  #Sync                           ! sync error barrier
3389         stn     %o5, [THREAD_REG + T_LOFAULT]   
3390         mov     %o4, %o5
3391 
3392         subcc   %g0, %o2, %o3
3393         add     %o0, %o2, %o0
3394         bz,pn   %ncc, 2f                ! check for zero bytes
3395           sub   %o2, 1, %o4
3396         add     %o0, %o4, %o0           ! start w/last byte     
3397         add     %o1, %o2, %o1
3398         lduba   [%o0 + %o3]ASI_AIUSL, %o4
3399 
3400 1:      stb     %o4, [%o1 + %o3]
3401         inccc   %o3
3402         sub     %o0, 2, %o0             ! get next byte
3403         bcc,a,pt %ncc, 1b
3404           lduba [%o0 + %o3]ASI_AIUSL, %o4
3405 
3406 2:
3407         membar  #Sync                           ! sync error barrier
3408         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3409         retl
3410           mov   %g0, %o0                ! return (0)
3411 
3412 .xcopyio_err:
3413         membar  #Sync                           ! sync error barrier
3414         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3415         retl
3416           mov   %g1, %o0
3417 
3418         SET_SIZE(xcopyin_little)
3419 
3420 #endif  /* lint */
3421 
3422 
3423 /*
3424  * Copy a block of storage - must not overlap (from + len <= to).
3425  * No fault handler installed (to be called under on_fault())
3426  */
3427 #if defined(lint)
3428 
3429 /* ARGSUSED */
3430 void
3431 copyin_noerr(const void *ufrom, void *kto, size_t count)
3432 {}
3433 
3434 #else   /* lint */
3435         ENTRY(copyin_noerr)
3436 
3437         cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
3438         bleu,pt %ncc, .copyin_ne_small          ! go to larger cases
3439           xor   %o0, %o1, %o3                   ! are src, dst alignable?
3440         btst    7, %o3                          !
3441         bz,pt   %ncc, .copyin_ne_8              ! check for longword alignment
3442           nop
3443         btst    1, %o3                          ! 
3444         bz,pt   %ncc, .copyin_ne_2              ! check for half-word
3445           nop
3446         sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
3447         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
3448         tst     %o3
3449         bz,pn   %icc, .copyin_ne_small          ! if zero, disable HW copy
3450           cmp   %o2, %o3                        ! if length <= limit
3451         bleu,pt %ncc, .copyin_ne_small          ! go to small copy
3452           nop
3453         ba,pt   %ncc, .copyin_noerr_more        ! otherwise go to large copy
3454           nop
3455 .copyin_ne_2:
3456         btst    3, %o3                          !
3457         bz,pt   %ncc, .copyin_ne_4              ! check for word alignment
3458           nop
3459         sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
3460         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
3461         tst     %o3
3462         bz,pn   %icc, .copyin_ne_small          ! if zero, disable HW copy
3463           cmp   %o2, %o3                        ! if length <= limit
3464         bleu,pt %ncc, .copyin_ne_small          ! go to small copy
3465           nop
3466         ba,pt   %ncc, .copyin_noerr_more        ! otherwise go to large copy
3467           nop
3468 .copyin_ne_4:
3469         ! already checked longword, must be word aligned
3470         sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
3471         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
3472         tst     %o3
3473         bz,pn   %icc, .copyin_ne_small          ! if zero, disable HW copy
3474           cmp   %o2, %o3                        ! if length <= limit
3475         bleu,pt %ncc, .copyin_ne_small          ! go to small copy
3476           nop
3477         ba,pt   %ncc, .copyin_noerr_more        ! otherwise go to large copy
3478           nop
3479 .copyin_ne_8:
3480         sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
3481         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
3482         tst     %o3
3483         bz,pn   %icc, .copyin_ne_small          ! if zero, disable HW copy
3484           cmp   %o2, %o3                        ! if length <= limit
3485         bleu,pt %ncc, .copyin_ne_small          ! go to small copy
3486           nop
3487         ba,pt   %ncc, .copyin_noerr_more        ! otherwise go to large copy
3488           nop
3489 
3490 .copyin_ne_small:
3491         ldn     [THREAD_REG + T_LOFAULT], %o4
3492         tst     %o4
3493         bz,pn   %ncc, .sm_do_copyin
3494           nop
3495         sethi   %hi(.sm_copyio_noerr), %o5
3496         or      %o5, %lo(.sm_copyio_noerr), %o5
3497         membar  #Sync                           ! sync error barrier
3498         ba,pt   %ncc, .sm_do_copyin
3499           stn   %o5, [THREAD_REG + T_LOFAULT]   ! set/save t_lofault
3500 
3501 .copyin_noerr_more:
3502         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3503         sethi   %hi(.copyio_noerr), REAL_LOFAULT
3504         ba,pt   %ncc, .do_copyin
3505           or    REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
3506 
3507 .copyio_noerr:
3508         jmp     %l6
3509           restore %g0,0,%g0
3510 
3511 .sm_copyio_noerr:
3512         membar  #Sync
3513         stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore t_lofault
3514         jmp     %o4
3515           nop
3516 
3517         SET_SIZE(copyin_noerr)
3518 #endif /* lint */
3519 
3520 /*
3521  * Copy a block of storage - must not overlap (from + len <= to).
3522  * No fault handler installed (to be called under on_fault())
3523  */
3524 
3525 #if defined(lint)
3526 
3527 /* ARGSUSED */
3528 void
3529 copyout_noerr(const void *kfrom, void *uto, size_t count)
3530 {}
3531 
3532 #else   /* lint */
3533         ENTRY(copyout_noerr)
3534 
3535         cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
3536         bleu,pt %ncc, .copyout_ne_small         ! go to larger cases
3537           xor   %o0, %o1, %o3                   ! are src, dst alignable?
3538         btst    7, %o3                          !
3539         bz,pt   %ncc, .copyout_ne_8             ! check for longword alignment
3540           nop
3541         btst    1, %o3                          ! 
3542         bz,pt   %ncc, .copyout_ne_2             ! check for half-word
3543           nop
3544         sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
3545         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
3546         tst     %o3
3547         bz,pn   %icc, .copyout_ne_small         ! if zero, disable HW copy
3548           cmp   %o2, %o3                        ! if length <= limit
3549         bleu,pt %ncc, .copyout_ne_small         ! go to small copy
3550           nop
3551         ba,pt   %ncc, .copyout_noerr_more       ! otherwise go to large copy
3552           nop
3553 .copyout_ne_2:
3554         btst    3, %o3                          !
3555         bz,pt   %ncc, .copyout_ne_4             ! check for word alignment
3556           nop
3557         sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
3558         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
3559         tst     %o3
3560         bz,pn   %icc, .copyout_ne_small         ! if zero, disable HW copy
3561           cmp   %o2, %o3                        ! if length <= limit
3562         bleu,pt %ncc, .copyout_ne_small         ! go to small copy
3563           nop
3564         ba,pt   %ncc, .copyout_noerr_more       ! otherwise go to large copy
3565           nop
3566 .copyout_ne_4:
3567         ! already checked longword, must be word aligned
3568         sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
3569         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
3570         tst     %o3
3571         bz,pn   %icc, .copyout_ne_small         ! if zero, disable HW copy
3572           cmp   %o2, %o3                        ! if length <= limit
3573         bleu,pt %ncc, .copyout_ne_small         ! go to small copy
3574           nop
3575         ba,pt   %ncc, .copyout_noerr_more       ! otherwise go to large copy
3576           nop
3577 .copyout_ne_8:
3578         sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
3579         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
3580         tst     %o3
3581         bz,pn   %icc, .copyout_ne_small         ! if zero, disable HW copy
3582           cmp   %o2, %o3                        ! if length <= limit
3583         bleu,pt %ncc, .copyout_ne_small         ! go to small copy
3584           nop
3585         ba,pt   %ncc, .copyout_noerr_more       ! otherwise go to large copy
3586           nop
3587 
3588 .copyout_ne_small:
3589         ldn     [THREAD_REG + T_LOFAULT], %o4
3590         tst     %o4
3591         bz,pn   %ncc, .sm_do_copyout
3592           nop
3593         sethi   %hi(.sm_copyio_noerr), %o5
3594         or      %o5, %lo(.sm_copyio_noerr), %o5
3595         membar  #Sync                           ! sync error barrier
3596         ba,pt   %ncc, .sm_do_copyout
3597         stn     %o5, [THREAD_REG + T_LOFAULT]   ! set/save t_lofault
3598 
3599 .copyout_noerr_more:
3600         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3601         sethi   %hi(.copyio_noerr), REAL_LOFAULT
3602         ba,pt   %ncc, .do_copyout
3603           or    REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
3604 
3605         SET_SIZE(copyout_noerr)
3606 #endif /* lint */
3607 
3608 
3609 /*
3610  * hwblkclr - clears block-aligned, block-multiple-sized regions that are
3611  * longer than 256 bytes in length using spitfire's block stores.  If
3612  * the criteria for using this routine are not met then it calls bzero
3613  * and returns 1.  Otherwise 0 is returned indicating success.
3614  * Caller is responsible for ensuring use_hw_bzero is true and that
3615  * kpreempt_disable() has been called.
3616  */
3617 #ifdef lint
3618 /*ARGSUSED*/
3619 int
3620 hwblkclr(void *addr, size_t len)
3621 { 
3622         return(0);
3623 }
3624 #else /* lint */
3625         ! %i0 - start address
3626         ! %i1 - length of region (multiple of 64)
3627         ! %l0 - saved fprs
3628         ! %l1 - pointer to saved %d0 block
3629         ! %l2 - saved curthread->t_lwp
3630 
3631         ENTRY(hwblkclr)
3632         ! get another window w/space for one aligned block of saved fpregs
3633         save    %sp, -SA(MINFRAME + 2*VIS_BLOCKSIZE), %sp
3634 
3635         ! Must be block-aligned
3636         andcc   %i0, (VIS_BLOCKSIZE-1), %g0
3637         bnz,pn  %ncc, 1f
3638           nop
3639 
3640         ! ... and must be 256 bytes or more
3641         cmp     %i1, 256
3642         blu,pn  %ncc, 1f
3643           nop
3644 
3645         ! ... and length must be a multiple of VIS_BLOCKSIZE
3646         andcc   %i1, (VIS_BLOCKSIZE-1), %g0
3647         bz,pn   %ncc, 2f
3648           nop
3649 
3650 1:      ! punt, call bzero but notify the caller that bzero was used
3651         mov     %i0, %o0
3652         call    bzero
3653         mov     %i1, %o1
3654         ret
3655           restore       %g0, 1, %o0 ! return (1) - did not use block operations
3656 
3657 2:      rd      %fprs, %l0              ! check for unused fp
3658         btst    FPRS_FEF, %l0
3659         bz,pt   %icc, 1f
3660           nop
3661 
3662         ! save in-use fpregs on stack
3663         membar  #Sync
3664         add     %fp, STACK_BIAS - 65, %l1
3665         and     %l1, -VIS_BLOCKSIZE, %l1
3666         stda    %d0, [%l1]ASI_BLK_P
3667 
3668 1:      membar  #StoreStore|#StoreLoad|#LoadStore
3669         wr      %g0, FPRS_FEF, %fprs
3670         wr      %g0, ASI_BLK_P, %asi
3671 
3672         ! Clear block
3673         fzero   %d0
3674         fzero   %d2
3675         fzero   %d4
3676         fzero   %d6
3677         fzero   %d8
3678         fzero   %d10
3679         fzero   %d12
3680         fzero   %d14
3681 
3682         mov     256, %i3
3683         ba,pt   %ncc, .pz_doblock
3684           nop
3685 
3686 .pz_blkstart:   
3687       ! stda    %d0, [%i0 + 192]%asi  ! in dly slot of branch that got us here
3688         stda    %d0, [%i0 + 128]%asi
3689         stda    %d0, [%i0 + 64]%asi
3690         stda    %d0, [%i0]%asi
3691 .pz_zinst:
3692         add     %i0, %i3, %i0
3693         sub     %i1, %i3, %i1
3694 .pz_doblock:
3695         cmp     %i1, 256
3696         bgeu,a  %ncc, .pz_blkstart
3697           stda  %d0, [%i0 + 192]%asi
3698 
3699         cmp     %i1, 64
3700         blu     %ncc, .pz_finish
3701         
3702           andn  %i1, (64-1), %i3
3703         srl     %i3, 4, %i2             ! using blocks, 1 instr / 16 words
3704         set     .pz_zinst, %i4
3705         sub     %i4, %i2, %i4
3706         jmp     %i4
3707           nop
3708 
3709 .pz_finish:
3710         membar  #Sync
3711         btst    FPRS_FEF, %l0
3712         bz,a    .pz_finished
3713           wr    %l0, 0, %fprs           ! restore fprs
3714 
3715         ! restore fpregs from stack
3716         ldda    [%l1]ASI_BLK_P, %d0
3717         membar  #Sync
3718         wr      %l0, 0, %fprs           ! restore fprs
3719 
3720 .pz_finished:
3721         ret
3722           restore       %g0, 0, %o0             ! return (bzero or not)
3723 
3724         SET_SIZE(hwblkclr)
3725 #endif  /* lint */
3726 
3727 #ifdef lint
3728 /*ARGSUSED*/
3729 void
3730 hw_pa_bcopy32(uint64_t src, uint64_t dst)
3731 {}
3732 #else /*!lint */
3733         /*
3734          * Copy 32 bytes of data from src (%o0) to dst (%o1)
3735          * using physical addresses.
3736          */
3737         ENTRY_NP(hw_pa_bcopy32)
3738         rdpr    %pstate, %g1
3739         andn    %g1, PSTATE_IE, %g2
3740         wrpr    %g0, %g2, %pstate
3741 
3742         rdpr    %pstate, %g0
3743         ldxa    [%o0]ASI_MEM, %o2
3744         add     %o0, 8, %o0
3745         ldxa    [%o0]ASI_MEM, %o3
3746         add     %o0, 8, %o0
3747         ldxa    [%o0]ASI_MEM, %o4
3748         add     %o0, 8, %o0
3749         ldxa    [%o0]ASI_MEM, %o5
3750 
3751         stxa    %g0, [%o1]ASI_DC_INVAL
3752         membar  #Sync
3753 
3754         stxa    %o2, [%o1]ASI_MEM
3755         add     %o1, 8, %o1
3756         stxa    %o3, [%o1]ASI_MEM
3757         add     %o1, 8, %o1
3758         stxa    %o4, [%o1]ASI_MEM
3759         add     %o1, 8, %o1
3760         stxa    %o5, [%o1]ASI_MEM
3761 
3762         retl
3763           wrpr    %g0, %g1, %pstate
3764 
3765         SET_SIZE(hw_pa_bcopy32)
3766 
3767 #endif /* lint */
3768 
3769 #if defined(lint)
3770 
3771 int use_hw_bcopy = 1;
3772 int use_hw_bzero = 1;
3773 uint_t hw_copy_limit_1 = 0;
3774 uint_t hw_copy_limit_2 = 0;
3775 uint_t hw_copy_limit_4 = 0;
3776 uint_t hw_copy_limit_8 = 0;
3777 
3778 #else /* !lint */
3779 
3780         DGDEF(use_hw_bcopy)
3781         .word   1
3782         DGDEF(use_hw_bzero)
3783         .word   1
3784         DGDEF(hw_copy_limit_1)
3785         .word   0
3786         DGDEF(hw_copy_limit_2)
3787         .word   0
3788         DGDEF(hw_copy_limit_4)
3789         .word   0
3790         DGDEF(hw_copy_limit_8)
3791         .word   0
3792 
3793         .align  64
3794         .section ".text"
3795 #endif /* !lint */