2 Wdiff usr/src/uts/sun4u/cpu/cheetah_copy.s

Print this page

de-linting of .s files

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/sun4u/cpu/cheetah_copy.s
          +++ new/usr/src/uts/sun4u/cpu/cheetah_copy.s

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License, Version 1.0 only
   6    6   * (the "License").  You may not use this file except in compliance
   7    7   * with the License.
   8    8   *
   9    9   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  10   10   * or http://www.opensolaris.org/os/licensing.
  11   11   * See the License for the specific language governing permissions
  12   12   * and limitations under the License.
  13   13   *
  14   14   * When distributing Covered Code, include this CDDL HEADER in each
  15   15   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  16   16   * If applicable, add the following below this CDDL HEADER, with the

↓ open down ↓

16 lines elided

↑ open up ↑

  17   17   * fields enclosed by brackets "[]" replaced with your own identifying
  18   18   * information: Portions Copyright [yyyy] [name of copyright owner]
  19   19   *
  20   20   * CDDL HEADER END
  21   21   */
  22   22  /*
  23   23   * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
  24   24   * Use is subject to license terms.
  25   25   */
  26   26  
  27      -#pragma ident   "%Z%%M% %I%     %E% SMI"
  28      -
  29   27  #include <sys/param.h>
  30   28  #include <sys/errno.h>
  31   29  #include <sys/asm_linkage.h>
  32   30  #include <sys/vtrace.h>
  33   31  #include <sys/machthread.h>
  34   32  #include <sys/clock.h>
  35   33  #include <sys/asi.h>
  36   34  #include <sys/fsr.h>
  37   35  #include <sys/privregs.h>
  38   36  #include <sys/fpras_impl.h>
  39   37  
  40      -#if !defined(lint)
  41   38  #include "assym.h"
  42      -#endif  /* lint */
  43   39  
  44   40  /*
  45   41   * Pseudo-code to aid in understanding the control flow of the
  46   42   * bcopy/copyin/copyout routines.
  47   43   *
  48   44   * On entry:
  49   45   *
  50   46   *      ! Determine whether to use the FP register version
  51   47   *      ! or the leaf routine version depending on size
  52   48   *      ! of copy and flags.  Set up error handling accordingly.

  53   49   *      ! The transition point depends on whether the src and
  54   50   *      ! dst addresses can be aligned to long word, word,
  55   51   *      ! half word, or byte boundaries.
  56   52   *      !
  57   53   *      ! WARNING: <Register usage convention>
  58   54   *      ! For FP version, %l6 holds previous error handling and
  59   55   *      ! a flag: TRAMP_FLAG (low bits)
  60   56   *      ! for leaf routine version, %o4 holds those values.
  61   57   *      ! So either %l6 or %o4 is reserved and not available for
  62   58   *      ! any other use.
  63   59   *
  64   60   *      if (length <= VIS_COPY_THRESHOLD)       ! start with a quick test
  65   61   *              go to small_copy;               ! to speed short copies
  66   62   * 
  67   63   *      ! src, dst long word alignable
  68   64   *              if (hw_copy_limit_8 == 0)       ! hw_copy disabled
  69   65   *                      go to small_copy;
  70   66   *              if (length <= hw_copy_limit_8)
  71   67   *                      go to small_copy;
  72   68   *              go to FPBLK_copy;
  73   69   *      }
  74   70   *      if (src,dst not alignable) {
  75   71   *              if (hw_copy_limit_1 == 0)       ! hw_copy disabled
  76   72   *                      go to small_copy;
  77   73   *              if (length <= hw_copy_limit_1)
  78   74   *                      go to small_copy;
  79   75   *              go to FPBLK_copy;
  80   76   *      }
  81   77   *      if (src,dst halfword alignable) {
  82   78   *              if (hw_copy_limit_2 == 0)       ! hw_copy disabled
  83   79   *                      go to small_copy;
  84   80   *              if (length <= hw_copy_limit_2)
  85   81   *                      go to small_copy;
  86   82   *              go to FPBLK_copy;
  87   83   *      }
  88   84   *      if (src,dst word alignable) {
  89   85   *              if (hw_copy_limit_4 == 0)       ! hw_copy disabled
  90   86   *                      go to small_copy;
  91   87   *              if (length <= hw_copy_limit_4)
  92   88   *                      go to small_copy;
  93   89   *              go to FPBLK_copy;
  94   90   *      }
  95   91   *
  96   92   * small_copy:
  97   93   *      Setup_leaf_rtn_error_handler;           ! diffs for each entry point
  98   94   *      
  99   95   *      if (count <= 3)                         ! fast path for tiny copies
 100   96   *              go to sm_left;                  ! special finish up code
 101   97   *      else
 102   98   *              if (count > CHKSIZE)            ! medium sized copies
 103   99   *                      go to sm_med            ! tuned by alignment
 104  100   *              if(src&dst not both word aligned) {
 105  101   *      sm_movebytes:
 106  102   *                      move byte by byte in 4-way unrolled loop
 107  103   *                      fall into sm_left;
 108  104   *      sm_left:
 109  105   *                      move 0-3 bytes byte at a time as needed.
 110  106   *                      restore error handler and exit.
 111  107   *
 112  108   *              } else {        ! src&dst are word aligned
 113  109   *                      check for at least 8 bytes left,
 114  110   *                      move word at a time, unrolled by 2
 115  111   *                      when fewer than 8 bytes left,
 116  112   *      sm_half:        move half word at a time while 2 or more bytes left
 117  113   *      sm_byte:        move final byte if necessary
 118  114   *      sm_exit:
 119  115   *                      restore error handler and exit.
 120  116   *              }
 121  117   *
 122  118   * ! Medium length cases with at least CHKSIZE bytes available
 123  119   * ! method: line up src and dst as best possible, then
 124  120   * ! move data in 4-way unrolled loops.
 125  121   *
 126  122   * sm_med:
 127  123   *      if(src&dst unalignable)
 128  124   *              go to sm_movebytes
 129  125   *      if(src&dst halfword alignable)
 130  126   *              go to sm_movehalf
 131  127   *      if(src&dst word alignable)
 132  128   *              go to sm_moveword
 133  129   * ! fall into long word movement
 134  130   *      move bytes until src is word aligned
 135  131   *      if not long word aligned, move a word
 136  132   *      move long words in 4-way unrolled loop until < 32 bytes left
 137  133   *      move long words in 1-way unrolled loop until < 8 bytes left
 138  134   *      if zero bytes left, goto sm_exit
 139  135   *      if one byte left, go to sm_byte
 140  136   *      else go to sm_half
 141  137   *
 142  138   * sm_moveword:
 143  139   *      move bytes until src is word aligned
 144  140   *      move words in 4-way unrolled loop until < 16 bytes left
 145  141   *      move words in 1-way unrolled loop until < 4 bytes left
 146  142   *      if zero bytes left, goto sm_exit
 147  143   *      if one byte left, go to sm_byte
 148  144   *      else go to sm_half
 149  145   *
 150  146   * sm_movehalf:
 151  147   *      move a byte if needed to align src on halfword
 152  148   *      move halfwords in 4-way unrolled loop until < 8 bytes left
 153  149   *      if zero bytes left, goto sm_exit
 154  150   *      if one byte left, go to sm_byte
 155  151   *      else go to sm_half
 156  152   *
 157  153   *
 158  154   * FPBLK_copy:
 159  155   *      %l6 = curthread->t_lofault;
 160  156   *      if (%l6 != NULL) {
 161  157   *              membar #Sync
 162  158   *              curthread->t_lofault = .copyerr;
 163  159   *              caller_error_handler = TRUE             ! %l6 |= 2
 164  160   *      }
 165  161   *
 166  162   *      ! for FPU testing we must not migrate cpus
 167  163   *      if (curthread->t_lwp == NULL) {
 168  164   *              ! Kernel threads do not have pcb's in which to store
 169  165   *              ! the floating point state, so disallow preemption during
 170  166   *              ! the copy.  This also prevents cpu migration.
 171  167   *              kpreempt_disable(curthread);
 172  168   *      } else {
 173  169   *              thread_nomigrate();
 174  170   *      }
 175  171   *
 176  172   *      old_fprs = %fprs;
 177  173   *      old_gsr = %gsr;
 178  174   *      if (%fprs.fef) {
 179  175   *              %fprs.fef = 1;
 180  176   *              save current fpregs on stack using blockstore
 181  177   *      } else {
 182  178   *              %fprs.fef = 1;
 183  179   *      }
 184  180   *
 185  181   *
 186  182   *      do_blockcopy_here;
 187  183   *
 188  184   * In lofault handler:
 189  185   *      curthread->t_lofault = .copyerr2;
 190  186   *      Continue on with the normal exit handler
 191  187   *
 192  188   * On normal exit:
 193  189   *      %gsr = old_gsr;
 194  190   *      if (old_fprs & FPRS_FEF)
 195  191   *              restore fpregs from stack using blockload
 196  192   *      else
 197  193   *              zero fpregs
 198  194   *      %fprs = old_fprs;
 199  195   *      membar #Sync
 200  196   *      curthread->t_lofault = (%l6 & ~3);
 201  197   *      ! following test omitted from copyin/copyout as they
 202  198   *      ! will always have a current thread
 203  199   *      if (curthread->t_lwp == NULL)
 204  200   *              kpreempt_enable(curthread);
 205  201   *      else
 206  202   *              thread_allowmigrate();
 207  203   *      return (0)
 208  204   *
 209  205   * In second lofault handler (.copyerr2):
 210  206   *      We've tried to restore fp state from the stack and failed.  To
 211  207   *      prevent from returning with a corrupted fp state, we will panic.
 212  208   */
 213  209  
 214  210  /*
 215  211   * Comments about optimization choices
 216  212   *
 217  213   * The initial optimization decision in this code is to determine
 218  214   * whether to use the FP registers for a copy or not.  If we don't
 219  215   * use the FP registers, we can execute the copy as a leaf routine,
 220  216   * saving a register save and restore.  Also, less elaborate setup
 221  217   * is required, allowing short copies to be completed more quickly.
 222  218   * For longer copies, especially unaligned ones (where the src and
 223  219   * dst do not align to allow simple ldx,stx operation), the FP
 224  220   * registers allow much faster copy operations.
 225  221   *
 226  222   * The estimated extra cost of the FP path will vary depending on
 227  223   * src/dst alignment, dst offset from the next 64 byte FPblock store
 228  224   * boundary, remaining src data after the last full dst cache line is
 229  225   * moved whether the FP registers need to be saved, and some other
 230  226   * minor issues.  The average additional overhead is estimated to be
 231  227   * 400 clocks.  Since each non-repeated/predicted tst and branch costs
 232  228   * around 10 clocks, elaborate calculation would slow down to all 
 233  229   * longer copies and only benefit a small portion of medium sized
 234  230   * copies.  Rather than incur such cost, we chose fixed transition
 235  231   * points for each of the alignment choices.
 236  232   *
 237  233   * For the inner loop, here is a comparison of the per cache line
 238  234   * costs for each alignment when src&dst are in cache:
 239  235   * 
 240  236   * byte aligned:  108 clocks slower for non-FPBLK
 241  237   * half aligned:   44 clocks slower for non-FPBLK
 242  238   * word aligned:   12 clocks slower for non-FPBLK
 243  239   * long aligned:    4 clocks >>faster<< for non-FPBLK
 244  240   *
 245  241   * The long aligned loop runs faster because it does no prefetching.
 246  242   * That wins if the data is not in cache or there is too little
 247  243   * data to gain much benefit from prefetching.  But when there
 248  244   * is more data and that data is not in cache, failing to prefetch
 249  245   * can run much slower.  In addition, there is a 2 Kbyte store queue
 250  246   * which will cause the non-FPBLK inner loop to slow for larger copies.
 251  247   * The exact tradeoff is strongly load and application dependent, with
 252  248   * increasing risk of a customer visible performance regression if the
 253  249   * non-FPBLK code is used for larger copies. Studies of synthetic in-cache
 254  250   * vs out-of-cache copy tests in user space suggest 1024 bytes as a safe
 255  251   * upper limit for the non-FPBLK code.  To minimize performance regression
 256  252   * risk while still gaining the primary benefits of the improvements to 
 257  253   * the non-FPBLK code, we set an upper bound of 1024 bytes for the various
 258  254   * hw_copy_limit_*.  Later experimental studies using different values 
 259  255   * of hw_copy_limit_* can be used to make further adjustments if 
 260  256   * appropriate.
 261  257   *
 262  258   * hw_copy_limit_1 = src and dst are byte aligned but not halfword aligned
 263  259   * hw_copy_limit_2 = src and dst are halfword aligned but not word aligned
 264  260   * hw_copy_limit_4 = src and dst are word aligned but not longword aligned
 265  261   * hw_copy_limit_8 = src and dst are longword aligned
 266  262   *
 267  263   * To say that src and dst are word aligned means that after
 268  264   * some initial alignment activity of moving 0 to 3 bytes,
 269  265   * both the src and dst will be on word boundaries so that
 270  266   * word loads and stores may be used.
 271  267   *
 272  268   * Recommended initial values as of Mar 2004, includes testing
 273  269   * on Cheetah+ (900MHz), Cheetah++ (1200MHz), and Jaguar(1050MHz):
 274  270   * hw_copy_limit_1 =  256
 275  271   * hw_copy_limit_2 =  512
 276  272   * hw_copy_limit_4 = 1024
 277  273   * hw_copy_limit_8 = 1024 (or 1536 on some systems)
 278  274   *
 279  275   *
 280  276   * If hw_copy_limit_? is set to zero, then use of FPBLK copy is
 281  277   * disabled for that alignment choice.
 282  278   * If hw_copy_limit_? is set to a value between 1 and VIS_COPY_THRESHOLD (256)
 283  279   * the value of VIS_COPY_THRESHOLD is used.
 284  280   * It is not envisioned that hw_copy_limit_? will be changed in the field
 285  281   * It is provided to allow for disabling FPBLK copies and to allow
 286  282   * easy testing of alternate values on future HW implementations
 287  283   * that might have different cache sizes, clock rates or instruction
 288  284   * timing rules.
 289  285   *
 290  286   * Our first test for FPBLK copies vs non-FPBLK copies checks a minimum
 291  287   * threshold to speedup all shorter copies (less than 256).  That
 292  288   * saves an alignment test, memory reference, and enabling test
 293  289   * for all short copies, or an estimated 24 clocks.
 294  290   *
 295  291   * The order in which these limits are checked does matter since each
 296  292   * non-predicted tst and branch costs around 10 clocks.
 297  293   * If src and dst are randomly selected addresses,
 298  294   * 4 of 8 will not be alignable.
 299  295   * 2 of 8 will be half word alignable.
 300  296   * 1 of 8 will be word alignable.
 301  297   * 1 of 8 will be long word alignable.
 302  298   * But, tests on running kernels show that src and dst to copy code
 303  299   * are typically not on random alignments.  Structure copies and
 304  300   * copies of larger data sizes are often on long word boundaries.
 305  301   * So we test the long word alignment case first, then
 306  302   * the byte alignment, then halfword, then word alignment.
 307  303   *
 308  304   * Several times, tests for length are made to split the code
 309  305   * into subcases.  These tests often allow later tests to be
 310  306   * avoided.  For example, within the non-FPBLK copy, we first 
 311  307   * check for tiny copies of 3 bytes or less.  That allows us
 312  308   * to use a 4-way unrolled loop for the general byte copy case
 313  309   * without a test on loop entry.
 314  310   * We subdivide the non-FPBLK case further into CHKSIZE bytes and less
 315  311   * vs longer cases.  For the really short case, we don't attempt
 316  312   * align src and dst.  We try to minimize special case tests in
 317  313   * the shortest loops as each test adds a significant percentage
 318  314   * to the total time.
 319  315   *
 320  316   * For the medium sized cases, we allow ourselves to adjust the
 321  317   * src and dst alignment and provide special cases for each of
 322  318   * the four adjusted alignment cases. The CHKSIZE that was used
 323  319   * to decide between short and medium size was chosen to be 39
 324  320   * as that allows for the worst case of 7 bytes of alignment
 325  321   * shift and 4 times 8 bytes for the first long word unrolling.
 326  322   * That knowledge saves an initial test for length on entry into
 327  323   * the medium cases.  If the general loop unrolling factor were
 328  324   * to be increases, this number would also need to be adjusted.
 329  325   *
 330  326   * For all cases in the non-FPBLK code where it is known that at
 331  327   * least 4 chunks of data are available for movement, the
 332  328   * loop is unrolled by four.  This 4-way loop runs in 8 clocks
 333  329   * or 2 clocks per data element.  Due to limitations of the
 334  330   * branch instruction on Cheetah, Jaguar, and Panther, the
 335  331   * minimum time for a small, tight loop is 3 clocks.  So
 336  332   * the 4-way loop runs 50% faster than the fastest non-unrolled
 337  333   * loop.
 338  334   *
 339  335   * Instruction alignment is forced by used of .align 16 directives
 340  336   * and nops which are not executed in the code.  This
 341  337   * combination of operations shifts the alignment of following
 342  338   * loops to insure that loops are aligned so that their instructions
 343  339   * fall within the minimum number of 4 instruction fetch groups. 
 344  340   * If instructions are inserted or removed between the .align 
 345  341   * instruction and the unrolled loops, then the alignment needs
 346  342   * to be readjusted.  Misaligned loops can add a clock per loop
 347  343   * iteration to the loop timing.
 348  344   *
 349  345   * In a few cases, code is duplicated to avoid a branch.  Since
 350  346   * a non-predicted tst and branch takes 10 clocks, this savings
 351  347   * is judged an appropriate time-space tradeoff.
 352  348   *
 353  349   * Within the FPBLK-code, the prefetch method in the inner
 354  350   * loop needs to be explained as it is not standard.  Two 
 355  351   * prefetches are issued for each cache line instead of one.
 356  352   * The primary one is at the maximum reach of 8 cache lines.
 357  353   * Most of the time, that maximum prefetch reach gives the
 358  354   * cache line more time to reach the processor for systems with
 359  355   * higher processor clocks.  But, sometimes memory interference
 360  356   * can cause that prefetch to be dropped.  Putting a second
 361  357   * prefetch at a reach of 5 cache lines catches the drops
 362  358   * three iterations later and shows a measured improvement
 363  359   * in performance over any similar loop with a single prefetch.
 364  360   * The prefetches are placed in the loop so they overlap with 
 365  361   * non-memory instructions, so that there is no extra cost 
 366  362   * when the data is already in-cache.
 367  363   *
 368  364   */
 369  365  
 370  366  /*
 371  367   * Notes on preserving existing fp state and on membars.
 372  368   *
 373  369   * When a copyOP decides to use fp we may have to preserve existing
 374  370   * floating point state.  It is not the caller's state that we need to
 375  371   * preserve - the rest of the kernel does not use fp and, anyway, fp
 376  372   * registers are volatile across a call.  Some examples:
 377  373   *
 378  374   *      - userland has fp state and is interrupted (device interrupt 
 379  375   *        or trap) and within the interrupt/trap handling we use
 380  376   *        bcopy()
 381  377   *      - another (higher level) interrupt or trap handler uses bcopy
 382  378   *        while a bcopy from an earlier interrupt is still active
 383  379   *      - an asynchronous error trap occurs while fp state exists (in
 384  380   *        userland or in kernel copy) and the tl0 component of the handling
 385  381   *        uses bcopy
 386  382   *      - a user process with fp state incurs a copy-on-write fault and
 387  383   *        hwblkpagecopy always uses fp
 388  384   *
 389  385   * We therefore need a per-call place in which to preserve fp state -
 390  386   * using our stack is ideal (and since fp copy cannot be leaf optimized
 391  387   * because of calls it makes, this is no hardship).
 392  388   *
 393  389   * The following membar BLD/BST discussion is Cheetah pipeline specific.
 394  390   * In Cheetah BLD is blocking, #LoadLoad/#LoadStore/#StoreStore are
 395  391   * nops (those semantics always apply) and #StoreLoad is implemented
 396  392   * as a membar #Sync.
 397  393   *
 398  394   * It is possible that the owner of the fp state has a block load or
 399  395   * block store still "in flight" at the time we come to preserve that
 400  396   * state.  Block loads are blocking in Cheetah pipelines so we do not
 401  397   * need to sync with them.  In preserving fp regs we will use block stores
 402  398   * (which are not blocking in Cheetah pipelines) so we require a membar #Sync
 403  399   * after storing state (so that our subsequent use of those registers
 404  400   * does not modify them before the block stores complete);  this membar
 405  401   * also serves to sync with block stores the owner of the fp state has
 406  402   * initiated.
 407  403   *
 408  404   * When we have finished fp copy (with it's repeated block stores)
 409  405   * we must membar #Sync so that our block stores may complete before
 410  406   * we either restore the original fp state into the fp registers or
 411  407   * return to a caller which may initiate other fp operations that could
 412  408   * modify the fp regs we used before the block stores complete.
 413  409   *
 414  410   * Synchronous faults (eg, unresolvable DMMU miss) that occur while
 415  411   * t_lofault is not NULL will not panic but will instead trampoline
 416  412   * to the registered lofault handler.  There is no need for any
 417  413   * membars for these - eg, our store to t_lofault will always be visible to
 418  414   * ourselves and it is our cpu which will take any trap.
 419  415   *
 420  416   * Asynchronous faults (eg, uncorrectable ECC error from memory) that occur
 421  417   * while t_lofault is not NULL will also not panic.  Since we're copying
 422  418   * to or from userland the extent of the damage is known - the destination
 423  419   * buffer is incomplete.  So trap handlers will trampoline to the lofault
 424  420   * handler in this case which should take some form of error action to
 425  421   * avoid using the incomplete buffer.  The trap handler also flags the
 426  422   * fault so that later return-from-trap handling (for the trap that brought
 427  423   * this thread into the kernel in the first place) can notify the process
 428  424   * and reboot the system (or restart the service with Greenline/Contracts).
 429  425   *
 430  426   * Asynchronous faults (eg, uncorrectable ECC error from memory) can
 431  427   * result in deferred error traps - the trap is taken sometime after
 432  428   * the event and the trap PC may not be the PC of the faulting access.
 433  429   * Delivery of such pending traps can be forced by a membar #Sync, acting
 434  430   * as an "error barrier" in this role.  To accurately apply the user/kernel
 435  431   * separation described in the preceding paragraph we must force delivery
 436  432   * of deferred traps affecting kernel state before we install a lofault
 437  433   * handler (if we interpose a new lofault handler on an existing one there
 438  434   * is no need to repeat this), and we must force delivery of deferred
 439  435   * errors affecting the lofault-protected region before we clear t_lofault.
 440  436   * Failure to do so results in lost kernel state being interpreted as
 441  437   * affecting a copyin/copyout only, or of an error that really only
 442  438   * affects copy data being interpreted as losing kernel state.
 443  439   *
 444  440   * Since the copy operations may preserve and later restore floating
 445  441   * point state that does not belong to the caller (see examples above),
 446  442   * we must be careful in how we do this in order to prevent corruption
 447  443   * of another program.
 448  444   *
 449  445   * To make sure that floating point state is always saved and restored
 450  446   * correctly, the following "big rules" must be followed when the floating
 451  447   * point registers will be used:
 452  448   *
 453  449   * 1. %l6 always holds the caller's lofault handler.  Also in this register,
 454  450   *    Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in
 455  451   *    use.  Bit 2 (TRAMP_FLAG) indicates that the call was to bcopy, and a
 456  452   *    lofault handler was set coming in.
 457  453   *
 458  454   * 2. The FPUSED flag indicates that all FP state has been successfully stored
 459  455   *    on the stack.  It should not be set until this save has been completed.
 460  456   *
 461  457   * 3. The FPUSED flag should not be cleared on exit until all FP state has
 462  458   *    been restored from the stack.  If an error occurs while restoring
 463  459   *    data from the stack, the error handler can check this flag to see if
 464  460   *    a restore is necessary.
 465  461   *
 466  462   * 4. Code run under the new lofault handler must be kept to a minimum.  In
 467  463   *    particular, any calls to FP_ALLOWMIGRATE, which could result in a call
 468  464   *    to kpreempt(), should not be made until after the lofault handler has
 469  465   *    been restored.
 470  466   */
 471  467  
 472  468  /*
 473  469   * VIS_COPY_THRESHOLD indicates the minimum number of bytes needed
 474  470   * to "break even" using FP/VIS-accelerated memory operations.
 475  471   * The FPBLK code assumes a minimum number of bytes are available
 476  472   * to be moved on entry.  Check that code carefully before 
 477  473   * reducing VIS_COPY_THRESHOLD below 256.
 478  474   */
 479  475  /*
 480  476   * This shadows sys/machsystm.h which can't be included due to the lack of
 481  477   * _ASM guards in include files it references. Change it here, change it there.
 482  478   */
 483  479  #define VIS_COPY_THRESHOLD 256
 484  480  
 485  481  /*
 486  482   * TEST for very short copies
 487  483   * Be aware that the maximum unroll for the short unaligned case
 488  484   * is SHORTCOPY+1
 489  485   */
 490  486  #define SHORTCOPY 3
 491  487  #define CHKSIZE  39
 492  488  
 493  489  /*
 494  490   * Indicates that we're to trampoline to the error handler.
 495  491   * Entry points bcopy, copyin_noerr, and copyout_noerr use this flag.
 496  492   * kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag.
 497  493   */
 498  494  #define FPUSED_FLAG     1
 499  495  #define TRAMP_FLAG      2
 500  496  #define MASK_FLAGS      3
 501  497  
 502  498  /*
 503  499   * Number of outstanding prefetches.
 504  500   * Testing with 1200 MHz Cheetah+ and Jaguar gives best results with
 505  501   * two prefetches, one with a reach of 8*BLOCK_SIZE+8 and one with a
 506  502   * reach of 5*BLOCK_SIZE.  The double prefetch gives an typical improvement
 507  503   * of 5% for large copies as compared to a single prefetch.  The reason
 508  504   * for the improvement is that with Cheetah and Jaguar, some prefetches
 509  505   * are dropped due to the prefetch queue being full.  The second prefetch
 510  506   * reduces the number of cache lines that are dropped. 
 511  507   * Do not remove the double prefetch or change either CHEETAH_PREFETCH
 512  508   * or CHEETAH_2ND_PREFETCH without extensive performance tests to prove
 513  509   * there is no loss of performance.
 514  510   */
 515  511  #define CHEETAH_PREFETCH        8
 516  512  #define CHEETAH_2ND_PREFETCH    5
 517  513  
 518  514  #define VIS_BLOCKSIZE           64
 519  515  
 520  516  /*
 521  517   * Size of stack frame in order to accomodate a 64-byte aligned
 522  518   * floating-point register save area and 2 64-bit temp locations.
 523  519   * All copy functions use two quadrants of fp registers; to assure a
 524  520   * block-aligned two block buffer in which to save we must reserve
 525  521   * three blocks on stack.  Not all functions preserve %pfrs on stack
 526  522   * or need to preserve %gsr but we use HWCOPYFRAMESIZE for all.
 527  523   *
 528  524   *    _______________________________________ <-- %fp + STACK_BIAS
 529  525   *    | We may need to preserve 2 quadrants |
 530  526   *    | of fp regs, but since we do so with |
 531  527   *    | BST/BLD we need room in which to    |
 532  528   *    | align to VIS_BLOCKSIZE bytes.  So   |
 533  529   *    | this area is 3 * VIS_BLOCKSIZE.     | <--  - SAVED_FPREGS_OFFSET
 534  530   *    |-------------------------------------|
 535  531   *    | 8 bytes to save %fprs               | <--  - SAVED_FPRS_OFFSET
 536  532   *    |-------------------------------------|
 537  533   *    | 8 bytes to save %gsr                | <--  - SAVED_GSR_OFFSET
 538  534   *    ---------------------------------------
 539  535   */
 540  536  #define HWCOPYFRAMESIZE         ((VIS_BLOCKSIZE * (2 + 1)) + (2 * 8))
 541  537  #define SAVED_FPREGS_OFFSET     (VIS_BLOCKSIZE * 3)
 542  538  #define SAVED_FPREGS_ADJUST     ((VIS_BLOCKSIZE * 2) - 1)
 543  539  #define SAVED_FPRS_OFFSET       (SAVED_FPREGS_OFFSET + 8)
 544  540  #define SAVED_GSR_OFFSET        (SAVED_FPRS_OFFSET + 8)
 545  541  
 546  542  /*
 547  543   * Common macros used by the various versions of the block copy
 548  544   * routines in this file.
 549  545   */
 550  546  
 551  547  /*
 552  548   * In FP copies if we do not have preserved data to restore over
 553  549   * the fp regs we used then we must zero those regs to avoid
 554  550   * exposing portions of the data to later threads (data security).
 555  551   *
 556  552   * Copy functions use either quadrants 1 and 3 or 2 and 4.
 557  553   *
 558  554   * FZEROQ1Q3: Zero quadrants 1 and 3, ie %f0 - %f15 and %f32 - %f47
 559  555   * FZEROQ2Q4: Zero quadrants 2 and 4, ie %f16 - %f31 and %f48 - %f63
 560  556   *
 561  557   * The instructions below are quicker than repeated fzero instructions
 562  558   * since they can dispatch down two fp pipelines.
 563  559   */
 564  560  #define FZEROQ1Q3                       \
 565  561          fzero   %f0                     ;\
 566  562          fzero   %f2                     ;\
 567  563          faddd   %f0, %f2, %f4           ;\
 568  564          fmuld   %f0, %f2, %f6           ;\
 569  565          faddd   %f0, %f2, %f8           ;\
 570  566          fmuld   %f0, %f2, %f10          ;\
 571  567          faddd   %f0, %f2, %f12          ;\
 572  568          fmuld   %f0, %f2, %f14          ;\
 573  569          faddd   %f0, %f2, %f32          ;\
 574  570          fmuld   %f0, %f2, %f34          ;\
 575  571          faddd   %f0, %f2, %f36          ;\
 576  572          fmuld   %f0, %f2, %f38          ;\
 577  573          faddd   %f0, %f2, %f40          ;\
 578  574          fmuld   %f0, %f2, %f42          ;\
 579  575          faddd   %f0, %f2, %f44          ;\
 580  576          fmuld   %f0, %f2, %f46
 581  577  
 582  578  #define FZEROQ2Q4                       \
 583  579          fzero   %f16                    ;\
 584  580          fzero   %f18                    ;\
 585  581          faddd   %f16, %f18, %f20        ;\
 586  582          fmuld   %f16, %f18, %f22        ;\
 587  583          faddd   %f16, %f18, %f24        ;\
 588  584          fmuld   %f16, %f18, %f26        ;\
 589  585          faddd   %f16, %f18, %f28        ;\
 590  586          fmuld   %f16, %f18, %f30        ;\
 591  587          faddd   %f16, %f18, %f48        ;\
 592  588          fmuld   %f16, %f18, %f50        ;\
 593  589          faddd   %f16, %f18, %f52        ;\
 594  590          fmuld   %f16, %f18, %f54        ;\
 595  591          faddd   %f16, %f18, %f56        ;\
 596  592          fmuld   %f16, %f18, %f58        ;\
 597  593          faddd   %f16, %f18, %f60        ;\
 598  594          fmuld   %f16, %f18, %f62
 599  595  
 600  596  /*
 601  597   * Macros to save and restore quadrants 1 and 3 or 2 and 4 to/from the stack.
 602  598   * Used to save and restore in-use fp registers when we want to use FP
 603  599   * and find fp already in use and copy size still large enough to justify
 604  600   * the additional overhead of this save and restore.
 605  601   *
 606  602   * A membar #Sync is needed before save to sync fp ops initiated before
 607  603   * the call to the copy function (by whoever has fp in use); for example
 608  604   * an earlier block load to the quadrant we are about to save may still be
 609  605   * "in flight".  A membar #Sync is required at the end of the save to
 610  606   * sync our block store (the copy code is about to begin ldd's to the
 611  607   * first quadrant).  Note, however, that since Cheetah pipeline block load
 612  608   * is blocking we can omit the initial membar before saving fp state (they're
 613  609   * commented below in case of future porting to a chip that does not block

↓ open down ↓

561 lines elided

↑ open up ↑

 614  610   * on block load).
 615  611   *
 616  612   * Similarly: a membar #Sync before restore allows the block stores of
 617  613   * the copy operation to complete before we fill the quadrants with their
 618  614   * original data, and a membar #Sync after restore lets the block loads
 619  615   * of the restore complete before we return to whoever has the fp regs
 620  616   * in use.  To avoid repeated membar #Sync we make it the responsibility
 621  617   * of the copy code to membar #Sync immediately after copy is complete
 622  618   * and before using the BLD_*_FROMSTACK macro.
 623  619   */
 624      -#if !defined(lint)
 625  620  #define BST_FPQ1Q3_TOSTACK(tmp1)                                \
 626  621          /* membar #Sync */                                      ;\
 627  622          add     %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1     ;\
 628  623          and     tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */    ;\
 629  624          stda    %f0, [tmp1]ASI_BLK_P                            ;\
 630  625          add     tmp1, VIS_BLOCKSIZE, tmp1                       ;\
 631  626          stda    %f32, [tmp1]ASI_BLK_P                           ;\
 632  627          membar  #Sync
 633  628  
 634  629  #define BLD_FPQ1Q3_FROMSTACK(tmp1)                              \

 635  630          /* membar #Sync - provided at copy completion */        ;\
 636  631          add     %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1     ;\
 637  632          and     tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */    ;\
 638  633          ldda    [tmp1]ASI_BLK_P, %f0                            ;\
 639  634          add     tmp1, VIS_BLOCKSIZE, tmp1                       ;\
 640  635          ldda    [tmp1]ASI_BLK_P, %f32                           ;\
 641  636          membar  #Sync
 642  637  
 643  638  #define BST_FPQ2Q4_TOSTACK(tmp1)                                \
 644  639          /* membar #Sync */                                      ;\
 645  640          add     %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1     ;\
 646  641          and     tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */    ;\
 647  642          stda    %f16, [tmp1]ASI_BLK_P                           ;\
 648  643          add     tmp1, VIS_BLOCKSIZE, tmp1                       ;\
 649  644          stda    %f48, [tmp1]ASI_BLK_P                           ;\

↓ open down ↓

15 lines elided

↑ open up ↑

 650  645          membar  #Sync
 651  646  
 652  647  #define BLD_FPQ2Q4_FROMSTACK(tmp1)                              \
 653  648          /* membar #Sync - provided at copy completion */        ;\
 654  649          add     %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1     ;\
 655  650          and     tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */    ;\
 656  651          ldda    [tmp1]ASI_BLK_P, %f16                           ;\
 657  652          add     tmp1, VIS_BLOCKSIZE, tmp1                       ;\
 658  653          ldda    [tmp1]ASI_BLK_P, %f48                           ;\
 659  654          membar  #Sync
 660      -#endif
 661  655  
 662  656  /*
 663  657   * FP_NOMIGRATE and FP_ALLOWMIGRATE.  Prevent migration (or, stronger,
 664  658   * prevent preemption if there is no t_lwp to save FP state to on context
 665  659   * switch) before commencing a FP copy, and reallow it on completion or
 666  660   * in error trampoline paths when we were using FP copy.
 667  661   *
 668  662   * Both macros may call other functions, so be aware that all outputs are
 669  663   * forfeit after using these macros.  For this reason we do not pass registers
 670  664   * to use - we just use any outputs we want.

 671  665   *
 672  666   * For fpRAS we need to perform the fpRAS mechanism test on the same
 673  667   * CPU as we use for the copy operation, both so that we validate the
 674  668   * CPU we perform the copy on and so that we know which CPU failed
 675  669   * if a failure is detected.  Hence we need to be bound to "our" CPU.
 676  670   * This could be achieved through disabling preemption (and we have do it that
 677  671   * way for threads with no t_lwp) but for larger copies this may hold
 678  672   * higher priority threads off of cpu for too long (eg, realtime).  So we
 679  673   * make use of the lightweight t_nomigrate mechanism where we can (ie, when
 680  674   * we have a t_lwp).
 681  675   *
 682  676   * Pseudo code:
 683  677   *
 684  678   * FP_NOMIGRATE:
 685  679   *
 686  680   * if (curthread->t_lwp) {
 687  681   *      thread_nomigrate();
 688  682   * } else {
 689  683   *      kpreempt_disable();
 690  684   * }
 691  685   *
 692  686   * FP_ALLOWMIGRATE:
 693  687   *
 694  688   * if (curthread->t_lwp) {
 695  689   *      thread_allowmigrate();
 696  690   * } else {
 697  691   *      kpreempt_enable();
 698  692   * }
 699  693   */
 700  694  
 701  695  #define FP_NOMIGRATE(label1, label2)                            \
 702  696          ldn     [THREAD_REG + T_LWP], %o0                       ;\
 703  697          brz,a,pn %o0, label1/**/f                               ;\
 704  698            ldsb  [THREAD_REG + T_PREEMPT], %o1                   ;\
 705  699          call    thread_nomigrate                                ;\
 706  700            nop                                                   ;\
 707  701          ba      label2/**/f                                     ;\
 708  702            nop                                                   ;\
 709  703  label1:                                                         ;\
 710  704          inc     %o1                                             ;\
 711  705          stb     %o1, [THREAD_REG + T_PREEMPT]                   ;\
 712  706  label2:
 713  707  
 714  708  #define FP_ALLOWMIGRATE(label1, label2)                 \
 715  709          ldn     [THREAD_REG + T_LWP], %o0                       ;\
 716  710          brz,a,pn %o0, label1/**/f                               ;\
 717  711            ldsb  [THREAD_REG + T_PREEMPT], %o1                   ;\
 718  712          call thread_allowmigrate                                ;\
 719  713            nop                                                   ;\
 720  714          ba      label2/**/f                                     ;\
 721  715            nop                                                   ;\
 722  716  label1:                                                         ;\
 723  717          dec     %o1                                             ;\
 724  718          brnz,pn %o1, label2/**/f                                ;\
 725  719            stb   %o1, [THREAD_REG + T_PREEMPT]                   ;\
 726  720          ldn     [THREAD_REG + T_CPU], %o0                       ;\
 727  721          ldub    [%o0 + CPU_KPRUNRUN], %o0                       ;\
 728  722          brz,pt  %o0, label2/**/f                                ;\
 729  723            nop                                                   ;\

↓ open down ↓

59 lines elided

↑ open up ↑

 730  724          call    kpreempt                                        ;\
 731  725            rdpr  %pil, %o0                                       ;\
 732  726  label2:
 733  727  
 734  728  /*
 735  729   * Copy a block of storage, returning an error code if `from' or
 736  730   * `to' takes a kernel pagefault which cannot be resolved.
 737  731   * Returns errno value on pagefault error, 0 if all ok
 738  732   */
 739  733  
 740      -#if defined(lint)
 741      -
 742      -/* ARGSUSED */
 743      -int
 744      -kcopy(const void *from, void *to, size_t count)
 745      -{ return(0); }
 746      -
 747      -#else   /* lint */
 748      -
 749  734          .seg    ".text"
 750  735          .align  4
 751  736  
 752  737          ENTRY(kcopy)
 753  738  
 754  739          cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
 755  740          bleu,pt %ncc, .kcopy_small              ! go to larger cases
 756  741            xor   %o0, %o1, %o3                   ! are src, dst alignable?
 757  742          btst    7, %o3                          !
 758  743          bz,pt   %ncc, .kcopy_8                  ! check for longword alignment

 759  744            nop
 760  745          btst    1, %o3                          ! 
 761  746          bz,pt   %ncc, .kcopy_2                  ! check for half-word
 762  747            nop
 763  748          sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
 764  749          ld      [%o3 + %lo(hw_copy_limit_1)], %o3
 765  750          tst     %o3
 766  751          bz,pn   %icc, .kcopy_small              ! if zero, disable HW copy
 767  752            cmp   %o2, %o3                        ! if length <= limit
 768  753          bleu,pt %ncc, .kcopy_small              ! go to small copy
 769  754            nop
 770  755          ba,pt   %ncc, .kcopy_more               ! otherwise go to large copy
 771  756            nop
 772  757  .kcopy_2:
 773  758          btst    3, %o3                          !
 774  759          bz,pt   %ncc, .kcopy_4                  ! check for word alignment
 775  760            nop
 776  761          sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
 777  762          ld      [%o3 + %lo(hw_copy_limit_2)], %o3
 778  763          tst     %o3
 779  764          bz,pn   %icc, .kcopy_small              ! if zero, disable HW copy
 780  765            cmp   %o2, %o3                        ! if length <= limit
 781  766          bleu,pt %ncc, .kcopy_small              ! go to small copy
 782  767            nop
 783  768          ba,pt   %ncc, .kcopy_more               ! otherwise go to large copy
 784  769            nop
 785  770  .kcopy_4:
 786  771          ! already checked longword, must be word aligned
 787  772          sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
 788  773          ld      [%o3 + %lo(hw_copy_limit_4)], %o3
 789  774          tst     %o3
 790  775          bz,pn   %icc, .kcopy_small              ! if zero, disable HW copy
 791  776            cmp   %o2, %o3                        ! if length <= limit
 792  777          bleu,pt %ncc, .kcopy_small              ! go to small copy
 793  778            nop
 794  779          ba,pt   %ncc, .kcopy_more               ! otherwise go to large copy
 795  780            nop
 796  781  .kcopy_8:
 797  782          sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
 798  783          ld      [%o3 + %lo(hw_copy_limit_8)], %o3
 799  784          tst     %o3
 800  785          bz,pn   %icc, .kcopy_small              ! if zero, disable HW copy
 801  786            cmp   %o2, %o3                        ! if length <= limit
 802  787          bleu,pt %ncc, .kcopy_small              ! go to small copy
 803  788            nop
 804  789          ba,pt   %ncc, .kcopy_more               ! otherwise go to large copy
 805  790            nop
 806  791  
 807  792  .kcopy_small:
 808  793          sethi   %hi(.sm_copyerr), %o5           ! sm_copyerr is lofault value
 809  794          or      %o5, %lo(.sm_copyerr), %o5
 810  795          ldn     [THREAD_REG + T_LOFAULT], %o4   ! save existing handler
 811  796          membar  #Sync                           ! sync error barrier
 812  797          ba,pt   %ncc, .sm_do_copy               ! common code
 813  798           stn    %o5, [THREAD_REG + T_LOFAULT]   ! set t_lofault
 814  799  
 815  800  .kcopy_more:
 816  801          save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
 817  802          sethi   %hi(.copyerr), %l7              ! copyerr is lofault value
 818  803          or      %l7, %lo(.copyerr), %l7
 819  804          ldn     [THREAD_REG + T_LOFAULT], %l6   ! save existing handler
 820  805          membar  #Sync                           ! sync error barrier
 821  806          ba,pt   %ncc, .do_copy                  ! common code
 822  807            stn   %l7, [THREAD_REG + T_LOFAULT]   ! set t_lofault
 823  808  
 824  809  
 825  810  /*
 826  811   * We got here because of a fault during bcopy_more, called from kcopy or bcopy.
 827  812   * Errno value is in %g1.  bcopy_more uses fp quadrants 1 and 3.
 828  813   */
 829  814  .copyerr:
 830  815          set     .copyerr2, %l0
 831  816          membar  #Sync                           ! sync error barrier
 832  817          stn     %l0, [THREAD_REG + T_LOFAULT]   ! set t_lofault
 833  818          btst    FPUSED_FLAG, %l6
 834  819          bz      %ncc, 1f
 835  820            and   %l6, TRAMP_FLAG, %l0            ! copy trampoline flag to %l0
 836  821  
 837  822          ldx     [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2      ! restore gsr
 838  823          wr      %o2, 0, %gsr
 839  824  
 840  825          ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
 841  826          btst    FPRS_FEF, %o3
 842  827          bz,pt   %icc, 4f
 843  828            nop
 844  829  
 845  830          BLD_FPQ1Q3_FROMSTACK(%o2)
 846  831  
 847  832          ba,pt   %ncc, 1f
 848  833            wr    %o3, 0, %fprs           ! restore fprs
 849  834  
 850  835  4:
 851  836          FZEROQ1Q3
 852  837          wr      %o3, 0, %fprs           ! restore fprs
 853  838  
 854  839          !
 855  840          ! Need to cater for the different expectations of kcopy
 856  841          ! and bcopy. kcopy will *always* set a t_lofault handler
 857  842          ! If it fires, we're expected to just return the error code
 858  843          ! and *not* to invoke any existing error handler. As far as
 859  844          ! bcopy is concerned, we only set t_lofault if there was an
 860  845          ! existing lofault handler. In that case we're expected to
 861  846          ! invoke the previously existing handler after resetting the
 862  847          ! t_lofault value.
 863  848          !
 864  849  1:
 865  850          andn    %l6, MASK_FLAGS, %l6            ! turn trampoline flag off
 866  851          membar  #Sync                           ! sync error barrier
 867  852          stn     %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
 868  853          FP_ALLOWMIGRATE(5, 6)
 869  854  
 870  855          btst    TRAMP_FLAG, %l0
 871  856          bnz,pn  %ncc, 3f
 872  857            nop
 873  858          ret
 874  859            restore       %g1, 0, %o0
 875  860  
 876  861  3:
 877  862          !
 878  863          ! We're here via bcopy. There *must* have been an error handler
 879  864          ! in place otherwise we would have died a nasty death already.
 880  865          !
 881  866          jmp     %l6                             ! goto real handler
 882  867            restore       %g0, 0, %o0             ! dispose of copy window
 883  868  
 884  869  /*
 885  870   * We got here because of a fault in .copyerr.  We can't safely restore fp
 886  871   * state, so we panic.
 887  872   */
 888  873  fp_panic_msg:
 889  874          .asciz  "Unable to restore fp state after copy operation"
 890  875  
 891  876          .align  4
 892  877  .copyerr2:
 893  878          set     fp_panic_msg, %o0
 894  879          call    panic
 895  880            nop
 896  881  
 897  882  /*
 898  883   * We got here because of a fault during a small kcopy or bcopy.
 899  884   * No floating point registers are used by the small copies.
 900  885   * Errno value is in %g1.
 901  886   */
 902  887  .sm_copyerr:
 903  888  1:
 904  889          btst    TRAMP_FLAG, %o4
 905  890          membar  #Sync

↓ open down ↓

147 lines elided

↑ open up ↑

 906  891          andn    %o4, TRAMP_FLAG, %o4
 907  892          bnz,pn  %ncc, 3f
 908  893            stn   %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
 909  894          retl
 910  895            mov   %g1, %o0
 911  896  3:
 912  897          jmp     %o4                             ! goto real handler
 913  898            mov   %g0, %o0                        ! 
 914  899  
 915  900          SET_SIZE(kcopy)
 916      -#endif  /* lint */
 917  901  
 918  902  
 919  903  /*
 920  904   * Copy a block of storage - must not overlap (from + len <= to).
 921  905   * Registers: l6 - saved t_lofault
 922  906   * (for short copies, o4 - saved t_lofault)
 923  907   *
 924  908   * Copy a page of memory.
 925  909   * Assumes double word alignment and a count >= 256.
 926  910   */
 927      -#if defined(lint)
 928  911  
 929      -/* ARGSUSED */
 930      -void
 931      -bcopy(const void *from, void *to, size_t count)
 932      -{}
 933      -
 934      -#else   /* lint */
 935      -
 936  912          ENTRY(bcopy)
 937  913  
 938  914          cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
 939  915          bleu,pt %ncc, .bcopy_small              ! go to larger cases
 940  916            xor   %o0, %o1, %o3                   ! are src, dst alignable?
 941  917          btst    7, %o3                          !
 942  918          bz,pt   %ncc, .bcopy_8                  ! check for longword alignment
 943  919            nop
 944  920          btst    1, %o3                          ! 
 945  921          bz,pt   %ncc, .bcopy_2                  ! check for half-word

 946  922            nop
 947  923          sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
 948  924          ld      [%o3 + %lo(hw_copy_limit_1)], %o3
 949  925          tst     %o3
 950  926          bz,pn   %icc, .bcopy_small              ! if zero, disable HW copy
 951  927            cmp   %o2, %o3                        ! if length <= limit
 952  928          bleu,pt %ncc, .bcopy_small              ! go to small copy
 953  929            nop
 954  930          ba,pt   %ncc, .bcopy_more               ! otherwise go to large copy
 955  931            nop
 956  932  .bcopy_2:
 957  933          btst    3, %o3                          !
 958  934          bz,pt   %ncc, .bcopy_4                  ! check for word alignment
 959  935            nop
 960  936          sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
 961  937          ld      [%o3 + %lo(hw_copy_limit_2)], %o3
 962  938          tst     %o3
 963  939          bz,pn   %icc, .bcopy_small              ! if zero, disable HW copy
 964  940            cmp   %o2, %o3                        ! if length <= limit
 965  941          bleu,pt %ncc, .bcopy_small              ! go to small copy
 966  942            nop
 967  943          ba,pt   %ncc, .bcopy_more               ! otherwise go to large copy
 968  944            nop
 969  945  .bcopy_4:
 970  946          ! already checked longword, must be word aligned
 971  947          sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
 972  948          ld      [%o3 + %lo(hw_copy_limit_4)], %o3
 973  949          tst     %o3
 974  950          bz,pn   %icc, .bcopy_small              ! if zero, disable HW copy
 975  951            cmp   %o2, %o3                        ! if length <= limit
 976  952          bleu,pt %ncc, .bcopy_small              ! go to small copy
 977  953            nop
 978  954          ba,pt   %ncc, .bcopy_more               ! otherwise go to large copy
 979  955            nop
 980  956  .bcopy_8:
 981  957          sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
 982  958          ld      [%o3 + %lo(hw_copy_limit_8)], %o3
 983  959          tst     %o3
 984  960          bz,pn   %icc, .bcopy_small              ! if zero, disable HW copy
 985  961            cmp   %o2, %o3                        ! if length <= limit
 986  962          bleu,pt %ncc, .bcopy_small              ! go to small copy
 987  963            nop
 988  964          ba,pt   %ncc, .bcopy_more               ! otherwise go to large copy
 989  965            nop
 990  966  
 991  967          .align  16
 992  968  .bcopy_small:
 993  969          ldn     [THREAD_REG + T_LOFAULT], %o4   ! save t_lofault
 994  970          tst     %o4
 995  971          bz,pt   %icc, .sm_do_copy
 996  972            nop
 997  973          sethi   %hi(.sm_copyerr), %o5
 998  974          or      %o5, %lo(.sm_copyerr), %o5
 999  975          membar  #Sync                           ! sync error barrier
1000  976          stn     %o5, [THREAD_REG + T_LOFAULT]   ! install new vector
1001  977          or      %o4, TRAMP_FLAG, %o4            ! error should trampoline
1002  978  .sm_do_copy:
1003  979          cmp     %o2, SHORTCOPY          ! check for really short case
1004  980          bleu,pt %ncc, .bc_sm_left       !
1005  981            cmp   %o2, CHKSIZE            ! check for medium length cases
1006  982          bgu,pn  %ncc, .bc_med           !
1007  983            or    %o0, %o1, %o3           ! prepare alignment check
1008  984          andcc   %o3, 0x3, %g0           ! test for alignment
1009  985          bz,pt   %ncc, .bc_sm_word       ! branch to word aligned case
1010  986  .bc_sm_movebytes:
1011  987            sub   %o2, 3, %o2             ! adjust count to allow cc zero test
1012  988  .bc_sm_notalign4:
1013  989          ldub    [%o0], %o3              ! read byte
1014  990          stb     %o3, [%o1]              ! write byte
1015  991          subcc   %o2, 4, %o2             ! reduce count by 4
1016  992          ldub    [%o0 + 1], %o3          ! repeat for a total of 4 bytes
1017  993          add     %o0, 4, %o0             ! advance SRC by 4
1018  994          stb     %o3, [%o1 + 1]
1019  995          ldub    [%o0 - 2], %o3
1020  996          add     %o1, 4, %o1             ! advance DST by 4
1021  997          stb     %o3, [%o1 - 2]
1022  998          ldub    [%o0 - 1], %o3
1023  999          bgt,pt  %ncc, .bc_sm_notalign4  ! loop til 3 or fewer bytes remain
1024 1000            stb   %o3, [%o1 - 1]
1025 1001          add     %o2, 3, %o2             ! restore count
1026 1002  .bc_sm_left:
1027 1003          tst     %o2
1028 1004          bz,pt   %ncc, .bc_sm_exit       ! check for zero length
1029 1005            deccc %o2                     ! reduce count for cc test
1030 1006          ldub    [%o0], %o3              ! move one byte
1031 1007          bz,pt   %ncc, .bc_sm_exit
1032 1008            stb   %o3, [%o1]
1033 1009          ldub    [%o0 + 1], %o3          ! move another byte
1034 1010          deccc   %o2                     ! check for more
1035 1011          bz,pt   %ncc, .bc_sm_exit
1036 1012            stb   %o3, [%o1 + 1]
1037 1013          ldub    [%o0 + 2], %o3          ! move final byte
1038 1014          stb     %o3, [%o1 + 2]
1039 1015          membar  #Sync                           ! sync error barrier
1040 1016          andn    %o4, TRAMP_FLAG, %o4
1041 1017          stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1042 1018          retl
1043 1019            mov   %g0, %o0                ! return 0
1044 1020          .align  16
1045 1021          nop                             ! instruction alignment
1046 1022                                          ! see discussion at start of file
1047 1023  .bc_sm_words:
1048 1024          lduw    [%o0], %o3              ! read word
1049 1025  .bc_sm_wordx:
1050 1026          subcc   %o2, 8, %o2             ! update count
1051 1027          stw     %o3, [%o1]              ! write word
1052 1028          add     %o0, 8, %o0             ! update SRC
1053 1029          lduw    [%o0 - 4], %o3          ! read word
1054 1030          add     %o1, 8, %o1             ! update DST
1055 1031          bgt,pt  %ncc, .bc_sm_words      ! loop til done
1056 1032            stw   %o3, [%o1 - 4]          ! write word
1057 1033          addcc   %o2, 7, %o2             ! restore count
1058 1034          bz,pt   %ncc, .bc_sm_exit
1059 1035            deccc %o2
1060 1036          bz,pt   %ncc, .bc_sm_byte
1061 1037  .bc_sm_half:
1062 1038            subcc %o2, 2, %o2             ! reduce count by 2
1063 1039          add     %o0, 2, %o0             ! advance SRC by 2
1064 1040          lduh    [%o0 - 2], %o3          ! read half word
1065 1041          add     %o1, 2, %o1             ! advance DST by 2
1066 1042          bgt,pt  %ncc, .bc_sm_half       ! loop til done
1067 1043            sth   %o3, [%o1 - 2]          ! write half word
1068 1044          addcc   %o2, 1, %o2             ! restore count
1069 1045          bz,pt   %ncc, .bc_sm_exit
1070 1046            nop
1071 1047  .bc_sm_byte:
1072 1048          ldub    [%o0], %o3
1073 1049          stb     %o3, [%o1]
1074 1050          membar  #Sync                           ! sync error barrier
1075 1051          andn    %o4, TRAMP_FLAG, %o4
1076 1052          stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1077 1053          retl
1078 1054            mov   %g0, %o0                ! return 0
1079 1055  
1080 1056  .bc_sm_word:
1081 1057          subcc   %o2, 4, %o2             ! update count
1082 1058          bgt,pt  %ncc, .bc_sm_wordx
1083 1059            lduw  [%o0], %o3              ! read word
1084 1060          addcc   %o2, 3, %o2             ! restore count
1085 1061          bz,pt   %ncc, .bc_sm_exit
1086 1062            stw   %o3, [%o1]              ! write word
1087 1063          deccc   %o2                     ! reduce count for cc test
1088 1064          ldub    [%o0 + 4], %o3          ! load one byte
1089 1065          bz,pt   %ncc, .bc_sm_exit
1090 1066            stb   %o3, [%o1 + 4]          ! store one byte
1091 1067          ldub    [%o0 + 5], %o3          ! load second byte
1092 1068          deccc   %o2
1093 1069          bz,pt   %ncc, .bc_sm_exit
1094 1070            stb   %o3, [%o1 + 5]          ! store second byte
1095 1071          ldub    [%o0 + 6], %o3          ! load third byte
1096 1072          stb     %o3, [%o1 + 6]          ! store third byte
1097 1073  .bc_sm_exit:
1098 1074          membar  #Sync                           ! sync error barrier
1099 1075          andn    %o4, TRAMP_FLAG, %o4
1100 1076          stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1101 1077          retl
1102 1078            mov   %g0, %o0                ! return 0
1103 1079  
1104 1080          .align 16
1105 1081  .bc_med:
1106 1082          xor     %o0, %o1, %o3           ! setup alignment check
1107 1083          btst    1, %o3
1108 1084          bnz,pt  %ncc, .bc_sm_movebytes  ! unaligned
1109 1085            nop
1110 1086          btst    3, %o3
1111 1087          bnz,pt  %ncc, .bc_med_half      ! halfword aligned
1112 1088            nop
1113 1089          btst    7, %o3
1114 1090          bnz,pt  %ncc, .bc_med_word      ! word aligned
1115 1091            nop
1116 1092  .bc_med_long:
1117 1093          btst    3, %o0                  ! check for
1118 1094          bz,pt   %ncc, .bc_med_long1     ! word alignment
1119 1095            nop
1120 1096  .bc_med_long0:
1121 1097          ldub    [%o0], %o3              ! load one byte
1122 1098          inc     %o0
1123 1099          stb     %o3,[%o1]               ! store byte
1124 1100          inc     %o1
1125 1101          btst    3, %o0
1126 1102          bnz,pt  %ncc, .bc_med_long0
1127 1103            dec   %o2
1128 1104  .bc_med_long1:                  ! word aligned
1129 1105          btst    7, %o0                  ! check for long word
1130 1106          bz,pt   %ncc, .bc_med_long2
1131 1107            nop
1132 1108          lduw    [%o0], %o3              ! load word
1133 1109          add     %o0, 4, %o0             ! advance SRC by 4
1134 1110          stw     %o3, [%o1]              ! store word
1135 1111          add     %o1, 4, %o1             ! advance DST by 4
1136 1112          sub     %o2, 4, %o2             ! reduce count by 4
1137 1113  !
1138 1114  !  Now long word aligned and have at least 32 bytes to move
1139 1115  !
1140 1116  .bc_med_long2:
1141 1117          sub     %o2, 31, %o2            ! adjust count to allow cc zero test
1142 1118  .bc_med_lmove:
1143 1119          ldx     [%o0], %o3              ! read long word
1144 1120          stx     %o3, [%o1]              ! write long word
1145 1121          subcc   %o2, 32, %o2            ! reduce count by 32
1146 1122          ldx     [%o0 + 8], %o3          ! repeat for a total for 4 long words
1147 1123          add     %o0, 32, %o0            ! advance SRC by 32
1148 1124          stx     %o3, [%o1 + 8]
1149 1125          ldx     [%o0 - 16], %o3
1150 1126          add     %o1, 32, %o1            ! advance DST by 32
1151 1127          stx     %o3, [%o1 - 16]
1152 1128          ldx     [%o0 - 8], %o3
1153 1129          bgt,pt  %ncc, .bc_med_lmove     ! loop til 31 or fewer bytes left
1154 1130            stx   %o3, [%o1 - 8]
1155 1131          addcc   %o2, 24, %o2            ! restore count to long word offset
1156 1132          ble,pt  %ncc, .bc_med_lextra    ! check for more long words to move
1157 1133            nop
1158 1134  .bc_med_lword:
1159 1135          ldx     [%o0], %o3              ! read long word
1160 1136          subcc   %o2, 8, %o2             ! reduce count by 8
1161 1137          stx     %o3, [%o1]              ! write long word
1162 1138          add     %o0, 8, %o0             ! advance SRC by 8
1163 1139          bgt,pt  %ncc, .bc_med_lword     ! loop til 7 or fewer bytes left
1164 1140            add   %o1, 8, %o1             ! advance DST by 8
1165 1141  .bc_med_lextra:
1166 1142          addcc   %o2, 7, %o2             ! restore rest of count
1167 1143          bz,pt   %ncc, .bc_sm_exit       ! if zero, then done
1168 1144            deccc %o2
1169 1145          bz,pt   %ncc, .bc_sm_byte
1170 1146            nop
1171 1147          ba,pt   %ncc, .bc_sm_half
1172 1148            nop
1173 1149  
1174 1150          .align 16
1175 1151  .bc_med_word:
1176 1152          btst    3, %o0                  ! check for
1177 1153          bz,pt   %ncc, .bc_med_word1     ! word alignment
1178 1154            nop
1179 1155  .bc_med_word0:
1180 1156          ldub    [%o0], %o3              ! load one byte
1181 1157          inc     %o0
1182 1158          stb     %o3,[%o1]               ! store byte
1183 1159          inc     %o1
1184 1160          btst    3, %o0
1185 1161          bnz,pt  %ncc, .bc_med_word0
1186 1162            dec   %o2
1187 1163  !
1188 1164  !  Now word aligned and have at least 36 bytes to move
1189 1165  !
1190 1166  .bc_med_word1:
1191 1167          sub     %o2, 15, %o2            ! adjust count to allow cc zero test
1192 1168  .bc_med_wmove:
1193 1169          lduw    [%o0], %o3              ! read word
1194 1170          stw     %o3, [%o1]              ! write word
1195 1171          subcc   %o2, 16, %o2            ! reduce count by 16
1196 1172          lduw    [%o0 + 4], %o3          ! repeat for a total for 4 words
1197 1173          add     %o0, 16, %o0            ! advance SRC by 16
1198 1174          stw     %o3, [%o1 + 4]
1199 1175          lduw    [%o0 - 8], %o3
1200 1176          add     %o1, 16, %o1            ! advance DST by 16
1201 1177          stw     %o3, [%o1 - 8]
1202 1178          lduw    [%o0 - 4], %o3
1203 1179          bgt,pt  %ncc, .bc_med_wmove     ! loop til 15 or fewer bytes left
1204 1180            stw   %o3, [%o1 - 4]
1205 1181          addcc   %o2, 12, %o2            ! restore count to word offset
1206 1182          ble,pt  %ncc, .bc_med_wextra    ! check for more words to move
1207 1183            nop
1208 1184  .bc_med_word2:
1209 1185          lduw    [%o0], %o3              ! read word
1210 1186          subcc   %o2, 4, %o2             ! reduce count by 4
1211 1187          stw     %o3, [%o1]              ! write word
1212 1188          add     %o0, 4, %o0             ! advance SRC by 4
1213 1189          bgt,pt  %ncc, .bc_med_word2     ! loop til 3 or fewer bytes left
1214 1190            add   %o1, 4, %o1             ! advance DST by 4
1215 1191  .bc_med_wextra:
1216 1192          addcc   %o2, 3, %o2             ! restore rest of count
1217 1193          bz,pt   %ncc, .bc_sm_exit       ! if zero, then done
1218 1194            deccc %o2
1219 1195          bz,pt   %ncc, .bc_sm_byte
1220 1196            nop
1221 1197          ba,pt   %ncc, .bc_sm_half
1222 1198            nop
1223 1199  
1224 1200          .align 16
1225 1201  .bc_med_half:
1226 1202          btst    1, %o0                  ! check for
1227 1203          bz,pt   %ncc, .bc_med_half1     ! half word alignment
1228 1204            nop
1229 1205          ldub    [%o0], %o3              ! load one byte
1230 1206          inc     %o0
1231 1207          stb     %o3,[%o1]               ! store byte
1232 1208          inc     %o1
1233 1209          dec     %o2
1234 1210  !
1235 1211  !  Now half word aligned and have at least 38 bytes to move
1236 1212  !
1237 1213  .bc_med_half1:
1238 1214          sub     %o2, 7, %o2             ! adjust count to allow cc zero test
1239 1215  .bc_med_hmove:
1240 1216          lduh    [%o0], %o3              ! read half word
1241 1217          sth     %o3, [%o1]              ! write half word
1242 1218          subcc   %o2, 8, %o2             ! reduce count by 8
1243 1219          lduh    [%o0 + 2], %o3          ! repeat for a total for 4 halfwords
1244 1220          add     %o0, 8, %o0             ! advance SRC by 8
1245 1221          sth     %o3, [%o1 + 2]
1246 1222          lduh    [%o0 - 4], %o3
1247 1223          add     %o1, 8, %o1             ! advance DST by 8
1248 1224          sth     %o3, [%o1 - 4]
1249 1225          lduh    [%o0 - 2], %o3
1250 1226          bgt,pt  %ncc, .bc_med_hmove     ! loop til 7 or fewer bytes left
1251 1227            sth   %o3, [%o1 - 2]
1252 1228          addcc   %o2, 7, %o2             ! restore count
1253 1229          bz,pt   %ncc, .bc_sm_exit
1254 1230            deccc %o2
1255 1231          bz,pt   %ncc, .bc_sm_byte
1256 1232            nop
1257 1233          ba,pt   %ncc, .bc_sm_half
1258 1234            nop
1259 1235  
1260 1236          SET_SIZE(bcopy)
1261 1237  
1262 1238  /*
1263 1239   * The _more entry points are not intended to be used directly by
1264 1240   * any caller from outside this file.  They are provided to allow
1265 1241   * profiling and dtrace of the portions of the copy code that uses
1266 1242   * the floating point registers.
1267 1243   * This entry is particularly important as DTRACE (at least as of
1268 1244   * 4/2004) does not support leaf functions.
1269 1245   */
1270 1246  
1271 1247          ENTRY(bcopy_more)
1272 1248  .bcopy_more:            
1273 1249          save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1274 1250          ldn     [THREAD_REG + T_LOFAULT], %l6   ! save t_lofault
1275 1251          tst     %l6
1276 1252          bz,pt   %ncc, .do_copy
1277 1253            nop
1278 1254          sethi   %hi(.copyerr), %o2
1279 1255          or      %o2, %lo(.copyerr), %o2
1280 1256          membar  #Sync                           ! sync error barrier
1281 1257          stn     %o2, [THREAD_REG + T_LOFAULT]   ! install new vector
1282 1258          !
1283 1259          ! We've already captured whether t_lofault was zero on entry.
1284 1260          ! We need to mark ourselves as being from bcopy since both
1285 1261          ! kcopy and bcopy use the same code path. If TRAMP_FLAG is set
1286 1262          ! and the saved lofault was zero, we won't reset lofault on
1287 1263          ! returning.
1288 1264          !
1289 1265          or      %l6, TRAMP_FLAG, %l6
1290 1266  
1291 1267  /*
1292 1268   * Copies that reach here are larger than VIS_COPY_THRESHOLD bytes
1293 1269   * Also, use of FP registers has been tested to be enabled
1294 1270   */
1295 1271  .do_copy:
1296 1272          FP_NOMIGRATE(6, 7)
1297 1273  
1298 1274          rd      %fprs, %o2              ! check for unused fp
1299 1275          st      %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
1300 1276          btst    FPRS_FEF, %o2
1301 1277          bz,a,pt %icc, .do_blockcopy
1302 1278            wr    %g0, FPRS_FEF, %fprs
1303 1279  
1304 1280          BST_FPQ1Q3_TOSTACK(%o2)
1305 1281  
1306 1282  .do_blockcopy:
1307 1283          rd      %gsr, %o2
1308 1284          stx     %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]      ! save gsr
1309 1285          or      %l6, FPUSED_FLAG, %l6
1310 1286  
1311 1287  #define REALSRC %i0
1312 1288  #define DST     %i1
1313 1289  #define CNT     %i2
1314 1290  #define SRC     %i3
1315 1291  #define TMP     %i5
1316 1292  
1317 1293          andcc   DST, VIS_BLOCKSIZE - 1, TMP
1318 1294          bz,pt   %ncc, 2f
1319 1295            neg   TMP
1320 1296          add     TMP, VIS_BLOCKSIZE, TMP
1321 1297  
1322 1298          ! TMP = bytes required to align DST on FP_BLOCK boundary
1323 1299          ! Using SRC as a tmp here
1324 1300          cmp     TMP, 3
1325 1301          bleu,pt %ncc, 1f
1326 1302            sub   CNT,TMP,CNT             ! adjust main count
1327 1303          sub     TMP, 3, TMP             ! adjust for end of loop test
1328 1304  .bc_blkalign:
1329 1305          ldub    [REALSRC], SRC          ! move 4 bytes per loop iteration
1330 1306          stb     SRC, [DST]
1331 1307          subcc   TMP, 4, TMP
1332 1308          ldub    [REALSRC + 1], SRC
1333 1309          add     REALSRC, 4, REALSRC
1334 1310          stb     SRC, [DST + 1]
1335 1311          ldub    [REALSRC - 2], SRC
1336 1312          add     DST, 4, DST
1337 1313          stb     SRC, [DST - 2]
1338 1314          ldub    [REALSRC - 1], SRC
1339 1315          bgu,pt  %ncc, .bc_blkalign
1340 1316            stb   SRC, [DST - 1]
1341 1317  
1342 1318          addcc   TMP, 3, TMP             ! restore count adjustment
1343 1319          bz,pt   %ncc, 2f                ! no bytes left?
1344 1320            nop
1345 1321  1:      ldub    [REALSRC], SRC
1346 1322          inc     REALSRC
1347 1323          inc     DST
1348 1324          deccc   TMP
1349 1325          bgu     %ncc, 1b
1350 1326            stb   SRC, [DST - 1]
1351 1327  
1352 1328  2:
1353 1329          andn    REALSRC, 0x7, SRC
1354 1330          alignaddr REALSRC, %g0, %g0
1355 1331  
1356 1332          ! SRC - 8-byte aligned
1357 1333          ! DST - 64-byte aligned
1358 1334          prefetch [SRC], #one_read
1359 1335          prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read
1360 1336          prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read
1361 1337          prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read
1362 1338          ldd     [SRC], %f0
1363 1339  #if CHEETAH_PREFETCH > 4
1364 1340          prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
1365 1341  #endif
1366 1342          ldd     [SRC + 0x08], %f2
1367 1343  #if CHEETAH_PREFETCH > 5
1368 1344          prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read
1369 1345  #endif
1370 1346          ldd     [SRC + 0x10], %f4
1371 1347  #if CHEETAH_PREFETCH > 6
1372 1348          prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read
1373 1349  #endif
1374 1350          faligndata %f0, %f2, %f32
1375 1351          ldd     [SRC + 0x18], %f6
1376 1352  #if CHEETAH_PREFETCH > 7
1377 1353          prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read
1378 1354  #endif
1379 1355          faligndata %f2, %f4, %f34
1380 1356          ldd     [SRC + 0x20], %f8
1381 1357          faligndata %f4, %f6, %f36
1382 1358          ldd     [SRC + 0x28], %f10
1383 1359          faligndata %f6, %f8, %f38
1384 1360          ldd     [SRC + 0x30], %f12
1385 1361          faligndata %f8, %f10, %f40
1386 1362          ldd     [SRC + 0x38], %f14
1387 1363          faligndata %f10, %f12, %f42
1388 1364          ldd     [SRC + VIS_BLOCKSIZE], %f0
1389 1365          sub     CNT, VIS_BLOCKSIZE, CNT
1390 1366          add     SRC, VIS_BLOCKSIZE, SRC
1391 1367          add     REALSRC, VIS_BLOCKSIZE, REALSRC
1392 1368          ba,a,pt %ncc, 1f
1393 1369            nop
1394 1370          .align  16
1395 1371  1:
1396 1372          ldd     [SRC + 0x08], %f2
1397 1373          faligndata %f12, %f14, %f44
1398 1374          ldd     [SRC + 0x10], %f4
1399 1375          faligndata %f14, %f0, %f46
1400 1376          stda    %f32, [DST]ASI_BLK_P
1401 1377          ldd     [SRC + 0x18], %f6
1402 1378          faligndata %f0, %f2, %f32
1403 1379          ldd     [SRC + 0x20], %f8
1404 1380          faligndata %f2, %f4, %f34
1405 1381          ldd     [SRC + 0x28], %f10
1406 1382          faligndata %f4, %f6, %f36
1407 1383          ldd     [SRC + 0x30], %f12
1408 1384          faligndata %f6, %f8, %f38
1409 1385          ldd     [SRC + 0x38], %f14
1410 1386          faligndata %f8, %f10, %f40
1411 1387          sub     CNT, VIS_BLOCKSIZE, CNT
1412 1388          ldd     [SRC + VIS_BLOCKSIZE], %f0
1413 1389          faligndata %f10, %f12, %f42
1414 1390          prefetch [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read
1415 1391          add     DST, VIS_BLOCKSIZE, DST
1416 1392          prefetch [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1417 1393          add     REALSRC, VIS_BLOCKSIZE, REALSRC
1418 1394          cmp     CNT, VIS_BLOCKSIZE + 8
1419 1395          bgu,pt  %ncc, 1b
1420 1396            add   SRC, VIS_BLOCKSIZE, SRC
1421 1397  
1422 1398          ! only if REALSRC & 0x7 is 0
1423 1399          cmp     CNT, VIS_BLOCKSIZE
1424 1400          bne     %ncc, 3f
1425 1401            andcc REALSRC, 0x7, %g0
1426 1402          bz,pt   %ncc, 2f
1427 1403            nop
1428 1404  3:      
1429 1405          faligndata %f12, %f14, %f44
1430 1406          faligndata %f14, %f0, %f46
1431 1407          stda    %f32, [DST]ASI_BLK_P
1432 1408          add     DST, VIS_BLOCKSIZE, DST
1433 1409          ba,pt   %ncc, 3f
1434 1410            nop
1435 1411  2:
1436 1412          ldd     [SRC + 0x08], %f2
1437 1413          fsrc1   %f12, %f44
1438 1414          ldd     [SRC + 0x10], %f4
1439 1415          fsrc1   %f14, %f46
1440 1416          stda    %f32, [DST]ASI_BLK_P
1441 1417          ldd     [SRC + 0x18], %f6
1442 1418          fsrc1   %f0, %f32
1443 1419          ldd     [SRC + 0x20], %f8
1444 1420          fsrc1   %f2, %f34
1445 1421          ldd     [SRC + 0x28], %f10
1446 1422          fsrc1   %f4, %f36
1447 1423          ldd     [SRC + 0x30], %f12
1448 1424          fsrc1   %f6, %f38
1449 1425          ldd     [SRC + 0x38], %f14
1450 1426          fsrc1   %f8, %f40
1451 1427          sub     CNT, VIS_BLOCKSIZE, CNT
1452 1428          add     DST, VIS_BLOCKSIZE, DST
1453 1429          add     SRC, VIS_BLOCKSIZE, SRC
1454 1430          add     REALSRC, VIS_BLOCKSIZE, REALSRC
1455 1431          fsrc1   %f10, %f42
1456 1432          fsrc1   %f12, %f44
1457 1433          fsrc1   %f14, %f46
1458 1434          stda    %f32, [DST]ASI_BLK_P
1459 1435          add     DST, VIS_BLOCKSIZE, DST
1460 1436          ba,a,pt %ncc, .bcb_exit
1461 1437            nop
1462 1438  
1463 1439  3:      tst     CNT
1464 1440          bz,a,pt %ncc, .bcb_exit
1465 1441            nop
1466 1442  
1467 1443  5:      ldub    [REALSRC], TMP
1468 1444          inc     REALSRC
1469 1445          inc     DST
1470 1446          deccc   CNT
1471 1447          bgu     %ncc, 5b
1472 1448            stb   TMP, [DST - 1]
1473 1449  .bcb_exit:
1474 1450          membar  #Sync
1475 1451  
1476 1452          FPRAS_INTERVAL(FPRAS_BCOPY, 0, %l5, %o2, %o3, %o4, %o5, 8)
1477 1453          FPRAS_REWRITE_TYPE2Q1(0, %l5, %o2, %o3, 8, 9)
1478 1454          FPRAS_CHECK(FPRAS_BCOPY, %l5, 9)        ! outputs lost
1479 1455  
1480 1456          ldx     [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2      ! restore gsr
1481 1457          wr      %o2, 0, %gsr
1482 1458  
1483 1459          ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1484 1460          btst    FPRS_FEF, %o3
1485 1461          bz,pt   %icc, 4f
1486 1462            nop
1487 1463  
1488 1464          BLD_FPQ1Q3_FROMSTACK(%o2)
1489 1465  
1490 1466          ba,pt   %ncc, 2f        
1491 1467            wr    %o3, 0, %fprs           ! restore fprs
1492 1468  4:
1493 1469          FZEROQ1Q3
1494 1470          wr      %o3, 0, %fprs           ! restore fprs

↓ open down ↓

549 lines elided

↑ open up ↑

1495 1471  2:
1496 1472          membar  #Sync                           ! sync error barrier
1497 1473          andn    %l6, MASK_FLAGS, %l6
1498 1474          stn     %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1499 1475          FP_ALLOWMIGRATE(5, 6)
1500 1476          ret
1501 1477            restore       %g0, 0, %o0
1502 1478  
1503 1479          SET_SIZE(bcopy_more)
1504 1480  
1505      -#endif  /* lint */
1506      -
1507 1481  /*
1508 1482   * Block copy with possibly overlapped operands.
1509 1483   */
1510 1484  
1511      -#if defined(lint)
1512      -
1513      -/*ARGSUSED*/
1514      -void
1515      -ovbcopy(const void *from, void *to, size_t count)
1516      -{}
1517      -
1518      -#else   /* lint */
1519      -
1520 1485          ENTRY(ovbcopy)
1521 1486          tst     %o2                     ! check count
1522 1487          bgu,a   %ncc, 1f                ! nothing to do or bad arguments
1523 1488            subcc %o0, %o1, %o3           ! difference of from and to address
1524 1489  
1525 1490          retl                            ! return
1526 1491            nop
1527 1492  1:
1528 1493          bneg,a  %ncc, 2f
1529 1494            neg   %o3                     ! if < 0, make it positive

1530 1495  2:      cmp     %o2, %o3                ! cmp size and abs(from - to)
1531 1496          bleu    %ncc, bcopy             ! if size <= abs(diff): use bcopy,
1532 1497            .empty                                !   no overlap
1533 1498            cmp   %o0, %o1                ! compare from and to addresses
1534 1499          blu     %ncc, .ov_bkwd          ! if from < to, copy backwards
1535 1500            nop
1536 1501          !
1537 1502          ! Copy forwards.
1538 1503          !
1539 1504  .ov_fwd:
1540 1505          ldub    [%o0], %o3              ! read from address
1541 1506          inc     %o0                     ! inc from address
1542 1507          stb     %o3, [%o1]              ! write to address
1543 1508          deccc   %o2                     ! dec count
1544 1509          bgu     %ncc, .ov_fwd           ! loop till done
1545 1510            inc   %o1                     ! inc to address
1546 1511  
1547 1512          retl                            ! return
1548 1513            nop
1549 1514          !
1550 1515          ! Copy backwards.
1551 1516          !
1552 1517  .ov_bkwd:

↓ open down ↓

23 lines elided

↑ open up ↑

1553 1518          deccc   %o2                     ! dec count
1554 1519          ldub    [%o0 + %o2], %o3        ! get byte at end of src
1555 1520          bgu     %ncc, .ov_bkwd          ! loop till done
1556 1521            stb   %o3, [%o1 + %o2]        ! delay slot, store at end of dst
1557 1522  
1558 1523          retl                            ! return
1559 1524            nop
1560 1525  
1561 1526          SET_SIZE(ovbcopy)
1562 1527  
1563      -#endif  /* lint */
1564 1528  
1565      -
1566 1529  /*
1567 1530   * hwblkpagecopy()
1568 1531   *
1569 1532   * Copies exactly one page.  This routine assumes the caller (ppcopy)
1570 1533   * has already disabled kernel preemption and has checked
1571 1534   * use_hw_bcopy.  Preventing preemption also prevents cpu migration.
1572 1535   */
1573      -#ifdef lint
1574      -/*ARGSUSED*/
1575      -void
1576      -hwblkpagecopy(const void *src, void *dst)
1577      -{ }
1578      -#else /* lint */
1579 1536          ENTRY(hwblkpagecopy)
1580 1537          ! get another window w/space for three aligned blocks of saved fpregs
1581 1538          save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1582 1539  
1583 1540          ! %i0 - source address (arg)
1584 1541          ! %i1 - destination address (arg)
1585 1542          ! %i2 - length of region (not arg)
1586 1543          ! %l0 - saved fprs
1587 1544          ! %l1 - pointer to saved fpregs
1588 1545

1589 1546          rd      %fprs, %l0              ! check for unused fp
1590 1547          btst    FPRS_FEF, %l0
1591 1548          bz,a,pt %icc, 1f
1592 1549            wr    %g0, FPRS_FEF, %fprs
1593 1550  
1594 1551          BST_FPQ1Q3_TOSTACK(%l1)
1595 1552  
1596 1553  1:      set     PAGESIZE, CNT
1597 1554          mov     REALSRC, SRC
1598 1555  
1599 1556          prefetch [SRC], #one_read
1600 1557          prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read
1601 1558          prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read
1602 1559          prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read
1603 1560          ldd     [SRC], %f0
1604 1561  #if CHEETAH_PREFETCH > 4
1605 1562          prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
1606 1563  #endif
1607 1564          ldd     [SRC + 0x08], %f2
1608 1565  #if CHEETAH_PREFETCH > 5
1609 1566          prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read
1610 1567  #endif
1611 1568          ldd     [SRC + 0x10], %f4
1612 1569  #if CHEETAH_PREFETCH > 6
1613 1570          prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read
1614 1571  #endif
1615 1572          fsrc1   %f0, %f32
1616 1573          ldd     [SRC + 0x18], %f6
1617 1574  #if CHEETAH_PREFETCH > 7
1618 1575          prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read
1619 1576  #endif
1620 1577          fsrc1   %f2, %f34
1621 1578          ldd     [SRC + 0x20], %f8
1622 1579          fsrc1   %f4, %f36
1623 1580          ldd     [SRC + 0x28], %f10
1624 1581          fsrc1   %f6, %f38
1625 1582          ldd     [SRC + 0x30], %f12
1626 1583          fsrc1   %f8, %f40
1627 1584          ldd     [SRC + 0x38], %f14
1628 1585          fsrc1   %f10, %f42
1629 1586          ldd     [SRC + VIS_BLOCKSIZE], %f0
1630 1587          sub     CNT, VIS_BLOCKSIZE, CNT
1631 1588          add     SRC, VIS_BLOCKSIZE, SRC
1632 1589          ba,a,pt %ncc, 2f
1633 1590            nop
1634 1591          .align  16
1635 1592  2:
1636 1593          ldd     [SRC + 0x08], %f2
1637 1594          fsrc1   %f12, %f44
1638 1595          ldd     [SRC + 0x10], %f4
1639 1596          fsrc1   %f14, %f46
1640 1597          stda    %f32, [DST]ASI_BLK_P
1641 1598          ldd     [SRC + 0x18], %f6
1642 1599          fsrc1   %f0, %f32
1643 1600          ldd     [SRC + 0x20], %f8
1644 1601          fsrc1   %f2, %f34
1645 1602          ldd     [SRC + 0x28], %f10
1646 1603          fsrc1   %f4, %f36
1647 1604          ldd     [SRC + 0x30], %f12
1648 1605          fsrc1   %f6, %f38
1649 1606          ldd     [SRC + 0x38], %f14
1650 1607          fsrc1   %f8, %f40
1651 1608          ldd     [SRC + VIS_BLOCKSIZE], %f0
1652 1609          fsrc1   %f10, %f42
1653 1610          prefetch [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read
1654 1611          sub     CNT, VIS_BLOCKSIZE, CNT
1655 1612          add     DST, VIS_BLOCKSIZE, DST
1656 1613          cmp     CNT, VIS_BLOCKSIZE + 8
1657 1614          prefetch [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1658 1615          bgu,pt  %ncc, 2b
1659 1616            add   SRC, VIS_BLOCKSIZE, SRC
1660 1617  
1661 1618          ! trailing block
1662 1619          ldd     [SRC + 0x08], %f2
1663 1620          fsrc1   %f12, %f44
1664 1621          ldd     [SRC + 0x10], %f4
1665 1622          fsrc1   %f14, %f46
1666 1623          stda    %f32, [DST]ASI_BLK_P
1667 1624          ldd     [SRC + 0x18], %f6
1668 1625          fsrc1   %f0, %f32
1669 1626          ldd     [SRC + 0x20], %f8
1670 1627          fsrc1   %f2, %f34
1671 1628          ldd     [SRC + 0x28], %f10
1672 1629          fsrc1   %f4, %f36
1673 1630          ldd     [SRC + 0x30], %f12
1674 1631          fsrc1   %f6, %f38
1675 1632          ldd     [SRC + 0x38], %f14
1676 1633          fsrc1   %f8, %f40
1677 1634          sub     CNT, VIS_BLOCKSIZE, CNT
1678 1635          add     DST, VIS_BLOCKSIZE, DST
1679 1636          add     SRC, VIS_BLOCKSIZE, SRC
1680 1637          fsrc1   %f10, %f42
1681 1638          fsrc1   %f12, %f44
1682 1639          fsrc1   %f14, %f46
1683 1640          stda    %f32, [DST]ASI_BLK_P
1684 1641  
1685 1642          membar  #Sync
1686 1643  
1687 1644          FPRAS_INTERVAL(FPRAS_PGCOPY, 1, %l5, %o2, %o3, %o4, %o5, 8)
1688 1645          FPRAS_REWRITE_TYPE1(1, %l5, %f32, %o2, 9)
1689 1646          FPRAS_CHECK(FPRAS_PGCOPY, %l5, 9)       ! lose outputs
1690 1647  
1691 1648          btst    FPRS_FEF, %l0
1692 1649          bz,pt   %icc, 2f
1693 1650            nop
1694 1651  
1695 1652          BLD_FPQ1Q3_FROMSTACK(%l3)

↓ open down ↓

107 lines elided

↑ open up ↑

1696 1653          ba      3f
1697 1654            nop
1698 1655  
1699 1656  2:      FZEROQ1Q3
1700 1657  
1701 1658  3:      wr      %l0, 0, %fprs           ! restore fprs
1702 1659          ret
1703 1660            restore       %g0, 0, %o0
1704 1661  
1705 1662          SET_SIZE(hwblkpagecopy)
1706      -#endif  /* lint */
1707 1663  
1708 1664  
1709 1665  /*
1710 1666   * Transfer data to and from user space -
1711 1667   * Note that these routines can cause faults
1712 1668   * It is assumed that the kernel has nothing at
1713 1669   * less than KERNELBASE in the virtual address space.
1714 1670   *
1715 1671   * Note that copyin(9F) and copyout(9F) are part of the
1716 1672   * DDI/DKI which specifies that they return '-1' on "errors."

1717 1673   *
1718 1674   * Sigh.
1719 1675   *
1720 1676   * So there's two extremely similar routines - xcopyin() and xcopyout()
1721 1677   * which return the errno that we've faithfully computed.  This
1722 1678   * allows other callers (e.g. uiomove(9F)) to work correctly.
1723 1679   * Given that these are used pretty heavily, we expand the calling
1724 1680   * sequences inline for all flavours (rather than making wrappers).
1725 1681   *
1726 1682   * There are also stub routines for xcopyout_little and xcopyin_little,
1727 1683   * which currently are intended to handle requests of <= 16 bytes from
1728 1684   * do_unaligned. Future enhancement to make them handle 8k pages efficiently
1729 1685   * is left as an exercise...
1730 1686   */
1731 1687  
1732 1688  /*
1733 1689   * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
1734 1690   *      
1735 1691   * General theory of operation:
1736 1692   *
1737 1693   * The only difference between copy{in,out} and
1738 1694   * xcopy{in,out} is in the error handling routine they invoke
1739 1695   * when a memory access error occurs. xcopyOP returns the errno
1740 1696   * while copyOP returns -1 (see above). copy{in,out}_noerr set
1741 1697   * a special flag (by oring the TRAMP_FLAG into the fault handler address)
1742 1698   * if they are called with a fault handler already in place. That flag
1743 1699   * causes the default handlers to trampoline to the previous handler
1744 1700   * upon an error.
1745 1701   *
1746 1702   * None of the copyops routines grab a window until it's decided that
1747 1703   * we need to do a HW block copy operation. This saves a window
1748 1704   * spill/fill when we're called during socket ops. The typical IO
1749 1705   * path won't cause spill/fill traps.
1750 1706   *
1751 1707   * This code uses a set of 4 limits for the maximum size that will
1752 1708   * be copied given a particular input/output address alignment.
1753 1709   * If the value for a particular limit is zero, the copy will be performed
1754 1710   * by the plain copy loops rather than FPBLK.

↓ open down ↓

38 lines elided

↑ open up ↑

1755 1711   *
1756 1712   * See the description of bcopy above for more details of the
1757 1713   * data copying algorithm and the default limits.
1758 1714   *
1759 1715   */
1760 1716  
1761 1717  /*
1762 1718   * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
1763 1719   */
1764 1720  
1765      -#if defined(lint)
1766      -
1767      -
1768      -#else   /* lint */
1769 1721  /*
1770 1722   * We save the arguments in the following registers in case of a fault:
1771 1723   *      kaddr - %l1
1772 1724   *      uaddr - %l2
1773 1725   *      count - %l3
1774 1726   */
1775 1727  #define SAVE_SRC        %l1
1776 1728  #define SAVE_DST        %l2
1777 1729  #define SAVE_COUNT      %l3
1778 1730

1779 1731  #define SM_SAVE_SRC             %g4
1780 1732  #define SM_SAVE_DST             %g5
1781 1733  #define SM_SAVE_COUNT           %o5
1782 1734  #define ERRNO           %l5
1783 1735  
1784 1736  
1785 1737  #define REAL_LOFAULT    %l4
1786 1738  /*
1787 1739   * Generic copyio fault handler.  This is the first line of defense when a
1788 1740   * fault occurs in (x)copyin/(x)copyout.  In order for this to function
1789 1741   * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
1790 1742   * This allows us to share common code for all the flavors of the copy
1791 1743   * operations, including the _noerr versions.
1792 1744   *
1793 1745   * Note that this function will restore the original input parameters before
1794 1746   * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
1795 1747   * member of the t_copyop structure, if needed.
1796 1748   */
1797 1749          ENTRY(copyio_fault)
1798 1750          membar  #Sync
1799 1751          mov     %g1,ERRNO                       ! save errno in ERRNO
1800 1752          btst    FPUSED_FLAG, %l6
1801 1753          bz      %ncc, 1f
1802 1754            nop
1803 1755  
1804 1756          ldx     [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
1805 1757          wr      %o2, 0, %gsr            ! restore gsr
1806 1758  
1807 1759          ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1808 1760          btst    FPRS_FEF, %o3
1809 1761          bz,pt   %icc, 4f
1810 1762            nop
1811 1763  
1812 1764          BLD_FPQ2Q4_FROMSTACK(%o2)
1813 1765  
1814 1766          ba,pt   %ncc, 1f
1815 1767            wr    %o3, 0, %fprs           ! restore fprs
1816 1768  
1817 1769  4:
1818 1770          FZEROQ2Q4
1819 1771          wr      %o3, 0, %fprs           ! restore fprs
1820 1772  
1821 1773  1:
1822 1774          andn    %l6, FPUSED_FLAG, %l6
1823 1775          membar  #Sync
1824 1776          stn     %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault

↓ open down ↓

46 lines elided

↑ open up ↑

1825 1777          FP_ALLOWMIGRATE(5, 6)
1826 1778  
1827 1779          mov     SAVE_SRC, %i0
1828 1780          mov     SAVE_DST, %i1
1829 1781          jmp     REAL_LOFAULT
1830 1782            mov   SAVE_COUNT, %i2
1831 1783  
1832 1784          SET_SIZE(copyio_fault)
1833 1785  
1834 1786  
1835      -#endif
1836      -
1837      -#if defined(lint)
1838      -
1839      -/*ARGSUSED*/
1840      -int
1841      -copyout(const void *kaddr, void *uaddr, size_t count)
1842      -{ return (0); }
1843      -
1844      -#else   /* lint */
1845      -
1846 1787          ENTRY(copyout)
1847 1788  
1848 1789          cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
1849 1790          bleu,pt %ncc, .copyout_small            ! go to larger cases
1850 1791            xor   %o0, %o1, %o3                   ! are src, dst alignable?
1851 1792          btst    7, %o3                          !
1852 1793          bz,pt   %ncc, .copyout_8                ! check for longword alignment
1853 1794            nop
1854 1795          btst    1, %o3                          ! 
1855 1796          bz,pt   %ncc, .copyout_2                ! check for half-word

1856 1797            nop
1857 1798          sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
1858 1799          ld      [%o3 + %lo(hw_copy_limit_1)], %o3
1859 1800          tst     %o3
1860 1801          bz,pn   %icc, .copyout_small            ! if zero, disable HW copy
1861 1802            cmp   %o2, %o3                        ! if length <= limit
1862 1803          bleu,pt %ncc, .copyout_small            ! go to small copy
1863 1804            nop
1864 1805          ba,pt   %ncc, .copyout_more             ! otherwise go to large copy
1865 1806            nop
1866 1807  .copyout_2:
1867 1808          btst    3, %o3                          !
1868 1809          bz,pt   %ncc, .copyout_4                ! check for word alignment
1869 1810            nop
1870 1811          sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
1871 1812          ld      [%o3 + %lo(hw_copy_limit_2)], %o3
1872 1813          tst     %o3
1873 1814          bz,pn   %icc, .copyout_small            ! if zero, disable HW copy
1874 1815            cmp   %o2, %o3                        ! if length <= limit
1875 1816          bleu,pt %ncc, .copyout_small            ! go to small copy
1876 1817            nop
1877 1818          ba,pt   %ncc, .copyout_more             ! otherwise go to large copy
1878 1819            nop
1879 1820  .copyout_4:
1880 1821          ! already checked longword, must be word aligned
1881 1822          sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
1882 1823          ld      [%o3 + %lo(hw_copy_limit_4)], %o3
1883 1824          tst     %o3
1884 1825          bz,pn   %icc, .copyout_small            ! if zero, disable HW copy
1885 1826            cmp   %o2, %o3                        ! if length <= limit
1886 1827          bleu,pt %ncc, .copyout_small            ! go to small copy
1887 1828            nop
1888 1829          ba,pt   %ncc, .copyout_more             ! otherwise go to large copy
1889 1830            nop
1890 1831  .copyout_8:
1891 1832          sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
1892 1833          ld      [%o3 + %lo(hw_copy_limit_8)], %o3
1893 1834          tst     %o3
1894 1835          bz,pn   %icc, .copyout_small            ! if zero, disable HW copy
1895 1836            cmp   %o2, %o3                        ! if length <= limit
1896 1837          bleu,pt %ncc, .copyout_small            ! go to small copy
1897 1838            nop
1898 1839          ba,pt   %ncc, .copyout_more             ! otherwise go to large copy
1899 1840            nop
1900 1841  
1901 1842          .align  16
1902 1843          nop                             ! instruction alignment
1903 1844                                          ! see discussion at start of file
1904 1845  .copyout_small:
1905 1846          sethi   %hi(.sm_copyout_err), %o5       ! .sm_copyout_err is lofault
1906 1847          or      %o5, %lo(.sm_copyout_err), %o5
1907 1848          ldn     [THREAD_REG + T_LOFAULT], %o4   ! save existing handler
1908 1849          membar  #Sync                           ! sync error barrier
1909 1850          stn     %o5, [THREAD_REG + T_LOFAULT]   ! set t_lofault
1910 1851  .sm_do_copyout:
1911 1852          mov     %o0, SM_SAVE_SRC
1912 1853          mov     %o1, SM_SAVE_DST
1913 1854          cmp     %o2, SHORTCOPY          ! check for really short case
1914 1855          bleu,pt %ncc, .co_sm_left       !
1915 1856            mov   %o2, SM_SAVE_COUNT
1916 1857          cmp     %o2, CHKSIZE            ! check for medium length cases
1917 1858          bgu,pn  %ncc, .co_med           !
1918 1859            or    %o0, %o1, %o3           ! prepare alignment check
1919 1860          andcc   %o3, 0x3, %g0           ! test for alignment
1920 1861          bz,pt   %ncc, .co_sm_word       ! branch to word aligned case
1921 1862  .co_sm_movebytes:
1922 1863            sub   %o2, 3, %o2             ! adjust count to allow cc zero test
1923 1864  .co_sm_notalign4:
1924 1865          ldub    [%o0], %o3              ! read byte
1925 1866          subcc   %o2, 4, %o2             ! reduce count by 4
1926 1867          stba    %o3, [%o1]ASI_USER      ! write byte
1927 1868          inc     %o1                     ! advance DST by 1
1928 1869          ldub    [%o0 + 1], %o3          ! repeat for a total of 4 bytes
1929 1870          add     %o0, 4, %o0             ! advance SRC by 4
1930 1871          stba    %o3, [%o1]ASI_USER
1931 1872          inc     %o1                     ! advance DST by 1
1932 1873          ldub    [%o0 - 2], %o3
1933 1874          stba    %o3, [%o1]ASI_USER
1934 1875          inc     %o1                     ! advance DST by 1
1935 1876          ldub    [%o0 - 1], %o3
1936 1877          stba    %o3, [%o1]ASI_USER
1937 1878          bgt,pt  %ncc, .co_sm_notalign4  ! loop til 3 or fewer bytes remain
1938 1879            inc   %o1                     ! advance DST by 1
1939 1880          add     %o2, 3, %o2             ! restore count
1940 1881  .co_sm_left:
1941 1882          tst     %o2
1942 1883          bz,pt   %ncc, .co_sm_exit       ! check for zero length
1943 1884            nop
1944 1885          ldub    [%o0], %o3              ! load one byte
1945 1886          deccc   %o2                     ! reduce count for cc test
1946 1887          bz,pt   %ncc, .co_sm_exit
1947 1888            stba  %o3,[%o1]ASI_USER       ! store one byte
1948 1889          ldub    [%o0 + 1], %o3          ! load second byte
1949 1890          deccc   %o2
1950 1891          inc     %o1
1951 1892          bz,pt   %ncc, .co_sm_exit
1952 1893            stba  %o3,[%o1]ASI_USER       ! store second byte
1953 1894          ldub    [%o0 + 2], %o3          ! load third byte
1954 1895          inc     %o1
1955 1896          stba    %o3,[%o1]ASI_USER       ! store third byte
1956 1897          membar  #Sync                           ! sync error barrier
1957 1898          stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1958 1899          retl
1959 1900            mov   %g0, %o0                ! return 0
1960 1901          .align  16
1961 1902  .co_sm_words:
1962 1903          lduw    [%o0], %o3              ! read word
1963 1904  .co_sm_wordx:
1964 1905          subcc   %o2, 8, %o2             ! update count
1965 1906          stwa    %o3, [%o1]ASI_USER      ! write word
1966 1907          add     %o0, 8, %o0             ! update SRC
1967 1908          lduw    [%o0 - 4], %o3          ! read word
1968 1909          add     %o1, 4, %o1             ! update DST
1969 1910          stwa    %o3, [%o1]ASI_USER      ! write word
1970 1911          bgt,pt  %ncc, .co_sm_words      ! loop til done
1971 1912            add   %o1, 4, %o1             ! update DST
1972 1913          addcc   %o2, 7, %o2             ! restore count
1973 1914          bz,pt   %ncc, .co_sm_exit
1974 1915            nop
1975 1916          deccc   %o2
1976 1917          bz,pt   %ncc, .co_sm_byte
1977 1918  .co_sm_half:
1978 1919            subcc %o2, 2, %o2             ! reduce count by 2
1979 1920          lduh    [%o0], %o3              ! read half word
1980 1921          add     %o0, 2, %o0             ! advance SRC by 2
1981 1922          stha    %o3, [%o1]ASI_USER      ! write half word
1982 1923          bgt,pt  %ncc, .co_sm_half       ! loop til done
1983 1924            add   %o1, 2, %o1             ! advance DST by 2
1984 1925          addcc   %o2, 1, %o2             ! restore count
1985 1926          bz,pt   %ncc, .co_sm_exit
1986 1927            nop
1987 1928  .co_sm_byte:
1988 1929          ldub    [%o0], %o3
1989 1930          stba    %o3, [%o1]ASI_USER
1990 1931          membar  #Sync                           ! sync error barrier
1991 1932          stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1992 1933          retl
1993 1934            mov   %g0, %o0                ! return 0
1994 1935          .align 16
1995 1936  .co_sm_word:
1996 1937          subcc   %o2, 4, %o2             ! update count
1997 1938          bgt,pt  %ncc, .co_sm_wordx
1998 1939            lduw  [%o0], %o3              ! read word
1999 1940          addcc   %o2, 3, %o2             ! restore count
2000 1941          bz,pt   %ncc, .co_sm_exit
2001 1942            stwa  %o3, [%o1]ASI_USER      ! write word
2002 1943          deccc   %o2                     ! reduce count for cc test
2003 1944          ldub    [%o0 + 4], %o3          ! load one byte
2004 1945          add     %o1, 4, %o1
2005 1946          bz,pt   %ncc, .co_sm_exit
2006 1947            stba  %o3, [%o1]ASI_USER      ! store one byte
2007 1948          ldub    [%o0 + 5], %o3          ! load second byte
2008 1949          deccc   %o2
2009 1950          inc     %o1
2010 1951          bz,pt   %ncc, .co_sm_exit
2011 1952            stba  %o3, [%o1]ASI_USER      ! store second byte
2012 1953          ldub    [%o0 + 6], %o3          ! load third byte
2013 1954          inc     %o1
2014 1955          stba    %o3, [%o1]ASI_USER      ! store third byte
2015 1956  .co_sm_exit:
2016 1957            membar        #Sync                           ! sync error barrier
2017 1958          stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2018 1959          retl
2019 1960            mov   %g0, %o0                ! return 0
2020 1961  
2021 1962          .align 16
2022 1963  .co_med:
2023 1964          xor     %o0, %o1, %o3           ! setup alignment check
2024 1965          btst    1, %o3
2025 1966          bnz,pt  %ncc, .co_sm_movebytes  ! unaligned
2026 1967            nop
2027 1968          btst    3, %o3
2028 1969          bnz,pt  %ncc, .co_med_half      ! halfword aligned
2029 1970            nop
2030 1971          btst    7, %o3
2031 1972          bnz,pt  %ncc, .co_med_word      ! word aligned
2032 1973            nop
2033 1974  .co_med_long:
2034 1975          btst    3, %o0                  ! check for
2035 1976          bz,pt   %ncc, .co_med_long1     ! word alignment
2036 1977            nop
2037 1978  .co_med_long0:
2038 1979          ldub    [%o0], %o3              ! load one byte
2039 1980          inc     %o0
2040 1981          stba    %o3,[%o1]ASI_USER       ! store byte
2041 1982          inc     %o1
2042 1983          btst    3, %o0
2043 1984          bnz,pt  %ncc, .co_med_long0
2044 1985            dec   %o2
2045 1986  .co_med_long1:                  ! word aligned
2046 1987          btst    7, %o0                  ! check for long word
2047 1988          bz,pt   %ncc, .co_med_long2
2048 1989            nop
2049 1990          lduw    [%o0], %o3              ! load word
2050 1991          add     %o0, 4, %o0             ! advance SRC by 4
2051 1992          stwa    %o3, [%o1]ASI_USER      ! store word
2052 1993          add     %o1, 4, %o1             ! advance DST by 4
2053 1994          sub     %o2, 4, %o2             ! reduce count by 4
2054 1995  !
2055 1996  !  Now long word aligned and have at least 32 bytes to move
2056 1997  !
2057 1998  .co_med_long2:
2058 1999          sub     %o2, 31, %o2            ! adjust count to allow cc zero test
2059 2000          sub     %o1, 8, %o1             ! adjust pointer to allow store in
2060 2001                                          ! branch delay slot instead of add
2061 2002  .co_med_lmove:
2062 2003          add     %o1, 8, %o1             ! advance DST by 8
2063 2004          ldx     [%o0], %o3              ! read long word
2064 2005          subcc   %o2, 32, %o2            ! reduce count by 32
2065 2006          stxa    %o3, [%o1]ASI_USER      ! write long word
2066 2007          add     %o1, 8, %o1             ! advance DST by 8
2067 2008          ldx     [%o0 + 8], %o3          ! repeat for a total for 4 long words
2068 2009          add     %o0, 32, %o0            ! advance SRC by 32
2069 2010          stxa    %o3, [%o1]ASI_USER
2070 2011          ldx     [%o0 - 16], %o3
2071 2012          add     %o1, 8, %o1             ! advance DST by 8
2072 2013          stxa    %o3, [%o1]ASI_USER
2073 2014          ldx     [%o0 - 8], %o3
2074 2015          add     %o1, 8, %o1             ! advance DST by 8
2075 2016          bgt,pt  %ncc, .co_med_lmove     ! loop til 31 or fewer bytes left
2076 2017            stxa  %o3, [%o1]ASI_USER
2077 2018          add     %o1, 8, %o1             ! advance DST by 8
2078 2019          addcc   %o2, 24, %o2            ! restore count to long word offset
2079 2020          ble,pt  %ncc, .co_med_lextra    ! check for more long words to move
2080 2021            nop
2081 2022  .co_med_lword:
2082 2023          ldx     [%o0], %o3              ! read long word
2083 2024          subcc   %o2, 8, %o2             ! reduce count by 8
2084 2025          stxa    %o3, [%o1]ASI_USER      ! write long word
2085 2026          add     %o0, 8, %o0             ! advance SRC by 8
2086 2027          bgt,pt  %ncc, .co_med_lword     ! loop til 7 or fewer bytes left
2087 2028            add   %o1, 8, %o1             ! advance DST by 8
2088 2029  .co_med_lextra:
2089 2030          addcc   %o2, 7, %o2             ! restore rest of count
2090 2031          bz,pt   %ncc, .co_sm_exit       ! if zero, then done
2091 2032            deccc %o2
2092 2033          bz,pt   %ncc, .co_sm_byte
2093 2034            nop
2094 2035          ba,pt   %ncc, .co_sm_half
2095 2036            nop
2096 2037  
2097 2038          .align 16
2098 2039          nop                             ! instruction alignment
2099 2040                                          ! see discussion at start of file
2100 2041  .co_med_word:
2101 2042          btst    3, %o0                  ! check for
2102 2043          bz,pt   %ncc, .co_med_word1     ! word alignment
2103 2044            nop
2104 2045  .co_med_word0:
2105 2046          ldub    [%o0], %o3              ! load one byte
2106 2047          inc     %o0
2107 2048          stba    %o3,[%o1]ASI_USER       ! store byte
2108 2049          inc     %o1
2109 2050          btst    3, %o0
2110 2051          bnz,pt  %ncc, .co_med_word0
2111 2052            dec   %o2
2112 2053  !
2113 2054  !  Now word aligned and have at least 36 bytes to move
2114 2055  !
2115 2056  .co_med_word1:
2116 2057          sub     %o2, 15, %o2            ! adjust count to allow cc zero test
2117 2058  .co_med_wmove:
2118 2059          lduw    [%o0], %o3              ! read word
2119 2060          subcc   %o2, 16, %o2            ! reduce count by 16
2120 2061          stwa    %o3, [%o1]ASI_USER      ! write word
2121 2062          add     %o1, 4, %o1             ! advance DST by 4
2122 2063          lduw    [%o0 + 4], %o3          ! repeat for a total for 4 words
2123 2064          add     %o0, 16, %o0            ! advance SRC by 16
2124 2065          stwa    %o3, [%o1]ASI_USER
2125 2066          add     %o1, 4, %o1             ! advance DST by 4
2126 2067          lduw    [%o0 - 8], %o3
2127 2068          stwa    %o3, [%o1]ASI_USER
2128 2069          add     %o1, 4, %o1             ! advance DST by 4
2129 2070          lduw    [%o0 - 4], %o3
2130 2071          stwa    %o3, [%o1]ASI_USER
2131 2072          bgt,pt  %ncc, .co_med_wmove     ! loop til 15 or fewer bytes left
2132 2073            add   %o1, 4, %o1             ! advance DST by 4
2133 2074          addcc   %o2, 12, %o2            ! restore count to word offset
2134 2075          ble,pt  %ncc, .co_med_wextra    ! check for more words to move
2135 2076            nop
2136 2077  .co_med_word2:
2137 2078          lduw    [%o0], %o3              ! read word
2138 2079          subcc   %o2, 4, %o2             ! reduce count by 4
2139 2080          stwa    %o3, [%o1]ASI_USER      ! write word
2140 2081          add     %o0, 4, %o0             ! advance SRC by 4
2141 2082          bgt,pt  %ncc, .co_med_word2     ! loop til 3 or fewer bytes left
2142 2083            add   %o1, 4, %o1             ! advance DST by 4
2143 2084  .co_med_wextra:
2144 2085          addcc   %o2, 3, %o2             ! restore rest of count
2145 2086          bz,pt   %ncc, .co_sm_exit       ! if zero, then done
2146 2087            deccc %o2
2147 2088          bz,pt   %ncc, .co_sm_byte
2148 2089            nop
2149 2090          ba,pt   %ncc, .co_sm_half
2150 2091            nop
2151 2092  
2152 2093          .align 16
2153 2094          nop                             ! instruction alignment
2154 2095          nop                             ! see discussion at start of file
2155 2096          nop
2156 2097  .co_med_half:
2157 2098          btst    1, %o0                  ! check for
2158 2099          bz,pt   %ncc, .co_med_half1     ! half word alignment
2159 2100            nop
2160 2101          ldub    [%o0], %o3              ! load one byte
2161 2102          inc     %o0
2162 2103          stba    %o3,[%o1]ASI_USER       ! store byte
2163 2104          inc     %o1
2164 2105          dec     %o2
2165 2106  !
2166 2107  !  Now half word aligned and have at least 38 bytes to move
2167 2108  !
2168 2109  .co_med_half1:
2169 2110          sub     %o2, 7, %o2             ! adjust count to allow cc zero test
2170 2111  .co_med_hmove:
2171 2112          lduh    [%o0], %o3              ! read half word
2172 2113          subcc   %o2, 8, %o2             ! reduce count by 8
2173 2114          stha    %o3, [%o1]ASI_USER      ! write half word
2174 2115          add     %o1, 2, %o1             ! advance DST by 2
2175 2116          lduh    [%o0 + 2], %o3          ! repeat for a total for 4 halfwords
2176 2117          add     %o0, 8, %o0             ! advance SRC by 8
2177 2118          stha    %o3, [%o1]ASI_USER
2178 2119          add     %o1, 2, %o1             ! advance DST by 2
2179 2120          lduh    [%o0 - 4], %o3
2180 2121          stha    %o3, [%o1]ASI_USER
2181 2122          add     %o1, 2, %o1             ! advance DST by 2
2182 2123          lduh    [%o0 - 2], %o3
2183 2124          stha    %o3, [%o1]ASI_USER
2184 2125          bgt,pt  %ncc, .co_med_hmove     ! loop til 7 or fewer bytes left
2185 2126            add   %o1, 2, %o1             ! advance DST by 2
2186 2127          addcc   %o2, 7, %o2             ! restore count
2187 2128          bz,pt   %ncc, .co_sm_exit
2188 2129            deccc %o2
2189 2130          bz,pt   %ncc, .co_sm_byte
2190 2131            nop
2191 2132          ba,pt   %ncc, .co_sm_half
2192 2133            nop
2193 2134  
2194 2135  /*
2195 2136   * We got here because of a fault during short copyout.
2196 2137   * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
2197 2138   */
2198 2139  .sm_copyout_err:
2199 2140          membar  #Sync
2200 2141          stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2201 2142          mov     SM_SAVE_SRC, %o0
2202 2143          mov     SM_SAVE_DST, %o1
2203 2144          mov     SM_SAVE_COUNT, %o2
2204 2145          ldn     [THREAD_REG + T_COPYOPS], %o3   ! check for copyop handler
2205 2146          tst     %o3
2206 2147          bz,pt   %ncc, 3f                        ! if not, return error
2207 2148            nop
2208 2149          ldn     [%o3 + CP_COPYOUT], %o5         ! if handler, invoke it with
2209 2150          jmp     %o5                             ! original arguments
2210 2151            nop
2211 2152  3:
2212 2153          retl
2213 2154            or    %g0, -1, %o0            ! return error value
2214 2155  
2215 2156          SET_SIZE(copyout)
2216 2157  
2217 2158  /*
2218 2159   * The _more entry points are not intended to be used directly by
2219 2160   * any caller from outside this file.  They are provided to allow
2220 2161   * profiling and dtrace of the portions of the copy code that uses
2221 2162   * the floating point registers.
2222 2163   * This entry is particularly important as DTRACE (at least as of
2223 2164   * 4/2004) does not support leaf functions.
2224 2165   */
2225 2166  
2226 2167          ENTRY(copyout_more)
2227 2168  .copyout_more:
2228 2169          save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2229 2170          set     .copyout_err, REAL_LOFAULT
2230 2171  
2231 2172  /*
2232 2173   * Copy outs that reach here are larger than VIS_COPY_THRESHOLD bytes
2233 2174   */
2234 2175  .do_copyout:
2235 2176          set     copyio_fault, %l7               ! .copyio_fault is lofault val
2236 2177  
2237 2178          ldn     [THREAD_REG + T_LOFAULT], %l6   ! save existing handler
2238 2179          membar  #Sync                           ! sync error barrier
2239 2180          stn     %l7, [THREAD_REG + T_LOFAULT]   ! set t_lofault
2240 2181  
2241 2182          mov     %i0, SAVE_SRC
2242 2183          mov     %i1, SAVE_DST
2243 2184          mov     %i2, SAVE_COUNT
2244 2185  
2245 2186          FP_NOMIGRATE(6, 7)
2246 2187  
2247 2188          rd      %fprs, %o2              ! check for unused fp
2248 2189          st      %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
2249 2190          btst    FPRS_FEF, %o2
2250 2191          bz,a,pt %icc, .do_blockcopyout
2251 2192            wr    %g0, FPRS_FEF, %fprs
2252 2193  
2253 2194          BST_FPQ2Q4_TOSTACK(%o2)
2254 2195  
2255 2196  .do_blockcopyout:
2256 2197          rd      %gsr, %o2
2257 2198          stx     %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]      ! save gsr
2258 2199          or      %l6, FPUSED_FLAG, %l6
2259 2200  
2260 2201          andcc   DST, VIS_BLOCKSIZE - 1, TMP
2261 2202          mov     ASI_USER, %asi
2262 2203          bz,pt   %ncc, 2f
2263 2204            neg   TMP
2264 2205          add     TMP, VIS_BLOCKSIZE, TMP
2265 2206  
2266 2207          ! TMP = bytes required to align DST on FP_BLOCK boundary
2267 2208          ! Using SRC as a tmp here
2268 2209          cmp     TMP, 3
2269 2210          bleu,pt %ncc, 1f
2270 2211            sub   CNT,TMP,CNT             ! adjust main count
2271 2212          sub     TMP, 3, TMP             ! adjust for end of loop test
2272 2213  .co_blkalign:
2273 2214          ldub    [REALSRC], SRC          ! move 4 bytes per loop iteration
2274 2215          stba    SRC, [DST]%asi
2275 2216          subcc   TMP, 4, TMP
2276 2217          ldub    [REALSRC + 1], SRC
2277 2218          add     REALSRC, 4, REALSRC
2278 2219          stba    SRC, [DST + 1]%asi
2279 2220          ldub    [REALSRC - 2], SRC
2280 2221          add     DST, 4, DST
2281 2222          stba    SRC, [DST - 2]%asi
2282 2223          ldub    [REALSRC - 1], SRC
2283 2224          bgu,pt  %ncc, .co_blkalign
2284 2225            stba  SRC, [DST - 1]%asi
2285 2226  
2286 2227          addcc   TMP, 3, TMP             ! restore count adjustment
2287 2228          bz,pt   %ncc, 2f                ! no bytes left?
2288 2229            nop
2289 2230  1:      ldub    [REALSRC], SRC
2290 2231          inc     REALSRC
2291 2232          inc     DST
2292 2233          deccc   TMP
2293 2234          bgu     %ncc, 1b
2294 2235            stba  SRC, [DST - 1]%asi
2295 2236  
2296 2237  2:
2297 2238          andn    REALSRC, 0x7, SRC
2298 2239          alignaddr REALSRC, %g0, %g0
2299 2240  
2300 2241          ! SRC - 8-byte aligned
2301 2242          ! DST - 64-byte aligned
2302 2243          prefetch [SRC], #one_read
2303 2244          prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read
2304 2245          prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read
2305 2246          prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read
2306 2247          ldd     [SRC], %f16
2307 2248  #if CHEETAH_PREFETCH > 4
2308 2249          prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
2309 2250  #endif
2310 2251          ldd     [SRC + 0x08], %f18
2311 2252  #if CHEETAH_PREFETCH > 5
2312 2253          prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read
2313 2254  #endif
2314 2255          ldd     [SRC + 0x10], %f20
2315 2256  #if CHEETAH_PREFETCH > 6
2316 2257          prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read
2317 2258  #endif
2318 2259          faligndata %f16, %f18, %f48
2319 2260          ldd     [SRC + 0x18], %f22
2320 2261  #if CHEETAH_PREFETCH > 7
2321 2262          prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read
2322 2263  #endif
2323 2264          faligndata %f18, %f20, %f50
2324 2265          ldd     [SRC + 0x20], %f24
2325 2266          faligndata %f20, %f22, %f52
2326 2267          ldd     [SRC + 0x28], %f26
2327 2268          faligndata %f22, %f24, %f54
2328 2269          ldd     [SRC + 0x30], %f28
2329 2270          faligndata %f24, %f26, %f56
2330 2271          ldd     [SRC + 0x38], %f30
2331 2272          faligndata %f26, %f28, %f58
2332 2273          ldd     [SRC + VIS_BLOCKSIZE], %f16
2333 2274          sub     CNT, VIS_BLOCKSIZE, CNT
2334 2275          add     SRC, VIS_BLOCKSIZE, SRC
2335 2276          add     REALSRC, VIS_BLOCKSIZE, REALSRC
2336 2277          ba,a,pt %ncc, 1f
2337 2278            nop
2338 2279          .align  16
2339 2280  1:
2340 2281          ldd     [SRC + 0x08], %f18
2341 2282          faligndata %f28, %f30, %f60
2342 2283          ldd     [SRC + 0x10], %f20
2343 2284          faligndata %f30, %f16, %f62
2344 2285          stda    %f48, [DST]ASI_BLK_AIUS
2345 2286          ldd     [SRC + 0x18], %f22
2346 2287          faligndata %f16, %f18, %f48
2347 2288          ldd     [SRC + 0x20], %f24
2348 2289          faligndata %f18, %f20, %f50
2349 2290          ldd     [SRC + 0x28], %f26
2350 2291          faligndata %f20, %f22, %f52
2351 2292          ldd     [SRC + 0x30], %f28
2352 2293          faligndata %f22, %f24, %f54
2353 2294          ldd     [SRC + 0x38], %f30
2354 2295          faligndata %f24, %f26, %f56
2355 2296          sub     CNT, VIS_BLOCKSIZE, CNT
2356 2297          ldd     [SRC + VIS_BLOCKSIZE], %f16
2357 2298          faligndata %f26, %f28, %f58
2358 2299          prefetch [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read
2359 2300          add     DST, VIS_BLOCKSIZE, DST
2360 2301          prefetch [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
2361 2302          add     REALSRC, VIS_BLOCKSIZE, REALSRC
2362 2303          cmp     CNT, VIS_BLOCKSIZE + 8
2363 2304          bgu,pt  %ncc, 1b
2364 2305            add   SRC, VIS_BLOCKSIZE, SRC
2365 2306  
2366 2307          ! only if REALSRC & 0x7 is 0
2367 2308          cmp     CNT, VIS_BLOCKSIZE
2368 2309          bne     %ncc, 3f
2369 2310            andcc REALSRC, 0x7, %g0
2370 2311          bz,pt   %ncc, 2f
2371 2312            nop
2372 2313  3:      
2373 2314          faligndata %f28, %f30, %f60
2374 2315          faligndata %f30, %f16, %f62
2375 2316          stda    %f48, [DST]ASI_BLK_AIUS
2376 2317          add     DST, VIS_BLOCKSIZE, DST
2377 2318          ba,pt   %ncc, 3f
2378 2319            nop
2379 2320  2:
2380 2321          ldd     [SRC + 0x08], %f18
2381 2322          fsrc1   %f28, %f60
2382 2323          ldd     [SRC + 0x10], %f20
2383 2324          fsrc1   %f30, %f62
2384 2325          stda    %f48, [DST]ASI_BLK_AIUS
2385 2326          ldd     [SRC + 0x18], %f22
2386 2327          fsrc1   %f16, %f48
2387 2328          ldd     [SRC + 0x20], %f24
2388 2329          fsrc1   %f18, %f50
2389 2330          ldd     [SRC + 0x28], %f26
2390 2331          fsrc1   %f20, %f52
2391 2332          ldd     [SRC + 0x30], %f28
2392 2333          fsrc1   %f22, %f54
2393 2334          ldd     [SRC + 0x38], %f30
2394 2335          fsrc1   %f24, %f56
2395 2336          sub     CNT, VIS_BLOCKSIZE, CNT
2396 2337          add     DST, VIS_BLOCKSIZE, DST
2397 2338          add     SRC, VIS_BLOCKSIZE, SRC
2398 2339          add     REALSRC, VIS_BLOCKSIZE, REALSRC
2399 2340          fsrc1   %f26, %f58
2400 2341          fsrc1   %f28, %f60
2401 2342          fsrc1   %f30, %f62
2402 2343          stda    %f48, [DST]ASI_BLK_AIUS
2403 2344          add     DST, VIS_BLOCKSIZE, DST
2404 2345          ba,a,pt %ncc, 4f
2405 2346            nop
2406 2347  
2407 2348  3:      tst     CNT
2408 2349          bz,a    %ncc, 4f
2409 2350            nop
2410 2351  
2411 2352  5:      ldub    [REALSRC], TMP
2412 2353          inc     REALSRC
2413 2354          inc     DST
2414 2355          deccc   CNT
2415 2356          bgu     %ncc, 5b
2416 2357            stba  TMP, [DST - 1]%asi
2417 2358  4:
2418 2359  
2419 2360  .copyout_exit:
2420 2361          membar  #Sync
2421 2362  
2422 2363          FPRAS_INTERVAL(FPRAS_COPYOUT, 0, %l5, %o2, %o3, %o4, %o5, 8)
2423 2364          FPRAS_REWRITE_TYPE2Q2(0, %l5, %o2, %o3, 8, 9)
2424 2365          FPRAS_CHECK(FPRAS_COPYOUT, %l5, 9)      ! lose outputs
2425 2366  
2426 2367          ldx     [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
2427 2368          wr      %o2, 0, %gsr            ! restore gsr
2428 2369  
2429 2370          ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
2430 2371          btst    FPRS_FEF, %o3
2431 2372          bz,pt   %icc, 4f
2432 2373            nop
2433 2374  
2434 2375          BLD_FPQ2Q4_FROMSTACK(%o2)
2435 2376  
2436 2377          ba,pt   %ncc, 1f
2437 2378            wr    %o3, 0, %fprs           ! restore fprs
2438 2379  
2439 2380  4:
2440 2381          FZEROQ2Q4
2441 2382          wr      %o3, 0, %fprs           ! restore fprs
2442 2383  
2443 2384  1:
2444 2385          membar  #Sync
2445 2386          andn    %l6, FPUSED_FLAG, %l6
2446 2387          stn     %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2447 2388          FP_ALLOWMIGRATE(5, 6)
2448 2389          ret
2449 2390            restore       %g0, 0, %o0
2450 2391  
2451 2392  /*
2452 2393   * We got here because of a fault during copyout.
2453 2394   * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
2454 2395   */
2455 2396  .copyout_err:
2456 2397          ldn     [THREAD_REG + T_COPYOPS], %o4   ! check for copyop handler
2457 2398          tst     %o4
2458 2399          bz,pt   %ncc, 2f                        ! if not, return error
2459 2400            nop

↓ open down ↓

604 lines elided

↑ open up ↑

2460 2401          ldn     [%o4 + CP_COPYOUT], %g2         ! if handler, invoke it with
2461 2402          jmp     %g2                             ! original arguments
2462 2403            restore %g0, 0, %g0                   ! dispose of copy window
2463 2404  2:
2464 2405          ret
2465 2406            restore %g0, -1, %o0                  ! return error value
2466 2407  
2467 2408  
2468 2409          SET_SIZE(copyout_more)
2469 2410  
2470      -#endif  /* lint */
2471 2411  
2472      -
2473      -#ifdef  lint
2474      -
2475      -/*ARGSUSED*/
2476      -int
2477      -xcopyout(const void *kaddr, void *uaddr, size_t count)
2478      -{ return (0); }
2479      -
2480      -#else   /* lint */
2481      -
2482 2412          ENTRY(xcopyout)
2483 2413          cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
2484 2414          bleu,pt %ncc, .xcopyout_small           ! go to larger cases
2485 2415            xor   %o0, %o1, %o3                   ! are src, dst alignable?
2486 2416          btst    7, %o3                          !
2487 2417          bz,pt   %ncc, .xcopyout_8               !
2488 2418            nop
2489 2419          btst    1, %o3                          ! 
2490 2420          bz,pt   %ncc, .xcopyout_2               ! check for half-word
2491 2421            nop

2492 2422          sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
2493 2423          ld      [%o3 + %lo(hw_copy_limit_1)], %o3
2494 2424          tst     %o3
2495 2425          bz,pn   %icc, .xcopyout_small           ! if zero, disable HW copy
2496 2426            cmp   %o2, %o3                        ! if length <= limit
2497 2427          bleu,pt %ncc, .xcopyout_small           ! go to small copy
2498 2428            nop
2499 2429          ba,pt   %ncc, .xcopyout_more            ! otherwise go to large copy
2500 2430            nop
2501 2431  .xcopyout_2:
2502 2432          btst    3, %o3                          !
2503 2433          bz,pt   %ncc, .xcopyout_4               ! check for word alignment
2504 2434            nop
2505 2435          sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
2506 2436          ld      [%o3 + %lo(hw_copy_limit_2)], %o3
2507 2437          tst     %o3
2508 2438          bz,pn   %icc, .xcopyout_small           ! if zero, disable HW copy
2509 2439            cmp   %o2, %o3                        ! if length <= limit
2510 2440          bleu,pt %ncc, .xcopyout_small           ! go to small copy
2511 2441            nop
2512 2442          ba,pt   %ncc, .xcopyout_more            ! otherwise go to large copy
2513 2443            nop
2514 2444  .xcopyout_4:
2515 2445          ! already checked longword, must be word aligned
2516 2446          sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
2517 2447          ld      [%o3 + %lo(hw_copy_limit_4)], %o3
2518 2448          tst     %o3
2519 2449          bz,pn   %icc, .xcopyout_small           ! if zero, disable HW copy
2520 2450            cmp   %o2, %o3                        ! if length <= limit
2521 2451          bleu,pt %ncc, .xcopyout_small           ! go to small copy
2522 2452            nop
2523 2453          ba,pt   %ncc, .xcopyout_more            ! otherwise go to large copy
2524 2454            nop
2525 2455  .xcopyout_8:
2526 2456          sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
2527 2457          ld      [%o3 + %lo(hw_copy_limit_8)], %o3
2528 2458          tst     %o3
2529 2459          bz,pn   %icc, .xcopyout_small           ! if zero, disable HW copy
2530 2460            cmp   %o2, %o3                        ! if length <= limit
2531 2461          bleu,pt %ncc, .xcopyout_small           ! go to small copy
2532 2462            nop
2533 2463          ba,pt   %ncc, .xcopyout_more            ! otherwise go to large copy
2534 2464            nop
2535 2465  
2536 2466  .xcopyout_small:
2537 2467          sethi   %hi(.sm_xcopyout_err), %o5      ! .sm_xcopyout_err is lofault
2538 2468          or      %o5, %lo(.sm_xcopyout_err), %o5
2539 2469          ldn     [THREAD_REG + T_LOFAULT], %o4   ! save existing handler
2540 2470          membar  #Sync                           ! sync error barrier
2541 2471          ba,pt   %ncc, .sm_do_copyout            ! common code
2542 2472            stn   %o5, [THREAD_REG + T_LOFAULT]   ! set t_lofault
2543 2473  
2544 2474  .xcopyout_more:
2545 2475          save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2546 2476          sethi   %hi(.xcopyout_err), REAL_LOFAULT
2547 2477          ba,pt   %ncc, .do_copyout               ! common code
2548 2478            or    REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
2549 2479  
2550 2480  /*
2551 2481   * We got here because of fault during xcopyout
2552 2482   * Errno value is in ERRNO
2553 2483   */
2554 2484  .xcopyout_err:
2555 2485          ldn     [THREAD_REG + T_COPYOPS], %o4   ! check for copyop handler
2556 2486          tst     %o4
2557 2487          bz,pt   %ncc, 2f                        ! if not, return error
2558 2488            nop
2559 2489          ldn     [%o4 + CP_XCOPYOUT], %g2        ! if handler, invoke it with
2560 2490          jmp     %g2                             ! original arguments
2561 2491            restore %g0, 0, %g0                   ! dispose of copy window
2562 2492  2:
2563 2493          ret
2564 2494            restore ERRNO, 0, %o0                 ! return errno value
2565 2495  
2566 2496  .sm_xcopyout_err:
2567 2497  
2568 2498          membar  #Sync
2569 2499          stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2570 2500          mov     SM_SAVE_SRC, %o0
2571 2501          mov     SM_SAVE_DST, %o1
2572 2502          mov     SM_SAVE_COUNT, %o2
2573 2503          ldn     [THREAD_REG + T_COPYOPS], %o3   ! check for copyop handler
2574 2504          tst     %o3
2575 2505          bz,pt   %ncc, 3f                        ! if not, return error

↓ open down ↓

84 lines elided

↑ open up ↑

2576 2506            nop
2577 2507          ldn     [%o3 + CP_XCOPYOUT], %o5        ! if handler, invoke it with
2578 2508          jmp     %o5                             ! original arguments
2579 2509            nop
2580 2510  3:
2581 2511          retl
2582 2512            or    %g1, 0, %o0             ! return errno value
2583 2513  
2584 2514          SET_SIZE(xcopyout)
2585 2515  
2586      -#endif  /* lint */
2587      -        
2588      -#ifdef  lint
2589      -
2590      -/*ARGSUSED*/
2591      -int
2592      -xcopyout_little(const void *kaddr, void *uaddr, size_t count)
2593      -{ return (0); }
2594      -
2595      -#else   /* lint */
2596      -
2597 2516          ENTRY(xcopyout_little)
2598 2517          sethi   %hi(.xcopyio_err), %o5
2599 2518          or      %o5, %lo(.xcopyio_err), %o5
2600 2519          ldn     [THREAD_REG + T_LOFAULT], %o4
2601 2520          membar  #Sync                           ! sync error barrier
2602 2521          stn     %o5, [THREAD_REG + T_LOFAULT]
2603 2522          mov     %o4, %o5
2604 2523  
2605 2524          subcc   %g0, %o2, %o3
2606 2525          add     %o0, %o2, %o0

2607 2526          bz,pn   %ncc, 2f                ! check for zero bytes
2608 2527            sub   %o2, 1, %o4
2609 2528          add     %o0, %o4, %o0           ! start w/last byte
2610 2529          add     %o1, %o2, %o1
2611 2530          ldub    [%o0 + %o3], %o4
2612 2531  
2613 2532  1:      stba    %o4, [%o1 + %o3]ASI_AIUSL
2614 2533          inccc   %o3
2615 2534          sub     %o0, 2, %o0             ! get next byte
2616 2535          bcc,a,pt %ncc, 1b

↓ open down ↓

10 lines elided

↑ open up ↑

2617 2536            ldub  [%o0 + %o3], %o4
2618 2537  
2619 2538  2:
2620 2539          membar  #Sync                           ! sync error barrier
2621 2540          stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2622 2541          retl
2623 2542            mov   %g0, %o0                ! return (0)
2624 2543  
2625 2544          SET_SIZE(xcopyout_little)
2626 2545  
2627      -#endif  /* lint */
2628      -
2629 2546  /*
2630 2547   * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
2631 2548   */
2632 2549  
2633      -#if defined(lint)
2634      -
2635      -/*ARGSUSED*/
2636      -int
2637      -copyin(const void *uaddr, void *kaddr, size_t count)
2638      -{ return (0); }
2639      -
2640      -#else   /* lint */
2641      -
2642 2550          ENTRY(copyin)
2643 2551          cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
2644 2552          bleu,pt %ncc, .copyin_small             ! go to larger cases
2645 2553            xor   %o0, %o1, %o3                   ! are src, dst alignable?
2646 2554          btst    7, %o3                          !
2647 2555          bz,pt   %ncc, .copyin_8                 ! check for longword alignment
2648 2556            nop
2649 2557          btst    1, %o3                          ! 
2650 2558          bz,pt   %ncc, .copyin_2                 ! check for half-word
2651 2559            nop

2652 2560          sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
2653 2561          ld      [%o3 + %lo(hw_copy_limit_1)], %o3
2654 2562          tst     %o3
2655 2563          bz,pn   %icc, .copyin_small             ! if zero, disable HW copy
2656 2564            cmp   %o2, %o3                        ! if length <= limit
2657 2565          bleu,pt %ncc, .copyin_small             ! go to small copy
2658 2566            nop
2659 2567          ba,pt   %ncc, .copyin_more              ! otherwise go to large copy
2660 2568            nop
2661 2569  .copyin_2:
2662 2570          btst    3, %o3                          !
2663 2571          bz,pt   %ncc, .copyin_4                 ! check for word alignment
2664 2572            nop
2665 2573          sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
2666 2574          ld      [%o3 + %lo(hw_copy_limit_2)], %o3
2667 2575          tst     %o3
2668 2576          bz,pn   %icc, .copyin_small             ! if zero, disable HW copy
2669 2577            cmp   %o2, %o3                        ! if length <= limit
2670 2578          bleu,pt %ncc, .copyin_small             ! go to small copy
2671 2579            nop
2672 2580          ba,pt   %ncc, .copyin_more              ! otherwise go to large copy
2673 2581            nop
2674 2582  .copyin_4:
2675 2583          ! already checked longword, must be word aligned
2676 2584          sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
2677 2585          ld      [%o3 + %lo(hw_copy_limit_4)], %o3
2678 2586          tst     %o3
2679 2587          bz,pn   %icc, .copyin_small             ! if zero, disable HW copy
2680 2588            cmp   %o2, %o3                        ! if length <= limit
2681 2589          bleu,pt %ncc, .copyin_small             ! go to small copy
2682 2590            nop
2683 2591          ba,pt   %ncc, .copyin_more              ! otherwise go to large copy
2684 2592            nop
2685 2593  .copyin_8:
2686 2594          sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
2687 2595          ld      [%o3 + %lo(hw_copy_limit_8)], %o3
2688 2596          tst     %o3
2689 2597          bz,pn   %icc, .copyin_small             ! if zero, disable HW copy
2690 2598            cmp   %o2, %o3                        ! if length <= limit
2691 2599          bleu,pt %ncc, .copyin_small             ! go to small copy
2692 2600            nop
2693 2601          ba,pt   %ncc, .copyin_more              ! otherwise go to large copy
2694 2602            nop
2695 2603  
2696 2604          .align  16
2697 2605          nop                             ! instruction alignment
2698 2606                                          ! see discussion at start of file
2699 2607  .copyin_small:
2700 2608          sethi   %hi(.sm_copyin_err), %o5        ! .sm_copyin_err is lofault 
2701 2609          or      %o5, %lo(.sm_copyin_err), %o5
2702 2610          ldn     [THREAD_REG + T_LOFAULT], %o4   ! set/save t_lofault, no tramp
2703 2611          membar  #Sync                           ! sync error barrier
2704 2612          stn     %o5, [THREAD_REG + T_LOFAULT]
2705 2613  .sm_do_copyin:
2706 2614          mov     %o0, SM_SAVE_SRC
2707 2615          mov     %o1, SM_SAVE_DST
2708 2616          cmp     %o2, SHORTCOPY          ! check for really short case
2709 2617          bleu,pt %ncc, .ci_sm_left       !
2710 2618            mov   %o2, SM_SAVE_COUNT
2711 2619          cmp     %o2, CHKSIZE            ! check for medium length cases
2712 2620          bgu,pn  %ncc, .ci_med           !
2713 2621            or    %o0, %o1, %o3           ! prepare alignment check
2714 2622          andcc   %o3, 0x3, %g0           ! test for alignment
2715 2623          bz,pt   %ncc, .ci_sm_word       ! branch to word aligned case
2716 2624  .ci_sm_movebytes:
2717 2625            sub   %o2, 3, %o2             ! adjust count to allow cc zero test
2718 2626  .ci_sm_notalign4:
2719 2627          lduba   [%o0]ASI_USER, %o3      ! read byte
2720 2628          subcc   %o2, 4, %o2             ! reduce count by 4
2721 2629          stb     %o3, [%o1]              ! write byte
2722 2630          add     %o0, 1, %o0             ! advance SRC by 1
2723 2631          lduba   [%o0]ASI_USER, %o3      ! repeat for a total of 4 bytes
2724 2632          add     %o0, 1, %o0             ! advance SRC by 1
2725 2633          stb     %o3, [%o1 + 1]
2726 2634          add     %o1, 4, %o1             ! advance DST by 4
2727 2635          lduba   [%o0]ASI_USER, %o3
2728 2636          add     %o0, 1, %o0             ! advance SRC by 1
2729 2637          stb     %o3, [%o1 - 2]
2730 2638          lduba   [%o0]ASI_USER, %o3
2731 2639          add     %o0, 1, %o0             ! advance SRC by 1
2732 2640          bgt,pt  %ncc, .ci_sm_notalign4  ! loop til 3 or fewer bytes remain
2733 2641            stb   %o3, [%o1 - 1]
2734 2642          add     %o2, 3, %o2             ! restore count
2735 2643  .ci_sm_left:
2736 2644          tst     %o2
2737 2645          bz,pt   %ncc, .ci_sm_exit
2738 2646            nop
2739 2647          lduba   [%o0]ASI_USER, %o3              ! load one byte
2740 2648          deccc   %o2                     ! reduce count for cc test
2741 2649          bz,pt   %ncc, .ci_sm_exit
2742 2650            stb   %o3,[%o1]               ! store one byte
2743 2651          inc     %o0
2744 2652          lduba   [%o0]ASI_USER, %o3      ! load second byte
2745 2653          deccc   %o2
2746 2654          bz,pt   %ncc, .ci_sm_exit
2747 2655            stb   %o3,[%o1 + 1]           ! store second byte
2748 2656          inc     %o0
2749 2657          lduba   [%o0]ASI_USER, %o3      ! load third byte
2750 2658          stb     %o3,[%o1 + 2]           ! store third byte
2751 2659          membar  #Sync                           ! sync error barrier
2752 2660          stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2753 2661          retl
2754 2662            mov   %g0, %o0                ! return 0
2755 2663          .align  16
2756 2664  .ci_sm_words:
2757 2665          lduwa   [%o0]ASI_USER, %o3              ! read word
2758 2666  .ci_sm_wordx:
2759 2667          subcc   %o2, 8, %o2             ! update count
2760 2668          stw     %o3, [%o1]              ! write word
2761 2669          add     %o0, 4, %o0             ! update SRC
2762 2670          add     %o1, 8, %o1             ! update DST
2763 2671          lduwa   [%o0]ASI_USER, %o3      ! read word
2764 2672          add     %o0, 4, %o0             ! update SRC
2765 2673          bgt,pt  %ncc, .ci_sm_words      ! loop til done
2766 2674            stw   %o3, [%o1 - 4]          ! write word
2767 2675          addcc   %o2, 7, %o2             ! restore count
2768 2676          bz,pt   %ncc, .ci_sm_exit
2769 2677            nop
2770 2678          deccc   %o2
2771 2679          bz,pt   %ncc, .ci_sm_byte
2772 2680  .ci_sm_half:
2773 2681            subcc %o2, 2, %o2             ! reduce count by 2
2774 2682          lduha   [%o0]ASI_USER, %o3      ! read half word
2775 2683          add     %o0, 2, %o0             ! advance SRC by 2
2776 2684          add     %o1, 2, %o1             ! advance DST by 2
2777 2685          bgt,pt  %ncc, .ci_sm_half       ! loop til done
2778 2686            sth   %o3, [%o1 - 2]          ! write half word
2779 2687          addcc   %o2, 1, %o2             ! restore count
2780 2688          bz,pt   %ncc, .ci_sm_exit
2781 2689            nop
2782 2690  .ci_sm_byte:
2783 2691          lduba   [%o0]ASI_USER, %o3
2784 2692          stb     %o3, [%o1]
2785 2693          membar  #Sync                           ! sync error barrier
2786 2694          stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2787 2695          retl
2788 2696            mov   %g0, %o0                ! return 0
2789 2697          .align  16
2790 2698  .ci_sm_word:
2791 2699          subcc   %o2, 4, %o2             ! update count
2792 2700          bgt,pt  %ncc, .ci_sm_wordx
2793 2701            lduwa [%o0]ASI_USER, %o3              ! read word
2794 2702          addcc   %o2, 3, %o2             ! restore count
2795 2703          bz,pt   %ncc, .ci_sm_exit
2796 2704            stw   %o3, [%o1]              ! write word
2797 2705          deccc   %o2                     ! reduce count for cc test
2798 2706          add     %o0, 4, %o0
2799 2707          lduba   [%o0]ASI_USER, %o3      ! load one byte
2800 2708          bz,pt   %ncc, .ci_sm_exit
2801 2709            stb   %o3, [%o1 + 4]          ! store one byte
2802 2710          inc     %o0
2803 2711          lduba   [%o0]ASI_USER, %o3      ! load second byte
2804 2712          deccc   %o2
2805 2713          bz,pt   %ncc, .ci_sm_exit
2806 2714            stb   %o3, [%o1 + 5]          ! store second byte
2807 2715          inc     %o0
2808 2716          lduba   [%o0]ASI_USER, %o3      ! load third byte
2809 2717          stb     %o3, [%o1 + 6]          ! store third byte
2810 2718  .ci_sm_exit:
2811 2719          membar  #Sync                           ! sync error barrier
2812 2720          stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2813 2721          retl
2814 2722            mov   %g0, %o0                ! return 0
2815 2723  
2816 2724          .align 16
2817 2725  .ci_med:
2818 2726          xor     %o0, %o1, %o3           ! setup alignment check
2819 2727          btst    1, %o3
2820 2728          bnz,pt  %ncc, .ci_sm_movebytes  ! unaligned
2821 2729            nop
2822 2730          btst    3, %o3
2823 2731          bnz,pt  %ncc, .ci_med_half      ! halfword aligned
2824 2732            nop
2825 2733          btst    7, %o3
2826 2734          bnz,pt  %ncc, .ci_med_word      ! word aligned
2827 2735            nop
2828 2736  .ci_med_long:
2829 2737          btst    3, %o0                  ! check for
2830 2738          bz,pt   %ncc, .ci_med_long1     ! word alignment
2831 2739            nop
2832 2740  .ci_med_long0:
2833 2741          lduba   [%o0]ASI_USER, %o3              ! load one byte
2834 2742          inc     %o0
2835 2743          stb     %o3,[%o1]               ! store byte
2836 2744          inc     %o1
2837 2745          btst    3, %o0
2838 2746          bnz,pt  %ncc, .ci_med_long0
2839 2747            dec   %o2
2840 2748  .ci_med_long1:                  ! word aligned
2841 2749          btst    7, %o0                  ! check for long word
2842 2750          bz,pt   %ncc, .ci_med_long2
2843 2751            nop
2844 2752          lduwa   [%o0]ASI_USER, %o3      ! load word
2845 2753          add     %o0, 4, %o0             ! advance SRC by 4
2846 2754          stw     %o3, [%o1]              ! store word
2847 2755          add     %o1, 4, %o1             ! advance DST by 4
2848 2756          sub     %o2, 4, %o2             ! reduce count by 4
2849 2757  !
2850 2758  !  Now long word aligned and have at least 32 bytes to move
2851 2759  !
2852 2760  .ci_med_long2:
2853 2761          sub     %o2, 31, %o2            ! adjust count to allow cc zero test
2854 2762  .ci_med_lmove:
2855 2763          ldxa    [%o0]ASI_USER, %o3      ! read long word
2856 2764          subcc   %o2, 32, %o2            ! reduce count by 32
2857 2765          stx     %o3, [%o1]              ! write long word
2858 2766          add     %o0, 8, %o0             ! advance SRC by 8
2859 2767          ldxa    [%o0]ASI_USER, %o3      ! repeat for a total for 4 long words
2860 2768          add     %o0, 8, %o0             ! advance SRC by 8
2861 2769          stx     %o3, [%o1 + 8]
2862 2770          add     %o1, 32, %o1            ! advance DST by 32
2863 2771          ldxa    [%o0]ASI_USER, %o3
2864 2772          add     %o0, 8, %o0             ! advance SRC by 8
2865 2773          stx     %o3, [%o1 - 16]
2866 2774          ldxa    [%o0]ASI_USER, %o3
2867 2775          add     %o0, 8, %o0             ! advance SRC by 8
2868 2776          bgt,pt  %ncc, .ci_med_lmove     ! loop til 31 or fewer bytes left
2869 2777            stx   %o3, [%o1 - 8]
2870 2778          addcc   %o2, 24, %o2            ! restore count to long word offset
2871 2779          ble,pt  %ncc, .ci_med_lextra    ! check for more long words to move
2872 2780            nop
2873 2781  .ci_med_lword:
2874 2782          ldxa    [%o0]ASI_USER, %o3      ! read long word
2875 2783          subcc   %o2, 8, %o2             ! reduce count by 8
2876 2784          stx     %o3, [%o1]              ! write long word
2877 2785          add     %o0, 8, %o0             ! advance SRC by 8
2878 2786          bgt,pt  %ncc, .ci_med_lword     ! loop til 7 or fewer bytes left
2879 2787            add   %o1, 8, %o1             ! advance DST by 8
2880 2788  .ci_med_lextra:
2881 2789          addcc   %o2, 7, %o2             ! restore rest of count
2882 2790          bz,pt   %ncc, .ci_sm_exit       ! if zero, then done
2883 2791            deccc %o2
2884 2792          bz,pt   %ncc, .ci_sm_byte
2885 2793            nop
2886 2794          ba,pt   %ncc, .ci_sm_half
2887 2795            nop
2888 2796  
2889 2797          .align 16
2890 2798          nop                             ! instruction alignment
2891 2799                                          ! see discussion at start of file
2892 2800  .ci_med_word:
2893 2801          btst    3, %o0                  ! check for
2894 2802          bz,pt   %ncc, .ci_med_word1     ! word alignment
2895 2803            nop
2896 2804  .ci_med_word0:
2897 2805          lduba   [%o0]ASI_USER, %o3      ! load one byte
2898 2806          inc     %o0
2899 2807          stb     %o3,[%o1]               ! store byte
2900 2808          inc     %o1
2901 2809          btst    3, %o0
2902 2810          bnz,pt  %ncc, .ci_med_word0
2903 2811            dec   %o2
2904 2812  !
2905 2813  !  Now word aligned and have at least 36 bytes to move
2906 2814  !
2907 2815  .ci_med_word1:
2908 2816          sub     %o2, 15, %o2            ! adjust count to allow cc zero test
2909 2817  .ci_med_wmove:
2910 2818          lduwa   [%o0]ASI_USER, %o3      ! read word
2911 2819          subcc   %o2, 16, %o2            ! reduce count by 16
2912 2820          stw     %o3, [%o1]              ! write word
2913 2821          add     %o0, 4, %o0             ! advance SRC by 4
2914 2822          lduwa   [%o0]ASI_USER, %o3      ! repeat for a total for 4 words
2915 2823          add     %o0, 4, %o0             ! advance SRC by 4
2916 2824          stw     %o3, [%o1 + 4]
2917 2825          add     %o1, 16, %o1            ! advance DST by 16
2918 2826          lduwa   [%o0]ASI_USER, %o3
2919 2827          add     %o0, 4, %o0             ! advance SRC by 4
2920 2828          stw     %o3, [%o1 - 8]
2921 2829          lduwa   [%o0]ASI_USER, %o3
2922 2830          add     %o0, 4, %o0             ! advance SRC by 4
2923 2831          bgt,pt  %ncc, .ci_med_wmove     ! loop til 15 or fewer bytes left
2924 2832            stw   %o3, [%o1 - 4]
2925 2833          addcc   %o2, 12, %o2            ! restore count to word offset
2926 2834          ble,pt  %ncc, .ci_med_wextra    ! check for more words to move
2927 2835            nop
2928 2836  .ci_med_word2:
2929 2837          lduwa   [%o0]ASI_USER, %o3      ! read word
2930 2838          subcc   %o2, 4, %o2             ! reduce count by 4
2931 2839          stw     %o3, [%o1]              ! write word
2932 2840          add     %o0, 4, %o0             ! advance SRC by 4
2933 2841          bgt,pt  %ncc, .ci_med_word2     ! loop til 3 or fewer bytes left
2934 2842            add   %o1, 4, %o1             ! advance DST by 4
2935 2843  .ci_med_wextra:
2936 2844          addcc   %o2, 3, %o2             ! restore rest of count
2937 2845          bz,pt   %ncc, .ci_sm_exit       ! if zero, then done
2938 2846            deccc %o2
2939 2847          bz,pt   %ncc, .ci_sm_byte
2940 2848            nop
2941 2849          ba,pt   %ncc, .ci_sm_half
2942 2850            nop
2943 2851  
2944 2852          .align 16
2945 2853          nop                             ! instruction alignment
2946 2854                                          ! see discussion at start of file
2947 2855  .ci_med_half:
2948 2856          btst    1, %o0                  ! check for
2949 2857          bz,pt   %ncc, .ci_med_half1     ! half word alignment
2950 2858            nop
2951 2859          lduba   [%o0]ASI_USER, %o3      ! load one byte
2952 2860          inc     %o0
2953 2861          stb     %o3,[%o1]               ! store byte
2954 2862          inc     %o1
2955 2863          dec     %o2
2956 2864  !
2957 2865  !  Now half word aligned and have at least 38 bytes to move
2958 2866  !
2959 2867  .ci_med_half1:
2960 2868          sub     %o2, 7, %o2             ! adjust count to allow cc zero test
2961 2869  .ci_med_hmove:
2962 2870          lduha   [%o0]ASI_USER, %o3      ! read half word
2963 2871          subcc   %o2, 8, %o2             ! reduce count by 8
2964 2872          sth     %o3, [%o1]              ! write half word
2965 2873          add     %o0, 2, %o0             ! advance SRC by 2
2966 2874          lduha   [%o0]ASI_USER, %o3      ! repeat for a total for 4 halfwords
2967 2875          add     %o0, 2, %o0             ! advance SRC by 2
2968 2876          sth     %o3, [%o1 + 2]
2969 2877          add     %o1, 8, %o1             ! advance DST by 8
2970 2878          lduha   [%o0]ASI_USER, %o3
2971 2879          add     %o0, 2, %o0             ! advance SRC by 2
2972 2880          sth     %o3, [%o1 - 4]
2973 2881          lduha   [%o0]ASI_USER, %o3
2974 2882          add     %o0, 2, %o0             ! advance SRC by 2
2975 2883          bgt,pt  %ncc, .ci_med_hmove     ! loop til 7 or fewer bytes left
2976 2884            sth   %o3, [%o1 - 2]
2977 2885          addcc   %o2, 7, %o2             ! restore count
2978 2886          bz,pt   %ncc, .ci_sm_exit
2979 2887            deccc %o2
2980 2888          bz,pt   %ncc, .ci_sm_byte
2981 2889            nop
2982 2890          ba,pt   %ncc, .ci_sm_half
2983 2891            nop
2984 2892  
2985 2893  .sm_copyin_err:
2986 2894          membar  #Sync
2987 2895          stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2988 2896          mov     SM_SAVE_SRC, %o0
2989 2897          mov     SM_SAVE_DST, %o1
2990 2898          mov     SM_SAVE_COUNT, %o2
2991 2899          ldn     [THREAD_REG + T_COPYOPS], %o3   ! check for copyop handler
2992 2900          tst     %o3
2993 2901          bz,pt   %ncc, 3f                        ! if not, return error
2994 2902            nop
2995 2903          ldn     [%o3 + CP_COPYIN], %o5          ! if handler, invoke it with
2996 2904          jmp     %o5                             ! original arguments
2997 2905            nop
2998 2906  3:
2999 2907          retl
3000 2908            or    %g0, -1, %o0            ! return errno value
3001 2909  
3002 2910          SET_SIZE(copyin)
3003 2911  
3004 2912  
3005 2913  /*
3006 2914   * The _more entry points are not intended to be used directly by
3007 2915   * any caller from outside this file.  They are provided to allow
3008 2916   * profiling and dtrace of the portions of the copy code that uses
3009 2917   * the floating point registers.
3010 2918   * This entry is particularly important as DTRACE (at least as of
3011 2919   * 4/2004) does not support leaf functions.
3012 2920   */
3013 2921  
3014 2922          ENTRY(copyin_more)
3015 2923  .copyin_more:
3016 2924          save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3017 2925          set     .copyin_err, REAL_LOFAULT
3018 2926  
3019 2927  /*
3020 2928   * Copy ins that reach here are larger than VIS_COPY_THRESHOLD bytes
3021 2929   */
3022 2930  .do_copyin:
3023 2931          set     copyio_fault, %l7               ! .copyio_fault is lofault val
3024 2932  
3025 2933          ldn     [THREAD_REG + T_LOFAULT], %l6   ! save existing handler
3026 2934          membar  #Sync                           ! sync error barrier
3027 2935          stn     %l7, [THREAD_REG + T_LOFAULT]   ! set t_lofault
3028 2936  
3029 2937          mov     %i0, SAVE_SRC
3030 2938          mov     %i1, SAVE_DST
3031 2939          mov     %i2, SAVE_COUNT
3032 2940  
3033 2941          FP_NOMIGRATE(6, 7)
3034 2942  
3035 2943          rd      %fprs, %o2              ! check for unused fp
3036 2944          st      %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
3037 2945          btst    FPRS_FEF, %o2
3038 2946          bz,a,pt %icc, .do_blockcopyin
3039 2947            wr    %g0, FPRS_FEF, %fprs
3040 2948  
3041 2949          BST_FPQ2Q4_TOSTACK(%o2)
3042 2950  
3043 2951  .do_blockcopyin:
3044 2952          rd      %gsr, %o2
3045 2953          stx     %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]      ! save gsr
3046 2954          or      %l6, FPUSED_FLAG, %l6
3047 2955  
3048 2956          andcc   DST, VIS_BLOCKSIZE - 1, TMP
3049 2957          mov     ASI_USER, %asi
3050 2958          bz,pt   %ncc, 2f
3051 2959            neg   TMP
3052 2960          add     TMP, VIS_BLOCKSIZE, TMP
3053 2961  
3054 2962          ! TMP = bytes required to align DST on FP_BLOCK boundary
3055 2963          ! Using SRC as a tmp here
3056 2964          cmp     TMP, 3
3057 2965          bleu,pt %ncc, 1f
3058 2966            sub   CNT,TMP,CNT             ! adjust main count
3059 2967          sub     TMP, 3, TMP             ! adjust for end of loop test
3060 2968  .ci_blkalign:
3061 2969          lduba   [REALSRC]%asi, SRC      ! move 4 bytes per loop iteration
3062 2970          stb     SRC, [DST]
3063 2971          subcc   TMP, 4, TMP
3064 2972          lduba   [REALSRC + 1]%asi, SRC
3065 2973          add     REALSRC, 4, REALSRC
3066 2974          stb     SRC, [DST + 1]
3067 2975          lduba   [REALSRC - 2]%asi, SRC
3068 2976          add     DST, 4, DST
3069 2977          stb     SRC, [DST - 2]
3070 2978          lduba   [REALSRC - 1]%asi, SRC
3071 2979          bgu,pt  %ncc, .ci_blkalign
3072 2980            stb   SRC, [DST - 1]
3073 2981  
3074 2982          addcc   TMP, 3, TMP             ! restore count adjustment
3075 2983          bz,pt   %ncc, 2f                ! no bytes left?
3076 2984            nop
3077 2985  1:      lduba   [REALSRC]%asi, SRC
3078 2986          inc     REALSRC
3079 2987          inc     DST
3080 2988          deccc   TMP
3081 2989          bgu     %ncc, 1b
3082 2990            stb   SRC, [DST - 1]
3083 2991  
3084 2992  2:
3085 2993          andn    REALSRC, 0x7, SRC
3086 2994          alignaddr REALSRC, %g0, %g0
3087 2995  
3088 2996          ! SRC - 8-byte aligned
3089 2997          ! DST - 64-byte aligned
3090 2998          prefetcha [SRC]%asi, #one_read
3091 2999          prefetcha [SRC + (1 * VIS_BLOCKSIZE)]%asi, #one_read
3092 3000          prefetcha [SRC + (2 * VIS_BLOCKSIZE)]%asi, #one_read
3093 3001          prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #one_read
3094 3002          ldda    [SRC]%asi, %f16
3095 3003  #if CHEETAH_PREFETCH > 4
3096 3004          prefetcha [SRC + (4 * VIS_BLOCKSIZE)]%asi, #one_read
3097 3005  #endif
3098 3006          ldda    [SRC + 0x08]%asi, %f18
3099 3007  #if CHEETAH_PREFETCH > 5
3100 3008          prefetcha [SRC + (5 * VIS_BLOCKSIZE)]%asi, #one_read
3101 3009  #endif
3102 3010          ldda    [SRC + 0x10]%asi, %f20
3103 3011  #if CHEETAH_PREFETCH > 6
3104 3012          prefetcha [SRC + (6 * VIS_BLOCKSIZE)]%asi, #one_read
3105 3013  #endif
3106 3014          faligndata %f16, %f18, %f48
3107 3015          ldda    [SRC + 0x18]%asi, %f22
3108 3016  #if CHEETAH_PREFETCH > 7
3109 3017          prefetcha [SRC + (7 * VIS_BLOCKSIZE)]%asi, #one_read
3110 3018  #endif
3111 3019          faligndata %f18, %f20, %f50
3112 3020          ldda    [SRC + 0x20]%asi, %f24
3113 3021          faligndata %f20, %f22, %f52
3114 3022          ldda    [SRC + 0x28]%asi, %f26
3115 3023          faligndata %f22, %f24, %f54
3116 3024          ldda    [SRC + 0x30]%asi, %f28
3117 3025          faligndata %f24, %f26, %f56
3118 3026          ldda    [SRC + 0x38]%asi, %f30
3119 3027          faligndata %f26, %f28, %f58
3120 3028          ldda    [SRC + VIS_BLOCKSIZE]%asi, %f16
3121 3029          sub     CNT, VIS_BLOCKSIZE, CNT
3122 3030          add     SRC, VIS_BLOCKSIZE, SRC
3123 3031          add     REALSRC, VIS_BLOCKSIZE, REALSRC
3124 3032          ba,a,pt %ncc, 1f
3125 3033            nop
3126 3034          .align  16
3127 3035  1:
3128 3036          ldda    [SRC + 0x08]%asi, %f18
3129 3037          faligndata %f28, %f30, %f60
3130 3038          ldda    [SRC + 0x10]%asi, %f20
3131 3039          faligndata %f30, %f16, %f62
3132 3040          stda    %f48, [DST]ASI_BLK_P
3133 3041          ldda    [SRC + 0x18]%asi, %f22
3134 3042          faligndata %f16, %f18, %f48
3135 3043          ldda    [SRC + 0x20]%asi, %f24
3136 3044          faligndata %f18, %f20, %f50
3137 3045          ldda    [SRC + 0x28]%asi, %f26
3138 3046          faligndata %f20, %f22, %f52
3139 3047          ldda    [SRC + 0x30]%asi, %f28
3140 3048          faligndata %f22, %f24, %f54
3141 3049          ldda    [SRC + 0x38]%asi, %f30
3142 3050          faligndata %f24, %f26, %f56
3143 3051          sub     CNT, VIS_BLOCKSIZE, CNT
3144 3052          ldda    [SRC + VIS_BLOCKSIZE]%asi, %f16
3145 3053          faligndata %f26, %f28, %f58
3146 3054          prefetcha [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8]%asi, #one_read
3147 3055          add     DST, VIS_BLOCKSIZE, DST
3148 3056          prefetcha [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read
3149 3057          add     REALSRC, VIS_BLOCKSIZE, REALSRC
3150 3058          cmp     CNT, VIS_BLOCKSIZE + 8
3151 3059          bgu,pt  %ncc, 1b
3152 3060            add   SRC, VIS_BLOCKSIZE, SRC
3153 3061  
3154 3062          ! only if REALSRC & 0x7 is 0
3155 3063          cmp     CNT, VIS_BLOCKSIZE
3156 3064          bne     %ncc, 3f
3157 3065            andcc REALSRC, 0x7, %g0
3158 3066          bz,pt   %ncc, 2f
3159 3067            nop
3160 3068  3:      
3161 3069          faligndata %f28, %f30, %f60
3162 3070          faligndata %f30, %f16, %f62
3163 3071          stda    %f48, [DST]ASI_BLK_P
3164 3072          add     DST, VIS_BLOCKSIZE, DST
3165 3073          ba,pt   %ncc, 3f
3166 3074            nop
3167 3075  2:
3168 3076          ldda    [SRC + 0x08]%asi, %f18
3169 3077          fsrc1   %f28, %f60
3170 3078          ldda    [SRC + 0x10]%asi, %f20
3171 3079          fsrc1   %f30, %f62
3172 3080          stda    %f48, [DST]ASI_BLK_P
3173 3081          ldda    [SRC + 0x18]%asi, %f22
3174 3082          fsrc1   %f16, %f48
3175 3083          ldda    [SRC + 0x20]%asi, %f24
3176 3084          fsrc1   %f18, %f50
3177 3085          ldda    [SRC + 0x28]%asi, %f26
3178 3086          fsrc1   %f20, %f52
3179 3087          ldda    [SRC + 0x30]%asi, %f28
3180 3088          fsrc1   %f22, %f54
3181 3089          ldda    [SRC + 0x38]%asi, %f30
3182 3090          fsrc1   %f24, %f56
3183 3091          sub     CNT, VIS_BLOCKSIZE, CNT
3184 3092          add     DST, VIS_BLOCKSIZE, DST
3185 3093          add     SRC, VIS_BLOCKSIZE, SRC
3186 3094          add     REALSRC, VIS_BLOCKSIZE, REALSRC
3187 3095          fsrc1   %f26, %f58
3188 3096          fsrc1   %f28, %f60
3189 3097          fsrc1   %f30, %f62
3190 3098          stda    %f48, [DST]ASI_BLK_P
3191 3099          add     DST, VIS_BLOCKSIZE, DST
3192 3100          ba,a,pt %ncc, 4f
3193 3101            nop
3194 3102  
3195 3103  3:      tst     CNT
3196 3104          bz,a    %ncc, 4f
3197 3105            nop
3198 3106  
3199 3107  5:      lduba   [REALSRC]ASI_USER, TMP
3200 3108          inc     REALSRC
3201 3109          inc     DST
3202 3110          deccc   CNT
3203 3111          bgu     %ncc, 5b
3204 3112            stb   TMP, [DST - 1]
3205 3113  4:
3206 3114  
3207 3115  .copyin_exit:
3208 3116          membar  #Sync
3209 3117  
3210 3118          FPRAS_INTERVAL(FPRAS_COPYIN, 1, %l5, %o2, %o3, %o4, %o5, 8)
3211 3119          FPRAS_REWRITE_TYPE1(1, %l5, %f48, %o2, 9)
3212 3120          FPRAS_CHECK(FPRAS_COPYIN, %l5, 9)       ! lose outputs
3213 3121  
3214 3122          ldx     [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2      ! restore gsr
3215 3123          wr      %o2, 0, %gsr
3216 3124  
3217 3125          ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
3218 3126          btst    FPRS_FEF, %o3
3219 3127          bz,pt   %icc, 4f
3220 3128            nop
3221 3129  
3222 3130          BLD_FPQ2Q4_FROMSTACK(%o2)
3223 3131  
3224 3132          ba,pt   %ncc, 1f
3225 3133            wr    %o3, 0, %fprs           ! restore fprs
3226 3134  
3227 3135  4:
3228 3136          FZEROQ2Q4
3229 3137          wr      %o3, 0, %fprs           ! restore fprs
3230 3138  
3231 3139  1:
3232 3140          membar  #Sync                           ! sync error barrier
3233 3141          andn    %l6, FPUSED_FLAG, %l6
3234 3142          stn     %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3235 3143          FP_ALLOWMIGRATE(5, 6)
3236 3144          ret
3237 3145            restore       %g0, 0, %o0
3238 3146  /*
3239 3147   * We got here because of a fault during copyin
3240 3148   * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
3241 3149   */
3242 3150  .copyin_err:
3243 3151          ldn     [THREAD_REG + T_COPYOPS], %o4   ! check for copyop handler
3244 3152          tst     %o4
3245 3153          bz,pt   %ncc, 2f                        ! if not, return error
3246 3154          nop

↓ open down ↓

595 lines elided

↑ open up ↑

3247 3155          ldn     [%o4 + CP_COPYIN], %g2          ! if handler, invoke it with
3248 3156          jmp     %g2                             ! original arguments
3249 3157          restore %g0, 0, %g0                     ! dispose of copy window
3250 3158  2:
3251 3159          ret
3252 3160          restore %g0, -1, %o0                    ! return error value
3253 3161  
3254 3162  
3255 3163          SET_SIZE(copyin_more)
3256 3164  
3257      -#endif  /* lint */
3258      -
3259      -#ifdef  lint
3260      -
3261      -/*ARGSUSED*/
3262      -int
3263      -xcopyin(const void *uaddr, void *kaddr, size_t count)
3264      -{ return (0); }
3265      -
3266      -#else   /* lint */
3267      -
3268 3165          ENTRY(xcopyin)
3269 3166  
3270 3167          cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
3271 3168          bleu,pt %ncc, .xcopyin_small            ! go to larger cases
3272 3169            xor   %o0, %o1, %o3                   ! are src, dst alignable?
3273 3170          btst    7, %o3                          !
3274 3171          bz,pt   %ncc, .xcopyin_8                ! check for longword alignment
3275 3172            nop
3276 3173          btst    1, %o3                          ! 
3277 3174          bz,pt   %ncc, .xcopyin_2                ! check for half-word

3278 3175            nop
3279 3176          sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
3280 3177          ld      [%o3 + %lo(hw_copy_limit_1)], %o3
3281 3178          tst     %o3
3282 3179          bz,pn   %icc, .xcopyin_small            ! if zero, disable HW copy
3283 3180            cmp   %o2, %o3                        ! if length <= limit
3284 3181          bleu,pt %ncc, .xcopyin_small            ! go to small copy
3285 3182            nop
3286 3183          ba,pt   %ncc, .xcopyin_more             ! otherwise go to large copy
3287 3184            nop
3288 3185  .xcopyin_2:
3289 3186          btst    3, %o3                          !
3290 3187          bz,pt   %ncc, .xcopyin_4                ! check for word alignment
3291 3188            nop
3292 3189          sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
3293 3190          ld      [%o3 + %lo(hw_copy_limit_2)], %o3
3294 3191          tst     %o3
3295 3192          bz,pn   %icc, .xcopyin_small            ! if zero, disable HW copy
3296 3193            cmp   %o2, %o3                        ! if length <= limit
3297 3194          bleu,pt %ncc, .xcopyin_small            ! go to small copy
3298 3195            nop
3299 3196          ba,pt   %ncc, .xcopyin_more             ! otherwise go to large copy
3300 3197            nop
3301 3198  .xcopyin_4:
3302 3199          ! already checked longword, must be word aligned
3303 3200          sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
3304 3201          ld      [%o3 + %lo(hw_copy_limit_4)], %o3
3305 3202          tst     %o3
3306 3203          bz,pn   %icc, .xcopyin_small            ! if zero, disable HW copy
3307 3204            cmp   %o2, %o3                        ! if length <= limit
3308 3205          bleu,pt %ncc, .xcopyin_small            ! go to small copy
3309 3206            nop
3310 3207          ba,pt   %ncc, .xcopyin_more             ! otherwise go to large copy
3311 3208            nop
3312 3209  .xcopyin_8:
3313 3210          sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
3314 3211          ld      [%o3 + %lo(hw_copy_limit_8)], %o3
3315 3212          tst     %o3
3316 3213          bz,pn   %icc, .xcopyin_small            ! if zero, disable HW copy
3317 3214            cmp   %o2, %o3                        ! if length <= limit
3318 3215          bleu,pt %ncc, .xcopyin_small            ! go to small copy
3319 3216            nop
3320 3217          ba,pt   %ncc, .xcopyin_more             ! otherwise go to large copy
3321 3218            nop
3322 3219  
3323 3220  .xcopyin_small:
3324 3221          sethi   %hi(.sm_xcopyin_err), %o5  ! .sm_xcopyin_err is lofault value
3325 3222          or      %o5, %lo(.sm_xcopyin_err), %o5
3326 3223          ldn     [THREAD_REG + T_LOFAULT], %o4   ! set/save t_lofaul
3327 3224          membar  #Sync                           ! sync error barrier
3328 3225          ba,pt   %ncc, .sm_do_copyin             ! common code
3329 3226            stn   %o5, [THREAD_REG + T_LOFAULT]
3330 3227          
3331 3228  .xcopyin_more:
3332 3229          save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3333 3230          sethi   %hi(.xcopyin_err), REAL_LOFAULT ! .xcopyin_err is lofault value
3334 3231          ba,pt   %ncc, .do_copyin
3335 3232            or    REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
3336 3233  
3337 3234  /*
3338 3235   * We got here because of fault during xcopyin
3339 3236   * Errno value is in ERRNO
3340 3237   */
3341 3238  .xcopyin_err:
3342 3239          ldn     [THREAD_REG + T_COPYOPS], %o4   ! check for copyop handler
3343 3240          tst     %o4
3344 3241          bz,pt   %ncc, 2f                        ! if not, return error
3345 3242            nop
3346 3243          ldn     [%o4 + CP_XCOPYIN], %g2         ! if handler, invoke it with
3347 3244          jmp     %g2                             ! original arguments
3348 3245            restore %g0, 0, %g0                   ! dispose of copy window
3349 3246  2:
3350 3247          ret
3351 3248            restore ERRNO, 0, %o0                 ! return errno value
3352 3249  
3353 3250  .sm_xcopyin_err:
3354 3251  
3355 3252          membar  #Sync
3356 3253          stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3357 3254          mov     SM_SAVE_SRC, %o0
3358 3255          mov     SM_SAVE_DST, %o1
3359 3256          mov     SM_SAVE_COUNT, %o2
3360 3257          ldn     [THREAD_REG + T_COPYOPS], %o3   ! check for copyop handler
3361 3258          tst     %o3
3362 3259          bz,pt   %ncc, 3f                        ! if not, return error

↓ open down ↓

85 lines elided

↑ open up ↑

3363 3260            nop
3364 3261          ldn     [%o3 + CP_XCOPYIN], %o5         ! if handler, invoke it with
3365 3262          jmp     %o5                             ! original arguments
3366 3263            nop
3367 3264  3:
3368 3265          retl
3369 3266            or    %g1, 0, %o0             ! return errno value
3370 3267  
3371 3268          SET_SIZE(xcopyin)
3372 3269  
3373      -#endif  /* lint */
3374      -
3375      -#ifdef  lint
3376      -
3377      -/*ARGSUSED*/
3378      -int
3379      -xcopyin_little(const void *uaddr, void *kaddr, size_t count)
3380      -{ return (0); }
3381      -
3382      -#else   /* lint */
3383      -
3384 3270          ENTRY(xcopyin_little)
3385 3271          sethi   %hi(.xcopyio_err), %o5
3386 3272          or      %o5, %lo(.xcopyio_err), %o5
3387 3273          ldn     [THREAD_REG + T_LOFAULT], %o4
3388 3274          membar  #Sync                           ! sync error barrier
3389 3275          stn     %o5, [THREAD_REG + T_LOFAULT]   
3390 3276          mov     %o4, %o5
3391 3277  
3392 3278          subcc   %g0, %o2, %o3
3393 3279          add     %o0, %o2, %o0

3394 3280          bz,pn   %ncc, 2f                ! check for zero bytes
3395 3281            sub   %o2, 1, %o4
3396 3282          add     %o0, %o4, %o0           ! start w/last byte     
3397 3283          add     %o1, %o2, %o1
3398 3284          lduba   [%o0 + %o3]ASI_AIUSL, %o4
3399 3285  
3400 3286  1:      stb     %o4, [%o1 + %o3]
3401 3287          inccc   %o3
3402 3288          sub     %o0, 2, %o0             ! get next byte
3403 3289          bcc,a,pt %ncc, 1b
3404 3290            lduba [%o0 + %o3]ASI_AIUSL, %o4
3405 3291  
3406 3292  2:
3407 3293          membar  #Sync                           ! sync error barrier
3408 3294          stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3409 3295          retl

↓ open down ↓

16 lines elided

↑ open up ↑

3410 3296            mov   %g0, %o0                ! return (0)
3411 3297  
3412 3298  .xcopyio_err:
3413 3299          membar  #Sync                           ! sync error barrier
3414 3300          stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3415 3301          retl
3416 3302            mov   %g1, %o0
3417 3303  
3418 3304          SET_SIZE(xcopyin_little)
3419 3305  
3420      -#endif  /* lint */
3421 3306  
3422      -
3423 3307  /*
3424 3308   * Copy a block of storage - must not overlap (from + len <= to).
3425 3309   * No fault handler installed (to be called under on_fault())
3426 3310   */
3427      -#if defined(lint)
3428      -
3429      -/* ARGSUSED */
3430      -void
3431      -copyin_noerr(const void *ufrom, void *kto, size_t count)
3432      -{}
3433      -
3434      -#else   /* lint */
3435 3311          ENTRY(copyin_noerr)
3436 3312  
3437 3313          cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
3438 3314          bleu,pt %ncc, .copyin_ne_small          ! go to larger cases
3439 3315            xor   %o0, %o1, %o3                   ! are src, dst alignable?
3440 3316          btst    7, %o3                          !
3441 3317          bz,pt   %ncc, .copyin_ne_8              ! check for longword alignment
3442 3318            nop
3443 3319          btst    1, %o3                          ! 
3444 3320          bz,pt   %ncc, .copyin_ne_2              ! check for half-word

3445 3321            nop
3446 3322          sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
3447 3323          ld      [%o3 + %lo(hw_copy_limit_1)], %o3
3448 3324          tst     %o3
3449 3325          bz,pn   %icc, .copyin_ne_small          ! if zero, disable HW copy
3450 3326            cmp   %o2, %o3                        ! if length <= limit
3451 3327          bleu,pt %ncc, .copyin_ne_small          ! go to small copy
3452 3328            nop
3453 3329          ba,pt   %ncc, .copyin_noerr_more        ! otherwise go to large copy
3454 3330            nop
3455 3331  .copyin_ne_2:
3456 3332          btst    3, %o3                          !
3457 3333          bz,pt   %ncc, .copyin_ne_4              ! check for word alignment
3458 3334            nop
3459 3335          sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
3460 3336          ld      [%o3 + %lo(hw_copy_limit_2)], %o3
3461 3337          tst     %o3
3462 3338          bz,pn   %icc, .copyin_ne_small          ! if zero, disable HW copy
3463 3339            cmp   %o2, %o3                        ! if length <= limit
3464 3340          bleu,pt %ncc, .copyin_ne_small          ! go to small copy
3465 3341            nop
3466 3342          ba,pt   %ncc, .copyin_noerr_more        ! otherwise go to large copy
3467 3343            nop
3468 3344  .copyin_ne_4:
3469 3345          ! already checked longword, must be word aligned
3470 3346          sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
3471 3347          ld      [%o3 + %lo(hw_copy_limit_4)], %o3
3472 3348          tst     %o3
3473 3349          bz,pn   %icc, .copyin_ne_small          ! if zero, disable HW copy
3474 3350            cmp   %o2, %o3                        ! if length <= limit
3475 3351          bleu,pt %ncc, .copyin_ne_small          ! go to small copy
3476 3352            nop
3477 3353          ba,pt   %ncc, .copyin_noerr_more        ! otherwise go to large copy
3478 3354            nop
3479 3355  .copyin_ne_8:
3480 3356          sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
3481 3357          ld      [%o3 + %lo(hw_copy_limit_8)], %o3
3482 3358          tst     %o3
3483 3359          bz,pn   %icc, .copyin_ne_small          ! if zero, disable HW copy
3484 3360            cmp   %o2, %o3                        ! if length <= limit
3485 3361          bleu,pt %ncc, .copyin_ne_small          ! go to small copy
3486 3362            nop
3487 3363          ba,pt   %ncc, .copyin_noerr_more        ! otherwise go to large copy
3488 3364            nop
3489 3365  
3490 3366  .copyin_ne_small:
3491 3367          ldn     [THREAD_REG + T_LOFAULT], %o4
3492 3368          tst     %o4
3493 3369          bz,pn   %ncc, .sm_do_copyin
3494 3370            nop
3495 3371          sethi   %hi(.sm_copyio_noerr), %o5
3496 3372          or      %o5, %lo(.sm_copyio_noerr), %o5
3497 3373          membar  #Sync                           ! sync error barrier
3498 3374          ba,pt   %ncc, .sm_do_copyin
3499 3375            stn   %o5, [THREAD_REG + T_LOFAULT]   ! set/save t_lofault
3500 3376  
3501 3377  .copyin_noerr_more:
3502 3378          save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3503 3379          sethi   %hi(.copyio_noerr), REAL_LOFAULT
3504 3380          ba,pt   %ncc, .do_copyin
3505 3381            or    REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
3506 3382  
3507 3383  .copyio_noerr:

↓ open down ↓

63 lines elided

↑ open up ↑

3508 3384          jmp     %l6
3509 3385            restore %g0,0,%g0
3510 3386  
3511 3387  .sm_copyio_noerr:
3512 3388          membar  #Sync
3513 3389          stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore t_lofault
3514 3390          jmp     %o4
3515 3391            nop
3516 3392  
3517 3393          SET_SIZE(copyin_noerr)
3518      -#endif /* lint */
3519 3394  
3520 3395  /*
3521 3396   * Copy a block of storage - must not overlap (from + len <= to).
3522 3397   * No fault handler installed (to be called under on_fault())
3523 3398   */
3524 3399  
3525      -#if defined(lint)
3526      -
3527      -/* ARGSUSED */
3528      -void
3529      -copyout_noerr(const void *kfrom, void *uto, size_t count)
3530      -{}
3531      -
3532      -#else   /* lint */
3533 3400          ENTRY(copyout_noerr)
3534 3401  
3535 3402          cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
3536 3403          bleu,pt %ncc, .copyout_ne_small         ! go to larger cases
3537 3404            xor   %o0, %o1, %o3                   ! are src, dst alignable?
3538 3405          btst    7, %o3                          !
3539 3406          bz,pt   %ncc, .copyout_ne_8             ! check for longword alignment
3540 3407            nop
3541 3408          btst    1, %o3                          ! 
3542 3409          bz,pt   %ncc, .copyout_ne_2             ! check for half-word

3543 3410            nop
3544 3411          sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
3545 3412          ld      [%o3 + %lo(hw_copy_limit_1)], %o3
3546 3413          tst     %o3
3547 3414          bz,pn   %icc, .copyout_ne_small         ! if zero, disable HW copy
3548 3415            cmp   %o2, %o3                        ! if length <= limit
3549 3416          bleu,pt %ncc, .copyout_ne_small         ! go to small copy
3550 3417            nop
3551 3418          ba,pt   %ncc, .copyout_noerr_more       ! otherwise go to large copy
3552 3419            nop
3553 3420  .copyout_ne_2:
3554 3421          btst    3, %o3                          !
3555 3422          bz,pt   %ncc, .copyout_ne_4             ! check for word alignment
3556 3423            nop
3557 3424          sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
3558 3425          ld      [%o3 + %lo(hw_copy_limit_2)], %o3
3559 3426          tst     %o3
3560 3427          bz,pn   %icc, .copyout_ne_small         ! if zero, disable HW copy
3561 3428            cmp   %o2, %o3                        ! if length <= limit
3562 3429          bleu,pt %ncc, .copyout_ne_small         ! go to small copy
3563 3430            nop
3564 3431          ba,pt   %ncc, .copyout_noerr_more       ! otherwise go to large copy
3565 3432            nop
3566 3433  .copyout_ne_4:
3567 3434          ! already checked longword, must be word aligned
3568 3435          sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
3569 3436          ld      [%o3 + %lo(hw_copy_limit_4)], %o3
3570 3437          tst     %o3
3571 3438          bz,pn   %icc, .copyout_ne_small         ! if zero, disable HW copy
3572 3439            cmp   %o2, %o3                        ! if length <= limit
3573 3440          bleu,pt %ncc, .copyout_ne_small         ! go to small copy
3574 3441            nop
3575 3442          ba,pt   %ncc, .copyout_noerr_more       ! otherwise go to large copy
3576 3443            nop
3577 3444  .copyout_ne_8:
3578 3445          sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
3579 3446          ld      [%o3 + %lo(hw_copy_limit_8)], %o3
3580 3447          tst     %o3
3581 3448          bz,pn   %icc, .copyout_ne_small         ! if zero, disable HW copy
3582 3449            cmp   %o2, %o3                        ! if length <= limit
3583 3450          bleu,pt %ncc, .copyout_ne_small         ! go to small copy
3584 3451            nop
3585 3452          ba,pt   %ncc, .copyout_noerr_more       ! otherwise go to large copy
3586 3453            nop
3587 3454  
3588 3455  .copyout_ne_small:
3589 3456          ldn     [THREAD_REG + T_LOFAULT], %o4
3590 3457          tst     %o4
3591 3458          bz,pn   %ncc, .sm_do_copyout
3592 3459            nop
3593 3460          sethi   %hi(.sm_copyio_noerr), %o5
3594 3461          or      %o5, %lo(.sm_copyio_noerr), %o5
3595 3462          membar  #Sync                           ! sync error barrier

↓ open down ↓

53 lines elided

↑ open up ↑

3596 3463          ba,pt   %ncc, .sm_do_copyout
3597 3464          stn     %o5, [THREAD_REG + T_LOFAULT]   ! set/save t_lofault
3598 3465  
3599 3466  .copyout_noerr_more:
3600 3467          save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3601 3468          sethi   %hi(.copyio_noerr), REAL_LOFAULT
3602 3469          ba,pt   %ncc, .do_copyout
3603 3470            or    REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
3604 3471  
3605 3472          SET_SIZE(copyout_noerr)
3606      -#endif /* lint */
3607 3473  
3608 3474  
3609 3475  /*
3610 3476   * hwblkclr - clears block-aligned, block-multiple-sized regions that are
3611 3477   * longer than 256 bytes in length using spitfire's block stores.  If
3612 3478   * the criteria for using this routine are not met then it calls bzero
3613 3479   * and returns 1.  Otherwise 0 is returned indicating success.
3614 3480   * Caller is responsible for ensuring use_hw_bzero is true and that
3615 3481   * kpreempt_disable() has been called.
3616 3482   */
3617      -#ifdef lint
3618      -/*ARGSUSED*/
3619      -int
3620      -hwblkclr(void *addr, size_t len)
3621      -{ 
3622      -        return(0);
3623      -}
3624      -#else /* lint */
3625 3483          ! %i0 - start address
3626 3484          ! %i1 - length of region (multiple of 64)
3627 3485          ! %l0 - saved fprs
3628 3486          ! %l1 - pointer to saved %d0 block
3629 3487          ! %l2 - saved curthread->t_lwp
3630 3488  
3631 3489          ENTRY(hwblkclr)
3632 3490          ! get another window w/space for one aligned block of saved fpregs
3633 3491          save    %sp, -SA(MINFRAME + 2*VIS_BLOCKSIZE), %sp
3634 3492

3635 3493          ! Must be block-aligned
3636 3494          andcc   %i0, (VIS_BLOCKSIZE-1), %g0
3637 3495          bnz,pn  %ncc, 1f
3638 3496            nop
3639 3497  
3640 3498          ! ... and must be 256 bytes or more
3641 3499          cmp     %i1, 256
3642 3500          blu,pn  %ncc, 1f
3643 3501            nop
3644 3502  
3645 3503          ! ... and length must be a multiple of VIS_BLOCKSIZE
3646 3504          andcc   %i1, (VIS_BLOCKSIZE-1), %g0
3647 3505          bz,pn   %ncc, 2f
3648 3506            nop
3649 3507  
3650 3508  1:      ! punt, call bzero but notify the caller that bzero was used
3651 3509          mov     %i0, %o0
3652 3510          call    bzero
3653 3511          mov     %i1, %o1
3654 3512          ret
3655 3513            restore       %g0, 1, %o0 ! return (1) - did not use block operations
3656 3514  
3657 3515  2:      rd      %fprs, %l0              ! check for unused fp
3658 3516          btst    FPRS_FEF, %l0
3659 3517          bz,pt   %icc, 1f
3660 3518            nop
3661 3519  
3662 3520          ! save in-use fpregs on stack
3663 3521          membar  #Sync
3664 3522          add     %fp, STACK_BIAS - 65, %l1
3665 3523          and     %l1, -VIS_BLOCKSIZE, %l1
3666 3524          stda    %d0, [%l1]ASI_BLK_P
3667 3525  
3668 3526  1:      membar  #StoreStore|#StoreLoad|#LoadStore
3669 3527          wr      %g0, FPRS_FEF, %fprs
3670 3528          wr      %g0, ASI_BLK_P, %asi
3671 3529  
3672 3530          ! Clear block
3673 3531          fzero   %d0
3674 3532          fzero   %d2
3675 3533          fzero   %d4
3676 3534          fzero   %d6
3677 3535          fzero   %d8
3678 3536          fzero   %d10
3679 3537          fzero   %d12
3680 3538          fzero   %d14
3681 3539  
3682 3540          mov     256, %i3
3683 3541          ba,pt   %ncc, .pz_doblock
3684 3542            nop
3685 3543  
3686 3544  .pz_blkstart:   
3687 3545        ! stda    %d0, [%i0 + 192]%asi  ! in dly slot of branch that got us here
3688 3546          stda    %d0, [%i0 + 128]%asi
3689 3547          stda    %d0, [%i0 + 64]%asi
3690 3548          stda    %d0, [%i0]%asi
3691 3549  .pz_zinst:
3692 3550          add     %i0, %i3, %i0
3693 3551          sub     %i1, %i3, %i1
3694 3552  .pz_doblock:
3695 3553          cmp     %i1, 256
3696 3554          bgeu,a  %ncc, .pz_blkstart
3697 3555            stda  %d0, [%i0 + 192]%asi
3698 3556  
3699 3557          cmp     %i1, 64
3700 3558          blu     %ncc, .pz_finish
3701 3559          
3702 3560            andn  %i1, (64-1), %i3
3703 3561          srl     %i3, 4, %i2             ! using blocks, 1 instr / 16 words
3704 3562          set     .pz_zinst, %i4
3705 3563          sub     %i4, %i2, %i4
3706 3564          jmp     %i4
3707 3565            nop
3708 3566  
3709 3567  .pz_finish:
3710 3568          membar  #Sync
3711 3569          btst    FPRS_FEF, %l0
3712 3570          bz,a    .pz_finished
3713 3571            wr    %l0, 0, %fprs           ! restore fprs
3714 3572

↓ open down ↓

80 lines elided

↑ open up ↑

3715 3573          ! restore fpregs from stack
3716 3574          ldda    [%l1]ASI_BLK_P, %d0
3717 3575          membar  #Sync
3718 3576          wr      %l0, 0, %fprs           ! restore fprs
3719 3577  
3720 3578  .pz_finished:
3721 3579          ret
3722 3580            restore       %g0, 0, %o0             ! return (bzero or not)
3723 3581  
3724 3582          SET_SIZE(hwblkclr)
3725      -#endif  /* lint */
3726 3583  
3727      -#ifdef lint
3728      -/*ARGSUSED*/
3729      -void
3730      -hw_pa_bcopy32(uint64_t src, uint64_t dst)
3731      -{}
3732      -#else /*!lint */
3733 3584          /*
3734 3585           * Copy 32 bytes of data from src (%o0) to dst (%o1)
3735 3586           * using physical addresses.
3736 3587           */
3737 3588          ENTRY_NP(hw_pa_bcopy32)
3738 3589          rdpr    %pstate, %g1
3739 3590          andn    %g1, PSTATE_IE, %g2
3740 3591          wrpr    %g0, %g2, %pstate
3741 3592  
3742 3593          rdpr    %pstate, %g0

3743 3594          ldxa    [%o0]ASI_MEM, %o2
3744 3595          add     %o0, 8, %o0
3745 3596          ldxa    [%o0]ASI_MEM, %o3
3746 3597          add     %o0, 8, %o0
3747 3598          ldxa    [%o0]ASI_MEM, %o4
3748 3599          add     %o0, 8, %o0
3749 3600          ldxa    [%o0]ASI_MEM, %o5
3750 3601  
3751 3602          stxa    %g0, [%o1]ASI_DC_INVAL
3752 3603          membar  #Sync
3753 3604  
3754 3605          stxa    %o2, [%o1]ASI_MEM
3755 3606          add     %o1, 8, %o1
3756 3607          stxa    %o3, [%o1]ASI_MEM

↓ open down ↓

14 lines elided

↑ open up ↑

3757 3608          add     %o1, 8, %o1
3758 3609          stxa    %o4, [%o1]ASI_MEM
3759 3610          add     %o1, 8, %o1
3760 3611          stxa    %o5, [%o1]ASI_MEM
3761 3612  
3762 3613          retl
3763 3614            wrpr    %g0, %g1, %pstate
3764 3615  
3765 3616          SET_SIZE(hw_pa_bcopy32)
3766 3617  
3767      -#endif /* lint */
3768      -
3769      -#if defined(lint)
3770      -
3771      -int use_hw_bcopy = 1;
3772      -int use_hw_bzero = 1;
3773      -uint_t hw_copy_limit_1 = 0;
3774      -uint_t hw_copy_limit_2 = 0;
3775      -uint_t hw_copy_limit_4 = 0;
3776      -uint_t hw_copy_limit_8 = 0;
3777      -
3778      -#else /* !lint */
3779      -
3780 3618          DGDEF(use_hw_bcopy)
3781 3619          .word   1
3782 3620          DGDEF(use_hw_bzero)
3783 3621          .word   1
3784 3622          DGDEF(hw_copy_limit_1)
3785 3623          .word   0
3786 3624          DGDEF(hw_copy_limit_2)
3787 3625          .word   0
3788 3626          DGDEF(hw_copy_limit_4)
3789 3627          .word   0
3790 3628          DGDEF(hw_copy_limit_8)
3791 3629          .word   0
3792 3630  
3793 3631          .align  64
3794 3632          .section ".text"
3795      -#endif /* !lint */

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX