2 Wdiff usr/src/uts/sun4u/cpu/opl_olympus_copy.s

Print this page

de-linting of .s files

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/sun4u/cpu/opl_olympus_copy.s
          +++ new/usr/src/uts/sun4u/cpu/opl_olympus_copy.s

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24   24   */
  25   25

↓ open down ↓

25 lines elided

↑ open up ↑

  26   26  #include <sys/param.h>
  27   27  #include <sys/errno.h>
  28   28  #include <sys/asm_linkage.h>
  29   29  #include <sys/vtrace.h>
  30   30  #include <sys/machthread.h>
  31   31  #include <sys/clock.h>
  32   32  #include <sys/asi.h>
  33   33  #include <sys/fsr.h>
  34   34  #include <sys/privregs.h>
  35   35  
  36      -#if !defined(lint)
  37   36  #include "assym.h"
  38      -#endif  /* lint */
  39   37  
  40   38  /*
  41   39   * Pseudo-code to aid in understanding the control flow of the
  42   40   * bcopy/copyin/copyout routines.
  43   41   *
  44   42   * On entry:
  45   43   *
  46   44   *      ! Determine whether to use the FP register version
  47   45   *      ! or the leaf routine version depending on size
  48   46   *      ! of copy and flags.  Set up error handling accordingly.

  49   47   *      ! The transition point depends on whether the src and
  50   48   *      ! dst addresses can be aligned to long word, word,
  51   49   *      ! half word, or byte boundaries.
  52   50   *      !
  53   51   *      ! WARNING: <Register usage convention>
  54   52   *      ! For FP version, %l6 holds previous error handling and
  55   53   *      ! a flag: TRAMP_FLAG (low bits)
  56   54   *      ! for leaf routine version, %o4 holds those values.
  57   55   *      ! So either %l6 or %o4 is reserved and not available for
  58   56   *      ! any other use.
  59   57   *
  60   58   *      if (length <= VIS_COPY_THRESHOLD)       ! start with a quick test
  61   59   *              go to small_copy;               ! to speed short copies
  62   60   *
  63   61   *      ! src, dst long word alignable
  64   62   *              if (hw_copy_limit_8 == 0)       ! hw_copy disabled
  65   63   *                      go to small_copy;
  66   64   *              if (length <= hw_copy_limit_8)
  67   65   *                      go to small_copy;
  68   66   *              go to FPBLK_copy;
  69   67   *      }
  70   68   *      if (src,dst not alignable) {
  71   69   *              if (hw_copy_limit_1 == 0)       ! hw_copy disabled
  72   70   *                      go to small_copy;
  73   71   *              if (length <= hw_copy_limit_1)
  74   72   *                      go to small_copy;
  75   73   *              go to FPBLK_copy;
  76   74   *      }
  77   75   *      if (src,dst halfword alignable) {
  78   76   *              if (hw_copy_limit_2 == 0)       ! hw_copy disabled
  79   77   *                      go to small_copy;
  80   78   *              if (length <= hw_copy_limit_2)
  81   79   *                      go to small_copy;
  82   80   *              go to FPBLK_copy;
  83   81   *      }
  84   82   *      if (src,dst word alignable) {
  85   83   *              if (hw_copy_limit_4 == 0)       ! hw_copy disabled
  86   84   *                      go to small_copy;
  87   85   *              if (length <= hw_copy_limit_4)
  88   86   *                      go to small_copy;
  89   87   *              go to FPBLK_copy;
  90   88   *      }
  91   89   *
  92   90   * small_copy:
  93   91   *      Setup_leaf_rtn_error_handler;           ! diffs for each entry point
  94   92   *
  95   93   *      if (count <= 3)                         ! fast path for tiny copies
  96   94   *              go to sm_left;                  ! special finish up code
  97   95   *      else
  98   96   *              if (count > CHKSIZE)            ! medium sized copies
  99   97   *                      go to sm_med            ! tuned by alignment
 100   98   *              if(src&dst not both word aligned) {
 101   99   *      sm_movebytes:
 102  100   *                      move byte by byte in 4-way unrolled loop
 103  101   *                      fall into sm_left;
 104  102   *      sm_left:
 105  103   *                      move 0-3 bytes byte at a time as needed.
 106  104   *                      restore error handler and exit.
 107  105   *
 108  106   *              } else {        ! src&dst are word aligned
 109  107   *                      check for at least 8 bytes left,
 110  108   *                      move word at a time, unrolled by 2
 111  109   *                      when fewer than 8 bytes left,
 112  110   *      sm_half:        move half word at a time while 2 or more bytes left
 113  111   *      sm_byte:        move final byte if necessary
 114  112   *      sm_exit:
 115  113   *                      restore error handler and exit.
 116  114   *              }
 117  115   *
 118  116   * ! Medium length cases with at least CHKSIZE bytes available
 119  117   * ! method: line up src and dst as best possible, then
 120  118   * ! move data in 4-way unrolled loops.
 121  119   *
 122  120   * sm_med:
 123  121   *      if(src&dst unalignable)
 124  122   *              go to sm_movebytes
 125  123   *      if(src&dst halfword alignable)
 126  124   *              go to sm_movehalf
 127  125   *      if(src&dst word alignable)
 128  126   *              go to sm_moveword
 129  127   * ! fall into long word movement
 130  128   *      move bytes until src is word aligned
 131  129   *      if not long word aligned, move a word
 132  130   *      move long words in 4-way unrolled loop until < 32 bytes left
 133  131   *      move long words in 1-way unrolled loop until < 8 bytes left
 134  132   *      if zero bytes left, goto sm_exit
 135  133   *      if one byte left, go to sm_byte
 136  134   *      else go to sm_half
 137  135   *
 138  136   * sm_moveword:
 139  137   *      move bytes until src is word aligned
 140  138   *      move words in 4-way unrolled loop until < 16 bytes left
 141  139   *      move words in 1-way unrolled loop until < 4 bytes left
 142  140   *      if zero bytes left, goto sm_exit
 143  141   *      if one byte left, go to sm_byte
 144  142   *      else go to sm_half
 145  143   *
 146  144   * sm_movehalf:
 147  145   *      move a byte if needed to align src on halfword
 148  146   *      move halfwords in 4-way unrolled loop until < 8 bytes left
 149  147   *      if zero bytes left, goto sm_exit
 150  148   *      if one byte left, go to sm_byte
 151  149   *      else go to sm_half
 152  150   *
 153  151   *
 154  152   * FPBLK_copy:
 155  153   *      %l6 = curthread->t_lofault;
 156  154   *      if (%l6 != NULL) {
 157  155   *              membar #Sync
 158  156   *              curthread->t_lofault = .copyerr;
 159  157   *              caller_error_handler = TRUE             ! %l6 |= 2
 160  158   *      }
 161  159   *
 162  160   *      ! for FPU testing we must not migrate cpus
 163  161   *      if (curthread->t_lwp == NULL) {
 164  162   *              ! Kernel threads do not have pcb's in which to store
 165  163   *              ! the floating point state, so disallow preemption during
 166  164   *              ! the copy.  This also prevents cpu migration.
 167  165   *              kpreempt_disable(curthread);
 168  166   *      } else {
 169  167   *              thread_nomigrate();
 170  168   *      }
 171  169   *
 172  170   *      old_fprs = %fprs;
 173  171   *      old_gsr = %gsr;
 174  172   *      if (%fprs.fef) {
 175  173   *              %fprs.fef = 1;
 176  174   *              save current fpregs on stack using blockstore
 177  175   *      } else {
 178  176   *              %fprs.fef = 1;
 179  177   *      }
 180  178   *
 181  179   *
 182  180   *      do_blockcopy_here;
 183  181   *
 184  182   * In lofault handler:
 185  183   *      curthread->t_lofault = .copyerr2;
 186  184   *      Continue on with the normal exit handler
 187  185   *
 188  186   * On normal exit:
 189  187   *      %gsr = old_gsr;
 190  188   *      if (old_fprs & FPRS_FEF)
 191  189   *              restore fpregs from stack using blockload
 192  190   *      else
 193  191   *              zero fpregs
 194  192   *      %fprs = old_fprs;
 195  193   *      membar #Sync
 196  194   *      curthread->t_lofault = (%l6 & ~3);
 197  195   *      ! following test omitted from copyin/copyout as they
 198  196   *      ! will always have a current thread
 199  197   *      if (curthread->t_lwp == NULL)
 200  198   *              kpreempt_enable(curthread);
 201  199   *      else
 202  200   *              thread_allowmigrate();
 203  201   *      return (0)
 204  202   *
 205  203   * In second lofault handler (.copyerr2):
 206  204   *      We've tried to restore fp state from the stack and failed.  To
 207  205   *      prevent from returning with a corrupted fp state, we will panic.
 208  206   */
 209  207  
 210  208  /*
 211  209   * Comments about optimization choices
 212  210   *
 213  211   * The initial optimization decision in this code is to determine
 214  212   * whether to use the FP registers for a copy or not.  If we don't
 215  213   * use the FP registers, we can execute the copy as a leaf routine,
 216  214   * saving a register save and restore.  Also, less elaborate setup
 217  215   * is required, allowing short copies to be completed more quickly.
 218  216   * For longer copies, especially unaligned ones (where the src and
 219  217   * dst do not align to allow simple ldx,stx operation), the FP
 220  218   * registers allow much faster copy operations.
 221  219   *
 222  220   * The estimated extra cost of the FP path will vary depending on
 223  221   * src/dst alignment, dst offset from the next 64 byte FPblock store
 224  222   * boundary, remaining src data after the last full dst cache line is
 225  223   * moved whether the FP registers need to be saved, and some other
 226  224   * minor issues.  The average additional overhead is estimated to be
 227  225   * 400 clocks.  Since each non-repeated/predicted tst and branch costs
 228  226   * around 10 clocks, elaborate calculation would slow down to all
 229  227   * longer copies and only benefit a small portion of medium sized
 230  228   * copies.  Rather than incur such cost, we chose fixed transition
 231  229   * points for each of the alignment choices.
 232  230   *
 233  231   * For the inner loop, here is a comparison of the per cache line
 234  232   * costs for each alignment when src&dst are in cache:
 235  233   *
 236  234   * byte aligned:  108 clocks slower for non-FPBLK
 237  235   * half aligned:   44 clocks slower for non-FPBLK
 238  236   * word aligned:   12 clocks slower for non-FPBLK
 239  237   * long aligned:    4 clocks >>faster<< for non-FPBLK
 240  238   *
 241  239   * The long aligned loop runs faster because it does no prefetching.
 242  240   * That wins if the data is not in cache or there is too little
 243  241   * data to gain much benefit from prefetching.  But when there
 244  242   * is more data and that data is not in cache, failing to prefetch
 245  243   * can run much slower.  In addition, there is a 2 Kbyte store queue
 246  244   * which will cause the non-FPBLK inner loop to slow for larger copies.
 247  245   * The exact tradeoff is strongly load and application dependent, with
 248  246   * increasing risk of a customer visible performance regression if the
 249  247   * non-FPBLK code is used for larger copies. Studies of synthetic in-cache
 250  248   * vs out-of-cache copy tests in user space suggest 1024 bytes as a safe
 251  249   * upper limit for the non-FPBLK code.  To minimize performance regression
 252  250   * risk while still gaining the primary benefits of the improvements to
 253  251   * the non-FPBLK code, we set an upper bound of 1024 bytes for the various
 254  252   * hw_copy_limit_*.  Later experimental studies using different values
 255  253   * of hw_copy_limit_* can be used to make further adjustments if
 256  254   * appropriate.
 257  255   *
 258  256   * hw_copy_limit_1 = src and dst are byte aligned but not halfword aligned
 259  257   * hw_copy_limit_2 = src and dst are halfword aligned but not word aligned
 260  258   * hw_copy_limit_4 = src and dst are word aligned but not longword aligned
 261  259   * hw_copy_limit_8 = src and dst are longword aligned
 262  260   *
 263  261   * To say that src and dst are word aligned means that after
 264  262   * some initial alignment activity of moving 0 to 3 bytes,
 265  263   * both the src and dst will be on word boundaries so that
 266  264   * word loads and stores may be used.
 267  265   *
 268  266   * Default values at May,2005 are:
 269  267   * hw_copy_limit_1 =  256
 270  268   * hw_copy_limit_2 =  512
 271  269   * hw_copy_limit_4 = 1024
 272  270   * hw_copy_limit_8 = 1024 (or 1536 on some systems)
 273  271   *
 274  272   *
 275  273   * If hw_copy_limit_? is set to zero, then use of FPBLK copy is
 276  274   * disabled for that alignment choice.
 277  275   * If hw_copy_limit_? is set to a value between 1 and VIS_COPY_THRESHOLD (256)
 278  276   * the value of VIS_COPY_THRESHOLD is used.
 279  277   * It is not envisioned that hw_copy_limit_? will be changed in the field
 280  278   * It is provided to allow for disabling FPBLK copies and to allow
 281  279   * easy testing of alternate values on future HW implementations
 282  280   * that might have different cache sizes, clock rates or instruction
 283  281   * timing rules.
 284  282   *
 285  283   * Our first test for FPBLK copies vs non-FPBLK copies checks a minimum
 286  284   * threshold to speedup all shorter copies (less than 256).  That
 287  285   * saves an alignment test, memory reference, and enabling test
 288  286   * for all short copies, or an estimated 24 clocks.
 289  287   *
 290  288   * The order in which these limits are checked does matter since each
 291  289   * non-predicted tst and branch costs around 10 clocks.
 292  290   * If src and dst are randomly selected addresses,
 293  291   * 4 of 8 will not be alignable.
 294  292   * 2 of 8 will be half word alignable.
 295  293   * 1 of 8 will be word alignable.
 296  294   * 1 of 8 will be long word alignable.
 297  295   * But, tests on running kernels show that src and dst to copy code
 298  296   * are typically not on random alignments.  Structure copies and
 299  297   * copies of larger data sizes are often on long word boundaries.
 300  298   * So we test the long word alignment case first, then
 301  299   * the byte alignment, then halfword, then word alignment.
 302  300   *
 303  301   * Several times, tests for length are made to split the code
 304  302   * into subcases.  These tests often allow later tests to be
 305  303   * avoided.  For example, within the non-FPBLK copy, we first
 306  304   * check for tiny copies of 3 bytes or less.  That allows us
 307  305   * to use a 4-way unrolled loop for the general byte copy case
 308  306   * without a test on loop entry.
 309  307   * We subdivide the non-FPBLK case further into CHKSIZE bytes and less
 310  308   * vs longer cases.  For the really short case, we don't attempt
 311  309   * align src and dst.  We try to minimize special case tests in
 312  310   * the shortest loops as each test adds a significant percentage
 313  311   * to the total time.
 314  312   *
 315  313   * For the medium sized cases, we allow ourselves to adjust the
 316  314   * src and dst alignment and provide special cases for each of
 317  315   * the four adjusted alignment cases. The CHKSIZE that was used
 318  316   * to decide between short and medium size was chosen to be 39
 319  317   * as that allows for the worst case of 7 bytes of alignment
 320  318   * shift and 4 times 8 bytes for the first long word unrolling.
 321  319   * That knowledge saves an initial test for length on entry into
 322  320   * the medium cases.  If the general loop unrolling factor were
 323  321   * to be increases, this number would also need to be adjusted.
 324  322   *
 325  323   * For all cases in the non-FPBLK code where it is known that at
 326  324   * least 4 chunks of data are available for movement, the
 327  325   * loop is unrolled by four.  This 4-way loop runs in 8 clocks
 328  326   * or 2 clocks per data element.
 329  327   *
 330  328   * Instruction alignment is forced by used of .align 16 directives
 331  329   * and nops which are not executed in the code.  This
 332  330   * combination of operations shifts the alignment of following
 333  331   * loops to insure that loops are aligned so that their instructions
 334  332   * fall within the minimum number of 4 instruction fetch groups.
 335  333   * If instructions are inserted or removed between the .align
 336  334   * instruction and the unrolled loops, then the alignment needs
 337  335   * to be readjusted.  Misaligned loops can add a clock per loop
 338  336   * iteration to the loop timing.
 339  337   *
 340  338   * In a few cases, code is duplicated to avoid a branch.  Since
 341  339   * a non-predicted tst and branch takes 10 clocks, this savings
 342  340   * is judged an appropriate time-space tradeoff.
 343  341   *
 344  342   * Within the FPBLK-code, the prefetch method in the inner
 345  343   * loop needs to be explained as it is not standard.  Two
 346  344   * prefetches are issued for each cache line instead of one.
 347  345   * The primary one is at the maximum reach of 8 cache lines.
 348  346   * Most of the time, that maximum prefetch reach gives the
 349  347   * cache line more time to reach the processor for systems with
 350  348   * higher processor clocks.  But, sometimes memory interference
 351  349   * can cause that prefetch to be dropped.  Putting a second
 352  350   * prefetch at a reach of 5 cache lines catches the drops
 353  351   * three iterations later and shows a measured improvement
 354  352   * in performance over any similar loop with a single prefetch.
 355  353   * The prefetches are placed in the loop so they overlap with
 356  354   * non-memory instructions, so that there is no extra cost
 357  355   * when the data is already in-cache.
 358  356   *
 359  357   */
 360  358  
 361  359  /*
 362  360   * Notes on preserving existing fp state and on membars.
 363  361   *
 364  362   * When a copyOP decides to use fp we may have to preserve existing
 365  363   * floating point state.  It is not the caller's state that we need to
 366  364   * preserve - the rest of the kernel does not use fp and, anyway, fp
 367  365   * registers are volatile across a call.  Some examples:
 368  366   *
 369  367   *      - userland has fp state and is interrupted (device interrupt
 370  368   *        or trap) and within the interrupt/trap handling we use
 371  369   *        bcopy()
 372  370   *      - another (higher level) interrupt or trap handler uses bcopy
 373  371   *        while a bcopy from an earlier interrupt is still active
 374  372   *      - an asynchronous error trap occurs while fp state exists (in
 375  373   *        userland or in kernel copy) and the tl0 component of the handling
 376  374   *        uses bcopy
 377  375   *      - a user process with fp state incurs a copy-on-write fault and
 378  376   *        hwblkpagecopy always uses fp
 379  377   *
 380  378   * We therefore need a per-call place in which to preserve fp state -
 381  379   * using our stack is ideal (and since fp copy cannot be leaf optimized
 382  380   * because of calls it makes, this is no hardship).
 383  381   *
 384  382   * When we have finished fp copy (with it's repeated block stores)
 385  383   * we must membar #Sync so that our block stores may complete before
 386  384   * we either restore the original fp state into the fp registers or
 387  385   * return to a caller which may initiate other fp operations that could
 388  386   * modify the fp regs we used before the block stores complete.
 389  387   *
 390  388   * Synchronous faults (eg, unresolvable DMMU miss) that occur while
 391  389   * t_lofault is not NULL will not panic but will instead trampoline
 392  390   * to the registered lofault handler.  There is no need for any
 393  391   * membars for these - eg, our store to t_lofault will always be visible to
 394  392   * ourselves and it is our cpu which will take any trap.
 395  393   *
 396  394   * Asynchronous faults (eg, uncorrectable ECC error from memory) that occur
 397  395   * while t_lofault is not NULL will also not panic.  Since we're copying
 398  396   * to or from userland the extent of the damage is known - the destination
 399  397   * buffer is incomplete.  So trap handlers will trampoline to the lofault
 400  398   * handler in this case which should take some form of error action to
 401  399   * avoid using the incomplete buffer.  The trap handler also flags the
 402  400   * fault so that later return-from-trap handling (for the trap that brought
 403  401   * this thread into the kernel in the first place) can notify the process
 404  402   * and reboot the system (or restart the service with Greenline/Contracts).
 405  403   *
 406  404   * Asynchronous faults (eg, uncorrectable ECC error from memory) can
 407  405   * result in deferred error traps - the trap is taken sometime after
 408  406   * the event and the trap PC may not be the PC of the faulting access.
 409  407   * Delivery of such pending traps can be forced by a membar #Sync, acting
 410  408   * as an "error barrier" in this role.  To accurately apply the user/kernel
 411  409   * separation described in the preceding paragraph we must force delivery
 412  410   * of deferred traps affecting kernel state before we install a lofault
 413  411   * handler (if we interpose a new lofault handler on an existing one there
 414  412   * is no need to repeat this), and we must force delivery of deferred
 415  413   * errors affecting the lofault-protected region before we clear t_lofault.
 416  414   * Failure to do so results in lost kernel state being interpreted as
 417  415   * affecting a copyin/copyout only, or of an error that really only
 418  416   * affects copy data being interpreted as losing kernel state.
 419  417   *
 420  418   * Since the copy operations may preserve and later restore floating
 421  419   * point state that does not belong to the caller (see examples above),
 422  420   * we must be careful in how we do this in order to prevent corruption
 423  421   * of another program.
 424  422   *
 425  423   * To make sure that floating point state is always saved and restored
 426  424   * correctly, the following "big rules" must be followed when the floating
 427  425   * point registers will be used:
 428  426   *
 429  427   * 1. %l6 always holds the caller's lofault handler.  Also in this register,
 430  428   *    Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in
 431  429   *    use.  Bit 2 (TRAMP_FLAG) indicates that the call was to bcopy, and a
 432  430   *    lofault handler was set coming in.
 433  431   *
 434  432   * 2. The FPUSED flag indicates that all FP state has been successfully stored
 435  433   *    on the stack.  It should not be set until this save has been completed.
 436  434   *
 437  435   * 3. The FPUSED flag should not be cleared on exit until all FP state has
 438  436   *    been restored from the stack.  If an error occurs while restoring
 439  437   *    data from the stack, the error handler can check this flag to see if
 440  438   *    a restore is necessary.
 441  439   *
 442  440   * 4. Code run under the new lofault handler must be kept to a minimum.  In
 443  441   *    particular, any calls to FP_ALLOWMIGRATE, which could result in a call
 444  442   *    to kpreempt(), should not be made until after the lofault handler has
 445  443   *    been restored.
 446  444   */
 447  445  
 448  446  /*
 449  447   * VIS_COPY_THRESHOLD indicates the minimum number of bytes needed
 450  448   * to "break even" using FP/VIS-accelerated memory operations.
 451  449   * The FPBLK code assumes a minimum number of bytes are available
 452  450   * to be moved on entry.  Check that code carefully before
 453  451   * reducing VIS_COPY_THRESHOLD below 256.
 454  452   */
 455  453  /*
 456  454   * This shadows sys/machsystm.h which can't be included due to the lack of
 457  455   * _ASM guards in include files it references. Change it here, change it there.
 458  456   */
 459  457  #define VIS_COPY_THRESHOLD 256
 460  458  
 461  459  /*
 462  460   * TEST for very short copies
 463  461   * Be aware that the maximum unroll for the short unaligned case
 464  462   * is SHORTCOPY+1
 465  463   */
 466  464  #define SHORTCOPY 3
 467  465  #define CHKSIZE  39
 468  466  
 469  467  /*
 470  468   * Indicates that we're to trampoline to the error handler.
 471  469   * Entry points bcopy, copyin_noerr, and copyout_noerr use this flag.
 472  470   * kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag.
 473  471   */
 474  472  #define FPUSED_FLAG     1
 475  473  #define TRAMP_FLAG      2
 476  474  #define MASK_FLAGS      3
 477  475  
 478  476  /*
 479  477   * Number of outstanding prefetches.
 480  478   * first prefetch moves data from L2 to L1 (n_reads)
 481  479   * second prefetch moves data from memory to L2 (one_read)
 482  480   */
 483  481  #define OLYMPUS_C_PREFETCH      24
 484  482  #define OLYMPUS_C_2ND_PREFETCH  12
 485  483  
 486  484  #define VIS_BLOCKSIZE           64
 487  485  
 488  486  /*
 489  487   * Size of stack frame in order to accomodate a 64-byte aligned
 490  488   * floating-point register save area and 2 64-bit temp locations.
 491  489   * All copy functions use two quadrants of fp registers; to assure a
 492  490   * block-aligned two block buffer in which to save we must reserve
 493  491   * three blocks on stack.  Not all functions preserve %pfrs on stack
 494  492   * or need to preserve %gsr but we use HWCOPYFRAMESIZE for all.
 495  493   *
 496  494   *    _______________________________________ <-- %fp + STACK_BIAS
 497  495   *    | We may need to preserve 2 quadrants |
 498  496   *    | of fp regs, but since we do so with |
 499  497   *    | BST/BLD we need room in which to    |
 500  498   *    | align to VIS_BLOCKSIZE bytes.  So   |
 501  499   *    | this area is 3 * VIS_BLOCKSIZE.     | <--  - SAVED_FPREGS_OFFSET
 502  500   *    |-------------------------------------|
 503  501   *    | 8 bytes to save %fprs               | <--  - SAVED_FPRS_OFFSET
 504  502   *    |-------------------------------------|
 505  503   *    | 8 bytes to save %gsr                | <--  - SAVED_GSR_OFFSET
 506  504   *    ---------------------------------------
 507  505   */
 508  506  #define HWCOPYFRAMESIZE         ((VIS_BLOCKSIZE * (2 + 1)) + (2 * 8))
 509  507  #define SAVED_FPREGS_OFFSET     (VIS_BLOCKSIZE * 3)
 510  508  #define SAVED_FPREGS_ADJUST     ((VIS_BLOCKSIZE * 2) - 1)
 511  509  #define SAVED_FPRS_OFFSET       (SAVED_FPREGS_OFFSET + 8)
 512  510  #define SAVED_GSR_OFFSET        (SAVED_FPRS_OFFSET + 8)
 513  511  
 514  512  /*
 515  513   * Common macros used by the various versions of the block copy
 516  514   * routines in this file.
 517  515   */
 518  516  
 519  517  /*
 520  518   * In FP copies if we do not have preserved data to restore over
 521  519   * the fp regs we used then we must zero those regs to avoid
 522  520   * exposing portions of the data to later threads (data security).
 523  521   *
 524  522   * Copy functions use either quadrants 1 and 3 or 2 and 4.
 525  523   *
 526  524   * FZEROQ1Q3: Zero quadrants 1 and 3, ie %f0 - %f15 and %f32 - %f47
 527  525   * FZEROQ2Q4: Zero quadrants 2 and 4, ie %f16 - %f31 and %f48 - %f63
 528  526   *
 529  527   * The instructions below are quicker than repeated fzero instructions
 530  528   * since they can dispatch down two fp pipelines.
 531  529   */
 532  530  #define FZEROQ1Q3                       \
 533  531          fzero   %f0                     ;\
 534  532          fmovd   %f0, %f2                ;\
 535  533          fmovd   %f0, %f4                ;\
 536  534          fmovd   %f0, %f6                ;\
 537  535          fmovd   %f0, %f8                ;\
 538  536          fmovd   %f0, %f10               ;\
 539  537          fmovd   %f0, %f12               ;\
 540  538          fmovd   %f0, %f14               ;\
 541  539          fmovd   %f0, %f32               ;\
 542  540          fmovd   %f0, %f34               ;\
 543  541          fmovd   %f0, %f36               ;\
 544  542          fmovd   %f0, %f38               ;\
 545  543          fmovd   %f0, %f40               ;\
 546  544          fmovd   %f0, %f42               ;\
 547  545          fmovd   %f0, %f44               ;\
 548  546          fmovd   %f0, %f46
 549  547  
 550  548  #define FZEROQ2Q4                       \
 551  549          fzero   %f16                    ;\
 552  550          fmovd   %f0, %f18               ;\
 553  551          fmovd   %f0, %f20               ;\
 554  552          fmovd   %f0, %f22               ;\
 555  553          fmovd   %f0, %f24               ;\
 556  554          fmovd   %f0, %f26               ;\
 557  555          fmovd   %f0, %f28               ;\
 558  556          fmovd   %f0, %f30               ;\
 559  557          fmovd   %f0, %f48               ;\
 560  558          fmovd   %f0, %f50               ;\
 561  559          fmovd   %f0, %f52               ;\
 562  560          fmovd   %f0, %f54               ;\
 563  561          fmovd   %f0, %f56               ;\
 564  562          fmovd   %f0, %f58               ;\
 565  563          fmovd   %f0, %f60               ;\
 566  564          fmovd   %f0, %f62
 567  565  
 568  566  /*
 569  567   * Macros to save and restore quadrants 1 and 3 or 2 and 4 to/from the stack.
 570  568   * Used to save and restore in-use fp registers when we want to use FP
 571  569   * and find fp already in use and copy size still large enough to justify
 572  570   * the additional overhead of this save and restore.
 573  571   *
 574  572   * A membar #Sync is needed before save to sync fp ops initiated before
 575  573   * the call to the copy function (by whoever has fp in use); for example
 576  574   * an earlier block load to the quadrant we are about to save may still be
 577  575   * "in flight".  A membar #Sync is required at the end of the save to
 578  576   * sync our block store (the copy code is about to begin ldd's to the

↓ open down ↓

530 lines elided

↑ open up ↑

 579  577   * first quadrant).
 580  578   *
 581  579   * Similarly: a membar #Sync before restore allows the block stores of
 582  580   * the copy operation to complete before we fill the quadrants with their
 583  581   * original data, and a membar #Sync after restore lets the block loads
 584  582   * of the restore complete before we return to whoever has the fp regs
 585  583   * in use.  To avoid repeated membar #Sync we make it the responsibility
 586  584   * of the copy code to membar #Sync immediately after copy is complete
 587  585   * and before using the BLD_*_FROMSTACK macro.
 588  586   */
 589      -#if !defined(lint)
 590  587  #define BST_FPQ1Q3_TOSTACK(tmp1)                                \
 591  588          /* membar #Sync */                                      ;\
 592  589          add     %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1     ;\
 593  590          and     tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */    ;\
 594  591          stda    %f0, [tmp1]ASI_BLK_P                            ;\
 595  592          add     tmp1, VIS_BLOCKSIZE, tmp1                       ;\
 596  593          stda    %f32, [tmp1]ASI_BLK_P                           ;\
 597  594          membar  #Sync
 598  595  
 599  596  #define BLD_FPQ1Q3_FROMSTACK(tmp1)                              \

 600  597          /* membar #Sync - provided at copy completion */        ;\
 601  598          add     %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1     ;\
 602  599          and     tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */    ;\
 603  600          ldda    [tmp1]ASI_BLK_P, %f0                            ;\
 604  601          add     tmp1, VIS_BLOCKSIZE, tmp1                       ;\
 605  602          ldda    [tmp1]ASI_BLK_P, %f32                           ;\
 606  603          membar  #Sync
 607  604  
 608  605  #define BST_FPQ2Q4_TOSTACK(tmp1)                                \
 609  606          /* membar #Sync */                                      ;\
 610  607          add     %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1     ;\
 611  608          and     tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */    ;\
 612  609          stda    %f16, [tmp1]ASI_BLK_P                           ;\
 613  610          add     tmp1, VIS_BLOCKSIZE, tmp1                       ;\
 614  611          stda    %f48, [tmp1]ASI_BLK_P                           ;\

↓ open down ↓

15 lines elided

↑ open up ↑

 615  612          membar  #Sync
 616  613  
 617  614  #define BLD_FPQ2Q4_FROMSTACK(tmp1)                              \
 618  615          /* membar #Sync - provided at copy completion */        ;\
 619  616          add     %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1     ;\
 620  617          and     tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */    ;\
 621  618          ldda    [tmp1]ASI_BLK_P, %f16                           ;\
 622  619          add     tmp1, VIS_BLOCKSIZE, tmp1                       ;\
 623  620          ldda    [tmp1]ASI_BLK_P, %f48                           ;\
 624  621          membar  #Sync
 625      -#endif
 626  622  
 627  623  /*
 628  624   * FP_NOMIGRATE and FP_ALLOWMIGRATE.  Prevent migration (or, stronger,
 629  625   * prevent preemption if there is no t_lwp to save FP state to on context
 630  626   * switch) before commencing a FP copy, and reallow it on completion or
 631  627   * in error trampoline paths when we were using FP copy.
 632  628   *
 633  629   * Both macros may call other functions, so be aware that all outputs are
 634  630   * forfeit after using these macros.  For this reason we do not pass registers
 635  631   * to use - we just use any outputs we want.

 636  632   *
 637  633   * Pseudo code:
 638  634   *
 639  635   * FP_NOMIGRATE:
 640  636   *
 641  637   * if (curthread->t_lwp) {
 642  638   *      thread_nomigrate();
 643  639   * } else {
 644  640   *      kpreempt_disable();
 645  641   * }
 646  642   *
 647  643   * FP_ALLOWMIGRATE:
 648  644   *
 649  645   * if (curthread->t_lwp) {
 650  646   *      thread_allowmigrate();
 651  647   * } else {
 652  648   *      kpreempt_enable();
 653  649   * }
 654  650   */
 655  651  
 656  652  #define FP_NOMIGRATE(label1, label2)                            \
 657  653          ldn     [THREAD_REG + T_LWP], %o0                       ;\
 658  654          brz,a,pn %o0, label1/**/f                               ;\
 659  655            ldsb  [THREAD_REG + T_PREEMPT], %o1                   ;\
 660  656          call    thread_nomigrate                                ;\
 661  657            nop                                                   ;\
 662  658          ba      label2/**/f                                     ;\
 663  659            nop                                                   ;\
 664  660  label1:                                                         ;\
 665  661          inc     %o1                                             ;\
 666  662          stb     %o1, [THREAD_REG + T_PREEMPT]                   ;\
 667  663  label2:
 668  664  
 669  665  #define FP_ALLOWMIGRATE(label1, label2)                 \
 670  666          ldn     [THREAD_REG + T_LWP], %o0                       ;\
 671  667          brz,a,pn %o0, label1/**/f                               ;\
 672  668            ldsb  [THREAD_REG + T_PREEMPT], %o1                   ;\
 673  669          call thread_allowmigrate                                ;\
 674  670            nop                                                   ;\
 675  671          ba      label2/**/f                                     ;\
 676  672            nop                                                   ;\
 677  673  label1:                                                         ;\
 678  674          dec     %o1                                             ;\
 679  675          brnz,pn %o1, label2/**/f                                ;\
 680  676            stb   %o1, [THREAD_REG + T_PREEMPT]                   ;\
 681  677          ldn     [THREAD_REG + T_CPU], %o0                       ;\
 682  678          ldub    [%o0 + CPU_KPRUNRUN], %o0                       ;\
 683  679          brz,pt  %o0, label2/**/f                                ;\
 684  680            nop                                                   ;\

↓ open down ↓

49 lines elided

↑ open up ↑

 685  681          call    kpreempt                                        ;\
 686  682            rdpr  %pil, %o0                                       ;\
 687  683  label2:
 688  684  
 689  685  /*
 690  686   * Copy a block of storage, returning an error code if `from' or
 691  687   * `to' takes a kernel pagefault which cannot be resolved.
 692  688   * Returns errno value on pagefault error, 0 if all ok
 693  689   */
 694  690  
 695      -#if defined(lint)
 696      -
 697      -/* ARGSUSED */
 698      -int
 699      -kcopy(const void *from, void *to, size_t count)
 700      -{ return(0); }
 701      -
 702      -#else   /* lint */
 703      -
 704  691          .seg    ".text"
 705  692          .align  4
 706  693  
 707  694          ENTRY(kcopy)
 708  695  
 709  696          cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
 710  697          bleu,pt %ncc, .kcopy_small              ! go to larger cases
 711  698            xor   %o0, %o1, %o3                   ! are src, dst alignable?
 712  699          btst    7, %o3                          !
 713  700          bz,pt   %ncc, .kcopy_8                  ! check for longword alignment

 714  701            nop
 715  702          btst    1, %o3                          !
 716  703          bz,pt   %ncc, .kcopy_2                  ! check for half-word
 717  704            nop
 718  705          sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
 719  706          ld      [%o3 + %lo(hw_copy_limit_1)], %o3
 720  707          tst     %o3
 721  708          bz,pn   %icc, .kcopy_small              ! if zero, disable HW copy
 722  709            cmp   %o2, %o3                        ! if length <= limit
 723  710          bleu,pt %ncc, .kcopy_small              ! go to small copy
 724  711            nop
 725  712          ba,pt   %ncc, .kcopy_more               ! otherwise go to large copy
 726  713            nop
 727  714  .kcopy_2:
 728  715          btst    3, %o3                          !
 729  716          bz,pt   %ncc, .kcopy_4                  ! check for word alignment
 730  717            nop
 731  718          sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
 732  719          ld      [%o3 + %lo(hw_copy_limit_2)], %o3
 733  720          tst     %o3
 734  721          bz,pn   %icc, .kcopy_small              ! if zero, disable HW copy
 735  722            cmp   %o2, %o3                        ! if length <= limit
 736  723          bleu,pt %ncc, .kcopy_small              ! go to small copy
 737  724            nop
 738  725          ba,pt   %ncc, .kcopy_more               ! otherwise go to large copy
 739  726            nop
 740  727  .kcopy_4:
 741  728          ! already checked longword, must be word aligned
 742  729          sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
 743  730          ld      [%o3 + %lo(hw_copy_limit_4)], %o3
 744  731          tst     %o3
 745  732          bz,pn   %icc, .kcopy_small              ! if zero, disable HW copy
 746  733            cmp   %o2, %o3                        ! if length <= limit
 747  734          bleu,pt %ncc, .kcopy_small              ! go to small copy
 748  735            nop
 749  736          ba,pt   %ncc, .kcopy_more               ! otherwise go to large copy
 750  737            nop
 751  738  .kcopy_8:
 752  739          sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
 753  740          ld      [%o3 + %lo(hw_copy_limit_8)], %o3
 754  741          tst     %o3
 755  742          bz,pn   %icc, .kcopy_small              ! if zero, disable HW copy
 756  743            cmp   %o2, %o3                        ! if length <= limit
 757  744          bleu,pt %ncc, .kcopy_small              ! go to small copy
 758  745            nop
 759  746          ba,pt   %ncc, .kcopy_more               ! otherwise go to large copy
 760  747            nop
 761  748  
 762  749  .kcopy_small:
 763  750          sethi   %hi(.sm_copyerr), %o5           ! sm_copyerr is lofault value
 764  751          or      %o5, %lo(.sm_copyerr), %o5
 765  752          ldn     [THREAD_REG + T_LOFAULT], %o4   ! save existing handler
 766  753          membar  #Sync                           ! sync error barrier
 767  754          ba,pt   %ncc, .sm_do_copy               ! common code
 768  755           stn    %o5, [THREAD_REG + T_LOFAULT]   ! set t_lofault
 769  756  
 770  757  .kcopy_more:
 771  758          save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
 772  759          sethi   %hi(.copyerr), %l7              ! copyerr is lofault value
 773  760          or      %l7, %lo(.copyerr), %l7
 774  761          ldn     [THREAD_REG + T_LOFAULT], %l6   ! save existing handler
 775  762          membar  #Sync                           ! sync error barrier
 776  763          ba,pt   %ncc, .do_copy                  ! common code
 777  764            stn   %l7, [THREAD_REG + T_LOFAULT]   ! set t_lofault
 778  765  
 779  766  
 780  767  /*
 781  768   * We got here because of a fault during bcopy_more, called from kcopy or bcopy.
 782  769   * Errno value is in %g1.  bcopy_more uses fp quadrants 1 and 3.
 783  770   */
 784  771  .copyerr:
 785  772          set     .copyerr2, %l0
 786  773          membar  #Sync                           ! sync error barrier
 787  774          stn     %l0, [THREAD_REG + T_LOFAULT]   ! set t_lofault
 788  775          btst    FPUSED_FLAG, %l6
 789  776          bz      %ncc, 1f
 790  777            and   %l6, TRAMP_FLAG, %l0            ! copy trampoline flag to %l0
 791  778  
 792  779          ldx     [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2      ! restore gsr
 793  780          wr      %o2, 0, %gsr
 794  781  
 795  782          ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
 796  783          btst    FPRS_FEF, %o3
 797  784          bz,pt   %icc, 4f
 798  785            nop
 799  786  
 800  787          BLD_FPQ1Q3_FROMSTACK(%o2)
 801  788  
 802  789          ba,pt   %ncc, 1f
 803  790            wr    %o3, 0, %fprs           ! restore fprs
 804  791  
 805  792  4:
 806  793          FZEROQ1Q3
 807  794          wr      %o3, 0, %fprs           ! restore fprs
 808  795  
 809  796          !
 810  797          ! Need to cater for the different expectations of kcopy
 811  798          ! and bcopy. kcopy will *always* set a t_lofault handler
 812  799          ! If it fires, we're expected to just return the error code
 813  800          ! and *not* to invoke any existing error handler. As far as
 814  801          ! bcopy is concerned, we only set t_lofault if there was an
 815  802          ! existing lofault handler. In that case we're expected to
 816  803          ! invoke the previously existing handler after resetting the
 817  804          ! t_lofault value.
 818  805          !
 819  806  1:
 820  807          andn    %l6, MASK_FLAGS, %l6            ! turn trampoline flag off
 821  808          membar  #Sync                           ! sync error barrier
 822  809          stn     %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
 823  810          FP_ALLOWMIGRATE(5, 6)
 824  811  
 825  812          btst    TRAMP_FLAG, %l0
 826  813          bnz,pn  %ncc, 3f
 827  814            nop
 828  815          ret
 829  816            restore       %g1, 0, %o0
 830  817  
 831  818  3:
 832  819          !
 833  820          ! We're here via bcopy. There *must* have been an error handler
 834  821          ! in place otherwise we would have died a nasty death already.
 835  822          !
 836  823          jmp     %l6                             ! goto real handler
 837  824            restore       %g0, 0, %o0             ! dispose of copy window
 838  825  
 839  826  /*
 840  827   * We got here because of a fault in .copyerr.  We can't safely restore fp
 841  828   * state, so we panic.
 842  829   */
 843  830  fp_panic_msg:
 844  831          .asciz  "Unable to restore fp state after copy operation"
 845  832  
 846  833          .align  4
 847  834  .copyerr2:
 848  835          set     fp_panic_msg, %o0
 849  836          call    panic
 850  837            nop
 851  838  
 852  839  /*
 853  840   * We got here because of a fault during a small kcopy or bcopy.
 854  841   * No floating point registers are used by the small copies.
 855  842   * Errno value is in %g1.
 856  843   */
 857  844  .sm_copyerr:
 858  845  1:
 859  846          btst    TRAMP_FLAG, %o4
 860  847          membar  #Sync

↓ open down ↓

147 lines elided

↑ open up ↑

 861  848          andn    %o4, TRAMP_FLAG, %o4
 862  849          bnz,pn  %ncc, 3f
 863  850            stn   %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
 864  851          retl
 865  852            mov   %g1, %o0
 866  853  3:
 867  854          jmp     %o4                             ! goto real handler
 868  855            mov   %g0, %o0                        !
 869  856  
 870  857          SET_SIZE(kcopy)
 871      -#endif  /* lint */
 872  858  
 873  859  
 874  860  /*
 875  861   * Copy a block of storage - must not overlap (from + len <= to).
 876  862   * Registers: l6 - saved t_lofault
 877  863   * (for short copies, o4 - saved t_lofault)
 878  864   *
 879  865   * Copy a page of memory.
 880  866   * Assumes double word alignment and a count >= 256.
 881  867   */
 882      -#if defined(lint)
 883  868  
 884      -/* ARGSUSED */
 885      -void
 886      -bcopy(const void *from, void *to, size_t count)
 887      -{}
 888      -
 889      -#else   /* lint */
 890      -
 891  869          ENTRY(bcopy)
 892  870  
 893  871          cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
 894  872          bleu,pt %ncc, .bcopy_small              ! go to larger cases
 895  873            xor   %o0, %o1, %o3                   ! are src, dst alignable?
 896  874          btst    7, %o3                          !
 897  875          bz,pt   %ncc, .bcopy_8                  ! check for longword alignment
 898  876            nop
 899  877          btst    1, %o3                          !
 900  878          bz,pt   %ncc, .bcopy_2                  ! check for half-word

 901  879            nop
 902  880          sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
 903  881          ld      [%o3 + %lo(hw_copy_limit_1)], %o3
 904  882          tst     %o3
 905  883          bz,pn   %icc, .bcopy_small              ! if zero, disable HW copy
 906  884            cmp   %o2, %o3                        ! if length <= limit
 907  885          bleu,pt %ncc, .bcopy_small              ! go to small copy
 908  886            nop
 909  887          ba,pt   %ncc, .bcopy_more               ! otherwise go to large copy
 910  888            nop
 911  889  .bcopy_2:
 912  890          btst    3, %o3                          !
 913  891          bz,pt   %ncc, .bcopy_4                  ! check for word alignment
 914  892            nop
 915  893          sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
 916  894          ld      [%o3 + %lo(hw_copy_limit_2)], %o3
 917  895          tst     %o3
 918  896          bz,pn   %icc, .bcopy_small              ! if zero, disable HW copy
 919  897            cmp   %o2, %o3                        ! if length <= limit
 920  898          bleu,pt %ncc, .bcopy_small              ! go to small copy
 921  899            nop
 922  900          ba,pt   %ncc, .bcopy_more               ! otherwise go to large copy
 923  901            nop
 924  902  .bcopy_4:
 925  903          ! already checked longword, must be word aligned
 926  904          sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
 927  905          ld      [%o3 + %lo(hw_copy_limit_4)], %o3
 928  906          tst     %o3
 929  907          bz,pn   %icc, .bcopy_small              ! if zero, disable HW copy
 930  908            cmp   %o2, %o3                        ! if length <= limit
 931  909          bleu,pt %ncc, .bcopy_small              ! go to small copy
 932  910            nop
 933  911          ba,pt   %ncc, .bcopy_more               ! otherwise go to large copy
 934  912            nop
 935  913  .bcopy_8:
 936  914          sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
 937  915          ld      [%o3 + %lo(hw_copy_limit_8)], %o3
 938  916          tst     %o3
 939  917          bz,pn   %icc, .bcopy_small              ! if zero, disable HW copy
 940  918            cmp   %o2, %o3                        ! if length <= limit
 941  919          bleu,pt %ncc, .bcopy_small              ! go to small copy
 942  920            nop
 943  921          ba,pt   %ncc, .bcopy_more               ! otherwise go to large copy
 944  922            nop
 945  923  
 946  924          .align  16
 947  925  .bcopy_small:
 948  926          ldn     [THREAD_REG + T_LOFAULT], %o4   ! save t_lofault
 949  927          tst     %o4
 950  928          bz,pt   %icc, .sm_do_copy
 951  929            nop
 952  930          sethi   %hi(.sm_copyerr), %o5
 953  931          or      %o5, %lo(.sm_copyerr), %o5
 954  932          membar  #Sync                           ! sync error barrier
 955  933          stn     %o5, [THREAD_REG + T_LOFAULT]   ! install new vector
 956  934          or      %o4, TRAMP_FLAG, %o4            ! error should trampoline
 957  935  .sm_do_copy:
 958  936          cmp     %o2, SHORTCOPY          ! check for really short case
 959  937          bleu,pt %ncc, .bc_sm_left       !
 960  938            cmp   %o2, CHKSIZE            ! check for medium length cases
 961  939          bgu,pn  %ncc, .bc_med           !
 962  940            or    %o0, %o1, %o3           ! prepare alignment check
 963  941          andcc   %o3, 0x3, %g0           ! test for alignment
 964  942          bz,pt   %ncc, .bc_sm_word       ! branch to word aligned case
 965  943  .bc_sm_movebytes:
 966  944            sub   %o2, 3, %o2             ! adjust count to allow cc zero test
 967  945  .bc_sm_notalign4:
 968  946          ldub    [%o0], %o3              ! read byte
 969  947          stb     %o3, [%o1]              ! write byte
 970  948          subcc   %o2, 4, %o2             ! reduce count by 4
 971  949          ldub    [%o0 + 1], %o3          ! repeat for a total of 4 bytes
 972  950          add     %o0, 4, %o0             ! advance SRC by 4
 973  951          stb     %o3, [%o1 + 1]
 974  952          ldub    [%o0 - 2], %o3
 975  953          add     %o1, 4, %o1             ! advance DST by 4
 976  954          stb     %o3, [%o1 - 2]
 977  955          ldub    [%o0 - 1], %o3
 978  956          bgt,pt  %ncc, .bc_sm_notalign4  ! loop til 3 or fewer bytes remain
 979  957            stb   %o3, [%o1 - 1]
 980  958          add     %o2, 3, %o2             ! restore count
 981  959  .bc_sm_left:
 982  960          tst     %o2
 983  961          bz,pt   %ncc, .bc_sm_exit       ! check for zero length
 984  962            deccc %o2                     ! reduce count for cc test
 985  963          ldub    [%o0], %o3              ! move one byte
 986  964          bz,pt   %ncc, .bc_sm_exit
 987  965            stb   %o3, [%o1]
 988  966          ldub    [%o0 + 1], %o3          ! move another byte
 989  967          deccc   %o2                     ! check for more
 990  968          bz,pt   %ncc, .bc_sm_exit
 991  969            stb   %o3, [%o1 + 1]
 992  970          ldub    [%o0 + 2], %o3          ! move final byte
 993  971          ba,pt   %ncc, .bc_sm_exit
 994  972            stb   %o3, [%o1 + 2]
 995  973          .align  16
 996  974          nop                             ! instruction alignment
 997  975                                          ! see discussion at start of file
 998  976  .bc_sm_words:
 999  977          lduw    [%o0], %o3              ! read word
1000  978  .bc_sm_wordx:
1001  979          subcc   %o2, 8, %o2             ! update count
1002  980          stw     %o3, [%o1]              ! write word
1003  981          add     %o0, 8, %o0             ! update SRC
1004  982          lduw    [%o0 - 4], %o3          ! read word
1005  983          add     %o1, 8, %o1             ! update DST
1006  984          bgt,pt  %ncc, .bc_sm_words      ! loop til done
1007  985            stw   %o3, [%o1 - 4]          ! write word
1008  986          addcc   %o2, 7, %o2             ! restore count
1009  987          bz,pt   %ncc, .bc_sm_exit
1010  988            deccc %o2
1011  989          bz,pt   %ncc, .bc_sm_byte
1012  990  .bc_sm_half:
1013  991            subcc %o2, 2, %o2             ! reduce count by 2
1014  992          add     %o0, 2, %o0             ! advance SRC by 2
1015  993          lduh    [%o0 - 2], %o3          ! read half word
1016  994          add     %o1, 2, %o1             ! advance DST by 2
1017  995          bgt,pt  %ncc, .bc_sm_half       ! loop til done
1018  996            sth   %o3, [%o1 - 2]          ! write half word
1019  997          addcc   %o2, 1, %o2             ! restore count
1020  998          bz,pt   %ncc, .bc_sm_exit
1021  999            nop
1022 1000  .bc_sm_byte:
1023 1001          ldub    [%o0], %o3
1024 1002          ba,pt   %ncc, .bc_sm_exit
1025 1003            stb   %o3, [%o1]
1026 1004  
1027 1005  .bc_sm_word:
1028 1006          subcc   %o2, 4, %o2             ! update count
1029 1007          bgt,pt  %ncc, .bc_sm_wordx
1030 1008            lduw  [%o0], %o3              ! read word
1031 1009          addcc   %o2, 3, %o2             ! restore count
1032 1010          bz,pt   %ncc, .bc_sm_exit
1033 1011            stw   %o3, [%o1]              ! write word
1034 1012          deccc   %o2                     ! reduce count for cc test
1035 1013          ldub    [%o0 + 4], %o3          ! load one byte
1036 1014          bz,pt   %ncc, .bc_sm_exit
1037 1015            stb   %o3, [%o1 + 4]          ! store one byte
1038 1016          ldub    [%o0 + 5], %o3          ! load second byte
1039 1017          deccc   %o2
1040 1018          bz,pt   %ncc, .bc_sm_exit
1041 1019            stb   %o3, [%o1 + 5]          ! store second byte
1042 1020          ldub    [%o0 + 6], %o3          ! load third byte
1043 1021          stb     %o3, [%o1 + 6]          ! store third byte
1044 1022  .bc_sm_exit:
1045 1023          ldn     [THREAD_REG + T_LOFAULT], %o3
1046 1024          brz,pt  %o3, .bc_sm_done
1047 1025            nop
1048 1026          membar  #Sync                           ! sync error barrier
1049 1027          andn    %o4, TRAMP_FLAG, %o4
1050 1028          stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1051 1029  .bc_sm_done:
1052 1030          retl
1053 1031            mov   %g0, %o0                ! return 0
1054 1032  
1055 1033          .align 16
1056 1034  .bc_med:
1057 1035          xor     %o0, %o1, %o3           ! setup alignment check
1058 1036          btst    1, %o3
1059 1037          bnz,pt  %ncc, .bc_sm_movebytes  ! unaligned
1060 1038            nop
1061 1039          btst    3, %o3
1062 1040          bnz,pt  %ncc, .bc_med_half      ! halfword aligned
1063 1041            nop
1064 1042          btst    7, %o3
1065 1043          bnz,pt  %ncc, .bc_med_word      ! word aligned
1066 1044            nop
1067 1045  .bc_med_long:
1068 1046          btst    3, %o0                  ! check for
1069 1047          bz,pt   %ncc, .bc_med_long1     ! word alignment
1070 1048            nop
1071 1049  .bc_med_long0:
1072 1050          ldub    [%o0], %o3              ! load one byte
1073 1051          inc     %o0
1074 1052          stb     %o3,[%o1]               ! store byte
1075 1053          inc     %o1
1076 1054          btst    3, %o0
1077 1055          bnz,pt  %ncc, .bc_med_long0
1078 1056            dec   %o2
1079 1057  .bc_med_long1:                  ! word aligned
1080 1058          btst    7, %o0                  ! check for long word
1081 1059          bz,pt   %ncc, .bc_med_long2
1082 1060            nop
1083 1061          lduw    [%o0], %o3              ! load word
1084 1062          add     %o0, 4, %o0             ! advance SRC by 4
1085 1063          stw     %o3, [%o1]              ! store word
1086 1064          add     %o1, 4, %o1             ! advance DST by 4
1087 1065          sub     %o2, 4, %o2             ! reduce count by 4
1088 1066  !
1089 1067  !  Now long word aligned and have at least 32 bytes to move
1090 1068  !
1091 1069  .bc_med_long2:
1092 1070          sub     %o2, 31, %o2            ! adjust count to allow cc zero test
1093 1071  .bc_med_lmove:
1094 1072          ldx     [%o0], %o3              ! read long word
1095 1073          stx     %o3, [%o1]              ! write long word
1096 1074          subcc   %o2, 32, %o2            ! reduce count by 32
1097 1075          ldx     [%o0 + 8], %o3          ! repeat for a total for 4 long words
1098 1076          add     %o0, 32, %o0            ! advance SRC by 32
1099 1077          stx     %o3, [%o1 + 8]
1100 1078          ldx     [%o0 - 16], %o3
1101 1079          add     %o1, 32, %o1            ! advance DST by 32
1102 1080          stx     %o3, [%o1 - 16]
1103 1081          ldx     [%o0 - 8], %o3
1104 1082          bgt,pt  %ncc, .bc_med_lmove     ! loop til 31 or fewer bytes left
1105 1083            stx   %o3, [%o1 - 8]
1106 1084          addcc   %o2, 24, %o2            ! restore count to long word offset
1107 1085          ble,pt  %ncc, .bc_med_lextra    ! check for more long words to move
1108 1086            nop
1109 1087  .bc_med_lword:
1110 1088          ldx     [%o0], %o3              ! read long word
1111 1089          subcc   %o2, 8, %o2             ! reduce count by 8
1112 1090          stx     %o3, [%o1]              ! write long word
1113 1091          add     %o0, 8, %o0             ! advance SRC by 8
1114 1092          bgt,pt  %ncc, .bc_med_lword     ! loop til 7 or fewer bytes left
1115 1093            add   %o1, 8, %o1             ! advance DST by 8
1116 1094  .bc_med_lextra:
1117 1095          addcc   %o2, 7, %o2             ! restore rest of count
1118 1096          bz,pt   %ncc, .bc_sm_exit       ! if zero, then done
1119 1097            deccc %o2
1120 1098          bz,pt   %ncc, .bc_sm_byte
1121 1099            nop
1122 1100          ba,pt   %ncc, .bc_sm_half
1123 1101            nop
1124 1102  
1125 1103          .align 16
1126 1104  .bc_med_word:
1127 1105          btst    3, %o0                  ! check for
1128 1106          bz,pt   %ncc, .bc_med_word1     ! word alignment
1129 1107            nop
1130 1108  .bc_med_word0:
1131 1109          ldub    [%o0], %o3              ! load one byte
1132 1110          inc     %o0
1133 1111          stb     %o3,[%o1]               ! store byte
1134 1112          inc     %o1
1135 1113          btst    3, %o0
1136 1114          bnz,pt  %ncc, .bc_med_word0
1137 1115            dec   %o2
1138 1116  !
1139 1117  !  Now word aligned and have at least 36 bytes to move
1140 1118  !
1141 1119  .bc_med_word1:
1142 1120          sub     %o2, 15, %o2            ! adjust count to allow cc zero test
1143 1121  .bc_med_wmove:
1144 1122          lduw    [%o0], %o3              ! read word
1145 1123          stw     %o3, [%o1]              ! write word
1146 1124          subcc   %o2, 16, %o2            ! reduce count by 16
1147 1125          lduw    [%o0 + 4], %o3          ! repeat for a total for 4 words
1148 1126          add     %o0, 16, %o0            ! advance SRC by 16
1149 1127          stw     %o3, [%o1 + 4]
1150 1128          lduw    [%o0 - 8], %o3
1151 1129          add     %o1, 16, %o1            ! advance DST by 16
1152 1130          stw     %o3, [%o1 - 8]
1153 1131          lduw    [%o0 - 4], %o3
1154 1132          bgt,pt  %ncc, .bc_med_wmove     ! loop til 15 or fewer bytes left
1155 1133            stw   %o3, [%o1 - 4]
1156 1134          addcc   %o2, 12, %o2            ! restore count to word offset
1157 1135          ble,pt  %ncc, .bc_med_wextra    ! check for more words to move
1158 1136            nop
1159 1137  .bc_med_word2:
1160 1138          lduw    [%o0], %o3              ! read word
1161 1139          subcc   %o2, 4, %o2             ! reduce count by 4
1162 1140          stw     %o3, [%o1]              ! write word
1163 1141          add     %o0, 4, %o0             ! advance SRC by 4
1164 1142          bgt,pt  %ncc, .bc_med_word2     ! loop til 3 or fewer bytes left
1165 1143            add   %o1, 4, %o1             ! advance DST by 4
1166 1144  .bc_med_wextra:
1167 1145          addcc   %o2, 3, %o2             ! restore rest of count
1168 1146          bz,pt   %ncc, .bc_sm_exit       ! if zero, then done
1169 1147            deccc %o2
1170 1148          bz,pt   %ncc, .bc_sm_byte
1171 1149            nop
1172 1150          ba,pt   %ncc, .bc_sm_half
1173 1151            nop
1174 1152  
1175 1153          .align 16
1176 1154  .bc_med_half:
1177 1155          btst    1, %o0                  ! check for
1178 1156          bz,pt   %ncc, .bc_med_half1     ! half word alignment
1179 1157            nop
1180 1158          ldub    [%o0], %o3              ! load one byte
1181 1159          inc     %o0
1182 1160          stb     %o3,[%o1]               ! store byte
1183 1161          inc     %o1
1184 1162          dec     %o2
1185 1163  !
1186 1164  !  Now half word aligned and have at least 38 bytes to move
1187 1165  !
1188 1166  .bc_med_half1:
1189 1167          sub     %o2, 7, %o2             ! adjust count to allow cc zero test
1190 1168  .bc_med_hmove:
1191 1169          lduh    [%o0], %o3              ! read half word
1192 1170          sth     %o3, [%o1]              ! write half word
1193 1171          subcc   %o2, 8, %o2             ! reduce count by 8
1194 1172          lduh    [%o0 + 2], %o3          ! repeat for a total for 4 halfwords
1195 1173          add     %o0, 8, %o0             ! advance SRC by 8
1196 1174          sth     %o3, [%o1 + 2]
1197 1175          lduh    [%o0 - 4], %o3
1198 1176          add     %o1, 8, %o1             ! advance DST by 8
1199 1177          sth     %o3, [%o1 - 4]
1200 1178          lduh    [%o0 - 2], %o3
1201 1179          bgt,pt  %ncc, .bc_med_hmove     ! loop til 7 or fewer bytes left
1202 1180            sth   %o3, [%o1 - 2]
1203 1181          addcc   %o2, 7, %o2             ! restore count
1204 1182          bz,pt   %ncc, .bc_sm_exit
1205 1183            deccc %o2
1206 1184          bz,pt   %ncc, .bc_sm_byte
1207 1185            nop
1208 1186          ba,pt   %ncc, .bc_sm_half
1209 1187            nop
1210 1188  
1211 1189          SET_SIZE(bcopy)
1212 1190  
1213 1191  /*
1214 1192   * The _more entry points are not intended to be used directly by
1215 1193   * any caller from outside this file.  They are provided to allow
1216 1194   * profiling and dtrace of the portions of the copy code that uses
1217 1195   * the floating point registers.
1218 1196   * This entry is particularly important as DTRACE (at least as of
1219 1197   * 4/2004) does not support leaf functions.
1220 1198   */
1221 1199  
1222 1200          ENTRY(bcopy_more)
1223 1201  .bcopy_more:
1224 1202          prefetch [%o0], #n_reads
1225 1203          save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1226 1204          ldn     [THREAD_REG + T_LOFAULT], %l6   ! save t_lofault
1227 1205          tst     %l6
1228 1206          bz,pt   %ncc, .do_copy
1229 1207            nop
1230 1208          sethi   %hi(.copyerr), %o2
1231 1209          or      %o2, %lo(.copyerr), %o2
1232 1210          membar  #Sync                           ! sync error barrier
1233 1211          stn     %o2, [THREAD_REG + T_LOFAULT]   ! install new vector
1234 1212          !
1235 1213          ! We've already captured whether t_lofault was zero on entry.
1236 1214          ! We need to mark ourselves as being from bcopy since both
1237 1215          ! kcopy and bcopy use the same code path. If TRAMP_FLAG is set
1238 1216          ! and the saved lofault was zero, we won't reset lofault on
1239 1217          ! returning.
1240 1218          !
1241 1219          or      %l6, TRAMP_FLAG, %l6
1242 1220  
1243 1221  /*
1244 1222   * Copies that reach here are larger than VIS_COPY_THRESHOLD bytes
1245 1223   * Also, use of FP registers has been tested to be enabled
1246 1224   */
1247 1225  .do_copy:
1248 1226          FP_NOMIGRATE(6, 7)
1249 1227  
1250 1228          rd      %fprs, %o2              ! check for unused fp
1251 1229          st      %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
1252 1230          btst    FPRS_FEF, %o2
1253 1231          bz,a,pt %icc, .do_blockcopy
1254 1232            wr    %g0, FPRS_FEF, %fprs
1255 1233  
1256 1234          BST_FPQ1Q3_TOSTACK(%o2)
1257 1235  
1258 1236  .do_blockcopy:
1259 1237          rd      %gsr, %o2
1260 1238          stx     %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]      ! save gsr
1261 1239          or      %l6, FPUSED_FLAG, %l6
1262 1240  
1263 1241  #define REALSRC %i0
1264 1242  #define DST     %i1
1265 1243  #define CNT     %i2
1266 1244  #define SRC     %i3
1267 1245  #define TMP     %i5
1268 1246  
1269 1247          andcc   DST, VIS_BLOCKSIZE - 1, TMP
1270 1248          bz,pt   %ncc, 2f
1271 1249            neg   TMP
1272 1250          add     TMP, VIS_BLOCKSIZE, TMP
1273 1251  
1274 1252          ! TMP = bytes required to align DST on FP_BLOCK boundary
1275 1253          ! Using SRC as a tmp here
1276 1254          cmp     TMP, 3
1277 1255          bleu,pt %ncc, 1f
1278 1256            sub   CNT,TMP,CNT             ! adjust main count
1279 1257          sub     TMP, 3, TMP             ! adjust for end of loop test
1280 1258  .bc_blkalign:
1281 1259          ldub    [REALSRC], SRC          ! move 4 bytes per loop iteration
1282 1260          stb     SRC, [DST]
1283 1261          subcc   TMP, 4, TMP
1284 1262          ldub    [REALSRC + 1], SRC
1285 1263          add     REALSRC, 4, REALSRC
1286 1264          stb     SRC, [DST + 1]
1287 1265          ldub    [REALSRC - 2], SRC
1288 1266          add     DST, 4, DST
1289 1267          stb     SRC, [DST - 2]
1290 1268          ldub    [REALSRC - 1], SRC
1291 1269          bgu,pt  %ncc, .bc_blkalign
1292 1270            stb   SRC, [DST - 1]
1293 1271  
1294 1272          addcc   TMP, 3, TMP             ! restore count adjustment
1295 1273          bz,pt   %ncc, 2f                ! no bytes left?
1296 1274            nop
1297 1275  1:      ldub    [REALSRC], SRC
1298 1276          inc     REALSRC
1299 1277          inc     DST
1300 1278          deccc   TMP
1301 1279          bgu     %ncc, 1b
1302 1280            stb   SRC, [DST - 1]
1303 1281  
1304 1282  2:
1305 1283          membar  #StoreLoad
1306 1284          andn    REALSRC, 0x7, SRC
1307 1285  
1308 1286          ! SRC - 8-byte aligned
1309 1287          ! DST - 64-byte aligned
1310 1288          ldd     [SRC], %f0
1311 1289          prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads
1312 1290          alignaddr REALSRC, %g0, %g0
1313 1291          ldd     [SRC + 0x08], %f2
1314 1292          prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads
1315 1293          faligndata %f0, %f2, %f32
1316 1294          ldd     [SRC + 0x10], %f4
1317 1295          prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
1318 1296          faligndata %f2, %f4, %f34
1319 1297          ldd     [SRC + 0x18], %f6
1320 1298          prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
1321 1299          faligndata %f4, %f6, %f36
1322 1300          ldd     [SRC + 0x20], %f8
1323 1301          prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read
1324 1302          faligndata %f6, %f8, %f38
1325 1303          ldd     [SRC + 0x28], %f10
1326 1304          prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read
1327 1305          faligndata %f8, %f10, %f40
1328 1306          ldd     [SRC + 0x30], %f12
1329 1307          prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read
1330 1308          faligndata %f10, %f12, %f42
1331 1309          ldd     [SRC + 0x38], %f14
1332 1310          ldd     [SRC + VIS_BLOCKSIZE], %f0
1333 1311          sub     CNT, VIS_BLOCKSIZE, CNT
1334 1312          add     SRC, VIS_BLOCKSIZE, SRC
1335 1313          prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read
1336 1314          add     REALSRC, VIS_BLOCKSIZE, REALSRC
1337 1315          ba,pt   %ncc, 1f
1338 1316            prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read
1339 1317          .align  32
1340 1318  1:
1341 1319          ldd     [SRC + 0x08], %f2
1342 1320          faligndata %f12, %f14, %f44
1343 1321          ldd     [SRC + 0x10], %f4
1344 1322          faligndata %f14, %f0, %f46
1345 1323          stda    %f32, [DST]ASI_BLK_P
1346 1324          ldd     [SRC + 0x18], %f6
1347 1325          faligndata %f0, %f2, %f32
1348 1326          ldd     [SRC + 0x20], %f8
1349 1327          faligndata %f2, %f4, %f34
1350 1328          ldd     [SRC + 0x28], %f10
1351 1329          faligndata %f4, %f6, %f36
1352 1330          ldd     [SRC + 0x30], %f12
1353 1331          faligndata %f6, %f8, %f38
1354 1332          sub     CNT, VIS_BLOCKSIZE, CNT
1355 1333          ldd     [SRC + 0x38], %f14
1356 1334          faligndata %f8, %f10, %f40
1357 1335          add     DST, VIS_BLOCKSIZE, DST
1358 1336          ldd     [SRC + VIS_BLOCKSIZE], %f0
1359 1337          faligndata %f10, %f12, %f42
1360 1338          add     REALSRC, VIS_BLOCKSIZE, REALSRC
1361 1339          prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
1362 1340          add     SRC, VIS_BLOCKSIZE, SRC
1363 1341          prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1364 1342          cmp     CNT, VIS_BLOCKSIZE + 8
1365 1343          bgu,pt  %ncc, 1b
1366 1344            prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1367 1345  
1368 1346          ! only if REALSRC & 0x7 is 0
1369 1347          cmp     CNT, VIS_BLOCKSIZE
1370 1348          bne     %ncc, 3f
1371 1349            andcc REALSRC, 0x7, %g0
1372 1350          bz,pt   %ncc, 2f
1373 1351            nop
1374 1352  3:
1375 1353          faligndata %f12, %f14, %f44
1376 1354          faligndata %f14, %f0, %f46
1377 1355          stda    %f32, [DST]ASI_BLK_P
1378 1356          add     DST, VIS_BLOCKSIZE, DST
1379 1357          ba,pt   %ncc, 3f
1380 1358            nop
1381 1359  2:
1382 1360          ldd     [SRC + 0x08], %f2
1383 1361          fsrc1   %f12, %f44
1384 1362          ldd     [SRC + 0x10], %f4
1385 1363          fsrc1   %f14, %f46
1386 1364          stda    %f32, [DST]ASI_BLK_P
1387 1365          ldd     [SRC + 0x18], %f6
1388 1366          fsrc1   %f0, %f32
1389 1367          ldd     [SRC + 0x20], %f8
1390 1368          fsrc1   %f2, %f34
1391 1369          ldd     [SRC + 0x28], %f10
1392 1370          fsrc1   %f4, %f36
1393 1371          ldd     [SRC + 0x30], %f12
1394 1372          fsrc1   %f6, %f38
1395 1373          ldd     [SRC + 0x38], %f14
1396 1374          fsrc1   %f8, %f40
1397 1375          sub     CNT, VIS_BLOCKSIZE, CNT
1398 1376          add     DST, VIS_BLOCKSIZE, DST
1399 1377          add     SRC, VIS_BLOCKSIZE, SRC
1400 1378          add     REALSRC, VIS_BLOCKSIZE, REALSRC
1401 1379          fsrc1   %f10, %f42
1402 1380          fsrc1   %f12, %f44
1403 1381          fsrc1   %f14, %f46
1404 1382          stda    %f32, [DST]ASI_BLK_P
1405 1383          add     DST, VIS_BLOCKSIZE, DST
1406 1384          ba,a,pt %ncc, .bcb_exit
1407 1385            nop
1408 1386  
1409 1387  3:      tst     CNT
1410 1388          bz,a,pt %ncc, .bcb_exit
1411 1389            nop
1412 1390  
1413 1391  5:      ldub    [REALSRC], TMP
1414 1392          inc     REALSRC
1415 1393          inc     DST
1416 1394          deccc   CNT
1417 1395          bgu     %ncc, 5b
1418 1396            stb   TMP, [DST - 1]
1419 1397  .bcb_exit:
1420 1398          membar  #Sync
1421 1399  
1422 1400          ldx     [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2      ! restore gsr
1423 1401          wr      %o2, 0, %gsr
1424 1402  
1425 1403          ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1426 1404          btst    FPRS_FEF, %o3
1427 1405          bz,pt   %icc, 4f
1428 1406            nop
1429 1407  
1430 1408          BLD_FPQ1Q3_FROMSTACK(%o2)
1431 1409  
1432 1410          ba,pt   %ncc, 2f
1433 1411            wr    %o3, 0, %fprs           ! restore fprs
1434 1412  4:
1435 1413          FZEROQ1Q3
1436 1414          wr      %o3, 0, %fprs           ! restore fprs

↓ open down ↓

536 lines elided

↑ open up ↑

1437 1415  2:
1438 1416          membar  #Sync                           ! sync error barrier
1439 1417          andn    %l6, MASK_FLAGS, %l6
1440 1418          stn     %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1441 1419          FP_ALLOWMIGRATE(5, 6)
1442 1420          ret
1443 1421            restore       %g0, 0, %o0
1444 1422  
1445 1423          SET_SIZE(bcopy_more)
1446 1424  
1447      -#endif  /* lint */
1448      -
1449 1425  /*
1450 1426   * Block copy with possibly overlapped operands.
1451 1427   */
1452 1428  
1453      -#if defined(lint)
1454      -
1455      -/*ARGSUSED*/
1456      -void
1457      -ovbcopy(const void *from, void *to, size_t count)
1458      -{}
1459      -
1460      -#else   /* lint */
1461      -
1462 1429          ENTRY(ovbcopy)
1463 1430          tst     %o2                     ! check count
1464 1431          bgu,a   %ncc, 1f                ! nothing to do or bad arguments
1465 1432            subcc %o0, %o1, %o3           ! difference of from and to address
1466 1433  
1467 1434          retl                            ! return
1468 1435            nop
1469 1436  1:
1470 1437          bneg,a  %ncc, 2f
1471 1438            neg   %o3                     ! if < 0, make it positive

1472 1439  2:      cmp     %o2, %o3                ! cmp size and abs(from - to)
1473 1440          bleu    %ncc, bcopy             ! if size <= abs(diff): use bcopy,
1474 1441            .empty                                !   no overlap
1475 1442            cmp   %o0, %o1                ! compare from and to addresses
1476 1443          blu     %ncc, .ov_bkwd          ! if from < to, copy backwards
1477 1444            nop
1478 1445          !
1479 1446          ! Copy forwards.
1480 1447          !
1481 1448  .ov_fwd:
1482 1449          ldub    [%o0], %o3              ! read from address
1483 1450          inc     %o0                     ! inc from address
1484 1451          stb     %o3, [%o1]              ! write to address
1485 1452          deccc   %o2                     ! dec count
1486 1453          bgu     %ncc, .ov_fwd           ! loop till done
1487 1454            inc   %o1                     ! inc to address
1488 1455  
1489 1456          retl                            ! return
1490 1457            nop
1491 1458          !
1492 1459          ! Copy backwards.
1493 1460          !
1494 1461  .ov_bkwd:

↓ open down ↓

23 lines elided

↑ open up ↑

1495 1462          deccc   %o2                     ! dec count
1496 1463          ldub    [%o0 + %o2], %o3        ! get byte at end of src
1497 1464          bgu     %ncc, .ov_bkwd          ! loop till done
1498 1465            stb   %o3, [%o1 + %o2]        ! delay slot, store at end of dst
1499 1466  
1500 1467          retl                            ! return
1501 1468            nop
1502 1469  
1503 1470          SET_SIZE(ovbcopy)
1504 1471  
1505      -#endif  /* lint */
1506 1472  
1507      -
1508 1473  /*
1509 1474   * hwblkpagecopy()
1510 1475   *
1511 1476   * Copies exactly one page.  This routine assumes the caller (ppcopy)
1512 1477   * has already disabled kernel preemption and has checked
1513 1478   * use_hw_bcopy.  Preventing preemption also prevents cpu migration.
1514 1479   */
1515      -#ifdef lint
1516      -/*ARGSUSED*/
1517      -void
1518      -hwblkpagecopy(const void *src, void *dst)
1519      -{ }
1520      -#else /* lint */
1521 1480          ENTRY(hwblkpagecopy)
1522 1481          ! get another window w/space for three aligned blocks of saved fpregs
1523 1482          prefetch [%o0], #n_reads
1524 1483          save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1525 1484  
1526 1485          ! %i0 - source address (arg)
1527 1486          ! %i1 - destination address (arg)
1528 1487          ! %i2 - length of region (not arg)
1529 1488          ! %l0 - saved fprs
1530 1489          ! %l1 - pointer to saved fpregs

1531 1490  
1532 1491          rd      %fprs, %l0              ! check for unused fp
1533 1492          btst    FPRS_FEF, %l0
1534 1493          bz,a,pt %icc, 1f
1535 1494            wr    %g0, FPRS_FEF, %fprs
1536 1495  
1537 1496          BST_FPQ1Q3_TOSTACK(%l1)
1538 1497  
1539 1498  1:      set     PAGESIZE, CNT
1540 1499          mov     REALSRC, SRC
1541 1500  
1542 1501          ldd     [SRC], %f0
1543 1502          prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads
1544 1503          ldd     [SRC + 0x08], %f2
1545 1504          prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads
1546 1505          fmovd   %f0, %f32
1547 1506          ldd     [SRC + 0x10], %f4
1548 1507          prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
1549 1508          fmovd   %f2, %f34
1550 1509          ldd     [SRC + 0x18], %f6
1551 1510          prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
1552 1511          fmovd   %f4, %f36
1553 1512          ldd     [SRC + 0x20], %f8
1554 1513          prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read
1555 1514          fmovd   %f6, %f38
1556 1515          ldd     [SRC + 0x28], %f10
1557 1516          prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read
1558 1517          fmovd   %f8, %f40
1559 1518          ldd     [SRC + 0x30], %f12
1560 1519          prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read
1561 1520          fmovd   %f10, %f42
1562 1521          ldd     [SRC + 0x38], %f14
1563 1522          ldd     [SRC + VIS_BLOCKSIZE], %f0
1564 1523          sub     CNT, VIS_BLOCKSIZE, CNT
1565 1524          add     SRC, VIS_BLOCKSIZE, SRC
1566 1525          prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read
1567 1526          ba,pt   %ncc, 2f
1568 1527          prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read
1569 1528          .align  32
1570 1529  2:
1571 1530          ldd     [SRC + 0x08], %f2
1572 1531          fmovd   %f12, %f44
1573 1532          ldd     [SRC + 0x10], %f4
1574 1533          fmovd   %f14, %f46
1575 1534          stda    %f32, [DST]ASI_BLK_P
1576 1535          ldd     [SRC + 0x18], %f6
1577 1536          fmovd   %f0, %f32
1578 1537          ldd     [SRC + 0x20], %f8
1579 1538          fmovd   %f2, %f34
1580 1539          ldd     [SRC + 0x28], %f10
1581 1540          fmovd   %f4, %f36
1582 1541          ldd     [SRC + 0x30], %f12
1583 1542          fmovd   %f6, %f38
1584 1543          ldd     [SRC + 0x38], %f14
1585 1544          fmovd   %f8, %f40
1586 1545          ldd     [SRC + VIS_BLOCKSIZE], %f0
1587 1546          fmovd   %f10, %f42
1588 1547          sub     CNT, VIS_BLOCKSIZE, CNT
1589 1548          prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
1590 1549          add     DST, VIS_BLOCKSIZE, DST
1591 1550          prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1592 1551          add     SRC, VIS_BLOCKSIZE, SRC
1593 1552          cmp     CNT, VIS_BLOCKSIZE + 8
1594 1553          bgu,pt  %ncc, 2b
1595 1554            prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1596 1555  
1597 1556          ! trailing block
1598 1557          ldd     [SRC + 0x08], %f2
1599 1558          fsrc1   %f12, %f44
1600 1559          ldd     [SRC + 0x10], %f4
1601 1560          fsrc1   %f14, %f46
1602 1561          stda    %f32, [DST]ASI_BLK_P
1603 1562          ldd     [SRC + 0x18], %f6
1604 1563          fsrc1   %f0, %f32
1605 1564          ldd     [SRC + 0x20], %f8
1606 1565          fsrc1   %f2, %f34
1607 1566          ldd     [SRC + 0x28], %f10
1608 1567          fsrc1   %f4, %f36
1609 1568          ldd     [SRC + 0x30], %f12
1610 1569          fsrc1   %f6, %f38
1611 1570          ldd     [SRC + 0x38], %f14
1612 1571          fsrc1   %f8, %f40
1613 1572          sub     CNT, VIS_BLOCKSIZE, CNT
1614 1573          add     DST, VIS_BLOCKSIZE, DST
1615 1574          add     SRC, VIS_BLOCKSIZE, SRC
1616 1575          fsrc1   %f10, %f42
1617 1576          fsrc1   %f12, %f44
1618 1577          fsrc1   %f14, %f46
1619 1578          stda    %f32, [DST]ASI_BLK_P
1620 1579  
1621 1580          membar  #Sync
1622 1581  
1623 1582          btst    FPRS_FEF, %l0
1624 1583          bz,pt   %icc, 2f
1625 1584            nop
1626 1585  
1627 1586          BLD_FPQ1Q3_FROMSTACK(%l3)

↓ open down ↓

97 lines elided

↑ open up ↑

1628 1587          ba      3f
1629 1588            nop
1630 1589  
1631 1590  2:      FZEROQ1Q3
1632 1591  
1633 1592  3:      wr      %l0, 0, %fprs           ! restore fprs
1634 1593          ret
1635 1594            restore       %g0, 0, %o0
1636 1595  
1637 1596          SET_SIZE(hwblkpagecopy)
1638      -#endif  /* lint */
1639 1597  
1640 1598  
1641 1599  /*
1642 1600   * Transfer data to and from user space -
1643 1601   * Note that these routines can cause faults
1644 1602   * It is assumed that the kernel has nothing at
1645 1603   * less than KERNELBASE in the virtual address space.
1646 1604   *
1647 1605   * Note that copyin(9F) and copyout(9F) are part of the
1648 1606   * DDI/DKI which specifies that they return '-1' on "errors."

1649 1607   *
1650 1608   * Sigh.
1651 1609   *
1652 1610   * So there's two extremely similar routines - xcopyin() and xcopyout()
1653 1611   * which return the errno that we've faithfully computed.  This
1654 1612   * allows other callers (e.g. uiomove(9F)) to work correctly.
1655 1613   * Given that these are used pretty heavily, we expand the calling
1656 1614   * sequences inline for all flavours (rather than making wrappers).
1657 1615   *
1658 1616   * There are also stub routines for xcopyout_little and xcopyin_little,
1659 1617   * which currently are intended to handle requests of <= 16 bytes from
1660 1618   * do_unaligned. Future enhancement to make them handle 8k pages efficiently
1661 1619   * is left as an exercise...
1662 1620   */
1663 1621  
1664 1622  /*
1665 1623   * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
1666 1624   *
1667 1625   * General theory of operation:
1668 1626   *
1669 1627   * The only difference between copy{in,out} and
1670 1628   * xcopy{in,out} is in the error handling routine they invoke
1671 1629   * when a memory access error occurs. xcopyOP returns the errno
1672 1630   * while copyOP returns -1 (see above). copy{in,out}_noerr set
1673 1631   * a special flag (by oring the TRAMP_FLAG into the fault handler address)
1674 1632   * if they are called with a fault handler already in place. That flag
1675 1633   * causes the default handlers to trampoline to the previous handler
1676 1634   * upon an error.
1677 1635   *
1678 1636   * None of the copyops routines grab a window until it's decided that
1679 1637   * we need to do a HW block copy operation. This saves a window
1680 1638   * spill/fill when we're called during socket ops. The typical IO
1681 1639   * path won't cause spill/fill traps.
1682 1640   *
1683 1641   * This code uses a set of 4 limits for the maximum size that will
1684 1642   * be copied given a particular input/output address alignment.
1685 1643   * If the value for a particular limit is zero, the copy will be performed
1686 1644   * by the plain copy loops rather than FPBLK.

↓ open down ↓

38 lines elided

↑ open up ↑

1687 1645   *
1688 1646   * See the description of bcopy above for more details of the
1689 1647   * data copying algorithm and the default limits.
1690 1648   *
1691 1649   */
1692 1650  
1693 1651  /*
1694 1652   * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
1695 1653   */
1696 1654  
1697      -#if defined(lint)
1698      -
1699      -
1700      -#else   /* lint */
1701 1655  /*
1702 1656   * We save the arguments in the following registers in case of a fault:
1703 1657   *      kaddr - %l1
1704 1658   *      uaddr - %l2
1705 1659   *      count - %l3
1706 1660   */
1707 1661  #define SAVE_SRC        %l1
1708 1662  #define SAVE_DST        %l2
1709 1663  #define SAVE_COUNT      %l3
1710 1664

1711 1665  #define SM_SAVE_SRC             %g4
1712 1666  #define SM_SAVE_DST             %g5
1713 1667  #define SM_SAVE_COUNT           %o5
1714 1668  #define ERRNO           %l5
1715 1669  
1716 1670  
1717 1671  #define REAL_LOFAULT    %l4
1718 1672  /*
1719 1673   * Generic copyio fault handler.  This is the first line of defense when a
1720 1674   * fault occurs in (x)copyin/(x)copyout.  In order for this to function
1721 1675   * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
1722 1676   * This allows us to share common code for all the flavors of the copy
1723 1677   * operations, including the _noerr versions.
1724 1678   *
1725 1679   * Note that this function will restore the original input parameters before
1726 1680   * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
1727 1681   * member of the t_copyop structure, if needed.
1728 1682   */
1729 1683          ENTRY(copyio_fault)
1730 1684          membar  #Sync
1731 1685          mov     %g1,ERRNO                       ! save errno in ERRNO
1732 1686          btst    FPUSED_FLAG, %l6
1733 1687          bz      %ncc, 1f
1734 1688            nop
1735 1689  
1736 1690          ldx     [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
1737 1691          wr      %o2, 0, %gsr            ! restore gsr
1738 1692  
1739 1693          ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1740 1694          btst    FPRS_FEF, %o3
1741 1695          bz,pt   %icc, 4f
1742 1696            nop
1743 1697  
1744 1698          BLD_FPQ2Q4_FROMSTACK(%o2)
1745 1699  
1746 1700          ba,pt   %ncc, 1f
1747 1701            wr    %o3, 0, %fprs           ! restore fprs
1748 1702  
1749 1703  4:
1750 1704          FZEROQ2Q4
1751 1705          wr      %o3, 0, %fprs           ! restore fprs
1752 1706  
1753 1707  1:
1754 1708          andn    %l6, FPUSED_FLAG, %l6
1755 1709          membar  #Sync
1756 1710          stn     %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault

↓ open down ↓

46 lines elided

↑ open up ↑

1757 1711          FP_ALLOWMIGRATE(5, 6)
1758 1712  
1759 1713          mov     SAVE_SRC, %i0
1760 1714          mov     SAVE_DST, %i1
1761 1715          jmp     REAL_LOFAULT
1762 1716            mov   SAVE_COUNT, %i2
1763 1717  
1764 1718          SET_SIZE(copyio_fault)
1765 1719  
1766 1720  
1767      -#endif
1768      -
1769      -#if defined(lint)
1770      -
1771      -/*ARGSUSED*/
1772      -int
1773      -copyout(const void *kaddr, void *uaddr, size_t count)
1774      -{ return (0); }
1775      -
1776      -#else   /* lint */
1777      -
1778 1721          ENTRY(copyout)
1779 1722  
1780 1723          cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
1781 1724          bleu,pt %ncc, .copyout_small            ! go to larger cases
1782 1725            xor   %o0, %o1, %o3                   ! are src, dst alignable?
1783 1726          btst    7, %o3                          !
1784 1727          bz,pt   %ncc, .copyout_8                ! check for longword alignment
1785 1728            nop
1786 1729          btst    1, %o3                          !
1787 1730          bz,pt   %ncc, .copyout_2                ! check for half-word

1788 1731            nop
1789 1732          sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
1790 1733          ld      [%o3 + %lo(hw_copy_limit_1)], %o3
1791 1734          tst     %o3
1792 1735          bz,pn   %icc, .copyout_small            ! if zero, disable HW copy
1793 1736            cmp   %o2, %o3                        ! if length <= limit
1794 1737          bleu,pt %ncc, .copyout_small            ! go to small copy
1795 1738            nop
1796 1739          ba,pt   %ncc, .copyout_more             ! otherwise go to large copy
1797 1740            nop
1798 1741  .copyout_2:
1799 1742          btst    3, %o3                          !
1800 1743          bz,pt   %ncc, .copyout_4                ! check for word alignment
1801 1744            nop
1802 1745          sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
1803 1746          ld      [%o3 + %lo(hw_copy_limit_2)], %o3
1804 1747          tst     %o3
1805 1748          bz,pn   %icc, .copyout_small            ! if zero, disable HW copy
1806 1749            cmp   %o2, %o3                        ! if length <= limit
1807 1750          bleu,pt %ncc, .copyout_small            ! go to small copy
1808 1751            nop
1809 1752          ba,pt   %ncc, .copyout_more             ! otherwise go to large copy
1810 1753            nop
1811 1754  .copyout_4:
1812 1755          ! already checked longword, must be word aligned
1813 1756          sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
1814 1757          ld      [%o3 + %lo(hw_copy_limit_4)], %o3
1815 1758          tst     %o3
1816 1759          bz,pn   %icc, .copyout_small            ! if zero, disable HW copy
1817 1760            cmp   %o2, %o3                        ! if length <= limit
1818 1761          bleu,pt %ncc, .copyout_small            ! go to small copy
1819 1762            nop
1820 1763          ba,pt   %ncc, .copyout_more             ! otherwise go to large copy
1821 1764            nop
1822 1765  .copyout_8:
1823 1766          sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
1824 1767          ld      [%o3 + %lo(hw_copy_limit_8)], %o3
1825 1768          tst     %o3
1826 1769          bz,pn   %icc, .copyout_small            ! if zero, disable HW copy
1827 1770            cmp   %o2, %o3                        ! if length <= limit
1828 1771          bleu,pt %ncc, .copyout_small            ! go to small copy
1829 1772            nop
1830 1773          ba,pt   %ncc, .copyout_more             ! otherwise go to large copy
1831 1774            nop
1832 1775  
1833 1776          .align  16
1834 1777          nop                             ! instruction alignment
1835 1778                                          ! see discussion at start of file
1836 1779  .copyout_small:
1837 1780          sethi   %hi(.sm_copyout_err), %o5       ! .sm_copyout_err is lofault
1838 1781          or      %o5, %lo(.sm_copyout_err), %o5
1839 1782          ldn     [THREAD_REG + T_LOFAULT], %o4   ! save existing handler
1840 1783          membar  #Sync                           ! sync error barrier
1841 1784          stn     %o5, [THREAD_REG + T_LOFAULT]   ! set t_lofault
1842 1785  .sm_do_copyout:
1843 1786          mov     %o0, SM_SAVE_SRC
1844 1787          mov     %o1, SM_SAVE_DST
1845 1788          cmp     %o2, SHORTCOPY          ! check for really short case
1846 1789          bleu,pt %ncc, .co_sm_left       !
1847 1790            mov   %o2, SM_SAVE_COUNT
1848 1791          cmp     %o2, CHKSIZE            ! check for medium length cases
1849 1792          bgu,pn  %ncc, .co_med           !
1850 1793            or    %o0, %o1, %o3           ! prepare alignment check
1851 1794          andcc   %o3, 0x3, %g0           ! test for alignment
1852 1795          bz,pt   %ncc, .co_sm_word       ! branch to word aligned case
1853 1796  .co_sm_movebytes:
1854 1797            sub   %o2, 3, %o2             ! adjust count to allow cc zero test
1855 1798  .co_sm_notalign4:
1856 1799          ldub    [%o0], %o3              ! read byte
1857 1800          subcc   %o2, 4, %o2             ! reduce count by 4
1858 1801          stba    %o3, [%o1]ASI_USER      ! write byte
1859 1802          inc     %o1                     ! advance DST by 1
1860 1803          ldub    [%o0 + 1], %o3          ! repeat for a total of 4 bytes
1861 1804          add     %o0, 4, %o0             ! advance SRC by 4
1862 1805          stba    %o3, [%o1]ASI_USER
1863 1806          inc     %o1                     ! advance DST by 1
1864 1807          ldub    [%o0 - 2], %o3
1865 1808          stba    %o3, [%o1]ASI_USER
1866 1809          inc     %o1                     ! advance DST by 1
1867 1810          ldub    [%o0 - 1], %o3
1868 1811          stba    %o3, [%o1]ASI_USER
1869 1812          bgt,pt  %ncc, .co_sm_notalign4  ! loop til 3 or fewer bytes remain
1870 1813            inc   %o1                     ! advance DST by 1
1871 1814          add     %o2, 3, %o2             ! restore count
1872 1815  .co_sm_left:
1873 1816          tst     %o2
1874 1817          bz,pt   %ncc, .co_sm_exit       ! check for zero length
1875 1818            nop
1876 1819          ldub    [%o0], %o3              ! load one byte
1877 1820          deccc   %o2                     ! reduce count for cc test
1878 1821          bz,pt   %ncc, .co_sm_exit
1879 1822            stba  %o3,[%o1]ASI_USER       ! store one byte
1880 1823          ldub    [%o0 + 1], %o3          ! load second byte
1881 1824          deccc   %o2
1882 1825          inc     %o1
1883 1826          bz,pt   %ncc, .co_sm_exit
1884 1827            stba  %o3,[%o1]ASI_USER       ! store second byte
1885 1828          ldub    [%o0 + 2], %o3          ! load third byte
1886 1829          inc     %o1
1887 1830          stba    %o3,[%o1]ASI_USER       ! store third byte
1888 1831          membar  #Sync                           ! sync error barrier
1889 1832          stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1890 1833          retl
1891 1834            mov   %g0, %o0                ! return 0
1892 1835          .align  16
1893 1836  .co_sm_words:
1894 1837          lduw    [%o0], %o3              ! read word
1895 1838  .co_sm_wordx:
1896 1839          subcc   %o2, 8, %o2             ! update count
1897 1840          stwa    %o3, [%o1]ASI_USER      ! write word
1898 1841          add     %o0, 8, %o0             ! update SRC
1899 1842          lduw    [%o0 - 4], %o3          ! read word
1900 1843          add     %o1, 4, %o1             ! update DST
1901 1844          stwa    %o3, [%o1]ASI_USER      ! write word
1902 1845          bgt,pt  %ncc, .co_sm_words      ! loop til done
1903 1846            add   %o1, 4, %o1             ! update DST
1904 1847          addcc   %o2, 7, %o2             ! restore count
1905 1848          bz,pt   %ncc, .co_sm_exit
1906 1849            nop
1907 1850          deccc   %o2
1908 1851          bz,pt   %ncc, .co_sm_byte
1909 1852  .co_sm_half:
1910 1853            subcc %o2, 2, %o2             ! reduce count by 2
1911 1854          lduh    [%o0], %o3              ! read half word
1912 1855          add     %o0, 2, %o0             ! advance SRC by 2
1913 1856          stha    %o3, [%o1]ASI_USER      ! write half word
1914 1857          bgt,pt  %ncc, .co_sm_half       ! loop til done
1915 1858            add   %o1, 2, %o1             ! advance DST by 2
1916 1859          addcc   %o2, 1, %o2             ! restore count
1917 1860          bz,pt   %ncc, .co_sm_exit
1918 1861            nop
1919 1862  .co_sm_byte:
1920 1863          ldub    [%o0], %o3
1921 1864          stba    %o3, [%o1]ASI_USER
1922 1865          membar  #Sync                           ! sync error barrier
1923 1866          stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1924 1867          retl
1925 1868            mov   %g0, %o0                ! return 0
1926 1869          .align 16
1927 1870  .co_sm_word:
1928 1871          subcc   %o2, 4, %o2             ! update count
1929 1872          bgt,pt  %ncc, .co_sm_wordx
1930 1873            lduw  [%o0], %o3              ! read word
1931 1874          addcc   %o2, 3, %o2             ! restore count
1932 1875          bz,pt   %ncc, .co_sm_exit
1933 1876            stwa  %o3, [%o1]ASI_USER      ! write word
1934 1877          deccc   %o2                     ! reduce count for cc test
1935 1878          ldub    [%o0 + 4], %o3          ! load one byte
1936 1879          add     %o1, 4, %o1
1937 1880          bz,pt   %ncc, .co_sm_exit
1938 1881            stba  %o3, [%o1]ASI_USER      ! store one byte
1939 1882          ldub    [%o0 + 5], %o3          ! load second byte
1940 1883          deccc   %o2
1941 1884          inc     %o1
1942 1885          bz,pt   %ncc, .co_sm_exit
1943 1886            stba  %o3, [%o1]ASI_USER      ! store second byte
1944 1887          ldub    [%o0 + 6], %o3          ! load third byte
1945 1888          inc     %o1
1946 1889          stba    %o3, [%o1]ASI_USER      ! store third byte
1947 1890  .co_sm_exit:
1948 1891            membar        #Sync                           ! sync error barrier
1949 1892          stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1950 1893          retl
1951 1894            mov   %g0, %o0                ! return 0
1952 1895  
1953 1896          .align 16
1954 1897  .co_med:
1955 1898          xor     %o0, %o1, %o3           ! setup alignment check
1956 1899          btst    1, %o3
1957 1900          bnz,pt  %ncc, .co_sm_movebytes  ! unaligned
1958 1901            nop
1959 1902          btst    3, %o3
1960 1903          bnz,pt  %ncc, .co_med_half      ! halfword aligned
1961 1904            nop
1962 1905          btst    7, %o3
1963 1906          bnz,pt  %ncc, .co_med_word      ! word aligned
1964 1907            nop
1965 1908  .co_med_long:
1966 1909          btst    3, %o0                  ! check for
1967 1910          bz,pt   %ncc, .co_med_long1     ! word alignment
1968 1911            nop
1969 1912  .co_med_long0:
1970 1913          ldub    [%o0], %o3              ! load one byte
1971 1914          inc     %o0
1972 1915          stba    %o3,[%o1]ASI_USER       ! store byte
1973 1916          inc     %o1
1974 1917          btst    3, %o0
1975 1918          bnz,pt  %ncc, .co_med_long0
1976 1919            dec   %o2
1977 1920  .co_med_long1:                  ! word aligned
1978 1921          btst    7, %o0                  ! check for long word
1979 1922          bz,pt   %ncc, .co_med_long2
1980 1923            nop
1981 1924          lduw    [%o0], %o3              ! load word
1982 1925          add     %o0, 4, %o0             ! advance SRC by 4
1983 1926          stwa    %o3, [%o1]ASI_USER      ! store word
1984 1927          add     %o1, 4, %o1             ! advance DST by 4
1985 1928          sub     %o2, 4, %o2             ! reduce count by 4
1986 1929  !
1987 1930  !  Now long word aligned and have at least 32 bytes to move
1988 1931  !
1989 1932  .co_med_long2:
1990 1933          sub     %o2, 31, %o2            ! adjust count to allow cc zero test
1991 1934          sub     %o1, 8, %o1             ! adjust pointer to allow store in
1992 1935                                          ! branch delay slot instead of add
1993 1936  .co_med_lmove:
1994 1937          add     %o1, 8, %o1             ! advance DST by 8
1995 1938          ldx     [%o0], %o3              ! read long word
1996 1939          subcc   %o2, 32, %o2            ! reduce count by 32
1997 1940          stxa    %o3, [%o1]ASI_USER      ! write long word
1998 1941          add     %o1, 8, %o1             ! advance DST by 8
1999 1942          ldx     [%o0 + 8], %o3          ! repeat for a total for 4 long words
2000 1943          add     %o0, 32, %o0            ! advance SRC by 32
2001 1944          stxa    %o3, [%o1]ASI_USER
2002 1945          ldx     [%o0 - 16], %o3
2003 1946          add     %o1, 8, %o1             ! advance DST by 8
2004 1947          stxa    %o3, [%o1]ASI_USER
2005 1948          ldx     [%o0 - 8], %o3
2006 1949          add     %o1, 8, %o1             ! advance DST by 8
2007 1950          bgt,pt  %ncc, .co_med_lmove     ! loop til 31 or fewer bytes left
2008 1951            stxa  %o3, [%o1]ASI_USER
2009 1952          add     %o1, 8, %o1             ! advance DST by 8
2010 1953          addcc   %o2, 24, %o2            ! restore count to long word offset
2011 1954          ble,pt  %ncc, .co_med_lextra    ! check for more long words to move
2012 1955            nop
2013 1956  .co_med_lword:
2014 1957          ldx     [%o0], %o3              ! read long word
2015 1958          subcc   %o2, 8, %o2             ! reduce count by 8
2016 1959          stxa    %o3, [%o1]ASI_USER      ! write long word
2017 1960          add     %o0, 8, %o0             ! advance SRC by 8
2018 1961          bgt,pt  %ncc, .co_med_lword     ! loop til 7 or fewer bytes left
2019 1962            add   %o1, 8, %o1             ! advance DST by 8
2020 1963  .co_med_lextra:
2021 1964          addcc   %o2, 7, %o2             ! restore rest of count
2022 1965          bz,pt   %ncc, .co_sm_exit       ! if zero, then done
2023 1966            deccc %o2
2024 1967          bz,pt   %ncc, .co_sm_byte
2025 1968            nop
2026 1969          ba,pt   %ncc, .co_sm_half
2027 1970            nop
2028 1971  
2029 1972          .align 16
2030 1973          nop                             ! instruction alignment
2031 1974                                          ! see discussion at start of file
2032 1975  .co_med_word:
2033 1976          btst    3, %o0                  ! check for
2034 1977          bz,pt   %ncc, .co_med_word1     ! word alignment
2035 1978            nop
2036 1979  .co_med_word0:
2037 1980          ldub    [%o0], %o3              ! load one byte
2038 1981          inc     %o0
2039 1982          stba    %o3,[%o1]ASI_USER       ! store byte
2040 1983          inc     %o1
2041 1984          btst    3, %o0
2042 1985          bnz,pt  %ncc, .co_med_word0
2043 1986            dec   %o2
2044 1987  !
2045 1988  !  Now word aligned and have at least 36 bytes to move
2046 1989  !
2047 1990  .co_med_word1:
2048 1991          sub     %o2, 15, %o2            ! adjust count to allow cc zero test
2049 1992  .co_med_wmove:
2050 1993          lduw    [%o0], %o3              ! read word
2051 1994          subcc   %o2, 16, %o2            ! reduce count by 16
2052 1995          stwa    %o3, [%o1]ASI_USER      ! write word
2053 1996          add     %o1, 4, %o1             ! advance DST by 4
2054 1997          lduw    [%o0 + 4], %o3          ! repeat for a total for 4 words
2055 1998          add     %o0, 16, %o0            ! advance SRC by 16
2056 1999          stwa    %o3, [%o1]ASI_USER
2057 2000          add     %o1, 4, %o1             ! advance DST by 4
2058 2001          lduw    [%o0 - 8], %o3
2059 2002          stwa    %o3, [%o1]ASI_USER
2060 2003          add     %o1, 4, %o1             ! advance DST by 4
2061 2004          lduw    [%o0 - 4], %o3
2062 2005          stwa    %o3, [%o1]ASI_USER
2063 2006          bgt,pt  %ncc, .co_med_wmove     ! loop til 15 or fewer bytes left
2064 2007            add   %o1, 4, %o1             ! advance DST by 4
2065 2008          addcc   %o2, 12, %o2            ! restore count to word offset
2066 2009          ble,pt  %ncc, .co_med_wextra    ! check for more words to move
2067 2010            nop
2068 2011  .co_med_word2:
2069 2012          lduw    [%o0], %o3              ! read word
2070 2013          subcc   %o2, 4, %o2             ! reduce count by 4
2071 2014          stwa    %o3, [%o1]ASI_USER      ! write word
2072 2015          add     %o0, 4, %o0             ! advance SRC by 4
2073 2016          bgt,pt  %ncc, .co_med_word2     ! loop til 3 or fewer bytes left
2074 2017            add   %o1, 4, %o1             ! advance DST by 4
2075 2018  .co_med_wextra:
2076 2019          addcc   %o2, 3, %o2             ! restore rest of count
2077 2020          bz,pt   %ncc, .co_sm_exit       ! if zero, then done
2078 2021            deccc %o2
2079 2022          bz,pt   %ncc, .co_sm_byte
2080 2023            nop
2081 2024          ba,pt   %ncc, .co_sm_half
2082 2025            nop
2083 2026  
2084 2027          .align 16
2085 2028          nop                             ! instruction alignment
2086 2029          nop                             ! see discussion at start of file
2087 2030          nop
2088 2031  .co_med_half:
2089 2032          btst    1, %o0                  ! check for
2090 2033          bz,pt   %ncc, .co_med_half1     ! half word alignment
2091 2034            nop
2092 2035          ldub    [%o0], %o3              ! load one byte
2093 2036          inc     %o0
2094 2037          stba    %o3,[%o1]ASI_USER       ! store byte
2095 2038          inc     %o1
2096 2039          dec     %o2
2097 2040  !
2098 2041  !  Now half word aligned and have at least 38 bytes to move
2099 2042  !
2100 2043  .co_med_half1:
2101 2044          sub     %o2, 7, %o2             ! adjust count to allow cc zero test
2102 2045  .co_med_hmove:
2103 2046          lduh    [%o0], %o3              ! read half word
2104 2047          subcc   %o2, 8, %o2             ! reduce count by 8
2105 2048          stha    %o3, [%o1]ASI_USER      ! write half word
2106 2049          add     %o1, 2, %o1             ! advance DST by 2
2107 2050          lduh    [%o0 + 2], %o3          ! repeat for a total for 4 halfwords
2108 2051          add     %o0, 8, %o0             ! advance SRC by 8
2109 2052          stha    %o3, [%o1]ASI_USER
2110 2053          add     %o1, 2, %o1             ! advance DST by 2
2111 2054          lduh    [%o0 - 4], %o3
2112 2055          stha    %o3, [%o1]ASI_USER
2113 2056          add     %o1, 2, %o1             ! advance DST by 2
2114 2057          lduh    [%o0 - 2], %o3
2115 2058          stha    %o3, [%o1]ASI_USER
2116 2059          bgt,pt  %ncc, .co_med_hmove     ! loop til 7 or fewer bytes left
2117 2060            add   %o1, 2, %o1             ! advance DST by 2
2118 2061          addcc   %o2, 7, %o2             ! restore count
2119 2062          bz,pt   %ncc, .co_sm_exit
2120 2063            deccc %o2
2121 2064          bz,pt   %ncc, .co_sm_byte
2122 2065            nop
2123 2066          ba,pt   %ncc, .co_sm_half
2124 2067            nop
2125 2068  
2126 2069  /*
2127 2070   * We got here because of a fault during short copyout.
2128 2071   * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
2129 2072   */
2130 2073  .sm_copyout_err:
2131 2074          membar  #Sync
2132 2075          stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2133 2076          mov     SM_SAVE_SRC, %o0
2134 2077          mov     SM_SAVE_DST, %o1
2135 2078          mov     SM_SAVE_COUNT, %o2
2136 2079          ldn     [THREAD_REG + T_COPYOPS], %o3   ! check for copyop handler
2137 2080          tst     %o3
2138 2081          bz,pt   %ncc, 3f                        ! if not, return error
2139 2082            nop
2140 2083          ldn     [%o3 + CP_COPYOUT], %o5         ! if handler, invoke it with
2141 2084          jmp     %o5                             ! original arguments
2142 2085            nop
2143 2086  3:
2144 2087          retl
2145 2088            or    %g0, -1, %o0            ! return error value
2146 2089  
2147 2090          SET_SIZE(copyout)
2148 2091  
2149 2092  /*
2150 2093   * The _more entry points are not intended to be used directly by
2151 2094   * any caller from outside this file.  They are provided to allow
2152 2095   * profiling and dtrace of the portions of the copy code that uses
2153 2096   * the floating point registers.
2154 2097   * This entry is particularly important as DTRACE (at least as of
2155 2098   * 4/2004) does not support leaf functions.
2156 2099   */
2157 2100  
2158 2101          ENTRY(copyout_more)
2159 2102  .copyout_more:
2160 2103          prefetch [%o0], #n_reads
2161 2104          save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2162 2105          set     .copyout_err, REAL_LOFAULT
2163 2106  
2164 2107  /*
2165 2108   * Copy outs that reach here are larger than VIS_COPY_THRESHOLD bytes
2166 2109   */
2167 2110  .do_copyout:
2168 2111          set     copyio_fault, %l7               ! .copyio_fault is lofault val
2169 2112  
2170 2113          ldn     [THREAD_REG + T_LOFAULT], %l6   ! save existing handler
2171 2114          membar  #Sync                           ! sync error barrier
2172 2115          stn     %l7, [THREAD_REG + T_LOFAULT]   ! set t_lofault
2173 2116  
2174 2117          mov     %i0, SAVE_SRC
2175 2118          mov     %i1, SAVE_DST
2176 2119          mov     %i2, SAVE_COUNT
2177 2120  
2178 2121          FP_NOMIGRATE(6, 7)
2179 2122  
2180 2123          rd      %fprs, %o2              ! check for unused fp
2181 2124          st      %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
2182 2125          btst    FPRS_FEF, %o2
2183 2126          bz,a,pt %icc, .do_blockcopyout
2184 2127            wr    %g0, FPRS_FEF, %fprs
2185 2128  
2186 2129          BST_FPQ2Q4_TOSTACK(%o2)
2187 2130  
2188 2131  .do_blockcopyout:
2189 2132          rd      %gsr, %o2
2190 2133          stx     %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]      ! save gsr
2191 2134          or      %l6, FPUSED_FLAG, %l6
2192 2135  
2193 2136          andcc   DST, VIS_BLOCKSIZE - 1, TMP
2194 2137          mov     ASI_USER, %asi
2195 2138          bz,pt   %ncc, 2f
2196 2139            neg   TMP
2197 2140          add     TMP, VIS_BLOCKSIZE, TMP
2198 2141  
2199 2142          ! TMP = bytes required to align DST on FP_BLOCK boundary
2200 2143          ! Using SRC as a tmp here
2201 2144          cmp     TMP, 3
2202 2145          bleu,pt %ncc, 1f
2203 2146            sub   CNT,TMP,CNT             ! adjust main count
2204 2147          sub     TMP, 3, TMP             ! adjust for end of loop test
2205 2148  .co_blkalign:
2206 2149          ldub    [REALSRC], SRC          ! move 4 bytes per loop iteration
2207 2150          stba    SRC, [DST]%asi
2208 2151          subcc   TMP, 4, TMP
2209 2152          ldub    [REALSRC + 1], SRC
2210 2153          add     REALSRC, 4, REALSRC
2211 2154          stba    SRC, [DST + 1]%asi
2212 2155          ldub    [REALSRC - 2], SRC
2213 2156          add     DST, 4, DST
2214 2157          stba    SRC, [DST - 2]%asi
2215 2158          ldub    [REALSRC - 1], SRC
2216 2159          bgu,pt  %ncc, .co_blkalign
2217 2160            stba  SRC, [DST - 1]%asi
2218 2161  
2219 2162          addcc   TMP, 3, TMP             ! restore count adjustment
2220 2163          bz,pt   %ncc, 2f                ! no bytes left?
2221 2164            nop
2222 2165  1:      ldub    [REALSRC], SRC
2223 2166          inc     REALSRC
2224 2167          inc     DST
2225 2168          deccc   TMP
2226 2169          bgu     %ncc, 1b
2227 2170            stba  SRC, [DST - 1]%asi
2228 2171  
2229 2172  2:
2230 2173          membar  #StoreLoad
2231 2174          andn    REALSRC, 0x7, SRC
2232 2175  
2233 2176          ! SRC - 8-byte aligned
2234 2177          ! DST - 64-byte aligned
2235 2178          ldd     [SRC], %f16
2236 2179          prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads
2237 2180          alignaddr REALSRC, %g0, %g0
2238 2181          ldd     [SRC + 0x08], %f18
2239 2182          prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads
2240 2183          faligndata %f16, %f18, %f48
2241 2184          ldd     [SRC + 0x10], %f20
2242 2185          prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
2243 2186          faligndata %f18, %f20, %f50
2244 2187          ldd     [SRC + 0x18], %f22
2245 2188          prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
2246 2189          faligndata %f20, %f22, %f52
2247 2190          ldd     [SRC + 0x20], %f24
2248 2191          prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read
2249 2192          faligndata %f22, %f24, %f54
2250 2193          ldd     [SRC + 0x28], %f26
2251 2194          prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read
2252 2195          faligndata %f24, %f26, %f56
2253 2196          ldd     [SRC + 0x30], %f28
2254 2197          prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read
2255 2198          faligndata %f26, %f28, %f58
2256 2199          ldd     [SRC + 0x38], %f30
2257 2200          ldd     [SRC + VIS_BLOCKSIZE], %f16
2258 2201          sub     CNT, VIS_BLOCKSIZE, CNT
2259 2202          add     SRC, VIS_BLOCKSIZE, SRC
2260 2203          prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read
2261 2204          add     REALSRC, VIS_BLOCKSIZE, REALSRC
2262 2205          ba,pt   %ncc, 1f
2263 2206          prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read
2264 2207          .align  32
2265 2208  1:
2266 2209          ldd     [SRC + 0x08], %f18
2267 2210          faligndata %f28, %f30, %f60
2268 2211          ldd     [SRC + 0x10], %f20
2269 2212          faligndata %f30, %f16, %f62
2270 2213          stda    %f48, [DST]ASI_BLK_AIUS
2271 2214          ldd     [SRC + 0x18], %f22
2272 2215          faligndata %f16, %f18, %f48
2273 2216          ldd     [SRC + 0x20], %f24
2274 2217          faligndata %f18, %f20, %f50
2275 2218          ldd     [SRC + 0x28], %f26
2276 2219          faligndata %f20, %f22, %f52
2277 2220          ldd     [SRC + 0x30], %f28
2278 2221          faligndata %f22, %f24, %f54
2279 2222          sub     CNT, VIS_BLOCKSIZE, CNT
2280 2223          ldd     [SRC + 0x38], %f30
2281 2224          faligndata %f24, %f26, %f56
2282 2225          add     DST, VIS_BLOCKSIZE, DST
2283 2226          ldd     [SRC + VIS_BLOCKSIZE], %f16
2284 2227          faligndata %f26, %f28, %f58
2285 2228          add     REALSRC, VIS_BLOCKSIZE, REALSRC
2286 2229          prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
2287 2230          add     SRC, VIS_BLOCKSIZE, SRC
2288 2231          prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read
2289 2232          cmp     CNT, VIS_BLOCKSIZE + 8
2290 2233          bgu,pt  %ncc, 1b
2291 2234            prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
2292 2235  
2293 2236          ! only if REALSRC & 0x7 is 0
2294 2237          cmp     CNT, VIS_BLOCKSIZE
2295 2238          bne     %ncc, 3f
2296 2239            andcc REALSRC, 0x7, %g0
2297 2240          bz,pt   %ncc, 2f
2298 2241            nop
2299 2242  3:
2300 2243          faligndata %f28, %f30, %f60
2301 2244          faligndata %f30, %f16, %f62
2302 2245          stda    %f48, [DST]ASI_BLK_AIUS
2303 2246          add     DST, VIS_BLOCKSIZE, DST
2304 2247          ba,pt   %ncc, 3f
2305 2248            nop
2306 2249  2:
2307 2250          ldd     [SRC + 0x08], %f18
2308 2251          fsrc1   %f28, %f60
2309 2252          ldd     [SRC + 0x10], %f20
2310 2253          fsrc1   %f30, %f62
2311 2254          stda    %f48, [DST]ASI_BLK_AIUS
2312 2255          ldd     [SRC + 0x18], %f22
2313 2256          fsrc1   %f16, %f48
2314 2257          ldd     [SRC + 0x20], %f24
2315 2258          fsrc1   %f18, %f50
2316 2259          ldd     [SRC + 0x28], %f26
2317 2260          fsrc1   %f20, %f52
2318 2261          ldd     [SRC + 0x30], %f28
2319 2262          fsrc1   %f22, %f54
2320 2263          ldd     [SRC + 0x38], %f30
2321 2264          fsrc1   %f24, %f56
2322 2265          sub     CNT, VIS_BLOCKSIZE, CNT
2323 2266          add     DST, VIS_BLOCKSIZE, DST
2324 2267          add     SRC, VIS_BLOCKSIZE, SRC
2325 2268          add     REALSRC, VIS_BLOCKSIZE, REALSRC
2326 2269          fsrc1   %f26, %f58
2327 2270          fsrc1   %f28, %f60
2328 2271          fsrc1   %f30, %f62
2329 2272          stda    %f48, [DST]ASI_BLK_AIUS
2330 2273          add     DST, VIS_BLOCKSIZE, DST
2331 2274          ba,a,pt %ncc, 4f
2332 2275            nop
2333 2276  
2334 2277  3:      tst     CNT
2335 2278          bz,a    %ncc, 4f
2336 2279            nop
2337 2280  
2338 2281  5:      ldub    [REALSRC], TMP
2339 2282          inc     REALSRC
2340 2283          inc     DST
2341 2284          deccc   CNT
2342 2285          bgu     %ncc, 5b
2343 2286            stba  TMP, [DST - 1]%asi
2344 2287  4:
2345 2288  
2346 2289  .copyout_exit:
2347 2290          membar  #Sync
2348 2291  
2349 2292          ldx     [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
2350 2293          wr      %o2, 0, %gsr            ! restore gsr
2351 2294  
2352 2295          ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
2353 2296          btst    FPRS_FEF, %o3
2354 2297          bz,pt   %icc, 4f
2355 2298            nop
2356 2299  
2357 2300          BLD_FPQ2Q4_FROMSTACK(%o2)
2358 2301  
2359 2302          ba,pt   %ncc, 1f
2360 2303            wr    %o3, 0, %fprs           ! restore fprs
2361 2304  
2362 2305  4:
2363 2306          FZEROQ2Q4
2364 2307          wr      %o3, 0, %fprs           ! restore fprs
2365 2308  
2366 2309  1:
2367 2310          membar  #Sync
2368 2311          andn    %l6, FPUSED_FLAG, %l6
2369 2312          stn     %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2370 2313          FP_ALLOWMIGRATE(5, 6)
2371 2314          ret
2372 2315            restore       %g0, 0, %o0
2373 2316  
2374 2317  /*
2375 2318   * We got here because of a fault during copyout.
2376 2319   * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
2377 2320   */
2378 2321  .copyout_err:
2379 2322          ldn     [THREAD_REG + T_COPYOPS], %o4   ! check for copyop handler
2380 2323          tst     %o4
2381 2324          bz,pt   %ncc, 2f                        ! if not, return error
2382 2325            nop

↓ open down ↓

595 lines elided

↑ open up ↑

2383 2326          ldn     [%o4 + CP_COPYOUT], %g2         ! if handler, invoke it with
2384 2327          jmp     %g2                             ! original arguments
2385 2328            restore %g0, 0, %g0                   ! dispose of copy window
2386 2329  2:
2387 2330          ret
2388 2331            restore %g0, -1, %o0                  ! return error value
2389 2332  
2390 2333  
2391 2334          SET_SIZE(copyout_more)
2392 2335  
2393      -#endif  /* lint */
2394 2336  
2395      -
2396      -#ifdef  lint
2397      -
2398      -/*ARGSUSED*/
2399      -int
2400      -xcopyout(const void *kaddr, void *uaddr, size_t count)
2401      -{ return (0); }
2402      -
2403      -#else   /* lint */
2404      -
2405 2337          ENTRY(xcopyout)
2406 2338          cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
2407 2339          bleu,pt %ncc, .xcopyout_small           ! go to larger cases
2408 2340            xor   %o0, %o1, %o3                   ! are src, dst alignable?
2409 2341          btst    7, %o3                          !
2410 2342          bz,pt   %ncc, .xcopyout_8               !
2411 2343            nop
2412 2344          btst    1, %o3                          !
2413 2345          bz,pt   %ncc, .xcopyout_2               ! check for half-word
2414 2346            nop

2415 2347          sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
2416 2348          ld      [%o3 + %lo(hw_copy_limit_1)], %o3
2417 2349          tst     %o3
2418 2350          bz,pn   %icc, .xcopyout_small           ! if zero, disable HW copy
2419 2351            cmp   %o2, %o3                        ! if length <= limit
2420 2352          bleu,pt %ncc, .xcopyout_small           ! go to small copy
2421 2353            nop
2422 2354          ba,pt   %ncc, .xcopyout_more            ! otherwise go to large copy
2423 2355            nop
2424 2356  .xcopyout_2:
2425 2357          btst    3, %o3                          !
2426 2358          bz,pt   %ncc, .xcopyout_4               ! check for word alignment
2427 2359            nop
2428 2360          sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
2429 2361          ld      [%o3 + %lo(hw_copy_limit_2)], %o3
2430 2362          tst     %o3
2431 2363          bz,pn   %icc, .xcopyout_small           ! if zero, disable HW copy
2432 2364            cmp   %o2, %o3                        ! if length <= limit
2433 2365          bleu,pt %ncc, .xcopyout_small           ! go to small copy
2434 2366            nop
2435 2367          ba,pt   %ncc, .xcopyout_more            ! otherwise go to large copy
2436 2368            nop
2437 2369  .xcopyout_4:
2438 2370          ! already checked longword, must be word aligned
2439 2371          sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
2440 2372          ld      [%o3 + %lo(hw_copy_limit_4)], %o3
2441 2373          tst     %o3
2442 2374          bz,pn   %icc, .xcopyout_small           ! if zero, disable HW copy
2443 2375            cmp   %o2, %o3                        ! if length <= limit
2444 2376          bleu,pt %ncc, .xcopyout_small           ! go to small copy
2445 2377            nop
2446 2378          ba,pt   %ncc, .xcopyout_more            ! otherwise go to large copy
2447 2379            nop
2448 2380  .xcopyout_8:
2449 2381          sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
2450 2382          ld      [%o3 + %lo(hw_copy_limit_8)], %o3
2451 2383          tst     %o3
2452 2384          bz,pn   %icc, .xcopyout_small           ! if zero, disable HW copy
2453 2385            cmp   %o2, %o3                        ! if length <= limit
2454 2386          bleu,pt %ncc, .xcopyout_small           ! go to small copy
2455 2387            nop
2456 2388          ba,pt   %ncc, .xcopyout_more            ! otherwise go to large copy
2457 2389            nop
2458 2390  
2459 2391  .xcopyout_small:
2460 2392          sethi   %hi(.sm_xcopyout_err), %o5      ! .sm_xcopyout_err is lofault
2461 2393          or      %o5, %lo(.sm_xcopyout_err), %o5
2462 2394          ldn     [THREAD_REG + T_LOFAULT], %o4   ! save existing handler
2463 2395          membar  #Sync                           ! sync error barrier
2464 2396          ba,pt   %ncc, .sm_do_copyout            ! common code
2465 2397            stn   %o5, [THREAD_REG + T_LOFAULT]   ! set t_lofault
2466 2398  
2467 2399  .xcopyout_more:
2468 2400          save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2469 2401          sethi   %hi(.xcopyout_err), REAL_LOFAULT
2470 2402          ba,pt   %ncc, .do_copyout               ! common code
2471 2403            or    REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
2472 2404  
2473 2405  /*
2474 2406   * We got here because of fault during xcopyout
2475 2407   * Errno value is in ERRNO
2476 2408   */
2477 2409  .xcopyout_err:
2478 2410          ldn     [THREAD_REG + T_COPYOPS], %o4   ! check for copyop handler
2479 2411          tst     %o4
2480 2412          bz,pt   %ncc, 2f                        ! if not, return error
2481 2413            nop
2482 2414          ldn     [%o4 + CP_XCOPYOUT], %g2        ! if handler, invoke it with
2483 2415          jmp     %g2                             ! original arguments
2484 2416            restore %g0, 0, %g0                   ! dispose of copy window
2485 2417  2:
2486 2418          ret
2487 2419            restore ERRNO, 0, %o0                 ! return errno value
2488 2420  
2489 2421  .sm_xcopyout_err:
2490 2422  
2491 2423          membar  #Sync
2492 2424          stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2493 2425          mov     SM_SAVE_SRC, %o0
2494 2426          mov     SM_SAVE_DST, %o1
2495 2427          mov     SM_SAVE_COUNT, %o2
2496 2428          ldn     [THREAD_REG + T_COPYOPS], %o3   ! check for copyop handler
2497 2429          tst     %o3
2498 2430          bz,pt   %ncc, 3f                        ! if not, return error

↓ open down ↓

84 lines elided

↑ open up ↑

2499 2431            nop
2500 2432          ldn     [%o3 + CP_XCOPYOUT], %o5        ! if handler, invoke it with
2501 2433          jmp     %o5                             ! original arguments
2502 2434            nop
2503 2435  3:
2504 2436          retl
2505 2437            or    %g1, 0, %o0             ! return errno value
2506 2438  
2507 2439          SET_SIZE(xcopyout)
2508 2440  
2509      -#endif  /* lint */
2510      -
2511      -#ifdef  lint
2512      -
2513      -/*ARGSUSED*/
2514      -int
2515      -xcopyout_little(const void *kaddr, void *uaddr, size_t count)
2516      -{ return (0); }
2517      -
2518      -#else   /* lint */
2519      -
2520 2441          ENTRY(xcopyout_little)
2521 2442          sethi   %hi(.xcopyio_err), %o5
2522 2443          or      %o5, %lo(.xcopyio_err), %o5
2523 2444          ldn     [THREAD_REG + T_LOFAULT], %o4
2524 2445          membar  #Sync                           ! sync error barrier
2525 2446          stn     %o5, [THREAD_REG + T_LOFAULT]
2526 2447          mov     %o4, %o5
2527 2448  
2528 2449          subcc   %g0, %o2, %o3
2529 2450          add     %o0, %o2, %o0

2530 2451          bz,pn   %ncc, 2f                ! check for zero bytes
2531 2452            sub   %o2, 1, %o4
2532 2453          add     %o0, %o4, %o0           ! start w/last byte
2533 2454          add     %o1, %o2, %o1
2534 2455          ldub    [%o0 + %o3], %o4
2535 2456  
2536 2457  1:      stba    %o4, [%o1 + %o3]ASI_AIUSL
2537 2458          inccc   %o3
2538 2459          sub     %o0, 2, %o0             ! get next byte
2539 2460          bcc,a,pt %ncc, 1b

↓ open down ↓

10 lines elided

↑ open up ↑

2540 2461            ldub  [%o0 + %o3], %o4
2541 2462  
2542 2463  2:
2543 2464          membar  #Sync                           ! sync error barrier
2544 2465          stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2545 2466          retl
2546 2467            mov   %g0, %o0                ! return (0)
2547 2468  
2548 2469          SET_SIZE(xcopyout_little)
2549 2470  
2550      -#endif  /* lint */
2551      -
2552 2471  /*
2553 2472   * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
2554 2473   */
2555 2474  
2556      -#if defined(lint)
2557      -
2558      -/*ARGSUSED*/
2559      -int
2560      -copyin(const void *uaddr, void *kaddr, size_t count)
2561      -{ return (0); }
2562      -
2563      -#else   /* lint */
2564      -
2565 2475          ENTRY(copyin)
2566 2476          cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
2567 2477          bleu,pt %ncc, .copyin_small             ! go to larger cases
2568 2478            xor   %o0, %o1, %o3                   ! are src, dst alignable?
2569 2479          btst    7, %o3                          !
2570 2480          bz,pt   %ncc, .copyin_8                 ! check for longword alignment
2571 2481            nop
2572 2482          btst    1, %o3                          !
2573 2483          bz,pt   %ncc, .copyin_2                 ! check for half-word
2574 2484            nop

2575 2485          sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
2576 2486          ld      [%o3 + %lo(hw_copy_limit_1)], %o3
2577 2487          tst     %o3
2578 2488          bz,pn   %icc, .copyin_small             ! if zero, disable HW copy
2579 2489            cmp   %o2, %o3                        ! if length <= limit
2580 2490          bleu,pt %ncc, .copyin_small             ! go to small copy
2581 2491            nop
2582 2492          ba,pt   %ncc, .copyin_more              ! otherwise go to large copy
2583 2493            nop
2584 2494  .copyin_2:
2585 2495          btst    3, %o3                          !
2586 2496          bz,pt   %ncc, .copyin_4                 ! check for word alignment
2587 2497            nop
2588 2498          sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
2589 2499          ld      [%o3 + %lo(hw_copy_limit_2)], %o3
2590 2500          tst     %o3
2591 2501          bz,pn   %icc, .copyin_small             ! if zero, disable HW copy
2592 2502            cmp   %o2, %o3                        ! if length <= limit
2593 2503          bleu,pt %ncc, .copyin_small             ! go to small copy
2594 2504            nop
2595 2505          ba,pt   %ncc, .copyin_more              ! otherwise go to large copy
2596 2506            nop
2597 2507  .copyin_4:
2598 2508          ! already checked longword, must be word aligned
2599 2509          sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
2600 2510          ld      [%o3 + %lo(hw_copy_limit_4)], %o3
2601 2511          tst     %o3
2602 2512          bz,pn   %icc, .copyin_small             ! if zero, disable HW copy
2603 2513            cmp   %o2, %o3                        ! if length <= limit
2604 2514          bleu,pt %ncc, .copyin_small             ! go to small copy
2605 2515            nop
2606 2516          ba,pt   %ncc, .copyin_more              ! otherwise go to large copy
2607 2517            nop
2608 2518  .copyin_8:
2609 2519          sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
2610 2520          ld      [%o3 + %lo(hw_copy_limit_8)], %o3
2611 2521          tst     %o3
2612 2522          bz,pn   %icc, .copyin_small             ! if zero, disable HW copy
2613 2523            cmp   %o2, %o3                        ! if length <= limit
2614 2524          bleu,pt %ncc, .copyin_small             ! go to small copy
2615 2525            nop
2616 2526          ba,pt   %ncc, .copyin_more              ! otherwise go to large copy
2617 2527            nop
2618 2528  
2619 2529          .align  16
2620 2530          nop                             ! instruction alignment
2621 2531                                          ! see discussion at start of file
2622 2532  .copyin_small:
2623 2533          sethi   %hi(.sm_copyin_err), %o5        ! .sm_copyin_err is lofault
2624 2534          or      %o5, %lo(.sm_copyin_err), %o5
2625 2535          ldn     [THREAD_REG + T_LOFAULT], %o4   ! set/save t_lofault, no tramp
2626 2536          membar  #Sync                           ! sync error barrier
2627 2537          stn     %o5, [THREAD_REG + T_LOFAULT]
2628 2538  .sm_do_copyin:
2629 2539          mov     %o0, SM_SAVE_SRC
2630 2540          mov     %o1, SM_SAVE_DST
2631 2541          cmp     %o2, SHORTCOPY          ! check for really short case
2632 2542          bleu,pt %ncc, .ci_sm_left       !
2633 2543            mov   %o2, SM_SAVE_COUNT
2634 2544          cmp     %o2, CHKSIZE            ! check for medium length cases
2635 2545          bgu,pn  %ncc, .ci_med           !
2636 2546            or    %o0, %o1, %o3           ! prepare alignment check
2637 2547          andcc   %o3, 0x3, %g0           ! test for alignment
2638 2548          bz,pt   %ncc, .ci_sm_word       ! branch to word aligned case
2639 2549  .ci_sm_movebytes:
2640 2550            sub   %o2, 3, %o2             ! adjust count to allow cc zero test
2641 2551  .ci_sm_notalign4:
2642 2552          lduba   [%o0]ASI_USER, %o3      ! read byte
2643 2553          subcc   %o2, 4, %o2             ! reduce count by 4
2644 2554          stb     %o3, [%o1]              ! write byte
2645 2555          add     %o0, 1, %o0             ! advance SRC by 1
2646 2556          lduba   [%o0]ASI_USER, %o3      ! repeat for a total of 4 bytes
2647 2557          add     %o0, 1, %o0             ! advance SRC by 1
2648 2558          stb     %o3, [%o1 + 1]
2649 2559          add     %o1, 4, %o1             ! advance DST by 4
2650 2560          lduba   [%o0]ASI_USER, %o3
2651 2561          add     %o0, 1, %o0             ! advance SRC by 1
2652 2562          stb     %o3, [%o1 - 2]
2653 2563          lduba   [%o0]ASI_USER, %o3
2654 2564          add     %o0, 1, %o0             ! advance SRC by 1
2655 2565          bgt,pt  %ncc, .ci_sm_notalign4  ! loop til 3 or fewer bytes remain
2656 2566            stb   %o3, [%o1 - 1]
2657 2567          add     %o2, 3, %o2             ! restore count
2658 2568  .ci_sm_left:
2659 2569          tst     %o2
2660 2570          bz,pt   %ncc, .ci_sm_exit
2661 2571            nop
2662 2572          lduba   [%o0]ASI_USER, %o3              ! load one byte
2663 2573          deccc   %o2                     ! reduce count for cc test
2664 2574          bz,pt   %ncc, .ci_sm_exit
2665 2575            stb   %o3,[%o1]               ! store one byte
2666 2576          inc     %o0
2667 2577          lduba   [%o0]ASI_USER, %o3      ! load second byte
2668 2578          deccc   %o2
2669 2579          bz,pt   %ncc, .ci_sm_exit
2670 2580            stb   %o3,[%o1 + 1]           ! store second byte
2671 2581          inc     %o0
2672 2582          lduba   [%o0]ASI_USER, %o3      ! load third byte
2673 2583          stb     %o3,[%o1 + 2]           ! store third byte
2674 2584          membar  #Sync                           ! sync error barrier
2675 2585          stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2676 2586          retl
2677 2587            mov   %g0, %o0                ! return 0
2678 2588          .align  16
2679 2589  .ci_sm_words:
2680 2590          lduwa   [%o0]ASI_USER, %o3              ! read word
2681 2591  .ci_sm_wordx:
2682 2592          subcc   %o2, 8, %o2             ! update count
2683 2593          stw     %o3, [%o1]              ! write word
2684 2594          add     %o0, 4, %o0             ! update SRC
2685 2595          add     %o1, 8, %o1             ! update DST
2686 2596          lduwa   [%o0]ASI_USER, %o3      ! read word
2687 2597          add     %o0, 4, %o0             ! update SRC
2688 2598          bgt,pt  %ncc, .ci_sm_words      ! loop til done
2689 2599            stw   %o3, [%o1 - 4]          ! write word
2690 2600          addcc   %o2, 7, %o2             ! restore count
2691 2601          bz,pt   %ncc, .ci_sm_exit
2692 2602            nop
2693 2603          deccc   %o2
2694 2604          bz,pt   %ncc, .ci_sm_byte
2695 2605  .ci_sm_half:
2696 2606            subcc %o2, 2, %o2             ! reduce count by 2
2697 2607          lduha   [%o0]ASI_USER, %o3      ! read half word
2698 2608          add     %o0, 2, %o0             ! advance SRC by 2
2699 2609          add     %o1, 2, %o1             ! advance DST by 2
2700 2610          bgt,pt  %ncc, .ci_sm_half       ! loop til done
2701 2611            sth   %o3, [%o1 - 2]          ! write half word
2702 2612          addcc   %o2, 1, %o2             ! restore count
2703 2613          bz,pt   %ncc, .ci_sm_exit
2704 2614            nop
2705 2615  .ci_sm_byte:
2706 2616          lduba   [%o0]ASI_USER, %o3
2707 2617          stb     %o3, [%o1]
2708 2618          membar  #Sync                           ! sync error barrier
2709 2619          stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2710 2620          retl
2711 2621            mov   %g0, %o0                ! return 0
2712 2622          .align  16
2713 2623  .ci_sm_word:
2714 2624          subcc   %o2, 4, %o2             ! update count
2715 2625          bgt,pt  %ncc, .ci_sm_wordx
2716 2626            lduwa [%o0]ASI_USER, %o3              ! read word
2717 2627          addcc   %o2, 3, %o2             ! restore count
2718 2628          bz,pt   %ncc, .ci_sm_exit
2719 2629            stw   %o3, [%o1]              ! write word
2720 2630          deccc   %o2                     ! reduce count for cc test
2721 2631          add     %o0, 4, %o0
2722 2632          lduba   [%o0]ASI_USER, %o3      ! load one byte
2723 2633          bz,pt   %ncc, .ci_sm_exit
2724 2634            stb   %o3, [%o1 + 4]          ! store one byte
2725 2635          inc     %o0
2726 2636          lduba   [%o0]ASI_USER, %o3      ! load second byte
2727 2637          deccc   %o2
2728 2638          bz,pt   %ncc, .ci_sm_exit
2729 2639            stb   %o3, [%o1 + 5]          ! store second byte
2730 2640          inc     %o0
2731 2641          lduba   [%o0]ASI_USER, %o3      ! load third byte
2732 2642          stb     %o3, [%o1 + 6]          ! store third byte
2733 2643  .ci_sm_exit:
2734 2644          membar  #Sync                           ! sync error barrier
2735 2645          stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2736 2646          retl
2737 2647            mov   %g0, %o0                ! return 0
2738 2648  
2739 2649          .align 16
2740 2650  .ci_med:
2741 2651          xor     %o0, %o1, %o3           ! setup alignment check
2742 2652          btst    1, %o3
2743 2653          bnz,pt  %ncc, .ci_sm_movebytes  ! unaligned
2744 2654            nop
2745 2655          btst    3, %o3
2746 2656          bnz,pt  %ncc, .ci_med_half      ! halfword aligned
2747 2657            nop
2748 2658          btst    7, %o3
2749 2659          bnz,pt  %ncc, .ci_med_word      ! word aligned
2750 2660            nop
2751 2661  .ci_med_long:
2752 2662          btst    3, %o0                  ! check for
2753 2663          bz,pt   %ncc, .ci_med_long1     ! word alignment
2754 2664            nop
2755 2665  .ci_med_long0:
2756 2666          lduba   [%o0]ASI_USER, %o3              ! load one byte
2757 2667          inc     %o0
2758 2668          stb     %o3,[%o1]               ! store byte
2759 2669          inc     %o1
2760 2670          btst    3, %o0
2761 2671          bnz,pt  %ncc, .ci_med_long0
2762 2672            dec   %o2
2763 2673  .ci_med_long1:                  ! word aligned
2764 2674          btst    7, %o0                  ! check for long word
2765 2675          bz,pt   %ncc, .ci_med_long2
2766 2676            nop
2767 2677          lduwa   [%o0]ASI_USER, %o3      ! load word
2768 2678          add     %o0, 4, %o0             ! advance SRC by 4
2769 2679          stw     %o3, [%o1]              ! store word
2770 2680          add     %o1, 4, %o1             ! advance DST by 4
2771 2681          sub     %o2, 4, %o2             ! reduce count by 4
2772 2682  !
2773 2683  !  Now long word aligned and have at least 32 bytes to move
2774 2684  !
2775 2685  .ci_med_long2:
2776 2686          sub     %o2, 31, %o2            ! adjust count to allow cc zero test
2777 2687  .ci_med_lmove:
2778 2688          ldxa    [%o0]ASI_USER, %o3      ! read long word
2779 2689          subcc   %o2, 32, %o2            ! reduce count by 32
2780 2690          stx     %o3, [%o1]              ! write long word
2781 2691          add     %o0, 8, %o0             ! advance SRC by 8
2782 2692          ldxa    [%o0]ASI_USER, %o3      ! repeat for a total for 4 long words
2783 2693          add     %o0, 8, %o0             ! advance SRC by 8
2784 2694          stx     %o3, [%o1 + 8]
2785 2695          add     %o1, 32, %o1            ! advance DST by 32
2786 2696          ldxa    [%o0]ASI_USER, %o3
2787 2697          add     %o0, 8, %o0             ! advance SRC by 8
2788 2698          stx     %o3, [%o1 - 16]
2789 2699          ldxa    [%o0]ASI_USER, %o3
2790 2700          add     %o0, 8, %o0             ! advance SRC by 8
2791 2701          bgt,pt  %ncc, .ci_med_lmove     ! loop til 31 or fewer bytes left
2792 2702            stx   %o3, [%o1 - 8]
2793 2703          addcc   %o2, 24, %o2            ! restore count to long word offset
2794 2704          ble,pt  %ncc, .ci_med_lextra    ! check for more long words to move
2795 2705            nop
2796 2706  .ci_med_lword:
2797 2707          ldxa    [%o0]ASI_USER, %o3      ! read long word
2798 2708          subcc   %o2, 8, %o2             ! reduce count by 8
2799 2709          stx     %o3, [%o1]              ! write long word
2800 2710          add     %o0, 8, %o0             ! advance SRC by 8
2801 2711          bgt,pt  %ncc, .ci_med_lword     ! loop til 7 or fewer bytes left
2802 2712            add   %o1, 8, %o1             ! advance DST by 8
2803 2713  .ci_med_lextra:
2804 2714          addcc   %o2, 7, %o2             ! restore rest of count
2805 2715          bz,pt   %ncc, .ci_sm_exit       ! if zero, then done
2806 2716            deccc %o2
2807 2717          bz,pt   %ncc, .ci_sm_byte
2808 2718            nop
2809 2719          ba,pt   %ncc, .ci_sm_half
2810 2720            nop
2811 2721  
2812 2722          .align 16
2813 2723          nop                             ! instruction alignment
2814 2724                                          ! see discussion at start of file
2815 2725  .ci_med_word:
2816 2726          btst    3, %o0                  ! check for
2817 2727          bz,pt   %ncc, .ci_med_word1     ! word alignment
2818 2728            nop
2819 2729  .ci_med_word0:
2820 2730          lduba   [%o0]ASI_USER, %o3      ! load one byte
2821 2731          inc     %o0
2822 2732          stb     %o3,[%o1]               ! store byte
2823 2733          inc     %o1
2824 2734          btst    3, %o0
2825 2735          bnz,pt  %ncc, .ci_med_word0
2826 2736            dec   %o2
2827 2737  !
2828 2738  !  Now word aligned and have at least 36 bytes to move
2829 2739  !
2830 2740  .ci_med_word1:
2831 2741          sub     %o2, 15, %o2            ! adjust count to allow cc zero test
2832 2742  .ci_med_wmove:
2833 2743          lduwa   [%o0]ASI_USER, %o3      ! read word
2834 2744          subcc   %o2, 16, %o2            ! reduce count by 16
2835 2745          stw     %o3, [%o1]              ! write word
2836 2746          add     %o0, 4, %o0             ! advance SRC by 4
2837 2747          lduwa   [%o0]ASI_USER, %o3      ! repeat for a total for 4 words
2838 2748          add     %o0, 4, %o0             ! advance SRC by 4
2839 2749          stw     %o3, [%o1 + 4]
2840 2750          add     %o1, 16, %o1            ! advance DST by 16
2841 2751          lduwa   [%o0]ASI_USER, %o3
2842 2752          add     %o0, 4, %o0             ! advance SRC by 4
2843 2753          stw     %o3, [%o1 - 8]
2844 2754          lduwa   [%o0]ASI_USER, %o3
2845 2755          add     %o0, 4, %o0             ! advance SRC by 4
2846 2756          bgt,pt  %ncc, .ci_med_wmove     ! loop til 15 or fewer bytes left
2847 2757            stw   %o3, [%o1 - 4]
2848 2758          addcc   %o2, 12, %o2            ! restore count to word offset
2849 2759          ble,pt  %ncc, .ci_med_wextra    ! check for more words to move
2850 2760            nop
2851 2761  .ci_med_word2:
2852 2762          lduwa   [%o0]ASI_USER, %o3      ! read word
2853 2763          subcc   %o2, 4, %o2             ! reduce count by 4
2854 2764          stw     %o3, [%o1]              ! write word
2855 2765          add     %o0, 4, %o0             ! advance SRC by 4
2856 2766          bgt,pt  %ncc, .ci_med_word2     ! loop til 3 or fewer bytes left
2857 2767            add   %o1, 4, %o1             ! advance DST by 4
2858 2768  .ci_med_wextra:
2859 2769          addcc   %o2, 3, %o2             ! restore rest of count
2860 2770          bz,pt   %ncc, .ci_sm_exit       ! if zero, then done
2861 2771            deccc %o2
2862 2772          bz,pt   %ncc, .ci_sm_byte
2863 2773            nop
2864 2774          ba,pt   %ncc, .ci_sm_half
2865 2775            nop
2866 2776  
2867 2777          .align 16
2868 2778          nop                             ! instruction alignment
2869 2779                                          ! see discussion at start of file
2870 2780  .ci_med_half:
2871 2781          btst    1, %o0                  ! check for
2872 2782          bz,pt   %ncc, .ci_med_half1     ! half word alignment
2873 2783            nop
2874 2784          lduba   [%o0]ASI_USER, %o3      ! load one byte
2875 2785          inc     %o0
2876 2786          stb     %o3,[%o1]               ! store byte
2877 2787          inc     %o1
2878 2788          dec     %o2
2879 2789  !
2880 2790  !  Now half word aligned and have at least 38 bytes to move
2881 2791  !
2882 2792  .ci_med_half1:
2883 2793          sub     %o2, 7, %o2             ! adjust count to allow cc zero test
2884 2794  .ci_med_hmove:
2885 2795          lduha   [%o0]ASI_USER, %o3      ! read half word
2886 2796          subcc   %o2, 8, %o2             ! reduce count by 8
2887 2797          sth     %o3, [%o1]              ! write half word
2888 2798          add     %o0, 2, %o0             ! advance SRC by 2
2889 2799          lduha   [%o0]ASI_USER, %o3      ! repeat for a total for 4 halfwords
2890 2800          add     %o0, 2, %o0             ! advance SRC by 2
2891 2801          sth     %o3, [%o1 + 2]
2892 2802          add     %o1, 8, %o1             ! advance DST by 8
2893 2803          lduha   [%o0]ASI_USER, %o3
2894 2804          add     %o0, 2, %o0             ! advance SRC by 2
2895 2805          sth     %o3, [%o1 - 4]
2896 2806          lduha   [%o0]ASI_USER, %o3
2897 2807          add     %o0, 2, %o0             ! advance SRC by 2
2898 2808          bgt,pt  %ncc, .ci_med_hmove     ! loop til 7 or fewer bytes left
2899 2809            sth   %o3, [%o1 - 2]
2900 2810          addcc   %o2, 7, %o2             ! restore count
2901 2811          bz,pt   %ncc, .ci_sm_exit
2902 2812            deccc %o2
2903 2813          bz,pt   %ncc, .ci_sm_byte
2904 2814            nop
2905 2815          ba,pt   %ncc, .ci_sm_half
2906 2816            nop
2907 2817  
2908 2818  .sm_copyin_err:
2909 2819          membar  #Sync
2910 2820          stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2911 2821          mov     SM_SAVE_SRC, %o0
2912 2822          mov     SM_SAVE_DST, %o1
2913 2823          mov     SM_SAVE_COUNT, %o2
2914 2824          ldn     [THREAD_REG + T_COPYOPS], %o3   ! check for copyop handler
2915 2825          tst     %o3
2916 2826          bz,pt   %ncc, 3f                        ! if not, return error
2917 2827            nop
2918 2828          ldn     [%o3 + CP_COPYIN], %o5          ! if handler, invoke it with
2919 2829          jmp     %o5                             ! original arguments
2920 2830            nop
2921 2831  3:
2922 2832          retl
2923 2833            or    %g0, -1, %o0            ! return errno value
2924 2834  
2925 2835          SET_SIZE(copyin)
2926 2836  
2927 2837  
2928 2838  /*
2929 2839   * The _more entry points are not intended to be used directly by
2930 2840   * any caller from outside this file.  They are provided to allow
2931 2841   * profiling and dtrace of the portions of the copy code that uses
2932 2842   * the floating point registers.
2933 2843   * This entry is particularly important as DTRACE (at least as of
2934 2844   * 4/2004) does not support leaf functions.
2935 2845   */
2936 2846  
2937 2847          ENTRY(copyin_more)
2938 2848  .copyin_more:
2939 2849          prefetch [%o0], #n_reads
2940 2850          save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2941 2851          set     .copyin_err, REAL_LOFAULT
2942 2852  
2943 2853  /*
2944 2854   * Copy ins that reach here are larger than VIS_COPY_THRESHOLD bytes
2945 2855   */
2946 2856  .do_copyin:
2947 2857          set     copyio_fault, %l7               ! .copyio_fault is lofault val
2948 2858  
2949 2859          ldn     [THREAD_REG + T_LOFAULT], %l6   ! save existing handler
2950 2860          membar  #Sync                           ! sync error barrier
2951 2861          stn     %l7, [THREAD_REG + T_LOFAULT]   ! set t_lofault
2952 2862  
2953 2863          mov     %i0, SAVE_SRC
2954 2864          mov     %i1, SAVE_DST
2955 2865          mov     %i2, SAVE_COUNT
2956 2866  
2957 2867          FP_NOMIGRATE(6, 7)
2958 2868  
2959 2869          rd      %fprs, %o2              ! check for unused fp
2960 2870          st      %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
2961 2871          btst    FPRS_FEF, %o2
2962 2872          bz,a,pt %icc, .do_blockcopyin
2963 2873            wr    %g0, FPRS_FEF, %fprs
2964 2874  
2965 2875          BST_FPQ2Q4_TOSTACK(%o2)
2966 2876  
2967 2877  .do_blockcopyin:
2968 2878          rd      %gsr, %o2
2969 2879          stx     %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]      ! save gsr
2970 2880          or      %l6, FPUSED_FLAG, %l6
2971 2881  
2972 2882          andcc   DST, VIS_BLOCKSIZE - 1, TMP
2973 2883          mov     ASI_USER, %asi
2974 2884          bz,pt   %ncc, 2f
2975 2885            neg   TMP
2976 2886          add     TMP, VIS_BLOCKSIZE, TMP
2977 2887  
2978 2888          ! TMP = bytes required to align DST on FP_BLOCK boundary
2979 2889          ! Using SRC as a tmp here
2980 2890          cmp     TMP, 3
2981 2891          bleu,pt %ncc, 1f
2982 2892            sub   CNT,TMP,CNT             ! adjust main count
2983 2893          sub     TMP, 3, TMP             ! adjust for end of loop test
2984 2894  .ci_blkalign:
2985 2895          lduba   [REALSRC]%asi, SRC      ! move 4 bytes per loop iteration
2986 2896          stb     SRC, [DST]
2987 2897          subcc   TMP, 4, TMP
2988 2898          lduba   [REALSRC + 1]%asi, SRC
2989 2899          add     REALSRC, 4, REALSRC
2990 2900          stb     SRC, [DST + 1]
2991 2901          lduba   [REALSRC - 2]%asi, SRC
2992 2902          add     DST, 4, DST
2993 2903          stb     SRC, [DST - 2]
2994 2904          lduba   [REALSRC - 1]%asi, SRC
2995 2905          bgu,pt  %ncc, .ci_blkalign
2996 2906            stb   SRC, [DST - 1]
2997 2907  
2998 2908          addcc   TMP, 3, TMP             ! restore count adjustment
2999 2909          bz,pt   %ncc, 2f                ! no bytes left?
3000 2910            nop
3001 2911  1:      lduba   [REALSRC]%asi, SRC
3002 2912          inc     REALSRC
3003 2913          inc     DST
3004 2914          deccc   TMP
3005 2915          bgu     %ncc, 1b
3006 2916            stb   SRC, [DST - 1]
3007 2917  
3008 2918  2:
3009 2919          membar  #StoreLoad
3010 2920          andn    REALSRC, 0x7, SRC
3011 2921  
3012 2922          ! SRC - 8-byte aligned
3013 2923          ! DST - 64-byte aligned
3014 2924          ldda    [SRC]%asi, %f16
3015 2925          prefetcha [SRC + (1 * VIS_BLOCKSIZE)]%asi, #n_reads
3016 2926          alignaddr REALSRC, %g0, %g0
3017 2927          ldda    [SRC + 0x08]%asi, %f18
3018 2928          prefetcha [SRC + (2 * VIS_BLOCKSIZE)]%asi, #n_reads
3019 2929          faligndata %f16, %f18, %f48
3020 2930          ldda    [SRC + 0x10]%asi, %f20
3021 2931          prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #n_reads
3022 2932          faligndata %f18, %f20, %f50
3023 2933          ldda    [SRC + 0x18]%asi, %f22
3024 2934          prefetcha [SRC + (4 * VIS_BLOCKSIZE)]%asi, #one_read
3025 2935          faligndata %f20, %f22, %f52
3026 2936          ldda    [SRC + 0x20]%asi, %f24
3027 2937          prefetcha [SRC + (8 * VIS_BLOCKSIZE)]%asi, #one_read
3028 2938          faligndata %f22, %f24, %f54
3029 2939          ldda    [SRC + 0x28]%asi, %f26
3030 2940          prefetcha [SRC + (12 * VIS_BLOCKSIZE)]%asi, #one_read
3031 2941          faligndata %f24, %f26, %f56
3032 2942          ldda    [SRC + 0x30]%asi, %f28
3033 2943          prefetcha [SRC + (16 * VIS_BLOCKSIZE)]%asi, #one_read
3034 2944          faligndata %f26, %f28, %f58
3035 2945          ldda    [SRC + 0x38]%asi, %f30
3036 2946          ldda    [SRC + VIS_BLOCKSIZE]%asi, %f16
3037 2947          sub     CNT, VIS_BLOCKSIZE, CNT
3038 2948          add     SRC, VIS_BLOCKSIZE, SRC
3039 2949          prefetcha [SRC + (19 * VIS_BLOCKSIZE)]%asi, #one_read
3040 2950          add     REALSRC, VIS_BLOCKSIZE, REALSRC
3041 2951          ba,pt   %ncc, 1f
3042 2952          prefetcha [SRC + (23 * VIS_BLOCKSIZE)]%asi, #one_read
3043 2953          .align  32
3044 2954  1:
3045 2955          ldda    [SRC + 0x08]%asi, %f18
3046 2956          faligndata %f28, %f30, %f60
3047 2957          ldda    [SRC + 0x10]%asi, %f20
3048 2958          faligndata %f30, %f16, %f62
3049 2959          stda    %f48, [DST]ASI_BLK_P
3050 2960          ldda    [SRC + 0x18]%asi, %f22
3051 2961          faligndata %f16, %f18, %f48
3052 2962          ldda    [SRC + 0x20]%asi, %f24
3053 2963          faligndata %f18, %f20, %f50
3054 2964          ldda    [SRC + 0x28]%asi, %f26
3055 2965          faligndata %f20, %f22, %f52
3056 2966          ldda    [SRC + 0x30]%asi, %f28
3057 2967          faligndata %f22, %f24, %f54
3058 2968          sub     CNT, VIS_BLOCKSIZE, CNT
3059 2969          ldda    [SRC + 0x38]%asi, %f30
3060 2970          faligndata %f24, %f26, %f56
3061 2971          add     DST, VIS_BLOCKSIZE, DST
3062 2972          ldda    [SRC + VIS_BLOCKSIZE]%asi, %f16
3063 2973          faligndata %f26, %f28, %f58
3064 2974          add     REALSRC, VIS_BLOCKSIZE, REALSRC
3065 2975          prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #n_reads
3066 2976          add     SRC, VIS_BLOCKSIZE, SRC
3067 2977          prefetcha [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read
3068 2978          cmp     CNT, VIS_BLOCKSIZE + 8
3069 2979          bgu,pt  %ncc, 1b
3070 2980            prefetcha [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read
3071 2981  
3072 2982          ! only if REALSRC & 0x7 is 0
3073 2983          cmp     CNT, VIS_BLOCKSIZE
3074 2984          bne     %ncc, 3f
3075 2985            andcc REALSRC, 0x7, %g0
3076 2986          bz,pt   %ncc, 2f
3077 2987            nop
3078 2988  3:
3079 2989          faligndata %f28, %f30, %f60
3080 2990          faligndata %f30, %f16, %f62
3081 2991          stda    %f48, [DST]ASI_BLK_P
3082 2992          add     DST, VIS_BLOCKSIZE, DST
3083 2993          ba,pt   %ncc, 3f
3084 2994            nop
3085 2995  2:
3086 2996          ldda    [SRC + 0x08]%asi, %f18
3087 2997          fsrc1   %f28, %f60
3088 2998          ldda    [SRC + 0x10]%asi, %f20
3089 2999          fsrc1   %f30, %f62
3090 3000          stda    %f48, [DST]ASI_BLK_P
3091 3001          ldda    [SRC + 0x18]%asi, %f22
3092 3002          fsrc1   %f16, %f48
3093 3003          ldda    [SRC + 0x20]%asi, %f24
3094 3004          fsrc1   %f18, %f50
3095 3005          ldda    [SRC + 0x28]%asi, %f26
3096 3006          fsrc1   %f20, %f52
3097 3007          ldda    [SRC + 0x30]%asi, %f28
3098 3008          fsrc1   %f22, %f54
3099 3009          ldda    [SRC + 0x38]%asi, %f30
3100 3010          fsrc1   %f24, %f56
3101 3011          sub     CNT, VIS_BLOCKSIZE, CNT
3102 3012          add     DST, VIS_BLOCKSIZE, DST
3103 3013          add     SRC, VIS_BLOCKSIZE, SRC
3104 3014          add     REALSRC, VIS_BLOCKSIZE, REALSRC
3105 3015          fsrc1   %f26, %f58
3106 3016          fsrc1   %f28, %f60
3107 3017          fsrc1   %f30, %f62
3108 3018          stda    %f48, [DST]ASI_BLK_P
3109 3019          add     DST, VIS_BLOCKSIZE, DST
3110 3020          ba,a,pt %ncc, 4f
3111 3021            nop
3112 3022  
3113 3023  3:      tst     CNT
3114 3024          bz,a    %ncc, 4f
3115 3025            nop
3116 3026  
3117 3027  5:      lduba   [REALSRC]ASI_USER, TMP
3118 3028          inc     REALSRC
3119 3029          inc     DST
3120 3030          deccc   CNT
3121 3031          bgu     %ncc, 5b
3122 3032            stb   TMP, [DST - 1]
3123 3033  4:
3124 3034  
3125 3035  .copyin_exit:
3126 3036          membar  #Sync
3127 3037  
3128 3038          ldx     [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2      ! restore gsr
3129 3039          wr      %o2, 0, %gsr
3130 3040  
3131 3041          ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
3132 3042          btst    FPRS_FEF, %o3
3133 3043          bz,pt   %icc, 4f
3134 3044            nop
3135 3045  
3136 3046          BLD_FPQ2Q4_FROMSTACK(%o2)
3137 3047  
3138 3048          ba,pt   %ncc, 1f
3139 3049            wr    %o3, 0, %fprs           ! restore fprs
3140 3050  
3141 3051  4:
3142 3052          FZEROQ2Q4
3143 3053          wr      %o3, 0, %fprs           ! restore fprs
3144 3054  
3145 3055  1:
3146 3056          membar  #Sync                           ! sync error barrier
3147 3057          andn    %l6, FPUSED_FLAG, %l6
3148 3058          stn     %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3149 3059          FP_ALLOWMIGRATE(5, 6)
3150 3060          ret
3151 3061            restore       %g0, 0, %o0
3152 3062  /*
3153 3063   * We got here because of a fault during copyin
3154 3064   * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
3155 3065   */
3156 3066  .copyin_err:
3157 3067          ldn     [THREAD_REG + T_COPYOPS], %o4   ! check for copyop handler
3158 3068          tst     %o4
3159 3069          bz,pt   %ncc, 2f                        ! if not, return error
3160 3070          nop

↓ open down ↓

586 lines elided

↑ open up ↑

3161 3071          ldn     [%o4 + CP_COPYIN], %g2          ! if handler, invoke it with
3162 3072          jmp     %g2                             ! original arguments
3163 3073          restore %g0, 0, %g0                     ! dispose of copy window
3164 3074  2:
3165 3075          ret
3166 3076          restore %g0, -1, %o0                    ! return error value
3167 3077  
3168 3078  
3169 3079          SET_SIZE(copyin_more)
3170 3080  
3171      -#endif  /* lint */
3172      -
3173      -#ifdef  lint
3174      -
3175      -/*ARGSUSED*/
3176      -int
3177      -xcopyin(const void *uaddr, void *kaddr, size_t count)
3178      -{ return (0); }
3179      -
3180      -#else   /* lint */
3181      -
3182 3081          ENTRY(xcopyin)
3183 3082  
3184 3083          cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
3185 3084          bleu,pt %ncc, .xcopyin_small            ! go to larger cases
3186 3085            xor   %o0, %o1, %o3                   ! are src, dst alignable?
3187 3086          btst    7, %o3                          !
3188 3087          bz,pt   %ncc, .xcopyin_8                ! check for longword alignment
3189 3088            nop
3190 3089          btst    1, %o3                          !
3191 3090          bz,pt   %ncc, .xcopyin_2                ! check for half-word

3192 3091            nop
3193 3092          sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
3194 3093          ld      [%o3 + %lo(hw_copy_limit_1)], %o3
3195 3094          tst     %o3
3196 3095          bz,pn   %icc, .xcopyin_small            ! if zero, disable HW copy
3197 3096            cmp   %o2, %o3                        ! if length <= limit
3198 3097          bleu,pt %ncc, .xcopyin_small            ! go to small copy
3199 3098            nop
3200 3099          ba,pt   %ncc, .xcopyin_more             ! otherwise go to large copy
3201 3100            nop
3202 3101  .xcopyin_2:
3203 3102          btst    3, %o3                          !
3204 3103          bz,pt   %ncc, .xcopyin_4                ! check for word alignment
3205 3104            nop
3206 3105          sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
3207 3106          ld      [%o3 + %lo(hw_copy_limit_2)], %o3
3208 3107          tst     %o3
3209 3108          bz,pn   %icc, .xcopyin_small            ! if zero, disable HW copy
3210 3109            cmp   %o2, %o3                        ! if length <= limit
3211 3110          bleu,pt %ncc, .xcopyin_small            ! go to small copy
3212 3111            nop
3213 3112          ba,pt   %ncc, .xcopyin_more             ! otherwise go to large copy
3214 3113            nop
3215 3114  .xcopyin_4:
3216 3115          ! already checked longword, must be word aligned
3217 3116          sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
3218 3117          ld      [%o3 + %lo(hw_copy_limit_4)], %o3
3219 3118          tst     %o3
3220 3119          bz,pn   %icc, .xcopyin_small            ! if zero, disable HW copy
3221 3120            cmp   %o2, %o3                        ! if length <= limit
3222 3121          bleu,pt %ncc, .xcopyin_small            ! go to small copy
3223 3122            nop
3224 3123          ba,pt   %ncc, .xcopyin_more             ! otherwise go to large copy
3225 3124            nop
3226 3125  .xcopyin_8:
3227 3126          sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
3228 3127          ld      [%o3 + %lo(hw_copy_limit_8)], %o3
3229 3128          tst     %o3
3230 3129          bz,pn   %icc, .xcopyin_small            ! if zero, disable HW copy
3231 3130            cmp   %o2, %o3                        ! if length <= limit
3232 3131          bleu,pt %ncc, .xcopyin_small            ! go to small copy
3233 3132            nop
3234 3133          ba,pt   %ncc, .xcopyin_more             ! otherwise go to large copy
3235 3134            nop
3236 3135  
3237 3136  .xcopyin_small:
3238 3137          sethi   %hi(.sm_xcopyin_err), %o5  ! .sm_xcopyin_err is lofault value
3239 3138          or      %o5, %lo(.sm_xcopyin_err), %o5
3240 3139          ldn     [THREAD_REG + T_LOFAULT], %o4   ! set/save t_lofaul
3241 3140          membar  #Sync                           ! sync error barrier
3242 3141          ba,pt   %ncc, .sm_do_copyin             ! common code
3243 3142            stn   %o5, [THREAD_REG + T_LOFAULT]
3244 3143  
3245 3144  .xcopyin_more:
3246 3145          save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3247 3146          sethi   %hi(.xcopyin_err), REAL_LOFAULT ! .xcopyin_err is lofault value
3248 3147          ba,pt   %ncc, .do_copyin
3249 3148            or    REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
3250 3149  
3251 3150  /*
3252 3151   * We got here because of fault during xcopyin
3253 3152   * Errno value is in ERRNO
3254 3153   */
3255 3154  .xcopyin_err:
3256 3155          ldn     [THREAD_REG + T_COPYOPS], %o4   ! check for copyop handler
3257 3156          tst     %o4
3258 3157          bz,pt   %ncc, 2f                        ! if not, return error
3259 3158            nop
3260 3159          ldn     [%o4 + CP_XCOPYIN], %g2         ! if handler, invoke it with
3261 3160          jmp     %g2                             ! original arguments
3262 3161            restore %g0, 0, %g0                   ! dispose of copy window
3263 3162  2:
3264 3163          ret
3265 3164            restore ERRNO, 0, %o0                 ! return errno value
3266 3165  
3267 3166  .sm_xcopyin_err:
3268 3167  
3269 3168          membar  #Sync
3270 3169          stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3271 3170          mov     SM_SAVE_SRC, %o0
3272 3171          mov     SM_SAVE_DST, %o1
3273 3172          mov     SM_SAVE_COUNT, %o2
3274 3173          ldn     [THREAD_REG + T_COPYOPS], %o3   ! check for copyop handler
3275 3174          tst     %o3
3276 3175          bz,pt   %ncc, 3f                        ! if not, return error

↓ open down ↓

85 lines elided

↑ open up ↑

3277 3176            nop
3278 3177          ldn     [%o3 + CP_XCOPYIN], %o5         ! if handler, invoke it with
3279 3178          jmp     %o5                             ! original arguments
3280 3179            nop
3281 3180  3:
3282 3181          retl
3283 3182            or    %g1, 0, %o0             ! return errno value
3284 3183  
3285 3184          SET_SIZE(xcopyin)
3286 3185  
3287      -#endif  /* lint */
3288      -
3289      -#ifdef  lint
3290      -
3291      -/*ARGSUSED*/
3292      -int
3293      -xcopyin_little(const void *uaddr, void *kaddr, size_t count)
3294      -{ return (0); }
3295      -
3296      -#else   /* lint */
3297      -
3298 3186          ENTRY(xcopyin_little)
3299 3187          sethi   %hi(.xcopyio_err), %o5
3300 3188          or      %o5, %lo(.xcopyio_err), %o5
3301 3189          ldn     [THREAD_REG + T_LOFAULT], %o4
3302 3190          membar  #Sync                           ! sync error barrier
3303 3191          stn     %o5, [THREAD_REG + T_LOFAULT]
3304 3192          mov     %o4, %o5
3305 3193  
3306 3194          subcc   %g0, %o2, %o3
3307 3195          add     %o0, %o2, %o0

3308 3196          bz,pn   %ncc, 2f                ! check for zero bytes
3309 3197            sub   %o2, 1, %o4
3310 3198          add     %o0, %o4, %o0           ! start w/last byte
3311 3199          add     %o1, %o2, %o1
3312 3200          lduba   [%o0 + %o3]ASI_AIUSL, %o4
3313 3201  
3314 3202  1:      stb     %o4, [%o1 + %o3]
3315 3203          inccc   %o3
3316 3204          sub     %o0, 2, %o0             ! get next byte
3317 3205          bcc,a,pt %ncc, 1b
3318 3206            lduba [%o0 + %o3]ASI_AIUSL, %o4
3319 3207  
3320 3208  2:
3321 3209          membar  #Sync                           ! sync error barrier
3322 3210          stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3323 3211          retl

↓ open down ↓

16 lines elided

↑ open up ↑

3324 3212            mov   %g0, %o0                ! return (0)
3325 3213  
3326 3214  .xcopyio_err:
3327 3215          membar  #Sync                           ! sync error barrier
3328 3216          stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3329 3217          retl
3330 3218            mov   %g1, %o0
3331 3219  
3332 3220          SET_SIZE(xcopyin_little)
3333 3221  
3334      -#endif  /* lint */
3335 3222  
3336      -
3337 3223  /*
3338 3224   * Copy a block of storage - must not overlap (from + len <= to).
3339 3225   * No fault handler installed (to be called under on_fault())
3340 3226   */
3341      -#if defined(lint)
3342      -
3343      -/* ARGSUSED */
3344      -void
3345      -copyin_noerr(const void *ufrom, void *kto, size_t count)
3346      -{}
3347      -
3348      -#else   /* lint */
3349 3227          ENTRY(copyin_noerr)
3350 3228  
3351 3229          cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
3352 3230          bleu,pt %ncc, .copyin_ne_small          ! go to larger cases
3353 3231            xor   %o0, %o1, %o3                   ! are src, dst alignable?
3354 3232          btst    7, %o3                          !
3355 3233          bz,pt   %ncc, .copyin_ne_8              ! check for longword alignment
3356 3234            nop
3357 3235          btst    1, %o3                          !
3358 3236          bz,pt   %ncc, .copyin_ne_2              ! check for half-word

3359 3237            nop
3360 3238          sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
3361 3239          ld      [%o3 + %lo(hw_copy_limit_1)], %o3
3362 3240          tst     %o3
3363 3241          bz,pn   %icc, .copyin_ne_small          ! if zero, disable HW copy
3364 3242            cmp   %o2, %o3                        ! if length <= limit
3365 3243          bleu,pt %ncc, .copyin_ne_small          ! go to small copy
3366 3244            nop
3367 3245          ba,pt   %ncc, .copyin_noerr_more        ! otherwise go to large copy
3368 3246            nop
3369 3247  .copyin_ne_2:
3370 3248          btst    3, %o3                          !
3371 3249          bz,pt   %ncc, .copyin_ne_4              ! check for word alignment
3372 3250            nop
3373 3251          sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
3374 3252          ld      [%o3 + %lo(hw_copy_limit_2)], %o3
3375 3253          tst     %o3
3376 3254          bz,pn   %icc, .copyin_ne_small          ! if zero, disable HW copy
3377 3255            cmp   %o2, %o3                        ! if length <= limit
3378 3256          bleu,pt %ncc, .copyin_ne_small          ! go to small copy
3379 3257            nop
3380 3258          ba,pt   %ncc, .copyin_noerr_more        ! otherwise go to large copy
3381 3259            nop
3382 3260  .copyin_ne_4:
3383 3261          ! already checked longword, must be word aligned
3384 3262          sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
3385 3263          ld      [%o3 + %lo(hw_copy_limit_4)], %o3
3386 3264          tst     %o3
3387 3265          bz,pn   %icc, .copyin_ne_small          ! if zero, disable HW copy
3388 3266            cmp   %o2, %o3                        ! if length <= limit
3389 3267          bleu,pt %ncc, .copyin_ne_small          ! go to small copy
3390 3268            nop
3391 3269          ba,pt   %ncc, .copyin_noerr_more        ! otherwise go to large copy
3392 3270            nop
3393 3271  .copyin_ne_8:
3394 3272          sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
3395 3273          ld      [%o3 + %lo(hw_copy_limit_8)], %o3
3396 3274          tst     %o3
3397 3275          bz,pn   %icc, .copyin_ne_small          ! if zero, disable HW copy
3398 3276            cmp   %o2, %o3                        ! if length <= limit
3399 3277          bleu,pt %ncc, .copyin_ne_small          ! go to small copy
3400 3278            nop
3401 3279          ba,pt   %ncc, .copyin_noerr_more        ! otherwise go to large copy
3402 3280            nop
3403 3281  
3404 3282  .copyin_ne_small:
3405 3283          ldn     [THREAD_REG + T_LOFAULT], %o4
3406 3284          tst     %o4
3407 3285          bz,pn   %ncc, .sm_do_copyin
3408 3286            nop
3409 3287          sethi   %hi(.sm_copyio_noerr), %o5
3410 3288          or      %o5, %lo(.sm_copyio_noerr), %o5
3411 3289          membar  #Sync                           ! sync error barrier
3412 3290          ba,pt   %ncc, .sm_do_copyin
3413 3291            stn   %o5, [THREAD_REG + T_LOFAULT]   ! set/save t_lofault
3414 3292  
3415 3293  .copyin_noerr_more:
3416 3294          save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3417 3295          sethi   %hi(.copyio_noerr), REAL_LOFAULT
3418 3296          ba,pt   %ncc, .do_copyin
3419 3297            or    REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
3420 3298  
3421 3299  .copyio_noerr:

↓ open down ↓

63 lines elided

↑ open up ↑

3422 3300          jmp     %l6
3423 3301            restore %g0,0,%g0
3424 3302  
3425 3303  .sm_copyio_noerr:
3426 3304          membar  #Sync
3427 3305          stn     %o4, [THREAD_REG + T_LOFAULT]   ! restore t_lofault
3428 3306          jmp     %o4
3429 3307            nop
3430 3308  
3431 3309          SET_SIZE(copyin_noerr)
3432      -#endif /* lint */
3433 3310  
3434 3311  /*
3435 3312   * Copy a block of storage - must not overlap (from + len <= to).
3436 3313   * No fault handler installed (to be called under on_fault())
3437 3314   */
3438 3315  
3439      -#if defined(lint)
3440      -
3441      -/* ARGSUSED */
3442      -void
3443      -copyout_noerr(const void *kfrom, void *uto, size_t count)
3444      -{}
3445      -
3446      -#else   /* lint */
3447 3316          ENTRY(copyout_noerr)
3448 3317  
3449 3318          cmp     %o2, VIS_COPY_THRESHOLD         ! check for leaf rtn case
3450 3319          bleu,pt %ncc, .copyout_ne_small         ! go to larger cases
3451 3320            xor   %o0, %o1, %o3                   ! are src, dst alignable?
3452 3321          btst    7, %o3                          !
3453 3322          bz,pt   %ncc, .copyout_ne_8             ! check for longword alignment
3454 3323            nop
3455 3324          btst    1, %o3                          !
3456 3325          bz,pt   %ncc, .copyout_ne_2             ! check for half-word

3457 3326            nop
3458 3327          sethi   %hi(hw_copy_limit_1), %o3       ! Check copy limit
3459 3328          ld      [%o3 + %lo(hw_copy_limit_1)], %o3
3460 3329          tst     %o3
3461 3330          bz,pn   %icc, .copyout_ne_small         ! if zero, disable HW copy
3462 3331            cmp   %o2, %o3                        ! if length <= limit
3463 3332          bleu,pt %ncc, .copyout_ne_small         ! go to small copy
3464 3333            nop
3465 3334          ba,pt   %ncc, .copyout_noerr_more       ! otherwise go to large copy
3466 3335            nop
3467 3336  .copyout_ne_2:
3468 3337          btst    3, %o3                          !
3469 3338          bz,pt   %ncc, .copyout_ne_4             ! check for word alignment
3470 3339            nop
3471 3340          sethi   %hi(hw_copy_limit_2), %o3       ! Check copy limit
3472 3341          ld      [%o3 + %lo(hw_copy_limit_2)], %o3
3473 3342          tst     %o3
3474 3343          bz,pn   %icc, .copyout_ne_small         ! if zero, disable HW copy
3475 3344            cmp   %o2, %o3                        ! if length <= limit
3476 3345          bleu,pt %ncc, .copyout_ne_small         ! go to small copy
3477 3346            nop
3478 3347          ba,pt   %ncc, .copyout_noerr_more       ! otherwise go to large copy
3479 3348            nop
3480 3349  .copyout_ne_4:
3481 3350          ! already checked longword, must be word aligned
3482 3351          sethi   %hi(hw_copy_limit_4), %o3       ! Check copy limit
3483 3352          ld      [%o3 + %lo(hw_copy_limit_4)], %o3
3484 3353          tst     %o3
3485 3354          bz,pn   %icc, .copyout_ne_small         ! if zero, disable HW copy
3486 3355            cmp   %o2, %o3                        ! if length <= limit
3487 3356          bleu,pt %ncc, .copyout_ne_small         ! go to small copy
3488 3357            nop
3489 3358          ba,pt   %ncc, .copyout_noerr_more       ! otherwise go to large copy
3490 3359            nop
3491 3360  .copyout_ne_8:
3492 3361          sethi   %hi(hw_copy_limit_8), %o3       ! Check copy limit
3493 3362          ld      [%o3 + %lo(hw_copy_limit_8)], %o3
3494 3363          tst     %o3
3495 3364          bz,pn   %icc, .copyout_ne_small         ! if zero, disable HW copy
3496 3365            cmp   %o2, %o3                        ! if length <= limit
3497 3366          bleu,pt %ncc, .copyout_ne_small         ! go to small copy
3498 3367            nop
3499 3368          ba,pt   %ncc, .copyout_noerr_more       ! otherwise go to large copy
3500 3369            nop
3501 3370  
3502 3371  .copyout_ne_small:
3503 3372          ldn     [THREAD_REG + T_LOFAULT], %o4
3504 3373          tst     %o4
3505 3374          bz,pn   %ncc, .sm_do_copyout
3506 3375            nop
3507 3376          sethi   %hi(.sm_copyio_noerr), %o5
3508 3377          or      %o5, %lo(.sm_copyio_noerr), %o5
3509 3378          membar  #Sync                           ! sync error barrier

↓ open down ↓

53 lines elided

↑ open up ↑

3510 3379          ba,pt   %ncc, .sm_do_copyout
3511 3380          stn     %o5, [THREAD_REG + T_LOFAULT]   ! set/save t_lofault
3512 3381  
3513 3382  .copyout_noerr_more:
3514 3383          save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3515 3384          sethi   %hi(.copyio_noerr), REAL_LOFAULT
3516 3385          ba,pt   %ncc, .do_copyout
3517 3386            or    REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
3518 3387  
3519 3388          SET_SIZE(copyout_noerr)
3520      -#endif /* lint */
3521 3389  
3522 3390  
3523 3391  /*
3524 3392   * hwblkclr - clears block-aligned, block-multiple-sized regions that are
3525 3393   * longer than 256 bytes in length using spitfire's block stores.  If
3526 3394   * the criteria for using this routine are not met then it calls bzero
3527 3395   * and returns 1.  Otherwise 0 is returned indicating success.
3528 3396   * Caller is responsible for ensuring use_hw_bzero is true and that
3529 3397   * kpreempt_disable() has been called.
3530 3398   */
3531      -#ifdef lint
3532      -/*ARGSUSED*/
3533      -int
3534      -hwblkclr(void *addr, size_t len)
3535      -{
3536      -        return(0);
3537      -}
3538      -#else /* lint */
3539 3399          ! %i0 - start address
3540 3400          ! %i1 - length of region (multiple of 64)
3541 3401          ! %l0 - saved fprs
3542 3402          ! %l1 - pointer to saved %d0 block
3543 3403          ! %l2 - saved curthread->t_lwp
3544 3404  
3545 3405          ENTRY(hwblkclr)
3546 3406          ! get another window w/space for one aligned block of saved fpregs
3547 3407          save    %sp, -SA(MINFRAME + 2*VIS_BLOCKSIZE), %sp
3548 3408

3549 3409          ! Must be block-aligned
3550 3410          andcc   %i0, (VIS_BLOCKSIZE-1), %g0
3551 3411          bnz,pn  %ncc, 1f
3552 3412            nop
3553 3413  
3554 3414          ! ... and must be 256 bytes or more
3555 3415          cmp     %i1, 256
3556 3416          blu,pn  %ncc, 1f
3557 3417            nop
3558 3418  
3559 3419          ! ... and length must be a multiple of VIS_BLOCKSIZE
3560 3420          andcc   %i1, (VIS_BLOCKSIZE-1), %g0
3561 3421          bz,pn   %ncc, 2f
3562 3422            nop
3563 3423  
3564 3424  1:      ! punt, call bzero but notify the caller that bzero was used
3565 3425          mov     %i0, %o0
3566 3426          call    bzero
3567 3427          mov     %i1, %o1
3568 3428          ret
3569 3429            restore       %g0, 1, %o0 ! return (1) - did not use block operations
3570 3430  
3571 3431  2:      rd      %fprs, %l0              ! check for unused fp
3572 3432          btst    FPRS_FEF, %l0
3573 3433          bz,pt   %icc, 1f
3574 3434            nop
3575 3435  
3576 3436          ! save in-use fpregs on stack
3577 3437          membar  #Sync
3578 3438          add     %fp, STACK_BIAS - 65, %l1
3579 3439          and     %l1, -VIS_BLOCKSIZE, %l1
3580 3440          stda    %d0, [%l1]ASI_BLK_P
3581 3441  
3582 3442  1:      membar  #StoreStore|#StoreLoad|#LoadStore
3583 3443          wr      %g0, FPRS_FEF, %fprs
3584 3444          wr      %g0, ASI_BLK_P, %asi
3585 3445  
3586 3446          ! Clear block
3587 3447          fzero   %d0
3588 3448          fzero   %d2
3589 3449          fzero   %d4
3590 3450          fzero   %d6
3591 3451          fzero   %d8
3592 3452          fzero   %d10
3593 3453          fzero   %d12
3594 3454          fzero   %d14
3595 3455  
3596 3456          mov     256, %i3
3597 3457          ba,pt   %ncc, .pz_doblock
3598 3458            nop
3599 3459  
3600 3460  .pz_blkstart:
3601 3461        ! stda    %d0, [%i0 + 192]%asi  ! in dly slot of branch that got us here
3602 3462          stda    %d0, [%i0 + 128]%asi
3603 3463          stda    %d0, [%i0 + 64]%asi
3604 3464          stda    %d0, [%i0]%asi
3605 3465  .pz_zinst:
3606 3466          add     %i0, %i3, %i0
3607 3467          sub     %i1, %i3, %i1
3608 3468  .pz_doblock:
3609 3469          cmp     %i1, 256
3610 3470          bgeu,a  %ncc, .pz_blkstart
3611 3471            stda  %d0, [%i0 + 192]%asi
3612 3472  
3613 3473          cmp     %i1, 64
3614 3474          blu     %ncc, .pz_finish
3615 3475  
3616 3476            andn  %i1, (64-1), %i3
3617 3477          srl     %i3, 4, %i2             ! using blocks, 1 instr / 16 words
3618 3478          set     .pz_zinst, %i4
3619 3479          sub     %i4, %i2, %i4
3620 3480          jmp     %i4
3621 3481            nop
3622 3482  
3623 3483  .pz_finish:
3624 3484          membar  #Sync
3625 3485          btst    FPRS_FEF, %l0
3626 3486          bz,a    .pz_finished
3627 3487            wr    %l0, 0, %fprs           ! restore fprs
3628 3488

↓ open down ↓

80 lines elided

↑ open up ↑

3629 3489          ! restore fpregs from stack
3630 3490          ldda    [%l1]ASI_BLK_P, %d0
3631 3491          membar  #Sync
3632 3492          wr      %l0, 0, %fprs           ! restore fprs
3633 3493  
3634 3494  .pz_finished:
3635 3495          ret
3636 3496            restore       %g0, 0, %o0             ! return (bzero or not)
3637 3497  
3638 3498          SET_SIZE(hwblkclr)
3639      -#endif  /* lint */
3640 3499  
3641      -#ifdef lint
3642      -/*ARGSUSED*/
3643      -void
3644      -hw_pa_bcopy32(uint64_t src, uint64_t dst)
3645      -{}
3646      -#else /*!lint */
3647 3500          /*
3648 3501           * Copy 32 bytes of data from src (%o0) to dst (%o1)
3649 3502           * using physical addresses.
3650 3503           */
3651 3504          ENTRY_NP(hw_pa_bcopy32)
3652 3505          rdpr    %pstate, %g1
3653 3506          andn    %g1, PSTATE_IE, %g2
3654 3507          wrpr    %g0, %g2, %pstate
3655 3508  
3656 3509          rdpr    %pstate, %g0

3657 3510          ldxa    [%o0]ASI_MEM, %o2
3658 3511          add     %o0, 8, %o0
3659 3512          ldxa    [%o0]ASI_MEM, %o3
3660 3513          add     %o0, 8, %o0
3661 3514          ldxa    [%o0]ASI_MEM, %o4
3662 3515          add     %o0, 8, %o0
3663 3516          ldxa    [%o0]ASI_MEM, %o5
3664 3517          membar  #Sync
3665 3518  
3666 3519          stxa    %o2, [%o1]ASI_MEM
3667 3520          add     %o1, 8, %o1
3668 3521          stxa    %o3, [%o1]ASI_MEM

↓ open down ↓

12 lines elided

↑ open up ↑

3669 3522          add     %o1, 8, %o1
3670 3523          stxa    %o4, [%o1]ASI_MEM
3671 3524          add     %o1, 8, %o1
3672 3525          stxa    %o5, [%o1]ASI_MEM
3673 3526  
3674 3527          retl
3675 3528            wrpr    %g0, %g1, %pstate
3676 3529  
3677 3530          SET_SIZE(hw_pa_bcopy32)
3678 3531  
3679      -#endif /* lint */
3680      -
3681      -#if defined(lint)
3682      -
3683      -int use_hw_bcopy = 1;
3684      -int use_hw_bzero = 1;
3685      -uint_t hw_copy_limit_1 = 0;
3686      -uint_t hw_copy_limit_2 = 0;
3687      -uint_t hw_copy_limit_4 = 0;
3688      -uint_t hw_copy_limit_8 = 0;
3689      -
3690      -#else /* !lint */
3691      -
3692 3532          DGDEF(use_hw_bcopy)
3693 3533          .word   1
3694 3534          DGDEF(use_hw_bzero)
3695 3535          .word   1
3696 3536          DGDEF(hw_copy_limit_1)
3697 3537          .word   0
3698 3538          DGDEF(hw_copy_limit_2)
3699 3539          .word   0
3700 3540          DGDEF(hw_copy_limit_4)
3701 3541          .word   0
3702 3542          DGDEF(hw_copy_limit_8)
3703 3543          .word   0
3704 3544  
3705 3545          .align  64
3706 3546          .section ".text"
3707      -#endif /* !lint */

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX